patch-2.1.48 linux/fs/buffer.c

Next file: linux/fs/dcache.c
Previous file: linux/fs/binfmt_misc.c
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v2.1.47/linux/fs/buffer.c linux/fs/buffer.c
@@ -49,8 +49,9 @@
 
 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
 #define MAX_BUF_PER_PAGE (PAGE_SIZE / 512)
-#define MAX_UNUSED_BUFFERS 30 /* don't ever have more than this number of 
-				 unused buffer heads */
+#define NR_RESERVED (2*MAX_BUF_PER_PAGE)
+#define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this 
+					     number of unused buffer heads */
 #define HASH_PAGES         4  /* number of pages to use for the hash table */
 #define HASH_PAGES_ORDER   2
 #define NR_HASH (HASH_PAGES*PAGE_SIZE/sizeof(struct buffer_head *))
@@ -560,6 +561,7 @@
 		if (!bh)
 			return NULL;
 		bh->b_count++;
+		bh->b_lru_time = jiffies;
 		wait_on_buffer(bh);
 		if (bh->b_dev == dev		&&
 		    bh->b_blocknr == block	&&
@@ -642,34 +644,20 @@
 	}
 }
 
-/* Check if a buffer is OK to be reclaimed. */
-static inline int can_reclaim(struct buffer_head *bh, int size)
-{
-	if (bh->b_count			|| 
-	    buffer_protected(bh)	||
-	    buffer_locked(bh))
-		return 0;
-			 
-	if (buffer_dirty(bh)) {
-		refile_buffer(bh);
-		return 0;
-	}
-
-	if (bh->b_size != size)
-		return 0;
-
-	return 1;
-}
-
-/* Find a candidate buffer to be reclaimed. */
-static struct buffer_head *find_candidate(struct buffer_head *list,
+/*
+ * Find a candidate buffer to be reclaimed. 
+ * N.B. Must search the entire BUF_LOCKED list rather than terminating
+ * when the first locked buffer is found.  Buffers are unlocked at 
+ * completion of IO, and under some conditions there may be (many)
+ * unlocked buffers after the first locked one.
+ */
+static struct buffer_head *find_candidate(struct buffer_head *bh,
 					  int *list_len, int size)
 {
-	struct buffer_head *bh;
-	
-	for (bh = list; 
-	     bh && (*list_len) > 0; 
-	     bh = bh->b_next_free, (*list_len)--) {
+	if (!bh)
+		goto no_candidate;
+
+	for (; (*list_len) > 0; bh = bh->b_next_free, (*list_len)--) {
 		if (size != bh->b_size) {
 			/* This provides a mechanism for freeing blocks
 			 * of other sizes, this is necessary now that we
@@ -680,110 +668,144 @@
 				break;
 			continue;
 		}
-
-		if (buffer_locked(bh) && bh->b_list == BUF_LOCKED) {
-			/* Buffers are written in the order they are placed 
-			 * on the locked list. If we encounter a locked
-			 * buffer here, this means that the rest of them
-			 * are also locked.
-			 */
-			(*list_len) = 0;
-			return NULL;
-		}
-
-		if (can_reclaim(bh,size))
-		    return bh;
+		else if (!bh->b_count		&& 
+			 !buffer_locked(bh)	&& 
+			 !buffer_protected(bh)	&&
+			 !buffer_dirty(bh)) 
+			return bh;
 	}
 
+no_candidate:
 	return NULL;
 }
 
 static void refill_freelist(int size)
 {
-	struct buffer_head * bh;
+	struct buffer_head * bh, * next;
 	struct buffer_head * candidate[BUF_DIRTY];
-	unsigned int best_time, winner;
 	int buffers[BUF_DIRTY];
 	int i;
-	int needed;
+	int needed, obtained=0;
 
 	refilled = 1;
-	/* If there are too many dirty buffers, we wake up the update process
-	 * now so as to ensure that there are still clean buffers available
-	 * for user processes to use (and dirty).
-	 */
 	
 	/* We are going to try to locate this much memory. */
 	needed = bdf_prm.b_un.nrefill * size;  
 
-	while ((nr_free_pages > min_free_pages*2)	&&
-	       (needed > 0)				&&
-	       grow_buffers(GFP_BUFFER, size))
-		needed -= PAGE_SIZE;
+	while ((nr_free_pages > min_free_pages*2) && 
+		grow_buffers(GFP_BUFFER, size)) {
+		obtained += PAGE_SIZE;
+		if (obtained >= needed)
+			return;
+	}
 
+	/*
+	 * Update the needed amount based on the number of potentially
+	 * freeable buffers. We don't want to free more than one quarter
+	 * of the available buffers.
+	 */
+	i = (nr_buffers_type[BUF_CLEAN] + nr_buffers_type[BUF_LOCKED]) >> 2;
+	if (i < bdf_prm.b_un.nrefill) {
+		needed = i * size;
+		if (needed < PAGE_SIZE)
+			needed = PAGE_SIZE;
+	}
+
+	/* 
+	 * OK, we cannot grow the buffer cache, now try to get some
+	 * from the lru list.
+	 */
 repeat:
-	if(needed <= 0)
+	if (obtained >= needed)
 		return;
 
-	/* OK, we cannot grow the buffer cache, now try to get some
-	 * from the lru list.
-	 *
+	/*
 	 * First set the candidate pointers to usable buffers.  This
-	 * should be quick nearly all of the time.
+	 * should be quick nearly all of the time.  N.B. There must be 
+	 * no blocking calls after setting up the candidate[] array!
 	 */
-
-	for(i=0; i<BUF_DIRTY; i++) {
+	for (i = BUF_CLEAN; i<BUF_DIRTY; i++) {
 		buffers[i] = nr_buffers_type[i];
 		candidate[i] = find_candidate(lru_list[i], &buffers[i], size);
 	}
 	
-	/* Now see which candidate wins the election. */
-	
-	winner = best_time = UINT_MAX;	
-	for(i=0; i<BUF_DIRTY; i++) {
-		if(!candidate[i])
-			continue;
-		if(candidate[i]->b_lru_time < best_time) {
-			best_time = candidate[i]->b_lru_time;
-			winner = i;
-		}
-	}
-	
-	/* If we have a winner, use it, and then get a new candidate from that list. */
-	if(winner != UINT_MAX) {
-		i = winner;
-		while (needed>0 && (bh=candidate[i])) {
-			candidate[i] = bh->b_next_free;
-			if(candidate[i] == bh)
-				candidate[i] = NULL;  /* Got last one */		
-			remove_from_queues(bh);
-			bh->b_dev = B_FREE;
-			put_last_free(bh);
-			needed -= bh->b_size;
-			buffers[i]--;
-			if(buffers[i] == 0)
-				candidate[i] = NULL;
-		
-			if (candidate[i] && !can_reclaim(candidate[i],size))
-				candidate[i] = find_candidate(candidate[i],
-							      &buffers[i], size);
+	/*
+	 * Select the older of the available buffers until we reach our goal.
+	 */
+	for (;;) {
+		i = BUF_CLEAN;
+		if (!candidate[BUF_CLEAN]) {
+			if (!candidate[BUF_LOCKED])
+				break;
+			i = BUF_LOCKED;
 		}
-		goto repeat;
-	}
+		else if (candidate[BUF_LOCKED] &&
+				(candidate[BUF_LOCKED]->b_lru_time < 
+				 candidate[BUF_CLEAN ]->b_lru_time))
+			i = BUF_LOCKED;
+		/*
+		 * Free the selected buffer and get the next candidate.
+		 */
+		bh = candidate[i];
+		next = bh->b_next_free;
+
+		obtained += bh->b_size;
+		remove_from_queues(bh);
+		put_last_free(bh);
+		if (obtained >= needed)
+			return;
+
+		if (--buffers[i] && bh != next)
+			candidate[i] = find_candidate(next, &buffers[i], size);
+		else
+			candidate[i] = NULL;
+	}	
+
+	/*
+	 * If we achieved at least half of our goal, return now.
+	 */
+	if (obtained >= (needed >> 1))
+		return;
 	
 	/* Too bad, that was not enough. Try a little harder to grow some. */
 	if (nr_free_pages > min_free_pages + 5) {
 		if (grow_buffers(GFP_BUFFER, size)) {
-			needed -= PAGE_SIZE;
+			obtained += PAGE_SIZE;
 			goto repeat;
 		}
 	}
-	
-	/* And repeat until we find something good. */
+
+	/*
+	 * Make one more attempt to allocate some buffers.
+	 */
 	if (grow_buffers(GFP_ATOMIC, size))
-		needed -= PAGE_SIZE;
-	else
-		wakeup_bdflush(1);
+		obtained += PAGE_SIZE;
+
+	/*
+	 * If we got any buffers, or another task freed some, 
+	 * return now to let this task proceed.
+	 */
+	if (obtained || free_list[BUFSIZE_INDEX(size)]) {
+#ifdef BUFFER_DEBUG
+printk("refill_freelist: obtained %d of %d, free list=%d\n", 
+obtained, needed, free_list[BUFSIZE_INDEX(size)] != NULL);
+#endif
+		return;
+	}
+
+	/*
+	 * System is _very_ low on memory ... wake up bdflush and wait.
+	 */
+#ifdef BUFFER_DEBUG
+printk("refill_freelist: waking bdflush\n");
+#endif
+	wakeup_bdflush(1);
+	/*
+	 * While we slept, other tasks may have needed buffers and entered
+	 * refill_freelist.  This could be a big problem ... reset the 
+	 * needed amount to the absolute minimum.
+	 */
+	needed = size;
 	goto repeat;
 }
 
@@ -831,6 +853,8 @@
 	 * and that it's unused (b_count=0), unlocked (buffer_locked=0), and clean.
 	 */
 	bh->b_count=1;
+	bh->b_list	= BUF_CLEAN;
+	bh->b_lru_time	= jiffies;
 	bh->b_flushtime=0;
 	bh->b_state=(1<<BH_Touched);
 	bh->b_dev=dev;
@@ -856,6 +880,16 @@
 
 
 /*
+ * Put a buffer into the appropriate list, without side-effects.
+ */
+static inline void file_buffer(struct buffer_head *bh, int list)
+{
+	remove_from_queues(bh);
+	bh->b_list = list;
+	insert_into_queues(bh);
+}
+
+/*
  * A buffer may need to be moved from one buffer list to another
  * (e.g. in case it is not shared any more). Handle this.
  */
@@ -873,15 +907,9 @@
 		dispose = BUF_LOCKED;
 	else
 		dispose = BUF_CLEAN;
-	if(dispose == BUF_CLEAN)
-		buf->b_lru_time = jiffies;
 	if(dispose != buf->b_list) {
-		if(dispose == BUF_DIRTY)
-			 buf->b_lru_time = jiffies;
-		remove_from_queues(buf);
-		buf->b_list = dispose;
-		insert_into_queues(buf);
-		if (dispose == BUF_DIRTY) {
+		file_buffer(buf, dispose);
+		if(dispose == BUF_DIRTY) {
 			int too_many = (nr_buffers * bdf_prm.b_un.nfract/100);
 
 			/* This buffer is dirty, maybe we need to start flushing.
@@ -1034,34 +1062,11 @@
 	nr_unused_buffer_heads++;
 	bh->b_next_free = unused_list;
 	unused_list = bh;
+	if (!waitqueue_active(&buffer_wait))
+		return;
 	wake_up(&buffer_wait);
 }
 
-static void get_more_buffer_heads(void)
-{
-	struct buffer_head * bh;
-
-	while (!unused_list) {
-		/* This is critical.  We can't swap out pages to get
-		 * more buffer heads, because the swap-out may need
-		 * more buffer-heads itself.  Thus SLAB_ATOMIC.
-		 */
-		if((bh = kmem_cache_alloc(bh_cachep, SLAB_ATOMIC)) != NULL) {
-			put_unused_buffer_head(bh);
-			nr_buffer_heads++;
-			return;
-		}
-
-		/* Uhhuh. We're _really_ low on memory. Now we just
-		 * wait for old buffer heads to become free due to
-		 * finishing IO..
-		 */
-		run_task_queue(&tq_disk);
-		sleep_on(&buffer_wait);
-	}
-
-}
-
 /* 
  * We can't put completed temporary IO buffer_heads directly onto the
  * unused_list when they become unlocked, since the device driver
@@ -1083,18 +1088,59 @@
 	}
 }
 
-static struct buffer_head * get_unused_buffer_head(void)
+/*
+ * Reserve NR_RESERVED buffer heads for async IO requests to avoid
+ * no-buffer-head deadlock.  Return NULL on failure; waiting for
+ * buffer heads is now handled in create_buffers().
+ */ 
+static struct buffer_head * get_unused_buffer_head(int async)
 {
 	struct buffer_head * bh;
 
 	recover_reusable_buffer_heads();
-	get_more_buffer_heads();
-	if (!unused_list)
-		return NULL;
-	bh = unused_list;
-	unused_list = bh->b_next_free;
-	nr_unused_buffer_heads--;
-	return bh;
+	if (nr_unused_buffer_heads > NR_RESERVED) {
+		bh = unused_list;
+		unused_list = bh->b_next_free;
+		nr_unused_buffer_heads--;
+		return bh;
+	}
+
+	/* This is critical.  We can't swap out pages to get
+	 * more buffer heads, because the swap-out may need
+	 * more buffer-heads itself.  Thus SLAB_ATOMIC.
+	 */
+	if((bh = kmem_cache_alloc(bh_cachep, SLAB_ATOMIC)) != NULL) {
+		memset(bh, 0, sizeof(*bh));
+		nr_buffer_heads++;
+		return bh;
+	}
+
+	/*
+	 * If we need an async buffer, use the reserved buffer heads.
+	 */
+	if (async && unused_list) {
+		bh = unused_list;
+		unused_list = bh->b_next_free;
+		nr_unused_buffer_heads--;
+		return bh;
+	}
+
+#if 0
+	/*
+	 * (Pending further analysis ...)
+	 * Ordinary (non-async) requests can use a different memory priority
+	 * to free up pages. Any swapping thus generated will use async
+	 * buffer heads.
+	 */
+	if(!async &&
+	   (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
+		memset(bh, 0, sizeof(*bh));
+		nr_buffer_heads++;
+		return bh;
+	}
+#endif
+
+	return NULL;
 }
 
 /*
@@ -1102,16 +1148,22 @@
  * the size of each buffer.. Use the bh->b_this_page linked list to
  * follow the buffers created.  Return NULL if unable to create more
  * buffers.
+ * The async flag is used to differentiate async IO (paging, swapping)
+ * from ordinary buffer allocations, and only async requests are allowed
+ * to sleep waiting for buffer heads. 
  */
-static struct buffer_head * create_buffers(unsigned long page, unsigned long size)
+static struct buffer_head * create_buffers(unsigned long page, 
+						unsigned long size, int async)
 {
+	struct wait_queue wait = { current, NULL };
 	struct buffer_head *bh, *head;
 	long offset;
 
+try_again:
 	head = NULL;
 	offset = PAGE_SIZE;
 	while ((offset -= size) >= 0) {
-		bh = get_unused_buffer_head();
+		bh = get_unused_buffer_head(async);
 		if (!bh)
 			goto no_grow;
 
@@ -1138,7 +1190,35 @@
 		bh = bh->b_this_page;
 		put_unused_buffer_head(head);
 	}
-	return NULL;
+
+	/*
+	 * Return failure for non-async IO requests.  Async IO requests
+	 * are not allowed to fail, so we have to wait until buffer heads
+	 * become available.  But we don't want tasks sleeping with 
+	 * partially complete buffers, so all were released above.
+	 */
+	if (!async)
+		return NULL;
+
+	/* Uhhuh. We're _really_ low on memory. Now we just
+	 * wait for old buffer heads to become free due to
+	 * finishing IO.  Since this is an async request and
+	 * the reserve list is empty, we're sure there are 
+	 * async buffer heads in use.
+	 */
+	run_task_queue(&tq_disk);
+
+	/* 
+	 * Set our state for sleeping, then check again for buffer heads.
+	 * This ensures we won't miss a wake_up from an interrupt.
+	 */
+	add_wait_queue(&buffer_wait, &wait);
+	current->state = TASK_UNINTERRUPTIBLE;
+	recover_reusable_buffer_heads();
+	schedule();
+	remove_wait_queue(&buffer_wait, &wait);
+	current->state = TASK_RUNNING;
+	goto try_again;
 }
 
 /* Run the hooks that have to be done when a page I/O has completed. */
@@ -1189,12 +1269,13 @@
 	clear_bit(PG_uptodate, &page->flags);
 	clear_bit(PG_error, &page->flags);
 	/*
-	 * Allocate buffer heads pointing to this page, just for I/O.
+	 * Allocate async buffer heads pointing to this page, just for I/O.
 	 * They do _not_ show up in the buffer hash table!
 	 * They are _not_ registered in page->buffers either!
 	 */
-	bh = create_buffers(page_address(page), size);
+	bh = create_buffers(page_address(page), size, 1);
 	if (!bh) {
+		/* WSH: exit here leaves page->count incremented */
 		clear_bit(PG_locked, &page->flags);
 		wake_up(&page->wait);
 		return -ENOMEM;
@@ -1405,16 +1486,15 @@
 		return 0;
 	}
 
-	isize = BUFSIZE_INDEX(size);
-
 	if (!(page = __get_free_page(pri)))
 		return 0;
-	bh = create_buffers(page, size);
+	bh = create_buffers(page, size, 0);
 	if (!bh) {
 		free_page(page);
 		return 0;
 	}
 
+	isize = BUFSIZE_INDEX(size);
 	insert_point = free_list[isize];
 
 	tmp = bh;
@@ -1554,6 +1634,18 @@
 				      SLAB_HWCACHE_ALIGN, NULL, NULL);
 	if(!bh_cachep)
 		panic("Cannot create buffer head SLAB cache\n");
+	/*
+	 * Allocate the reserved buffer heads.
+	 */
+	while (nr_buffer_heads < NR_RESERVED) {
+		struct buffer_head * bh;
+
+		bh = kmem_cache_alloc(bh_cachep, SLAB_ATOMIC);
+		if (!bh)
+			break;
+		put_unused_buffer_head(bh);
+		nr_buffer_heads++;
+	}
 
 	lru_list[BUF_CLEAN] = 0;
 	grow_buffers(GFP_KERNEL, BLOCK_SIZE);

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen, slshen@lbl.gov