patch-2.4.10 linux/fs/buffer.c
Next file: linux/fs/coda/file.c
Previous file: linux/fs/block_dev.c
Back to the patch index
Back to the overall index
- Lines: 743
- Date:
Thu Sep 20 17:25:51 2001
- Orig file:
v2.4.9/linux/fs/buffer.c
- Orig date:
Mon Aug 27 12:41:46 2001
diff -u --recursive --new-file v2.4.9/linux/fs/buffer.c linux/fs/buffer.c
@@ -103,7 +103,8 @@
atomic_t buffermem_pages = ATOMIC_INIT(0);
/* Here is the parameter block for the bdflush process. If you add or
- * remove any of the parameters, make sure to update kernel/sysctl.c.
+ * remove any of the parameters, make sure to update kernel/sysctl.c
+ * and the documentation at linux/Documentation/sysctl/vm.txt.
*/
#define N_PARAM 9
@@ -130,10 +131,11 @@
/* These are the min and max parameter values that we will allow to be assigned */
int bdflush_min[N_PARAM] = { 0, 10, 5, 25, 0, 1*HZ, 0, 0, 0};
-int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,600*HZ, 6000*HZ, 100, 0, 0};
+int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,10000*HZ, 6000*HZ, 100, 0, 0};
inline void unlock_buffer(struct buffer_head *bh)
{
+ clear_bit(BH_Wait_IO, &bh->b_state);
clear_bit(BH_Lock, &bh->b_state);
smp_mb__after_clear_bit();
if (waitqueue_active(&bh->b_wait))
@@ -207,7 +209,7 @@
int nr;
next = lru_list[BUF_DIRTY];
- nr = nr_buffers_type[BUF_DIRTY] * 2;
+ nr = nr_buffers_type[BUF_DIRTY];
count = 0;
while (next && --nr >= 0) {
struct buffer_head * bh = next;
@@ -261,7 +263,7 @@
int nr;
next = lru_list[index];
- nr = nr_buffers_type[index] * 2;
+ nr = nr_buffers_type[index];
while (next && --nr >= 0) {
struct buffer_head *bh = next;
next = bh->b_next_free;
@@ -308,7 +310,7 @@
* We will ultimately want to put these in a separate list, but for
* now we search all of the lists for dirty buffers.
*/
-static int sync_buffers(kdev_t dev, int wait)
+int sync_buffers(kdev_t dev, int wait)
{
int err = 0;
@@ -623,6 +625,16 @@
spin_unlock(&lru_list_lock);
}
+void buffer_insert_inode_data_queue(struct buffer_head *bh, struct inode *inode)
+{
+ spin_lock(&lru_list_lock);
+ if (bh->b_inode)
+ list_del(&bh->b_inode_buffers);
+ bh->b_inode = inode;
+ list_add(&bh->b_inode_buffers, &inode->i_dirty_data_buffers);
+ spin_unlock(&lru_list_lock);
+}
+
/* The caller must have the lru_list lock before calling the
remove_inode_queue functions. */
static void __remove_inode_queue(struct buffer_head *bh)
@@ -642,13 +654,12 @@
int ret;
spin_lock(&lru_list_lock);
- ret = !list_empty(&inode->i_dirty_buffers);
+ ret = !list_empty(&inode->i_dirty_buffers) || !list_empty(&inode->i_dirty_data_buffers);
spin_unlock(&lru_list_lock);
return ret;
}
-
/* If invalidate_buffers() will trash dirty buffers, it means some kind
of fs corruption is going on. Trashing dirty data always imply losing
information that was supposed to be just stored on the physical layer
@@ -668,8 +679,16 @@
These are two special cases. Normal usage imply the device driver
to issue a sync on the device (without waiting I/O completion) and
- then an invalidate_buffers call that doesn't trash dirty buffers. */
-void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
+ then an invalidate_buffers call that doesn't trash dirty buffers.
+
+ For handling cache coherency with the blkdev pagecache the 'update' case
+ is been introduced. It is needed to re-read from disk any pinned
+ buffer. NOTE: re-reading from disk is destructive so we can do it only
+ when we assume nobody is changing the buffercache under our I/O and when
+ we think the disk contains more recent information than the buffercache.
+ The update == 1 pass marks the buffers we need to update, the update == 2
+ pass does the actual I/O. */
+void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers, int update)
{
int i, nlist, slept;
struct buffer_head * bh, * bh_next;
@@ -700,13 +719,36 @@
}
write_lock(&hash_table_lock);
- if (!atomic_read(&bh->b_count) &&
- (destroy_dirty_buffers || !buffer_dirty(bh))) {
- remove_inode_queue(bh);
- __remove_from_queues(bh);
- put_last_free(bh);
+ /* All buffers in the lru lists are mapped */
+ if (!buffer_mapped(bh))
+ BUG();
+ if (!atomic_read(&bh->b_count)) {
+ if (destroy_dirty_buffers || !buffer_dirty(bh)) {
+ remove_inode_queue(bh);
+ __remove_from_queues(bh);
+ put_last_free(bh);
+ }
+ } else if (update) {
+ if ((update == 2) ^ buffer_uptodate(bh) &&
+ (update == 2) ^ buffer_req(bh)) {
+ write_unlock(&hash_table_lock);
+ atomic_inc(&bh->b_count);
+ spin_unlock(&lru_list_lock);
+
+ if (update == 2) {
+ ll_rw_block(READ, 1, &bh);
+ wait_on_buffer(bh);
+ } else {
+ lock_buffer(bh);
+ clear_bit(BH_Uptodate, &bh->b_state);
+ clear_bit(BH_Req, &bh->b_state);
+ unlock_buffer(bh);
+ }
+
+ atomic_dec(&bh->b_count);
+ goto retry;
+ }
}
- /* else complain loudly? */
write_unlock(&hash_table_lock);
if (slept)
@@ -796,27 +838,13 @@
static void free_more_memory(void)
{
- balance_dirty(NODEV);
- page_launder(GFP_NOFS, 0);
+ balance_dirty();
wakeup_bdflush();
- wakeup_kswapd();
current->policy |= SCHED_YIELD;
__set_current_state(TASK_RUNNING);
schedule();
}
-/*
- * We used to try various strange things. Let's not.
- * We'll just try to balance dirty buffers, and possibly
- * launder some pages and do our best to make more memory
- * available.
- */
-static void refill_freelist(int size)
-{
- if (!grow_buffers(size))
- free_more_memory();
-}
-
void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
{
bh->b_list = BUF_CLEAN;
@@ -852,17 +880,17 @@
* that unlock the page..
*/
spin_lock_irqsave(&page_uptodate_lock, flags);
+ mark_buffer_async(bh, 0);
unlock_buffer(bh);
tmp = bh->b_this_page;
while (tmp != bh) {
- if (tmp->b_end_io == end_buffer_io_async && buffer_locked(tmp))
+ if (buffer_async(tmp) && buffer_locked(tmp))
goto still_busy;
tmp = tmp->b_this_page;
}
/* OK, the async IO on this page is complete. */
spin_unlock_irqrestore(&page_uptodate_lock, flags);
- put_bh(bh);
/*
* if none of the buffers had errors then we can set the
@@ -882,13 +910,13 @@
return;
still_busy:
- put_bh(bh);
spin_unlock_irqrestore(&page_uptodate_lock, flags);
return;
}
-void set_buffer_async_io(struct buffer_head *bh) {
+inline void set_buffer_async_io(struct buffer_head *bh) {
bh->b_end_io = end_buffer_io_async ;
+ mark_buffer_async(bh, 1);
}
/*
@@ -960,6 +988,54 @@
return err2;
}
+int fsync_inode_data_buffers(struct inode *inode)
+{
+ struct buffer_head *bh;
+ struct inode tmp;
+ int err = 0, err2;
+
+ INIT_LIST_HEAD(&tmp.i_dirty_data_buffers);
+
+ spin_lock(&lru_list_lock);
+
+ while (!list_empty(&inode->i_dirty_data_buffers)) {
+ bh = BH_ENTRY(inode->i_dirty_data_buffers.next);
+ list_del(&bh->b_inode_buffers);
+ if (!buffer_dirty(bh) && !buffer_locked(bh))
+ bh->b_inode = NULL;
+ else {
+ bh->b_inode = &tmp;
+ list_add(&bh->b_inode_buffers, &tmp.i_dirty_data_buffers);
+ if (buffer_dirty(bh)) {
+ get_bh(bh);
+ spin_unlock(&lru_list_lock);
+ ll_rw_block(WRITE, 1, &bh);
+ brelse(bh);
+ spin_lock(&lru_list_lock);
+ }
+ }
+ }
+
+ while (!list_empty(&tmp.i_dirty_data_buffers)) {
+ bh = BH_ENTRY(tmp.i_dirty_data_buffers.prev);
+ remove_inode_queue(bh);
+ get_bh(bh);
+ spin_unlock(&lru_list_lock);
+ wait_on_buffer(bh);
+ if (!buffer_uptodate(bh))
+ err = -EIO;
+ brelse(bh);
+ spin_lock(&lru_list_lock);
+ }
+
+ spin_unlock(&lru_list_lock);
+ err2 = osync_inode_data_buffers(inode);
+
+ if (err)
+ return err;
+ else
+ return err2;
+}
/*
* osync is designed to support O_SYNC io. It waits synchronously for
@@ -1001,6 +1077,35 @@
return err;
}
+int osync_inode_data_buffers(struct inode *inode)
+{
+ struct buffer_head *bh;
+ struct list_head *list;
+ int err = 0;
+
+ spin_lock(&lru_list_lock);
+
+ repeat:
+
+ for (list = inode->i_dirty_data_buffers.prev;
+ bh = BH_ENTRY(list), list != &inode->i_dirty_data_buffers;
+ list = bh->b_inode_buffers.prev) {
+ if (buffer_locked(bh)) {
+ get_bh(bh);
+ spin_unlock(&lru_list_lock);
+ wait_on_buffer(bh);
+ if (!buffer_uptodate(bh))
+ err = -EIO;
+ brelse(bh);
+ spin_lock(&lru_list_lock);
+ goto repeat;
+ }
+ }
+
+ spin_unlock(&lru_list_lock);
+ return err;
+}
+
/*
* Invalidate any and all dirty buffers on a given inode. We are
@@ -1009,15 +1114,13 @@
*/
void invalidate_inode_buffers(struct inode *inode)
{
- struct list_head *list, *next;
+ struct list_head * entry;
spin_lock(&lru_list_lock);
- list = inode->i_dirty_buffers.next;
- while (list != &inode->i_dirty_buffers) {
- next = list->next;
- remove_inode_queue(BH_ENTRY(list));
- list = next;
- }
+ while ((entry = inode->i_dirty_buffers.next) != &inode->i_dirty_buffers)
+ remove_inode_queue(BH_ENTRY(entry));
+ while ((entry = inode->i_dirty_data_buffers.next) != &inode->i_dirty_data_buffers)
+ remove_inode_queue(BH_ENTRY(entry));
spin_unlock(&lru_list_lock);
}
@@ -1069,6 +1172,7 @@
out:
write_unlock(&hash_table_lock);
spin_unlock(&lru_list_lock);
+ touch_buffer(bh);
return bh;
}
@@ -1078,7 +1182,10 @@
*/
write_unlock(&hash_table_lock);
spin_unlock(&lru_list_lock);
- refill_freelist(size);
+
+ if (!grow_buffers(size))
+ free_more_memory();
+
/* FIXME: getblk should fail if there's no enough memory */
goto repeat;
}
@@ -1086,7 +1193,7 @@
/* -1 -> no need to flush
0 -> async flush
1 -> sync flush (wait for I/O completion) */
-int balance_dirty_state(kdev_t dev)
+static int balance_dirty_state(void)
{
unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
@@ -1114,16 +1221,16 @@
* pressures on different devices - thus the (currently unused)
* 'dev' parameter.
*/
-void balance_dirty(kdev_t dev)
+void balance_dirty(void)
{
- int state = balance_dirty_state(dev);
+ int state = balance_dirty_state();
if (state < 0)
return;
/* If we're getting into imbalance, start write-out */
spin_lock(&lru_list_lock);
- write_some_buffers(dev);
+ write_some_buffers(NODEV);
/*
* And if we're _really_ out of balance, wait for
@@ -1132,12 +1239,12 @@
* This will throttle heavy writers.
*/
if (state > 0) {
- wait_for_some_buffers(dev);
+ wait_for_some_buffers(NODEV);
wakeup_bdflush();
}
}
-static __inline__ void __mark_dirty(struct buffer_head *bh)
+inline void __mark_dirty(struct buffer_head *bh)
{
bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
refile_buffer(bh);
@@ -1155,7 +1262,7 @@
{
if (!atomic_set_buffer_dirty(bh)) {
__mark_dirty(bh);
- balance_dirty(bh->b_dev);
+ balance_dirty();
}
}
@@ -1170,8 +1277,6 @@
dispose = BUF_LOCKED;
if (buffer_dirty(bh))
dispose = BUF_DIRTY;
- if (buffer_protected(bh))
- dispose = BUF_PROTECTED;
if (dispose != bh->b_list) {
__remove_from_lru_list(bh, bh->b_list);
bh->b_list = dispose;
@@ -1211,11 +1316,11 @@
/* grab the lru lock here to block bdflush. */
spin_lock(&lru_list_lock);
write_lock(&hash_table_lock);
- if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf) || buffer_protected(buf))
+ if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf))
goto in_use;
__hash_unlink(buf);
- remove_inode_queue(buf);
write_unlock(&hash_table_lock);
+ remove_inode_queue(buf);
__remove_from_lru_list(buf, buf->b_list);
spin_unlock(&lru_list_lock);
put_last_free(buf);
@@ -1226,16 +1331,19 @@
spin_unlock(&lru_list_lock);
}
-/*
- * bread() reads a specified block and returns the buffer that contains
- * it. It returns NULL if the block was unreadable.
+/**
+ * bread() - reads a specified block and returns the bh
+ * @block: number of block
+ * @size: size (in bytes) to read
+ *
+ * Reads a specified block, and returns buffer head that
+ * contains it. It returns NULL if the block was unreadable.
*/
struct buffer_head * bread(kdev_t dev, int block, int size)
{
struct buffer_head * bh;
bh = getblk(dev, block, size);
- touch_buffer(bh);
if (buffer_uptodate(bh))
return bh;
ll_rw_block(READ, 1, &bh);
@@ -1436,7 +1544,7 @@
* we have truncated the file and are going to free the
* blocks on-disk..
*/
-int block_flushpage(struct page *page, unsigned long offset)
+int discard_bh_page(struct page *page, unsigned long offset, int drop_pagecache)
{
struct buffer_head *head, *bh, *next;
unsigned int curr_off = 0;
@@ -1473,7 +1581,8 @@
*/
if (!offset) {
if (!try_to_free_buffers(page, 0)) {
- atomic_inc(&buffermem_pages);
+ if (drop_pagecache)
+ atomic_inc(&buffermem_pages);
return 0;
}
}
@@ -1481,7 +1590,7 @@
return 1;
}
-static void create_empty_buffers(struct page *page, kdev_t dev, unsigned long blocksize)
+void create_empty_buffers(struct page *page, kdev_t dev, unsigned long blocksize)
{
struct buffer_head *bh, *head, *tail;
@@ -1590,8 +1699,7 @@
/* Stage 2: lock the buffers, mark them clean */
do {
lock_buffer(bh);
- bh->b_end_io = end_buffer_io_async;
- get_bh(bh);
+ set_buffer_async_io(bh);
set_bit(BH_Uptodate, &bh->b_state);
clear_bit(BH_Dirty, &bh->b_state);
bh = bh->b_this_page;
@@ -1705,14 +1813,14 @@
set_bit(BH_Uptodate, &bh->b_state);
if (!atomic_set_buffer_dirty(bh)) {
__mark_dirty(bh);
- buffer_insert_inode_queue(bh, inode);
+ buffer_insert_inode_data_queue(bh, inode);
need_balance_dirty = 1;
}
}
}
if (need_balance_dirty)
- balance_dirty(bh->b_dev);
+ balance_dirty();
/*
* is this a partial write that happened to make all buffers
* uptodate then we can optimize away a bogus readpage() for
@@ -1792,8 +1900,7 @@
for (i = 0; i < nr; i++) {
struct buffer_head * bh = arr[i];
lock_buffer(bh);
- bh->b_end_io = end_buffer_io_async;
- get_bh(bh);
+ set_buffer_async_io(bh);
}
/* Stage 3: start the IO */
@@ -2034,6 +2141,47 @@
return tmp.b_blocknr;
}
+int generic_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize, get_block_t * get_block)
+{
+ int i, nr_blocks, retval;
+ unsigned long * blocks = iobuf->blocks;
+
+ nr_blocks = iobuf->length / blocksize;
+ /* build the blocklist */
+ for (i = 0; i < nr_blocks; i++, blocknr++) {
+ struct buffer_head bh;
+
+ bh.b_state = 0;
+ bh.b_dev = inode->i_dev;
+ bh.b_size = blocksize;
+
+ retval = get_block(inode, blocknr, &bh, rw == READ ? 0 : 1);
+ if (retval)
+ goto out;
+
+ if (rw == READ) {
+ if (buffer_new(&bh))
+ BUG();
+ if (!buffer_mapped(&bh)) {
+ /* there was an hole in the filesystem */
+ blocks[i] = -1UL;
+ continue;
+ }
+ } else {
+ if (buffer_new(&bh))
+ unmap_underlying_metadata(&bh);
+ if (!buffer_mapped(&bh))
+ BUG();
+ }
+ blocks[i] = bh.b_blocknr;
+ }
+
+ retval = brw_kiovec(rw, 1, &iobuf, inode->i_dev, iobuf->blocks, blocksize);
+
+ out:
+ return retval;
+}
+
/*
* IO completion routine for a buffer_head being used for kiobuf IO: we
* can't dispatch the kiobuf callback until io_count reaches 0.
@@ -2149,6 +2297,18 @@
while (length > 0) {
blocknr = b[bufind++];
+ if (blocknr == -1UL) {
+ if (rw == READ) {
+ /* there was an hole in the filesystem */
+ memset(kmap(map) + offset, 0, size);
+ flush_dcache_page(map);
+ kunmap(map);
+
+ transferred += size;
+ goto skip_block;
+ } else
+ BUG();
+ }
tmp = bhs[bhind++];
tmp->b_dev = B_FREE;
@@ -2167,9 +2327,6 @@
} else
set_bit(BH_Uptodate, &tmp->b_state);
- length -= size;
- offset += size;
-
atomic_inc(&iobuf->io_count);
submit_bh(rw, tmp);
/*
@@ -2184,7 +2341,11 @@
goto finished;
bhind = 0;
}
-
+
+ skip_block:
+ length -= size;
+ offset += size;
+
if (offset >= PAGE_SIZE) {
offset = 0;
break;
@@ -2237,8 +2398,7 @@
lock_buffer(bh);
bh->b_blocknr = *(b++);
set_bit(BH_Mapped, &bh->b_state);
- bh->b_end_io = end_buffer_io_async;
- get_bh(bh);
+ set_buffer_async_io(bh);
bh = bh->b_this_page;
} while (bh != head);
@@ -2350,40 +2510,37 @@
return 0;
}
-/*
- * Sync all the buffers on one page..
- *
- * If we have old buffers that are locked, we'll
- * wait on them, but we won't wait on the new ones
- * we're writing out now.
- *
- * This all is required so that we can free up memory
- * later.
- *
- * Wait:
- * 0 - no wait (this does not get called - see try_to_free_buffers below)
- * 1 - start IO for dirty buffers
- * 2 - wait for completion of locked buffers
- */
-static void sync_page_buffers(struct buffer_head *bh, unsigned int gfp_mask)
+static int sync_page_buffers(struct buffer_head *bh, unsigned int gfp_mask)
{
- struct buffer_head * tmp = bh;
+ struct buffer_head * p = bh;
+ int tryagain = 1;
do {
- struct buffer_head *p = tmp;
- tmp = tmp->b_this_page;
- if (buffer_locked(p)) {
- if (gfp_mask & __GFP_WAIT)
- __wait_on_buffer(p);
- } else if (buffer_dirty(p))
- ll_rw_block(WRITE, 1, &p);
- } while (tmp != bh);
+ if (buffer_dirty(p) || buffer_locked(p)) {
+ if (test_and_set_bit(BH_Wait_IO, &p->b_state)) {
+ if (buffer_dirty(p)) {
+ ll_rw_block(WRITE, 1, &p);
+ tryagain = 0;
+ } else if (buffer_locked(p)) {
+ if (gfp_mask & __GFP_WAIT) {
+ wait_on_buffer(p);
+ tryagain = 1;
+ } else
+ tryagain = 0;
+ }
+ } else
+ tryagain = 0;
+ }
+ p = p->b_this_page;
+ } while (p != bh);
+
+ return tryagain;
}
/*
* Can the buffer be thrown out?
*/
-#define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
+#define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock))
#define buffer_busy(bh) (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
/*
@@ -2448,14 +2605,16 @@
write_unlock(&hash_table_lock);
spin_unlock(&lru_list_lock);
if (gfp_mask & __GFP_IO) {
- sync_page_buffers(bh, gfp_mask);
- /* We waited synchronously, so we can free the buffers. */
- if (gfp_mask & __GFP_WAIT) {
- gfp_mask = 0; /* no IO or waiting this time around */
- goto cleaned_buffers_try_again;
+ if ((gfp_mask & __GFP_HIGHIO) || !PageHighMem(page)) {
+ if (sync_page_buffers(bh, gfp_mask)) {
+ /* no IO or waiting next time */
+ gfp_mask = 0;
+ goto cleaned_buffers_try_again;
+ }
}
- wakeup_bdflush();
}
+ if (balance_dirty_state() >= 0)
+ wakeup_bdflush();
return 0;
}
@@ -2466,9 +2625,8 @@
#ifdef CONFIG_SMP
struct buffer_head * bh;
int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
- int protected = 0;
int nlist;
- static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", "PROTECTED", };
+ static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", };
#endif
printk("Buffer memory: %6dkB\n",
@@ -2478,7 +2636,7 @@
if (!spin_trylock(&lru_list_lock))
return;
for(nlist = 0; nlist < NR_LIST; nlist++) {
- found = locked = dirty = used = lastused = protected = 0;
+ found = locked = dirty = used = lastused = 0;
bh = lru_list[nlist];
if(!bh) continue;
@@ -2486,8 +2644,6 @@
found++;
if (buffer_locked(bh))
locked++;
- if (buffer_protected(bh))
- protected++;
if (buffer_dirty(bh))
dirty++;
if (atomic_read(&bh->b_count))
@@ -2501,9 +2657,9 @@
buf_types[nlist], found, tmp);
}
printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), "
- "%d locked, %d protected, %d dirty\n",
+ "%d locked, %d dirty\n",
buf_types[nlist], found, size_buffers_type[nlist]>>10,
- used, lastused, locked, protected, dirty);
+ used, lastused, locked, dirty);
}
spin_unlock(&lru_list_lock);
#endif
@@ -2706,7 +2862,7 @@
CHECK_EMERGENCY_SYNC
spin_lock(&lru_list_lock);
- if (!write_some_buffers(NODEV) || balance_dirty_state(NODEV) < 0) {
+ if (!write_some_buffers(NODEV) || balance_dirty_state() < 0) {
wait_for_some_buffers(NODEV);
interruptible_sleep_on(&bdflush_wait);
}
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)