From: Daniel McNeil <daniel@osdl.org>

Should fix the final data-exposure problems with combined O_DIRECT and buffered I/O against local filesystems.

The problem is that there are windows in which the normal kupdate writeback
function has moved pages onto a different page list (locked_pages or
io_pages).  This writeback will discover that the page already has buffers
which are under I/O (due to JBD commit activity) and
__block_write_full_page() will then put the page back onto the dirty_pages
list.

But this leaves a window in which the page is not visible to
filemap_datawrite()/filemap_fdatawait().  Consequently the sync which
O_DIRECT performs to flush out any pending buffered writes will fail to
wait on some blocks which already have I/O in flight.  With enough luck, an
O_DIRECT read can get in there and peek at unwritten data.

We fix this by introducing serialisation between regular memory-cleansing
writeback (WB_SYNC_NONE) and data-integrity writeback (WB_SYNC_ALL).  A
per-address_space rwsem is added.

memory-cleansing writeback uses a read_trylock() so it doesn't get blocked
by sync() operations.  And the sync() operations use down_write() so they
have exclusive access to the address_space's writeback, thus eliminating
all these races.



---

 fs/buffer.c         |    3 +--
 fs/inode.c          |    1 +
 include/linux/fs.h  |    1 +
 mm/page-writeback.c |   27 +++++++++++++++++++++++++--
 mm/swap_state.c     |    1 +
 5 files changed, 29 insertions(+), 4 deletions(-)

diff -puN fs/buffer.c~O_DIRECT-vs-buffered-fix fs/buffer.c
--- 25/fs/buffer.c~O_DIRECT-vs-buffered-fix	2004-02-19 23:27:36.000000000 -0800
+++ 25-akpm/fs/buffer.c	2004-02-19 23:27:36.000000000 -0800
@@ -1816,8 +1816,7 @@ static int __block_write_full_page(struc
 			lock_buffer(bh);
 		} else {
 			if (test_set_buffer_locked(bh)) {
-				if (buffer_dirty(bh))
-					__set_page_dirty_nobuffers(page);
+				__set_page_dirty_nobuffers(page);
 				continue;
 			}
 		}
diff -puN fs/inode.c~O_DIRECT-vs-buffered-fix fs/inode.c
--- 25/fs/inode.c~O_DIRECT-vs-buffered-fix	2004-02-19 23:27:36.000000000 -0800
+++ 25-akpm/fs/inode.c	2004-02-19 23:27:36.000000000 -0800
@@ -195,6 +195,7 @@ void inode_init_once(struct inode *inode
 	INIT_LIST_HEAD(&inode->i_data.i_mmap_shared);
 	spin_lock_init(&inode->i_lock);
 	i_size_ordered_init(inode);
+	init_rwsem(&inode->i_data.wb_rwsema);
 }
 
 EXPORT_SYMBOL(inode_init_once);
diff -puN include/linux/fs.h~O_DIRECT-vs-buffered-fix include/linux/fs.h
--- 25/include/linux/fs.h~O_DIRECT-vs-buffered-fix	2004-02-19 23:27:36.000000000 -0800
+++ 25-akpm/include/linux/fs.h	2004-02-19 23:27:36.000000000 -0800
@@ -338,6 +338,7 @@ struct address_space {
 	spinlock_t		private_lock;	/* for use by the address_space */
 	struct list_head	private_list;	/* ditto */
 	struct address_space	*assoc_mapping;	/* ditto */
+	struct rw_semaphore	wb_rwsema;	/* serialize SYNC writebacks */
 };
 
 struct block_device {
diff -puN mm/page-writeback.c~O_DIRECT-vs-buffered-fix mm/page-writeback.c
--- 25/mm/page-writeback.c~O_DIRECT-vs-buffered-fix	2004-02-19 23:27:36.000000000 -0800
+++ 25-akpm/mm/page-writeback.c	2004-02-19 23:27:36.000000000 -0800
@@ -482,9 +482,32 @@ void __init page_writeback_init(void)
 
 int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
+	int ret;
+	if (wbc->sync_mode == WB_SYNC_NONE) {
+		if (!down_read_trylock(&mapping->wb_rwsema))
+			/*
+			 * SYNC writeback in progress
+			 */
+			return 0;
+	} else {
+		/*
+		 * Only allow 1 SYNC writeback at a time, to be able
+		 * to wait for all i/o without worrying about racing
+		 * WB_SYNC_NONE writers.
+		 */
+		down_write(&mapping->wb_rwsema);
+	}
+
 	if (mapping->a_ops->writepages)
-		return mapping->a_ops->writepages(mapping, wbc);
-	return generic_writepages(mapping, wbc);
+		ret = mapping->a_ops->writepages(mapping, wbc);
+	else
+		ret = generic_writepages(mapping, wbc);
+	if (wbc->sync_mode == WB_SYNC_NONE) {
+		up_read(&mapping->wb_rwsema);
+	} else {
+		up_write(&mapping->wb_rwsema);
+	}
+	return ret;
 }
 
 /**
diff -puN mm/swap_state.c~O_DIRECT-vs-buffered-fix mm/swap_state.c
--- 25/mm/swap_state.c~O_DIRECT-vs-buffered-fix	2004-02-19 23:27:36.000000000 -0800
+++ 25-akpm/mm/swap_state.c	2004-02-19 23:27:36.000000000 -0800
@@ -38,6 +38,7 @@ struct address_space swapper_space = {
 	.truncate_count  = ATOMIC_INIT(0),
 	.private_lock	= SPIN_LOCK_UNLOCKED,
 	.private_list	= LIST_HEAD_INIT(swapper_space.private_list),
+	.wb_rwsema	= __RWSEM_INITIALIZER(swapper_space.wb_rwsema)
 };
 
 #define INC_CACHE_INFO(x)	do { swap_cache_info.x++; } while (0)

_