From: Daniel McNeil <daniel@osdl.org>

This patch moves the serializing of writebacks up one level to before where
the dirty_pages are moved to the io_pages list.  This prevents writebacks
from missing any pages that are moved back to the dirty list by an
SYNC_NONE writeback.  I have not seen this race in testing -- just by
looking at the code.  It also skips the serialization for blockdevs.

Also this patch changes filemap_fdatawrite() to leave the page on the
locked_pages list until the i/o has finished.  This prevents parallel
filemap_fdatawait()s from missing a page that should be waited on.  I have
not seen this in testing, either.


---

 25-akpm/fs/fs-writeback.c   |   25 ++++++++++++++++++++-
 25-akpm/mm/filemap.c        |   51 ++++++++++++++++++++++++++++++++++++--------
 25-akpm/mm/page-writeback.c |   23 +------------------
 3 files changed, 68 insertions(+), 31 deletions(-)

diff -puN fs/fs-writeback.c~serialise-writeback-fdatawait fs/fs-writeback.c
--- 25/fs/fs-writeback.c~serialise-writeback-fdatawait	Tue Feb 24 17:37:25 2004
+++ 25-akpm/fs/fs-writeback.c	Tue Feb 24 17:38:43 2004
@@ -139,6 +139,7 @@ __sync_single_inode(struct inode *inode,
 	struct address_space *mapping = inode->i_mapping;
 	struct super_block *sb = inode->i_sb;
 	int wait = wbc->sync_mode == WB_SYNC_ALL;
+	int blkdev = inode->i_sb == blockdev_superblock;
 
 	BUG_ON(inode->i_state & I_LOCK);
 
@@ -146,6 +147,22 @@ __sync_single_inode(struct inode *inode,
 	dirty = inode->i_state & I_DIRTY;
 	inode->i_state |= I_LOCK;
 	inode->i_state &= ~I_DIRTY;
+	spin_unlock(&inode_lock);
+
+	/*
+	 * Serialize writebacks except for blockdevs
+	 */
+	if (!blkdev) {
+		/*
+		 * Only allow 1 SYNC writeback at a time, to be able to wait
+		 * for all i/o without worrying about racing WB_SYNC_NONE
+		 * writers.
+		 */
+		if (wait)
+			down_write(&mapping->wb_rwsema);
+		else
+			down_read(&mapping->wb_rwsema);
+	}
 
 	/*
 	 * smp_rmb(); note: if you remove write_lock below, you must add this.
@@ -157,10 +174,16 @@ __sync_single_inode(struct inode *inode,
 	if (wait || !wbc->for_kupdate || list_empty(&mapping->io_pages))
 		list_splice_init(&mapping->dirty_pages, &mapping->io_pages);
 	spin_unlock(&mapping->page_lock);
-	spin_unlock(&inode_lock);
 
 	do_writepages(mapping, wbc);
 
+	if (!blkdev) {
+		if (wait)
+			up_write(&mapping->wb_rwsema);
+		else
+			up_read(&mapping->wb_rwsema);
+	}
+
 	/* Don't write the inode if only I_DIRTY_PAGES was set */
 	if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC))
 		write_inode(inode, wait);
diff -puN mm/filemap.c~serialise-writeback-fdatawait mm/filemap.c
--- 25/mm/filemap.c~serialise-writeback-fdatawait	Tue Feb 24 17:37:25 2004
+++ 25-akpm/mm/filemap.c	Tue Feb 24 17:39:18 2004
@@ -128,6 +128,8 @@ static inline int sync_page(struct page 
 	return 0;
 }
 
+extern struct super_block *blockdev_superblock;
+
 /**
  * filemap_fdatawrite - start writeback against all of a mapping's dirty pages
  * @mapping: address space structure to write
@@ -144,14 +146,29 @@ static int __filemap_fdatawrite(struct a
 		.sync_mode = sync_mode,
 		.nr_to_write = mapping->nrpages * 2,
 	};
+	int blkdev = mapping->host->i_sb == blockdev_superblock;
 
 	if (mapping->backing_dev_info->memory_backed)
 		return 0;
 
+	if (!blkdev) {
+		if (sync_mode == WB_SYNC_NONE)
+			down_read(&mapping->wb_rwsema);
+		else
+			down_write(&mapping->wb_rwsema);
+	}
+
 	spin_lock(&mapping->page_lock);
 	list_splice_init(&mapping->dirty_pages, &mapping->io_pages);
 	spin_unlock(&mapping->page_lock);
 	ret = do_writepages(mapping, &wbc);
+
+	if (!blkdev) {
+		if (sync_mode == WB_SYNC_NONE)
+			up_read(&mapping->wb_rwsema);
+		else
+			up_write(&mapping->wb_rwsema);
+	}
 	return ret;
 }
 
@@ -188,13 +205,24 @@ restart:
 		struct page *page;
 
 		page = list_entry(mapping->locked_pages.next,struct page,list);
-		list_del(&page->list);
-		if (PageDirty(page))
-			list_add(&page->list, &mapping->dirty_pages);
-		else
+		/*
+		 * Leave page on locked list until i/o has finished
+		 * so parallel filemap_fdatawait()s will all see the page.
+		 */
+
+		if (!PageDirty(page) && !PageLocked(page) &&
+		    !PageWriteback(page)) {
+
+			/*
+			 * The page is checked if locked because it might
+			 * be in process of being setup for writeback with
+			 * PG_dirty cleared and PG_writeback not set yet.
+			 * The page is not dirty and i/o has finished
+			 * so we can move it to the clean list.
+			 */
+			list_del(&page->list);
 			list_add(&page->list, &mapping->clean_pages);
 
-		if (!PageWriteback(page)) {
 			if (++progress > 32) {
 				if (need_resched()) {
 					spin_unlock(&mapping->page_lock);
@@ -209,10 +237,15 @@ restart:
 		page_cache_get(page);
 		spin_unlock(&mapping->page_lock);
 
-		wait_on_page_writeback(page);
-		if (PageError(page))
-			ret = -EIO;
-
+		lock_page(page);
+		if (PageDirty(page) && mapping->a_ops->writepage) {
+			write_one_page(page, 1);
+		} else {
+			wait_on_page_writeback(page);
+			unlock_page(page);
+		}
+ 		if (PageError(page))
+ 			ret = -EIO;
 		page_cache_release(page);
 		spin_lock(&mapping->page_lock);
 	}
diff -puN mm/page-writeback.c~serialise-writeback-fdatawait mm/page-writeback.c
--- 25/mm/page-writeback.c~serialise-writeback-fdatawait	Tue Feb 24 17:37:25 2004
+++ 25-akpm/mm/page-writeback.c	Tue Feb 24 17:37:25 2004
@@ -482,28 +482,9 @@ void __init page_writeback_init(void)
 
 int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
-	int ret;
-
-	/*
-	 * Only allow 1 SYNC writeback at a time, to be able to wait for all
-	 * I/O without worrying about racing WB_SYNC_NONE writers.
-	 */
-	if (wbc->sync_mode == WB_SYNC_NONE)
-		down_read(&mapping->wb_rwsema);
-	else
-		down_write(&mapping->wb_rwsema);
-
 	if (mapping->a_ops->writepages)
-		ret = mapping->a_ops->writepages(mapping, wbc);
-	else
-		ret = generic_writepages(mapping, wbc);
-
-	if (wbc->sync_mode == WB_SYNC_NONE)
-		up_read(&mapping->wb_rwsema);
-	else
-		up_write(&mapping->wb_rwsema);
-
-	return ret;
+		return mapping->a_ops->writepages(mapping, wbc);
+	return generic_writepages(mapping, wbc);
 }
 
 /**

_