From: Suparna Bhattacharya <suparna@in.ibm.com>

The attached patch plugs some inaccuracies and associated
inefficiencies in sync_page_range handling for AIO 
O_SYNC writes.

- Correctly propagate the return value from sync_page_range
  when some but not all pages in the range have completed
  writeback (Otherwise, O_SYNC AIO writes could report
  completion before all the relevant writeouts actually 
  completed)
- Reset and check page dirty settings prior to attempting
  writeouts to avoid blocking synchronously for a writeback
  initiated otherwise.
- Disable repeated calls to writeout or generic_osync_inode
  during AIO retries for the same iocb.
- Don't block synchronously for data writeouts to complete
  when writing out inode state for generic_osync_inode in
  AIO context. The wait_on_page_range call after this will
  take care of completing the iocb only after the data has
  been written out. [TBD: Is this particular change OK for 
  data=ordered, or does it need rethinking ?]

I have run aio-stress with modifications for AIO O_SYNC 
writes on ext2, ext3, jfs, xfs and blockdev.



 include/linux/aio.h |    5 +++++
 mm/filemap.c        |   20 ++++++--------------
 mm/page-writeback.c |   20 ++++++++++++++++++--
 3 files changed, 29 insertions(+), 16 deletions(-)

diff -puN include/linux/aio.h~aio-osync-fix-2 include/linux/aio.h
--- 25/include/linux/aio.h~aio-osync-fix-2	2003-08-30 15:42:35.000000000 -0700
+++ 25-akpm/include/linux/aio.h	2003-08-30 15:42:35.000000000 -0700
@@ -29,21 +29,26 @@ struct kioctx;
 #define KIF_LOCKED		0
 #define KIF_KICKED		1
 #define KIF_CANCELLED		2
+#define KIF_SYNCED		3
 
 #define kiocbTryLock(iocb)	test_and_set_bit(KIF_LOCKED, &(iocb)->ki_flags)
 #define kiocbTryKick(iocb)	test_and_set_bit(KIF_KICKED, &(iocb)->ki_flags)
+#define kiocbTrySync(iocb)	test_and_set_bit(KIF_SYNCED, &(iocb)->ki_flags)
 
 #define kiocbSetLocked(iocb)	set_bit(KIF_LOCKED, &(iocb)->ki_flags)
 #define kiocbSetKicked(iocb)	set_bit(KIF_KICKED, &(iocb)->ki_flags)
 #define kiocbSetCancelled(iocb)	set_bit(KIF_CANCELLED, &(iocb)->ki_flags)
+#define kiocbSetSynced(iocb)	set_bit(KIF_SYNCED, &(iocb)->ki_flags)
 
 #define kiocbClearLocked(iocb)	clear_bit(KIF_LOCKED, &(iocb)->ki_flags)
 #define kiocbClearKicked(iocb)	clear_bit(KIF_KICKED, &(iocb)->ki_flags)
 #define kiocbClearCancelled(iocb)	clear_bit(KIF_CANCELLED, &(iocb)->ki_flags)
+#define kiocbClearSynced(iocb)	clear_bit(KIF_SYNCED, &(iocb)->ki_flags)
 
 #define kiocbIsLocked(iocb)	test_bit(KIF_LOCKED, &(iocb)->ki_flags)
 #define kiocbIsKicked(iocb)	test_bit(KIF_KICKED, &(iocb)->ki_flags)
 #define kiocbIsCancelled(iocb)	test_bit(KIF_CANCELLED, &(iocb)->ki_flags)
+#define kiocbIsSynced(iocb)	test_bit(KIF_SYNCED, &(iocb)->ki_flags)
 
 struct kiocb {
 	struct list_head	ki_run_list;
diff -puN mm/filemap.c~aio-osync-fix-2 mm/filemap.c
--- 25/mm/filemap.c~aio-osync-fix-2	2003-08-30 15:42:35.000000000 -0700
+++ 25-akpm/mm/filemap.c	2003-08-30 15:42:35.000000000 -0700
@@ -1953,13 +1953,9 @@ generic_file_aio_write_nolock(struct kio
 
 osync:
 	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
-		ssize_t err;
-
-		err = sync_page_range_nolock(inode, mapping, pos, ret);
-		if (err < 0)
-			ret = err;
-		else
-			*ppos = pos + err;
+		ret = sync_page_range_nolock(inode, mapping, pos, ret);
+		if (ret >= 0)
+			*ppos = pos + ret;
 	}
 	return ret;
 }
@@ -2023,13 +2019,9 @@ ssize_t generic_file_aio_write(struct ki
 
 osync:
 	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
-		ssize_t err;
-
-		err = sync_page_range(inode, mapping, pos, ret);
-		if (err < 0)
-			ret = err;
-		else
-			iocb->ki_pos = pos + err;
+		ret = sync_page_range(inode, mapping, pos, ret);
+		if (ret >= 0)
+			iocb->ki_pos = pos + ret;
 	}
 	return ret;
 }
diff -puN mm/page-writeback.c~aio-osync-fix-2 mm/page-writeback.c
--- 25/mm/page-writeback.c~aio-osync-fix-2	2003-08-30 15:42:35.000000000 -0700
+++ 25-akpm/mm/page-writeback.c	2003-08-30 15:42:35.000000000 -0700
@@ -641,6 +641,10 @@ static int page_writer(struct page *page
 		.nr_to_write	= 1,
 	};
 
+	if (!test_clear_page_dirty(page)) {
+		unlock_page(page);
+		return 0;
+	}
 	wait_on_page_writeback(page);
 	return page->mapping->a_ops->writepage(page, &wbc);
 }
@@ -662,8 +666,13 @@ write_out_page_range(struct address_spac
 ssize_t sync_page_range(struct inode *inode, struct address_space *mapping,
 			loff_t pos, size_t count)
 {
-	int ret;
+	int ret = 0;
 
+	if (in_aio()) {
+		/* Already issued writeouts for this iocb ? */
+		if (kiocbTrySync(io_wait_to_kiocb(current->io_wait)))
+			goto do_wait; /* just need to check if done */
+	}
 	if (!mapping->a_ops->writepage)
 		return 0;
 	if (mapping->backing_dev_info->memory_backed)
@@ -674,6 +683,7 @@ ssize_t sync_page_range(struct inode *in
 		ret = generic_osync_inode(inode, OSYNC_METADATA);
 		up(&inode->i_sem);
 	}
+do_wait:
 	if (ret >= 0)
 		ret = wait_on_page_range(mapping, pos, count);
 	return ret;
@@ -688,8 +698,13 @@ ssize_t sync_page_range(struct inode *in
 ssize_t sync_page_range_nolock(struct inode *inode, struct address_space
 	*mapping, loff_t pos, size_t count)
 {
-	int ret;
+	ssize_t ret = 0;
 
+	if (in_aio()) {
+		/* Already issued writeouts for this iocb ? */
+		if (kiocbTrySync(io_wait_to_kiocb(current->io_wait)))
+			goto do_wait; /* just need to check if done */
+	}
 	if (!mapping->a_ops->writepage)
 		return 0;
 	if (mapping->backing_dev_info->memory_backed)
@@ -698,6 +713,7 @@ ssize_t sync_page_range_nolock(struct in
 	if (ret >= 0) {
 		ret = generic_osync_inode(inode, OSYNC_METADATA);
 	}
+do_wait:
 	if (ret >= 0)
 		ret = wait_on_page_range(mapping, pos, count);
 	return ret;

_