diff -purN -X /home/mbligh/.diff.exclude 540-O_SYNC-speedup/fs/aio.c 545-aio-O_SYNC/fs/aio.c
--- 540-O_SYNC-speedup/fs/aio.c	2004-02-28 11:21:18.000000000 -0800
+++ 545-aio-O_SYNC/fs/aio.c	2004-02-28 11:21:41.000000000 -0800
@@ -27,6 +27,7 @@
 #include <linux/aio.h>
 #include <linux/highmem.h>
 #include <linux/workqueue.h>
+#include <linux/writeback.h>
 
 #include <asm/kmap_types.h>
 #include <asm/uaccess.h>
@@ -1308,28 +1309,40 @@ static ssize_t aio_pread(struct kiocb *i
 static ssize_t aio_pwrite(struct kiocb *iocb)
 {
 	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
 	ssize_t ret = 0;
 
 	ret = file->f_op->aio_write(iocb, iocb->ki_buf,
-		iocb->ki_left, iocb->ki_pos);
+				iocb->ki_left, iocb->ki_pos);
 
 	/*
-	 * TBD: Even if iocb->ki_left = 0, could we need to
-	 * wait for data to be sync'd ? Or can we assume
-	 * that aio_fdsync/aio_fsync would be called explicitly
-	 * as required.
+	 * Even if iocb->ki_left = 0, we may need to wait
+	 * for a balance_dirty_pages to complete
 	 */
 	if (ret > 0) {
-		iocb->ki_buf += ret;
+		iocb->ki_buf += iocb->ki_buf ? ret : 0;
 		iocb->ki_left -= ret;
 
 		ret = -EIOCBRETRY;
 	}
 
 	/* This means we must have transferred all that we could */
-	/* No need to retry anymore */
-	if (ret == 0)
+	/* No need to retry anymore unless we need to osync data */
+	if (ret == 0) {
 		ret = iocb->ki_nbytes - iocb->ki_left;
+		if (!iocb->ki_buf)
+			return ret;
+
+		/* Set things up for potential O_SYNC */
+		if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+			iocb->ki_buf = NULL;
+			iocb->ki_pos -= ret; /* back up fpos */
+			iocb->ki_left = ret; /* sync what we have written out */
+			iocb->ki_nbytes = ret;
+			ret = -EIOCBRETRY;
+		}
+	}
 
 	return ret;
 }
diff -purN -X /home/mbligh/.diff.exclude 540-O_SYNC-speedup/include/linux/aio.h 545-aio-O_SYNC/include/linux/aio.h
--- 540-O_SYNC-speedup/include/linux/aio.h	2004-02-28 11:21:09.000000000 -0800
+++ 545-aio-O_SYNC/include/linux/aio.h	2004-02-28 11:21:41.000000000 -0800
@@ -29,21 +29,26 @@ struct kioctx;
 #define KIF_LOCKED		0
 #define KIF_KICKED		1
 #define KIF_CANCELLED		2
+#define KIF_SYNCED		3
 
 #define kiocbTryLock(iocb)	test_and_set_bit(KIF_LOCKED, &(iocb)->ki_flags)
 #define kiocbTryKick(iocb)	test_and_set_bit(KIF_KICKED, &(iocb)->ki_flags)
+#define kiocbTrySync(iocb)	test_and_set_bit(KIF_SYNCED, &(iocb)->ki_flags)
 
 #define kiocbSetLocked(iocb)	set_bit(KIF_LOCKED, &(iocb)->ki_flags)
 #define kiocbSetKicked(iocb)	set_bit(KIF_KICKED, &(iocb)->ki_flags)
 #define kiocbSetCancelled(iocb)	set_bit(KIF_CANCELLED, &(iocb)->ki_flags)
+#define kiocbSetSynced(iocb)	set_bit(KIF_SYNCED, &(iocb)->ki_flags)
 
 #define kiocbClearLocked(iocb)	clear_bit(KIF_LOCKED, &(iocb)->ki_flags)
 #define kiocbClearKicked(iocb)	clear_bit(KIF_KICKED, &(iocb)->ki_flags)
 #define kiocbClearCancelled(iocb)	clear_bit(KIF_CANCELLED, &(iocb)->ki_flags)
+#define kiocbClearSynced(iocb)	clear_bit(KIF_SYNCED, &(iocb)->ki_flags)
 
 #define kiocbIsLocked(iocb)	test_bit(KIF_LOCKED, &(iocb)->ki_flags)
 #define kiocbIsKicked(iocb)	test_bit(KIF_KICKED, &(iocb)->ki_flags)
 #define kiocbIsCancelled(iocb)	test_bit(KIF_CANCELLED, &(iocb)->ki_flags)
+#define kiocbIsSynced(iocb)	test_bit(KIF_SYNCED, &(iocb)->ki_flags)
 
 struct kiocb {
 	struct list_head	ki_run_list;
diff -purN -X /home/mbligh/.diff.exclude 540-O_SYNC-speedup/include/linux/writeback.h 545-aio-O_SYNC/include/linux/writeback.h
--- 540-O_SYNC-speedup/include/linux/writeback.h	2004-02-28 11:21:36.000000000 -0800
+++ 545-aio-O_SYNC/include/linux/writeback.h	2004-02-28 11:21:41.000000000 -0800
@@ -87,8 +87,10 @@ void page_writeback_init(void);
 void balance_dirty_pages_ratelimited(struct address_space *mapping);
 int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0);
 int do_writepages(struct address_space *mapping, struct writeback_control *wbc);
-int sync_page_range(struct inode *inode, struct address_space *mapping,
+ssize_t sync_page_range(struct inode *inode, struct address_space *mapping,
 			loff_t pos, size_t count);
+ssize_t sync_page_range_nolock(struct inode *inode, struct address_space
+		*mapping, loff_t pos, size_t count);
 
 /* pdflush.c */
 extern int nr_pdflush_threads;	/* Global so it can be exported to sysctl
diff -purN -X /home/mbligh/.diff.exclude 540-O_SYNC-speedup/mm/filemap.c 545-aio-O_SYNC/mm/filemap.c
--- 540-O_SYNC-speedup/mm/filemap.c	2004-02-28 11:21:36.000000000 -0800
+++ 545-aio-O_SYNC/mm/filemap.c	2004-02-28 11:21:41.000000000 -0800
@@ -944,22 +944,19 @@ __generic_file_aio_read(struct kiocb *io
 out:
 	return retval;
 }
-
 EXPORT_SYMBOL(__generic_file_aio_read);
 
-ssize_t
-generic_file_aio_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos)
+ssize_t generic_file_aio_read(struct kiocb *iocb, char __user *buf,
+				size_t count, loff_t pos)
 {
 	struct iovec local_iov = { .iov_base = buf, .iov_len = count };
 
-	BUG_ON(iocb->ki_pos != pos);
 	return __generic_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos);
 }
-
 EXPORT_SYMBOL(generic_file_aio_read);
 
-ssize_t
-generic_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
+ssize_t generic_file_read(struct file *filp, char __user *buf,
+				size_t count, loff_t *ppos)
 {
 	struct iovec local_iov = { .iov_base = buf, .iov_len = count };
 	struct kiocb kiocb;
@@ -971,10 +968,10 @@ generic_file_read(struct file *filp, cha
 		ret = wait_on_sync_kiocb(&kiocb);
 	return ret;
 }
-
 EXPORT_SYMBOL(generic_file_read);
 
-int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
+int file_send_actor(read_descriptor_t * desc, struct page *page,
+				unsigned long offset, unsigned long size)
 {
 	ssize_t written;
 	unsigned long count = desc->count;
@@ -1821,7 +1818,7 @@ EXPORT_SYMBOL(generic_write_checks);
  *							okir@monad.swb.de
  */
 ssize_t
-generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
+__generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
 				unsigned long nr_segs, loff_t *ppos)
 {
 	struct file *file = iocb->ki_filp;
@@ -1990,7 +1987,7 @@ generic_file_aio_write_nolock(struct kio
 	 */
 	if (likely(status >= 0)) {
 		if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
-			if (!a_ops->writepage || !is_sync_kiocb(iocb))
+			if (!a_ops->writepage)
 				status = generic_osync_inode(inode, mapping,
 						OSYNC_METADATA|OSYNC_DATA);
 		}
@@ -2007,6 +2004,55 @@ out:
 EXPORT_SYMBOL(generic_file_aio_write_nolock);
 
 ssize_t
+generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
+				unsigned long nr_segs, loff_t *ppos)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	ssize_t ret;
+	loff_t pos = *ppos;
+
+	if (!iov->iov_base && !is_sync_kiocb(iocb)) {
+		/* nothing to transfer, may just need to sync data */
+		ret = iov->iov_len; /* vector AIO not supported yet */
+		goto osync;
+	}
+
+	ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, ppos);
+
+	/*
+	 * Avoid doing a sync in parts for aio - its more efficient to
+	 * call in again after all the data has been copied
+	 */
+	if (!is_sync_kiocb(iocb))
+		return ret;
+
+osync:
+	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+		ret = sync_page_range_nolock(inode, mapping, pos, ret);
+		if (ret >= 0)
+			*ppos = pos + ret;
+	}
+	return ret;
+}
+
+
+ssize_t
+__generic_file_write_nolock(struct file *file, const struct iovec *iov,
+				unsigned long nr_segs, loff_t *ppos)
+{
+	struct kiocb kiocb;
+	ssize_t ret;
+
+	init_sync_kiocb(&kiocb, file);
+	ret = __generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
+	if (-EIOCBQUEUED == ret)
+		ret = wait_on_sync_kiocb(&kiocb);
+	return ret;
+}
+
+ssize_t
 generic_file_write_nolock(struct file *file, const struct iovec *iov,
 				unsigned long nr_segs, loff_t *ppos)
 {
@@ -2032,19 +2078,29 @@ ssize_t generic_file_aio_write(struct ki
 	struct iovec local_iov = { .iov_base = (void __user *)buf,
 					.iov_len = count };
 
-	BUG_ON(iocb->ki_pos != pos);
+	if (!buf && !is_sync_kiocb(iocb)) {
+		/* nothing to transfer, may just need to sync data */
+		ret = count;
+		goto osync;
+	}
 
 	down(&inode->i_sem);
-	ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
+	ret = __generic_file_aio_write_nolock(iocb, &local_iov, 1,
 						&iocb->ki_pos);
 	up(&inode->i_sem);
 
-	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
-		ssize_t err;
+	/*
+	 * Avoid doing a sync in parts for aio - its more efficient to
+	 * call in again after all the data has been copied
+	 */
+	if (!is_sync_kiocb(iocb))
+		return ret;
 
-		err = sync_page_range(inode, mapping, pos, ret);
-		if (err < 0)
-			ret = err;
+osync:
+	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+		ret = sync_page_range(inode, mapping, pos, ret);
+		if (ret >= 0)
+			iocb->ki_pos = pos + ret;
 	}
 	return ret;
 }
@@ -2060,7 +2116,7 @@ ssize_t generic_file_write(struct file *
 					.iov_len = count };
 
 	down(&inode->i_sem);
-	ret = generic_file_write_nolock(file, &local_iov, 1, ppos);
+	ret = __generic_file_write_nolock(file, &local_iov, 1, ppos);
 	up(&inode->i_sem);
 
 	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
@@ -2097,11 +2153,11 @@ ssize_t generic_file_writev(struct file 
 	ssize_t ret;
 
 	down(&inode->i_sem);
-	ret = generic_file_write_nolock(file, iov, nr_segs, ppos);
+	ret = __generic_file_write_nolock(file, iov, nr_segs, ppos);
 	up(&inode->i_sem);
 
 	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
-		int err;
+		ssize_t err;
 
 		err = sync_page_range(inode, mapping, *ppos - ret, ret);
 		if (err < 0)
diff -purN -X /home/mbligh/.diff.exclude 540-O_SYNC-speedup/mm/page-writeback.c 545-aio-O_SYNC/mm/page-writeback.c
--- 540-O_SYNC-speedup/mm/page-writeback.c	2004-02-28 11:21:36.000000000 -0800
+++ 545-aio-O_SYNC/mm/page-writeback.c	2004-02-28 11:21:41.000000000 -0800
@@ -571,16 +571,19 @@ int test_clear_page_dirty(struct page *p
 EXPORT_SYMBOL(test_clear_page_dirty);
 
 
-static int operate_on_page_range(struct address_space *mapping,
+static ssize_t operate_on_page_range(struct address_space *mapping,
 		loff_t pos, size_t count, int (*operator)(struct page *))
 {
 	pgoff_t first = pos >> PAGE_CACHE_SHIFT;
 	pgoff_t last = (pos + count - 1) >> PAGE_CACHE_SHIFT;	/* inclusive */
 	pgoff_t next = first;
 	struct pagevec pvec;
-	ssize_t ret = 0;
+	ssize_t ret = 0, bytes = 0;
 	int i;
 
+	if (count == 0)
+		return 0;
+
 	pagevec_init(&pvec, 0);
 	while (pagevec_lookup(&pvec, mapping, next,
 				min((pgoff_t)PAGEVEC_SIZE, last - next + 1))) {
@@ -595,6 +598,8 @@ static int operate_on_page_range(struct 
 			}
 			next = page->index + 1;
 			ret = (*operator)(page);
+			if (ret == -EIOCBRETRY)
+				break;
 			if (PageError(page)) {
 				if (!ret)
 					ret = -EIO;
@@ -603,20 +608,22 @@ static int operate_on_page_range(struct 
 				break;
 		}
 		pagevec_release(&pvec);
-		if (next > last)
+		if ((next > last) || (ret == -EIOCBRETRY))
 			break;
 	}
-	return ret;
+	bytes = (next << PAGE_CACHE_SHIFT) - pos;
+	if (bytes > count)
+		bytes = count;
+	return (bytes && (!ret || (ret == -EIOCBRETRY))) ? bytes : ret;
 }
 
 static int page_waiter(struct page *page)
 {
 	unlock_page(page);
-	wait_on_page_writeback(page);
-	return 0;
+	return wait_on_page_writeback_wq(page, current->io_wait);
 }
 
-static int
+static size_t
 wait_on_page_range(struct address_space *mapping, loff_t pos, size_t count)
 {
 	return operate_on_page_range(mapping, pos, count, page_waiter);
@@ -629,11 +636,15 @@ static int page_writer(struct page *page
 		.nr_to_write	= 1,
 	};
 
+	if (!test_clear_page_dirty(page)) {
+		unlock_page(page);
+		return 0;
+	}
 	wait_on_page_writeback(page);
 	return page->mapping->a_ops->writepage(page, &wbc);
 }
 
-static int
+static ssize_t
 write_out_page_range(struct address_space *mapping, loff_t pos, size_t count)
 {
 	return operate_on_page_range(mapping, pos, count, page_writer);
@@ -647,22 +658,58 @@ write_out_page_range(struct address_spac
  * We need to re-take i_sem during the generic_osync_inode list walk because
  * it is otherwise livelockable.
  */
-int sync_page_range(struct inode *inode, struct address_space *mapping,
+ssize_t sync_page_range(struct inode *inode, struct address_space *mapping,
 			loff_t pos, size_t count)
 {
-	int ret;
+	int ret = 0;
 
+	if (in_aio()) {
+		/* Already issued writeouts for this iocb ? */
+		if (kiocbTrySync(io_wait_to_kiocb(current->io_wait)))
+			goto do_wait; /* just need to check if done */
+	}
 	if (!mapping->a_ops->writepage)
 		return 0;
 	if (mapping->backing_dev_info->memory_backed)
 		return 0;
 	ret = write_out_page_range(mapping, pos, count);
-	if (ret == 0) {
+	if (ret >= 0) {
 		down(&inode->i_sem);
 		ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
 		up(&inode->i_sem);
 	}
-	if (ret == 0)
+do_wait:
+	if (ret >= 0)
+		ret = wait_on_page_range(mapping, pos, count);
+	return ret;
+}
+
+/*
+ * It is really better to use sync_page_range, rather than call
+ * sync_page_range_nolock while holding i_sem, if you don't
+ * want to block parallel O_SYNC writes until the pages in this
+ * range are written out.
+ */
+ssize_t sync_page_range_nolock(struct inode *inode, struct address_space
+	*mapping, loff_t pos, size_t count)
+{
+	ssize_t ret = 0;
+
+	if (in_aio()) {
+		/* Already issued writeouts for this iocb ? */
+		if (kiocbTrySync(io_wait_to_kiocb(current->io_wait)))
+			goto do_wait; /* just need to check if done */
+	}
+	if (!mapping->a_ops->writepage)
+		return 0;
+	if (mapping->backing_dev_info->memory_backed)
+		return 0;
+	ret = write_out_page_range(mapping, pos, count);
+	if (ret >= 0) {
+		ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
+	}
+do_wait:
+	if (ret >= 0)
 		ret = wait_on_page_range(mapping, pos, count);
 	return ret;
 }