From: Suparna Bhattacharya <suparna@in.ibm.com>

I picked up Badari's forward port of Stephen Tweedie's
DIO fixes from 2.4, and have updated it with the following:

- Drop i_sem after i/o submission for DIO reads/writes.
- Release i_alloc_sem on DIO completion.
- Modify other write paths i.e. generic_file_aio_write, 
  generic_file_writev and xfs write to handle acquistion of 
  i_alloc_sem and align with i_sem and i_alloc_sem being released 
  directly by the DIO code, rather than by the caller.

I had to debate a bit about how the _write_nolock() cases
should be handled. The assumption I made is that for a regular
file, the caller would have already acquired i_alloc_sem and
i_sem. When its not a regular file the DIO code doesn't touch
i_sem/i_alloc_sem anyway.

Let me know how this looks to you.

I'm not quite sure how to test these changes to be sure
it all works. I have run aio-stress for O_DIRECT reads
and writes. I still need to play with running fsx (there
seem to be some problems there as I'm seeing short writes
- need to investigate whether the problem exists with mm5
as such or if its due to this patch).



 fs/buffer.c             |   32 ++++++++++++++++----------
 fs/direct-io.c          |   21 ++++++++++++++++-
 fs/inode.c              |    1 
 fs/open.c               |    2 +
 fs/xfs/linux/xfs_file.c |    2 +
 include/linux/fs.h      |    2 +
 mm/filemap.c            |   58 +++++++++++++++++++++++++++++++++++++++++++++---
 7 files changed, 102 insertions(+), 16 deletions(-)

diff -puN fs/buffer.c~O_DIRECT-race-fixes fs/buffer.c
--- 25/fs/buffer.c~O_DIRECT-race-fixes	2003-09-16 18:39:34.000000000 -0700
+++ 25-akpm/fs/buffer.c	2003-09-16 18:39:34.000000000 -0700
@@ -374,26 +374,18 @@ out:
 	return ret;
 }
 
-asmlinkage long sys_fdatasync(unsigned int fd)
+int do_fdatasync(struct file * file)
 {
-	struct file * file;
 	struct dentry * dentry;
 	struct inode * inode;
 	int ret, err;
 
-	ret = -EBADF;
-	file = fget(fd);
-	if (!file)
-		goto out;
+	if (unlikely(!file->f_op || !file->f_op->fsync))
+		return -EINVAL;
 
 	dentry = file->f_dentry;
 	inode = dentry->d_inode;
 
-	ret = -EINVAL;
-	if (!file->f_op || !file->f_op->fsync)
-		goto out_putf;
-
-	down(&inode->i_sem);
 	current->flags |= PF_SYNCWRITE;
 	ret = filemap_fdatawrite(inode->i_mapping);
 	err = file->f_op->fsync(file, dentry, 1);
@@ -403,8 +395,24 @@ asmlinkage long sys_fdatasync(unsigned i
 	if (!ret)
 		ret = err;
 	current->flags &= ~PF_SYNCWRITE;
-	up(&inode->i_sem);
+	return ret;
+}
+
+asmlinkage long sys_fdatasync(unsigned int fd)
+{
+	struct file * file;
+	struct inode * inode;
+	int ret, err;
+
+	ret = -EBADF;
+	file = fget(fd);
+	if (!file)
+		goto out;
 
+	inode = file->f_dentry->d_inode;
+	down(&inode->i_sem);
+	ret = do_fdatasync(file);
+	up(&inode->i_sem);
 out_putf:
 	fput(file);
 out:
diff -puN fs/direct-io.c~O_DIRECT-race-fixes fs/direct-io.c
--- 25/fs/direct-io.c~O_DIRECT-race-fixes	2003-09-16 18:39:34.000000000 -0700
+++ 25-akpm/fs/direct-io.c	2003-09-16 18:39:34.000000000 -0700
@@ -58,6 +58,7 @@ struct dio {
 	struct bio *bio;		/* bio under assembly */
 	struct inode *inode;
 	int rw;
+	int is_reg_file;
 	unsigned blkbits;		/* doesn't change */
 	unsigned blkfactor;		/* When we're using an alignment which
 					   is finer than the filesystem's soft
@@ -205,6 +206,8 @@ static void dio_complete(struct dio *dio
 {
 	if (dio->end_io)
 		dio->end_io(dio->inode, offset, bytes, dio->map_bh.b_private);
+	if (dio->is_reg_file)
+		up_read(&dio->inode->i_alloc_sem);
 }
 
 /*
@@ -448,6 +451,7 @@ static int get_more_blocks(struct dio *d
 	unsigned long fs_count;	/* Number of filesystem-sized blocks */
 	unsigned long dio_count;/* Number of dio_block-sized blocks */
 	unsigned long blkmask;
+	int beyond_eof = 0;
 
 	/*
 	 * If there was a memory error and we've overwritten all the
@@ -465,8 +469,13 @@ static int get_more_blocks(struct dio *d
 		if (dio_count & blkmask)	
 			fs_count++;
 
+		if (dio->is_reg_file) {
+			if (dio->block_in_file > dio->inode->i_size) {
+				beyond_eof = 1;
+			}
+		}
 		ret = (*dio->get_blocks)(dio->inode, fs_startblk, fs_count,
-				map_bh, dio->rw == WRITE);
+				map_bh, (dio->rw == WRITE) && beyond_eof);
 	}
 	return ret;
 }
@@ -773,6 +782,10 @@ do_holes:
 			if (!buffer_mapped(map_bh)) {
 				char *kaddr;
 
+				if (dio->rw == WRITE) {
+					return -ENOTBLK;
+				}
+
 				if (dio->block_in_file >=
 					i_size_read(dio->inode)>>blkbits) {
 					/* We hit eof */
@@ -863,6 +876,10 @@ direct_io_worker(int rw, struct kiocb *i
 	dio->start_zero_done = 0;
 	dio->block_in_file = offset >> blkbits;
 	dio->blocks_available = 0;
+	if (S_ISREG(inode->i_mode))
+		dio->is_reg_file = 1;
+	else
+		dio->is_reg_file = 0;
 
 	dio->cur_page = NULL;
 
@@ -949,6 +966,8 @@ direct_io_worker(int rw, struct kiocb *i
 	 * OK, all BIOs are submitted, so we can decrement bio_count to truly
 	 * reflect the number of to-be-processed BIOs.
 	 */
+	if (dio->is_reg_file)
+		up(&dio->inode->i_sem); 	/* Don't need i_sem any more */
 	if (dio->is_async) {
 		if (ret == 0)
 			ret = dio->result;	/* Bytes written */
diff -puN fs/inode.c~O_DIRECT-race-fixes fs/inode.c
--- 25/fs/inode.c~O_DIRECT-race-fixes	2003-09-16 18:39:34.000000000 -0700
+++ 25-akpm/fs/inode.c	2003-09-16 18:39:34.000000000 -0700
@@ -183,6 +183,7 @@ void inode_init_once(struct inode *inode
 	INIT_LIST_HEAD(&inode->i_dentry);
 	INIT_LIST_HEAD(&inode->i_devices);
 	sema_init(&inode->i_sem, 1);
+	init_rwsem(&inode->i_alloc_sem);
 	INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
 	spin_lock_init(&inode->i_data.page_lock);
 	init_MUTEX(&inode->i_data.i_shared_sem);
diff -puN fs/open.c~O_DIRECT-race-fixes fs/open.c
--- 25/fs/open.c~O_DIRECT-race-fixes	2003-09-16 18:39:34.000000000 -0700
+++ 25-akpm/fs/open.c	2003-09-16 18:39:34.000000000 -0700
@@ -187,11 +187,13 @@ int do_truncate(struct dentry *dentry, l
 	if (length < 0)
 		return -EINVAL;
 
+	down_write(&dentry->d_inode->i_alloc_sem);
 	newattrs.ia_size = length;
 	newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
 	down(&dentry->d_inode->i_sem);
 	err = notify_change(dentry, &newattrs);
 	up(&dentry->d_inode->i_sem);
+	up_write(&dentry->d_inode->i_alloc_sem);
 	return err;
 }
 
diff -puN fs/xfs/linux/xfs_file.c~O_DIRECT-race-fixes fs/xfs/linux/xfs_file.c
--- 25/fs/xfs/linux/xfs_file.c~O_DIRECT-race-fixes	2003-09-16 18:39:34.000000000 -0700
+++ 25-akpm/fs/xfs/linux/xfs_file.c	2003-09-16 18:39:34.000000000 -0700
@@ -94,6 +94,8 @@ linvfs_write(
 	BUG_ON(iocb->ki_pos != pos);
 
 	if (direct) {
+		down_read(&inode->i_alloc_sem);
+		down(&inode->i_sem);
 		VOP_WRITE(vp, iocb, &iov, 1, &iocb->ki_pos, NULL, error);
 	} else {
 		down(&inode->i_sem);
diff -puN include/linux/fs.h~O_DIRECT-race-fixes include/linux/fs.h
--- 25/include/linux/fs.h~O_DIRECT-race-fixes	2003-09-16 18:39:34.000000000 -0700
+++ 25-akpm/include/linux/fs.h	2003-09-16 18:39:34.000000000 -0700
@@ -388,6 +388,7 @@ struct inode {
 	unsigned short          i_bytes;
 	spinlock_t		i_lock;	/* i_blocks, i_bytes, maybe i_size */
 	struct semaphore	i_sem;
+	struct rw_semaphore	i_alloc_sem;
 	struct inode_operations	*i_op;
 	struct file_operations	*i_fop;	/* former ->i_op->default_file_ops */
 	struct super_block	*i_sb;
@@ -1204,6 +1205,7 @@ static inline void invalidate_remote_ino
 }
 extern void invalidate_inode_pages2(struct address_space *mapping);
 extern void write_inode_now(struct inode *, int);
+extern int do_fdatasync(struct file *);
 extern int filemap_fdatawrite(struct address_space *);
 extern int filemap_flush(struct address_space *);
 extern int filemap_fdatawait(struct address_space *);
diff -puN mm/filemap.c~O_DIRECT-race-fixes mm/filemap.c
--- 25/mm/filemap.c~O_DIRECT-race-fixes	2003-09-16 18:39:34.000000000 -0700
+++ 25-akpm/mm/filemap.c	2003-09-16 18:39:34.000000000 -0700
@@ -883,6 +883,10 @@ __generic_file_aio_read(struct kiocb *io
 		retval = 0;
 		if (!count)
 			goto out; /* skip atime */
+		if (S_ISREG(inode->i_mode)) {
+			down_read(&inode->i_alloc_sem);
+			down(&inode->i_sem);
+		}
 		size = i_size_read(inode);
 		if (pos < size) {
 			retval = generic_file_direct_IO(READ, iocb,
@@ -893,6 +897,12 @@ __generic_file_aio_read(struct kiocb *io
 				*ppos = pos + retval;
 		}
 		update_atime(filp->f_dentry->d_inode);
+		/*
+		 * i_sem and i_alloc_sem release are handled by
+		 * the direct i/o code, the former is released after
+		 * i/o submission, the latter set up to be released
+		 * on actual i/o completion.
+		 */
 		goto out;
 	}
 
@@ -1839,10 +1849,18 @@ __generic_file_aio_write_nolock(struct k
 			status = generic_osync_inode(inode, OSYNC_METADATA);
 		if (written >= 0 && !is_sync_kiocb(iocb))
 			written = -EIOCBQUEUED;
+		if (written == -ENOTBLK) {
+			if (S_ISREG(inode->i_mode)) {
+				down(&inode->i_sem);
+			}
+			goto do_buffer_io;
+		}
 		goto out_status;
 	}
 
+do_buffer_io:
 	buf = iov->iov_base;
+	written = 0; /* Reset in case this is a fallback from O_DIRECT */
 	do {
 		unsigned long index;
 		unsigned long offset;
@@ -1938,6 +1956,14 @@ __generic_file_aio_write_nolock(struct k
 		}
 	}
 
+	if (unlikely(file->f_flags & O_DIRECT)) {
+		if (written)
+			do_fdatasync(file);
+		if (S_ISREG(inode->i_mode)) {
+			up(&inode->i_sem);
+		}
+	}
+
 out_status:	
 	err = written ? written : status;
 out:
@@ -2026,9 +2052,16 @@ ssize_t generic_file_aio_write(struct ki
 	}
 
 	down(&inode->i_sem);
+	if (file->f_flags & O_DIRECT)
+		down_read(&inode->i_alloc_sem);
 	ret = __generic_file_aio_write_nolock(iocb, &local_iov, 1,
 						&iocb->ki_pos);
-	up(&inode->i_sem);
+	/*
+	 * For O_DIRECT, i_sem would have already been released and
+	 * i_alloc_sem is released on i/o completion.
+	 */
+	if (!(file->f_flags & O_DIRECT))
+		up(&inode->i_sem);
 
 	/*
 	 * Avoid doing a sync in parts for aio - its more efficient to
@@ -2057,9 +2090,16 @@ ssize_t generic_file_write(struct file *
 	struct iovec local_iov = { .iov_base = (void __user *)buf,
 					.iov_len = count };
 
+	if (file->f_flags & O_DIRECT)
+		down_read(&inode->i_alloc_sem);
 	down(&inode->i_sem);
 	ret = __generic_file_write_nolock(file, &local_iov, 1, ppos);
-	up(&inode->i_sem);
+	/*
+	 * For O_DIRECT, i_sem would have already been released and
+	 * i_alloc_sem is released on i/o completion
+	 */
+	if (!(file->f_flags & O_DIRECT))
+		up(&inode->i_sem);
 
 	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
 		ssize_t err;
@@ -2090,9 +2130,16 @@ ssize_t generic_file_writev(struct file 
 	struct inode *inode = file->f_dentry->d_inode;
 	ssize_t ret;
 
+	if (file->f_flags & O_DIRECT)
+		down_read(&inode->i_alloc_sem);
 	down(&inode->i_sem);
 	ret = __generic_file_write_nolock(file, iov, nr_segs, ppos);
-	up(&inode->i_sem);
+	/*
+	 * For O_DIRECT, i_sem would have already been released and
+	 * i_alloc_sem is released on i/o completion.
+	 */
+	if (!(file->f_flags & O_DIRECT))
+		up(&inode->i_sem);
 
 	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
 		ssize_t err;
@@ -2105,6 +2152,11 @@ ssize_t generic_file_writev(struct file 
 	return ret;
 }
 
+/*
+ * For regular files, i_sem and i_alloc_sem should be held already.
+ * i_sem may be dropped later once we've mapped the new IO.
+ * i_alloc_sem is kept until the IO completes.
+ */
 ssize_t
 generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 	loff_t offset, unsigned long nr_segs)

_