From: Oleg Drokin <green@namesys.com>

This patch implements DirectIO support for reiserfs v3.  This is mostly a
port from 2.4.

Thanks to Mingming Cao from IBM for some clues in porting.



 25-akpm/fs/reiserfs/file.c            |   39 +++++++++++++
 25-akpm/fs/reiserfs/inode.c           |   97 +++++++++++++++++++++++++++++-----
 25-akpm/fs/reiserfs/tail_conversion.c |    4 +
 3 files changed, 126 insertions(+), 14 deletions(-)

diff -puN fs/reiserfs/file.c~reiserfs-direct-io fs/reiserfs/file.c
--- 25/fs/reiserfs/file.c~reiserfs-direct-io	Tue Sep  2 15:14:22 2003
+++ 25-akpm/fs/reiserfs/file.c	Tue Sep  2 15:14:22 2003
@@ -485,6 +485,11 @@ int reiserfs_allocate_blocks_for_region(
     /* Now the final thing, if we have grew the file, we must update it's size*/
     if ( pos + write_bytes > inode->i_size) {
 	inode->i_size = pos + write_bytes; // Set new size
+	/* If the file have grown so much that tail packing is no longer possible, reset
+	   "need to pack" flag */
+	if ( (have_large_tails (inode->i_sb) && inode->i_size > i_block_size (inode)*4) ||
+	     (have_small_tails (inode->i_sb) && inode->i_size > i_block_size(inode)) )
+	    REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask ;
     }
 
     /* Amount of on-disk blocks used by file have changed, update it */
@@ -999,9 +1004,41 @@ ssize_t reiserfs_file_write( struct file
     struct page * prepared_pages[REISERFS_WRITE_PAGES_AT_A_TIME];
 				/* To simplify coding at this time, we store
 				   locked pages in array for now */
-    if ( count <= PAGE_CACHE_SIZE || file->f_flags & O_DIRECT)
+    if ( count <= PAGE_CACHE_SIZE )
         return generic_file_write(file, buf, count, ppos);
 
+    if ( file->f_flags & O_DIRECT) { // Direct IO needs some special threating.
+	int result, after_file_end = 0;
+	if ( (*ppos + count >= inode->i_size) || (file->f_flags & O_APPEND) ) {
+	    /* If we are appending a file, we need to put this savelink in here.
+	       If we will crash while doing direct io, finish_unfinished will
+	       cut the garbage from the file end. */
+	    struct reiserfs_transaction_handle th;
+	    reiserfs_write_lock(inode->i_sb);
+	    journal_begin(&th, inode->i_sb,  JOURNAL_PER_BALANCE_CNT );
+	    reiserfs_update_inode_transaction(inode);
+	    add_save_link (&th, inode, 1 /* Truncate */);
+	    journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT );
+	    reiserfs_write_unlock(inode->i_sb);
+	    after_file_end = 1;
+	}
+	result = generic_file_write(file, buf, count, ppos);
+
+	if ( after_file_end ) { /* Now update i_size and remove the savelink */
+	    struct reiserfs_transaction_handle th;
+	    reiserfs_write_lock(inode->i_sb);
+	    journal_begin(&th, inode->i_sb, 1);
+	    reiserfs_update_inode_transaction(inode);
+	    reiserfs_update_sd(&th, inode);
+	    journal_end(&th, inode->i_sb, 1);
+	    remove_save_link (inode, 1/* truncate */);
+	    reiserfs_write_unlock(inode->i_sb);
+	}
+
+	return result;
+    }
+
+
     if ( unlikely((ssize_t) count < 0 ))
         return -EINVAL;
 
diff -puN fs/reiserfs/inode.c~reiserfs-direct-io fs/reiserfs/inode.c
--- 25/fs/reiserfs/inode.c~reiserfs-direct-io	Tue Sep  2 15:14:22 2003
+++ 25-akpm/fs/reiserfs/inode.c	Tue Sep  2 15:14:22 2003
@@ -306,7 +306,7 @@ research:
 	** read old data off disk.  Set the up to date bit on the buffer instead
 	** and jump to the end
 	*/
-	    if (PageUptodate(bh_result->b_page)) {
+	    if (!bh_result->b_page || PageUptodate(bh_result->b_page)) {
 		set_buffer_uptodate(bh_result);
 		goto finished ;
     }
@@ -420,6 +420,45 @@ static int reiserfs_get_block_create_0 (
     return reiserfs_get_block(inode, block, bh_result, GET_BLOCK_NO_HOLE) ;
 }
 
+/* This is special helper for reiserfs_get_block in case we are executing
+   direct_IO request. */
+static int reiserfs_get_blocks_direct_io(struct inode *inode,
+					 sector_t iblock,
+					 unsigned long max_blocks,
+					 struct buffer_head *bh_result,
+					 int create)
+{
+    int ret ;
+
+    bh_result->b_page = NULL;
+
+    /* We set the b_size before reiserfs_get_block call since it is
+       referenced in convert_tail_for_hole() that may be called from
+       reiserfs_get_block() */
+    bh_result->b_size = (1 << inode->i_blkbits);
+
+    ret = reiserfs_get_block(inode, iblock, bh_result, create) ;
+
+    /* don't allow direct io onto tail pages */
+    if (ret == 0 && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
+        /* make sure future calls to the direct io funcs for this offset
+        ** in the file fail by unmapping the buffer
+        */
+        reiserfs_unmap_buffer(bh_result);
+        ret = -EINVAL ;
+    }
+    /* Possible unpacked tail. Flush the data before pages have
+       disappeared */
+    if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) {
+        lock_kernel();
+        reiserfs_commit_for_inode(inode);
+        REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
+        unlock_kernel();
+    }
+    return ret ;
+}
+
+
 /*
 ** helper function for when reiserfs_get_block is called for a hole
 ** but the file tail is still in a direct item
@@ -448,7 +487,10 @@ static int convert_tail_for_hole(struct 
     tail_end = (tail_start | (bh_result->b_size - 1)) + 1 ;
 
     index = tail_offset >> PAGE_CACHE_SHIFT ;
-    if (index != hole_page->index) {
+    /* hole_page can be zero in case of direct_io, we are sure
+       that we cannot get here if we write with O_DIRECT into
+       tail page */
+    if (!hole_page || index != hole_page->index) {
 	tail_page = grab_cache_page(inode->i_mapping, index) ;
 	retval = -ENOMEM;
 	if (!tail_page) {
@@ -554,7 +596,12 @@ int reiserfs_get_block (struct inode * i
 	return ret;
     }
 
-    REISERFS_I(inode)->i_flags |= i_pack_on_close_mask ;
+    /* If file is of such a size, that it might have a tail and tails are enabled
+    ** we should mark it as possibly needing tail packing on close
+    */
+    if ( (have_large_tails (inode->i_sb) && inode->i_size < i_block_size (inode)*4) ||
+	 (have_small_tails (inode->i_sb) && inode->i_size < i_block_size(inode)) )
+	REISERFS_I(inode)->i_flags |= i_pack_on_close_mask ;
 
     windex = push_journal_writer("reiserfs_get_block") ;
   
@@ -745,21 +792,26 @@ int reiserfs_get_block (struct inode * i
 	    */
 	    set_buffer_uptodate (unbh);
 
-	    /* we've converted the tail, so we must 
-	    ** flush unbh before the transaction commits
-	    */
-	    add_to_flushlist(inode, unbh) ;
-
-	    /* mark it dirty now to prevent commit_write from adding
-	     ** this buffer to the inode's dirty buffer list
+	    /* unbh->b_page == NULL in case of DIRECT_IO request, this means
+	       buffer will disappear shortly, so it should not be added to
 	     */
+	    if ( unbh->b_page ) {
+		/* we've converted the tail, so we must
+		** flush unbh before the transaction commits
+		*/
+		add_to_flushlist(inode, unbh) ;
+
+		/* mark it dirty now to prevent commit_write from adding
+		** this buffer to the inode's dirty buffer list
+		*/
 		/*
 		 * AKPM: changed __mark_buffer_dirty to mark_buffer_dirty().
 		 * It's still atomic, but it sets the page dirty too,
 		 * which makes it eligible for writeback at any time by the
 		 * VM (which was also the case with __mark_buffer_dirty())
 		 */
-	    mark_buffer_dirty(unbh) ;
+		mark_buffer_dirty(unbh) ;
+	    }
 
 	    //inode->i_blocks += inode->i_sb->s_blocksize / 512;
 	    //mark_tail_converted (inode);
@@ -2204,6 +2256,13 @@ static int reiserfs_commit_write(struct 
     if (pos > inode->i_size) {
 	struct reiserfs_transaction_handle th ;
 	reiserfs_write_lock(inode->i_sb);
+	/* If the file have grown beyond the border where it
+	   can have a tail, unmark it as needing a tail
+	   packing */
+	if ( (have_large_tails (inode->i_sb) && inode->i_size > i_block_size (inode)*4) ||
+	     (have_small_tails (inode->i_sb) && inode->i_size > i_block_size(inode)) )
+	    REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask ;
+
 	journal_begin(&th, inode->i_sb, 1) ;
 	reiserfs_update_inode_transaction(inode) ;
 	inode->i_size = pos ;
@@ -2310,6 +2369,19 @@ static int reiserfs_releasepage(struct p
     return ret ;
 }
 
+/* We thank Mingming Cao for helping us understand in great detail what
+   to do in this section of the code. */
+static int reiserfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
+			      loff_t offset, unsigned long nr_segs)
+{
+    struct file *file = iocb->ki_filp;
+    struct inode *inode = file->f_dentry->d_inode->i_mapping->host;
+
+    return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+			      offset, nr_segs, reiserfs_get_blocks_direct_io, NULL);
+}
+
+
 struct address_space_operations reiserfs_address_space_operations = {
     .writepage = reiserfs_writepage,
     .readpage = reiserfs_readpage, 
@@ -2318,5 +2390,6 @@ struct address_space_operations reiserfs
     .sync_page = block_sync_page,
     .prepare_write = reiserfs_prepare_write,
     .commit_write = reiserfs_commit_write,
-    .bmap = reiserfs_aop_bmap
+    .bmap = reiserfs_aop_bmap,
+    .direct_IO = reiserfs_direct_IO
 } ;
diff -puN fs/reiserfs/tail_conversion.c~reiserfs-direct-io fs/reiserfs/tail_conversion.c
--- 25/fs/reiserfs/tail_conversion.c~reiserfs-direct-io	Tue Sep  2 15:14:22 2003
+++ 25-akpm/fs/reiserfs/tail_conversion.c	Tue Sep  2 15:14:22 2003
@@ -104,8 +104,10 @@ int direct2indirect (struct reiserfs_tra
 	/* we only send the unbh pointer if the buffer is not up to date.
 	** this avoids overwriting good data from writepage() with old data
 	** from the disk or buffer cache
+	** Special case: unbh->b_page will be NULL if we are coming through
+	** DIRECT_IO handler here.
 	*/
-	if (buffer_uptodate(unbh) || PageUptodate(unbh->b_page)) {
+	if (!unbh->b_page || buffer_uptodate(unbh) || PageUptodate(unbh->b_page)) {
 	    up_to_date_bh = NULL ;
 	} else {
 	    up_to_date_bh = unbh ;

_