- When invalidating pages, take care to shoot down any ptes which map them
  as well.

  This ensures that the next mmap access to the page will generate a major
  fault, so NFS's server-side modifications are picked up.

  This also allows us to call invalidate_complete_page() on all pages, so
  filesytems such as ext3 get a chance to invalidate the buffer_heads.

- Don't mark in-pagetable pages as non-uptodate any more.  That broke a
  previous guarantee that mapped-into-user-process pages are always uptodate.

- Check the return value of invalidate_complete_page().  It can fail if
  someone redirties a page after generic_file_direct_IO() write it back.

But we still have a problem.  If invalidate_inode_pages2() calls
unmap_mapping_range(), that can cause zap_pte_range() to dirty the pagecache
pages.  That will redirty the page's buffers and will cause
invalidate_complete_page() to fail.

So, in generic_file_direct_IO() we do a complete pte shootdown on the file
up-front, prior to writing back dirty pagecache.  This is only done for
O_DIRECT writes.  It _could_ be done for O_DIRECT reads too, providing full
mmap-vs-direct-IO coherency for both O_DIRECT reads and O_DIRECT writes, but
permitting the pte shootdown on O_DIRECT reads trivially allows people to nuke
other people's mapped pagecache.

NFS also uses invalidate_inode_pages2() for handling server-side modification
notifications.  But in the NFS case the clear_page_dirty() in
invalidate_inode_pages2() is sufficient, because NFS doesn't have to worry
about the "dirty buffers against a clean page" problem. (I think)

Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 25-akpm/include/linux/fs.h |    2 -
 25-akpm/mm/filemap.c       |   18 ++++++++++--
 25-akpm/mm/truncate.c      |   67 ++++++++++++++++++++++++++++-----------------
 3 files changed, 59 insertions(+), 28 deletions(-)

diff -puN mm/truncate.c~invalidate_inode_pages-mmap-coherency-fix mm/truncate.c
--- 25/mm/truncate.c~invalidate_inode_pages-mmap-coherency-fix	Fri Dec 10 15:13:55 2004
+++ 25-akpm/mm/truncate.c	Fri Dec 10 15:13:55 2004
@@ -65,6 +65,8 @@ truncate_complete_page(struct address_sp
  * be marked dirty at any time too.  So we re-check the dirtiness inside
  * ->tree_lock.  That provides exclusion against the __set_page_dirty
  * functions.
+ *
+ * Returns non-zero if the page was successfully invalidated.
  */
 static int
 invalidate_complete_page(struct address_space *mapping, struct page *page)
@@ -277,50 +279,67 @@ unsigned long invalidate_inode_pages(str
 EXPORT_SYMBOL(invalidate_inode_pages);
 
 /**
- * invalidate_inode_pages2 - remove all unmapped pages from an address_space
+ * invalidate_inode_pages2 - remove all pages from an address_space
  * @mapping - the address_space
  *
- * invalidate_inode_pages2() is like truncate_inode_pages(), except for the case
- * where the page is seen to be mapped into process pagetables.  In that case,
- * the page is marked clean but is left attached to its address_space.
- *
- * The page is also marked not uptodate so that a subsequent pagefault will
- * perform I/O to bringthe page's contents back into sync with its backing
- * store.
+ * Any pages which are found to be mapped into pagetables are unmapped prior to
+ * invalidation.
  *
- * FIXME: invalidate_inode_pages2() is probably trivially livelockable.
+ * Returns -EIO if any pages could not be invalidated.
  */
-void invalidate_inode_pages2(struct address_space *mapping)
+int invalidate_inode_pages2(struct address_space *mapping)
 {
 	struct pagevec pvec;
 	pgoff_t next = 0;
 	int i;
+	int ret = 0;
+	int did_full_unmap = 0;
 
 	pagevec_init(&pvec, 0);
-	while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
-		for (i = 0; i < pagevec_count(&pvec); i++) {
+	while (!ret && pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+		for (i = 0; !ret && i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
+			int was_dirty;
 
 			lock_page(page);
-			if (page->mapping == mapping) {	/* truncate race? */
-				wait_on_page_writeback(page);
-				next = page->index + 1;
-				if (page_mapped(page)) {
-					clear_page_dirty(page);
-					ClearPageUptodate(page);
+			if (page->mapping != mapping) {	/* truncate race? */
+				unlock_page(page);
+				continue;
+			}
+			wait_on_page_writeback(page);
+			next = page->index + 1;
+			while (page_mapped(page)) {
+				if (!did_full_unmap) {
+					/*
+					 * Zap the rest of the file in one hit.
+					 * FIXME: invalidate_inode_pages2()
+					 * should take start/end offsets.
+					 */
+					unmap_mapping_range(mapping,
+						page->index << PAGE_CACHE_SHIFT,
+					  	-1, 0);
+					did_full_unmap = 1;
 				} else {
-					if (!invalidate_complete_page(mapping,
-								      page)) {
-						clear_page_dirty(page);
-						ClearPageUptodate(page);
-					}
+					/*
+					 * Just zap this page
+					 */
+					unmap_mapping_range(mapping,
+					  page->index << PAGE_CACHE_SHIFT,
+					  (page->index << PAGE_CACHE_SHIFT)+1,
+					  0);
 				}
 			}
+			was_dirty = test_clear_page_dirty(page);
+			if (!invalidate_complete_page(mapping, page)) {
+				if (was_dirty)
+					set_page_dirty(page);
+				ret = -EIO;
+			}
 			unlock_page(page);
 		}
 		pagevec_release(&pvec);
 		cond_resched();
 	}
+	return ret;
 }
-
 EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
diff -puN include/linux/fs.h~invalidate_inode_pages-mmap-coherency-fix include/linux/fs.h
--- 25/include/linux/fs.h~invalidate_inode_pages-mmap-coherency-fix	Fri Dec 10 15:13:55 2004
+++ 25-akpm/include/linux/fs.h	Fri Dec 10 15:13:55 2004
@@ -1428,7 +1428,7 @@ static inline void invalidate_remote_ino
 	    S_ISLNK(inode->i_mode))
 		invalidate_inode_pages(inode->i_mapping);
 }
-extern void invalidate_inode_pages2(struct address_space *mapping);
+extern int invalidate_inode_pages2(struct address_space *mapping);
 extern void write_inode_now(struct inode *, int);
 extern int filemap_fdatawrite(struct address_space *);
 extern int filemap_flush(struct address_space *);
diff -puN mm/filemap.c~invalidate_inode_pages-mmap-coherency-fix mm/filemap.c
--- 25/mm/filemap.c~invalidate_inode_pages-mmap-coherency-fix	Fri Dec 10 15:13:55 2004
+++ 25-akpm/mm/filemap.c	Fri Dec 10 15:13:55 2004
@@ -2320,7 +2320,8 @@ ssize_t generic_file_writev(struct file 
 EXPORT_SYMBOL(generic_file_writev);
 
 /*
- * Called under i_sem for writes to S_ISREG files
+ * Called under i_sem for writes to S_ISREG files.   Returns -EIO if something
+ * went wrong during pagecache shootdown.
  */
 ssize_t
 generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
@@ -2330,12 +2331,23 @@ generic_file_direct_IO(int rw, struct ki
 	struct address_space *mapping = file->f_mapping;
 	ssize_t retval;
 
+	/*
+	 * If it's a write, unmap all mmappings of the file up-front.  This
+	 * will cause any pte dirty bits to be propagated into the pageframes
+	 * for the subsequent filemap_write_and_wait().
+	 */
+	if (rw == WRITE && mapping_mapped(mapping))
+		unmap_mapping_range(mapping, 0, -1, 0);
+
 	retval = filemap_write_and_wait(mapping);
 	if (retval == 0) {
 		retval = mapping->a_ops->direct_IO(rw, iocb, iov,
 						offset, nr_segs);
-		if (rw == WRITE && mapping->nrpages)
-			invalidate_inode_pages2(mapping);
+		if (rw == WRITE && mapping->nrpages) {
+			int err = invalidate_inode_pages2(mapping);
+			if (err)
+				retval = err;
+		}
 	}
 	return retval;
 }
_