From: Hugh Dickins <hugh@veritas.com>

Despite its restart_pgoff pretensions, unmap_mapping_range_vma was fatally
unable to distinguish a vma to be restarted from the case where that vma
has been freed, and its vm_area_struct reused for the top part of a
!new_below split of an isomorphic vma yet to be scanned.

The obvious answer is to note restart_vma in the struct address_space, and
cancel it when that vma is freed; but I'm reluctant to enlarge every struct
inode just for this.  Another answer is to flag valid restart in the
vm_area_struct; but vm_flags is protected by down_write of mmap_sem, which
we cannot take within down_write of i_sem.  If we're going to need yet
another field, better to record the restart_addr itself: restart_vma only
recorded the last restart, but a busy tree could well use more.

Actually, we don't need another field: we can neatly (though naughtily)
keep restart_addr in vm_truncate_count, provided mapping->truncate_count
leaps over those values which look like a page-aligned address.  Zero
remains good for forcing a scan (though now interpreted as restart_addr 0),
and it turns out no change is needed to any of the vm_truncate_count
settings in dup_mmap, vma_link, vma_adjust, move_one_page.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 25-akpm/include/linux/mm.h |    7 +----
 25-akpm/mm/memory.c        |   57 +++++++++++++++------------------------------
 2 files changed, 22 insertions(+), 42 deletions(-)

diff -puN include/linux/mm.h~vmtrunc-restart_addr-in-truncate_count include/linux/mm.h
--- 25/include/linux/mm.h~vmtrunc-restart_addr-in-truncate_count	Thu Oct  7 14:03:45 2004
+++ 25-akpm/include/linux/mm.h	Thu Oct  7 14:03:45 2004
@@ -103,7 +103,7 @@ struct vm_area_struct {
 					   units, *not* PAGE_CACHE_SIZE */
 	struct file * vm_file;		/* File we map to (can be NULL). */
 	void * vm_private_data;		/* was vm_pte (shared mem) */
-	unsigned int vm_truncate_count;	/* compare mapping->truncate_count */
+	unsigned long vm_truncate_count;/* truncate_count or restart_addr */
 
 #ifdef CONFIG_NUMA
 	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
@@ -557,11 +557,8 @@ struct zap_details {
 	pgoff_t	first_index;			/* Lowest page->index to unmap */
 	pgoff_t last_index;			/* Highest page->index to unmap */
 	spinlock_t *i_mmap_lock;		/* For unmap_mapping_range: */
-	struct vm_area_struct *restart_vma;	/* Where lock was dropped */
-	pgoff_t restart_pgoff;			/* File offset for restart */
-	unsigned long restart_addr;		/* Where we should restart */
 	unsigned long break_addr;		/* Where unmap_vmas stopped */
-	unsigned int truncate_count;		/* Compare vm_truncate_count */
+	unsigned long truncate_count;		/* Compare vm_truncate_count */
 };
 
 void zap_page_range(struct vm_area_struct *vma, unsigned long address,
diff -puN mm/memory.c~vmtrunc-restart_addr-in-truncate_count mm/memory.c
--- 25/mm/memory.c~vmtrunc-restart_addr-in-truncate_count	Thu Oct  7 14:03:45 2004
+++ 25-akpm/mm/memory.c	Thu Oct  7 14:03:45 2004
@@ -1180,20 +1180,13 @@ no_new_page:
  *
  * In order to make forward progress despite repeatedly restarting some
  * large vma, note the break_addr set by unmap_vmas when it breaks out:
- * and restart from that address when we reach that vma again (so long
- * as another such was not inserted earlier while the lock was dropped).
- *
- * There is no guarantee that the restart_vma will remain intact when
- * the lock is regained: but if it has been freed to some other use,
- * it cannot then appear in the tree or list of vmas, so no harm done;
- * if it has been reused for a new vma of the same mapping, nopage
- * checks on i_size and truncate_count ensure it cannot be mapping any
- * of the truncated pages, so the area below restart_addr is still safe
- * to skip - but we must check pgoff to prevent spurious unmapping; or
- * restart_vma may have been split or merged, shrunk or extended - but
- * never shifted, so restart_addr and restart_pgoff remain in synch
- * (even in the case of mremap move, which makes a copy of the vma).
+ * and restart from that address when we reach that vma again.  It might
+ * have been split or merged, shrunk or extended, but never shifted: so
+ * restart_addr remains valid so long as it remains in the vma's range.
+ * unmap_mapping_range forces truncate_count to leap over page-aligned
+ * values so we can save vma's restart_addr in its truncate_count field.
  */
+#define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK))
 
 static void reset_vma_truncate_counts(struct address_space *mapping)
 {
@@ -1210,26 +1203,20 @@ static int unmap_mapping_range_vma(struc
 		unsigned long start_addr, unsigned long end_addr,
 		struct zap_details *details)
 {
+	unsigned long restart_addr;
 	int need_break;
 
-	if (vma == details->restart_vma &&
-	    start_addr < details->restart_addr) {
-		/*
-		 * Be careful: this vm_area_struct may have been reused
-		 * meanwhile.  If pgoff matches up, it's probably the
-		 * same one (perhaps shrunk or extended), but no harm is
-		 * done if actually it's new.  If pgoff does not match,
-		 * it would likely be wrong to unmap from restart_addr,
-		 * but it must be new, so we can just mark it completed.
-		 */
-		start_addr = details->restart_addr;
-		if (linear_page_index(vma, start_addr) !=
-		    details->restart_pgoff || start_addr >= end_addr) {
+again:
+	restart_addr = vma->vm_truncate_count;
+	if (is_restart_addr(restart_addr) && start_addr < restart_addr) {
+		start_addr = restart_addr;
+		if (start_addr >= end_addr) {
+			/* Top of vma has been split off since last time */
 			vma->vm_truncate_count = details->truncate_count;
 			return 0;
 		}
 	}
-again:
+
 	details->break_addr = end_addr;
 	zap_page_range(vma, start_addr, end_addr - start_addr, details);
 
@@ -1248,14 +1235,10 @@ again:
 		if (!need_break)
 			return 0;
 	} else {
-		if (!need_break) {
-			start_addr = details->break_addr;
+		/* Note restart_addr in vma's truncate_count field */
+		vma->vm_truncate_count = details->break_addr;
+		if (!need_break)
 			goto again;
-		}
-		details->restart_vma = vma;
-		details->restart_pgoff =
-			linear_page_index(vma, details->break_addr);
-		details->restart_addr = details->break_addr;
 	}
 
 	spin_unlock(details->i_mmap_lock);
@@ -1357,15 +1340,15 @@ void unmap_mapping_range(struct address_
 	if (details.last_index < details.first_index)
 		details.last_index = ULONG_MAX;
 	details.i_mmap_lock = &mapping->i_mmap_lock;
-	details.restart_vma = NULL;
 
 	spin_lock(&mapping->i_mmap_lock);
 
 	/* Protect against page faults, and endless unmapping loops */
 	mapping->truncate_count++;
-	if (unlikely(mapping->truncate_count == 0)) {
+	if (unlikely(is_restart_addr(mapping->truncate_count))) {
+		if (mapping->truncate_count == 0)
+			reset_vma_truncate_counts(mapping);
 		mapping->truncate_count++;
-		reset_vma_truncate_counts(mapping);
 	}
 	details.truncate_count = mapping->truncate_count;
 
_