From: Hugh Dickins <hugh@veritas.com>

Let's lighten the global spinlock mmlist_lock.

What's it for?
1. Its original role is to guard mmlist.
2. It later got a second role, to prevent get_task_mm from raising
   mm_users from the dead, just after it went down to 0.

Firstly consider the second: __exit_mm sets tsk->mm NULL while holding
task_lock before calling mmput; so mmlist_lock only guards against the
exceptional case, of get_task_mm on a kernel workthread which did AIO's
use_mm (which transiently sets its tsk->mm without raising mm_users) on an
mm now exiting.

Well, I don't think get_task_mm should succeed at all on use_mm tasks. 
It's mainly used by /proc/pid and ptrace, seems at best confusing for those
to present the kernel thread as having a user mm, which it won't have a
moment later.  Define PF_BORROWED_MM, set in use_mm, clear in unuse_mm
(though we could just leave it), get_task_mm give NULL if set.

Secondly consider the first: and what's mmlist for?
1. Its original role was for swap_out to scan: rmap ended that in 2.5.27.
2. In 2.4.10 it got a second role, for try_to_unuse to scan for swapoff.

So, make mmlist a list of mms which maybe have pages on swap: add mm to
mmlist when first swap entry is assigned in try_to_unmap_one (pageout), or
in copy_page_range (fork); and mmput remove it from mmlist as before,
except usually list_empty and there's no need to lock.  drain_mmlist added
to swapoff, to empty out the mmlist if no swap is then in use.

mmput leave mm on mmlist until after its exit_mmap, so try_to_unmap_one can
still add mm to mmlist without worrying about the mm_users 0 case; but
try_to_unuse must avoid the mm_users 0 case (when an mm might be removed
from mmlist, and freed, while it's down in unuse_process): use
atomic_inc_return now all architectures support that.

Some of the detailed comments in try_to_unuse have grown out of date:
updated and trimmed some, but leave SWAP_MAP_MAX for another occasion.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 25-akpm/arch/i386/mm/pgtable.c |    6 +----
 25-akpm/fs/aio.c               |    2 +
 25-akpm/fs/exec.c              |    6 -----
 25-akpm/include/linux/sched.h  |    5 +---
 25-akpm/kernel/fork.c          |   37 +++++++++---------------------
 25-akpm/mm/memory.c            |    9 ++++++-
 25-akpm/mm/rmap.c              |    6 +++++
 25-akpm/mm/swapfile.c          |   49 +++++++++++++++++++++++++++--------------
 8 files changed, 64 insertions(+), 56 deletions(-)

diff -puN arch/i386/mm/pgtable.c~lighten-mmlist_lock arch/i386/mm/pgtable.c
--- 25/arch/i386/mm/pgtable.c~lighten-mmlist_lock	2004-09-26 17:20:46.823297456 -0700
+++ 25-akpm/arch/i386/mm/pgtable.c	2004-09-26 17:20:46.837295328 -0700
@@ -165,10 +165,8 @@ void pmd_ctor(void *pmd, kmem_cache_t *c
  * against pageattr.c; it is the unique case in which a valid change
  * of kernel pagetables can't be lazily synchronized by vmalloc faults.
  * vmalloc faults work because attached pagetables are never freed.
- * If the locking proves to be non-performant, a ticketing scheme with
- * checks at dup_mmap(), exec(), and other mmlist addition points
- * could be used. The locking scheme was chosen on the basis of
- * manfred's recommendations and having no core impact whatsoever.
+ * The locking scheme was chosen on the basis of manfred's
+ * recommendations and having no core impact whatsoever.
  * -- wli
  */
 spinlock_t pgd_lock = SPIN_LOCK_UNLOCKED;
diff -puN fs/aio.c~lighten-mmlist_lock fs/aio.c
--- 25/fs/aio.c~lighten-mmlist_lock	2004-09-26 17:20:46.825297152 -0700
+++ 25-akpm/fs/aio.c	2004-09-26 17:20:46.839295024 -0700
@@ -572,6 +572,7 @@ void use_mm(struct mm_struct *mm)
 	struct task_struct *tsk = current;
 
 	task_lock(tsk);
+	tsk->flags |= PF_BORROWED_MM;
 	active_mm = tsk->active_mm;
 	atomic_inc(&mm->mm_count);
 	tsk->mm = mm;
@@ -598,6 +599,7 @@ void unuse_mm(struct mm_struct *mm)
 	struct task_struct *tsk = current;
 
 	task_lock(tsk);
+	tsk->flags &= ~PF_BORROWED_MM;
 	tsk->mm = NULL;
 	/* active_mm is still 'mm' */
 	enter_lazy_tlb(mm, tsk);
diff -puN fs/exec.c~lighten-mmlist_lock fs/exec.c
--- 25/fs/exec.c~lighten-mmlist_lock	2004-09-26 17:20:46.826297000 -0700
+++ 25-akpm/fs/exec.c	2004-09-26 17:20:46.840294872 -0700
@@ -531,12 +531,6 @@ static int exec_mmap(struct mm_struct *m
 	struct task_struct *tsk;
 	struct mm_struct * old_mm, *active_mm;
 
-	/* Add it to the list of mm's */
-	spin_lock(&mmlist_lock);
-	list_add(&mm->mmlist, &init_mm.mmlist);
-	mmlist_nr++;
-	spin_unlock(&mmlist_lock);
-
 	/* Notify parent that we're no longer interested in the old VM */
 	tsk = current;
 	old_mm = current->mm;
diff -puN include/linux/sched.h~lighten-mmlist_lock include/linux/sched.h
--- 25/include/linux/sched.h~lighten-mmlist_lock	2004-09-26 17:20:46.828296696 -0700
+++ 25-akpm/include/linux/sched.h	2004-09-26 17:20:46.841294720 -0700
@@ -218,7 +218,7 @@ struct mm_struct {
 	struct rw_semaphore mmap_sem;
 	spinlock_t page_table_lock;		/* Protects task page tables and mm->rss */
 
-	struct list_head mmlist;		/* List of all active mm's.  These are globally strung
+	struct list_head mmlist;		/* List of maybe swapped mm's.  These are globally strung
 						 * together off init_mm.mmlist, and are protected
 						 * by mmlist_lock
 						 */
@@ -252,8 +252,6 @@ struct mm_struct {
 	struct kioctx		default_kioctx;
 };
 
-extern int mmlist_nr;
-
 struct sighand_struct {
 	atomic_t		count;
 	struct k_sigaction	action[_NSIG];
@@ -840,6 +838,7 @@ do { if (atomic_dec_and_test(&(tsk)->usa
 #define PF_UISLEEP	0x00400000	/* Uninterruptible sleep */
 #define PF_SINBINNED	0x00800000	/* I am sinbinned */
 #define PF_UNPRIV_RT	0x01000000	/* I wanted to be RT but had insufficient privilege*/
+#define PF_BORROWED_MM	0x02000000	/* I am a kthread doing use_mm */
 
 /*
  * Scheduling statistics for a task/thread
diff -puN kernel/fork.c~lighten-mmlist_lock kernel/fork.c
--- 25/kernel/fork.c~lighten-mmlist_lock	2004-09-26 17:20:46.830296392 -0700
+++ 25-akpm/kernel/fork.c	2004-09-26 17:20:46.842294568 -0700
@@ -184,17 +184,6 @@ static inline int dup_mmap(struct mm_str
 	rb_parent = NULL;
 	pprev = &mm->mmap;
 
-	/*
-	 * Add it to the mmlist after the parent.
-	 * Doing it this way means that we can order the list,
-	 * and fork() won't mess up the ordering significantly.
-	 * Add it first so that swapoff can see any swap entries.
-	 */
-	spin_lock(&mmlist_lock);
-	list_add(&mm->mmlist, &current->mm->mmlist);
-	mmlist_nr++;
-	spin_unlock(&mmlist_lock);
-
 	for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) {
 		struct file *file;
 
@@ -294,7 +283,6 @@ static inline void mm_free_pgd(struct mm
 #endif /* CONFIG_MMU */
 
 spinlock_t mmlist_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
-int mmlist_nr;
 
 #define allocate_mm()	(kmem_cache_alloc(mm_cachep, SLAB_KERNEL))
 #define free_mm(mm)	(kmem_cache_free(mm_cachep, (mm)))
@@ -306,6 +294,7 @@ static struct mm_struct * mm_init(struct
 	atomic_set(&mm->mm_users, 1);
 	atomic_set(&mm->mm_count, 1);
 	init_rwsem(&mm->mmap_sem);
+	INIT_LIST_HEAD(&mm->mmlist);
 	mm->core_waiters = 0;
 	mm->page_table_lock = SPIN_LOCK_UNLOCKED;
 	mm->ioctx_list_lock = RW_LOCK_UNLOCKED;
@@ -354,12 +343,14 @@ void fastcall __mmdrop(struct mm_struct 
  */
 void mmput(struct mm_struct *mm)
 {
-	if (atomic_dec_and_lock(&mm->mm_users, &mmlist_lock)) {
-		list_del(&mm->mmlist);
-		mmlist_nr--;
-		spin_unlock(&mmlist_lock);
+	if (atomic_dec_and_test(&mm->mm_users)) {
 		exit_aio(mm);
 		exit_mmap(mm);
+		if (!list_empty(&mm->mmlist)) {
+			spin_lock(&mmlist_lock);
+			list_del(&mm->mmlist);
+			spin_unlock(&mmlist_lock);
+		}
 		put_swap_token(mm);
 		mmdrop(mm);
 	}
@@ -369,15 +360,11 @@ EXPORT_SYMBOL_GPL(mmput);
 /**
  * get_task_mm - acquire a reference to the task's mm
  *
- * Returns %NULL if the task has no mm.  Checks if the use count
- * of the mm is non-zero and if so returns a reference to it, after
+ * Returns %NULL if the task has no mm.  Checks PF_BORROWED_MM (meaning
+ * this kernel workthread has transiently adopted a user mm with use_mm,
+ * to do its AIO) is not set and if so returns a reference to it, after
  * bumping up the use count.  User must release the mm via mmput()
  * after use.  Typically used by /proc and ptrace.
- *
- * If the use count is zero, it means that this mm is going away,
- * so return %NULL.  This only happens in the case of an AIO daemon
- * which has temporarily adopted an mm (see use_mm), in the course
- * of its final mmput, before exit_aio has completed.
  */
 struct mm_struct *get_task_mm(struct task_struct *task)
 {
@@ -386,12 +373,10 @@ struct mm_struct *get_task_mm(struct tas
 	task_lock(task);
 	mm = task->mm;
 	if (mm) {
-		spin_lock(&mmlist_lock);
-		if (!atomic_read(&mm->mm_users))
+		if (task->flags & PF_BORROWED_MM)
 			mm = NULL;
 		else
 			atomic_inc(&mm->mm_users);
-		spin_unlock(&mmlist_lock);
 	}
 	task_unlock(task);
 	return mm;
diff -puN mm/memory.c~lighten-mmlist_lock mm/memory.c
--- 25/mm/memory.c~lighten-mmlist_lock	2004-09-26 17:20:46.831296240 -0700
+++ 25-akpm/mm/memory.c	2004-09-26 17:20:46.844294264 -0700
@@ -288,8 +288,15 @@ skip_copy_pte_range:
 					goto cont_copy_pte_range_noset;
 				/* pte contains position in swap, so copy. */
 				if (!pte_present(pte)) {
-					if (!pte_file(pte))
+					if (!pte_file(pte)) {
 						swap_duplicate(pte_to_swp_entry(pte));
+						if (list_empty(&dst->mmlist)) {
+							spin_lock(&mmlist_lock);
+							list_add(&dst->mmlist,
+								 &src->mmlist);
+							spin_unlock(&mmlist_lock);
+						}
+					}
 					set_pte(dst_pte, pte);
 					goto cont_copy_pte_range_noset;
 				}
diff -puN mm/rmap.c~lighten-mmlist_lock mm/rmap.c
--- 25/mm/rmap.c~lighten-mmlist_lock	2004-09-26 17:20:46.833295936 -0700
+++ 25-akpm/mm/rmap.c	2004-09-26 17:20:46.845294112 -0700
@@ -35,6 +35,7 @@
  *         mm->page_table_lock
  *           zone->lru_lock (in mark_page_accessed)
  *           swap_list_lock (in swap_free etc's swap_info_get)
+ *             mmlist_lock (in mmput, drain_mmlist and others)
  *             swap_device_lock (in swap_duplicate, swap_info_get)
  *             mapping->private_lock (in __set_page_dirty_buffers)
  *             inode_lock (in set_page_dirty's __mark_inode_dirty)
@@ -576,6 +577,11 @@ static int try_to_unmap_one(struct page 
 		 */
 		BUG_ON(!PageSwapCache(page));
 		swap_duplicate(entry);
+		if (list_empty(&mm->mmlist)) {
+			spin_lock(&mmlist_lock);
+			list_add(&mm->mmlist, &init_mm.mmlist);
+			spin_unlock(&mmlist_lock);
+		}
 		set_pte(pte, swp_entry_to_pte(entry));
 		BUG_ON(pte_file(*pte));
 	}
diff -puN mm/swapfile.c~lighten-mmlist_lock mm/swapfile.c
--- 25/mm/swapfile.c~lighten-mmlist_lock	2004-09-26 17:20:46.834295784 -0700
+++ 25-akpm/mm/swapfile.c	2004-09-26 17:20:46.847293808 -0700
@@ -648,11 +648,12 @@ static int try_to_unuse(unsigned int typ
 	 *
 	 * A simpler strategy would be to start at the last mm we
 	 * freed the previous entry from; but that would take less
-	 * advantage of mmlist ordering (now preserved by swap_out()),
-	 * which clusters forked address spaces together, most recent
-	 * child immediately after parent.  If we race with dup_mmap(),
-	 * we very much want to resolve parent before child, otherwise
-	 * we may miss some entries: using last mm would invert that.
+	 * advantage of mmlist ordering, which clusters forked mms
+	 * together, child after parent.  If we race with dup_mmap(), we
+	 * prefer to resolve parent before child, lest we miss entries
+	 * duplicated after we scanned child: using last mm would invert
+	 * that.  Though it's only a serious concern when an overflowed
+	 * swap count is reset from SWAP_MAP_MAX, preventing a rescan.
 	 */
 	start_mm = &init_mm;
 	atomic_inc(&init_mm.mm_users);
@@ -660,15 +661,7 @@ static int try_to_unuse(unsigned int typ
 	/*
 	 * Keep on scanning until all entries have gone.  Usually,
 	 * one pass through swap_map is enough, but not necessarily:
-	 * mmput() removes mm from mmlist before exit_mmap() and its
-	 * zap_page_range().  That's not too bad, those entries are
-	 * on their way out, and handled faster there than here.
-	 * do_munmap() behaves similarly, taking the range out of mm's
-	 * vma list before zap_page_range().  But unfortunately, when
-	 * unmapping a part of a vma, it takes the whole out first,
-	 * then reinserts what's left after (might even reschedule if
-	 * open() method called) - so swap entries may be invisible
-	 * to swapoff for a while, then reappear - but that is rare.
+	 * there are races when an instance of an entry might be missed.
 	 */
 	while ((i = find_next_to_unuse(si, i)) != 0) {
 		if (signal_pending(current)) {
@@ -720,7 +713,7 @@ static int try_to_unuse(unsigned int typ
 		wait_on_page_writeback(page);
 
 		/*
-		 * Remove all references to entry, without blocking.
+		 * Remove all references to entry.
 		 * Whenever we reach init_mm, there's no address space
 		 * to search, but use it as a reminder to search shmem.
 		 */
@@ -745,7 +738,10 @@ static int try_to_unuse(unsigned int typ
 			while (*swap_map > 1 && !retval &&
 					(p = p->next) != &start_mm->mmlist) {
 				mm = list_entry(p, struct mm_struct, mmlist);
-				atomic_inc(&mm->mm_users);
+				if (atomic_inc_return(&mm->mm_users) == 1) {
+					atomic_dec(&mm->mm_users);
+					continue;
+				}
 				spin_unlock(&mmlist_lock);
 				mmput(prev_mm);
 				prev_mm = mm;
@@ -859,6 +855,26 @@ static int try_to_unuse(unsigned int typ
 }
 
 /*
+ * After a successful try_to_unuse, if no swap is now in use, we know we
+ * can empty the mmlist.  swap_list_lock must be held on entry and exit.
+ * Note that mmlist_lock nests inside swap_list_lock, and an mm must be
+ * added to the mmlist just after page_duplicate - before would be racy.
+ */
+static void drain_mmlist(void)
+{
+	struct list_head *p, *next;
+	unsigned int i;
+
+	for (i = 0; i < nr_swapfiles; i++)
+		if (swap_info[i].inuse_pages)
+			return;
+	spin_lock(&mmlist_lock);
+	list_for_each_safe(p, next, &init_mm.mmlist)
+		list_del_init(p);
+	spin_unlock(&mmlist_lock);
+}
+
+/*
  * Use this swapdev's extent info to locate the (PAGE_SIZE) block which
  * corresponds to page offset `offset'.
  */
@@ -1172,6 +1188,7 @@ asmlinkage long sys_swapoff(const char _
 	}
 	down(&swapon_sem);
 	swap_list_lock();
+	drain_mmlist();
 	swap_device_lock(p);
 	swap_file = p->swap_file;
 	p->swap_file = NULL;
_