From: Hugh Dickins <hugh@veritas.com>

With page_map_lock gone, how to stabilize page->mapping's anon_vma while
acquiring anon_vma->lock in page_referenced_anon and try_to_unmap_anon?

The page cannot actually be freed (vmscan holds reference), but however much
we check page_mapped (which guarantees that anon_vma is in use - or would
guarantee that if we added suitable barriers), there's no locking against page
becoming unmapped the instant after, then anon_vma freed.

It's okay to take anon_vma->lock after it's freed, so long as it remains a
struct anon_vma (its list would become empty, or perhaps reused for an
unrelated anon_vma: but no problem since we always check that the page located
is the right one); but corruption if that memory gets reused for some other
purpose.

This is not unique: it's liable to be problem whenever the kernel tries to
approach a structure obliquely.  It's generally solved with an atomic
reference count; but one advantage of anon_vma over anonmm is that it does not
have such a count, and it would be a backward step to add one.

Therefore...  implement SLAB_DESTROY_BY_RCU flag, to guarantee that such a
kmem_cache_alloc'ed structure cannot get freed to other use while the
rcu_read_lock is held i.e.  preempt disabled; and use that for anon_vma.

Fix concerns raised by Manfred: this flag is incompatible with poisoning and
destructor, and kmem_cache_destroy needs to synchronize_kernel.

I hope SLAB_DESTROY_BY_RCU may be useful elsewhere; but though it's safe for
little anon_vma, I'd be reluctant to use it on any caches whose immediate
shrinkage under pressure is important to the system.

Signed-off-by: Hugh Dickins <hugh@veritas.com>

Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 25-akpm/include/linux/slab.h |    1 
 25-akpm/mm/rmap.c            |   50 ++++++++++++++++++++-----------
 25-akpm/mm/slab.c            |   69 ++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 96 insertions(+), 24 deletions(-)

diff -puN include/linux/slab.h~rmaplock-3-5-slab_destroy_by_rcu include/linux/slab.h
--- 25/include/linux/slab.h~rmaplock-3-5-slab_destroy_by_rcu	Thu Aug  5 15:42:42 2004
+++ 25-akpm/include/linux/slab.h	Thu Aug  5 15:42:42 2004
@@ -45,6 +45,7 @@ typedef struct kmem_cache_s kmem_cache_t
 #define SLAB_RECLAIM_ACCOUNT	0x00020000UL	/* track pages allocated to indicate
 						   what is reclaimable later*/
 #define SLAB_PANIC		0x00040000UL	/* panic if kmem_cache_create() fails */
+#define SLAB_DESTROY_BY_RCU	0x00080000UL	/* defer freeing pages to RCU */
 
 /* flags passed to a constructor func */
 #define	SLAB_CTOR_CONSTRUCTOR	0x001UL		/* if not set, then deconstructor */
diff -puN mm/rmap.c~rmaplock-3-5-slab_destroy_by_rcu mm/rmap.c
--- 25/mm/rmap.c~rmaplock-3-5-slab_destroy_by_rcu	Thu Aug  5 15:42:42 2004
+++ 25-akpm/mm/rmap.c	Thu Aug  5 15:42:42 2004
@@ -30,6 +30,7 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/rmap.h>
+#include <linux/rcupdate.h>
 
 #include <asm/tlbflush.h>
 
@@ -159,8 +160,31 @@ static void anon_vma_ctor(void *data, km
 
 void __init anon_vma_init(void)
 {
-	anon_vma_cachep = kmem_cache_create("anon_vma",
-		sizeof(struct anon_vma), 0, SLAB_PANIC, anon_vma_ctor, NULL);
+	anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
+			0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor, NULL);
+}
+
+/*
+ * Getting a lock on a stable anon_vma from a page off the LRU is
+ * tricky: page_lock_anon_vma rely on RCU to guard against the races.
+ */
+static struct anon_vma *page_lock_anon_vma(struct page *page)
+{
+	struct anon_vma *anon_vma = NULL;
+	unsigned long anon_mapping;
+
+	rcu_read_lock();
+	anon_mapping = (unsigned long) page->mapping;
+	if (!(anon_mapping & PAGE_MAPPING_ANON))
+		goto out;
+	if (!page_mapped(page))
+		goto out;
+
+	anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
+	spin_lock(&anon_vma->lock);
+out:
+	rcu_read_unlock();
+	return anon_vma;
 }
 
 /*
@@ -241,19 +265,15 @@ out:
 static int page_referenced_anon(struct page *page)
 {
 	unsigned int mapcount;
-	struct anon_vma *anon_vma = (void *) page->mapping - PAGE_MAPPING_ANON;
+	struct anon_vma *anon_vma;
 	struct vm_area_struct *vma;
 	int referenced = 0;
 
-	/*
-	 * Recheck mapcount: it is not safe to take anon_vma->lock after
-	 * last page_remove_rmap, since struct anon_vma might be reused.
-	 */
-	mapcount = page_mapcount(page);
-	if (!mapcount)
+	anon_vma = page_lock_anon_vma(page);
+	if (!anon_vma)
 		return referenced;
 
-	spin_lock(&anon_vma->lock);
+	mapcount = page_mapcount(page);
 	list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
 		referenced += page_referenced_one(page, vma, &mapcount);
 		if (!mapcount)
@@ -637,18 +657,14 @@ out_unlock:
 
 static int try_to_unmap_anon(struct page *page)
 {
-	struct anon_vma *anon_vma = (void *) page->mapping - PAGE_MAPPING_ANON;
+	struct anon_vma *anon_vma;
 	struct vm_area_struct *vma;
 	int ret = SWAP_AGAIN;
 
-	/*
-	 * Recheck mapped: it is not safe to take anon_vma->lock after
-	 * last page_remove_rmap, since struct anon_vma might be reused.
-	 */
-	if (!page_mapped(page))
+	anon_vma = page_lock_anon_vma(page);
+	if (!anon_vma)
 		return ret;
 
-	spin_lock(&anon_vma->lock);
 	list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
 		ret = try_to_unmap_one(page, vma);
 		if (ret == SWAP_FAIL || !page_mapped(page))
diff -puN mm/slab.c~rmaplock-3-5-slab_destroy_by_rcu mm/slab.c
--- 25/mm/slab.c~rmaplock-3-5-slab_destroy_by_rcu	Thu Aug  5 15:42:42 2004
+++ 25-akpm/mm/slab.c	Thu Aug  5 15:42:42 2004
@@ -91,6 +91,7 @@
 #include	<linux/cpu.h>
 #include	<linux/sysctl.h>
 #include	<linux/module.h>
+#include	<linux/rcupdate.h>
 
 #include	<asm/uaccess.h>
 #include	<asm/cacheflush.h>
@@ -139,11 +140,13 @@
 			 SLAB_POISON | SLAB_HWCACHE_ALIGN | \
 			 SLAB_NO_REAP | SLAB_CACHE_DMA | \
 			 SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \
-			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC)
+			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
+			 SLAB_DESTROY_BY_RCU)
 #else
 # define CREATE_MASK	(SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \
 			 SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \
-			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC)
+			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
+			 SLAB_DESTROY_BY_RCU)
 #endif
 
 /*
@@ -190,6 +193,28 @@ struct slab {
 };
 
 /*
+ * struct slab_rcu
+ *
+ * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
+ * arrange for kmem_freepages to be called via RCU.  This is useful if
+ * we need to approach a kernel structure obliquely, from its address
+ * obtained without the usual locking.  We can lock the structure to
+ * stabilize it and check it's still at the given address, only if we
+ * can be sure that the memory has not been meanwhile reused for some
+ * other kind of object (which our subsystem's lock might corrupt).
+ *
+ * rcu_read_lock before reading the address, then rcu_read_unlock after
+ * taking the spinlock within the structure expected at that address.
+ *
+ * We assume struct slab_rcu can overlay struct slab when destroying.
+ */
+struct slab_rcu {
+	struct rcu_head		head;
+	kmem_cache_t		*cachep;
+	void			*addr;
+};
+
+/*
  * struct array_cache
  *
  * Per cpu structures
@@ -873,6 +898,16 @@ static void kmem_freepages(kmem_cache_t 
 		atomic_sub(1<<cachep->gfporder, &slab_reclaim_pages);
 }
 
+static void kmem_rcu_free(struct rcu_head *head)
+{
+	struct slab_rcu *slab_rcu = (struct slab_rcu *) head;
+	kmem_cache_t *cachep = slab_rcu->cachep;
+
+	kmem_freepages(cachep, slab_rcu->addr);
+	if (OFF_SLAB(cachep))
+		kmem_cache_free(cachep->slabp_cache, slab_rcu);
+}
+
 #if DEBUG
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
@@ -1026,6 +1061,8 @@ static void check_poison_obj(kmem_cache_
  */
 static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
 {
+	void *addr = slabp->s_mem - slabp->colouroff;
+
 #if DEBUG
 	int i;
 	for (i = 0; i < cachep->num; i++) {
@@ -1061,10 +1098,19 @@ static void slab_destroy (kmem_cache_t *
 		}
 	}
 #endif
-	
-	kmem_freepages(cachep, slabp->s_mem-slabp->colouroff);
-	if (OFF_SLAB(cachep))
-		kmem_cache_free(cachep->slabp_cache, slabp);
+
+	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
+		struct slab_rcu *slab_rcu;
+
+		slab_rcu = (struct slab_rcu *) slabp;
+		slab_rcu->cachep = cachep;
+		slab_rcu->addr = addr;
+		call_rcu(&slab_rcu->head, kmem_rcu_free);
+	} else {
+		kmem_freepages(cachep, addr);
+		if (OFF_SLAB(cachep))
+			kmem_cache_free(cachep->slabp_cache, slabp);
+	}
 }
 
 /**
@@ -1139,9 +1185,15 @@ kmem_cache_create (const char *name, siz
 	 */
 	if ((size < 4096 || fls(size-1) == fls(size-1+3*BYTES_PER_WORD)))
 		flags |= SLAB_RED_ZONE|SLAB_STORE_USER;
-	flags |= SLAB_POISON;
+	if (!(flags & SLAB_DESTROY_BY_RCU))
+		flags |= SLAB_POISON;
 #endif
+	if (flags & SLAB_DESTROY_BY_RCU)
+		BUG_ON(flags & SLAB_POISON);
 #endif
+	if (flags & SLAB_DESTROY_BY_RCU)
+		BUG_ON(dtor);
+
 	/*
 	 * Always checks flags, a caller might be expecting debug
 	 * support which isn't available.
@@ -1553,6 +1605,9 @@ int kmem_cache_destroy (kmem_cache_t * c
 		return 1;
 	}
 
+	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
+		synchronize_kernel();
+
 	/* no cpu_online check required here since we clear the percpu
 	 * array on cpu offline and set this to NULL.
 	 */
_