This code is playing with page->lru from pages which came from slab.  But to
remove page->list we need to convert slab over to using page->lru.  So we
cannot allow the i386 pagetable code to go scribbling on the ->lru field of
active slab pages.

This optimisation was pretty thin, and it is more important to shrink the
pageframe (on all architectures).


---

 25-akpm/arch/i386/mm/init.c               |   30 ++------
 25-akpm/arch/i386/mm/pageattr.c           |   25 +++----
 25-akpm/arch/i386/mm/pgtable.c            |  102 ++++++++++--------------------
 25-akpm/include/asm-i386/pgtable-3level.h |    2 
 25-akpm/include/asm-i386/pgtable.h        |   30 ++++----
 5 files changed, 75 insertions(+), 114 deletions(-)

diff -puN arch/i386/mm/init.c~unslabify-pgds-and-pmds arch/i386/mm/init.c
--- 25/arch/i386/mm/init.c~unslabify-pgds-and-pmds	2004-03-19 17:24:38.637651760 -0800
+++ 25-akpm/arch/i386/mm/init.c	2004-03-19 17:25:43.623772368 -0800
@@ -523,30 +523,20 @@ void __init mem_init(void)
 #endif
 }
 
-kmem_cache_t *pgd_cache;
-kmem_cache_t *pmd_cache;
+#ifdef CONFIG_X86_PAE
+struct kmem_cache_s *pae_pgd_cachep;
 
 void __init pgtable_cache_init(void)
 {
-	if (PTRS_PER_PMD > 1) {
-		pmd_cache = kmem_cache_create("pmd",
-					PTRS_PER_PMD*sizeof(pmd_t),
-					PTRS_PER_PMD*sizeof(pmd_t),
-					0,
-					pmd_ctor,
-					NULL);
-		if (!pmd_cache)
-			panic("pgtable_cache_init(): cannot create pmd cache");
-	}
-	pgd_cache = kmem_cache_create("pgd",
-				PTRS_PER_PGD*sizeof(pgd_t),
-				PTRS_PER_PGD*sizeof(pgd_t),
-				0,
-				pgd_ctor,
-				PTRS_PER_PMD == 1 ? pgd_dtor : NULL);
-	if (!pgd_cache)
-		panic("pgtable_cache_init(): Cannot create pgd cache");
+	/*
+	 * PAE pgds must be 16-byte aligned:
+	 */
+	pae_pgd_cachep = kmem_cache_create("pae_pgd", 32, 0,
+		SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, NULL, NULL);
+	if (!pae_pgd_cachep)
+		panic("init_pae(): Cannot alloc pae_pgd SLAB cache");
 }
+#endif
 
 /*
  * This function cannot be __init, since exceptions don't work in that
diff -puN arch/i386/mm/pageattr.c~unslabify-pgds-and-pmds arch/i386/mm/pageattr.c
--- 25/arch/i386/mm/pageattr.c~unslabify-pgds-and-pmds	2004-03-19 17:24:38.639651456 -0800
+++ 25-akpm/arch/i386/mm/pageattr.c	2004-03-19 17:24:38.646650392 -0800
@@ -67,22 +67,19 @@ static void flush_kernel_map(void *dummy
 
 static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) 
 { 
-	struct page *page;
-	unsigned long flags;
-
 	set_pte_atomic(kpte, pte); 	/* change init_mm */
-	if (PTRS_PER_PMD > 1)
-		return;
-
-	spin_lock_irqsave(&pgd_lock, flags);
-	list_for_each_entry(page, &pgd_list, lru) {
-		pgd_t *pgd;
-		pmd_t *pmd;
-		pgd = (pgd_t *)page_address(page) + pgd_index(address);
-		pmd = pmd_offset(pgd, address);
-		set_pte_atomic((pte_t *)pmd, pte);
+#ifndef CONFIG_X86_PAE
+	{
+		struct list_head *l;
+		spin_lock(&mmlist_lock);
+		list_for_each(l, &init_mm.mmlist) {
+			struct mm_struct *mm = list_entry(l, struct mm_struct, mmlist);
+			pmd_t *pmd = pmd_offset(pgd_offset(mm, address), address);
+			set_pte_atomic((pte_t *)pmd, pte);
+		}
+		spin_unlock(&mmlist_lock);
 	}
-	spin_unlock_irqrestore(&pgd_lock, flags);
+#endif
 }
 
 /* 
diff -puN arch/i386/mm/pgtable.c~unslabify-pgds-and-pmds arch/i386/mm/pgtable.c
--- 25/arch/i386/mm/pgtable.c~unslabify-pgds-and-pmds	2004-03-19 17:24:38.640651304 -0800
+++ 25-akpm/arch/i386/mm/pgtable.c	2004-03-19 17:24:38.647650240 -0800
@@ -12,7 +12,6 @@
 #include <linux/highmem.h>
 #include <linux/slab.h>
 #include <linux/pagemap.h>
-#include <linux/spinlock.h>
 
 #include <asm/system.h>
 #include <asm/pgtable.h>
@@ -152,88 +151,61 @@ struct page *pte_alloc_one(struct mm_str
 	return pte;
 }
 
-void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags)
-{
-	memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
-}
-
-/*
- * List of all pgd's needed for non-PAE so it can invalidate entries
- * in both cached and uncached pgd's; not needed for PAE since the
- * kernel pmd is shared. If PAE were not to share the pmd a similar
- * tactic would be needed. This is essentially codepath-based locking
- * against pageattr.c; it is the unique case in which a valid change
- * of kernel pagetables can't be lazily synchronized by vmalloc faults.
- * vmalloc faults work because attached pagetables are never freed.
- * If the locking proves to be non-performant, a ticketing scheme with
- * checks at dup_mmap(), exec(), and other mmlist addition points
- * could be used. The locking scheme was chosen on the basis of
- * manfred's recommendations and having no core impact whatsoever.
- * -- wli
- */
-spinlock_t pgd_lock = SPIN_LOCK_UNLOCKED;
-LIST_HEAD(pgd_list);
+#ifdef CONFIG_X86_PAE
 
-void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
+pgd_t *pgd_alloc(struct mm_struct *mm)
 {
-	unsigned long flags;
-
-	if (PTRS_PER_PMD == 1)
-		spin_lock_irqsave(&pgd_lock, flags);
+	int i;
+	pgd_t *pgd = kmem_cache_alloc(pae_pgd_cachep, GFP_KERNEL);
 
-	memcpy((pgd_t *)pgd + USER_PTRS_PER_PGD,
+	if (pgd) {
+		for (i = 0; i < USER_PTRS_PER_PGD; i++) {
+			unsigned long pmd = __get_free_page(GFP_KERNEL);
+			if (!pmd)
+				goto out_oom;
+			clear_page(pmd);
+			set_pgd(pgd + i, __pgd(1 + __pa(pmd)));
+		}
+		memcpy(pgd + USER_PTRS_PER_PGD,
 			swapper_pg_dir + USER_PTRS_PER_PGD,
 			(PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
-
-	if (PTRS_PER_PMD > 1)
-		return;
-
-	list_add(&virt_to_page(pgd)->lru, &pgd_list);
-	spin_unlock_irqrestore(&pgd_lock, flags);
-	memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
+	}
+	return pgd;
+out_oom:
+	for (i--; i >= 0; i--)
+		free_page((unsigned long)__va(pgd_val(pgd[i])-1));
+	kmem_cache_free(pae_pgd_cachep, pgd);
+	return NULL;
 }
 
-/* never called when PTRS_PER_PMD > 1 */
-void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused)
+void pgd_free(pgd_t *pgd)
 {
-	unsigned long flags; /* can be called from interrupt context */
+	int i;
 
-	spin_lock_irqsave(&pgd_lock, flags);
-	list_del(&virt_to_page(pgd)->lru);
-	spin_unlock_irqrestore(&pgd_lock, flags);
+	for (i = 0; i < USER_PTRS_PER_PGD; i++)
+		free_page((unsigned long)__va(pgd_val(pgd[i])-1));
+	kmem_cache_free(pae_pgd_cachep, pgd);
 }
 
+#else
+
 pgd_t *pgd_alloc(struct mm_struct *mm)
 {
-	int i;
-	pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
-
-	if (PTRS_PER_PMD == 1 || !pgd)
-		return pgd;
+	pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL);
 
-	for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
-		pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
-		if (!pmd)
-			goto out_oom;
-		set_pgd(&pgd[i], __pgd(1 + __pa((u64)((u32)pmd))));
+	if (pgd) {
+		memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t));
+		memcpy(pgd + USER_PTRS_PER_PGD,
+			swapper_pg_dir + USER_PTRS_PER_PGD,
+			(PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
 	}
 	return pgd;
-
-out_oom:
-	for (i--; i >= 0; i--)
-		kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
-	kmem_cache_free(pgd_cache, pgd);
-	return NULL;
 }
 
 void pgd_free(pgd_t *pgd)
 {
-	int i;
-
-	/* in the PAE case user pgd entries are overwritten before usage */
-	if (PTRS_PER_PMD > 1)
-		for (i = 0; i < USER_PTRS_PER_PGD; ++i)
-			kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
-	/* in the non-PAE case, clear_page_tables() clears user pgd entries */
-	kmem_cache_free(pgd_cache, pgd);
+	free_page((unsigned long)pgd);
 }
+
+#endif /* CONFIG_X86_PAE */
+
diff -puN include/asm-i386/pgtable-3level.h~unslabify-pgds-and-pmds include/asm-i386/pgtable-3level.h
--- 25/include/asm-i386/pgtable-3level.h~unslabify-pgds-and-pmds	2004-03-19 17:24:38.641651152 -0800
+++ 25-akpm/include/asm-i386/pgtable-3level.h	2004-03-19 17:24:38.647650240 -0800
@@ -123,4 +123,6 @@ static inline pmd_t pfn_pmd(unsigned lon
 #define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) })
 #define PTE_FILE_MAX_BITS       32
 
+extern struct kmem_cache_s *pae_pgd_cachep;
+
 #endif /* _I386_PGTABLE_3LEVEL_H */
diff -puN include/asm-i386/pgtable.h~unslabify-pgds-and-pmds include/asm-i386/pgtable.h
--- 25/include/asm-i386/pgtable.h~unslabify-pgds-and-pmds	2004-03-19 17:24:38.643650848 -0800
+++ 25-akpm/include/asm-i386/pgtable.h	2004-03-19 17:24:38.648650088 -0800
@@ -21,27 +21,15 @@
 #include <asm/bitops.h>
 #endif
 
-#include <linux/slab.h>
-#include <linux/list.h>
-#include <linux/spinlock.h>
+extern pgd_t swapper_pg_dir[1024];
+extern void paging_init(void);
 
 /*
  * ZERO_PAGE is a global shared page that is always zero: used
  * for zero-mapped memory areas etc..
  */
-#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
 extern unsigned long empty_zero_page[1024];
-extern pgd_t swapper_pg_dir[1024];
-extern kmem_cache_t *pgd_cache;
-extern kmem_cache_t *pmd_cache;
-extern spinlock_t pgd_lock;
-extern struct list_head pgd_list;
-
-void pmd_ctor(void *, kmem_cache_t *, unsigned long);
-void pgd_ctor(void *, kmem_cache_t *, unsigned long);
-void pgd_dtor(void *, kmem_cache_t *, unsigned long);
-void pgtable_cache_init(void);
-void paging_init(void);
+#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
 
 #endif /* !__ASSEMBLY__ */
 
@@ -53,8 +41,20 @@ void paging_init(void);
 #ifndef __ASSEMBLY__
 #ifdef CONFIG_X86_PAE
 # include <asm/pgtable-3level.h>
+
+/*
+ * Need to initialise the X86 PAE caches
+ */
+extern void pgtable_cache_init(void);
+
 #else
 # include <asm/pgtable-2level.h>
+
+/*
+ * No page table caches to initialise
+ */
+#define pgtable_cache_init()	do { } while (0)
+
 #endif
 #endif
 

_