From: William Lee Irwin III <wli@holomorphy.com>

In case a more explicit description is required (only difference from
before is ripping out touch_all_pages() for CONFIG_DISCONTIGMEM):

(a) fixes oops on page->lru list poison in pgd_dtor() on PAE
(b) fixes pmd corruption due to userspace teardown not zeroing out
	the kernel pmd entries
(c) fixes pgd_ctor() performance problems on larger/PAE systems
(d) fixes touch_all_pages() boot-time triplefault on i386 discontig
(e) preserves pgd and pmd preconstruction
(f) preserves pgtable.c's #ifdef-lessness

Successfully tested on 28x/56GB (sorry, I lost a node to a temporary
hardware failure) with XKVA enabled. Without this patch (and the
cpumask_arith.h fixes) no runtime whatsoever is possible (in fact, the
touch_all_pages() issue takes it out well prior to console_init()).
vs. 2.6.0-test2-mm4



 arch/i386/mm/init.c        |   39 +++++++++-
 arch/i386/mm/pgtable.c     |  170 +++++++++++++++++++++++++++------------------
 include/asm-i386/pgtable.h |    4 -
 3 files changed, 142 insertions(+), 71 deletions(-)

diff -puN arch/i386/mm/init.c~4g4g-wli-fixes arch/i386/mm/init.c
--- 25/arch/i386/mm/init.c~4g4g-wli-fixes	2003-08-05 08:41:16.000000000 -0700
+++ 25-akpm/arch/i386/mm/init.c	2003-08-05 08:41:16.000000000 -0700
@@ -332,6 +332,11 @@ void __init zone_sizes_init(void)
 extern void zone_sizes_init(void);
 #endif /* !CONFIG_DISCONTIGMEM */
 
+#ifdef CONFIG_DISCONTIGMEM
+void __init touch_all_pages(void)
+{
+}
+#else
 void __init touch_all_pages(void)
 {
 	int i;
@@ -346,6 +351,7 @@ void __init touch_all_pages(void)
 
 	printk("done.\n");
 }
+#endif
 
 /*
  * paging_init() sets up the page tables - note that the first 8MB are
@@ -515,11 +521,13 @@ void __init mem_init(void)
 	load_LDT(&init_mm.context);
 }
 
-kmem_cache_t *pgd_cache;
-kmem_cache_t *pmd_cache;
+kmem_cache_t *pgd_cache, *pmd_cache, *kpmd_cache;
 
 void __init pgtable_cache_init(void)
 {
+	void (*ctor)(void *, kmem_cache_t *, unsigned long);
+	void (*dtor)(void *, kmem_cache_t *, unsigned long);
+
 	if (PTRS_PER_PMD > 1) {
 		pmd_cache = kmem_cache_create("pmd",
 					PTRS_PER_PMD*sizeof(pmd_t),
@@ -529,13 +537,36 @@ void __init pgtable_cache_init(void)
 					NULL);
 		if (!pmd_cache)
 			panic("pgtable_cache_init(): cannot create pmd cache");
+
+		if (TASK_SIZE > PAGE_OFFSET) {
+			kpmd_cache = kmem_cache_create("kpmd",
+					PTRS_PER_PMD*sizeof(pmd_t),
+					0,
+					SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN,
+					kpmd_ctor,
+					NULL);
+			if (!kpmd_cache)
+				panic("pgtable_cache_init(): "
+						"cannot create kpmd cache");
+		}
 	}
+
+	if (PTRS_PER_PMD == 1 || TASK_SIZE <= PAGE_OFFSET)
+		ctor = pgd_ctor;
+	else
+		ctor = NULL;
+
+	if (PTRS_PER_PMD == 1 && TASK_SIZE <= PAGE_OFFSET)
+		dtor = pgd_dtor;
+	else
+		dtor = NULL;
+
 	pgd_cache = kmem_cache_create("pgd",
 				PTRS_PER_PGD*sizeof(pgd_t),
 				0,
 				SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN,
-				pgd_ctor,
-				pgd_dtor);
+				ctor,
+				dtor);
 	if (!pgd_cache)
 		panic("pgtable_cache_init(): Cannot create pgd cache");
 }
diff -puN arch/i386/mm/pgtable.c~4g4g-wli-fixes arch/i386/mm/pgtable.c
--- 25/arch/i386/mm/pgtable.c~4g4g-wli-fixes	2003-08-05 08:41:16.000000000 -0700
+++ 25-akpm/arch/i386/mm/pgtable.c	2003-08-05 08:41:16.000000000 -0700
@@ -158,6 +158,17 @@ void pmd_ctor(void *pmd, kmem_cache_t *c
 	memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
 }
 
+void kpmd_ctor(void *__pmd, kmem_cache_t *cache, unsigned long flags)
+{
+	pmd_t *kpmd, *pmd;
+	kpmd = pmd_offset(&swapper_pg_dir[PTRS_PER_PGD-1],
+				(PTRS_PER_PMD - NR_SHARED_PMDS)*PMD_SIZE);
+	pmd = (pmd_t *)__pmd + (PTRS_PER_PMD - NR_SHARED_PMDS);
+
+	memset(__pmd, 0, (PTRS_PER_PMD - NR_SHARED_PMDS)*sizeof(pmd_t));
+	memcpy(pmd, kpmd, NR_SHARED_PMDS*sizeof(pmd_t));
+}
+
 /*
  * List of all pgd's needed so it can invalidate entries in both cached
  * and uncached pgd's. This is essentially codepath-based locking
@@ -169,24 +180,60 @@ void pmd_ctor(void *pmd, kmem_cache_t *c
  * could be used. The locking scheme was chosen on the basis of
  * manfred's recommendations and having no core impact whatsoever.
  * -- wli
+ *
+ * The entire issue goes away when XKVA is configured.
  */
 spinlock_t pgd_lock = SPIN_LOCK_UNLOCKED;
 LIST_HEAD(pgd_list);
 
+/*
+ * This is not that hard to figure out.
+ * (a) PTRS_PER_PMD == 1 means non-PAE.
+ * (b) PTRS_PER_PMD > 1 means PAE.
+ * (c) TASK_SIZE > PAGE_OFFSET means XKVA.
+ * (d) TASK_SIZE <= PAGE_OFFSET means non-XKVA.
+ *
+ * Do *NOT* back out the preconstruction like the patch I'm cleaning
+ * up after this very instant did, or at all, for that matter.
+ * This is never called when PTRS_PER_PMD > 1 && TASK_SIZE > PAGE_OFFSET.
+ * -- wli
+ */
 void pgd_ctor(void *__pgd, kmem_cache_t *cache, unsigned long unused)
 {
+	pgd_t *pgd = (pgd_t *)__pgd;
 	unsigned long flags;
-	pgd_t *pgd0 = __pgd;
+
+	if (PTRS_PER_PMD == 1) {
+		if (TASK_SIZE <= PAGE_OFFSET)
+			spin_lock_irqsave(&pgd_lock, flags);
+		else
+ 			memcpy(&pgd[PTRS_PER_PGD - NR_SHARED_PMDS],
+ 				&swapper_pg_dir[PTRS_PER_PGD - NR_SHARED_PMDS],
+ 				NR_SHARED_PMDS * sizeof(pgd_t));
+	}
+
+	if (TASK_SIZE <= PAGE_OFFSET)
+ 		memcpy(pgd + USER_PTRS_PER_PGD,
+ 			swapper_pg_dir + USER_PTRS_PER_PGD,
+ 			(PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
 
 	if (PTRS_PER_PMD > 1)
 		return;
 
-	spin_lock_irqsave(&pgd_lock, flags);
-	list_add(&virt_to_page(pgd0)->lru, &pgd_list);
-	spin_unlock_irqrestore(&pgd_lock, flags);
+	if (TASK_SIZE > PAGE_OFFSET)
+		memset(pgd, 0, (PTRS_PER_PGD - NR_SHARED_PMDS)*sizeof(pgd_t));
+	else {
+		list_add(&virt_to_page(pgd)->lru, &pgd_list);
+		spin_unlock_irqrestore(&pgd_lock, flags);
+		memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
+	}
 }
 
-/* never called when PTRS_PER_PMD > 1 */
+/*
+ * Never called when PTRS_PER_PMD > 1 || TASK_SIZE > PAGE_OFFSET
+ * for with PAE we would list_del() multiple times, and for non-PAE
+ * with XKVA all the AGP pgd shootdown code is unnecessary.
+ */
 void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused)
 {
 	unsigned long flags; /* can be called from interrupt context */
@@ -196,87 +243,80 @@ void pgd_dtor(void *pgd, kmem_cache_t *c
 	spin_unlock_irqrestore(&pgd_lock, flags);
 }
 
-#ifdef CONFIG_X86_PAE
-
+/*
+ * See the comments above pgd_ctor() wrt. preconstruction.
+ * Do *NOT* memcpy() here. If you do, you back out important
+ * anti- cache pollution code.
+ *
+ */
 pgd_t *pgd_alloc(struct mm_struct *mm)
 {
 	int i;
 	pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
 
-	if (pgd) {
-#ifdef CONFIG_X86_4G_VM_LAYOUT
-		pmd_t *pmd0, *kernel_pmd0;
-#endif
-		pmd_t *pmd;
+	if (PTRS_PER_PMD == 1 || !pgd)
+		return pgd;
 
-		for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
-			pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
-			if (!pmd)
-				goto out_oom;
-			set_pgd(&pgd[i], __pgd(1 + __pa((u64)((u32)pmd))));
-		}
+	/*
+	 * In the 4G userspace case alias the top 16 MB virtual
+	 * memory range into the user mappings as well (these
+	 * include the trampoline and CPU data structures).
+	 */
+	for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
+		kmem_cache_t *cache;
+		pmd_t *pmd;
 
-#ifdef CONFIG_X86_4G_VM_LAYOUT
-		/*
-		 * In the 4G userspace case alias the top 16 MB virtual
-		 * memory range into the user mappings as well (these
-		 * include the trampoline and CPU data structures).
-		 */
-		pmd0 = pmd;
-		kernel_pmd0 = (pmd_t *)__va(pgd_val(swapper_pg_dir[PTRS_PER_PGD-1]) & PAGE_MASK);
-		memcpy(pmd0 + PTRS_PER_PMD - NR_SHARED_PMDS, kernel_pmd0 + PTRS_PER_PMD - NR_SHARED_PMDS, sizeof(pmd_t) * NR_SHARED_PMDS);
-#else
-		memcpy(pgd + USER_PTRS_PER_PGD,
-			swapper_pg_dir + USER_PTRS_PER_PGD,
-			(PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
-#endif
+		if (TASK_SIZE > PAGE_OFFSET && i == USER_PTRS_PER_PGD - 1)
+			cache = kpmd_cache;
+		else
+			cache = pmd_cache;
+
+		pmd = kmem_cache_alloc(cache, GFP_KERNEL);
+		if (!pmd)
+			goto out_oom;
+		set_pgd(&pgd[i], __pgd(1 + __pa((u64)((u32)pmd))));
 	}
+
 	return pgd;
 out_oom:
+	/*
+	 * we don't have to handle the kpmd_cache here, since it's the
+	 * last allocation, and has either nothing to free or when it
+	 * succeeds the whole operation succeeds.
+	 */
 	for (i--; i >= 0; i--)
 		kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
 	kmem_cache_free(pgd_cache, pgd);
 	return NULL;
 }
 
-#else /* ! PAE */
-
-pgd_t *pgd_alloc(struct mm_struct *mm)
-{
-	pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
-
-	if (pgd) {
-#ifdef CONFIG_X86_4G_VM_LAYOUT
-		memset(pgd, 0, PTRS_PER_PGD * sizeof(pgd_t));
-		/*
-		 * In the 4G userspace case alias the top 16 MB virtual
-		 * memory range into the user mappings as well (these
-		 * include the trampoline and CPU data structures).
-		 */
- 		memcpy(pgd + PTRS_PER_PGD-NR_SHARED_PMDS,
- 			swapper_pg_dir + PTRS_PER_PGD-NR_SHARED_PMDS,
- 			NR_SHARED_PMDS * sizeof(pgd_t));
-#else
- 		memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t));
- 		memcpy(pgd + USER_PTRS_PER_PGD,
- 			swapper_pg_dir + USER_PTRS_PER_PGD,
- 			(PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
-#endif
-	}
-	return pgd;
-}
-
-#endif /* CONFIG_X86_PAE */
-
 void pgd_free(pgd_t *pgd)
 {
 	int i;
 
-	/* in the PAE case user pgd entries are overwritten before usage */
-	if (PTRS_PER_PMD > 1)
-		for (i = 0; i < USER_PTRS_PER_PGD; ++i)
-			kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
 	/* in the non-PAE case, clear_page_tables() clears user pgd entries */
+	if (PTRS_PER_PMD == 1)
+		goto out_free;
+
+	/* in the PAE case user pgd entries are overwritten before usage */
+	for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
+		kmem_cache_t *cache;
+		pmd_t *pmd = __va(pgd_val(pgd[i]) - 1);
+
+		/*
+		 * only userspace pmd's are cleared for us
+		 * by mm/memory.c; it's a slab cache invariant
+		 * that we must separate the kernel pmd slab
+		 * all times, else we'll have bad pmd's.
+		 */
+		if (TASK_SIZE > PAGE_OFFSET && i == USER_PTRS_PER_PGD - 1)
+			cache = kpmd_cache;
+		else
+			cache = pmd_cache;
+
+		kmem_cache_free(cache, pmd);
+	}
+out_free:
 	kmem_cache_free(pgd_cache, pgd);
 }
 
diff -puN include/asm-i386/pgtable.h~4g4g-wli-fixes include/asm-i386/pgtable.h
--- 25/include/asm-i386/pgtable.h~4g4g-wli-fixes	2003-08-05 08:41:16.000000000 -0700
+++ 25-akpm/include/asm-i386/pgtable.h	2003-08-05 08:41:16.000000000 -0700
@@ -32,12 +32,12 @@
 #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
 extern unsigned long empty_zero_page[1024];
 extern pgd_t swapper_pg_dir[1024];
-extern kmem_cache_t *pgd_cache;
-extern kmem_cache_t *pmd_cache;
+extern kmem_cache_t *pgd_cache, *pmd_cache, *kpmd_cache;
 extern spinlock_t pgd_lock;
 extern struct list_head pgd_list;
 
 void pmd_ctor(void *, kmem_cache_t *, unsigned long);
+void kpmd_ctor(void *, kmem_cache_t *, unsigned long);
 void pgd_ctor(void *, kmem_cache_t *, unsigned long);
 void pgd_dtor(void *, kmem_cache_t *, unsigned long);
 void pgtable_cache_init(void);

_