From: Anton Blanchard <anton@samba.org>

The current SLB handling code has a number of problems:

- We loop trying to find an empty SLB entry before deciding to cast one
  out.  On large working sets this really hurts since the SLB is always full
  and we end up looping through all 64 entries unnecessarily.

- During castout we currently invalidate the entry we are replacing.  This
  is to avoid a nasty race where the entry is in the ERAT but not the SLB and
  another cpu does a tlbie that removes the ERAT at a critical point.  If
  this race is fixed the SLB can be removed.

- The SLB prefault code doesnt work properly

The following patch addresses all the above concerns and adds some more
optimisations:

- feature nop out some segment table only code

- slb invalidate the kernel segment on context switch (avoids us having to
  slb invalidate at each cast out)

- optimise flush on context switch, the lazy tlb stuff avoids it being
  called when going from userspace to kernel thread, but it gets called when
  going to kernel thread to userspace.  In many cases we are returning to the
  same userspace task, we now check for this and avoid the flush

- use the optimised POWER4 mtcrf where possible



---

 arch/ppc64/kernel/head.S        |  157 ++++++++------
 arch/ppc64/kernel/pacaData.c    |    1 
 arch/ppc64/kernel/process.c     |   24 ++
 arch/ppc64/kernel/stab.c        |  432 ++++++++++++++++++++++------------------
 include/asm-ppc64/cputable.h    |    9 
 include/asm-ppc64/mmu.h         |    8 
 include/asm-ppc64/mmu_context.h |   11 -
 include/asm-ppc64/paca.h        |   23 --
 8 files changed, 384 insertions(+), 281 deletions(-)

diff -puN arch/ppc64/kernel/head.S~ppc64-slb_rewrite arch/ppc64/kernel/head.S
--- 25/arch/ppc64/kernel/head.S~ppc64-slb_rewrite	2004-01-26 19:44:52.000000000 -0800
+++ 25-akpm/arch/ppc64/kernel/head.S	2004-01-26 19:44:52.000000000 -0800
@@ -646,12 +646,14 @@ fast_exception_return:
  */
 	.globl DataAccess_common
 DataAccess_common:
+BEGIN_FTR_SECTION
 	mfspr   r22,DAR
 	srdi    r22,r22,60
 	cmpi    0,r22,0xc
 
 	/* Segment fault on a bolted segment. Go off and map that segment. */
 	beq-	.do_stab_bolted
+END_FTR_SECTION_IFCLR(CPU_FTR_SLB)
 stab_bolted_user_return:
 	EXCEPTION_PROLOG_COMMON
 	ld      r3,_DSISR(r1)
@@ -661,10 +663,12 @@ stab_bolted_user_return:
 	rlwinm	r4,r3,32-23,29,29	/* DSISR_STORE -> _PAGE_RW */
 	ld      r3,_DAR(r1)             /* into the hash table */
 
+BEGIN_FTR_SECTION
 	beq+	2f			/* If so handle it */
 	li	r4,0x300                /* Trap number */
 	bl	.do_stab_SI
 	b	1f
+END_FTR_SECTION_IFCLR(CPU_FTR_SLB)
 
 2:	li	r5,0x300
 	bl	.do_hash_page_DSI 	/* Try to handle as hpte fault */
@@ -690,7 +694,7 @@ DataAccessSLB_common:
 	EXCEPTION_PROLOG_COMMON
 	ld      r3,_DAR(r1)
 	li      r4,0x380                /* Exception vector  */
-	bl	.ste_allocate
+	bl	.slb_allocate
 	or.	r3,r3,r3		/* Check return code */
 	beq     fast_exception_return   /* Return if we succeeded */
 	addi	r3,r1,STACK_FRAME_OVERHEAD
@@ -705,12 +709,14 @@ DataAccessSLB_common:
 InstructionAccess_common:
 	EXCEPTION_PROLOG_COMMON
 
+BEGIN_FTR_SECTION
 	andis.	r0,r23,0x0020		/* no ste found? */
 	beq+	2f
 	mr	r3,r22			/* SRR0 at interrupt */
 	li	r4,0x400		/* Trap number       */
 	bl	.do_stab_SI
 	b	1f
+END_FTR_SECTION_IFCLR(CPU_FTR_SLB)
 
 2:	mr	r3,r22
 	li	r5,0x400
@@ -730,7 +736,7 @@ InstructionAccessSLB_common:
 	EXCEPTION_PROLOG_COMMON
 	mr      r3,r22                  /* SRR0 = NIA        */
 	li	r4,0x480                /* Exception vector  */
-	bl	.ste_allocate
+	bl	.slb_allocate
 	or.	r3,r3,r3		/* Check return code */
 	beq+	fast_exception_return   /* Return if we succeeded */
 
@@ -1006,48 +1012,27 @@ _GLOBAL(do_stab_bolted)
  * r20 - r23, SRR0 and SRR1 are saved in the exception frame.
  * We assume we aren't going to take any exceptions during this procedure.
  */
+/* XXX note fix masking in get_kernel_vsid to match */
 _GLOBAL(do_slb_bolted)
-	stw     r23,EX_CCR(r21) /* save CR in exc. frame */
+	stw	r23,EX_CCR(r21)		/* save CR in exc. frame */
 
-	/* (((ea >> 28) & 0x1fff) << 15) | (ea >> 60) */
-	mfspr	r21,DAR
-	rldicl  r20,r21,36,32   /* Permits a full 32b of ESID */
-	rldicr  r20,r20,15,48
-	rldicl  r21,r21,4,60
-	or      r20,r20,r21
-
-	li      r21,9           /* VSID_RANDOMIZER */
-	sldi    r21,r21,32
-	oris    r21,r21,58231
-	ori     r21,r21,39831
-
-	mulld   r20,r20,r21
-	clrldi  r20,r20,28      /* r20 = vsid */
-
-	/* Search the SLB for a free entry */
-	li      r22,1
-1:
-	slbmfee	r23,r22
-	rldicl  r23,r23,37,63
-	cmpwi   r23,0
-	beq     4f              /* Found an invalid entry              */
-
-	addi	r22,r22,1
-	cmpldi	r22,64
-	blt	1b
+	/*
+	 * We take the next entry, round robin. Previously we tried
+	 * to find a free slot first but that took too long. Unfortunately
+	 * we dont have any LRU information to help us choose a slot.
+	 */
 
-	/* No free entry - just take the next entry, round-robin */
-	/* XXX we should get the number of SLB entries from the naca */
+	/* r20 = paca */
+	/* use a cpu feature mask if we ever change our slb size */
 SLB_NUM_ENTRIES = 64
-2:	mfspr	r21,SPRG3
-	ld	r22,PACASTABRR(r21)
-	addi	r23,r22,1
-	cmpdi	r23,SLB_NUM_ENTRIES
-	blt	3f
-	li	r23,1
-3:	std	r23,PACASTABRR(r21)
+1:	ld	r22,PACASTABRR(r20)
+	addi	r21,r22,1
+	cmpdi	r21,SLB_NUM_ENTRIES
+	blt+	2f
+	li	r21,1			/* dont touch bolted slot 0 */
+2:	std	r21,PACASTABRR(r20)
 
-	/* r20 = vsid, r22 = entry */
+	/* r20 = paca, r22 = entry */
 
 	/* 
 	 * Never cast out the segment for our kernel stack. Since we
@@ -1056,48 +1041,86 @@ SLB_NUM_ENTRIES = 64
 	 * which gets invalidated due to a tlbie from another cpu at a
 	 * non recoverable point (after setting srr0/1) - Anton
 	 */
-	slbmfee	r23,r22
-	srdi	r23,r23,28
+	slbmfee	r21,r22
+	srdi	r21,r21,27
 	/*
 	 * This is incorrect (r1 is not the kernel stack) if we entered
 	 * from userspace but there is no critical window from userspace
 	 * so this should be OK. Also if we cast out the userspace stack
 	 * segment while in userspace we will fault it straight back in.
 	 */
-	srdi	r21,r1,28
-	cmpd	r21,r23
-	beq-	2b
-
-	/* Put together the vsid portion of the entry. */
-4:	li      r21,0
-	rldimi  r21,r20,12,0
-	ori     r20,r21,1024
-	ori	r20,r20,128    /* set class bit for kernel region */
-#ifndef CONFIG_PPC_ISERIES
-	ori	r20,r20,256    /* map kernel region with large ptes */
-#endif
+	srdi	r23,r1,27
+	ori	r23,r23,1
+	cmpd	r23,r21
+	beq-	1b
+
+	/* r20 = paca, r22 = entry */
+
+	/* (((ea >> 28) & 0x1fff) << 15) | (ea >> 60) */
+	mfspr	r21,DAR
+	rldicl	r23,r21,36,51
+	sldi	r23,r23,15
+	srdi	r21,r21,60
+	or	r23,r23,r21
+
+	/* VSID_RANDOMIZER */
+	li	r21,9
+	sldi	r21,r21,32
+	oris	r21,r21,58231
+	ori	r21,r21,39831
+
+	/* vsid = (ordinal * VSID_RANDOMIZER) & VSID_MASK */
+	mulld	r23,r23,r21
+	clrldi	r23,r23,28
+
+	/* r20 = paca, r22 = entry, r23 = vsid */
+
+	/* Put together slb word1 */
+	sldi	r23,r23,12
+
+BEGIN_FTR_SECTION
+	/* set kp and c bits */
+	ori	r23,r23,0x480
+END_FTR_SECTION_IFCLR(CPU_FTR_16M_PAGE)
+BEGIN_FTR_SECTION
+	/* set kp, l and c bits */
+	ori	r23,r23,0x580
+END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE)
+
+	/* r20 = paca, r22 = entry, r23 = slb word1 */
 
-	/* Put together the esid portion of the entry. */
-	mfspr	r21,DAR        /* Get the new esid                     */
-	rldicl  r21,r21,36,28  /* Permits a full 36b of ESID           */
-	li      r23,0
-	rldimi  r23,r21,28,0   /* Insert esid  */
-	oris    r21,r23,2048   /* valid bit    */
-	rldimi  r21,r22,0,52   /* Insert entry */
+	/* Put together slb word0 */
+	mfspr	r21,DAR
+	rldicr	r21,r21,0,35	/* get the new esid */
+	oris	r21,r21,2048	/* set valid bit */
+	rldimi	r21,r22,0,52	/* insert entry */
+
+	/* r20 = paca, r21 = slb word0, r23 = slb word1 */
 
 	/* 
 	 * No need for an isync before or after this slbmte. The exception
 	 * we enter with and the rfid we exit with are context synchronizing .
 	 */
-	slbmte  r20,r21
+	slbmte	r23,r21
 
 	/* All done -- return from exception. */
-	mfsprg  r20,3                   /* Load the PACA pointer  */
-	ld      r21,PACAEXCSP(r20)      /* Get the exception frame pointer */
-	addi    r21,r21,EXC_FRAME_SIZE
+	ld	r21,PACAEXCSP(r20)	/* Get the exception frame pointer */
+	addi	r21,r21,EXC_FRAME_SIZE
 	lwz	r23,EX_CCR(r21)		/* get saved CR */
 	/* note that this is almost identical to maskable_exception_exit */
-	mtcr    r23                     /* restore CR */
+
+	/*
+	 * Until everyone updates binutils hardwire the POWER4 optimised
+	 * single field mtcrf
+	 */
+#if 0
+	.machine	push
+	.machine	"power4"
+	mtcrf	0x80,r23
+	.machine	pop
+#else
+	.long 0x7ef80120
+#endif
 
 	mfmsr	r22
 	li	r23, MSR_RI
@@ -1107,10 +1130,10 @@ SLB_NUM_ENTRIES = 64
 	ld	r22,EX_SRR0(r21)	/* Get SRR0 from exc. frame */
 	ld	r23,EX_SRR1(r21)	/* Get SRR1 from exc. frame */
 	mtspr	SRR0,r22
-	mtspr   SRR1,r23
+	mtspr	SRR1,r23
 	ld	r22,EX_R22(r21)		/* restore r22 and r23 */
 	ld	r23,EX_R23(r21)
-	mfspr	r20,SPRG2
+	ld	r20,EX_R20(r21)
 	mfspr	r21,SPRG1
 	rfid
 
diff -puN arch/ppc64/kernel/pacaData.c~ppc64-slb_rewrite arch/ppc64/kernel/pacaData.c
--- 25/arch/ppc64/kernel/pacaData.c~ppc64-slb_rewrite	2004-01-26 19:44:52.000000000 -0800
+++ 25-akpm/arch/ppc64/kernel/pacaData.c	2004-01-26 19:44:52.000000000 -0800
@@ -41,7 +41,6 @@ struct systemcfg *systemcfg;
 	.xStab_data = {							    \
 		.real = (asrr),		/* Real pointer to segment table */ \
 		.virt = (asrv),		/* Virt pointer to segment table */ \
-		.next_round_robin = 1	/* Round robin index */		    \
 	},								    \
 	.lpQueuePtr = (lpq),		/* &xItLpQueue, */		    \
 	/* .xRtas = {							    \
diff -puN arch/ppc64/kernel/process.c~ppc64-slb_rewrite arch/ppc64/kernel/process.c
--- 25/arch/ppc64/kernel/process.c~ppc64-slb_rewrite	2004-01-26 19:44:52.000000000 -0800
+++ 25-akpm/arch/ppc64/kernel/process.c	2004-01-26 19:44:52.000000000 -0800
@@ -151,7 +151,31 @@ struct task_struct *__switch_to(struct t
 
 	local_irq_save(flags);
 	last = _switch(old_thread, new_thread);
+
+	/*
+	 * force our kernel stack out of the ERAT and SLB, this is to
+	 * avoid the race where we it hangs around in the ERAT but not the
+	 * SLB and the ERAT gets invalidated at just the wrong moment by
+	 * another CPU doing a tlbie.
+	 *
+	 * We definitely dont want to flush our bolted segment, so check
+	 * for that first.
+	 */
+	if ((cur_cpu_spec->cpu_features & CPU_FTR_SLB) &&
+	    GET_ESID((unsigned long)_get_SP()) != GET_ESID(PAGE_OFFSET)) {
+		union {
+			unsigned long word0;
+			slb_dword0 data;
+		} esid_data;
+
+		esid_data.word0 = 0;
+		/* class bit is in valid field for slbie instruction */
+		esid_data.data.v = 1;
+		esid_data.data.esid = GET_ESID((unsigned long)_get_SP());
+		asm volatile("isync; slbie %0; isync" : : "r" (esid_data));
+	}
 	local_irq_restore(flags);
+
 	return last;
 }
 
diff -puN arch/ppc64/kernel/stab.c~ppc64-slb_rewrite arch/ppc64/kernel/stab.c
--- 25/arch/ppc64/kernel/stab.c~ppc64-slb_rewrite	2004-01-26 19:44:52.000000000 -0800
+++ 25-akpm/arch/ppc64/kernel/stab.c	2004-01-26 19:44:52.000000000 -0800
@@ -12,8 +12,6 @@
  *      2 of the License, or (at your option) any later version.
  */
 
-/* XXX Note: Changes for bolted region have not been merged - Anton */
-
 #include <linux/config.h>
 #include <asm/pgtable.h>
 #include <asm/mmu.h>
@@ -59,6 +57,15 @@ void stab_initialize(unsigned long stab)
 	}
 }
 
+/* Both the segment table and SLB code uses the following cache */
+#define NR_STAB_CACHE_ENTRIES 8
+DEFINE_PER_CPU(long, stab_cache_ptr);
+DEFINE_PER_CPU(long, stab_cache[NR_STAB_CACHE_ENTRIES]);
+
+/*
+ * Segment table stuff
+ */
+
 /*
  * Create a segment table entry for the given esid/vsid pair.
  */
@@ -91,14 +98,8 @@ int make_ste(unsigned long stab, unsigne
 
 	/*
 	 * Could not find empty entry, pick one with a round robin selection.
-	 * Search all entries in the two groups.  Note that the first time
-	 * we get here, we start with entry 1 so the initializer
-	 * can be common with the SLB castout code.
+	 * Search all entries in the two groups.
 	 */
-
-	/* This assumes we never castout when initializing the stab. */
-	PMC_SW_PROCESSOR(stab_capacity_castouts); 
-
 	castout_entry = get_paca()->xStab_data.next_round_robin;
 	for (i = 0; i < 16; i++) {
 		if (castout_entry < 8) {
@@ -123,23 +124,169 @@ int make_ste(unsigned long stab, unsigne
 	/* Modify the old entry to the new value. */
 
 	/* Force previous translations to complete. DRENG */
-	asm volatile("isync" : : : "memory" );
+	asm volatile("isync" : : : "memory");
 
 	castout_ste->dw0.dw0.v = 0;
-	asm volatile("sync" : : : "memory" );    /* Order update */
+	asm volatile("sync" : : : "memory");    /* Order update */
 	castout_ste->dw1.dw1.vsid = vsid;
 	old_esid = castout_ste->dw0.dw0.esid;
 	castout_ste->dw0.dw0.esid = esid;
 	castout_ste->dw0.dw0.kp = 1;
-	asm volatile("eieio" : : : "memory" );   /* Order update */
+	asm volatile("eieio" : : : "memory");   /* Order update */
 	castout_ste->dw0.dw0.v  = 1;
 	asm volatile("slbie  %0" : : "r" (old_esid << SID_SHIFT)); 
 	/* Ensure completion of slbie */
-	asm volatile("sync" : : : "memory" );
+	asm volatile("sync" : : : "memory");
 
 	return (global_entry | (castout_entry & 0x7));
 }
 
+static inline void __ste_allocate(unsigned long esid, unsigned long vsid,
+				  mm_context_t context)
+{
+	unsigned char stab_entry;
+	unsigned long *offset;
+	int region_id = REGION_ID(esid << SID_SHIFT);
+
+	stab_entry = make_ste(get_paca()->xStab_data.virt, esid, vsid);
+
+	if (region_id != USER_REGION_ID)
+		return;
+
+	offset = &__get_cpu_var(stab_cache_ptr);
+	if (*offset < NR_STAB_CACHE_ENTRIES) {
+		__get_cpu_var(stab_cache[*offset]) = stab_entry;
+	}
+	(*offset)++;
+}
+
+/*
+ * Allocate a segment table entry for the given ea.
+ */
+int ste_allocate(unsigned long ea)
+{
+	unsigned long vsid, esid;
+	mm_context_t context;
+
+	/* Check for invalid effective addresses. */
+	if (!IS_VALID_EA(ea))
+		return 1;
+
+	/* Kernel or user address? */
+	if (REGION_ID(ea) >= KERNEL_REGION_ID) {
+		vsid = get_kernel_vsid(ea);
+		context = REGION_ID(ea);
+	} else {
+		if (!current->mm)
+			return 1;
+
+		context = current->mm->context;
+		vsid = get_vsid(context, ea);
+	}
+
+	esid = GET_ESID(ea);
+	__ste_allocate(esid, vsid, context);
+	/* Order update */
+	asm volatile("sync":::"memory");
+
+	return 0;
+}
+
+/*
+ * preload some userspace segments into the segment table.
+ */
+static void preload_stab(struct task_struct *tsk, struct mm_struct *mm)
+{
+	unsigned long pc = KSTK_EIP(tsk);
+	unsigned long stack = KSTK_ESP(tsk);
+	unsigned long unmapped_base;
+	unsigned long pc_esid = GET_ESID(pc);
+	unsigned long stack_esid = GET_ESID(stack);
+	unsigned long unmapped_base_esid;
+	unsigned long vsid;
+
+	if (test_tsk_thread_flag(tsk, TIF_32BIT))
+		unmapped_base = TASK_UNMAPPED_BASE_USER32;
+	else
+		unmapped_base = TASK_UNMAPPED_BASE_USER64;
+
+	unmapped_base_esid = GET_ESID(unmapped_base);
+
+	if (!IS_VALID_EA(pc) || (REGION_ID(pc) >= KERNEL_REGION_ID))
+		return;
+	vsid = get_vsid(mm->context, pc);
+	__ste_allocate(pc_esid, vsid, mm->context);
+
+	if (pc_esid == stack_esid)
+		return;
+
+	if (!IS_VALID_EA(stack) || (REGION_ID(stack) >= KERNEL_REGION_ID))
+		return;
+	vsid = get_vsid(mm->context, stack);
+	__ste_allocate(stack_esid, vsid, mm->context);
+
+	if (pc_esid == unmapped_base_esid || stack_esid == unmapped_base_esid)
+		return;
+
+	if (!IS_VALID_EA(unmapped_base) ||
+	    (REGION_ID(unmapped_base) >= KERNEL_REGION_ID))
+		return;
+	vsid = get_vsid(mm->context, unmapped_base);
+	__ste_allocate(unmapped_base_esid, vsid, mm->context);
+
+	/* Order update */
+	asm volatile("sync" : : : "memory");
+}
+
+/* Flush all user entries from the segment table of the current processor. */
+void flush_stab(struct task_struct *tsk, struct mm_struct *mm)
+{
+	STE *stab = (STE *) get_paca()->xStab_data.virt;
+	STE *ste;
+	unsigned long *offset = &__get_cpu_var(stab_cache_ptr);
+
+	/* Force previous translations to complete. DRENG */
+	asm volatile("isync" : : : "memory");
+
+	if (*offset <= NR_STAB_CACHE_ENTRIES) {
+		int i;
+
+		for (i = 0; i < *offset; i++) {
+			ste = stab + __get_cpu_var(stab_cache[i]);
+			ste->dw0.dw0.v = 0;
+		}
+
+		asm volatile("sync; slbia; sync":::"memory");
+	} else {
+		unsigned long entry;
+
+		/* Invalidate all entries. */
+		ste = stab;
+
+		/* Never flush the first entry. */
+		ste += 1;
+		for (entry = 1;
+		     entry < (PAGE_SIZE / sizeof(STE));
+		     entry++, ste++) {
+			unsigned long ea;
+			ea = ste->dw0.dw0.esid << SID_SHIFT;
+			if (ea < KERNELBASE) {
+				ste->dw0.dw0.v = 0;
+			}
+		}
+
+		asm volatile("sync; slbia; sync":::"memory");
+	}
+
+	*offset = 0;
+
+	preload_stab(tsk, mm);
+}
+
+/*
+ * SLB stuff
+ */
+
 /*
  * Create a segment buffer entry for the given esid/vsid pair.
  *
@@ -160,22 +307,11 @@ void make_slbe(unsigned long esid, unsig
 	} vsid_data;
 
 	/*
-	 * Find an empty entry, if one exists. Must start at 0 because
-	 * we use this code to load SLB entry 0 at boot.
-	 */
-	for (entry = 0; entry < naca->slb_size; entry++) {
-		asm volatile("slbmfee  %0,%1" 
-			     : "=r" (esid_data) : "r" (entry)); 
-		if (!esid_data.data.v)
-			goto write_entry;
-	}
-
-	/*
-	 * Could not find empty entry, pick one with a round robin selection.
+	 * We take the next entry, round robin. Previously we tried
+	 * to find a free slot first but that took too long. Unfortunately
+	 * we dont have any LRU information to help us choose a slot.
 	 */
 
-	PMC_SW_PROCESSOR(stab_capacity_castouts); 
-
 	/* 
 	 * Never cast out the segment for our kernel stack. Since we
 	 * dont invalidate the ERAT we could have a valid translation
@@ -190,13 +326,13 @@ void make_slbe(unsigned long esid, unsig
 		if (castout_entry >= naca->slb_size)
 			castout_entry = 1; 
 		asm volatile("slbmfee  %0,%1" : "=r" (esid_data) : "r" (entry));
-	} while (esid_data.data.esid == GET_ESID((unsigned long)_get_SP()));
+	} while (esid_data.data.v &&
+		 esid_data.data.esid == GET_ESID((unsigned long)_get_SP()));
 
 	get_paca()->xStab_data.next_round_robin = castout_entry;
 
 	/* slbie not needed as the previous mapping is still valid. */
 
-write_entry:	
 	/* 
 	 * Write the new SLB entry.
 	 */
@@ -220,211 +356,129 @@ write_entry:	
 	asm volatile("slbmte  %0,%1" : : "r" (vsid_data), "r" (esid_data)); 
 }
 
-static inline void __ste_allocate(unsigned long esid, unsigned long vsid,
-				  int kernel_segment, mm_context_t context)
+static inline void __slb_allocate(unsigned long esid, unsigned long vsid,
+				  mm_context_t context)
 {
-	if (cur_cpu_spec->cpu_features & CPU_FTR_SLB) {
-		int large = 0;
+	int large = 0;
+	int region_id = REGION_ID(esid << SID_SHIFT);
+	unsigned long *offset;
 
-#ifndef CONFIG_PPC_ISERIES
-		if (REGION_ID(esid << SID_SHIFT) == KERNEL_REGION_ID)
+	if (cur_cpu_spec->cpu_features & CPU_FTR_16M_PAGE) {
+		if (region_id == KERNEL_REGION_ID)
 			large = 1;
-		else if (REGION_ID(esid << SID_SHIFT) == USER_REGION_ID)
+		else if (region_id == USER_REGION_ID)
 			large = in_hugepage_area(context, esid << SID_SHIFT);
-#endif
-		make_slbe(esid, vsid, large, kernel_segment);
-	} else {
-		unsigned char top_entry, stab_entry, *segments; 
+	}
 
-		stab_entry = make_ste(get_paca()->xStab_data.virt, esid, vsid);
-		PMC_SW_PROCESSOR_A(stab_entry_use, stab_entry & 0xf); 
+	make_slbe(esid, vsid, large, region_id != USER_REGION_ID);
 
-		segments = get_paca()->xSegments;		
-		top_entry = get_paca()->stab_cache_pointer;
-		if (!kernel_segment && top_entry < STAB_CACHE_SIZE) {
-			segments[top_entry] = stab_entry;
-			if (top_entry == STAB_CACHE_SIZE)
-				top_entry = 0xff;
-			top_entry++;
-			get_paca()->stab_cache_pointer = top_entry;
-		}
+	if (region_id != USER_REGION_ID)
+		return;
+
+	offset = &__get_cpu_var(stab_cache_ptr);
+	if (*offset < NR_STAB_CACHE_ENTRIES) {
+		__get_cpu_var(stab_cache[*offset]) = esid;
 	}
+	(*offset)++;
 }
 
 /*
  * Allocate a segment table entry for the given ea.
  */
-int ste_allocate(unsigned long ea)
+int slb_allocate(unsigned long ea)
 {
 	unsigned long vsid, esid;
-	int kernel_segment = 0;
 	mm_context_t context;
 
-	PMC_SW_PROCESSOR(stab_faults); 
-
 	/* Check for invalid effective addresses. */
-	if (!IS_VALID_EA(ea))
+	if (unlikely(!IS_VALID_EA(ea)))
 		return 1;
 
 	/* Kernel or user address? */
 	if (REGION_ID(ea) >= KERNEL_REGION_ID) {
-		kernel_segment = 1;
-		vsid = get_kernel_vsid(ea);
 		context = REGION_ID(ea);
+		vsid = get_kernel_vsid(ea);
 	} else {
-		if (! current->mm)
+		if (unlikely(!current->mm))
 			return 1;
 
 		context = current->mm->context;
-		
 		vsid = get_vsid(context, ea);
 	}
 
 	esid = GET_ESID(ea);
-	__ste_allocate(esid, vsid, kernel_segment, context);
-	if (!(cur_cpu_spec->cpu_features & CPU_FTR_SLB)) {
-		/* Order update */
-		asm volatile("sync":::"memory"); 
-	}
+	__slb_allocate(esid, vsid, context);
 
 	return 0;
 }
 
-unsigned long ppc64_preload_all_segments;
-unsigned long ppc64_stab_preload = 1;
-#define STAB_PRESSURE 0
-#define USE_SLBIE_ON_STAB 0
-
 /*
- * preload all 16 segments for a 32 bit process and the PC and SP segments
- * for a 64 bit process.
+ * preload some userspace segments into the SLB.
  */
-static void preload_stab(struct task_struct *tsk, struct mm_struct *mm)
+static void preload_slb(struct task_struct *tsk, struct mm_struct *mm)
 {
-	if (ppc64_preload_all_segments &&
-	    test_tsk_thread_flag(tsk, TIF_32BIT)) {
-		unsigned long esid, vsid;
-
-		for (esid = 0; esid < 16; esid++) {
-			unsigned long ea = esid << SID_SHIFT;
-			vsid = get_vsid(mm->context, ea);
-			__ste_allocate(esid, vsid, 0, mm->context);
-		}
-	} else {
-		unsigned long pc = KSTK_EIP(tsk);
-		unsigned long stack = KSTK_ESP(tsk);
-		unsigned long pc_segment = pc & ~SID_MASK;
-		unsigned long stack_segment = stack & ~SID_MASK;
-		unsigned long vsid;
-
-		if (pc) {
-			if (!IS_VALID_EA(pc) || 
-			    (REGION_ID(pc) >= KERNEL_REGION_ID))
-				return;
-			vsid = get_vsid(mm->context, pc);
-			__ste_allocate(GET_ESID(pc), vsid, 0, mm->context);
-		}
-
-		if (stack && (pc_segment != stack_segment)) {
-			if (!IS_VALID_EA(stack) || 
-			    (REGION_ID(stack) >= KERNEL_REGION_ID))
-				return;
-			vsid = get_vsid(mm->context, stack);
-			__ste_allocate(GET_ESID(stack), vsid, 0, mm->context);
-		}
-	}
-
-	if (!(cur_cpu_spec->cpu_features & CPU_FTR_SLB)) {
-		/* Order update */
-		asm volatile("sync" : : : "memory"); 
-	}
+	unsigned long pc = KSTK_EIP(tsk);
+	unsigned long stack = KSTK_ESP(tsk);
+	unsigned long unmapped_base;
+	unsigned long pc_esid = GET_ESID(pc);
+	unsigned long stack_esid = GET_ESID(stack);
+	unsigned long unmapped_base_esid;
+	unsigned long vsid;
+
+	if (test_tsk_thread_flag(tsk, TIF_32BIT))
+		unmapped_base = TASK_UNMAPPED_BASE_USER32;
+	else
+		unmapped_base = TASK_UNMAPPED_BASE_USER64;
+
+	unmapped_base_esid = GET_ESID(unmapped_base);
+
+	if (!IS_VALID_EA(pc) || (REGION_ID(pc) >= KERNEL_REGION_ID))
+		return;
+	vsid = get_vsid(mm->context, pc);
+	__slb_allocate(pc_esid, vsid, mm->context);
+
+	if (pc_esid == stack_esid)
+		return;
+
+	if (!IS_VALID_EA(stack) || (REGION_ID(stack) >= KERNEL_REGION_ID))
+		return;
+	vsid = get_vsid(mm->context, stack);
+	__slb_allocate(stack_esid, vsid, mm->context);
+
+	if (pc_esid == unmapped_base_esid || stack_esid == unmapped_base_esid)
+		return;
+
+	if (!IS_VALID_EA(unmapped_base) ||
+	    (REGION_ID(unmapped_base) >= KERNEL_REGION_ID))
+		return;
+	vsid = get_vsid(mm->context, unmapped_base);
+	__slb_allocate(unmapped_base_esid, vsid, mm->context);
 }
 
 /* Flush all user entries from the segment table of the current processor. */
-void flush_stab(struct task_struct *tsk, struct mm_struct *mm)
+void flush_slb(struct task_struct *tsk, struct mm_struct *mm)
 {
-	if (cur_cpu_spec->cpu_features & CPU_FTR_SLB) {
-		/*
-		 * XXX disable 32bit slb invalidate optimisation until we fix
-		 * the issue where a 32bit app execed out of a 64bit app can
-		 * cause segments above 4GB not to be flushed - Anton
-		 */
-		if (0 && !STAB_PRESSURE && test_thread_flag(TIF_32BIT)) {
-			union {
-				unsigned long word0;
-				slb_dword0 data;
-			} esid_data;
-			unsigned long esid;
-
-			asm volatile("isync" : : : "memory");
-			for (esid = 0; esid < 16; esid++) {
-				esid_data.word0 = 0;
-				esid_data.data.esid = esid;
-				asm volatile("slbie %0" : : "r" (esid_data));
-			}
-			asm volatile("isync" : : : "memory");
-		} else {
-			asm volatile("isync; slbia; isync":::"memory");
-		}
+	unsigned long *offset = &__get_cpu_var(stab_cache_ptr);
 
-		PMC_SW_PROCESSOR(stab_invalidations);
-	} else {
-		STE *stab = (STE *) get_paca()->xStab_data.virt;
-		STE *ste;
-		unsigned long flags;
+	if (*offset <= NR_STAB_CACHE_ENTRIES) {
+		int i;
+		union {
+			unsigned long word0;
+			slb_dword0 data;
+		} esid_data;
 
-		/* Force previous translations to complete. DRENG */
 		asm volatile("isync" : : : "memory");
-
-		local_irq_save(flags);
-		if (get_paca()->stab_cache_pointer != 0xff && !STAB_PRESSURE) {
-			int i;
-			unsigned char *segments = get_paca()->xSegments;
-
-			for (i = 0; i < get_paca()->stab_cache_pointer; i++) {
-				ste = stab + segments[i]; 
-				ste->dw0.dw0.v = 0;
-				PMC_SW_PROCESSOR(stab_invalidations); 
-			}
-
-#if USE_SLBIE_ON_STAB
-			asm volatile("sync":::"memory");
-			for (i = 0; i < get_paca()->stab_cache_pointer; i++) {
-				ste = stab + segments[i]; 
-				asm volatile("slbie  %0" : :
-					"r" (ste->dw0.dw0.esid << SID_SHIFT)); 
-			}
-			asm volatile("sync":::"memory");
-#else
-			asm volatile("sync; slbia; sync":::"memory");
-#endif
-
-		} else {
-			unsigned long entry;
-
-			/* Invalidate all entries. */
-			ste = stab;
-
-			/* Never flush the first entry. */ 
-			ste += 1;
-			for (entry = 1;
-			     entry < (PAGE_SIZE / sizeof(STE)); 
-			     entry++, ste++) {
-				unsigned long ea;
-				ea = ste->dw0.dw0.esid << SID_SHIFT;
-				if (STAB_PRESSURE || ea < KERNELBASE) {
-					ste->dw0.dw0.v = 0;
-					PMC_SW_PROCESSOR(stab_invalidations); 
-				}
-			}
-
-			asm volatile("sync; slbia; sync":::"memory");
+		for (i = 0; i < *offset; i++) {
+			esid_data.word0 = 0;
+			esid_data.data.esid = __get_cpu_var(stab_cache[i]);
+			asm volatile("slbie %0" : : "r" (esid_data));
 		}
-
-		get_paca()->stab_cache_pointer = 0;
-		local_irq_restore(flags);
+		asm volatile("isync" : : : "memory");
+	} else {
+		asm volatile("isync; slbia; isync" : : : "memory");
 	}
 
-	if (ppc64_stab_preload)
-		preload_stab(tsk, mm);
+	*offset = 0;
+
+	preload_slb(tsk, mm);
 }
diff -puN include/asm-ppc64/cputable.h~ppc64-slb_rewrite include/asm-ppc64/cputable.h
--- 25/include/asm-ppc64/cputable.h~ppc64-slb_rewrite	2004-01-26 19:44:52.000000000 -0800
+++ 25-akpm/include/asm-ppc64/cputable.h	2004-01-26 19:44:52.000000000 -0800
@@ -135,10 +135,17 @@ extern firmware_feature_t firmware_featu
 #define COMMON_USER_PPC64	(PPC_FEATURE_32 | PPC_FEATURE_64 | \
 			         PPC_FEATURE_HAS_FPU | PPC_FEATURE_HAS_MMU)
 
-#define CPU_FTR_PPCAS_ARCH_V2   (CPU_FTR_SLB | CPU_FTR_16M_PAGE | \
+#define CPU_FTR_PPCAS_ARCH_V2_BASE (CPU_FTR_SLB | \
                                  CPU_FTR_TLBIEL | CPU_FTR_NOEXECUTE | \
                                  CPU_FTR_NODSISRALIGN)
 
+/* iSeries doesn't support large pages */
+#ifdef CONFIG_PPC_ISERIES
+#define CPU_FTR_PPCAS_ARCH_V2	(CPU_FTR_PPCAS_ARCH_V2_BASE)
+#else
+#define CPU_FTR_PPCAS_ARCH_V2	(CPU_FTR_PPCAS_ARCH_V2_BASE | CPU_FTR_16M_PAGE)
+#endif
+
 #define COMMON_PPC64_FW	(0)
 #endif
 
diff -puN include/asm-ppc64/mmu_context.h~ppc64-slb_rewrite include/asm-ppc64/mmu_context.h
--- 25/include/asm-ppc64/mmu_context.h~ppc64-slb_rewrite	2004-01-26 19:44:52.000000000 -0800
+++ 25-akpm/include/asm-ppc64/mmu_context.h	2004-01-26 19:44:52.000000000 -0800
@@ -139,6 +139,7 @@ destroy_context(struct mm_struct *mm)
 }
 
 extern void flush_stab(struct task_struct *tsk, struct mm_struct *mm);
+extern void flush_slb(struct task_struct *tsk, struct mm_struct *mm);
 
 /*
  * switch_mm is the entry point called from the architecture independent
@@ -154,7 +155,15 @@ static inline void switch_mm(struct mm_s
  END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
 	 : : );
 #endif /* CONFIG_ALTIVEC */
-	flush_stab(tsk, next);
+
+	/* No need to flush userspace segments if the mm doesnt change */
+	if (prev == next)
+		return;
+
+	if (cur_cpu_spec->cpu_features & CPU_FTR_SLB)
+		flush_slb(tsk, next);
+	else
+		flush_stab(tsk, next);
 	cpu_set(smp_processor_id(), next->cpu_vm_mask);
 }
 
diff -puN include/asm-ppc64/mmu.h~ppc64-slb_rewrite include/asm-ppc64/mmu.h
--- 25/include/asm-ppc64/mmu.h~ppc64-slb_rewrite	2004-01-26 19:44:52.000000000 -0800
+++ 25-akpm/include/asm-ppc64/mmu.h	2004-01-26 19:44:52.000000000 -0800
@@ -28,14 +28,6 @@ typedef unsigned long mm_context_t;
 #endif
 
 /*
- * Define the size of the cache used for segment table entries.  The first
- * entry is used as a cache pointer, therefore the actual number of entries
- * stored is one less than defined here.  Do not change this value without
- * considering the impact it will have on the layout of the paca in paca.h.
- */
-#define STAB_CACHE_SIZE 16
-
-/*
  * Hardware Segment Lookaside Buffer Entry
  * This structure has been padded out to two 64b doublewords (actual SLBE's are
  * 94 bits).  This padding facilites use by the segment management
diff -puN include/asm-ppc64/paca.h~ppc64-slb_rewrite include/asm-ppc64/paca.h
--- 25/include/asm-ppc64/paca.h~ppc64-slb_rewrite	2004-01-26 19:44:52.000000000 -0800
+++ 25-akpm/include/asm-ppc64/paca.h	2004-01-26 19:44:52.000000000 -0800
@@ -63,20 +63,15 @@ struct paca_struct {
 	u16 xPacaIndex;			/* Logical processor number		0x18 */
         u16 xHwProcNum;                 /* Physical processor number            0x1A */
 	u32 default_decr;		/* Default decrementer value		0x1c */	
-	u64 unused1;
-	u64 xKsave;			/* Saved Kernel stack addr or zero	0x28 */
-	u64 pvr;			/* Processor version register		0x30 */
-	u8 *exception_sp;		/*					0x38 */
-
-	struct ItLpQueue *lpQueuePtr;	/* LpQueue handled by this processor    0x40 */
-	u64  xTOC;			/* Kernel TOC address			0x48 */
-	STAB xStab_data;		/* Segment table information		0x50,0x58,0x60 */
-	u8 xSegments[STAB_CACHE_SIZE];	/* Cache of used stab entries		0x68,0x70 */
-	u8 xProcEnabled;		/* 1=soft enabled			0x78 */
-	u8 unused2;
-	u8 prof_enabled;		/* 1=iSeries profiling enabled          0x7A */
-	u8 stab_cache_pointer;	
-	u8 resv1[4];			/*					0x7B-0x7F */
+	u64 xKsave;			/* Saved Kernel stack addr or zero	0x20 */
+	u64 pvr;			/* Processor version register		0x28 */
+	struct ItLpQueue *lpQueuePtr;	/* LpQueue handled by this processor    0x30 */
+	u64  xTOC;			/* Kernel TOC address			0x38 */
+	STAB xStab_data;		/* Segment table information		0x40,0x48,0x50 */
+	u8 *exception_sp;		/*                                      0x58 */
+	u8 xProcEnabled;		/*                                      0x59 */
+	u8 prof_enabled;		/* 1=iSeries profiling enabled          0x60 */
+	u8 resv1[30];			/*					0x61-0x7F */
 
 /*=====================================================================================
  * CACHE_LINE_2 0x0080 - 0x00FF

_