From: Hugh Dickins <hugh@veritas.com>

Repeated -j3 kernel builds, run in tandem on dual PIII, have been collapsing
recently on -mm with 4G/4G split, SMP and preemption.  Typically 'make' fails
with Error 139 because 'as' or another got SIGSEGV; maybe within ten minutes,
maybe after ten hours.

This patch seems to fix that (ran successfully overnight on test4-mm1, will
run over the weekend on test4-mm3-1).  Please cast a critical eye over it, I
expect Ingo or someone else will find it can be improved.

The problem is that a task may be preempted just after it has entered
kernelspace, while using the transitional "virtual stack" i.e.  %esp pointing
to high per-cpu kmap of the kernel stack.  If the task resumes on another
cpu, that %esp needs to be repointed into the new cpu's kmap.

The corresponding returns to userspace look okay to me: interrupts are
disabled over the critical points.  And in general no copy is taken of %esp
while on the virtual stack e.g.  setting pointer to pt_regs is and must be
done after switching to real stack.  But there's one place in
__SWITCH_KERNELSPACE itself where we need to check and repeat if moved.



 arch/i386/kernel/entry.S   |   17 ++++++++++++++++-
 arch/i386/kernel/process.c |   20 +++++++++++++++++---
 2 files changed, 33 insertions(+), 4 deletions(-)

diff -puN arch/i386/kernel/entry.S~4g4g-preempt-vstack-fix arch/i386/kernel/entry.S
--- 25/arch/i386/kernel/entry.S~4g4g-preempt-vstack-fix	2003-08-30 15:42:11.000000000 -0700
+++ 25-akpm/arch/i386/kernel/entry.S	2003-08-30 15:42:11.000000000 -0700
@@ -103,6 +103,20 @@ TSS_ESP0_OFFSET = (4 - 0x200)
 
 #ifdef CONFIG_X86_SWITCH_PAGETABLES
 
+#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)
+/*
+ * If task is preempted in __SWITCH_KERNELSPACE, and moved to another cpu,
+ * __switch_to repoints %esp to the appropriate virtual stack; but %ebp is
+ * left stale, so we must check whether to repeat the real stack calculation.
+ */
+#define repeat_if_esp_changed				\
+	xorl %esp, %ebp;				\
+	testl $0xffffe000, %ebp;			\
+	jnz 0b
+#else
+#define repeat_if_esp_changed
+#endif
+
 /* clobbers ebx, edx and ebp */
 
 #define __SWITCH_KERNELSPACE				\
@@ -117,12 +131,13 @@ TSS_ESP0_OFFSET = (4 - 0x200)
 	movl $swapper_pg_dir-__PAGE_OFFSET, %edx;	\
 							\
 	/* GET_THREAD_INFO(%ebp) intermixed */		\
-							\
+0:							\
 	movl %esp, %ebp;				\
 	movl %esp, %ebx;				\
 	andl $0xffffe000, %ebp;				\
 	andl $0x00001fff, %ebx;				\
 	orl TI_real_stack(%ebp), %ebx;			\
+	repeat_if_esp_changed;				\
 							\
 	movl %edx, %cr3;				\
 	movl %ebx, %esp;				\
diff -puN arch/i386/kernel/process.c~4g4g-preempt-vstack-fix arch/i386/kernel/process.c
--- 25/arch/i386/kernel/process.c~4g4g-preempt-vstack-fix	2003-08-30 15:42:11.000000000 -0700
+++ 25-akpm/arch/i386/kernel/process.c	2003-08-30 15:42:11.000000000 -0700
@@ -479,13 +479,27 @@ struct task_struct * __switch_to(struct 
 	__kmap_atomic(next->stack_page1, KM_VSTACK1);
 
 	/*
-	 * Reload esp0:
-	 */
-	/*
 	 * NOTE: here we rely on the task being the stack as well
 	 */
 	next_p->thread_info->virtual_stack = (void *)__kmap_atomic_vaddr(KM_VSTACK0);
+
+#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)
+	/*
+	 * If next was preempted on entry from userspace to kernel,
+	 * and now it's on a different cpu, we need to adjust %esp.
+	 * This assumes that entry.S does not copy %esp while on the
+	 * virtual stack (with interrupts enabled): which is so,
+	 * except within __SWITCH_KERNELSPACE itself.
+	 */
+	if (unlikely(next->esp >= TASK_SIZE)) {
+		next->esp &= THREAD_SIZE - 1;
+		next->esp |= (unsigned long) next_p->thread_info->virtual_stack;
+	}
+#endif
 #endif
+	/*
+	 * Reload esp0:
+	 */
 	load_esp0(tss, virtual_esp0(next_p));
 
 	/*

_