From: "Chen, Kenneth W" <kenneth.w.chen@intel.com>

For architecture like ia64, the switch stack structure is fairly large
(currently 528 bytes).  For context switch intensive application, we found
that significant amount of cache misses occurs in switch_to() function. 
The following patch adds a hook in the schedule() function to prefetch
switch stack structure as soon as 'next' task is determined.  This allows
maximum overlap in prefetch cache lines for that structure.

Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 arch/ia64/kernel/entry.S  |   23 +++++++++++++++++++++++
 include/asm-ia64/system.h |    1 +
 include/linux/sched.h     |    5 +++++
 kernel/sched.c            |    1 +
 4 files changed, 30 insertions(+)

diff -puN arch/ia64/kernel/entry.S~prefetch-kernel-stacks-to-speed-up-context-switch arch/ia64/kernel/entry.S
--- devel/arch/ia64/kernel/entry.S~prefetch-kernel-stacks-to-speed-up-context-switch	2005-09-07 20:10:38.000000000 -0700
+++ devel-akpm/arch/ia64/kernel/entry.S	2005-09-07 20:10:38.000000000 -0700
@@ -470,6 +470,29 @@ ENTRY(load_switch_stack)
 	br.cond.sptk.many b7
 END(load_switch_stack)
 
+GLOBAL_ENTRY(prefetch_stack)
+	add r14 = -IA64_SWITCH_STACK_SIZE, sp
+	add r15 = IA64_TASK_THREAD_KSP_OFFSET, in0
+	;;
+	ld8 r16 = [r15]				// load next's stack pointer
+	lfetch.fault.excl [r14], 128
+	;;
+	lfetch.fault.excl [r14], 128
+	lfetch.fault [r16], 128
+	;;
+	lfetch.fault.excl [r14], 128
+	lfetch.fault [r16], 128
+	;;
+	lfetch.fault.excl [r14], 128
+	lfetch.fault [r16], 128
+	;;
+	lfetch.fault.excl [r14], 128
+	lfetch.fault [r16], 128
+	;;
+	lfetch.fault [r16], 128
+	br.ret.sptk.many rp
+END(prefetch_switch_stack)
+
 GLOBAL_ENTRY(execve)
 	mov r15=__NR_execve			// put syscall number in place
 	break __BREAK_SYSCALL
diff -puN include/asm-ia64/system.h~prefetch-kernel-stacks-to-speed-up-context-switch include/asm-ia64/system.h
--- devel/include/asm-ia64/system.h~prefetch-kernel-stacks-to-speed-up-context-switch	2005-09-07 20:10:38.000000000 -0700
+++ devel-akpm/include/asm-ia64/system.h	2005-09-07 20:10:38.000000000 -0700
@@ -275,6 +275,7 @@ extern void ia64_load_extra (struct task
  */
 #define __ARCH_WANT_UNLOCKED_CTXSW
 
+#define ARCH_HAS_PREFETCH_SWITCH_STACK
 #define ia64_platform_is(x) (strcmp(x, platform_name) == 0)
 
 void cpu_idle_wait(void);
diff -puN include/linux/sched.h~prefetch-kernel-stacks-to-speed-up-context-switch include/linux/sched.h
--- devel/include/linux/sched.h~prefetch-kernel-stacks-to-speed-up-context-switch	2005-09-07 20:10:38.000000000 -0700
+++ devel-akpm/include/linux/sched.h	2005-09-07 20:10:38.000000000 -0700
@@ -604,6 +604,11 @@ extern int groups_search(struct group_in
 #define GROUP_AT(gi, i) \
     ((gi)->blocks[(i)/NGROUPS_PER_BLOCK][(i)%NGROUPS_PER_BLOCK])
 
+#ifdef ARCH_HAS_PREFETCH_SWITCH_STACK
+extern void prefetch_stack(struct task_struct*);
+#else
+static inline void prefetch_stack(struct task_struct *t) { }
+#endif
 
 struct audit_context;		/* See audit.c */
 struct mempolicy;
diff -puN kernel/sched.c~prefetch-kernel-stacks-to-speed-up-context-switch kernel/sched.c
--- devel/kernel/sched.c~prefetch-kernel-stacks-to-speed-up-context-switch	2005-09-07 20:10:38.000000000 -0700
+++ devel-akpm/kernel/sched.c	2005-09-07 20:10:38.000000000 -0700
@@ -2888,6 +2888,7 @@ switch_tasks:
 	if (next == rq->idle)
 		schedstat_inc(rq, sched_goidle);
 	prefetch(next);
+	prefetch_stack(next);
 	clear_tsk_need_resched(prev);
 	rcu_qsctr_inc(task_cpu(prev));
 
_