From: "Chen, Kenneth W" <kenneth.w.chen@intel.com> For architecture like ia64, the switch stack structure is fairly large (currently 528 bytes). For context switch intensive application, we found that significant amount of cache misses occurs in switch_to() function. The following patch adds a hook in the schedule() function to prefetch switch stack structure as soon as 'next' task is determined. This allows maximum overlap in prefetch cache lines for that structure. Signed-off-by: Ken Chen <kenneth.w.chen@intel.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: "Luck, Tony" <tony.luck@intel.com> Signed-off-by: Andrew Morton <akpm@osdl.org> --- arch/ia64/kernel/entry.S | 23 +++++++++++++++++++++++ include/asm-ia64/system.h | 1 + include/linux/sched.h | 5 +++++ kernel/sched.c | 1 + 4 files changed, 30 insertions(+) diff -puN arch/ia64/kernel/entry.S~prefetch-kernel-stacks-to-speed-up-context-switch arch/ia64/kernel/entry.S --- devel/arch/ia64/kernel/entry.S~prefetch-kernel-stacks-to-speed-up-context-switch 2005-09-07 20:10:38.000000000 -0700 +++ devel-akpm/arch/ia64/kernel/entry.S 2005-09-07 20:10:38.000000000 -0700 @@ -470,6 +470,29 @@ ENTRY(load_switch_stack) br.cond.sptk.many b7 END(load_switch_stack) +GLOBAL_ENTRY(prefetch_stack) + add r14 = -IA64_SWITCH_STACK_SIZE, sp + add r15 = IA64_TASK_THREAD_KSP_OFFSET, in0 + ;; + ld8 r16 = [r15] // load next's stack pointer + lfetch.fault.excl [r14], 128 + ;; + lfetch.fault.excl [r14], 128 + lfetch.fault [r16], 128 + ;; + lfetch.fault.excl [r14], 128 + lfetch.fault [r16], 128 + ;; + lfetch.fault.excl [r14], 128 + lfetch.fault [r16], 128 + ;; + lfetch.fault.excl [r14], 128 + lfetch.fault [r16], 128 + ;; + lfetch.fault [r16], 128 + br.ret.sptk.many rp +END(prefetch_switch_stack) + GLOBAL_ENTRY(execve) mov r15=__NR_execve // put syscall number in place break __BREAK_SYSCALL diff -puN include/asm-ia64/system.h~prefetch-kernel-stacks-to-speed-up-context-switch include/asm-ia64/system.h --- devel/include/asm-ia64/system.h~prefetch-kernel-stacks-to-speed-up-context-switch 2005-09-07 20:10:38.000000000 -0700 +++ devel-akpm/include/asm-ia64/system.h 2005-09-07 20:10:38.000000000 -0700 @@ -275,6 +275,7 @@ extern void ia64_load_extra (struct task */ #define __ARCH_WANT_UNLOCKED_CTXSW +#define ARCH_HAS_PREFETCH_SWITCH_STACK #define ia64_platform_is(x) (strcmp(x, platform_name) == 0) void cpu_idle_wait(void); diff -puN include/linux/sched.h~prefetch-kernel-stacks-to-speed-up-context-switch include/linux/sched.h --- devel/include/linux/sched.h~prefetch-kernel-stacks-to-speed-up-context-switch 2005-09-07 20:10:38.000000000 -0700 +++ devel-akpm/include/linux/sched.h 2005-09-07 20:10:38.000000000 -0700 @@ -604,6 +604,11 @@ extern int groups_search(struct group_in #define GROUP_AT(gi, i) \ ((gi)->blocks[(i)/NGROUPS_PER_BLOCK][(i)%NGROUPS_PER_BLOCK]) +#ifdef ARCH_HAS_PREFETCH_SWITCH_STACK +extern void prefetch_stack(struct task_struct*); +#else +static inline void prefetch_stack(struct task_struct *t) { } +#endif struct audit_context; /* See audit.c */ struct mempolicy; diff -puN kernel/sched.c~prefetch-kernel-stacks-to-speed-up-context-switch kernel/sched.c --- devel/kernel/sched.c~prefetch-kernel-stacks-to-speed-up-context-switch 2005-09-07 20:10:38.000000000 -0700 +++ devel-akpm/kernel/sched.c 2005-09-07 20:10:38.000000000 -0700 @@ -2888,6 +2888,7 @@ switch_tasks: if (next == rq->idle) schedstat_inc(rq, sched_goidle); prefetch(next); + prefetch_stack(next); clear_tsk_need_resched(prev); rcu_qsctr_inc(task_cpu(prev)); _