From: Nick Piggin <piggin@cyberone.com.au>

The following patch builds a scheduling description for the i386
architecture using cpu_sibling_map to set up SMT if CONFIG_SCHED_SMT is
set.

It could be made more fancy and collapse degenerate domains at runtime (ie.
1 sibling per CPU, or 1 NUMA node in the computer).


From: Zwane Mwaikambo <zwane@arm.linux.org.uk>

   This fixes an oops due to cpu_sibling_map being uninitialised when a
   system with no MP table (most UP boxen) boots a CONFIG_SMT kernel.  What
   also happens is that the cpu_group lists end up not being terminated
   properly, but this oops kills it first.  Patch tested on UP w/o MP table,
   2x P2 and UP Xeon w/ no siblings.

From: "Martin J. Bligh" <mbligh@aracnet.com>,
      Nick Piggin <piggin@cyberone.com.au>

   Change arch_init_sched_domains to use cpu_online_map

From: Anton Blanchard <anton@samba.org>

   Fix build with NR_CPUS > BITS_PER_LONG


---

 25-akpm/arch/i386/Kconfig            |   10 +
 25-akpm/arch/i386/kernel/smpboot.c   |  206 ++++++++++++++++++++++++++++++++++-
 25-akpm/include/asm-i386/processor.h |    5 
 25-akpm/include/linux/sched.h        |   16 ++
 25-akpm/kernel/sched.c               |   35 +----
 5 files changed, 246 insertions(+), 26 deletions(-)

diff -puN arch/i386/Kconfig~sched-domains-i386-ht arch/i386/Kconfig
--- 25/arch/i386/Kconfig~sched-domains-i386-ht	2004-04-27 20:37:24.050283672 -0700
+++ 25-akpm/arch/i386/Kconfig	2004-04-27 20:37:24.061282000 -0700
@@ -479,6 +479,16 @@ config NR_CPUS
 	  This is purely to save memory - each supported CPU adds
 	  approximately eight kilobytes to the kernel image.
 
+config SCHED_SMT
+	bool "SMT (Hyperthreading) scheduler support"
+	depends on SMP
+	default off
+	help
+	  SMT scheduler support improves the CPU scheduler's decision making
+	  when dealing with Intel Pentium 4 chips with HyperThreading at a
+	  cost of slightly increased overhead in some places. If unsure say
+	  N here.
+
 config PREEMPT
 	bool "Preemptible Kernel"
 	help
diff -puN arch/i386/kernel/smpboot.c~sched-domains-i386-ht arch/i386/kernel/smpboot.c
--- 25/arch/i386/kernel/smpboot.c~sched-domains-i386-ht	2004-04-27 20:37:24.052283368 -0700
+++ 25-akpm/arch/i386/kernel/smpboot.c	2004-04-27 20:37:24.063281696 -0700
@@ -39,6 +39,7 @@
 #include <linux/kernel.h>
 
 #include <linux/mm.h>
+#include <linux/sched.h>
 #include <linux/kernel_stat.h>
 #include <linux/smp_lock.h>
 #include <linux/irq.h>
@@ -955,6 +956,8 @@ static void __init smp_boot_cpus(unsigne
 
 	current_thread_info()->cpu = 0;
 	smp_tune_scheduling();
+	cpus_clear(cpu_sibling_map[0]);
+	cpu_set(0, cpu_sibling_map[0]);
 
 	/*
 	 * If we couldn't find an SMP configuration at boot time,
@@ -1085,7 +1088,7 @@ static void __init smp_boot_cpus(unsigne
 	 * efficiently.
 	 */
 	for (cpu = 0; cpu < NR_CPUS; cpu++)
-		cpu_sibling_map[cpu] = CPU_MASK_NONE;
+		cpus_clear(cpu_sibling_map[cpu]);
 
 	for (cpu = 0; cpu < NR_CPUS; cpu++) {
 		int siblings = 0;
@@ -1122,6 +1125,207 @@ static void __init smp_boot_cpus(unsigne
 		synchronize_tsc_bp();
 }
 
+#ifdef CONFIG_SCHED_SMT
+#ifdef CONFIG_NUMA
+static struct sched_group sched_group_cpus[NR_CPUS];
+static struct sched_group sched_group_phys[NR_CPUS];
+static struct sched_group sched_group_nodes[MAX_NUMNODES];
+static DEFINE_PER_CPU(struct sched_domain, phys_domains);
+static DEFINE_PER_CPU(struct sched_domain, node_domains);
+__init void arch_init_sched_domains(void)
+{
+	int i;
+	struct sched_group *first_cpu = NULL, *last_cpu = NULL;
+
+	/* Set up domains */
+	for_each_cpu_mask(i, cpu_online_map) {
+		struct sched_domain *cpu_domain = cpu_sched_domain(i);
+		struct sched_domain *phys_domain = &per_cpu(phys_domains, i);
+		struct sched_domain *node_domain = &per_cpu(node_domains, i);
+		int node = cpu_to_node(i);
+		cpumask_t nodemask = node_to_cpumask(node);
+
+		*cpu_domain = SD_SIBLING_INIT;
+		cpu_domain->span = cpu_sibling_map[i];
+
+		*phys_domain = SD_CPU_INIT;
+		phys_domain->span = nodemask;
+		phys_domain->flags |= SD_FLAG_IDLE;
+
+		*node_domain = SD_NODE_INIT;
+		node_domain->span = cpu_online_map;
+	}
+
+	/* Set up CPU (sibling) groups */
+	for_each_cpu_mask(i, cpu_online_map) {
+		struct sched_domain *cpu_domain = cpu_sched_domain(i);
+		int j;
+		first_cpu = last_cpu = NULL;
+
+		if (i != first_cpu(cpu_domain->span))
+			continue;
+
+		for_each_cpu_mask(j, cpu_domain->span) {
+			struct sched_group *cpu = &sched_group_cpus[j];
+
+			cpu->cpumask = CPU_MASK_NONE;
+			cpu_set(j, cpu->cpumask);
+
+			if (!first_cpu)
+				first_cpu = cpu;
+			if (last_cpu)
+				last_cpu->next = cpu;
+			last_cpu = cpu;
+		}
+		last_cpu->next = first_cpu;
+	}
+
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		int j;
+		cpumask_t nodemask;
+		cpus_and(nodemask, node_to_cpumask(i), cpu_online_map);
+
+		if (cpus_empty(nodemask))
+			continue;
+
+		first_cpu = last_cpu = NULL;
+		/* Set up physical groups */
+		for_each_cpu_mask(j, nodemask) {
+			struct sched_domain *cpu_domain = cpu_sched_domain(j);
+			struct sched_group *cpu = &sched_group_phys[j];
+
+			if (j != first_cpu(cpu_domain->span))
+				continue;
+
+			cpu->cpumask = cpu_domain->span;
+
+			if (!first_cpu)
+				first_cpu = cpu;
+			if (last_cpu)
+				last_cpu->next = cpu;
+			last_cpu = cpu;
+		}
+		last_cpu->next = first_cpu;
+	}
+
+	/* Set up nodes */
+	first_cpu = last_cpu = NULL;
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		struct sched_group *cpu = &sched_group_nodes[i];
+		cpumask_t nodemask;
+		cpus_and(nodemask, node_to_cpumask(i), cpu_online_map);
+
+		if (cpus_empty(nodemask))
+			continue;
+
+		cpu->cpumask = nodemask;
+
+		if (!first_cpu)
+			first_cpu = cpu;
+		if (last_cpu)
+			last_cpu->next = cpu;
+		last_cpu = cpu;
+	}
+	last_cpu->next = first_cpu;
+
+
+	mb();
+	for_each_cpu_mask(i, cpu_online_map) {
+		int node = cpu_to_node(i);
+		struct sched_domain *cpu_domain = cpu_sched_domain(i);
+		struct sched_domain *phys_domain = &per_cpu(phys_domains, i);
+		struct sched_domain *node_domain = &per_cpu(node_domains, i);
+		struct sched_group *cpu_group = &sched_group_cpus[i];
+		struct sched_group *phys_group = &sched_group_phys[first_cpu(cpu_domain->span)];
+		struct sched_group *node_group = &sched_group_nodes[node];
+
+		cpu_domain->parent = phys_domain;
+		phys_domain->parent = node_domain;
+
+		node_domain->groups = node_group;
+		phys_domain->groups = phys_group;
+		cpu_domain->groups = cpu_group;
+	}
+}
+#else /* CONFIG_NUMA */
+static struct sched_group sched_group_cpus[NR_CPUS];
+static struct sched_group sched_group_phys[NR_CPUS];
+static DEFINE_PER_CPU(struct sched_domain, phys_domains);
+__init void arch_init_sched_domains(void)
+{
+	int i;
+	struct sched_group *first_cpu = NULL, *last_cpu = NULL;
+
+	/* Set up domains */
+	for_each_cpu_mask(i, cpu_online_map) {
+		struct sched_domain *cpu_domain = cpu_sched_domain(i);
+		struct sched_domain *phys_domain = &per_cpu(phys_domains, i);
+
+		*cpu_domain = SD_SIBLING_INIT;
+		cpu_domain->span = cpu_sibling_map[i];
+
+		*phys_domain = SD_CPU_INIT;
+		phys_domain->span = cpu_online_map;
+		phys_domain->flags |= SD_FLAG_IDLE;
+	}
+
+	/* Set up CPU (sibling) groups */
+	for_each_cpu_mask(i, cpu_online_map) {
+		struct sched_domain *cpu_domain = cpu_sched_domain(i);
+		int j;
+		first_cpu = last_cpu = NULL;
+
+		if (i != first_cpu(cpu_domain->span))
+			continue;
+
+		for_each_cpu_mask(j, cpu_domain->span) {
+			struct sched_group *cpu = &sched_group_cpus[j];
+
+			cpus_clear(cpu->cpumask);
+			cpu_set(j, cpu->cpumask);
+
+			if (!first_cpu)
+				first_cpu = cpu;
+			if (last_cpu)
+				last_cpu->next = cpu;
+			last_cpu = cpu;
+		}
+		last_cpu->next = first_cpu;
+	}
+
+	first_cpu = last_cpu = NULL;
+	/* Set up physical groups */
+	for_each_cpu_mask(i, cpu_online_map) {
+		struct sched_domain *cpu_domain = cpu_sched_domain(i);
+		struct sched_group *cpu = &sched_group_phys[i];
+
+		if (i != first_cpu(cpu_domain->span))
+			continue;
+
+		cpu->cpumask = cpu_domain->span;
+
+		if (!first_cpu)
+			first_cpu = cpu;
+		if (last_cpu)
+			last_cpu->next = cpu;
+		last_cpu = cpu;
+	}
+	last_cpu->next = first_cpu;
+
+	mb();
+	for_each_cpu_mask(i, cpu_online_map) {
+		struct sched_domain *cpu_domain = cpu_sched_domain(i);
+		struct sched_domain *phys_domain = &per_cpu(phys_domains, i);
+		struct sched_group *cpu_group = &sched_group_cpus[i];
+		struct sched_group *phys_group = &sched_group_phys[first_cpu(cpu_domain->span)];
+		cpu_domain->parent = phys_domain;
+		phys_domain->groups = phys_group;
+		cpu_domain->groups = cpu_group;
+	}
+}
+#endif /* CONFIG_NUMA */
+#endif /* CONFIG_SCHED_SMT */
+
 /* These are wrappers to interface to the new boot process.  Someone
    who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */
 void __init smp_prepare_cpus(unsigned int max_cpus)
diff -puN include/asm-i386/processor.h~sched-domains-i386-ht include/asm-i386/processor.h
--- 25/include/asm-i386/processor.h~sched-domains-i386-ht	2004-04-27 20:37:24.054283064 -0700
+++ 25-akpm/include/asm-i386/processor.h	2004-04-27 20:37:24.063281696 -0700
@@ -648,4 +648,9 @@ extern inline void prefetchw(const void 
 
 extern void select_idle_routine(const struct cpuinfo_x86 *c);
 
+#ifdef CONFIG_SCHED_SMT
+#define ARCH_HAS_SCHED_DOMAIN
+#define ARCH_HAS_SCHED_WAKE_BALANCE
+#endif
+
 #endif /* __ASM_I386_PROCESSOR_H */
diff -puN include/linux/sched.h~sched-domains-i386-ht include/linux/sched.h
--- 25/include/linux/sched.h~sched-domains-i386-ht	2004-04-27 20:37:24.055282912 -0700
+++ 25-akpm/include/linux/sched.h	2004-04-27 20:37:24.065281392 -0700
@@ -577,6 +577,22 @@ struct sched_domain {
 	unsigned int nr_balance_failed; /* initialise to 0 */
 };
 
+/* Common values for SMT siblings */
+#define SD_SIBLING_INIT (struct sched_domain) {		\
+	.span			= CPU_MASK_NONE,	\
+	.parent			= NULL,			\
+	.groups			= NULL,			\
+	.min_interval		= 1,			\
+	.max_interval		= 2,			\
+	.busy_factor		= 8,			\
+	.imbalance_pct		= 110,			\
+	.cache_hot_time		= 0,			\
+	.cache_nice_tries	= 0,			\
+	.flags			= SD_FLAG_FASTMIGRATE | SD_FLAG_NEWIDLE | SD_FLAG_WAKE,\
+	.balance_interval	= 1,			\
+	.nr_balance_failed	= 0,			\
+}
+
 /* Common values for CPUs */
 #define SD_CPU_INIT (struct sched_domain) {		\
 	.span			= CPU_MASK_NONE,	\
diff -puN kernel/sched.c~sched-domains-i386-ht kernel/sched.c
--- 25/kernel/sched.c~sched-domains-i386-ht	2004-04-27 20:37:24.057282608 -0700
+++ 25-akpm/kernel/sched.c	2004-04-27 20:37:24.067281088 -0700
@@ -3302,28 +3302,20 @@ DEFINE_PER_CPU(struct sched_domain, node
 static void __init arch_init_sched_domains(void)
 {
 	int i;
-	cpumask_t all_cpus = CPU_MASK_NONE;
 	struct sched_group *first_node = NULL, *last_node = NULL;
 
-	for (i = 0; i < NR_CPUS; i++) {
-		if (!cpu_possible(i))
-			continue;
-
-		cpu_set(i, all_cpus);
-	}
-
 	/* Set up domains */
-	for_each_cpu_mask(i, all_cpus) {
+	for_each_cpu_mask(i, cpu_online_map) {
 		int node = cpu_to_node(i);
 		cpumask_t nodemask = node_to_cpumask(node);
 		struct sched_domain *node_domain = &per_cpu(node_domains, i);
 		struct sched_domain *cpu_domain = cpu_sched_domain(i);
 
 		*node_domain = SD_NODE_INIT;
-		node_domain->span = all_cpus;
+		node_domain->span = cpu_online_map;
 
 		*cpu_domain = SD_CPU_INIT;
-		cpus_and(cpu_domain->span, nodemask, all_cpus);
+		cpus_and(cpu_domain->span, nodemask, cpu_online_map);
 		cpu_domain->parent = node_domain;
 	}
 
@@ -3333,8 +3325,9 @@ static void __init arch_init_sched_domai
 		int j;
 		cpumask_t nodemask;
 		struct sched_group *node = &sched_group_nodes[i];
+		cpumask_t tmp = node_to_cpumask(i);
 
-		cpus_and(nodemask, node_to_cpumask(i), all_cpus);
+		cpus_and(nodemask, tmp, cpu_online_map);
 
 		if (cpus_empty(nodemask))
 			continue;
@@ -3364,7 +3357,7 @@ static void __init arch_init_sched_domai
 	last_node->next = first_node;
 
 	mb();
-	for_each_cpu_mask(i, all_cpus) {
+	for_each_cpu_mask(i, cpu_online_map) {
 		struct sched_domain *node_domain = &per_cpu(node_domains, i);
 		struct sched_domain *cpu_domain = cpu_sched_domain(i);
 		node_domain->groups = &sched_group_nodes[cpu_to_node(i)];
@@ -3376,26 +3369,18 @@ static void __init arch_init_sched_domai
 static void __init arch_init_sched_domains(void)
 {
 	int i;
-	cpumask_t all_cpus = CPU_MASK_NONE;
 	struct sched_group *first_cpu = NULL, *last_cpu = NULL;
 
-	for (i = 0; i < NR_CPUS; i++) {
-		if (!cpu_possible(i))
-			continue;
-
-		cpu_set(i, all_cpus);
-	}
-
 	/* Set up domains */
-	for_each_cpu_mask(i, all_cpus) {
+	for_each_cpu_mask(i, cpu_online_map) {
 		struct sched_domain *cpu_domain = cpu_sched_domain(i);
 
 		*cpu_domain = SD_CPU_INIT;
-		cpu_domain->span = all_cpus;
+		cpu_domain->span = cpu_online_map;
 	}
 
 	/* Set up CPU groups */
-	for_each_cpu_mask(i, all_cpus) {
+	for_each_cpu_mask(i, cpu_online_map) {
 		struct sched_group *cpu = &sched_group_cpus[i];
 
 		cpus_clear(cpu->cpumask);
@@ -3410,7 +3395,7 @@ static void __init arch_init_sched_domai
 	last_cpu->next = first_cpu;
 
 	mb();
-	for_each_cpu_mask(i, all_cpus) {
+	for_each_cpu_mask(i, cpu_online_map) {
 		struct sched_domain *cpu_domain = cpu_sched_domain(i);
 		cpu_domain->groups = &sched_group_cpus[i];
 	}

_