diff -purN -X /home/mbligh/.diff.exclude 160-lotsa_sds/Documentation/sched-domains.txt 180-sched_domains/Documentation/sched-domains.txt --- 160-lotsa_sds/Documentation/sched-domains.txt 1969-12-31 16:00:00.000000000 -0800 +++ 180-sched_domains/Documentation/sched-domains.txt 2004-03-12 11:07:47.000000000 -0800 @@ -0,0 +1,55 @@ +Each CPU has a "base" scheduling domain (struct sched_domain). These are +accessed via cpu_sched_domain(i) and this_sched_domain() macros. The domain +hierarchy is built from these base domains via the ->parent pointer. ->parent +MUST be NULL terminated, and domain structures should be per-CPU as they +are locklessly updated. + +Each scheduling domain spans a number of CPUs (stored in the ->span field). +A domain's span MUST be a superset of it child's span, and a base domain +for CPU i MUST span at least i. The top domain for each CPU will generally +span all CPUs in the system although strictly it doesn't have to, but this +could lead to a case where some CPUs will never be given tasks to run unless +the CPUs allowed mask is explicitly set. A sched domain's span means "balance +process load among these CPUs". + +Each scheduling domain must have one or more CPU groups (struct sched_group) +which are organised as a circular one way linked list from the ->groups +pointer. The union of cpumasks of these groups MUST be the same as the +domain's span. The intersection of cpumasks from any two of these groups +MUST be the empty set. The group pointed to by the ->groups pointer MUST +contain the CPU to which the domain belongs. Groups may be shared among +CPUs as they contain read only data after they have been set up. + +Balancing within a sched domain occurs between groups. That is, each group +is treated as one entity. The load of a group is defined as the sum of the +load of each of its member CPUs, and only when the load of a group becomes +out of balance are tasks moved between groups. + +In kernel/sched.c, rebalance_tick is run periodically on each CPU. This +function takes its CPU's base sched domain and checks to see if has reached +its rebalance interval. If so, then it will run load_balance on that domain. +rebalance_tick then checks the parent sched_domain (if it exists), and the +parent of the parent and so forth. + +*** Implementing sched domains *** +The "base" domain will "span" the first level of the hierarchy. In the case +of SMT, you'll span all siblings of the physical CPU, with each group being +a single virtual CPU. + +In SMP, the parent of the base domain will span all physical CPUs in the +node. Each group being a single physical CPU. Then with NUMA, the parent +of the SMP domain will span the entire machine, with each group having the +cpumask of a node. Or, you could do multi-level NUMA or Opteron, for example, +might have just one domain covering its one NUMA level. + +The implementor should read comments in include/linux/sched.h: +struct sched_domain fields, SD_FLAG_*, SD_*_INIT to get an idea of +the specifics and what to tune. + +Implementors should change the line +#undef SCHED_DOMAIN_DEBUG +to +#define SCHED_DOMAIN_DEBUG +in kernel/sched.c as this enables an error checking parse of the sched domains +which should catch most possible errors (described above). It also prints out +the domain structure in a visual format. diff -purN -X /home/mbligh/.diff.exclude 160-lotsa_sds/arch/i386/Kconfig 180-sched_domains/arch/i386/Kconfig --- 160-lotsa_sds/arch/i386/Kconfig 2004-03-12 11:07:26.000000000 -0800 +++ 180-sched_domains/arch/i386/Kconfig 2004-03-12 11:07:47.000000000 -0800 @@ -526,6 +526,16 @@ config NR_CPUS This is purely to save memory - each supported CPU adds approximately eight kilobytes to the kernel image. +config SCHED_SMT + bool "SMT (Hyperthreading) scheduler support" + depends on SMP + default off + help + SMT scheduler support improves the CPU scheduler's decision making + when dealing with Intel Pentium 4 chips with HyperThreading at a + cost of slightly increased overhead in some places. If unsure say + N here. + config PREEMPT bool "Preemptible Kernel" help diff -purN -X /home/mbligh/.diff.exclude 160-lotsa_sds/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c 180-sched_domains/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c --- 160-lotsa_sds/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c 2004-02-18 14:56:46.000000000 -0800 +++ 180-sched_domains/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c 2004-03-12 11:07:47.000000000 -0800 @@ -57,8 +57,7 @@ static int cpufreq_p4_setdc(unsigned int u32 l, h; cpumask_t cpus_allowed, affected_cpu_map; struct cpufreq_freqs freqs; - int hyperthreading = 0; - int sibling = 0; + int j; if (!cpu_online(cpu) || (newstate > DC_DISABLE) || (newstate == DC_RESV)) @@ -68,13 +67,10 @@ static int cpufreq_p4_setdc(unsigned int cpus_allowed = current->cpus_allowed; /* only run on CPU to be set, or on its sibling */ - affected_cpu_map = cpumask_of_cpu(cpu); -#ifdef CONFIG_X86_HT - hyperthreading = ((cpu_has_ht) && (smp_num_siblings == 2)); - if (hyperthreading) { - sibling = cpu_sibling_map[cpu]; - cpu_set(sibling, affected_cpu_map); - } +#ifdef CONFIG_SMP + affected_cpu_map = cpu_sibling_map[cpu]; +#else + affected_cpu_map = cpumask_of_cpu(cpu); #endif set_cpus_allowed(current, affected_cpu_map); BUG_ON(!cpu_isset(smp_processor_id(), affected_cpu_map)); @@ -97,11 +93,11 @@ static int cpufreq_p4_setdc(unsigned int /* notifiers */ freqs.old = stock_freq * l / 8; freqs.new = stock_freq * newstate / 8; - freqs.cpu = cpu; - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - if (hyperthreading) { - freqs.cpu = sibling; - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + for_each_cpu(j) { + if (cpu_isset(j, affected_cpu_map)) { + freqs.cpu = j; + cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + } } rdmsr(MSR_IA32_THERM_STATUS, l, h); @@ -132,10 +128,11 @@ static int cpufreq_p4_setdc(unsigned int set_cpus_allowed(current, cpus_allowed); /* notifiers */ - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - if (hyperthreading) { - freqs.cpu = cpu; - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); + for_each_cpu(j) { + if (cpu_isset(j, affected_cpu_map)) { + freqs.cpu = j; + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); + } } return 0; diff -purN -X /home/mbligh/.diff.exclude 160-lotsa_sds/arch/i386/kernel/io_apic.c 180-sched_domains/arch/i386/kernel/io_apic.c --- 160-lotsa_sds/arch/i386/kernel/io_apic.c 2004-03-11 14:33:36.000000000 -0800 +++ 180-sched_domains/arch/i386/kernel/io_apic.c 2004-03-12 11:07:47.000000000 -0800 @@ -317,8 +317,7 @@ struct irq_cpu_info { #define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask) -#define CPU_TO_PACKAGEINDEX(i) \ - ((physical_balance && i > cpu_sibling_map[i]) ? cpu_sibling_map[i] : i) +#define CPU_TO_PACKAGEINDEX(i) (first_cpu(cpu_sibling_map[i])) #define MAX_BALANCED_IRQ_INTERVAL (5*HZ) #define MIN_BALANCED_IRQ_INTERVAL (HZ/2) @@ -401,6 +400,7 @@ static void do_irq_balance(void) unsigned long max_cpu_irq = 0, min_cpu_irq = (~0); unsigned long move_this_load = 0; int max_loaded = 0, min_loaded = 0; + int load; unsigned long useful_load_threshold = balanced_irq_interval + 10; int selected_irq; int tmp_loaded, first_attempt = 1; @@ -452,7 +452,7 @@ static void do_irq_balance(void) for (i = 0; i < NR_CPUS; i++) { if (!cpu_online(i)) continue; - if (physical_balance && i > cpu_sibling_map[i]) + if (i != CPU_TO_PACKAGEINDEX(i)) continue; if (min_cpu_irq > CPU_IRQ(i)) { min_cpu_irq = CPU_IRQ(i); @@ -471,7 +471,7 @@ tryanothercpu: for (i = 0; i < NR_CPUS; i++) { if (!cpu_online(i)) continue; - if (physical_balance && i > cpu_sibling_map[i]) + if (i != CPU_TO_PACKAGEINDEX(i)) continue; if (max_cpu_irq <= CPU_IRQ(i)) continue; @@ -551,9 +551,14 @@ tryanotherirq: * We seek the least loaded sibling by making the comparison * (A+B)/2 vs B */ - if (physical_balance && (CPU_IRQ(min_loaded) >> 1) > - CPU_IRQ(cpu_sibling_map[min_loaded])) - min_loaded = cpu_sibling_map[min_loaded]; + load = CPU_IRQ(min_loaded) >> 1; + for_each_cpu_mask(j, cpu_sibling_map[min_loaded]) { + if (load > CPU_IRQ(j)) { + /* This won't change cpu_sibling_map[min_loaded] */ + load = CPU_IRQ(j); + min_loaded = j; + } + } cpus_and(allowed_mask, cpu_online_map, irq_affinity[selected_irq]); target_cpu_mask = cpumask_of_cpu(min_loaded); diff -purN -X /home/mbligh/.diff.exclude 160-lotsa_sds/arch/i386/kernel/smpboot.c 180-sched_domains/arch/i386/kernel/smpboot.c --- 160-lotsa_sds/arch/i386/kernel/smpboot.c 2004-03-11 14:33:36.000000000 -0800 +++ 180-sched_domains/arch/i386/kernel/smpboot.c 2004-03-12 11:07:47.000000000 -0800 @@ -39,6 +39,7 @@ #include <linux/kernel.h> #include <linux/mm.h> +#include <linux/sched.h> #include <linux/kernel_stat.h> #include <linux/smp_lock.h> #include <linux/irq.h> @@ -934,7 +935,7 @@ static int boot_cpu_logical_apicid; /* Where the IO area was mapped on multiquad, always 0 otherwise */ void *xquad_portio; -int cpu_sibling_map[NR_CPUS] __cacheline_aligned; +cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned; static void __init smp_boot_cpus(unsigned int max_cpus) { @@ -953,6 +954,8 @@ static void __init smp_boot_cpus(unsigne current_thread_info()->cpu = 0; smp_tune_scheduling(); + cpus_clear(cpu_sibling_map[0]); + cpu_set(0, cpu_sibling_map[0]); /* * If we couldn't find an SMP configuration at boot time, @@ -1079,32 +1082,34 @@ static void __init smp_boot_cpus(unsigne Dprintk("Boot done.\n"); /* - * If Hyper-Threading is avaialble, construct cpu_sibling_map[], so - * that we can tell the sibling CPU efficiently. + * construct cpu_sibling_map[], so that we can tell sibling CPUs + * efficiently. */ - if (cpu_has_ht && smp_num_siblings > 1) { - for (cpu = 0; cpu < NR_CPUS; cpu++) - cpu_sibling_map[cpu] = NO_PROC_ID; - - for (cpu = 0; cpu < NR_CPUS; cpu++) { - int i; - if (!cpu_isset(cpu, cpu_callout_map)) - continue; + for (cpu = 0; cpu < NR_CPUS; cpu++) + cpus_clear(cpu_sibling_map[cpu]); + + for (cpu = 0; cpu < NR_CPUS; cpu++) { + int siblings = 0; + int i; + if (!cpu_isset(cpu, cpu_callout_map)) + continue; + if (smp_num_siblings > 1) { for (i = 0; i < NR_CPUS; i++) { - if (i == cpu || !cpu_isset(i, cpu_callout_map)) + if (!cpu_isset(i, cpu_callout_map)) continue; if (phys_proc_id[cpu] == phys_proc_id[i]) { - cpu_sibling_map[cpu] = i; - printk("cpu_sibling_map[%d] = %d\n", cpu, cpu_sibling_map[cpu]); - break; + siblings++; + cpu_set(i, cpu_sibling_map[cpu]); } } - if (cpu_sibling_map[cpu] == NO_PROC_ID) { - smp_num_siblings = 1; - printk(KERN_WARNING "WARNING: No sibling found for CPU %d.\n", cpu); - } + } else { + siblings++; + cpu_set(cpu, cpu_sibling_map[cpu]); } + + if (siblings != smp_num_siblings) + printk(KERN_WARNING "WARNING: %d siblings found for CPU%d, should be %d\n", siblings, cpu, smp_num_siblings); } smpboot_setup_io_apic(); @@ -1118,6 +1123,216 @@ static void __init smp_boot_cpus(unsigne synchronize_tsc_bp(); } +#ifdef CONFIG_SCHED_SMT +#ifdef CONFIG_NUMA +static struct sched_group sched_group_cpus[NR_CPUS]; +static struct sched_group sched_group_phys[NR_CPUS]; +static struct sched_group sched_group_nodes[MAX_NUMNODES]; +static DEFINE_PER_CPU(struct sched_domain, phys_domains); +static DEFINE_PER_CPU(struct sched_domain, node_domains); +__init void arch_init_sched_domains(void) +{ + int i; + struct sched_group *first_cpu = NULL, *last_cpu = NULL; + + /* Set up domains */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = cpu_sched_domain(i); + struct sched_domain *phys_domain = &per_cpu(phys_domains, i); + struct sched_domain *node_domain = &per_cpu(node_domains, i); + int node = cpu_to_node(i); + cpumask_t nodemask = node_to_cpumask(node); + + *cpu_domain = SD_SIBLING_INIT; + cpu_domain->span = cpu_sibling_map[i]; + + *phys_domain = SD_CPU_INIT; + phys_domain->span = nodemask; + + *node_domain = SD_NODE_INIT; + node_domain->span = cpu_possible_map; + } + + /* Set up CPU (sibling) groups */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = cpu_sched_domain(i); + int j; + first_cpu = last_cpu = NULL; + + if (i != first_cpu(cpu_domain->span)) + continue; + + for_each_cpu_mask(j, cpu_domain->span) { + struct sched_group *cpu = &sched_group_cpus[j]; + + cpu->cpumask = CPU_MASK_NONE; + cpu_set(j, cpu->cpumask); + cpu->cpu_power = SCHED_LOAD_SCALE; + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + } + + for (i = 0; i < MAX_NUMNODES; i++) { + int j; + cpumask_t nodemask; + struct sched_group *node = &sched_group_nodes[i]; + cpus_and(nodemask, node_to_cpumask(i), cpu_possible_map); + + if (cpus_empty(nodemask)) + continue; + + first_cpu = last_cpu = NULL; + /* Set up physical groups */ + for_each_cpu_mask(j, nodemask) { + struct sched_domain *cpu_domain = cpu_sched_domain(j); + struct sched_group *cpu = &sched_group_phys[j]; + + if (j != first_cpu(cpu_domain->span)) + continue; + + cpu->cpumask = cpu_domain->span; + /* + * Make each extra sibling increase power by 10% of + * the basic CPU. This is very arbitrary. + */ + cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10; + node->cpu_power += cpu->cpu_power; + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + } + + /* Set up nodes */ + first_cpu = last_cpu = NULL; + for (i = 0; i < MAX_NUMNODES; i++) { + struct sched_group *cpu = &sched_group_nodes[i]; + cpumask_t nodemask; + cpus_and(nodemask, node_to_cpumask(i), cpu_possible_map); + + if (cpus_empty(nodemask)) + continue; + + cpu->cpumask = nodemask; + /* ->cpu_power already setup */ + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + + mb(); + for_each_cpu(i) { + int node = cpu_to_node(i); + struct sched_domain *cpu_domain = cpu_sched_domain(i); + struct sched_domain *phys_domain = &per_cpu(phys_domains, i); + struct sched_domain *node_domain = &per_cpu(node_domains, i); + struct sched_group *cpu_group = &sched_group_cpus[i]; + struct sched_group *phys_group = &sched_group_phys[first_cpu(cpu_domain->span)]; + struct sched_group *node_group = &sched_group_nodes[node]; + + cpu_domain->parent = phys_domain; + phys_domain->parent = node_domain; + + node_domain->groups = node_group; + phys_domain->groups = phys_group; + cpu_domain->groups = cpu_group; + } +} +#else /* CONFIG_NUMA */ +static struct sched_group sched_group_cpus[NR_CPUS]; +static struct sched_group sched_group_phys[NR_CPUS]; +static DEFINE_PER_CPU(struct sched_domain, phys_domains); +__init void arch_init_sched_domains(void) +{ + int i; + struct sched_group *first_cpu = NULL, *last_cpu = NULL; + + /* Set up domains */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = cpu_sched_domain(i); + struct sched_domain *phys_domain = &per_cpu(phys_domains, i); + + *cpu_domain = SD_SIBLING_INIT; + cpu_domain->span = cpu_sibling_map[i]; + + *phys_domain = SD_CPU_INIT; + phys_domain->span = cpu_possible_map; + } + + /* Set up CPU (sibling) groups */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = cpu_sched_domain(i); + int j; + first_cpu = last_cpu = NULL; + + if (i != first_cpu(cpu_domain->span)) + continue; + + for_each_cpu_mask(j, cpu_domain->span) { + struct sched_group *cpu = &sched_group_cpus[j]; + + cpus_clear(cpu->cpumask); + cpu_set(j, cpu->cpumask); + cpu->cpu_power = SCHED_LOAD_SCALE; + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + } + + first_cpu = last_cpu = NULL; + /* Set up physical groups */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = cpu_sched_domain(i); + struct sched_group *cpu = &sched_group_phys[i]; + + if (i != first_cpu(cpu_domain->span)) + continue; + + cpu->cpumask = cpu_domain->span; + /* See SMT+NUMA setup for comment */ + cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10; + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + + mb(); + for_each_cpu(i) { + struct sched_domain *cpu_domain = cpu_sched_domain(i); + struct sched_domain *phys_domain = &per_cpu(phys_domains, i); + struct sched_group *cpu_group = &sched_group_cpus[i]; + struct sched_group *phys_group = &sched_group_phys[first_cpu(cpu_domain->span)]; + cpu_domain->parent = phys_domain; + phys_domain->groups = phys_group; + cpu_domain->groups = cpu_group; + } +} +#endif /* CONFIG_NUMA */ +#endif /* CONFIG_SCHED_SMT */ + /* These are wrappers to interface to the new boot process. Someone who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */ void __init smp_prepare_cpus(unsigned int max_cpus) diff -purN -X /home/mbligh/.diff.exclude 160-lotsa_sds/arch/i386/oprofile/op_model_p4.c 180-sched_domains/arch/i386/oprofile/op_model_p4.c --- 160-lotsa_sds/arch/i386/oprofile/op_model_p4.c 2003-10-01 11:40:41.000000000 -0700 +++ 180-sched_domains/arch/i386/oprofile/op_model_p4.c 2004-03-12 11:07:47.000000000 -0800 @@ -382,11 +382,8 @@ static struct p4_event_binding p4_events static unsigned int get_stagger(void) { #ifdef CONFIG_SMP - int cpu; - if (smp_num_siblings > 1) { - cpu = smp_processor_id(); - return (cpu_sibling_map[cpu] > cpu) ? 0 : 1; - } + int cpu = smp_processor_id(); + return (cpu != first_cpu(cpu_sibling_map[cpu])); #endif return 0; } diff -purN -X /home/mbligh/.diff.exclude 160-lotsa_sds/include/asm-i386/processor.h 180-sched_domains/include/asm-i386/processor.h --- 160-lotsa_sds/include/asm-i386/processor.h 2004-03-12 11:07:27.000000000 -0800 +++ 180-sched_domains/include/asm-i386/processor.h 2004-03-12 11:07:47.000000000 -0800 @@ -660,4 +660,9 @@ extern inline void prefetchw(const void extern void select_idle_routine(const struct cpuinfo_x86 *c); +#ifdef CONFIG_SCHED_SMT +#define ARCH_HAS_SCHED_DOMAIN +#define ARCH_HAS_SCHED_WAKE_BALANCE +#endif + #endif /* __ASM_I386_PROCESSOR_H */ diff -purN -X /home/mbligh/.diff.exclude 160-lotsa_sds/include/asm-i386/smp.h 180-sched_domains/include/asm-i386/smp.h --- 160-lotsa_sds/include/asm-i386/smp.h 2004-03-11 14:35:15.000000000 -0800 +++ 180-sched_domains/include/asm-i386/smp.h 2004-03-12 11:07:47.000000000 -0800 @@ -34,7 +34,7 @@ extern void smp_alloc_memory(void); extern int pic_mode; extern int smp_num_siblings; -extern int cpu_sibling_map[]; +extern cpumask_t cpu_sibling_map[]; extern void smp_flush_tlb(void); extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs); diff -purN -X /home/mbligh/.diff.exclude 160-lotsa_sds/include/asm-ppc64/mmu_context.h 180-sched_domains/include/asm-ppc64/mmu_context.h --- 160-lotsa_sds/include/asm-ppc64/mmu_context.h 2004-03-11 14:35:23.000000000 -0800 +++ 180-sched_domains/include/asm-ppc64/mmu_context.h 2004-03-12 11:07:47.000000000 -0800 @@ -156,6 +156,8 @@ static inline void switch_mm(struct mm_s if (!cpu_isset(smp_processor_id(), next->cpu_vm_mask)) cpu_set(smp_processor_id(), next->cpu_vm_mask); + cpu_set(smp_processor_id(), next->cpu_vm_mask); + /* No need to flush userspace segments if the mm doesnt change */ if (prev == next) return; diff -purN -X /home/mbligh/.diff.exclude 160-lotsa_sds/include/linux/sched.h 180-sched_domains/include/linux/sched.h --- 160-lotsa_sds/include/linux/sched.h 2004-03-11 14:35:34.000000000 -0800 +++ 180-sched_domains/include/linux/sched.h 2004-03-12 11:07:47.000000000 -0800 @@ -147,6 +147,7 @@ extern spinlock_t mmlist_lock; typedef struct task_struct task_t; extern void sched_init(void); +extern void sched_init_smp(void); extern void init_idle(task_t *idle, int cpu); extern void show_state(void); @@ -529,6 +530,103 @@ do { if (atomic_dec_and_test(&(tsk)->usa #define PF_SYNCWRITE 0x00200000 /* I am doing a sync write */ #ifdef CONFIG_SMP +#define SCHED_LOAD_SHIFT 7 /* increase resolution of load calculations */ +#define SCHED_LOAD_SCALE (1UL << SCHED_LOAD_SHIFT) + +#define SD_FLAG_NEWIDLE 1 /* Balance when about to become idle */ +#define SD_FLAG_EXEC 2 /* Balance on exec */ +#define SD_FLAG_WAKE 4 /* Balance on task wakeup */ +#define SD_FLAG_FASTMIGRATE 8 /* Sync wakes put task on waking CPU */ + +struct sched_group { + struct sched_group *next; /* Must be a circular list */ + cpumask_t cpumask; + + /* + * CPU power of this group, SCHED_LOAD_SCALE being max power for a + * single CPU. This should be read only (except for setup). Although + * it will need to be written to at cpu hot(un)plug time, perhaps the + * cpucontrol semaphore will provide enough exclusion? + */ + unsigned long cpu_power; +}; + +struct sched_domain { + /* These fields must be setup */ + struct sched_domain *parent; /* top domain must be null terminated */ + struct sched_group *groups; /* the balancing groups of the domain */ + cpumask_t span; /* span of all CPUs in this domain */ + unsigned long min_interval; /* Minimum balance interval ms */ + unsigned long max_interval; /* Maximum balance interval ms */ + unsigned int busy_factor; /* less balancing by factor if busy */ + unsigned int imbalance_pct; /* No balance until over watermark */ + unsigned long long cache_hot_time; /* Task considered cache hot (ns) */ + unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */ + int flags; /* See SD_FLAG_* */ + + /* Runtime fields. */ + unsigned long last_balance; /* init to jiffies. units in jiffies */ + unsigned int balance_interval; /* initialise to 1. units in ms. */ + unsigned int nr_balance_failed; /* initialise to 0 */ +}; + +/* Common values for SMT siblings */ +#define SD_SIBLING_INIT (struct sched_domain) { \ + .span = CPU_MASK_NONE, \ + .parent = NULL, \ + .groups = NULL, \ + .min_interval = 1, \ + .max_interval = 2, \ + .busy_factor = 8, \ + .imbalance_pct = 110, \ + .cache_hot_time = 0, \ + .cache_nice_tries = 0, \ + .flags = SD_FLAG_FASTMIGRATE | SD_FLAG_NEWIDLE | SD_FLAG_WAKE,\ + .last_balance = jiffies, \ + .balance_interval = 1, \ + .nr_balance_failed = 0, \ +} + +/* Common values for CPUs */ +#define SD_CPU_INIT (struct sched_domain) { \ + .span = CPU_MASK_NONE, \ + .parent = NULL, \ + .groups = NULL, \ + .min_interval = 1, \ + .max_interval = 8, \ + .busy_factor = 32, \ + .imbalance_pct = 125, \ + .cache_hot_time = (5*1000000), \ + .cache_nice_tries = 2, \ + .flags = SD_FLAG_FASTMIGRATE | SD_FLAG_NEWIDLE,\ + .last_balance = jiffies, \ + .balance_interval = 1, \ + .nr_balance_failed = 0, \ +} + +#ifdef CONFIG_NUMA +/* Common values for NUMA nodes */ +#define SD_NODE_INIT (struct sched_domain) { \ + .span = CPU_MASK_NONE, \ + .parent = NULL, \ + .groups = NULL, \ + .min_interval = 20, \ + .max_interval = 1000*fls(num_online_cpus()),\ + .busy_factor = 4, \ + .imbalance_pct = 125, \ + .cache_hot_time = (5*1000000), \ + .cache_nice_tries = 1, \ + .flags = SD_FLAG_EXEC, \ + .last_balance = jiffies, \ + .balance_interval = 1, \ + .nr_balance_failed = 0, \ +} +#endif + +DECLARE_PER_CPU(struct sched_domain, base_domains); +#define cpu_sched_domain(cpu) (&per_cpu(base_domains, (cpu))) +#define this_sched_domain() (&__get_cpu_var(base_domains)) + extern int set_cpus_allowed(task_t *p, cpumask_t new_mask); #else static inline int set_cpus_allowed(task_t *p, cpumask_t new_mask) @@ -541,10 +639,8 @@ extern unsigned long long sched_clock(vo #ifdef CONFIG_NUMA extern void sched_balance_exec(void); -extern void node_nr_running_init(void); #else #define sched_balance_exec() {} -#define node_nr_running_init() {} #endif extern void set_user_nice(task_t *p, long nice); diff -purN -X /home/mbligh/.diff.exclude 160-lotsa_sds/init/main.c 180-sched_domains/init/main.c --- 160-lotsa_sds/init/main.c 2004-03-11 14:35:37.000000000 -0800 +++ 180-sched_domains/init/main.c 2004-03-12 11:07:47.000000000 -0800 @@ -568,7 +568,6 @@ static void do_pre_smp_initcalls(void) migration_init(); #endif - node_nr_running_init(); spawn_ksoftirqd(); } @@ -599,6 +598,7 @@ static int init(void * unused) do_pre_smp_initcalls(); smp_init(); + sched_init_smp(); do_basic_setup(); prepare_namespace(); diff -purN -X /home/mbligh/.diff.exclude 160-lotsa_sds/kernel/sched.c 180-sched_domains/kernel/sched.c --- 160-lotsa_sds/kernel/sched.c 2004-03-12 11:06:24.000000000 -0800 +++ 180-sched_domains/kernel/sched.c 2004-03-12 11:07:47.000000000 -0800 @@ -91,7 +91,6 @@ #define MAX_SLEEP_AVG (AVG_TIMESLICE * MAX_BONUS) #define STARVATION_LIMIT (MAX_SLEEP_AVG) #define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) -#define NODE_THRESHOLD 125 #define CREDIT_LIMIT 100 /* @@ -187,7 +186,7 @@ static inline unsigned int task_timeslic typedef struct runqueue runqueue_t; struct prio_array { - int nr_active; + unsigned int nr_active; unsigned long bitmap[BITMAP_SIZE]; struct list_head queue[MAX_PRIO]; }; @@ -202,23 +201,33 @@ struct prio_array { struct runqueue { spinlock_t lock; unsigned long nr_running, nr_switches, expired_timestamp, - nr_uninterruptible, timestamp_last_tick; + nr_uninterruptible; + unsigned long long timestamp_last_tick; task_t *curr, *idle; struct mm_struct *prev_mm; prio_array_t *active, *expired, arrays[2]; - int best_expired_prio, prev_cpu_load[NR_CPUS]; -#ifdef CONFIG_NUMA - atomic_t *node_nr_running; - int prev_node_load[MAX_NUMNODES]; + int best_expired_prio; + + atomic_t nr_iowait; + +#ifdef CONFIG_SMP + unsigned long cpu_load[NR_CPUS]; #endif + /* For active balancing */ + int active_balance; + int push_cpu; + task_t *migration_thread; struct list_head migration_queue; - - atomic_t nr_iowait; }; static DEFINE_PER_CPU(struct runqueue, runqueues); +#ifdef CONFIG_SMP +/* Mandatory scheduling domains */ +DEFINE_PER_CPU(struct sched_domain, base_domains); +#endif + #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) #define this_rq() (&__get_cpu_var(runqueues)) #define task_rq(p) cpu_rq(task_cpu(p)) @@ -233,51 +242,16 @@ static DEFINE_PER_CPU(struct runqueue, r # define task_running(rq, p) ((rq)->curr == (p)) #endif -#ifdef CONFIG_NUMA - -/* - * Keep track of running tasks. - */ - -static atomic_t node_nr_running[MAX_NUMNODES] ____cacheline_maxaligned_in_smp = - {[0 ...MAX_NUMNODES-1] = ATOMIC_INIT(0)}; - -static inline void nr_running_init(struct runqueue *rq) -{ - rq->node_nr_running = &node_nr_running[0]; -} - static inline void nr_running_inc(runqueue_t *rq) { - atomic_inc(rq->node_nr_running); rq->nr_running++; } static inline void nr_running_dec(runqueue_t *rq) { - atomic_dec(rq->node_nr_running); rq->nr_running--; } -__init void node_nr_running_init(void) -{ - int i; - - for (i = 0; i < NR_CPUS; i++) { - if (cpu_possible(i)) - cpu_rq(i)->node_nr_running = - &node_nr_running[cpu_to_node(i)]; - } -} - -#else /* !CONFIG_NUMA */ - -# define nr_running_init(rq) do { } while (0) -# define nr_running_inc(rq) do { (rq)->nr_running++; } while (0) -# define nr_running_dec(rq) do { (rq)->nr_running--; } while (0) - -#endif /* CONFIG_NUMA */ - /* * task_rq_lock - lock the runqueue a given task resides on and disable * interrupts. Note the ordering: we can safely lookup the task_rq without @@ -545,37 +519,30 @@ inline int task_curr(task_t *p) typedef struct { struct list_head list; task_t *task; + int dest_cpu; struct completion done; } migration_req_t; /* - * The task's runqueue lock must be held, and the new mask must be valid. + * The task's runqueue lock must be held. * Returns true if you have to wait for migration thread. */ -static int __set_cpus_allowed(task_t *p, cpumask_t new_mask, - migration_req_t *req) +static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) { runqueue_t *rq = task_rq(p); - p->cpus_allowed = new_mask; - /* - * Can the task run on the task's current CPU? If not then - * migrate the thread off to a proper CPU. - */ - if (cpu_isset(task_cpu(p), new_mask)) - return 0; - /* * If the task is not on a runqueue (and not running), then * it is sufficient to simply update the task's cpu field. */ if (!p->array && !task_running(rq, p)) { - set_task_cpu(p, any_online_cpu(p->cpus_allowed)); + set_task_cpu(p, dest_cpu); return 0; } init_completion(&req->done); req->task = p; + req->dest_cpu = dest_cpu; list_add(&req->list, &rq->migration_queue); return 1; } @@ -636,7 +603,76 @@ void kick_process(task_t *p) } EXPORT_SYMBOL_GPL(kick_process); +/* + * Return a low guess at the load of cpu. Update previous history if update + * is true + */ +static inline unsigned long get_low_cpu_load(int cpu, int update) +{ + runqueue_t *rq = cpu_rq(cpu); + runqueue_t *this_rq = this_rq(); + unsigned long nr = rq->nr_running << SCHED_LOAD_SHIFT; + unsigned long load = this_rq->cpu_load[cpu]; + unsigned long ret = min(nr, load); + + if (update) + this_rq->cpu_load[cpu] = (nr + load) / 2; + + return ret; +} + +static inline unsigned long get_high_cpu_load(int cpu, int update) +{ + runqueue_t *rq = cpu_rq(cpu); + runqueue_t *this_rq = this_rq(); + unsigned long nr = rq->nr_running << SCHED_LOAD_SHIFT; + unsigned long load = this_rq->cpu_load[cpu]; + unsigned long ret = max(nr, load); + + if (update) + this_rq->cpu_load[cpu] = (nr + load) / 2; + + return ret; +} + +#endif + +/* + * sched_balance_wake can be used with SMT architectures to wake a + * task onto an idle sibling if cpu is not idle. Returns cpu if + * cpu is idle or no siblings are idle, otherwise returns an idle + * sibling. + */ +#if defined(CONFIG_SMP) && defined(ARCH_HAS_SCHED_WAKE_BALANCE) +static int sched_balance_wake(int cpu, task_t *p) +{ + cpumask_t tmp; + struct sched_domain *domain; + int i; + + if (idle_cpu(cpu)) + return cpu; + + domain = cpu_sched_domain(cpu); + if (!(domain->flags & SD_FLAG_WAKE)) + return cpu; + + cpus_and(tmp, domain->span, cpu_online_map); + for_each_cpu_mask(i, tmp) { + if (!cpu_isset(i, p->cpus_allowed)) + continue; + if (idle_cpu(i)) + return i; + } + + return cpu; +} +#else +static inline int sched_balance_wake(int cpu, task_t *p) +{ + return cpu; +} #endif /*** @@ -659,44 +695,108 @@ static int try_to_wake_up(task_t * p, un int success = 0; long old_state; runqueue_t *rq; + int cpu, this_cpu; +#ifdef CONFIG_SMP + unsigned long long now; + unsigned long load, this_load; + int new_cpu; + struct sched_domain *sd; + runqueue_t *this_rq; +#endif -repeat_lock_task: rq = task_rq_lock(p, &flags); old_state = p->state; - if (old_state & state) { - if (!p->array) { - /* - * Fast-migrate the task if it's not running or runnable - * currently. Do not violate hard affinity. - */ - if (unlikely(sync && !task_running(rq, p) && - (task_cpu(p) != smp_processor_id()) && - cpu_isset(smp_processor_id(), - p->cpus_allowed))) { - - set_task_cpu(p, smp_processor_id()); - task_rq_unlock(rq, &flags); - goto repeat_lock_task; - } - if (old_state == TASK_UNINTERRUPTIBLE) { - rq->nr_uninterruptible--; - /* - * Tasks on involuntary sleep don't earn - * sleep_avg beyond just interactive state. - */ - p->activated = -1; - } - if (sync && (task_cpu(p) == smp_processor_id())) - __activate_task(p, rq); - else { - activate_task(p, rq); - if (TASK_PREEMPTS_CURR(p, rq)) - resched_task(rq->curr); - } - success = 1; + if (!(old_state & state)) + goto out; + + if (p->array) + goto out_running; + + this_cpu = smp_processor_id(); + cpu = task_cpu(p); + +#ifdef CONFIG_SMP + if (cpu == this_cpu) + goto out_activate; + + if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed) + || task_running(rq, p))) + goto out_activate; + + /* Passive load balancing */ + load = get_low_cpu_load(cpu, 1); + this_load = get_high_cpu_load(this_cpu, 1) + SCHED_LOAD_SCALE; + if (load > this_load) { + new_cpu = sched_balance_wake(this_cpu, p); + set_task_cpu(p, new_cpu); + goto repeat_lock_task; + } + + this_rq = this_rq(); + now = sched_clock(); + sd = cpu_sched_domain(this_cpu); + + /* + * Fast-migrate the task if it's not running or + * runnable currently. Do not violate hard affinity. + */ + do { + if (!(sd->flags & SD_FLAG_FASTMIGRATE)) + break; + if (now - p->timestamp < sd->cache_hot_time) + break; + + if (cpu_isset(cpu, sd->span)) { + new_cpu = sched_balance_wake(this_cpu, p); + set_task_cpu(p, new_cpu); + goto repeat_lock_task; } - p->state = TASK_RUNNING; + sd = sd->parent; + } while (sd); + + new_cpu = sched_balance_wake(cpu, p); + if (new_cpu != cpu) { + set_task_cpu(p, new_cpu); + goto repeat_lock_task; + } + goto out_activate; + +repeat_lock_task: + task_rq_unlock(rq, &flags); + rq = task_rq_lock(p, &flags); + old_state = p->state; + if (!(old_state & state)) + goto out; + + if (p->array) + goto out_running; + + this_cpu = smp_processor_id(); + cpu = task_cpu(p); + +out_activate: +#endif /* CONFIG_SMP */ + if (old_state == TASK_UNINTERRUPTIBLE) { + rq->nr_uninterruptible--; + /* + * Tasks on involuntary sleep don't earn + * sleep_avg beyond just interactive state. + */ + p->activated = -1; + } + + if (sync && cpu == this_cpu) { + __activate_task(p, rq); + } else { + activate_task(p, rq); + if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); } + success = 1; + +out_running: + p->state = TASK_RUNNING; +out: task_rq_unlock(rq, &flags); return success; @@ -755,8 +855,8 @@ void fastcall sched_fork(task_t *p) p->timestamp = sched_clock(); if (!current->time_slice) { /* - * This case is rare, it happens when the parent has only - * a single jiffy left from its timeslice. Taking the + * This case is rare, it happens when the parent has only + * a single jiffy left from its timeslice. Taking the * runqueue lock is not a problem. */ current->time_slice = 1; @@ -872,7 +972,7 @@ static inline void finish_task_switch(ta * still held, otherwise prev could be scheduled on another cpu, die * there before we look at prev->state, and then the reference would * be dropped twice. - * Manfred Spraul <manfred@colorfullife.com> + * Manfred Spraul <manfred@colorfullife.com> */ prev_task_flags = prev->flags; finish_arch_switch(rq, prev); @@ -934,7 +1034,7 @@ unsigned long nr_running(void) { unsigned long i, sum = 0; - for (i = 0; i < NR_CPUS; i++) + for_each_cpu(i) sum += cpu_rq(i)->nr_running; return sum; @@ -1004,6 +1104,14 @@ static inline void double_rq_unlock(runq spin_unlock(&rq2->lock); } +enum idle_type +{ + IDLE, + NOT_IDLE, + NEWLY_IDLE, +}; + +#ifdef CONFIG_SMP #ifdef CONFIG_NUMA /* * If dest_cpu is allowed for this process, migrate the task to it. @@ -1016,28 +1124,18 @@ static void sched_migrate_task(task_t *p runqueue_t *rq; migration_req_t req; unsigned long flags; - cpumask_t old_mask, new_mask = cpumask_of_cpu(dest_cpu); rq = task_rq_lock(p, &flags); - old_mask = p->cpus_allowed; - if (!cpu_isset(dest_cpu, old_mask) || !cpu_online(dest_cpu)) + if (!cpu_isset(dest_cpu, p->cpus_allowed)) goto out; /* force the process onto the specified CPU */ - if (__set_cpus_allowed(p, new_mask, &req)) { + if (migrate_task(p, dest_cpu, &req)) { /* Need to wait for migration thread. */ task_rq_unlock(rq, &flags); wake_up_process(rq->migration_thread); wait_for_completion(&req.done); - - /* If we raced with sys_sched_setaffinity, don't - * restore mask. */ - rq = task_rq_lock(p, &flags); - if (likely(cpus_equal(p->cpus_allowed, new_mask))) { - /* Restore old mask: won't need migration - * thread, since current cpu is allowed. */ - BUG_ON(__set_cpus_allowed(p, old_mask, NULL)); - } + return; } out: task_rq_unlock(rq, &flags); @@ -1047,215 +1145,81 @@ out: * Find the least loaded CPU. Slightly favor the current CPU by * setting its runqueue length as the minimum to start. */ -static int sched_best_cpu(struct task_struct *p) +static int sched_best_cpu(struct task_struct *p, struct sched_domain *domain) { - int i, minload, load, best_cpu, node = 0; - cpumask_t cpumask; + cpumask_t tmp; + int i, min_load, this_cpu, best_cpu; - best_cpu = task_cpu(p); - if (cpu_rq(best_cpu)->nr_running <= 2) - return best_cpu; + best_cpu = this_cpu = task_cpu(p); + min_load = INT_MAX; - minload = 10000000; - for_each_node_with_cpus(i) { - /* - * Node load is always divided by nr_cpus_node to normalise - * load values in case cpu count differs from node to node. - * We first multiply node_nr_running by 10 to get a little - * better resolution. - */ - load = 10 * atomic_read(&node_nr_running[i]) / nr_cpus_node(i); - if (load < minload) { - minload = load; - node = i; - } - } + cpus_and(tmp, domain->span, cpu_online_map); + for_each_cpu_mask(i, tmp) { + unsigned long load; + if (i == this_cpu) + load = get_low_cpu_load(i, 0); + else + load = get_high_cpu_load(i, 0) + SCHED_LOAD_SCALE; - minload = 10000000; - cpumask = node_to_cpumask(node); - for (i = 0; i < NR_CPUS; ++i) { - if (!cpu_isset(i, cpumask)) - continue; - if (cpu_rq(i)->nr_running < minload) { + if (min_load > load) { best_cpu = i; - minload = cpu_rq(i)->nr_running; + min_load = load; } + } return best_cpu; } void sched_balance_exec(void) { + struct sched_domain *domain = this_sched_domain(); int new_cpu; + int this_cpu = smp_processor_id(); + if (numnodes == 1) + return; - if (numnodes > 1) { - new_cpu = sched_best_cpu(current); - if (new_cpu != smp_processor_id()) - sched_migrate_task(current, new_cpu); - } -} + if (this_rq()->nr_running <= 1) + return; -/* - * Find the busiest node. All previous node loads contribute with a - * geometrically deccaying weight to the load measure: - * load_{t} = load_{t-1}/2 + nr_node_running_{t} - * This way sudden load peaks are flattened out a bit. - * Node load is divided by nr_cpus_node() in order to compare nodes - * of different cpu count but also [first] multiplied by 10 to - * provide better resolution. - */ -static int find_busiest_node(int this_node) -{ - int i, node = -1, load, this_load, maxload; - - if (!nr_cpus_node(this_node)) - return node; - this_load = maxload = (this_rq()->prev_node_load[this_node] >> 1) - + (10 * atomic_read(&node_nr_running[this_node]) - / nr_cpus_node(this_node)); - this_rq()->prev_node_load[this_node] = this_load; - for_each_node_with_cpus(i) { - if (i == this_node) - continue; - load = (this_rq()->prev_node_load[i] >> 1) - + (10 * atomic_read(&node_nr_running[i]) - / nr_cpus_node(i)); - this_rq()->prev_node_load[i] = load; - if (load > maxload && (100*load > NODE_THRESHOLD*this_load)) { - maxload = load; - node = i; - } + while (domain->parent && !(domain->flags & SD_FLAG_EXEC)) + domain = domain->parent; + + if (domain->flags & SD_FLAG_EXEC) { + new_cpu = sched_best_cpu(current, domain); + if (new_cpu != this_cpu) + sched_migrate_task(current, new_cpu); } - return node; } - #endif /* CONFIG_NUMA */ -#ifdef CONFIG_SMP - /* - * double_lock_balance - lock the busiest runqueue - * - * this_rq is locked already. Recalculate nr_running if we have to - * drop the runqueue lock. + * double_lock_balance - lock the busiest runqueue, this_rq is locked already. */ -static inline -unsigned int double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest, - int this_cpu, int idle, - unsigned int nr_running) +static inline void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) { if (unlikely(!spin_trylock(&busiest->lock))) { if (busiest < this_rq) { spin_unlock(&this_rq->lock); spin_lock(&busiest->lock); spin_lock(&this_rq->lock); - /* Need to recalculate nr_running */ - if (idle || (this_rq->nr_running > - this_rq->prev_cpu_load[this_cpu])) - nr_running = this_rq->nr_running; - else - nr_running = this_rq->prev_cpu_load[this_cpu]; } else spin_lock(&busiest->lock); } - return nr_running; -} - -/* - * find_busiest_queue - find the busiest runqueue among the cpus in cpumask. - */ -static inline -runqueue_t *find_busiest_queue(runqueue_t *this_rq, int this_cpu, int idle, - int *imbalance, cpumask_t cpumask) -{ - int nr_running, load, max_load, i; - runqueue_t *busiest, *rq_src; - - /* - * We search all runqueues to find the most busy one. - * We do this lockless to reduce cache-bouncing overhead, - * we re-check the 'best' source CPU later on again, with - * the lock held. - * - * We fend off statistical fluctuations in runqueue lengths by - * saving the runqueue length (as seen by the balancing CPU) during - * the previous load-balancing operation and using the smaller one - * of the current and saved lengths. If a runqueue is long enough - * for a longer amount of time then we recognize it and pull tasks - * from it. - * - * The 'current runqueue length' is a statistical maximum variable, - * for that one we take the longer one - to avoid fluctuations in - * the other direction. So for a load-balance to happen it needs - * stable long runqueue on the target CPU and stable short runqueue - * on the local runqueue. - * - * We make an exception if this CPU is about to become idle - in - * that case we are less picky about moving a task across CPUs and - * take what can be taken. - */ - if (idle || (this_rq->nr_running > this_rq->prev_cpu_load[this_cpu])) - nr_running = this_rq->nr_running; - else - nr_running = this_rq->prev_cpu_load[this_cpu]; - - busiest = NULL; - max_load = 1; - for (i = 0; i < NR_CPUS; i++) { - if (!cpu_isset(i, cpumask)) - continue; - - rq_src = cpu_rq(i); - if (idle || (rq_src->nr_running < this_rq->prev_cpu_load[i])) - load = rq_src->nr_running; - else - load = this_rq->prev_cpu_load[i]; - this_rq->prev_cpu_load[i] = rq_src->nr_running; - - if ((load > max_load) && (rq_src != this_rq)) { - busiest = rq_src; - max_load = load; - } - } - - if (likely(!busiest)) - goto out; - - *imbalance = max_load - nr_running; - - /* It needs an at least ~25% imbalance to trigger balancing. */ - if (!idle && ((*imbalance)*4 < max_load)) { - busiest = NULL; - goto out; - } - - nr_running = double_lock_balance(this_rq, busiest, this_cpu, - idle, nr_running); - /* - * Make sure nothing changed since we checked the - * runqueue length. - */ - if (busiest->nr_running <= nr_running) { - spin_unlock(&busiest->lock); - busiest = NULL; - } -out: - return busiest; } /* * pull_task - move a task from a remote runqueue to the local runqueue. * Both runqueues must be locked. */ -static inline -void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, - runqueue_t *this_rq, int this_cpu) +static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, + task_t *p, runqueue_t *this_rq, prio_array_t *this_array, + int this_cpu) { dequeue_task(p, src_array); nr_running_dec(src_rq); set_task_cpu(p, this_cpu); nr_running_inc(this_rq); - enqueue_task(p, this_rq->active); + enqueue_task(p, this_array); p->timestamp = sched_clock() - (src_rq->timestamp_last_tick - p->timestamp); /* @@ -1263,69 +1227,71 @@ void pull_task(runqueue_t *src_rq, prio_ * to be always true for them. */ if (TASK_PREEMPTS_CURR(p, this_rq)) - set_need_resched(); + resched_task(this_rq->curr); } /* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? */ static inline -int can_migrate_task(task_t *tsk, runqueue_t *rq, int this_cpu, int idle) +int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, + struct sched_domain *domain, enum idle_type idle) { - unsigned long delta = rq->timestamp_last_tick - tsk->timestamp; - /* * We do not migrate tasks that are: * 1) running (obviously), or * 2) cannot be migrated to this CPU due to cpus_allowed, or * 3) are cache-hot on their current CPU. */ - if (task_running(rq, tsk)) - return 0; - if (!cpu_isset(this_cpu, tsk->cpus_allowed)) + if (task_running(rq, p)) return 0; - if (!idle && (delta <= JIFFIES_TO_NS(cache_decay_ticks))) + if (!cpu_isset(this_cpu, p->cpus_allowed)) return 0; + + /* Aggressive migration if we've failed balancing */ + if (idle == NEWLY_IDLE || + domain->nr_balance_failed < domain->cache_nice_tries) { + if ((rq->timestamp_last_tick - p->timestamp) + < domain->cache_hot_time) + return 0; + } + return 1; } /* - * Current runqueue is empty, or rebalance tick: if there is an - * inbalance (current runqueue is too short) then pull from - * busiest runqueue(s). - * - * We call this with the current runqueue locked, - * irqs disabled. - */ -static void load_balance(runqueue_t *this_rq, int idle, cpumask_t cpumask) + * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, + * as part of a balancing operation within "domain". Returns the number of + * tasks moved. + * + * Called with both runqueues locked. + */ +static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, + unsigned long max_nr_move, struct sched_domain *domain, + enum idle_type idle) { - int imbalance, idx, this_cpu = smp_processor_id(); - runqueue_t *busiest; - prio_array_t *array; + int idx; + int pulled = 0; + prio_array_t *array, *dst_array; struct list_head *head, *curr; task_t *tmp; - busiest = find_busiest_queue(this_rq, this_cpu, idle, - &imbalance, cpumask); - if (!busiest) + if (max_nr_move <= 0 || busiest->nr_running <= 1) goto out; /* - * We only want to steal a number of tasks equal to 1/2 the imbalance, - * otherwise we'll just shift the imbalance to the new queue: - */ - imbalance /= 2; - - /* * We first consider expired tasks. Those will likely not be * executed in the near future, and they are most likely to * be cache-cold, thus switching CPUs has the least effect * on them. */ - if (busiest->expired->nr_active) + if (busiest->expired->nr_active) { array = busiest->expired; - else + dst_array = this_rq->expired; + } else { array = busiest->active; + dst_array = this_rq->active; + } new_array: /* Start searching at priority 0: */ @@ -1338,9 +1304,10 @@ skip_bitmap: if (idx >= MAX_PRIO) { if (array == busiest->expired) { array = busiest->active; + dst_array = this_rq->active; goto new_array; } - goto out_unlock; + goto out; } head = array->queue + idx; @@ -1350,100 +1317,445 @@ skip_queue: curr = curr->prev; - if (!can_migrate_task(tmp, busiest, this_cpu, idle)) { + if (!can_migrate_task(tmp, busiest, this_cpu, domain, idle)) { if (curr != head) goto skip_queue; idx++; goto skip_bitmap; } - pull_task(busiest, array, tmp, this_rq, this_cpu); + pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); + pulled++; - /* Only migrate one task if we are idle */ - if (!idle && --imbalance) { + /* We only want to steal up to the prescribed number of tasks. */ + if (pulled < max_nr_move) { if (curr != head) goto skip_queue; idx++; goto skip_bitmap; } -out_unlock: - spin_unlock(&busiest->lock); out: - ; + return pulled; } /* - * One of the idle_cpu_tick() and busy_cpu_tick() functions will - * get called every timer tick, on every CPU. Our balancing action - * frequency and balancing agressivity depends on whether the CPU is - * idle or not. + * find_busiest_group finds and returns the busiest CPU group within the + * domain. It calculates and returns the number of tasks which should be + * moved to restore balance via the imbalance parameter. + */ +static struct sched_group * +find_busiest_group(struct sched_domain *domain, int this_cpu, + unsigned long *imbalance, enum idle_type idle) +{ + unsigned long max_load, avg_load, total_load, this_load; + unsigned int total_pwr; + int modify; + struct sched_group *busiest = NULL, *this = NULL, *group = domain->groups; + + max_load = 0; + this_load = 0; + total_load = 0; + total_pwr = 0; + + if (group == NULL) + goto out_balanced; + + /* + * Don't modify when we newly become idle because that ruins our + * statistics: its triggered by some value of nr_running (ie. 0). + * Timer based balancing is a good statistic though. + */ + if (idle == NEWLY_IDLE) + modify = 0; + else + modify = 1; + + do { + cpumask_t tmp; + unsigned long load; + int local_group; + int i, nr_cpus = 0; + + local_group = cpu_isset(this_cpu, group->cpumask); + + /* Tally up the load of all CPUs in the group */ + avg_load = 0; + cpus_and(tmp, group->cpumask, cpu_online_map); + for_each_cpu_mask(i, tmp) { + /* Bias balancing toward cpus of our domain */ + if (local_group) { + load = get_high_cpu_load(i, modify); + } else + load = get_low_cpu_load(i, modify); + + nr_cpus++; + avg_load += load; + } + + if (!nr_cpus) + goto nextgroup; + + total_load += avg_load; + total_pwr += group->cpu_power; + + /* Adjust by relative CPU power of the group */ + avg_load = (avg_load << SCHED_LOAD_SHIFT) / group->cpu_power; + + if (local_group) { + this_load = avg_load; + this = group; + goto nextgroup; + } + if (avg_load > max_load) { + max_load = avg_load; + busiest = group; + } +nextgroup: + group = group->next; + } while (group != domain->groups); + + if (!busiest || this_load >= max_load) + goto out_balanced; + + avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; + + if (idle == NOT_IDLE) { + if (this_load >= avg_load || + 100*max_load <= domain->imbalance_pct*this_load) + goto out_balanced; + } + + /* + * We're trying to get all the cpus to the average_load, so we don't + * want to push ourselves above the average load, nor do we wish to + * reduce the max loaded cpu below the average load, as either of these + * actions would just result in more rebalancing later, and ping-pong + * tasks around. Thus we look for the minimum possible imbalance. + * Negative imbalances (*we* are more loaded than anyone else) will + * be counted as no imbalance for these purposes -- we can't fix that + * by pulling tasks to us. Be careful of negative numbers as they'll + * appear as very large values with unsigned longs. + */ + *imbalance = (min(max_load - avg_load, avg_load - this_load) + 1) / 2; + + if (*imbalance <= SCHED_LOAD_SCALE/2) { + unsigned long pwr_now = 0, pwr_move = 0; + unsigned long tmp; + + /* + * OK, we don't have enough imbalance to justify moving tasks, + * however we may be able to increase total CPU power used by + * moving them. + */ + + pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load); + pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load); + pwr_now >>= SCHED_LOAD_SHIFT; + + /* Amount of load we'd subtract */ + tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power; + if (max_load > tmp) + pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE, + max_load - tmp); + + /* Amount of load we'd add */ + tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power; + pwr_move += this->cpu_power*min(this->cpu_power, this_load + tmp); + pwr_move >>= SCHED_LOAD_SHIFT; + + /* Move if we gain another 8th of a CPU worth of throughput */ + if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8) + goto out_balanced; + *imbalance = 1; + return busiest; + } + + /* How many tasks to actually move to equalise the imbalance */ + *imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power)) + >> SCHED_LOAD_SHIFT; + /* Get rid of the scaling factor, rounding *up* as we divide */ + *imbalance = (*imbalance + SCHED_LOAD_SCALE/2) >> SCHED_LOAD_SHIFT; + + return busiest; + +out_balanced: + *imbalance = 0; + return NULL; +} + +/* + * find_busiest_queue - find the busiest runqueue among the cpus in group. + */ +static runqueue_t *find_busiest_queue(struct sched_group *group) +{ + cpumask_t tmp; + int i; + unsigned long max_load = 0; + runqueue_t *busiest = NULL; + + cpus_and(tmp, group->cpumask, cpu_online_map); + for_each_cpu_mask(i, tmp) { + unsigned long load; + + load = get_low_cpu_load(i, 0); + + if (load >= max_load) { + max_load = load; + busiest = cpu_rq(i); + } + } + + return busiest; +} + +/* + * Check this_cpu to ensure it is balanced within domain. Attempt to move + * tasks if there is an imbalance. * - * busy-rebalance every 200 msecs. idle-rebalance every 1 msec. (or on - * systems with HZ=100, every 10 msecs.) + * Called with this_rq unlocked. + */ +static int load_balance(int this_cpu, runqueue_t *this_rq, + struct sched_domain *domain, enum idle_type idle) +{ + struct sched_group *group; + runqueue_t *busiest = NULL; + unsigned long imbalance; + int balanced = 0, failed = 0; + int nr_moved = 0; + + spin_lock(&this_rq->lock); + + group = find_busiest_group(domain, this_cpu, &imbalance, idle); + if (!group) { + balanced = 1; + goto out; + } + + busiest = find_busiest_queue(group); + if (!busiest || busiest == this_rq) { + balanced = 1; + goto out; + } + + /* Attempt to move tasks */ + double_lock_balance(this_rq, busiest); + + nr_moved = move_tasks(this_rq, this_cpu, busiest, + imbalance, domain, idle); + spin_unlock(&busiest->lock); +out: + spin_unlock(&this_rq->lock); + + if (!balanced && nr_moved == 0) + failed = 1; + + if (failed && busiest && + domain->nr_balance_failed > domain->cache_nice_tries) { + int wake = 0; + + spin_lock(&busiest->lock); + if (!busiest->active_balance) { + busiest->active_balance = 1; + busiest->push_cpu = this_cpu; + wake = 1; + } + spin_unlock(&busiest->lock); + if (wake) + wake_up_process(busiest->migration_thread); + } + + if (failed) + domain->nr_balance_failed++; + else + domain->nr_balance_failed = 0; + + if (balanced) { + if (domain->balance_interval < domain->max_interval) + domain->balance_interval *= 2; + } else { + domain->balance_interval = domain->min_interval; + } + + return nr_moved; +} + +/* + * Check this_cpu to ensure it is balanced within domain. Attempt to move + * tasks if there is an imbalance. * - * On NUMA, do a node-rebalance every 400 msecs. + * Called from schedule when this_rq is about to become idle (NEWLY_IDLE). + * this_rq is locked. */ -#define IDLE_REBALANCE_TICK (HZ/1000 ?: 1) -#define BUSY_REBALANCE_TICK (HZ/5 ?: 1) -#define IDLE_NODE_REBALANCE_TICK (IDLE_REBALANCE_TICK * 5) -#define BUSY_NODE_REBALANCE_TICK (BUSY_REBALANCE_TICK * 2) +static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, + struct sched_domain *domain) +{ + struct sched_group *group; + runqueue_t *busiest = NULL; + unsigned long imbalance; + int nr_moved = 0; -#ifdef CONFIG_NUMA -static void balance_node(runqueue_t *this_rq, int idle, int this_cpu) + group = find_busiest_group(domain, this_cpu, &imbalance, NEWLY_IDLE); + if (!group) + goto out; + + busiest = find_busiest_queue(group); + if (!busiest || busiest == this_rq) + goto out; + + /* Attempt to move tasks */ + double_lock_balance(this_rq, busiest); + + nr_moved = move_tasks(this_rq, this_cpu, busiest, + imbalance, domain, NEWLY_IDLE); + + spin_unlock(&busiest->lock); + +out: + return nr_moved; +} + +/* + * idle_balance is called by schedule() if this_cpu is about to become + * idle. Attempts to pull tasks from other CPUs. + */ +static inline void idle_balance(int this_cpu, runqueue_t *this_rq) { - int node = find_busiest_node(cpu_to_node(this_cpu)); + struct sched_domain *domain = this_sched_domain(); - if (node >= 0) { - cpumask_t cpumask = node_to_cpumask(node); - cpu_set(this_cpu, cpumask); - spin_lock(&this_rq->lock); - load_balance(this_rq, idle, cpumask); - spin_unlock(&this_rq->lock); - } + do { + if (unlikely(!domain->groups)) + /* hasn't been setup yet */ + break; + + if (domain->flags & SD_FLAG_NEWIDLE) { + if (load_balance_newidle(this_cpu, this_rq, domain)) { + /* We've pulled tasks over so stop searching */ + break; + } + } + + domain = domain->parent; + } while (domain); } -#endif -static void rebalance_tick(runqueue_t *this_rq, int idle) +/* + * active_load_balance is run by migration threads. It pushes a running + * task off the cpu. It can be required to correctly have at least 1 task + * running on each physical CPU where possible, and not have a physical / + * logical imbalance. + * + * Called with busiest locked. + */ +static void active_load_balance(runqueue_t *busiest, int busiest_cpu) { -#ifdef CONFIG_NUMA - int this_cpu = smp_processor_id(); -#endif - unsigned long j = jiffies; + int i; + struct sched_domain *sd = cpu_sched_domain(busiest_cpu); + struct sched_group *group, *busy_group; - /* - * First do inter-node rebalancing, then intra-node rebalancing, - * if both events happen in the same tick. The inter-node - * rebalancing does not necessarily have to create a perfect - * balance within the node, since we load-balance the most loaded - * node with the current CPU. (ie. other CPUs in the local node - * are not balanced.) - */ - if (idle) { -#ifdef CONFIG_NUMA - if (!(j % IDLE_NODE_REBALANCE_TICK)) - balance_node(this_rq, idle, this_cpu); -#endif - if (!(j % IDLE_REBALANCE_TICK)) { - spin_lock(&this_rq->lock); - load_balance(this_rq, idle, cpu_to_node_mask(this_cpu)); - spin_unlock(&this_rq->lock); + if (busiest->nr_running <= 1) + return; + + /* sd->parent should never cause a NULL dereference, if it did so, + * then push_cpu was set to a buggy value */ + while (!cpu_isset(busiest->push_cpu, sd->span)) { + sd = sd->parent; + if (!sd->parent && !cpu_isset(busiest->push_cpu, sd->span)) { + WARN_ON(1); + return; } + } + + if (!sd->groups) { + WARN_ON(1); return; } -#ifdef CONFIG_NUMA - if (!(j % BUSY_NODE_REBALANCE_TICK)) - balance_node(this_rq, idle, this_cpu); -#endif - if (!(j % BUSY_REBALANCE_TICK)) { - spin_lock(&this_rq->lock); - load_balance(this_rq, idle, cpu_to_node_mask(this_cpu)); - spin_unlock(&this_rq->lock); + + group = sd->groups; + while (!cpu_isset(busiest_cpu, group->cpumask)) { + group = group->next; + if (group == sd->groups) { + WARN_ON(1); + return; + } } + busy_group = group; + + group = sd->groups; + do { + cpumask_t tmp; + runqueue_t *rq; + int push_cpu = 0, nr = 0; + + if (group == busy_group) + goto next_group; + + cpus_and(tmp, group->cpumask, cpu_online_map); + for_each_cpu_mask(i, tmp) { + if (!idle_cpu(i)) + goto next_group; + push_cpu = i; + nr++; + } + if (nr == 0) + goto next_group; + + rq = cpu_rq(push_cpu); + double_lock_balance(busiest, rq); + move_tasks(rq, push_cpu, busiest, 1, sd, IDLE); + spin_unlock(&rq->lock); +next_group: + group = group->next; + } while (group != sd->groups); +} + +/* + * rebalance_tick will get called every timer tick, on every CPU. + * + * It checks each scheduling domain to see if it is due to be balanced, + * and initiates a balancing operation if so. + * + * Balancing parameters are set up in arch_init_sched_domains. + */ + +/* Don't have all balancing operations going off at once */ +#define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS) + +static void rebalance_tick(int this_cpu, runqueue_t *this_rq, enum idle_type idle) +{ + unsigned long j = jiffies + CPU_OFFSET(this_cpu); + struct sched_domain *domain = this_sched_domain(); + + /* Run through all this CPU's domains */ + do { + unsigned long interval; + + if (unlikely(!domain->groups)) + break; + + interval = domain->balance_interval; + if (idle != IDLE) + interval *= domain->busy_factor; + + /* scale ms to jiffies */ + interval = interval * HZ / 1000; + if (unlikely(interval == 0)) + interval = 1; + + if (j - domain->last_balance >= interval) { + if (load_balance(this_cpu, this_rq, domain, idle)) { + /* We've pulled tasks over so no longer idle */ + idle = NOT_IDLE; + } + domain->last_balance += interval; + } + + domain = domain->parent; + } while (domain); } #else /* * on UP we do not need to balance between CPUs: */ -static inline void rebalance_tick(runqueue_t *this_rq, int idle) +static inline void rebalance_tick(int this_cpu, runqueue_t *this_rq, enum idle_type idle) { } #endif @@ -1501,7 +1813,7 @@ void scheduler_tick(int user_ticks, int cpustat->iowait += sys_ticks; else cpustat->idle += sys_ticks; - rebalance_tick(rq, 1); + rebalance_tick(cpu, rq, IDLE); return; } if (TASK_NICE(p) > 0) @@ -1585,7 +1897,7 @@ void scheduler_tick(int user_ticks, int out_unlock: spin_unlock(&rq->lock); out: - rebalance_tick(rq, 0); + rebalance_tick(cpu, rq, NOT_IDLE); } void scheduling_functions_start_here(void) { } @@ -1654,7 +1966,7 @@ need_resched: if (unlikely(!rq->nr_running)) { #ifdef CONFIG_SMP - load_balance(rq, 1, cpu_to_node_mask(smp_processor_id())); + idle_balance(smp_processor_id(), rq); #endif if (!rq->nr_running) { next = rq->idle; @@ -2720,7 +3032,12 @@ int set_cpus_allowed(task_t *p, cpumask_ goto out; } - if (__set_cpus_allowed(p, new_mask, &req)) { + p->cpus_allowed = new_mask; + /* Can the task run on the task's current CPU? If so, we're done */ + if (cpu_isset(task_cpu(p), new_mask)) + goto out; + + if (migrate_task(p, any_online_cpu(new_mask), &req)) { /* Need help from migration thread: drop lock and wait. */ task_rq_unlock(rq, &flags); wake_up_process(rq->migration_thread); @@ -2734,8 +3051,16 @@ out: EXPORT_SYMBOL_GPL(set_cpus_allowed); -/* Move (not current) task off this cpu, onto dest cpu. */ -static void move_task_away(struct task_struct *p, int dest_cpu) +/* + * Move (not current) task off this cpu, onto dest cpu. We're doing + * this because either it can't run here any more (set_cpus_allowed() + * away from this CPU, or CPU going down), or because we're + * attempting to rebalance this task on exec (sched_balance_exec). + * + * So we race with normal scheduler movements, but that's OK, as long + * as the task is no longer on this CPU. + */ +static void __migrate_task(struct task_struct *p, int dest_cpu) { runqueue_t *rq_dest; unsigned long flags; @@ -2744,14 +3069,18 @@ static void move_task_away(struct task_s local_irq_save(flags); double_rq_lock(this_rq(), rq_dest); + /* Already moved. */ if (task_cpu(p) != smp_processor_id()) - goto out; /* Already moved */ + goto out; + /* Affinity changed (again). */ + if (!cpu_isset(dest_cpu, p->cpus_allowed)) + goto out; set_task_cpu(p, dest_cpu); if (p->array) { deactivate_task(p, this_rq()); activate_task(p, rq_dest); - if (p->prio < rq_dest->curr->prio) + if (TASK_PREEMPTS_CURR(p, rq_dest)) resched_task(rq_dest->curr); } p->timestamp = rq_dest->timestamp_last_tick; @@ -2788,7 +3117,13 @@ static int migration_thread(void * data) refrigerator(PF_IOTHREAD); spin_lock_irq(&rq->lock); + if (rq->active_balance) { + active_load_balance(rq, cpu); + rq->active_balance = 0; + } + head = &rq->migration_queue; + current->state = TASK_INTERRUPTIBLE; if (list_empty(head)) { spin_unlock_irq(&rq->lock); @@ -2799,8 +3134,7 @@ static int migration_thread(void * data) list_del_init(head->next); spin_unlock_irq(&rq->lock); - move_task_away(req->task, - any_online_cpu(req->task->cpus_allowed)); + __migrate_task(req->task, req->dest_cpu); complete(&req->done); } return 0; @@ -2867,6 +3201,210 @@ int __init migration_init(void) spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; EXPORT_SYMBOL(kernel_flag); +#ifdef CONFIG_SMP +#ifdef ARCH_HAS_SCHED_DOMAIN +extern void __init arch_init_sched_domains(void); +#else +static struct sched_group sched_group_cpus[NR_CPUS]; +#ifdef CONFIG_NUMA +static struct sched_group sched_group_nodes[MAX_NUMNODES]; +DEFINE_PER_CPU(struct sched_domain, node_domains); +static void __init arch_init_sched_domains(void) +{ + int i; + struct sched_group *first_node = NULL, *last_node = NULL; + + /* Set up domains */ + for_each_cpu(i) { + int node = cpu_to_node(i); + cpumask_t nodemask = node_to_cpumask(node); + struct sched_domain *node_domain = &per_cpu(node_domains, i); + struct sched_domain *cpu_domain = cpu_sched_domain(i); + + *node_domain = SD_NODE_INIT; + node_domain->span = cpu_possible_map; + + *cpu_domain = SD_CPU_INIT; + cpus_and(cpu_domain->span, nodemask, cpu_possible_map); + cpu_domain->parent = node_domain; + } + + /* Set up groups */ + for (i = 0; i < MAX_NUMNODES; i++) { + struct sched_group *first_cpu = NULL, *last_cpu = NULL; + int j; + cpumask_t nodemask; + struct sched_group *node = &sched_group_nodes[i]; + cpumask_t tmp = node_to_cpumask(i); + + cpus_and(nodemask, tmp, cpu_possible_map); + + if (cpus_empty(nodemask)) + continue; + + node->cpumask = nodemask; + node->cpu_power = SCHED_LOAD_SCALE * cpus_weight(node->cpumask); + + for_each_cpu_mask(j, node->cpumask) { + struct sched_group *cpu = &sched_group_cpus[j]; + + cpus_clear(cpu->cpumask); + cpu_set(j, cpu->cpumask); + cpu->cpu_power = SCHED_LOAD_SCALE; + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + + if (!first_node) + first_node = node; + if (last_node) + last_node->next = node; + last_node = node; + } + last_node->next = first_node; + + mb(); + for_each_cpu(i) { + struct sched_domain *node_domain = &per_cpu(node_domains, i); + struct sched_domain *cpu_domain = cpu_sched_domain(i); + node_domain->groups = &sched_group_nodes[cpu_to_node(i)]; + cpu_domain->groups = &sched_group_cpus[i]; + } +} + +#else /* CONFIG_NUMA */ +static void __init arch_init_sched_domains(void) +{ + int i; + struct sched_group *first_cpu = NULL, *last_cpu = NULL; + + /* Set up domains */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = cpu_sched_domain(i); + + *cpu_domain = SD_CPU_INIT; + cpu_domain->span = cpu_possible_map; + } + + /* Set up CPU groups */ + for_each_cpu_mask(i, cpu_possible_map) { + struct sched_group *cpu = &sched_group_cpus[i]; + + cpus_clear(cpu->cpumask); + cpu_set(i, cpu->cpumask); + cpu->cpu_power = SCHED_LOAD_SCALE; + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + + mb(); + for_each_cpu(i) { + struct sched_domain *cpu_domain = cpu_sched_domain(i); + cpu_domain->groups = &sched_group_cpus[i]; + } +} + +#endif /* CONFIG_NUMA */ +#endif /* ARCH_HAS_SCHED_DOMAIN */ + +#undef SCHED_DOMAIN_DEBUG +#ifdef SCHED_DOMAIN_DEBUG +void sched_domain_debug(void) +{ + int i; + + for_each_cpu(i) { + int level = 0; + struct sched_domain *cpu_domain = cpu_sched_domain(i); + + printk(KERN_DEBUG "CPU%d: %s\n", + i, (cpu_online(i) ? " online" : "offline")); + + do { + int j; + char str[NR_CPUS]; + struct sched_group *group = cpu_domain->groups; + cpumask_t groupmask, tmp; + + cpumask_snprintf(str, NR_CPUS, cpu_domain->span); + cpus_clear(groupmask); + + printk(KERN_DEBUG); + for (j = 0; j < level + 1; j++) + printk(" "); + printk("domain %d: span %s\n", level, str); + + if (!cpu_isset(i, cpu_domain->span)) + printk(KERN_DEBUG "ERROR domain->span does not contain CPU%d\n", i); + if (!cpu_isset(i, group->cpumask)) + printk(KERN_DEBUG "ERROR domain->groups does not contain CPU%d\n", i); + + printk(KERN_DEBUG); + for (j = 0; j < level + 2; j++) + printk(" "); + printk("groups:"); + do { + if (group == NULL) { + printk(" ERROR: NULL"); + break; + } + + if (cpus_weight(group->cpumask) == 0) + printk(" ERROR empty group:"); + + cpus_and(tmp, groupmask, group->cpumask); + if (cpus_weight(tmp) > 0) + printk(" ERROR repeated CPUs:"); + + cpus_or(groupmask, groupmask, group->cpumask); + + cpumask_snprintf(str, NR_CPUS, group->cpumask); + printk(" %s", str); + + group = group->next; + } while (group != cpu_domain->groups); + printk("\n"); + + if (!cpus_equal(cpu_domain->span, groupmask)) + printk(KERN_DEBUG "ERROR groups don't span domain->span\n"); + + level++; + cpu_domain = cpu_domain->parent; + + if (cpu_domain) { + cpus_and(tmp, groupmask, cpu_domain->span); + if (!cpus_equal(tmp, groupmask)) + printk(KERN_DEBUG "ERROR parent span is not a superset of domain->span\n"); + } + + } while (cpu_domain); + } +} +#else +#define sched_domain_debug() {} +#endif + +void __init sched_init_smp(void) +{ + arch_init_sched_domains(); + sched_domain_debug(); +} +#else +void __init sched_init_smp(void) +{ +} +#endif /* CONFIG_SMP */ + void __init sched_init(void) { runqueue_t *rq; @@ -2874,6 +3412,11 @@ void __init sched_init(void) for (i = 0; i < NR_CPUS; i++) { prio_array_t *array; +#ifdef CONFIG_SMP + struct sched_domain *domain; + domain = cpu_sched_domain(i); + memset(domain, 0, sizeof(struct sched_domain)); +#endif rq = cpu_rq(i); rq->active = rq->arrays; @@ -2883,7 +3426,6 @@ void __init sched_init(void) spin_lock_init(&rq->lock); INIT_LIST_HEAD(&rq->migration_queue); atomic_set(&rq->nr_iowait, 0); - nr_running_init(rq); for (j = 0; j < 2; j++) { array = rq->arrays + j;