From: Nick Piggin De-racify the sched domain setup code. This involves creating a dummy "init" domain during sched_init (which is called early). When topology information becomes available, the sched domains are then built and attached. The attach mechanism is asynchronous and uses the migration threads, which perform the switch with interrupts off. This is a quiescent state, so domains can still be lockless on the read side. It also allows us to change the domains at runtime without much more work. This is something SGI is interested in to elegantly do soft partitioning of their systems without having to use hard cpu affinities (which cause balancing problems of their own). The current setup code also has a race somewhere because it is unable to boot on a 384 CPU system. --- 25-akpm/arch/i386/kernel/smpboot.c | 133 +++++++++++++----------------- 25-akpm/arch/ppc64/kernel/smp.c | 67 +++++++-------- 25-akpm/include/linux/sched.h | 10 -- 25-akpm/kernel/sched.c | 163 ++++++++++++++++++++++++++----------- 4 files changed, 214 insertions(+), 159 deletions(-) diff -puN arch/i386/kernel/smpboot.c~sched-domain-setup-lock arch/i386/kernel/smpboot.c --- 25/arch/i386/kernel/smpboot.c~sched-domain-setup-lock 2004-04-12 02:02:55.637928504 -0700 +++ 25-akpm/arch/i386/kernel/smpboot.c 2004-04-12 02:02:55.647926984 -0700 @@ -1130,16 +1130,17 @@ static void __init smp_boot_cpus(unsigne static struct sched_group sched_group_cpus[NR_CPUS]; static struct sched_group sched_group_phys[NR_CPUS]; static struct sched_group sched_group_nodes[MAX_NUMNODES]; +static DEFINE_PER_CPU(struct sched_domain, cpu_domains); static DEFINE_PER_CPU(struct sched_domain, phys_domains); static DEFINE_PER_CPU(struct sched_domain, node_domains); __init void arch_init_sched_domains(void) { int i; - struct sched_group *first_cpu = NULL, *last_cpu = NULL; + struct sched_group *first = NULL, *last = NULL; /* Set up domains */ for_each_cpu(i) { - struct sched_domain *cpu_domain = cpu_sched_domain(i); + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); struct sched_domain *phys_domain = &per_cpu(phys_domains, i); struct sched_domain *node_domain = &per_cpu(node_domains, i); int node = cpu_to_node(i); @@ -1147,26 +1148,27 @@ __init void arch_init_sched_domains(void *cpu_domain = SD_SIBLING_INIT; cpu_domain->span = cpu_sibling_map[i]; + cpu_domain->parent = phys_domain; + cpu_domain->groups = &sched_group_cpus[i]; *phys_domain = SD_CPU_INIT; phys_domain->span = nodemask; + phys_domain->parent = node_domain; + phys_domain->groups = &sched_group_phys[first_cpu(cpu_domain->span)]; *node_domain = SD_NODE_INIT; node_domain->span = cpu_possible_map; + node_domain->groups = &sched_group_nodes[cpu_to_node(i)]; } /* Set up CPU (sibling) groups */ for_each_cpu(i) { - struct sched_domain *cpu_domain = cpu_sched_domain(i); + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); int j; - first_cpu = last_cpu = NULL; + first = last = NULL; - if (i != first_cpu(cpu_domain->span)) { - cpu_sched_domain(i)->flags |= SD_SHARE_CPUPOWER; - cpu_sched_domain(first_cpu(cpu_domain->span))->flags |= - SD_SHARE_CPUPOWER; + if (i != first_cpu(cpu_domain->span)) continue; - } for_each_cpu_mask(j, cpu_domain->span) { struct sched_group *cpu = &sched_group_cpus[j]; @@ -1175,13 +1177,13 @@ __init void arch_init_sched_domains(void cpu_set(j, cpu->cpumask); cpu->cpu_power = SCHED_LOAD_SCALE; - if (!first_cpu) - first_cpu = cpu; - if (last_cpu) - last_cpu->next = cpu; - last_cpu = cpu; + if (!first) + first = cpu; + if (last) + last->next = cpu; + last = cpu; } - last_cpu->next = first_cpu; + last->next = first; } for (i = 0; i < MAX_NUMNODES; i++) { @@ -1193,10 +1195,10 @@ __init void arch_init_sched_domains(void if (cpus_empty(nodemask)) continue; - first_cpu = last_cpu = NULL; + first = last = NULL; /* Set up physical groups */ for_each_cpu_mask(j, nodemask) { - struct sched_domain *cpu_domain = cpu_sched_domain(j); + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, j); struct sched_group *cpu = &sched_group_phys[j]; if (j != first_cpu(cpu_domain->span)) @@ -1210,17 +1212,17 @@ __init void arch_init_sched_domains(void cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10; node->cpu_power += cpu->cpu_power; - if (!first_cpu) - first_cpu = cpu; - if (last_cpu) - last_cpu->next = cpu; - last_cpu = cpu; + if (!first) + first = cpu; + if (last) + last->next = cpu; + last = cpu; } - last_cpu->next = first_cpu; + last->next = first; } /* Set up nodes */ - first_cpu = last_cpu = NULL; + first = last = NULL; for (i = 0; i < MAX_NUMNODES; i++) { struct sched_group *cpu = &sched_group_nodes[i]; cpumask_t nodemask; @@ -1232,65 +1234,53 @@ __init void arch_init_sched_domains(void cpu->cpumask = nodemask; /* ->cpu_power already setup */ - if (!first_cpu) - first_cpu = cpu; - if (last_cpu) - last_cpu->next = cpu; - last_cpu = cpu; + if (!first) + first = cpu; + if (last) + last->next = cpu; + last = cpu; } - last_cpu->next = first_cpu; + last->next = first; mb(); for_each_cpu(i) { - int node = cpu_to_node(i); - struct sched_domain *cpu_domain = cpu_sched_domain(i); - struct sched_domain *phys_domain = &per_cpu(phys_domains, i); - struct sched_domain *node_domain = &per_cpu(node_domains, i); - struct sched_group *cpu_group = &sched_group_cpus[i]; - struct sched_group *phys_group = &sched_group_phys[first_cpu(cpu_domain->span)]; - struct sched_group *node_group = &sched_group_nodes[node]; - - cpu_domain->parent = phys_domain; - phys_domain->parent = node_domain; - - node_domain->groups = node_group; - phys_domain->groups = phys_group; - cpu_domain->groups = cpu_group; + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); + cpu_attach_domain(cpu_domain, i); } } #else /* !CONFIG_NUMA */ static struct sched_group sched_group_cpus[NR_CPUS]; static struct sched_group sched_group_phys[NR_CPUS]; +static DEFINE_PER_CPU(struct sched_domain, cpu_domains); static DEFINE_PER_CPU(struct sched_domain, phys_domains); __init void arch_init_sched_domains(void) { int i; - struct sched_group *first_cpu = NULL, *last_cpu = NULL; + struct sched_group *first = NULL, *last = NULL; /* Set up domains */ for_each_cpu(i) { - struct sched_domain *cpu_domain = cpu_sched_domain(i); + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); struct sched_domain *phys_domain = &per_cpu(phys_domains, i); *cpu_domain = SD_SIBLING_INIT; cpu_domain->span = cpu_sibling_map[i]; + cpu_domain->parent = phys_domain; + cpu_domain->groups = &sched_group_cpus[i]; *phys_domain = SD_CPU_INIT; phys_domain->span = cpu_possible_map; + phys_domain->groups = &sched_group_phys[first_cpu(cpu_domain->span)]; } /* Set up CPU (sibling) groups */ for_each_cpu(i) { - struct sched_domain *cpu_domain = cpu_sched_domain(i); + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); int j; - first_cpu = last_cpu = NULL; + first = last = NULL; - if (i != first_cpu(cpu_domain->span)) { - cpu_sched_domain(i)->flags |= SD_SHARE_CPUPOWER; - cpu_sched_domain(first_cpu(cpu_domain->span))->flags |= - SD_SHARE_CPUPOWER; + if (i != first_cpu(cpu_domain->span)) continue; - } for_each_cpu_mask(j, cpu_domain->span) { struct sched_group *cpu = &sched_group_cpus[j]; @@ -1299,19 +1289,19 @@ __init void arch_init_sched_domains(void cpu_set(j, cpu->cpumask); cpu->cpu_power = SCHED_LOAD_SCALE; - if (!first_cpu) - first_cpu = cpu; - if (last_cpu) - last_cpu->next = cpu; - last_cpu = cpu; + if (!first) + first = cpu; + if (last) + last->next = cpu; + last = cpu; } - last_cpu->next = first_cpu; + last->next = first; } - first_cpu = last_cpu = NULL; + first = last = NULL; /* Set up physical groups */ for_each_cpu(i) { - struct sched_domain *cpu_domain = cpu_sched_domain(i); + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); struct sched_group *cpu = &sched_group_phys[i]; if (i != first_cpu(cpu_domain->span)) @@ -1321,23 +1311,18 @@ __init void arch_init_sched_domains(void /* See SMT+NUMA setup for comment */ cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10; - if (!first_cpu) - first_cpu = cpu; - if (last_cpu) - last_cpu->next = cpu; - last_cpu = cpu; + if (!first) + first = cpu; + if (last) + last->next = cpu; + last = cpu; } - last_cpu->next = first_cpu; + last->next = first; mb(); for_each_cpu(i) { - struct sched_domain *cpu_domain = cpu_sched_domain(i); - struct sched_domain *phys_domain = &per_cpu(phys_domains, i); - struct sched_group *cpu_group = &sched_group_cpus[i]; - struct sched_group *phys_group = &sched_group_phys[first_cpu(cpu_domain->span)]; - cpu_domain->parent = phys_domain; - phys_domain->groups = phys_group; - cpu_domain->groups = cpu_group; + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); + cpu_attach_domain(cpu_domain, i); } } #endif /* CONFIG_NUMA */ diff -puN arch/ppc64/kernel/smp.c~sched-domain-setup-lock arch/ppc64/kernel/smp.c --- 25/arch/ppc64/kernel/smp.c~sched-domain-setup-lock 2004-04-12 02:02:55.639928200 -0700 +++ 25-akpm/arch/ppc64/kernel/smp.c 2004-04-12 02:02:55.650926528 -0700 @@ -997,6 +997,7 @@ void __init smp_cpus_done(unsigned int m static struct sched_group sched_group_cpus[NR_CPUS]; static struct sched_group sched_group_phys[NR_CPUS]; static struct sched_group sched_group_nodes[MAX_NUMNODES]; +static DEFINE_PER_CPU(struct sched_domain, cpu_domains); static DEFINE_PER_CPU(struct sched_domain, phys_domains); static DEFINE_PER_CPU(struct sched_domain, node_domains); __init void arch_init_sched_domains(void) @@ -1006,7 +1007,7 @@ __init void arch_init_sched_domains(void /* Set up domains */ for_each_cpu(i) { - struct sched_domain *cpu_domain = cpu_sched_domain(i); + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); struct sched_domain *phys_domain = &per_cpu(phys_domains, i); struct sched_domain *node_domain = &per_cpu(node_domains, i); int node = cpu_to_node(i); @@ -1019,25 +1020,30 @@ __init void arch_init_sched_domains(void cpus_or(cpu_domain->span, my_cpumask, sibling_cpumask); else cpu_domain->span = my_cpumask; + cpu_domain->groups = &sched_group_cpus[i]; + cpu_domain->parent = phys_domain; *phys_domain = SD_CPU_INIT; phys_domain->span = nodemask; // phys_domain->cache_hot_time = XXX; + phys_domain->groups = &sched_group_phys[first_cpu(cpu_domain->span)]; + phys_domain->parent = node_domain; *node_domain = SD_NODE_INIT; node_domain->span = cpu_possible_map; // node_domain->cache_hot_time = XXX; + node_domain->groups = &sched_group_nodes[node]; } /* Set up CPU (sibling) groups */ for_each_cpu(i) { - struct sched_domain *cpu_domain = cpu_sched_domain(i); + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); int j; first_cpu = last_cpu = NULL; if (i != first_cpu(cpu_domain->span)) { - cpu_sched_domain(i)->flags |= SD_SHARE_CPUPOWER; - cpu_sched_domain(first_cpu(cpu_domain->span))->flags |= + &per_cpu(cpu_domains, i)->flags |= SD_SHARE_CPUPOWER; + &per_cpu(cpu_domains, first_cpu(cpu_domain->span))->flags |= SD_SHARE_CPUPOWER; continue; } @@ -1071,7 +1077,7 @@ __init void arch_init_sched_domains(void first_cpu = last_cpu = NULL; /* Set up physical groups */ for_each_cpu_mask(j, nodemask) { - struct sched_domain *cpu_domain = cpu_sched_domain(j); + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, j); struct sched_group *cpu = &sched_group_phys[j]; if (j != first_cpu(cpu_domain->span)) @@ -1119,24 +1125,14 @@ __init void arch_init_sched_domains(void mb(); for_each_cpu(i) { int node = cpu_to_node(i); - struct sched_domain *cpu_domain = cpu_sched_domain(i); - struct sched_domain *phys_domain = &per_cpu(phys_domains, i); - struct sched_domain *node_domain = &per_cpu(node_domains, i); - struct sched_group *cpu_group = &sched_group_cpus[i]; - struct sched_group *phys_group = &sched_group_phys[first_cpu(cpu_domain->span)]; - struct sched_group *node_group = &sched_group_nodes[node]; - - cpu_domain->parent = phys_domain; - phys_domain->parent = node_domain; - - node_domain->groups = node_group; - phys_domain->groups = phys_group; - cpu_domain->groups = cpu_group; + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); + cpu_attach_domain(cpu_domain, i); } } #else /* !CONFIG_NUMA */ static struct sched_group sched_group_cpus[NR_CPUS]; static struct sched_group sched_group_phys[NR_CPUS]; +static DEFINE_PER_CPU(struct sched_domain, cpu_domains); static DEFINE_PER_CPU(struct sched_domain, phys_domains); __init void arch_init_sched_domains(void) { @@ -1145,7 +1141,7 @@ __init void arch_init_sched_domains(void /* Set up domains */ for_each_cpu(i) { - struct sched_domain *cpu_domain = cpu_sched_domain(i); + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); struct sched_domain *phys_domain = &per_cpu(phys_domains, i); cpumask_t my_cpumask = cpumask_of_cpu(i); cpumask_t sibling_cpumask = cpumask_of_cpu(i ^ 0x1); @@ -1155,21 +1151,24 @@ __init void arch_init_sched_domains(void cpus_or(cpu_domain->span, my_cpumask, sibling_cpumask); else cpu_domain->span = my_cpumask; + cpu_domain->groups = &sched_group_cpus[i]; + cpu_domain->parent = phys_domain; *phys_domain = SD_CPU_INIT; phys_domain->span = cpu_possible_map; // phys_domain->cache_hot_time = XXX; + phys_domain->groups = &sched_group_phys[first_cpu(cpu_domain->span)]; } /* Set up CPU (sibling) groups */ for_each_cpu(i) { - struct sched_domain *cpu_domain = cpu_sched_domain(i); + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); int j; first_cpu = last_cpu = NULL; if (i != first_cpu(cpu_domain->span)) { - cpu_sched_domain(i)->flags |= SD_SHARE_CPUPOWER; - cpu_sched_domain(first_cpu(cpu_domain->span))->flags |= + &per_cpu(cpu_domains, i)->flags |= SD_SHARE_CPUPOWER; + &per_cpu(cpu_domains, first_cpu(cpu_domain->span))->flags |= SD_SHARE_CPUPOWER; continue; } @@ -1193,7 +1192,7 @@ __init void arch_init_sched_domains(void first_cpu = last_cpu = NULL; /* Set up physical groups */ for_each_cpu(i) { - struct sched_domain *cpu_domain = cpu_sched_domain(i); + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); struct sched_group *cpu = &sched_group_phys[i]; if (i != first_cpu(cpu_domain->span)) @@ -1213,19 +1212,20 @@ __init void arch_init_sched_domains(void mb(); for_each_cpu(i) { - struct sched_domain *cpu_domain = cpu_sched_domain(i); - struct sched_domain *phys_domain = &per_cpu(phys_domains, i); - struct sched_group *cpu_group = &sched_group_cpus[i]; - struct sched_group *phys_group = &sched_group_phys[first_cpu(cpu_domain->span)]; - cpu_domain->parent = phys_domain; - phys_domain->groups = phys_group; - cpu_domain->groups = cpu_group; + struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i); + cpu_attach_domain(cpu_sd, i); } } #endif /* CONFIG_NUMA */ #else /* !CONFIG_SCHED_SMT */ +#ifdef CONFIG_NUMA +#error ppc64 has no NUMA scheduler defined without CONFIG_SCHED_SMT. \ + Please enable CONFIG_SCHED_SMT or bug Anton. +#endif + static struct sched_group sched_group_cpus[NR_CPUS]; +static DEFINE_PER_CPU(struct sched_domain, cpu_domains); __init void arch_init_sched_domains(void) { @@ -1234,11 +1234,12 @@ __init void arch_init_sched_domains(void /* Set up domains */ for_each_cpu(i) { - struct sched_domain *cpu_sd = cpu_sched_domain(i); + struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i); *cpu_sd = SD_CPU_INIT; cpu_sd->span = cpu_possible_map; // cpu_sd->cache_hot_time = XXX; + cpu_sd->groups = &sched_group_cpus[i]; } /* Set up CPU groups */ @@ -1259,8 +1260,8 @@ __init void arch_init_sched_domains(void mb(); for_each_cpu(i) { - struct sched_domain *cpu_sd = cpu_sched_domain(i); - cpu_sd->groups = &sched_group_cpus[i]; + struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i); + cpu_attach_domain(cpu_sd, i); } } #endif diff -puN include/linux/sched.h~sched-domain-setup-lock include/linux/sched.h --- 25/include/linux/sched.h~sched-domain-setup-lock 2004-04-12 02:02:55.641927896 -0700 +++ 25-akpm/include/linux/sched.h 2004-04-12 02:02:55.652926224 -0700 @@ -597,7 +597,8 @@ struct sched_domain { .per_cpu_gain = 15, \ .flags = SD_BALANCE_NEWIDLE \ | SD_WAKE_AFFINE \ - | SD_WAKE_IDLE, \ + | SD_WAKE_IDLE \ + | SD_SHARE_CPUPOWER, \ .last_balance = jiffies, \ .balance_interval = 1, \ .nr_balance_failed = 0, \ @@ -642,12 +643,7 @@ struct sched_domain { } #endif -DECLARE_PER_CPU(struct sched_domain, base_domains); -#define cpu_sched_domain(cpu) (&per_cpu(base_domains, (cpu))) -#define this_sched_domain() (&__get_cpu_var(base_domains)) - -#define for_each_domain(cpu, domain) \ - for (domain = cpu_sched_domain(cpu); domain; domain = domain->parent) +extern void cpu_attach_domain(struct sched_domain *sd, int cpu); extern int set_cpus_allowed(task_t *p, cpumask_t new_mask); #else diff -puN kernel/sched.c~sched-domain-setup-lock kernel/sched.c --- 25/kernel/sched.c~sched-domain-setup-lock 2004-04-12 02:02:55.642927744 -0700 +++ 25-akpm/kernel/sched.c 2004-04-12 02:02:55.656925616 -0700 @@ -228,20 +228,22 @@ struct runqueue { int best_expired_prio; atomic_t nr_iowait; +#ifdef CONFIG_SMP + struct sched_domain *sd; + /* For active balancing */ int active_balance; int push_cpu; task_t *migration_thread; struct list_head migration_queue; +#endif }; static DEFINE_PER_CPU(struct runqueue, runqueues); -#ifdef CONFIG_SMP -/* Mandatory scheduling domains */ -DEFINE_PER_CPU(struct sched_domain, base_domains); -#endif +#define for_each_domain(cpu, domain) \ + for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent) #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) #define this_rq() (&__get_cpu_var(runqueues)) @@ -531,10 +533,22 @@ inline int task_curr(task_t *p) } #ifdef CONFIG_SMP +enum request_type { + REQ_MOVE_TASK, + REQ_SET_DOMAIN, +}; + typedef struct { struct list_head list; + enum request_type type; + + /* For REQ_MOVE_TASK */ task_t *task; int dest_cpu; + + /* For REQ_SET_DOMAIN */ + struct sched_domain *sd; + struct completion done; } migration_req_t; @@ -556,6 +570,7 @@ static int migrate_task(task_t *p, int d } init_completion(&req->done); + req->type = REQ_MOVE_TASK; req->task = p; req->dest_cpu = dest_cpu; list_add(&req->list, &rq->migration_queue); @@ -644,13 +659,14 @@ static inline unsigned long get_high_cpu static int wake_idle(int cpu, task_t *p) { cpumask_t tmp; + runqueue_t *rq = cpu_rq(cpu); struct sched_domain *sd; int i; if (idle_cpu(cpu)) return cpu; - sd = cpu_sched_domain(cpu); + sd = rq->sd; if (!(sd->flags & SD_WAKE_IDLE)) return cpu; @@ -1381,9 +1397,6 @@ find_busiest_group(struct sched_domain * struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; unsigned long max_load, avg_load, total_load, this_load, total_pwr; - if (unlikely(!group)) - return NULL; - max_load = this_load = total_load = total_pwr = 0; do { @@ -1660,9 +1673,6 @@ static inline void idle_balance(int this return; for_each_domain(this_cpu, sd) { - if (unlikely(!sd->groups)) - return; - if (sd->flags & SD_BALANCE_NEWIDLE) { if (load_balance_newidle(this_cpu, this_rq, sd)) { /* We've pulled tasks over so stop searching */ @@ -1761,9 +1771,6 @@ static void rebalance_tick(int this_cpu, for_each_domain(this_cpu, sd) { unsigned long interval = sd->balance_interval; - if (unlikely(!sd->groups)) - return; - if (idle != IDLE) interval *= sd->busy_factor; @@ -1954,17 +1961,19 @@ out: static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq) { int i; - struct sched_domain *sd = cpu_sched_domain(cpu); + struct sched_domain *sd = rq->sd; cpumask_t sibling_map; if (!(sd->flags & SD_SHARE_CPUPOWER)) return; cpus_and(sibling_map, sd->span, cpu_online_map); - cpu_clear(cpu, sibling_map); for_each_cpu_mask(i, sibling_map) { runqueue_t *smt_rq; + if (i == cpu) + continue; + smt_rq = cpu_rq(i); /* @@ -1978,7 +1987,7 @@ static inline void wake_sleeping_depende static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p) { - struct sched_domain *sd = cpu_sched_domain(cpu); + struct sched_domain *sd = rq->sd; cpumask_t sibling_map; int ret = 0, i; @@ -1986,11 +1995,13 @@ static inline int dependent_sleeper(int return 0; cpus_and(sibling_map, sd->span, cpu_online_map); - cpu_clear(cpu, sibling_map); for_each_cpu_mask(i, sibling_map) { runqueue_t *smt_rq; task_t *smt_curr; + if (i == cpu) + continue; + smt_rq = cpu_rq(i); smt_curr = smt_rq->curr; @@ -3263,10 +3274,19 @@ static int migration_thread(void * data) } req = list_entry(head->next, migration_req_t, list); list_del_init(head->next); + spin_unlock(&rq->lock); - __migrate_task(req->task, req->dest_cpu); + if (req->type == REQ_MOVE_TASK) { + __migrate_task(req->task, req->dest_cpu); + } else if (req->type == REQ_SET_DOMAIN) { + rq->sd = req->sd; + } else { + WARN_ON(1); + } + local_irq_enable(); + complete(&req->done); } return 0; @@ -3402,13 +3422,42 @@ spinlock_t kernel_flag __cacheline_align EXPORT_SYMBOL(kernel_flag); #ifdef CONFIG_SMP +/* Attach the domain 'sd' to 'cpu' as its base domain */ +void cpu_attach_domain(struct sched_domain *sd, int cpu) +{ + migration_req_t req; + unsigned long flags; + runqueue_t *rq = cpu_rq(cpu); + int local = 1; + + spin_lock_irqsave(&rq->lock, flags); + + if (cpu == smp_processor_id() || cpu_is_offline(cpu)) { + rq->sd = sd; + } else { + init_completion(&req.done); + req.type = REQ_SET_DOMAIN; + req.sd = sd; + list_add(&req.list, &rq->migration_queue); + local = 0; + } + + spin_unlock_irqrestore(&rq->lock, flags); + + if (!local) { + wake_up_process(rq->migration_thread); + wait_for_completion(&req.done); + } +} + #ifdef ARCH_HAS_SCHED_DOMAIN extern void __init arch_init_sched_domains(void); #else static struct sched_group sched_group_cpus[NR_CPUS]; +static DEFINE_PER_CPU(struct sched_domain, cpu_domains); #ifdef CONFIG_NUMA static struct sched_group sched_group_nodes[MAX_NUMNODES]; -DEFINE_PER_CPU(struct sched_domain, node_domains); +static DEFINE_PER_CPU(struct sched_domain, node_domains); static void __init arch_init_sched_domains(void) { int i; @@ -3419,13 +3468,15 @@ static void __init arch_init_sched_domai int node = cpu_to_node(i); cpumask_t nodemask = node_to_cpumask(node); struct sched_domain *node_sd = &per_cpu(node_domains, i); - struct sched_domain *cpu_sd = cpu_sched_domain(i); + struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i); *node_sd = SD_NODE_INIT; node_sd->span = cpu_possible_map; + node_sd->groups = &sched_group_nodes[cpu_to_node(i)]; *cpu_sd = SD_CPU_INIT; cpus_and(cpu_sd->span, nodemask, cpu_possible_map); + cpu_sd->groups = &sched_group_cpus[i]; cpu_sd->parent = node_sd; } @@ -3470,10 +3521,8 @@ static void __init arch_init_sched_domai mb(); for_each_cpu(i) { - struct sched_domain *node_sd = &per_cpu(node_domains, i); - struct sched_domain *cpu_sd = cpu_sched_domain(i); - node_sd->groups = &sched_group_nodes[cpu_to_node(i)]; - cpu_sd->groups = &sched_group_cpus[i]; + struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i); + cpu_attach_domain(cpu_sd, i); } } @@ -3485,10 +3534,11 @@ static void __init arch_init_sched_domai /* Set up domains */ for_each_cpu(i) { - struct sched_domain *cpu_sd = cpu_sched_domain(i); + struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i); *cpu_sd = SD_CPU_INIT; cpu_sd->span = cpu_possible_map; + cpu_sd->groups = &sched_group_cpus[i]; } /* Set up CPU groups */ @@ -3507,10 +3557,10 @@ static void __init arch_init_sched_domai } last_cpu->next = first_cpu; - mb(); + mb(); /* domains were modified outside the lock */ for_each_cpu(i) { - struct sched_domain *cpu_sd = cpu_sched_domain(i); - cpu_sd->groups = &sched_group_cpus[i]; + struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i); + cpu_attach_domain(cpu_sd, i); } } @@ -3524,8 +3574,11 @@ void sched_domain_debug(void) int i; for_each_cpu(i) { + runqueue_t *rq = cpu_rq(i); + struct sched_domain *sd; int level = 0; - struct sched_domain *cpu_sd = cpu_sched_domain(i); + + sd = rq->sd; printk(KERN_DEBUG "CPU%d: %s\n", i, (cpu_online(i) ? " online" : "offline")); @@ -3533,10 +3586,10 @@ void sched_domain_debug(void) do { int j; char str[NR_CPUS]; - struct sched_group *group = cpu_sd->groups; + struct sched_group *group = sd->groups; cpumask_t groupmask, tmp; - cpumask_scnprintf(str, NR_CPUS, cpu_sd->span); + cpumask_scnprintf(str, NR_CPUS, sd->span); cpus_clear(groupmask); printk(KERN_DEBUG); @@ -3544,7 +3597,7 @@ void sched_domain_debug(void) printk(" "); printk("domain %d: span %s\n", level, str); - if (!cpu_isset(i, cpu_sd->span)) + if (!cpu_isset(i, sd->span)) printk(KERN_DEBUG "ERROR domain->span does not contain CPU%d\n", i); if (!cpu_isset(i, group->cpumask)) printk(KERN_DEBUG "ERROR domain->groups does not contain CPU%d\n", i); @@ -3574,22 +3627,22 @@ void sched_domain_debug(void) printk(" %s", str); group = group->next; - } while (group != cpu_sd->groups); + } while (group != sd->groups); printk("\n"); - if (!cpus_equal(cpu_sd->span, groupmask)) + if (!cpus_equal(sd->span, groupmask)) printk(KERN_DEBUG "ERROR groups don't span domain->span\n"); level++; - cpu_sd = cpu_sd->parent; + sd = sd->parent; - if (cpu_sd) { - cpus_and(tmp, groupmask, cpu_sd->span); + if (sd) { + cpus_and(tmp, groupmask, sd->span); if (!cpus_equal(tmp, groupmask)) printk(KERN_DEBUG "ERROR parent span is not a superset of domain->span\n"); } - } while (cpu_sd); + } while (sd); } } #else @@ -3612,21 +3665,41 @@ void __init sched_init(void) runqueue_t *rq; int i, j, k; - for (i = 0; i < NR_CPUS; i++) { - prio_array_t *array; #ifdef CONFIG_SMP - struct sched_domain *domain; - domain = cpu_sched_domain(i); - memset(domain, 0, sizeof(struct sched_domain)); + /* Set up an initial dummy domain for early boot */ + static struct sched_domain sched_domain_init; + static struct sched_group sched_group_init; + cpumask_t cpu_mask_all = CPU_MASK_ALL; + + memset(&sched_domain_init, 0, sizeof(struct sched_domain)); + sched_domain_init.span = cpu_mask_all; + sched_domain_init.groups = &sched_group_init; + sched_domain_init.last_balance = jiffies; + sched_domain_init.balance_interval = INT_MAX; /* Don't balance */ + + memset(&sched_group_init, 0, sizeof(struct sched_group)); + sched_group_init.cpumask = cpu_mask_all; + sched_group_init.next = &sched_group_init; + sched_group_init.cpu_power = SCHED_LOAD_SCALE; #endif + for (i = 0; i < NR_CPUS; i++) { + prio_array_t *array; + rq = cpu_rq(i); + spin_lock_init(&rq->lock); rq->active = rq->arrays; rq->expired = rq->arrays + 1; rq->best_expired_prio = MAX_PRIO; - spin_lock_init(&rq->lock); +#ifdef CONFIG_SMP + rq->sd = &sched_domain_init; + rq->cpu_load = 0; + rq->active_balance = 0; + rq->push_cpu = 0; + rq->migration_thread = NULL; INIT_LIST_HEAD(&rq->migration_queue); +#endif atomic_set(&rq->nr_iowait, 0); for (j = 0; j < 2; j++) { _