From: Nick Piggin <nickpiggin@yahoo.com.au>

One of the problems with the multilevel balance-on-fork/exec is that it needs
to jump through hoops to satisfy sched-domain's locking semantics (that is,
you may traverse your own domain when not preemptable, and you may traverse
others' domains when holding their runqueue lock).

balance-on-exec had to potentially migrate between more than one CPU before
finding a final CPU to migrate to, and balance-on-fork needed to potentially
take multiple runqueue locks.

So bite the bullet and make sched-domains go completely RCU.  This actually
simplifies the code quite a bit.

From: Ingo Molnar <mingo@elte.hu>

schedstats RCU fix, and a nice comment on for_each_domain, from Ingo.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 kernel/sched.c |   58 +++++++++++++--------------------------------------------
 1 files changed, 14 insertions(+), 44 deletions(-)

diff -puN kernel/sched.c~sched-rcu-domains kernel/sched.c
--- 25/kernel/sched.c~sched-rcu-domains	2005-06-18 02:55:38.000000000 -0700
+++ 25-akpm/kernel/sched.c	2005-06-18 02:55:38.000000000 -0700
@@ -262,6 +262,13 @@ struct runqueue {
 
 static DEFINE_PER_CPU(struct runqueue, runqueues);
 
+/*
+ * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
+ * See update_sched_domains: synchronize_kernel for details.
+ *
+ * The domain tree of any CPU may only be accessed from within
+ * preempt-disabled sections.
+ */
 #define for_each_domain(cpu, domain) \
 	for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent)
 
@@ -397,6 +404,7 @@ static int show_schedstat(struct seq_fil
 
 #ifdef CONFIG_SMP
 		/* domain-specific stats */
+		preempt_disable();
 		for_each_domain(cpu, sd) {
 			enum idle_type itype;
 			char mask_str[NR_CPUS];
@@ -421,6 +429,7 @@ static int show_schedstat(struct seq_fil
 			    sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
 			    sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance);
 		}
+		preempt_enable();
 #endif
 	}
 	return 0;
@@ -826,22 +835,12 @@ inline int task_curr(const task_t *p)
 }
 
 #ifdef CONFIG_SMP
-enum request_type {
-	REQ_MOVE_TASK,
-	REQ_SET_DOMAIN,
-};
-
 typedef struct {
 	struct list_head list;
-	enum request_type type;
 
-	/* For REQ_MOVE_TASK */
 	task_t *task;
 	int dest_cpu;
 
-	/* For REQ_SET_DOMAIN */
-	struct sched_domain *sd;
-
 	struct completion done;
 } migration_req_t;
 
@@ -863,7 +862,6 @@ static int migrate_task(task_t *p, int d
 	}
 
 	init_completion(&req->done);
-	req->type = REQ_MOVE_TASK;
 	req->task = p;
 	req->dest_cpu = dest_cpu;
 	list_add(&req->list, &rq->migration_queue);
@@ -4381,17 +4379,9 @@ static int migration_thread(void * data)
 		req = list_entry(head->next, migration_req_t, list);
 		list_del_init(head->next);
 
-		if (req->type == REQ_MOVE_TASK) {
-			spin_unlock(&rq->lock);
-			__migrate_task(req->task, cpu, req->dest_cpu);
-			local_irq_enable();
-		} else if (req->type == REQ_SET_DOMAIN) {
-			rq->sd = req->sd;
-			spin_unlock_irq(&rq->lock);
-		} else {
-			spin_unlock_irq(&rq->lock);
-			WARN_ON(1);
-		}
+		spin_unlock(&rq->lock);
+		__migrate_task(req->task, cpu, req->dest_cpu);
+		local_irq_enable();
 
 		complete(&req->done);
 	}
@@ -4622,7 +4612,6 @@ static int migration_call(struct notifie
 			migration_req_t *req;
 			req = list_entry(rq->migration_queue.next,
 					 migration_req_t, list);
-			BUG_ON(req->type != REQ_MOVE_TASK);
 			list_del_init(&req->list);
 			complete(&req->done);
 		}
@@ -4926,10 +4915,7 @@ static int __devinit sd_parent_degenerat
  */
 void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu)
 {
-	migration_req_t req;
-	unsigned long flags;
 	runqueue_t *rq = cpu_rq(cpu);
-	int local = 1;
 	struct sched_domain *tmp;
 
 	/* Remove the sched domains which do not contribute to scheduling. */
@@ -4946,24 +4932,7 @@ void __devinit cpu_attach_domain(struct 
 
 	sched_domain_debug(sd, cpu);
 
-	spin_lock_irqsave(&rq->lock, flags);
-
-	if (cpu == smp_processor_id() || !cpu_online(cpu)) {
-		rq->sd = sd;
-	} else {
-		init_completion(&req.done);
-		req.type = REQ_SET_DOMAIN;
-		req.sd = sd;
-		list_add(&req.list, &rq->migration_queue);
-		local = 0;
-	}
-
-	spin_unlock_irqrestore(&rq->lock, flags);
-
-	if (!local) {
-		wake_up_process(rq->migration_thread);
-		wait_for_completion(&req.done);
-	}
+	rq->sd = sd;
 }
 
 /* cpus with isolated domains */
@@ -5238,6 +5207,7 @@ static int update_sched_domains(struct n
 	case CPU_DOWN_PREPARE:
 		for_each_online_cpu(i)
 			cpu_attach_domain(NULL, i);
+		synchronize_kernel();
 		arch_destroy_sched_domains();
 		return NOTIFY_OK;
 
_