From: Ingo Molnar <mingo@elte.hu>

Implement balancing during clone().  It does the following things:

- introduces SD_BALANCE_CLONE that can serve as a tool for an
  architecture to limit the search-idlest-CPU scope on clone(). 
  E.g. the 512-CPU systems should rather not enable this.

- uses the highest sd for the imbalance_pct, not this_rq (which didnt 
  make sense).

- unifies balance-on-exec and balance-on-clone via the find_idlest_cpu() 
  function. Gets rid of sched_best_cpu() which was still a bit
  inconsistent IMO, it used 'min_load < load' as a condition for
  balancing - while a more correct approach would be to use half of the
  imbalance_pct, like passive balancing does.

- the patch also reintroduces the possibility to do SD_BALANCE_EXEC on
  SMP systems, and activates it - to get testing.

- NOTE: there's one thing in this patch that is slightly unclean: i
  introduced wake_up_forked_thread. I did this to make it easier to get
  rid of this patch later (wake_up_forked_process() has lots of
  dependencies in various architectures). If this capability remains in
  the kernel then i'll clean it up and introduce one function for
  wake_up_forked_process/thread.

- NOTE2: i added the SD_BALANCE_CLONE flag to the NUMA CPU template too. 
  Some NUMA architectures probably want to disable this.


---

 25-akpm/include/linux/sched.h |   23 ++++-
 25-akpm/kernel/fork.c         |   20 ++++
 25-akpm/kernel/sched.c        |  169 +++++++++++++++++++++++++++++++++---------
 3 files changed, 167 insertions(+), 45 deletions(-)

diff -puN include/linux/sched.h~sched-balance-context include/linux/sched.h
--- 25/include/linux/sched.h~sched-balance-context	2004-04-12 22:53:15.499176072 -0700
+++ 25-akpm/include/linux/sched.h	2004-04-12 22:53:15.508174704 -0700
@@ -546,10 +546,11 @@ do { if (atomic_dec_and_test(&(tsk)->usa
 
 #define SD_BALANCE_NEWIDLE	1	/* Balance when about to become idle */
 #define SD_BALANCE_EXEC		2	/* Balance on exec */
-#define SD_WAKE_IDLE		4	/* Wake to idle CPU on task wakeup */
-#define SD_WAKE_AFFINE		8	/* Wake task to waking CPU */
-#define SD_WAKE_BALANCE		16	/* Perform balancing at task wakeup */
-#define SD_SHARE_CPUPOWER	32	/* Domain members share cpu power */
+#define SD_BALANCE_CLONE	4	/* Balance on clone */
+#define SD_WAKE_IDLE		8	/* Wake to idle CPU on task wakeup */
+#define SD_WAKE_AFFINE		16	/* Wake task to waking CPU */
+#define SD_WAKE_BALANCE		32	/* Perform balancing at task wakeup */
+#define SD_SHARE_CPUPOWER	64	/* Domain members share cpu power */
 
 struct sched_group {
 	struct sched_group *next;	/* Must be a circular list */
@@ -597,6 +598,8 @@ struct sched_domain {
 	.cache_nice_tries	= 0,			\
 	.per_cpu_gain		= 15,			\
 	.flags			= SD_BALANCE_NEWIDLE	\
+				| SD_BALANCE_EXEC	\
+				| SD_BALANCE_CLONE	\
 				| SD_WAKE_AFFINE	\
 				| SD_WAKE_IDLE		\
 				| SD_SHARE_CPUPOWER,	\
@@ -618,6 +621,8 @@ struct sched_domain {
 	.cache_nice_tries	= 1,			\
 	.per_cpu_gain		= 100,			\
 	.flags			= SD_BALANCE_NEWIDLE	\
+				| SD_BALANCE_EXEC	\
+				| SD_BALANCE_CLONE	\
 				| SD_WAKE_AFFINE	\
 				| SD_WAKE_BALANCE,	\
 	.last_balance		= jiffies,		\
@@ -639,6 +644,7 @@ struct sched_domain {
 	.cache_nice_tries	= 1,			\
 	.per_cpu_gain		= 100,			\
 	.flags			= SD_BALANCE_EXEC	\
+				| SD_BALANCE_CLONE	\
 				| SD_WAKE_BALANCE,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
@@ -658,7 +664,7 @@ static inline int set_cpus_allowed(task_
 
 extern unsigned long long sched_clock(void);
 
-#ifdef CONFIG_NUMA
+#ifdef CONFIG_SMP
 extern void sched_balance_exec(void);
 #else
 #define sched_balance_exec()   {}
@@ -716,12 +722,17 @@ extern void do_timer(struct pt_regs *);
 
 extern int FASTCALL(wake_up_state(struct task_struct * tsk, unsigned int state));
 extern int FASTCALL(wake_up_process(struct task_struct * tsk));
+extern void FASTCALL(wake_up_forked_process(struct task_struct * tsk));
 #ifdef CONFIG_SMP
  extern void kick_process(struct task_struct *tsk);
+ extern void FASTCALL(wake_up_forked_thread(struct task_struct * tsk));
 #else
  static inline void kick_process(struct task_struct *tsk) { }
+ static inline void wake_up_forked_thread(struct task_struct * tsk)
+ {
+	return wake_up_forked_process(tsk);
+ }
 #endif
-extern void FASTCALL(wake_up_forked_process(struct task_struct * tsk));
 extern void FASTCALL(sched_fork(task_t * p));
 extern void FASTCALL(sched_exit(task_t * p));
 
diff -puN kernel/fork.c~sched-balance-context kernel/fork.c
--- 25/kernel/fork.c~sched-balance-context	2004-04-12 22:53:15.502175616 -0700
+++ 25-akpm/kernel/fork.c	2004-04-12 22:53:15.509174552 -0700
@@ -1177,9 +1177,23 @@ long do_fork(unsigned long clone_flags,
 			set_tsk_thread_flag(p, TIF_SIGPENDING);
 		}
 
-		if (!(clone_flags & CLONE_STOPPED))
-			wake_up_forked_process(p);	/* do this last */
-		else
+		if (!(clone_flags & CLONE_STOPPED)) {
+			/*
+			 * Do the wakeup last. On SMP we treat fork() and
+			 * CLONE_VM separately, because fork() has already
+			 * created cache footprint on this CPU (due to
+			 * copying the pagetables), hence migration would
+			 * probably be costy. Threads on the other hand
+			 * have less traction to the current CPU, and if
+			 * there's an imbalance then the scheduler can
+			 * migrate this fresh thread now, before it
+			 * accumulates a larger cache footprint:
+			 */
+			if (clone_flags & CLONE_VM)
+				wake_up_forked_thread(p);
+			else
+				wake_up_forked_process(p);
+		} else
 			p->state = TASK_STOPPED;
 		++total_forks;
 
diff -puN kernel/sched.c~sched-balance-context kernel/sched.c
--- 25/kernel/sched.c~sched-balance-context	2004-04-12 22:53:15.503175464 -0700
+++ 25-akpm/kernel/sched.c	2004-04-12 22:53:15.512174096 -0700
@@ -1156,7 +1156,133 @@ enum idle_type
 };
 
 #ifdef CONFIG_SMP
-#ifdef CONFIG_NUMA
+
+/*
+ * find_idlest_cpu - find the least busy runqueue.
+ */
+static int find_idlest_cpu(struct task_struct *p, int this_cpu,
+			   struct sched_domain *sd)
+{
+	unsigned long load, min_load, this_load;
+	int i, min_cpu;
+	cpumask_t mask;
+
+	min_cpu = UINT_MAX;
+	min_load = ULONG_MAX;
+
+	cpus_and(mask, sd->span, cpu_online_map);
+	cpus_and(mask, mask, p->cpus_allowed);
+
+	for_each_cpu_mask(i, mask) {
+		load = target_load(i);
+
+		if (load < min_load) {
+			min_cpu = i;
+			min_load = load;
+
+			/* break out early on an idle CPU: */
+			if (!min_load)
+				break;
+		}
+	}
+
+	/* add +1 to account for the new task */
+	this_load = source_load(this_cpu) + SCHED_LOAD_SCALE;
+
+	/*
+	 * Would with the addition of the new task to the
+	 * current CPU there be an imbalance between this
+	 * CPU and the idlest CPU?
+	 *
+	 * Use half of the balancing threshold - new-context is
+	 * a good opportunity to balance.
+	 */
+	if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100)
+		return min_cpu;
+
+	return this_cpu;
+}
+
+/*
+ * wake_up_forked_thread - wake up a freshly forked thread.
+ *
+ * This function will do some initial scheduler statistics housekeeping
+ * that must be done for every newly created context, and it also does
+ * runqueue balancing.
+ */
+void fastcall wake_up_forked_thread(task_t * p)
+{
+	unsigned long flags;
+	int this_cpu = get_cpu(), cpu;
+	struct sched_domain *tmp, *sd = NULL;
+	runqueue_t *this_rq = cpu_rq(this_cpu), *rq;
+
+	/*
+	 * Find the largest domain that this CPU is part of that
+	 * is willing to balance on clone:
+	 */
+	for_each_domain(this_cpu, tmp)
+		if (tmp->flags & SD_BALANCE_CLONE)
+			sd = tmp;
+	if (sd)
+		cpu = find_idlest_cpu(p, this_cpu, sd);
+	else
+		cpu = this_cpu;
+
+	local_irq_save(flags);
+lock_again:
+	rq = cpu_rq(cpu);
+	double_rq_lock(this_rq, rq);
+
+	BUG_ON(p->state != TASK_RUNNING);
+
+	/*
+	 * We did find_idlest_cpu() unlocked, so in theory
+	 * the mask could have changed - just dont migrate
+	 * in this case:
+	 */
+	if (unlikely(!cpu_isset(cpu, p->cpus_allowed))) {
+		cpu = this_cpu;
+		double_rq_unlock(this_rq, rq);
+		goto lock_again;
+	}
+	/*
+	 * We decrease the sleep average of forking parents
+	 * and children as well, to keep max-interactive tasks
+	 * from forking tasks that are max-interactive.
+	 */
+	current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
+		PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
+
+	p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
+		CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
+
+	p->interactive_credit = 0;
+
+	p->prio = effective_prio(p);
+	set_task_cpu(p, cpu);
+
+	if (cpu == this_cpu) {
+		if (unlikely(!current->array))
+			__activate_task(p, rq);
+		else {
+			p->prio = current->prio;
+			list_add_tail(&p->run_list, &current->run_list);
+			p->array = current->array;
+			p->array->nr_active++;
+			rq->nr_running++;
+		}
+	} else {
+		__activate_task(p, rq);
+		if (TASK_PREEMPTS_CURR(p, rq))
+			resched_task(rq->curr);
+	}
+
+	double_rq_unlock(this_rq, rq);
+	local_irq_restore(flags);
+	put_cpu();
+}
+
 /*
  * If dest_cpu is allowed for this process, migrate the task to it.
  * This is accomplished by forcing the cpu_allowed mask to only
@@ -1197,34 +1323,6 @@ out:
 }
 
 /*
- * Find the least loaded CPU.  Slightly favor the current CPU by
- * setting its load as the minimum to start.
- */
-static int sched_best_cpu(struct task_struct *p, struct sched_domain *sd)
-{
-	cpumask_t tmp;
-	int i, min_load, this_cpu, best_cpu;
-
-	best_cpu = this_cpu = task_cpu(p);
-	min_load = INT_MAX;
-
-	cpus_and(tmp, sd->span, cpu_online_map);
-	for_each_cpu_mask(i, tmp) {
-		unsigned long load;
-		if (i == this_cpu)
-			load = source_load(i);
-		else
-			load = target_load(i) + SCHED_LOAD_SCALE;
-
-		if (min_load > load) {
-			best_cpu = i;
-			min_load = load;
-		}
-	}
-	return best_cpu;
-}
-
-/*
  * sched_balance_exec(): find the highest-level, exec-balance-capable
  * domain and try to migrate the task to the least loaded CPU.
  *
@@ -1233,19 +1331,19 @@ static int sched_best_cpu(struct task_st
  */
 void sched_balance_exec(void)
 {
-	struct sched_domain *sd, *best_sd = NULL;
+	struct sched_domain *tmp, *sd = NULL;
 	int new_cpu, this_cpu = get_cpu();
 
 	/* Prefer the current CPU if there's only this task running */
 	if (this_rq()->nr_running <= 1)
 		goto out;
 
-	for_each_domain(this_cpu, sd)
-		if (sd->flags & SD_BALANCE_EXEC)
-			best_sd = sd;
+	for_each_domain(this_cpu, tmp)
+		if (tmp->flags & SD_BALANCE_EXEC)
+			sd = tmp;
 
-	if (best_sd) {
-		new_cpu = sched_best_cpu(current, best_sd);
+	if (sd) {
+		new_cpu = find_idlest_cpu(current, this_cpu, sd);
 		if (new_cpu != this_cpu) {
 			put_cpu();
 			sched_migrate_task(current, new_cpu);
@@ -1255,7 +1353,6 @@ void sched_balance_exec(void)
 out:
 	put_cpu();
 }
-#endif /* CONFIG_NUMA */
 
 /*
  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.

_