Patches contributed by Eötvös Lorand University


commit 7a6c6bcee029a978f866511d6e41dbc7301fde4c
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Oct 15 17:00:19 2007 +0200

    sched: enable wake-idle on CONFIG_SCHED_MC=y
    
    most multicore CPUs today have shared L2 caches, so tune things so
    that the spreading amongst cores is more aggressive.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/topology.h b/include/linux/topology.h
index 865a63e65578..47729f18bfdf 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -98,7 +98,7 @@
 	.cache_nice_tries	= 0,			\
 	.busy_idx		= 0,			\
 	.idle_idx		= 0,			\
-	.newidle_idx		= 1,			\
+	.newidle_idx		= 0,			\
 	.wake_idx		= 0,			\
 	.forkexec_idx		= 0,			\
 	.flags			= SD_LOAD_BALANCE	\
@@ -128,14 +128,15 @@
 	.imbalance_pct		= 125,			\
 	.cache_nice_tries	= 1,			\
 	.busy_idx		= 2,			\
-	.idle_idx		= 1,			\
-	.newidle_idx		= 2,			\
+	.idle_idx		= 0,			\
+	.newidle_idx		= 0,			\
 	.wake_idx		= 1,			\
 	.forkexec_idx		= 1,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
 				| SD_WAKE_AFFINE	\
+				| SD_WAKE_IDLE		\
 				| SD_SHARE_PKG_RESOURCES\
 				| BALANCE_FOR_MC_POWER,	\
 	.last_balance		= jiffies,		\

commit 95dbb421d12fdd9796ed153853daf3679809274f
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Oct 15 17:00:19 2007 +0200

    sched: reintroduce topology.h tunings
    
    reintroduce the 2.6.22 topology.h tunings again - they result in
    slightly better balancing.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/topology.h b/include/linux/topology.h
index 525d437b1253..865a63e65578 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -98,7 +98,7 @@
 	.cache_nice_tries	= 0,			\
 	.busy_idx		= 0,			\
 	.idle_idx		= 0,			\
-	.newidle_idx		= 0,			\
+	.newidle_idx		= 1,			\
 	.wake_idx		= 0,			\
 	.forkexec_idx		= 0,			\
 	.flags			= SD_LOAD_BALANCE	\
@@ -128,15 +128,14 @@
 	.imbalance_pct		= 125,			\
 	.cache_nice_tries	= 1,			\
 	.busy_idx		= 2,			\
-	.idle_idx		= 0,			\
-	.newidle_idx		= 0,			\
+	.idle_idx		= 1,			\
+	.newidle_idx		= 2,			\
 	.wake_idx		= 1,			\
 	.forkexec_idx		= 1,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
 				| SD_WAKE_AFFINE	\
-				| SD_WAKE_IDLE		\
 				| SD_SHARE_PKG_RESOURCES\
 				| BALANCE_FOR_MC_POWER,	\
 	.last_balance		= jiffies,		\
@@ -159,15 +158,14 @@
 	.imbalance_pct		= 125,			\
 	.cache_nice_tries	= 1,			\
 	.busy_idx		= 2,			\
-	.idle_idx		= 0,			\
-	.newidle_idx		= 0,			\
+	.idle_idx		= 1,			\
+	.newidle_idx		= 2,			\
 	.wake_idx		= 1,			\
 	.forkexec_idx		= 1,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
 				| SD_WAKE_AFFINE	\
-				| SD_WAKE_IDLE		\
 				| BALANCE_FOR_PKG_POWER,\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\

commit 6bc1665ba71de0f207391b01b187b21b2619c15c
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Oct 15 17:00:18 2007 +0200

    sched: allow the immediate migration of cache-cold tasks
    
    allow the immediate migration of cache-cold tasks.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 3b27c3a553aa..750612751a7f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1016,6 +1016,11 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
 	if (p->sched_class != &fair_sched_class)
 		return 0;
 
+	if (sysctl_sched_migration_cost == -1)
+		return 1;
+	if (sysctl_sched_migration_cost == 0)
+		return 0;
+
 	delta = now - p->se.exec_start;
 
 	return delta < (s64)sysctl_sched_migration_cost;
@@ -2189,7 +2194,8 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
 	 * 2) too many balance attempts have failed.
 	 */
 
-	if (sd->nr_balance_failed > sd->cache_nice_tries) {
+	if (!task_hot(p, rq->clock, sd) ||
+			sd->nr_balance_failed > sd->cache_nice_tries) {
 #ifdef CONFIG_SCHEDSTATS
 		if (task_hot(p, rq->clock, sd)) {
 			schedstat_inc(sd, lb_hot_gained[idle]);

commit cc367732ff0b1c63d0d7bdd11e6d1661794ef6a3
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Oct 15 17:00:18 2007 +0200

    sched: debug, improve migration statistics
    
    add new migration statistics when SCHED_DEBUG and SCHEDSTATS
    is enabled. Available in /proc/<PID>/sched.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index fcc9a5ada1a2..3a6e05e77715 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -931,6 +931,24 @@ struct sched_entity {
 	u64			block_max;
 	u64			exec_max;
 	u64			slice_max;
+
+	u64			nr_migrations;
+	u64			nr_migrations_cold;
+	u64			nr_failed_migrations_affine;
+	u64			nr_failed_migrations_running;
+	u64			nr_failed_migrations_hot;
+	u64			nr_forced_migrations;
+	u64			nr_forced2_migrations;
+
+	u64			nr_wakeups;
+	u64			nr_wakeups_sync;
+	u64			nr_wakeups_migrate;
+	u64			nr_wakeups_local;
+	u64			nr_wakeups_remote;
+	u64			nr_wakeups_affine;
+	u64			nr_wakeups_affine_attempts;
+	u64			nr_wakeups_passive;
+	u64			nr_wakeups_idle;
 #endif
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched.c b/kernel/sched.c
index 945ab1322e18..3b27c3a553aa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1005,6 +1005,23 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 
 #ifdef CONFIG_SMP
 
+/*
+ * Is this task likely cache-hot:
+ */
+static inline int
+task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
+{
+	s64 delta;
+
+	if (p->sched_class != &fair_sched_class)
+		return 0;
+
+	delta = now - p->se.exec_start;
+
+	return delta < (s64)sysctl_sched_migration_cost;
+}
+
+
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
 	int old_cpu = task_cpu(p);
@@ -1022,6 +1039,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 		p->se.sleep_start -= clock_offset;
 	if (p->se.block_start)
 		p->se.block_start -= clock_offset;
+	if (old_cpu != new_cpu) {
+		schedstat_inc(p, se.nr_migrations);
+		if (task_hot(p, old_rq->clock, NULL))
+			schedstat_inc(p, se.nr_forced2_migrations);
+	}
 #endif
 	p->se.vruntime -= old_cfsrq->min_vruntime -
 					 new_cfsrq->min_vruntime;
@@ -1394,8 +1416,13 @@ static int wake_idle(int cpu, struct task_struct *p)
 		if (sd->flags & SD_WAKE_IDLE) {
 			cpus_and(tmp, sd->span, p->cpus_allowed);
 			for_each_cpu_mask(i, tmp) {
-				if (idle_cpu(i))
+				if (idle_cpu(i)) {
+					if (i != task_cpu(p)) {
+						schedstat_inc(p,
+							se.nr_wakeups_idle);
+					}
 					return i;
+				}
 			}
 		} else {
 			break;
@@ -1426,7 +1453,7 @@ static inline int wake_idle(int cpu, struct task_struct *p)
  */
 static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 {
-	int cpu, this_cpu, success = 0;
+	int cpu, orig_cpu, this_cpu, success = 0;
 	unsigned long flags;
 	long old_state;
 	struct rq *rq;
@@ -1445,6 +1472,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 		goto out_running;
 
 	cpu = task_cpu(p);
+	orig_cpu = cpu;
 	this_cpu = smp_processor_id();
 
 #ifdef CONFIG_SMP
@@ -1488,6 +1516,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 			unsigned long tl = this_load;
 			unsigned long tl_per_task;
 
+			schedstat_inc(p, se.nr_wakeups_affine_attempts);
 			tl_per_task = cpu_avg_load_per_task(this_cpu);
 
 			/*
@@ -1507,6 +1536,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 				 * there is no bad imbalance.
 				 */
 				schedstat_inc(this_sd, ttwu_move_affine);
+				schedstat_inc(p, se.nr_wakeups_affine);
 				goto out_set_cpu;
 			}
 		}
@@ -1518,6 +1548,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 		if (this_sd->flags & SD_WAKE_BALANCE) {
 			if (imbalance*this_load <= 100*load) {
 				schedstat_inc(this_sd, ttwu_move_balance);
+				schedstat_inc(p, se.nr_wakeups_passive);
 				goto out_set_cpu;
 			}
 		}
@@ -1543,6 +1574,15 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 
 out_activate:
 #endif /* CONFIG_SMP */
+	schedstat_inc(p, se.nr_wakeups);
+	if (sync)
+		schedstat_inc(p, se.nr_wakeups_sync);
+	if (orig_cpu != cpu)
+		schedstat_inc(p, se.nr_wakeups_migrate);
+	if (cpu == this_cpu)
+		schedstat_inc(p, se.nr_wakeups_local);
+	else
+		schedstat_inc(p, se.nr_wakeups_remote);
 	update_rq_clock(rq);
 	activate_task(rq, p, 1);
 	/*
@@ -2118,22 +2158,6 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
 	check_preempt_curr(this_rq, p);
 }
 
-/*
- * Is this task likely cache-hot:
- */
-static inline int
-task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
-{
-	s64 delta;
-
-	if (p->sched_class != &fair_sched_class)
-		return 0;
-
-	delta = now - p->se.exec_start;
-
-	return delta < (s64)sysctl_sched_migration_cost;
-}
-
 /*
  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
  */
@@ -2148,12 +2172,16 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
 	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
 	 * 3) are cache-hot on their current CPU.
 	 */
-	if (!cpu_isset(this_cpu, p->cpus_allowed))
+	if (!cpu_isset(this_cpu, p->cpus_allowed)) {
+		schedstat_inc(p, se.nr_failed_migrations_affine);
 		return 0;
+	}
 	*all_pinned = 0;
 
-	if (task_running(rq, p))
+	if (task_running(rq, p)) {
+		schedstat_inc(p, se.nr_failed_migrations_running);
 		return 0;
+	}
 
 	/*
 	 * Aggressive migration if:
@@ -2163,14 +2191,18 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
 
 	if (sd->nr_balance_failed > sd->cache_nice_tries) {
 #ifdef CONFIG_SCHEDSTATS
-		if (task_hot(p, rq->clock, sd))
+		if (task_hot(p, rq->clock, sd)) {
 			schedstat_inc(sd, lb_hot_gained[idle]);
+			schedstat_inc(p, se.nr_forced_migrations);
+		}
 #endif
 		return 1;
 	}
 
-	if (task_hot(p, rq->clock, sd))
+	if (task_hot(p, rq->clock, sd)) {
+		schedstat_inc(p, se.nr_failed_migrations_hot);
 		return 0;
+	}
 	return 1;
 }
 
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 755815937417..27e82cbccaa5 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -260,6 +260,7 @@ __initcall(init_sched_debug_procfs);
 
 void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 {
+	unsigned long nr_switches;
 	unsigned long flags;
 	int num_threads = 1;
 
@@ -273,8 +274,12 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads);
 	SEQ_printf(m,
 		"---------------------------------------------------------\n");
+#define __P(F) \
+	SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)F)
 #define P(F) \
 	SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)p->F)
+#define __PN(F) \
+	SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
 #define PN(F) \
 	SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
 
@@ -282,6 +287,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	PN(se.vruntime);
 	PN(se.sum_exec_runtime);
 
+	nr_switches = p->nvcsw + p->nivcsw;
+
 #ifdef CONFIG_SCHEDSTATS
 	PN(se.wait_start);
 	PN(se.sleep_start);
@@ -292,14 +299,55 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	PN(se.slice_max);
 	PN(se.wait_max);
 	P(sched_info.bkl_count);
+	P(se.nr_migrations);
+	P(se.nr_migrations_cold);
+	P(se.nr_failed_migrations_affine);
+	P(se.nr_failed_migrations_running);
+	P(se.nr_failed_migrations_hot);
+	P(se.nr_forced_migrations);
+	P(se.nr_forced2_migrations);
+	P(se.nr_wakeups);
+	P(se.nr_wakeups_sync);
+	P(se.nr_wakeups_migrate);
+	P(se.nr_wakeups_local);
+	P(se.nr_wakeups_remote);
+	P(se.nr_wakeups_affine);
+	P(se.nr_wakeups_affine_attempts);
+	P(se.nr_wakeups_passive);
+	P(se.nr_wakeups_idle);
+
+	{
+		u64 avg_atom, avg_per_cpu;
+
+		avg_atom = p->se.sum_exec_runtime;
+		if (nr_switches)
+			do_div(avg_atom, nr_switches);
+		else
+			avg_atom = -1LL;
+
+		avg_per_cpu = p->se.sum_exec_runtime;
+		if (p->se.nr_migrations)
+			avg_per_cpu = div64_64(avg_per_cpu, p->se.nr_migrations);
+		else
+			avg_per_cpu = -1LL;
+
+		__PN(avg_atom);
+		__PN(avg_per_cpu);
+	}
 #endif
+	__P(nr_switches);
 	SEQ_printf(m, "%-35s:%21Ld\n",
-		   "nr_switches", (long long)(p->nvcsw + p->nivcsw));
+		   "nr_voluntary_switches", (long long)p->nvcsw);
+	SEQ_printf(m, "%-35s:%21Ld\n",
+		   "nr_involuntary_switches", (long long)p->nivcsw);
+
 	P(se.load.weight);
 	P(policy);
 	P(prio);
-#undef P
 #undef PN
+#undef __PN
+#undef P
+#undef __P
 
 	{
 		u64 t0, t1;
@@ -314,13 +362,32 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 void proc_sched_set_task(struct task_struct *p)
 {
 #ifdef CONFIG_SCHEDSTATS
-	p->se.sleep_max			= 0;
-	p->se.block_max			= 0;
-	p->se.exec_max			= 0;
-	p->se.slice_max			= 0;
-	p->se.wait_max			= 0;
-	p->sched_info.bkl_count		= 0;
+	p->se.wait_max				= 0;
+	p->se.sleep_max				= 0;
+	p->se.sum_sleep_runtime			= 0;
+	p->se.block_max				= 0;
+	p->se.exec_max				= 0;
+	p->se.slice_max				= 0;
+	p->se.nr_migrations			= 0;
+	p->se.nr_migrations_cold		= 0;
+	p->se.nr_failed_migrations_affine	= 0;
+	p->se.nr_failed_migrations_running	= 0;
+	p->se.nr_failed_migrations_hot		= 0;
+	p->se.nr_forced_migrations		= 0;
+	p->se.nr_forced2_migrations		= 0;
+	p->se.nr_wakeups			= 0;
+	p->se.nr_wakeups_sync			= 0;
+	p->se.nr_wakeups_migrate		= 0;
+	p->se.nr_wakeups_local			= 0;
+	p->se.nr_wakeups_remote			= 0;
+	p->se.nr_wakeups_affine			= 0;
+	p->se.nr_wakeups_affine_attempts	= 0;
+	p->se.nr_wakeups_passive		= 0;
+	p->se.nr_wakeups_idle			= 0;
+	p->sched_info.bkl_count			= 0;
 #endif
-	p->se.sum_exec_runtime		= 0;
-	p->se.prev_sum_exec_runtime	= 0;
+	p->se.sum_exec_runtime			= 0;
+	p->se.prev_sum_exec_runtime		= 0;
+	p->nvcsw				= 0;
+	p->nivcsw				= 0;
 }

commit 2d92f22784b7b8879ebe3254e44c92cb8792b0dd
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Oct 15 17:00:18 2007 +0200

    sched: debug: increase width of debug line
    
    increase width of debug line - in preparation of more debugging info.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 0aab455a7b41..755815937417 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -198,7 +198,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
 	u64 now = ktime_to_ns(ktime_get());
 	int cpu;
 
-	SEQ_printf(m, "Sched Debug Version: v0.05-v20, %s %.*s\n",
+	SEQ_printf(m, "Sched Debug Version: v0.06-v22, %s %.*s\n",
 		init_utsname()->release,
 		(int)strcspn(init_utsname()->version, " "),
 		init_utsname()->version);
@@ -271,11 +271,12 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	rcu_read_unlock();
 
 	SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads);
-	SEQ_printf(m, "----------------------------------------------\n");
+	SEQ_printf(m,
+		"---------------------------------------------------------\n");
 #define P(F) \
-	SEQ_printf(m, "%-25s:%20Ld\n", #F, (long long)p->F)
+	SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)p->F)
 #define PN(F) \
-	SEQ_printf(m, "%-25s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
+	SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
 
 	PN(se.exec_start);
 	PN(se.vruntime);
@@ -292,7 +293,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	PN(se.wait_max);
 	P(sched_info.bkl_count);
 #endif
-	SEQ_printf(m, "%-25s:%20Ld\n",
+	SEQ_printf(m, "%-35s:%21Ld\n",
 		   "nr_switches", (long long)(p->nvcsw + p->nivcsw));
 	P(se.load.weight);
 	P(policy);
@@ -305,7 +306,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 
 		t0 = sched_clock();
 		t1 = sched_clock();
-		SEQ_printf(m, "%-25s:%20Ld\n",
+		SEQ_printf(m, "%-35s:%21Ld\n",
 			   "clock-delta", (long long)(t1-t0));
 	}
 }

commit da84d96176729fb48a8458561e5d8647103168b8
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Oct 15 17:00:18 2007 +0200

    sched: reintroduce cache-hot affinity
    
    reintroduce a simplified version of cache-hot/cold scheduling
    affinity. This improves performance with certain SMP workloads,
    such as sysbench.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8be5b57768c0..fcc9a5ada1a2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1415,6 +1415,7 @@ extern unsigned int sysctl_sched_wakeup_granularity;
 extern unsigned int sysctl_sched_batch_wakeup_granularity;
 extern unsigned int sysctl_sched_child_runs_first;
 extern unsigned int sysctl_sched_features;
+extern unsigned int sysctl_sched_migration_cost;
 #endif
 
 extern unsigned int sysctl_sched_compat_yield;
diff --git a/kernel/sched.c b/kernel/sched.c
index 791dd08c692f..089d8b12ab76 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2118,6 +2118,17 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
 	check_preempt_curr(this_rq, p);
 }
 
+/*
+ * Is this task likely cache-hot:
+ */
+static inline int
+task_hot(struct task_struct *p, unsigned long long now, struct sched_domain *sd)
+{
+	s64 delta = now - p->se.exec_start;
+
+	return delta < (long long)sysctl_sched_migration_cost;
+}
+
 /*
  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
  */
@@ -2139,6 +2150,22 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
 	if (task_running(rq, p))
 		return 0;
 
+	/*
+	 * Aggressive migration if:
+	 * 1) task is cache cold, or
+	 * 2) too many balance attempts have failed.
+	 */
+
+	if (sd->nr_balance_failed > sd->cache_nice_tries) {
+#ifdef CONFIG_SCHEDSTATS
+		if (task_hot(p, rq->clock, sd))
+			schedstat_inc(sd, lb_hot_gained[idle]);
+#endif
+		return 1;
+	}
+
+	if (task_hot(p, rq->clock, sd))
+		return 0;
 	return 1;
 }
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index cea1fa32b3f4..a17b785d7000 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -74,6 +74,8 @@ const_debug unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL;
  */
 const_debug unsigned int sysctl_sched_wakeup_granularity = 10000000UL;
 
+const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+
 /**************************************************************
  * CFS operations on generic schedulable entities:
  */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 230ca4eb57fe..ec14aa8ac51f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -277,6 +277,14 @@ static ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "sched_migration_cost",
+		.data		= &sysctl_sched_migration_cost,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 #endif
 	{
 		.ctl_name	= CTL_UNNUMBERED,

commit e5f32a3856caabe745381279f7f32e3b581b59dc
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Oct 15 17:00:18 2007 +0200

    sched: speed up context-switches a bit
    
    speed up context-switches a bit by not clearing p->exec_start.
    
    (as a side-effect, this also makes p->exec_start a universal timestamp
    available to cache-hot estimations.)
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c240b72b4e62..cea1fa32b3f4 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -379,15 +379,6 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	se->exec_start = rq_of(cfs_rq)->clock;
 }
 
-/*
- * We are descheduling a task - update its stats:
- */
-static inline void
-update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-	se->exec_start = 0;
-}
-
 /**************************************************
  * Scheduling class queueing methods:
  */
@@ -609,8 +600,6 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 	if (prev->on_rq)
 		update_curr(cfs_rq);
 
-	update_stats_curr_end(cfs_rq, prev);
-
 	check_spread(cfs_rq, prev);
 	if (prev->on_rq) {
 		update_stats_wait_start(cfs_rq, prev);

commit 91c234b4e3419c786cac2d5b7a7b96443e512e3a
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Oct 15 17:00:18 2007 +0200

    sched: do not wakeup-preempt with SCHED_BATCH tasks
    
    do not wakeup-preempt with SCHED_BATCH tasks, their preemption
    is batched too, driven by the tick.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index ec1592eb8d08..c240b72b4e62 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -828,6 +828,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 		resched_task(curr);
 		return;
 	}
+	/*
+	 * Batch tasks do not preempt (their preemption is driven by
+	 * the tick):
+	 */
+	if (unlikely(p->policy == SCHED_BATCH))
+		return;
 
 	if (sched_feat(WAKEUP_PREEMPT)) {
 		while (!is_same_group(se, pse)) {

commit 178be793485d70d871a0fd46b29e9e3e7da636ad
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Oct 15 17:00:18 2007 +0200

    sched: do not normalize kernel threads via SysRq-N
    
    do not normalize kernel threads via SysRq-N: the migration threads,
    softlockup threads, etc. might be essential for the system to
    function properly. So only zap user tasks.
    
    pointed out by Andi Kleen.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index fc61b1fc67d5..791dd08c692f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -365,15 +365,6 @@ static inline int cpu_of(struct rq *rq)
 #endif
 }
 
-static inline int is_migration_thread(struct task_struct *p, struct rq *rq)
-{
-#ifdef CONFIG_SMP
-	return p == rq->migration_thread;
-#else
-	return 0;
-#endif
-}
-
 /*
  * Update the per-runqueue clock, as finegrained as the platform can give
  * us, but without assuming monotonicity, etc.:
@@ -6563,6 +6554,12 @@ void normalize_rt_tasks(void)
 
 	read_lock_irq(&tasklist_lock);
 	do_each_thread(g, p) {
+		/*
+		 * Only normalize user tasks:
+		 */
+		if (!p->mm)
+			continue;
+
 		p->se.exec_start		= 0;
 #ifdef CONFIG_SCHEDSTATS
 		p->se.wait_start		= 0;
@@ -6584,8 +6581,7 @@ void normalize_rt_tasks(void)
 		spin_lock_irqsave(&p->pi_lock, flags);
 		rq = __task_rq_lock(p);
 
-		if (!is_migration_thread(p, rq))
-			normalize_task(rq, p);
+		normalize_task(rq, p);
 
 		__task_rq_unlock(rq);
 		spin_unlock_irqrestore(&p->pi_lock, flags);

commit d5036e89dcf7c19b3d03219d7d385bc96965b7fe
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Oct 15 17:00:15 2007 +0200

    sched: clean up is_migration_thread()
    
    clean up is_migration_thread() and turn it into an inline function.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 2c6295b395a9..7ef66bd753e7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,12 +75,6 @@ unsigned long long __attribute__((weak)) sched_clock(void)
 	return (unsigned long long)jiffies * (1000000000 / HZ);
 }
 
-#ifdef CONFIG_SMP
-#define is_migration_thread(p, rq) ((p) == (rq)->migration_thread)
-#else
-#define is_migration_thread(p, rq) 0
-#endif
-
 /*
  * Convert user-nice values [ -20 ... 0 ... 19 ]
  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -371,6 +365,15 @@ static inline int cpu_of(struct rq *rq)
 #endif
 }
 
+static inline int is_migration_thread(struct task_struct *p, struct rq *rq)
+{
+#ifdef CONFIG_SMP
+	return p == rq->migration_thread;
+#else
+	return 0;
+#endif
+}
+
 /*
  * Update the per-runqueue clock, as finegrained as the platform can give
  * us, but without assuming monotonicity, etc.: