Patches contributed by Eötvös Lorand University

<<Prev 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290[291]292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 Next>>

commit 7a6c6bcee029a978f866511d6e41dbc7301fde4c
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Oct 15 17:00:19 2007 +0200

    sched: enable wake-idle on CONFIG_SCHED_MC=y
    
    most multicore CPUs today have shared L2 caches, so tune things so
    that the spreading amongst cores is more aggressive.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/topology.h b/include/linux/topology.h
index 865a63e65578..47729f18bfdf 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -98,7 +98,7 @@
 	.cache_nice_tries	= 0,			\
 	.busy_idx		= 0,			\
 	.idle_idx		= 0,			\
-	.newidle_idx		= 1,			\
+	.newidle_idx		= 0,			\
 	.wake_idx		= 0,			\
 	.forkexec_idx		= 0,			\
 	.flags			= SD_LOAD_BALANCE	\
@@ -128,14 +128,15 @@
 	.imbalance_pct		= 125,			\
 	.cache_nice_tries	= 1,			\
 	.busy_idx		= 2,			\
-	.idle_idx		= 1,			\
-	.newidle_idx		= 2,			\
+	.idle_idx		= 0,			\
+	.newidle_idx		= 0,			\
 	.wake_idx		= 1,			\
 	.forkexec_idx		= 1,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
 				| SD_WAKE_AFFINE	\
+				| SD_WAKE_IDLE		\
 				| SD_SHARE_PKG_RESOURCES\
 				| BALANCE_FOR_MC_POWER,	\
 	.last_balance		= jiffies,		\

commit 95dbb421d12fdd9796ed153853daf3679809274f
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Oct 15 17:00:19 2007 +0200

    sched: reintroduce topology.h tunings
    
    reintroduce the 2.6.22 topology.h tunings again - they result in
    slightly better balancing.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/topology.h b/include/linux/topology.h
index 525d437b1253..865a63e65578 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -98,7 +98,7 @@
 	.cache_nice_tries	= 0,			\
 	.busy_idx		= 0,			\
 	.idle_idx		= 0,			\
-	.newidle_idx		= 0,			\
+	.newidle_idx		= 1,			\
 	.wake_idx		= 0,			\
 	.forkexec_idx		= 0,			\
 	.flags			= SD_LOAD_BALANCE	\
@@ -128,15 +128,14 @@
 	.imbalance_pct		= 125,			\
 	.cache_nice_tries	= 1,			\
 	.busy_idx		= 2,			\
-	.idle_idx		= 0,			\
-	.newidle_idx		= 0,			\
+	.idle_idx		= 1,			\
+	.newidle_idx		= 2,			\
 	.wake_idx		= 1,			\
 	.forkexec_idx		= 1,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
 				| SD_WAKE_AFFINE	\
-				| SD_WAKE_IDLE		\
 				| SD_SHARE_PKG_RESOURCES\
 				| BALANCE_FOR_MC_POWER,	\
 	.last_balance		= jiffies,		\
@@ -159,15 +158,14 @@
 	.imbalance_pct		= 125,			\
 	.cache_nice_tries	= 1,			\
 	.busy_idx		= 2,			\
-	.idle_idx		= 0,			\
-	.newidle_idx		= 0,			\
+	.idle_idx		= 1,			\
+	.newidle_idx		= 2,			\
 	.wake_idx		= 1,			\
 	.forkexec_idx		= 1,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
 				| SD_WAKE_AFFINE	\
-				| SD_WAKE_IDLE		\
 				| BALANCE_FOR_PKG_POWER,\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\

commit 6bc1665ba71de0f207391b01b187b21b2619c15c
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Oct 15 17:00:18 2007 +0200

    sched: allow the immediate migration of cache-cold tasks
    
    allow the immediate migration of cache-cold tasks.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 3b27c3a553aa..750612751a7f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1016,6 +1016,11 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
 	if (p->sched_class != &fair_sched_class)
 		return 0;
 
+	if (sysctl_sched_migration_cost == -1)
+		return 1;
+	if (sysctl_sched_migration_cost == 0)
+		return 0;
+
 	delta = now - p->se.exec_start;
 
 	return delta < (s64)sysctl_sched_migration_cost;
@@ -2189,7 +2194,8 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
 	 * 2) too many balance attempts have failed.
 	 */
 
-	if (sd->nr_balance_failed > sd->cache_nice_tries) {
+	if (!task_hot(p, rq->clock, sd) ||
+			sd->nr_balance_failed > sd->cache_nice_tries) {
 #ifdef CONFIG_SCHEDSTATS
 		if (task_hot(p, rq->clock, sd)) {
 			schedstat_inc(sd, lb_hot_gained[idle]);

commit cc367732ff0b1c63d0d7bdd11e6d1661794ef6a3
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Oct 15 17:00:18 2007 +0200

    sched: debug, improve migration statistics
    
    add new migration statistics when SCHED_DEBUG and SCHEDSTATS
    is enabled. Available in /proc/<PID>/sched.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index fcc9a5ada1a2..3a6e05e77715 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -931,6 +931,24 @@ struct sched_entity {
 	u64			block_max;
 	u64			exec_max;
 	u64			slice_max;
+
+	u64			nr_migrations;
+	u64			nr_migrations_cold;
+	u64			nr_failed_migrations_affine;
+	u64			nr_failed_migrations_running;
+	u64			nr_failed_migrations_hot;
+	u64			nr_forced_migrations;
+	u64			nr_forced2_migrations;
+
+	u64			nr_wakeups;
+	u64			nr_wakeups_sync;
+	u64			nr_wakeups_migrate;
+	u64			nr_wakeups_local;
+	u64			nr_wakeups_remote;
+	u64			nr_wakeups_affine;
+	u64			nr_wakeups_affine_attempts;
+	u64			nr_wakeups_passive;
+	u64			nr_wakeups_idle;
 #endif
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched.c b/kernel/sched.c
index 945ab1322e18..3b27c3a553aa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1005,6 +1005,23 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 
 #ifdef CONFIG_SMP
 
+/*
+ * Is this task likely cache-hot:
+ */
+static inline int
+task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
+{
+	s64 delta;
+
+	if (p->sched_class != &fair_sched_class)
+		return 0;
+
+	delta = now - p->se.exec_start;
+
+	return delta < (s64)sysctl_sched_migration_cost;
+}
+
+
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
 	int old_cpu = task_cpu(p);
@@ -1022,6 +1039,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 		p->se.sleep_start -= clock_offset;
 	if (p->se.block_start)
 		p->se.block_start -= clock_offset;
+	if (old_cpu != new_cpu) {
+		schedstat_inc(p, se.nr_migrations);
+		if (task_hot(p, old_rq->clock, NULL))
+			schedstat_inc(p, se.nr_forced2_migrations);
+	}
 #endif
 	p->se.vruntime -= old_cfsrq->min_vruntime -
 					 new_cfsrq->min_vruntime;
@@ -1394,8 +1416,13 @@ static int wake_idle(int cpu, struct task_struct *p)
 		if (sd->flags & SD_WAKE_IDLE) {
 			cpus_and(tmp, sd->span, p->cpus_allowed);
 			for_each_cpu_mask(i, tmp) {
-				if (idle_cpu(i))
+				if (idle_cpu(i)) {
+					if (i != task_cpu(p)) {
+						schedstat_inc(p,
+							se.nr_wakeups_idle);
+					}
 					return i;
+				}
 			}
 		} else {
 			break;
@@ -1426,7 +1453,7 @@ static inline int wake_idle(int cpu, struct task_struct *p)
  */
 static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 {
-	int cpu, this_cpu, success = 0;
+	int cpu, orig_cpu, this_cpu, success = 0;
 	unsigned long flags;
 	long old_state;
 	struct rq *rq;
@@ -1445,6 +1472,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 		goto out_running;
 
 	cpu = task_cpu(p);
+	orig_cpu = cpu;
 	this_cpu = smp_processor_id();
 
 #ifdef CONFIG_SMP
@@ -1488,6 +1516,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 			unsigned long tl = this_load;
 			unsigned long tl_per_task;
 
+			schedstat_inc(p, se.nr_wakeups_affine_attempts);
 			tl_per_task = cpu_avg_load_per_task(this_cpu);
 
 			/*
@@ -1507,6 +1536,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 				 * there is no bad imbalance.
 				 */
 				schedstat_inc(this_sd, ttwu_move_affine);
+				schedstat_inc(p, se.nr_wakeups_affine);
 				goto out_set_cpu;
 			}
 		}
@@ -1518,6 +1548,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 		if (this_sd->flags & SD_WAKE_BALANCE) {
 			if (imbalance*this_load <= 100*load) {
 				schedstat_inc(this_sd, ttwu_move_balance);
+				schedstat_inc(p, se.nr_wakeups_passive);
 				goto out_set_cpu;
 			}
 		}
@@ -1543,6 +1574,15 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 
 out_activate:
 #endif /* CONFIG_SMP */
+	schedstat_inc(p, se.nr_wakeups);
+	if (sync)
+		schedstat_inc(p, se.nr_wakeups_sync);
+	if (orig_cpu != cpu)
+		schedstat_inc(p, se.nr_wakeups_migrate);
+	if (cpu == this_cpu)
+		schedstat_inc(p, se.nr_wakeups_local);
+	else
+		schedstat_inc(p, se.nr_wakeups_remote);
 	update_rq_clock(rq);
 	activate_task(rq, p, 1);
 	/*
@@ -2118,22 +2158,6 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
 	check_preempt_curr(this_rq, p);
 }
 
-/*
- * Is this task likely cache-hot:
- */
-static inline int
-task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
-{
-	s64 delta;
-
-	if (p->sched_class != &fair_sched_class)
-		return 0;
-
-	delta = now - p->se.exec_start;
-
-	return delta < (s64)sysctl_sched_migration_cost;
-}
-
 /*
  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
  */
@@ -2148,12 +2172,16 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
 	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
 	 * 3) are cache-hot on their current CPU.
 	 */
-	if (!cpu_isset(this_cpu, p->cpus_allowed))
+	if (!cpu_isset(this_cpu, p->cpus_allowed)) {
+		schedstat_inc(p, se.nr_failed_migrations_affine);
 		return 0;
+	}
 	*all_pinned = 0;
 
-	if (task_running(rq, p))
+	if (task_running(rq, p)) {
+		schedstat_inc(p, se.nr_failed_migrations_running);
 		return 0;
+	}
 
 	/*
 	 * Aggressive migration if:
@@ -2163,14 +2191,18 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
 
 	if (sd->nr_balance_failed > sd->cache_nice_tries) {
 #ifdef CONFIG_SCHEDSTATS
-		if (task_hot(p, rq->clock, sd))
+		if (task_hot(p, rq->clock, sd)) {
 			schedstat_inc(sd, lb_hot_gained[idle]);
+			schedstat_inc(p, se.nr_forced_migrations);
+		}
 #endif
 		return 1;
 	}
 
-	if (task_hot(p, rq->clock, sd))
+	if (task_hot(p, rq->clock, sd)) {
+		schedstat_inc(p, se.nr_failed_migrations_hot);
 		return 0;
+	}
 	return 1;
 }
 
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 755815937417..27e82cbccaa5 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -260,6 +260,7 @@ __initcall(init_sched_debug_procfs);
 
 void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 {
+	unsigned long nr_switches;
 	unsigned long flags;
 	int num_threads = 1;
 
@@ -273,8 +274,12 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads);
 	SEQ_printf(m,
 		"---------------------------------------------------------\n");
+#define __P(F) \
+	SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)F)
 #define P(F) \
 	SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)p->F)
+#define __PN(F) \
+	SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
 #define PN(F) \
 	SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
 
@@ -282,6 +287,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	PN(se.vruntime);
 	PN(se.sum_exec_runtime);
 
+	nr_switches = p->nvcsw + p->nivcsw;
+
 #ifdef CONFIG_SCHEDSTATS
 	PN(se.wait_start);
 	PN(se.sleep_start);
@@ -292,14 +299,55 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	PN(se.slice_max);
 	PN(se.wait_max);
 	P(sched_info.bkl_count);
+	P(se.nr_migrations);
+	P(se.nr_migrations_cold);
+	P(se.nr_failed_migrations_affine);
+	P(se.nr_failed_migrations_running);
+	P(se.nr_failed_migrations_hot);
+	P(se.nr_forced_migrations);
+	P(se.nr_forced2_migrations);
+	P(se.nr_wakeups);
+	P(se.nr_wakeups_sync);
+	P(se.nr_wakeups_migrate);
+	P(se.nr_wakeups_local);
+	P(se.nr_wakeups_remote);
+	P(se.nr_wakeups_affine);
+	P(se.nr_wakeups_affine_attempts);
+	P(se.nr_wakeups_passive);
+	P(se.nr_wakeups_idle);
+
+	{
+		u64 avg_atom, avg_per_cpu;
+
+		avg_atom = p->se.sum_exec_runtime;
+		if (nr_switches)
+			do_div(avg_atom, nr_switches);
+		else
+			avg_atom = -1LL;
+
+		avg_per_cpu = p->se.sum_exec_runtime;
+		if (p->se.nr_migrations)
+			avg_per_cpu = div64_64(avg_per_cpu, p->se.nr_migrations);
+		else
+			avg_per_cpu = -1LL;
+
+		__PN(avg_atom);
+		__PN(avg_per_cpu);
+	}
 #endif
+	__P(nr_switches);
 	SEQ_printf(m, "%-35s:%21Ld\n",
-		   "nr_switches", (long long)(p->nvcsw + p->nivcsw));
+		   "nr_voluntary_switches", (long long)p->nvcsw);
+	SEQ_printf(m, "%-35s:%21Ld\n",
+		   "nr_involuntary_switches", (long long)p->nivcsw);
+
 	P(se.load.weight);
 	P(policy);
 	P(prio);
-#undef P
 #undef PN
+#undef __PN
+#undef P
+#undef __P
 
 	{
 		u64 t0, t1;
@@ -314,13 +362,32 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 void proc_sched_set_task(struct task_struct *p)
 {
 #ifdef CONFIG_SCHEDSTATS
-	p->se.sleep_max			= 0;
-	p->se.block_max			= 0;
-	p->se.exec_max			= 0;
-	p->se.slice_max			= 0;
-	p->se.wait_max			= 0;
-	p->sched_info.bkl_count		= 0;
+	p->se.wait_max				= 0;
+	p->se.sleep_max				= 0;
+	p->se.sum_sleep_runtime			= 0;
+	p->se.block_max				= 0;
+	p->se.exec_max				= 0;
+	p->se.slice_max				= 0;
+	p->se.nr_migrations			= 0;
+	p->se.nr_migrations_cold		= 0;
+	p->se.nr_failed_migrations_affine	= 0;
+	p->se.nr_failed_migrations_running	= 0;
+	p->se.nr_failed_migrations_hot		= 0;
+	p->se.nr_forced_migrations		= 0;
+	p->se.nr_forced2_migrations		= 0;
+	p->se.nr_wakeups			= 0;
+	p->se.nr_wakeups_sync			= 0;
+	p->se.nr_wakeups_migrate		= 0;
+	p->se.nr_wakeups_local			= 0;
+	p->se.nr_wakeups_remote			= 0;
+	p->se.nr_wakeups_affine			= 0;
+	p->se.nr_wakeups_affine_attempts	= 0;
+	p->se.nr_wakeups_passive		= 0;
+	p->se.nr_wakeups_idle			= 0;
+	p->sched_info.bkl_count			= 0;
 #endif
-	p->se.sum_exec_runtime		= 0;
-	p->se.prev_sum_exec_runtime	= 0;
+	p->se.sum_exec_runtime			= 0;
+	p->se.prev_sum_exec_runtime		= 0;
+	p->nvcsw				= 0;
+	p->nivcsw				= 0;
 }

commit 2d92f22784b7b8879ebe3254e44c92cb8792b0dd
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Oct 15 17:00:18 2007 +0200

    sched: debug: increase width of debug line
    
    increase width of debug line - in preparation of more debugging info.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 0aab455a7b41..755815937417 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -198,7 +198,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
 	u64 now = ktime_to_ns(ktime_get());
 	int cpu;
 
-	SEQ_printf(m, "Sched Debug Version: v0.05-v20, %s %.*s\n",
+	SEQ_printf(m, "Sched Debug Version: v0.06-v22, %s %.*s\n",
 		init_utsname()->release,
 		(int)strcspn(init_utsname()->version, " "),
 		init_utsname()->version);
@@ -271,11 +271,12 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	rcu_read_unlock();
 
 	SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads);
-	SEQ_printf(m, "----------------------------------------------\n");
+	SEQ_printf(m,
+		"---------------------------------------------------------\n");
 #define P(F) \
-	SEQ_printf(m, "%-25s:%20Ld\n", #F, (long long)p->F)
+	SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)p->F)
 #define PN(F) \
-	SEQ_printf(m, "%-25s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
+	SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
 
 	PN(se.exec_start);
 	PN(se.vruntime);
@@ -292,7 +293,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	PN(se.wait_max);
 	P(sched_info.bkl_count);
 #endif
-	SEQ_printf(m, "%-25s:%20Ld\n",
+	SEQ_printf(m, "%-35s:%21Ld\n",
 		   "nr_switches", (long long)(p->nvcsw + p->nivcsw));
 	P(se.load.weight);
 	P(policy);
@@ -305,7 +306,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 
 		t0 = sched_clock();
 		t1 = sched_clock();
-		SEQ_printf(m, "%-25s:%20Ld\n",
+		SEQ_printf(m, "%-35s:%21Ld\n",
 			   "clock-delta", (long long)(t1-t0));
 	}
 }

commit da84d96176729fb48a8458561e5d8647103168b8
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Oct 15 17:00:18 2007 +0200

    sched: reintroduce cache-hot affinity
    
    reintroduce a simplified version of cache-hot/cold scheduling
    affinity. This improves performance with certain SMP workloads,
    such as sysbench.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8be5b57768c0..fcc9a5ada1a2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1415,6 +1415,7 @@ extern unsigned int sysctl_sched_wakeup_granularity;
 extern unsigned int sysctl_sched_batch_wakeup_granularity;
 extern unsigned int sysctl_sched_child_runs_first;
 extern unsigned int sysctl_sched_features;
+extern unsigned int sysctl_sched_migration_cost;
 #endif
 
 extern unsigned int sysctl_sched_compat_yield;
diff --git a/kernel/sched.c b/kernel/sched.c
index 791dd08c692f..089d8b12ab76 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2118,6 +2118,17 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
 	check_preempt_curr(this_rq, p);
 }
 
+/*
+ * Is this task likely cache-hot:
+ */
+static inline int
+task_hot(struct task_struct *p, unsigned long long now, struct sched_domain *sd)
+{
+	s64 delta = now - p->se.exec_start;
+
+	return delta < (long long)sysctl_sched_migration_cost;
+}
+
 /*
  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
  */
@@ -2139,6 +2150,22 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
 	if (task_running(rq, p))
 		return 0;
 
+	/*
+	 * Aggressive migration if:
+	 * 1) task is cache cold, or
+	 * 2) too many balance attempts have failed.
+	 */
+
+	if (sd->nr_balance_failed > sd->cache_nice_tries) {
+#ifdef CONFIG_SCHEDSTATS
+		if (task_hot(p, rq->clock, sd))
+			schedstat_inc(sd, lb_hot_gained[idle]);
+#endif
+		return 1;
+	}
+
+	if (task_hot(p, rq->clock, sd))
+		return 0;
 	return 1;
 }
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index cea1fa32b3f4..a17b785d7000 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -74,6 +74,8 @@ const_debug unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL;
  */
 const_debug unsigned int sysctl_sched_wakeup_granularity = 10000000UL;
 
+const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+
 /**************************************************************
  * CFS operations on generic schedulable entities:
  */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 230ca4eb57fe..ec14aa8ac51f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -277,6 +277,14 @@ static ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "sched_migration_cost",
+		.data		= &sysctl_sched_migration_cost,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 #endif
 	{
 		.ctl_name	= CTL_UNNUMBERED,

commit e5f32a3856caabe745381279f7f32e3b581b59dc
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Oct 15 17:00:18 2007 +0200

    sched: speed up context-switches a bit
    
    speed up context-switches a bit by not clearing p->exec_start.
    
    (as a side-effect, this also makes p->exec_start a universal timestamp
    available to cache-hot estimations.)
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c240b72b4e62..cea1fa32b3f4 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -379,15 +379,6 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	se->exec_start = rq_of(cfs_rq)->clock;
 }
 
-/*
- * We are descheduling a task - update its stats:
- */
-static inline void
-update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-	se->exec_start = 0;
-}
-
 /**************************************************
  * Scheduling class queueing methods:
  */
@@ -609,8 +600,6 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 	if (prev->on_rq)
 		update_curr(cfs_rq);
 
-	update_stats_curr_end(cfs_rq, prev);
-
 	check_spread(cfs_rq, prev);
 	if (prev->on_rq) {
 		update_stats_wait_start(cfs_rq, prev);

commit 91c234b4e3419c786cac2d5b7a7b96443e512e3a
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Oct 15 17:00:18 2007 +0200

    sched: do not wakeup-preempt with SCHED_BATCH tasks
    
    do not wakeup-preempt with SCHED_BATCH tasks, their preemption
    is batched too, driven by the tick.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index ec1592eb8d08..c240b72b4e62 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -828,6 +828,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 		resched_task(curr);
 		return;
 	}
+	/*
+	 * Batch tasks do not preempt (their preemption is driven by
+	 * the tick):
+	 */
+	if (unlikely(p->policy == SCHED_BATCH))
+		return;
 
 	if (sched_feat(WAKEUP_PREEMPT)) {
 		while (!is_same_group(se, pse)) {

commit 178be793485d70d871a0fd46b29e9e3e7da636ad
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Oct 15 17:00:18 2007 +0200

    sched: do not normalize kernel threads via SysRq-N
    
    do not normalize kernel threads via SysRq-N: the migration threads,
    softlockup threads, etc. might be essential for the system to
    function properly. So only zap user tasks.
    
    pointed out by Andi Kleen.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index fc61b1fc67d5..791dd08c692f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -365,15 +365,6 @@ static inline int cpu_of(struct rq *rq)
 #endif
 }
 
-static inline int is_migration_thread(struct task_struct *p, struct rq *rq)
-{
-#ifdef CONFIG_SMP
-	return p == rq->migration_thread;
-#else
-	return 0;
-#endif
-}
-
 /*
  * Update the per-runqueue clock, as finegrained as the platform can give
  * us, but without assuming monotonicity, etc.:
@@ -6563,6 +6554,12 @@ void normalize_rt_tasks(void)
 
 	read_lock_irq(&tasklist_lock);
 	do_each_thread(g, p) {
+		/*
+		 * Only normalize user tasks:
+		 */
+		if (!p->mm)
+			continue;
+
 		p->se.exec_start		= 0;
 #ifdef CONFIG_SCHEDSTATS
 		p->se.wait_start		= 0;
@@ -6584,8 +6581,7 @@ void normalize_rt_tasks(void)
 		spin_lock_irqsave(&p->pi_lock, flags);
 		rq = __task_rq_lock(p);
 
-		if (!is_migration_thread(p, rq))
-			normalize_task(rq, p);
+		normalize_task(rq, p);
 
 		__task_rq_unlock(rq);
 		spin_unlock_irqrestore(&p->pi_lock, flags);

commit d5036e89dcf7c19b3d03219d7d385bc96965b7fe
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Oct 15 17:00:15 2007 +0200

    sched: clean up is_migration_thread()
    
    clean up is_migration_thread() and turn it into an inline function.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 2c6295b395a9..7ef66bd753e7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,12 +75,6 @@ unsigned long long __attribute__((weak)) sched_clock(void)
 	return (unsigned long long)jiffies * (1000000000 / HZ);
 }
 
-#ifdef CONFIG_SMP
-#define is_migration_thread(p, rq) ((p) == (rq)->migration_thread)
-#else
-#define is_migration_thread(p, rq) 0
-#endif
-
 /*
  * Convert user-nice values [ -20 ... 0 ... 19 ]
  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -371,6 +365,15 @@ static inline int cpu_of(struct rq *rq)
 #endif
 }
 
+static inline int is_migration_thread(struct task_struct *p, struct rq *rq)
+{
+#ifdef CONFIG_SMP
+	return p == rq->migration_thread;
+#else
+	return 0;
+#endif
+}
+
 /*
  * Update the per-runqueue clock, as finegrained as the platform can give
  * us, but without assuming monotonicity, etc.: