Patches contributed by Eötvös Lorand University

<<Prev 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312[313]314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 Next>>

commit c65cc8705256ad7524c97564b4fe3ca9782bf6d1
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Jul 9 18:51:58 2007 +0200

    sched: uninline set_task_cpu()
    
    uninline set_task_cpu(): CFS will add more code to it.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8764cda0feca..4b912e753ca0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1633,10 +1633,7 @@ static inline unsigned int task_cpu(const struct task_struct *p)
 	return task_thread_info(p)->cpu;
 }
 
-static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
-{
-	task_thread_info(p)->cpu = cpu;
-}
+extern void set_task_cpu(struct task_struct *p, unsigned int cpu);
 
 #else
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 46b23f0fee24..d6624978feb2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1111,6 +1111,12 @@ unsigned long weighted_cpuload(const int cpu)
 }
 
 #ifdef CONFIG_SMP
+
+void set_task_cpu(struct task_struct *p, unsigned int cpu)
+{
+	task_thread_info(p)->cpu = cpu;
+}
+
 struct migration_req {
 	struct list_head list;

commit 0437e109e1841607f2988891eaa36c531c6aa6ac
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Jul 9 18:51:57 2007 +0200

    sched: zap the migration init / cache-hot balancing code
    
    the SMP load-balancer uses the boot-time migration-cost estimation
    code to attempt to improve the quality of balancing. The reason for
    this code is that the discrete priority queues do not preserve
    the order of scheduling accurately, so the load-balancer skips
    tasks that were running on a CPU 'recently'.
    
    this code is fundamental fragile: the boot-time migration cost detector
    doesnt really work on systems that had large L3 caches, it caused boot
    delays on large systems and the whole cache-hot concept made the
    balancing code pretty undeterministic as well.
    
    (and hey, i wrote most of it, so i can say it out loud that it sucks ;-)
    
    under CFS the same purpose of cache affinity can be achieved without
    any special cache-hot special-case: tasks are sorted in the 'timeline'
    tree and the SMP balancer picks tasks from the left side of the
    tree, thus the most cache-cold task is balanced automatically.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index af50f9bbe68e..4d880b3d1f35 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1014,49 +1014,6 @@ and is between 256 and 4096 characters. It is defined in the file
 
 	mga=		[HW,DRM]
 
-	migration_cost=
-			[KNL,SMP] debug: override scheduler migration costs
-			Format: <level-1-usecs>,<level-2-usecs>,...
-			This debugging option can be used to override the
-			default scheduler migration cost matrix. The numbers
-			are indexed by 'CPU domain distance'.
-			E.g. migration_cost=1000,2000,3000 on an SMT NUMA
-			box will set up an intra-core migration cost of
-			1 msec, an inter-core migration cost of 2 msecs,
-			and an inter-node migration cost of 3 msecs.
-
-			WARNING: using the wrong values here can break
-			scheduler performance, so it's only for scheduler
-			development purposes, not production environments.
-
-	migration_debug=
-			[KNL,SMP] migration cost auto-detect verbosity
-			Format=<0|1|2>
-			If a system's migration matrix reported at bootup
-			seems erroneous then this option can be used to
-			increase verbosity of the detection process.
-			We default to 0 (no extra messages), 1 will print
-			some more information, and 2 will be really
-			verbose (probably only useful if you also have a
-			serial console attached to the system).
-
-	migration_factor=
-			[KNL,SMP] multiply/divide migration costs by a factor
-			Format=<percent>
-			This debug option can be used to proportionally
-			increase or decrease the auto-detected migration
-			costs for all entries of the migration matrix.
-			E.g. migration_factor=150 will increase migration
-			costs by 50%. (and thus the scheduler will be less
-			eager migrating cache-hot tasks)
-			migration_factor=80 will decrease migration costs
-			by 20%. (thus the scheduler will be more eager to
-			migrate tasks)
-
-			WARNING: using the wrong values here can break
-			scheduler performance, so it's only for scheduler
-			development purposes, not production environments.
-
 	mousedev.tap_time=
 			[MOUSE] Maximum time between finger touching and
 			leaving touchpad surface for touch to be considered
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 88baed1e7e83..0b2954534b8e 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -941,17 +941,6 @@ static int __cpuinit __smp_prepare_cpu(int cpu)
 }
 #endif
 
-static void smp_tune_scheduling(void)
-{
-	if (cpu_khz) {
-		/* cache size in kB */
-		long cachesize = boot_cpu_data.x86_cache_size;
-
-		if (cachesize > 0)
-			max_cache_size = cachesize * 1024;
-	}
-}
-
 /*
  * Cycle through the processors sending APIC IPIs to boot each.
  */
@@ -980,7 +969,6 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
 	x86_cpu_to_apicid[0] = boot_cpu_physical_apicid;
 
 	current_thread_info()->cpu = 0;
-	smp_tune_scheduling();
 
 	set_cpu_sibling_map(0);
 
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index eaa6a24bc0b6..188fb73c6845 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -805,7 +805,6 @@ static void __cpuinit
 get_max_cacheline_size (void)
 {
 	unsigned long line_size, max = 1;
-	unsigned int cache_size = 0;
 	u64 l, levels, unique_caches;
         pal_cache_config_info_t cci;
         s64 status;
@@ -835,8 +834,6 @@ get_max_cacheline_size (void)
 		line_size = 1 << cci.pcci_line_size;
 		if (line_size > max)
 			max = line_size;
-		if (cache_size < cci.pcci_cache_size)
-			cache_size = cci.pcci_cache_size;
 		if (!cci.pcci_unified) {
 			status = ia64_pal_cache_config_info(l,
 						    /* cache_type (instruction)= */ 1,
@@ -853,9 +850,6 @@ get_max_cacheline_size (void)
 			ia64_i_cache_stride_shift = cci.pcci_stride;
 	}
   out:
-#ifdef CONFIG_SMP
-	max_cache_size = max(max_cache_size, cache_size);
-#endif
 	if (max > ia64_max_cacheline_size)
 		ia64_max_cacheline_size = max;
 }
diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c
index 67edfa7ed93a..a1b017f2dbb3 100644
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -51,16 +51,6 @@ int __cpu_logical_map[NR_CPUS];		/* Map logical to physical */
 EXPORT_SYMBOL(phys_cpu_present_map);
 EXPORT_SYMBOL(cpu_online_map);
 
-/* This happens early in bootup, can't really do it better */
-static void smp_tune_scheduling (void)
-{
-	struct cache_desc *cd = &current_cpu_data.scache;
-	unsigned long cachesize = cd->linesz * cd->sets * cd->ways;
-
-	if (cachesize > max_cache_size)
-		max_cache_size = cachesize;
-}
-
 extern void __init calibrate_delay(void);
 extern ATTRIB_NORET void cpu_idle(void);
 
@@ -228,7 +218,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 {
 	init_new_context(current, &init_mm);
 	current_thread_info()->cpu = 0;
-	smp_tune_scheduling();
 	plat_prepare_cpus(max_cpus);
 #ifndef CONFIG_HOTPLUG_CPU
 	cpu_present_map = cpu_possible_map;
diff --git a/arch/sparc/kernel/smp.c b/arch/sparc/kernel/smp.c
index 4d9ad59031bb..4fea3ac7bff0 100644
--- a/arch/sparc/kernel/smp.c
+++ b/arch/sparc/kernel/smp.c
@@ -68,16 +68,6 @@ void __cpuinit smp_store_cpu_info(int id)
 	cpu_data(id).prom_node = cpu_node;
 	cpu_data(id).mid = cpu_get_hwmid(cpu_node);
 
-	/* this is required to tune the scheduler correctly */
-	/* is it possible to have CPUs with different cache sizes? */
-	if (id == boot_cpu_id) {
-		int cache_line,cache_nlines;
-		cache_line = 0x20;
-		cache_line = prom_getintdefault(cpu_node, "ecache-line-size", cache_line);
-		cache_nlines = 0x8000;
-		cache_nlines = prom_getintdefault(cpu_node, "ecache-nlines", cache_nlines);
-		max_cache_size = cache_line * cache_nlines;
-	}
 	if (cpu_data(id).mid < 0)
 		panic("No MID found for CPU%d at node 0x%08d", id, cpu_node);
 }
diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index 4dcd7d0b60f2..40e40f968d61 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -1163,32 +1163,6 @@ int setup_profiling_timer(unsigned int multiplier)
 	return -EINVAL;
 }
 
-static void __init smp_tune_scheduling(void)
-{
-	unsigned int smallest = ~0U;
-	int i;
-
-	for (i = 0; i < NR_CPUS; i++) {
-		unsigned int val = cpu_data(i).ecache_size;
-
-		if (val && val < smallest)
-			smallest = val;
-	}
-
-	/* Any value less than 256K is nonsense.  */
-	if (smallest < (256U * 1024U))
-		smallest = 256 * 1024;
-
-	max_cache_size = smallest;
-
-	if (smallest < 1U * 1024U * 1024U)
-		printk(KERN_INFO "Using max_cache_size of %uKB\n",
-		       smallest / 1024U);
-	else
-		printk(KERN_INFO "Using max_cache_size of %uMB\n",
-		       smallest / 1024U / 1024U);
-}
-
 /* Constrain the number of cpus to max_cpus.  */
 void __init smp_prepare_cpus(unsigned int max_cpus)
 {
@@ -1206,7 +1180,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 	}
 
 	cpu_data(boot_cpu_id).udelay_val = loops_per_jiffy;
-	smp_tune_scheduling();
 }
 
 void __devinit smp_prepare_boot_cpu(void)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7e74262f98e1..8764cda0feca 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -754,12 +754,6 @@ struct sched_domain {
 extern int partition_sched_domains(cpumask_t *partition1,
 				    cpumask_t *partition2);
 
-/*
- * Maximum cache size the migration-costs auto-tuning code will
- * search from:
- */
-extern unsigned int max_cache_size;
-
 #endif	/* CONFIG_SMP */
 
 
diff --git a/kernel/sched.c b/kernel/sched.c
index ac054d9a0719..46b23f0fee24 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5797,483 +5797,6 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
 
 #define SD_NODES_PER_DOMAIN 16
 
-/*
- * Self-tuning task migration cost measurement between source and target CPUs.
- *
- * This is done by measuring the cost of manipulating buffers of varying
- * sizes. For a given buffer-size here are the steps that are taken:
- *
- * 1) the source CPU reads+dirties a shared buffer
- * 2) the target CPU reads+dirties the same shared buffer
- *
- * We measure how long they take, in the following 4 scenarios:
- *
- *  - source: CPU1, target: CPU2 | cost1
- *  - source: CPU2, target: CPU1 | cost2
- *  - source: CPU1, target: CPU1 | cost3
- *  - source: CPU2, target: CPU2 | cost4
- *
- * We then calculate the cost3+cost4-cost1-cost2 difference - this is
- * the cost of migration.
- *
- * We then start off from a small buffer-size and iterate up to larger
- * buffer sizes, in 5% steps - measuring each buffer-size separately, and
- * doing a maximum search for the cost. (The maximum cost for a migration
- * normally occurs when the working set size is around the effective cache
- * size.)
- */
-#define SEARCH_SCOPE		2
-#define MIN_CACHE_SIZE		(64*1024U)
-#define DEFAULT_CACHE_SIZE	(5*1024*1024U)
-#define ITERATIONS		1
-#define SIZE_THRESH		130
-#define COST_THRESH		130
-
-/*
- * The migration cost is a function of 'domain distance'. Domain
- * distance is the number of steps a CPU has to iterate down its
- * domain tree to share a domain with the other CPU. The farther
- * two CPUs are from each other, the larger the distance gets.
- *
- * Note that we use the distance only to cache measurement results,
- * the distance value is not used numerically otherwise. When two
- * CPUs have the same distance it is assumed that the migration
- * cost is the same. (this is a simplification but quite practical)
- */
-#define MAX_DOMAIN_DISTANCE 32
-
-static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] =
-		{ [ 0 ... MAX_DOMAIN_DISTANCE-1 ] =
-/*
- * Architectures may override the migration cost and thus avoid
- * boot-time calibration. Unit is nanoseconds. Mostly useful for
- * virtualized hardware:
- */
-#ifdef CONFIG_DEFAULT_MIGRATION_COST
-			CONFIG_DEFAULT_MIGRATION_COST
-#else
-			-1LL
-#endif
-};
-
-/*
- * Allow override of migration cost - in units of microseconds.
- * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost
- * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs:
- */
-static int __init migration_cost_setup(char *str)
-{
-	int ints[MAX_DOMAIN_DISTANCE+1], i;
-
-	str = get_options(str, ARRAY_SIZE(ints), ints);
-
-	printk("#ints: %d\n", ints[0]);
-	for (i = 1; i <= ints[0]; i++) {
-		migration_cost[i-1] = (unsigned long long)ints[i]*1000;
-		printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]);
-	}
-	return 1;
-}
-
-__setup ("migration_cost=", migration_cost_setup);
-
-/*
- * Global multiplier (divisor) for migration-cutoff values,
- * in percentiles. E.g. use a value of 150 to get 1.5 times
- * longer cache-hot cutoff times.
- *
- * (We scale it from 100 to 128 to long long handling easier.)
- */
-
-#define MIGRATION_FACTOR_SCALE 128
-
-static unsigned int migration_factor = MIGRATION_FACTOR_SCALE;
-
-static int __init setup_migration_factor(char *str)
-{
-	get_option(&str, &migration_factor);
-	migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100;
-	return 1;
-}
-
-__setup("migration_factor=", setup_migration_factor);
-
-/*
- * Estimated distance of two CPUs, measured via the number of domains
- * we have to pass for the two CPUs to be in the same span:
- */
-static unsigned long domain_distance(int cpu1, int cpu2)
-{
-	unsigned long distance = 0;
-	struct sched_domain *sd;
-
-	for_each_domain(cpu1, sd) {
-		WARN_ON(!cpu_isset(cpu1, sd->span));
-		if (cpu_isset(cpu2, sd->span))
-			return distance;
-		distance++;
-	}
-	if (distance >= MAX_DOMAIN_DISTANCE) {
-		WARN_ON(1);
-		distance = MAX_DOMAIN_DISTANCE-1;
-	}
-
-	return distance;
-}
-
-static unsigned int migration_debug;
-
-static int __init setup_migration_debug(char *str)
-{
-	get_option(&str, &migration_debug);
-	return 1;
-}
-
-__setup("migration_debug=", setup_migration_debug);
-
-/*
- * Maximum cache-size that the scheduler should try to measure.
- * Architectures with larger caches should tune this up during
- * bootup. Gets used in the domain-setup code (i.e. during SMP
- * bootup).
- */
-unsigned int max_cache_size;
-
-static int __init setup_max_cache_size(char *str)
-{
-	get_option(&str, &max_cache_size);
-	return 1;
-}
-
-__setup("max_cache_size=", setup_max_cache_size);
-
-/*
- * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This
- * is the operation that is timed, so we try to generate unpredictable
- * cachemisses that still end up filling the L2 cache:
- */
-static void touch_cache(void *__cache, unsigned long __size)
-{
-	unsigned long size = __size / sizeof(long);
-	unsigned long chunk1 = size / 3;
-	unsigned long chunk2 = 2 * size / 3;
-	unsigned long *cache = __cache;
-	int i;
-
-	for (i = 0; i < size/6; i += 8) {
-		switch (i % 6) {
-			case 0: cache[i]++;
-			case 1: cache[size-1-i]++;
-			case 2: cache[chunk1-i]++;
-			case 3: cache[chunk1+i]++;
-			case 4: cache[chunk2-i]++;
-			case 5: cache[chunk2+i]++;
-		}
-	}
-}
-
-/*
- * Measure the cache-cost of one task migration. Returns in units of nsec.
- */
-static unsigned long long
-measure_one(void *cache, unsigned long size, int source, int target)
-{
-	cpumask_t mask, saved_mask;
-	unsigned long long t0, t1, t2, t3, cost;
-
-	saved_mask = current->cpus_allowed;
-
-	/*
-	 * Flush source caches to RAM and invalidate them:
-	 */
-	sched_cacheflush();
-
-	/*
-	 * Migrate to the source CPU:
-	 */
-	mask = cpumask_of_cpu(source);
-	set_cpus_allowed(current, mask);
-	WARN_ON(smp_processor_id() != source);
-
-	/*
-	 * Dirty the working set:
-	 */
-	t0 = sched_clock();
-	touch_cache(cache, size);
-	t1 = sched_clock();
-
-	/*
-	 * Migrate to the target CPU, dirty the L2 cache and access
-	 * the shared buffer. (which represents the working set
-	 * of a migrated task.)
-	 */
-	mask = cpumask_of_cpu(target);
-	set_cpus_allowed(current, mask);
-	WARN_ON(smp_processor_id() != target);
-
-	t2 = sched_clock();
-	touch_cache(cache, size);
-	t3 = sched_clock();
-
-	cost = t1-t0 + t3-t2;
-
-	if (migration_debug >= 2)
-		printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n",
-			source, target, t1-t0, t1-t0, t3-t2, cost);
-	/*
-	 * Flush target caches to RAM and invalidate them:
-	 */
-	sched_cacheflush();
-
-	set_cpus_allowed(current, saved_mask);
-
-	return cost;
-}
-
-/*
- * Measure a series of task migrations and return the average
- * result. Since this code runs early during bootup the system
- * is 'undisturbed' and the average latency makes sense.
- *
- * The algorithm in essence auto-detects the relevant cache-size,
- * so it will properly detect different cachesizes for different
- * cache-hierarchies, depending on how the CPUs are connected.
- *
- * Architectures can prime the upper limit of the search range via
- * max_cache_size, otherwise the search range defaults to 20MB...64K.
- */
-static unsigned long long
-measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
-{
-	unsigned long long cost1, cost2;
-	int i;
-
-	/*
-	 * Measure the migration cost of 'size' bytes, over an
-	 * average of 10 runs:
-	 *
-	 * (We perturb the cache size by a small (0..4k)
-	 *  value to compensate size/alignment related artifacts.
-	 *  We also subtract the cost of the operation done on
-	 *  the same CPU.)
-	 */
-	cost1 = 0;
-
-	/*
-	 * dry run, to make sure we start off cache-cold on cpu1,
-	 * and to get any vmalloc pagefaults in advance:
-	 */
-	measure_one(cache, size, cpu1, cpu2);
-	for (i = 0; i < ITERATIONS; i++)
-		cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2);
-
-	measure_one(cache, size, cpu2, cpu1);
-	for (i = 0; i < ITERATIONS; i++)
-		cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1);
-
-	/*
-	 * (We measure the non-migrating [cached] cost on both
-	 *  cpu1 and cpu2, to handle CPUs with different speeds)
-	 */
-	cost2 = 0;
-
-	measure_one(cache, size, cpu1, cpu1);
-	for (i = 0; i < ITERATIONS; i++)
-		cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1);
-
-	measure_one(cache, size, cpu2, cpu2);
-	for (i = 0; i < ITERATIONS; i++)
-		cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2);
-
-	/*
-	 * Get the per-iteration migration cost:
-	 */
-	do_div(cost1, 2 * ITERATIONS);
-	do_div(cost2, 2 * ITERATIONS);
-
-	return cost1 - cost2;
-}
-
-static unsigned long long measure_migration_cost(int cpu1, int cpu2)
-{
-	unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0;
-	unsigned int max_size, size, size_found = 0;
-	long long cost = 0, prev_cost;
-	void *cache;
-
-	/*
-	 * Search from max_cache_size*5 down to 64K - the real relevant
-	 * cachesize has to lie somewhere inbetween.
-	 */
-	if (max_cache_size) {
-		max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE);
-		size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE);
-	} else {
-		/*
-		 * Since we have no estimation about the relevant
-		 * search range
-		 */
-		max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE;
-		size = MIN_CACHE_SIZE;
-	}
-
-	if (!cpu_online(cpu1) || !cpu_online(cpu2)) {
-		printk("cpu %d and %d not both online!\n", cpu1, cpu2);
-		return 0;
-	}
-
-	/*
-	 * Allocate the working set:
-	 */
-	cache = vmalloc(max_size);
-	if (!cache) {
-		printk("could not vmalloc %d bytes for cache!\n", 2 * max_size);
-		return 1000000; /* return 1 msec on very small boxen */
-	}
-
-	while (size <= max_size) {
-		prev_cost = cost;
-		cost = measure_cost(cpu1, cpu2, cache, size);
-
-		/*
-		 * Update the max:
-		 */
-		if (cost > 0) {
-			if (max_cost < cost) {
-				max_cost = cost;
-				size_found = size;
-			}
-		}
-		/*
-		 * Calculate average fluctuation, we use this to prevent
-		 * noise from triggering an early break out of the loop:
-		 */
-		fluct = abs(cost - prev_cost);
-		avg_fluct = (avg_fluct + fluct)/2;
-
-		if (migration_debug)
-			printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): "
-				"(%8Ld %8Ld)\n",
-				cpu1, cpu2, size,
-				(long)cost / 1000000,
-				((long)cost / 100000) % 10,
-				(long)max_cost / 1000000,
-				((long)max_cost / 100000) % 10,
-				domain_distance(cpu1, cpu2),
-				cost, avg_fluct);
-
-		/*
-		 * If we iterated at least 20% past the previous maximum,
-		 * and the cost has dropped by more than 20% already,
-		 * (taking fluctuations into account) then we assume to
-		 * have found the maximum and break out of the loop early:
-		 */
-		if (size_found && (size*100 > size_found*SIZE_THRESH))
-			if (cost+avg_fluct <= 0 ||
-				max_cost*100 > (cost+avg_fluct)*COST_THRESH) {
-
-				if (migration_debug)
-					printk("-> found max.\n");
-				break;
-			}
-		/*
-		 * Increase the cachesize in 10% steps:
-		 */
-		size = size * 10 / 9;
-	}
-
-	if (migration_debug)
-		printk("[%d][%d] working set size found: %d, cost: %Ld\n",
-			cpu1, cpu2, size_found, max_cost);
-
-	vfree(cache);
-
-	/*
-	 * A task is considered 'cache cold' if at least 2 times
-	 * the worst-case cost of migration has passed.
-	 *
-	 * (this limit is only listened to if the load-balancing
-	 * situation is 'nice' - if there is a large imbalance we
-	 * ignore it for the sake of CPU utilization and
-	 * processing fairness.)
-	 */
-	return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE;
-}
-
-static void calibrate_migration_costs(const cpumask_t *cpu_map)
-{
-	int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id();
-	unsigned long j0, j1, distance, max_distance = 0;
-	struct sched_domain *sd;
-
-	j0 = jiffies;
-
-	/*
-	 * First pass - calculate the cacheflush times:
-	 */
-	for_each_cpu_mask(cpu1, *cpu_map) {
-		for_each_cpu_mask(cpu2, *cpu_map) {
-			if (cpu1 == cpu2)
-				continue;
-			distance = domain_distance(cpu1, cpu2);
-			max_distance = max(max_distance, distance);
-			/*
-			 * No result cached yet?
-			 */
-			if (migration_cost[distance] == -1LL)
-				migration_cost[distance] =
-					measure_migration_cost(cpu1, cpu2);
-		}
-	}
-	/*
-	 * Second pass - update the sched domain hierarchy with
-	 * the new cache-hot-time estimations:
-	 */
-	for_each_cpu_mask(cpu, *cpu_map) {
-		distance = 0;
-		for_each_domain(cpu, sd) {
-			sd->cache_hot_time = migration_cost[distance];
-			distance++;
-		}
-	}
-	/*
-	 * Print the matrix:
-	 */
-	if (migration_debug)
-		printk("migration: max_cache_size: %d, cpu: %d MHz:\n",
-			max_cache_size,
-#ifdef CONFIG_X86
-			cpu_khz/1000
-#else
-			-1
-#endif
-		);
-	if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) {
-		printk("migration_cost=");
-		for (distance = 0; distance <= max_distance; distance++) {
-			if (distance)
-				printk(",");
-			printk("%ld", (long)migration_cost[distance] / 1000);
-		}
-		printk("\n");
-	}
-	j1 = jiffies;
-	if (migration_debug)
-		printk("migration: %ld seconds\n", (j1-j0) / HZ);
-
-	/*
-	 * Move back to the original CPU. NUMA-Q gets confused
-	 * if we migrate to another quad during bootup.
-	 */
-	if (raw_smp_processor_id() != orig_cpu) {
-		cpumask_t mask = cpumask_of_cpu(orig_cpu),
-			saved_mask = current->cpus_allowed;
-
-		set_cpus_allowed(current, mask);
-		set_cpus_allowed(current, saved_mask);
-	}
-}
-
 #ifdef CONFIG_NUMA
 
 /**
@@ -6803,10 +6326,6 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 #endif
 		cpu_attach_domain(sd, i);
 	}
-	/*
-	 * Tune cache-hot values:
-	 */
-	calibrate_migration_costs(cpu_map);
 
 	return 0;

commit 0e6aca43e08a62a48d6770e9a159dbec167bf4c6
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Jul 9 18:51:57 2007 +0200

    sched: add SCHED_IDLE policy
    
    this patch adds the SCHED_IDLE policy to sched.h.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2acfb23f3681..7e74262f98e1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -34,6 +34,8 @@
 #define SCHED_FIFO		1
 #define SCHED_RR		2
 #define SCHED_BATCH		3
+/* SCHED_ISO: reserved but not implemented yet */
+#define SCHED_IDLE		5
 
 #ifdef __KERNEL__

commit d15bcfdbe1818478891d714343f037cfe60875f0
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Jul 9 18:51:57 2007 +0200

    sched: rename idle_type/SCHED_IDLE
    
    enum idle_type (used by the load-balancer) clashes with the
    SCHED_IDLE name that we want to introduce. 'CPU_IDLE' instead
    of 'SCHED_IDLE' is more descriptive as well.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 693f0e6c54d4..2acfb23f3681 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -639,12 +639,11 @@ static inline int sched_info_on(void)
 #endif
 }
 
-enum idle_type
-{
-	SCHED_IDLE,
-	NOT_IDLE,
-	NEWLY_IDLE,
-	MAX_IDLE_TYPES
+enum cpu_idle_type {
+	CPU_IDLE,
+	CPU_NOT_IDLE,
+	CPU_NEWLY_IDLE,
+	CPU_MAX_IDLE_TYPES
 };
 
 /*
@@ -719,14 +718,14 @@ struct sched_domain {
 
 #ifdef CONFIG_SCHEDSTATS
 	/* load_balance() stats */
-	unsigned long lb_cnt[MAX_IDLE_TYPES];
-	unsigned long lb_failed[MAX_IDLE_TYPES];
-	unsigned long lb_balanced[MAX_IDLE_TYPES];
-	unsigned long lb_imbalance[MAX_IDLE_TYPES];
-	unsigned long lb_gained[MAX_IDLE_TYPES];
-	unsigned long lb_hot_gained[MAX_IDLE_TYPES];
-	unsigned long lb_nobusyg[MAX_IDLE_TYPES];
-	unsigned long lb_nobusyq[MAX_IDLE_TYPES];
+	unsigned long lb_cnt[CPU_MAX_IDLE_TYPES];
+	unsigned long lb_failed[CPU_MAX_IDLE_TYPES];
+	unsigned long lb_balanced[CPU_MAX_IDLE_TYPES];
+	unsigned long lb_imbalance[CPU_MAX_IDLE_TYPES];
+	unsigned long lb_gained[CPU_MAX_IDLE_TYPES];
+	unsigned long lb_hot_gained[CPU_MAX_IDLE_TYPES];
+	unsigned long lb_nobusyg[CPU_MAX_IDLE_TYPES];
+	unsigned long lb_nobusyq[CPU_MAX_IDLE_TYPES];
 
 	/* Active load balancing */
 	unsigned long alb_cnt;
diff --git a/kernel/sched.c b/kernel/sched.c
index 50e1a3122699..ac054d9a0719 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -496,12 +496,12 @@ static int show_schedstat(struct seq_file *seq, void *v)
 		/* domain-specific stats */
 		preempt_disable();
 		for_each_domain(cpu, sd) {
-			enum idle_type itype;
+			enum cpu_idle_type itype;
 			char mask_str[NR_CPUS];
 
 			cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
 			seq_printf(seq, "domain%d %s", dcnt++, mask_str);
-			for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
+			for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
 					itype++) {
 				seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu "
 						"%lu",
@@ -2208,7 +2208,7 @@ static void pull_task(struct rq *src_rq, struct prio_array *src_array,
  */
 static
 int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
-		     struct sched_domain *sd, enum idle_type idle,
+		     struct sched_domain *sd, enum cpu_idle_type idle,
 		     int *all_pinned)
 {
 	/*
@@ -2254,7 +2254,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
  */
 static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		      unsigned long max_nr_move, unsigned long max_load_move,
-		      struct sched_domain *sd, enum idle_type idle,
+		      struct sched_domain *sd, enum cpu_idle_type idle,
 		      int *all_pinned)
 {
 	int idx, pulled = 0, pinned = 0, this_best_prio, best_prio,
@@ -2372,7 +2372,7 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
  */
 static struct sched_group *
 find_busiest_group(struct sched_domain *sd, int this_cpu,
-		   unsigned long *imbalance, enum idle_type idle, int *sd_idle,
+		   unsigned long *imbalance, enum cpu_idle_type idle, int *sd_idle,
 		   cpumask_t *cpus, int *balance)
 {
 	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
@@ -2391,9 +2391,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	max_load = this_load = total_load = total_pwr = 0;
 	busiest_load_per_task = busiest_nr_running = 0;
 	this_load_per_task = this_nr_running = 0;
-	if (idle == NOT_IDLE)
+	if (idle == CPU_NOT_IDLE)
 		load_idx = sd->busy_idx;
-	else if (idle == NEWLY_IDLE)
+	else if (idle == CPU_NEWLY_IDLE)
 		load_idx = sd->newidle_idx;
 	else
 		load_idx = sd->idle_idx;
@@ -2477,7 +2477,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		 * Busy processors will not participate in power savings
 		 * balance.
 		 */
- 		if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+ 		if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
  			goto group_next;
 
 		/*
@@ -2639,7 +2639,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 
 out_balanced:
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-	if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+	if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
 		goto ret;
 
 	if (this == group_leader && group_leader != group_min) {
@@ -2656,7 +2656,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
  * find_busiest_queue - find the busiest runqueue among the cpus in group.
  */
 static struct rq *
-find_busiest_queue(struct sched_group *group, enum idle_type idle,
+find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
 		   unsigned long imbalance, cpumask_t *cpus)
 {
 	struct rq *busiest = NULL, *rq;
@@ -2698,7 +2698,7 @@ static inline unsigned long minus_1_or_zero(unsigned long n)
  * tasks if there is an imbalance.
  */
 static int load_balance(int this_cpu, struct rq *this_rq,
-			struct sched_domain *sd, enum idle_type idle,
+			struct sched_domain *sd, enum cpu_idle_type idle,
 			int *balance)
 {
 	int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
@@ -2712,9 +2712,9 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 	 * When power savings policy is enabled for the parent domain, idle
 	 * sibling can pick up load irrespective of busy siblings. In this case,
 	 * let the state of idle sibling percolate up as IDLE, instead of
-	 * portraying it as NOT_IDLE.
+	 * portraying it as CPU_NOT_IDLE.
 	 */
-	if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
+	if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 		sd_idle = 1;
 
@@ -2848,7 +2848,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
  * Check this_cpu to ensure it is balanced within domain. Attempt to move
  * tasks if there is an imbalance.
  *
- * Called from schedule when this_rq is about to become idle (NEWLY_IDLE).
+ * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
  * this_rq is locked.
  */
 static int
@@ -2865,31 +2865,31 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
 	 * When power savings policy is enabled for the parent domain, idle
 	 * sibling can pick up load irrespective of busy siblings. In this case,
 	 * let the state of idle sibling percolate up as IDLE, instead of
-	 * portraying it as NOT_IDLE.
+	 * portraying it as CPU_NOT_IDLE.
 	 */
 	if (sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 		sd_idle = 1;
 
-	schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
+	schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]);
 redo:
-	group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE,
+	group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
 				   &sd_idle, &cpus, NULL);
 	if (!group) {
-		schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
+		schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
 		goto out_balanced;
 	}
 
-	busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance,
+	busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance,
 				&cpus);
 	if (!busiest) {
-		schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
+		schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
 		goto out_balanced;
 	}
 
 	BUG_ON(busiest == this_rq);
 
-	schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
+	schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
 
 	nr_moved = 0;
 	if (busiest->nr_running > 1) {
@@ -2897,7 +2897,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
 		double_lock_balance(this_rq, busiest);
 		nr_moved = move_tasks(this_rq, this_cpu, busiest,
 					minus_1_or_zero(busiest->nr_running),
-					imbalance, sd, NEWLY_IDLE, NULL);
+					imbalance, sd, CPU_NEWLY_IDLE, NULL);
 		spin_unlock(&busiest->lock);
 
 		if (!nr_moved) {
@@ -2908,7 +2908,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
 	}
 
 	if (!nr_moved) {
-		schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
+		schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
 		if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
 		    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 			return -1;
@@ -2918,7 +2918,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
 	return nr_moved;
 
 out_balanced:
-	schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
+	schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
 	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 		return -1;
@@ -3003,7 +3003,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
 		schedstat_inc(sd, alb_cnt);
 
 		if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
-			       RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE,
+			       RTPRIO_TO_LOAD_WEIGHT(100), sd, CPU_IDLE,
 			       NULL))
 			schedstat_inc(sd, alb_pushed);
 		else
@@ -3120,7 +3120,7 @@ static DEFINE_SPINLOCK(balancing);
  *
  * Balancing parameters are set up in arch_init_sched_domains.
  */
-static inline void rebalance_domains(int cpu, enum idle_type idle)
+static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
 {
 	int balance = 1;
 	struct rq *rq = cpu_rq(cpu);
@@ -3134,7 +3134,7 @@ static inline void rebalance_domains(int cpu, enum idle_type idle)
 			continue;
 
 		interval = sd->balance_interval;
-		if (idle != SCHED_IDLE)
+		if (idle != CPU_IDLE)
 			interval *= sd->busy_factor;
 
 		/* scale ms to jiffies */
@@ -3154,7 +3154,7 @@ static inline void rebalance_domains(int cpu, enum idle_type idle)
 				 * longer idle, or one of our SMT siblings is
 				 * not idle.
 				 */
-				idle = NOT_IDLE;
+				idle = CPU_NOT_IDLE;
 			}
 			sd->last_balance = jiffies;
 		}
@@ -3184,7 +3184,7 @@ static void run_rebalance_domains(struct softirq_action *h)
 {
 	int local_cpu = smp_processor_id();
 	struct rq *local_rq = cpu_rq(local_cpu);
-	enum idle_type idle = local_rq->idle_at_tick ? SCHED_IDLE : NOT_IDLE;
+	enum cpu_idle_type idle = local_rq->idle_at_tick ? CPU_IDLE : CPU_NOT_IDLE;
 
 	rebalance_domains(local_cpu, idle);
 
@@ -3210,7 +3210,7 @@ static void run_rebalance_domains(struct softirq_action *h)
 			if (need_resched())
 				break;
 
-			rebalance_domains(balance_cpu, SCHED_IDLE);
+			rebalance_domains(balance_cpu, CPU_IDLE);
 
 			rq = cpu_rq(balance_cpu);
 			if (time_after(local_rq->next_balance, rq->next_balance))

commit a0f98a1cb7d27c656de450ba56efd31bdc59065e
Author: Ingo Molnar <mingo@elte.hu>
Date:   Sun Jun 17 18:37:45 2007 +0200

    sched: fix SysRq-N (normalize RT tasks)
    
    Gene Heskett reported the following problem while testing CFS: SysRq-N
    is not always effective in normalizing tasks back to SCHED_OTHER.
    
    The reason for that turns out to be the following bug:
    
     - normalize_rt_tasks() uses for_each_process() to iterate through all
       tasks in the system.  The problem is, this method does not iterate
       through all tasks, it iterates through all thread groups.
    
    The proper mechanism to enumerate over all threads is to use a
    do_each_thread() + while_each_thread() loop.
    
    Reported-by: Gene Heskett <gene.heskett@gmail.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/kernel/sched.c b/kernel/sched.c
index 13cdab3b4c48..49be34e1f0b8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7071,12 +7071,13 @@ EXPORT_SYMBOL(__might_sleep);
 void normalize_rt_tasks(void)
 {
 	struct prio_array *array;
-	struct task_struct *p;
+	struct task_struct *g, *p;
 	unsigned long flags;
 	struct rq *rq;
 
 	read_lock_irq(&tasklist_lock);
-	for_each_process(p) {
+
+	do_each_thread(g, p) {
 		if (!rt_task(p))
 			continue;
 
@@ -7094,7 +7095,8 @@ void normalize_rt_tasks(void)
 
 		__task_rq_unlock(rq);
 		spin_unlock_irqrestore(&p->pi_lock, flags);
-	}
+	} while_each_thread(g, p);
+
 	read_unlock_irq(&tasklist_lock);
 }

commit c1a834dc704763673df10282995257f2de93cbe9
Author: Ingo Molnar <mingo@elte.hu>
Date:   Fri Jun 1 00:47:16 2007 -0700

    timer stats: speedups
    
    Make timer-stats have almost zero overhead when enabled in the config but
    not used.  (this way distros can enable it more easily)
    
    Also update the documentation about overhead of timer_stats - it was
    written for the first version which had a global lock and a linear list
    walk based lookup ;-)
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
    Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/Documentation/hrtimer/timer_stats.txt b/Documentation/hrtimer/timer_stats.txt
index 27f782e3593f..22b0814d0ad0 100644
--- a/Documentation/hrtimer/timer_stats.txt
+++ b/Documentation/hrtimer/timer_stats.txt
@@ -2,9 +2,10 @@ timer_stats - timer usage statistics
 ------------------------------------
 
 timer_stats is a debugging facility to make the timer (ab)usage in a Linux
-system visible to kernel and userspace developers. It is not intended for
-production usage as it adds significant overhead to the (hr)timer code and the
-(hr)timer data structures.
+system visible to kernel and userspace developers. If enabled in the config
+but not used it has almost zero runtime overhead, and a relatively small
+data structure overhead. Even if collection is enabled runtime all the
+locking is per-CPU and lookup is hashed.
 
 timer_stats should be used by kernel and userspace developers to verify that
 their code does not make unduly use of timers. This helps to avoid unnecessary
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index fa3d380ca8c0..321693724ad7 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -236,10 +236,15 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
 	/*
 	 * It doesnt matter which lock we take:
 	 */
-	spinlock_t *lock = &per_cpu(lookup_lock, raw_smp_processor_id());
+	spinlock_t *lock;
 	struct entry *entry, input;
 	unsigned long flags;
 
+	if (likely(!active))
+		return;
+
+	lock = &per_cpu(lookup_lock, raw_smp_processor_id());
+
 	input.timer = timer;
 	input.start_func = startf;
 	input.expire_func = timerf;
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 1ba77ca7d165..da95e10cfd70 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -126,7 +126,10 @@ config TIMER_STATS
 	  reprogrammed. The statistics can be read from /proc/timer_stats.
 	  The statistics collection is started by writing 1 to /proc/timer_stats,
 	  writing 0 stops it. This feature is useful to collect information
-	  about timer usage patterns in kernel and userspace.
+	  about timer usage patterns in kernel and userspace. This feature
+	  is lightweight if enabled in the kernel config but not activated
+	  (it defaults to deactivated on bootup and will only be activated
+	  if some application like powertop activates it explicitly).
 
 config DEBUG_SLAB
 	bool "Debug slab memory allocations"

commit 8f0c45cdf87dc9141e87b0ad2fc6fff216a95f79
Author: Ingo Molnar <mingo@elte.hu>
Date:   Tue May 8 00:28:26 2007 -0700

    enhance initcall_debug, measure latency
    
    enhance the initcall_debug boot option:
    
     - measure the time the initcall took to execute and report
       it in units of milliseconds.
    
     - show the return code of initcalls (useful to see failures and
       to make sure that an initcall hung)
    
    [akpm@linux-foundation.org: fix printk warning]
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
    Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/init/main.c b/init/main.c
index 949c27b59d12..c1537e0ddceb 100644
--- a/init/main.c
+++ b/init/main.c
@@ -648,6 +648,7 @@ static void __init do_initcalls(void)
 	int count = preempt_count();
 
 	for (call = __initcall_start; call < __initcall_end; call++) {
+		ktime_t t0, t1, delta;
 		char *msg = NULL;
 		char msgbuf[40];
 		int result;
@@ -657,10 +658,26 @@ static void __init do_initcalls(void)
 			print_fn_descriptor_symbol(": %s()",
 					(unsigned long) *call);
 			printk("\n");
+			t0 = ktime_get();
 		}
 
 		result = (*call)();
 
+		if (initcall_debug) {
+			t1 = ktime_get();
+			delta = ktime_sub(t1, t0);
+
+			printk("initcall 0x%p", *call);
+			print_fn_descriptor_symbol(": %s()",
+					(unsigned long) *call);
+			printk(" returned %d.\n", result);
+
+			printk("initcall 0x%p ran for %Ld msecs: ",
+				*call, (unsigned long long)delta.tv64 >> 20);
+			print_fn_descriptor_symbol("%s()\n",
+				(unsigned long) *call);
+		}
+
 		if (result && result != -ENODEV && initcall_debug) {
 			sprintf(msgbuf, "error code %d", result);
 			msg = msgbuf;

commit 3c43f03908de98fa8f7a9e8fc9411ebf4c2de298
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed May 2 19:27:04 2007 +0200

    [PATCH] x86: default to physical mode on hotplug CPU kernels
    
    Default to physical mode on hotplug CPU kernels.  Furher simplify and clean up
    the APIC initialization code.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Andi Kleen <ak@suse.de>
    Cc: Suresh Siddha <suresh.b.siddha@intel.com>
    Cc: Andi Kleen <ak@suse.de>
    Cc: "Li, Shaohua" <shaohua.li@intel.com>
    Cc: "Eric W. Biederman" <ebiederm@xmission.com>
    Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c
index 9ea5b8ecc7e1..280898b045b2 100644
--- a/arch/i386/kernel/acpi/boot.c
+++ b/arch/i386/kernel/acpi/boot.c
@@ -874,7 +874,7 @@ static void __init acpi_process_madt(void)
 				acpi_ioapic = 1;
 
 				smp_found_config = 1;
-				clustered_apic_check();
+				setup_apic_routing();
 			}
 		}
 		if (error == -EINVAL) {
diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c
index 4f5983c98669..0952eccd8f28 100644
--- a/arch/i386/kernel/mpparse.c
+++ b/arch/i386/kernel/mpparse.c
@@ -477,7 +477,7 @@ static int __init smp_read_mpc(struct mp_config_table *mpc)
 		}
 		++mpc_record;
 	}
-	clustered_apic_check();
+	setup_apic_routing();
 	if (!num_processors)
 		printk(KERN_ERR "SMP mptable: no processors registered!\n");
 	return num_processors;
diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c
index 025f26ebb8d7..c08650a427e2 100644
--- a/arch/x86_64/kernel/genapic.c
+++ b/arch/x86_64/kernel/genapic.c
@@ -35,11 +35,8 @@ struct genapic __read_mostly *genapic = &apic_flat;
 /*
  * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
  */
-void __init clustered_apic_check(void)
+void __init setup_apic_routing(void)
 {
-	unsigned int i, max_apic = 0;
-	u8 id;
-
 #ifdef CONFIG_ACPI
 	/*
 	 * Quirk: some x86_64 machines can only use physical APIC mode
@@ -49,17 +46,10 @@ void __init clustered_apic_check(void)
 	if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
 			(acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL))
 		genapic = &apic_physflat;
+	else
 #endif
 
-	for (i = 0; i < NR_CPUS; i++) {
-		id = bios_cpu_apicid[i];
-		if (id == BAD_APICID)
-			continue;
-		if (id > max_apic)
-			max_apic = id;
-	}
-
-	if (max_apic < 8)
+	if (cpus_weight(cpu_possible_map) <= 8)
 		genapic = &apic_flat;
 	else
 		genapic = &apic_physflat;
diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c
index 455aa0b932f0..d0dc4891599b 100644
--- a/arch/x86_64/kernel/mpparse.c
+++ b/arch/x86_64/kernel/mpparse.c
@@ -300,7 +300,7 @@ static int __init smp_read_mpc(struct mp_config_table *mpc)
 			}
 		}
 	}
-	clustered_apic_check();
+	setup_apic_routing();
 	if (!num_processors)
 		printk(KERN_ERR "MPTABLE: no processors registered!\n");
 	return num_processors;
diff --git a/include/asm-i386/genapic.h b/include/asm-i386/genapic.h
index 8ffbb0f07457..33e3ffe1766c 100644
--- a/include/asm-i386/genapic.h
+++ b/include/asm-i386/genapic.h
@@ -36,7 +36,7 @@ struct genapic {
 	void (*init_apic_ldr)(void);
 	physid_mask_t (*ioapic_phys_id_map)(physid_mask_t map);
 
-	void (*clustered_apic_check)(void);
+	void (*setup_apic_routing)(void);
 	int (*multi_timer_check)(int apic, int irq);
 	int (*apicid_to_node)(int logical_apicid); 
 	int (*cpu_to_logical_apicid)(int cpu);
@@ -99,7 +99,7 @@ struct genapic {
 	APICFUNC(check_apicid_present) \
 	APICFUNC(init_apic_ldr) \
 	APICFUNC(ioapic_phys_id_map) \
-	APICFUNC(clustered_apic_check) \
+	APICFUNC(setup_apic_routing) \
 	APICFUNC(multi_timer_check) \
 	APICFUNC(apicid_to_node) \
 	APICFUNC(cpu_to_logical_apicid) \
diff --git a/include/asm-i386/mach-bigsmp/mach_apic.h b/include/asm-i386/mach-bigsmp/mach_apic.h
index 18b19a773440..ebd319f838ab 100644
--- a/include/asm-i386/mach-bigsmp/mach_apic.h
+++ b/include/asm-i386/mach-bigsmp/mach_apic.h
@@ -71,7 +71,7 @@ static inline void init_apic_ldr(void)
 	apic_write_around(APIC_LDR, val);
 }
 
-static inline void clustered_apic_check(void)
+static inline void setup_apic_routing(void)
 {
 	printk("Enabling APIC mode:  %s.  Using %d I/O APICs\n",
 		"Physflat", nr_ioapics);
diff --git a/include/asm-i386/mach-default/mach_apic.h b/include/asm-i386/mach-default/mach_apic.h
index 3ef6292db780..6db1c3babe9a 100644
--- a/include/asm-i386/mach-default/mach_apic.h
+++ b/include/asm-i386/mach-default/mach_apic.h
@@ -54,7 +54,7 @@ static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map)
 	return phys_map;
 }
 
-static inline void clustered_apic_check(void)
+static inline void setup_apic_routing(void)
 {
 	printk("Enabling APIC mode:  %s.  Using %d I/O APICs\n",
 					"Flat", nr_ioapics);
diff --git a/include/asm-i386/mach-es7000/mach_apic.h b/include/asm-i386/mach-es7000/mach_apic.h
index 26333685a7fb..8e8b3949173a 100644
--- a/include/asm-i386/mach-es7000/mach_apic.h
+++ b/include/asm-i386/mach-es7000/mach_apic.h
@@ -81,7 +81,7 @@ static inline void enable_apic_mode(void)
 }
 
 extern int apic_version [MAX_APICS];
-static inline void clustered_apic_check(void)
+static inline void setup_apic_routing(void)
 {
 	int apic = bios_cpu_apicid[smp_processor_id()];
 	printk("Enabling APIC mode:  %s.  Using %d I/O APICs, target cpus %lx\n",
diff --git a/include/asm-i386/mach-generic/mach_apic.h b/include/asm-i386/mach-generic/mach_apic.h
index d9dc039da94a..a236e7021528 100644
--- a/include/asm-i386/mach-generic/mach_apic.h
+++ b/include/asm-i386/mach-generic/mach_apic.h
@@ -13,7 +13,7 @@
 #define apic_id_registered (genapic->apic_id_registered)
 #define init_apic_ldr (genapic->init_apic_ldr)
 #define ioapic_phys_id_map (genapic->ioapic_phys_id_map)
-#define clustered_apic_check (genapic->clustered_apic_check) 
+#define setup_apic_routing (genapic->setup_apic_routing)
 #define multi_timer_check (genapic->multi_timer_check)
 #define apicid_to_node (genapic->apicid_to_node)
 #define cpu_to_logical_apicid (genapic->cpu_to_logical_apicid) 
diff --git a/include/asm-i386/mach-numaq/mach_apic.h b/include/asm-i386/mach-numaq/mach_apic.h
index 9d158095da82..5e5e7dd2692e 100644
--- a/include/asm-i386/mach-numaq/mach_apic.h
+++ b/include/asm-i386/mach-numaq/mach_apic.h
@@ -34,7 +34,7 @@ static inline void init_apic_ldr(void)
 	/* Already done in NUMA-Q firmware */
 }
 
-static inline void clustered_apic_check(void)
+static inline void setup_apic_routing(void)
 {
 	printk("Enabling APIC mode:  %s.  Using %d I/O APICs\n",
 		"NUMA-Q", nr_ioapics);
diff --git a/include/asm-i386/mach-summit/mach_apic.h b/include/asm-i386/mach-summit/mach_apic.h
index 43e5bd8f4a19..732f776aab8e 100644
--- a/include/asm-i386/mach-summit/mach_apic.h
+++ b/include/asm-i386/mach-summit/mach_apic.h
@@ -80,7 +80,7 @@ static inline int apic_id_registered(void)
 	return 1;
 }
 
-static inline void clustered_apic_check(void)
+static inline void setup_apic_routing(void)
 {
 	printk("Enabling APIC mode:  Summit.  Using %d I/O APICs\n",
 						nr_ioapics);
diff --git a/include/asm-i386/mach-visws/mach_apic.h b/include/asm-i386/mach-visws/mach_apic.h
index 18afe6b6fc4d..efac6f0d139f 100644
--- a/include/asm-i386/mach-visws/mach_apic.h
+++ b/include/asm-i386/mach-visws/mach_apic.h
@@ -47,7 +47,7 @@ static inline void summit_check(char *oem, char *productid)
 {
 }
 
-static inline void clustered_apic_check(void)
+static inline void setup_apic_routing(void)
 {
 }
 
diff --git a/include/asm-x86_64/apic.h b/include/asm-x86_64/apic.h
index 7cfb39cbd918..2f3b013595ab 100644
--- a/include/asm-x86_64/apic.h
+++ b/include/asm-x86_64/apic.h
@@ -83,7 +83,7 @@ extern void setup_secondary_APIC_clock (void);
 extern int APIC_init_uniprocessor (void);
 extern void disable_APIC_timer(void);
 extern void enable_APIC_timer(void);
-extern void clustered_apic_check(void);
+extern void setup_apic_routing(void);
 
 extern void setup_APIC_extened_lvt(unsigned char lvt_off, unsigned char vector,
 				   unsigned char msg_type, unsigned char mask);

commit 424df390101f9dfe015f0633aaec696c159b37a8
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed May 2 19:27:04 2007 +0200

    [PATCH] x86-64: remove clustered APIC mode
    
    Remove now unused clustered APIC mode code.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Andi Kleen <ak@suse.de>
    Cc: Suresh Siddha <suresh.b.siddha@intel.com>
    Cc: Andi Kleen <ak@suse.de>
    Cc: "Li, Shaohua" <shaohua.li@intel.com>
    Cc: "Eric W. Biederman" <ebiederm@xmission.com>
    Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index bb47e86f3d02..6879b4f01e88 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -21,8 +21,7 @@ obj-$(CONFIG_MICROCODE)		+= microcode.o
 obj-$(CONFIG_X86_CPUID)		+= cpuid.o
 obj-$(CONFIG_SMP)		+= smp.o smpboot.o trampoline.o tsc_sync.o
 obj-y				+= apic.o  nmi.o
-obj-y				+= io_apic.o mpparse.o \
-		genapic.o genapic_cluster.o genapic_flat.o
+obj-y				+= io_apic.o mpparse.o genapic.o genapic_flat.o
 obj-$(CONFIG_KEXEC)		+= machine_kexec.o relocate_kernel.o crash.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump.o
 obj-$(CONFIG_PM)		+= suspend.o
diff --git a/arch/x86_64/kernel/genapic_cluster.c b/arch/x86_64/kernel/genapic_cluster.c
deleted file mode 100644
index 73d76308b955..000000000000
--- a/arch/x86_64/kernel/genapic_cluster.c
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Copyright 2004 James Cleverdon, IBM.
- * Subject to the GNU Public License, v.2
- *
- * Clustered APIC subarch code.  Up to 255 CPUs, physical delivery.
- * (A more realistic maximum is around 230 CPUs.)
- *
- * Hacked for x86-64 by James Cleverdon from i386 architecture code by
- * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
- * James Cleverdon.
- */
-#include <linux/threads.h>
-#include <linux/cpumask.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/ctype.h>
-#include <linux/init.h>
-#include <asm/smp.h>
-#include <asm/ipi.h>
-
-
-/*
- * Set up the logical destination ID.
- *
- * Intel recommends to set DFR, LDR and TPR before enabling
- * an APIC.  See e.g. "AP-388 82489DX User's Manual" (Intel
- * document number 292116).  So here it goes...
- */
-static void cluster_init_apic_ldr(void)
-{
-	unsigned long val, id;
-	long i, count;
-	u8 lid;
-	u8 my_id = hard_smp_processor_id();
-	u8 my_cluster = APIC_CLUSTER(my_id);
-
-	/* Create logical APIC IDs by counting CPUs already in cluster. */
-	for (count = 0, i = NR_CPUS; --i >= 0; ) {
-		lid = x86_cpu_to_log_apicid[i];
-		if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster)
-			++count;
-	}
-	/*
-	 * We only have a 4 wide bitmap in cluster mode.  There's no way
-	 * to get above 60 CPUs and still give each one it's own bit.
-	 * But, we're using physical IRQ delivery, so we don't care.
-	 * Use bit 3 for the 4th through Nth CPU in each cluster.
-	 */
-	if (count >= XAPIC_DEST_CPUS_SHIFT)
-		count = 3;
-	id = my_cluster | (1UL << count);
-	x86_cpu_to_log_apicid[smp_processor_id()] = id;
-	apic_write(APIC_DFR, APIC_DFR_CLUSTER);
-	val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
-	val |= SET_APIC_LOGICAL_ID(id);
-	apic_write(APIC_LDR, val);
-}
-
-/* Start with all IRQs pointing to boot CPU.  IRQ balancing will shift them. */
-
-static cpumask_t cluster_target_cpus(void)
-{
-	return cpumask_of_cpu(0);
-}
-
-static cpumask_t cluster_vector_allocation_domain(int cpu)
-{
-	cpumask_t domain = CPU_MASK_NONE;
-	cpu_set(cpu, domain);
-	return domain;
-}
-
-static void cluster_send_IPI_mask(cpumask_t mask, int vector)
-{
-	send_IPI_mask_sequence(mask, vector);
-}
-
-static void cluster_send_IPI_allbutself(int vector)
-{
-	cpumask_t mask = cpu_online_map;
-
-	cpu_clear(smp_processor_id(), mask);
-
-	if (!cpus_empty(mask))
-		cluster_send_IPI_mask(mask, vector);
-}
-
-static void cluster_send_IPI_all(int vector)
-{
-	cluster_send_IPI_mask(cpu_online_map, vector);
-}
-
-static int cluster_apic_id_registered(void)
-{
-	return 1;
-}
-
-static unsigned int cluster_cpu_mask_to_apicid(cpumask_t cpumask)
-{
-	int cpu;
-
-	/*
-	 * We're using fixed IRQ delivery, can only return one phys APIC ID.
-	 * May as well be the first.
-	 */
-	cpu = first_cpu(cpumask);
-	if ((unsigned)cpu < NR_CPUS)
-		return x86_cpu_to_apicid[cpu];
-	else
-		return BAD_APICID;
-}
-
-/* cpuid returns the value latched in the HW at reset, not the APIC ID
- * register's value.  For any box whose BIOS changes APIC IDs, like
- * clustered APIC systems, we must use hard_smp_processor_id.
- *
- * See Intel's IA-32 SW Dev's Manual Vol2 under CPUID.
- */
-static unsigned int phys_pkg_id(int index_msb)
-{
-	return hard_smp_processor_id() >> index_msb;
-}
-
-struct genapic apic_cluster = {
-	.name = "clustered",
-	.int_delivery_mode = dest_Fixed,
-	.int_dest_mode = (APIC_DEST_PHYSICAL != 0),
-	.target_cpus = cluster_target_cpus,
-	.vector_allocation_domain = cluster_vector_allocation_domain,
-	.apic_id_registered = cluster_apic_id_registered,
-	.init_apic_ldr = cluster_init_apic_ldr,
-	.send_IPI_all = cluster_send_IPI_all,
-	.send_IPI_allbutself = cluster_send_IPI_allbutself,
-	.send_IPI_mask = cluster_send_IPI_mask,
-	.cpu_mask_to_apicid = cluster_cpu_mask_to_apicid,
-	.phys_pkg_id = phys_pkg_id,
-};

commit 07c7c4744400f93a7c52b32159c31d823e1747a5
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed May 2 19:27:04 2007 +0200

    [PATCH] x86-64: always use physical delivery mode on > 8 CPUs
    
    Remove clustered APIC mode.  There's little point in the use of clustered APIC
    mode, broadcasting is limited to within the cluster only, and chipsets have
    bugs in this area as well.  So default to physical APIC mode when the CPU
    count is large, and default to logical APIC mode when the CPU count is 8 or
    smaller.
    
    (this patch only removes the use of genapic_cluster and cleans up the
    resulting genapic.c file - removal of all remaining traces of clustered
    mode will be done by another patch.)
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Andi Kleen <ak@suse.de>
    Cc: Suresh Siddha <suresh.b.siddha@intel.com>
    Cc: Andi Kleen <ak@suse.de>
    Cc: "Li, Shaohua" <shaohua.li@intel.com>
    Cc: "Eric W. Biederman" <ebiederm@xmission.com>
    Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c
index 2f2b8fc6e2f3..025f26ebb8d7 100644
--- a/arch/x86_64/kernel/genapic.c
+++ b/arch/x86_64/kernel/genapic.c
@@ -11,26 +11,24 @@
 #include <linux/threads.h>
 #include <linux/cpumask.h>
 #include <linux/string.h>
+#include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/ctype.h>
 #include <linux/init.h>
-#include <linux/module.h>
 
 #include <asm/smp.h>
 #include <asm/ipi.h>
 
-#if defined(CONFIG_ACPI)
+#ifdef CONFIG_ACPI
 #include <acpi/acpi_bus.h>
 #endif
 
 /* which logical CPU number maps to which CPU (physical APIC ID) */
-u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
+u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly
+					= { [0 ... NR_CPUS-1] = BAD_APICID };
 EXPORT_SYMBOL(x86_cpu_to_apicid);
-u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
 
-extern struct genapic apic_cluster;
-extern struct genapic apic_flat;
-extern struct genapic apic_physflat;
+u8 x86_cpu_to_log_apicid[NR_CPUS]	= { [0 ... NR_CPUS-1] = BAD_APICID };
 
 struct genapic __read_mostly *genapic = &apic_flat;
 
@@ -39,76 +37,37 @@ struct genapic __read_mostly *genapic = &apic_flat;
  */
 void __init clustered_apic_check(void)
 {
-	int i;
-	u8 clusters, max_cluster;
+	unsigned int i, max_apic = 0;
 	u8 id;
-	u8 cluster_cnt[NUM_APIC_CLUSTERS];
-	int max_apic = 0;
 
 #ifdef CONFIG_ACPI
 	/*
-	 * Some x86_64 machines use physical APIC mode regardless of how many
-	 * procs/clusters are present (x86_64 ES7000 is an example).
+	 * Quirk: some x86_64 machines can only use physical APIC mode
+	 * regardless of how many processors are present (x86_64 ES7000
+	 * is an example).
 	 */
-	if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID)
-		if (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL) {
-			genapic = &apic_cluster;
-			goto print;
-		}
+	if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
+			(acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL))
+		genapic = &apic_physflat;
 #endif
 
-	memset(cluster_cnt, 0, sizeof(cluster_cnt));
 	for (i = 0; i < NR_CPUS; i++) {
 		id = bios_cpu_apicid[i];
 		if (id == BAD_APICID)
 			continue;
 		if (id > max_apic)
 			max_apic = id;
-		cluster_cnt[APIC_CLUSTERID(id)]++;
 	}
 
-	/*
-	 * Don't use clustered mode on AMD platforms, default
-	 * to flat logical mode.
-	 */
- 	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
-		/*
-		 * Switch to physical flat mode if more than 8 APICs
-		 * (In the case of 8 CPUs APIC ID goes from 0 to 7):
-		 */
-		if (max_apic >= 8)
-			genapic = &apic_physflat;
- 		goto print;
- 	}
-
-	clusters = 0;
-	max_cluster = 0;
-
-	for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
-		if (cluster_cnt[i] > 0) {
-			++clusters;
-			if (cluster_cnt[i] > max_cluster)
-				max_cluster = cluster_cnt[i];
-		}
-	}
-
-	/*
-	 * If we have clusters <= 1 and CPUs <= 8 in cluster 0, then flat mode,
-	 * else if max_cluster <= 4 and cluster_cnt[15] == 0, clustered logical
-	 * else physical mode.
-	 * (We don't use lowest priority delivery + HW APIC IRQ steering, so
-	 * can ignore the clustered logical case and go straight to physical.)
-	 */
-	if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster)
+	if (max_apic < 8)
 		genapic = &apic_flat;
 	else
-		genapic = &apic_cluster;
+		genapic = &apic_physflat;
 
-print:
 	printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
 }
 
-/* Same for both flat and clustered. */
+/* Same for both flat and physical. */
 
 void send_IPI_self(int vector)
 {
diff --git a/include/asm-x86_64/genapic.h b/include/asm-x86_64/genapic.h
index a0e9a4b93484..d7e516ccbaa4 100644
--- a/include/asm-x86_64/genapic.h
+++ b/include/asm-x86_64/genapic.h
@@ -29,7 +29,9 @@ struct genapic {
 	unsigned int (*phys_pkg_id)(int index_msb);
 };
 
-
 extern struct genapic *genapic;
 
+extern struct genapic apic_flat;
+extern struct genapic apic_physflat;
+
 #endif