Patches contributed by Eötvös Lorand University


commit eb24073bc1fe3e569a855cf38d529fb650c35524
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 16 21:09:13 2009 +0200

    sched: Fix TASK_WAKING & loadaverage breakage
    
    Fix this:
    
    top - 21:54:00 up  2:59,  1 user,  load average: 432512.33, 426421.74, 417432.74
    
    Which happens because we now set TASK_WAKING before activate_task().
    
    Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Mike Galbraith <efault@gmx.de>
    LKML-Reference: <new-submission>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 5049d959bb26..969dfaef2465 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2343,7 +2343,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
 	/*
 	 * In order to handle concurrent wakeups and release the rq->lock
 	 * we put the task in TASK_WAKING state.
+	 *
+	 * First fix up the nr_uninterruptible count:
 	 */
+	if (task_contributes_to_load(p))
+		rq->nr_uninterruptible--;
 	p->state = TASK_WAKING;
 	task_rq_unlock(rq, &flags);
 

commit 8d7ac69ffaf740cdf98bdd5073c2d70a8828200e
Author: Ingo Molnar <mingo@elte.hu>
Date:   Tue Aug 18 16:45:25 2009 +0000

    Blackfin: Fix link errors with binutils 2.19 and GCC 4.3
    
    Not sure whether this has been reported/fixed before.
    
    Today I built a Blackfin tool-chain from scratch for -tip testing,
    and it triggers:
    
     arch/blackfin/kernel/vmlinux.lds:1238: undefined section `.data_a_l1' referenced in expression
    
    and:
    
     arch/blackfin/kernel/vmlinux.lds:1238: undefined section `.text_data_l1'
    referenced in expression
    
    Now i dont have any way to test this linker script, but it now at
    least builds fine after fixing what appears to be typos in those
    assert statements.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Mike Frysinger <vapier@gentoo.org>

diff --git a/arch/blackfin/kernel/vmlinux.lds.S b/arch/blackfin/kernel/vmlinux.lds.S
index d7ffe299b979..21ac7c26079e 100644
--- a/arch/blackfin/kernel/vmlinux.lds.S
+++ b/arch/blackfin/kernel/vmlinux.lds.S
@@ -221,7 +221,7 @@ SECTIONS
 		. = ALIGN(4);
 		__ebss_l1 = .;
 	}
-	ASSERT (SIZEOF(.data_a_l1) <= L1_DATA_A_LENGTH, "L1 data A overflow!")
+	ASSERT (SIZEOF(.data_l1) <= L1_DATA_A_LENGTH, "L1 data A overflow!")
 
 	.data_b_l1 L1_DATA_B_START : AT(LOADADDR(.data_l1) + SIZEOF(.data_l1))
 	{
@@ -262,7 +262,7 @@ SECTIONS
 		. = ALIGN(4);
 		__ebss_l2 = .;
 	}
-	ASSERT (SIZEOF(.text_data_l1) <= L2_LENGTH, "L2 overflow!")
+	ASSERT (SIZEOF(.text_data_l2) <= L2_LENGTH, "L2 overflow!")
 
 	/* Force trailing alignment of our init section so that when we
 	 * free our init memory, we don't leave behind a partial page.

commit 40d9d82c8ab8c4e2373a23a1e31dc8d84c53aa01
Merge: 983f2163e7fd b36461da2a03
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 16 21:16:37 2009 +0200

    Merge branch 'tip/tracing/core4' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-2.6-trace into tracing/core

commit 0ec04e16d08b69d8da46abbcfa3e3f2cd9738852
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 16 17:40:48 2009 +0200

    perf sched: Add 'perf sched map' scheduling event map printout
    
    This prints a textual context-switching outline of workload
    captured via perf sched record.
    
    For example, on a 16 CPU box it outputs:
    
       N1  O1  .   .   .   S1  .   .   .   B0  .  *I0  C1  .   M1  .    23002.773423 secs
       N1  O1  .  *Q0  .   S1  .   .   .   B0  .   I0  C1  .   M1  .    23002.773423 secs
       N1  O1  .   Q0  .   S1  .   .   .   B0  .  *R1  C1  .   M1  .    23002.773485 secs
       N1  O1  .   Q0  .   S1  .  *S0  .   B0  .   R1  C1  .   M1  .    23002.773478 secs
      *L0  O1  .   Q0  .   S1  .   S0  .   B0  .   R1  C1  .   M1  .    23002.773523 secs
       L0  O1  .  *.   .   S1  .   S0  .   B0  .   R1  C1  .   M1  .    23002.773531 secs
       L0  O1  .   .   .   S1  .   S0  .   B0  .   R1  C1 *T1  M1  .    23002.773547 secs T1 => irqbalance:2089
       L0  O1  .   .   .   S1  .   S0  .  *P0  .   R1  C1  T1  M1  .    23002.773549 secs
      *N1  O1  .   .   .   S1  .   S0  .   P0  .   R1  C1  T1  M1  .    23002.773566 secs
       N1  O1  .   .   .  *J0  .   S0  .   P0  .   R1  C1  T1  M1  .    23002.773571 secs
       N1  O1  .   .   .   J0  .   S0 *B0  P0  .   R1  C1  T1  M1  .    23002.773592 secs
       N1  O1  .   .   .   J0  .  *U0  B0  P0  .   R1  C1  T1  M1  .    23002.773582 secs
       N1  O1  .   .   .  *S1  .   U0  B0  P0  .   R1  C1  T1  M1  .    23002.773604 secs
       N1  O1  .   .   .   S1  .   U0  B0 *.   .   R1  C1  T1  M1  .    23002.773615 secs
       N1  O1  .   .   .   S1  .   U0  B0  .   .  *K0  C1  T1  M1  .    23002.773631 secs
       N1  O1  .  *M0  .   S1  .   U0  B0  .   .   K0  C1  T1  M1  .    23002.773624 secs
       N1  O1  .   M0  .   S1  .   U0 *.   .   .   K0  C1  T1  M1  .    23002.773644 secs
       N1  O1  .   M0  .   S1  .   U0  .   .   .  *R1  C1  T1  M1  .    23002.773662 secs
       N1  O1  .   M0  .   S1  .  *.   .   .   .   R1  C1  T1  M1  .    23002.773648 secs
       N1  O1  .  *.   .   S1  .   .   .   .   .   R1  C1  T1  M1  .    23002.773680 secs
       N1  O1  .   .   .  *L0  .   .   .   .   .   R1  C1  T1  M1  .    23002.773717 secs
      *N0  O1  .   .   .   L0  .   .   .   .   .   R1  C1  T1  M1  .    23002.773709 secs
      *N1  O1  .   .   .   L0  .   .   .   .   .   R1  C1  T1  M1  .    23002.773747 secs
    
    Columns stand for individual CPUs, from CPU0 to CPU15, and the
    two-letter shortcuts stand for tasks that are running on a CPU.
    
    '*' denotes the CPU that had the event.
    
    A dot signals an idle CPU.
    
    New tasks are assigned new two-letter shortcuts - when they occur
    first they are printed. In the above example 'T1' stood for irqbalance:
    
          T1 => irqbalance:2089
    
    Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Mike Galbraith <efault@gmx.de>
    Cc: Paul Mackerras <paulus@samba.org>
    Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
    Cc: Frederic Weisbecker <fweisbec@gmail.com>
    LKML-Reference: <new-submission>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index da8f67483ae7..f67e351b050b 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -159,8 +159,6 @@ static struct rb_root		atom_root, sorted_atom_root;
 static u64			all_runtime;
 static u64			all_count;
 
-static int read_events(void);
-
 
 static u64 get_nsecs(void)
 {
@@ -634,38 +632,6 @@ static void test_calibrations(void)
 	printf("the sleep test took %Ld nsecs\n", T1-T0);
 }
 
-static void __cmd_replay(void)
-{
-	unsigned long i;
-
-	calibrate_run_measurement_overhead();
-	calibrate_sleep_measurement_overhead();
-
-	test_calibrations();
-
-	read_events();
-
-	printf("nr_run_events:        %ld\n", nr_run_events);
-	printf("nr_sleep_events:      %ld\n", nr_sleep_events);
-	printf("nr_wakeup_events:     %ld\n", nr_wakeup_events);
-
-	if (targetless_wakeups)
-		printf("target-less wakeups:  %ld\n", targetless_wakeups);
-	if (multitarget_wakeups)
-		printf("multi-target wakeups: %ld\n", multitarget_wakeups);
-	if (nr_run_events_optimized)
-		printf("run atoms optimized: %ld\n",
-			nr_run_events_optimized);
-
-	print_task_traces();
-	add_cross_task_wakeups();
-
-	create_tasks();
-	printf("------------------------------------------------------------\n");
-	for (i = 0; i < replay_repeat; i++)
-		run_one_test();
-}
-
 static int
 process_comm_event(event_t *event, unsigned long offset, unsigned long head)
 {
@@ -1354,64 +1320,6 @@ static void sort_lat(void)
 	}
 }
 
-static void __cmd_lat(void)
-{
-	struct rb_node *next;
-
-	setup_pager();
-	read_events();
-	sort_lat();
-
-	printf("\n -----------------------------------------------------------------------------------------\n");
-	printf("  Task                  |   Runtime ms  | Switches | Average delay ms | Maximum delay ms |\n");
-	printf(" -----------------------------------------------------------------------------------------\n");
-
-	next = rb_first(&sorted_atom_root);
-
-	while (next) {
-		struct work_atoms *work_list;
-
-		work_list = rb_entry(next, struct work_atoms, node);
-		output_lat_thread(work_list);
-		next = rb_next(next);
-	}
-
-	printf(" -----------------------------------------------------------------------------------------\n");
-	printf("  TOTAL:                |%11.3f ms |%9Ld |\n",
-		(double)all_runtime/1e6, all_count);
-
-	printf(" ---------------------------------------------------\n");
-	if (nr_unordered_timestamps && nr_timestamps) {
-		printf("  INFO: %.3f%% unordered timestamps (%ld out of %ld)\n",
-			(double)nr_unordered_timestamps/(double)nr_timestamps*100.0,
-			nr_unordered_timestamps, nr_timestamps);
-	} else {
-	}
-	if (nr_lost_events && nr_events) {
-		printf("  INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n",
-			(double)nr_lost_events/(double)nr_events*100.0,
-			nr_lost_events, nr_events, nr_lost_chunks);
-	}
-	if (nr_state_machine_bugs && nr_timestamps) {
-		printf("  INFO: %.3f%% state machine bugs (%ld out of %ld)",
-			(double)nr_state_machine_bugs/(double)nr_timestamps*100.0,
-			nr_state_machine_bugs, nr_timestamps);
-		if (nr_lost_events)
-			printf(" (due to lost events?)");
-		printf("\n");
-	}
-	if (nr_context_switch_bugs && nr_timestamps) {
-		printf("  INFO: %.3f%% context switch bugs (%ld out of %ld)",
-			(double)nr_context_switch_bugs/(double)nr_timestamps*100.0,
-			nr_context_switch_bugs, nr_timestamps);
-		if (nr_lost_events)
-			printf(" (due to lost events?)");
-		printf("\n");
-	}
-	printf("\n");
-
-}
-
 static struct trace_sched_handler *trace_handler;
 
 static void
@@ -1431,19 +1339,106 @@ process_sched_wakeup_event(struct raw_event_sample *raw,
 	FILL_FIELD(wakeup_event, success, event, raw->data);
 	FILL_FIELD(wakeup_event, cpu, event, raw->data);
 
-	trace_handler->wakeup_event(&wakeup_event, event, cpu, timestamp, thread);
+	if (trace_handler->wakeup_event)
+		trace_handler->wakeup_event(&wakeup_event, event, cpu, timestamp, thread);
 }
 
 /*
  * Track the current task - that way we can know whether there's any
  * weird events, such as a task being switched away that is not current.
  */
+static int max_cpu = 15;
+
 static u32 curr_pid[MAX_CPUS] = { [0 ... MAX_CPUS-1] = -1 };
 
+static struct thread *curr_thread[MAX_CPUS];
+
+static char next_shortname1 = 'A';
+static char next_shortname2 = '0';
+
+static void
+map_switch_event(struct trace_switch_event *switch_event,
+		 struct event *event __used,
+		 int this_cpu,
+		 u64 timestamp,
+		 struct thread *thread __used)
+{
+	struct thread *sched_out, *sched_in;
+	int new_shortname;
+	u64 timestamp0;
+	s64 delta;
+	int cpu;
+
+	BUG_ON(this_cpu >= MAX_CPUS || this_cpu < 0);
+
+	if (this_cpu > max_cpu)
+		max_cpu = this_cpu;
+
+	timestamp0 = cpu_last_switched[this_cpu];
+	cpu_last_switched[this_cpu] = timestamp;
+	if (timestamp0)
+		delta = timestamp - timestamp0;
+	else
+		delta = 0;
+
+	if (delta < 0)
+		die("hm, delta: %Ld < 0 ?\n", delta);
+
+
+	sched_out = threads__findnew(switch_event->prev_pid, &threads, &last_match);
+	sched_in = threads__findnew(switch_event->next_pid, &threads, &last_match);
+
+	curr_thread[this_cpu] = sched_in;
+
+	printf("  ");
+
+	new_shortname = 0;
+	if (!sched_in->shortname[0]) {
+		sched_in->shortname[0] = next_shortname1;
+		sched_in->shortname[1] = next_shortname2;
+
+		if (next_shortname1 < 'Z') {
+			next_shortname1++;
+		} else {
+			next_shortname1='A';
+			if (next_shortname2 < '9') {
+				next_shortname2++;
+			} else {
+				next_shortname2='0';
+			}
+		}
+		new_shortname = 1;
+	}
+
+	for (cpu = 0; cpu <= max_cpu; cpu++) {
+		if (cpu != this_cpu)
+			printf(" ");
+		else
+			printf("*");
+
+		if (curr_thread[cpu]) {
+			if (curr_thread[cpu]->pid)
+				printf("%2s ", curr_thread[cpu]->shortname);
+			else
+				printf(".  ");
+		} else
+			printf("   ");
+	}
+
+	printf("  %12.6f secs ", (double)timestamp/1e9);
+	if (new_shortname) {
+		printf("%s => %s:%d\n",
+			sched_in->shortname, sched_in->comm, sched_in->pid);
+	} else {
+		printf("\n");
+	}
+}
+
+
 static void
 process_sched_switch_event(struct raw_event_sample *raw,
 			   struct event *event,
-			   int cpu,
+			   int this_cpu,
 			   u64 timestamp __used,
 			   struct thread *thread __used)
 {
@@ -1459,17 +1454,18 @@ process_sched_switch_event(struct raw_event_sample *raw,
 	FILL_FIELD(switch_event, next_pid, event, raw->data);
 	FILL_FIELD(switch_event, next_prio, event, raw->data);
 
-	if (curr_pid[cpu] != (u32)-1) {
+	if (curr_pid[this_cpu] != (u32)-1) {
 		/*
 		 * Are we trying to switch away a PID that is
 		 * not current?
 		 */
-		if (curr_pid[cpu] != switch_event.prev_pid)
+		if (curr_pid[this_cpu] != switch_event.prev_pid)
 			nr_context_switch_bugs++;
 	}
-	curr_pid[cpu] = switch_event.next_pid;
+	if (trace_handler->switch_event)
+		trace_handler->switch_event(&switch_event, event, this_cpu, timestamp, thread);
 
-	trace_handler->switch_event(&switch_event, event, cpu, timestamp, thread);
+	curr_pid[this_cpu] = switch_event.next_pid;
 }
 
 static void
@@ -1486,7 +1482,8 @@ process_sched_runtime_event(struct raw_event_sample *raw,
 	FILL_FIELD(runtime_event, runtime, event, raw->data);
 	FILL_FIELD(runtime_event, vruntime, event, raw->data);
 
-	trace_handler->runtime_event(&runtime_event, event, cpu, timestamp, thread);
+	if (trace_handler->runtime_event)
+		trace_handler->runtime_event(&runtime_event, event, cpu, timestamp, thread);
 }
 
 static void
@@ -1505,7 +1502,8 @@ process_sched_fork_event(struct raw_event_sample *raw,
 	FILL_ARRAY(fork_event, child_comm, event, raw->data);
 	FILL_FIELD(fork_event, child_pid, event, raw->data);
 
-	trace_handler->fork_event(&fork_event, event, cpu, timestamp, thread);
+	if (trace_handler->fork_event)
+		trace_handler->fork_event(&fork_event, event, cpu, timestamp, thread);
 }
 
 static void
@@ -1748,6 +1746,116 @@ static int read_events(void)
 	return rc;
 }
 
+static void print_bad_events(void)
+{
+	if (nr_unordered_timestamps && nr_timestamps) {
+		printf("  INFO: %.3f%% unordered timestamps (%ld out of %ld)\n",
+			(double)nr_unordered_timestamps/(double)nr_timestamps*100.0,
+			nr_unordered_timestamps, nr_timestamps);
+	}
+	if (nr_lost_events && nr_events) {
+		printf("  INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n",
+			(double)nr_lost_events/(double)nr_events*100.0,
+			nr_lost_events, nr_events, nr_lost_chunks);
+	}
+	if (nr_state_machine_bugs && nr_timestamps) {
+		printf("  INFO: %.3f%% state machine bugs (%ld out of %ld)",
+			(double)nr_state_machine_bugs/(double)nr_timestamps*100.0,
+			nr_state_machine_bugs, nr_timestamps);
+		if (nr_lost_events)
+			printf(" (due to lost events?)");
+		printf("\n");
+	}
+	if (nr_context_switch_bugs && nr_timestamps) {
+		printf("  INFO: %.3f%% context switch bugs (%ld out of %ld)",
+			(double)nr_context_switch_bugs/(double)nr_timestamps*100.0,
+			nr_context_switch_bugs, nr_timestamps);
+		if (nr_lost_events)
+			printf(" (due to lost events?)");
+		printf("\n");
+	}
+}
+
+static void __cmd_lat(void)
+{
+	struct rb_node *next;
+
+	setup_pager();
+	read_events();
+	sort_lat();
+
+	printf("\n -----------------------------------------------------------------------------------------\n");
+	printf("  Task                  |   Runtime ms  | Switches | Average delay ms | Maximum delay ms |\n");
+	printf(" -----------------------------------------------------------------------------------------\n");
+
+	next = rb_first(&sorted_atom_root);
+
+	while (next) {
+		struct work_atoms *work_list;
+
+		work_list = rb_entry(next, struct work_atoms, node);
+		output_lat_thread(work_list);
+		next = rb_next(next);
+	}
+
+	printf(" -----------------------------------------------------------------------------------------\n");
+	printf("  TOTAL:                |%11.3f ms |%9Ld |\n",
+		(double)all_runtime/1e6, all_count);
+
+	printf(" ---------------------------------------------------\n");
+
+	print_bad_events();
+	printf("\n");
+
+}
+
+static struct trace_sched_handler map_ops  = {
+	.wakeup_event		= NULL,
+	.switch_event		= map_switch_event,
+	.runtime_event		= NULL,
+	.fork_event		= NULL,
+};
+
+static void __cmd_map(void)
+{
+	setup_pager();
+	read_events();
+	print_bad_events();
+}
+
+static void __cmd_replay(void)
+{
+	unsigned long i;
+
+	calibrate_run_measurement_overhead();
+	calibrate_sleep_measurement_overhead();
+
+	test_calibrations();
+
+	read_events();
+
+	printf("nr_run_events:        %ld\n", nr_run_events);
+	printf("nr_sleep_events:      %ld\n", nr_sleep_events);
+	printf("nr_wakeup_events:     %ld\n", nr_wakeup_events);
+
+	if (targetless_wakeups)
+		printf("target-less wakeups:  %ld\n", targetless_wakeups);
+	if (multitarget_wakeups)
+		printf("multi-target wakeups: %ld\n", multitarget_wakeups);
+	if (nr_run_events_optimized)
+		printf("run atoms optimized: %ld\n",
+			nr_run_events_optimized);
+
+	print_task_traces();
+	add_cross_task_wakeups();
+
+	create_tasks();
+	printf("------------------------------------------------------------\n");
+	for (i = 0; i < replay_repeat; i++)
+		run_one_test();
+}
+
+
 static const char * const sched_usage[] = {
 	"perf sched [<options>] {record|latency|replay|trace}",
 	NULL
@@ -1867,6 +1975,10 @@ int cmd_sched(int argc, const char **argv, const char *prefix __used)
 		}
 		setup_sorting();
 		__cmd_lat();
+	} else if (!strcmp(argv[0], "map")) {
+		trace_handler = &map_ops;
+		setup_sorting();
+		__cmd_map();
 	} else if (!strncmp(argv[0], "rep", 3)) {
 		trace_handler = &replay_ops;
 		if (argc) {
diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c
index 12c4341078f9..45efb5db0d19 100644
--- a/tools/perf/util/thread.c
+++ b/tools/perf/util/thread.c
@@ -8,7 +8,7 @@
 
 static struct thread *thread__new(pid_t pid)
 {
-	struct thread *self = malloc(sizeof(*self));
+	struct thread *self = calloc(1, sizeof(*self));
 
 	if (self != NULL) {
 		self->pid = pid;
diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h
index 665d1f3dc977..32aea3c1c2ad 100644
--- a/tools/perf/util/thread.h
+++ b/tools/perf/util/thread.h
@@ -7,6 +7,7 @@ struct thread {
 	struct rb_node		rb_node;
 	struct list_head	maps;
 	pid_t			pid;
+	char			shortname[3];
 	char			*comm;
 };
 

commit 80ed0987f363d7eb50193df3e6f6d71451f74bc3
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 16 14:12:36 2009 +0200

    perf sched: Make idle thread and comm/pid names more consistent
    
    Peter noticed that we have 3 ways of referring to the idle thread:
    
     [idle]:0
     swapper:0
     swapper-0
    
    Standardize on 'swapper:0'.
    
    Reported-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Mike Galbraith <efault@gmx.de>
    Cc: Paul Mackerras <paulus@samba.org>
    Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
    Cc: Frederic Weisbecker <fweisbec@gmail.com>
    LKML-Reference: <new-submission>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 2d542368de3c..da8f67483ae7 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -1204,13 +1204,13 @@ static void output_lat_thread(struct work_atoms *work_list)
 	/*
 	 * Ignore idle threads:
 	 */
-	if (!work_list->thread->pid)
+	if (!strcmp(work_list->thread->comm, "swapper"))
 		return;
 
 	all_runtime += work_list->total_runtime;
 	all_count += work_list->nb_atoms;
 
-	ret = printf("  %s-%d ", work_list->thread->comm, work_list->thread->pid);
+	ret = printf("  %s:%d ", work_list->thread->comm, work_list->thread->pid);
 
 	for (i = 0; i < 24 - ret; i++)
 		printf(" ");
diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c
index 7635928ca278..12c4341078f9 100644
--- a/tools/perf/util/thread.c
+++ b/tools/perf/util/thread.c
@@ -85,7 +85,7 @@ register_idle_thread(struct rb_root *threads, struct thread **last_match)
 {
 	struct thread *thread = threads__findnew(0, threads, last_match);
 
-	if (!thread || thread__set_comm(thread, "[init]")) {
+	if (!thread || thread__set_comm(thread, "swapper")) {
 		fprintf(stderr, "problem inserting idle task.\n");
 		exit(-1);
 	}

commit c8a37751043427c6e4397a2cbfd617cb5f215c72
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 16 14:07:00 2009 +0200

    perf sched: Sanity check context switch events
    
    Use 'perf sched latency' to track the current task based on
    context-switch events, and flag the cases where there's some
    impossible transition: such as a PID being switched out that
    was not switched in.
    
    Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Mike Galbraith <efault@gmx.de>
    Cc: Paul Mackerras <paulus@samba.org>
    Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
    Cc: Frederic Weisbecker <fweisbec@gmail.com>
    LKML-Reference: <new-submission>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 1f0f9be34faa..2d542368de3c 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -119,6 +119,7 @@ static unsigned long		replay_repeat = 10;
 static unsigned long		nr_timestamps;
 static unsigned long		nr_unordered_timestamps;
 static unsigned long		nr_state_machine_bugs;
+static unsigned long		nr_context_switch_bugs;
 static unsigned long		nr_events;
 static unsigned long		nr_lost_chunks;
 static unsigned long		nr_lost_events;
@@ -1399,6 +1400,14 @@ static void __cmd_lat(void)
 			printf(" (due to lost events?)");
 		printf("\n");
 	}
+	if (nr_context_switch_bugs && nr_timestamps) {
+		printf("  INFO: %.3f%% context switch bugs (%ld out of %ld)",
+			(double)nr_context_switch_bugs/(double)nr_timestamps*100.0,
+			nr_context_switch_bugs, nr_timestamps);
+		if (nr_lost_events)
+			printf(" (due to lost events?)");
+		printf("\n");
+	}
 	printf("\n");
 
 }
@@ -1425,10 +1434,16 @@ process_sched_wakeup_event(struct raw_event_sample *raw,
 	trace_handler->wakeup_event(&wakeup_event, event, cpu, timestamp, thread);
 }
 
+/*
+ * Track the current task - that way we can know whether there's any
+ * weird events, such as a task being switched away that is not current.
+ */
+static u32 curr_pid[MAX_CPUS] = { [0 ... MAX_CPUS-1] = -1 };
+
 static void
 process_sched_switch_event(struct raw_event_sample *raw,
 			   struct event *event,
-			   int cpu __used,
+			   int cpu,
 			   u64 timestamp __used,
 			   struct thread *thread __used)
 {
@@ -1444,6 +1459,16 @@ process_sched_switch_event(struct raw_event_sample *raw,
 	FILL_FIELD(switch_event, next_pid, event, raw->data);
 	FILL_FIELD(switch_event, next_prio, event, raw->data);
 
+	if (curr_pid[cpu] != (u32)-1) {
+		/*
+		 * Are we trying to switch away a PID that is
+		 * not current?
+		 */
+		if (curr_pid[cpu] != switch_event.prev_pid)
+			nr_context_switch_bugs++;
+	}
+	curr_pid[cpu] = switch_event.next_pid;
+
 	trace_handler->switch_event(&switch_event, event, cpu, timestamp, thread);
 }
 

commit dc02bf7178c8e2cb3d442ae19027b736d51c7dd5
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 16 13:45:00 2009 +0200

    perf sched: Account for lost events, increase default buffering
    
    Output such lost event and state machine weirdness stats:
    
       TOTAL:                |  14974.910 ms |    46384 |
      ---------------------------------------------------
       INFO: 8.865% lost events (19132 out of 215819, in 8 chunks)
       INFO: 0.198% state machine bugs (49 out of 24708) (due to lost events?)
    
    And increase buffering to -m 1024 (4 MB) by default. Since we
    use output multiplexing that kind of space is needed.
    
    Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Mike Galbraith <efault@gmx.de>
    Cc: Paul Mackerras <paulus@samba.org>
    Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
    Cc: Frederic Weisbecker <fweisbec@gmail.com>
    LKML-Reference: <new-submission>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index adcb563ec4d2..1f0f9be34faa 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -117,7 +117,11 @@ static u64			run_avg;
 
 static unsigned long		replay_repeat = 10;
 static unsigned long		nr_timestamps;
-static unsigned long		unordered_timestamps;
+static unsigned long		nr_unordered_timestamps;
+static unsigned long		nr_state_machine_bugs;
+static unsigned long		nr_events;
+static unsigned long		nr_lost_chunks;
+static unsigned long		nr_lost_events;
 
 #define TASK_STATE_TO_CHAR_STR "RSDTtZX"
 
@@ -668,14 +672,14 @@ process_comm_event(event_t *event, unsigned long offset, unsigned long head)
 
 	thread = threads__findnew(event->comm.pid, &threads, &last_match);
 
-	dump_printf("%p [%p]: PERF_EVENT_COMM: %s:%d\n",
+	dump_printf("%p [%p]: perf_event_comm: %s:%d\n",
 		(void *)(offset + head),
 		(void *)(long)(event->header.size),
 		event->comm.comm, event->comm.pid);
 
 	if (thread == NULL ||
 	    thread__set_comm(thread, event->comm.comm)) {
-		dump_printf("problem processing PERF_EVENT_COMM, skipping event.\n");
+		dump_printf("problem processing perf_event_comm, skipping event.\n");
 		return -1;
 	}
 	total_comm++;
@@ -1168,14 +1172,12 @@ latency_wakeup_event(struct trace_wakeup_event *wakeup_event,
 
 	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
 
-	if (atom->state != THREAD_SLEEPING) {
-		printf("boo2\n");
-		return;
-	}
+	if (atom->state != THREAD_SLEEPING)
+		nr_state_machine_bugs++;
 
 	nr_timestamps++;
 	if (atom->sched_out_time > timestamp) {
-		unordered_timestamps++;
+		nr_unordered_timestamps++;
 		return;
 	}
 
@@ -1214,7 +1216,7 @@ static void output_lat_thread(struct work_atoms *work_list)
 
 	avg = work_list->total_lat / work_list->nb_atoms;
 
-	printf("|%9.3f ms |%9llu | avg:%9.3f ms | max:%9.3f ms |\n",
+	printf("|%11.3f ms |%9llu | avg:%9.3f ms | max:%9.3f ms |\n",
 	      (double)work_list->total_runtime / 1e6,
 		 work_list->nb_atoms, (double)avg / 1e6,
 		 (double)work_list->max_lat / 1e6);
@@ -1359,9 +1361,9 @@ static void __cmd_lat(void)
 	read_events();
 	sort_lat();
 
-	printf("\n ---------------------------------------------------------------------------------------\n");
-	printf("  Task                  |  Runtime ms | Switches | Average delay ms | Maximum delay ms |\n");
-	printf(" ---------------------------------------------------------------------------------------\n");
+	printf("\n -----------------------------------------------------------------------------------------\n");
+	printf("  Task                  |   Runtime ms  | Switches | Average delay ms | Maximum delay ms |\n");
+	printf(" -----------------------------------------------------------------------------------------\n");
 
 	next = rb_first(&sorted_atom_root);
 
@@ -1373,18 +1375,32 @@ static void __cmd_lat(void)
 		next = rb_next(next);
 	}
 
-	printf(" ---------------------------------------------------------------------------------------\n");
-	printf("  TOTAL:                |%9.3f ms |%9Ld |",
+	printf(" -----------------------------------------------------------------------------------------\n");
+	printf("  TOTAL:                |%11.3f ms |%9Ld |\n",
 		(double)all_runtime/1e6, all_count);
 
-	if (unordered_timestamps && nr_timestamps) {
-		printf(" INFO: %.2f%% unordered events.\n",
-			(double)unordered_timestamps/(double)nr_timestamps*100.0);
+	printf(" ---------------------------------------------------\n");
+	if (nr_unordered_timestamps && nr_timestamps) {
+		printf("  INFO: %.3f%% unordered timestamps (%ld out of %ld)\n",
+			(double)nr_unordered_timestamps/(double)nr_timestamps*100.0,
+			nr_unordered_timestamps, nr_timestamps);
 	} else {
+	}
+	if (nr_lost_events && nr_events) {
+		printf("  INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n",
+			(double)nr_lost_events/(double)nr_events*100.0,
+			nr_lost_events, nr_events, nr_lost_chunks);
+	}
+	if (nr_state_machine_bugs && nr_timestamps) {
+		printf("  INFO: %.3f%% state machine bugs (%ld out of %ld)",
+			(double)nr_state_machine_bugs/(double)nr_timestamps*100.0,
+			nr_state_machine_bugs, nr_timestamps);
+		if (nr_lost_events)
+			printf(" (due to lost events?)");
 		printf("\n");
 	}
+	printf("\n");
 
-	printf(" -------------------------------------------------\n\n");
 }
 
 static struct trace_sched_handler *trace_handler;
@@ -1585,8 +1601,13 @@ process_event(event_t *event, unsigned long offset, unsigned long head)
 {
 	trace_event(event);
 
+	nr_events++;
 	switch (event->header.type) {
-	case PERF_EVENT_MMAP ... PERF_EVENT_LOST:
+	case PERF_EVENT_MMAP:
+		return 0;
+	case PERF_EVENT_LOST:
+		nr_lost_chunks++;
+		nr_lost_events += event->lost.lost;
 		return 0;
 
 	case PERF_EVENT_COMM:
@@ -1768,6 +1789,7 @@ static const char *record_args[] = {
 	"-R",
 	"-M",
 	"-f",
+	"-m", "1024",
 	"-c", "1",
 	"-e", "sched:sched_switch:r",
 	"-e", "sched:sched_stat_wait:r",
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index fa2d4e91d329..2495529cae7d 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -52,7 +52,7 @@ struct lost_event {
  */
 struct read_event {
 	struct perf_event_header header;
-	u32 pid,tid;
+	u32 pid, tid;
 	u64 value;
 	u64 time_enabled;
 	u64 time_running;

commit b9183f9b99a9bd3349aefbd51d22f7e1bdc4a087
Author: Ingo Molnar <mingo@elte.hu>
Date:   Tue Sep 15 15:56:32 2009 +0200

    amd64_edac: build driver only on AMD hardware
    
    -tip testing found the following build failure (config attached):
    
    drivers/built-in.o: In function `amd64_check':
    amd64_edac.c:(.text+0x3e9491): undefined reference to `amd_decode_nb_mce'
    drivers/built-in.o: In function `amd64_init_2nd_stage':
    amd64_edac.c:(.text+0x3e9b46): undefined reference to `amd_report_gart_errors'
    amd64_edac.c:(.text+0x3e9b55): undefined reference to `amd_register_ecc_decoder'
    drivers/built-in.o: In function `amd64_nbea_store':
    amd64_edac_dbg.c:(.text+0x3ea22e): undefined reference to `amd_decode_nb_mce'
    drivers/built-in.o: In function `amd64_remove_one_instance':
    amd64_edac.c:(.devexit.text+0x3eea): undefined reference to `amd_report_gart_errors'
    amd64_edac.c:(.devexit.text+0x3ef6): undefined reference to `amd_unregister_ecc_decoder'
    
    the AMD EDAC code has a dependency on CONFIG_CPU_SUP_AMD facilities. The
    patch below solves the problem here.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>

diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
index 4339b1a879cd..a3ca18e2d7cf 100644
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -59,7 +59,7 @@ config EDAC_MM_EDAC
 
 config EDAC_AMD64
 	tristate "AMD64 (Opteron, Athlon64) K8, F10h, F11h"
-	depends on EDAC_MM_EDAC && K8_NB && X86_64 && PCI
+	depends on EDAC_MM_EDAC && K8_NB && X86_64 && PCI && CPU_SUP_AMD
 	help
 	  Support for error detection and correction on the AMD 64
 	  Families of Memory Controllers (K8, F10h and F11h)

commit 51e0304ce6e55a6e59658558916b4f74da085ff0
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 16 08:54:45 2009 +0200

    sched: Implement a gentler fair-sleepers feature
    
    Add back FAIR_SLEEPERS and GENTLE_FAIR_SLEEPERS.
    
    FAIR_SLEEPERS is the old logic: credit sleepers with their sleep time.
    
    GENTLE_FAIR_SLEEPERS dampens this a bit: 50% of their sleep time gets
    credited.
    
    The hope here is to still give the benefits of fair-sleepers logic
    (quick wakeups, etc.) while not allow them to have 100% of their
    sleep time as if they were running.
    
    Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Mike Galbraith <efault@gmx.de>
    LKML-Reference: <new-submission>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a37f311f436e..acf16a8d934b 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -711,7 +711,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 
 	if (!initial) {
 		/* sleeps upto a single latency don't count. */
-		if (sched_feat(NEW_FAIR_SLEEPERS)) {
+		if (sched_feat(FAIR_SLEEPERS)) {
 			unsigned long thresh = sysctl_sched_latency;
 
 			/*
@@ -725,6 +725,13 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 					 task_of(se)->policy != SCHED_IDLE))
 				thresh = calc_delta_fair(thresh, se);
 
+			/*
+			 * Halve their sleep time's effect, to allow
+			 * for a gentler effect of sleepers:
+			 */
+			if (sched_feat(GENTLE_FAIR_SLEEPERS))
+				thresh >>= 1;
+
 			vruntime -= thresh;
 		}
 	}
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 70115c69c7a9..fd375675f834 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -3,7 +3,14 @@
  * considers the task to be running during that period. This gives it
  * a service deficit on wakeup, allowing it to run sooner.
  */
-SCHED_FEAT(NEW_FAIR_SLEEPERS, 0)
+SCHED_FEAT(FAIR_SLEEPERS, 1)
+
+/*
+ * Only give sleepers 50% of their service deficit. This allows
+ * them to run sooner, but does not allow tons of sleepers to
+ * rip the spread apart.
+ */
+SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1)
 
 /*
  * By not normalizing the sleep time, heavy tasks get an effective

commit fdaa45e95d2ef59a140d2fb2e487141f83f5a07c
Author: Ingo Molnar <mingo@elte.hu>
Date:   Tue Sep 15 11:00:26 2009 +0200

    slub: Fix build error in kmem_cache_open() with !CONFIG_SLUB_DEBUG
    
    This build bug:
    
     mm/slub.c: In function 'kmem_cache_open':
     mm/slub.c:2476: error: 'disable_higher_order_debug' undeclared (first use in this function)
     mm/slub.c:2476: error: (Each undeclared identifier is reported only once
     mm/slub.c:2476: error: for each function it appears in.)
    
    Triggers because there's no !CONFIG_SLUB_DEBUG definition for
    disable_higher_order_debug.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>

diff --git a/mm/slub.c b/mm/slub.c
index a5789b91d179..0a216aae227e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1071,6 +1071,8 @@ static inline unsigned long kmem_cache_flags(unsigned long objsize,
 }
 #define slub_debug 0
 
+#define disable_higher_order_debug 0
+
 static inline unsigned long slabs_node(struct kmem_cache *s, int node)
 							{ return 0; }
 static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)