Patches contributed by Eötvös Lorand University


commit f2858d8ad9858e63c87257553c5721cba5db95ae
Author: Ingo Molnar <mingo@elte.hu>
Date:   Fri Sep 11 12:12:54 2009 +0200

    perf sched: Add 'perf sched latency' and 'perf sched replay'
    
    Separate the option parsing cleanly and add two variants:
    
     - 'perf sched latency' (can be abbreviated via 'perf sched lat')
     - 'perf sched replay'  (can be abbreviated via 'perf sched rep')
    
    Also add a repeat count option to replay and add a separation
    set of options for replay.
    
    Do the sorting setup only in the latency sub-command.
    
    Display separate help screens for 'perf sched' and
    'perf sched replay -h' - i.e. further separation of the
    sub-commands.
    
    Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Mike Galbraith <efault@gmx.de>
    Cc: Paul Mackerras <paulus@samba.org>
    Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
    Cc: Frederic Weisbecker <fweisbec@gmail.com>
    LKML-Reference: <new-submission>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 10fcd49e298b..e01cc63b98cc 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -30,9 +30,6 @@ static struct thread		*last_match;
 static struct perf_header	*header;
 static u64			sample_type;
 
-static int			replay_mode;
-static int			lat_mode;
-
 static char			default_sort_order[] = "avg, max, switch, runtime";
 static char			*sort_order = default_sort_order;
 
@@ -623,9 +620,11 @@ static void test_calibrations(void)
 	printf("the sleep test took %Ld nsecs\n", T1-T0);
 }
 
+static unsigned long replay_repeat = 10;
+
 static void __cmd_replay(void)
 {
-	long nr_iterations = 10, i;
+	unsigned long i;
 
 	calibrate_run_measurement_overhead();
 	calibrate_sleep_measurement_overhead();
@@ -651,7 +650,7 @@ static void __cmd_replay(void)
 
 	create_tasks();
 	printf("------------------------------------------------------------\n");
-	for (i = 0; i < nr_iterations; i++)
+	for (i = 0; i < replay_repeat; i++)
 		run_one_test();
 }
 
@@ -1623,21 +1622,45 @@ static int read_events(void)
 }
 
 static const char * const sched_usage[] = {
-	"perf sched [<options>] <command>",
+	"perf sched [<options>] {record|latency|replay}",
 	NULL
 };
 
-static const struct option options[] = {
+static const struct option sched_options[] = {
+	OPT_BOOLEAN('v', "verbose", &verbose,
+		    "be more verbose (show symbol address, etc)"),
 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
 		    "dump raw trace in ASCII"),
-	OPT_BOOLEAN('r', "replay", &replay_mode,
-		    "replay sched behaviour from traces"),
-	OPT_BOOLEAN('l', "latency", &lat_mode,
-		    "measure various latencies"),
+	OPT_END()
+};
+
+static const char * const latency_usage[] = {
+	"perf sched latency [<options>]",
+	NULL
+};
+
+static const struct option latency_options[] = {
 	OPT_STRING('s', "sort", &sort_order, "key[,key2...]",
 		   "sort by key(s): runtime, switch, avg, max"),
 	OPT_BOOLEAN('v', "verbose", &verbose,
 		    "be more verbose (show symbol address, etc)"),
+	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
+		    "dump raw trace in ASCII"),
+	OPT_END()
+};
+
+static const char * const replay_usage[] = {
+	"perf sched replay [<options>]",
+	NULL
+};
+
+static const struct option replay_options[] = {
+	OPT_INTEGER('r', "repeat", &replay_repeat,
+		    "repeat the workload replay N times (-1: infinite)"),
+	OPT_BOOLEAN('v', "verbose", &verbose,
+		    "be more verbose (show symbol address, etc)"),
+	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
+		    "dump raw trace in ASCII"),
 	OPT_END()
 };
 
@@ -1649,7 +1672,7 @@ static void setup_sorting(void)
 			tok; tok = strtok_r(NULL, ", ", &tmp)) {
 		if (sort_dimension__add(tok, &sort_list) < 0) {
 			error("Unknown --sort key: `%s'", tok);
-			usage_with_options(sched_usage, options);
+			usage_with_options(latency_usage, latency_options);
 		}
 	}
 
@@ -1663,29 +1686,32 @@ int cmd_sched(int argc, const char **argv, const char *prefix __used)
 	symbol__init();
 	page_size = getpagesize();
 
-	argc = parse_options(argc, argv, options, sched_usage, 0);
-	if (argc) {
-		/*
-		 * Special case: if there's an argument left then assume tha
-		 * it's a symbol filter:
-		 */
-		if (argc > 1)
-			usage_with_options(sched_usage, options);
-	}
+	argc = parse_options(argc, argv, sched_options, sched_usage,
+			     PARSE_OPT_STOP_AT_NON_OPTION);
+	if (!argc)
+		usage_with_options(sched_usage, sched_options);
 
-	if (replay_mode)
-		trace_handler = &replay_ops;
-	else if (lat_mode)
+	if (!strncmp(argv[0], "lat", 3)) {
 		trace_handler = &lat_ops;
-	else
-		usage_with_options(sched_usage, options);
-
-	setup_sorting();
-
-	if (replay_mode)
-		__cmd_replay();
-	else if (lat_mode)
+		if (argc > 1) {
+			argc = parse_options(argc, argv, latency_options, latency_usage, 0);
+			if (argc)
+				usage_with_options(latency_usage, latency_options);
+			setup_sorting();
+		}
 		__cmd_lat();
+	} else if (!strncmp(argv[0], "rep", 3)) {
+		trace_handler = &replay_ops;
+		if (argc) {
+			argc = parse_options(argc, argv, replay_options, replay_usage, 0);
+			if (argc)
+				usage_with_options(replay_usage, replay_options);
+		}
+		__cmd_replay();
+	} else {
+		usage_with_options(sched_usage, sched_options);
+	}
+
 
 	return 0;
 }

commit 3e304147cdb404ce6d1dd0e50cb19f52142bb363
Author: Ingo Molnar <mingo@elte.hu>
Date:   Sat Sep 12 10:08:34 2009 +0200

    perf sched: Output runtime and context switch totals
    
    After:
    
    -----------------------------------------------------------------------------------
     Task              |  Runtime ms | Switches | Average delay ms | Maximum delay ms |
    -----------------------------------------------------------------------------------
     make              |    0.678 ms |       13 | avg:    0.018 ms | max:    0.050 ms |
     gcc               |    0.014 ms |        2 | avg:    0.320 ms | max:    0.627 ms |
     gcc               |    0.000 ms |        2 | avg:    0.185 ms | max:    0.369 ms |
    ...
    -----------------------------------------------------------------------------------
     TOTAL:            |   21.316 ms |       63 |
    ---------------------------------------------
    
    Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Mike Galbraith <efault@gmx.de>
    Cc: Paul Mackerras <paulus@samba.org>
    Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
    Cc: Frederic Weisbecker <fweisbec@gmail.com>
    LKML-Reference: <new-submission>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index c382f530d4c6..727cc5b852c2 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -1087,6 +1087,9 @@ static struct trace_sched_handler lat_ops  = {
 	.fork_event		= latency_fork_event,
 };
 
+static u64 all_runtime;
+static u64 all_count;
+
 static void output_lat_thread(struct thread_latency *lat)
 {
 	struct lat_snapshot *shot;
@@ -1111,6 +1114,9 @@ static void output_lat_thread(struct thread_latency *lat)
 		total += delta;
 	}
 
+	all_runtime += total_runtime;
+	all_count += count;
+
 	if (!count)
 		return;
 
@@ -1133,7 +1139,7 @@ static void __cmd_lat(void)
 	read_events();
 
 	printf("-----------------------------------------------------------------------------------\n");
-	printf(" Task              |  runtime ms | switches | average delay ms | maximum delay ms |\n");
+	printf(" Task              |  Runtime ms | Switches | Average delay ms | Maximum delay ms |\n");
 	printf("-----------------------------------------------------------------------------------\n");
 
 	next = rb_first(&lat_snapshot_root);
@@ -1147,6 +1153,9 @@ static void __cmd_lat(void)
 	}
 
 	printf("-----------------------------------------------------------------------------------\n");
+	printf(" TOTAL:            |%9.3f ms |%9Ld |\n",
+		(double)all_runtime/1e9, all_count);
+	printf("---------------------------------------------\n");
 }
 
 static struct trace_sched_handler *trace_handler;

commit ea92ed5a8f4e6c638efe7de2efe8a875d580ad3f
Author: Ingo Molnar <mingo@elte.hu>
Date:   Sat Sep 12 10:08:34 2009 +0200

    perf sched: Add runtime stats
    
    Extend the latency tracking structure with scheduling atom
    runtime info - and sum it up during per task display.
    
    (Also clean up a few details.)
    
    Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Mike Galbraith <efault@gmx.de>
    Cc: Paul Mackerras <paulus@samba.org>
    Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
    Cc: Frederic Weisbecker <fweisbec@gmail.com>
    LKML-Reference: <new-submission>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index a084c284e198..c382f530d4c6 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -243,8 +243,8 @@ add_sched_event_run(struct task_desc *task, nsec_t timestamp, u64 duration)
 	nr_run_events++;
 }
 
-static unsigned long targetless_wakeups;
-static unsigned long multitarget_wakeups;
+static unsigned long		targetless_wakeups;
+static unsigned long		multitarget_wakeups;
 
 static void
 add_sched_event_wakeup(struct task_desc *task, nsec_t timestamp,
@@ -485,10 +485,10 @@ static void create_tasks(void)
 	}
 }
 
-static nsec_t cpu_usage;
-static nsec_t runavg_cpu_usage;
-static nsec_t parent_cpu_usage;
-static nsec_t runavg_parent_cpu_usage;
+static nsec_t			cpu_usage;
+static nsec_t			runavg_cpu_usage;
+static nsec_t			parent_cpu_usage;
+static nsec_t			runavg_parent_cpu_usage;
 
 static void wait_for_tasks(void)
 {
@@ -858,9 +858,9 @@ replay_fork_event(struct trace_fork_event *fork_event,
 }
 
 static struct trace_sched_handler replay_ops  = {
-	.wakeup_event = replay_wakeup_event,
-	.switch_event = replay_switch_event,
-	.fork_event = replay_fork_event,
+	.wakeup_event		= replay_wakeup_event,
+	.switch_event		= replay_switch_event,
+	.fork_event		= replay_fork_event,
 };
 
 #define TASK_STATE_TO_CHAR_STR "RSDTtZX"
@@ -877,6 +877,7 @@ struct lat_snapshot {
 	enum thread_state	state;
 	u64			wake_up_time;
 	u64			sched_in_time;
+	u64			runtime;
 };
 
 struct thread_latency {
@@ -951,6 +952,7 @@ latency_fork_event(struct trace_fork_event *fork_event __used,
 	/* should insert the newcomer */
 }
 
+__used
 static char sched_out_state(struct trace_switch_event *switch_event)
 {
 	const char *str = TASK_STATE_TO_CHAR_STR;
@@ -960,17 +962,15 @@ static char sched_out_state(struct trace_switch_event *switch_event)
 
 static void
 lat_sched_out(struct thread_latency *lat,
-	     struct trace_switch_event *switch_event)
+	     struct trace_switch_event *switch_event __used, u64 delta)
 {
 	struct lat_snapshot *snapshot;
 
-	if (sched_out_state(switch_event) == 'R')
-		return;
-
 	snapshot = calloc(sizeof(*snapshot), 1);
 	if (!snapshot)
 		die("Non memory");
 
+	snapshot->runtime = delta;
 	list_add_tail(&snapshot->list, &lat->snapshot_list);
 }
 
@@ -997,16 +997,31 @@ lat_sched_in(struct thread_latency *lat, u64 timestamp)
 	snapshot->sched_in_time = timestamp;
 }
 
-
 static void
 latency_switch_event(struct trace_switch_event *switch_event,
 		     struct event *event __used,
-		     int cpu __used,
+		     int cpu,
 		     u64 timestamp,
 		     struct thread *thread __used)
 {
 	struct thread_latency *out_lat, *in_lat;
 	struct thread *sched_out, *sched_in;
+	u64 timestamp0;
+	s64 delta;
+
+	if (cpu >= MAX_CPUS || cpu < 0)
+		return;
+
+	timestamp0 = cpu_last_switched[cpu];
+	cpu_last_switched[cpu] = timestamp;
+	if (timestamp0)
+		delta = timestamp - timestamp0;
+	else
+		delta = 0;
+
+	if (delta < 0)
+		die("hm, delta: %Ld < 0 ?\n", delta);
+
 
 	sched_out = threads__findnew(switch_event->prev_pid, &threads, &last_match);
 	sched_in = threads__findnew(switch_event->next_pid, &threads, &last_match);
@@ -1028,7 +1043,7 @@ latency_switch_event(struct trace_switch_event *switch_event,
 	}
 
 	lat_sched_in(in_lat, timestamp);
-	lat_sched_out(out_lat, switch_event);
+	lat_sched_out(out_lat, switch_event, delta);
 }
 
 static void
@@ -1067,9 +1082,9 @@ latency_wakeup_event(struct trace_wakeup_event *wakeup_event,
 }
 
 static struct trace_sched_handler lat_ops  = {
-	.wakeup_event = latency_wakeup_event,
-	.switch_event = latency_switch_event,
-	.fork_event = latency_fork_event,
+	.wakeup_event		= latency_wakeup_event,
+	.switch_event		= latency_switch_event,
+	.fork_event		= latency_fork_event,
 };
 
 static void output_lat_thread(struct thread_latency *lat)
@@ -1080,8 +1095,11 @@ static void output_lat_thread(struct thread_latency *lat)
 	int ret;
 	u64 max = 0, avg;
 	u64 total = 0, delta;
+	u64 total_runtime = 0;
 
 	list_for_each_entry(shot, &lat->snapshot_list, list) {
+		total_runtime += shot->runtime;
+
 		if (shot->state != THREAD_SCHED_IN)
 			continue;
 
@@ -1104,7 +1122,7 @@ static void output_lat_thread(struct thread_latency *lat)
 	avg = total / count;
 
 	printf("|%9.3f ms |%9d | avg:%9.3f ms | max:%9.3f ms |\n",
-		0.0, count, (double)avg/1e9, (double)max/1e9);
+		(double)total_runtime/1e9, count, (double)avg/1e9, (double)max/1e9);
 }
 
 static void __cmd_lat(void)

commit d9340c1db3f52460a8335eeb127a2728c5bba6ce
Author: Ingo Molnar <mingo@elte.hu>
Date:   Sat Sep 12 10:08:34 2009 +0200

    perf sched: Display time in milliseconds, reorganize output
    
    After:
    
    -----------------------------------------------------------------------------------
     Task              |  runtime ms | switches | average delay ms | maximum delay ms |
    -----------------------------------------------------------------------------------
     migration/0       |    0.000 ms |        1 | avg:    0.047 ms | max:    0.047 ms |
     ksoftirqd/0       |    0.000 ms |        1 | avg:    0.039 ms | max:    0.039 ms |
     migration/1       |    0.000 ms |        3 | avg:    0.013 ms | max:    0.016 ms |
     migration/3       |    0.000 ms |        2 | avg:    0.003 ms | max:    0.004 ms |
     migration/4       |    0.000 ms |        1 | avg:    0.022 ms | max:    0.022 ms |
     distccd           |    0.000 ms |        1 | avg:    0.004 ms | max:    0.004 ms |
     distccd           |    0.000 ms |        1 | avg:    0.014 ms | max:    0.014 ms |
     distccd           |    0.000 ms |        2 | avg:    0.000 ms | max:    0.000 ms |
     distccd           |    0.000 ms |        2 | avg:    0.012 ms | max:    0.019 ms |
     distccd           |    0.000 ms |        1 | avg:    0.002 ms | max:    0.002 ms |
     as                |    0.000 ms |        2 | avg:    0.019 ms | max:    0.019 ms |
     as                |    0.000 ms |        3 | avg:    0.015 ms | max:    0.017 ms |
     as                |    0.000 ms |        1 | avg:    0.009 ms | max:    0.009 ms |
     perf              |    0.000 ms |        1 | avg:    0.001 ms | max:    0.001 ms |
     gcc               |    0.000 ms |        1 | avg:    0.021 ms | max:    0.021 ms |
     run-mozilla.sh    |    0.000 ms |        2 | avg:    0.010 ms | max:    0.017 ms |
     mozilla-plugin-   |    0.000 ms |        1 | avg:    0.006 ms | max:    0.006 ms |
     gcc               |    0.000 ms |        2 | avg:    0.013 ms | max:    0.013 ms |
    -----------------------------------------------------------------------------------
    
    (The runtime ms column is not filled in yet.)
    
    Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Mike Galbraith <efault@gmx.de>
    Cc: Paul Mackerras <paulus@samba.org>
    Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
    Cc: Frederic Weisbecker <fweisbec@gmail.com>
    LKML-Reference: <new-submission>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 84699cf036ab..a084c284e198 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -1096,14 +1096,15 @@ static void output_lat_thread(struct thread_latency *lat)
 	if (!count)
 		return;
 
-	ret = printf("%s", lat->thread->comm);
+	ret = printf(" %s ", lat->thread->comm);
 
-	for (i = 0; i < 25 - ret; i++)
+	for (i = 0; i < 19 - ret; i++)
 		printf(" ");
 
 	avg = total / count;
 
-	printf("%5d        %10llu       %10llu      %10llu\n", count, total, avg, max);
+	printf("|%9.3f ms |%9d | avg:%9.3f ms | max:%9.3f ms |\n",
+		0.0, count, (double)avg/1e9, (double)max/1e9);
 }
 
 static void __cmd_lat(void)
@@ -1113,11 +1114,9 @@ static void __cmd_lat(void)
 	setup_pager();
 	read_events();
 
-	printf(" Tasks");
-	printf("                     count");
-	printf("          total");
-	printf("              avg");
-	printf("            max\n\n");
+	printf("-----------------------------------------------------------------------------------\n");
+	printf(" Task              |  runtime ms | switches | average delay ms | maximum delay ms |\n");
+	printf("-----------------------------------------------------------------------------------\n");
 
 	next = rb_first(&lat_snapshot_root);
 
@@ -1128,6 +1127,8 @@ static void __cmd_lat(void)
 		output_lat_thread(lat);
 		next = rb_next(next);
 	}
+
+	printf("-----------------------------------------------------------------------------------\n");
 }
 
 static struct trace_sched_handler *trace_handler;
diff --git a/tools/perf/util/trace-event-read.c b/tools/perf/util/trace-event-read.c
index a1217a10632f..1b5c847d2c22 100644
--- a/tools/perf/util/trace-event-read.c
+++ b/tools/perf/util/trace-event-read.c
@@ -458,12 +458,13 @@ struct record *trace_read_data(int cpu)
 	return data;
 }
 
-void trace_report (void)
+void trace_report(void)
 {
 	const char *input_file = "trace.info";
 	char buf[BUFSIZ];
 	char test[] = { 23, 8, 68 };
 	char *version;
+	int show_version = 0;
 	int show_funcs = 0;
 	int show_printk = 0;
 
@@ -480,7 +481,8 @@ void trace_report (void)
 		die("not a trace file (missing tracing)");
 
 	version = read_string();
-	printf("version = %s\n", version);
+	if (show_version)
+		printf("version = %s\n", version);
 	free(version);
 
 	read_or_die(buf, 1);

commit 46f392c97f9fd772426ed3361c5179a0d44b8c3f
Author: Ingo Molnar <mingo@elte.hu>
Date:   Sat Sep 12 10:08:34 2009 +0200

    perf sched: Clean up latency and replay sub-commands
    
    - Separate the latency and the replay commands more cleanly
    
     - Use consistent naming
    
     - Display help page on 'perf sched' outlining comments,
       instead of aborting
    
    Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Mike Galbraith <efault@gmx.de>
    Cc: Paul Mackerras <paulus@samba.org>
    Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
    Cc: Frederic Weisbecker <fweisbec@gmail.com>
    LKML-Reference: <new-submission>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 4f9e943181a3..84699cf036ab 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -543,24 +543,7 @@ static void wait_for_tasks(void)
 	}
 }
 
-static int __cmd_sched(void);
-
-static void parse_trace(void)
-{
-	__cmd_sched();
-
-	printf("nr_run_events:        %ld\n", nr_run_events);
-	printf("nr_sleep_events:      %ld\n", nr_sleep_events);
-	printf("nr_wakeup_events:     %ld\n", nr_wakeup_events);
-
-	if (targetless_wakeups)
-		printf("target-less wakeups:  %ld\n", targetless_wakeups);
-	if (multitarget_wakeups)
-		printf("multi-target wakeups: %ld\n", multitarget_wakeups);
-	if (nr_run_events_optimized)
-		printf("run events optimized: %ld\n",
-			nr_run_events_optimized);
-}
+static int read_events(void);
 
 static unsigned long nr_runs;
 static nsec_t sum_runtime;
@@ -637,6 +620,38 @@ static void test_calibrations(void)
 	printf("the sleep test took %Ld nsecs\n", T1-T0);
 }
 
+static void __cmd_replay(void)
+{
+	long nr_iterations = 10, i;
+
+	calibrate_run_measurement_overhead();
+	calibrate_sleep_measurement_overhead();
+
+	test_calibrations();
+
+	read_events();
+
+	printf("nr_run_events:        %ld\n", nr_run_events);
+	printf("nr_sleep_events:      %ld\n", nr_sleep_events);
+	printf("nr_wakeup_events:     %ld\n", nr_wakeup_events);
+
+	if (targetless_wakeups)
+		printf("target-less wakeups:  %ld\n", targetless_wakeups);
+	if (multitarget_wakeups)
+		printf("multi-target wakeups: %ld\n", multitarget_wakeups);
+	if (nr_run_events_optimized)
+		printf("run events optimized: %ld\n",
+			nr_run_events_optimized);
+
+	print_task_traces();
+	add_cross_task_wakeups();
+
+	create_tasks();
+	printf("------------------------------------------------------------\n");
+	for (i = 0; i < nr_iterations; i++)
+		run_one_test();
+}
+
 static int
 process_comm_event(event_t *event, unsigned long offset, unsigned long head)
 {
@@ -1091,10 +1106,13 @@ static void output_lat_thread(struct thread_latency *lat)
 	printf("%5d        %10llu       %10llu      %10llu\n", count, total, avg, max);
 }
 
-static void output_lat_results(void)
+static void __cmd_lat(void)
 {
 	struct rb_node *next;
 
+	setup_pager();
+	read_events();
+
 	printf(" Tasks");
 	printf("                     count");
 	printf("          total");
@@ -1312,7 +1330,7 @@ process_event(event_t *event, unsigned long offset, unsigned long head)
 	return 0;
 }
 
-static int __cmd_sched(void)
+static int read_events(void)
 {
 	int ret, rc = EXIT_FAILURE;
 	unsigned long offset = 0;
@@ -1408,8 +1426,8 @@ static int __cmd_sched(void)
 	return rc;
 }
 
-static const char * const annotate_usage[] = {
-	"perf trace [<options>] <command>",
+static const char * const sched_usage[] = {
+	"perf sched [<options>] <command>",
 	NULL
 };
 
@@ -1427,49 +1445,30 @@ static const struct option options[] = {
 
 int cmd_sched(int argc, const char **argv, const char *prefix __used)
 {
-	long nr_iterations = 10, i;
-
 	symbol__init();
 	page_size = getpagesize();
 
-	argc = parse_options(argc, argv, options, annotate_usage, 0);
+	argc = parse_options(argc, argv, options, sched_usage, 0);
 	if (argc) {
 		/*
 		 * Special case: if there's an argument left then assume tha
 		 * it's a symbol filter:
 		 */
 		if (argc > 1)
-			usage_with_options(annotate_usage, options);
+			usage_with_options(sched_usage, options);
 	}
 
-//	setup_pager();
-
 	if (replay_mode)
 		trace_handler = &replay_ops;
 	else if (lat_mode)
 		trace_handler = &lat_ops;
-	else /* We may need a default subcommand (perf trace?) */
-		die("Please select a sub command (-r)\n");
-
-	if (replay_mode) {
-		calibrate_run_measurement_overhead();
-		calibrate_sleep_measurement_overhead();
-
-		test_calibrations();
-
-		parse_trace();
-		print_task_traces();
-		add_cross_task_wakeups();
-
-		create_tasks();
-		printf("------------------------------------------------------------\n");
-		for (i = 0; i < nr_iterations; i++)
-			run_one_test();
-	} else if (lat_mode) {
-		setup_pager();
-		__cmd_sched();
-		output_lat_results();
-	}
+	else
+		usage_with_options(sched_usage, options);
+
+	if (replay_mode)
+		__cmd_replay();
+	else if (lat_mode)
+		__cmd_lat();
 
 	return 0;
 }

commit ad236fd23b6d6372dcacd549983cce051d2ccff6
Author: Ingo Molnar <mingo@elte.hu>
Date:   Fri Sep 11 12:12:54 2009 +0200

    perf sched: Tighten up the code
    
    Various small cleanups - removal of debug printks and dead
    functions, etc.
    
    Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Mike Galbraith <efault@gmx.de>
    Cc: Paul Mackerras <paulus@samba.org>
    Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
    Cc: Frederic Weisbecker <fweisbec@gmail.com>
    LKML-Reference: <new-submission>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 6ec4f51d536b..de93a2604528 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -61,29 +61,6 @@ static u64			sample_type;
 
 typedef unsigned long long nsec_t;
 
-#define printk(x...)		do { printf(x); fflush(stdout); } while (0)
-
-nsec_t prev_printk;
-
-#define __dprintk(x,y...) do {						 \
-	nsec_t __now = get_nsecs(), __delta = __now - prev_printk;	 \
-									 \
-	prev_printk = __now;						 \
-									 \
-	printf("%.3f [%Ld] [%.3f]: " x, (double)__now/1e6, __now, (double)__delta/1e6, y);\
-} while (0)
-
-#if !DEBUG
-# define dprintk(x...)	do { } while (0)
-#else
-# define dprintk(x...)	__dprintk(x)
-#endif
-
-#define __DP()		__dprintk("parent: line %d\n", __LINE__)
-#define DP()		dprintk("parent: line %d\n", __LINE__)
-#define D()		dprintk("task %ld: line %d\n", this_task->nr, __LINE__)
-
-
 static nsec_t run_measurement_overhead;
 static nsec_t sleep_measurement_overhead;
 
@@ -129,7 +106,7 @@ static void calibrate_run_measurement_overhead(void)
 	}
 	run_measurement_overhead = min_delta;
 
-	printk("run measurement overhead: %Ld nsecs\n", min_delta);
+	printf("run measurement overhead: %Ld nsecs\n", min_delta);
 }
 
 static void calibrate_sleep_measurement_overhead(void)
@@ -147,7 +124,7 @@ static void calibrate_sleep_measurement_overhead(void)
 	min_delta -= 10000;
 	sleep_measurement_overhead = min_delta;
 
-	printk("sleep measurement overhead: %Ld nsecs\n", min_delta);
+	printf("sleep measurement overhead: %Ld nsecs\n", min_delta);
 }
 
 #define COMM_LEN	20
@@ -293,7 +270,7 @@ add_sched_event_wakeup(struct task_desc *task, nsec_t timestamp,
 
 static void
 add_sched_event_sleep(struct task_desc *task, nsec_t timestamp,
-		      unsigned long uninterruptible __used)
+		      u64 task_state __used)
 {
 	struct sched_event *event = get_new_event(task, timestamp);
 
@@ -329,77 +306,13 @@ static struct task_desc *register_pid(unsigned long pid, const char *comm)
 	BUG_ON(!tasks);
 	tasks[task->nr] = task;
 
-	printk("registered task #%ld, PID %ld (%s)\n", nr_tasks, pid, comm);
+	if (verbose)
+		printf("registered task #%ld, PID %ld (%s)\n", nr_tasks, pid, comm);
 
 	return task;
 }
 
 
-static int first_trace_line = 1;
-
-static nsec_t first_timestamp;
-static nsec_t prev_timestamp;
-
-void parse_line(char *line);
-
-void parse_line(char *line)
-{
-	unsigned long param1 = 0, param2 = 0;
-	char comm[COMM_LEN], comm2[COMM_LEN];
-	unsigned long pid, pid2, timestamp0;
-	struct task_desc *task, *task2;
-	char func_str[SYM_LEN];
-	nsec_t timestamp;
-	int ret;
-
-	//"   <idle> 0     0D.s3    0us+: try_to_wake_up <events/0 9> (1 0)"
-	ret = sscanf(line, "%20s %5ld %*s %ldus%*c:"
-			   " %128s <%20s %ld> (%ld %ld)\n",
-		comm, &pid, &timestamp0,
-		func_str, comm2, &pid2, &param1, &param2);
-	dprintk("ret: %d\n", ret);
-	if (ret != 8)
-		return;
-
-	timestamp = timestamp0 * 1000LL;
-
-	if (first_trace_line) {
-		first_trace_line = 0;
-		first_timestamp = timestamp;
-	}
-
-	timestamp -= first_timestamp;
-	BUG_ON(timestamp < prev_timestamp);
-	prev_timestamp = timestamp;
-
-	dprintk("parsed: %s - %ld %Ld: %s - <%s %ld> (%ld %ld)\n",
-		comm,
-		pid,
-		timestamp,
-		func_str,
-		comm2,
-		pid2,
-		param1,
-		param2);
-
-	task = register_pid(pid, comm);
-	task2 = register_pid(pid2, comm2);
-
-	if (!strcmp(func_str, "update_curr")) {
-		dprintk("%Ld: task %ld runs for %ld nsecs\n",
-			timestamp, task->nr, param1);
-		add_sched_event_run(task, timestamp, param1);
-	} else if (!strcmp(func_str, "try_to_wake_up")) {
-		dprintk("%Ld: task %ld wakes up task %ld\n",
-			timestamp, task->nr, task2->nr);
-		add_sched_event_wakeup(task, timestamp, task2);
-	} else if (!strcmp(func_str, "deactivate_task")) {
-		dprintk("%Ld: task %ld goes to sleep (uninterruptible: %ld)\n",
-			timestamp, task->nr, param1);
-		add_sched_event_sleep(task, timestamp, param1);
-	}
-}
-
 static void print_task_traces(void)
 {
 	struct task_desc *task;
@@ -407,7 +320,7 @@ static void print_task_traces(void)
 
 	for (i = 0; i < nr_tasks; i++) {
 		task = tasks[i];
-		printk("task %6ld (%20s:%10ld), nr_events: %ld\n",
+		printf("task %6ld (%20s:%10ld), nr_events: %ld\n",
 			task->nr, task->comm, task->pid, task->nr_events);
 	}
 }
@@ -437,40 +350,16 @@ process_sched_event(struct task_desc *this_task __used, struct sched_event *even
 	now = get_nsecs();
 	delta = start_time + event->timestamp - now;
 
-	dprintk("task %ld, event #%ld, %Ld, delta: %.3f (%Ld)\n",
-		this_task->nr, event->nr, event->timestamp,
-		(double)delta/1e6, delta);
-
-	if (0 && delta > 0) {
-		dprintk("%.3f: task %ld FIX %.3f\n",
-			(double)event->timestamp/1e6,
-			this_task->nr,
-			(double)delta/1e6);
-		sleep_nsecs(start_time + event->timestamp - now);
-		nr_sleep_corrections++;
-	}
-
 	switch (event->type) {
 		case SCHED_EVENT_RUN:
-			dprintk("%.3f: task %ld RUN for %.3f\n",
-				(double)event->timestamp/1e6,
-				this_task->nr,
-				(double)event->duration/1e6);
 			burn_nsecs(event->duration);
 			break;
 		case SCHED_EVENT_SLEEP:
-			dprintk("%.3f: task %ld %s SLEEP\n",
-				(double)event->timestamp/1e6,
-				this_task->nr, event->wait_sem ? "" : "SKIP");
 			if (event->wait_sem)
 				ret = sem_wait(event->wait_sem);
 			BUG_ON(ret);
 			break;
 		case SCHED_EVENT_WAKEUP:
-			dprintk("%.3f: task %ld WAKEUP => task %ld\n",
-				(double)event->timestamp/1e6,
-				this_task->nr,
-				event->wakee->nr);
 			if (event->wait_sem)
 				ret = sem_post(event->wait_sem);
 			BUG_ON(ret);
@@ -511,14 +400,10 @@ static nsec_t get_cpu_usage_nsec_self(void)
 	BUG_ON(!file);
 
 	while ((chars = getline(&line, &len, file)) != -1) {
-		dprintk("got line with length %zu :\n", chars);
-		dprintk("%s", line);
 		ret = sscanf(line, "se.sum_exec_runtime : %ld.%06ld\n",
 			&msecs, &nsecs);
 		if (ret == 2) {
 			total = msecs*1e6 + nsecs;
-			dprintk("total: (%ld.%06ld) %Ld\n",
-				msecs, nsecs, total);
 			break;
 		}
 	}
@@ -536,19 +421,16 @@ static void *thread_func(void *ctx)
 	unsigned long i, ret;
 	char comm2[22];
 
-	dprintk("task %ld started up.\n", this_task->nr);
 	sprintf(comm2, ":%s", this_task->comm);
 	prctl(PR_SET_NAME, comm2);
 
 again:
 	ret = sem_post(&this_task->ready_for_work);
 	BUG_ON(ret);
-	D();
 	ret = pthread_mutex_lock(&start_work_mutex);
 	BUG_ON(ret);
 	ret = pthread_mutex_unlock(&start_work_mutex);
 	BUG_ON(ret);
-	D();
 
 	cpu_usage_0 = get_cpu_usage_nsec_self();
 
@@ -560,19 +442,13 @@ static void *thread_func(void *ctx)
 	cpu_usage_1 = get_cpu_usage_nsec_self();
 	this_task->cpu_usage = cpu_usage_1 - cpu_usage_0;
 
-	dprintk("task %ld cpu usage: %0.3f msecs\n",
-		this_task->nr, (double)this_task->cpu_usage / 1e6);
-
-	D();
 	ret = sem_post(&this_task->work_done_sem);
 	BUG_ON(ret);
-	D();
 
 	ret = pthread_mutex_lock(&work_done_wait_mutex);
 	BUG_ON(ret);
 	ret = pthread_mutex_unlock(&work_done_wait_mutex);
 	BUG_ON(ret);
-	D();
 
 	goto again;
 }
@@ -614,9 +490,7 @@ static void wait_for_tasks(void)
 	struct task_desc *task;
 	unsigned long i, ret;
 
-	DP();
 	start_time = get_nsecs();
-	DP();
 	cpu_usage = 0;
 	pthread_mutex_unlock(&work_done_wait_mutex);
 
@@ -633,24 +507,6 @@ static void wait_for_tasks(void)
 
 	pthread_mutex_unlock(&start_work_mutex);
 
-#if 0
-	for (i = 0; i < nr_tasks; i++) {
-		unsigned long missed;
-
-		task = tasks[i];
-		while (task->curr_event + 1 < task->nr_events) {
-			dprintk("parent waiting for %ld (%ld != %ld)\n",
-				i, task->curr_event, task->nr_events);
-			sleep_nsecs(100000000);
-		}
-		missed = task->nr_events - 1 - task->curr_event;
-		if (missed)
-			printk("task %ld missed events: %ld\n", i, missed);
-		ret = sem_post(&task->sleep_sem);
-		BUG_ON(ret);
-	}
-#endif
-	DP();
 	for (i = 0; i < nr_tasks; i++) {
 		task = tasks[i];
 		ret = sem_wait(&task->work_done_sem);
@@ -687,16 +543,16 @@ static void parse_trace(void)
 {
 	__cmd_sched();
 
-	printk("nr_run_events:        %ld\n", nr_run_events);
-	printk("nr_sleep_events:      %ld\n", nr_sleep_events);
-	printk("nr_wakeup_events:     %ld\n", nr_wakeup_events);
+	printf("nr_run_events:        %ld\n", nr_run_events);
+	printf("nr_sleep_events:      %ld\n", nr_sleep_events);
+	printf("nr_wakeup_events:     %ld\n", nr_wakeup_events);
 
 	if (targetless_wakeups)
-		printk("target-less wakeups:  %ld\n", targetless_wakeups);
+		printf("target-less wakeups:  %ld\n", targetless_wakeups);
 	if (multitarget_wakeups)
-		printk("multi-target wakeups: %ld\n", multitarget_wakeups);
+		printf("multi-target wakeups: %ld\n", multitarget_wakeups);
 	if (nr_run_events_optimized)
-		printk("run events optimized: %ld\n",
+		printf("run events optimized: %ld\n",
 			nr_run_events_optimized);
 }
 
@@ -728,17 +584,17 @@ static void run_one_test(void)
 		run_avg = delta;
 	run_avg = (run_avg*9 + delta)/10;
 
-	printk("#%-3ld: %0.3f, ",
+	printf("#%-3ld: %0.3f, ",
 		nr_runs, (double)delta/1000000.0);
 
 #if 0
-	printk("%0.2f +- %0.2f, ",
+	printf("%0.2f +- %0.2f, ",
 		(double)avg_delta/1e6, (double)std_dev/1e6);
 #endif
-	printk("ravg: %0.2f, ",
+	printf("ravg: %0.2f, ",
 		(double)run_avg/1e6);
 
-	printk("cpu: %0.2f / %0.2f",
+	printf("cpu: %0.2f / %0.2f",
 		(double)cpu_usage/1e6, (double)runavg_cpu_usage/1e6);
 
 #if 0
@@ -746,15 +602,15 @@ static void run_one_test(void)
 	 * rusage statistics done by the parent, these are less
 	 * accurate than the sum_exec_runtime based statistics:
 	 */
-	printk(" [%0.2f / %0.2f]",
+	printf(" [%0.2f / %0.2f]",
 		(double)parent_cpu_usage/1e6,
 		(double)runavg_parent_cpu_usage/1e6);
 #endif
 
-	printk("\n");
+	printf("\n");
 
 	if (nr_sleep_corrections)
-		printk(" (%ld sleep corrections)\n", nr_sleep_corrections);
+		printf(" (%ld sleep corrections)\n", nr_sleep_corrections);
 	nr_sleep_corrections = 0;
 }
 
@@ -766,13 +622,13 @@ static void test_calibrations(void)
 	burn_nsecs(1e6);
 	T1 = get_nsecs();
 
-	printk("the run test took %Ld nsecs\n", T1-T0);
+	printf("the run test took %Ld nsecs\n", T1-T0);
 
 	T0 = get_nsecs();
 	sleep_nsecs(1e6);
 	T1 = get_nsecs();
 
-	printk("the sleep test took %Ld nsecs\n", T1-T0);
+	printf("the sleep test took %Ld nsecs\n", T1-T0);
 }
 
 static int
@@ -820,12 +676,14 @@ process_sched_wakeup_event(struct trace_wakeup_event *wakeup_event, struct event
 {
 	struct task_desc *waker, *wakee;
 
-	printf("sched_wakeup event %p\n", event);
+	if (verbose) {
+		printf("sched_wakeup event %p\n", event);
 
-	printf(" ... pid %d woke up %s/%d\n",
-		wakeup_event->common_pid,
-		wakeup_event->comm,
-		wakeup_event->pid);
+		printf(" ... pid %d woke up %s/%d\n",
+			wakeup_event->common_pid,
+			wakeup_event->comm,
+			wakeup_event->pid);
+	}
 
 	waker = register_pid(wakeup_event->common_pid, "<unknown>");
 	wakee = register_pid(wakeup_event->pid, wakeup_event->comm);
@@ -863,7 +721,9 @@ process_sched_switch_event(struct trace_switch_event *switch_event, struct event
 	u64 timestamp0;
 	s64 delta;
 
-	printf("sched_switch event %p\n", event);
+	if (verbose)
+		printf("sched_switch event %p\n", event);
+
 	if (cpu >= MAX_CPUS || cpu < 0)
 		return;
 
@@ -876,10 +736,12 @@ process_sched_switch_event(struct trace_switch_event *switch_event, struct event
 	if (delta < 0)
 		die("hm, delta: %Ld < 0 ?\n", delta);
 
-	printf(" ... switch from %s/%d to %s/%d [ran %Ld nsecs]\n",
-		switch_event->prev_comm, switch_event->prev_pid,
-		switch_event->next_comm, switch_event->next_pid,
-		delta);
+	if (verbose) {
+		printf(" ... switch from %s/%d to %s/%d [ran %Ld nsecs]\n",
+			switch_event->prev_comm, switch_event->prev_pid,
+			switch_event->next_comm, switch_event->next_pid,
+			delta);
+	}
 
 	prev = register_pid(switch_event->prev_pid, switch_event->prev_comm);
 	next = register_pid(switch_event->next_pid, switch_event->next_comm);
@@ -887,6 +749,7 @@ process_sched_switch_event(struct trace_switch_event *switch_event, struct event
 	cpu_last_switched[cpu] = timestamp;
 
 	add_sched_event_run(prev, timestamp, delta);
+	add_sched_event_sleep(prev, timestamp, switch_event->prev_state);
 }
 
 struct trace_fork_event {
@@ -908,9 +771,11 @@ static void
 process_sched_fork_event(struct trace_fork_event *fork_event, struct event *event,
 		  int cpu __used, u64 timestamp __used, struct thread *thread __used)
 {
-	printf("sched_fork event %p\n", event);
-	printf("... parent: %s/%d\n", fork_event->parent_comm, fork_event->parent_pid);
-	printf("...  child: %s/%d\n", fork_event->child_comm, fork_event->child_pid);
+	if (verbose) {
+		printf("sched_fork event %p\n", event);
+		printf("... parent: %s/%d\n", fork_event->parent_comm, fork_event->parent_pid);
+		printf("...  child: %s/%d\n", fork_event->child_comm, fork_event->child_pid);
+	}
 	register_pid(fork_event->parent_pid, fork_event->parent_comm);
 	register_pid(fork_event->child_pid, fork_event->child_comm);
 }
@@ -918,11 +783,12 @@ process_sched_fork_event(struct trace_fork_event *fork_event, struct event *even
 static void process_sched_exit_event(struct event *event,
 		  int cpu __used, u64 timestamp __used, struct thread *thread __used)
 {
-	printf("sched_exit event %p\n", event);
+	if (verbose)
+		printf("sched_exit event %p\n", event);
 }
 
 static void
-process_raw_event(event_t *raw_event, void *more_data,
+process_raw_event(event_t *raw_event __used, void *more_data,
 		  int cpu, u64 timestamp, struct thread *thread)
 {
 	struct {
@@ -935,14 +801,6 @@ process_raw_event(event_t *raw_event, void *more_data,
 	type = trace_parse_common_type(raw->data);
 	event = trace_find_event(type);
 
-	/*
-	 * FIXME: better resolve from pid from the struct trace_entry
-	 * field, although it should be the same than this perf
-	 * event pid
-	 */
-	printf("id %d, type: %d, event: %s\n",
-		raw_event->header.type, type, event->name);
-
 	if (!strcmp(event->name, "sched_switch"))
 		process_sched_switch_event(more_data, event, cpu, timestamp, thread);
 	if (!strcmp(event->name, "sched_wakeup"))
@@ -1197,7 +1055,7 @@ int cmd_sched(int argc, const char **argv, const char *prefix __used)
 	add_cross_task_wakeups();
 
 	create_tasks();
-	printk("------------------------------------------------------------\n");
+	printf("------------------------------------------------------------\n");
 	for (i = 0; i < nr_iterations; i++)
 		run_one_test();
 

commit fbf9482911825f965829567aea8acff3bbc5279c
Author: Ingo Molnar <mingo@elte.hu>
Date:   Fri Sep 11 12:12:54 2009 +0200

    perf sched: Implement the scheduling workload replay engine
    
    Integrate the schedbench.c bits with the raw trace events
    that we get from the perf machinery, and activate the
    workload replayer/simulator.
    
    Example of a captured 'make -j' workload:
    
    $ perf sched
    
      run measurement overhead: 90 nsecs
      sleep measurement overhead: 2724743 nsecs
      the run test took 1000081 nsecs
      the sleep test took 2981111 nsecs
      version = 0.5
      ...
      nr_run_events:        70
      nr_sleep_events:      66
      nr_wakeup_events:     9
      target-less wakeups:  71
      multi-target wakeups: 47
      run events optimized: 139
      task      0 (                perf:      6607), nr_events: 2
      task      1 (                perf:      6608), nr_events: 6
      task      2 (                    :         0), nr_events: 1
      task      3 (                make:      6609), nr_events: 5
      task      4 (                  sh:      6610), nr_events: 4
      task      5 (                make:      6611), nr_events: 6
      task      6 (                  sh:      6612), nr_events: 4
      task      7 (                make:      6613), nr_events: 5
      task      8 (        migration/11:        25), nr_events: 1
      task      9 (        migration/13:        29), nr_events: 1
      task     10 (        migration/15:        33), nr_events: 1
      task     11 (         migration/9:        21), nr_events: 1
      task     12 (                  sh:      6614), nr_events: 4
      task     13 (                make:      6615), nr_events: 5
      task     14 (                  sh:      6616), nr_events: 4
      task     15 (                make:      6617), nr_events: 7
      task     16 (         migration/3:         9), nr_events: 1
      task     17 (         migration/5:        13), nr_events: 1
      task     18 (         migration/7:        17), nr_events: 1
      task     19 (         migration/1:         5), nr_events: 1
      task     20 (                  sh:      6618), nr_events: 4
      task     21 (                make:      6619), nr_events: 5
      task     22 (                  sh:      6620), nr_events: 4
      task     23 (                make:      6621), nr_events: 10
      task     24 (                  sh:      6623), nr_events: 3
      task     25 (                 gcc:      6624), nr_events: 4
      task     26 (                 gcc:      6625), nr_events: 4
      task     27 (                 gcc:      6626), nr_events: 5
      task     28 (            collect2:      6627), nr_events: 5
      task     29 (                  sh:      6622), nr_events: 1
      task     30 (                make:      6628), nr_events: 7
      task     31 (                  sh:      6630), nr_events: 4
      task     32 (                 gcc:      6631), nr_events: 4
      task     33 (                  sh:      6629), nr_events: 1
      task     34 (                 gcc:      6632), nr_events: 4
      task     35 (                 gcc:      6633), nr_events: 4
      task     36 (            collect2:      6634), nr_events: 4
      task     37 (                make:      6635), nr_events: 8
      task     38 (                  sh:      6637), nr_events: 4
      task     39 (                  sh:      6636), nr_events: 1
      task     40 (                 gcc:      6638), nr_events: 4
      task     41 (                 gcc:      6639), nr_events: 4
      task     42 (                 gcc:      6640), nr_events: 4
      task     43 (            collect2:      6641), nr_events: 4
      task     44 (                make:      6642), nr_events: 6
      task     45 (                  sh:      6643), nr_events: 5
      task     46 (                  sh:      6644), nr_events: 3
      task     47 (                  sh:      6645), nr_events: 4
      task     48 (                make:      6646), nr_events: 6
      task     49 (                  sh:      6647), nr_events: 3
      task     50 (                make:      6648), nr_events: 5
      task     51 (                  sh:      6649), nr_events: 5
      task     52 (                  sh:      6650), nr_events: 6
      task     53 (                make:      6651), nr_events: 4
      task     54 (                make:      6652), nr_events: 5
      task     55 (                make:      6653), nr_events: 4
      task     56 (                make:      6654), nr_events: 4
      task     57 (                make:      6655), nr_events: 5
      task     58 (                  sh:      6656), nr_events: 4
      task     59 (                 gcc:      6657), nr_events: 9
      task     60 (         ksoftirqd/3:        10), nr_events: 1
      task     61 (                 gcc:      6658), nr_events: 4
      task     62 (                make:      6659), nr_events: 5
      task     63 (                  sh:      6660), nr_events: 3
      task     64 (                 gcc:      6661), nr_events: 5
      task     65 (            collect2:      6662), nr_events: 4
      ------------------------------------------------------------
      #1  : 256.745, ravg: 256.74, cpu: 0.00 / 0.00
      #2  : 439.372, ravg: 275.01, cpu: 0.00 / 0.00
      #3  : 411.971, ravg: 288.70, cpu: 0.00 / 0.00
      #4  : 385.500, ravg: 298.38, cpu: 0.00 / 0.00
      #5  : 366.526, ravg: 305.20, cpu: 0.00 / 0.00
      #6  : 381.281, ravg: 312.81, cpu: 0.00 / 0.00
      #7  : 410.756, ravg: 322.60, cpu: 0.00 / 0.00
      #8  : 368.009, ravg: 327.14, cpu: 0.00 / 0.00
      #9  : 408.098, ravg: 335.24, cpu: 0.00 / 0.00
      #10 : 368.582, ravg: 338.57, cpu: 0.00 / 0.00
    
    I.e. we successfully analyzed the trace, replayed it
    via real threads and measured the replayed workload's
    scheduling properties.
    
    This is how it looked like in 'top' output:
    
       PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND
      7164 mingo     20   0 1434m 8080  888 R 57.0  0.1   0:02.04 :perf
      7165 mingo     20   0 1434m 8080  888 R 41.8  0.1   0:01.52 :perf
      7228 mingo     20   0 1434m 8080  888 R 39.8  0.1   0:01.44 :gcc
      7225 mingo     20   0 1434m 8080  888 R 33.8  0.1   0:01.26 :gcc
      7202 mingo     20   0 1434m 8080  888 R 31.2  0.1   0:01.16 :sh
      7222 mingo     20   0 1434m 8080  888 R 25.2  0.1   0:00.96 :sh
      7211 mingo     20   0 1434m 8080  888 R 21.9  0.1   0:00.82 :sh
      7213 mingo     20   0 1434m 8080  888 D 19.2  0.1   0:00.74 :sh
      7194 mingo     20   0 1434m 8080  888 D 18.6  0.1   0:00.72 :make
    
    There's still various kinks in it - more patches to come.
    
    Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Mike Galbraith <efault@gmx.de>
    Cc: Paul Mackerras <paulus@samba.org>
    Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
    Cc: Frederic Weisbecker <fweisbec@gmail.com>
    LKML-Reference: <new-submission>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index c66e6a321371..6ec4f51d536b 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -57,7 +57,7 @@ static u64			sample_type;
 
 #define BUG_ON(x)	assert(!(x))
 
-#define DEBUG		1
+#define DEBUG		0
 
 typedef unsigned long long nsec_t;
 
@@ -238,15 +238,14 @@ static struct sched_event *last_event(struct task_desc *task)
 }
 
 static void
-add_sched_event_run(struct task_desc *task, nsec_t timestamp,
-		    unsigned long duration)
+add_sched_event_run(struct task_desc *task, nsec_t timestamp, u64 duration)
 {
 	struct sched_event *event, *curr_event = last_event(task);
 
 	/*
- 	 * optimize an existing RUN event by merging this one
- 	 * to it:
- 	 */
+	 * optimize an existing RUN event by merging this one
+	 * to it:
+	 */
 	if (curr_event && curr_event->type == SCHED_EVENT_RUN) {
 		nr_run_events_optimized++;
 		curr_event->duration += duration;
@@ -376,7 +375,7 @@ void parse_line(char *line)
 	dprintk("parsed: %s - %ld %Ld: %s - <%s %ld> (%ld %ld)\n",
 		comm,
 		pid,
-		timestamp, 
+		timestamp,
 		func_str,
 		comm2,
 		pid2,
@@ -429,7 +428,7 @@ static void add_cross_task_wakeups(void)
 }
 
 static void
-process_sched_event(struct task_desc *this_task, struct sched_event *event)
+process_sched_event(struct task_desc *this_task __used, struct sched_event *event)
 {
 	int ret = 0;
 	nsec_t now;
@@ -744,9 +743,9 @@ static void run_one_test(void)
 
 #if 0
 	/*
- 	 * rusage statistics done by the parent, these are less
- 	 * accurate than the sum_exec_runtime based statistics:
- 	 */
+	 * rusage statistics done by the parent, these are less
+	 * accurate than the sum_exec_runtime based statistics:
+	 */
 	printk(" [%0.2f / %0.2f]",
 		(double)parent_cpu_usage/1e6,
 		(double)runavg_parent_cpu_usage/1e6);
@@ -798,16 +797,128 @@ process_comm_event(event_t *event, unsigned long offset, unsigned long head)
 	return 0;
 }
 
-static void process_sched_wakeup_event(struct event *event,
+struct trace_wakeup_event {
+	u32 size;
+
+	u16 common_type;
+	u8 common_flags;
+	u8 common_preempt_count;
+	u32 common_pid;
+	u32 common_tgid;
+
+	char comm[16];
+	u32 pid;
+
+	u32 prio;
+	u32 success;
+	u32 cpu;
+};
+
+static void
+process_sched_wakeup_event(struct trace_wakeup_event *wakeup_event, struct event *event,
 		  int cpu __used, u64 timestamp __used, struct thread *thread __used)
 {
+	struct task_desc *waker, *wakee;
+
 	printf("sched_wakeup event %p\n", event);
+
+	printf(" ... pid %d woke up %s/%d\n",
+		wakeup_event->common_pid,
+		wakeup_event->comm,
+		wakeup_event->pid);
+
+	waker = register_pid(wakeup_event->common_pid, "<unknown>");
+	wakee = register_pid(wakeup_event->pid, wakeup_event->comm);
+
+	add_sched_event_wakeup(waker, timestamp, wakee);
 }
 
-static void process_sched_switch_event(struct event *event,
+struct trace_switch_event {
+	u32 size;
+
+	u16 common_type;
+	u8 common_flags;
+	u8 common_preempt_count;
+	u32 common_pid;
+	u32 common_tgid;
+
+	char prev_comm[16];
+	u32 prev_pid;
+	u32 prev_prio;
+	u64 prev_state;
+	char next_comm[16];
+	u32 next_pid;
+	u32 next_prio;
+};
+
+#define MAX_CPUS 4096
+
+unsigned long cpu_last_switched[MAX_CPUS];
+
+static void
+process_sched_switch_event(struct trace_switch_event *switch_event, struct event *event,
 		  int cpu __used, u64 timestamp __used, struct thread *thread __used)
 {
+	struct task_desc *prev, *next;
+	u64 timestamp0;
+	s64 delta;
+
 	printf("sched_switch event %p\n", event);
+	if (cpu >= MAX_CPUS || cpu < 0)
+		return;
+
+	timestamp0 = cpu_last_switched[cpu];
+	if (timestamp0)
+		delta = timestamp - timestamp0;
+	else
+		delta = 0;
+
+	if (delta < 0)
+		die("hm, delta: %Ld < 0 ?\n", delta);
+
+	printf(" ... switch from %s/%d to %s/%d [ran %Ld nsecs]\n",
+		switch_event->prev_comm, switch_event->prev_pid,
+		switch_event->next_comm, switch_event->next_pid,
+		delta);
+
+	prev = register_pid(switch_event->prev_pid, switch_event->prev_comm);
+	next = register_pid(switch_event->next_pid, switch_event->next_comm);
+
+	cpu_last_switched[cpu] = timestamp;
+
+	add_sched_event_run(prev, timestamp, delta);
+}
+
+struct trace_fork_event {
+	u32 size;
+
+	u16 common_type;
+	u8 common_flags;
+	u8 common_preempt_count;
+	u32 common_pid;
+	u32 common_tgid;
+
+	char parent_comm[16];
+	u32 parent_pid;
+	char child_comm[16];
+	u32 child_pid;
+};
+
+static void
+process_sched_fork_event(struct trace_fork_event *fork_event, struct event *event,
+		  int cpu __used, u64 timestamp __used, struct thread *thread __used)
+{
+	printf("sched_fork event %p\n", event);
+	printf("... parent: %s/%d\n", fork_event->parent_comm, fork_event->parent_pid);
+	printf("...  child: %s/%d\n", fork_event->child_comm, fork_event->child_pid);
+	register_pid(fork_event->parent_pid, fork_event->parent_comm);
+	register_pid(fork_event->child_pid, fork_event->child_comm);
+}
+
+static void process_sched_exit_event(struct event *event,
+		  int cpu __used, u64 timestamp __used, struct thread *thread __used)
+{
+	printf("sched_exit event %p\n", event);
 }
 
 static void
@@ -833,9 +944,15 @@ process_raw_event(event_t *raw_event, void *more_data,
 		raw_event->header.type, type, event->name);
 
 	if (!strcmp(event->name, "sched_switch"))
-		process_sched_switch_event(event, cpu, timestamp, thread);
+		process_sched_switch_event(more_data, event, cpu, timestamp, thread);
 	if (!strcmp(event->name, "sched_wakeup"))
-		process_sched_wakeup_event(event, cpu, timestamp, thread);
+		process_sched_wakeup_event(more_data, event, cpu, timestamp, thread);
+	if (!strcmp(event->name, "sched_wakeup_new"))
+		process_sched_wakeup_event(more_data, event, cpu, timestamp, thread);
+	if (!strcmp(event->name, "sched_process_fork"))
+		process_sched_fork_event(more_data, event, cpu, timestamp, thread);
+	if (!strcmp(event->name, "sched_process_exit"))
+		process_sched_exit_event(event, cpu, timestamp, thread);
 }
 
 static int
@@ -1053,7 +1170,7 @@ static const struct option options[] = {
 
 int cmd_sched(int argc, const char **argv, const char *prefix __used)
 {
-	long nr_iterations = LONG_MAX, i;
+	long nr_iterations = 10, i;
 
 	symbol__init();
 	page_size = getpagesize();
@@ -1068,8 +1185,7 @@ int cmd_sched(int argc, const char **argv, const char *prefix __used)
 			usage_with_options(annotate_usage, options);
 	}
 
-
-	setup_pager();
+//	setup_pager();
 
 	calibrate_run_measurement_overhead();
 	calibrate_sleep_measurement_overhead();

commit ec156764d424dd67283c2cd5e9f6f1b8388364ac
Author: Ingo Molnar <mingo@elte.hu>
Date:   Fri Sep 11 12:12:54 2009 +0200

    perf sched: Import schedbench.c
    
    Import the schedbench.c tool that i wrote some time ago to
    simulate scheduler behavior but never finished. It's a good
    basis for perf sched nevertheless.
    
    Most of its guts are not hooked up to the perf event loop
    yet - that will be done in the patches to come.
    
    Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Mike Galbraith <efault@gmx.de>
    Cc: Paul Mackerras <paulus@samba.org>
    Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
    Cc: Frederic Weisbecker <fweisbec@gmail.com>
    LKML-Reference: <new-submission>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 60228d9179ac..c66e6a321371 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -12,22 +12,770 @@
 #include "util/debug.h"
 
 #include "util/trace-event.h"
+#include <sys/types.h>
 
-static char		const *input_name = "perf.data";
-static int		input;
-static unsigned long	page_size;
-static unsigned long	mmap_window = 32;
+static char			const *input_name = "perf.data";
+static int			input;
+static unsigned long		page_size;
+static unsigned long		mmap_window = 32;
 
-static unsigned long	total = 0;
-static unsigned long	total_comm = 0;
+static unsigned long		total_comm = 0;
 
-static struct rb_root	threads;
-static struct thread	*last_match;
+static struct rb_root		threads;
+static struct thread		*last_match;
 
-static struct perf_header *header;
-static u64		sample_type;
+static struct perf_header	*header;
+static u64			sample_type;
 
 
+/*
+ * Scheduler benchmarks
+ */
+#include <sys/resource.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/prctl.h>
+
+#include <linux/unistd.h>
+
+#include <semaphore.h>
+#include <pthread.h>
+#include <signal.h>
+#include <values.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <fcntl.h>
+#include <time.h>
+#include <math.h>
+
+#include <stdio.h>
+
+#define PR_SET_NAME	15               /* Set process name */
+
+#define BUG_ON(x)	assert(!(x))
+
+#define DEBUG		1
+
+typedef unsigned long long nsec_t;
+
+#define printk(x...)		do { printf(x); fflush(stdout); } while (0)
+
+nsec_t prev_printk;
+
+#define __dprintk(x,y...) do {						 \
+	nsec_t __now = get_nsecs(), __delta = __now - prev_printk;	 \
+									 \
+	prev_printk = __now;						 \
+									 \
+	printf("%.3f [%Ld] [%.3f]: " x, (double)__now/1e6, __now, (double)__delta/1e6, y);\
+} while (0)
+
+#if !DEBUG
+# define dprintk(x...)	do { } while (0)
+#else
+# define dprintk(x...)	__dprintk(x)
+#endif
+
+#define __DP()		__dprintk("parent: line %d\n", __LINE__)
+#define DP()		dprintk("parent: line %d\n", __LINE__)
+#define D()		dprintk("task %ld: line %d\n", this_task->nr, __LINE__)
+
+
+static nsec_t run_measurement_overhead;
+static nsec_t sleep_measurement_overhead;
+
+static nsec_t get_nsecs(void)
+{
+	struct timespec ts;
+
+	clock_gettime(CLOCK_MONOTONIC, &ts);
+
+	return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
+}
+
+static void burn_nsecs(nsec_t nsecs)
+{
+	nsec_t T0 = get_nsecs(), T1;
+
+	do {
+		T1 = get_nsecs();
+	} while (T1 + run_measurement_overhead < T0 + nsecs);
+}
+
+static void sleep_nsecs(nsec_t nsecs)
+{
+	struct timespec ts;
+
+	ts.tv_nsec = nsecs % 999999999;
+	ts.tv_sec = nsecs / 999999999;
+
+	nanosleep(&ts, NULL);
+}
+
+static void calibrate_run_measurement_overhead(void)
+{
+	nsec_t T0, T1, delta, min_delta = 1000000000ULL;
+	int i;
+
+	for (i = 0; i < 10; i++) {
+		T0 = get_nsecs();
+		burn_nsecs(0);
+		T1 = get_nsecs();
+		delta = T1-T0;
+		min_delta = min(min_delta, delta);
+	}
+	run_measurement_overhead = min_delta;
+
+	printk("run measurement overhead: %Ld nsecs\n", min_delta);
+}
+
+static void calibrate_sleep_measurement_overhead(void)
+{
+	nsec_t T0, T1, delta, min_delta = 1000000000ULL;
+	int i;
+
+	for (i = 0; i < 10; i++) {
+		T0 = get_nsecs();
+		sleep_nsecs(10000);
+		T1 = get_nsecs();
+		delta = T1-T0;
+		min_delta = min(min_delta, delta);
+	}
+	min_delta -= 10000;
+	sleep_measurement_overhead = min_delta;
+
+	printk("sleep measurement overhead: %Ld nsecs\n", min_delta);
+}
+
+#define COMM_LEN	20
+#define SYM_LEN		129
+
+#define MAX_PID		65536
+
+static unsigned long nr_tasks;
+
+struct sched_event;
+
+struct task_desc {
+	unsigned long		nr;
+	unsigned long		pid;
+	char			comm[COMM_LEN];
+
+	unsigned long		nr_events;
+	unsigned long		curr_event;
+	struct sched_event	**events;
+
+	pthread_t		thread;
+	sem_t			sleep_sem;
+
+	sem_t			ready_for_work;
+	sem_t			work_done_sem;
+
+	nsec_t			cpu_usage;
+};
+
+enum sched_event_type {
+	SCHED_EVENT_RUN,
+	SCHED_EVENT_SLEEP,
+	SCHED_EVENT_WAKEUP,
+};
+
+struct sched_event {
+	enum sched_event_type	type;
+	nsec_t			timestamp;
+	nsec_t			duration;
+	unsigned long		nr;
+	int			specific_wait;
+	sem_t			*wait_sem;
+	struct task_desc	*wakee;
+};
+
+static struct task_desc		*pid_to_task[MAX_PID];
+
+static struct task_desc		**tasks;
+
+static pthread_mutex_t		start_work_mutex = PTHREAD_MUTEX_INITIALIZER;
+static nsec_t			start_time;
+
+static pthread_mutex_t		work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static unsigned long		nr_run_events;
+static unsigned long		nr_sleep_events;
+static unsigned long		nr_wakeup_events;
+
+static unsigned long		nr_sleep_corrections;
+static unsigned long		nr_run_events_optimized;
+
+static struct sched_event *
+get_new_event(struct task_desc *task, nsec_t timestamp)
+{
+	struct sched_event *event = calloc(1, sizeof(*event));
+	unsigned long idx = task->nr_events;
+	size_t size;
+
+	event->timestamp = timestamp;
+	event->nr = idx;
+
+	task->nr_events++;
+	size = sizeof(struct sched_event *) * task->nr_events;
+	task->events = realloc(task->events, size);
+	BUG_ON(!task->events);
+
+	task->events[idx] = event;
+
+	return event;
+}
+
+static struct sched_event *last_event(struct task_desc *task)
+{
+	if (!task->nr_events)
+		return NULL;
+
+	return task->events[task->nr_events - 1];
+}
+
+static void
+add_sched_event_run(struct task_desc *task, nsec_t timestamp,
+		    unsigned long duration)
+{
+	struct sched_event *event, *curr_event = last_event(task);
+
+	/*
+ 	 * optimize an existing RUN event by merging this one
+ 	 * to it:
+ 	 */
+	if (curr_event && curr_event->type == SCHED_EVENT_RUN) {
+		nr_run_events_optimized++;
+		curr_event->duration += duration;
+		return;
+	}
+
+	event = get_new_event(task, timestamp);
+
+	event->type = SCHED_EVENT_RUN;
+	event->duration = duration;
+
+	nr_run_events++;
+}
+
+static unsigned long targetless_wakeups;
+static unsigned long multitarget_wakeups;
+
+static void
+add_sched_event_wakeup(struct task_desc *task, nsec_t timestamp,
+		       struct task_desc *wakee)
+{
+	struct sched_event *event, *wakee_event;
+
+	event = get_new_event(task, timestamp);
+	event->type = SCHED_EVENT_WAKEUP;
+	event->wakee = wakee;
+
+	wakee_event = last_event(wakee);
+	if (!wakee_event || wakee_event->type != SCHED_EVENT_SLEEP) {
+		targetless_wakeups++;
+		return;
+	}
+	if (wakee_event->wait_sem) {
+		multitarget_wakeups++;
+		return;
+	}
+
+	wakee_event->wait_sem = calloc(1, sizeof(*wakee_event->wait_sem));
+	sem_init(wakee_event->wait_sem, 0, 0);
+	wakee_event->specific_wait = 1;
+	event->wait_sem = wakee_event->wait_sem;
+
+	nr_wakeup_events++;
+}
+
+static void
+add_sched_event_sleep(struct task_desc *task, nsec_t timestamp,
+		      unsigned long uninterruptible __used)
+{
+	struct sched_event *event = get_new_event(task, timestamp);
+
+	event->type = SCHED_EVENT_SLEEP;
+
+	nr_sleep_events++;
+}
+
+static struct task_desc *register_pid(unsigned long pid, const char *comm)
+{
+	struct task_desc *task;
+
+	BUG_ON(pid >= MAX_PID);
+
+	task = pid_to_task[pid];
+
+	if (task)
+		return task;
+
+	task = calloc(1, sizeof(*task));
+	task->pid = pid;
+	task->nr = nr_tasks;
+	strcpy(task->comm, comm);
+	/*
+	 * every task starts in sleeping state - this gets ignored
+	 * if there's no wakeup pointing to this sleep state:
+	 */
+	add_sched_event_sleep(task, 0, 0);
+
+	pid_to_task[pid] = task;
+	nr_tasks++;
+	tasks = realloc(tasks, nr_tasks*sizeof(struct task_task *));
+	BUG_ON(!tasks);
+	tasks[task->nr] = task;
+
+	printk("registered task #%ld, PID %ld (%s)\n", nr_tasks, pid, comm);
+
+	return task;
+}
+
+
+static int first_trace_line = 1;
+
+static nsec_t first_timestamp;
+static nsec_t prev_timestamp;
+
+void parse_line(char *line);
+
+void parse_line(char *line)
+{
+	unsigned long param1 = 0, param2 = 0;
+	char comm[COMM_LEN], comm2[COMM_LEN];
+	unsigned long pid, pid2, timestamp0;
+	struct task_desc *task, *task2;
+	char func_str[SYM_LEN];
+	nsec_t timestamp;
+	int ret;
+
+	//"   <idle> 0     0D.s3    0us+: try_to_wake_up <events/0 9> (1 0)"
+	ret = sscanf(line, "%20s %5ld %*s %ldus%*c:"
+			   " %128s <%20s %ld> (%ld %ld)\n",
+		comm, &pid, &timestamp0,
+		func_str, comm2, &pid2, &param1, &param2);
+	dprintk("ret: %d\n", ret);
+	if (ret != 8)
+		return;
+
+	timestamp = timestamp0 * 1000LL;
+
+	if (first_trace_line) {
+		first_trace_line = 0;
+		first_timestamp = timestamp;
+	}
+
+	timestamp -= first_timestamp;
+	BUG_ON(timestamp < prev_timestamp);
+	prev_timestamp = timestamp;
+
+	dprintk("parsed: %s - %ld %Ld: %s - <%s %ld> (%ld %ld)\n",
+		comm,
+		pid,
+		timestamp, 
+		func_str,
+		comm2,
+		pid2,
+		param1,
+		param2);
+
+	task = register_pid(pid, comm);
+	task2 = register_pid(pid2, comm2);
+
+	if (!strcmp(func_str, "update_curr")) {
+		dprintk("%Ld: task %ld runs for %ld nsecs\n",
+			timestamp, task->nr, param1);
+		add_sched_event_run(task, timestamp, param1);
+	} else if (!strcmp(func_str, "try_to_wake_up")) {
+		dprintk("%Ld: task %ld wakes up task %ld\n",
+			timestamp, task->nr, task2->nr);
+		add_sched_event_wakeup(task, timestamp, task2);
+	} else if (!strcmp(func_str, "deactivate_task")) {
+		dprintk("%Ld: task %ld goes to sleep (uninterruptible: %ld)\n",
+			timestamp, task->nr, param1);
+		add_sched_event_sleep(task, timestamp, param1);
+	}
+}
+
+static void print_task_traces(void)
+{
+	struct task_desc *task;
+	unsigned long i;
+
+	for (i = 0; i < nr_tasks; i++) {
+		task = tasks[i];
+		printk("task %6ld (%20s:%10ld), nr_events: %ld\n",
+			task->nr, task->comm, task->pid, task->nr_events);
+	}
+}
+
+static void add_cross_task_wakeups(void)
+{
+	struct task_desc *task1, *task2;
+	unsigned long i, j;
+
+	for (i = 0; i < nr_tasks; i++) {
+		task1 = tasks[i];
+		j = i + 1;
+		if (j == nr_tasks)
+			j = 0;
+		task2 = tasks[j];
+		add_sched_event_wakeup(task1, 0, task2);
+	}
+}
+
+static void
+process_sched_event(struct task_desc *this_task, struct sched_event *event)
+{
+	int ret = 0;
+	nsec_t now;
+	long long delta;
+
+	now = get_nsecs();
+	delta = start_time + event->timestamp - now;
+
+	dprintk("task %ld, event #%ld, %Ld, delta: %.3f (%Ld)\n",
+		this_task->nr, event->nr, event->timestamp,
+		(double)delta/1e6, delta);
+
+	if (0 && delta > 0) {
+		dprintk("%.3f: task %ld FIX %.3f\n",
+			(double)event->timestamp/1e6,
+			this_task->nr,
+			(double)delta/1e6);
+		sleep_nsecs(start_time + event->timestamp - now);
+		nr_sleep_corrections++;
+	}
+
+	switch (event->type) {
+		case SCHED_EVENT_RUN:
+			dprintk("%.3f: task %ld RUN for %.3f\n",
+				(double)event->timestamp/1e6,
+				this_task->nr,
+				(double)event->duration/1e6);
+			burn_nsecs(event->duration);
+			break;
+		case SCHED_EVENT_SLEEP:
+			dprintk("%.3f: task %ld %s SLEEP\n",
+				(double)event->timestamp/1e6,
+				this_task->nr, event->wait_sem ? "" : "SKIP");
+			if (event->wait_sem)
+				ret = sem_wait(event->wait_sem);
+			BUG_ON(ret);
+			break;
+		case SCHED_EVENT_WAKEUP:
+			dprintk("%.3f: task %ld WAKEUP => task %ld\n",
+				(double)event->timestamp/1e6,
+				this_task->nr,
+				event->wakee->nr);
+			if (event->wait_sem)
+				ret = sem_post(event->wait_sem);
+			BUG_ON(ret);
+			break;
+		default:
+			BUG_ON(1);
+	}
+}
+
+static nsec_t get_cpu_usage_nsec_parent(void)
+{
+	struct rusage ru;
+	nsec_t sum;
+	int err;
+
+	err = getrusage(RUSAGE_SELF, &ru);
+	BUG_ON(err);
+
+	sum =  ru.ru_utime.tv_sec*1e9 + ru.ru_utime.tv_usec*1e3;
+	sum += ru.ru_stime.tv_sec*1e9 + ru.ru_stime.tv_usec*1e3;
+
+	return sum;
+}
+
+static nsec_t get_cpu_usage_nsec_self(void)
+{
+	char filename [] = "/proc/1234567890/sched";
+	unsigned long msecs, nsecs;
+	char *line = NULL;
+	nsec_t total = 0;
+	size_t len = 0;
+	ssize_t chars;
+	FILE *file;
+	int ret;
+
+	sprintf(filename, "/proc/%d/sched", getpid());
+	file = fopen(filename, "r");
+	BUG_ON(!file);
+
+	while ((chars = getline(&line, &len, file)) != -1) {
+		dprintk("got line with length %zu :\n", chars);
+		dprintk("%s", line);
+		ret = sscanf(line, "se.sum_exec_runtime : %ld.%06ld\n",
+			&msecs, &nsecs);
+		if (ret == 2) {
+			total = msecs*1e6 + nsecs;
+			dprintk("total: (%ld.%06ld) %Ld\n",
+				msecs, nsecs, total);
+			break;
+		}
+	}
+	if (line)
+		free(line);
+	fclose(file);
+
+	return total;
+}
+
+static void *thread_func(void *ctx)
+{
+	struct task_desc *this_task = ctx;
+	nsec_t cpu_usage_0, cpu_usage_1;
+	unsigned long i, ret;
+	char comm2[22];
+
+	dprintk("task %ld started up.\n", this_task->nr);
+	sprintf(comm2, ":%s", this_task->comm);
+	prctl(PR_SET_NAME, comm2);
+
+again:
+	ret = sem_post(&this_task->ready_for_work);
+	BUG_ON(ret);
+	D();
+	ret = pthread_mutex_lock(&start_work_mutex);
+	BUG_ON(ret);
+	ret = pthread_mutex_unlock(&start_work_mutex);
+	BUG_ON(ret);
+	D();
+
+	cpu_usage_0 = get_cpu_usage_nsec_self();
+
+	for (i = 0; i < this_task->nr_events; i++) {
+		this_task->curr_event = i;
+		process_sched_event(this_task, this_task->events[i]);
+	}
+
+	cpu_usage_1 = get_cpu_usage_nsec_self();
+	this_task->cpu_usage = cpu_usage_1 - cpu_usage_0;
+
+	dprintk("task %ld cpu usage: %0.3f msecs\n",
+		this_task->nr, (double)this_task->cpu_usage / 1e6);
+
+	D();
+	ret = sem_post(&this_task->work_done_sem);
+	BUG_ON(ret);
+	D();
+
+	ret = pthread_mutex_lock(&work_done_wait_mutex);
+	BUG_ON(ret);
+	ret = pthread_mutex_unlock(&work_done_wait_mutex);
+	BUG_ON(ret);
+	D();
+
+	goto again;
+}
+
+static void create_tasks(void)
+{
+	struct task_desc *task;
+	pthread_attr_t attr;
+	unsigned long i;
+	int err;
+
+	err = pthread_attr_init(&attr);
+	BUG_ON(err);
+	err = pthread_attr_setstacksize(&attr, (size_t)(16*1024));
+	BUG_ON(err);
+	err = pthread_mutex_lock(&start_work_mutex);
+	BUG_ON(err);
+	err = pthread_mutex_lock(&work_done_wait_mutex);
+	BUG_ON(err);
+	for (i = 0; i < nr_tasks; i++) {
+		task = tasks[i];
+		sem_init(&task->sleep_sem, 0, 0);
+		sem_init(&task->ready_for_work, 0, 0);
+		sem_init(&task->work_done_sem, 0, 0);
+		task->curr_event = 0;
+		err = pthread_create(&task->thread, &attr, thread_func, task);
+		BUG_ON(err);
+	}
+}
+
+static nsec_t cpu_usage;
+static nsec_t runavg_cpu_usage;
+static nsec_t parent_cpu_usage;
+static nsec_t runavg_parent_cpu_usage;
+
+static void wait_for_tasks(void)
+{
+	nsec_t cpu_usage_0, cpu_usage_1;
+	struct task_desc *task;
+	unsigned long i, ret;
+
+	DP();
+	start_time = get_nsecs();
+	DP();
+	cpu_usage = 0;
+	pthread_mutex_unlock(&work_done_wait_mutex);
+
+	for (i = 0; i < nr_tasks; i++) {
+		task = tasks[i];
+		ret = sem_wait(&task->ready_for_work);
+		BUG_ON(ret);
+		sem_init(&task->ready_for_work, 0, 0);
+	}
+	ret = pthread_mutex_lock(&work_done_wait_mutex);
+	BUG_ON(ret);
+
+	cpu_usage_0 = get_cpu_usage_nsec_parent();
+
+	pthread_mutex_unlock(&start_work_mutex);
+
+#if 0
+	for (i = 0; i < nr_tasks; i++) {
+		unsigned long missed;
+
+		task = tasks[i];
+		while (task->curr_event + 1 < task->nr_events) {
+			dprintk("parent waiting for %ld (%ld != %ld)\n",
+				i, task->curr_event, task->nr_events);
+			sleep_nsecs(100000000);
+		}
+		missed = task->nr_events - 1 - task->curr_event;
+		if (missed)
+			printk("task %ld missed events: %ld\n", i, missed);
+		ret = sem_post(&task->sleep_sem);
+		BUG_ON(ret);
+	}
+#endif
+	DP();
+	for (i = 0; i < nr_tasks; i++) {
+		task = tasks[i];
+		ret = sem_wait(&task->work_done_sem);
+		BUG_ON(ret);
+		sem_init(&task->work_done_sem, 0, 0);
+		cpu_usage += task->cpu_usage;
+		task->cpu_usage = 0;
+	}
+
+	cpu_usage_1 = get_cpu_usage_nsec_parent();
+	if (!runavg_cpu_usage)
+		runavg_cpu_usage = cpu_usage;
+	runavg_cpu_usage = (runavg_cpu_usage*9 + cpu_usage)/10;
+
+	parent_cpu_usage = cpu_usage_1 - cpu_usage_0;
+	if (!runavg_parent_cpu_usage)
+		runavg_parent_cpu_usage = parent_cpu_usage;
+	runavg_parent_cpu_usage = (runavg_parent_cpu_usage*9 +
+				   parent_cpu_usage)/10;
+
+	ret = pthread_mutex_lock(&start_work_mutex);
+	BUG_ON(ret);
+
+	for (i = 0; i < nr_tasks; i++) {
+		task = tasks[i];
+		sem_init(&task->sleep_sem, 0, 0);
+		task->curr_event = 0;
+	}
+}
+
+static int __cmd_sched(void);
+
+static void parse_trace(void)
+{
+	__cmd_sched();
+
+	printk("nr_run_events:        %ld\n", nr_run_events);
+	printk("nr_sleep_events:      %ld\n", nr_sleep_events);
+	printk("nr_wakeup_events:     %ld\n", nr_wakeup_events);
+
+	if (targetless_wakeups)
+		printk("target-less wakeups:  %ld\n", targetless_wakeups);
+	if (multitarget_wakeups)
+		printk("multi-target wakeups: %ld\n", multitarget_wakeups);
+	if (nr_run_events_optimized)
+		printk("run events optimized: %ld\n",
+			nr_run_events_optimized);
+}
+
+static unsigned long nr_runs;
+static nsec_t sum_runtime;
+static nsec_t sum_fluct;
+static nsec_t run_avg;
+
+static void run_one_test(void)
+{
+	nsec_t T0, T1, delta, avg_delta, fluct, std_dev;
+
+	T0 = get_nsecs();
+	wait_for_tasks();
+	T1 = get_nsecs();
+
+	delta = T1 - T0;
+	sum_runtime += delta;
+	nr_runs++;
+
+	avg_delta = sum_runtime / nr_runs;
+	if (delta < avg_delta)
+		fluct = avg_delta - delta;
+	else
+		fluct = delta - avg_delta;
+	sum_fluct += fluct;
+	std_dev = sum_fluct / nr_runs / sqrt(nr_runs);
+	if (!run_avg)
+		run_avg = delta;
+	run_avg = (run_avg*9 + delta)/10;
+
+	printk("#%-3ld: %0.3f, ",
+		nr_runs, (double)delta/1000000.0);
+
+#if 0
+	printk("%0.2f +- %0.2f, ",
+		(double)avg_delta/1e6, (double)std_dev/1e6);
+#endif
+	printk("ravg: %0.2f, ",
+		(double)run_avg/1e6);
+
+	printk("cpu: %0.2f / %0.2f",
+		(double)cpu_usage/1e6, (double)runavg_cpu_usage/1e6);
+
+#if 0
+	/*
+ 	 * rusage statistics done by the parent, these are less
+ 	 * accurate than the sum_exec_runtime based statistics:
+ 	 */
+	printk(" [%0.2f / %0.2f]",
+		(double)parent_cpu_usage/1e6,
+		(double)runavg_parent_cpu_usage/1e6);
+#endif
+
+	printk("\n");
+
+	if (nr_sleep_corrections)
+		printk(" (%ld sleep corrections)\n", nr_sleep_corrections);
+	nr_sleep_corrections = 0;
+}
+
+static void test_calibrations(void)
+{
+	nsec_t T0, T1;
+
+	T0 = get_nsecs();
+	burn_nsecs(1e6);
+	T1 = get_nsecs();
+
+	printk("the run test took %Ld nsecs\n", T1-T0);
+
+	T0 = get_nsecs();
+	sleep_nsecs(1e6);
+	T1 = get_nsecs();
+
+	printk("the sleep test took %Ld nsecs\n", T1-T0);
+}
+
 static int
 process_comm_event(event_t *event, unsigned long offset, unsigned long head)
 {
@@ -50,6 +798,46 @@ process_comm_event(event_t *event, unsigned long offset, unsigned long head)
 	return 0;
 }
 
+static void process_sched_wakeup_event(struct event *event,
+		  int cpu __used, u64 timestamp __used, struct thread *thread __used)
+{
+	printf("sched_wakeup event %p\n", event);
+}
+
+static void process_sched_switch_event(struct event *event,
+		  int cpu __used, u64 timestamp __used, struct thread *thread __used)
+{
+	printf("sched_switch event %p\n", event);
+}
+
+static void
+process_raw_event(event_t *raw_event, void *more_data,
+		  int cpu, u64 timestamp, struct thread *thread)
+{
+	struct {
+		u32 size;
+		char data[0];
+	} *raw = more_data;
+	struct event *event;
+	int type;
+
+	type = trace_parse_common_type(raw->data);
+	event = trace_find_event(type);
+
+	/*
+	 * FIXME: better resolve from pid from the struct trace_entry
+	 * field, although it should be the same than this perf
+	 * event pid
+	 */
+	printf("id %d, type: %d, event: %s\n",
+		raw_event->header.type, type, event->name);
+
+	if (!strcmp(event->name, "sched_switch"))
+		process_sched_switch_event(event, cpu, timestamp, thread);
+	if (!strcmp(event->name, "sched_wakeup"))
+		process_sched_wakeup_event(event, cpu, timestamp, thread);
+}
+
 static int
 process_sample_event(event_t *event, unsigned long offset, unsigned long head)
 {
@@ -122,20 +910,8 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
 		dump_printf(" ...... dso: [hypervisor]\n");
 	}
 
-	if (sample_type & PERF_SAMPLE_RAW) {
-		struct {
-			u32 size;
-			char data[0];
-		} *raw = more_data;
-
-		/*
-		 * FIXME: better resolve from pid from the struct trace_entry
-		 * field, although it should be the same than this perf
-		 * event pid
-		 */
-		print_event(cpu, raw->data, raw->size, timestamp, thread->comm);
-	}
-	total += period;
+	if (sample_type & PERF_SAMPLE_RAW)
+		process_raw_event(event, more_data, cpu, timestamp, thread);
 
 	return 0;
 }
@@ -277,6 +1053,8 @@ static const struct option options[] = {
 
 int cmd_sched(int argc, const char **argv, const char *prefix __used)
 {
+	long nr_iterations = LONG_MAX, i;
+
 	symbol__init();
 	page_size = getpagesize();
 
@@ -293,5 +1071,19 @@ int cmd_sched(int argc, const char **argv, const char *prefix __used)
 
 	setup_pager();
 
-	return __cmd_sched();
+	calibrate_run_measurement_overhead();
+	calibrate_sleep_measurement_overhead();
+
+	test_calibrations();
+
+	parse_trace();
+	print_task_traces();
+	add_cross_task_wakeups();
+
+	create_tasks();
+	printk("------------------------------------------------------------\n");
+	for (i = 0; i < nr_iterations; i++)
+		run_one_test();
+
+	return 0;
 }
diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c
index 629e602d9405..16cf2d51c4e1 100644
--- a/tools/perf/util/trace-event-parse.c
+++ b/tools/perf/util/trace-event-parse.c
@@ -1799,7 +1799,7 @@ static int get_common_info(const char *type, int *offset, int *size)
 	return 0;
 }
 
-static int parse_common_type(void *data)
+int trace_parse_common_type(void *data)
 {
 	static int type_offset;
 	static int type_size;
@@ -1832,7 +1832,7 @@ static int parse_common_pid(void *data)
 	return read_size(data + pid_offset, pid_size);
 }
 
-static struct event *find_event(int id)
+struct event *trace_find_event(int id)
 {
 	struct event *event;
 
@@ -2420,8 +2420,8 @@ get_return_for_leaf(int cpu, int cur_pid, unsigned long long cur_func,
 	int type;
 	int pid;
 
-	type = parse_common_type(next->data);
-	event = find_event(type);
+	type = trace_parse_common_type(next->data);
+	event = trace_find_event(type);
 	if (!event)
 		return NULL;
 
@@ -2502,8 +2502,8 @@ print_graph_entry_leaf(struct event *event, void *data, struct record *ret_rec)
 	int type;
 	int i;
 
-	type = parse_common_type(ret_rec->data);
-	ret_event = find_event(type);
+	type = trace_parse_common_type(ret_rec->data);
+	ret_event = trace_find_event(type);
 
 	field = find_field(ret_event, "rettime");
 	if (!field)
@@ -2696,9 +2696,9 @@ void print_event(int cpu, void *data, int size, unsigned long long nsecs,
 	nsecs -= secs * NSECS_PER_SEC;
 	usecs = nsecs / NSECS_PER_USEC;
 
-	type = parse_common_type(data);
+	type = trace_parse_common_type(data);
 
-	event = find_event(type);
+	event = trace_find_event(type);
 	if (!event)
 		die("ug! no event found for type %d", type);
 
diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h
index 420294a5773e..bc81612fd244 100644
--- a/tools/perf/util/trace-event.h
+++ b/tools/perf/util/trace-event.h
@@ -234,6 +234,8 @@ extern int header_page_data_offset;
 extern int header_page_data_size;
 
 int parse_header_page(char *buf, unsigned long size);
+int trace_parse_common_type(void *data);
+struct event *trace_find_event(int id);
 
 void read_tracing_data(struct perf_counter_attr *pattrs, int nb_counters);
 

commit 0a02ad9331dd4db56c29c60db2e99c4daaad8a48
Author: Ingo Molnar <mingo@elte.hu>
Date:   Fri Sep 11 12:12:54 2009 +0200

    perf: Add 'perf sched' tool
    
    This turn-key tool allows scheduler measurements to be
    conducted and the results be displayed numerically.
    
    First baby step towards that goal: clone the new command off of
    perf trace.
    
    Fix a few other details along the way:
    
     - add (minimal) perf trace documentation
    
     - reorder a few places
    
     - list perf trace in the mainporcelain list as well
       as it's a very useful utility.
    
    Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Mike Galbraith <efault@gmx.de>
    Cc: Paul Mackerras <paulus@samba.org>
    Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
    Cc: Frederic Weisbecker <fweisbec@gmail.com>
    LKML-Reference: <new-submission>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt
new file mode 100644
index 000000000000..056320eecb3a
--- /dev/null
+++ b/tools/perf/Documentation/perf-sched.txt
@@ -0,0 +1,25 @@
+perf-sched(1)
+==============
+
+NAME
+----
+perf-sched - Read perf.data (created by perf record) and display sched output
+
+SYNOPSIS
+--------
+[verse]
+'perf sched' [-i <file> | --input=file] symbol_name
+
+DESCRIPTION
+-----------
+This command reads the input file and displays the latencies recorded.
+
+OPTIONS
+-------
+-D::
+--dump-raw-trace=::
+        Display verbose dump of the sched data.
+
+SEE ALSO
+--------
+linkperf:perf-record[1]
diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt
new file mode 100644
index 000000000000..41ed75398ca9
--- /dev/null
+++ b/tools/perf/Documentation/perf-trace.txt
@@ -0,0 +1,25 @@
+perf-trace(1)
+==============
+
+NAME
+----
+perf-trace - Read perf.data (created by perf record) and display trace output
+
+SYNOPSIS
+--------
+[verse]
+'perf trace' [-i <file> | --input=file] symbol_name
+
+DESCRIPTION
+-----------
+This command reads the input file and displays the trace recorded.
+
+OPTIONS
+-------
+-D::
+--dump-raw-trace=::
+        Display verbose dump of the trace data.
+
+SEE ALSO
+--------
+linkperf:perf-record[1]
diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 9f8d207a91bf..2cb8cc3f6772 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -376,6 +376,7 @@ LIB_OBJS += util/trace-event-info.o
 
 BUILTIN_OBJS += builtin-annotate.o
 BUILTIN_OBJS += builtin-help.o
+BUILTIN_OBJS += builtin-sched.o
 BUILTIN_OBJS += builtin-list.o
 BUILTIN_OBJS += builtin-record.o
 BUILTIN_OBJS += builtin-report.o
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
new file mode 100644
index 000000000000..60228d9179ac
--- /dev/null
+++ b/tools/perf/builtin-sched.c
@@ -0,0 +1,297 @@
+#include "builtin.h"
+
+#include "util/util.h"
+#include "util/cache.h"
+#include "util/symbol.h"
+#include "util/thread.h"
+#include "util/header.h"
+
+#include "util/parse-options.h"
+
+#include "perf.h"
+#include "util/debug.h"
+
+#include "util/trace-event.h"
+
+static char		const *input_name = "perf.data";
+static int		input;
+static unsigned long	page_size;
+static unsigned long	mmap_window = 32;
+
+static unsigned long	total = 0;
+static unsigned long	total_comm = 0;
+
+static struct rb_root	threads;
+static struct thread	*last_match;
+
+static struct perf_header *header;
+static u64		sample_type;
+
+
+static int
+process_comm_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	struct thread *thread;
+
+	thread = threads__findnew(event->comm.pid, &threads, &last_match);
+
+	dump_printf("%p [%p]: PERF_EVENT_COMM: %s:%d\n",
+		(void *)(offset + head),
+		(void *)(long)(event->header.size),
+		event->comm.comm, event->comm.pid);
+
+	if (thread == NULL ||
+	    thread__set_comm(thread, event->comm.comm)) {
+		dump_printf("problem processing PERF_EVENT_COMM, skipping event.\n");
+		return -1;
+	}
+	total_comm++;
+
+	return 0;
+}
+
+static int
+process_sample_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	char level;
+	int show = 0;
+	struct dso *dso = NULL;
+	struct thread *thread;
+	u64 ip = event->ip.ip;
+	u64 timestamp = -1;
+	u32 cpu = -1;
+	u64 period = 1;
+	void *more_data = event->ip.__more_data;
+	int cpumode;
+
+	thread = threads__findnew(event->ip.pid, &threads, &last_match);
+
+	if (sample_type & PERF_SAMPLE_TIME) {
+		timestamp = *(u64 *)more_data;
+		more_data += sizeof(u64);
+	}
+
+	if (sample_type & PERF_SAMPLE_CPU) {
+		cpu = *(u32 *)more_data;
+		more_data += sizeof(u32);
+		more_data += sizeof(u32); /* reserved */
+	}
+
+	if (sample_type & PERF_SAMPLE_PERIOD) {
+		period = *(u64 *)more_data;
+		more_data += sizeof(u64);
+	}
+
+	dump_printf("%p [%p]: PERF_EVENT_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n",
+		(void *)(offset + head),
+		(void *)(long)(event->header.size),
+		event->header.misc,
+		event->ip.pid, event->ip.tid,
+		(void *)(long)ip,
+		(long long)period);
+
+	dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid);
+
+	if (thread == NULL) {
+		eprintf("problem processing %d event, skipping it.\n",
+			event->header.type);
+		return -1;
+	}
+
+	cpumode = event->header.misc & PERF_EVENT_MISC_CPUMODE_MASK;
+
+	if (cpumode == PERF_EVENT_MISC_KERNEL) {
+		show = SHOW_KERNEL;
+		level = 'k';
+
+		dso = kernel_dso;
+
+		dump_printf(" ...... dso: %s\n", dso->name);
+
+	} else if (cpumode == PERF_EVENT_MISC_USER) {
+
+		show = SHOW_USER;
+		level = '.';
+
+	} else {
+		show = SHOW_HV;
+		level = 'H';
+
+		dso = hypervisor_dso;
+
+		dump_printf(" ...... dso: [hypervisor]\n");
+	}
+
+	if (sample_type & PERF_SAMPLE_RAW) {
+		struct {
+			u32 size;
+			char data[0];
+		} *raw = more_data;
+
+		/*
+		 * FIXME: better resolve from pid from the struct trace_entry
+		 * field, although it should be the same than this perf
+		 * event pid
+		 */
+		print_event(cpu, raw->data, raw->size, timestamp, thread->comm);
+	}
+	total += period;
+
+	return 0;
+}
+
+static int
+process_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	trace_event(event);
+
+	switch (event->header.type) {
+	case PERF_EVENT_MMAP ... PERF_EVENT_LOST:
+		return 0;
+
+	case PERF_EVENT_COMM:
+		return process_comm_event(event, offset, head);
+
+	case PERF_EVENT_EXIT ... PERF_EVENT_READ:
+		return 0;
+
+	case PERF_EVENT_SAMPLE:
+		return process_sample_event(event, offset, head);
+
+	case PERF_EVENT_MAX:
+	default:
+		return -1;
+	}
+
+	return 0;
+}
+
+static int __cmd_sched(void)
+{
+	int ret, rc = EXIT_FAILURE;
+	unsigned long offset = 0;
+	unsigned long head = 0;
+	struct stat perf_stat;
+	event_t *event;
+	uint32_t size;
+	char *buf;
+
+	trace_report();
+	register_idle_thread(&threads, &last_match);
+
+	input = open(input_name, O_RDONLY);
+	if (input < 0) {
+		perror("failed to open file");
+		exit(-1);
+	}
+
+	ret = fstat(input, &perf_stat);
+	if (ret < 0) {
+		perror("failed to stat file");
+		exit(-1);
+	}
+
+	if (!perf_stat.st_size) {
+		fprintf(stderr, "zero-sized file, nothing to do!\n");
+		exit(0);
+	}
+	header = perf_header__read(input);
+	head = header->data_offset;
+	sample_type = perf_header__sample_type(header);
+
+	if (!(sample_type & PERF_SAMPLE_RAW))
+		die("No trace sample to read. Did you call perf record "
+		    "without -R?");
+
+	if (load_kernel() < 0) {
+		perror("failed to load kernel symbols");
+		return EXIT_FAILURE;
+	}
+
+remap:
+	buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ,
+			   MAP_SHARED, input, offset);
+	if (buf == MAP_FAILED) {
+		perror("failed to mmap file");
+		exit(-1);
+	}
+
+more:
+	event = (event_t *)(buf + head);
+
+	size = event->header.size;
+	if (!size)
+		size = 8;
+
+	if (head + event->header.size >= page_size * mmap_window) {
+		unsigned long shift = page_size * (head / page_size);
+		int res;
+
+		res = munmap(buf, page_size * mmap_window);
+		assert(res == 0);
+
+		offset += shift;
+		head -= shift;
+		goto remap;
+	}
+
+	size = event->header.size;
+
+
+	if (!size || process_event(event, offset, head) < 0) {
+
+		/*
+		 * assume we lost track of the stream, check alignment, and
+		 * increment a single u64 in the hope to catch on again 'soon'.
+		 */
+
+		if (unlikely(head & 7))
+			head &= ~7ULL;
+
+		size = 8;
+	}
+
+	head += size;
+
+	if (offset + head < (unsigned long)perf_stat.st_size)
+		goto more;
+
+	rc = EXIT_SUCCESS;
+	close(input);
+
+	return rc;
+}
+
+static const char * const annotate_usage[] = {
+	"perf trace [<options>] <command>",
+	NULL
+};
+
+static const struct option options[] = {
+	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
+		    "dump raw trace in ASCII"),
+	OPT_BOOLEAN('v', "verbose", &verbose,
+		    "be more verbose (show symbol address, etc)"),
+	OPT_END()
+};
+
+int cmd_sched(int argc, const char **argv, const char *prefix __used)
+{
+	symbol__init();
+	page_size = getpagesize();
+
+	argc = parse_options(argc, argv, options, annotate_usage, 0);
+	if (argc) {
+		/*
+		 * Special case: if there's an argument left then assume tha
+		 * it's a symbol filter:
+		 */
+		if (argc > 1)
+			usage_with_options(annotate_usage, options);
+	}
+
+
+	setup_pager();
+
+	return __cmd_sched();
+}
diff --git a/tools/perf/builtin.h b/tools/perf/builtin.h
index 3a63e41fb44e..b09cadbd76b1 100644
--- a/tools/perf/builtin.h
+++ b/tools/perf/builtin.h
@@ -16,12 +16,13 @@ extern int check_pager_config(const char *cmd);
 
 extern int cmd_annotate(int argc, const char **argv, const char *prefix);
 extern int cmd_help(int argc, const char **argv, const char *prefix);
+extern int cmd_sched(int argc, const char **argv, const char *prefix);
+extern int cmd_list(int argc, const char **argv, const char *prefix);
 extern int cmd_record(int argc, const char **argv, const char *prefix);
 extern int cmd_report(int argc, const char **argv, const char *prefix);
 extern int cmd_stat(int argc, const char **argv, const char *prefix);
 extern int cmd_top(int argc, const char **argv, const char *prefix);
-extern int cmd_version(int argc, const char **argv, const char *prefix);
-extern int cmd_list(int argc, const char **argv, const char *prefix);
 extern int cmd_trace(int argc, const char **argv, const char *prefix);
+extern int cmd_version(int argc, const char **argv, const char *prefix);
 
 #endif
diff --git a/tools/perf/command-list.txt b/tools/perf/command-list.txt
index eebce30afbc0..3133c74729dd 100644
--- a/tools/perf/command-list.txt
+++ b/tools/perf/command-list.txt
@@ -4,7 +4,9 @@
 #
 perf-annotate			mainporcelain common
 perf-list			mainporcelain common
+perf-sched			mainporcelain common
 perf-record			mainporcelain common
 perf-report			mainporcelain common
 perf-stat			mainporcelain common
 perf-top			mainporcelain common
+perf-trace			mainporcelain common
diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index fe4589dde950..c972d1c35489 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -293,6 +293,7 @@ static void handle_internal_command(int argc, const char **argv)
 		{ "annotate", cmd_annotate, 0 },
 		{ "version", cmd_version, 0 },
 		{ "trace", cmd_trace, 0 },
+		{ "sched", cmd_sched, 0 },
 	};
 	unsigned int i;
 	static const char ext[] = STRIP_EXTENSION;

commit e1f8450854d69f0291882804406ea1bab3ca44b4
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Sep 10 20:52:09 2009 +0200

    sched: Fix sched::sched_stat_wait tracepoint field
    
    This weird perf trace output:
    
      cc1-9943  [001]  2802.059479616: sched_stat_wait: task: as:9944 wait: 2801938766276 [ns]
    
    Is caused by setting one component field of the delta to zero
    a bit too early. Move it to later.
    
    ( Note, this does not affect the NEW_FAIR_SLEEPERS interactivity bug,
      it's just a reporting bug in essence. )
    
    Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Nikos Chantziaras <realnc@arcor.de>
    Cc: Jens Axboe <jens.axboe@oracle.com>
    Cc: Mike Galbraith <efault@gmx.de>
    LKML-Reference: <4AA93D34.8040500@arcor.de>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 26fadb44250c..aa7f84121016 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -545,14 +545,13 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	schedstat_set(se->wait_count, se->wait_count + 1);
 	schedstat_set(se->wait_sum, se->wait_sum +
 			rq_of(cfs_rq)->clock - se->wait_start);
-	schedstat_set(se->wait_start, 0);
-
 #ifdef CONFIG_SCHEDSTATS
 	if (entity_is_task(se)) {
 		trace_sched_stat_wait(task_of(se),
 			rq_of(cfs_rq)->clock - se->wait_start);
 	}
 #endif
+	schedstat_set(se->wait_start, 0);
 }
 
 static inline void