Patches contributed by Eötvös Lorand University

<<Prev 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60[61]62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 Next>>

commit f2858d8ad9858e63c87257553c5721cba5db95ae
Author: Ingo Molnar <mingo@elte.hu>
Date:   Fri Sep 11 12:12:54 2009 +0200

    perf sched: Add 'perf sched latency' and 'perf sched replay'
    
    Separate the option parsing cleanly and add two variants:
    
     - 'perf sched latency' (can be abbreviated via 'perf sched lat')
     - 'perf sched replay'  (can be abbreviated via 'perf sched rep')
    
    Also add a repeat count option to replay and add a separation
    set of options for replay.
    
    Do the sorting setup only in the latency sub-command.
    
    Display separate help screens for 'perf sched' and
    'perf sched replay -h' - i.e. further separation of the
    sub-commands.
    
    Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Mike Galbraith <efault@gmx.de>
    Cc: Paul Mackerras <paulus@samba.org>
    Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
    Cc: Frederic Weisbecker <fweisbec@gmail.com>
    LKML-Reference: <new-submission>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 10fcd49e298b..e01cc63b98cc 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -30,9 +30,6 @@ static struct thread		*last_match;
 static struct perf_header	*header;
 static u64			sample_type;
 
-static int			replay_mode;
-static int			lat_mode;
-
 static char			default_sort_order[] = "avg, max, switch, runtime";
 static char			*sort_order = default_sort_order;
 
@@ -623,9 +620,11 @@ static void test_calibrations(void)
 	printf("the sleep test took %Ld nsecs\n", T1-T0);
 }
 
+static unsigned long replay_repeat = 10;
+
 static void __cmd_replay(void)
 {
-	long nr_iterations = 10, i;
+	unsigned long i;
 
 	calibrate_run_measurement_overhead();
 	calibrate_sleep_measurement_overhead();
@@ -651,7 +650,7 @@ static void __cmd_replay(void)
 
 	create_tasks();
 	printf("------------------------------------------------------------\n");
-	for (i = 0; i < nr_iterations; i++)
+	for (i = 0; i < replay_repeat; i++)
 		run_one_test();
 }
 
@@ -1623,21 +1622,45 @@ static int read_events(void)
 }
 
 static const char * const sched_usage[] = {
-	"perf sched [<options>] <command>",
+	"perf sched [<options>] {record|latency|replay}",
 	NULL
 };
 
-static const struct option options[] = {
+static const struct option sched_options[] = {
+	OPT_BOOLEAN('v', "verbose", &verbose,
+		    "be more verbose (show symbol address, etc)"),
 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
 		    "dump raw trace in ASCII"),
-	OPT_BOOLEAN('r', "replay", &replay_mode,
-		    "replay sched behaviour from traces"),
-	OPT_BOOLEAN('l', "latency", &lat_mode,
-		    "measure various latencies"),
+	OPT_END()
+};
+
+static const char * const latency_usage[] = {
+	"perf sched latency [<options>]",
+	NULL
+};
+
+static const struct option latency_options[] = {
 	OPT_STRING('s', "sort", &sort_order, "key[,key2...]",
 		   "sort by key(s): runtime, switch, avg, max"),
 	OPT_BOOLEAN('v', "verbose", &verbose,
 		    "be more verbose (show symbol address, etc)"),
+	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
+		    "dump raw trace in ASCII"),
+	OPT_END()
+};
+
+static const char * const replay_usage[] = {
+	"perf sched replay [<options>]",
+	NULL
+};
+
+static const struct option replay_options[] = {
+	OPT_INTEGER('r', "repeat", &replay_repeat,
+		    "repeat the workload replay N times (-1: infinite)"),
+	OPT_BOOLEAN('v', "verbose", &verbose,
+		    "be more verbose (show symbol address, etc)"),
+	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
+		    "dump raw trace in ASCII"),
 	OPT_END()
 };
 
@@ -1649,7 +1672,7 @@ static void setup_sorting(void)
 			tok; tok = strtok_r(NULL, ", ", &tmp)) {
 		if (sort_dimension__add(tok, &sort_list) < 0) {
 			error("Unknown --sort key: `%s'", tok);
-			usage_with_options(sched_usage, options);
+			usage_with_options(latency_usage, latency_options);
 		}
 	}
 
@@ -1663,29 +1686,32 @@ int cmd_sched(int argc, const char **argv, const char *prefix __used)
 	symbol__init();
 	page_size = getpagesize();
 
-	argc = parse_options(argc, argv, options, sched_usage, 0);
-	if (argc) {
-		/*
-		 * Special case: if there's an argument left then assume tha
-		 * it's a symbol filter:
-		 */
-		if (argc > 1)
-			usage_with_options(sched_usage, options);
-	}
+	argc = parse_options(argc, argv, sched_options, sched_usage,
+			     PARSE_OPT_STOP_AT_NON_OPTION);
+	if (!argc)
+		usage_with_options(sched_usage, sched_options);
 
-	if (replay_mode)
-		trace_handler = &replay_ops;
-	else if (lat_mode)
+	if (!strncmp(argv[0], "lat", 3)) {
 		trace_handler = &lat_ops;
-	else
-		usage_with_options(sched_usage, options);
-
-	setup_sorting();
-
-	if (replay_mode)
-		__cmd_replay();
-	else if (lat_mode)
+		if (argc > 1) {
+			argc = parse_options(argc, argv, latency_options, latency_usage, 0);
+			if (argc)
+				usage_with_options(latency_usage, latency_options);
+			setup_sorting();
+		}
 		__cmd_lat();
+	} else if (!strncmp(argv[0], "rep", 3)) {
+		trace_handler = &replay_ops;
+		if (argc) {
+			argc = parse_options(argc, argv, replay_options, replay_usage, 0);
+			if (argc)
+				usage_with_options(replay_usage, replay_options);
+		}
+		__cmd_replay();
+	} else {
+		usage_with_options(sched_usage, sched_options);
+	}
+
 
 	return 0;
 }

commit 3e304147cdb404ce6d1dd0e50cb19f52142bb363
Author: Ingo Molnar <mingo@elte.hu>
Date:   Sat Sep 12 10:08:34 2009 +0200

    perf sched: Output runtime and context switch totals
    
    After:
    
    -----------------------------------------------------------------------------------
     Task              |  Runtime ms | Switches | Average delay ms | Maximum delay ms |
    -----------------------------------------------------------------------------------
     make              |    0.678 ms |       13 | avg:    0.018 ms | max:    0.050 ms |
     gcc               |    0.014 ms |        2 | avg:    0.320 ms | max:    0.627 ms |
     gcc               |    0.000 ms |        2 | avg:    0.185 ms | max:    0.369 ms |
    ...
    -----------------------------------------------------------------------------------
     TOTAL:            |   21.316 ms |       63 |
    ---------------------------------------------
    
    Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Mike Galbraith <efault@gmx.de>
    Cc: Paul Mackerras <paulus@samba.org>
    Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
    Cc: Frederic Weisbecker <fweisbec@gmail.com>
    LKML-Reference: <new-submission>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index c382f530d4c6..727cc5b852c2 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -1087,6 +1087,9 @@ static struct trace_sched_handler lat_ops  = {
 	.fork_event		= latency_fork_event,
 };
 
+static u64 all_runtime;
+static u64 all_count;
+
 static void output_lat_thread(struct thread_latency *lat)
 {
 	struct lat_snapshot *shot;
@@ -1111,6 +1114,9 @@ static void output_lat_thread(struct thread_latency *lat)
 		total += delta;
 	}
 
+	all_runtime += total_runtime;
+	all_count += count;
+
 	if (!count)
 		return;
 
@@ -1133,7 +1139,7 @@ static void __cmd_lat(void)
 	read_events();
 
 	printf("-----------------------------------------------------------------------------------\n");
-	printf(" Task              |  runtime ms | switches | average delay ms | maximum delay ms |\n");
+	printf(" Task              |  Runtime ms | Switches | Average delay ms | Maximum delay ms |\n");
 	printf("-----------------------------------------------------------------------------------\n");
 
 	next = rb_first(&lat_snapshot_root);
@@ -1147,6 +1153,9 @@ static void __cmd_lat(void)
 	}
 
 	printf("-----------------------------------------------------------------------------------\n");
+	printf(" TOTAL:            |%9.3f ms |%9Ld |\n",
+		(double)all_runtime/1e9, all_count);
+	printf("---------------------------------------------\n");
 }
 
 static struct trace_sched_handler *trace_handler;

commit ea92ed5a8f4e6c638efe7de2efe8a875d580ad3f
Author: Ingo Molnar <mingo@elte.hu>
Date:   Sat Sep 12 10:08:34 2009 +0200

    perf sched: Add runtime stats
    
    Extend the latency tracking structure with scheduling atom
    runtime info - and sum it up during per task display.
    
    (Also clean up a few details.)
    
    Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Mike Galbraith <efault@gmx.de>
    Cc: Paul Mackerras <paulus@samba.org>
    Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
    Cc: Frederic Weisbecker <fweisbec@gmail.com>
    LKML-Reference: <new-submission>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index a084c284e198..c382f530d4c6 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -243,8 +243,8 @@ add_sched_event_run(struct task_desc *task, nsec_t timestamp, u64 duration)
 	nr_run_events++;
 }
 
-static unsigned long targetless_wakeups;
-static unsigned long multitarget_wakeups;
+static unsigned long		targetless_wakeups;
+static unsigned long		multitarget_wakeups;
 
 static void
 add_sched_event_wakeup(struct task_desc *task, nsec_t timestamp,
@@ -485,10 +485,10 @@ static void create_tasks(void)
 	}
 }
 
-static nsec_t cpu_usage;
-static nsec_t runavg_cpu_usage;
-static nsec_t parent_cpu_usage;
-static nsec_t runavg_parent_cpu_usage;
+static nsec_t			cpu_usage;
+static nsec_t			runavg_cpu_usage;
+static nsec_t			parent_cpu_usage;
+static nsec_t			runavg_parent_cpu_usage;
 
 static void wait_for_tasks(void)
 {
@@ -858,9 +858,9 @@ replay_fork_event(struct trace_fork_event *fork_event,
 }
 
 static struct trace_sched_handler replay_ops  = {
-	.wakeup_event = replay_wakeup_event,
-	.switch_event = replay_switch_event,
-	.fork_event = replay_fork_event,
+	.wakeup_event		= replay_wakeup_event,
+	.switch_event		= replay_switch_event,
+	.fork_event		= replay_fork_event,
 };
 
 #define TASK_STATE_TO_CHAR_STR "RSDTtZX"
@@ -877,6 +877,7 @@ struct lat_snapshot {
 	enum thread_state	state;
 	u64			wake_up_time;
 	u64			sched_in_time;
+	u64			runtime;
 };
 
 struct thread_latency {
@@ -951,6 +952,7 @@ latency_fork_event(struct trace_fork_event *fork_event __used,
 	/* should insert the newcomer */
 }
 
+__used
 static char sched_out_state(struct trace_switch_event *switch_event)
 {
 	const char *str = TASK_STATE_TO_CHAR_STR;
@@ -960,17 +962,15 @@ static char sched_out_state(struct trace_switch_event *switch_event)
 
 static void
 lat_sched_out(struct thread_latency *lat,
-	     struct trace_switch_event *switch_event)
+	     struct trace_switch_event *switch_event __used, u64 delta)
 {
 	struct lat_snapshot *snapshot;
 
-	if (sched_out_state(switch_event) == 'R')
-		return;
-
 	snapshot = calloc(sizeof(*snapshot), 1);
 	if (!snapshot)
 		die("Non memory");
 
+	snapshot->runtime = delta;
 	list_add_tail(&snapshot->list, &lat->snapshot_list);
 }
 
@@ -997,16 +997,31 @@ lat_sched_in(struct thread_latency *lat, u64 timestamp)
 	snapshot->sched_in_time = timestamp;
 }
 
-
 static void
 latency_switch_event(struct trace_switch_event *switch_event,
 		     struct event *event __used,
-		     int cpu __used,
+		     int cpu,
 		     u64 timestamp,
 		     struct thread *thread __used)
 {
 	struct thread_latency *out_lat, *in_lat;
 	struct thread *sched_out, *sched_in;
+	u64 timestamp0;
+	s64 delta;
+
+	if (cpu >= MAX_CPUS || cpu < 0)
+		return;
+
+	timestamp0 = cpu_last_switched[cpu];
+	cpu_last_switched[cpu] = timestamp;
+	if (timestamp0)
+		delta = timestamp - timestamp0;
+	else
+		delta = 0;
+
+	if (delta < 0)
+		die("hm, delta: %Ld < 0 ?\n", delta);
+
 
 	sched_out = threads__findnew(switch_event->prev_pid, &threads, &last_match);
 	sched_in = threads__findnew(switch_event->next_pid, &threads, &last_match);
@@ -1028,7 +1043,7 @@ latency_switch_event(struct trace_switch_event *switch_event,
 	}
 
 	lat_sched_in(in_lat, timestamp);
-	lat_sched_out(out_lat, switch_event);
+	lat_sched_out(out_lat, switch_event, delta);
 }
 
 static void
@@ -1067,9 +1082,9 @@ latency_wakeup_event(struct trace_wakeup_event *wakeup_event,
 }
 
 static struct trace_sched_handler lat_ops  = {
-	.wakeup_event = latency_wakeup_event,
-	.switch_event = latency_switch_event,
-	.fork_event = latency_fork_event,
+	.wakeup_event		= latency_wakeup_event,
+	.switch_event		= latency_switch_event,
+	.fork_event		= latency_fork_event,
 };
 
 static void output_lat_thread(struct thread_latency *lat)
@@ -1080,8 +1095,11 @@ static void output_lat_thread(struct thread_latency *lat)
 	int ret;
 	u64 max = 0, avg;
 	u64 total = 0, delta;
+	u64 total_runtime = 0;
 
 	list_for_each_entry(shot, &lat->snapshot_list, list) {
+		total_runtime += shot->runtime;
+
 		if (shot->state != THREAD_SCHED_IN)
 			continue;
 
@@ -1104,7 +1122,7 @@ static void output_lat_thread(struct thread_latency *lat)
 	avg = total / count;
 
 	printf("|%9.3f ms |%9d | avg:%9.3f ms | max:%9.3f ms |\n",
-		0.0, count, (double)avg/1e9, (double)max/1e9);
+		(double)total_runtime/1e9, count, (double)avg/1e9, (double)max/1e9);
 }
 
 static void __cmd_lat(void)

commit d9340c1db3f52460a8335eeb127a2728c5bba6ce
Author: Ingo Molnar <mingo@elte.hu>
Date:   Sat Sep 12 10:08:34 2009 +0200

    perf sched: Display time in milliseconds, reorganize output
    
    After:
    
    -----------------------------------------------------------------------------------
     Task              |  runtime ms | switches | average delay ms | maximum delay ms |
    -----------------------------------------------------------------------------------
     migration/0       |    0.000 ms |        1 | avg:    0.047 ms | max:    0.047 ms |
     ksoftirqd/0       |    0.000 ms |        1 | avg:    0.039 ms | max:    0.039 ms |
     migration/1       |    0.000 ms |        3 | avg:    0.013 ms | max:    0.016 ms |
     migration/3       |    0.000 ms |        2 | avg:    0.003 ms | max:    0.004 ms |
     migration/4       |    0.000 ms |        1 | avg:    0.022 ms | max:    0.022 ms |
     distccd           |    0.000 ms |        1 | avg:    0.004 ms | max:    0.004 ms |
     distccd           |    0.000 ms |        1 | avg:    0.014 ms | max:    0.014 ms |
     distccd           |    0.000 ms |        2 | avg:    0.000 ms | max:    0.000 ms |
     distccd           |    0.000 ms |        2 | avg:    0.012 ms | max:    0.019 ms |
     distccd           |    0.000 ms |        1 | avg:    0.002 ms | max:    0.002 ms |
     as                |    0.000 ms |        2 | avg:    0.019 ms | max:    0.019 ms |
     as                |    0.000 ms |        3 | avg:    0.015 ms | max:    0.017 ms |
     as                |    0.000 ms |        1 | avg:    0.009 ms | max:    0.009 ms |
     perf              |    0.000 ms |        1 | avg:    0.001 ms | max:    0.001 ms |
     gcc               |    0.000 ms |        1 | avg:    0.021 ms | max:    0.021 ms |
     run-mozilla.sh    |    0.000 ms |        2 | avg:    0.010 ms | max:    0.017 ms |
     mozilla-plugin-   |    0.000 ms |        1 | avg:    0.006 ms | max:    0.006 ms |
     gcc               |    0.000 ms |        2 | avg:    0.013 ms | max:    0.013 ms |
    -----------------------------------------------------------------------------------
    
    (The runtime ms column is not filled in yet.)
    
    Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Mike Galbraith <efault@gmx.de>
    Cc: Paul Mackerras <paulus@samba.org>
    Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
    Cc: Frederic Weisbecker <fweisbec@gmail.com>
    LKML-Reference: <new-submission>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 84699cf036ab..a084c284e198 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -1096,14 +1096,15 @@ static void output_lat_thread(struct thread_latency *lat)
 	if (!count)
 		return;
 
-	ret = printf("%s", lat->thread->comm);
+	ret = printf(" %s ", lat->thread->comm);
 
-	for (i = 0; i < 25 - ret; i++)
+	for (i = 0; i < 19 - ret; i++)
 		printf(" ");
 
 	avg = total / count;
 
-	printf("%5d        %10llu       %10llu      %10llu\n", count, total, avg, max);
+	printf("|%9.3f ms |%9d | avg:%9.3f ms | max:%9.3f ms |\n",
+		0.0, count, (double)avg/1e9, (double)max/1e9);
 }
 
 static void __cmd_lat(void)
@@ -1113,11 +1114,9 @@ static void __cmd_lat(void)
 	setup_pager();
 	read_events();
 
-	printf(" Tasks");
-	printf("                     count");
-	printf("          total");
-	printf("              avg");
-	printf("            max\n\n");
+	printf("-----------------------------------------------------------------------------------\n");
+	printf(" Task              |  runtime ms | switches | average delay ms | maximum delay ms |\n");
+	printf("-----------------------------------------------------------------------------------\n");
 
 	next = rb_first(&lat_snapshot_root);
 
@@ -1128,6 +1127,8 @@ static void __cmd_lat(void)
 		output_lat_thread(lat);
 		next = rb_next(next);
 	}
+
+	printf("-----------------------------------------------------------------------------------\n");
 }
 
 static struct trace_sched_handler *trace_handler;
diff --git a/tools/perf/util/trace-event-read.c b/tools/perf/util/trace-event-read.c
index a1217a10632f..1b5c847d2c22 100644
--- a/tools/perf/util/trace-event-read.c
+++ b/tools/perf/util/trace-event-read.c
@@ -458,12 +458,13 @@ struct record *trace_read_data(int cpu)
 	return data;
 }
 
-void trace_report (void)
+void trace_report(void)
 {
 	const char *input_file = "trace.info";
 	char buf[BUFSIZ];
 	char test[] = { 23, 8, 68 };
 	char *version;
+	int show_version = 0;
 	int show_funcs = 0;
 	int show_printk = 0;
 
@@ -480,7 +481,8 @@ void trace_report (void)
 		die("not a trace file (missing tracing)");
 
 	version = read_string();
-	printf("version = %s\n", version);
+	if (show_version)
+		printf("version = %s\n", version);
 	free(version);
 
 	read_or_die(buf, 1);

commit 46f392c97f9fd772426ed3361c5179a0d44b8c3f
Author: Ingo Molnar <mingo@elte.hu>
Date:   Sat Sep 12 10:08:34 2009 +0200

    perf sched: Clean up latency and replay sub-commands
    
    - Separate the latency and the replay commands more cleanly
    
     - Use consistent naming
    
     - Display help page on 'perf sched' outlining comments,
       instead of aborting
    
    Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Mike Galbraith <efault@gmx.de>
    Cc: Paul Mackerras <paulus@samba.org>
    Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
    Cc: Frederic Weisbecker <fweisbec@gmail.com>
    LKML-Reference: <new-submission>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 4f9e943181a3..84699cf036ab 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -543,24 +543,7 @@ static void wait_for_tasks(void)
 	}
 }
 
-static int __cmd_sched(void);
-
-static void parse_trace(void)
-{
-	__cmd_sched();
-
-	printf("nr_run_events:        %ld\n", nr_run_events);
-	printf("nr_sleep_events:      %ld\n", nr_sleep_events);
-	printf("nr_wakeup_events:     %ld\n", nr_wakeup_events);
-
-	if (targetless_wakeups)
-		printf("target-less wakeups:  %ld\n", targetless_wakeups);
-	if (multitarget_wakeups)
-		printf("multi-target wakeups: %ld\n", multitarget_wakeups);
-	if (nr_run_events_optimized)
-		printf("run events optimized: %ld\n",
-			nr_run_events_optimized);
-}
+static int read_events(void);
 
 static unsigned long nr_runs;
 static nsec_t sum_runtime;
@@ -637,6 +620,38 @@ static void test_calibrations(void)
 	printf("the sleep test took %Ld nsecs\n", T1-T0);
 }
 
+static void __cmd_replay(void)
+{
+	long nr_iterations = 10, i;
+
+	calibrate_run_measurement_overhead();
+	calibrate_sleep_measurement_overhead();
+
+	test_calibrations();
+
+	read_events();
+
+	printf("nr_run_events:        %ld\n", nr_run_events);
+	printf("nr_sleep_events:      %ld\n", nr_sleep_events);
+	printf("nr_wakeup_events:     %ld\n", nr_wakeup_events);
+
+	if (targetless_wakeups)
+		printf("target-less wakeups:  %ld\n", targetless_wakeups);
+	if (multitarget_wakeups)
+		printf("multi-target wakeups: %ld\n", multitarget_wakeups);
+	if (nr_run_events_optimized)
+		printf("run events optimized: %ld\n",
+			nr_run_events_optimized);
+
+	print_task_traces();
+	add_cross_task_wakeups();
+
+	create_tasks();
+	printf("------------------------------------------------------------\n");
+	for (i = 0; i < nr_iterations; i++)
+		run_one_test();
+}
+
 static int
 process_comm_event(event_t *event, unsigned long offset, unsigned long head)
 {
@@ -1091,10 +1106,13 @@ static void output_lat_thread(struct thread_latency *lat)
 	printf("%5d        %10llu       %10llu      %10llu\n", count, total, avg, max);
 }
 
-static void output_lat_results(void)
+static void __cmd_lat(void)
 {
 	struct rb_node *next;
 
+	setup_pager();
+	read_events();
+
 	printf(" Tasks");
 	printf("                     count");
 	printf("          total");
@@ -1312,7 +1330,7 @@ process_event(event_t *event, unsigned long offset, unsigned long head)
 	return 0;
 }
 
-static int __cmd_sched(void)
+static int read_events(void)
 {
 	int ret, rc = EXIT_FAILURE;
 	unsigned long offset = 0;
@@ -1408,8 +1426,8 @@ static int __cmd_sched(void)
 	return rc;
 }
 
-static const char * const annotate_usage[] = {
-	"perf trace [<options>] <command>",
+static const char * const sched_usage[] = {
+	"perf sched [<options>] <command>",
 	NULL
 };
 
@@ -1427,49 +1445,30 @@ static const struct option options[] = {
 
 int cmd_sched(int argc, const char **argv, const char *prefix __used)
 {
-	long nr_iterations = 10, i;
-
 	symbol__init();
 	page_size = getpagesize();
 
-	argc = parse_options(argc, argv, options, annotate_usage, 0);
+	argc = parse_options(argc, argv, options, sched_usage, 0);
 	if (argc) {
 		/*
 		 * Special case: if there's an argument left then assume tha
 		 * it's a symbol filter:
 		 */
 		if (argc > 1)
-			usage_with_options(annotate_usage, options);
+			usage_with_options(sched_usage, options);
 	}
 
-//	setup_pager();
-
 	if (replay_mode)
 		trace_handler = &replay_ops;
 	else if (lat_mode)
 		trace_handler = &lat_ops;
-	else /* We may need a default subcommand (perf trace?) */
-		die("Please select a sub command (-r)\n");
-
-	if (replay_mode) {
-		calibrate_run_measurement_overhead();
-		calibrate_sleep_measurement_overhead();
-
-		test_calibrations();
-
-		parse_trace();
-		print_task_traces();
-		add_cross_task_wakeups();
-
-		create_tasks();
-		printf("------------------------------------------------------------\n");
-		for (i = 0; i < nr_iterations; i++)
-			run_one_test();
-	} else if (lat_mode) {
-		setup_pager();
-		__cmd_sched();
-		output_lat_results();
-	}
+	else
+		usage_with_options(sched_usage, options);
+
+	if (replay_mode)
+		__cmd_replay();
+	else if (lat_mode)
+		__cmd_lat();
 
 	return 0;
 }

commit ad236fd23b6d6372dcacd549983cce051d2ccff6
Author: Ingo Molnar <mingo@elte.hu>
Date:   Fri Sep 11 12:12:54 2009 +0200

    perf sched: Tighten up the code
    
    Various small cleanups - removal of debug printks and dead
    functions, etc.
    
    Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Mike Galbraith <efault@gmx.de>
    Cc: Paul Mackerras <paulus@samba.org>
    Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
    Cc: Frederic Weisbecker <fweisbec@gmail.com>
    LKML-Reference: <new-submission>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 6ec4f51d536b..de93a2604528 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -61,29 +61,6 @@ static u64			sample_type;
 
 typedef unsigned long long nsec_t;
 
-#define printk(x...)		do { printf(x); fflush(stdout); } while (0)
-
-nsec_t prev_printk;
-
-#define __dprintk(x,y...) do {						 \
-	nsec_t __now = get_nsecs(), __delta = __now - prev_printk;	 \
-									 \
-	prev_printk = __now;						 \
-									 \
-	printf("%.3f [%Ld] [%.3f]: " x, (double)__now/1e6, __now, (double)__delta/1e6, y);\
-} while (0)
-
-#if !DEBUG
-# define dprintk(x...)	do { } while (0)
-#else
-# define dprintk(x...)	__dprintk(x)
-#endif
-
-#define __DP()		__dprintk("parent: line %d\n", __LINE__)
-#define DP()		dprintk("parent: line %d\n", __LINE__)
-#define D()		dprintk("task %ld: line %d\n", this_task->nr, __LINE__)
-
-
 static nsec_t run_measurement_overhead;
 static nsec_t sleep_measurement_overhead;
 
@@ -129,7 +106,7 @@ static void calibrate_run_measurement_overhead(void)
 	}
 	run_measurement_overhead = min_delta;
 
-	printk("run measurement overhead: %Ld nsecs\n", min_delta);
+	printf("run measurement overhead: %Ld nsecs\n", min_delta);
 }
 
 static void calibrate_sleep_measurement_overhead(void)
@@ -147,7 +124,7 @@ static void calibrate_sleep_measurement_overhead(void)
 	min_delta -= 10000;
 	sleep_measurement_overhead = min_delta;
 
-	printk("sleep measurement overhead: %Ld nsecs\n", min_delta);
+	printf("sleep measurement overhead: %Ld nsecs\n", min_delta);
 }
 
 #define COMM_LEN	20
@@ -293,7 +270,7 @@ add_sched_event_wakeup(struct task_desc *task, nsec_t timestamp,
 
 static void
 add_sched_event_sleep(struct task_desc *task, nsec_t timestamp,
-		      unsigned long uninterruptible __used)
+		      u64 task_state __used)
 {
 	struct sched_event *event = get_new_event(task, timestamp);
 
@@ -329,77 +306,13 @@ static struct task_desc *register_pid(unsigned long pid, const char *comm)
 	BUG_ON(!tasks);
 	tasks[task->nr] = task;
 
-	printk("registered task #%ld, PID %ld (%s)\n", nr_tasks, pid, comm);
+	if (verbose)
+		printf("registered task #%ld, PID %ld (%s)\n", nr_tasks, pid, comm);
 
 	return task;
 }
 
 
-static int first_trace_line = 1;
-
-static nsec_t first_timestamp;
-static nsec_t prev_timestamp;
-
-void parse_line(char *line);
-
-void parse_line(char *line)
-{
-	unsigned long param1 = 0, param2 = 0;
-	char comm[COMM_LEN], comm2[COMM_LEN];
-	unsigned long pid, pid2, timestamp0;
-	struct task_desc *task, *task2;
-	char func_str[SYM_LEN];
-	nsec_t timestamp;
-	int ret;
-
-	//"   <idle> 0     0D.s3    0us+: try_to_wake_up <events/0 9> (1 0)"
-	ret = sscanf(line, "%20s %5ld %*s %ldus%*c:"
-			   " %128s <%20s %ld> (%ld %ld)\n",
-		comm, &pid, &timestamp0,
-		func_str, comm2, &pid2, &param1, &param2);
-	dprintk("ret: %d\n", ret);
-	if (ret != 8)
-		return;
-
-	timestamp = timestamp0 * 1000LL;
-
-	if (first_trace_line) {
-		first_trace_line = 0;
-		first_timestamp = timestamp;
-	}
-
-	timestamp -= first_timestamp;
-	BUG_ON(timestamp < prev_timestamp);
-	prev_timestamp = timestamp;
-
-	dprintk("parsed: %s - %ld %Ld: %s - <%s %ld> (%ld %ld)\n",
-		comm,
-		pid,
-		timestamp,
-		func_str,
-		comm2,
-		pid2,
-		param1,
-		param2);
-
-	task = register_pid(pid, comm);
-	task2 = register_pid(pid2, comm2);
-
-	if (!strcmp(func_str, "update_curr")) {
-		dprintk("%Ld: task %ld runs for %ld nsecs\n",
-			timestamp, task->nr, param1);
-		add_sched_event_run(task, timestamp, param1);
-	} else if (!strcmp(func_str, "try_to_wake_up")) {
-		dprintk("%Ld: task %ld wakes up task %ld\n",
-			timestamp, task->nr, task2->nr);
-		add_sched_event_wakeup(task, timestamp, task2);
-	} else if (!strcmp(func_str, "deactivate_task")) {
-		dprintk("%Ld: task %ld goes to sleep (uninterruptible: %ld)\n",
-			timestamp, task->nr, param1);
-		add_sched_event_sleep(task, timestamp, param1);
-	}
-}
-
 static void print_task_traces(void)
 {
 	struct task_desc *task;
@@ -407,7 +320,7 @@ static void print_task_traces(void)
 
 	for (i = 0; i < nr_tasks; i++) {
 		task = tasks[i];
-		printk("task %6ld (%20s:%10ld), nr_events: %ld\n",
+		printf("task %6ld (%20s:%10ld), nr_events: %ld\n",
 			task->nr, task->comm, task->pid, task->nr_events);
 	}
 }
@@ -437,40 +350,16 @@ process_sched_event(struct task_desc *this_task __used, struct sched_event *even
 	now = get_nsecs();
 	delta = start_time + event->timestamp - now;
 
-	dprintk("task %ld, event #%ld, %Ld, delta: %.3f (%Ld)\n",
-		this_task->nr, event->nr, event->timestamp,
-		(double)delta/1e6, delta);
-
-	if (0 && delta > 0) {
-		dprintk("%.3f: task %ld FIX %.3f\n",
-			(double)event->timestamp/1e6,
-			this_task->nr,
-			(double)delta/1e6);
-		sleep_nsecs(start_time + event->timestamp - now);
-		nr_sleep_corrections++;
-	}
-
 	switch (event->type) {
 		case SCHED_EVENT_RUN:
-			dprintk("%.3f: task %ld RUN for %.3f\n",
-				(double)event->timestamp/1e6,
-				this_task->nr,
-				(double)event->duration/1e6);
 			burn_nsecs(event->duration);
 			break;
 		case SCHED_EVENT_SLEEP:
-			dprintk("%.3f: task %ld %s SLEEP\n",
-				(double)event->timestamp/1e6,
-				this_task->nr, event->wait_sem ? "" : "SKIP");
 			if (event->wait_sem)
 				ret = sem_wait(event->wait_sem);
 			BUG_ON(ret);
 			break;
 		case SCHED_EVENT_WAKEUP:
-			dprintk("%.3f: task %ld WAKEUP => task %ld\n",
-				(double)event->timestamp/1e6,
-				this_task->nr,
-				event->wakee->nr);
 			if (event->wait_sem)
 				ret = sem_post(event->wait_sem);
 			BUG_ON(ret);
@@ -511,14 +400,10 @@ static nsec_t get_cpu_usage_nsec_self(void)
 	BUG_ON(!file);
 
 	while ((chars = getline(&line, &len, file)) != -1) {
-		dprintk("got line with length %zu :\n", chars);
-		dprintk("%s", line);
 		ret = sscanf(line, "se.sum_exec_runtime : %ld.%06ld\n",
 			&msecs, &nsecs);
 		if (ret == 2) {
 			total = msecs*1e6 + nsecs;
-			dprintk("total: (%ld.%06ld) %Ld\n",
-				msecs, nsecs, total);
 			break;
 		}
 	}
@@ -536,19 +421,16 @@ static void *thread_func(void *ctx)
 	unsigned long i, ret;
 	char comm2[22];
 
-	dprintk("task %ld started up.\n", this_task->nr);
 	sprintf(comm2, ":%s", this_task->comm);
 	prctl(PR_SET_NAME, comm2);
 
 again:
 	ret = sem_post(&this_task->ready_for_work);
 	BUG_ON(ret);
-	D();
 	ret = pthread_mutex_lock(&start_work_mutex);
 	BUG_ON(ret);
 	ret = pthread_mutex_unlock(&start_work_mutex);
 	BUG_ON(ret);
-	D();
 
 	cpu_usage_0 = get_cpu_usage_nsec_self();
 
@@ -560,19 +442,13 @@ static void *thread_func(void *ctx)
 	cpu_usage_1 = get_cpu_usage_nsec_self();
 	this_task->cpu_usage = cpu_usage_1 - cpu_usage_0;
 
-	dprintk("task %ld cpu usage: %0.3f msecs\n",
-		this_task->nr, (double)this_task->cpu_usage / 1e6);
-
-	D();
 	ret = sem_post(&this_task->work_done_sem);
 	BUG_ON(ret);
-	D();
 
 	ret = pthread_mutex_lock(&work_done_wait_mutex);
 	BUG_ON(ret);
 	ret = pthread_mutex_unlock(&work_done_wait_mutex);
 	BUG_ON(ret);
-	D();
 
 	goto again;
 }
@@ -614,9 +490,7 @@ static void wait_for_tasks(void)
 	struct task_desc *task;
 	unsigned long i, ret;
 
-	DP();
 	start_time = get_nsecs();
-	DP();
 	cpu_usage = 0;
 	pthread_mutex_unlock(&work_done_wait_mutex);
 
@@ -633,24 +507,6 @@ static void wait_for_tasks(void)
 
 	pthread_mutex_unlock(&start_work_mutex);
 
-#if 0
-	for (i = 0; i < nr_tasks; i++) {
-		unsigned long missed;
-
-		task = tasks[i];
-		while (task->curr_event + 1 < task->nr_events) {
-			dprintk("parent waiting for %ld (%ld != %ld)\n",
-				i, task->curr_event, task->nr_events);
-			sleep_nsecs(100000000);
-		}
-		missed = task->nr_events - 1 - task->curr_event;
-		if (missed)
-			printk("task %ld missed events: %ld\n", i, missed);
-		ret = sem_post(&task->sleep_sem);
-		BUG_ON(ret);
-	}
-#endif
-	DP();
 	for (i = 0; i < nr_tasks; i++) {
 		task = tasks[i];
 		ret = sem_wait(&task->work_done_sem);
@@ -687,16 +543,16 @@ static void parse_trace(void)
 {
 	__cmd_sched();
 
-	printk("nr_run_events:        %ld\n", nr_run_events);
-	printk("nr_sleep_events:      %ld\n", nr_sleep_events);
-	printk("nr_wakeup_events:     %ld\n", nr_wakeup_events);
+	printf("nr_run_events:        %ld\n", nr_run_events);
+	printf("nr_sleep_events:      %ld\n", nr_sleep_events);
+	printf("nr_wakeup_events:     %ld\n", nr_wakeup_events);
 
 	if (targetless_wakeups)
-		printk("target-less wakeups:  %ld\n", targetless_wakeups);
+		printf("target-less wakeups:  %ld\n", targetless_wakeups);
 	if (multitarget_wakeups)
-		printk("multi-target wakeups: %ld\n", multitarget_wakeups);
+		printf("multi-target wakeups: %ld\n", multitarget_wakeups);
 	if (nr_run_events_optimized)
-		printk("run events optimized: %ld\n",
+		printf("run events optimized: %ld\n",
 			nr_run_events_optimized);
 }
 
@@ -728,17 +584,17 @@ static void run_one_test(void)
 		run_avg = delta;
 	run_avg = (run_avg*9 + delta)/10;
 
-	printk("#%-3ld: %0.3f, ",
+	printf("#%-3ld: %0.3f, ",
 		nr_runs, (double)delta/1000000.0);
 
 #if 0
-	printk("%0.2f +- %0.2f, ",
+	printf("%0.2f +- %0.2f, ",
 		(double)avg_delta/1e6, (double)std_dev/1e6);
 #endif
-	printk("ravg: %0.2f, ",
+	printf("ravg: %0.2f, ",
 		(double)run_avg/1e6);
 
-	printk("cpu: %0.2f / %0.2f",
+	printf("cpu: %0.2f / %0.2f",
 		(double)cpu_usage/1e6, (double)runavg_cpu_usage/1e6);
 
 #if 0
@@ -746,15 +602,15 @@ static void run_one_test(void)
 	 * rusage statistics done by the parent, these are less
 	 * accurate than the sum_exec_runtime based statistics:
 	 */
-	printk(" [%0.2f / %0.2f]",
+	printf(" [%0.2f / %0.2f]",
 		(double)parent_cpu_usage/1e6,
 		(double)runavg_parent_cpu_usage/1e6);
 #endif
 
-	printk("\n");
+	printf("\n");
 
 	if (nr_sleep_corrections)
-		printk(" (%ld sleep corrections)\n", nr_sleep_corrections);
+		printf(" (%ld sleep corrections)\n", nr_sleep_corrections);
 	nr_sleep_corrections = 0;
 }
 
@@ -766,13 +622,13 @@ static void test_calibrations(void)
 	burn_nsecs(1e6);
 	T1 = get_nsecs();
 
-	printk("the run test took %Ld nsecs\n", T1-T0);
+	printf("the run test took %Ld nsecs\n", T1-T0);
 
 	T0 = get_nsecs();
 	sleep_nsecs(1e6);
 	T1 = get_nsecs();
 
-	printk("the sleep test took %Ld nsecs\n", T1-T0);
+	printf("the sleep test took %Ld nsecs\n", T1-T0);
 }
 
 static int
@@ -820,12 +676,14 @@ process_sched_wakeup_event(struct trace_wakeup_event *wakeup_event, struct event
 {
 	struct task_desc *waker, *wakee;
 
-	printf("sched_wakeup event %p\n", event);
+	if (verbose) {
+		printf("sched_wakeup event %p\n", event);
 
-	printf(" ... pid %d woke up %s/%d\n",
-		wakeup_event->common_pid,
-		wakeup_event->comm,
-		wakeup_event->pid);
+		printf(" ... pid %d woke up %s/%d\n",
+			wakeup_event->common_pid,
+			wakeup_event->comm,
+			wakeup_event->pid);
+	}
 
 	waker = register_pid(wakeup_event->common_pid, "<unknown>");
 	wakee = register_pid(wakeup_event->pid, wakeup_event->comm);
@@ -863,7 +721,9 @@ process_sched_switch_event(struct trace_switch_event *switch_event, struct event
 	u64 timestamp0;
 	s64 delta;
 
-	printf("sched_switch event %p\n", event);
+	if (verbose)
+		printf("sched_switch event %p\n", event);
+
 	if (cpu >= MAX_CPUS || cpu < 0)
 		return;
 
@@ -876,10 +736,12 @@ process_sched_switch_event(struct trace_switch_event *switch_event, struct event
 	if (delta < 0)
 		die("hm, delta: %Ld < 0 ?\n", delta);
 
-	printf(" ... switch from %s/%d to %s/%d [ran %Ld nsecs]\n",
-		switch_event->prev_comm, switch_event->prev_pid,
-		switch_event->next_comm, switch_event->next_pid,
-		delta);
+	if (verbose) {
+		printf(" ... switch from %s/%d to %s/%d [ran %Ld nsecs]\n",
+			switch_event->prev_comm, switch_event->prev_pid,
+			switch_event->next_comm, switch_event->next_pid,
+			delta);
+	}
 
 	prev = register_pid(switch_event->prev_pid, switch_event->prev_comm);
 	next = register_pid(switch_event->next_pid, switch_event->next_comm);
@@ -887,6 +749,7 @@ process_sched_switch_event(struct trace_switch_event *switch_event, struct event
 	cpu_last_switched[cpu] = timestamp;
 
 	add_sched_event_run(prev, timestamp, delta);
+	add_sched_event_sleep(prev, timestamp, switch_event->prev_state);
 }
 
 struct trace_fork_event {
@@ -908,9 +771,11 @@ static void
 process_sched_fork_event(struct trace_fork_event *fork_event, struct event *event,
 		  int cpu __used, u64 timestamp __used, struct thread *thread __used)
 {
-	printf("sched_fork event %p\n", event);
-	printf("... parent: %s/%d\n", fork_event->parent_comm, fork_event->parent_pid);
-	printf("...  child: %s/%d\n", fork_event->child_comm, fork_event->child_pid);
+	if (verbose) {
+		printf("sched_fork event %p\n", event);
+		printf("... parent: %s/%d\n", fork_event->parent_comm, fork_event->parent_pid);
+		printf("...  child: %s/%d\n", fork_event->child_comm, fork_event->child_pid);
+	}
 	register_pid(fork_event->parent_pid, fork_event->parent_comm);
 	register_pid(fork_event->child_pid, fork_event->child_comm);
 }
@@ -918,11 +783,12 @@ process_sched_fork_event(struct trace_fork_event *fork_event, struct event *even
 static void process_sched_exit_event(struct event *event,
 		  int cpu __used, u64 timestamp __used, struct thread *thread __used)
 {
-	printf("sched_exit event %p\n", event);
+	if (verbose)
+		printf("sched_exit event %p\n", event);
 }
 
 static void
-process_raw_event(event_t *raw_event, void *more_data,
+process_raw_event(event_t *raw_event __used, void *more_data,
 		  int cpu, u64 timestamp, struct thread *thread)
 {
 	struct {
@@ -935,14 +801,6 @@ process_raw_event(event_t *raw_event, void *more_data,
 	type = trace_parse_common_type(raw->data);
 	event = trace_find_event(type);
 
-	/*
-	 * FIXME: better resolve from pid from the struct trace_entry
-	 * field, although it should be the same than this perf
-	 * event pid
-	 */
-	printf("id %d, type: %d, event: %s\n",
-		raw_event->header.type, type, event->name);
-
 	if (!strcmp(event->name, "sched_switch"))
 		process_sched_switch_event(more_data, event, cpu, timestamp, thread);
 	if (!strcmp(event->name, "sched_wakeup"))
@@ -1197,7 +1055,7 @@ int cmd_sched(int argc, const char **argv, const char *prefix __used)
 	add_cross_task_wakeups();
 
 	create_tasks();
-	printk("------------------------------------------------------------\n");
+	printf("------------------------------------------------------------\n");
 	for (i = 0; i < nr_iterations; i++)
 		run_one_test();

commit fbf9482911825f965829567aea8acff3bbc5279c
Author: Ingo Molnar <mingo@elte.hu>
Date:   Fri Sep 11 12:12:54 2009 +0200

    perf sched: Implement the scheduling workload replay engine
    
    Integrate the schedbench.c bits with the raw trace events
    that we get from the perf machinery, and activate the
    workload replayer/simulator.
    
    Example of a captured 'make -j' workload:
    
    $ perf sched
    
      run measurement overhead: 90 nsecs
      sleep measurement overhead: 2724743 nsecs
      the run test took 1000081 nsecs
      the sleep test took 2981111 nsecs
      version = 0.5
      ...
      nr_run_events:        70
      nr_sleep_events:      66
      nr_wakeup_events:     9
      target-less wakeups:  71
      multi-target wakeups: 47
      run events optimized: 139
      task      0 (                perf:      6607), nr_events: 2
      task      1 (                perf:      6608), nr_events: 6
      task      2 (                    :         0), nr_events: 1
      task      3 (                make:      6609), nr_events: 5
      task      4 (                  sh:      6610), nr_events: 4
      task      5 (                make:      6611), nr_events: 6
      task      6 (                  sh:      6612), nr_events: 4
      task      7 (                make:      6613), nr_events: 5
      task      8 (        migration/11:        25), nr_events: 1
      task      9 (        migration/13:        29), nr_events: 1
      task     10 (        migration/15:        33), nr_events: 1
      task     11 (         migration/9:        21), nr_events: 1
      task     12 (                  sh:      6614), nr_events: 4
      task     13 (                make:      6615), nr_events: 5
      task     14 (                  sh:      6616), nr_events: 4
      task     15 (                make:      6617), nr_events: 7
      task     16 (         migration/3:         9), nr_events: 1
      task     17 (         migration/5:        13), nr_events: 1
      task     18 (         migration/7:        17), nr_events: 1
      task     19 (         migration/1:         5), nr_events: 1
      task     20 (                  sh:      6618), nr_events: 4
      task     21 (                make:      6619), nr_events: 5
      task     22 (                  sh:      6620), nr_events: 4
      task     23 (                make:      6621), nr_events: 10
      task     24 (                  sh:      6623), nr_events: 3
      task     25 (                 gcc:      6624), nr_events: 4
      task     26 (                 gcc:      6625), nr_events: 4
      task     27 (                 gcc:      6626), nr_events: 5
      task     28 (            collect2:      6627), nr_events: 5
      task     29 (                  sh:      6622), nr_events: 1
      task     30 (                make:      6628), nr_events: 7
      task     31 (                  sh:      6630), nr_events: 4
      task     32 (                 gcc:      6631), nr_events: 4
      task     33 (                  sh:      6629), nr_events: 1
      task     34 (                 gcc:      6632), nr_events: 4
      task     35 (                 gcc:      6633), nr_events: 4
      task     36 (            collect2:      6634), nr_events: 4
      task     37 (                make:      6635), nr_events: 8
      task     38 (                  sh:      6637), nr_events: 4
      task     39 (                  sh:      6636), nr_events: 1
      task     40 (                 gcc:      6638), nr_events: 4
      task     41 (                 gcc:      6639), nr_events: 4
      task     42 (                 gcc:      6640), nr_events: 4
      task     43 (            collect2:      6641), nr_events: 4
      task     44 (                make:      6642), nr_events: 6
      task     45 (                  sh:      6643), nr_events: 5
      task     46 (                  sh:      6644), nr_events: 3
      task     47 (                  sh:      6645), nr_events: 4
      task     48 (                make:      6646), nr_events: 6
      task     49 (                  sh:      6647), nr_events: 3
      task     50 (                make:      6648), nr_events: 5
      task     51 (                  sh:      6649), nr_events: 5
      task     52 (                  sh:      6650), nr_events: 6
      task     53 (                make:      6651), nr_events: 4
      task     54 (                make:      6652), nr_events: 5
      task     55 (                make:      6653), nr_events: 4
      task     56 (                make:      6654), nr_events: 4
      task     57 (                make:      6655), nr_events: 5
      task     58 (                  sh:      6656), nr_events: 4
      task     59 (                 gcc:      6657), nr_events: 9
      task     60 (         ksoftirqd/3:        10), nr_events: 1
      task     61 (                 gcc:      6658), nr_events: 4
      task     62 (                make:      6659), nr_events: 5
      task     63 (                  sh:      6660), nr_events: 3
      task     64 (                 gcc:      6661), nr_events: 5
      task     65 (            collect2:      6662), nr_events: 4
      ------------------------------------------------------------
      #1  : 256.745, ravg: 256.74, cpu: 0.00 / 0.00
      #2  : 439.372, ravg: 275.01, cpu: 0.00 / 0.00
      #3  : 411.971, ravg: 288.70, cpu: 0.00 / 0.00
      #4  : 385.500, ravg: 298.38, cpu: 0.00 / 0.00
      #5  : 366.526, ravg: 305.20, cpu: 0.00 / 0.00
      #6  : 381.281, ravg: 312.81, cpu: 0.00 / 0.00
      #7  : 410.756, ravg: 322.60, cpu: 0.00 / 0.00
      #8  : 368.009, ravg: 327.14, cpu: 0.00 / 0.00
      #9  : 408.098, ravg: 335.24, cpu: 0.00 / 0.00
      #10 : 368.582, ravg: 338.57, cpu: 0.00 / 0.00
    
    I.e. we successfully analyzed the trace, replayed it
    via real threads and measured the replayed workload's
    scheduling properties.
    
    This is how it looked like in 'top' output:
    
       PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND
      7164 mingo     20   0 1434m 8080  888 R 57.0  0.1   0:02.04 :perf
      7165 mingo     20   0 1434m 8080  888 R 41.8  0.1   0:01.52 :perf
      7228 mingo     20   0 1434m 8080  888 R 39.8  0.1   0:01.44 :gcc
      7225 mingo     20   0 1434m 8080  888 R 33.8  0.1   0:01.26 :gcc
      7202 mingo     20   0 1434m 8080  888 R 31.2  0.1   0:01.16 :sh
      7222 mingo     20   0 1434m 8080  888 R 25.2  0.1   0:00.96 :sh
      7211 mingo     20   0 1434m 8080  888 R 21.9  0.1   0:00.82 :sh
      7213 mingo     20   0 1434m 8080  888 D 19.2  0.1   0:00.74 :sh
      7194 mingo     20   0 1434m 8080  888 D 18.6  0.1   0:00.72 :make
    
    There's still various kinks in it - more patches to come.
    
    Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Mike Galbraith <efault@gmx.de>
    Cc: Paul Mackerras <paulus@samba.org>
    Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
    Cc: Frederic Weisbecker <fweisbec@gmail.com>
    LKML-Reference: <new-submission>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index c66e6a321371..6ec4f51d536b 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -57,7 +57,7 @@ static u64			sample_type;
 
 #define BUG_ON(x)	assert(!(x))
 
-#define DEBUG		1
+#define DEBUG		0
 
 typedef unsigned long long nsec_t;
 
@@ -238,15 +238,14 @@ static struct sched_event *last_event(struct task_desc *task)
 }
 
 static void
-add_sched_event_run(struct task_desc *task, nsec_t timestamp,
-		    unsigned long duration)
+add_sched_event_run(struct task_desc *task, nsec_t timestamp, u64 duration)
 {
 	struct sched_event *event, *curr_event = last_event(task);
 
 	/*
- 	 * optimize an existing RUN event by merging this one
- 	 * to it:
- 	 */
+	 * optimize an existing RUN event by merging this one
+	 * to it:
+	 */
 	if (curr_event && curr_event->type == SCHED_EVENT_RUN) {
 		nr_run_events_optimized++;
 		curr_event->duration += duration;
@@ -376,7 +375,7 @@ void parse_line(char *line)
 	dprintk("parsed: %s - %ld %Ld: %s - <%s %ld> (%ld %ld)\n",
 		comm,
 		pid,
-		timestamp, 
+		timestamp,
 		func_str,
 		comm2,
 		pid2,
@@ -429,7 +428,7 @@ static void add_cross_task_wakeups(void)
 }
 
 static void
-process_sched_event(struct task_desc *this_task, struct sched_event *event)
+process_sched_event(struct task_desc *this_task __used, struct sched_event *event)
 {
 	int ret = 0;
 	nsec_t now;
@@ -744,9 +743,9 @@ static void run_one_test(void)
 
 #if 0
 	/*
- 	 * rusage statistics done by the parent, these are less
- 	 * accurate than the sum_exec_runtime based statistics:
- 	 */
+	 * rusage statistics done by the parent, these are less
+	 * accurate than the sum_exec_runtime based statistics:
+	 */
 	printk(" [%0.2f / %0.2f]",
 		(double)parent_cpu_usage/1e6,
 		(double)runavg_parent_cpu_usage/1e6);
@@ -798,16 +797,128 @@ process_comm_event(event_t *event, unsigned long offset, unsigned long head)
 	return 0;
 }
 
-static void process_sched_wakeup_event(struct event *event,
+struct trace_wakeup_event {
+	u32 size;
+
+	u16 common_type;
+	u8 common_flags;
+	u8 common_preempt_count;
+	u32 common_pid;
+	u32 common_tgid;
+
+	char comm[16];
+	u32 pid;
+
+	u32 prio;
+	u32 success;
+	u32 cpu;
+};
+
+static void
+process_sched_wakeup_event(struct trace_wakeup_event *wakeup_event, struct event *event,
 		  int cpu __used, u64 timestamp __used, struct thread *thread __used)
 {
+	struct task_desc *waker, *wakee;
+
 	printf("sched_wakeup event %p\n", event);
+
+	printf(" ... pid %d woke up %s/%d\n",
+		wakeup_event->common_pid,
+		wakeup_event->comm,
+		wakeup_event->pid);
+
+	waker = register_pid(wakeup_event->common_pid, "<unknown>");
+	wakee = register_pid(wakeup_event->pid, wakeup_event->comm);
+
+	add_sched_event_wakeup(waker, timestamp, wakee);
 }
 
-static void process_sched_switch_event(struct event *event,
+struct trace_switch_event {
+	u32 size;
+
+	u16 common_type;
+	u8 common_flags;
+	u8 common_preempt_count;
+	u32 common_pid;
+	u32 common_tgid;
+
+	char prev_comm[16];
+	u32 prev_pid;
+	u32 prev_prio;
+	u64 prev_state;
+	char next_comm[16];
+	u32 next_pid;
+	u32 next_prio;
+};
+
+#define MAX_CPUS 4096
+
+unsigned long cpu_last_switched[MAX_CPUS];
+
+static void
+process_sched_switch_event(struct trace_switch_event *switch_event, struct event *event,
 		  int cpu __used, u64 timestamp __used, struct thread *thread __used)
 {
+	struct task_desc *prev, *next;
+	u64 timestamp0;
+	s64 delta;
+
 	printf("sched_switch event %p\n", event);
+	if (cpu >= MAX_CPUS || cpu < 0)
+		return;
+
+	timestamp0 = cpu_last_switched[cpu];
+	if (timestamp0)
+		delta = timestamp - timestamp0;
+	else
+		delta = 0;
+
+	if (delta < 0)
+		die("hm, delta: %Ld < 0 ?\n", delta);
+
+	printf(" ... switch from %s/%d to %s/%d [ran %Ld nsecs]\n",
+		switch_event->prev_comm, switch_event->prev_pid,
+		switch_event->next_comm, switch_event->next_pid,
+		delta);
+
+	prev = register_pid(switch_event->prev_pid, switch_event->prev_comm);
+	next = register_pid(switch_event->next_pid, switch_event->next_comm);
+
+	cpu_last_switched[cpu] = timestamp;
+
+	add_sched_event_run(prev, timestamp, delta);
+}
+
+struct trace_fork_event {
+	u32 size;
+
+	u16 common_type;
+	u8 common_flags;
+	u8 common_preempt_count;
+	u32 common_pid;
+	u32 common_tgid;
+
+	char parent_comm[16];
+	u32 parent_pid;
+	char child_comm[16];
+	u32 child_pid;
+};
+
+static void
+process_sched_fork_event(struct trace_fork_event *fork_event, struct event *event,
+		  int cpu __used, u64 timestamp __used, struct thread *thread __used)
+{
+	printf("sched_fork event %p\n", event);
+	printf("... parent: %s/%d\n", fork_event->parent_comm, fork_event->parent_pid);
+	printf("...  child: %s/%d\n", fork_event->child_comm, fork_event->child_pid);
+	register_pid(fork_event->parent_pid, fork_event->parent_comm);
+	register_pid(fork_event->child_pid, fork_event->child_comm);
+}
+
+static void process_sched_exit_event(struct event *event,
+		  int cpu __used, u64 timestamp __used, struct thread *thread __used)
+{
+	printf("sched_exit event %p\n", event);
 }
 
 static void
@@ -833,9 +944,15 @@ process_raw_event(event_t *raw_event, void *more_data,
 		raw_event->header.type, type, event->name);
 
 	if (!strcmp(event->name, "sched_switch"))
-		process_sched_switch_event(event, cpu, timestamp, thread);
+		process_sched_switch_event(more_data, event, cpu, timestamp, thread);
 	if (!strcmp(event->name, "sched_wakeup"))
-		process_sched_wakeup_event(event, cpu, timestamp, thread);
+		process_sched_wakeup_event(more_data, event, cpu, timestamp, thread);
+	if (!strcmp(event->name, "sched_wakeup_new"))
+		process_sched_wakeup_event(more_data, event, cpu, timestamp, thread);
+	if (!strcmp(event->name, "sched_process_fork"))
+		process_sched_fork_event(more_data, event, cpu, timestamp, thread);
+	if (!strcmp(event->name, "sched_process_exit"))
+		process_sched_exit_event(event, cpu, timestamp, thread);
 }
 
 static int
@@ -1053,7 +1170,7 @@ static const struct option options[] = {
 
 int cmd_sched(int argc, const char **argv, const char *prefix __used)
 {
-	long nr_iterations = LONG_MAX, i;
+	long nr_iterations = 10, i;
 
 	symbol__init();
 	page_size = getpagesize();
@@ -1068,8 +1185,7 @@ int cmd_sched(int argc, const char **argv, const char *prefix __used)
 			usage_with_options(annotate_usage, options);
 	}
 
-
-	setup_pager();
+//	setup_pager();
 
 	calibrate_run_measurement_overhead();
 	calibrate_sleep_measurement_overhead();

commit ec156764d424dd67283c2cd5e9f6f1b8388364ac
Author: Ingo Molnar <mingo@elte.hu>
Date:   Fri Sep 11 12:12:54 2009 +0200

    perf sched: Import schedbench.c
    
    Import the schedbench.c tool that i wrote some time ago to
    simulate scheduler behavior but never finished. It's a good
    basis for perf sched nevertheless.
    
    Most of its guts are not hooked up to the perf event loop
    yet - that will be done in the patches to come.
    
    Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Mike Galbraith <efault@gmx.de>
    Cc: Paul Mackerras <paulus@samba.org>
    Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
    Cc: Frederic Weisbecker <fweisbec@gmail.com>
    LKML-Reference: <new-submission>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 60228d9179ac..c66e6a321371 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -12,22 +12,770 @@
 #include "util/debug.h"
 
 #include "util/trace-event.h"
+#include <sys/types.h>
 
-static char		const *input_name = "perf.data";
-static int		input;
-static unsigned long	page_size;
-static unsigned long	mmap_window = 32;
+static char			const *input_name = "perf.data";
+static int			input;
+static unsigned long		page_size;
+static unsigned long		mmap_window = 32;
 
-static unsigned long	total = 0;
-static unsigned long	total_comm = 0;
+static unsigned long		total_comm = 0;
 
-static struct rb_root	threads;
-static struct thread	*last_match;
+static struct rb_root		threads;
+static struct thread		*last_match;
 
-static struct perf_header *header;
-static u64		sample_type;
+static struct perf_header	*header;
+static u64			sample_type;
 
 
+/*
+ * Scheduler benchmarks
+ */
+#include <sys/resource.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/prctl.h>
+
+#include <linux/unistd.h>
+
+#include <semaphore.h>
+#include <pthread.h>
+#include <signal.h>
+#include <values.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <fcntl.h>
+#include <time.h>
+#include <math.h>
+
+#include <stdio.h>
+
+#define PR_SET_NAME	15               /* Set process name */
+
+#define BUG_ON(x)	assert(!(x))
+
+#define DEBUG		1
+
+typedef unsigned long long nsec_t;
+
+#define printk(x...)		do { printf(x); fflush(stdout); } while (0)
+
+nsec_t prev_printk;
+
+#define __dprintk(x,y...) do {						 \
+	nsec_t __now = get_nsecs(), __delta = __now - prev_printk;	 \
+									 \
+	prev_printk = __now;						 \
+									 \
+	printf("%.3f [%Ld] [%.3f]: " x, (double)__now/1e6, __now, (double)__delta/1e6, y);\
+} while (0)
+
+#if !DEBUG
+# define dprintk(x...)	do { } while (0)
+#else
+# define dprintk(x...)	__dprintk(x)
+#endif
+
+#define __DP()		__dprintk("parent: line %d\n", __LINE__)
+#define DP()		dprintk("parent: line %d\n", __LINE__)
+#define D()		dprintk("task %ld: line %d\n", this_task->nr, __LINE__)
+
+
+static nsec_t run_measurement_overhead;
+static nsec_t sleep_measurement_overhead;
+
+static nsec_t get_nsecs(void)
+{
+	struct timespec ts;
+
+	clock_gettime(CLOCK_MONOTONIC, &ts);
+
+	return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
+}
+
+static void burn_nsecs(nsec_t nsecs)
+{
+	nsec_t T0 = get_nsecs(), T1;
+
+	do {
+		T1 = get_nsecs();
+	} while (T1 + run_measurement_overhead < T0 + nsecs);
+}
+
+static void sleep_nsecs(nsec_t nsecs)
+{
+	struct timespec ts;
+
+	ts.tv_nsec = nsecs % 999999999;
+	ts.tv_sec = nsecs / 999999999;
+
+	nanosleep(&ts, NULL);
+}
+
+static void calibrate_run_measurement_overhead(void)
+{
+	nsec_t T0, T1, delta, min_delta = 1000000000ULL;
+	int i;
+
+	for (i = 0; i < 10; i++) {
+		T0 = get_nsecs();
+		burn_nsecs(0);
+		T1 = get_nsecs();
+		delta = T1-T0;
+		min_delta = min(min_delta, delta);
+	}
+	run_measurement_overhead = min_delta;
+
+	printk("run measurement overhead: %Ld nsecs\n", min_delta);
+}
+
+static void calibrate_sleep_measurement_overhead(void)
+{
+	nsec_t T0, T1, delta, min_delta = 1000000000ULL;
+	int i;
+
+	for (i = 0; i < 10; i++) {
+		T0 = get_nsecs();
+		sleep_nsecs(10000);
+		T1 = get_nsecs();
+		delta = T1-T0;
+		min_delta = min(min_delta, delta);
+	}
+	min_delta -= 10000;
+	sleep_measurement_overhead = min_delta;
+
+	printk("sleep measurement overhead: %Ld nsecs\n", min_delta);
+}
+
+#define COMM_LEN	20
+#define SYM_LEN		129
+
+#define MAX_PID		65536
+
+static unsigned long nr_tasks;
+
+struct sched_event;
+
+struct task_desc {
+	unsigned long		nr;
+	unsigned long		pid;
+	char			comm[COMM_LEN];
+
+	unsigned long		nr_events;
+	unsigned long		curr_event;
+	struct sched_event	**events;
+
+	pthread_t		thread;
+	sem_t			sleep_sem;
+
+	sem_t			ready_for_work;
+	sem_t			work_done_sem;
+
+	nsec_t			cpu_usage;
+};
+
+enum sched_event_type {
+	SCHED_EVENT_RUN,
+	SCHED_EVENT_SLEEP,
+	SCHED_EVENT_WAKEUP,
+};
+
+struct sched_event {
+	enum sched_event_type	type;
+	nsec_t			timestamp;
+	nsec_t			duration;
+	unsigned long		nr;
+	int			specific_wait;
+	sem_t			*wait_sem;
+	struct task_desc	*wakee;
+};
+
+static struct task_desc		*pid_to_task[MAX_PID];
+
+static struct task_desc		**tasks;
+
+static pthread_mutex_t		start_work_mutex = PTHREAD_MUTEX_INITIALIZER;
+static nsec_t			start_time;
+
+static pthread_mutex_t		work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static unsigned long		nr_run_events;
+static unsigned long		nr_sleep_events;
+static unsigned long		nr_wakeup_events;
+
+static unsigned long		nr_sleep_corrections;
+static unsigned long		nr_run_events_optimized;
+
+static struct sched_event *
+get_new_event(struct task_desc *task, nsec_t timestamp)
+{
+	struct sched_event *event = calloc(1, sizeof(*event));
+	unsigned long idx = task->nr_events;
+	size_t size;
+
+	event->timestamp = timestamp;
+	event->nr = idx;
+
+	task->nr_events++;
+	size = sizeof(struct sched_event *) * task->nr_events;
+	task->events = realloc(task->events, size);
+	BUG_ON(!task->events);
+
+	task->events[idx] = event;
+
+	return event;
+}
+
+static struct sched_event *last_event(struct task_desc *task)
+{
+	if (!task->nr_events)
+		return NULL;
+
+	return task->events[task->nr_events - 1];
+}
+
+static void
+add_sched_event_run(struct task_desc *task, nsec_t timestamp,
+		    unsigned long duration)
+{
+	struct sched_event *event, *curr_event = last_event(task);
+
+	/*
+ 	 * optimize an existing RUN event by merging this one
+ 	 * to it:
+ 	 */
+	if (curr_event && curr_event->type == SCHED_EVENT_RUN) {
+		nr_run_events_optimized++;
+		curr_event->duration += duration;
+		return;
+	}
+
+	event = get_new_event(task, timestamp);
+
+	event->type = SCHED_EVENT_RUN;
+	event->duration = duration;
+
+	nr_run_events++;
+}
+
+static unsigned long targetless_wakeups;
+static unsigned long multitarget_wakeups;
+
+static void
+add_sched_event_wakeup(struct task_desc *task, nsec_t timestamp,
+		       struct task_desc *wakee)
+{
+	struct sched_event *event, *wakee_event;
+
+	event = get_new_event(task, timestamp);
+	event->type = SCHED_EVENT_WAKEUP;
+	event->wakee = wakee;
+
+	wakee_event = last_event(wakee);
+	if (!wakee_event || wakee_event->type != SCHED_EVENT_SLEEP) {
+		targetless_wakeups++;
+		return;
+	}
+	if (wakee_event->wait_sem) {
+		multitarget_wakeups++;
+		return;
+	}
+
+	wakee_event->wait_sem = calloc(1, sizeof(*wakee_event->wait_sem));
+	sem_init(wakee_event->wait_sem, 0, 0);
+	wakee_event->specific_wait = 1;
+	event->wait_sem = wakee_event->wait_sem;
+
+	nr_wakeup_events++;
+}
+
+static void
+add_sched_event_sleep(struct task_desc *task, nsec_t timestamp,
+		      unsigned long uninterruptible __used)
+{
+	struct sched_event *event = get_new_event(task, timestamp);
+
+	event->type = SCHED_EVENT_SLEEP;
+
+	nr_sleep_events++;
+}
+
+static struct task_desc *register_pid(unsigned long pid, const char *comm)
+{
+	struct task_desc *task;
+
+	BUG_ON(pid >= MAX_PID);
+
+	task = pid_to_task[pid];
+
+	if (task)
+		return task;
+
+	task = calloc(1, sizeof(*task));
+	task->pid = pid;
+	task->nr = nr_tasks;
+	strcpy(task->comm, comm);
+	/*
+	 * every task starts in sleeping state - this gets ignored
+	 * if there's no wakeup pointing to this sleep state:
+	 */
+	add_sched_event_sleep(task, 0, 0);
+
+	pid_to_task[pid] = task;
+	nr_tasks++;
+	tasks = realloc(tasks, nr_tasks*sizeof(struct task_task *));
+	BUG_ON(!tasks);
+	tasks[task->nr] = task;
+
+	printk("registered task #%ld, PID %ld (%s)\n", nr_tasks, pid, comm);
+
+	return task;
+}
+
+
+static int first_trace_line = 1;
+
+static nsec_t first_timestamp;
+static nsec_t prev_timestamp;
+
+void parse_line(char *line);
+
+void parse_line(char *line)
+{
+	unsigned long param1 = 0, param2 = 0;
+	char comm[COMM_LEN], comm2[COMM_LEN];
+	unsigned long pid, pid2, timestamp0;
+	struct task_desc *task, *task2;
+	char func_str[SYM_LEN];
+	nsec_t timestamp;
+	int ret;
+
+	//"   <idle> 0     0D.s3    0us+: try_to_wake_up <events/0 9> (1 0)"
+	ret = sscanf(line, "%20s %5ld %*s %ldus%*c:"
+			   " %128s <%20s %ld> (%ld %ld)\n",
+		comm, &pid, &timestamp0,
+		func_str, comm2, &pid2, &param1, &param2);
+	dprintk("ret: %d\n", ret);
+	if (ret != 8)
+		return;
+
+	timestamp = timestamp0 * 1000LL;
+
+	if (first_trace_line) {
+		first_trace_line = 0;
+		first_timestamp = timestamp;
+	}
+
+	timestamp -= first_timestamp;
+	BUG_ON(timestamp < prev_timestamp);
+	prev_timestamp = timestamp;
+
+	dprintk("parsed: %s - %ld %Ld: %s - <%s %ld> (%ld %ld)\n",
+		comm,
+		pid,
+		timestamp, 
+		func_str,
+		comm2,
+		pid2,
+		param1,
+		param2);
+
+	task = register_pid(pid, comm);
+	task2 = register_pid(pid2, comm2);
+
+	if (!strcmp(func_str, "update_curr")) {
+		dprintk("%Ld: task %ld runs for %ld nsecs\n",
+			timestamp, task->nr, param1);
+		add_sched_event_run(task, timestamp, param1);
+	} else if (!strcmp(func_str, "try_to_wake_up")) {
+		dprintk("%Ld: task %ld wakes up task %ld\n",
+			timestamp, task->nr, task2->nr);
+		add_sched_event_wakeup(task, timestamp, task2);
+	} else if (!strcmp(func_str, "deactivate_task")) {
+		dprintk("%Ld: task %ld goes to sleep (uninterruptible: %ld)\n",
+			timestamp, task->nr, param1);
+		add_sched_event_sleep(task, timestamp, param1);
+	}
+}
+
+static void print_task_traces(void)
+{
+	struct task_desc *task;
+	unsigned long i;
+
+	for (i = 0; i < nr_tasks; i++) {
+		task = tasks[i];
+		printk("task %6ld (%20s:%10ld), nr_events: %ld\n",
+			task->nr, task->comm, task->pid, task->nr_events);
+	}
+}
+
+static void add_cross_task_wakeups(void)
+{
+	struct task_desc *task1, *task2;
+	unsigned long i, j;
+
+	for (i = 0; i < nr_tasks; i++) {
+		task1 = tasks[i];
+		j = i + 1;
+		if (j == nr_tasks)
+			j = 0;
+		task2 = tasks[j];
+		add_sched_event_wakeup(task1, 0, task2);
+	}
+}
+
+static void
+process_sched_event(struct task_desc *this_task, struct sched_event *event)
+{
+	int ret = 0;
+	nsec_t now;
+	long long delta;
+
+	now = get_nsecs();
+	delta = start_time + event->timestamp - now;
+
+	dprintk("task %ld, event #%ld, %Ld, delta: %.3f (%Ld)\n",
+		this_task->nr, event->nr, event->timestamp,
+		(double)delta/1e6, delta);
+
+	if (0 && delta > 0) {
+		dprintk("%.3f: task %ld FIX %.3f\n",
+			(double)event->timestamp/1e6,
+			this_task->nr,
+			(double)delta/1e6);
+		sleep_nsecs(start_time + event->timestamp - now);
+		nr_sleep_corrections++;
+	}
+
+	switch (event->type) {
+		case SCHED_EVENT_RUN:
+			dprintk("%.3f: task %ld RUN for %.3f\n",
+				(double)event->timestamp/1e6,
+				this_task->nr,
+				(double)event->duration/1e6);
+			burn_nsecs(event->duration);
+			break;
+		case SCHED_EVENT_SLEEP:
+			dprintk("%.3f: task %ld %s SLEEP\n",
+				(double)event->timestamp/1e6,
+				this_task->nr, event->wait_sem ? "" : "SKIP");
+			if (event->wait_sem)
+				ret = sem_wait(event->wait_sem);
+			BUG_ON(ret);
+			break;
+		case SCHED_EVENT_WAKEUP:
+			dprintk("%.3f: task %ld WAKEUP => task %ld\n",
+				(double)event->timestamp/1e6,
+				this_task->nr,
+				event->wakee->nr);
+			if (event->wait_sem)
+				ret = sem_post(event->wait_sem);
+			BUG_ON(ret);
+			break;
+		default:
+			BUG_ON(1);
+	}
+}
+
+static nsec_t get_cpu_usage_nsec_parent(void)
+{
+	struct rusage ru;
+	nsec_t sum;
+	int err;
+
+	err = getrusage(RUSAGE_SELF, &ru);
+	BUG_ON(err);
+
+	sum =  ru.ru_utime.tv_sec*1e9 + ru.ru_utime.tv_usec*1e3;
+	sum += ru.ru_stime.tv_sec*1e9 + ru.ru_stime.tv_usec*1e3;
+
+	return sum;
+}
+
+static nsec_t get_cpu_usage_nsec_self(void)
+{
+	char filename [] = "/proc/1234567890/sched";
+	unsigned long msecs, nsecs;
+	char *line = NULL;
+	nsec_t total = 0;
+	size_t len = 0;
+	ssize_t chars;
+	FILE *file;
+	int ret;
+
+	sprintf(filename, "/proc/%d/sched", getpid());
+	file = fopen(filename, "r");
+	BUG_ON(!file);
+
+	while ((chars = getline(&line, &len, file)) != -1) {
+		dprintk("got line with length %zu :\n", chars);
+		dprintk("%s", line);
+		ret = sscanf(line, "se.sum_exec_runtime : %ld.%06ld\n",
+			&msecs, &nsecs);
+		if (ret == 2) {
+			total = msecs*1e6 + nsecs;
+			dprintk("total: (%ld.%06ld) %Ld\n",
+				msecs, nsecs, total);
+			break;
+		}
+	}
+	if (line)
+		free(line);
+	fclose(file);
+
+	return total;
+}
+
+static void *thread_func(void *ctx)
+{
+	struct task_desc *this_task = ctx;
+	nsec_t cpu_usage_0, cpu_usage_1;
+	unsigned long i, ret;
+	char comm2[22];
+
+	dprintk("task %ld started up.\n", this_task->nr);
+	sprintf(comm2, ":%s", this_task->comm);
+	prctl(PR_SET_NAME, comm2);
+
+again:
+	ret = sem_post(&this_task->ready_for_work);
+	BUG_ON(ret);
+	D();
+	ret = pthread_mutex_lock(&start_work_mutex);
+	BUG_ON(ret);
+	ret = pthread_mutex_unlock(&start_work_mutex);
+	BUG_ON(ret);
+	D();
+
+	cpu_usage_0 = get_cpu_usage_nsec_self();
+
+	for (i = 0; i < this_task->nr_events; i++) {
+		this_task->curr_event = i;
+		process_sched_event(this_task, this_task->events[i]);
+	}
+
+	cpu_usage_1 = get_cpu_usage_nsec_self();
+	this_task->cpu_usage = cpu_usage_1 - cpu_usage_0;
+
+	dprintk("task %ld cpu usage: %0.3f msecs\n",
+		this_task->nr, (double)this_task->cpu_usage / 1e6);
+
+	D();
+	ret = sem_post(&this_task->work_done_sem);
+	BUG_ON(ret);
+	D();
+
+	ret = pthread_mutex_lock(&work_done_wait_mutex);
+	BUG_ON(ret);
+	ret = pthread_mutex_unlock(&work_done_wait_mutex);
+	BUG_ON(ret);
+	D();
+
+	goto again;
+}
+
+static void create_tasks(void)
+{
+	struct task_desc *task;
+	pthread_attr_t attr;
+	unsigned long i;
+	int err;
+
+	err = pthread_attr_init(&attr);
+	BUG_ON(err);
+	err = pthread_attr_setstacksize(&attr, (size_t)(16*1024));
+	BUG_ON(err);
+	err = pthread_mutex_lock(&start_work_mutex);
+	BUG_ON(err);
+	err = pthread_mutex_lock(&work_done_wait_mutex);
+	BUG_ON(err);
+	for (i = 0; i < nr_tasks; i++) {
+		task = tasks[i];
+		sem_init(&task->sleep_sem, 0, 0);
+		sem_init(&task->ready_for_work, 0, 0);
+		sem_init(&task->work_done_sem, 0, 0);
+		task->curr_event = 0;
+		err = pthread_create(&task->thread, &attr, thread_func, task);
+		BUG_ON(err);
+	}
+}
+
+static nsec_t cpu_usage;
+static nsec_t runavg_cpu_usage;
+static nsec_t parent_cpu_usage;
+static nsec_t runavg_parent_cpu_usage;
+
+static void wait_for_tasks(void)
+{
+	nsec_t cpu_usage_0, cpu_usage_1;
+	struct task_desc *task;
+	unsigned long i, ret;
+
+	DP();
+	start_time = get_nsecs();
+	DP();
+	cpu_usage = 0;
+	pthread_mutex_unlock(&work_done_wait_mutex);
+
+	for (i = 0; i < nr_tasks; i++) {
+		task = tasks[i];
+		ret = sem_wait(&task->ready_for_work);
+		BUG_ON(ret);
+		sem_init(&task->ready_for_work, 0, 0);
+	}
+	ret = pthread_mutex_lock(&work_done_wait_mutex);
+	BUG_ON(ret);
+
+	cpu_usage_0 = get_cpu_usage_nsec_parent();
+
+	pthread_mutex_unlock(&start_work_mutex);
+
+#if 0
+	for (i = 0; i < nr_tasks; i++) {
+		unsigned long missed;
+
+		task = tasks[i];
+		while (task->curr_event + 1 < task->nr_events) {
+			dprintk("parent waiting for %ld (%ld != %ld)\n",
+				i, task->curr_event, task->nr_events);
+			sleep_nsecs(100000000);
+		}
+		missed = task->nr_events - 1 - task->curr_event;
+		if (missed)
+			printk("task %ld missed events: %ld\n", i, missed);
+		ret = sem_post(&task->sleep_sem);
+		BUG_ON(ret);
+	}
+#endif
+	DP();
+	for (i = 0; i < nr_tasks; i++) {
+		task = tasks[i];
+		ret = sem_wait(&task->work_done_sem);
+		BUG_ON(ret);
+		sem_init(&task->work_done_sem, 0, 0);
+		cpu_usage += task->cpu_usage;
+		task->cpu_usage = 0;
+	}
+
+	cpu_usage_1 = get_cpu_usage_nsec_parent();
+	if (!runavg_cpu_usage)
+		runavg_cpu_usage = cpu_usage;
+	runavg_cpu_usage = (runavg_cpu_usage*9 + cpu_usage)/10;
+
+	parent_cpu_usage = cpu_usage_1 - cpu_usage_0;
+	if (!runavg_parent_cpu_usage)
+		runavg_parent_cpu_usage = parent_cpu_usage;
+	runavg_parent_cpu_usage = (runavg_parent_cpu_usage*9 +
+				   parent_cpu_usage)/10;
+
+	ret = pthread_mutex_lock(&start_work_mutex);
+	BUG_ON(ret);
+
+	for (i = 0; i < nr_tasks; i++) {
+		task = tasks[i];
+		sem_init(&task->sleep_sem, 0, 0);
+		task->curr_event = 0;
+	}
+}
+
+static int __cmd_sched(void);
+
+static void parse_trace(void)
+{
+	__cmd_sched();
+
+	printk("nr_run_events:        %ld\n", nr_run_events);
+	printk("nr_sleep_events:      %ld\n", nr_sleep_events);
+	printk("nr_wakeup_events:     %ld\n", nr_wakeup_events);
+
+	if (targetless_wakeups)
+		printk("target-less wakeups:  %ld\n", targetless_wakeups);
+	if (multitarget_wakeups)
+		printk("multi-target wakeups: %ld\n", multitarget_wakeups);
+	if (nr_run_events_optimized)
+		printk("run events optimized: %ld\n",
+			nr_run_events_optimized);
+}
+
+static unsigned long nr_runs;
+static nsec_t sum_runtime;
+static nsec_t sum_fluct;
+static nsec_t run_avg;
+
+static void run_one_test(void)
+{
+	nsec_t T0, T1, delta, avg_delta, fluct, std_dev;
+
+	T0 = get_nsecs();
+	wait_for_tasks();
+	T1 = get_nsecs();
+
+	delta = T1 - T0;
+	sum_runtime += delta;
+	nr_runs++;
+
+	avg_delta = sum_runtime / nr_runs;
+	if (delta < avg_delta)
+		fluct = avg_delta - delta;
+	else
+		fluct = delta - avg_delta;
+	sum_fluct += fluct;
+	std_dev = sum_fluct / nr_runs / sqrt(nr_runs);
+	if (!run_avg)
+		run_avg = delta;
+	run_avg = (run_avg*9 + delta)/10;
+
+	printk("#%-3ld: %0.3f, ",
+		nr_runs, (double)delta/1000000.0);
+
+#if 0
+	printk("%0.2f +- %0.2f, ",
+		(double)avg_delta/1e6, (double)std_dev/1e6);
+#endif
+	printk("ravg: %0.2f, ",
+		(double)run_avg/1e6);
+
+	printk("cpu: %0.2f / %0.2f",
+		(double)cpu_usage/1e6, (double)runavg_cpu_usage/1e6);
+
+#if 0
+	/*
+ 	 * rusage statistics done by the parent, these are less
+ 	 * accurate than the sum_exec_runtime based statistics:
+ 	 */
+	printk(" [%0.2f / %0.2f]",
+		(double)parent_cpu_usage/1e6,
+		(double)runavg_parent_cpu_usage/1e6);
+#endif
+
+	printk("\n");
+
+	if (nr_sleep_corrections)
+		printk(" (%ld sleep corrections)\n", nr_sleep_corrections);
+	nr_sleep_corrections = 0;
+}
+
+static void test_calibrations(void)
+{
+	nsec_t T0, T1;
+
+	T0 = get_nsecs();
+	burn_nsecs(1e6);
+	T1 = get_nsecs();
+
+	printk("the run test took %Ld nsecs\n", T1-T0);
+
+	T0 = get_nsecs();
+	sleep_nsecs(1e6);
+	T1 = get_nsecs();
+
+	printk("the sleep test took %Ld nsecs\n", T1-T0);
+}
+
 static int
 process_comm_event(event_t *event, unsigned long offset, unsigned long head)
 {
@@ -50,6 +798,46 @@ process_comm_event(event_t *event, unsigned long offset, unsigned long head)
 	return 0;
 }
 
+static void process_sched_wakeup_event(struct event *event,
+		  int cpu __used, u64 timestamp __used, struct thread *thread __used)
+{
+	printf("sched_wakeup event %p\n", event);
+}
+
+static void process_sched_switch_event(struct event *event,
+		  int cpu __used, u64 timestamp __used, struct thread *thread __used)
+{
+	printf("sched_switch event %p\n", event);
+}
+
+static void
+process_raw_event(event_t *raw_event, void *more_data,
+		  int cpu, u64 timestamp, struct thread *thread)
+{
+	struct {
+		u32 size;
+		char data[0];
+	} *raw = more_data;
+	struct event *event;
+	int type;
+
+	type = trace_parse_common_type(raw->data);
+	event = trace_find_event(type);
+
+	/*
+	 * FIXME: better resolve from pid from the struct trace_entry
+	 * field, although it should be the same than this perf
+	 * event pid
+	 */
+	printf("id %d, type: %d, event: %s\n",
+		raw_event->header.type, type, event->name);
+
+	if (!strcmp(event->name, "sched_switch"))
+		process_sched_switch_event(event, cpu, timestamp, thread);
+	if (!strcmp(event->name, "sched_wakeup"))
+		process_sched_wakeup_event(event, cpu, timestamp, thread);
+}
+
 static int
 process_sample_event(event_t *event, unsigned long offset, unsigned long head)
 {
@@ -122,20 +910,8 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
 		dump_printf(" ...... dso: [hypervisor]\n");
 	}
 
-	if (sample_type & PERF_SAMPLE_RAW) {
-		struct {
-			u32 size;
-			char data[0];
-		} *raw = more_data;
-
-		/*
-		 * FIXME: better resolve from pid from the struct trace_entry
-		 * field, although it should be the same than this perf
-		 * event pid
-		 */
-		print_event(cpu, raw->data, raw->size, timestamp, thread->comm);
-	}
-	total += period;
+	if (sample_type & PERF_SAMPLE_RAW)
+		process_raw_event(event, more_data, cpu, timestamp, thread);
 
 	return 0;
 }
@@ -277,6 +1053,8 @@ static const struct option options[] = {
 
 int cmd_sched(int argc, const char **argv, const char *prefix __used)
 {
+	long nr_iterations = LONG_MAX, i;
+
 	symbol__init();
 	page_size = getpagesize();
 
@@ -293,5 +1071,19 @@ int cmd_sched(int argc, const char **argv, const char *prefix __used)
 
 	setup_pager();
 
-	return __cmd_sched();
+	calibrate_run_measurement_overhead();
+	calibrate_sleep_measurement_overhead();
+
+	test_calibrations();
+
+	parse_trace();
+	print_task_traces();
+	add_cross_task_wakeups();
+
+	create_tasks();
+	printk("------------------------------------------------------------\n");
+	for (i = 0; i < nr_iterations; i++)
+		run_one_test();
+
+	return 0;
 }
diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c
index 629e602d9405..16cf2d51c4e1 100644
--- a/tools/perf/util/trace-event-parse.c
+++ b/tools/perf/util/trace-event-parse.c
@@ -1799,7 +1799,7 @@ static int get_common_info(const char *type, int *offset, int *size)
 	return 0;
 }
 
-static int parse_common_type(void *data)
+int trace_parse_common_type(void *data)
 {
 	static int type_offset;
 	static int type_size;
@@ -1832,7 +1832,7 @@ static int parse_common_pid(void *data)
 	return read_size(data + pid_offset, pid_size);
 }
 
-static struct event *find_event(int id)
+struct event *trace_find_event(int id)
 {
 	struct event *event;
 
@@ -2420,8 +2420,8 @@ get_return_for_leaf(int cpu, int cur_pid, unsigned long long cur_func,
 	int type;
 	int pid;
 
-	type = parse_common_type(next->data);
-	event = find_event(type);
+	type = trace_parse_common_type(next->data);
+	event = trace_find_event(type);
 	if (!event)
 		return NULL;
 
@@ -2502,8 +2502,8 @@ print_graph_entry_leaf(struct event *event, void *data, struct record *ret_rec)
 	int type;
 	int i;
 
-	type = parse_common_type(ret_rec->data);
-	ret_event = find_event(type);
+	type = trace_parse_common_type(ret_rec->data);
+	ret_event = trace_find_event(type);
 
 	field = find_field(ret_event, "rettime");
 	if (!field)
@@ -2696,9 +2696,9 @@ void print_event(int cpu, void *data, int size, unsigned long long nsecs,
 	nsecs -= secs * NSECS_PER_SEC;
 	usecs = nsecs / NSECS_PER_USEC;
 
-	type = parse_common_type(data);
+	type = trace_parse_common_type(data);
 
-	event = find_event(type);
+	event = trace_find_event(type);
 	if (!event)
 		die("ug! no event found for type %d", type);
 
diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h
index 420294a5773e..bc81612fd244 100644
--- a/tools/perf/util/trace-event.h
+++ b/tools/perf/util/trace-event.h
@@ -234,6 +234,8 @@ extern int header_page_data_offset;
 extern int header_page_data_size;
 
 int parse_header_page(char *buf, unsigned long size);
+int trace_parse_common_type(void *data);
+struct event *trace_find_event(int id);
 
 void read_tracing_data(struct perf_counter_attr *pattrs, int nb_counters);

commit 0a02ad9331dd4db56c29c60db2e99c4daaad8a48
Author: Ingo Molnar <mingo@elte.hu>
Date:   Fri Sep 11 12:12:54 2009 +0200

    perf: Add 'perf sched' tool
    
    This turn-key tool allows scheduler measurements to be
    conducted and the results be displayed numerically.
    
    First baby step towards that goal: clone the new command off of
    perf trace.
    
    Fix a few other details along the way:
    
     - add (minimal) perf trace documentation
    
     - reorder a few places
    
     - list perf trace in the mainporcelain list as well
       as it's a very useful utility.
    
    Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Mike Galbraith <efault@gmx.de>
    Cc: Paul Mackerras <paulus@samba.org>
    Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
    Cc: Frederic Weisbecker <fweisbec@gmail.com>
    LKML-Reference: <new-submission>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt
new file mode 100644
index 000000000000..056320eecb3a
--- /dev/null
+++ b/tools/perf/Documentation/perf-sched.txt
@@ -0,0 +1,25 @@
+perf-sched(1)
+==============
+
+NAME
+----
+perf-sched - Read perf.data (created by perf record) and display sched output
+
+SYNOPSIS
+--------
+[verse]
+'perf sched' [-i <file> | --input=file] symbol_name
+
+DESCRIPTION
+-----------
+This command reads the input file and displays the latencies recorded.
+
+OPTIONS
+-------
+-D::
+--dump-raw-trace=::
+        Display verbose dump of the sched data.
+
+SEE ALSO
+--------
+linkperf:perf-record[1]
diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt
new file mode 100644
index 000000000000..41ed75398ca9
--- /dev/null
+++ b/tools/perf/Documentation/perf-trace.txt
@@ -0,0 +1,25 @@
+perf-trace(1)
+==============
+
+NAME
+----
+perf-trace - Read perf.data (created by perf record) and display trace output
+
+SYNOPSIS
+--------
+[verse]
+'perf trace' [-i <file> | --input=file] symbol_name
+
+DESCRIPTION
+-----------
+This command reads the input file and displays the trace recorded.
+
+OPTIONS
+-------
+-D::
+--dump-raw-trace=::
+        Display verbose dump of the trace data.
+
+SEE ALSO
+--------
+linkperf:perf-record[1]
diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 9f8d207a91bf..2cb8cc3f6772 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -376,6 +376,7 @@ LIB_OBJS += util/trace-event-info.o
 
 BUILTIN_OBJS += builtin-annotate.o
 BUILTIN_OBJS += builtin-help.o
+BUILTIN_OBJS += builtin-sched.o
 BUILTIN_OBJS += builtin-list.o
 BUILTIN_OBJS += builtin-record.o
 BUILTIN_OBJS += builtin-report.o
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
new file mode 100644
index 000000000000..60228d9179ac
--- /dev/null
+++ b/tools/perf/builtin-sched.c
@@ -0,0 +1,297 @@
+#include "builtin.h"
+
+#include "util/util.h"
+#include "util/cache.h"
+#include "util/symbol.h"
+#include "util/thread.h"
+#include "util/header.h"
+
+#include "util/parse-options.h"
+
+#include "perf.h"
+#include "util/debug.h"
+
+#include "util/trace-event.h"
+
+static char		const *input_name = "perf.data";
+static int		input;
+static unsigned long	page_size;
+static unsigned long	mmap_window = 32;
+
+static unsigned long	total = 0;
+static unsigned long	total_comm = 0;
+
+static struct rb_root	threads;
+static struct thread	*last_match;
+
+static struct perf_header *header;
+static u64		sample_type;
+
+
+static int
+process_comm_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	struct thread *thread;
+
+	thread = threads__findnew(event->comm.pid, &threads, &last_match);
+
+	dump_printf("%p [%p]: PERF_EVENT_COMM: %s:%d\n",
+		(void *)(offset + head),
+		(void *)(long)(event->header.size),
+		event->comm.comm, event->comm.pid);
+
+	if (thread == NULL ||
+	    thread__set_comm(thread, event->comm.comm)) {
+		dump_printf("problem processing PERF_EVENT_COMM, skipping event.\n");
+		return -1;
+	}
+	total_comm++;
+
+	return 0;
+}
+
+static int
+process_sample_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	char level;
+	int show = 0;
+	struct dso *dso = NULL;
+	struct thread *thread;
+	u64 ip = event->ip.ip;
+	u64 timestamp = -1;
+	u32 cpu = -1;
+	u64 period = 1;
+	void *more_data = event->ip.__more_data;
+	int cpumode;
+
+	thread = threads__findnew(event->ip.pid, &threads, &last_match);
+
+	if (sample_type & PERF_SAMPLE_TIME) {
+		timestamp = *(u64 *)more_data;
+		more_data += sizeof(u64);
+	}
+
+	if (sample_type & PERF_SAMPLE_CPU) {
+		cpu = *(u32 *)more_data;
+		more_data += sizeof(u32);
+		more_data += sizeof(u32); /* reserved */
+	}
+
+	if (sample_type & PERF_SAMPLE_PERIOD) {
+		period = *(u64 *)more_data;
+		more_data += sizeof(u64);
+	}
+
+	dump_printf("%p [%p]: PERF_EVENT_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n",
+		(void *)(offset + head),
+		(void *)(long)(event->header.size),
+		event->header.misc,
+		event->ip.pid, event->ip.tid,
+		(void *)(long)ip,
+		(long long)period);
+
+	dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid);
+
+	if (thread == NULL) {
+		eprintf("problem processing %d event, skipping it.\n",
+			event->header.type);
+		return -1;
+	}
+
+	cpumode = event->header.misc & PERF_EVENT_MISC_CPUMODE_MASK;
+
+	if (cpumode == PERF_EVENT_MISC_KERNEL) {
+		show = SHOW_KERNEL;
+		level = 'k';
+
+		dso = kernel_dso;
+
+		dump_printf(" ...... dso: %s\n", dso->name);
+
+	} else if (cpumode == PERF_EVENT_MISC_USER) {
+
+		show = SHOW_USER;
+		level = '.';
+
+	} else {
+		show = SHOW_HV;
+		level = 'H';
+
+		dso = hypervisor_dso;
+
+		dump_printf(" ...... dso: [hypervisor]\n");
+	}
+
+	if (sample_type & PERF_SAMPLE_RAW) {
+		struct {
+			u32 size;
+			char data[0];
+		} *raw = more_data;
+
+		/*
+		 * FIXME: better resolve from pid from the struct trace_entry
+		 * field, although it should be the same than this perf
+		 * event pid
+		 */
+		print_event(cpu, raw->data, raw->size, timestamp, thread->comm);
+	}
+	total += period;
+
+	return 0;
+}
+
+static int
+process_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	trace_event(event);
+
+	switch (event->header.type) {
+	case PERF_EVENT_MMAP ... PERF_EVENT_LOST:
+		return 0;
+
+	case PERF_EVENT_COMM:
+		return process_comm_event(event, offset, head);
+
+	case PERF_EVENT_EXIT ... PERF_EVENT_READ:
+		return 0;
+
+	case PERF_EVENT_SAMPLE:
+		return process_sample_event(event, offset, head);
+
+	case PERF_EVENT_MAX:
+	default:
+		return -1;
+	}
+
+	return 0;
+}
+
+static int __cmd_sched(void)
+{
+	int ret, rc = EXIT_FAILURE;
+	unsigned long offset = 0;
+	unsigned long head = 0;
+	struct stat perf_stat;
+	event_t *event;
+	uint32_t size;
+	char *buf;
+
+	trace_report();
+	register_idle_thread(&threads, &last_match);
+
+	input = open(input_name, O_RDONLY);
+	if (input < 0) {
+		perror("failed to open file");
+		exit(-1);
+	}
+
+	ret = fstat(input, &perf_stat);
+	if (ret < 0) {
+		perror("failed to stat file");
+		exit(-1);
+	}
+
+	if (!perf_stat.st_size) {
+		fprintf(stderr, "zero-sized file, nothing to do!\n");
+		exit(0);
+	}
+	header = perf_header__read(input);
+	head = header->data_offset;
+	sample_type = perf_header__sample_type(header);
+
+	if (!(sample_type & PERF_SAMPLE_RAW))
+		die("No trace sample to read. Did you call perf record "
+		    "without -R?");
+
+	if (load_kernel() < 0) {
+		perror("failed to load kernel symbols");
+		return EXIT_FAILURE;
+	}
+
+remap:
+	buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ,
+			   MAP_SHARED, input, offset);
+	if (buf == MAP_FAILED) {
+		perror("failed to mmap file");
+		exit(-1);
+	}
+
+more:
+	event = (event_t *)(buf + head);
+
+	size = event->header.size;
+	if (!size)
+		size = 8;
+
+	if (head + event->header.size >= page_size * mmap_window) {
+		unsigned long shift = page_size * (head / page_size);
+		int res;
+
+		res = munmap(buf, page_size * mmap_window);
+		assert(res == 0);
+
+		offset += shift;
+		head -= shift;
+		goto remap;
+	}
+
+	size = event->header.size;
+
+
+	if (!size || process_event(event, offset, head) < 0) {
+
+		/*
+		 * assume we lost track of the stream, check alignment, and
+		 * increment a single u64 in the hope to catch on again 'soon'.
+		 */
+
+		if (unlikely(head & 7))
+			head &= ~7ULL;
+
+		size = 8;
+	}
+
+	head += size;
+
+	if (offset + head < (unsigned long)perf_stat.st_size)
+		goto more;
+
+	rc = EXIT_SUCCESS;
+	close(input);
+
+	return rc;
+}
+
+static const char * const annotate_usage[] = {
+	"perf trace [<options>] <command>",
+	NULL
+};
+
+static const struct option options[] = {
+	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
+		    "dump raw trace in ASCII"),
+	OPT_BOOLEAN('v', "verbose", &verbose,
+		    "be more verbose (show symbol address, etc)"),
+	OPT_END()
+};
+
+int cmd_sched(int argc, const char **argv, const char *prefix __used)
+{
+	symbol__init();
+	page_size = getpagesize();
+
+	argc = parse_options(argc, argv, options, annotate_usage, 0);
+	if (argc) {
+		/*
+		 * Special case: if there's an argument left then assume tha
+		 * it's a symbol filter:
+		 */
+		if (argc > 1)
+			usage_with_options(annotate_usage, options);
+	}
+
+
+	setup_pager();
+
+	return __cmd_sched();
+}
diff --git a/tools/perf/builtin.h b/tools/perf/builtin.h
index 3a63e41fb44e..b09cadbd76b1 100644
--- a/tools/perf/builtin.h
+++ b/tools/perf/builtin.h
@@ -16,12 +16,13 @@ extern int check_pager_config(const char *cmd);
 
 extern int cmd_annotate(int argc, const char **argv, const char *prefix);
 extern int cmd_help(int argc, const char **argv, const char *prefix);
+extern int cmd_sched(int argc, const char **argv, const char *prefix);
+extern int cmd_list(int argc, const char **argv, const char *prefix);
 extern int cmd_record(int argc, const char **argv, const char *prefix);
 extern int cmd_report(int argc, const char **argv, const char *prefix);
 extern int cmd_stat(int argc, const char **argv, const char *prefix);
 extern int cmd_top(int argc, const char **argv, const char *prefix);
-extern int cmd_version(int argc, const char **argv, const char *prefix);
-extern int cmd_list(int argc, const char **argv, const char *prefix);
 extern int cmd_trace(int argc, const char **argv, const char *prefix);
+extern int cmd_version(int argc, const char **argv, const char *prefix);
 
 #endif
diff --git a/tools/perf/command-list.txt b/tools/perf/command-list.txt
index eebce30afbc0..3133c74729dd 100644
--- a/tools/perf/command-list.txt
+++ b/tools/perf/command-list.txt
@@ -4,7 +4,9 @@
 #
 perf-annotate			mainporcelain common
 perf-list			mainporcelain common
+perf-sched			mainporcelain common
 perf-record			mainporcelain common
 perf-report			mainporcelain common
 perf-stat			mainporcelain common
 perf-top			mainporcelain common
+perf-trace			mainporcelain common
diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index fe4589dde950..c972d1c35489 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -293,6 +293,7 @@ static void handle_internal_command(int argc, const char **argv)
 		{ "annotate", cmd_annotate, 0 },
 		{ "version", cmd_version, 0 },
 		{ "trace", cmd_trace, 0 },
+		{ "sched", cmd_sched, 0 },
 	};
 	unsigned int i;
 	static const char ext[] = STRIP_EXTENSION;

commit e1f8450854d69f0291882804406ea1bab3ca44b4
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Sep 10 20:52:09 2009 +0200

    sched: Fix sched::sched_stat_wait tracepoint field
    
    This weird perf trace output:
    
      cc1-9943  [001]  2802.059479616: sched_stat_wait: task: as:9944 wait: 2801938766276 [ns]
    
    Is caused by setting one component field of the delta to zero
    a bit too early. Move it to later.
    
    ( Note, this does not affect the NEW_FAIR_SLEEPERS interactivity bug,
      it's just a reporting bug in essence. )
    
    Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Nikos Chantziaras <realnc@arcor.de>
    Cc: Jens Axboe <jens.axboe@oracle.com>
    Cc: Mike Galbraith <efault@gmx.de>
    LKML-Reference: <4AA93D34.8040500@arcor.de>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 26fadb44250c..aa7f84121016 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -545,14 +545,13 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	schedstat_set(se->wait_count, se->wait_count + 1);
 	schedstat_set(se->wait_sum, se->wait_sum +
 			rq_of(cfs_rq)->clock - se->wait_start);
-	schedstat_set(se->wait_start, 0);
-
 #ifdef CONFIG_SCHEDSTATS
 	if (entity_is_task(se)) {
 		trace_sched_stat_wait(task_of(se),
 			rq_of(cfs_rq)->clock - se->wait_start);
 	}
 #endif
+	schedstat_set(se->wait_start, 0);
 }
 
 static inline void