Patches contributed by Eötvös Lorand University

<<Prev 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59[60]61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 Next>>

commit 983f2163e7fdf11a15e05816de243f93f07eafca
Author: Ingo Molnar <mingo@elte.hu>
Date:   Tue Sep 15 12:29:20 2009 +0200

    MAINTAINERS: Update tracing tree details
    
    Acked-by: Steven Rostedt <srostedt@redhat.com>
    Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/MAINTAINERS b/MAINTAINERS
index 8dca9d89c6c1..1505129ec5a0 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2118,13 +2118,16 @@ F:	Documentation/filesystems/caching/
 F:	fs/fscache/
 F:	include/linux/fscache*.h
 
-FTRACE
+TRACING
 M:	Steven Rostedt <rostedt@goodmis.org>
+M:	Frederic Weisbecker <fweisbec@gmail.com>
+M:	Ingo Molnar <mingo@redhat.com>
+T:	git git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git tracing/core
 S:	Maintained
 F:	Documentation/trace/ftrace.txt
 F:	arch/*/*/*/ftrace.h
 F:	arch/*/kernel/ftrace.c
-F:	include/*/ftrace.h
+F:	include/*/ftrace.h include/trace/ include/linux/trace*.h
 F:	kernel/trace/
 
 FUJITSU FR-V (FRV) PORT

commit dca2d6ac09d9ef59ff46820d4f0c94b08a671202
Merge: d6a65dffb30d 18240904960a
Author: Ingo Molnar <mingo@elte.hu>
Date:   Tue Sep 15 12:18:15 2009 +0200

    Merge branch 'linus' into tracing/hw-breakpoints
    
    Conflicts:
            arch/x86/kernel/process_64.c
    
    Semantic conflict fixed in:
            arch/x86/kvm/x86.c
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --cc arch/x86/kernel/process_64.c
index 89c46f1259d3,ad535b683170..72edac026a78
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@@ -493,33 -489,12 +502,30 @@@ __switch_to(struct task_struct *prev_p
  		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
  		__switch_to_xtra(prev_p, next_p, tss);
  
- 	/* If the task has used fpu the last 5 timeslices, just do a full
- 	 * restore of the math state immediately to avoid the trap; the
- 	 * chances of needing FPU soon are obviously high now
- 	 *
- 	 * tsk_used_math() checks prevent calling math_state_restore(),
- 	 * which can sleep in the case of !tsk_used_math()
+ 	/*
+ 	 * Preload the FPU context, now that we've determined that the
+ 	 * task is likely to be using it. 
  	 */
- 	if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
- 		math_state_restore();
+ 	if (preload_fpu)
+ 		__math_state_restore();
 +	/*
 +	 * There's a problem with moving the arch_install_thread_hw_breakpoint()
 +	 * call before current is updated.  Suppose a kernel breakpoint is
 +	 * triggered in between the two, the hw-breakpoint handler will see that
 +	 * the 'current' task does not have TIF_DEBUG flag set and will think it
 +	 * is leftover from an old task (lazy switching) and will erase it. Then
 +	 * until the next context switch, no user-breakpoints will be installed.
 +	 *
 +	 * The real problem is that it's impossible to update both current and
 +	 * physical debug registers at the same instant, so there will always be
 +	 * a window in which they disagree and a breakpoint might get triggered.
 +	 * Since we use lazy switching, we are forced to assume that a
 +	 * disagreement means that current is correct and the exception is due
 +	 * to lazy debug register switching.
 +	 */
 +	if (unlikely(test_tsk_thread_flag(next_p, TIF_DEBUG)))
 +		arch_install_thread_hw_breakpoint(next_p);
 +
  	return prev_p;
  }
  
diff --cc arch/x86/kvm/x86.c
index 3d4529011828,be451ee44249..74029f50b26a
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@@ -3312,18 -3638,17 +3638,17 @@@ static int vcpu_enter_guest(struct kvm_
  		set_debugreg(vcpu->arch.eff_db[3], 3);
  	}
  
- 	KVMTRACE_0D(VMENTRY, vcpu, entryexit);
+ 	trace_kvm_entry(vcpu->vcpu_id);
  	kvm_x86_ops->run(vcpu, kvm_run);
  
- 	if (unlikely(vcpu->arch.switch_db_regs)) {
- 		set_debugreg(0, 7);
- 		set_debugreg(vcpu->arch.host_db[0], 0);
- 		set_debugreg(vcpu->arch.host_db[1], 1);
- 		set_debugreg(vcpu->arch.host_db[2], 2);
- 		set_debugreg(vcpu->arch.host_db[3], 3);
+ 	if (unlikely(vcpu->arch.switch_db_regs || test_thread_flag(TIF_DEBUG))) {
 -		set_debugreg(current->thread.debugreg0, 0);
 -		set_debugreg(current->thread.debugreg1, 1);
 -		set_debugreg(current->thread.debugreg2, 2);
 -		set_debugreg(current->thread.debugreg3, 3);
++		set_debugreg(current->thread.debugreg[0], 0);
++		set_debugreg(current->thread.debugreg[1], 1);
++		set_debugreg(current->thread.debugreg[2], 2);
++		set_debugreg(current->thread.debugreg[3], 3);
+ 		set_debugreg(current->thread.debugreg6, 6);
+ 		set_debugreg(current->thread.debugreg7, 7);
  	}
- 	set_debugreg(vcpu->arch.host_dr6, 6);
- 	set_debugreg(vcpu->arch.host_dr7, 7);
  
  	set_bit(KVM_REQ_KICK, &vcpu->requests);
  	local_irq_enable();

commit d11533893b31ab7806ff04bfa69ae646068610ce
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Sep 14 18:22:53 2009 +0200

    perf sched: Fix 'perf sched latency' output on 32-bit systems
    
    Before:
    
      -----------------------------------------------------------------------------------
       Task              |  Runtime ms | Switches | Average delay ms | Maximum delay ms |
      -----------------------------------------------------------------------------------
       perf              |4853313.251 ms |       10 | avg:    0.046 ms | max:    0.337 ms |
       flush-8:0         |2426659.202 ms |        5 | avg:    0.015 ms | max:    0.016 ms |
       sleep             |485331.966 ms |        1 | avg:    0.012 ms | max:    0.012 ms |
       ksoftirqd/1       |485331.320 ms |        1 | avg:    0.005 ms | max:    0.005 ms |
      -----------------------------------------------------------------------------------
       TOTAL:            |8250635.739 ms |       17 |
      ---------------------------------------------
    
    After:
    
      -----------------------------------------------------------------------------------
       Task              |  Runtime ms | Switches | Average delay ms | Maximum delay ms |
      -----------------------------------------------------------------------------------
       perf              |    0.206 ms |       10 | avg:    0.046 ms | max:    0.337 ms |
       flush-8:0         |    2.680 ms |        5 | avg:    0.015 ms | max:    0.016 ms |
       sleep             |    0.662 ms |        1 | avg:    0.012 ms | max:    0.012 ms |
       ksoftirqd/1       |    0.015 ms |        1 | avg:    0.005 ms | max:    0.005 ms |
      -----------------------------------------------------------------------------------
       TOTAL:            |    3.563 ms |       17 |
      ---------------------------------------------
    
    Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Mike Galbraith <efault@gmx.de>
    Cc: Paul Mackerras <paulus@samba.org>
    Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
    Cc: Frederic Weisbecker <fweisbec@gmail.com>
    LKML-Reference: <new-submission>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 2ce87ef5a3e6..f856a02cd4fc 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -805,7 +805,7 @@ replay_wakeup_event(struct trace_wakeup_event *wakeup_event,
 	add_sched_event_wakeup(waker, timestamp, wakee);
 }
 
-static unsigned long cpu_last_switched[MAX_CPUS];
+static u64 cpu_last_switched[MAX_CPUS];
 
 static void
 replay_switch_event(struct trace_switch_event *switch_event,

commit ea57c4f5203d82c7844c54cdef54e972cf4e9d1f
Author: Ingo Molnar <mingo@elte.hu>
Date:   Sun Sep 13 18:15:54 2009 +0200

    perf tools: Implement counter output multiplexing
    
    Finish the -M/--multiplex option implementation:
    
     - separate it out from group_fd
    
     - correctly set it via the ioctl and dont mmap counters that
       are multiplexed
    
     - modify the perf record event loop to deal with buffer-less
       counters.
    
     - remove the -g option from perf sched record
    
     - account for unordered events in perf sched latency
    
     - (add -f to perf sched record to ease measurements)
    
     - skip idle threads (pid==0) in latency output
    
    The result is better latency output by 'perf sched latency':
    
     -----------------------------------------------------------------------------------
      Task              |  Runtime ms | Switches | Average delay ms | Maximum delay ms |
     -----------------------------------------------------------------------------------
      ksoftirqd/8       |    0.071 ms |        2 | avg:    0.458 ms | max:    0.913 ms |
      at-spi-registry   |    0.609 ms |       19 | avg:    0.013 ms | max:    0.023 ms |
      perf              |    3.316 ms |       16 | avg:    0.013 ms | max:    0.054 ms |
      Xorg              |    0.392 ms |       19 | avg:    0.011 ms | max:    0.018 ms |
      sleep             |    0.537 ms |        2 | avg:    0.009 ms | max:    0.009 ms |
     -----------------------------------------------------------------------------------
      TOTAL:            |    4.925 ms |       58 |
     ---------------------------------------------
    
    Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Mike Galbraith <efault@gmx.de>
    Cc: Paul Mackerras <paulus@samba.org>
    Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
    Cc: Frederic Weisbecker <fweisbec@gmail.com>
    LKML-Reference: <new-submission>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 79f99dba5be0..5f3127e7a615 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -49,6 +49,7 @@ static int			inherit_stat			= 0;
 static int			no_samples			= 0;
 static int			sample_address			= 0;
 static int			multiplex			= 0;
+static int			multiplex_fd			= -1;
 
 static long			samples;
 static struct timeval		last_read;
@@ -471,23 +472,29 @@ static void create_counter(int counter, int cpu, pid_t pid)
 	 */
 	if (group && group_fd == -1)
 		group_fd = fd[nr_cpu][counter];
+	if (multiplex && multiplex_fd == -1)
+		multiplex_fd = fd[nr_cpu][counter];
 
-	event_array[nr_poll].fd = fd[nr_cpu][counter];
-	event_array[nr_poll].events = POLLIN;
-	nr_poll++;
-
-	mmap_array[nr_cpu][counter].counter = counter;
-	mmap_array[nr_cpu][counter].prev = 0;
-	mmap_array[nr_cpu][counter].mask = mmap_pages*page_size - 1;
-	mmap_array[nr_cpu][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
-			PROT_READ|PROT_WRITE, MAP_SHARED, fd[nr_cpu][counter], 0);
-	if (mmap_array[nr_cpu][counter].base == MAP_FAILED) {
-		error("failed to mmap with %d (%s)\n", errno, strerror(errno));
-		exit(-1);
-	}
+	if (multiplex && fd[nr_cpu][counter] != multiplex_fd) {
+		int ret;
 
-	if (multiplex && fd[nr_cpu][counter] != group_fd)
-		ioctl(fd[nr_cpu][counter], PERF_COUNTER_IOC_SET_OUTPUT, group_fd);
+		ret = ioctl(fd[nr_cpu][counter], PERF_COUNTER_IOC_SET_OUTPUT, multiplex_fd);
+		assert(ret != -1);
+	} else {
+		event_array[nr_poll].fd = fd[nr_cpu][counter];
+		event_array[nr_poll].events = POLLIN;
+		nr_poll++;
+
+		mmap_array[nr_cpu][counter].counter = counter;
+		mmap_array[nr_cpu][counter].prev = 0;
+		mmap_array[nr_cpu][counter].mask = mmap_pages*page_size - 1;
+		mmap_array[nr_cpu][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
+				PROT_READ|PROT_WRITE, MAP_SHARED, fd[nr_cpu][counter], 0);
+		if (mmap_array[nr_cpu][counter].base == MAP_FAILED) {
+			error("failed to mmap with %d (%s)\n", errno, strerror(errno));
+			exit(-1);
+		}
+	}
 
 	ioctl(fd[nr_cpu][counter], PERF_COUNTER_IOC_ENABLE);
 }
@@ -618,8 +625,10 @@ static int __cmd_record(int argc, const char **argv)
 		int hits = samples;
 
 		for (i = 0; i < nr_cpu; i++) {
-			for (counter = 0; counter < nr_counters; counter++)
-				mmap_read(&mmap_array[i][counter]);
+			for (counter = 0; counter < nr_counters; counter++) {
+				if (mmap_array[i][counter].base)
+					mmap_read(&mmap_array[i][counter]);
+			}
 		}
 
 		if (hits == samples) {
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 3e003237c42f..2ce87ef5a3e6 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -116,6 +116,8 @@ static u64			sum_fluct;
 static u64			run_avg;
 
 static unsigned long		replay_repeat = 10;
+static unsigned long		nr_timestamps;
+static unsigned long		unordered_timestamps;
 
 #define TASK_STATE_TO_CHAR_STR "RSDTtZX"
 
@@ -1109,8 +1111,11 @@ latency_wakeup_event(struct trace_wakeup_event *wakeup_event,
 	if (atom->state != THREAD_SLEEPING)
 		return;
 
-	if (atom->sched_out_time > timestamp)
+	nr_timestamps++;
+	if (atom->sched_out_time > timestamp) {
+		unordered_timestamps++;
 		return;
+	}
 
 	atom->state = THREAD_WAIT_CPU;
 	atom->wake_up_time = timestamp;
@@ -1130,6 +1135,11 @@ static void output_lat_thread(struct task_atoms *atom_list)
 
 	if (!atom_list->nb_atoms)
 		return;
+	/*
+	 * Ignore idle threads:
+	 */
+	if (!atom_list->thread->pid)
+		return;
 
 	all_runtime += atom_list->total_runtime;
 	all_count += atom_list->nb_atoms;
@@ -1301,8 +1311,16 @@ static void __cmd_lat(void)
 	}
 
 	printf("-----------------------------------------------------------------------------------\n");
-	printf(" TOTAL:            |%9.3f ms |%9Ld |\n",
+	printf(" TOTAL:            |%9.3f ms |%9Ld |",
 		(double)all_runtime/1e6, all_count);
+
+	if (unordered_timestamps && nr_timestamps) {
+		printf(" INFO: %.2f%% unordered events.\n",
+			(double)unordered_timestamps/(double)nr_timestamps*100.0);
+	} else {
+		printf("\n");
+	}
+
 	printf("---------------------------------------------\n");
 }
 
@@ -1667,12 +1685,13 @@ static const char *record_args[] = {
 	"-a",
 	"-R",
 	"-M",
-	"-g",
+	"-f",
 	"-c", "1",
 	"-e", "sched:sched_switch:r",
 	"-e", "sched:sched_stat_wait:r",
 	"-e", "sched:sched_stat_sleep:r",
 	"-e", "sched:sched_stat_iowait:r",
+	"-e", "sched:sched_stat_runtime:r",
 	"-e", "sched:sched_process_exit:r",
 	"-e", "sched:sched_process_fork:r",
 	"-e", "sched:sched_wakeup:r",
diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c
index 64d6e302751a..f6a8437141c8 100644
--- a/tools/perf/util/trace-event-parse.c
+++ b/tools/perf/util/trace-event-parse.c
@@ -2722,8 +2722,10 @@ void print_event(int cpu, void *data, int size, unsigned long long nsecs,
 	type = trace_parse_common_type(data);
 
 	event = trace_find_event(type);
-	if (!event)
-		die("ug! no event found for type %d", type);
+	if (!event) {
+		printf("ug! no event found for type %d\n", type);
+		return;
+	}
 
 	pid = parse_common_pid(data);

commit f977bb4937857994312fff4f9c2cad336a36a932
Author: Ingo Molnar <mingo@elte.hu>
Date:   Sun Sep 13 18:15:54 2009 +0200

    perf_counter, sched: Add sched_stat_runtime tracepoint
    
    This allows more precise tracking of how the scheduler accounts
    (and acts upon) a task having spent N nanoseconds of CPU time.
    
    Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Mike Galbraith <efault@gmx.de>
    Cc: Paul Mackerras <paulus@samba.org>
    Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
    Cc: Frederic Weisbecker <fweisbec@gmail.com>
    LKML-Reference: <new-submission>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index b48f1ad7c946..4069c43f4187 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -379,6 +379,39 @@ TRACE_EVENT(sched_stat_wait,
 			(unsigned long long)__entry->delay)
 );
 
+/*
+ * Tracepoint for accounting runtime (time the task is executing
+ * on a CPU).
+ */
+TRACE_EVENT(sched_stat_runtime,
+
+	TP_PROTO(struct task_struct *tsk, u64 runtime, u64 vruntime),
+
+	TP_ARGS(tsk, runtime, vruntime),
+
+	TP_STRUCT__entry(
+		__array( char,	comm,	TASK_COMM_LEN	)
+		__field( pid_t,	pid			)
+		__field( u64,	runtime			)
+		__field( u64,	vruntime			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+		__entry->pid		= tsk->pid;
+		__entry->runtime	= runtime;
+		__entry->vruntime	= vruntime;
+	)
+	TP_perf_assign(
+		__perf_count(runtime);
+	),
+
+	TP_printk("task: %s:%d runtime: %Lu [ns], vruntime: %Lu [ns]",
+			__entry->comm, __entry->pid,
+			(unsigned long long)__entry->runtime,
+			(unsigned long long)__entry->vruntime)
+);
+
 /*
  * Tracepoint for accounting sleep time (time the task is not runnable,
  * including iowait, see below).
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index aa7f84121016..a097e909e80f 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -513,6 +513,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
 	if (entity_is_task(curr)) {
 		struct task_struct *curtask = task_of(curr);
 
+		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
 		cpuacct_charge(curtask, delta_exec);
 		account_group_exec_runtime(curtask, delta_exec);
 	}

commit c13f0d3c8165e9592102687fa999da0a0d9c3724
Author: Ingo Molnar <mingo@elte.hu>
Date:   Sun Sep 13 16:51:04 2009 +0200

    perf sched: Add 'perf sched trace', improve documentation
    
    Alias 'perf sched trace' to 'perf trace', for workflow completeness.
    
    Add a bit of documentation for perf sched.
    
    Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Mike Galbraith <efault@gmx.de>
    Cc: Paul Mackerras <paulus@samba.org>
    Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
    Cc: Frederic Weisbecker <fweisbec@gmail.com>
    LKML-Reference: <new-submission>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt
index 056320eecb3a..1ce79198997b 100644
--- a/tools/perf/Documentation/perf-sched.txt
+++ b/tools/perf/Documentation/perf-sched.txt
@@ -3,16 +3,32 @@ perf-sched(1)
 
 NAME
 ----
-perf-sched - Read perf.data (created by perf record) and display sched output
+perf-sched - Tool to trace/measure scheduler properties (latencies)
 
 SYNOPSIS
 --------
 [verse]
-'perf sched' [-i <file> | --input=file] symbol_name
+'perf sched' {record|latency|replay|trace}
 
 DESCRIPTION
 -----------
-This command reads the input file and displays the latencies recorded.
+There's four variants of perf sched:
+
+  'perf sched record <command>' to record the scheduling events
+  of an arbitrary workload.
+
+  'perf sched latency' to report the per task scheduling latencies
+  and other scheduling properties of the workload.
+
+  'perf sched trace' to see a detailed trace of the workload that
+  was recorded.
+
+  'perf sched replay' to simulate the workload that was recorded
+  via perf sched record. (this is done by starting up mockup threads
+  that mimic the workload based on the events in the trace. These
+  threads can then replay the timings (CPU runtime and sleep patterns)
+  of the workload as it occured when it was recorded - and can repeat
+  it a number of times, measuring its performance.)
 
 OPTIONS
 -------
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index ede40c1429a8..8db0fd222f80 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -1597,7 +1597,7 @@ static int read_events(void)
 }
 
 static const char * const sched_usage[] = {
-	"perf sched [<options>] {record|latency|replay}",
+	"perf sched [<options>] {record|latency|replay|trace}",
 	NULL
 };
 
@@ -1719,6 +1719,11 @@ int cmd_sched(int argc, const char **argv, const char *prefix __used)
 				usage_with_options(replay_usage, replay_options);
 		}
 		__cmd_replay();
+	} else if (!strcmp(argv[0], "trace")) {
+		/*
+		 * Aliased to 'perf trace' for now:
+		 */
+		return cmd_trace(argc, argv, prefix);
 	} else {
 		usage_with_options(sched_usage, sched_options);
 	}

commit 459ec28ab404d7afcd512ce9b855959ad301605a
Author: Ingo Molnar <mingo@elte.hu>
Date:   Sun Sep 13 17:33:44 2009 +0200

    perf_counter: Allow mmap if paranoid checks are turned off
    
    Before:
    
      $ perf sched record -f sleep 1
      Error: failed to mmap with 1 (Operation not permitted)
    
    After:
    
      $ perf sched record -f sleep 1
      [ perf record: Captured and wrote 0.095 MB perf.data (~4161 samples) ]
    
    Note, this is only allowed if perfcounter_paranoid is set to
    the most permissive (non-default) value of -1.
    
    Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Mike Galbraith <efault@gmx.de>
    Cc: Paul Mackerras <paulus@samba.org>
    Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
    Cc: Frederic Weisbecker <fweisbec@gmail.com>
    LKML-Reference: <new-submission>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index e0d91fdf0c3c..667ab25ad3d5 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2315,7 +2315,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	lock_limit >>= PAGE_SHIFT;
 	locked = vma->vm_mm->locked_vm + extra;
 
-	if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
+	if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
+		!capable(CAP_IPC_LOCK)) {
 		ret = -EPERM;
 		goto unlock;
 	}

commit 1fc35b29b4098aa3bf9fc9acb4c1615d0b5dd95d
Author: Ingo Molnar <mingo@elte.hu>
Date:   Sun Sep 13 09:44:29 2009 +0200

    perf sched: Implement the 'perf sched record' subcommand
    
    Implement the 'perf sched record' subcommand that adds a
    default list of events, turns on raw sampling and system-wide
    tracing and passes off the rest of the command to perf record.
    
    This is more convenient than having to specify the events all
    the time.
    
    Before:
    
     $ perf record -a -R -e sched:sched_switch:r -e sched:sched_stat_wait:r -e sched:sched_stat_sleep:r -e sched:sched_stat_iowait:r -e sched:sched_process_exit:r -e sched:sched_process_fork:r -e sched:sched_wakeup:r -e sched:sched_migrate_task:r -c 1 sleep 1
    
    After:
    
     $ perf sched record -f sleep 1
    
    Also fix an assumption in the event string parser that assumed
    that strings passed in can be modified. (In this case they wont
    be as they come from a readonly constant section.)
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index b72544f2b964..ede40c1429a8 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -1656,6 +1656,40 @@ static void setup_sorting(void)
 	sort_dimension__add((char *)"pid", &cmp_pid);
 }
 
+static const char *record_args[] = {
+	"record",
+	"-a",
+	"-R",
+	"-c", "1",
+	"-e", "sched:sched_switch:r",
+	"-e", "sched:sched_stat_wait:r",
+	"-e", "sched:sched_stat_sleep:r",
+	"-e", "sched:sched_stat_iowait:r",
+	"-e", "sched:sched_process_exit:r",
+	"-e", "sched:sched_process_fork:r",
+	"-e", "sched:sched_wakeup:r",
+	"-e", "sched:sched_migrate_task:r",
+};
+
+static int __cmd_record(int argc, const char **argv)
+{
+	unsigned int rec_argc, i, j;
+	const char **rec_argv;
+
+	rec_argc = ARRAY_SIZE(record_args) + argc - 1;
+	rec_argv = calloc(rec_argc + 1, sizeof(char *));
+
+	for (i = 0; i < ARRAY_SIZE(record_args); i++)
+		rec_argv[i] = strdup(record_args[i]);
+
+	for (j = 1; j < (unsigned int)argc; j++, i++)
+		rec_argv[i] = argv[j];
+
+	BUG_ON(i != rec_argc);
+
+	return cmd_record(i, rec_argv, NULL);
+}
+
 int cmd_sched(int argc, const char **argv, const char *prefix __used)
 {
 	symbol__init();
@@ -1666,7 +1700,9 @@ int cmd_sched(int argc, const char **argv, const char *prefix __used)
 	if (!argc)
 		usage_with_options(sched_usage, sched_options);
 
-	if (!strncmp(argv[0], "lat", 3)) {
+	if (!strncmp(argv[0], "rec", 3)) {
+		return __cmd_record(argc, argv);
+	} else if (!strncmp(argv[0], "lat", 3)) {
 		trace_handler = &lat_ops;
 		if (argc > 1) {
 			argc = parse_options(argc, argv, latency_options, latency_usage, 0);
@@ -1687,6 +1723,5 @@ int cmd_sched(int argc, const char **argv, const char *prefix __used)
 		usage_with_options(sched_usage, sched_options);
 	}
 
-
 	return 0;
 }
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index d06c66cd358b..034245e46817 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -525,7 +525,8 @@ static enum event_result parse_tracepoint_event(const char **strp,
 
 	flags = strchr(evt_name, ':');
 	if (flags) {
-		*flags = '\0';
+		/* split it out: */
+		evt_name = strndup(evt_name, flags - evt_name);
 		flags++;
 	}

commit b5fae128e41021889777f8ead810cbd2a8b249fc
Author: Ingo Molnar <mingo@elte.hu>
Date:   Fri Sep 11 12:12:54 2009 +0200

    perf sched: Clean up PID sorting logic
    
    Use a sort list for thread atoms insertion as well - instead of
    hardcoded for PID.
    
    Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Mike Galbraith <efault@gmx.de>
    Cc: Paul Mackerras <paulus@samba.org>
    Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
    Cc: Frederic Weisbecker <fweisbec@gmail.com>
    LKML-Reference: <new-submission>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index cc2dbd5b50eb..b72544f2b964 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -144,7 +144,7 @@ struct task_atoms {
 	u64			total_runtime;
 };
 
-typedef int (*sort_thread_lat)(struct task_atoms *, struct task_atoms *);
+typedef int (*sort_fn_t)(struct task_atoms *, struct task_atoms *);
 
 static struct rb_root		atom_root, sorted_atom_root;
 
@@ -869,41 +869,22 @@ static struct trace_sched_handler replay_ops  = {
 	.fork_event		= replay_fork_event,
 };
 
-static struct task_atoms *
-thread_atoms_search(struct rb_root *root, struct thread *thread)
-{
-	struct rb_node *node = root->rb_node;
-
-	while (node) {
-		struct task_atoms *atoms;
-
-		atoms = container_of(node, struct task_atoms, node);
-		if (thread->pid > atoms->thread->pid)
-			node = node->rb_left;
-		else if (thread->pid < atoms->thread->pid)
-			node = node->rb_right;
-		else {
-			return atoms;
-		}
-	}
-	return NULL;
-}
-
 struct sort_dimension {
 	const char		*name;
-	sort_thread_lat		cmp;
+	sort_fn_t		cmp;
 	struct list_head	list;
 };
 
 static LIST_HEAD(cmp_pid);
 
 static int
-thread_lat_cmp(struct list_head *list, struct task_atoms *l,
-	       struct task_atoms *r)
+thread_lat_cmp(struct list_head *list, struct task_atoms *l, struct task_atoms *r)
 {
 	struct sort_dimension *sort;
 	int ret = 0;
 
+	BUG_ON(list_empty(list));
+
 	list_for_each_entry(sort, list, list) {
 		ret = sort->cmp(l, r);
 		if (ret)
@@ -913,6 +894,32 @@ thread_lat_cmp(struct list_head *list, struct task_atoms *l,
 	return ret;
 }
 
+static struct task_atoms *
+thread_atoms_search(struct rb_root *root, struct thread *thread,
+			 struct list_head *sort_list)
+{
+	struct rb_node *node = root->rb_node;
+	struct task_atoms key = { .thread = thread };
+
+	while (node) {
+		struct task_atoms *atoms;
+		int cmp;
+
+		atoms = container_of(node, struct task_atoms, node);
+
+		cmp = thread_lat_cmp(sort_list, &key, atoms);
+		if (cmp > 0)
+			node = node->rb_left;
+		else if (cmp < 0)
+			node = node->rb_right;
+		else {
+			BUG_ON(thread != atoms->thread);
+			return atoms;
+		}
+	}
+	return NULL;
+}
+
 static void
 __thread_latency_insert(struct rb_root *root, struct task_atoms *data,
 			 struct list_head *sort_list)
@@ -1049,18 +1056,18 @@ latency_switch_event(struct trace_switch_event *switch_event,
 	sched_out = threads__findnew(switch_event->prev_pid, &threads, &last_match);
 	sched_in = threads__findnew(switch_event->next_pid, &threads, &last_match);
 
-	in_atoms = thread_atoms_search(&atom_root, sched_in);
+	in_atoms = thread_atoms_search(&atom_root, sched_in, &cmp_pid);
 	if (!in_atoms) {
 		thread_atoms_insert(sched_in);
-		in_atoms = thread_atoms_search(&atom_root, sched_in);
+		in_atoms = thread_atoms_search(&atom_root, sched_in, &cmp_pid);
 		if (!in_atoms)
 			die("in-atom: Internal tree error");
 	}
 
-	out_atoms = thread_atoms_search(&atom_root, sched_out);
+	out_atoms = thread_atoms_search(&atom_root, sched_out, &cmp_pid);
 	if (!out_atoms) {
 		thread_atoms_insert(sched_out);
-		out_atoms = thread_atoms_search(&atom_root, sched_out);
+		out_atoms = thread_atoms_search(&atom_root, sched_out, &cmp_pid);
 		if (!out_atoms)
 			die("out-atom: Internal tree error");
 	}
@@ -1085,7 +1092,7 @@ latency_wakeup_event(struct trace_wakeup_event *wakeup_event,
 		return;
 
 	wakee = threads__findnew(wakeup_event->pid, &threads, &last_match);
-	atoms = thread_atoms_search(&atom_root, wakee);
+	atoms = thread_atoms_search(&atom_root, wakee, &cmp_pid);
 	if (!atoms) {
 		thread_atoms_insert(wakee);
 		return;
@@ -1136,7 +1143,6 @@ static void output_lat_thread(struct task_atoms *atom_list)
 
 static int pid_cmp(struct task_atoms *l, struct task_atoms *r)
 {
-
 	if (l->thread->pid < r->thread->pid)
 		return -1;
 	if (l->thread->pid > r->thread->pid)
@@ -1146,8 +1152,8 @@ static int pid_cmp(struct task_atoms *l, struct task_atoms *r)
 }
 
 static struct sort_dimension pid_sort_dimension = {
-	.name = "pid",
-	.cmp = pid_cmp,
+	.name			= "pid",
+	.cmp			= pid_cmp,
 };
 
 static int avg_cmp(struct task_atoms *l, struct task_atoms *r)
@@ -1172,8 +1178,8 @@ static int avg_cmp(struct task_atoms *l, struct task_atoms *r)
 }
 
 static struct sort_dimension avg_sort_dimension = {
-	.name 	= "avg",
-	.cmp	= avg_cmp,
+	.name			= "avg",
+	.cmp			= avg_cmp,
 };
 
 static int max_cmp(struct task_atoms *l, struct task_atoms *r)
@@ -1187,8 +1193,8 @@ static int max_cmp(struct task_atoms *l, struct task_atoms *r)
 }
 
 static struct sort_dimension max_sort_dimension = {
-	.name 	= "max",
-	.cmp	= max_cmp,
+	.name			= "max",
+	.cmp			= max_cmp,
 };
 
 static int switch_cmp(struct task_atoms *l, struct task_atoms *r)
@@ -1202,8 +1208,8 @@ static int switch_cmp(struct task_atoms *l, struct task_atoms *r)
 }
 
 static struct sort_dimension switch_sort_dimension = {
-	.name 	= "switch",
-	.cmp	= switch_cmp,
+	.name			= "switch",
+	.cmp			= switch_cmp,
 };
 
 static int runtime_cmp(struct task_atoms *l, struct task_atoms *r)
@@ -1217,8 +1223,8 @@ static int runtime_cmp(struct task_atoms *l, struct task_atoms *r)
 }
 
 static struct sort_dimension runtime_sort_dimension = {
-	.name 	= "runtime",
-	.cmp	= runtime_cmp,
+	.name			= "runtime",
+	.cmp			= runtime_cmp,
 };
 
 static struct sort_dimension *available_sorts[] = {
@@ -1666,8 +1672,8 @@ int cmd_sched(int argc, const char **argv, const char *prefix __used)
 			argc = parse_options(argc, argv, latency_options, latency_usage, 0);
 			if (argc)
 				usage_with_options(latency_usage, latency_options);
-			setup_sorting();
 		}
+		setup_sorting();
 		__cmd_lat();
 	} else if (!strncmp(argv[0], "rep", 3)) {
 		trace_handler = &replay_ops;
diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h
index 634f2809a342..665d1f3dc977 100644
--- a/tools/perf/util/thread.h
+++ b/tools/perf/util/thread.h
@@ -4,10 +4,10 @@
 #include "symbol.h"
 
 struct thread {
-	struct rb_node	 rb_node;
-	struct list_head maps;
-	pid_t		 pid;
-	char		 *comm;
+	struct rb_node		rb_node;
+	struct list_head	maps;
+	pid_t			pid;
+	char			*comm;
 };
 
 int thread__set_comm(struct thread *self, const char *comm);

commit b1ffe8f3e0c96f5527a89e24410d6b0e59b3554a
Author: Ingo Molnar <mingo@elte.hu>
Date:   Fri Sep 11 12:12:54 2009 +0200

    perf sched: Finish latency => atom rename and misc cleanups
    
    - Rename 'latency' field/variable names to the better 'atom' ones
    
     - Reduce the number of #include lines and consolidate them
    
     - Gather file scope variables at the top of the file
    
     - Remove unused bits
    
    No change in functionality.
    
    Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Mike Galbraith <efault@gmx.de>
    Cc: Paul Mackerras <paulus@samba.org>
    Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
    Cc: Frederic Weisbecker <fweisbec@gmail.com>
    LKML-Reference: <new-submission>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index e01cc63b98cc..cc2dbd5b50eb 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -1,4 +1,5 @@
 #include "builtin.h"
+#include "perf.h"
 
 #include "util/util.h"
 #include "util/cache.h"
@@ -7,15 +8,16 @@
 #include "util/header.h"
 
 #include "util/parse-options.h"
+#include "util/trace-event.h"
 
-#include "perf.h"
 #include "util/debug.h"
 
-#include "util/trace-event.h"
 #include <sys/types.h>
+#include <sys/prctl.h>
 
-
-#define MAX_CPUS 4096
+#include <semaphore.h>
+#include <pthread.h>
+#include <math.h>
 
 static char			const *input_name = "perf.data";
 static int			input;
@@ -33,44 +35,126 @@ static u64			sample_type;
 static char			default_sort_order[] = "avg, max, switch, runtime";
 static char			*sort_order = default_sort_order;
 
+#define PR_SET_NAME		15               /* Set process name */
+#define MAX_CPUS		4096
 
-/*
- * Scheduler benchmarks
- */
-#include <sys/resource.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/time.h>
-#include <sys/prctl.h>
+#define BUG_ON(x)		assert(!(x))
 
-#include <linux/unistd.h>
+static u64			run_measurement_overhead;
+static u64			sleep_measurement_overhead;
 
-#include <semaphore.h>
-#include <pthread.h>
-#include <signal.h>
-#include <values.h>
-#include <string.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include <assert.h>
-#include <fcntl.h>
-#include <time.h>
-#include <math.h>
+#define COMM_LEN		20
+#define SYM_LEN			129
 
-#include <stdio.h>
+#define MAX_PID			65536
 
-#define PR_SET_NAME	15               /* Set process name */
+static unsigned long		nr_tasks;
 
-#define BUG_ON(x)	assert(!(x))
+struct sched_event;
 
-#define DEBUG		0
+struct task_desc {
+	unsigned long		nr;
+	unsigned long		pid;
+	char			comm[COMM_LEN];
 
-typedef unsigned long long nsec_t;
+	unsigned long		nr_events;
+	unsigned long		curr_event;
+	struct sched_event	**events;
+
+	pthread_t		thread;
+	sem_t			sleep_sem;
 
-static nsec_t run_measurement_overhead;
-static nsec_t sleep_measurement_overhead;
+	sem_t			ready_for_work;
+	sem_t			work_done_sem;
+
+	u64			cpu_usage;
+};
+
+enum sched_event_type {
+	SCHED_EVENT_RUN,
+	SCHED_EVENT_SLEEP,
+	SCHED_EVENT_WAKEUP,
+};
+
+struct sched_event {
+	enum sched_event_type	type;
+	u64			timestamp;
+	u64			duration;
+	unsigned long		nr;
+	int			specific_wait;
+	sem_t			*wait_sem;
+	struct task_desc	*wakee;
+};
+
+static struct task_desc		*pid_to_task[MAX_PID];
+
+static struct task_desc		**tasks;
+
+static pthread_mutex_t		start_work_mutex = PTHREAD_MUTEX_INITIALIZER;
+static u64			start_time;
+
+static pthread_mutex_t		work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER;
 
-static nsec_t get_nsecs(void)
+static unsigned long		nr_run_events;
+static unsigned long		nr_sleep_events;
+static unsigned long		nr_wakeup_events;
+
+static unsigned long		nr_sleep_corrections;
+static unsigned long		nr_run_events_optimized;
+
+static unsigned long		targetless_wakeups;
+static unsigned long		multitarget_wakeups;
+
+static u64			cpu_usage;
+static u64			runavg_cpu_usage;
+static u64			parent_cpu_usage;
+static u64			runavg_parent_cpu_usage;
+
+static unsigned long		nr_runs;
+static u64			sum_runtime;
+static u64			sum_fluct;
+static u64			run_avg;
+
+static unsigned long		replay_repeat = 10;
+
+#define TASK_STATE_TO_CHAR_STR "RSDTtZX"
+
+enum thread_state {
+	THREAD_SLEEPING = 0,
+	THREAD_WAIT_CPU,
+	THREAD_SCHED_IN,
+	THREAD_IGNORE
+};
+
+struct work_atom {
+	struct list_head	list;
+	enum thread_state	state;
+	u64			wake_up_time;
+	u64			sched_in_time;
+	u64			runtime;
+};
+
+struct task_atoms {
+	struct list_head	atom_list;
+	struct thread		*thread;
+	struct rb_node		node;
+	u64			max_lat;
+	u64			total_lat;
+	u64			nb_atoms;
+	u64			total_runtime;
+};
+
+typedef int (*sort_thread_lat)(struct task_atoms *, struct task_atoms *);
+
+static struct rb_root		atom_root, sorted_atom_root;
+
+static u64			all_runtime;
+static u64			all_count;
+
+static int read_events(void);
+
+
+static u64 get_nsecs(void)
 {
 	struct timespec ts;
 
@@ -79,16 +163,16 @@ static nsec_t get_nsecs(void)
 	return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
 }
 
-static void burn_nsecs(nsec_t nsecs)
+static void burn_nsecs(u64 nsecs)
 {
-	nsec_t T0 = get_nsecs(), T1;
+	u64 T0 = get_nsecs(), T1;
 
 	do {
 		T1 = get_nsecs();
 	} while (T1 + run_measurement_overhead < T0 + nsecs);
 }
 
-static void sleep_nsecs(nsec_t nsecs)
+static void sleep_nsecs(u64 nsecs)
 {
 	struct timespec ts;
 
@@ -100,7 +184,7 @@ static void sleep_nsecs(nsec_t nsecs)
 
 static void calibrate_run_measurement_overhead(void)
 {
-	nsec_t T0, T1, delta, min_delta = 1000000000ULL;
+	u64 T0, T1, delta, min_delta = 1000000000ULL;
 	int i;
 
 	for (i = 0; i < 10; i++) {
@@ -117,7 +201,7 @@ static void calibrate_run_measurement_overhead(void)
 
 static void calibrate_sleep_measurement_overhead(void)
 {
-	nsec_t T0, T1, delta, min_delta = 1000000000ULL;
+	u64 T0, T1, delta, min_delta = 1000000000ULL;
 	int i;
 
 	for (i = 0; i < 10; i++) {
@@ -133,67 +217,8 @@ static void calibrate_sleep_measurement_overhead(void)
 	printf("sleep measurement overhead: %Ld nsecs\n", min_delta);
 }
 
-#define COMM_LEN	20
-#define SYM_LEN		129
-
-#define MAX_PID		65536
-
-static unsigned long nr_tasks;
-
-struct sched_event;
-
-struct task_desc {
-	unsigned long		nr;
-	unsigned long		pid;
-	char			comm[COMM_LEN];
-
-	unsigned long		nr_events;
-	unsigned long		curr_event;
-	struct sched_event	**events;
-
-	pthread_t		thread;
-	sem_t			sleep_sem;
-
-	sem_t			ready_for_work;
-	sem_t			work_done_sem;
-
-	nsec_t			cpu_usage;
-};
-
-enum sched_event_type {
-	SCHED_EVENT_RUN,
-	SCHED_EVENT_SLEEP,
-	SCHED_EVENT_WAKEUP,
-};
-
-struct sched_event {
-	enum sched_event_type	type;
-	nsec_t			timestamp;
-	nsec_t			duration;
-	unsigned long		nr;
-	int			specific_wait;
-	sem_t			*wait_sem;
-	struct task_desc	*wakee;
-};
-
-static struct task_desc		*pid_to_task[MAX_PID];
-
-static struct task_desc		**tasks;
-
-static pthread_mutex_t		start_work_mutex = PTHREAD_MUTEX_INITIALIZER;
-static nsec_t			start_time;
-
-static pthread_mutex_t		work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER;
-
-static unsigned long		nr_run_events;
-static unsigned long		nr_sleep_events;
-static unsigned long		nr_wakeup_events;
-
-static unsigned long		nr_sleep_corrections;
-static unsigned long		nr_run_events_optimized;
-
 static struct sched_event *
-get_new_event(struct task_desc *task, nsec_t timestamp)
+get_new_event(struct task_desc *task, u64 timestamp)
 {
 	struct sched_event *event = calloc(1, sizeof(*event));
 	unsigned long idx = task->nr_events;
@@ -221,7 +246,7 @@ static struct sched_event *last_event(struct task_desc *task)
 }
 
 static void
-add_sched_event_run(struct task_desc *task, nsec_t timestamp, u64 duration)
+add_sched_event_run(struct task_desc *task, u64 timestamp, u64 duration)
 {
 	struct sched_event *event, *curr_event = last_event(task);
 
@@ -243,11 +268,8 @@ add_sched_event_run(struct task_desc *task, nsec_t timestamp, u64 duration)
 	nr_run_events++;
 }
 
-static unsigned long		targetless_wakeups;
-static unsigned long		multitarget_wakeups;
-
 static void
-add_sched_event_wakeup(struct task_desc *task, nsec_t timestamp,
+add_sched_event_wakeup(struct task_desc *task, u64 timestamp,
 		       struct task_desc *wakee)
 {
 	struct sched_event *event, *wakee_event;
@@ -275,7 +297,7 @@ add_sched_event_wakeup(struct task_desc *task, nsec_t timestamp,
 }
 
 static void
-add_sched_event_sleep(struct task_desc *task, nsec_t timestamp,
+add_sched_event_sleep(struct task_desc *task, u64 timestamp,
 		      u64 task_state __used)
 {
 	struct sched_event *event = get_new_event(task, timestamp);
@@ -350,7 +372,7 @@ static void
 process_sched_event(struct task_desc *this_task __used, struct sched_event *event)
 {
 	int ret = 0;
-	nsec_t now;
+	u64 now;
 	long long delta;
 
 	now = get_nsecs();
@@ -375,10 +397,10 @@ process_sched_event(struct task_desc *this_task __used, struct sched_event *even
 	}
 }
 
-static nsec_t get_cpu_usage_nsec_parent(void)
+static u64 get_cpu_usage_nsec_parent(void)
 {
 	struct rusage ru;
-	nsec_t sum;
+	u64 sum;
 	int err;
 
 	err = getrusage(RUSAGE_SELF, &ru);
@@ -390,12 +412,12 @@ static nsec_t get_cpu_usage_nsec_parent(void)
 	return sum;
 }
 
-static nsec_t get_cpu_usage_nsec_self(void)
+static u64 get_cpu_usage_nsec_self(void)
 {
 	char filename [] = "/proc/1234567890/sched";
 	unsigned long msecs, nsecs;
 	char *line = NULL;
-	nsec_t total = 0;
+	u64 total = 0;
 	size_t len = 0;
 	ssize_t chars;
 	FILE *file;
@@ -423,7 +445,7 @@ static nsec_t get_cpu_usage_nsec_self(void)
 static void *thread_func(void *ctx)
 {
 	struct task_desc *this_task = ctx;
-	nsec_t cpu_usage_0, cpu_usage_1;
+	u64 cpu_usage_0, cpu_usage_1;
 	unsigned long i, ret;
 	char comm2[22];
 
@@ -485,14 +507,9 @@ static void create_tasks(void)
 	}
 }
 
-static nsec_t			cpu_usage;
-static nsec_t			runavg_cpu_usage;
-static nsec_t			parent_cpu_usage;
-static nsec_t			runavg_parent_cpu_usage;
-
 static void wait_for_tasks(void)
 {
-	nsec_t cpu_usage_0, cpu_usage_1;
+	u64 cpu_usage_0, cpu_usage_1;
 	struct task_desc *task;
 	unsigned long i, ret;
 
@@ -543,16 +560,9 @@ static void wait_for_tasks(void)
 	}
 }
 
-static int read_events(void);
-
-static unsigned long nr_runs;
-static nsec_t sum_runtime;
-static nsec_t sum_fluct;
-static nsec_t run_avg;
-
 static void run_one_test(void)
 {
-	nsec_t T0, T1, delta, avg_delta, fluct, std_dev;
+	u64 T0, T1, delta, avg_delta, fluct, std_dev;
 
 	T0 = get_nsecs();
 	wait_for_tasks();
@@ -576,10 +586,6 @@ static void run_one_test(void)
 	printf("#%-3ld: %0.3f, ",
 		nr_runs, (double)delta/1000000.0);
 
-#if 0
-	printf("%0.2f +- %0.2f, ",
-		(double)avg_delta/1e6, (double)std_dev/1e6);
-#endif
 	printf("ravg: %0.2f, ",
 		(double)run_avg/1e6);
 
@@ -605,7 +611,7 @@ static void run_one_test(void)
 
 static void test_calibrations(void)
 {
-	nsec_t T0, T1;
+	u64 T0, T1;
 
 	T0 = get_nsecs();
 	burn_nsecs(1e6);
@@ -620,8 +626,6 @@ static void test_calibrations(void)
 	printf("the sleep test took %Ld nsecs\n", T1-T0);
 }
 
-static unsigned long replay_repeat = 10;
-
 static void __cmd_replay(void)
 {
 	unsigned long i;
@@ -865,47 +869,8 @@ static struct trace_sched_handler replay_ops  = {
 	.fork_event		= replay_fork_event,
 };
 
-#define TASK_STATE_TO_CHAR_STR "RSDTtZX"
-
-enum thread_state {
-	THREAD_SLEEPING = 0,
-	THREAD_WAIT_CPU,
-	THREAD_SCHED_IN,
-	THREAD_IGNORE
-};
-
-struct work_atom {
-	struct list_head	list;
-	enum thread_state	state;
-	u64			wake_up_time;
-	u64			sched_in_time;
-	u64			runtime;
-};
-
-struct task_atoms {
-	struct list_head	snapshot_list;
-	struct thread		*thread;
-	struct rb_node		node;
-	u64			max_lat;
-	u64			total_lat;
-	u64			nb_atoms;
-	u64			total_runtime;
-};
-
-typedef int (*sort_thread_lat)(struct task_atoms *, struct task_atoms *);
-
-struct sort_dimension {
-	const char 		*name;
-	sort_thread_lat		cmp;
-	struct list_head 	list;
-};
-
-static LIST_HEAD(cmp_pid);
-
-static struct rb_root lat_snapshot_root, sorted_lat_snapshot_root;
-
 static struct task_atoms *
-thread_atom_list_search(struct rb_root *root, struct thread *thread)
+thread_atoms_search(struct rb_root *root, struct thread *thread)
 {
 	struct rb_node *node = root->rb_node;
 
@@ -924,6 +889,14 @@ thread_atom_list_search(struct rb_root *root, struct thread *thread)
 	return NULL;
 }
 
+struct sort_dimension {
+	const char		*name;
+	sort_thread_lat		cmp;
+	struct list_head	list;
+};
+
+static LIST_HEAD(cmp_pid);
+
 static int
 thread_lat_cmp(struct list_head *list, struct task_atoms *l,
 	       struct task_atoms *r)
@@ -965,16 +938,17 @@ __thread_latency_insert(struct rb_root *root, struct task_atoms *data,
 	rb_insert_color(&data->node, root);
 }
 
-static void thread_atom_list_insert(struct thread *thread)
+static void thread_atoms_insert(struct thread *thread)
 {
 	struct task_atoms *atoms;
+
 	atoms = calloc(sizeof(*atoms), 1);
 	if (!atoms)
 		die("No memory");
 
 	atoms->thread = thread;
-	INIT_LIST_HEAD(&atoms->snapshot_list);
-	__thread_latency_insert(&lat_snapshot_root, atoms, &cmp_pid);
+	INIT_LIST_HEAD(&atoms->atom_list);
+	__thread_latency_insert(&atom_root, atoms, &cmp_pid);
 }
 
 static void
@@ -1001,50 +975,49 @@ lat_sched_out(struct task_atoms *atoms,
 	      u64 delta,
 	      u64 timestamp)
 {
-	struct work_atom *snapshot;
+	struct work_atom *atom;
 
-	snapshot = calloc(sizeof(*snapshot), 1);
-	if (!snapshot)
+	atom = calloc(sizeof(*atom), 1);
+	if (!atom)
 		die("Non memory");
 
 	if (sched_out_state(switch_event) == 'R') {
-		snapshot->state = THREAD_WAIT_CPU;
-		snapshot->wake_up_time = timestamp;
+		atom->state = THREAD_WAIT_CPU;
+		atom->wake_up_time = timestamp;
 	}
 
-	snapshot->runtime = delta;
-	list_add_tail(&snapshot->list, &atoms->snapshot_list);
+	atom->runtime = delta;
+	list_add_tail(&atom->list, &atoms->atom_list);
 }
 
 static void
 lat_sched_in(struct task_atoms *atoms, u64 timestamp)
 {
-	struct work_atom *snapshot;
+	struct work_atom *atom;
 	u64 delta;
 
-	if (list_empty(&atoms->snapshot_list))
+	if (list_empty(&atoms->atom_list))
 		return;
 
-	snapshot = list_entry(atoms->snapshot_list.prev, struct work_atom,
-			      list);
+	atom = list_entry(atoms->atom_list.prev, struct work_atom, list);
 
-	if (snapshot->state != THREAD_WAIT_CPU)
+	if (atom->state != THREAD_WAIT_CPU)
 		return;
 
-	if (timestamp < snapshot->wake_up_time) {
-		snapshot->state = THREAD_IGNORE;
+	if (timestamp < atom->wake_up_time) {
+		atom->state = THREAD_IGNORE;
 		return;
 	}
 
-	snapshot->state = THREAD_SCHED_IN;
-	snapshot->sched_in_time = timestamp;
+	atom->state = THREAD_SCHED_IN;
+	atom->sched_in_time = timestamp;
 
-	delta = snapshot->sched_in_time - snapshot->wake_up_time;
+	delta = atom->sched_in_time - atom->wake_up_time;
 	atoms->total_lat += delta;
 	if (delta > atoms->max_lat)
 		atoms->max_lat = delta;
 	atoms->nb_atoms++;
-	atoms->total_runtime += snapshot->runtime;
+	atoms->total_runtime += atom->runtime;
 }
 
 static void
@@ -1076,20 +1049,20 @@ latency_switch_event(struct trace_switch_event *switch_event,
 	sched_out = threads__findnew(switch_event->prev_pid, &threads, &last_match);
 	sched_in = threads__findnew(switch_event->next_pid, &threads, &last_match);
 
-	in_atoms = thread_atom_list_search(&lat_snapshot_root, sched_in);
+	in_atoms = thread_atoms_search(&atom_root, sched_in);
 	if (!in_atoms) {
-		thread_atom_list_insert(sched_in);
-		in_atoms = thread_atom_list_search(&lat_snapshot_root, sched_in);
+		thread_atoms_insert(sched_in);
+		in_atoms = thread_atoms_search(&atom_root, sched_in);
 		if (!in_atoms)
-			die("Internal latency tree error");
+			die("in-atom: Internal tree error");
 	}
 
-	out_atoms = thread_atom_list_search(&lat_snapshot_root, sched_out);
+	out_atoms = thread_atoms_search(&atom_root, sched_out);
 	if (!out_atoms) {
-		thread_atom_list_insert(sched_out);
-		out_atoms = thread_atom_list_search(&lat_snapshot_root, sched_out);
+		thread_atoms_insert(sched_out);
+		out_atoms = thread_atoms_search(&atom_root, sched_out);
 		if (!out_atoms)
-			die("Internal latency tree error");
+			die("out-atom: Internal tree error");
 	}
 
 	lat_sched_in(in_atoms, timestamp);
@@ -1104,7 +1077,7 @@ latency_wakeup_event(struct trace_wakeup_event *wakeup_event,
 		     struct thread *thread __used)
 {
 	struct task_atoms *atoms;
-	struct work_atom *snapshot;
+	struct work_atom *atom;
 	struct thread *wakee;
 
 	/* Note for later, it may be interesting to observe the failing cases */
@@ -1112,23 +1085,22 @@ latency_wakeup_event(struct trace_wakeup_event *wakeup_event,
 		return;
 
 	wakee = threads__findnew(wakeup_event->pid, &threads, &last_match);
-	atoms = thread_atom_list_search(&lat_snapshot_root, wakee);
+	atoms = thread_atoms_search(&atom_root, wakee);
 	if (!atoms) {
-		thread_atom_list_insert(wakee);
+		thread_atoms_insert(wakee);
 		return;
 	}
 
-	if (list_empty(&atoms->snapshot_list))
+	if (list_empty(&atoms->atom_list))
 		return;
 
-	snapshot = list_entry(atoms->snapshot_list.prev, struct work_atom,
-			      list);
+	atom = list_entry(atoms->atom_list.prev, struct work_atom, list);
 
-	if (snapshot->state != THREAD_SLEEPING)
+	if (atom->state != THREAD_SLEEPING)
 		return;
 
-	snapshot->state = THREAD_WAIT_CPU;
-	snapshot->wake_up_time = timestamp;
+	atom->state = THREAD_WAIT_CPU;
+	atom->wake_up_time = timestamp;
 }
 
 static struct trace_sched_handler lat_ops  = {
@@ -1137,9 +1109,6 @@ static struct trace_sched_handler lat_ops  = {
 	.fork_event		= latency_fork_event,
 };
 
-static u64 all_runtime;
-static u64 all_count;
-
 static void output_lat_thread(struct task_atoms *atom_list)
 {
 	int i;
@@ -1287,13 +1256,13 @@ static void sort_lat(void)
 
 	for (;;) {
 		struct task_atoms *data;
-		node = rb_first(&lat_snapshot_root);
+		node = rb_first(&atom_root);
 		if (!node)
 			break;
 
-		rb_erase(node, &lat_snapshot_root);
+		rb_erase(node, &atom_root);
 		data = rb_entry(node, struct task_atoms, node);
-		__thread_latency_insert(&sorted_lat_snapshot_root, data, &sort_list);
+		__thread_latency_insert(&sorted_atom_root, data, &sort_list);
 	}
 }
 
@@ -1309,7 +1278,7 @@ static void __cmd_lat(void)
 	printf(" Task              |  Runtime ms | Switches | Average delay ms | Maximum delay ms |\n");
 	printf("-----------------------------------------------------------------------------------\n");
 
-	next = rb_first(&sorted_lat_snapshot_root);
+	next = rb_first(&sorted_atom_root);
 
 	while (next) {
 		struct task_atoms *atom_list;