Patches contributed by Eötvös Lorand University
commit c44d70a340554a33071339064a303ac0f1a31623
Author: Ingo Molnar <mingo@elte.hu>
Date: Sun May 17 11:24:08 2009 +0200
perf_counter: fix counter inheritance race
Context rotation should not occur when we are in the middle of
walking the counter list when inheriting counters ...
[ Impact: fix occasionally incorrect perf stat results ]
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index c8c1dfc22c93..13cb2fbbf334 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -508,6 +508,7 @@ struct perf_counter_context {
int nr_counters;
int nr_active;
int is_active;
+ int rr_allowed;
struct task_struct *task;
/*
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 7af16d1c480f..4d8f97375f3a 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1120,7 +1120,8 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
__perf_counter_task_sched_out(ctx);
rotate_ctx(&cpuctx->ctx);
- rotate_ctx(ctx);
+ if (ctx->rr_allowed)
+ rotate_ctx(ctx);
perf_counter_cpu_sched_in(cpuctx, cpu);
perf_counter_task_sched_in(curr, cpu);
@@ -3108,6 +3109,7 @@ __perf_counter_init_context(struct perf_counter_context *ctx,
mutex_init(&ctx->mutex);
INIT_LIST_HEAD(&ctx->counter_list);
INIT_LIST_HEAD(&ctx->event_list);
+ ctx->rr_allowed = 1;
ctx->task = task;
}
@@ -3348,6 +3350,9 @@ void perf_counter_init_task(struct task_struct *child)
*/
mutex_lock(&parent_ctx->mutex);
+ parent_ctx->rr_allowed = 0;
+ barrier(); /* irqs */
+
/*
* We dont have to disable NMIs - we are only looking at
* the list, not manipulating it:
@@ -3361,6 +3366,9 @@ void perf_counter_init_task(struct task_struct *child)
break;
}
+ barrier(); /* irqs */
+ parent_ctx->rr_allowed = 1;
+
mutex_unlock(&parent_ctx->mutex);
}
commit 33b2fb303fe7f6b08bbb32f708e67b96eaa94a7a
Author: Ingo Molnar <mingo@elte.hu>
Date: Sun May 17 11:08:41 2009 +0200
perf_counter: fix counter freeing logic
Fix counter lifetime bugs which explain the crashes reported by
Marcelo Tosatti and Arnaldo Carvalho de Melo.
The new rule is: flushing + freeing is only done for a task's
own counters, never for other tasks.
[ Impact: fix crashes/lockups with inherited counters ]
Reported-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Reported-by: Marcelo Tosatti <mtosatti@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/kernel/exit.c b/kernel/exit.c
index 73affd35e76d..f9dfedd94af0 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -178,12 +178,6 @@ void release_task(struct task_struct * p)
proc_flush_task(p);
- /*
- * Flush inherited counters to the parent - before the parent
- * gets woken up by child-exit notifications.
- */
- perf_counter_exit_task(p);
-
write_lock_irq(&tasklist_lock);
tracehook_finish_release_task(p);
__exit_signal(p);
@@ -985,6 +979,13 @@ NORET_TYPE void do_exit(long code)
module_put(tsk->binfmt->module);
proc_exit_connector(tsk);
+
+ /*
+ * Flush inherited counters to the parent - before the parent
+ * gets woken up by child-exit notifications.
+ */
+ perf_counter_exit_task(tsk);
+
exit_notify(tsk, group_dead);
#ifdef CONFIG_NUMA
mpol_put(tsk->mempolicy);
@@ -1257,12 +1258,6 @@ static int wait_task_zombie(struct task_struct *p, int options,
*/
read_unlock(&tasklist_lock);
- /*
- * Flush inherited counters to the parent - before the parent
- * gets woken up by child-exit notifications.
- */
- perf_counter_exit_task(p);
-
retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
status = (p->signal->flags & SIGNAL_GROUP_EXIT)
? p->signal->group_exit_code : p->exit_code;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 59a926d04baf..7af16d1c480f 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3299,6 +3299,8 @@ void perf_counter_exit_task(struct task_struct *child)
struct perf_counter *child_counter, *tmp;
struct perf_counter_context *child_ctx;
+ WARN_ON_ONCE(child != current);
+
child_ctx = &child->perf_counter_ctx;
if (likely(!child_ctx->nr_counters))
commit 4200efd9acda4accf24640f1e77d24fdcdb524df
Author: Ingo Molnar <mingo@elte.hu>
Date: Tue May 19 09:22:19 2009 +0200
sched: properly define the sched_group::cpumask and sched_domain::span fields
Properly document the variable-size structure tricks we are doing
wrt. struct sched_group and sched_domain, and use the field[0] GCC
extension instead of defining a vla array.
Dont use unions for this, as pointed out by Linus.
[ Impact: cleanup, un-confuse Sparse and LLVM ]
Reported-by: Jeff Garzik <jeff@garzik.org>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <alpine.LFD.2.01.0905180850110.3301@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/include/linux/sched.h b/include/linux/sched.h
index de7b3b217772..dbb1043e8656 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -839,7 +839,17 @@ struct sched_group {
*/
u32 reciprocal_cpu_power;
- unsigned long cpumask[];
+ /*
+ * The CPUs this group covers.
+ *
+ * NOTE: this field is variable length. (Allocated dynamically
+ * by attaching extra space to the end of the structure,
+ * depending on how many CPUs the kernel has booted up with)
+ *
+ * It is also be embedded into static data structures at build
+ * time. (See 'struct static_sched_group' in kernel/sched.c)
+ */
+ unsigned long cpumask[0];
};
static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
@@ -925,8 +935,17 @@ struct sched_domain {
char *name;
#endif
- /* span of all CPUs in this domain */
- unsigned long span[];
+ /*
+ * Span of all CPUs in this domain.
+ *
+ * NOTE: this field is variable length. (Allocated dynamically
+ * by attaching extra space to the end of the structure,
+ * depending on how many CPUs the kernel has booted up with)
+ *
+ * It is also be embedded into static data structures at build
+ * time. (See 'struct static_sched_domain' in kernel/sched.c)
+ */
+ unsigned long span[0];
};
static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
diff --git a/kernel/sched.c b/kernel/sched.c
index 497c09ba61e7..228acae8821f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7948,8 +7948,9 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
/*
* The cpus mask in sched_group and sched_domain hangs off the end.
- * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space
- * for nr_cpu_ids < CONFIG_NR_CPUS.
+ *
+ * ( See the the comments in include/linux/sched.h:struct sched_group
+ * and struct sched_domain. )
*/
struct static_sched_group {
struct sched_group sg;
commit 1079cac0f4eb7d968395378b1625979d4c818dd6
Merge: 5872144f64b3 1406de8e11eb
Author: Ingo Molnar <mingo@elte.hu>
Date: Mon May 18 10:15:09 2009 +0200
Merge commit 'v2.6.30-rc6' into tracing/core
Merge reason: we were on an -rc4 base, sync up to -rc6
Signed-off-by: Ingo Molnar <mingo@elte.hu>
commit b68f1d2e7aa21029d73c7d453a8046e95d351740
Author: Ingo Molnar <mingo@elte.hu>
Date: Sun May 17 19:37:25 2009 +0200
perf_counter, x86: speed up the scheduling fast-path
We have to set up the LVT entry only at counter init time, not at
every switch-in time.
There's friction between NMI and non-NMI use here - we'll probably
remove the per counter configurability of it - but until then, dont
slow down things ...
[ Impact: micro-optimization ]
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 5bfd30ab3920..c109819c2cb9 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -285,6 +285,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
return -EACCES;
hwc->nmi = 1;
}
+ perf_counters_lapic_init(hwc->nmi);
if (!hwc->irq_period)
hwc->irq_period = x86_pmu.max_period;
@@ -603,8 +604,6 @@ static int x86_pmu_enable(struct perf_counter *counter)
hwc->counter_base = x86_pmu.perfctr;
}
- perf_counters_lapic_init(hwc->nmi);
-
x86_pmu.disable(hwc, idx);
cpuc->counters[idx] = counter;
@@ -1054,7 +1053,7 @@ void __init init_hw_perf_counters(void)
pr_info("... counter mask: %016Lx\n", perf_counter_mask);
- perf_counters_lapic_init(0);
+ perf_counters_lapic_init(1);
register_die_notifier(&perf_counter_nmi_notifier);
}
commit b286e21868ea1af724a7a4802da2c8e144fa70de
Merge: ed077b58f614 1406de8e11eb
Author: Ingo Molnar <mingo@elte.hu>
Date: Mon May 18 09:12:45 2009 +0200
Merge commit 'v2.6.30-rc6' into x86/mm
Merge reason: sync up to -rc6 which has changes to mm/ which we are
going to touch in the commits to follow as well.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
commit dc3f81b129b5439ba7bac265bbc6a51a39275dae
Merge: d2517a49d555 1406de8e11eb
Author: Ingo Molnar <mingo@elte.hu>
Date: Mon May 18 07:37:44 2009 +0200
Merge commit 'v2.6.30-rc6' into perfcounters/core
Merge reason: this branch was on an -rc4 base, merge it up to -rc6
to get the latest upstream fixes.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
commit d2517a49d55536b38c7a87e5289550cfedaa4dcc
Author: Ingo Molnar <mingo@elte.hu>
Date: Sun May 17 10:04:45 2009 +0200
perf_counter, x86: fix zero irq_period counters
The quirk to irq_period unearthed an unrobustness we had in the
hw_counter initialization sequence: we left irq_period at 0, which
was then quirked up to 2 ... which then generated a _lot_ of
interrupts during 'perf stat' runs, slowed them down and skewed
the counter results in general.
Initialize irq_period to the maximum instead.
[ Impact: fix perf stat results ]
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 886dcf334bc3..5bfd30ab3920 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -286,6 +286,9 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
hwc->nmi = 1;
}
+ if (!hwc->irq_period)
+ hwc->irq_period = x86_pmu.max_period;
+
atomic64_set(&hwc->period_left,
min(x86_pmu.max_period, hwc->irq_period));
commit 0203026b58b4299ba7281c0b4b417207c1f05d0e
Author: Ingo Molnar <mingo@elte.hu>
Date: Sun May 17 11:24:08 2009 +0200
perf_counter: fix threaded task exit
Flushing counters in __exit_signal() with irqs disabled is not
a good idea as perf_counter_exit_task() acquires mutexes. So
flush it before acquiring the tasklist lock.
(Note, we still need a fix for when the PID has been unhashed.)
[ Impact: fix crash with inherited counters ]
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/kernel/exit.c b/kernel/exit.c
index 16d74f13a3e7..73affd35e76d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -128,12 +128,6 @@ static void __exit_signal(struct task_struct *tsk)
sig = NULL; /* Marker for below. */
}
- /*
- * Flush inherited counters to the parent - before the parent
- * gets woken up by child-exit notifications.
- */
- perf_counter_exit_task(tsk);
-
__unhash_process(tsk);
/*
@@ -183,6 +177,13 @@ void release_task(struct task_struct * p)
atomic_dec(&__task_cred(p)->user->processes);
proc_flush_task(p);
+
+ /*
+ * Flush inherited counters to the parent - before the parent
+ * gets woken up by child-exit notifications.
+ */
+ perf_counter_exit_task(p);
+
write_lock_irq(&tasklist_lock);
tracehook_finish_release_task(p);
__exit_signal(p);
commit 58d7e993b16b62d30f8ef27757614056fe4def11
Author: Ingo Molnar <mingo@elte.hu>
Date: Fri May 15 11:03:23 2009 +0200
perf stat: handle Ctrl-C
Before this change, if a long-running perf stat workload was Ctrl-C-ed,
the utility exited without displaying statistics.
After the change, the Ctrl-C gets propagated into the workload (and
causes its early exit there), but perf stat itself will still continue
to run and will display counter results.
This is useful to run open-ended workloads, let them run for
a while, then Ctrl-C them to get the stats.
[ Impact: extend perf stat with new functionality ]
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index cf575c305a6c..03518d75d864 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -538,8 +538,14 @@ static void process_options(int argc, char **argv)
}
}
+static void skip_signal(int signo)
+{
+}
+
int cmd_stat(int argc, char **argv, const char *prefix)
{
+ sigset_t blocked;
+
page_size = sysconf(_SC_PAGE_SIZE);
process_options(argc, argv);
@@ -548,5 +554,15 @@ int cmd_stat(int argc, char **argv, const char *prefix)
assert(nr_cpus <= MAX_NR_CPUS);
assert(nr_cpus >= 0);
+ /*
+ * We dont want to block the signals - that would cause
+ * child tasks to inherit that and Ctrl-C would not work.
+ * What we want is for Ctrl-C to work in the exec()-ed
+ * task, but being ignored by perf stat itself:
+ */
+ signal(SIGINT, skip_signal);
+ signal(SIGALRM, skip_signal);
+ signal(SIGABRT, skip_signal);
+
return do_perfstat(argc, argv);
}