Patches contributed by Eötvös Lorand University


commit c44d70a340554a33071339064a303ac0f1a31623
Author: Ingo Molnar <mingo@elte.hu>
Date:   Sun May 17 11:24:08 2009 +0200

    perf_counter: fix counter inheritance race
    
    Context rotation should not occur when we are in the middle of
    walking the counter list when inheriting counters ...
    
    [ Impact: fix occasionally incorrect perf stat results ]
    
    Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Paul Mackerras <paulus@samba.org>
    Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
    Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
    Cc: Marcelo Tosatti <mtosatti@redhat.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index c8c1dfc22c93..13cb2fbbf334 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -508,6 +508,7 @@ struct perf_counter_context {
 	int			nr_counters;
 	int			nr_active;
 	int			is_active;
+	int			rr_allowed;
 	struct task_struct	*task;
 
 	/*
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 7af16d1c480f..4d8f97375f3a 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1120,7 +1120,8 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 	__perf_counter_task_sched_out(ctx);
 
 	rotate_ctx(&cpuctx->ctx);
-	rotate_ctx(ctx);
+	if (ctx->rr_allowed)
+		rotate_ctx(ctx);
 
 	perf_counter_cpu_sched_in(cpuctx, cpu);
 	perf_counter_task_sched_in(curr, cpu);
@@ -3108,6 +3109,7 @@ __perf_counter_init_context(struct perf_counter_context *ctx,
 	mutex_init(&ctx->mutex);
 	INIT_LIST_HEAD(&ctx->counter_list);
 	INIT_LIST_HEAD(&ctx->event_list);
+	ctx->rr_allowed = 1;
 	ctx->task = task;
 }
 
@@ -3348,6 +3350,9 @@ void perf_counter_init_task(struct task_struct *child)
 	 */
 	mutex_lock(&parent_ctx->mutex);
 
+	parent_ctx->rr_allowed = 0;
+	barrier(); /* irqs */
+
 	/*
 	 * We dont have to disable NMIs - we are only looking at
 	 * the list, not manipulating it:
@@ -3361,6 +3366,9 @@ void perf_counter_init_task(struct task_struct *child)
 			break;
 	}
 
+	barrier(); /* irqs */
+	parent_ctx->rr_allowed = 1;
+
 	mutex_unlock(&parent_ctx->mutex);
 }
 

commit 33b2fb303fe7f6b08bbb32f708e67b96eaa94a7a
Author: Ingo Molnar <mingo@elte.hu>
Date:   Sun May 17 11:08:41 2009 +0200

    perf_counter: fix counter freeing logic
    
    Fix counter lifetime bugs which explain the crashes reported by
    Marcelo Tosatti and Arnaldo Carvalho de Melo.
    
    The new rule is: flushing + freeing is only done for a task's
    own counters, never for other tasks.
    
    [ Impact: fix crashes/lockups with inherited counters ]
    
    Reported-by: Arnaldo Carvalho de Melo <acme@redhat.com>
    Reported-by: Marcelo Tosatti <mtosatti@redhat.com>
    Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Paul Mackerras <paulus@samba.org>
    Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/exit.c b/kernel/exit.c
index 73affd35e76d..f9dfedd94af0 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -178,12 +178,6 @@ void release_task(struct task_struct * p)
 
 	proc_flush_task(p);
 
-	/*
-	 * Flush inherited counters to the parent - before the parent
-	 * gets woken up by child-exit notifications.
-	 */
-	perf_counter_exit_task(p);
-
 	write_lock_irq(&tasklist_lock);
 	tracehook_finish_release_task(p);
 	__exit_signal(p);
@@ -985,6 +979,13 @@ NORET_TYPE void do_exit(long code)
 		module_put(tsk->binfmt->module);
 
 	proc_exit_connector(tsk);
+
+	/*
+	 * Flush inherited counters to the parent - before the parent
+	 * gets woken up by child-exit notifications.
+	 */
+	perf_counter_exit_task(tsk);
+
 	exit_notify(tsk, group_dead);
 #ifdef CONFIG_NUMA
 	mpol_put(tsk->mempolicy);
@@ -1257,12 +1258,6 @@ static int wait_task_zombie(struct task_struct *p, int options,
 	 */
 	read_unlock(&tasklist_lock);
 
-	/*
-	 * Flush inherited counters to the parent - before the parent
-	 * gets woken up by child-exit notifications.
-	 */
-	perf_counter_exit_task(p);
-
 	retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
 	status = (p->signal->flags & SIGNAL_GROUP_EXIT)
 		? p->signal->group_exit_code : p->exit_code;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 59a926d04baf..7af16d1c480f 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3299,6 +3299,8 @@ void perf_counter_exit_task(struct task_struct *child)
 	struct perf_counter *child_counter, *tmp;
 	struct perf_counter_context *child_ctx;
 
+	WARN_ON_ONCE(child != current);
+
 	child_ctx = &child->perf_counter_ctx;
 
 	if (likely(!child_ctx->nr_counters))

commit 4200efd9acda4accf24640f1e77d24fdcdb524df
Author: Ingo Molnar <mingo@elte.hu>
Date:   Tue May 19 09:22:19 2009 +0200

    sched: properly define the sched_group::cpumask and sched_domain::span fields
    
    Properly document the variable-size structure tricks we are doing
    wrt. struct sched_group and sched_domain, and use the field[0] GCC
    extension instead of defining a vla array.
    
    Dont use unions for this, as pointed out by Linus.
    
    [ Impact: cleanup, un-confuse Sparse and LLVM ]
    
    Reported-by: Jeff Garzik <jeff@garzik.org>
    Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
    LKML-Reference: <alpine.LFD.2.01.0905180850110.3301@localhost.localdomain>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index de7b3b217772..dbb1043e8656 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -839,7 +839,17 @@ struct sched_group {
 	 */
 	u32 reciprocal_cpu_power;
 
-	unsigned long cpumask[];
+	/*
+	 * The CPUs this group covers.
+	 *
+	 * NOTE: this field is variable length. (Allocated dynamically
+	 * by attaching extra space to the end of the structure,
+	 * depending on how many CPUs the kernel has booted up with)
+	 *
+	 * It is also be embedded into static data structures at build
+	 * time. (See 'struct static_sched_group' in kernel/sched.c)
+	 */
+	unsigned long cpumask[0];
 };
 
 static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
@@ -925,8 +935,17 @@ struct sched_domain {
 	char *name;
 #endif
 
-	/* span of all CPUs in this domain */
-	unsigned long span[];
+	/*
+	 * Span of all CPUs in this domain.
+	 *
+	 * NOTE: this field is variable length. (Allocated dynamically
+	 * by attaching extra space to the end of the structure,
+	 * depending on how many CPUs the kernel has booted up with)
+	 *
+	 * It is also be embedded into static data structures at build
+	 * time. (See 'struct static_sched_domain' in kernel/sched.c)
+	 */
+	unsigned long span[0];
 };
 
 static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
diff --git a/kernel/sched.c b/kernel/sched.c
index 497c09ba61e7..228acae8821f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7948,8 +7948,9 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
 
 /*
  * The cpus mask in sched_group and sched_domain hangs off the end.
- * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space
- * for nr_cpu_ids < CONFIG_NR_CPUS.
+ *
+ * ( See the the comments in include/linux/sched.h:struct sched_group
+ *   and struct sched_domain. )
  */
 struct static_sched_group {
 	struct sched_group sg;

commit 1079cac0f4eb7d968395378b1625979d4c818dd6
Merge: 5872144f64b3 1406de8e11eb
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon May 18 10:15:09 2009 +0200

    Merge commit 'v2.6.30-rc6' into tracing/core
    
    Merge reason: we were on an -rc4 base, sync up to -rc6
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit b68f1d2e7aa21029d73c7d453a8046e95d351740
Author: Ingo Molnar <mingo@elte.hu>
Date:   Sun May 17 19:37:25 2009 +0200

    perf_counter, x86: speed up the scheduling fast-path
    
    We have to set up the LVT entry only at counter init time, not at
    every switch-in time.
    
    There's friction between NMI and non-NMI use here - we'll probably
    remove the per counter configurability of it - but until then, dont
    slow down things ...
    
    [ Impact: micro-optimization ]
    
    Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
    Cc: Paul Mackerras <paulus@samba.org>
    Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
    Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
    Cc: Marcelo Tosatti <mtosatti@redhat.com>
    LKML-Reference: <new-submission>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 5bfd30ab3920..c109819c2cb9 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -285,6 +285,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 			return -EACCES;
 		hwc->nmi = 1;
 	}
+	perf_counters_lapic_init(hwc->nmi);
 
 	if (!hwc->irq_period)
 		hwc->irq_period = x86_pmu.max_period;
@@ -603,8 +604,6 @@ static int x86_pmu_enable(struct perf_counter *counter)
 		hwc->counter_base = x86_pmu.perfctr;
 	}
 
-	perf_counters_lapic_init(hwc->nmi);
-
 	x86_pmu.disable(hwc, idx);
 
 	cpuc->counters[idx] = counter;
@@ -1054,7 +1053,7 @@ void __init init_hw_perf_counters(void)
 
 	pr_info("... counter mask:    %016Lx\n", perf_counter_mask);
 
-	perf_counters_lapic_init(0);
+	perf_counters_lapic_init(1);
 	register_die_notifier(&perf_counter_nmi_notifier);
 }
 

commit b286e21868ea1af724a7a4802da2c8e144fa70de
Merge: ed077b58f614 1406de8e11eb
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon May 18 09:12:45 2009 +0200

    Merge commit 'v2.6.30-rc6' into x86/mm
    
    Merge reason: sync up to -rc6 which has changes to mm/ which we are
                  going to touch in the commits to follow as well.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit dc3f81b129b5439ba7bac265bbc6a51a39275dae
Merge: d2517a49d555 1406de8e11eb
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon May 18 07:37:44 2009 +0200

    Merge commit 'v2.6.30-rc6' into perfcounters/core
    
    Merge reason: this branch was on an -rc4 base, merge it up to -rc6
                  to get the latest upstream fixes.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit d2517a49d55536b38c7a87e5289550cfedaa4dcc
Author: Ingo Molnar <mingo@elte.hu>
Date:   Sun May 17 10:04:45 2009 +0200

    perf_counter, x86: fix zero irq_period counters
    
    The quirk to irq_period unearthed an unrobustness we had in the
    hw_counter initialization sequence: we left irq_period at 0, which
    was then quirked up to 2 ... which then generated a _lot_ of
    interrupts during 'perf stat' runs, slowed them down and skewed
    the counter results in general.
    
    Initialize irq_period to the maximum instead.
    
    [ Impact: fix perf stat results ]
    
    Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
    Cc: Paul Mackerras <paulus@samba.org>
    Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
    Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
    Cc: Marcelo Tosatti <mtosatti@redhat.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 886dcf334bc3..5bfd30ab3920 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -286,6 +286,9 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 		hwc->nmi = 1;
 	}
 
+	if (!hwc->irq_period)
+		hwc->irq_period = x86_pmu.max_period;
+
 	atomic64_set(&hwc->period_left,
 			min(x86_pmu.max_period, hwc->irq_period));
 

commit 0203026b58b4299ba7281c0b4b417207c1f05d0e
Author: Ingo Molnar <mingo@elte.hu>
Date:   Sun May 17 11:24:08 2009 +0200

    perf_counter: fix threaded task exit
    
    Flushing counters in __exit_signal() with irqs disabled is not
    a good idea as perf_counter_exit_task() acquires mutexes. So
    flush it before acquiring the tasklist lock.
    
    (Note, we still need a fix for when the PID has been unhashed.)
    
    [ Impact: fix crash with inherited counters ]
    
    Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
    Cc: Paul Mackerras <paulus@samba.org>
    Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
    Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
    Cc: Marcelo Tosatti <mtosatti@redhat.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/exit.c b/kernel/exit.c
index 16d74f13a3e7..73affd35e76d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -128,12 +128,6 @@ static void __exit_signal(struct task_struct *tsk)
 		sig = NULL; /* Marker for below. */
 	}
 
-	/*
-	 * Flush inherited counters to the parent - before the parent
-	 * gets woken up by child-exit notifications.
-	 */
-	perf_counter_exit_task(tsk);
-
 	__unhash_process(tsk);
 
 	/*
@@ -183,6 +177,13 @@ void release_task(struct task_struct * p)
 	atomic_dec(&__task_cred(p)->user->processes);
 
 	proc_flush_task(p);
+
+	/*
+	 * Flush inherited counters to the parent - before the parent
+	 * gets woken up by child-exit notifications.
+	 */
+	perf_counter_exit_task(p);
+
 	write_lock_irq(&tasklist_lock);
 	tracehook_finish_release_task(p);
 	__exit_signal(p);

commit 58d7e993b16b62d30f8ef27757614056fe4def11
Author: Ingo Molnar <mingo@elte.hu>
Date:   Fri May 15 11:03:23 2009 +0200

    perf stat: handle Ctrl-C
    
    Before this change, if a long-running perf stat workload was Ctrl-C-ed,
    the utility exited without displaying statistics.
    
    After the change, the Ctrl-C gets propagated into the workload (and
    causes its early exit there), but perf stat itself will still continue
    to run and will display counter results.
    
    This is useful to run open-ended workloads, let them run for
    a while, then Ctrl-C them to get the stats.
    
    [ Impact: extend perf stat with new functionality ]
    
    Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Paul Mackerras <paulus@samba.org>
    Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
    Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
    LKML-Reference: <new-submission>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index cf575c305a6c..03518d75d864 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -538,8 +538,14 @@ static void process_options(int argc, char **argv)
 	}
 }
 
+static void skip_signal(int signo)
+{
+}
+
 int cmd_stat(int argc, char **argv, const char *prefix)
 {
+	sigset_t blocked;
+
 	page_size = sysconf(_SC_PAGE_SIZE);
 
 	process_options(argc, argv);
@@ -548,5 +554,15 @@ int cmd_stat(int argc, char **argv, const char *prefix)
 	assert(nr_cpus <= MAX_NR_CPUS);
 	assert(nr_cpus >= 0);
 
+	/*
+	 * We dont want to block the signals - that would cause
+	 * child tasks to inherit that and Ctrl-C would not work.
+	 * What we want is for Ctrl-C to work in the exec()-ed
+	 * task, but being ignored by perf stat itself:
+	 */
+	signal(SIGINT,  skip_signal);
+	signal(SIGALRM, skip_signal);
+	signal(SIGABRT, skip_signal);
+
 	return do_perfstat(argc, argv);
 }