Patches contributed by Eötvös Lorand University
commit 235c7fc7c500e4fd1700c4ad01b5612bcdc1b449
Author: Ingo Molnar <mingo@elte.hu>
Date: Sun Dec 21 14:43:25 2008 +0100
perfcounters: generalize the counter scheduler
Impact: clean up and refactor code
refactor the counter scheduler: separate out in/out functions and
introduce a counter-rotation function as well.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 48e1dbcdc1cd..d7a79f321b1c 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -111,11 +111,12 @@ static void __perf_counter_remove_from_context(void *info)
spin_lock(&ctx->lock);
if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
- counter->hw_ops->disable(counter);
counter->state = PERF_COUNTER_STATE_INACTIVE;
+ counter->hw_ops->disable(counter);
ctx->nr_active--;
cpuctx->active_oncpu--;
counter->task = NULL;
+ counter->oncpu = -1;
}
ctx->nr_counters--;
@@ -192,8 +193,36 @@ static void perf_counter_remove_from_context(struct perf_counter *counter)
spin_unlock_irq(&ctx->lock);
}
+static int
+counter_sched_in(struct perf_counter *counter,
+ struct perf_cpu_context *cpuctx,
+ struct perf_counter_context *ctx,
+ int cpu)
+{
+ if (counter->state == PERF_COUNTER_STATE_OFF)
+ return 0;
+
+ counter->state = PERF_COUNTER_STATE_ACTIVE;
+ counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
+ /*
+ * The new state must be visible before we turn it on in the hardware:
+ */
+ smp_wmb();
+
+ if (counter->hw_ops->enable(counter)) {
+ counter->state = PERF_COUNTER_STATE_INACTIVE;
+ counter->oncpu = -1;
+ return -EAGAIN;
+ }
+
+ cpuctx->active_oncpu++;
+ ctx->nr_active++;
+
+ return 0;
+}
+
/*
- * Cross CPU call to install and enable a preformance counter
+ * Cross CPU call to install and enable a performance counter
*/
static void __perf_install_in_context(void *info)
{
@@ -220,22 +249,17 @@ static void __perf_install_in_context(void *info)
* counters on a global level. NOP for non NMI based counters.
*/
perf_flags = hw_perf_save_disable();
- list_add_counter(counter, ctx);
- hw_perf_restore(perf_flags);
+ list_add_counter(counter, ctx);
ctx->nr_counters++;
- if (cpuctx->active_oncpu < perf_max_counters) {
- counter->state = PERF_COUNTER_STATE_ACTIVE;
- counter->oncpu = cpu;
- ctx->nr_active++;
- cpuctx->active_oncpu++;
- counter->hw_ops->enable(counter);
- }
+ counter_sched_in(counter, cpuctx, ctx, cpu);
if (!ctx->task && cpuctx->max_pertask)
cpuctx->max_pertask--;
+ hw_perf_restore(perf_flags);
+
spin_unlock(&ctx->lock);
curr_rq_unlock_irq_restore(&flags);
}
@@ -302,8 +326,8 @@ counter_sched_out(struct perf_counter *counter,
if (counter->state != PERF_COUNTER_STATE_ACTIVE)
return;
- counter->hw_ops->disable(counter);
counter->state = PERF_COUNTER_STATE_INACTIVE;
+ counter->hw_ops->disable(counter);
counter->oncpu = -1;
cpuctx->active_oncpu--;
@@ -326,6 +350,22 @@ group_sched_out(struct perf_counter *group_counter,
counter_sched_out(counter, cpuctx, ctx);
}
+void __perf_counter_sched_out(struct perf_counter_context *ctx,
+ struct perf_cpu_context *cpuctx)
+{
+ struct perf_counter *counter;
+
+ if (likely(!ctx->nr_counters))
+ return;
+
+ spin_lock(&ctx->lock);
+ if (ctx->nr_active) {
+ list_for_each_entry(counter, &ctx->counter_list, list_entry)
+ group_sched_out(counter, cpuctx, ctx);
+ }
+ spin_unlock(&ctx->lock);
+}
+
/*
* Called from scheduler to remove the counters of the current task,
* with interrupts disabled.
@@ -341,39 +381,18 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu)
{
struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
struct perf_counter_context *ctx = &task->perf_counter_ctx;
- struct perf_counter *counter;
if (likely(!cpuctx->task_ctx))
return;
- spin_lock(&ctx->lock);
- if (ctx->nr_active) {
- list_for_each_entry(counter, &ctx->counter_list, list_entry)
- group_sched_out(counter, cpuctx, ctx);
- }
- spin_unlock(&ctx->lock);
+ __perf_counter_sched_out(ctx, cpuctx);
+
cpuctx->task_ctx = NULL;
}
-static int
-counter_sched_in(struct perf_counter *counter,
- struct perf_cpu_context *cpuctx,
- struct perf_counter_context *ctx,
- int cpu)
+static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
{
- if (counter->state == PERF_COUNTER_STATE_OFF)
- return 0;
-
- if (counter->hw_ops->enable(counter))
- return -EAGAIN;
-
- counter->state = PERF_COUNTER_STATE_ACTIVE;
- counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
-
- cpuctx->active_oncpu++;
- ctx->nr_active++;
-
- return 0;
+ __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
}
static int
@@ -416,21 +435,10 @@ group_sched_in(struct perf_counter *group_counter,
return -EAGAIN;
}
-/*
- * Called from scheduler to add the counters of the current task
- * with interrupts disabled.
- *
- * We restore the counter value and then enable it.
- *
- * This does not protect us against NMI, but enable()
- * sets the enabled bit in the control field of counter _before_
- * accessing the counter control register. If a NMI hits, then it will
- * keep the counter running.
- */
-void perf_counter_task_sched_in(struct task_struct *task, int cpu)
+static void
+__perf_counter_sched_in(struct perf_counter_context *ctx,
+ struct perf_cpu_context *cpuctx, int cpu)
{
- struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
- struct perf_counter_context *ctx = &task->perf_counter_ctx;
struct perf_counter *counter;
if (likely(!ctx->nr_counters))
@@ -453,10 +461,35 @@ void perf_counter_task_sched_in(struct task_struct *task, int cpu)
break;
}
spin_unlock(&ctx->lock);
+}
+/*
+ * Called from scheduler to add the counters of the current task
+ * with interrupts disabled.
+ *
+ * We restore the counter value and then enable it.
+ *
+ * This does not protect us against NMI, but enable()
+ * sets the enabled bit in the control field of counter _before_
+ * accessing the counter control register. If a NMI hits, then it will
+ * keep the counter running.
+ */
+void perf_counter_task_sched_in(struct task_struct *task, int cpu)
+{
+ struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+ struct perf_counter_context *ctx = &task->perf_counter_ctx;
+
+ __perf_counter_sched_in(ctx, cpuctx, cpu);
cpuctx->task_ctx = ctx;
}
+static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
+{
+ struct perf_counter_context *ctx = &cpuctx->ctx;
+
+ __perf_counter_sched_in(ctx, cpuctx, cpu);
+}
+
int perf_counter_task_disable(void)
{
struct task_struct *curr = current;
@@ -514,6 +547,8 @@ int perf_counter_task_enable(void)
/* force the update of the task clock: */
__task_delta_exec(curr, 1);
+ perf_counter_task_sched_out(curr, cpu);
+
spin_lock(&ctx->lock);
/*
@@ -538,19 +573,18 @@ int perf_counter_task_enable(void)
return 0;
}
-void perf_counter_task_tick(struct task_struct *curr, int cpu)
+/*
+ * Round-robin a context's counters:
+ */
+static void rotate_ctx(struct perf_counter_context *ctx)
{
- struct perf_counter_context *ctx = &curr->perf_counter_ctx;
struct perf_counter *counter;
u64 perf_flags;
- if (likely(!ctx->nr_counters))
+ if (!ctx->nr_counters)
return;
- perf_counter_task_sched_out(curr, cpu);
-
spin_lock(&ctx->lock);
-
/*
* Rotate the first entry last (works just fine for group counters too):
*/
@@ -563,7 +597,24 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
hw_perf_restore(perf_flags);
spin_unlock(&ctx->lock);
+}
+
+void perf_counter_task_tick(struct task_struct *curr, int cpu)
+{
+ struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+ struct perf_counter_context *ctx = &curr->perf_counter_ctx;
+ const int rotate_percpu = 0;
+
+ if (rotate_percpu)
+ perf_counter_cpu_sched_out(cpuctx);
+ perf_counter_task_sched_out(curr, cpu);
+ if (rotate_percpu)
+ rotate_ctx(&cpuctx->ctx);
+ rotate_ctx(ctx);
+
+ if (rotate_percpu)
+ perf_counter_cpu_sched_in(cpuctx, cpu);
perf_counter_task_sched_in(curr, cpu);
}
@@ -905,8 +956,6 @@ static u64 task_clock_perf_counter_val(struct perf_counter *counter, int update)
struct task_struct *curr = counter->task;
u64 delta;
- WARN_ON_ONCE(counter->task != current);
-
delta = __task_delta_exec(curr, update);
return curr->se.sum_exec_runtime + delta;
@@ -1160,6 +1209,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
counter->group_leader = group_leader;
counter->hw_ops = NULL;
+ counter->state = PERF_COUNTER_STATE_INACTIVE;
if (hw_event->disabled)
counter->state = PERF_COUNTER_STATE_OFF;
@@ -1331,35 +1381,49 @@ __perf_counter_exit_task(struct task_struct *child,
{
struct perf_counter *parent_counter;
u64 parent_val, child_val;
- unsigned long flags;
- u64 perf_flags;
/*
- * Disable and unlink this counter.
- *
- * Be careful about zapping the list - IRQ/NMI context
- * could still be processing it:
+ * If we do not self-reap then we have to wait for the
+ * child task to unschedule (it will happen for sure),
+ * so that its counter is at its final count. (This
+ * condition triggers rarely - child tasks usually get
+ * off their CPU before the parent has a chance to
+ * get this far into the reaping action)
*/
- curr_rq_lock_irq_save(&flags);
- perf_flags = hw_perf_save_disable();
-
- if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) {
+ if (child != current) {
+ wait_task_inactive(child, 0);
+ list_del_init(&child_counter->list_entry);
+ } else {
struct perf_cpu_context *cpuctx;
+ unsigned long flags;
+ u64 perf_flags;
+
+ /*
+ * Disable and unlink this counter.
+ *
+ * Be careful about zapping the list - IRQ/NMI context
+ * could still be processing it:
+ */
+ curr_rq_lock_irq_save(&flags);
+ perf_flags = hw_perf_save_disable();
cpuctx = &__get_cpu_var(perf_cpu_context);
- child_counter->hw_ops->disable(child_counter);
- child_counter->state = PERF_COUNTER_STATE_INACTIVE;
- child_counter->oncpu = -1;
+ if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) {
+ child_counter->state = PERF_COUNTER_STATE_INACTIVE;
+ child_counter->hw_ops->disable(child_counter);
+ cpuctx->active_oncpu--;
+ child_ctx->nr_active--;
+ child_counter->oncpu = -1;
+ }
- cpuctx->active_oncpu--;
- child_ctx->nr_active--;
- }
+ list_del_init(&child_counter->list_entry);
- list_del_init(&child_counter->list_entry);
+ child_ctx->nr_counters--;
- hw_perf_restore(perf_flags);
- curr_rq_unlock_irq_restore(&flags);
+ hw_perf_restore(perf_flags);
+ curr_rq_unlock_irq_restore(&flags);
+ }
parent_counter = child_counter->parent;
/*
commit 8fe91e61cdc407c7556d3cd71cf20141a25bbcea
Author: Ingo Molnar <mingo@elte.hu>
Date: Tue Dec 23 12:29:25 2008 +0100
perfcounters: remove ->nr_inherited
Impact: remove dead code
nr_inherited was not maintained correctly (not decremented) - and also
not used - remove it.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 53af11d3767b..1ea08e9f31ce 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -164,7 +164,6 @@ struct perf_counter {
struct task_struct *task;
struct file *filp;
- unsigned int nr_inherited;
struct perf_counter *parent;
/*
* Protect attach/detach:
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 2e73929a6959..48e1dbcdc1cd 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1308,7 +1308,6 @@ inherit_counter(struct perf_counter *parent_counter,
child_ctx->nr_counters++;
child_counter->parent = parent_counter;
- parent_counter->nr_inherited++;
/*
* inherit into child's child as well:
*/
commit 0dff86aa7b9ec65a6d07167b7afb050b5fc98ddc
Author: Ingo Molnar <mingo@elte.hu>
Date: Tue Dec 23 12:28:12 2008 +0100
x86, perfcounters: print out the ->used bitmask
Impact: extend debug printouts
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 74090a393a7c..f3359c2b3910 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -255,6 +255,7 @@ static int pmc_generic_enable(struct perf_counter *counter)
idx = find_first_zero_bit(cpuc->used, nr_counters_generic);
if (idx == nr_counters_generic)
return -EAGAIN;
+
set_bit(idx, cpuc->used);
hwc->idx = idx;
}
@@ -274,6 +275,7 @@ static int pmc_generic_enable(struct perf_counter *counter)
void perf_counter_print_debug(void)
{
u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left;
+ struct cpu_hw_counters *cpuc;
int cpu, idx;
if (!nr_counters_generic)
@@ -282,6 +284,7 @@ void perf_counter_print_debug(void)
local_irq_disable();
cpu = smp_processor_id();
+ cpuc = &per_cpu(cpu_hw_counters, cpu);
rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
@@ -291,6 +294,7 @@ void perf_counter_print_debug(void)
printk(KERN_INFO "CPU#%d: ctrl: %016llx\n", cpu, ctrl);
printk(KERN_INFO "CPU#%d: status: %016llx\n", cpu, status);
printk(KERN_INFO "CPU#%d: overflow: %016llx\n", cpu, overflow);
+ printk(KERN_INFO "CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used);
for (idx = 0; idx < nr_counters_generic; idx++) {
rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
commit 95cdd2e7851cce79ab839cb0b3cbe68d7911d0f1
Author: Ingo Molnar <mingo@elte.hu>
Date: Sun Dec 21 13:50:42 2008 +0100
perfcounters: enable lowlevel pmc code to schedule counters
Allow lowlevel ->enable() op to return an error if a counter can not be
added. This can be used to handle counter constraints.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index b67557121425..74090a393a7c 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -244,7 +244,7 @@ static int fixed_mode_idx(struct hw_perf_counter *hwc)
/*
* Find a PMC slot for the freshly enabled / scheduled in counter:
*/
-static void pmc_generic_enable(struct perf_counter *counter)
+static int pmc_generic_enable(struct perf_counter *counter)
{
struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
struct hw_perf_counter *hwc = &counter->hw;
@@ -253,6 +253,8 @@ static void pmc_generic_enable(struct perf_counter *counter)
/* Try to get the previous counter again */
if (test_and_set_bit(idx, cpuc->used)) {
idx = find_first_zero_bit(cpuc->used, nr_counters_generic);
+ if (idx == nr_counters_generic)
+ return -EAGAIN;
set_bit(idx, cpuc->used);
hwc->idx = idx;
}
@@ -265,6 +267,8 @@ static void pmc_generic_enable(struct perf_counter *counter)
__hw_perf_counter_set_period(counter, hwc, idx);
__pmc_generic_enable(counter, hwc, idx);
+
+ return 0;
}
void perf_counter_print_debug(void)
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 48f76d2e54c2..53af11d3767b 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -128,7 +128,7 @@ struct perf_counter;
* struct hw_perf_counter_ops - performance counter hw ops
*/
struct hw_perf_counter_ops {
- void (*enable) (struct perf_counter *counter);
+ int (*enable) (struct perf_counter *counter);
void (*disable) (struct perf_counter *counter);
void (*read) (struct perf_counter *counter);
};
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f1110ac1267b..2e73929a6959 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -355,21 +355,25 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu)
cpuctx->task_ctx = NULL;
}
-static void
+static int
counter_sched_in(struct perf_counter *counter,
struct perf_cpu_context *cpuctx,
struct perf_counter_context *ctx,
int cpu)
{
if (counter->state == PERF_COUNTER_STATE_OFF)
- return;
+ return 0;
+
+ if (counter->hw_ops->enable(counter))
+ return -EAGAIN;
- counter->hw_ops->enable(counter);
counter->state = PERF_COUNTER_STATE_ACTIVE;
counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
cpuctx->active_oncpu++;
ctx->nr_active++;
+
+ return 0;
}
static int
@@ -378,20 +382,38 @@ group_sched_in(struct perf_counter *group_counter,
struct perf_counter_context *ctx,
int cpu)
{
- struct perf_counter *counter;
- int was_group = 0;
+ struct perf_counter *counter, *partial_group;
+ int ret = 0;
- counter_sched_in(group_counter, cpuctx, ctx, cpu);
+ if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
+ return -EAGAIN;
/*
* Schedule in siblings as one group (if any):
*/
list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
- counter_sched_in(counter, cpuctx, ctx, cpu);
- was_group = 1;
+ if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
+ partial_group = counter;
+ goto group_error;
+ }
+ ret = -EAGAIN;
}
- return was_group;
+ return ret;
+
+group_error:
+ /*
+ * Groups can be scheduled in as one unit only, so undo any
+ * partial group before returning:
+ */
+ list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
+ if (counter == partial_group)
+ break;
+ counter_sched_out(counter, cpuctx, ctx);
+ }
+ counter_sched_out(group_counter, cpuctx, ctx);
+
+ return -EAGAIN;
}
/*
@@ -416,9 +438,6 @@ void perf_counter_task_sched_in(struct task_struct *task, int cpu)
spin_lock(&ctx->lock);
list_for_each_entry(counter, &ctx->counter_list, list_entry) {
- if (ctx->nr_active == cpuctx->max_pertask)
- break;
-
/*
* Listen to the 'cpu' scheduling filter constraint
* of counters:
@@ -856,8 +875,9 @@ static const struct file_operations perf_fops = {
.poll = perf_poll,
};
-static void cpu_clock_perf_counter_enable(struct perf_counter *counter)
+static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
{
+ return 0;
}
static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
@@ -913,11 +933,13 @@ static void task_clock_perf_counter_read(struct perf_counter *counter)
task_clock_perf_counter_update(counter, now);
}
-static void task_clock_perf_counter_enable(struct perf_counter *counter)
+static int task_clock_perf_counter_enable(struct perf_counter *counter)
{
u64 now = task_clock_perf_counter_val(counter, 0);
atomic64_set(&counter->hw.prev_count, now);
+
+ return 0;
}
static void task_clock_perf_counter_disable(struct perf_counter *counter)
@@ -960,12 +982,14 @@ static void page_faults_perf_counter_read(struct perf_counter *counter)
page_faults_perf_counter_update(counter);
}
-static void page_faults_perf_counter_enable(struct perf_counter *counter)
+static int page_faults_perf_counter_enable(struct perf_counter *counter)
{
/*
* page-faults is a per-task value already,
* so we dont have to clear it on switch-in.
*/
+
+ return 0;
}
static void page_faults_perf_counter_disable(struct perf_counter *counter)
@@ -1006,12 +1030,14 @@ static void context_switches_perf_counter_read(struct perf_counter *counter)
context_switches_perf_counter_update(counter);
}
-static void context_switches_perf_counter_enable(struct perf_counter *counter)
+static int context_switches_perf_counter_enable(struct perf_counter *counter)
{
/*
* ->nvcsw + curr->nivcsw is a per-task value already,
* so we dont have to clear it on switch-in.
*/
+
+ return 0;
}
static void context_switches_perf_counter_disable(struct perf_counter *counter)
@@ -1050,12 +1076,14 @@ static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
cpu_migrations_perf_counter_update(counter);
}
-static void cpu_migrations_perf_counter_enable(struct perf_counter *counter)
+static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
{
/*
* se.nr_migrations is a per-task value already,
* so we dont have to clear it on switch-in.
*/
+
+ return 0;
}
static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
commit 78b6084c907cea15bb40a564b974e072f5163781
Author: Ingo Molnar <mingo@elte.hu>
Date: Sun Dec 21 15:07:49 2008 +0100
perfcounters: fix init context lock
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 54fa2fa2c8e4..467cff545c30 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -116,7 +116,9 @@ extern struct group_info init_groups;
#ifdef CONFIG_PERF_COUNTERS
# define INIT_PERF_COUNTERS(tsk) \
.perf_counter_ctx.counter_list = \
- LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list),
+ LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list), \
+ .perf_counter_ctx.lock = \
+ __SPIN_LOCK_UNLOCKED(tsk.perf_counter_ctx.lock),
#else
# define INIT_PERF_COUNTERS(tsk)
#endif
commit eef6cbf5844c620d9db9be99e4908cdf92492fb9
Author: Ingo Molnar <mingo@elte.hu>
Date: Fri Dec 19 10:20:42 2008 +0100
perfcounters: pull inherited counters
Change counter inheritance from a 'push' to a 'pull' model: instead of
child tasks pushing their final counts to the parent, reuse the wait4
infrastructure to pull counters as child tasks are exit-processed,
much like how cutime/cstime is collected.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 23fd8909b9e5..54fa2fa2c8e4 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -113,6 +113,14 @@ extern struct group_info init_groups;
# define CAP_INIT_BSET CAP_INIT_EFF_SET
#endif
+#ifdef CONFIG_PERF_COUNTERS
+# define INIT_PERF_COUNTERS(tsk) \
+ .perf_counter_ctx.counter_list = \
+ LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list),
+#else
+# define INIT_PERF_COUNTERS(tsk)
+#endif
+
/*
* INIT_TASK is used to set up the first task table, touch at
* your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -180,6 +188,7 @@ extern struct group_info init_groups;
INIT_IDS \
INIT_TRACE_IRQFLAGS \
INIT_LOCKDEP \
+ INIT_PERF_COUNTERS(tsk) \
}
diff --git a/kernel/exit.c b/kernel/exit.c
index 244edfd96865..101b7eeff44c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -153,6 +153,9 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
{
struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
+#ifdef CONFIG_PERF_COUNTERS
+ WARN_ON_ONCE(!list_empty(&tsk->perf_counter_ctx.counter_list));
+#endif
trace_sched_process_free(tsk);
put_task_struct(tsk);
}
@@ -922,12 +925,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
forget_original_parent(tsk);
exit_task_namespaces(tsk);
- /*
- * Flush inherited counters to the parent - before the parent
- * gets woken up by child-exit notifications.
- */
- perf_counter_exit_task(tsk);
-
write_lock_irq(&tasklist_lock);
if (group_dead)
kill_orphaned_pgrp(tsk->group_leader, NULL);
@@ -1122,12 +1119,6 @@ NORET_TYPE void do_exit(long code)
if (tsk->splice_pipe)
__free_pipe_info(tsk->splice_pipe);
- /*
- * These must happen late, after the PID is not
- * hashed anymore, but still at a point that may sleep:
- */
- perf_counter_exit_task(tsk);
-
preempt_disable();
/* causes final put_task_struct in finish_task_switch(). */
tsk->state = TASK_DEAD;
@@ -1371,6 +1362,12 @@ static int wait_task_zombie(struct task_struct *p, int options,
*/
read_unlock(&tasklist_lock);
+ /*
+ * Flush inherited counters to the parent - before the parent
+ * gets woken up by child-exit notifications.
+ */
+ perf_counter_exit_task(p);
+
retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
status = (p->signal->flags & SIGNAL_GROUP_EXIT)
? p->signal->group_exit_code : p->exit_code;
commit aa9c4c0f967fdb482ea95e8473ec3d201e6e0781
Author: Ingo Molnar <mingo@elte.hu>
Date: Wed Dec 17 14:10:57 2008 +0100
perfcounters: fix task clock counter
Impact: fix per task clock counter precision
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 4a145caeee07..1b2e3242497c 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -66,7 +66,15 @@ static inline unsigned int kstat_irqs(unsigned int irq)
return sum;
}
+
+/*
+ * Lock/unlock the current runqueue - to extract task statistics:
+ */
+extern void curr_rq_lock_irq_save(unsigned long *flags);
+extern void curr_rq_unlock_irq_restore(unsigned long *flags);
+extern unsigned long long __task_delta_exec(struct task_struct *tsk, int update);
extern unsigned long long task_delta_exec(struct task_struct *);
+
extern void account_user_time(struct task_struct *, cputime_t);
extern void account_user_time_scaled(struct task_struct *, cputime_t);
extern void account_system_time(struct task_struct *, int, cputime_t);
diff --git a/kernel/exit.c b/kernel/exit.c
index d336c90a5f13..244edfd96865 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -922,6 +922,12 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
forget_original_parent(tsk);
exit_task_namespaces(tsk);
+ /*
+ * Flush inherited counters to the parent - before the parent
+ * gets woken up by child-exit notifications.
+ */
+ perf_counter_exit_task(tsk);
+
write_lock_irq(&tasklist_lock);
if (group_dead)
kill_orphaned_pgrp(tsk->group_leader, NULL);
@@ -1093,11 +1099,6 @@ NORET_TYPE void do_exit(long code)
mpol_put(tsk->mempolicy);
tsk->mempolicy = NULL;
#endif
- /*
- * These must happen late, after the PID is not
- * hashed anymore, but still at a point that may sleep:
- */
- perf_counter_exit_task(tsk);
#ifdef CONFIG_FUTEX
if (unlikely(!list_empty(&tsk->pi_state_list)))
exit_pi_state_list(tsk);
@@ -1121,6 +1122,12 @@ NORET_TYPE void do_exit(long code)
if (tsk->splice_pipe)
__free_pipe_info(tsk->splice_pipe);
+ /*
+ * These must happen late, after the PID is not
+ * hashed anymore, but still at a point that may sleep:
+ */
+ perf_counter_exit_task(tsk);
+
preempt_disable();
/* causes final put_task_struct in finish_task_switch(). */
tsk->state = TASK_DEAD;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 961d651aa574..f1110ac1267b 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -18,6 +18,7 @@
#include <linux/uaccess.h>
#include <linux/syscalls.h>
#include <linux/anon_inodes.h>
+#include <linux/kernel_stat.h>
#include <linux/perf_counter.h>
/*
@@ -106,7 +107,8 @@ static void __perf_counter_remove_from_context(void *info)
if (ctx->task && cpuctx->task_ctx != ctx)
return;
- spin_lock_irqsave(&ctx->lock, flags);
+ curr_rq_lock_irq_save(&flags);
+ spin_lock(&ctx->lock);
if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
counter->hw_ops->disable(counter);
@@ -135,7 +137,8 @@ static void __perf_counter_remove_from_context(void *info)
perf_max_counters - perf_reserved_percpu);
}
- spin_unlock_irqrestore(&ctx->lock, flags);
+ spin_unlock(&ctx->lock);
+ curr_rq_unlock_irq_restore(&flags);
}
@@ -209,7 +212,8 @@ static void __perf_install_in_context(void *info)
if (ctx->task && cpuctx->task_ctx != ctx)
return;
- spin_lock_irqsave(&ctx->lock, flags);
+ curr_rq_lock_irq_save(&flags);
+ spin_lock(&ctx->lock);
/*
* Protect the list operation against NMI by disabling the
@@ -232,7 +236,8 @@ static void __perf_install_in_context(void *info)
if (!ctx->task && cpuctx->max_pertask)
cpuctx->max_pertask--;
- spin_unlock_irqrestore(&ctx->lock, flags);
+ spin_unlock(&ctx->lock);
+ curr_rq_unlock_irq_restore(&flags);
}
/*
@@ -438,15 +443,19 @@ int perf_counter_task_disable(void)
struct task_struct *curr = current;
struct perf_counter_context *ctx = &curr->perf_counter_ctx;
struct perf_counter *counter;
+ unsigned long flags;
u64 perf_flags;
int cpu;
if (likely(!ctx->nr_counters))
return 0;
- local_irq_disable();
+ curr_rq_lock_irq_save(&flags);
cpu = smp_processor_id();
+ /* force the update of the task clock: */
+ __task_delta_exec(curr, 1);
+
perf_counter_task_sched_out(curr, cpu);
spin_lock(&ctx->lock);
@@ -463,7 +472,7 @@ int perf_counter_task_disable(void)
spin_unlock(&ctx->lock);
- local_irq_enable();
+ curr_rq_unlock_irq_restore(&flags);
return 0;
}
@@ -473,15 +482,19 @@ int perf_counter_task_enable(void)
struct task_struct *curr = current;
struct perf_counter_context *ctx = &curr->perf_counter_ctx;
struct perf_counter *counter;
+ unsigned long flags;
u64 perf_flags;
int cpu;
if (likely(!ctx->nr_counters))
return 0;
- local_irq_disable();
+ curr_rq_lock_irq_save(&flags);
cpu = smp_processor_id();
+ /* force the update of the task clock: */
+ __task_delta_exec(curr, 1);
+
spin_lock(&ctx->lock);
/*
@@ -493,6 +506,7 @@ int perf_counter_task_enable(void)
if (counter->state != PERF_COUNTER_STATE_OFF)
continue;
counter->state = PERF_COUNTER_STATE_INACTIVE;
+ counter->hw_event.disabled = 0;
}
hw_perf_restore(perf_flags);
@@ -500,7 +514,7 @@ int perf_counter_task_enable(void)
perf_counter_task_sched_in(curr, cpu);
- local_irq_enable();
+ curr_rq_unlock_irq_restore(&flags);
return 0;
}
@@ -540,8 +554,11 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
static void __read(void *info)
{
struct perf_counter *counter = info;
+ unsigned long flags;
+ curr_rq_lock_irq_save(&flags);
counter->hw_ops->read(counter);
+ curr_rq_unlock_irq_restore(&flags);
}
static u64 perf_counter_read(struct perf_counter *counter)
@@ -860,13 +877,27 @@ static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
.read = cpu_clock_perf_counter_read,
};
-static void task_clock_perf_counter_update(struct perf_counter *counter)
+/*
+ * Called from within the scheduler:
+ */
+static u64 task_clock_perf_counter_val(struct perf_counter *counter, int update)
{
- u64 prev, now;
+ struct task_struct *curr = counter->task;
+ u64 delta;
+
+ WARN_ON_ONCE(counter->task != current);
+
+ delta = __task_delta_exec(curr, update);
+
+ return curr->se.sum_exec_runtime + delta;
+}
+
+static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
+{
+ u64 prev;
s64 delta;
prev = atomic64_read(&counter->hw.prev_count);
- now = current->se.sum_exec_runtime;
atomic64_set(&counter->hw.prev_count, now);
@@ -877,17 +908,23 @@ static void task_clock_perf_counter_update(struct perf_counter *counter)
static void task_clock_perf_counter_read(struct perf_counter *counter)
{
- task_clock_perf_counter_update(counter);
+ u64 now = task_clock_perf_counter_val(counter, 1);
+
+ task_clock_perf_counter_update(counter, now);
}
static void task_clock_perf_counter_enable(struct perf_counter *counter)
{
- atomic64_set(&counter->hw.prev_count, current->se.sum_exec_runtime);
+ u64 now = task_clock_perf_counter_val(counter, 0);
+
+ atomic64_set(&counter->hw.prev_count, now);
}
static void task_clock_perf_counter_disable(struct perf_counter *counter)
{
- task_clock_perf_counter_update(counter);
+ u64 now = task_clock_perf_counter_val(counter, 0);
+
+ task_clock_perf_counter_update(counter, now);
}
static const struct hw_perf_counter_ops perf_ops_task_clock = {
@@ -1267,6 +1304,7 @@ __perf_counter_exit_task(struct task_struct *child,
{
struct perf_counter *parent_counter;
u64 parent_val, child_val;
+ unsigned long flags;
u64 perf_flags;
/*
@@ -1275,7 +1313,7 @@ __perf_counter_exit_task(struct task_struct *child,
* Be careful about zapping the list - IRQ/NMI context
* could still be processing it:
*/
- local_irq_disable();
+ curr_rq_lock_irq_save(&flags);
perf_flags = hw_perf_save_disable();
if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) {
@@ -1294,7 +1332,7 @@ __perf_counter_exit_task(struct task_struct *child,
list_del_init(&child_counter->list_entry);
hw_perf_restore(perf_flags);
- local_irq_enable();
+ curr_rq_unlock_irq_restore(&flags);
parent_counter = child_counter->parent;
/*
diff --git a/kernel/sched.c b/kernel/sched.c
index 382cfdb5e38d..4d84ff4c8774 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -638,7 +638,7 @@ static inline int cpu_of(struct rq *rq)
#define task_rq(p) cpu_rq(task_cpu(p))
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
-static inline void update_rq_clock(struct rq *rq)
+inline void update_rq_clock(struct rq *rq)
{
rq->clock = sched_clock_cpu(cpu_of(rq));
}
@@ -969,6 +969,26 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
}
}
+void curr_rq_lock_irq_save(unsigned long *flags)
+ __acquires(rq->lock)
+{
+ struct rq *rq;
+
+ local_irq_save(*flags);
+ rq = cpu_rq(smp_processor_id());
+ spin_lock(&rq->lock);
+}
+
+void curr_rq_unlock_irq_restore(unsigned long *flags)
+ __releases(rq->lock)
+{
+ struct rq *rq;
+
+ rq = cpu_rq(smp_processor_id());
+ spin_unlock(&rq->lock);
+ local_irq_restore(*flags);
+}
+
void task_rq_unlock_wait(struct task_struct *p)
{
struct rq *rq = task_rq(p);
@@ -2558,7 +2578,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next)
{
fire_sched_out_preempt_notifiers(prev, next);
- perf_counter_task_sched_out(prev, cpu_of(rq));
prepare_lock_switch(rq, next);
prepare_arch_switch(next);
}
@@ -4089,6 +4108,29 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
EXPORT_PER_CPU_SYMBOL(kstat);
+/*
+ * Return any ns on the sched_clock that have not yet been banked in
+ * @p in case that task is currently running.
+ */
+unsigned long long __task_delta_exec(struct task_struct *p, int update)
+{
+ s64 delta_exec;
+ struct rq *rq;
+
+ rq = task_rq(p);
+ WARN_ON_ONCE(!runqueue_is_locked());
+ WARN_ON_ONCE(!task_current(rq, p));
+
+ if (update)
+ update_rq_clock(rq);
+
+ delta_exec = rq->clock - p->se.exec_start;
+
+ WARN_ON_ONCE(delta_exec < 0);
+
+ return delta_exec;
+}
+
/*
* Return any ns on the sched_clock that have not yet been banked in
* @p in case that task is currently running.
@@ -4316,13 +4358,13 @@ void scheduler_tick(void)
update_rq_clock(rq);
update_cpu_load(rq);
curr->sched_class->task_tick(rq, curr, 0);
+ perf_counter_task_tick(curr, cpu);
spin_unlock(&rq->lock);
#ifdef CONFIG_SMP
rq->idle_at_tick = idle_cpu(cpu);
trigger_load_balance(rq, cpu);
#endif
- perf_counter_task_tick(curr, cpu);
}
#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
@@ -4512,6 +4554,7 @@ asmlinkage void __sched schedule(void)
if (likely(prev != next)) {
sched_info_switch(prev, next);
+ perf_counter_task_sched_out(prev, cpu);
rq->nr_switches++;
rq->curr = next;
commit 7671581f1666ef4b54a1c1e598c51ac44c060a9b
Author: Ingo Molnar <mingo@elte.hu>
Date: Wed Dec 17 14:20:28 2008 +0100
perfcounters: hw ops rename
Impact: rename field names
Shorten them.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 358af5266407..b67557121425 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -577,9 +577,9 @@ static void pmc_generic_read(struct perf_counter *counter)
}
static const struct hw_perf_counter_ops x86_perf_counter_ops = {
- .hw_perf_counter_enable = pmc_generic_enable,
- .hw_perf_counter_disable = pmc_generic_disable,
- .hw_perf_counter_read = pmc_generic_read,
+ .enable = pmc_generic_enable,
+ .disable = pmc_generic_disable,
+ .read = pmc_generic_read,
};
const struct hw_perf_counter_ops *
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 984da540224b..48f76d2e54c2 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -128,9 +128,9 @@ struct perf_counter;
* struct hw_perf_counter_ops - performance counter hw ops
*/
struct hw_perf_counter_ops {
- void (*hw_perf_counter_enable) (struct perf_counter *counter);
- void (*hw_perf_counter_disable) (struct perf_counter *counter);
- void (*hw_perf_counter_read) (struct perf_counter *counter);
+ void (*enable) (struct perf_counter *counter);
+ void (*disable) (struct perf_counter *counter);
+ void (*read) (struct perf_counter *counter);
};
/**
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f8a4d9a5d5d3..961d651aa574 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -109,7 +109,7 @@ static void __perf_counter_remove_from_context(void *info)
spin_lock_irqsave(&ctx->lock, flags);
if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
- counter->hw_ops->hw_perf_counter_disable(counter);
+ counter->hw_ops->disable(counter);
counter->state = PERF_COUNTER_STATE_INACTIVE;
ctx->nr_active--;
cpuctx->active_oncpu--;
@@ -226,7 +226,7 @@ static void __perf_install_in_context(void *info)
counter->oncpu = cpu;
ctx->nr_active++;
cpuctx->active_oncpu++;
- counter->hw_ops->hw_perf_counter_enable(counter);
+ counter->hw_ops->enable(counter);
}
if (!ctx->task && cpuctx->max_pertask)
@@ -297,7 +297,7 @@ counter_sched_out(struct perf_counter *counter,
if (counter->state != PERF_COUNTER_STATE_ACTIVE)
return;
- counter->hw_ops->hw_perf_counter_disable(counter);
+ counter->hw_ops->disable(counter);
counter->state = PERF_COUNTER_STATE_INACTIVE;
counter->oncpu = -1;
@@ -327,7 +327,7 @@ group_sched_out(struct perf_counter *group_counter,
*
* We stop each counter and update the counter value in counter->count.
*
- * This does not protect us against NMI, but hw_perf_counter_disable()
+ * This does not protect us against NMI, but disable()
* sets the disabled bit in the control field of counter _before_
* accessing the counter control register. If a NMI hits, then it will
* not restart the counter.
@@ -359,7 +359,7 @@ counter_sched_in(struct perf_counter *counter,
if (counter->state == PERF_COUNTER_STATE_OFF)
return;
- counter->hw_ops->hw_perf_counter_enable(counter);
+ counter->hw_ops->enable(counter);
counter->state = PERF_COUNTER_STATE_ACTIVE;
counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
@@ -395,7 +395,7 @@ group_sched_in(struct perf_counter *group_counter,
*
* We restore the counter value and then enable it.
*
- * This does not protect us against NMI, but hw_perf_counter_enable()
+ * This does not protect us against NMI, but enable()
* sets the enabled bit in the control field of counter _before_
* accessing the counter control register. If a NMI hits, then it will
* keep the counter running.
@@ -537,11 +537,11 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
/*
* Cross CPU call to read the hardware counter
*/
-static void __hw_perf_counter_read(void *info)
+static void __read(void *info)
{
struct perf_counter *counter = info;
- counter->hw_ops->hw_perf_counter_read(counter);
+ counter->hw_ops->read(counter);
}
static u64 perf_counter_read(struct perf_counter *counter)
@@ -552,7 +552,7 @@ static u64 perf_counter_read(struct perf_counter *counter)
*/
if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
smp_call_function_single(counter->oncpu,
- __hw_perf_counter_read, counter, 1);
+ __read, counter, 1);
}
return atomic64_read(&counter->count);
@@ -855,9 +855,9 @@ static void cpu_clock_perf_counter_read(struct perf_counter *counter)
}
static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
- .hw_perf_counter_enable = cpu_clock_perf_counter_enable,
- .hw_perf_counter_disable = cpu_clock_perf_counter_disable,
- .hw_perf_counter_read = cpu_clock_perf_counter_read,
+ .enable = cpu_clock_perf_counter_enable,
+ .disable = cpu_clock_perf_counter_disable,
+ .read = cpu_clock_perf_counter_read,
};
static void task_clock_perf_counter_update(struct perf_counter *counter)
@@ -891,9 +891,9 @@ static void task_clock_perf_counter_disable(struct perf_counter *counter)
}
static const struct hw_perf_counter_ops perf_ops_task_clock = {
- .hw_perf_counter_enable = task_clock_perf_counter_enable,
- .hw_perf_counter_disable = task_clock_perf_counter_disable,
- .hw_perf_counter_read = task_clock_perf_counter_read,
+ .enable = task_clock_perf_counter_enable,
+ .disable = task_clock_perf_counter_disable,
+ .read = task_clock_perf_counter_read,
};
static u64 get_page_faults(void)
@@ -937,9 +937,9 @@ static void page_faults_perf_counter_disable(struct perf_counter *counter)
}
static const struct hw_perf_counter_ops perf_ops_page_faults = {
- .hw_perf_counter_enable = page_faults_perf_counter_enable,
- .hw_perf_counter_disable = page_faults_perf_counter_disable,
- .hw_perf_counter_read = page_faults_perf_counter_read,
+ .enable = page_faults_perf_counter_enable,
+ .disable = page_faults_perf_counter_disable,
+ .read = page_faults_perf_counter_read,
};
static u64 get_context_switches(void)
@@ -983,9 +983,9 @@ static void context_switches_perf_counter_disable(struct perf_counter *counter)
}
static const struct hw_perf_counter_ops perf_ops_context_switches = {
- .hw_perf_counter_enable = context_switches_perf_counter_enable,
- .hw_perf_counter_disable = context_switches_perf_counter_disable,
- .hw_perf_counter_read = context_switches_perf_counter_read,
+ .enable = context_switches_perf_counter_enable,
+ .disable = context_switches_perf_counter_disable,
+ .read = context_switches_perf_counter_read,
};
static inline u64 get_cpu_migrations(void)
@@ -1027,9 +1027,9 @@ static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
}
static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
- .hw_perf_counter_enable = cpu_migrations_perf_counter_enable,
- .hw_perf_counter_disable = cpu_migrations_perf_counter_disable,
- .hw_perf_counter_read = cpu_migrations_perf_counter_read,
+ .enable = cpu_migrations_perf_counter_enable,
+ .disable = cpu_migrations_perf_counter_disable,
+ .read = cpu_migrations_perf_counter_read,
};
static const struct hw_perf_counter_ops *
@@ -1283,7 +1283,7 @@ __perf_counter_exit_task(struct task_struct *child,
cpuctx = &__get_cpu_var(perf_cpu_context);
- child_counter->hw_ops->hw_perf_counter_disable(child_counter);
+ child_counter->hw_ops->disable(child_counter);
child_counter->state = PERF_COUNTER_STATE_INACTIVE;
child_counter->oncpu = -1;
commit 862a1a5f346fe7e9181ea51eaae48cf2cd70f746
Author: Ingo Molnar <mingo@elte.hu>
Date: Wed Dec 17 13:09:20 2008 +0100
x86, perfcounters: refactor code for fixed-function PMCs
Impact: clean up
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
index 945a315e6d62..13745deb16c8 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -8,6 +8,10 @@
#define X86_PMC_MAX_GENERIC 8
#define X86_PMC_MAX_FIXED 3
+#define X86_PMC_IDX_GENERIC 0
+#define X86_PMC_IDX_FIXED 32
+#define X86_PMC_IDX_MAX 64
+
#define MSR_ARCH_PERFMON_PERFCTR0 0xc1
#define MSR_ARCH_PERFMON_PERFCTR1 0xc2
@@ -54,6 +58,15 @@ union cpuid10_edx {
* Fixed-purpose performance counters:
*/
+/*
+ * All 3 fixed-mode PMCs are configured via this single MSR:
+ */
+#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL 0x38d
+
+/*
+ * The counts are available in three separate MSRs:
+ */
+
/* Instr_Retired.Any: */
#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309
@@ -63,7 +76,6 @@ union cpuid10_edx {
/* CPU_CLK_Unhalted.Ref: */
#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b
-
#ifdef CONFIG_PERF_COUNTERS
extern void init_hw_perf_counters(void);
extern void perf_counters_lapic_init(int nmi);
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 2fca50c45979..358af5266407 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -24,17 +24,14 @@ static bool perf_counters_initialized __read_mostly;
/*
* Number of (generic) HW counters:
*/
-static int nr_hw_counters __read_mostly;
-static u32 perf_counter_mask __read_mostly;
+static int nr_counters_generic __read_mostly;
+static u64 perf_counter_mask __read_mostly;
-static int nr_hw_counters_fixed __read_mostly;
+static int nr_counters_fixed __read_mostly;
struct cpu_hw_counters {
- struct perf_counter *generic[X86_PMC_MAX_GENERIC];
- unsigned long used[BITS_TO_LONGS(X86_PMC_MAX_GENERIC)];
-
- struct perf_counter *fixed[X86_PMC_MAX_FIXED];
- unsigned long used_fixed[BITS_TO_LONGS(X86_PMC_MAX_FIXED)];
+ struct perf_counter *counters[X86_PMC_IDX_MAX];
+ unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
};
/*
@@ -159,7 +156,7 @@ void hw_perf_enable_all(void)
if (unlikely(!perf_counters_initialized))
return;
- wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0);
+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask);
}
u64 hw_perf_save_disable(void)
@@ -170,7 +167,7 @@ u64 hw_perf_save_disable(void)
return 0;
rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
- wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
return ctrl;
}
@@ -181,7 +178,7 @@ void hw_perf_restore(u64 ctrl)
if (unlikely(!perf_counters_initialized))
return;
- wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0);
+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
}
EXPORT_SYMBOL_GPL(hw_perf_restore);
@@ -239,6 +236,11 @@ __pmc_generic_enable(struct perf_counter *counter,
hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
}
+static int fixed_mode_idx(struct hw_perf_counter *hwc)
+{
+ return -1;
+}
+
/*
* Find a PMC slot for the freshly enabled / scheduled in counter:
*/
@@ -250,7 +252,7 @@ static void pmc_generic_enable(struct perf_counter *counter)
/* Try to get the previous counter again */
if (test_and_set_bit(idx, cpuc->used)) {
- idx = find_first_zero_bit(cpuc->used, nr_hw_counters);
+ idx = find_first_zero_bit(cpuc->used, nr_counters_generic);
set_bit(idx, cpuc->used);
hwc->idx = idx;
}
@@ -259,7 +261,7 @@ static void pmc_generic_enable(struct perf_counter *counter)
__pmc_generic_disable(counter, hwc, idx);
- cpuc->generic[idx] = counter;
+ cpuc->counters[idx] = counter;
__hw_perf_counter_set_period(counter, hwc, idx);
__pmc_generic_enable(counter, hwc, idx);
@@ -270,7 +272,7 @@ void perf_counter_print_debug(void)
u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left;
int cpu, idx;
- if (!nr_hw_counters)
+ if (!nr_counters_generic)
return;
local_irq_disable();
@@ -286,7 +288,7 @@ void perf_counter_print_debug(void)
printk(KERN_INFO "CPU#%d: status: %016llx\n", cpu, status);
printk(KERN_INFO "CPU#%d: overflow: %016llx\n", cpu, overflow);
- for (idx = 0; idx < nr_hw_counters; idx++) {
+ for (idx = 0; idx < nr_counters_generic; idx++) {
rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
rdmsrl(MSR_ARCH_PERFMON_PERFCTR0 + idx, pmc_count);
@@ -311,7 +313,7 @@ static void pmc_generic_disable(struct perf_counter *counter)
__pmc_generic_disable(counter, hwc, idx);
clear_bit(idx, cpuc->used);
- cpuc->generic[idx] = NULL;
+ cpuc->counters[idx] = NULL;
/*
* Drain the remaining delta count out of a counter
@@ -381,7 +383,7 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global);
/* Disable counters globally */
- wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
ack_APIC_irq();
cpuc = &per_cpu(cpu_hw_counters, cpu);
@@ -392,8 +394,8 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
again:
ack = status;
- for_each_bit(bit, (unsigned long *) &status, nr_hw_counters) {
- struct perf_counter *counter = cpuc->generic[bit];
+ for_each_bit(bit, (unsigned long *) &status, nr_counters_generic) {
+ struct perf_counter *counter = cpuc->counters[bit];
clear_bit(bit, (unsigned long *) &status);
if (!counter)
@@ -424,7 +426,7 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
}
}
- wrmsr(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack, 0);
+ wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
/*
* Repeat if there is more work to be done:
@@ -436,7 +438,7 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
/*
* Restore - do not reenable when global enable is off:
*/
- wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, saved_global, 0);
+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global);
}
void smp_perf_counter_interrupt(struct pt_regs *regs)
@@ -462,8 +464,8 @@ void perf_counter_notify(struct pt_regs *regs)
cpu = smp_processor_id();
cpuc = &per_cpu(cpu_hw_counters, cpu);
- for_each_bit(bit, cpuc->used, nr_hw_counters) {
- struct perf_counter *counter = cpuc->generic[bit];
+ for_each_bit(bit, cpuc->used, X86_PMC_IDX_MAX) {
+ struct perf_counter *counter = cpuc->counters[bit];
if (!counter)
continue;
@@ -540,26 +542,29 @@ void __init init_hw_perf_counters(void)
printk(KERN_INFO "... version: %d\n", eax.split.version_id);
printk(KERN_INFO "... num counters: %d\n", eax.split.num_counters);
- nr_hw_counters = eax.split.num_counters;
- if (nr_hw_counters > X86_PMC_MAX_GENERIC) {
- nr_hw_counters = X86_PMC_MAX_GENERIC;
+ nr_counters_generic = eax.split.num_counters;
+ if (nr_counters_generic > X86_PMC_MAX_GENERIC) {
+ nr_counters_generic = X86_PMC_MAX_GENERIC;
WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
- nr_hw_counters, X86_PMC_MAX_GENERIC);
+ nr_counters_generic, X86_PMC_MAX_GENERIC);
}
- perf_counter_mask = (1 << nr_hw_counters) - 1;
- perf_max_counters = nr_hw_counters;
+ perf_counter_mask = (1 << nr_counters_generic) - 1;
+ perf_max_counters = nr_counters_generic;
printk(KERN_INFO "... bit width: %d\n", eax.split.bit_width);
printk(KERN_INFO "... mask length: %d\n", eax.split.mask_length);
- nr_hw_counters_fixed = edx.split.num_counters_fixed;
- if (nr_hw_counters_fixed > X86_PMC_MAX_FIXED) {
- nr_hw_counters_fixed = X86_PMC_MAX_FIXED;
+ nr_counters_fixed = edx.split.num_counters_fixed;
+ if (nr_counters_fixed > X86_PMC_MAX_FIXED) {
+ nr_counters_fixed = X86_PMC_MAX_FIXED;
WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
- nr_hw_counters_fixed, X86_PMC_MAX_FIXED);
+ nr_counters_fixed, X86_PMC_MAX_FIXED);
}
- printk(KERN_INFO "... fixed counters: %d\n", nr_hw_counters_fixed);
+ printk(KERN_INFO "... fixed counters: %d\n", nr_counters_fixed);
+
+ perf_counter_mask |= ((1LL << nr_counters_fixed)-1) << X86_PMC_IDX_FIXED;
+ printk(KERN_INFO "... counter mask: %016Lx\n", perf_counter_mask);
perf_counters_initialized = true;
perf_counters_lapic_init(0);
commit 703e937c83bbad79075a7846e062e447c2fee6a4
Author: Ingo Molnar <mingo@elte.hu>
Date: Wed Dec 17 10:51:15 2008 +0100
perfcounters: add fixed-mode PMC enumeration
Enumerate fixed-mode PMCs based on CPUID, and feed that into the
perfcounter code.
Does not use fixed-mode PMCs yet.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
index dd5a4a559e2d..945a315e6d62 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -41,6 +41,29 @@ union cpuid10_eax {
unsigned int full;
};
+union cpuid10_edx {
+ struct {
+ unsigned int num_counters_fixed:4;
+ unsigned int reserved:28;
+ } split;
+ unsigned int full;
+};
+
+
+/*
+ * Fixed-purpose performance counters:
+ */
+
+/* Instr_Retired.Any: */
+#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309
+
+/* CPU_CLK_Unhalted.Core: */
+#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a
+
+/* CPU_CLK_Unhalted.Ref: */
+#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b
+
+
#ifdef CONFIG_PERF_COUNTERS
extern void init_hw_perf_counters(void);
extern void perf_counters_lapic_init(int nmi);
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index fc3af8688232..2fca50c45979 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -27,6 +27,8 @@ static bool perf_counters_initialized __read_mostly;
static int nr_hw_counters __read_mostly;
static u32 perf_counter_mask __read_mostly;
+static int nr_hw_counters_fixed __read_mostly;
+
struct cpu_hw_counters {
struct perf_counter *generic[X86_PMC_MAX_GENERIC];
unsigned long used[BITS_TO_LONGS(X86_PMC_MAX_GENERIC)];
@@ -519,8 +521,9 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
void __init init_hw_perf_counters(void)
{
union cpuid10_eax eax;
- unsigned int unused;
unsigned int ebx;
+ unsigned int unused;
+ union cpuid10_edx edx;
if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
return;
@@ -529,14 +532,14 @@ void __init init_hw_perf_counters(void)
* Check whether the Architectural PerfMon supports
* Branch Misses Retired Event or not.
*/
- cpuid(10, &(eax.full), &ebx, &unused, &unused);
+ cpuid(10, &eax.full, &ebx, &unused, &edx.full);
if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
return;
printk(KERN_INFO "Intel Performance Monitoring support detected.\n");
- printk(KERN_INFO "... version: %d\n", eax.split.version_id);
- printk(KERN_INFO "... num_counters: %d\n", eax.split.num_counters);
+ printk(KERN_INFO "... version: %d\n", eax.split.version_id);
+ printk(KERN_INFO "... num counters: %d\n", eax.split.num_counters);
nr_hw_counters = eax.split.num_counters;
if (nr_hw_counters > X86_PMC_MAX_GENERIC) {
nr_hw_counters = X86_PMC_MAX_GENERIC;
@@ -546,8 +549,16 @@ void __init init_hw_perf_counters(void)
perf_counter_mask = (1 << nr_hw_counters) - 1;
perf_max_counters = nr_hw_counters;
- printk(KERN_INFO "... bit_width: %d\n", eax.split.bit_width);
- printk(KERN_INFO "... mask_length: %d\n", eax.split.mask_length);
+ printk(KERN_INFO "... bit width: %d\n", eax.split.bit_width);
+ printk(KERN_INFO "... mask length: %d\n", eax.split.mask_length);
+
+ nr_hw_counters_fixed = edx.split.num_counters_fixed;
+ if (nr_hw_counters_fixed > X86_PMC_MAX_FIXED) {
+ nr_hw_counters_fixed = X86_PMC_MAX_FIXED;
+ WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
+ nr_hw_counters_fixed, X86_PMC_MAX_FIXED);
+ }
+ printk(KERN_INFO "... fixed counters: %d\n", nr_hw_counters_fixed);
perf_counters_initialized = true;