Patches contributed by Eötvös Lorand University
commit f3134de60624829a57741c1f3796847d4de165f6
Merge: e726f5f91eff 361b73d5c34f
Author: Ingo Molnar <mingo@elte.hu>
Date: Fri Dec 12 07:40:08 2008 +0100
Merge branches 'tracing/function-graph-tracer' and 'tracing/ring-buffer' into tracing/core
commit 447557ac7ce120306b4a31d6003faef39cb1bf14
Author: Ingo Molnar <mingo@elte.hu>
Date: Thu Dec 11 20:40:18 2008 +0100
perf counters: update docs
Impact: update docs
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/Documentation/perf-counters.txt b/Documentation/perf-counters.txt
index 19033a0bb526..fddd32189a50 100644
--- a/Documentation/perf-counters.txt
+++ b/Documentation/perf-counters.txt
@@ -10,8 +10,8 @@ trigger interrupts when a threshold number of events have passed - and can
thus be used to profile the code that runs on that CPU.
The Linux Performance Counter subsystem provides an abstraction of these
-hardware capabilities. It provides per task and per CPU counters, and
-it provides event capabilities on top of those.
+hardware capabilities. It provides per task and per CPU counters, counter
+groups, and it provides event capabilities on top of those.
Performance counters are accessed via special file descriptors.
There's one file descriptor per virtual counter used.
@@ -19,12 +19,8 @@ There's one file descriptor per virtual counter used.
The special file descriptor is opened via the perf_counter_open()
system call:
- int
- perf_counter_open(u32 hw_event_type,
- u32 hw_event_period,
- u32 record_type,
- pid_t pid,
- int cpu);
+ int sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr,
+ pid_t pid, int cpu, int group_fd);
The syscall returns the new fd. The fd can be used via the normal
VFS system calls: read() can be used to read the counter, fcntl()
@@ -33,39 +29,78 @@ can be used to set the blocking mode, etc.
Multiple counters can be kept open at a time, and the counters
can be poll()ed.
-When creating a new counter fd, 'hw_event_type' is one of:
-
- enum hw_event_types {
- PERF_COUNT_CYCLES,
- PERF_COUNT_INSTRUCTIONS,
- PERF_COUNT_CACHE_REFERENCES,
- PERF_COUNT_CACHE_MISSES,
- PERF_COUNT_BRANCH_INSTRUCTIONS,
- PERF_COUNT_BRANCH_MISSES,
- };
+When creating a new counter fd, 'perf_counter_hw_event' is:
+
+/*
+ * Hardware event to monitor via a performance monitoring counter:
+ */
+struct perf_counter_hw_event {
+ s64 type;
+
+ u64 irq_period;
+ u32 record_type;
+
+ u32 disabled : 1, /* off by default */
+ nmi : 1, /* NMI sampling */
+ raw : 1, /* raw event type */
+ __reserved_1 : 29;
+
+ u64 __reserved_2;
+};
+
+/*
+ * Generalized performance counter event types, used by the hw_event.type
+ * parameter of the sys_perf_counter_open() syscall:
+ */
+enum hw_event_types {
+ /*
+ * Common hardware events, generalized by the kernel:
+ */
+ PERF_COUNT_CYCLES = 0,
+ PERF_COUNT_INSTRUCTIONS = 1,
+ PERF_COUNT_CACHE_REFERENCES = 2,
+ PERF_COUNT_CACHE_MISSES = 3,
+ PERF_COUNT_BRANCH_INSTRUCTIONS = 4,
+ PERF_COUNT_BRANCH_MISSES = 5,
+
+ /*
+ * Special "software" counters provided by the kernel, even if
+ * the hardware does not support performance counters. These
+ * counters measure various physical and sw events of the
+ * kernel (and allow the profiling of them as well):
+ */
+ PERF_COUNT_CPU_CLOCK = -1,
+ PERF_COUNT_TASK_CLOCK = -2,
+ /*
+ * Future software events:
+ */
+ /* PERF_COUNT_PAGE_FAULTS = -3,
+ PERF_COUNT_CONTEXT_SWITCHES = -4, */
+};
These are standardized types of events that work uniformly on all CPUs
that implements Performance Counters support under Linux. If a CPU is
not able to count branch-misses, then the system call will return
-EINVAL.
-[ Note: more hw_event_types are supported as well, but they are CPU
- specific and are enumerated via /sys on a per CPU basis. Raw hw event
- types can be passed in as negative numbers. For example, to count
- "External bus cycles while bus lock signal asserted" events on Intel
- Core CPUs, pass in a -0x4064 event type value. ]
-
-The parameter 'hw_event_period' is the number of events before waking up
-a read() that is blocked on a counter fd. Zero value means a non-blocking
-counter.
+More hw_event_types are supported as well, but they are CPU
+specific and are enumerated via /sys on a per CPU basis. Raw hw event
+types can be passed in under hw_event.type if hw_event.raw is 1.
+For example, to count "External bus cycles while bus lock signal asserted"
+events on Intel Core CPUs, pass in a 0x4064 event type value and set
+hw_event.raw to 1.
'record_type' is the type of data that a read() will provide for the
counter, and it can be one of:
- enum perf_record_type {
- PERF_RECORD_SIMPLE,
- PERF_RECORD_IRQ,
- };
+/*
+ * IRQ-notification data record type:
+ */
+enum perf_counter_record_type {
+ PERF_RECORD_SIMPLE = 0,
+ PERF_RECORD_IRQ = 1,
+ PERF_RECORD_GROUP = 2,
+};
a "simple" counter is one that counts hardware events and allows
them to be read out into a u64 count value. (read() returns 8 on
@@ -76,6 +111,10 @@ the IP of the interrupted context. In this case read() will return
the 8-byte counter value, plus the Instruction Pointer address of the
interrupted context.
+The parameter 'hw_event_period' is the number of events before waking up
+a read() that is blocked on a counter fd. Zero value means a non-blocking
+counter.
+
The 'pid' parameter allows the counter to be specific to a task:
pid == 0: if the pid parameter is zero, the counter is attached to the
@@ -92,7 +131,7 @@ CPU:
cpu >= 0: the counter is restricted to a specific CPU
cpu == -1: the counter counts on all CPUs
-Note: the combination of 'pid == -1' and 'cpu == -1' is not valid.
+(Note: the combination of 'pid == -1' and 'cpu == -1' is not valid.)
A 'pid > 0' and 'cpu == -1' counter is a per task counter that counts
events of that task and 'follows' that task to whatever CPU the task
@@ -102,3 +141,7 @@ their own tasks.
A 'pid == -1' and 'cpu == x' counter is a per CPU counter that counts
all events on CPU-x. Per CPU counters need CAP_SYS_ADMIN privilege.
+Group counters are created by passing in a group_fd of another counter.
+Groups are scheduled at once and can be used with PERF_RECORD_GROUP
+to record multi-dimensional timestamps.
+
commit 6a930700c8b655a9e25e42fc4adc0b225ebbcefc
Author: Ingo Molnar <mingo@elte.hu>
Date: Thu Dec 11 15:17:03 2008 +0100
perf counters: clean up state transitions
Impact: cleanup
Introduce a proper enum for the 3 states of a counter:
PERF_COUNTER_STATE_OFF = -1
PERF_COUNTER_STATE_INACTIVE = 0
PERF_COUNTER_STATE_ACTIVE = 1
and rename counter->active to counter->state and propagate the
changes everywhere.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 3e1dbebe22b9..4854cca7fffd 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -332,7 +332,7 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
* Then store sibling timestamps (if any):
*/
list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
- if (!counter->active) {
+ if (counter->state != PERF_COUNTER_STATE_ACTIVE) {
/*
* When counter was not in the overflow mask, we have to
* read it from hardware. We read it as well, when it
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 97d86c293ee8..8cb095fa442c 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -127,6 +127,15 @@ struct hw_perf_counter_ops {
void (*hw_perf_counter_read) (struct perf_counter *counter);
};
+/**
+ * enum perf_counter_active_state - the states of a counter
+ */
+enum perf_counter_active_state {
+ PERF_COUNTER_STATE_OFF = -1,
+ PERF_COUNTER_STATE_INACTIVE = 0,
+ PERF_COUNTER_STATE_ACTIVE = 1,
+};
+
/**
* struct perf_counter - performance counter kernel representation:
*/
@@ -136,7 +145,7 @@ struct perf_counter {
struct perf_counter *group_leader;
const struct hw_perf_counter_ops *hw_ops;
- int active;
+ enum perf_counter_active_state state;
#if BITS_PER_LONG == 64
atomic64_t count;
#else
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 4e679b91d8bb..559130b8774d 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -167,9 +167,9 @@ static void __perf_counter_remove_from_context(void *info)
spin_lock(&ctx->lock);
- if (counter->active) {
+ if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
counter->hw_ops->hw_perf_counter_disable(counter);
- counter->active = 0;
+ counter->state = PERF_COUNTER_STATE_INACTIVE;
ctx->nr_active--;
cpuctx->active_oncpu--;
counter->task = NULL;
@@ -281,7 +281,7 @@ static void __perf_install_in_context(void *info)
if (cpuctx->active_oncpu < perf_max_counters) {
counter->hw_ops->hw_perf_counter_enable(counter);
- counter->active = 1;
+ counter->state = PERF_COUNTER_STATE_ACTIVE;
counter->oncpu = cpu;
ctx->nr_active++;
cpuctx->active_oncpu++;
@@ -328,7 +328,6 @@ perf_install_in_context(struct perf_counter_context *ctx,
spin_lock_irq(&ctx->lock);
/*
- * If the context is active and the counter has not been added
* we need to retry the smp call.
*/
if (ctx->nr_active && list_empty(&counter->list_entry)) {
@@ -353,12 +352,12 @@ counter_sched_out(struct perf_counter *counter,
struct perf_cpu_context *cpuctx,
struct perf_counter_context *ctx)
{
- if (!counter->active)
+ if (counter->state != PERF_COUNTER_STATE_ACTIVE)
return;
counter->hw_ops->hw_perf_counter_disable(counter);
- counter->active = 0;
- counter->oncpu = -1;
+ counter->state = PERF_COUNTER_STATE_INACTIVE;
+ counter->oncpu = -1;
cpuctx->active_oncpu--;
ctx->nr_active--;
@@ -415,11 +414,11 @@ counter_sched_in(struct perf_counter *counter,
struct perf_counter_context *ctx,
int cpu)
{
- if (counter->active == -1)
+ if (counter->state == PERF_COUNTER_STATE_OFF)
return;
counter->hw_ops->hw_perf_counter_enable(counter);
- counter->active = 1;
+ counter->state = PERF_COUNTER_STATE_ACTIVE;
counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
cpuctx->active_oncpu++;
@@ -506,8 +505,8 @@ int perf_counter_task_disable(void)
perf_flags = hw_perf_save_disable();
list_for_each_entry(counter, &ctx->counter_list, list_entry) {
- WARN_ON_ONCE(counter->active == 1);
- counter->active = -1;
+ WARN_ON_ONCE(counter->state == PERF_COUNTER_STATE_ACTIVE);
+ counter->state = PERF_COUNTER_STATE_OFF;
}
hw_perf_restore(perf_flags);
@@ -540,9 +539,9 @@ int perf_counter_task_enable(void)
perf_flags = hw_perf_save_disable();
list_for_each_entry(counter, &ctx->counter_list, list_entry) {
- if (counter->active != -1)
+ if (counter->state != PERF_COUNTER_STATE_OFF)
continue;
- counter->active = 0;
+ counter->state = PERF_COUNTER_STATE_INACTIVE;
}
hw_perf_restore(perf_flags);
@@ -620,7 +619,7 @@ static u64 perf_counter_read(struct perf_counter *counter)
* If counter is enabled and currently active on a CPU, update the
* value in the counter structure:
*/
- if (counter->active) {
+ if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
smp_call_function_single(counter->oncpu,
__hw_perf_counter_read, counter, 1);
}
@@ -673,7 +672,7 @@ static struct perf_data *perf_switch_irq_data(struct perf_counter *counter)
retry:
spin_lock_irq(&ctx->lock);
- if (!counter->active) {
+ if (counter->state != PERF_COUNTER_STATE_ACTIVE) {
counter->irqdata = counter->usrdata;
counter->usrdata = oldirqdata;
spin_unlock_irq(&ctx->lock);
commit 1d1c7ddbfab358445a542715551301b7fc363e28
Author: Ingo Molnar <mingo@elte.hu>
Date: Thu Dec 11 14:59:31 2008 +0100
perf counters: add prctl interface to disable/enable counters
Add a way for self-monitoring tasks to disable/enable counters summarily,
via a prctl:
PR_TASK_PERF_COUNTERS_DISABLE 31
PR_TASK_PERF_COUNTERS_ENABLE 32
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 30c0ec8c1ee3..97d86c293ee8 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -213,6 +213,8 @@ extern u64 hw_perf_save_disable(void);
extern void hw_perf_restore(u64 ctrl);
extern void atomic64_counter_set(struct perf_counter *counter, u64 val64);
extern u64 atomic64_counter_read(struct perf_counter *counter);
+extern int perf_counter_task_disable(void);
+extern int perf_counter_task_enable(void);
#else
static inline void
@@ -226,6 +228,8 @@ static inline void perf_counter_notify(struct pt_regs *regs) { }
static inline void perf_counter_print_debug(void) { }
static inline void hw_perf_restore(u64 ctrl) { }
static inline u64 hw_perf_save_disable(void) { return 0; }
+static inline int perf_counter_task_disable(void) { return -EINVAL; }
+static inline int perf_counter_task_enable(void) { return -EINVAL; }
#endif
#endif /* _LINUX_PERF_COUNTER_H */
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index 48d887e3c6e7..b00df4c79c63 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -85,4 +85,7 @@
#define PR_SET_TIMERSLACK 29
#define PR_GET_TIMERSLACK 30
+#define PR_TASK_PERF_COUNTERS_DISABLE 31
+#define PR_TASK_PERF_COUNTERS_ENABLE 32
+
#endif /* _LINUX_PRCTL_H */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index a0fe8474ee29..4e679b91d8bb 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -415,6 +415,9 @@ counter_sched_in(struct perf_counter *counter,
struct perf_counter_context *ctx,
int cpu)
{
+ if (counter->active == -1)
+ return;
+
counter->hw_ops->hw_perf_counter_enable(counter);
counter->active = 1;
counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
@@ -479,6 +482,79 @@ void perf_counter_task_sched_in(struct task_struct *task, int cpu)
cpuctx->task_ctx = ctx;
}
+int perf_counter_task_disable(void)
+{
+ struct task_struct *curr = current;
+ struct perf_counter_context *ctx = &curr->perf_counter_ctx;
+ struct perf_counter *counter;
+ u64 perf_flags;
+ int cpu;
+
+ if (likely(!ctx->nr_counters))
+ return 0;
+
+ local_irq_disable();
+ cpu = smp_processor_id();
+
+ perf_counter_task_sched_out(curr, cpu);
+
+ spin_lock(&ctx->lock);
+
+ /*
+ * Disable all the counters:
+ */
+ perf_flags = hw_perf_save_disable();
+
+ list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+ WARN_ON_ONCE(counter->active == 1);
+ counter->active = -1;
+ }
+ hw_perf_restore(perf_flags);
+
+ spin_unlock(&ctx->lock);
+
+ local_irq_enable();
+
+ return 0;
+}
+
+int perf_counter_task_enable(void)
+{
+ struct task_struct *curr = current;
+ struct perf_counter_context *ctx = &curr->perf_counter_ctx;
+ struct perf_counter *counter;
+ u64 perf_flags;
+ int cpu;
+
+ if (likely(!ctx->nr_counters))
+ return 0;
+
+ local_irq_disable();
+ cpu = smp_processor_id();
+
+ spin_lock(&ctx->lock);
+
+ /*
+ * Disable all the counters:
+ */
+ perf_flags = hw_perf_save_disable();
+
+ list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+ if (counter->active != -1)
+ continue;
+ counter->active = 0;
+ }
+ hw_perf_restore(perf_flags);
+
+ spin_unlock(&ctx->lock);
+
+ perf_counter_task_sched_in(curr, cpu);
+
+ local_irq_enable();
+
+ return 0;
+}
+
void perf_counter_task_tick(struct task_struct *curr, int cpu)
{
struct perf_counter_context *ctx = &curr->perf_counter_ctx;
@@ -951,13 +1027,9 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
* @cpu: target cpu
* @group_fd: group leader counter fd
*/
-asmlinkage int sys_perf_counter_open(
-
- struct perf_counter_hw_event *hw_event_uptr __user,
- pid_t pid,
- int cpu,
- int group_fd)
-
+asmlinkage int
+sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
+ pid_t pid, int cpu, int group_fd)
{
struct perf_counter *counter, *group_leader;
struct perf_counter_hw_event hw_event;
diff --git a/kernel/sys.c b/kernel/sys.c
index 31deba8f7d16..0f66633be319 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -14,6 +14,7 @@
#include <linux/prctl.h>
#include <linux/highuid.h>
#include <linux/fs.h>
+#include <linux/perf_counter.h>
#include <linux/resource.h>
#include <linux/kernel.h>
#include <linux/kexec.h>
@@ -1716,6 +1717,12 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
case PR_SET_TSC:
error = SET_TSC_CTL(arg2);
break;
+ case PR_TASK_PERF_COUNTERS_DISABLE:
+ error = perf_counter_task_disable();
+ break;
+ case PR_TASK_PERF_COUNTERS_ENABLE:
+ error = perf_counter_task_enable();
+ break;
case PR_GET_TIMERSLACK:
error = current->timer_slack_ns;
break;
commit bae43c9945ebeef15e7952e317efb02393d3bfc7
Author: Ingo Molnar <mingo@elte.hu>
Date: Thu Dec 11 14:03:20 2008 +0100
perf counters: implement PERF_COUNT_TASK_CLOCK
Impact: add new perf-counter type
The 'task clock' counter counts the amount of time a task is executing,
in nanoseconds. It stops ticking when a task is scheduled out either due
to it blocking, sleeping or it being preempted.
This counter type is a Linux kernel based abstraction, it is available
even if the hardware does not support native hardware performance counters.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 68f6e3ad531f..30c0ec8c1ee3 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -50,8 +50,11 @@ enum hw_event_types {
*/
PERF_COUNT_CPU_CLOCK = -1,
PERF_COUNT_TASK_CLOCK = -2,
- PERF_COUNT_PAGE_FAULTS = -3,
- PERF_COUNT_CONTEXT_SWITCHES = -4,
+ /*
+ * Future software events:
+ */
+ /* PERF_COUNT_PAGE_FAULTS = -3,
+ PERF_COUNT_CONTEXT_SWITCHES = -4, */
};
/*
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0e93fea17120..a0fe8474ee29 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -855,6 +855,25 @@ static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
.hw_perf_counter_read = cpu_clock_perf_counter_read,
};
+static void task_clock_perf_counter_enable(struct perf_counter *counter)
+{
+}
+
+static void task_clock_perf_counter_disable(struct perf_counter *counter)
+{
+}
+
+static void task_clock_perf_counter_read(struct perf_counter *counter)
+{
+ atomic64_counter_set(counter, current->se.sum_exec_runtime);
+}
+
+static const struct hw_perf_counter_ops perf_ops_task_clock = {
+ .hw_perf_counter_enable = task_clock_perf_counter_enable,
+ .hw_perf_counter_disable = task_clock_perf_counter_disable,
+ .hw_perf_counter_read = task_clock_perf_counter_read,
+};
+
static const struct hw_perf_counter_ops *
sw_perf_counter_init(struct perf_counter *counter)
{
@@ -864,6 +883,9 @@ sw_perf_counter_init(struct perf_counter *counter)
case PERF_COUNT_CPU_CLOCK:
hw_ops = &perf_ops_cpu_clock;
break;
+ case PERF_COUNT_TASK_CLOCK:
+ hw_ops = &perf_ops_task_clock;
+ break;
default:
break;
}
commit 01b2838c4298c5e0d30b4993c195ac34dd9df61e
Author: Ingo Molnar <mingo@elte.hu>
Date: Thu Dec 11 13:45:51 2008 +0100
perf counters: consolidate hw_perf save/restore APIs
Impact: cleanup
Rename them to better match up the usual IRQ disable/enable APIs:
hw_perf_disable_all() => hw_perf_save_disable()
hw_perf_restore_ctrl() => hw_perf_restore()
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 43c8e9a38b4e..3e1dbebe22b9 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -118,13 +118,13 @@ void hw_perf_enable_all(void)
wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0);
}
-void hw_perf_restore_ctrl(u64 ctrl)
+void hw_perf_restore(u64 ctrl)
{
wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0);
}
-EXPORT_SYMBOL_GPL(hw_perf_restore_ctrl);
+EXPORT_SYMBOL_GPL(hw_perf_restore);
-u64 hw_perf_disable_all(void)
+u64 hw_perf_save_disable(void)
{
u64 ctrl;
@@ -132,7 +132,7 @@ u64 hw_perf_disable_all(void)
wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
return ctrl;
}
-EXPORT_SYMBOL_GPL(hw_perf_disable_all);
+EXPORT_SYMBOL_GPL(hw_perf_save_disable);
static inline void
__x86_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx)
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index cca804e6f1dd..a3e66a33b7a2 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -270,11 +270,11 @@ static atomic_t c3_cpu_count;
/* Common C-state entry for C2, C3, .. */
static void acpi_cstate_enter(struct acpi_processor_cx *cstate)
{
- u64 pctrl;
+ u64 perf_flags;
/* Don't trace irqs off for idle */
stop_critical_timings();
- pctrl = hw_perf_disable_all();
+ perf_flags = hw_perf_save_disable();
if (cstate->entry_method == ACPI_CSTATE_FFH) {
/* Call into architectural FFH based C-state */
acpi_processor_ffh_cstate_enter(cstate);
@@ -287,7 +287,7 @@ static void acpi_cstate_enter(struct acpi_processor_cx *cstate)
gets asserted in time to freeze execution properly. */
unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
}
- hw_perf_restore_ctrl(pctrl);
+ hw_perf_restore(perf_flags);
start_critical_timings();
}
#endif /* !CONFIG_CPU_IDLE */
@@ -1433,7 +1433,7 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
/* Don't trace irqs off for idle */
stop_critical_timings();
- pctrl = hw_perf_disable_all();
+ pctrl = hw_perf_save_disable();
if (cx->entry_method == ACPI_CSTATE_FFH) {
/* Call into architectural FFH based C-state */
acpi_processor_ffh_cstate_enter(cx);
@@ -1448,7 +1448,7 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
gets asserted in time to freeze execution properly. */
unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
}
- hw_perf_restore_ctrl(pctrl);
+ hw_perf_restore(pctrl);
start_critical_timings();
}
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 9a1713a1be27..68f6e3ad531f 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -67,7 +67,7 @@ enum perf_counter_record_type {
* Hardware event to monitor via a performance monitoring counter:
*/
struct perf_counter_hw_event {
- u64 type;
+ s64 type;
u64 irq_period;
u32 record_type;
@@ -206,8 +206,8 @@ extern void perf_counter_task_tick(struct task_struct *task, int cpu);
extern void perf_counter_init_task(struct task_struct *task);
extern void perf_counter_notify(struct pt_regs *regs);
extern void perf_counter_print_debug(void);
-extern void hw_perf_restore_ctrl(u64 ctrl);
-extern u64 hw_perf_disable_all(void);
+extern u64 hw_perf_save_disable(void);
+extern void hw_perf_restore(u64 ctrl);
extern void atomic64_counter_set(struct perf_counter *counter, u64 val64);
extern u64 atomic64_counter_read(struct perf_counter *counter);
@@ -221,8 +221,8 @@ perf_counter_task_tick(struct task_struct *task, int cpu) { }
static inline void perf_counter_init_task(struct task_struct *task) { }
static inline void perf_counter_notify(struct pt_regs *regs) { }
static inline void perf_counter_print_debug(void) { }
-static inline void hw_perf_restore_ctrl(u64 ctrl) { }
-static inline u64 hw_perf_disable_all(void) { return 0; }
+static inline void hw_perf_restore(u64 ctrl) { }
+static inline u64 hw_perf_save_disable(void) { return 0; }
#endif
#endif /* _LINUX_PERF_COUNTER_H */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 506286e5ba63..0e93fea17120 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -43,8 +43,8 @@ hw_perf_counter_init(struct perf_counter *counter)
return ERR_PTR(-EINVAL);
}
-u64 __weak hw_perf_disable_all(void) { return 0; }
-void __weak hw_perf_restore_ctrl(u64 ctrl) { }
+u64 __weak hw_perf_save_disable(void) { return 0; }
+void __weak hw_perf_restore(u64 ctrl) { }
void __weak hw_perf_counter_setup(void) { }
#if BITS_PER_LONG == 64
@@ -180,9 +180,9 @@ static void __perf_counter_remove_from_context(void *info)
* Protect the list operation against NMI by disabling the
* counters on a global level. NOP for non NMI based counters.
*/
- perf_flags = hw_perf_disable_all();
+ perf_flags = hw_perf_save_disable();
list_del_counter(counter, ctx);
- hw_perf_restore_ctrl(perf_flags);
+ hw_perf_restore(perf_flags);
if (!ctx->task) {
/*
@@ -273,9 +273,9 @@ static void __perf_install_in_context(void *info)
* Protect the list operation against NMI by disabling the
* counters on a global level. NOP for non NMI based counters.
*/
- perf_flags = hw_perf_disable_all();
+ perf_flags = hw_perf_save_disable();
list_add_counter(counter, ctx);
- hw_perf_restore_ctrl(perf_flags);
+ hw_perf_restore(perf_flags);
ctx->nr_counters++;
@@ -495,13 +495,13 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
/*
* Rotate the first entry last (works just fine for group counters too):
*/
- perf_flags = hw_perf_disable_all();
+ perf_flags = hw_perf_save_disable();
list_for_each_entry(counter, &ctx->counter_list, list_entry) {
list_del(&counter->list_entry);
list_add_tail(&counter->list_entry, &ctx->counter_list);
break;
}
- hw_perf_restore_ctrl(perf_flags);
+ hw_perf_restore(perf_flags);
spin_unlock(&ctx->lock);
commit 5c92d12411dfe5f0f3d1b1c1e2f756245e6f7249
Author: Ingo Molnar <mingo@elte.hu>
Date: Thu Dec 11 13:21:10 2008 +0100
perf counters: implement PERF_COUNT_CPU_CLOCK
Impact: add new perf-counter type
The 'CPU clock' counter counts the amount of CPU clock time that is
elapsing, in nanoseconds. (regardless of how much of it the task is
spending on a CPU executing)
This counter type is a Linux kernel based abstraction, it is available
even if the hardware does not support native hardware performance counters.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 718b635dece6..43c8e9a38b4e 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -178,35 +178,6 @@ static void x86_perf_counter_enable(struct perf_counter *counter)
__x86_perf_counter_enable(hwc, idx);
}
-#ifdef CONFIG_X86_64
-static inline void atomic64_counter_set(struct perf_counter *counter, u64 val)
-{
- atomic64_set(&counter->count, val);
-}
-
-static inline u64 atomic64_counter_read(struct perf_counter *counter)
-{
- return atomic64_read(&counter->count);
-}
-#else
-/*
- * Todo: add proper atomic64_t support to 32-bit x86:
- */
-static inline void atomic64_counter_set(struct perf_counter *counter, u64 val64)
-{
- u32 *val32 = (void *)&val64;
-
- atomic_set(counter->count32 + 0, *(val32 + 0));
- atomic_set(counter->count32 + 1, *(val32 + 1));
-}
-
-static inline u64 atomic64_counter_read(struct perf_counter *counter)
-{
- return atomic_read(counter->count32 + 0) |
- (u64) atomic_read(counter->count32 + 1) << 32;
-}
-#endif
-
static void __hw_perf_save_counter(struct perf_counter *counter,
struct hw_perf_counter *hwc, int idx)
{
@@ -309,7 +280,7 @@ static void x86_perf_counter_read(struct perf_counter *counter)
} while (offs != hwc->prev_count);
val32 = (s32) val;
- val = (s64)hwc->irq_period + (s64)val32;
+ val = (s64)hwc->irq_period + (s64)val32;
atomic64_counter_set(counter, hwc->prev_count + val);
}
@@ -573,13 +544,14 @@ void __init init_hw_perf_counters(void)
perf_counters_initialized = true;
}
-static struct hw_perf_counter_ops x86_perf_counter_ops = {
+static const struct hw_perf_counter_ops x86_perf_counter_ops = {
.hw_perf_counter_enable = x86_perf_counter_enable,
.hw_perf_counter_disable = x86_perf_counter_disable,
.hw_perf_counter_read = x86_perf_counter_read,
};
-struct hw_perf_counter_ops *hw_perf_counter_init(struct perf_counter *counter)
+const struct hw_perf_counter_ops *
+hw_perf_counter_init(struct perf_counter *counter)
{
int err;
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 27385641ecb6..9a1713a1be27 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -131,7 +131,7 @@ struct perf_counter {
struct list_head list_entry;
struct list_head sibling_list;
struct perf_counter *group_leader;
- struct hw_perf_counter_ops *hw_ops;
+ const struct hw_perf_counter_ops *hw_ops;
int active;
#if BITS_PER_LONG == 64
@@ -197,7 +197,7 @@ struct perf_cpu_context {
extern int perf_max_counters;
#ifdef CONFIG_PERF_COUNTERS
-extern struct hw_perf_counter_ops *
+extern const struct hw_perf_counter_ops *
hw_perf_counter_init(struct perf_counter *counter);
extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
@@ -208,6 +208,9 @@ extern void perf_counter_notify(struct pt_regs *regs);
extern void perf_counter_print_debug(void);
extern void hw_perf_restore_ctrl(u64 ctrl);
extern u64 hw_perf_disable_all(void);
+extern void atomic64_counter_set(struct perf_counter *counter, u64 val64);
+extern u64 atomic64_counter_read(struct perf_counter *counter);
+
#else
static inline void
perf_counter_task_sched_in(struct task_struct *task, int cpu) { }
@@ -219,7 +222,7 @@ static inline void perf_counter_init_task(struct task_struct *task) { }
static inline void perf_counter_notify(struct pt_regs *regs) { }
static inline void perf_counter_print_debug(void) { }
static inline void hw_perf_restore_ctrl(u64 ctrl) { }
-static inline u64 hw_perf_disable_all(void) { return 0; }
+static inline u64 hw_perf_disable_all(void) { return 0; }
#endif
#endif /* _LINUX_PERF_COUNTER_H */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index e6e41ca95463..506286e5ba63 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -37,15 +37,15 @@ static DEFINE_MUTEX(perf_resource_mutex);
/*
* Architecture provided APIs - weak aliases:
*/
-extern __weak struct hw_perf_counter_ops *
+extern __weak const struct hw_perf_counter_ops *
hw_perf_counter_init(struct perf_counter *counter)
{
return ERR_PTR(-EINVAL);
}
-void __weak hw_perf_disable_all(void) { }
-void __weak hw_perf_enable_all(void) { }
-void __weak hw_perf_counter_setup(void) { }
+u64 __weak hw_perf_disable_all(void) { return 0; }
+void __weak hw_perf_restore_ctrl(u64 ctrl) { }
+void __weak hw_perf_counter_setup(void) { }
#if BITS_PER_LONG == 64
@@ -58,6 +58,16 @@ static inline u64 perf_counter_read_safe(struct perf_counter *counter)
return (u64) atomic64_read(&counter->count);
}
+void atomic64_counter_set(struct perf_counter *counter, u64 val)
+{
+ atomic64_set(&counter->count, val);
+}
+
+u64 atomic64_counter_read(struct perf_counter *counter)
+{
+ return atomic64_read(&counter->count);
+}
+
#else
/*
@@ -79,6 +89,20 @@ static u64 perf_counter_read_safe(struct perf_counter *counter)
return cntl | ((u64) cnth) << 32;
}
+void atomic64_counter_set(struct perf_counter *counter, u64 val64)
+{
+ u32 *val32 = (void *)&val64;
+
+ atomic_set(counter->count32 + 0, *(val32 + 0));
+ atomic_set(counter->count32 + 1, *(val32 + 1));
+}
+
+u64 atomic64_counter_read(struct perf_counter *counter)
+{
+ return atomic_read(counter->count32 + 0) |
+ (u64) atomic_read(counter->count32 + 1) << 32;
+}
+
#endif
static void
@@ -131,6 +155,7 @@ static void __perf_counter_remove_from_context(void *info)
struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
struct perf_counter *counter = info;
struct perf_counter_context *ctx = counter->ctx;
+ u64 perf_flags;
/*
* If this is a task context, we need to check whether it is
@@ -155,9 +180,9 @@ static void __perf_counter_remove_from_context(void *info)
* Protect the list operation against NMI by disabling the
* counters on a global level. NOP for non NMI based counters.
*/
- hw_perf_disable_all();
+ perf_flags = hw_perf_disable_all();
list_del_counter(counter, ctx);
- hw_perf_enable_all();
+ hw_perf_restore_ctrl(perf_flags);
if (!ctx->task) {
/*
@@ -232,6 +257,7 @@ static void __perf_install_in_context(void *info)
struct perf_counter *counter = info;
struct perf_counter_context *ctx = counter->ctx;
int cpu = smp_processor_id();
+ u64 perf_flags;
/*
* If this is a task context, we need to check whether it is
@@ -247,9 +273,9 @@ static void __perf_install_in_context(void *info)
* Protect the list operation against NMI by disabling the
* counters on a global level. NOP for non NMI based counters.
*/
- hw_perf_disable_all();
+ perf_flags = hw_perf_disable_all();
list_add_counter(counter, ctx);
- hw_perf_enable_all();
+ hw_perf_restore_ctrl(perf_flags);
ctx->nr_counters++;
@@ -457,6 +483,7 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
{
struct perf_counter_context *ctx = &curr->perf_counter_ctx;
struct perf_counter *counter;
+ u64 perf_flags;
if (likely(!ctx->nr_counters))
return;
@@ -468,13 +495,13 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
/*
* Rotate the first entry last (works just fine for group counters too):
*/
- hw_perf_disable_all();
+ perf_flags = hw_perf_disable_all();
list_for_each_entry(counter, &ctx->counter_list, list_entry) {
list_del(&counter->list_entry);
list_add_tail(&counter->list_entry, &ctx->counter_list);
break;
}
- hw_perf_enable_all();
+ hw_perf_restore_ctrl(perf_flags);
spin_unlock(&ctx->lock);
@@ -807,6 +834,42 @@ static const struct file_operations perf_fops = {
.poll = perf_poll,
};
+static void cpu_clock_perf_counter_enable(struct perf_counter *counter)
+{
+}
+
+static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
+{
+}
+
+static void cpu_clock_perf_counter_read(struct perf_counter *counter)
+{
+ int cpu = raw_smp_processor_id();
+
+ atomic64_counter_set(counter, cpu_clock(cpu));
+}
+
+static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
+ .hw_perf_counter_enable = cpu_clock_perf_counter_enable,
+ .hw_perf_counter_disable = cpu_clock_perf_counter_disable,
+ .hw_perf_counter_read = cpu_clock_perf_counter_read,
+};
+
+static const struct hw_perf_counter_ops *
+sw_perf_counter_init(struct perf_counter *counter)
+{
+ const struct hw_perf_counter_ops *hw_ops = NULL;
+
+ switch (counter->hw_event.type) {
+ case PERF_COUNT_CPU_CLOCK:
+ hw_ops = &perf_ops_cpu_clock;
+ break;
+ default:
+ break;
+ }
+ return hw_ops;
+}
+
/*
* Allocate and initialize a counter structure
*/
@@ -815,7 +878,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
int cpu,
struct perf_counter *group_leader)
{
- struct hw_perf_counter_ops *hw_ops;
+ const struct hw_perf_counter_ops *hw_ops;
struct perf_counter *counter;
counter = kzalloc(sizeof(*counter), GFP_KERNEL);
@@ -842,7 +905,13 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
counter->group_leader = group_leader;
counter->hw_ops = NULL;
- hw_ops = hw_perf_counter_init(counter);
+ hw_ops = NULL;
+ if (!hw_event->raw && hw_event->type < 0)
+ hw_ops = sw_perf_counter_init(counter);
+ if (!hw_ops) {
+ hw_ops = hw_perf_counter_init(counter);
+ }
+
if (!hw_ops) {
kfree(counter);
return NULL;
@@ -912,7 +981,7 @@ asmlinkage int sys_perf_counter_open(
goto err_put_context;
}
- ret = -ENOMEM;
+ ret = -EINVAL;
counter = perf_counter_alloc(&hw_event, cpu, group_leader);
if (!counter)
goto err_put_context;
commit 621a01eac89b5e2f81a4cf576568b31f40a02724
Author: Ingo Molnar <mingo@elte.hu>
Date: Thu Dec 11 12:46:46 2008 +0100
perf counters: hw driver API
Impact: restructure code, introduce hw_ops driver abstraction
Introduce this abstraction to handle counter details:
struct hw_perf_counter_ops {
void (*hw_perf_counter_enable) (struct perf_counter *counter);
void (*hw_perf_counter_disable) (struct perf_counter *counter);
void (*hw_perf_counter_read) (struct perf_counter *counter);
};
This will be useful to support assymetric hw details, and it will also
be useful to implement "software counters". (Counters that count kernel
managed sw events such as pagefaults, context-switches, wall-clock time
or task-local time.)
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 54b4ad0cce68..718b635dece6 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -56,7 +56,7 @@ const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
/*
* Setup the hardware configuration for a given hw_event_type
*/
-int hw_perf_counter_init(struct perf_counter *counter)
+static int __hw_perf_counter_init(struct perf_counter *counter)
{
struct perf_counter_hw_event *hw_event = &counter->hw_event;
struct hw_perf_counter *hwc = &counter->hw;
@@ -135,7 +135,7 @@ u64 hw_perf_disable_all(void)
EXPORT_SYMBOL_GPL(hw_perf_disable_all);
static inline void
-__hw_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx)
+__x86_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx)
{
wrmsr(hwc->config_base + idx, hwc->config, 0);
}
@@ -149,13 +149,13 @@ static void __hw_perf_counter_set_period(struct hw_perf_counter *hwc, int idx)
wrmsr(hwc->counter_base + idx, hwc->next_count, 0);
}
-static void __hw_perf_counter_enable(struct hw_perf_counter *hwc, int idx)
+static void __x86_perf_counter_enable(struct hw_perf_counter *hwc, int idx)
{
wrmsr(hwc->config_base + idx,
hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
}
-void hw_perf_counter_enable(struct perf_counter *counter)
+static void x86_perf_counter_enable(struct perf_counter *counter)
{
struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
struct hw_perf_counter *hwc = &counter->hw;
@@ -170,12 +170,12 @@ void hw_perf_counter_enable(struct perf_counter *counter)
perf_counters_lapic_init(hwc->nmi);
- __hw_perf_counter_disable(hwc, idx);
+ __x86_perf_counter_disable(hwc, idx);
cpuc->counters[idx] = counter;
__hw_perf_counter_set_period(hwc, idx);
- __hw_perf_counter_enable(hwc, idx);
+ __x86_perf_counter_enable(hwc, idx);
}
#ifdef CONFIG_X86_64
@@ -282,20 +282,20 @@ void perf_counter_print_debug(void)
local_irq_enable();
}
-void hw_perf_counter_disable(struct perf_counter *counter)
+static void x86_perf_counter_disable(struct perf_counter *counter)
{
struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
struct hw_perf_counter *hwc = &counter->hw;
unsigned int idx = hwc->idx;
- __hw_perf_counter_disable(hwc, idx);
+ __x86_perf_counter_disable(hwc, idx);
clear_bit(idx, cpuc->used);
cpuc->counters[idx] = NULL;
__hw_perf_save_counter(counter, hwc, idx);
}
-void hw_perf_counter_read(struct perf_counter *counter)
+static void x86_perf_counter_read(struct perf_counter *counter)
{
struct hw_perf_counter *hwc = &counter->hw;
unsigned long addr = hwc->counter_base + hwc->idx;
@@ -342,7 +342,7 @@ static void perf_save_and_restart(struct perf_counter *counter)
__hw_perf_counter_set_period(hwc, idx);
if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE)
- __hw_perf_counter_enable(hwc, idx);
+ __x86_perf_counter_enable(hwc, idx);
}
static void
@@ -572,3 +572,20 @@ void __init init_hw_perf_counters(void)
perf_counters_initialized = true;
}
+
+static struct hw_perf_counter_ops x86_perf_counter_ops = {
+ .hw_perf_counter_enable = x86_perf_counter_enable,
+ .hw_perf_counter_disable = x86_perf_counter_disable,
+ .hw_perf_counter_read = x86_perf_counter_read,
+};
+
+struct hw_perf_counter_ops *hw_perf_counter_init(struct perf_counter *counter)
+{
+ int err;
+
+ err = __hw_perf_counter_init(counter);
+ if (err)
+ return NULL;
+
+ return &x86_perf_counter_ops;
+}
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 7af7d8965460..27385641ecb6 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -113,6 +113,17 @@ struct perf_data {
u8 data[PERF_DATA_BUFLEN];
};
+struct perf_counter;
+
+/**
+ * struct hw_perf_counter_ops - performance counter hw ops
+ */
+struct hw_perf_counter_ops {
+ void (*hw_perf_counter_enable) (struct perf_counter *counter);
+ void (*hw_perf_counter_disable) (struct perf_counter *counter);
+ void (*hw_perf_counter_read) (struct perf_counter *counter);
+};
+
/**
* struct perf_counter - performance counter kernel representation:
*/
@@ -120,6 +131,7 @@ struct perf_counter {
struct list_head list_entry;
struct list_head sibling_list;
struct perf_counter *group_leader;
+ struct hw_perf_counter_ops *hw_ops;
int active;
#if BITS_PER_LONG == 64
@@ -185,6 +197,9 @@ struct perf_cpu_context {
extern int perf_max_counters;
#ifdef CONFIG_PERF_COUNTERS
+extern struct hw_perf_counter_ops *
+hw_perf_counter_init(struct perf_counter *counter);
+
extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
extern void perf_counter_task_sched_out(struct task_struct *task, int cpu);
extern void perf_counter_task_tick(struct task_struct *task, int cpu);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 278209c547a8..e6e41ca95463 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -37,18 +37,15 @@ static DEFINE_MUTEX(perf_resource_mutex);
/*
* Architecture provided APIs - weak aliases:
*/
-
-int __weak hw_perf_counter_init(struct perf_counter *counter)
+extern __weak struct hw_perf_counter_ops *
+hw_perf_counter_init(struct perf_counter *counter)
{
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
}
-void __weak hw_perf_counter_enable(struct perf_counter *counter) { }
-void __weak hw_perf_counter_disable(struct perf_counter *counter) { }
-void __weak hw_perf_counter_read(struct perf_counter *counter) { }
-void __weak hw_perf_disable_all(void) { }
-void __weak hw_perf_enable_all(void) { }
-void __weak hw_perf_counter_setup(void) { }
+void __weak hw_perf_disable_all(void) { }
+void __weak hw_perf_enable_all(void) { }
+void __weak hw_perf_counter_setup(void) { }
#if BITS_PER_LONG == 64
@@ -146,7 +143,7 @@ static void __perf_counter_remove_from_context(void *info)
spin_lock(&ctx->lock);
if (counter->active) {
- hw_perf_counter_disable(counter);
+ counter->hw_ops->hw_perf_counter_disable(counter);
counter->active = 0;
ctx->nr_active--;
cpuctx->active_oncpu--;
@@ -257,7 +254,7 @@ static void __perf_install_in_context(void *info)
ctx->nr_counters++;
if (cpuctx->active_oncpu < perf_max_counters) {
- hw_perf_counter_enable(counter);
+ counter->hw_ops->hw_perf_counter_enable(counter);
counter->active = 1;
counter->oncpu = cpu;
ctx->nr_active++;
@@ -333,7 +330,7 @@ counter_sched_out(struct perf_counter *counter,
if (!counter->active)
return;
- hw_perf_counter_disable(counter);
+ counter->hw_ops->hw_perf_counter_disable(counter);
counter->active = 0;
counter->oncpu = -1;
@@ -392,7 +389,7 @@ counter_sched_in(struct perf_counter *counter,
struct perf_counter_context *ctx,
int cpu)
{
- hw_perf_counter_enable(counter);
+ counter->hw_ops->hw_perf_counter_enable(counter);
counter->active = 1;
counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
@@ -509,7 +506,9 @@ void perf_counter_init_task(struct task_struct *task)
*/
static void __hw_perf_counter_read(void *info)
{
- hw_perf_counter_read(info);
+ struct perf_counter *counter = info;
+
+ counter->hw_ops->hw_perf_counter_read(counter);
}
static u64 perf_counter_read(struct perf_counter *counter)
@@ -816,8 +815,10 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
int cpu,
struct perf_counter *group_leader)
{
- struct perf_counter *counter = kzalloc(sizeof(*counter), GFP_KERNEL);
+ struct hw_perf_counter_ops *hw_ops;
+ struct perf_counter *counter;
+ counter = kzalloc(sizeof(*counter), GFP_KERNEL);
if (!counter)
return NULL;
@@ -839,6 +840,14 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
counter->hw_event = *hw_event;
counter->wakeup_pending = 0;
counter->group_leader = group_leader;
+ counter->hw_ops = NULL;
+
+ hw_ops = hw_perf_counter_init(counter);
+ if (!hw_ops) {
+ kfree(counter);
+ return NULL;
+ }
+ counter->hw_ops = hw_ops;
return counter;
}
@@ -908,10 +917,6 @@ asmlinkage int sys_perf_counter_open(
if (!counter)
goto err_put_context;
- ret = hw_perf_counter_init(counter);
- if (ret)
- goto err_free_put_context;
-
perf_install_in_context(ctx, counter, cpu);
ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
@@ -927,8 +932,6 @@ asmlinkage int sys_perf_counter_open(
mutex_lock(&counter->mutex);
perf_counter_remove_from_context(counter);
mutex_unlock(&counter->mutex);
-
-err_free_put_context:
kfree(counter);
err_put_context:
commit ccff286d85098ba5438e22aa2ea807fc1e18cf2f
Author: Ingo Molnar <mingo@elte.hu>
Date: Thu Dec 11 11:26:29 2008 +0100
perf counters: group counter, fixes
Impact: bugfix
Check that a group does not span outside the context of a CPU or a task.
Also, do not allow deep recursive hierarchies.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index fa59fe8c02d5..278209c547a8 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -107,9 +107,6 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
list_del_init(&counter->list_entry);
- if (list_empty(&counter->sibling_list))
- return;
-
/*
* If this was a group counter with sibling counters then
* upgrade the siblings to singleton counters by adding them
@@ -395,9 +392,6 @@ counter_sched_in(struct perf_counter *counter,
struct perf_counter_context *ctx,
int cpu)
{
- if (!counter->active)
- return;
-
hw_perf_counter_enable(counter);
counter->active = 1;
counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
@@ -876,32 +870,39 @@ asmlinkage int sys_perf_counter_open(
return -EFAULT;
/*
- * Look up the group leader:
+ * Get the target context (task or percpu):
+ */
+ ctx = find_get_context(pid, cpu);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ /*
+ * Look up the group leader (we will attach this counter to it):
*/
group_leader = NULL;
if (group_fd != -1) {
ret = -EINVAL;
group_file = fget_light(group_fd, &fput_needed);
if (!group_file)
- goto out_fput;
+ goto err_put_context;
if (group_file->f_op != &perf_fops)
- goto out_fput;
+ goto err_put_context;
group_leader = group_file->private_data;
/*
- * Do not allow a recursive hierarchy:
+ * Do not allow a recursive hierarchy (this new sibling
+ * becoming part of another group-sibling):
+ */
+ if (group_leader->group_leader != group_leader)
+ goto err_put_context;
+ /*
+ * Do not allow to attach to a group in a different
+ * task or CPU context:
*/
- if (group_leader->group_leader)
- goto out_fput;
+ if (group_leader->ctx != ctx)
+ goto err_put_context;
}
- /*
- * Get the target context (task or percpu):
- */
- ctx = find_get_context(pid, cpu);
- if (IS_ERR(ctx))
- return PTR_ERR(ctx);
-
ret = -ENOMEM;
counter = perf_counter_alloc(&hw_event, cpu, group_leader);
if (!counter)
commit 04289bb9891882202d7e961c4c04d2376930e9f9
Author: Ingo Molnar <mingo@elte.hu>
Date: Thu Dec 11 08:38:42 2008 +0100
perf counters: add support for group counters
Impact: add group counters
This patch adds the "counter groups" abstraction.
Groups of counters behave much like normal 'single' counters, with a
few semantic and behavioral extensions on top of that.
A counter group is created by creating a new counter with the open()
syscall's group-leader group_fd file descriptor parameter pointing
to another, already existing counter.
Groups of counters are scheduled in and out in one atomic group, and
they are also roundrobin-scheduled atomically.
Counters that are member of a group can also record events with an
(atomic) extended timestamp that extends to all members of the group,
if the record type is set to PERF_RECORD_GROUP.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index ef1936a871aa..54b4ad0cce68 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -346,18 +346,22 @@ static void perf_save_and_restart(struct perf_counter *counter)
}
static void
-perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown)
+perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
{
- struct perf_counter_context *ctx = leader->ctx;
- struct perf_counter *counter;
+ struct perf_counter *counter, *group_leader = sibling->group_leader;
int bit;
- list_for_each_entry(counter, &ctx->counters, list) {
- if (counter->hw_event.record_type != PERF_RECORD_SIMPLE ||
- counter == leader)
- continue;
+ /*
+ * Store the counter's own timestamp first:
+ */
+ perf_store_irq_data(sibling, sibling->hw_event.type);
+ perf_store_irq_data(sibling, atomic64_counter_read(sibling));
- if (counter->active) {
+ /*
+ * Then store sibling timestamps (if any):
+ */
+ list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
+ if (!counter->active) {
/*
* When counter was not in the overflow mask, we have to
* read it from hardware. We read it as well, when it
@@ -371,8 +375,8 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown)
perf_save_and_restart(counter);
}
}
- perf_store_irq_data(leader, counter->hw_event.type);
- perf_store_irq_data(leader, atomic64_counter_read(counter));
+ perf_store_irq_data(sibling, counter->hw_event.type);
+ perf_store_irq_data(sibling, atomic64_counter_read(counter));
}
}
@@ -416,10 +420,6 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
perf_store_irq_data(counter, instruction_pointer(regs));
break;
case PERF_RECORD_GROUP:
- perf_store_irq_data(counter,
- counter->hw_event.type);
- perf_store_irq_data(counter,
- atomic64_counter_read(counter));
perf_handle_group(counter, &status, &ack);
break;
}
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index a2b4852e2d70..7af7d8965460 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -117,7 +117,10 @@ struct perf_data {
* struct perf_counter - performance counter kernel representation:
*/
struct perf_counter {
- struct list_head list;
+ struct list_head list_entry;
+ struct list_head sibling_list;
+ struct perf_counter *group_leader;
+
int active;
#if BITS_PER_LONG == 64
atomic64_t count;
@@ -158,7 +161,8 @@ struct perf_counter_context {
* Protect the list of counters:
*/
spinlock_t lock;
- struct list_head counters;
+
+ struct list_head counter_list;
int nr_counters;
int nr_active;
struct task_struct *task;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0d323ceda3a4..fa59fe8c02d5 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -10,6 +10,7 @@
#include <linux/fs.h>
#include <linux/cpu.h>
#include <linux/smp.h>
+#include <linux/file.h>
#include <linux/poll.h>
#include <linux/sysfs.h>
#include <linux/ptrace.h>
@@ -55,7 +56,7 @@ void __weak hw_perf_counter_setup(void) { }
* Read the cached counter in counter safe against cross CPU / NMI
* modifications. 64 bit version - no complications.
*/
-static inline u64 perf_read_counter_safe(struct perf_counter *counter)
+static inline u64 perf_counter_read_safe(struct perf_counter *counter)
{
return (u64) atomic64_read(&counter->count);
}
@@ -66,7 +67,7 @@ static inline u64 perf_read_counter_safe(struct perf_counter *counter)
* Read the cached counter in counter safe against cross CPU / NMI
* modifications. 32 bit version.
*/
-static u64 perf_read_counter_safe(struct perf_counter *counter)
+static u64 perf_counter_read_safe(struct perf_counter *counter)
{
u32 cntl, cnth;
@@ -83,13 +84,55 @@ static u64 perf_read_counter_safe(struct perf_counter *counter)
#endif
+static void
+list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
+{
+ struct perf_counter *group_leader = counter->group_leader;
+
+ /*
+ * Depending on whether it is a standalone or sibling counter,
+ * add it straight to the context's counter list, or to the group
+ * leader's sibling list:
+ */
+ if (counter->group_leader == counter)
+ list_add_tail(&counter->list_entry, &ctx->counter_list);
+ else
+ list_add_tail(&counter->list_entry, &group_leader->sibling_list);
+}
+
+static void
+list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
+{
+ struct perf_counter *sibling, *tmp;
+
+ list_del_init(&counter->list_entry);
+
+ if (list_empty(&counter->sibling_list))
+ return;
+
+ /*
+ * If this was a group counter with sibling counters then
+ * upgrade the siblings to singleton counters by adding them
+ * to the context list directly:
+ */
+ list_for_each_entry_safe(sibling, tmp,
+ &counter->sibling_list, list_entry) {
+
+ list_del_init(&sibling->list_entry);
+ list_add_tail(&sibling->list_entry, &ctx->counter_list);
+ WARN_ON_ONCE(!sibling->group_leader);
+ WARN_ON_ONCE(sibling->group_leader == sibling);
+ sibling->group_leader = sibling;
+ }
+}
+
/*
* Cross CPU call to remove a performance counter
*
* We disable the counter on the hardware level first. After that we
* remove it from the context list.
*/
-static void __perf_remove_from_context(void *info)
+static void __perf_counter_remove_from_context(void *info)
{
struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
struct perf_counter *counter = info;
@@ -119,7 +162,7 @@ static void __perf_remove_from_context(void *info)
* counters on a global level. NOP for non NMI based counters.
*/
hw_perf_disable_all();
- list_del_init(&counter->list);
+ list_del_counter(counter, ctx);
hw_perf_enable_all();
if (!ctx->task) {
@@ -144,7 +187,7 @@ static void __perf_remove_from_context(void *info)
* CPU counters are removed with a smp call. For task counters we only
* call when the task is on a CPU.
*/
-static void perf_remove_from_context(struct perf_counter *counter)
+static void perf_counter_remove_from_context(struct perf_counter *counter)
{
struct perf_counter_context *ctx = counter->ctx;
struct task_struct *task = ctx->task;
@@ -155,32 +198,32 @@ static void perf_remove_from_context(struct perf_counter *counter)
* the removal is always sucessful.
*/
smp_call_function_single(counter->cpu,
- __perf_remove_from_context,
+ __perf_counter_remove_from_context,
counter, 1);
return;
}
retry:
- task_oncpu_function_call(task, __perf_remove_from_context,
+ task_oncpu_function_call(task, __perf_counter_remove_from_context,
counter);
spin_lock_irq(&ctx->lock);
/*
* If the context is active we need to retry the smp call.
*/
- if (ctx->nr_active && !list_empty(&counter->list)) {
+ if (ctx->nr_active && !list_empty(&counter->list_entry)) {
spin_unlock_irq(&ctx->lock);
goto retry;
}
/*
* The lock prevents that this context is scheduled in so we
- * can remove the counter safely, if it the call above did not
+ * can remove the counter safely, if the call above did not
* succeed.
*/
- if (!list_empty(&counter->list)) {
+ if (!list_empty(&counter->list_entry)) {
ctx->nr_counters--;
- list_del_init(&counter->list);
+ list_del_counter(counter, ctx);
counter->task = NULL;
}
spin_unlock_irq(&ctx->lock);
@@ -211,7 +254,7 @@ static void __perf_install_in_context(void *info)
* counters on a global level. NOP for non NMI based counters.
*/
hw_perf_disable_all();
- list_add_tail(&counter->list, &ctx->counters);
+ list_add_counter(counter, ctx);
hw_perf_enable_all();
ctx->nr_counters++;
@@ -268,7 +311,7 @@ perf_install_in_context(struct perf_counter_context *ctx,
* If the context is active and the counter has not been added
* we need to retry the smp call.
*/
- if (ctx->nr_active && list_empty(&counter->list)) {
+ if (ctx->nr_active && list_empty(&counter->list_entry)) {
spin_unlock_irq(&ctx->lock);
goto retry;
}
@@ -278,13 +321,45 @@ perf_install_in_context(struct perf_counter_context *ctx,
* can add the counter safely, if it the call above did not
* succeed.
*/
- if (list_empty(&counter->list)) {
- list_add_tail(&counter->list, &ctx->counters);
+ if (list_empty(&counter->list_entry)) {
+ list_add_counter(counter, ctx);
ctx->nr_counters++;
}
spin_unlock_irq(&ctx->lock);
}
+static void
+counter_sched_out(struct perf_counter *counter,
+ struct perf_cpu_context *cpuctx,
+ struct perf_counter_context *ctx)
+{
+ if (!counter->active)
+ return;
+
+ hw_perf_counter_disable(counter);
+ counter->active = 0;
+ counter->oncpu = -1;
+
+ cpuctx->active_oncpu--;
+ ctx->nr_active--;
+}
+
+static void
+group_sched_out(struct perf_counter *group_counter,
+ struct perf_cpu_context *cpuctx,
+ struct perf_counter_context *ctx)
+{
+ struct perf_counter *counter;
+
+ counter_sched_out(group_counter, cpuctx, ctx);
+
+ /*
+ * Schedule out siblings (if any):
+ */
+ list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
+ counter_sched_out(counter, cpuctx, ctx);
+}
+
/*
* Called from scheduler to remove the counters of the current task,
* with interrupts disabled.
@@ -306,21 +381,48 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu)
return;
spin_lock(&ctx->lock);
- list_for_each_entry(counter, &ctx->counters, list) {
- if (!ctx->nr_active)
- break;
- if (counter->active) {
- hw_perf_counter_disable(counter);
- counter->active = 0;
- counter->oncpu = -1;
- ctx->nr_active--;
- cpuctx->active_oncpu--;
- }
+ if (ctx->nr_active) {
+ list_for_each_entry(counter, &ctx->counter_list, list_entry)
+ group_sched_out(counter, cpuctx, ctx);
}
spin_unlock(&ctx->lock);
cpuctx->task_ctx = NULL;
}
+static void
+counter_sched_in(struct perf_counter *counter,
+ struct perf_cpu_context *cpuctx,
+ struct perf_counter_context *ctx,
+ int cpu)
+{
+ if (!counter->active)
+ return;
+
+ hw_perf_counter_enable(counter);
+ counter->active = 1;
+ counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
+
+ cpuctx->active_oncpu++;
+ ctx->nr_active++;
+}
+
+static void
+group_sched_in(struct perf_counter *group_counter,
+ struct perf_cpu_context *cpuctx,
+ struct perf_counter_context *ctx,
+ int cpu)
+{
+ struct perf_counter *counter;
+
+ counter_sched_in(group_counter, cpuctx, ctx, cpu);
+
+ /*
+ * Schedule in siblings as one group (if any):
+ */
+ list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
+ counter_sched_in(counter, cpuctx, ctx, cpu);
+}
+
/*
* Called from scheduler to add the counters of the current task
* with interrupts disabled.
@@ -342,19 +444,21 @@ void perf_counter_task_sched_in(struct task_struct *task, int cpu)
return;
spin_lock(&ctx->lock);
- list_for_each_entry(counter, &ctx->counters, list) {
+ list_for_each_entry(counter, &ctx->counter_list, list_entry) {
if (ctx->nr_active == cpuctx->max_pertask)
break;
+
+ /*
+ * Listen to the 'cpu' scheduling filter constraint
+ * of counters:
+ */
if (counter->cpu != -1 && counter->cpu != cpu)
continue;
- hw_perf_counter_enable(counter);
- counter->active = 1;
- counter->oncpu = cpu;
- ctx->nr_active++;
- cpuctx->active_oncpu++;
+ group_sched_in(counter, cpuctx, ctx, cpu);
}
spin_unlock(&ctx->lock);
+
cpuctx->task_ctx = ctx;
}
@@ -371,12 +475,12 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
spin_lock(&ctx->lock);
/*
- * Rotate the first entry last:
+ * Rotate the first entry last (works just fine for group counters too):
*/
hw_perf_disable_all();
- list_for_each_entry(counter, &ctx->counters, list) {
- list_del(&counter->list);
- list_add_tail(&counter->list, &ctx->counters);
+ list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+ list_del(&counter->list_entry);
+ list_add_tail(&counter->list_entry, &ctx->counter_list);
break;
}
hw_perf_enable_all();
@@ -386,17 +490,24 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
perf_counter_task_sched_in(curr, cpu);
}
+/*
+ * Initialize the perf_counter context in a task_struct:
+ */
+static void
+__perf_counter_init_context(struct perf_counter_context *ctx,
+ struct task_struct *task)
+{
+ spin_lock_init(&ctx->lock);
+ INIT_LIST_HEAD(&ctx->counter_list);
+ ctx->nr_counters = 0;
+ ctx->task = task;
+}
/*
* Initialize the perf_counter context in task_struct
*/
void perf_counter_init_task(struct task_struct *task)
{
- struct perf_counter_context *ctx = &task->perf_counter_ctx;
-
- spin_lock_init(&ctx->lock);
- INIT_LIST_HEAD(&ctx->counters);
- ctx->nr_counters = 0;
- ctx->task = task;
+ __perf_counter_init_context(&task->perf_counter_ctx, task);
}
/*
@@ -407,7 +518,7 @@ static void __hw_perf_counter_read(void *info)
hw_perf_counter_read(info);
}
-static u64 perf_read_counter(struct perf_counter *counter)
+static u64 perf_counter_read(struct perf_counter *counter)
{
/*
* If counter is enabled and currently active on a CPU, update the
@@ -418,7 +529,7 @@ static u64 perf_read_counter(struct perf_counter *counter)
__hw_perf_counter_read, counter, 1);
}
- return perf_read_counter_safe(counter);
+ return perf_counter_read_safe(counter);
}
/*
@@ -555,7 +666,7 @@ static int perf_release(struct inode *inode, struct file *file)
mutex_lock(&counter->mutex);
- perf_remove_from_context(counter);
+ perf_counter_remove_from_context(counter);
put_context(ctx);
mutex_unlock(&counter->mutex);
@@ -577,7 +688,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
return -EINVAL;
mutex_lock(&counter->mutex);
- cntval = perf_read_counter(counter);
+ cntval = perf_counter_read(counter);
mutex_unlock(&counter->mutex);
return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval);
@@ -707,15 +818,25 @@ static const struct file_operations perf_fops = {
* Allocate and initialize a counter structure
*/
static struct perf_counter *
-perf_counter_alloc(struct perf_counter_hw_event *hw_event, int cpu)
+perf_counter_alloc(struct perf_counter_hw_event *hw_event,
+ int cpu,
+ struct perf_counter *group_leader)
{
struct perf_counter *counter = kzalloc(sizeof(*counter), GFP_KERNEL);
if (!counter)
return NULL;
+ /*
+ * Single counters are their own group leaders, with an
+ * empty sibling list:
+ */
+ if (!group_leader)
+ group_leader = counter;
+
mutex_init(&counter->mutex);
- INIT_LIST_HEAD(&counter->list);
+ INIT_LIST_HEAD(&counter->list_entry);
+ INIT_LIST_HEAD(&counter->sibling_list);
init_waitqueue_head(&counter->waitq);
counter->irqdata = &counter->data[0];
@@ -723,6 +844,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, int cpu)
counter->cpu = cpu;
counter->hw_event = *hw_event;
counter->wakeup_pending = 0;
+ counter->group_leader = group_leader;
return counter;
}
@@ -743,20 +865,45 @@ asmlinkage int sys_perf_counter_open(
int group_fd)
{
- struct perf_counter_context *ctx;
+ struct perf_counter *counter, *group_leader;
struct perf_counter_hw_event hw_event;
- struct perf_counter *counter;
+ struct perf_counter_context *ctx;
+ struct file *group_file = NULL;
+ int fput_needed = 0;
int ret;
if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
return -EFAULT;
+ /*
+ * Look up the group leader:
+ */
+ group_leader = NULL;
+ if (group_fd != -1) {
+ ret = -EINVAL;
+ group_file = fget_light(group_fd, &fput_needed);
+ if (!group_file)
+ goto out_fput;
+ if (group_file->f_op != &perf_fops)
+ goto out_fput;
+
+ group_leader = group_file->private_data;
+ /*
+ * Do not allow a recursive hierarchy:
+ */
+ if (group_leader->group_leader)
+ goto out_fput;
+ }
+
+ /*
+ * Get the target context (task or percpu):
+ */
ctx = find_get_context(pid, cpu);
if (IS_ERR(ctx))
return PTR_ERR(ctx);
ret = -ENOMEM;
- counter = perf_counter_alloc(&hw_event, cpu);
+ counter = perf_counter_alloc(&hw_event, cpu, group_leader);
if (!counter)
goto err_put_context;
@@ -770,11 +917,14 @@ asmlinkage int sys_perf_counter_open(
if (ret < 0)
goto err_remove_free_put_context;
+out_fput:
+ fput_light(group_file, fput_needed);
+
return ret;
err_remove_free_put_context:
mutex_lock(&counter->mutex);
- perf_remove_from_context(counter);
+ perf_counter_remove_from_context(counter);
mutex_unlock(&counter->mutex);
err_free_put_context:
@@ -783,40 +933,40 @@ asmlinkage int sys_perf_counter_open(
err_put_context:
put_context(ctx);
- return ret;
+ goto out_fput;
}
-static void __cpuinit perf_init_cpu(int cpu)
+static void __cpuinit perf_counter_init_cpu(int cpu)
{
- struct perf_cpu_context *ctx;
+ struct perf_cpu_context *cpuctx;
- ctx = &per_cpu(perf_cpu_context, cpu);
- spin_lock_init(&ctx->ctx.lock);
- INIT_LIST_HEAD(&ctx->ctx.counters);
+ cpuctx = &per_cpu(perf_cpu_context, cpu);
+ __perf_counter_init_context(&cpuctx->ctx, NULL);
mutex_lock(&perf_resource_mutex);
- ctx->max_pertask = perf_max_counters - perf_reserved_percpu;
+ cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
mutex_unlock(&perf_resource_mutex);
+
hw_perf_counter_setup();
}
#ifdef CONFIG_HOTPLUG_CPU
-static void __perf_exit_cpu(void *info)
+static void __perf_counter_exit_cpu(void *info)
{
struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
struct perf_counter_context *ctx = &cpuctx->ctx;
struct perf_counter *counter, *tmp;
- list_for_each_entry_safe(counter, tmp, &ctx->counters, list)
- __perf_remove_from_context(counter);
+ list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
+ __perf_counter_remove_from_context(counter);
}
-static void perf_exit_cpu(int cpu)
+static void perf_counter_exit_cpu(int cpu)
{
- smp_call_function_single(cpu, __perf_exit_cpu, NULL, 1);
+ smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
}
#else
-static inline void perf_exit_cpu(int cpu) { }
+static inline void perf_counter_exit_cpu(int cpu) { }
#endif
static int __cpuinit
@@ -828,12 +978,12 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
- perf_init_cpu(cpu);
+ perf_counter_init_cpu(cpu);
break;
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
- perf_exit_cpu(cpu);
+ perf_counter_exit_cpu(cpu);
break;
default: