Patches contributed by Eötvös Lorand University
commit 9f66a3810fe0d4100972db84290f3ae4a4d77025
Author: Ingo Molnar <mingo@elte.hu>
Date: Wed Dec 10 12:33:23 2008 +0100
perf counters: restructure the API
Impact: clean up new API
Thorough cleanup of the new perf counters API, we now get clean separation
of the various concepts:
- introduce perf_counter_hw_event to separate out the event source details
- move special type flags into separate attributes: PERF_COUNT_NMI,
PERF_COUNT_RAW
- extend the type to u64 and reserve it fully to the architecture in the
raw type case.
And make use of all these changes in the core and x86 perfcounters code.
Also change the syscall signature to:
asmlinkage int sys_perf_counter_open(
struct perf_counter_hw_event *hw_event_uptr __user,
pid_t pid,
int cpu,
int group_fd);
( Note that group_fd is unused for now - it's reserved for the counter
groups abstraction. )
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 30e7ebf78275..ef1936a871aa 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -58,8 +58,8 @@ const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
*/
int hw_perf_counter_init(struct perf_counter *counter)
{
+ struct perf_counter_hw_event *hw_event = &counter->hw_event;
struct hw_perf_counter *hwc = &counter->hw;
- u32 hw_event_type = counter->event.hw_event_type;
if (unlikely(!perf_counters_initialized))
return -EINVAL;
@@ -77,14 +77,14 @@ int hw_perf_counter_init(struct perf_counter *counter)
hwc->nmi = 0;
if (capable(CAP_SYS_ADMIN)) {
hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
- if (hw_event_type & PERF_COUNT_NMI)
+ if (hw_event->nmi)
hwc->nmi = 1;
}
- hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0;
- hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0;
+ hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0;
+ hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0;
- hwc->irq_period = counter->event.hw_event_period;
+ hwc->irq_period = hw_event->irq_period;
/*
* Intel PMCs cannot be accessed sanely above 32 bit width,
* so we install an artificial 1<<31 period regardless of
@@ -93,21 +93,20 @@ int hw_perf_counter_init(struct perf_counter *counter)
if (!hwc->irq_period)
hwc->irq_period = 0x7FFFFFFF;
- hwc->next_count = -((s32) hwc->irq_period);
+ hwc->next_count = -(s32)hwc->irq_period;
/*
* Raw event type provide the config in the event structure
*/
- hw_event_type &= ~PERF_COUNT_NMI;
- if (hw_event_type == PERF_COUNT_RAW) {
- hwc->config |= counter->event.hw_raw_ctrl;
+ if (hw_event->raw) {
+ hwc->config |= hw_event->type;
} else {
- if (hw_event_type >= max_intel_perfmon_events)
+ if (hw_event->type >= max_intel_perfmon_events)
return -EINVAL;
/*
* The generic map:
*/
- hwc->config |= intel_perfmon_event_map[hw_event_type];
+ hwc->config |= intel_perfmon_event_map[hw_event->type];
}
counter->wakeup_pending = 0;
@@ -354,7 +353,7 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown)
int bit;
list_for_each_entry(counter, &ctx->counters, list) {
- if (counter->record_type != PERF_RECORD_SIMPLE ||
+ if (counter->hw_event.record_type != PERF_RECORD_SIMPLE ||
counter == leader)
continue;
@@ -372,7 +371,7 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown)
perf_save_and_restart(counter);
}
}
- perf_store_irq_data(leader, counter->event.hw_event_type);
+ perf_store_irq_data(leader, counter->hw_event.type);
perf_store_irq_data(leader, atomic64_counter_read(counter));
}
}
@@ -410,7 +409,7 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
perf_save_and_restart(counter);
- switch (counter->record_type) {
+ switch (counter->hw_event.record_type) {
case PERF_RECORD_SIMPLE:
continue;
case PERF_RECORD_IRQ:
@@ -418,7 +417,7 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
break;
case PERF_RECORD_GROUP:
perf_store_irq_data(counter,
- counter->event.hw_event_type);
+ counter->hw_event.type);
perf_store_irq_data(counter,
atomic64_counter_read(counter));
perf_handle_group(counter, &status, &ack);
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 1f0017673e77..a2b4852e2d70 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -24,65 +24,93 @@
struct task_struct;
/*
- * Generalized hardware event types, used by the hw_event_type parameter
- * of the sys_perf_counter_open() syscall:
+ * User-space ABI bits:
+ */
+
+/*
+ * Generalized performance counter event types, used by the hw_event.type
+ * parameter of the sys_perf_counter_open() syscall:
*/
enum hw_event_types {
- PERF_COUNT_CYCLES,
- PERF_COUNT_INSTRUCTIONS,
- PERF_COUNT_CACHE_REFERENCES,
- PERF_COUNT_CACHE_MISSES,
- PERF_COUNT_BRANCH_INSTRUCTIONS,
- PERF_COUNT_BRANCH_MISSES,
/*
- * If this bit is set in the type, then trigger NMI sampling:
+ * Common hardware events, generalized by the kernel:
*/
- PERF_COUNT_NMI = (1 << 30),
- PERF_COUNT_RAW = (1 << 31),
+ PERF_COUNT_CYCLES = 0,
+ PERF_COUNT_INSTRUCTIONS = 1,
+ PERF_COUNT_CACHE_REFERENCES = 2,
+ PERF_COUNT_CACHE_MISSES = 3,
+ PERF_COUNT_BRANCH_INSTRUCTIONS = 4,
+ PERF_COUNT_BRANCH_MISSES = 5,
+
+ /*
+ * Special "software" counters provided by the kernel, even if
+ * the hardware does not support performance counters. These
+ * counters measure various physical and sw events of the
+ * kernel (and allow the profiling of them as well):
+ */
+ PERF_COUNT_CPU_CLOCK = -1,
+ PERF_COUNT_TASK_CLOCK = -2,
+ PERF_COUNT_PAGE_FAULTS = -3,
+ PERF_COUNT_CONTEXT_SWITCHES = -4,
};
/*
* IRQ-notification data record type:
*/
-enum perf_record_type {
- PERF_RECORD_SIMPLE,
- PERF_RECORD_IRQ,
- PERF_RECORD_GROUP,
+enum perf_counter_record_type {
+ PERF_RECORD_SIMPLE = 0,
+ PERF_RECORD_IRQ = 1,
+ PERF_RECORD_GROUP = 2,
};
-struct perf_counter_event {
- u32 hw_event_type;
- u32 hw_event_period;
- u64 hw_raw_ctrl;
+/*
+ * Hardware event to monitor via a performance monitoring counter:
+ */
+struct perf_counter_hw_event {
+ u64 type;
+
+ u64 irq_period;
+ u32 record_type;
+
+ u32 disabled : 1, /* off by default */
+ nmi : 1, /* NMI sampling */
+ raw : 1, /* raw event type */
+ __reserved_1 : 29;
+
+ u64 __reserved_2;
};
+/*
+ * Kernel-internal data types:
+ */
+
/**
- * struct hw_perf_counter - performance counter hardware details
+ * struct hw_perf_counter - performance counter hardware details:
*/
struct hw_perf_counter {
- u64 config;
- unsigned long config_base;
- unsigned long counter_base;
- int nmi;
- unsigned int idx;
- u64 prev_count;
- s32 next_count;
- u64 irq_period;
+ u64 config;
+ unsigned long config_base;
+ unsigned long counter_base;
+ int nmi;
+ unsigned int idx;
+ u64 prev_count;
+ u64 irq_period;
+ s32 next_count;
};
/*
* Hardcoded buffer length limit for now, for IRQ-fed events:
*/
-#define PERF_DATA_BUFLEN 2048
+#define PERF_DATA_BUFLEN 2048
/**
* struct perf_data - performance counter IRQ data sampling ...
*/
struct perf_data {
- int len;
- int rd_idx;
- int overrun;
- u8 data[PERF_DATA_BUFLEN];
+ int len;
+ int rd_idx;
+ int overrun;
+ u8 data[PERF_DATA_BUFLEN];
};
/**
@@ -96,7 +124,7 @@ struct perf_counter {
#else
atomic_t count32[2];
#endif
- struct perf_counter_event event;
+ struct perf_counter_hw_event hw_event;
struct hw_perf_counter hw;
struct perf_counter_context *ctx;
@@ -110,8 +138,6 @@ struct perf_counter {
int oncpu;
int cpu;
- enum perf_record_type record_type;
-
/* read() / irq related data */
wait_queue_head_t waitq;
/* optional: for NMIs */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 3ecd73d03daa..a549678b7c3c 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -54,7 +54,7 @@ struct compat_stat;
struct compat_timeval;
struct robust_list_head;
struct getcpu_cache;
-struct perf_counter_event;
+struct perf_counter_hw_event;
#include <linux/types.h>
#include <linux/aio_abi.h>
@@ -625,7 +625,11 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
-asmlinkage int
-sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type,
- pid_t pid, int cpu, int masterfd);
+
+asmlinkage int sys_perf_counter_open(
+
+ struct perf_counter_hw_event *hw_event_uptr __user,
+ pid_t pid,
+ int cpu,
+ int group_fd);
#endif
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 2557c670a3bb..0d323ceda3a4 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -669,7 +669,7 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
struct perf_counter *counter = file->private_data;
- switch (counter->record_type) {
+ switch (counter->hw_event.record_type) {
case PERF_RECORD_SIMPLE:
return perf_read_hw(counter, buf, count);
@@ -707,7 +707,7 @@ static const struct file_operations perf_fops = {
* Allocate and initialize a counter structure
*/
static struct perf_counter *
-perf_counter_alloc(struct perf_counter_event *event, int cpu, u32 record_type)
+perf_counter_alloc(struct perf_counter_hw_event *hw_event, int cpu)
{
struct perf_counter *counter = kzalloc(sizeof(*counter), GFP_KERNEL);
@@ -718,31 +718,37 @@ perf_counter_alloc(struct perf_counter_event *event, int cpu, u32 record_type)
INIT_LIST_HEAD(&counter->list);
init_waitqueue_head(&counter->waitq);
- counter->irqdata = &counter->data[0];
- counter->usrdata = &counter->data[1];
- counter->cpu = cpu;
- counter->record_type = record_type;
- counter->event = *event;
- counter->wakeup_pending = 0;
+ counter->irqdata = &counter->data[0];
+ counter->usrdata = &counter->data[1];
+ counter->cpu = cpu;
+ counter->hw_event = *hw_event;
+ counter->wakeup_pending = 0;
return counter;
}
/**
- * sys_perf_task_open - open a performance counter associate it to a task
- * @hw_event_type: event type for monitoring/sampling...
+ * sys_perf_task_open - open a performance counter, associate it to a task/cpu
+ *
+ * @hw_event_uptr: event type attributes for monitoring/sampling
* @pid: target pid
+ * @cpu: target cpu
+ * @group_fd: group leader counter fd
*/
-asmlinkage int
-sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type,
- pid_t pid, int cpu, int masterfd)
+asmlinkage int sys_perf_counter_open(
+
+ struct perf_counter_hw_event *hw_event_uptr __user,
+ pid_t pid,
+ int cpu,
+ int group_fd)
+
{
struct perf_counter_context *ctx;
- struct perf_counter_event event;
+ struct perf_counter_hw_event hw_event;
struct perf_counter *counter;
int ret;
- if (copy_from_user(&event, uevent, sizeof(event)) != 0)
+ if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
return -EFAULT;
ctx = find_get_context(pid, cpu);
@@ -750,7 +756,7 @@ sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type,
return PTR_ERR(ctx);
ret = -ENOMEM;
- counter = perf_counter_alloc(&event, cpu, record_type);
+ counter = perf_counter_alloc(&hw_event, cpu);
if (!counter)
goto err_put_context;
commit 43874d238d5f208854a73c3225ca2a22833eec8b
Author: Ingo Molnar <mingo@elte.hu>
Date: Tue Dec 9 12:23:59 2008 +0100
perfcounters: consolidate global-disable codepaths
Impact: cleanup
Simplify global disable handling.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 919ec46679b2..6a93d1f04d97 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -33,7 +33,6 @@ static u32 perf_counter_mask __read_mostly;
struct cpu_hw_counters {
struct perf_counter *counters[MAX_HW_COUNTERS];
unsigned long used[BITS_TO_LONGS(MAX_HW_COUNTERS)];
- int enable_all;
};
/*
@@ -115,24 +114,13 @@ int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type)
return 0;
}
-static void __hw_perf_enable_all(void)
-{
- wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0);
-}
-
void hw_perf_enable_all(void)
{
- struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-
- cpuc->enable_all = 1;
- __hw_perf_enable_all();
+ wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0);
}
void hw_perf_disable_all(void)
{
- struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-
- cpuc->enable_all = 0;
wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
}
@@ -385,8 +373,10 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown)
static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
{
int bit, cpu = smp_processor_id();
+ u64 ack, status, saved_global;
struct cpu_hw_counters *cpuc;
- u64 ack, status;
+
+ rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global);
/* Disable counters globally */
wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
@@ -445,10 +435,9 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
goto again;
out:
/*
- * Do not reenable when global enable is off:
+ * Restore - do not reenable when global enable is off:
*/
- if (cpuc->enable_all)
- __hw_perf_enable_all();
+ wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, saved_global, 0);
}
void smp_perf_counter_interrupt(struct pt_regs *regs)
commit 1e12567678054bc1d4c944ecfad17624b3e49345
Author: Ingo Molnar <mingo@elte.hu>
Date: Tue Dec 9 12:18:18 2008 +0100
perfcounters, x86: clean up debug code
Impact: cleanup
Get rid of unused debug code.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 7d528ffc2d26..919ec46679b2 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -214,13 +214,11 @@ static void __hw_perf_save_counter(struct perf_counter *counter,
{
s64 raw = -1;
s64 delta;
- int err;
/*
* Get the raw hw counter value:
*/
- err = rdmsrl_safe(hwc->counter_base + idx, &raw);
- WARN_ON_ONCE(err);
+ rdmsrl(hwc->counter_base + idx, raw);
/*
* Rebase it to zero (it started counting at -irq_period),
@@ -252,20 +250,18 @@ static void __hw_perf_save_counter(struct perf_counter *counter,
void perf_counter_print_debug(void)
{
u64 ctrl, status, overflow, pmc_ctrl, pmc_count, next_count;
- int cpu, err, idx;
+ int cpu, idx;
+
+ if (!nr_hw_counters)
+ return;
local_irq_disable();
cpu = smp_processor_id();
- err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_CTRL, &ctrl);
- WARN_ON_ONCE(err);
-
- err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_STATUS, &status);
- WARN_ON_ONCE(err);
-
- err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_OVF_CTRL, &overflow);
- WARN_ON_ONCE(err);
+ rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
+ rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+ rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
printk(KERN_INFO "\n");
printk(KERN_INFO "CPU#%d: ctrl: %016llx\n", cpu, ctrl);
@@ -273,11 +269,8 @@ void perf_counter_print_debug(void)
printk(KERN_INFO "CPU#%d: overflow: %016llx\n", cpu, overflow);
for (idx = 0; idx < nr_hw_counters; idx++) {
- err = rdmsrl_safe(MSR_ARCH_PERFMON_EVENTSEL0 + idx, &pmc_ctrl);
- WARN_ON_ONCE(err);
-
- err = rdmsrl_safe(MSR_ARCH_PERFMON_PERFCTR0 + idx, &pmc_count);
- WARN_ON_ONCE(err);
+ rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
+ rdmsrl(MSR_ARCH_PERFMON_PERFCTR0 + idx, pmc_count);
next_count = per_cpu(prev_next_count[idx], cpu);
@@ -310,13 +303,11 @@ void hw_perf_counter_read(struct perf_counter *counter)
unsigned long addr = hwc->counter_base + hwc->idx;
s64 offs, val = -1LL;
s32 val32;
- int err;
/* Careful: NMI might modify the counter offset */
do {
offs = hwc->prev_count;
- err = rdmsrl_safe(addr, &val);
- WARN_ON_ONCE(err);
+ rdmsrl(addr, val);
} while (offs != hwc->prev_count);
val32 = (s32) val;
@@ -346,10 +337,8 @@ static void perf_save_and_restart(struct perf_counter *counter)
struct hw_perf_counter *hwc = &counter->hw;
int idx = hwc->idx;
u64 pmc_ctrl;
- int err;
- err = rdmsrl_safe(MSR_ARCH_PERFMON_EVENTSEL0 + idx, &pmc_ctrl);
- WARN_ON_ONCE(err);
+ rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
__hw_perf_save_counter(counter, hwc, idx);
__hw_perf_counter_set_period(hwc, idx);
commit 7e2ae34749edf19e76e594b9c4b2cdde1066afc5
Author: Ingo Molnar <mingo@elte.hu>
Date: Tue Dec 9 11:40:46 2008 +0100
perfcounters, x86: simplify disable/enable of counters
Impact: fix spurious missed counter wakeups
In the case of NMI events, close a race window that can occur if an NMI
hits counter code that temporarily disables+enables a counter, and the NMI
leaks into the disabled section.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 615e953208ef..7d528ffc2d26 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -136,14 +136,25 @@ void hw_perf_disable_all(void)
wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
}
+static inline void
+__hw_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx)
+{
+ wrmsr(hwc->config_base + idx, hwc->config, 0);
+}
+
static DEFINE_PER_CPU(u64, prev_next_count[MAX_HW_COUNTERS]);
-static void __hw_perf_counter_enable(struct hw_perf_counter *hwc, int idx)
+static void __hw_perf_counter_set_period(struct hw_perf_counter *hwc, int idx)
{
per_cpu(prev_next_count[idx], smp_processor_id()) = hwc->next_count;
wrmsr(hwc->counter_base + idx, hwc->next_count, 0);
- wrmsr(hwc->config_base + idx, hwc->config, 0);
+}
+
+static void __hw_perf_counter_enable(struct hw_perf_counter *hwc, int idx)
+{
+ wrmsr(hwc->config_base + idx,
+ hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
}
void hw_perf_counter_enable(struct perf_counter *counter)
@@ -161,11 +172,11 @@ void hw_perf_counter_enable(struct perf_counter *counter)
perf_counters_lapic_init(hwc->nmi);
- wrmsr(hwc->config_base + idx,
- hwc->config & ~ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
+ __hw_perf_counter_disable(hwc, idx);
cpuc->counters[idx] = counter;
- counter->hw.config |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+
+ __hw_perf_counter_set_period(hwc, idx);
__hw_perf_counter_enable(hwc, idx);
}
@@ -286,8 +297,7 @@ void hw_perf_counter_disable(struct perf_counter *counter)
struct hw_perf_counter *hwc = &counter->hw;
unsigned int idx = hwc->idx;
- counter->hw.config &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
- wrmsr(hwc->config_base + idx, hwc->config, 0);
+ __hw_perf_counter_disable(hwc, idx);
clear_bit(idx, cpuc->used);
cpuc->counters[idx] = NULL;
@@ -328,18 +338,24 @@ static void perf_store_irq_data(struct perf_counter *counter, u64 data)
}
}
+/*
+ * NMI-safe enable method:
+ */
static void perf_save_and_restart(struct perf_counter *counter)
{
struct hw_perf_counter *hwc = &counter->hw;
int idx = hwc->idx;
+ u64 pmc_ctrl;
+ int err;
- wrmsr(hwc->config_base + idx,
- hwc->config & ~ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
+ err = rdmsrl_safe(MSR_ARCH_PERFMON_EVENTSEL0 + idx, &pmc_ctrl);
+ WARN_ON_ONCE(err);
- if (hwc->config & ARCH_PERFMON_EVENTSEL0_ENABLE) {
- __hw_perf_save_counter(counter, hwc, idx);
+ __hw_perf_save_counter(counter, hwc, idx);
+ __hw_perf_counter_set_period(hwc, idx);
+
+ if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE)
__hw_perf_counter_enable(hwc, idx);
- }
}
static void
commit 4c59e4676dc95f6f58a2cff5390b2699fa5b5549
Author: Ingo Molnar <mingo@elte.hu>
Date: Mon Dec 8 19:38:33 2008 +0100
perfcounters: select ANON_INODES
The perfcounters subsystem depends on CONFIG_ANON_INODES facilities,
so make sure it's selected.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/init/Kconfig b/init/Kconfig
index 78bede218f10..7d147a36e968 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -741,6 +741,7 @@ config PERF_COUNTERS
bool "Kernel Performance Counters"
depends on HAVE_PERF_COUNTERS
default y
+ select ANON_INODES
help
Enable kernel support for performance counter hardware.
commit 50dd94e017ec39f85c26b6c10ed9fb2d7a7d8042
Author: Ingo Molnar <mingo@elte.hu>
Date: Mon Dec 8 18:47:51 2008 +0100
sparseirq: fix typo in !CONFIG_IO_APIC case
Impact: build fix
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index b35e94c2d6df..25d527ca1362 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -193,12 +193,9 @@ extern void probe_nr_irqs_gsi(void);
#else /* !CONFIG_X86_IO_APIC */
#define io_apic_assign_pci_irqs 0
static const int timer_through_8259 = 0;
-static inline void ioapic_init_mappings(void) { }
+static inline void ioapic_init_mappings(void) { }
-static inline int probe_nr_irqs(void)
-{
- return NR_IRQS;
-}
+static inline void probe_nr_irqs_gsi(void) { }
#endif
#endif /* _ASM_X86_IO_APIC_H */
commit e726f5f91effd8944c76475a2688093a03ba0d10
Author: Ingo Molnar <mingo@elte.hu>
Date: Mon Dec 8 16:55:53 2008 +0100
tracing/function-graph-tracer: fix 'flags' variable mismatch
this warning:
kernel/trace/trace.c: In function ‘trace_vprintk’:
kernel/trace/trace.c:3626: warning: ‘flags’ may be used uninitialized in this function
shows some confusion about irq_flags / flags use here. We already have
irq_flags so remove the extra flags variable.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0b8659bd5ad2..8ebe0070c47a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3596,9 +3596,9 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
struct ring_buffer_event *event;
struct trace_array *tr = &global_trace;
struct trace_array_cpu *data;
- struct print_entry *entry;
- unsigned long flags, irq_flags;
int cpu, len = 0, size, pc;
+ struct print_entry *entry;
+ unsigned long irq_flags;
if (tracing_disabled || tracing_selftest_running)
return 0;
@@ -3623,7 +3623,7 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
if (!event)
goto out_unlock;
entry = ring_buffer_event_data(event);
- tracing_generic_entry_update(&entry->ent, flags, pc);
+ tracing_generic_entry_update(&entry->ent, irq_flags, pc);
entry->ent.type = TRACE_PRINT;
entry->ip = ip;
entry->depth = depth;
commit 087052b02f42b50316c6e4d7f2d8dfba3de6fc2e
Author: Ingo Molnar <mingo@elte.hu>
Date: Fri Oct 17 16:09:57 2008 +0200
x86: fix default_spin_lock_flags() prototype
these warnings:
arch/x86/kernel/paravirt-spinlocks.c: In function ‘default_spin_lock_flags’:
arch/x86/kernel/paravirt-spinlocks.c:12: warning: passing argument 1 of ‘__raw_spin_lock’ from incompatible pointer type
arch/x86/kernel/paravirt-spinlocks.c: At top level:
arch/x86/kernel/paravirt-spinlocks.c:11: warning: ‘default_spin_lock_flags’ defined but not used
showed that the prototype of default_spin_lock_flags() was confused about
what type spinlocks have.
the proper type on UP is raw_spinlock_t.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index 0e9f1982b1dd..95777b0faa73 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -7,7 +7,8 @@
#include <asm/paravirt.h>
-static void default_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags)
+static inline void
+default_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags)
{
__raw_spin_lock(lock);
}
commit 87b9cf4623ad4e5fc009e48c020593dffd5d3793
Author: Ingo Molnar <mingo@elte.hu>
Date: Mon Dec 8 14:20:16 2008 +0100
x86, perfcounters: read out MSR_CORE_PERF_GLOBAL_STATUS with counters disabled
Impact: make perfcounter NMI and IRQ sequence more robust
Make __smp_perf_counter_interrupt() a bit more conservative: first disable
all counters, then read out the status. Most invocations are because there
are real events, so there's no performance impact.
Code flow gets a bit simpler as well this way.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 82440cbed0e6..615e953208ef 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -383,18 +383,16 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
struct cpu_hw_counters *cpuc;
u64 ack, status;
- rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
- if (!status) {
- ack_APIC_irq();
- return;
- }
-
/* Disable counters globally */
wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
ack_APIC_irq();
cpuc = &per_cpu(cpu_hw_counters, cpu);
+ rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+ if (!status)
+ goto out;
+
again:
ack = status;
for_each_bit(bit, (unsigned long *) &status, nr_hw_counters) {
@@ -440,7 +438,7 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
if (status)
goto again;
-
+out:
/*
* Do not reenable when global enable is off:
*/
commit 241771ef016b5c0c83cd7a4372a74321c973c1e6
Author: Ingo Molnar <mingo@elte.hu>
Date: Wed Dec 3 10:39:53 2008 +0100
performance counters: x86 support
Implement performance counters for x86 Intel CPUs.
It's simplified right now: the PERFMON CPU feature is assumed,
which is available in Core2 and later Intel CPUs.
The design is flexible to be extended to more CPU types as well.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d4d4cb7629ea..f2fdc1867241 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -643,6 +643,7 @@ config X86_UP_IOAPIC
config X86_LOCAL_APIC
def_bool y
depends on X86_64 || (X86_32 && (X86_UP_APIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH))
+ select HAVE_PERF_COUNTERS
config X86_IO_APIC
def_bool y
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 256b00b61892..3c14ed07dc4e 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -823,7 +823,8 @@ ia32_sys_call_table:
.quad compat_sys_signalfd4
.quad sys_eventfd2
.quad sys_epoll_create1
- .quad sys_dup3 /* 330 */
+ .quad sys_dup3 /* 330 */
.quad sys_pipe2
.quad sys_inotify_init1
+ .quad sys_perf_counter_open
ia32_syscall_end:
diff --git a/arch/x86/include/asm/hardirq_32.h b/arch/x86/include/asm/hardirq_32.h
index 5ca135e72f2b..b3e475dc9338 100644
--- a/arch/x86/include/asm/hardirq_32.h
+++ b/arch/x86/include/asm/hardirq_32.h
@@ -9,6 +9,7 @@ typedef struct {
unsigned long idle_timestamp;
unsigned int __nmi_count; /* arch dependent */
unsigned int apic_timer_irqs; /* arch dependent */
+ unsigned int apic_perf_irqs; /* arch dependent */
unsigned int irq0_irqs;
unsigned int irq_resched_count;
unsigned int irq_call_count;
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 8de644b6b959..aa93e53b85ee 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -30,6 +30,8 @@
/* Interrupt handlers registered during init_IRQ */
extern void apic_timer_interrupt(void);
extern void error_interrupt(void);
+extern void perf_counter_interrupt(void);
+
extern void spurious_interrupt(void);
extern void thermal_interrupt(void);
extern void reschedule_interrupt(void);
diff --git a/arch/x86/include/asm/intel_arch_perfmon.h b/arch/x86/include/asm/intel_arch_perfmon.h
index fa0fd068bc2e..71598a9eab61 100644
--- a/arch/x86/include/asm/intel_arch_perfmon.h
+++ b/arch/x86/include/asm/intel_arch_perfmon.h
@@ -1,22 +1,24 @@
#ifndef _ASM_X86_INTEL_ARCH_PERFMON_H
#define _ASM_X86_INTEL_ARCH_PERFMON_H
-#define MSR_ARCH_PERFMON_PERFCTR0 0xc1
-#define MSR_ARCH_PERFMON_PERFCTR1 0xc2
+#define MSR_ARCH_PERFMON_PERFCTR0 0xc1
+#define MSR_ARCH_PERFMON_PERFCTR1 0xc2
-#define MSR_ARCH_PERFMON_EVENTSEL0 0x186
-#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
+#define MSR_ARCH_PERFMON_EVENTSEL0 0x186
+#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
-#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22)
-#define ARCH_PERFMON_EVENTSEL_INT (1 << 20)
-#define ARCH_PERFMON_EVENTSEL_OS (1 << 17)
-#define ARCH_PERFMON_EVENTSEL_USR (1 << 16)
+#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22)
+#define ARCH_PERFMON_EVENTSEL_INT (1 << 20)
+#define ARCH_PERFMON_EVENTSEL_OS (1 << 17)
+#define ARCH_PERFMON_EVENTSEL_USR (1 << 16)
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL (0x3c)
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX (0)
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0
#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
- (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
+ (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
+
+#define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6
union cpuid10_eax {
struct {
@@ -28,4 +30,12 @@ union cpuid10_eax {
unsigned int full;
};
+#ifdef CONFIG_PERF_COUNTERS
+extern void init_hw_perf_counters(void);
+extern void perf_counters_lapic_init(int nmi);
+#else
+static inline void init_hw_perf_counters(void) { }
+static inline void perf_counters_lapic_init(int nmi) { }
+#endif
+
#endif /* _ASM_X86_INTEL_ARCH_PERFMON_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 0005adb0f941..b8d277f1252f 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -86,6 +86,11 @@
*/
#define LOCAL_TIMER_VECTOR 0xef
+/*
+ * Performance monitoring interrupt vector:
+ */
+#define LOCAL_PERF_VECTOR 0xee
+
/*
* First APIC vector available to drivers: (vectors 0x30-0xee) we
* start at 0x31(0x41) to spread out vectors evenly between priority
diff --git a/arch/x86/include/asm/mach-default/entry_arch.h b/arch/x86/include/asm/mach-default/entry_arch.h
index 6b1add8e31dd..ad31e5d90e90 100644
--- a/arch/x86/include/asm/mach-default/entry_arch.h
+++ b/arch/x86/include/asm/mach-default/entry_arch.h
@@ -25,10 +25,15 @@ BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
* a much simpler SMP time architecture:
*/
#ifdef CONFIG_X86_LOCAL_APIC
+
BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR)
BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
+#ifdef CONFIG_PERF_COUNTERS
+BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR)
+#endif
+
#ifdef CONFIG_X86_MCE_P4THERMAL
BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
#endif
diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
index 2fbfff88df37..90a8d9d4206b 100644
--- a/arch/x86/include/asm/pda.h
+++ b/arch/x86/include/asm/pda.h
@@ -30,6 +30,7 @@ struct x8664_pda {
short isidle;
struct mm_struct *active_mm;
unsigned apic_timer_irqs;
+ unsigned apic_perf_irqs;
unsigned irq0_irqs;
unsigned irq_resched_count;
unsigned irq_call_count;
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e44d379faad2..810bf266d134 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -80,6 +80,7 @@ struct thread_info {
#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
#define TIF_SECCOMP 8 /* secure computing */
#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */
+#define TIF_PERF_COUNTERS 11 /* notify perf counter work */
#define TIF_NOTSC 16 /* TSC is not accessible in userland */
#define TIF_IA32 17 /* 32bit process */
#define TIF_FORK 18 /* ret_from_fork */
@@ -103,6 +104,7 @@ struct thread_info {
#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
#define _TIF_SECCOMP (1 << TIF_SECCOMP)
#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY)
+#define _TIF_PERF_COUNTERS (1 << TIF_PERF_COUNTERS)
#define _TIF_NOTSC (1 << TIF_NOTSC)
#define _TIF_IA32 (1 << TIF_IA32)
#define _TIF_FORK (1 << TIF_FORK)
@@ -135,7 +137,7 @@ struct thread_info {
/* Only used for 64 bit */
#define _TIF_DO_NOTIFY_MASK \
- (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME)
+ (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_PERF_COUNTERS|_TIF_NOTIFY_RESUME)
/* flags to check in __switch_to() */
#define _TIF_WORK_CTXSW \
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index f2bba78430a4..7e47658b0a6f 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -338,6 +338,7 @@
#define __NR_dup3 330
#define __NR_pipe2 331
#define __NR_inotify_init1 332
+#define __NR_perf_counter_open 333
#ifdef __KERNEL__
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index d2e415e6666f..53025feaf88d 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -653,7 +653,8 @@ __SYSCALL(__NR_dup3, sys_dup3)
__SYSCALL(__NR_pipe2, sys_pipe2)
#define __NR_inotify_init1 294
__SYSCALL(__NR_inotify_init1, sys_inotify_init1)
-
+#define __NR_perf_counter_open 295
+__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open)
#ifndef __NO_STUBS
#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index 16f94879b525..8ab8c1858672 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -31,6 +31,7 @@
#include <linux/dmi.h>
#include <linux/dmar.h>
+#include <asm/intel_arch_perfmon.h>
#include <asm/atomic.h>
#include <asm/smp.h>
#include <asm/mtrr.h>
@@ -1147,6 +1148,7 @@ void __cpuinit setup_local_APIC(void)
apic_write(APIC_ESR, 0);
}
#endif
+ perf_counters_lapic_init(0);
preempt_disable();
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 82ec6075c057..89e53361fe24 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -1,5 +1,5 @@
#
-# Makefile for x86-compatible CPU details and quirks
+# Makefile for x86-compatible CPU details, features and quirks
#
obj-y := intel_cacheinfo.o addon_cpuid_features.o
@@ -16,11 +16,13 @@ obj-$(CONFIG_CPU_SUP_CENTAUR_64) += centaur_64.o
obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
-obj-$(CONFIG_X86_MCE) += mcheck/
-obj-$(CONFIG_MTRR) += mtrr/
-obj-$(CONFIG_CPU_FREQ) += cpufreq/
+obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
-obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
+obj-$(CONFIG_X86_MCE) += mcheck/
+obj-$(CONFIG_MTRR) += mtrr/
+obj-$(CONFIG_CPU_FREQ) += cpufreq/
+
+obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
quiet_cmd_mkcapflags = MKCAP $@
cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index b9c9ea0217a9..4461011db47c 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -17,6 +17,7 @@
#include <asm/mmu_context.h>
#include <asm/mtrr.h>
#include <asm/mce.h>
+#include <asm/intel_arch_perfmon.h>
#include <asm/pat.h>
#include <asm/asm.h>
#include <asm/numa.h>
@@ -750,6 +751,7 @@ void __init identify_boot_cpu(void)
#else
vgetcpu_set_mode();
#endif
+ init_hw_perf_counters();
}
void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
new file mode 100644
index 000000000000..82440cbed0e6
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -0,0 +1,571 @@
+/*
+ * Performance counter x86 architecture code
+ *
+ * Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
+ *
+ * For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_counter.h>
+#include <linux/capability.h>
+#include <linux/notifier.h>
+#include <linux/hardirq.h>
+#include <linux/kprobes.h>
+#include <linux/kdebug.h>
+#include <linux/sched.h>
+
+#include <asm/intel_arch_perfmon.h>
+#include <asm/apic.h>
+
+static bool perf_counters_initialized __read_mostly;
+
+/*
+ * Number of (generic) HW counters:
+ */
+static int nr_hw_counters __read_mostly;
+static u32 perf_counter_mask __read_mostly;
+
+/* No support for fixed function counters yet */
+
+#define MAX_HW_COUNTERS 8
+
+struct cpu_hw_counters {
+ struct perf_counter *counters[MAX_HW_COUNTERS];
+ unsigned long used[BITS_TO_LONGS(MAX_HW_COUNTERS)];
+ int enable_all;
+};
+
+/*
+ * Intel PerfMon v3. Used on Core2 and later.
+ */
+static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
+
+const int intel_perfmon_event_map[] =
+{
+ [PERF_COUNT_CYCLES] = 0x003c,
+ [PERF_COUNT_INSTRUCTIONS] = 0x00c0,
+ [PERF_COUNT_CACHE_REFERENCES] = 0x4f2e,
+ [PERF_COUNT_CACHE_MISSES] = 0x412e,
+ [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x00c4,
+ [PERF_COUNT_BRANCH_MISSES] = 0x00c5,
+};
+
+const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
+
+/*
+ * Setup the hardware configuration for a given hw_event_type
+ */
+int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type)
+{
+ struct hw_perf_counter *hwc = &counter->hw;
+
+ if (unlikely(!perf_counters_initialized))
+ return -EINVAL;
+
+ /*
+ * Count user events, and generate PMC IRQs:
+ * (keep 'enabled' bit clear for now)
+ */
+ hwc->config = ARCH_PERFMON_EVENTSEL_USR | ARCH_PERFMON_EVENTSEL_INT;
+
+ /*
+ * If privileged enough, count OS events too, and allow
+ * NMI events as well:
+ */
+ hwc->nmi = 0;
+ if (capable(CAP_SYS_ADMIN)) {
+ hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
+ if (hw_event_type & PERF_COUNT_NMI)
+ hwc->nmi = 1;
+ }
+
+ hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0;
+ hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0;
+
+ hwc->irq_period = counter->__irq_period;
+ /*
+ * Intel PMCs cannot be accessed sanely above 32 bit width,
+ * so we install an artificial 1<<31 period regardless of
+ * the generic counter period:
+ */
+ if (!hwc->irq_period)
+ hwc->irq_period = 0x7FFFFFFF;
+
+ hwc->next_count = -((s32) hwc->irq_period);
+
+ /*
+ * Negative event types mean raw encoded event+umask values:
+ */
+ if (hw_event_type < 0) {
+ counter->hw_event_type = -hw_event_type;
+ counter->hw_event_type &= ~PERF_COUNT_NMI;
+ } else {
+ hw_event_type &= ~PERF_COUNT_NMI;
+ if (hw_event_type >= max_intel_perfmon_events)
+ return -EINVAL;
+ /*
+ * The generic map:
+ */
+ counter->hw_event_type = intel_perfmon_event_map[hw_event_type];
+ }
+ hwc->config |= counter->hw_event_type;
+ counter->wakeup_pending = 0;
+
+ return 0;
+}
+
+static void __hw_perf_enable_all(void)
+{
+ wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0);
+}
+
+void hw_perf_enable_all(void)
+{
+ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+
+ cpuc->enable_all = 1;
+ __hw_perf_enable_all();
+}
+
+void hw_perf_disable_all(void)
+{
+ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+
+ cpuc->enable_all = 0;
+ wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
+}
+
+static DEFINE_PER_CPU(u64, prev_next_count[MAX_HW_COUNTERS]);
+
+static void __hw_perf_counter_enable(struct hw_perf_counter *hwc, int idx)
+{
+ per_cpu(prev_next_count[idx], smp_processor_id()) = hwc->next_count;
+
+ wrmsr(hwc->counter_base + idx, hwc->next_count, 0);
+ wrmsr(hwc->config_base + idx, hwc->config, 0);
+}
+
+void hw_perf_counter_enable(struct perf_counter *counter)
+{
+ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+ struct hw_perf_counter *hwc = &counter->hw;
+ int idx = hwc->idx;
+
+ /* Try to get the previous counter again */
+ if (test_and_set_bit(idx, cpuc->used)) {
+ idx = find_first_zero_bit(cpuc->used, nr_hw_counters);
+ set_bit(idx, cpuc->used);
+ hwc->idx = idx;
+ }
+
+ perf_counters_lapic_init(hwc->nmi);
+
+ wrmsr(hwc->config_base + idx,
+ hwc->config & ~ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
+
+ cpuc->counters[idx] = counter;
+ counter->hw.config |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+ __hw_perf_counter_enable(hwc, idx);
+}
+
+#ifdef CONFIG_X86_64
+static inline void atomic64_counter_set(struct perf_counter *counter, u64 val)
+{
+ atomic64_set(&counter->count, val);
+}
+
+static inline u64 atomic64_counter_read(struct perf_counter *counter)
+{
+ return atomic64_read(&counter->count);
+}
+#else
+/*
+ * Todo: add proper atomic64_t support to 32-bit x86:
+ */
+static inline void atomic64_counter_set(struct perf_counter *counter, u64 val64)
+{
+ u32 *val32 = (void *)&val64;
+
+ atomic_set(counter->count32 + 0, *(val32 + 0));
+ atomic_set(counter->count32 + 1, *(val32 + 1));
+}
+
+static inline u64 atomic64_counter_read(struct perf_counter *counter)
+{
+ return atomic_read(counter->count32 + 0) |
+ (u64) atomic_read(counter->count32 + 1) << 32;
+}
+#endif
+
+static void __hw_perf_save_counter(struct perf_counter *counter,
+ struct hw_perf_counter *hwc, int idx)
+{
+ s64 raw = -1;
+ s64 delta;
+ int err;
+
+ /*
+ * Get the raw hw counter value:
+ */
+ err = rdmsrl_safe(hwc->counter_base + idx, &raw);
+ WARN_ON_ONCE(err);
+
+ /*
+ * Rebase it to zero (it started counting at -irq_period),
+ * to see the delta since ->prev_count:
+ */
+ delta = (s64)hwc->irq_period + (s64)(s32)raw;
+
+ atomic64_counter_set(counter, hwc->prev_count + delta);
+
+ /*
+ * Adjust the ->prev_count offset - if we went beyond
+ * irq_period of units, then we got an IRQ and the counter
+ * was set back to -irq_period:
+ */
+ while (delta >= (s64)hwc->irq_period) {
+ hwc->prev_count += hwc->irq_period;
+ delta -= (s64)hwc->irq_period;
+ }
+
+ /*
+ * Calculate the next raw counter value we'll write into
+ * the counter at the next sched-in time:
+ */
+ delta -= (s64)hwc->irq_period;
+
+ hwc->next_count = (s32)delta;
+}
+
+void perf_counter_print_debug(void)
+{
+ u64 ctrl, status, overflow, pmc_ctrl, pmc_count, next_count;
+ int cpu, err, idx;
+
+ local_irq_disable();
+
+ cpu = smp_processor_id();
+
+ err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_CTRL, &ctrl);
+ WARN_ON_ONCE(err);
+
+ err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_STATUS, &status);
+ WARN_ON_ONCE(err);
+
+ err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_OVF_CTRL, &overflow);
+ WARN_ON_ONCE(err);
+
+ printk(KERN_INFO "\n");
+ printk(KERN_INFO "CPU#%d: ctrl: %016llx\n", cpu, ctrl);
+ printk(KERN_INFO "CPU#%d: status: %016llx\n", cpu, status);
+ printk(KERN_INFO "CPU#%d: overflow: %016llx\n", cpu, overflow);
+
+ for (idx = 0; idx < nr_hw_counters; idx++) {
+ err = rdmsrl_safe(MSR_ARCH_PERFMON_EVENTSEL0 + idx, &pmc_ctrl);
+ WARN_ON_ONCE(err);
+
+ err = rdmsrl_safe(MSR_ARCH_PERFMON_PERFCTR0 + idx, &pmc_count);
+ WARN_ON_ONCE(err);
+
+ next_count = per_cpu(prev_next_count[idx], cpu);
+
+ printk(KERN_INFO "CPU#%d: PMC%d ctrl: %016llx\n",
+ cpu, idx, pmc_ctrl);
+ printk(KERN_INFO "CPU#%d: PMC%d count: %016llx\n",
+ cpu, idx, pmc_count);
+ printk(KERN_INFO "CPU#%d: PMC%d next: %016llx\n",
+ cpu, idx, next_count);
+ }
+ local_irq_enable();
+}
+
+void hw_perf_counter_disable(struct perf_counter *counter)
+{
+ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+ struct hw_perf_counter *hwc = &counter->hw;
+ unsigned int idx = hwc->idx;
+
+ counter->hw.config &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
+ wrmsr(hwc->config_base + idx, hwc->config, 0);
+
+ clear_bit(idx, cpuc->used);
+ cpuc->counters[idx] = NULL;
+ __hw_perf_save_counter(counter, hwc, idx);
+}
+
+void hw_perf_counter_read(struct perf_counter *counter)
+{
+ struct hw_perf_counter *hwc = &counter->hw;
+ unsigned long addr = hwc->counter_base + hwc->idx;
+ s64 offs, val = -1LL;
+ s32 val32;
+ int err;
+
+ /* Careful: NMI might modify the counter offset */
+ do {
+ offs = hwc->prev_count;
+ err = rdmsrl_safe(addr, &val);
+ WARN_ON_ONCE(err);
+ } while (offs != hwc->prev_count);
+
+ val32 = (s32) val;
+ val = (s64)hwc->irq_period + (s64)val32;
+ atomic64_counter_set(counter, hwc->prev_count + val);
+}
+
+static void perf_store_irq_data(struct perf_counter *counter, u64 data)
+{
+ struct perf_data *irqdata = counter->irqdata;
+
+ if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
+ irqdata->overrun++;
+ } else {
+ u64 *p = (u64 *) &irqdata->data[irqdata->len];
+
+ *p = data;
+ irqdata->len += sizeof(u64);
+ }
+}
+
+static void perf_save_and_restart(struct perf_counter *counter)
+{
+ struct hw_perf_counter *hwc = &counter->hw;
+ int idx = hwc->idx;
+
+ wrmsr(hwc->config_base + idx,
+ hwc->config & ~ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
+
+ if (hwc->config & ARCH_PERFMON_EVENTSEL0_ENABLE) {
+ __hw_perf_save_counter(counter, hwc, idx);
+ __hw_perf_counter_enable(hwc, idx);
+ }
+}
+
+static void
+perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown)
+{
+ struct perf_counter_context *ctx = leader->ctx;
+ struct perf_counter *counter;
+ int bit;
+
+ list_for_each_entry(counter, &ctx->counters, list) {
+ if (counter->record_type != PERF_RECORD_SIMPLE ||
+ counter == leader)
+ continue;
+
+ if (counter->active) {
+ /*
+ * When counter was not in the overflow mask, we have to
+ * read it from hardware. We read it as well, when it
+ * has not been read yet and clear the bit in the
+ * status mask.
+ */
+ bit = counter->hw.idx;
+ if (!test_bit(bit, (unsigned long *) overflown) ||
+ test_bit(bit, (unsigned long *) status)) {
+ clear_bit(bit, (unsigned long *) status);
+ perf_save_and_restart(counter);
+ }
+ }
+ perf_store_irq_data(leader, counter->hw_event_type);
+ perf_store_irq_data(leader, atomic64_counter_read(counter));
+ }
+}
+
+/*
+ * This handler is triggered by the local APIC, so the APIC IRQ handling
+ * rules apply:
+ */
+static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
+{
+ int bit, cpu = smp_processor_id();
+ struct cpu_hw_counters *cpuc;
+ u64 ack, status;
+
+ rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+ if (!status) {
+ ack_APIC_irq();
+ return;
+ }
+
+ /* Disable counters globally */
+ wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
+ ack_APIC_irq();
+
+ cpuc = &per_cpu(cpu_hw_counters, cpu);
+
+again:
+ ack = status;
+ for_each_bit(bit, (unsigned long *) &status, nr_hw_counters) {
+ struct perf_counter *counter = cpuc->counters[bit];
+
+ clear_bit(bit, (unsigned long *) &status);
+ if (!counter)
+ continue;
+
+ perf_save_and_restart(counter);
+
+ switch (counter->record_type) {
+ case PERF_RECORD_SIMPLE:
+ continue;
+ case PERF_RECORD_IRQ:
+ perf_store_irq_data(counter, instruction_pointer(regs));
+ break;
+ case PERF_RECORD_GROUP:
+ perf_store_irq_data(counter, counter->hw_event_type);
+ perf_store_irq_data(counter,
+ atomic64_counter_read(counter));
+ perf_handle_group(counter, &status, &ack);
+ break;
+ }
+ /*
+ * From NMI context we cannot call into the scheduler to
+ * do a task wakeup - but we mark these counters as
+ * wakeup_pending and initate a wakeup callback:
+ */
+ if (nmi) {
+ counter->wakeup_pending = 1;
+ set_tsk_thread_flag(current, TIF_PERF_COUNTERS);
+ } else {
+ wake_up(&counter->waitq);
+ }
+ }
+
+ wrmsr(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack, 0);
+
+ /*
+ * Repeat if there is more work to be done:
+ */
+ rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+ if (status)
+ goto again;
+
+ /*
+ * Do not reenable when global enable is off:
+ */
+ if (cpuc->enable_all)
+ __hw_perf_enable_all();
+}
+
+void smp_perf_counter_interrupt(struct pt_regs *regs)
+{
+ irq_enter();
+#ifdef CONFIG_X86_64
+ add_pda(apic_perf_irqs, 1);
+#else
+ per_cpu(irq_stat, smp_processor_id()).apic_perf_irqs++;
+#endif
+ apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
+ __smp_perf_counter_interrupt(regs, 0);
+
+ irq_exit();
+}
+
+/*
+ * This handler is triggered by NMI contexts:
+ */
+void perf_counter_notify(struct pt_regs *regs)
+{
+ struct cpu_hw_counters *cpuc;
+ unsigned long flags;
+ int bit, cpu;
+
+ local_irq_save(flags);
+ cpu = smp_processor_id();
+ cpuc = &per_cpu(cpu_hw_counters, cpu);
+
+ for_each_bit(bit, cpuc->used, nr_hw_counters) {
+ struct perf_counter *counter = cpuc->counters[bit];
+
+ if (!counter)
+ continue;
+
+ if (counter->wakeup_pending) {
+ counter->wakeup_pending = 0;
+ wake_up(&counter->waitq);
+ }
+ }
+
+ local_irq_restore(flags);
+}
+
+void __cpuinit perf_counters_lapic_init(int nmi)
+{
+ u32 apic_val;
+
+ if (!perf_counters_initialized)
+ return;
+ /*
+ * Enable the performance counter vector in the APIC LVT:
+ */
+ apic_val = apic_read(APIC_LVTERR);
+
+ apic_write(APIC_LVTERR, apic_val | APIC_LVT_MASKED);
+ if (nmi)
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ else
+ apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
+ apic_write(APIC_LVTERR, apic_val);
+}
+
+static int __kprobes
+perf_counter_nmi_handler(struct notifier_block *self,
+ unsigned long cmd, void *__args)
+{
+ struct die_args *args = __args;
+ struct pt_regs *regs;
+
+ if (likely(cmd != DIE_NMI_IPI))
+ return NOTIFY_DONE;
+
+ regs = args->regs;
+
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ __smp_perf_counter_interrupt(regs, 1);
+
+ return NOTIFY_STOP;
+}
+
+static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
+ .notifier_call = perf_counter_nmi_handler
+};
+
+void __init init_hw_perf_counters(void)
+{
+ union cpuid10_eax eax;
+ unsigned int unused;
+ unsigned int ebx;
+
+ if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+ return;
+
+ /*
+ * Check whether the Architectural PerfMon supports
+ * Branch Misses Retired Event or not.
+ */
+ cpuid(10, &(eax.full), &ebx, &unused, &unused);
+ if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
+ return;
+
+ printk(KERN_INFO "Intel Performance Monitoring support detected.\n");
+
+ printk(KERN_INFO "... version: %d\n", eax.split.version_id);
+ printk(KERN_INFO "... num_counters: %d\n", eax.split.num_counters);
+ nr_hw_counters = eax.split.num_counters;
+ if (nr_hw_counters > MAX_HW_COUNTERS) {
+ nr_hw_counters = MAX_HW_COUNTERS;
+ WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
+ nr_hw_counters, MAX_HW_COUNTERS);
+ }
+ perf_counter_mask = (1 << nr_hw_counters) - 1;
+ perf_max_counters = nr_hw_counters;
+
+ printk(KERN_INFO "... bit_width: %d\n", eax.split.bit_width);
+ printk(KERN_INFO "... mask_length: %d\n", eax.split.mask_length);
+
+ perf_counters_lapic_init(0);
+ register_die_notifier(&perf_counter_nmi_notifier);
+
+ perf_counters_initialized = true;
+}
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 3194636a4293..fc013cfde307 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -984,6 +984,11 @@ apicinterrupt ERROR_APIC_VECTOR \
apicinterrupt SPURIOUS_APIC_VECTOR \
spurious_interrupt smp_spurious_interrupt
+#ifdef CONFIG_PERF_COUNTERS
+apicinterrupt LOCAL_PERF_VECTOR \
+ perf_counter_interrupt smp_perf_counter_interrupt
+#endif
+
/*
* Exception entry points.
*/
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index d1d4dc52f649..d92bc71e41a7 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -56,6 +56,10 @@ static int show_other_interrupts(struct seq_file *p)
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
seq_printf(p, " Local timer interrupts\n");
+ seq_printf(p, "CNT: ");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
+ seq_printf(p, " Performance counter interrupts\n");
#endif
#ifdef CONFIG_SMP
seq_printf(p, "RES: ");
@@ -160,6 +164,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
#ifdef CONFIG_X86_LOCAL_APIC
sum += irq_stats(cpu)->apic_timer_irqs;
+ sum += irq_stats(cpu)->apic_perf_irqs;
#endif
#ifdef CONFIG_SMP
sum += irq_stats(cpu)->irq_resched_count;
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 607db63044a5..6a33b5e30161 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -160,6 +160,9 @@ void __init native_init_IRQ(void)
/* IPI vectors for APIC spurious and error interrupts */
alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+# ifdef CONFIG_PERF_COUNTERS
+ alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
+# endif
#endif
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL)
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 8670b3ce626e..91d785c25ad9 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -138,6 +138,11 @@ static void __init apic_intr_init(void)
/* IPI vectors for APIC spurious and error interrupts */
alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+
+ /* Performance monitoring interrupt: */
+#ifdef CONFIG_PERF_COUNTERS
+ alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
+#endif
}
void __init native_init_IRQ(void)
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index b1cc6da64208..dee553c503d3 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -6,7 +6,7 @@
* 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
* 2000-2002 x86-64 support by Andi Kleen
*/
-
+#include <linux/perf_counter.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/smp.h>
@@ -891,6 +891,11 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
tracehook_notify_resume(regs);
}
+ if (thread_info_flags & _TIF_PERF_COUNTERS) {
+ clear_thread_flag(TIF_PERF_COUNTERS);
+ perf_counter_notify(regs);
+ }
+
#ifdef CONFIG_X86_32
clear_thread_flag(TIF_IRET);
#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index d44395ff34c3..496726ddcea1 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -332,3 +332,4 @@ ENTRY(sys_call_table)
.long sys_dup3 /* 330 */
.long sys_pipe2
.long sys_inotify_init1
+ .long sys_perf_counter_open