Patches contributed by Eötvös Lorand University
commit 6c594c21fcb02c662f11c97be4d7d2b73060a205
Author: Ingo Molnar <mingo@elte.hu>
Date: Sun Dec 14 12:34:15 2008 +0100
perfcounters: add task migrations counter
Impact: add new feature, new sw counter
Add a counter that counts the number of cross-CPU migrations a
task is suffering.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index d2a16563415f..f30486fc55d7 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -42,6 +42,8 @@ enum hw_event_types {
PERF_COUNT_BRANCH_INSTRUCTIONS = 4,
PERF_COUNT_BRANCH_MISSES = 5,
+ PERF_HW_EVENTS_MAX = 6,
+
/*
* Special "software" counters provided by the kernel, even if
* the hardware does not support performance counters. These
@@ -50,11 +52,11 @@ enum hw_event_types {
*/
PERF_COUNT_CPU_CLOCK = -1,
PERF_COUNT_TASK_CLOCK = -2,
- /*
- * Future software events:
- */
PERF_COUNT_PAGE_FAULTS = -3,
PERF_COUNT_CONTEXT_SWITCHES = -4,
+ PERF_COUNT_CPU_MIGRATIONS = -5,
+
+ PERF_SW_EVENTS_MIN = -6,
};
/*
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4c530278391b..2e15be8fc792 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1014,6 +1014,8 @@ struct sched_entity {
u64 last_wakeup;
u64 avg_overlap;
+ u64 nr_migrations;
+
#ifdef CONFIG_SCHEDSTATS
u64 wait_start;
u64 wait_max;
@@ -1029,7 +1031,6 @@ struct sched_entity {
u64 exec_max;
u64 slice_max;
- u64 nr_migrations;
u64 nr_migrations_cold;
u64 nr_failed_migrations_affine;
u64 nr_failed_migrations_running;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 09287091c526..fb11e351e44e 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -936,6 +936,52 @@ static const struct hw_perf_counter_ops perf_ops_context_switches = {
.hw_perf_counter_read = context_switches_perf_counter_read,
};
+static inline u64 get_cpu_migrations(void)
+{
+ return current->se.nr_migrations;
+}
+
+static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
+{
+ u64 prev, now;
+ s64 delta;
+
+ prev = atomic64_read(&counter->hw.prev_count);
+ now = get_cpu_migrations();
+
+ atomic64_set(&counter->hw.prev_count, now);
+
+ delta = now - prev;
+ if (WARN_ON_ONCE(delta < 0))
+ delta = 0;
+
+ atomic64_add(delta, &counter->count);
+}
+
+static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
+{
+ cpu_migrations_perf_counter_update(counter);
+}
+
+static void cpu_migrations_perf_counter_enable(struct perf_counter *counter)
+{
+ /*
+ * se.nr_migrations is a per-task value already,
+ * so we dont have to clear it on switch-in.
+ */
+}
+
+static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
+{
+ cpu_migrations_perf_counter_update(counter);
+}
+
+static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
+ .hw_perf_counter_enable = cpu_migrations_perf_counter_enable,
+ .hw_perf_counter_disable = cpu_migrations_perf_counter_disable,
+ .hw_perf_counter_read = cpu_migrations_perf_counter_read,
+};
+
static const struct hw_perf_counter_ops *
sw_perf_counter_init(struct perf_counter *counter)
{
@@ -951,6 +997,9 @@ sw_perf_counter_init(struct perf_counter *counter)
case PERF_COUNT_CONTEXT_SWITCHES:
hw_ops = &perf_ops_context_switches;
break;
+ case PERF_COUNT_CPU_MIGRATIONS:
+ hw_ops = &perf_ops_cpu_migrations;
+ break;
default:
break;
}
diff --git a/kernel/sched.c b/kernel/sched.c
index 5c3f4106314e..382cfdb5e38d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1852,12 +1852,14 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
p->se.sleep_start -= clock_offset;
if (p->se.block_start)
p->se.block_start -= clock_offset;
+#endif
if (old_cpu != new_cpu) {
- schedstat_inc(p, se.nr_migrations);
+ p->se.nr_migrations++;
+#ifdef CONFIG_SCHEDSTATS
if (task_hot(p, old_rq->clock, NULL))
schedstat_inc(p, se.nr_forced2_migrations);
- }
#endif
+ }
p->se.vruntime -= old_cfsrq->min_vruntime -
new_cfsrq->min_vruntime;
@@ -2375,6 +2377,7 @@ static void __sched_fork(struct task_struct *p)
p->se.exec_start = 0;
p->se.sum_exec_runtime = 0;
p->se.prev_sum_exec_runtime = 0;
+ p->se.nr_migrations = 0;
p->se.last_wakeup = 0;
p->se.avg_overlap = 0;
commit 5d6a27d8a096868ae313f71f563b06074a7e34fe
Author: Ingo Molnar <mingo@elte.hu>
Date: Sun Dec 14 12:28:33 2008 +0100
perfcounters: add context switch counter
Impact: add new feature, new sw counter
Add a counter that counts the number of context-switches a task
is doing.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index e5d25bf8f74e..d2a16563415f 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -53,8 +53,8 @@ enum hw_event_types {
/*
* Future software events:
*/
- /* PERF_COUNT_PAGE_FAULTS = -3,
- PERF_COUNT_CONTEXT_SWITCHES = -4, */
+ PERF_COUNT_PAGE_FAULTS = -3,
+ PERF_COUNT_CONTEXT_SWITCHES = -4,
};
/*
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 1f81cde0dc43..09287091c526 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -888,6 +888,54 @@ static const struct hw_perf_counter_ops perf_ops_task_clock = {
.hw_perf_counter_read = task_clock_perf_counter_read,
};
+static u64 get_context_switches(void)
+{
+ struct task_struct *curr = current;
+
+ return curr->nvcsw + curr->nivcsw;
+}
+
+static void context_switches_perf_counter_update(struct perf_counter *counter)
+{
+ u64 prev, now;
+ s64 delta;
+
+ prev = atomic64_read(&counter->hw.prev_count);
+ now = get_context_switches();
+
+ atomic64_set(&counter->hw.prev_count, now);
+
+ delta = now - prev;
+ if (WARN_ON_ONCE(delta < 0))
+ delta = 0;
+
+ atomic64_add(delta, &counter->count);
+}
+
+static void context_switches_perf_counter_read(struct perf_counter *counter)
+{
+ context_switches_perf_counter_update(counter);
+}
+
+static void context_switches_perf_counter_enable(struct perf_counter *counter)
+{
+ /*
+ * ->nvcsw + curr->nivcsw is a per-task value already,
+ * so we dont have to clear it on switch-in.
+ */
+}
+
+static void context_switches_perf_counter_disable(struct perf_counter *counter)
+{
+ context_switches_perf_counter_update(counter);
+}
+
+static const struct hw_perf_counter_ops perf_ops_context_switches = {
+ .hw_perf_counter_enable = context_switches_perf_counter_enable,
+ .hw_perf_counter_disable = context_switches_perf_counter_disable,
+ .hw_perf_counter_read = context_switches_perf_counter_read,
+};
+
static const struct hw_perf_counter_ops *
sw_perf_counter_init(struct perf_counter *counter)
{
@@ -900,6 +948,9 @@ sw_perf_counter_init(struct perf_counter *counter)
case PERF_COUNT_TASK_CLOCK:
hw_ops = &perf_ops_task_clock;
break;
+ case PERF_COUNT_CONTEXT_SWITCHES:
+ hw_ops = &perf_ops_context_switches;
+ break;
default:
break;
}
commit 8cb391e8786c8072367f0aeb90551903fef074ba
Author: Ingo Molnar <mingo@elte.hu>
Date: Sun Dec 14 12:22:31 2008 +0100
perfcounters: fix task clock counter
Impact: bugfix
Update the task clock counter to the new math.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f5e81dd193d1..1f81cde0dc43 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -850,17 +850,36 @@ static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
.hw_perf_counter_read = cpu_clock_perf_counter_read,
};
-static void task_clock_perf_counter_enable(struct perf_counter *counter)
+static void task_clock_perf_counter_update(struct perf_counter *counter)
{
+ u64 prev, now;
+ s64 delta;
+
+ prev = atomic64_read(&counter->hw.prev_count);
+ now = current->se.sum_exec_runtime;
+
+ atomic64_set(&counter->hw.prev_count, now);
+
+ delta = now - prev;
+ if (WARN_ON_ONCE(delta < 0))
+ delta = 0;
+
+ atomic64_add(delta, &counter->count);
}
-static void task_clock_perf_counter_disable(struct perf_counter *counter)
+static void task_clock_perf_counter_read(struct perf_counter *counter)
{
+ task_clock_perf_counter_update(counter);
}
-static void task_clock_perf_counter_read(struct perf_counter *counter)
+static void task_clock_perf_counter_enable(struct perf_counter *counter)
+{
+ atomic64_set(&counter->hw.prev_count, current->se.sum_exec_runtime);
+}
+
+static void task_clock_perf_counter_disable(struct perf_counter *counter)
{
- atomic64_set(&counter->count, current->se.sum_exec_runtime);
+ task_clock_perf_counter_update(counter);
}
static const struct hw_perf_counter_ops perf_ops_task_clock = {
commit 9b51f66dcb09ac5eb6bc68fc111d5c7a1e0131d6
Author: Ingo Molnar <mingo@elte.hu>
Date: Fri Dec 12 13:49:45 2008 +0100
perfcounters: implement "counter inheritance"
Impact: implement new performance feature
Counter inheritance can be used to run performance counters in a workload,
transparently - and pipe back the counter results to the parent counter.
Inheritance for performance counters works the following way: when creating
a counter it can be marked with the .inherit=1 flag. Such counters are then
'inherited' by all child tasks (be they fork()-ed or clone()-ed). These
counters get inherited through exec() boundaries as well (except through
setuid boundaries).
The counter values get added back to the parent counter(s) when the child
task(s) exit - much like stime/utime statistics are gathered. So inherited
counters are ideal to gather summary statistics about an application's
behavior via shell commands, without having to modify that application.
The timec.c command utilizes counter inheritance:
http://redhat.com/~mingo/perfcounters/timec.c
Sample output:
$ ./timec -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
Performance counter stats for 'ls':
163516953 instructions
2295 cache-misses
2855182 branch-misses
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 72460289c654..e5d25bf8f74e 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -75,10 +75,11 @@ struct perf_counter_hw_event {
u64 irq_period;
u32 record_type;
- u32 disabled : 1, /* off by default */
- nmi : 1, /* NMI sampling */
- raw : 1, /* raw event type */
- __reserved_1 : 29;
+ u32 disabled : 1, /* off by default */
+ nmi : 1, /* NMI sampling */
+ raw : 1, /* raw event type */
+ inherit : 1, /* children inherit it */
+ __reserved_1 : 28;
u64 __reserved_2;
};
@@ -138,6 +139,8 @@ enum perf_counter_active_state {
PERF_COUNTER_STATE_ACTIVE = 1,
};
+struct file;
+
/**
* struct perf_counter - performance counter kernel representation:
*/
@@ -156,7 +159,10 @@ struct perf_counter {
struct perf_counter_context *ctx;
struct task_struct *task;
+ struct file *filp;
+ unsigned int nr_inherited;
+ struct perf_counter *parent;
/*
* Protect attach/detach:
*/
@@ -210,13 +216,16 @@ struct perf_cpu_context {
extern int perf_max_counters;
#ifdef CONFIG_PERF_COUNTERS
+extern void
+perf_counter_show(struct perf_counter *counter, char *str, int trace);
extern const struct hw_perf_counter_ops *
hw_perf_counter_init(struct perf_counter *counter);
extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
extern void perf_counter_task_sched_out(struct task_struct *task, int cpu);
extern void perf_counter_task_tick(struct task_struct *task, int cpu);
-extern void perf_counter_init_task(struct task_struct *task);
+extern void perf_counter_init_task(struct task_struct *child);
+extern void perf_counter_exit_task(struct task_struct *child);
extern void perf_counter_notify(struct pt_regs *regs);
extern void perf_counter_print_debug(void);
extern u64 hw_perf_save_disable(void);
@@ -226,12 +235,15 @@ extern int perf_counter_task_enable(void);
#else
static inline void
+perf_counter_show(struct perf_counter *counter, char *str, int trace) { }
+static inline void
perf_counter_task_sched_in(struct task_struct *task, int cpu) { }
static inline void
perf_counter_task_sched_out(struct task_struct *task, int cpu) { }
static inline void
perf_counter_task_tick(struct task_struct *task, int cpu) { }
-static inline void perf_counter_init_task(struct task_struct *task) { }
+static inline void perf_counter_init_task(struct task_struct *child) { }
+static inline void perf_counter_exit_task(struct task_struct *child) { }
static inline void perf_counter_notify(struct pt_regs *regs) { }
static inline void perf_counter_print_debug(void) { }
static inline void hw_perf_restore(u64 ctrl) { }
diff --git a/kernel/exit.c b/kernel/exit.c
index 2d8be7ebb0f7..d336c90a5f13 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1093,11 +1093,12 @@ NORET_TYPE void do_exit(long code)
mpol_put(tsk->mempolicy);
tsk->mempolicy = NULL;
#endif
-#ifdef CONFIG_FUTEX
/*
- * This must happen late, after the PID is not
- * hashed anymore:
+ * These must happen late, after the PID is not
+ * hashed anymore, but still at a point that may sleep:
*/
+ perf_counter_exit_task(tsk);
+#ifdef CONFIG_FUTEX
if (unlikely(!list_empty(&tsk->pi_state_list)))
exit_pi_state_list(tsk);
if (unlikely(current->pi_state_cache))
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 416861ce8b27..f5e81dd193d1 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -80,8 +80,6 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
list_del_init(&sibling->list_entry);
list_add_tail(&sibling->list_entry, &ctx->counter_list);
- WARN_ON_ONCE(!sibling->group_leader);
- WARN_ON_ONCE(sibling->group_leader == sibling);
sibling->group_leader = sibling;
}
}
@@ -97,6 +95,7 @@ static void __perf_counter_remove_from_context(void *info)
struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
struct perf_counter *counter = info;
struct perf_counter_context *ctx = counter->ctx;
+ unsigned long flags;
u64 perf_flags;
/*
@@ -107,7 +106,7 @@ static void __perf_counter_remove_from_context(void *info)
if (ctx->task && cpuctx->task_ctx != ctx)
return;
- spin_lock(&ctx->lock);
+ spin_lock_irqsave(&ctx->lock, flags);
if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
counter->hw_ops->hw_perf_counter_disable(counter);
@@ -136,7 +135,7 @@ static void __perf_counter_remove_from_context(void *info)
perf_max_counters - perf_reserved_percpu);
}
- spin_unlock(&ctx->lock);
+ spin_unlock_irqrestore(&ctx->lock, flags);
}
@@ -199,6 +198,7 @@ static void __perf_install_in_context(void *info)
struct perf_counter *counter = info;
struct perf_counter_context *ctx = counter->ctx;
int cpu = smp_processor_id();
+ unsigned long flags;
u64 perf_flags;
/*
@@ -209,7 +209,7 @@ static void __perf_install_in_context(void *info)
if (ctx->task && cpuctx->task_ctx != ctx)
return;
- spin_lock(&ctx->lock);
+ spin_lock_irqsave(&ctx->lock, flags);
/*
* Protect the list operation against NMI by disabling the
@@ -232,7 +232,7 @@ static void __perf_install_in_context(void *info)
if (!ctx->task && cpuctx->max_pertask)
cpuctx->max_pertask--;
- spin_unlock(&ctx->lock);
+ spin_unlock_irqrestore(&ctx->lock, flags);
}
/*
@@ -446,10 +446,9 @@ int perf_counter_task_disable(void)
*/
perf_flags = hw_perf_save_disable();
- list_for_each_entry(counter, &ctx->counter_list, list_entry) {
- WARN_ON_ONCE(counter->state == PERF_COUNTER_STATE_ACTIVE);
+ list_for_each_entry(counter, &ctx->counter_list, list_entry)
counter->state = PERF_COUNTER_STATE_OFF;
- }
+
hw_perf_restore(perf_flags);
spin_unlock(&ctx->lock);
@@ -525,26 +524,6 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
perf_counter_task_sched_in(curr, cpu);
}
-/*
- * Initialize the perf_counter context in a task_struct:
- */
-static void
-__perf_counter_init_context(struct perf_counter_context *ctx,
- struct task_struct *task)
-{
- spin_lock_init(&ctx->lock);
- INIT_LIST_HEAD(&ctx->counter_list);
- ctx->nr_counters = 0;
- ctx->task = task;
-}
-/*
- * Initialize the perf_counter context in task_struct
- */
-void perf_counter_init_task(struct task_struct *task)
-{
- __perf_counter_init_context(&task->perf_counter_ctx, task);
-}
-
/*
* Cross CPU call to read the hardware counter
*/
@@ -663,7 +642,6 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
cpuctx = &per_cpu(perf_cpu_context, cpu);
ctx = &cpuctx->ctx;
- WARN_ON_ONCE(ctx->task);
return ctx;
}
@@ -915,12 +893,13 @@ sw_perf_counter_init(struct perf_counter *counter)
static struct perf_counter *
perf_counter_alloc(struct perf_counter_hw_event *hw_event,
int cpu,
- struct perf_counter *group_leader)
+ struct perf_counter *group_leader,
+ gfp_t gfpflags)
{
const struct hw_perf_counter_ops *hw_ops;
struct perf_counter *counter;
- counter = kzalloc(sizeof(*counter), GFP_KERNEL);
+ counter = kzalloc(sizeof(*counter), gfpflags);
if (!counter)
return NULL;
@@ -947,9 +926,8 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
hw_ops = NULL;
if (!hw_event->raw && hw_event->type < 0)
hw_ops = sw_perf_counter_init(counter);
- if (!hw_ops) {
+ if (!hw_ops)
hw_ops = hw_perf_counter_init(counter);
- }
if (!hw_ops) {
kfree(counter);
@@ -975,8 +953,10 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
struct perf_counter *counter, *group_leader;
struct perf_counter_hw_event hw_event;
struct perf_counter_context *ctx;
+ struct file *counter_file = NULL;
struct file *group_file = NULL;
int fput_needed = 0;
+ int fput_needed2 = 0;
int ret;
if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
@@ -1017,25 +997,29 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
}
ret = -EINVAL;
- counter = perf_counter_alloc(&hw_event, cpu, group_leader);
+ counter = perf_counter_alloc(&hw_event, cpu, group_leader, GFP_KERNEL);
if (!counter)
goto err_put_context;
- perf_install_in_context(ctx, counter, cpu);
-
ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
if (ret < 0)
- goto err_remove_free_put_context;
+ goto err_free_put_context;
+
+ counter_file = fget_light(ret, &fput_needed2);
+ if (!counter_file)
+ goto err_free_put_context;
+
+ counter->filp = counter_file;
+ perf_install_in_context(ctx, counter, cpu);
+
+ fput_light(counter_file, fput_needed2);
out_fput:
fput_light(group_file, fput_needed);
return ret;
-err_remove_free_put_context:
- mutex_lock(&counter->mutex);
- perf_counter_remove_from_context(counter);
- mutex_unlock(&counter->mutex);
+err_free_put_context:
kfree(counter);
err_put_context:
@@ -1044,6 +1028,186 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
goto out_fput;
}
+/*
+ * Initialize the perf_counter context in a task_struct:
+ */
+static void
+__perf_counter_init_context(struct perf_counter_context *ctx,
+ struct task_struct *task)
+{
+ memset(ctx, 0, sizeof(*ctx));
+ spin_lock_init(&ctx->lock);
+ INIT_LIST_HEAD(&ctx->counter_list);
+ ctx->task = task;
+}
+
+/*
+ * inherit a counter from parent task to child task:
+ */
+static int
+inherit_counter(struct perf_counter *parent_counter,
+ struct task_struct *parent,
+ struct perf_counter_context *parent_ctx,
+ struct task_struct *child,
+ struct perf_counter_context *child_ctx)
+{
+ struct perf_counter *child_counter;
+
+ child_counter = perf_counter_alloc(&parent_counter->hw_event,
+ parent_counter->cpu, NULL,
+ GFP_ATOMIC);
+ if (!child_counter)
+ return -ENOMEM;
+
+ /*
+ * Link it up in the child's context:
+ */
+ child_counter->ctx = child_ctx;
+ child_counter->task = child;
+ list_add_counter(child_counter, child_ctx);
+ child_ctx->nr_counters++;
+
+ child_counter->parent = parent_counter;
+ parent_counter->nr_inherited++;
+ /*
+ * inherit into child's child as well:
+ */
+ child_counter->hw_event.inherit = 1;
+
+ /*
+ * Get a reference to the parent filp - we will fput it
+ * when the child counter exits. This is safe to do because
+ * we are in the parent and we know that the filp still
+ * exists and has a nonzero count:
+ */
+ atomic_long_inc(&parent_counter->filp->f_count);
+
+ return 0;
+}
+
+static void
+__perf_counter_exit_task(struct task_struct *child,
+ struct perf_counter *child_counter,
+ struct perf_counter_context *child_ctx)
+{
+ struct perf_counter *parent_counter;
+ u64 parent_val, child_val;
+ u64 perf_flags;
+
+ /*
+ * Disable and unlink this counter.
+ *
+ * Be careful about zapping the list - IRQ/NMI context
+ * could still be processing it:
+ */
+ local_irq_disable();
+ perf_flags = hw_perf_save_disable();
+
+ if (child_counter->state == PERF_COUNTER_STATE_ACTIVE)
+ child_counter->hw_ops->hw_perf_counter_disable(child_counter);
+ list_del_init(&child_counter->list_entry);
+
+ hw_perf_restore(perf_flags);
+ local_irq_enable();
+
+ parent_counter = child_counter->parent;
+ /*
+ * It can happen that parent exits first, and has counters
+ * that are still around due to the child reference. These
+ * counters need to be zapped - but otherwise linger.
+ */
+ if (!parent_counter)
+ return;
+
+ parent_val = atomic64_read(&parent_counter->count);
+ child_val = atomic64_read(&child_counter->count);
+
+ /*
+ * Add back the child's count to the parent's count:
+ */
+ atomic64_add(child_val, &parent_counter->count);
+
+ fput(parent_counter->filp);
+
+ kfree(child_counter);
+}
+
+/*
+ * When a child task exist, feed back counter values to parent counters.
+ *
+ * Note: we are running in child context, but the PID is not hashed
+ * anymore so new counters will not be added.
+ */
+void perf_counter_exit_task(struct task_struct *child)
+{
+ struct perf_counter *child_counter, *tmp;
+ struct perf_counter_context *child_ctx;
+
+ child_ctx = &child->perf_counter_ctx;
+
+ if (likely(!child_ctx->nr_counters))
+ return;
+
+ list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
+ list_entry)
+ __perf_counter_exit_task(child, child_counter, child_ctx);
+}
+
+/*
+ * Initialize the perf_counter context in task_struct
+ */
+void perf_counter_init_task(struct task_struct *child)
+{
+ struct perf_counter_context *child_ctx, *parent_ctx;
+ struct perf_counter *counter, *parent_counter;
+ struct task_struct *parent = current;
+ unsigned long flags;
+
+ child_ctx = &child->perf_counter_ctx;
+ parent_ctx = &parent->perf_counter_ctx;
+
+ __perf_counter_init_context(child_ctx, child);
+
+ /*
+ * This is executed from the parent task context, so inherit
+ * counters that have been marked for cloning:
+ */
+
+ if (likely(!parent_ctx->nr_counters))
+ return;
+
+ /*
+ * Lock the parent list. No need to lock the child - not PID
+ * hashed yet and not running, so nobody can access it.
+ */
+ spin_lock_irqsave(&parent_ctx->lock, flags);
+
+ /*
+ * We dont have to disable NMIs - we are only looking at
+ * the list, not manipulating it:
+ */
+ list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) {
+ if (!counter->hw_event.inherit || counter->group_leader != counter)
+ continue;
+
+ /*
+ * Instead of creating recursive hierarchies of counters,
+ * we link inheritd counters back to the original parent,
+ * which has a filp for sure, which we use as the reference
+ * count:
+ */
+ parent_counter = counter;
+ if (counter->parent)
+ parent_counter = counter->parent;
+
+ if (inherit_counter(parent_counter, parent,
+ parent_ctx, child, child_ctx))
+ break;
+ }
+
+ spin_unlock_irqrestore(&parent_ctx->lock, flags);
+}
+
static void __cpuinit perf_counter_init_cpu(int cpu)
{
struct perf_cpu_context *cpuctx;
commit ee06094f8279e1312fc0a31591320cc7b6f0ab1e
Author: Ingo Molnar <mingo@elte.hu>
Date: Sat Dec 13 09:00:03 2008 +0100
perfcounters: restructure x86 counter math
Impact: restructure code
Change counter math from absolute values to clear delta logic.
We try to extract elapsed deltas from the raw hw counter - and put
that into the generic counter.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f2fdc1867241..fe94490bab61 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -643,7 +643,7 @@ config X86_UP_IOAPIC
config X86_LOCAL_APIC
def_bool y
depends on X86_64 || (X86_32 && (X86_UP_APIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH))
- select HAVE_PERF_COUNTERS
+ select HAVE_PERF_COUNTERS if (!M386 && !M486)
config X86_IO_APIC
def_bool y
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index b903f8df72bb..5afae13d8d59 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -53,6 +53,48 @@ const int intel_perfmon_event_map[] =
const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
+/*
+ * Propagate counter elapsed time into the generic counter.
+ * Can only be executed on the CPU where the counter is active.
+ * Returns the delta events processed.
+ */
+static void
+x86_perf_counter_update(struct perf_counter *counter,
+ struct hw_perf_counter *hwc, int idx)
+{
+ u64 prev_raw_count, new_raw_count, delta;
+
+ WARN_ON_ONCE(counter->state != PERF_COUNTER_STATE_ACTIVE);
+ /*
+ * Careful: an NMI might modify the previous counter value.
+ *
+ * Our tactic to handle this is to first atomically read and
+ * exchange a new raw count - then add that new-prev delta
+ * count to the generic counter atomically:
+ */
+again:
+ prev_raw_count = atomic64_read(&hwc->prev_count);
+ rdmsrl(hwc->counter_base + idx, new_raw_count);
+
+ if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
+ new_raw_count) != prev_raw_count)
+ goto again;
+
+ /*
+ * Now we have the new raw value and have updated the prev
+ * timestamp already. We can now calculate the elapsed delta
+ * (counter-)time and add that to the generic counter.
+ *
+ * Careful, not all hw sign-extends above the physical width
+ * of the count, so we do that by clipping the delta to 32 bits:
+ */
+ delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count);
+ WARN_ON_ONCE((int)delta < 0);
+
+ atomic64_add(delta, &counter->count);
+ atomic64_sub(delta, &hwc->period_left);
+}
+
/*
* Setup the hardware configuration for a given hw_event_type
*/
@@ -90,10 +132,10 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
* so we install an artificial 1<<31 period regardless of
* the generic counter period:
*/
- if (!hwc->irq_period)
+ if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF)
hwc->irq_period = 0x7FFFFFFF;
- hwc->next_count = -(s32)hwc->irq_period;
+ atomic64_set(&hwc->period_left, hwc->irq_period);
/*
* Raw event type provide the config in the event structure
@@ -118,12 +160,6 @@ void hw_perf_enable_all(void)
wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0);
}
-void hw_perf_restore(u64 ctrl)
-{
- wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0);
-}
-EXPORT_SYMBOL_GPL(hw_perf_restore);
-
u64 hw_perf_save_disable(void)
{
u64 ctrl;
@@ -134,27 +170,74 @@ u64 hw_perf_save_disable(void)
}
EXPORT_SYMBOL_GPL(hw_perf_save_disable);
+void hw_perf_restore(u64 ctrl)
+{
+ wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0);
+}
+EXPORT_SYMBOL_GPL(hw_perf_restore);
+
static inline void
-__x86_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx)
+__x86_perf_counter_disable(struct perf_counter *counter,
+ struct hw_perf_counter *hwc, unsigned int idx)
{
- wrmsr(hwc->config_base + idx, hwc->config, 0);
+ int err;
+
+ err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0);
+ WARN_ON_ONCE(err);
}
-static DEFINE_PER_CPU(u64, prev_next_count[MAX_HW_COUNTERS]);
+static DEFINE_PER_CPU(u64, prev_left[MAX_HW_COUNTERS]);
-static void __hw_perf_counter_set_period(struct hw_perf_counter *hwc, int idx)
+/*
+ * Set the next IRQ period, based on the hwc->period_left value.
+ * To be called with the counter disabled in hw:
+ */
+static void
+__hw_perf_counter_set_period(struct perf_counter *counter,
+ struct hw_perf_counter *hwc, int idx)
{
- per_cpu(prev_next_count[idx], smp_processor_id()) = hwc->next_count;
+ s32 left = atomic64_read(&hwc->period_left);
+ s32 period = hwc->irq_period;
+
+ WARN_ON_ONCE(period <= 0);
+
+ /*
+ * If we are way outside a reasoable range then just skip forward:
+ */
+ if (unlikely(left <= -period)) {
+ left = period;
+ atomic64_set(&hwc->period_left, left);
+ }
+
+ if (unlikely(left <= 0)) {
+ left += period;
+ atomic64_set(&hwc->period_left, left);
+ }
- wrmsr(hwc->counter_base + idx, hwc->next_count, 0);
+ WARN_ON_ONCE(left <= 0);
+
+ per_cpu(prev_left[idx], smp_processor_id()) = left;
+
+ /*
+ * The hw counter starts counting from this counter offset,
+ * mark it to be able to extra future deltas:
+ */
+ atomic64_set(&hwc->prev_count, (u64)(s64)-left);
+
+ wrmsr(hwc->counter_base + idx, -left, 0);
}
-static void __x86_perf_counter_enable(struct hw_perf_counter *hwc, int idx)
+static void
+__x86_perf_counter_enable(struct perf_counter *counter,
+ struct hw_perf_counter *hwc, int idx)
{
wrmsr(hwc->config_base + idx,
hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
}
+/*
+ * Find a PMC slot for the freshly enabled / scheduled in counter:
+ */
static void x86_perf_counter_enable(struct perf_counter *counter)
{
struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
@@ -170,55 +253,17 @@ static void x86_perf_counter_enable(struct perf_counter *counter)
perf_counters_lapic_init(hwc->nmi);
- __x86_perf_counter_disable(hwc, idx);
+ __x86_perf_counter_disable(counter, hwc, idx);
cpuc->counters[idx] = counter;
- __hw_perf_counter_set_period(hwc, idx);
- __x86_perf_counter_enable(hwc, idx);
-}
-
-static void __hw_perf_save_counter(struct perf_counter *counter,
- struct hw_perf_counter *hwc, int idx)
-{
- s64 raw = -1;
- s64 delta;
-
- /*
- * Get the raw hw counter value:
- */
- rdmsrl(hwc->counter_base + idx, raw);
-
- /*
- * Rebase it to zero (it started counting at -irq_period),
- * to see the delta since ->prev_count:
- */
- delta = (s64)hwc->irq_period + (s64)(s32)raw;
-
- atomic64_counter_set(counter, hwc->prev_count + delta);
-
- /*
- * Adjust the ->prev_count offset - if we went beyond
- * irq_period of units, then we got an IRQ and the counter
- * was set back to -irq_period:
- */
- while (delta >= (s64)hwc->irq_period) {
- hwc->prev_count += hwc->irq_period;
- delta -= (s64)hwc->irq_period;
- }
-
- /*
- * Calculate the next raw counter value we'll write into
- * the counter at the next sched-in time:
- */
- delta -= (s64)hwc->irq_period;
-
- hwc->next_count = (s32)delta;
+ __hw_perf_counter_set_period(counter, hwc, idx);
+ __x86_perf_counter_enable(counter, hwc, idx);
}
void perf_counter_print_debug(void)
{
- u64 ctrl, status, overflow, pmc_ctrl, pmc_count, next_count;
+ u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left;
int cpu, idx;
if (!nr_hw_counters)
@@ -241,14 +286,14 @@ void perf_counter_print_debug(void)
rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
rdmsrl(MSR_ARCH_PERFMON_PERFCTR0 + idx, pmc_count);
- next_count = per_cpu(prev_next_count[idx], cpu);
+ prev_left = per_cpu(prev_left[idx], cpu);
printk(KERN_INFO "CPU#%d: PMC%d ctrl: %016llx\n",
cpu, idx, pmc_ctrl);
printk(KERN_INFO "CPU#%d: PMC%d count: %016llx\n",
cpu, idx, pmc_count);
- printk(KERN_INFO "CPU#%d: PMC%d next: %016llx\n",
- cpu, idx, next_count);
+ printk(KERN_INFO "CPU#%d: PMC%d left: %016llx\n",
+ cpu, idx, prev_left);
}
local_irq_enable();
}
@@ -259,29 +304,16 @@ static void x86_perf_counter_disable(struct perf_counter *counter)
struct hw_perf_counter *hwc = &counter->hw;
unsigned int idx = hwc->idx;
- __x86_perf_counter_disable(hwc, idx);
+ __x86_perf_counter_disable(counter, hwc, idx);
clear_bit(idx, cpuc->used);
cpuc->counters[idx] = NULL;
- __hw_perf_save_counter(counter, hwc, idx);
-}
-static void x86_perf_counter_read(struct perf_counter *counter)
-{
- struct hw_perf_counter *hwc = &counter->hw;
- unsigned long addr = hwc->counter_base + hwc->idx;
- s64 offs, val = -1LL;
- s32 val32;
-
- /* Careful: NMI might modify the counter offset */
- do {
- offs = hwc->prev_count;
- rdmsrl(addr, val);
- } while (offs != hwc->prev_count);
-
- val32 = (s32) val;
- val = (s64)hwc->irq_period + (s64)val32;
- atomic64_counter_set(counter, hwc->prev_count + val);
+ /*
+ * Drain the remaining delta count out of a counter
+ * that we are disabling:
+ */
+ x86_perf_counter_update(counter, hwc, idx);
}
static void perf_store_irq_data(struct perf_counter *counter, u64 data)
@@ -299,7 +331,8 @@ static void perf_store_irq_data(struct perf_counter *counter, u64 data)
}
/*
- * NMI-safe enable method:
+ * Save and restart an expired counter. Called by NMI contexts,
+ * so it has to be careful about preempting normal counter ops:
*/
static void perf_save_and_restart(struct perf_counter *counter)
{
@@ -309,45 +342,25 @@ static void perf_save_and_restart(struct perf_counter *counter)
rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
- __hw_perf_save_counter(counter, hwc, idx);
- __hw_perf_counter_set_period(hwc, idx);
+ x86_perf_counter_update(counter, hwc, idx);
+ __hw_perf_counter_set_period(counter, hwc, idx);
if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE)
- __x86_perf_counter_enable(hwc, idx);
+ __x86_perf_counter_enable(counter, hwc, idx);
}
static void
perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
{
struct perf_counter *counter, *group_leader = sibling->group_leader;
- int bit;
-
- /*
- * Store the counter's own timestamp first:
- */
- perf_store_irq_data(sibling, sibling->hw_event.type);
- perf_store_irq_data(sibling, atomic64_counter_read(sibling));
/*
- * Then store sibling timestamps (if any):
+ * Store sibling timestamps (if any):
*/
list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
- if (counter->state != PERF_COUNTER_STATE_ACTIVE) {
- /*
- * When counter was not in the overflow mask, we have to
- * read it from hardware. We read it as well, when it
- * has not been read yet and clear the bit in the
- * status mask.
- */
- bit = counter->hw.idx;
- if (!test_bit(bit, (unsigned long *) overflown) ||
- test_bit(bit, (unsigned long *) status)) {
- clear_bit(bit, (unsigned long *) status);
- perf_save_and_restart(counter);
- }
- }
+ x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
perf_store_irq_data(sibling, counter->hw_event.type);
- perf_store_irq_data(sibling, atomic64_counter_read(counter));
+ perf_store_irq_data(sibling, atomic64_read(&counter->count));
}
}
@@ -540,6 +553,11 @@ void __init init_hw_perf_counters(void)
perf_counters_initialized = true;
}
+static void x86_perf_counter_read(struct perf_counter *counter)
+{
+ x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
+}
+
static const struct hw_perf_counter_ops x86_perf_counter_ops = {
.hw_perf_counter_enable = x86_perf_counter_enable,
.hw_perf_counter_disable = x86_perf_counter_disable,
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 8cb095fa442c..72460289c654 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -91,14 +91,16 @@ struct perf_counter_hw_event {
* struct hw_perf_counter - performance counter hardware details:
*/
struct hw_perf_counter {
+#ifdef CONFIG_PERF_COUNTERS
u64 config;
unsigned long config_base;
unsigned long counter_base;
int nmi;
unsigned int idx;
- u64 prev_count;
+ atomic64_t prev_count;
u64 irq_period;
- s32 next_count;
+ atomic64_t period_left;
+#endif
};
/*
@@ -140,17 +142,15 @@ enum perf_counter_active_state {
* struct perf_counter - performance counter kernel representation:
*/
struct perf_counter {
+#ifdef CONFIG_PERF_COUNTERS
struct list_head list_entry;
struct list_head sibling_list;
struct perf_counter *group_leader;
const struct hw_perf_counter_ops *hw_ops;
enum perf_counter_active_state state;
-#if BITS_PER_LONG == 64
atomic64_t count;
-#else
- atomic_t count32[2];
-#endif
+
struct perf_counter_hw_event hw_event;
struct hw_perf_counter hw;
@@ -172,6 +172,7 @@ struct perf_counter {
struct perf_data *irqdata;
struct perf_data *usrdata;
struct perf_data data[2];
+#endif
};
/**
@@ -220,8 +221,6 @@ extern void perf_counter_notify(struct pt_regs *regs);
extern void perf_counter_print_debug(void);
extern u64 hw_perf_save_disable(void);
extern void hw_perf_restore(u64 ctrl);
-extern void atomic64_counter_set(struct perf_counter *counter, u64 val64);
-extern u64 atomic64_counter_read(struct perf_counter *counter);
extern int perf_counter_task_disable(void);
extern int perf_counter_task_enable(void);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 559130b8774d..416861ce8b27 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -44,67 +44,9 @@ hw_perf_counter_init(struct perf_counter *counter)
}
u64 __weak hw_perf_save_disable(void) { return 0; }
-void __weak hw_perf_restore(u64 ctrl) { }
+void __weak hw_perf_restore(u64 ctrl) { }
void __weak hw_perf_counter_setup(void) { }
-#if BITS_PER_LONG == 64
-
-/*
- * Read the cached counter in counter safe against cross CPU / NMI
- * modifications. 64 bit version - no complications.
- */
-static inline u64 perf_counter_read_safe(struct perf_counter *counter)
-{
- return (u64) atomic64_read(&counter->count);
-}
-
-void atomic64_counter_set(struct perf_counter *counter, u64 val)
-{
- atomic64_set(&counter->count, val);
-}
-
-u64 atomic64_counter_read(struct perf_counter *counter)
-{
- return atomic64_read(&counter->count);
-}
-
-#else
-
-/*
- * Read the cached counter in counter safe against cross CPU / NMI
- * modifications. 32 bit version.
- */
-static u64 perf_counter_read_safe(struct perf_counter *counter)
-{
- u32 cntl, cnth;
-
- local_irq_disable();
- do {
- cnth = atomic_read(&counter->count32[1]);
- cntl = atomic_read(&counter->count32[0]);
- } while (cnth != atomic_read(&counter->count32[1]));
-
- local_irq_enable();
-
- return cntl | ((u64) cnth) << 32;
-}
-
-void atomic64_counter_set(struct perf_counter *counter, u64 val64)
-{
- u32 *val32 = (void *)&val64;
-
- atomic_set(counter->count32 + 0, *(val32 + 0));
- atomic_set(counter->count32 + 1, *(val32 + 1));
-}
-
-u64 atomic64_counter_read(struct perf_counter *counter)
-{
- return atomic_read(counter->count32 + 0) |
- (u64) atomic_read(counter->count32 + 1) << 32;
-}
-
-#endif
-
static void
list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
{
@@ -280,11 +222,11 @@ static void __perf_install_in_context(void *info)
ctx->nr_counters++;
if (cpuctx->active_oncpu < perf_max_counters) {
- counter->hw_ops->hw_perf_counter_enable(counter);
counter->state = PERF_COUNTER_STATE_ACTIVE;
counter->oncpu = cpu;
ctx->nr_active++;
cpuctx->active_oncpu++;
+ counter->hw_ops->hw_perf_counter_enable(counter);
}
if (!ctx->task && cpuctx->max_pertask)
@@ -624,7 +566,7 @@ static u64 perf_counter_read(struct perf_counter *counter)
__hw_perf_counter_read, counter, 1);
}
- return perf_counter_read_safe(counter);
+ return atomic64_read(&counter->count);
}
/*
@@ -921,7 +863,7 @@ static void cpu_clock_perf_counter_read(struct perf_counter *counter)
{
int cpu = raw_smp_processor_id();
- atomic64_counter_set(counter, cpu_clock(cpu));
+ atomic64_set(&counter->count, cpu_clock(cpu));
}
static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
@@ -940,7 +882,7 @@ static void task_clock_perf_counter_disable(struct perf_counter *counter)
static void task_clock_perf_counter_read(struct perf_counter *counter)
{
- atomic64_counter_set(counter, current->se.sum_exec_runtime);
+ atomic64_set(&counter->count, current->se.sum_exec_runtime);
}
static const struct hw_perf_counter_ops perf_ops_task_clock = {
commit 9b194e831fb2c322ed81a373e49620f34edc2778
Author: Ingo Molnar <mingo@elte.hu>
Date: Sun Dec 14 20:22:35 2008 +0100
x86: implement atomic64_t on 32-bit
Impact: new API
Implement the atomic64_t APIs on 32-bit as well. Will be used by
the performance counters code.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h
index ad5b9f6ecddf..9927e01b03c2 100644
--- a/arch/x86/include/asm/atomic_32.h
+++ b/arch/x86/include/asm/atomic_32.h
@@ -255,5 +255,223 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
#define smp_mb__before_atomic_inc() barrier()
#define smp_mb__after_atomic_inc() barrier()
+/* An 64bit atomic type */
+
+typedef struct {
+ unsigned long long counter;
+} atomic64_t;
+
+#define ATOMIC64_INIT(val) { (val) }
+
+/**
+ * atomic64_read - read atomic64 variable
+ * @v: pointer of type atomic64_t
+ *
+ * Atomically reads the value of @v.
+ * Doesn't imply a read memory barrier.
+ */
+#define __atomic64_read(ptr) ((ptr)->counter)
+
+static inline unsigned long long
+cmpxchg8b(unsigned long long *ptr, unsigned long long old, unsigned long long new)
+{
+ asm volatile(
+
+ LOCK_PREFIX "cmpxchg8b (%[ptr])\n"
+
+ : "=A" (old)
+
+ : [ptr] "D" (ptr),
+ "A" (old),
+ "b" (ll_low(new)),
+ "c" (ll_high(new))
+
+ : "memory");
+
+ return old;
+}
+
+static inline unsigned long long
+atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val,
+ unsigned long long new_val)
+{
+ return cmpxchg8b(&ptr->counter, old_val, new_val);
+}
+
+/**
+ * atomic64_set - set atomic64 variable
+ * @ptr: pointer to type atomic64_t
+ * @new_val: value to assign
+ *
+ * Atomically sets the value of @ptr to @new_val.
+ */
+static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val)
+{
+ unsigned long long old_val;
+
+ do {
+ old_val = atomic_read(ptr);
+ } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
+}
+
+/**
+ * atomic64_read - read atomic64 variable
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically reads the value of @ptr and returns it.
+ */
+static inline unsigned long long atomic64_read(atomic64_t *ptr)
+{
+ unsigned long long curr_val;
+
+ do {
+ curr_val = __atomic64_read(ptr);
+ } while (atomic64_cmpxchg(ptr, curr_val, curr_val) != curr_val);
+
+ return curr_val;
+}
+
+/**
+ * atomic64_add_return - add and return
+ * @delta: integer value to add
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically adds @delta to @ptr and returns @delta + *@ptr
+ */
+static inline unsigned long long
+atomic64_add_return(unsigned long long delta, atomic64_t *ptr)
+{
+ unsigned long long old_val, new_val;
+
+ do {
+ old_val = atomic_read(ptr);
+ new_val = old_val + delta;
+
+ } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
+
+ return new_val;
+}
+
+static inline long atomic64_sub_return(unsigned long long delta, atomic64_t *ptr)
+{
+ return atomic64_add_return(-delta, ptr);
+}
+
+static inline long atomic64_inc_return(atomic64_t *ptr)
+{
+ return atomic64_add_return(1, ptr);
+}
+
+static inline long atomic64_dec_return(atomic64_t *ptr)
+{
+ return atomic64_sub_return(1, ptr);
+}
+
+/**
+ * atomic64_add - add integer to atomic64 variable
+ * @delta: integer value to add
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically adds @delta to @ptr.
+ */
+static inline void atomic64_add(unsigned long long delta, atomic64_t *ptr)
+{
+ atomic64_add_return(delta, ptr);
+}
+
+/**
+ * atomic64_sub - subtract the atomic64 variable
+ * @delta: integer value to subtract
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically subtracts @delta from @ptr.
+ */
+static inline void atomic64_sub(unsigned long long delta, atomic64_t *ptr)
+{
+ atomic64_add(-delta, ptr);
+}
+
+/**
+ * atomic64_sub_and_test - subtract value from variable and test result
+ * @delta: integer value to subtract
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically subtracts @delta from @ptr and returns
+ * true if the result is zero, or false for all
+ * other cases.
+ */
+static inline int
+atomic64_sub_and_test(unsigned long long delta, atomic64_t *ptr)
+{
+ unsigned long long old_val = atomic64_sub_return(delta, ptr);
+
+ return old_val == 0;
+}
+
+/**
+ * atomic64_inc - increment atomic64 variable
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically increments @ptr by 1.
+ */
+static inline void atomic64_inc(atomic64_t *ptr)
+{
+ atomic64_add(1, ptr);
+}
+
+/**
+ * atomic64_dec - decrement atomic64 variable
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically decrements @ptr by 1.
+ */
+static inline void atomic64_dec(atomic64_t *ptr)
+{
+ atomic64_sub(1, ptr);
+}
+
+/**
+ * atomic64_dec_and_test - decrement and test
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically decrements @ptr by 1 and
+ * returns true if the result is 0, or false for all other
+ * cases.
+ */
+static inline int atomic64_dec_and_test(atomic64_t *ptr)
+{
+ return atomic64_sub_and_test(1, ptr);
+}
+
+/**
+ * atomic64_inc_and_test - increment and test
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically increments @ptr by 1
+ * and returns true if the result is zero, or false for all
+ * other cases.
+ */
+static inline int atomic64_inc_and_test(atomic64_t *ptr)
+{
+ return atomic64_sub_and_test(-1, ptr);
+}
+
+/**
+ * atomic64_add_negative - add and test if negative
+ * @delta: integer value to add
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically adds @delta to @ptr and returns true
+ * if the result is negative, or false when
+ * result is greater than or equal to zero.
+ */
+static inline int
+atomic64_add_negative(unsigned long long delta, atomic64_t *ptr)
+{
+ long long old_val = atomic64_add_return(delta, ptr);
+
+ return old_val < 0;
+}
+
#include <asm-generic/atomic.h>
#endif /* _ASM_X86_ATOMIC_32_H */
commit 8299608f140ae321e4eb5d1306184265d2b9511e
Merge: 45ab6b0c76d0 30cb367ea2be 69b88afa8d11 8daa19051e1c
Author: Ingo Molnar <mingo@elte.hu>
Date: Fri Dec 12 13:49:17 2008 +0100
Merge branches 'irq/sparseirq', 'x86/quirks' and 'x86/reboot' into cpus4096
We merge the irq/sparseirq, x86/quirks and x86/reboot trees into the
cpus4096 tree because the io-apic changes in the sparseirq change
conflict with the cpumask changes in the cpumask tree, and we
want to resolve those.
diff --cc arch/x86/kernel/process.c
index c27af49a4ede,c622772744d8,c622772744d8,a4da7c4b3129..95d811a9594f
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@@@@ -7,8 -7,7 -7,7 -7,8 +7,9 @@@@@
#include <linux/module.h>
#include <linux/pm.h>
#include <linux/clockchips.h>
+++#include <linux/ftrace.h>
#include <asm/system.h>
+++ #include <asm/apic.h>
unsigned long idle_halt;
EXPORT_SYMBOL(idle_halt);
commit 45ab6b0c76d0e4cce5bd608ccf97b0f6b20f18df
Merge: 81444a799550 d65bd5ecb2bd
Author: Ingo Molnar <mingo@elte.hu>
Date: Fri Dec 12 13:48:57 2008 +0100
Merge branch 'sched/core' into cpus4096
Conflicts:
include/linux/ftrace.h
kernel/sched.c
diff --cc include/linux/ftrace.h
index 11cac81eed08,9c5bc6be2b09..985b28dc2ba9
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@@ -6,9 -6,8 +6,10 @@@
#include <linux/ktime.h>
#include <linux/init.h>
#include <linux/types.h>
+ #include <linux/module.h>
#include <linux/kallsyms.h>
+#include <linux/bitops.h>
+#include <linux/sched.h>
#ifdef CONFIG_FUNCTION_TRACER
diff --cc kernel/sched.c
index 4ed9f588faa6,ad7b93be5691..e00c92d22655
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@@ -3676,12 -3685,9 +3685,12 @@@ out_balanced
static void idle_balance(int this_cpu, struct rq *this_rq)
{
struct sched_domain *sd;
- int pulled_task = -1;
+ int pulled_task = 0;
unsigned long next_balance = jiffies + HZ;
- cpumask_t tmpmask;
+ cpumask_var_t tmpmask;
+
+ if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC))
+ return;
for_each_domain(this_cpu, sd) {
unsigned long interval;
@@@ -7711,13 -7670,19 +7724,19 @@@ static struct sched_domain_attr *dattr_
/*
* Special case: If a kmalloc of a doms_cur partition (array of
- * cpumask_t) fails, then fallback to a single sched domain,
- * as determined by the single cpumask_t fallback_doms.
+ * cpumask) fails, then fallback to a single sched domain,
+ * as determined by the single cpumask fallback_doms.
*/
-static cpumask_t fallback_doms;
+static cpumask_var_t fallback_doms;
- void __attribute__((weak)) arch_update_cpu_topology(void)
+ /*
+ * arch_update_cpu_topology lets virtualized architectures update the
+ * cpu core maps. It is supposed to return 1 if the topology changed
+ * or 0 if it stayed the same.
+ */
+ int __attribute__((weak)) arch_update_cpu_topology(void)
{
+ return 0;
}
/*
@@@ -7821,8 -7788,8 +7844,8 @@@ void partition_sched_domains(int ndoms_
/* Destroy deleted domains */
for (i = 0; i < ndoms_cur; i++) {
- for (j = 0; j < n; j++) {
+ for (j = 0; j < n && !new_topology; j++) {
- if (cpus_equal(doms_cur[i], doms_new[j])
+ if (cpumask_equal(&doms_cur[i], &doms_new[j])
&& dattrs_equal(dattr_cur, i, dattr_new, j))
goto match1;
}
@@@ -7841,8 -7808,8 +7864,8 @@@ match1
/* Build new domains */
for (i = 0; i < ndoms_new; i++) {
- for (j = 0; j < ndoms_cur; j++) {
+ for (j = 0; j < ndoms_cur && !new_topology; j++) {
- if (cpus_equal(doms_new[i], doms_cur[j])
+ if (cpumask_equal(&doms_new[i], &doms_cur[j])
&& dattrs_equal(dattr_new, i, dattr_cur, j))
goto match2;
}
commit 81444a799550214f549caf579cf65a0ca55e70b7
Merge: a64d31baed10 da485e0cb167
Author: Ingo Molnar <mingo@elte.hu>
Date: Fri Dec 12 12:43:05 2008 +0100
Merge branch 'tracing/fastboot' into cpus4096
diff --cc kernel/Makefile
index 010ccb311166,19fad003b19d..6a212b842d86
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@@ -19,11 -19,8 +19,7 @@@ CFLAGS_REMOVE_mutex-debug.o = -p
CFLAGS_REMOVE_rtmutex-debug.o = -pg
CFLAGS_REMOVE_cgroup-debug.o = -pg
CFLAGS_REMOVE_sched_clock.o = -pg
-CFLAGS_REMOVE_sched.o = -pg
endif
- ifdef CONFIG_FUNCTION_RET_TRACER
- CFLAGS_REMOVE_extable.o = -pg # For __kernel_text_address()
- CFLAGS_REMOVE_module.o = -pg # For __module_text_address()
- endif
obj-$(CONFIG_FREEZER) += freezer.o
obj-$(CONFIG_PROFILING) += profile.o
commit 30cb367ea2be76bf71dbd275f38d0fd3b6f4142b
Author: Ingo Molnar <mingo@elte.hu>
Date: Fri Dec 12 12:19:57 2008 +0100
sparse irqs: add irqnr.h to the user headers list
Impact: fix build error
/home/mingo/tip/usr/include/linux/random.h:11: included file
'linux/irqnr.h' is not exported
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index e531783e5d78..95ac82340c3b 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -313,6 +313,7 @@ unifdef-y += ptrace.h
unifdef-y += qnx4_fs.h
unifdef-y += quota.h
unifdef-y += random.h
+unifdef-y += irqnr.h
unifdef-y += reboot.h
unifdef-y += reiserfs_fs.h
unifdef-y += reiserfs_xattr.h