Patches contributed by Eötvös Lorand University
commit 3085354de635179d70c240e6d942bcbd1d93056c
Author: Ingo Molnar <mingo@elte.hu>
Date: Thu Mar 27 21:29:09 2008 +0100
x86: prefetch fix #2
Linus noticed a second bug and an uncleanliness:
- we'd return on any instruction fetch fault
- we'd use both the value of 16 and the PF_INSTR symbol which are
the same and make no sense
the cleanup nicely unifies this piece of logic.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index c0c82bc143c9..ec08d8389850 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -91,13 +91,10 @@ static int is_prefetch(struct pt_regs *regs, unsigned long addr,
int prefetch = 0;
unsigned char *max_instr;
-#ifdef CONFIG_X86_32
- /* Catch an obscure case of prefetch inside an NX page: */
- if ((__supported_pte_mask & _PAGE_NX) && (error_code & 16))
- return 0;
-#endif
-
- /* If it was a exec fault on NX page, ignore */
+ /*
+ * If it was a exec (instruction fetch) fault on NX page, then
+ * do not ignore the fault:
+ */
if (error_code & PF_INSTR)
return 0;
commit bc713dcf35c427ae8377fb9a4d1b7f891054ce13
Author: Ingo Molnar <mingo@elte.hu>
Date: Thu Mar 27 15:58:28 2008 +0100
x86: fix prefetch workaround
some early Athlon XP's and Opterons generate bogus faults on prefetch
instructions. The workaround for this regressed over .24 - reinstate it.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index fdc667422df9..c0c82bc143c9 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -92,7 +92,8 @@ static int is_prefetch(struct pt_regs *regs, unsigned long addr,
unsigned char *max_instr;
#ifdef CONFIG_X86_32
- if (!(__supported_pte_mask & _PAGE_NX))
+ /* Catch an obscure case of prefetch inside an NX page: */
+ if ((__supported_pte_mask & _PAGE_NX) && (error_code & 16))
return 0;
#endif
commit 3c274c2909e17aa0afeded4cd4520b7357357ca0
Author: Ingo Molnar <mingo@elte.hu>
Date: Fri Mar 21 10:06:32 2008 +0100
x86: add dmi quirk for io_delay
reported by mereandor@gmail.com, in:
http://bugzilla.kernel.org/show_bug.cgi?id=6307
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c
index c706a3061553..5921e5f0a640 100644
--- a/arch/x86/kernel/io_delay.c
+++ b/arch/x86/kernel/io_delay.c
@@ -76,6 +76,14 @@ static struct dmi_system_id __initdata io_delay_0xed_port_dmi_table[] = {
DMI_MATCH(DMI_BOARD_NAME, "30B9")
}
},
+ {
+ .callback = dmi_io_delay_0xed_port,
+ .ident = "HP Pavilion dv6000",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
+ DMI_MATCH(DMI_BOARD_NAME, "30B8")
+ }
+ },
{
.callback = dmi_io_delay_0xed_port,
.ident = "HP Pavilion tx1000",
commit 33b0c4217dcd67b788318c3192a2912b530e4eef
Author: Ingo Molnar <mingo@elte.hu>
Date: Sun Mar 16 11:14:30 2008 +0100
sched: tune multi-core idle balancing
WAKE_IDLE is too agressive on multi-core CPUs with the new
wake-affine code, keep it on for SMT/HT balancing alone
(where there's no cache affinity at all between logical CPUs).
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 2352f46160d3..2d8dac8799cf 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -138,7 +138,6 @@
| SD_BALANCE_FORK \
| SD_BALANCE_EXEC \
| SD_WAKE_AFFINE \
- | SD_WAKE_IDLE \
| SD_SHARE_PKG_RESOURCES\
| BALANCE_FOR_MC_POWER, \
.last_balance = jiffies, \
commit 74e3cd7f480ae1888b7cd196bf8125a1d3bfee05
Author: Ingo Molnar <mingo@elte.hu>
Date: Tue Mar 18 18:47:57 2008 +0100
sched: retune wake granularity
reduce wake-up granularity for better interactivity.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 87c9d3a2aafa..b85cac4b5e25 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -73,13 +73,13 @@ unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL;
/*
* SCHED_OTHER wake-up granularity.
- * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds)
*
* This option delays the preemption effects of decoupled workloads
* and reduces their over-scheduling. Synchronous workloads will still
* have immediate wakeup/sleep latencies.
*/
-unsigned int sysctl_sched_wakeup_granularity = 10000000UL;
+unsigned int sysctl_sched_wakeup_granularity = 5000000UL;
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
commit f540a6080a092e2ab69fd146c308022db7347b0a
Author: Ingo Molnar <mingo@elte.hu>
Date: Sat Mar 15 17:10:34 2008 +0100
sched: wakeup-buddy tasks are cache-hot
Wakeup-buddy tasks are cache-hot - this makes it a bit harder
for the load-balancer to tear them apart. (but it's still possible,
if the load is sufficiently assymetric)
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/kernel/sched.c b/kernel/sched.c
index adbd475cfd25..3f7c5eb254e2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1396,6 +1396,12 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
{
s64 delta;
+ /*
+ * Buddy candidates are cache hot:
+ */
+ if (&p->se == cfs_rq_of(&p->se)->next)
+ return 1;
+
if (p->sched_class != &fair_sched_class)
return 0;
commit 4ae7d5cefd4aa3560e359a3b0f03e12adc8b5c86
Author: Ingo Molnar <mingo@elte.hu>
Date: Wed Mar 19 01:42:00 2008 +0100
sched: improve affine wakeups
improve affine wakeups. Maintain the 'overlap' metric based on CFS's
sum_exec_runtime - which means the amount of time a task executes
after it wakes up some other task.
Use the 'overlap' for the wakeup decisions: if the 'overlap' is short,
it means there's strong workload coupling between this task and the
woken up task. If the 'overlap' is large then the workload is decoupled
and the scheduler will move them to separate CPUs more easily.
( Also slightly move the preempt_check within try_to_wake_up() - this has
no effect on functionality but allows 'early wakeups' (for still-on-rq
tasks) to be correctly accounted as well.)
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 11d8e9a74eff..3625fcaf5d0f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -929,6 +929,9 @@ struct sched_entity {
u64 vruntime;
u64 prev_sum_exec_runtime;
+ u64 last_wakeup;
+ u64 avg_overlap;
+
#ifdef CONFIG_SCHEDSTATS
u64 wait_start;
u64 wait_max;
diff --git a/kernel/sched.c b/kernel/sched.c
index d1ad69b270ca..adbd475cfd25 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1855,10 +1855,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
schedstat_inc(p, se.nr_wakeups_remote);
update_rq_clock(rq);
activate_task(rq, p, 1);
- check_preempt_curr(rq, p);
success = 1;
out_running:
+ check_preempt_curr(rq, p);
+
p->state = TASK_RUNNING;
#ifdef CONFIG_SMP
if (p->sched_class->task_wake_up)
@@ -1892,6 +1893,8 @@ static void __sched_fork(struct task_struct *p)
p->se.exec_start = 0;
p->se.sum_exec_runtime = 0;
p->se.prev_sum_exec_runtime = 0;
+ p->se.last_wakeup = 0;
+ p->se.avg_overlap = 0;
#ifdef CONFIG_SCHEDSTATS
p->se.wait_start = 0;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 4b5e24cf2f4a..ef358ba07683 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -288,6 +288,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
PN(se.exec_start);
PN(se.vruntime);
PN(se.sum_exec_runtime);
+ PN(se.avg_overlap);
nr_switches = p->nvcsw + p->nivcsw;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index b5a357396b49..87c9d3a2aafa 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -556,6 +556,21 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
account_entity_enqueue(cfs_rq, se);
}
+static void update_avg(u64 *avg, u64 sample)
+{
+ s64 diff = sample - *avg;
+ *avg += diff >> 3;
+}
+
+static void update_avg_stats(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ if (!se->last_wakeup)
+ return;
+
+ update_avg(&se->avg_overlap, se->sum_exec_runtime - se->last_wakeup);
+ se->last_wakeup = 0;
+}
+
static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
{
@@ -566,6 +581,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
update_stats_dequeue(cfs_rq, se);
if (sleep) {
+ update_avg_stats(cfs_rq, se);
#ifdef CONFIG_SCHEDSTATS
if (entity_is_task(se)) {
struct task_struct *tsk = task_of(se);
@@ -981,12 +997,15 @@ static inline int wake_idle(int cpu, struct task_struct *p)
#ifdef CONFIG_SMP
+static const struct sched_class fair_sched_class;
+
static int
-wake_affine(struct rq *rq, struct sched_domain *this_sd, struct task_struct *p,
- int prev_cpu, int this_cpu, int sync, int idx,
- unsigned long load, unsigned long this_load,
+wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
+ struct task_struct *p, int prev_cpu, int this_cpu, int sync,
+ int idx, unsigned long load, unsigned long this_load,
unsigned int imbalance)
{
+ struct task_struct *curr = this_rq->curr;
unsigned long tl = this_load;
unsigned long tl_per_task;
@@ -994,10 +1013,15 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct task_struct *p,
return 0;
/*
- * Attract cache-cold tasks on sync wakeups:
+ * If the currently running task will sleep within
+ * a reasonable amount of time then attract this newly
+ * woken task:
*/
- if (sync && !task_hot(p, rq->clock, this_sd))
- return 1;
+ if (sync && curr->sched_class == &fair_sched_class) {
+ if (curr->se.avg_overlap < sysctl_sched_migration_cost &&
+ p->se.avg_overlap < sysctl_sched_migration_cost)
+ return 1;
+ }
schedstat_inc(p, se.nr_wakeups_affine_attempts);
tl_per_task = cpu_avg_load_per_task(this_cpu);
@@ -1030,18 +1054,16 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
struct sched_domain *sd, *this_sd = NULL;
int prev_cpu, this_cpu, new_cpu;
unsigned long load, this_load;
+ struct rq *rq, *this_rq;
unsigned int imbalance;
- struct rq *rq;
int idx;
prev_cpu = task_cpu(p);
rq = task_rq(p);
this_cpu = smp_processor_id();
+ this_rq = cpu_rq(this_cpu);
new_cpu = prev_cpu;
- if (prev_cpu == this_cpu)
- goto out;
-
/*
* 'this_sd' is the first domain that both
* this_cpu and prev_cpu are present in:
@@ -1069,11 +1091,12 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
load = source_load(prev_cpu, idx);
this_load = target_load(this_cpu, idx);
- if (wake_affine(rq, this_sd, p, prev_cpu, this_cpu, sync, idx,
- load, this_load, imbalance)) {
- new_cpu = this_cpu;
+ if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
+ load, this_load, imbalance))
+ return this_cpu;
+
+ if (prev_cpu == this_cpu)
goto out;
- }
/*
* Start passive balancing when half the imbalance_pct
@@ -1083,8 +1106,7 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
if (imbalance*this_load <= 100*load) {
schedstat_inc(this_sd, ttwu_move_balance);
schedstat_inc(p, se.nr_wakeups_passive);
- new_cpu = this_cpu;
- goto out;
+ return this_cpu;
}
}
@@ -1111,6 +1133,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
return;
}
+ se->last_wakeup = se->sum_exec_runtime;
+ if (unlikely(se == pse))
+ return;
+
cfs_rq_of(pse)->next = pse;
/*
commit 6f3d09291b4982991680b61763b2541e53e2a95f
Author: Ingo Molnar <mingo@elte.hu>
Date: Wed Mar 19 01:44:24 2008 +0100
sched, net: socket wakeups are sync
'sync' wakeups are a hint towards the scheduler that (certain)
networking related wakeups likely create coupling between tasks.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/net/core/sock.c b/net/core/sock.c
index 09cb3a74de7f..2654c147c004 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1621,7 +1621,7 @@ static void sock_def_readable(struct sock *sk, int len)
{
read_lock(&sk->sk_callback_lock);
if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
- wake_up_interruptible(sk->sk_sleep);
+ wake_up_interruptible_sync(sk->sk_sleep);
sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
read_unlock(&sk->sk_callback_lock);
}
@@ -1635,7 +1635,7 @@ static void sock_def_write_space(struct sock *sk)
*/
if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
- wake_up_interruptible(sk->sk_sleep);
+ wake_up_interruptible_sync(sk->sk_sleep);
/* Should agree with poll, otherwise some programs break */
if (sock_writeable(sk))
commit f48273860edfca2306236d0f0de609aab3f773d4
Author: Ingo Molnar <mingo@elte.hu>
Date: Sun Mar 16 21:21:47 2008 +0100
sched: clean up wakeup balancing, code flow
Clean up the code flow. No code changed:
kernel/sched.o:
text data bss dec hex filename
42521 2858 232 45611 b22b sched.o.before
42521 2858 232 45611 b22b sched.o.after
md5:
09b31c44e9aff8666f72773dc433e2df sched.o.before.asm
09b31c44e9aff8666f72773dc433e2df sched.o.after.asm
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 2d2be02b8e3b..b5a357396b49 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1040,7 +1040,7 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
new_cpu = prev_cpu;
if (prev_cpu == this_cpu)
- goto out_set_cpu;
+ goto out;
/*
* 'this_sd' is the first domain that both
@@ -1054,13 +1054,13 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
}
if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
- goto out_set_cpu;
+ goto out;
/*
* Check for affine wakeup and passive balancing possibilities.
*/
if (!this_sd)
- goto out_keep_cpu;
+ goto out;
idx = this_sd->wake_idx;
@@ -1069,11 +1069,11 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
load = source_load(prev_cpu, idx);
this_load = target_load(this_cpu, idx);
- new_cpu = this_cpu; /* Wake to this CPU if we can */
-
if (wake_affine(rq, this_sd, p, prev_cpu, this_cpu, sync, idx,
- load, this_load, imbalance))
- goto out_set_cpu;
+ load, this_load, imbalance)) {
+ new_cpu = this_cpu;
+ goto out;
+ }
/*
* Start passive balancing when half the imbalance_pct
@@ -1083,17 +1083,12 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
if (imbalance*this_load <= 100*load) {
schedstat_inc(this_sd, ttwu_move_balance);
schedstat_inc(p, se.nr_wakeups_passive);
- goto out_set_cpu;
+ new_cpu = this_cpu;
+ goto out;
}
}
-out_keep_cpu:
- /*
- * Could not wake to this_cpu.
- * Wake to the previous cpu instead:
- */
- new_cpu = prev_cpu;
-out_set_cpu:
+out:
return wake_idle(new_cpu, p);
}
#endif /* CONFIG_SMP */
commit ac192d3921a14e2c9080799e16959b4bd56f49d6
Author: Ingo Molnar <mingo@elte.hu>
Date: Sun Mar 16 20:56:26 2008 +0100
sched: clean up wakeup balancing, rename variables
rename 'cpu' to 'prev_cpu'. No code changed:
kernel/sched.o:
text data bss dec hex filename
42521 2858 232 45611 b22b sched.o.before
42521 2858 232 45611 b22b sched.o.after
md5:
09b31c44e9aff8666f72773dc433e2df sched.o.before.asm
09b31c44e9aff8666f72773dc433e2df sched.o.after.asm
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 70679b266693..2d2be02b8e3b 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -983,7 +983,7 @@ static inline int wake_idle(int cpu, struct task_struct *p)
static int
wake_affine(struct rq *rq, struct sched_domain *this_sd, struct task_struct *p,
- int cpu, int this_cpu, int sync, int idx,
+ int prev_cpu, int this_cpu, int sync, int idx,
unsigned long load, unsigned long this_load,
unsigned int imbalance)
{
@@ -1010,7 +1010,7 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct task_struct *p,
if (sync)
tl -= current->se.load.weight;
- if ((tl <= load && tl + target_load(cpu, idx) <= tl_per_task) ||
+ if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) ||
100*(tl + p->se.load.weight) <= imbalance*load) {
/*
* This domain has SD_WAKE_AFFINE and
@@ -1028,22 +1028,26 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct task_struct *p,
static int select_task_rq_fair(struct task_struct *p, int sync)
{
struct sched_domain *sd, *this_sd = NULL;
+ int prev_cpu, this_cpu, new_cpu;
unsigned long load, this_load;
- int cpu, this_cpu, new_cpu;
unsigned int imbalance;
struct rq *rq;
int idx;
- cpu = task_cpu(p);
- rq = task_rq(p);
- this_cpu = smp_processor_id();
- new_cpu = cpu;
+ prev_cpu = task_cpu(p);
+ rq = task_rq(p);
+ this_cpu = smp_processor_id();
+ new_cpu = prev_cpu;
- if (cpu == this_cpu)
+ if (prev_cpu == this_cpu)
goto out_set_cpu;
+ /*
+ * 'this_sd' is the first domain that both
+ * this_cpu and prev_cpu are present in:
+ */
for_each_domain(this_cpu, sd) {
- if (cpu_isset(cpu, sd->span)) {
+ if (cpu_isset(prev_cpu, sd->span)) {
this_sd = sd;
break;
}
@@ -1062,12 +1066,12 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
- load = source_load(cpu, idx);
+ load = source_load(prev_cpu, idx);
this_load = target_load(this_cpu, idx);
new_cpu = this_cpu; /* Wake to this CPU if we can */
- if (wake_affine(rq, this_sd, p, cpu, this_cpu, sync, idx,
+ if (wake_affine(rq, this_sd, p, prev_cpu, this_cpu, sync, idx,
load, this_load, imbalance))
goto out_set_cpu;
@@ -1084,7 +1088,11 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
}
out_keep_cpu:
- new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
+ /*
+ * Could not wake to this_cpu.
+ * Wake to the previous cpu instead:
+ */
+ new_cpu = prev_cpu;
out_set_cpu:
return wake_idle(new_cpu, p);
}