Patches contributed by Eötvös Lorand University
commit 098fb9db2c74cfd6ffdbf61eb026a0c21abc5f75
Author: Ingo Molnar <mingo@elte.hu>
Date: Sun Mar 16 20:36:10 2008 +0100
sched: clean up wakeup balancing, move wake_affine()
split out the affine-wakeup bits.
No code changed:
kernel/sched.o:
text data bss dec hex filename
42521 2858 232 45611 b22b sched.o.before
42521 2858 232 45611 b22b sched.o.after
md5:
9d76738f1272aa82f0b7affd2f51df6b sched.o.before.asm
09b31c44e9aff8666f72773dc433e2df sched.o.after.asm
(the md5's changed because stack slots changed and some registers
get scheduled by gcc in a different order - but otherwise the before
and after assembly is instruction for instruction equivalent.)
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f2cc59080efa..70679b266693 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -980,12 +980,59 @@ static inline int wake_idle(int cpu, struct task_struct *p)
#endif
#ifdef CONFIG_SMP
+
+static int
+wake_affine(struct rq *rq, struct sched_domain *this_sd, struct task_struct *p,
+ int cpu, int this_cpu, int sync, int idx,
+ unsigned long load, unsigned long this_load,
+ unsigned int imbalance)
+{
+ unsigned long tl = this_load;
+ unsigned long tl_per_task;
+
+ if (!(this_sd->flags & SD_WAKE_AFFINE))
+ return 0;
+
+ /*
+ * Attract cache-cold tasks on sync wakeups:
+ */
+ if (sync && !task_hot(p, rq->clock, this_sd))
+ return 1;
+
+ schedstat_inc(p, se.nr_wakeups_affine_attempts);
+ tl_per_task = cpu_avg_load_per_task(this_cpu);
+
+ /*
+ * If sync wakeup then subtract the (maximum possible)
+ * effect of the currently running task from the load
+ * of the current CPU:
+ */
+ if (sync)
+ tl -= current->se.load.weight;
+
+ if ((tl <= load && tl + target_load(cpu, idx) <= tl_per_task) ||
+ 100*(tl + p->se.load.weight) <= imbalance*load) {
+ /*
+ * This domain has SD_WAKE_AFFINE and
+ * p is cache cold in this domain, and
+ * there is no bad imbalance.
+ */
+ schedstat_inc(this_sd, ttwu_move_affine);
+ schedstat_inc(p, se.nr_wakeups_affine);
+
+ return 1;
+ }
+ return 0;
+}
+
static int select_task_rq_fair(struct task_struct *p, int sync)
{
- int cpu, this_cpu;
- struct rq *rq;
struct sched_domain *sd, *this_sd = NULL;
- int new_cpu;
+ unsigned long load, this_load;
+ int cpu, this_cpu, new_cpu;
+ unsigned int imbalance;
+ struct rq *rq;
+ int idx;
cpu = task_cpu(p);
rq = task_rq(p);
@@ -1008,66 +1055,35 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
/*
* Check for affine wakeup and passive balancing possibilities.
*/
- if (this_sd) {
- int idx = this_sd->wake_idx;
- unsigned int imbalance;
- unsigned long load, this_load;
-
- imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
-
- load = source_load(cpu, idx);
- this_load = target_load(this_cpu, idx);
-
- new_cpu = this_cpu; /* Wake to this CPU if we can */
-
- if (this_sd->flags & SD_WAKE_AFFINE) {
- unsigned long tl = this_load;
- unsigned long tl_per_task;
-
- /*
- * Attract cache-cold tasks on sync wakeups:
- */
- if (sync && !task_hot(p, rq->clock, this_sd))
- goto out_set_cpu;
-
- schedstat_inc(p, se.nr_wakeups_affine_attempts);
- tl_per_task = cpu_avg_load_per_task(this_cpu);
-
- /*
- * If sync wakeup then subtract the (maximum possible)
- * effect of the currently running task from the load
- * of the current CPU:
- */
- if (sync)
- tl -= current->se.load.weight;
-
- if ((tl <= load &&
- tl + target_load(cpu, idx) <= tl_per_task) ||
- 100*(tl + p->se.load.weight) <= imbalance*load) {
- /*
- * This domain has SD_WAKE_AFFINE and
- * p is cache cold in this domain, and
- * there is no bad imbalance.
- */
- schedstat_inc(this_sd, ttwu_move_affine);
- schedstat_inc(p, se.nr_wakeups_affine);
- goto out_set_cpu;
- }
- }
+ if (!this_sd)
+ goto out_keep_cpu;
- /*
- * Start passive balancing when half the imbalance_pct
- * limit is reached.
- */
- if (this_sd->flags & SD_WAKE_BALANCE) {
- if (imbalance*this_load <= 100*load) {
- schedstat_inc(this_sd, ttwu_move_balance);
- schedstat_inc(p, se.nr_wakeups_passive);
- goto out_set_cpu;
- }
+ idx = this_sd->wake_idx;
+
+ imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
+
+ load = source_load(cpu, idx);
+ this_load = target_load(this_cpu, idx);
+
+ new_cpu = this_cpu; /* Wake to this CPU if we can */
+
+ if (wake_affine(rq, this_sd, p, cpu, this_cpu, sync, idx,
+ load, this_load, imbalance))
+ goto out_set_cpu;
+
+ /*
+ * Start passive balancing when half the imbalance_pct
+ * limit is reached.
+ */
+ if (this_sd->flags & SD_WAKE_BALANCE) {
+ if (imbalance*this_load <= 100*load) {
+ schedstat_inc(this_sd, ttwu_move_balance);
+ schedstat_inc(p, se.nr_wakeups_passive);
+ goto out_set_cpu;
}
}
+out_keep_cpu:
new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
out_set_cpu:
return wake_idle(new_cpu, p);
commit 6a6029b8cefe0ca7e82f27f3904dbedba3de4e06
Author: Ingo Molnar <mingo@elte.hu>
Date: Fri Mar 14 22:17:08 2008 +0100
sched: simplify sched_slice()
Use the existing calc_delta_mine() calculation for sched_slice(). This
saves a divide and simplifies the code because we share it with the
other /cfs_rq->load users.
It also improves code size:
text data bss dec hex filename
42659 2740 144 45543 b1e7 sched.o.before
42093 2740 144 44977 afb1 sched.o.after
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 31aa1b9fa762..f2cc59080efa 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -283,12 +283,8 @@ static u64 __sched_period(unsigned long nr_running)
*/
static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- u64 slice = __sched_period(cfs_rq->nr_running);
-
- slice *= se->load.weight;
- do_div(slice, cfs_rq->load.weight);
-
- return slice;
+ return calc_delta_mine(__sched_period(cfs_rq->nr_running),
+ se->load.weight, &cfs_rq->load);
}
/*
commit e22ecef1d2658ba54ed7d3fdb5d60829fb434c23
Author: Ingo Molnar <mingo@elte.hu>
Date: Fri Mar 14 22:16:08 2008 +0100
sched: fix fair sleepers
Fair sleepers need to scale their latency target down by runqueue
weight. Otherwise busy systems will gain ever larger sleep bonus.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 31c4a2988b64..31aa1b9fa762 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -528,8 +528,10 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
if (!initial) {
/* sleeps upto a single latency don't count. */
- if (sched_feat(NEW_FAIR_SLEEPERS))
- vruntime -= sysctl_sched_latency;
+ if (sched_feat(NEW_FAIR_SLEEPERS)) {
+ vruntime -= calc_delta_fair(sysctl_sched_latency,
+ &cfs_rq->load);
+ }
/* ensure we never gain time by being placed backwards. */
vruntime = max_vruntime(se->vruntime, vruntime);
commit 27d117266097101dcf79c4576903cdcdd0eabffc
Author: Ingo Molnar <mingo@elte.hu>
Date: Fri Mar 14 22:20:01 2008 +0100
sched: fix calc_delta_mine()
lw->weight can be 0 for a short time during bootup.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
diff --git a/kernel/sched.c b/kernel/sched.c
index 3a4ba3dc0f49..6b06f23261c0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1084,7 +1084,7 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
u64 tmp;
if (unlikely(!lw->inv_weight))
- lw->inv_weight = (WMULT_CONST - lw->weight/2) / lw->weight + 1;
+ lw->inv_weight = (WMULT_CONST-lw->weight/2) / (lw->weight+1);
tmp = (u64)delta_exec * weight;
/*
commit e89996ae3f9e88d4fd75751a15c10b19d197e702
Author: Ingo Molnar <mingo@elte.hu>
Date: Fri Mar 14 23:48:28 2008 +0100
sched: fix update_load_add()/sub()
Clear the cached inverse value when updating load. This is needed for
calc_delta_mine() to work correctly when using the rq load.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
diff --git a/kernel/sched.c b/kernel/sched.c
index 9df9ba73cb7a..3a4ba3dc0f49 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1108,11 +1108,13 @@ calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
{
lw->weight += inc;
+ lw->inv_weight = 0;
}
static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
{
lw->weight -= dec;
+ lw->inv_weight = 0;
}
/*
commit 9a46d7e5b63903a70cd96c2c1391a7a26a8dbec9
Author: Ingo Molnar <mingo@elte.hu>
Date: Tue Feb 26 09:30:32 2008 +0100
x86: ioremap, remove WARN_ON()
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index ac3c959e271d..8fe576baa148 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -134,8 +134,6 @@ static void __iomem *__ioremap(unsigned long phys_addr, unsigned long size,
return NULL;
}
- WARN_ON_ONCE(page_is_ram(pfn));
-
switch (mode) {
case IOR_MODE_UNCACHED:
default:
commit f5dbb55b995b77d396fe2204495a0af3e24d28c2
Author: Ingo Molnar <mingo@elte.hu>
Date: Mon Mar 10 18:04:34 2008 +0100
fix BIOS PCI config cycle buglet causing ACPI boot regression
I figured out another ACPI related regression today.
randconfig testing triggered an early boot-time hang on a laptop of mine
(32-bit x86, config attached) - the screen was scrolling ACPI AML
exceptions [with no serial port and no early debugging available].
v2.6.24 works fine on that laptop with the same .config, so after a few
hours of bisection (had to restart it 3 times - other regressions
interacted), it honed in on this commit:
| 10270d4838bdc493781f5a1cf2e90e9c34c9142f is first bad commit
|
| Author: Linus Torvalds <torvalds@woody.linux-foundation.org>
| Date: Wed Feb 13 09:56:14 2008 -0800
|
| acpi: fix acpi_os_read_pci_configuration() misuse of raw_pci_read()
reverting this commit ontop of -rc5 gave a correctly booting kernel.
But this commit fixes a real bug so the real question is, why did it
break the bootup?
After quite some head-scratching, the following change stood out:
- pci_id->bus = tu8;
+ pci_id->bus = val;
pci_id->bus is defined as u16:
struct acpi_pci_id {
u16 segment;
u16 bus;
...
and 'tu8' changed from u8 to u32. So previously we'd unconditionally
mask the return value of acpi_os_read_pci_configuration()
(raw_pci_read()) to 8 bits, but now we just trust whatever comes back
from the PCI access routines and only crop it to 16 bits.
But if the high 8 bits of that result contains any noise then we'll
write that into ACPI's PCI ID descriptor and confuse the heck out of the
rest of ACPI.
So lets check the PCI-BIOS code on that theory. We have this codepath
for 8-bit accesses (arch/x86/pci/pcbios.c:pci_bios_read()):
switch (len) {
case 1:
__asm__("lcall *(%%esi); cld\n\t"
"jc 1f\n\t"
"xor %%ah, %%ah\n"
"1:"
: "=c" (*value),
"=a" (result)
: "1" (PCIBIOS_READ_CONFIG_BYTE),
"b" (bx),
"D" ((long)reg),
"S" (&pci_indirect));
Aha! The "=a" output constraint puts the full 32 bits of EAX into
*value. But if the BIOS's routines set any of the high bits to nonzero,
we'll return a value with more set in it than intended.
The other, more common PCI access methods (v1 and v2 PCI reads) clear
out the high bits already, for example pci_conf1_read() does:
switch (len) {
case 1:
*value = inb(0xCFC + (reg & 3));
which explicitly converts the return byte up to 32 bits and zero-extends
it.
So zero-extending the result in the PCI-BIOS read routine fixes the
regression on my laptop. ( It might fix some other long-standing issues
we had with PCI-BIOS during the past decade ... ) Both 8-bit and 16-bit
accesses were buggy.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c
index 10ac8c316c46..2f7109ac4c15 100644
--- a/arch/x86/pci/pcbios.c
+++ b/arch/x86/pci/pcbios.c
@@ -198,6 +198,11 @@ static int pci_bios_read(unsigned int seg, unsigned int bus,
"b" (bx),
"D" ((long)reg),
"S" (&pci_indirect));
+ /*
+ * Zero-extend the result beyond 8 bits, do not trust the
+ * BIOS having done it:
+ */
+ *value &= 0xff;
break;
case 2:
__asm__("lcall *(%%esi); cld\n\t"
@@ -210,6 +215,11 @@ static int pci_bios_read(unsigned int seg, unsigned int bus,
"b" (bx),
"D" ((long)reg),
"S" (&pci_indirect));
+ /*
+ * Zero-extend the result beyond 16 bits, do not trust the
+ * BIOS having done it:
+ */
+ *value &= 0xffff;
break;
case 4:
__asm__("lcall *(%%esi); cld\n\t"
commit 80d38f9a7871d9bafc3f244dabe48b41a58de705
Author: Ingo Molnar <mingo@elte.hu>
Date: Fri Mar 7 10:47:43 2008 +0100
drivers/char/esp.c: fix bootup lockup
randconfig testing found a bootup lockup in drivers/char/esp.c because
of a spinlock that wasn't correctly initialized.
I'm not sure why it became more prominent in 2.6.25-rc4, the bug seems
rather old and i've been doing allyesconfig bootups for ages with
CONFIG_ESP enabled.
This fixes this bootup lockup:
PM: Adding info for No Bus:ttyP63
ttyP32 at 0x0240 (irq = 0) is an ESP primary port
BUG: spinlock lockup on CPU#0, swapper/1, f56dd004
Pid: 1, comm: swapper Not tainted 2.6.25-rc4-sched-devel.git-x86-latest.git #402 [<c03ac6f4>] _raw_spin_lock+0x134/0x140
[<c08649be>] _spin_lock_irqsave+0x5e/0x80
[<c0b9fbfe>] ? espserial_init+0x2be/0x6e0
[<c0b9fbfe>] espserial_init+0x2be/0x6e0
[<c0b877a3>] kernel_init+0x83/0x260
[<c0b9f940>] ? espserial_init+0x0/0x6e0
[<c010416a>] ? restore_nocheck_notrace+0x0/0xe
[<c0b87720>] ? kernel_init+0x0/0x260
[<c0b87720>] ? kernel_init+0x0/0x260
[<c0104507>] kernel_thread_helper+0x7/0x10
=======================
kzalloc() is not the way to initialize spinlocks anymore.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
diff --git a/drivers/char/esp.c b/drivers/char/esp.c
index c01e26d9ee5e..f3fe62067344 100644
--- a/drivers/char/esp.c
+++ b/drivers/char/esp.c
@@ -2484,6 +2484,7 @@ static int __init espserial_init(void)
return 0;
}
+ spin_lock_init(&info->lock);
/* rx_trigger, tx_trigger are needed by autoconfig */
info->config.rx_trigger = rx_trigger;
info->config.tx_trigger = tx_trigger;
commit 7432d149fda8ce9ead9df91e577b83ce52ad5f65
Author: Ingo Molnar <mingo@elte.hu>
Date: Thu Mar 6 18:29:43 2008 +0100
x86: re-add reboot fixups
Jan Beulich noticed that the reboot fixups went missing during
reboot.c unification.
(commit 4d022e35fd7e07c522c7863fee6f07e53cf3fc14)
Geode and a few other rare boards with special reboot quirks are
affected.
Reported-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 7fd6ac43e4a1..55ceb8cdef75 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -326,6 +326,10 @@ static inline void kb_wait(void)
}
}
+void __attribute__((weak)) mach_reboot_fixups(void)
+{
+}
+
static void native_machine_emergency_restart(void)
{
int i;
@@ -337,6 +341,8 @@ static void native_machine_emergency_restart(void)
/* Could also try the reset bit in the Hammer NB */
switch (reboot_type) {
case BOOT_KBD:
+ mach_reboot_fixups(); /* for board specific fixups */
+
for (i = 0; i < 10; i++) {
kb_wait();
udelay(50);
commit d47846c5866b7d98a1173c86a39d810a06647329
Author: Ingo Molnar <mingo@elte.hu>
Date: Tue Mar 4 14:54:47 2008 +0100
sysfs: CONFIG_SYSFS_DEPRECATED fix
CONFIG_SYSFS_DEPRECATED=y changed its meaning recently and causes
regressions in working setups that had SYSFS_DEPRECATED disabled.
so rename it to SYSFS_DEPRECATED_V2 so that testers pick up the new
default via 'make oldconfig', even if their old .config's disabled
CONFIG_SYSFS_DEPRECATED ...
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
diff --git a/init/Kconfig b/init/Kconfig
index e6606e6e99e4..98ebf3725412 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -367,9 +367,13 @@ config RESOURCE_COUNTERS
depends on CGROUPS
config SYSFS_DEPRECATED
+ bool
+
+config SYSFS_DEPRECATED_V2
bool "Create deprecated sysfs files"
depends on SYSFS
default y
+ select SYSFS_DEPRECATED
help
This option creates deprecated symlinks such as the
"device"-link, the <subsystem>:<name>-link, and the