Patches contributed by Eötvös Lorand University
commit 8018c27b26af56af18eb8b2dc600eba825792d8f
Author: Ingo Molnar <mingo@elte.hu>
Date: Fri Dec 29 16:50:01 2006 -0800
[PATCH] kvm: fix GFP_KERNEL allocation in atomic section in kvm_dev_ioctl_create_vcpu()
fix an GFP_KERNEL allocation in atomic section: kvm_dev_ioctl_create_vcpu()
called kvm_mmu_init(), which calls alloc_pages(), while holding the vcpu.
The fix is to set up the MMU state in two phases: kvm_mmu_create() and
kvm_mmu_setup().
(NOTE: free_vcpus does an kvm_mmu_destroy() call so there's no need for any
extra teardown branch on allocation/init failure here.)
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Avi Kivity <avi@qumranet.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
index 2670219a9264..100df6f38d92 100644
--- a/drivers/kvm/kvm.h
+++ b/drivers/kvm/kvm.h
@@ -319,7 +319,8 @@ int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module);
void kvm_exit_arch(void);
void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
-int kvm_mmu_init(struct kvm_vcpu *vcpu);
+int kvm_mmu_create(struct kvm_vcpu *vcpu);
+int kvm_mmu_setup(struct kvm_vcpu *vcpu);
int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index 973544553cba..ce7fe640f18d 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -522,12 +522,14 @@ static int kvm_dev_ioctl_create_vcpu(struct kvm *kvm, int n)
if (r < 0)
goto out_free_vcpus;
- kvm_arch_ops->vcpu_load(vcpu);
+ r = kvm_mmu_create(vcpu);
+ if (r < 0)
+ goto out_free_vcpus;
- r = kvm_arch_ops->vcpu_setup(vcpu);
+ kvm_arch_ops->vcpu_load(vcpu);
+ r = kvm_mmu_setup(vcpu);
if (r >= 0)
- r = kvm_mmu_init(vcpu);
-
+ r = kvm_arch_ops->vcpu_setup(vcpu);
vcpu_put(vcpu);
if (r < 0)
diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c
index 85887fcd584f..790423c5f23d 100644
--- a/drivers/kvm/mmu.c
+++ b/drivers/kvm/mmu.c
@@ -639,28 +639,22 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
return -ENOMEM;
}
-int kvm_mmu_init(struct kvm_vcpu *vcpu)
+int kvm_mmu_create(struct kvm_vcpu *vcpu)
{
- int r;
-
ASSERT(vcpu);
ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
ASSERT(list_empty(&vcpu->free_pages));
- r = alloc_mmu_pages(vcpu);
- if (r)
- goto out;
-
- r = init_kvm_mmu(vcpu);
- if (r)
- goto out_free_pages;
+ return alloc_mmu_pages(vcpu);
+}
- return 0;
+int kvm_mmu_setup(struct kvm_vcpu *vcpu)
+{
+ ASSERT(vcpu);
+ ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
+ ASSERT(!list_empty(&vcpu->free_pages));
-out_free_pages:
- free_mmu_pages(vcpu);
-out:
- return r;
+ return init_kvm_mmu(vcpu);
}
void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
commit 9414232fa0cc28e2f51b8c76d260f2748f7953fc
Author: Ingo Molnar <mingo@elte.hu>
Date: Fri Dec 29 16:48:13 2006 -0800
[PATCH] sched: fix cond_resched_softirq() offset
Remove the __resched_legal() check: it is conceptually broken. The biggest
problem it had is that it can mask buggy cond_resched() calls. A
cond_resched() call is only legal if we are not in an atomic context, with
two narrow exceptions:
- if the system is booting
- a reacquire_kernel_lock() down() done while PREEMPT_ACTIVE is set
But __resched_legal() hid this and just silently returned whenever
these primitives were called from invalid contexts. (Same goes for
cond_resched_locked() and cond_resched_softirq()).
Furthermore, the __legal_resched(0) call was buggy in that it caused
unnecessarily long softirq latencies via cond_resched_softirq(). (which is
only called from softirq-off sections, hence the code did nothing.)
The fix is to resurrect the efficiency of the might_sleep checks and to
only allow the narrow exceptions.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
diff --git a/kernel/sched.c b/kernel/sched.c
index b515e3caad7f..3df33da0dafc 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4617,17 +4617,6 @@ asmlinkage long sys_sched_yield(void)
return 0;
}
-static inline int __resched_legal(int expected_preempt_count)
-{
-#ifdef CONFIG_PREEMPT
- if (unlikely(preempt_count() != expected_preempt_count))
- return 0;
-#endif
- if (unlikely(system_state != SYSTEM_RUNNING))
- return 0;
- return 1;
-}
-
static void __cond_resched(void)
{
#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
@@ -4647,7 +4636,8 @@ static void __cond_resched(void)
int __sched cond_resched(void)
{
- if (need_resched() && __resched_legal(0)) {
+ if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
+ system_state == SYSTEM_RUNNING) {
__cond_resched();
return 1;
}
@@ -4673,7 +4663,7 @@ int cond_resched_lock(spinlock_t *lock)
ret = 1;
spin_lock(lock);
}
- if (need_resched() && __resched_legal(1)) {
+ if (need_resched() && system_state == SYSTEM_RUNNING) {
spin_release(&lock->dep_map, 1, _THIS_IP_);
_raw_spin_unlock(lock);
preempt_enable_no_resched();
@@ -4689,7 +4679,7 @@ int __sched cond_resched_softirq(void)
{
BUG_ON(!in_softirq());
- if (need_resched() && __resched_legal(0)) {
+ if (need_resched() && system_state == SYSTEM_RUNNING) {
raw_local_irq_disable();
_local_bh_enable();
raw_local_irq_enable();
commit e4e6bdbb426d1ecd9e4587f22115f8d0d426d21f
Author: Ingo Molnar <mingo@elte.hu>
Date: Fri Dec 29 16:47:14 2006 -0800
[PATCH] rcu: rcutorture suspend fix
Fix suspend hang: rcutorture threads need to be nofreeze.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index c52f981ea008..482b11ff65cb 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -522,6 +522,7 @@ rcu_torture_writer(void *arg)
VERBOSE_PRINTK_STRING("rcu_torture_writer task started");
set_user_nice(current, 19);
+ current->flags |= PF_NOFREEZE;
do {
schedule_timeout_uninterruptible(1);
@@ -561,6 +562,7 @@ rcu_torture_fakewriter(void *arg)
VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started");
set_user_nice(current, 19);
+ current->flags |= PF_NOFREEZE;
do {
schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10);
@@ -591,6 +593,7 @@ rcu_torture_reader(void *arg)
VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
set_user_nice(current, 19);
+ current->flags |= PF_NOFREEZE;
do {
idx = cur_ops->readlock();
commit 52e88f5d4a6b06f3a945728dd3bc403632afe069
Author: Ingo Molnar <mingo@elte.hu>
Date: Fri Dec 29 16:47:10 2006 -0800
[PATCH] change WARN_ON back to "BUG: at ..."
WARN_ON() ever triggering is a kernel bug. Do not try to paper over this
fact by suggesting to the user that this is 'only' a warning, as the
following recent commit does:
commit 30e25b71e725b150585e17888b130e3324f8cf7c
Author: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Fri Dec 8 02:36:24 2006 -0800
[PATCH] Fix generic WARN_ON message
A warning is a warning, not a BUG.
( it might make sense to rename BUG() to CRASH() and BUG_ON() to
CRASH_ON(), but that does not change the fact that WARN_ON()
signals a kernel bug. )
i and others objected to this change during lkml review:
http://marc.theaimsgroup.com/?l=linux-kernel&m=116115160710533&w=2
still the change slipped upstream - grumble :)
Also, use the standard "BUG: " format to make it easier to grep logs and
to make it easier to google for kernel bugs.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index a06eecd48292..14fae1fa87df 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -35,7 +35,7 @@ struct bug_entry {
#define WARN_ON(condition) ({ \
typeof(condition) __ret_warn_on = (condition); \
if (unlikely(__ret_warn_on)) { \
- printk("WARNING at %s:%d %s()\n", __FILE__, \
+ printk("BUG: at %s:%d %s()\n", __FILE__, \
__LINE__, __FUNCTION__); \
dump_stack(); \
} \
commit e1d9fd2e3d33b2fec3207171ec8ca6e71d5c81c7
Author: Ingo Molnar <mingo@elte.hu>
Date: Sat Dec 23 16:55:29 2006 +0100
[PATCH] suspend: fix suspend on single-CPU systems
Clark Williams reported that suspend doesnt work on his laptop on
2.6.20-rc1-rt kernels. The bug was introduced by the following cleanup
commit:
commit 112cecb2cc0e7341db92281ba04b26c41bb8146d
Author: Siddha, Suresh B <suresh.b.siddha@intel.com>
Date: Wed Dec 6 20:34:31 2006 -0800
[PATCH] suspend: don't change cpus_allowed for task initiating the suspend
because with this change 'error' is not initialized to 0 anymore, if
there are no other online CPUs. (i.e. if the system is single-CPU).
the fix is the initialize it to 0. The really weird thing is that my
version of gcc does not warn about this non-initialized variable
situation ...
(also fix the kernel printk in the error branch, it was missing a
newline)
Reported-by: Clark Williams <williams@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 9124669f4586..241064a32241 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -258,7 +258,7 @@ static cpumask_t frozen_cpus;
int disable_nonboot_cpus(void)
{
- int cpu, first_cpu, error;
+ int cpu, first_cpu, error = 0;
mutex_lock(&cpu_add_remove_lock);
first_cpu = first_cpu(cpu_present_map);
@@ -294,7 +294,7 @@ int disable_nonboot_cpus(void)
/* Make sure the CPUs won't be enabled by someone else */
cpu_hotplug_disabled = 1;
} else {
- printk(KERN_ERR "Non-boot CPUs are not disabled");
+ printk(KERN_ERR "Non-boot CPUs are not disabled\n");
}
out:
mutex_unlock(&cpu_add_remove_lock);
commit 0888f06ac99f993df2bb4c479f5b9306dafe154f
Author: Ingo Molnar <mingo@elte.hu>
Date: Fri Dec 22 01:11:56 2006 -0800
[PATCH] sched: fix bad missed wakeups in the i386, x86_64, ia64, ACPI and APM idle code
Fernando Lopez-Lezcano reported frequent scheduling latencies and audio
xruns starting at the 2.6.18-rt kernel, and those problems persisted all
until current -rt kernels. The latencies were serious and unjustified by
system load, often in the milliseconds range.
After a patient and heroic multi-month effort of Fernando, where he
tested dozens of kernels, tried various configs, boot options,
test-patches of mine and provided latency traces of those incidents, the
following 'smoking gun' trace was captured by him:
_------=> CPU#
/ _-----=> irqs-off
| / _----=> need-resched
|| / _---=> hardirq/softirq
||| / _--=> preempt-depth
|||| /
||||| delay
cmd pid ||||| time | caller
\ / ||||| \ | /
IRQ_19-1479 1D..1 0us : __trace_start_sched_wakeup (try_to_wake_up)
IRQ_19-1479 1D..1 0us : __trace_start_sched_wakeup <<...>-5856> (37 0)
IRQ_19-1479 1D..1 0us : __trace_start_sched_wakeup (c01262ba 0 0)
IRQ_19-1479 1D..1 0us : resched_task (try_to_wake_up)
IRQ_19-1479 1D..1 0us : __spin_unlock_irqrestore (try_to_wake_up)
...
<idle>-0 1...1 11us!: default_idle (cpu_idle)
...
<idle>-0 0Dn.1 602us : smp_apic_timer_interrupt (c0103baf 1 0)
...
<...>-5856 0D..2 618us : __switch_to (__schedule)
<...>-5856 0D..2 618us : __schedule <<idle>-0> (20 162)
<...>-5856 0D..2 619us : __spin_unlock_irq (__schedule)
<...>-5856 0...1 619us : trace_stop_sched_switched (__schedule)
<...>-5856 0D..1 619us : trace_stop_sched_switched <<...>-5856> (37 0)
what is visible in this trace is that CPU#1 ran try_to_wake_up() for
PID:5856, it placed PID:5856 on CPU#0's runqueue and ran resched_task()
for CPU#0. But it decided to not send an IPI that no CPU - due to
TS_POLLING. But CPU#0 never woke up after its NEED_RESCHED bit was set,
and only rescheduled to PID:5856 upon the next lapic timer IRQ. The
result was a 600+ usecs latency and a missed wakeup!
the bug turned out to be an idle-wakeup bug introduced into the mainline
kernel this summer via an optimization in the x86_64 tree:
commit 495ab9c045e1b0e5c82951b762257fe1c9d81564
Author: Andi Kleen <ak@suse.de>
Date: Mon Jun 26 13:59:11 2006 +0200
[PATCH] i386/x86-64/ia64: Move polling flag into thread_info_status
During some profiling I noticed that default_idle causes a lot of
memory traffic. I think that is caused by the atomic operations
to clear/set the polling flag in thread_info. There is actually
no reason to make this atomic - only the idle thread does it
to itself, other CPUs only read it. So I moved it into ti->status.
the problem is this type of change:
if (!hlt_counter && boot_cpu_data.hlt_works_ok) {
- clear_thread_flag(TIF_POLLING_NRFLAG);
+ current_thread_info()->status &= ~TS_POLLING;
smp_mb__after_clear_bit();
while (!need_resched()) {
local_irq_disable();
this changes clear_thread_flag() to an explicit clearing of TS_POLLING.
clear_thread_flag() is defined as:
clear_bit(flag, &ti->flags);
and clear_bit() is a LOCK-ed atomic instruction on all x86 platforms:
static inline void clear_bit(int nr, volatile unsigned long * addr)
{
__asm__ __volatile__( LOCK_PREFIX
"btrl %1,%0"
hence smp_mb__after_clear_bit() is defined as a simple compile barrier:
#define smp_mb__after_clear_bit() barrier()
but the explicit TS_POLLING clearing introduced by the patch:
+ current_thread_info()->status &= ~TS_POLLING;
is not an atomic op! So the clearing of the TS_POLLING bit is freely
reorderable with the reading of the NEED_RESCHED bit - and both now
reside in different memory addresses.
CPU idle wakeup very much depends on ordered memory ops, the clearing of
the TS_POLLING flag must always be done before we test need_resched()
and hit the idle instruction(s). [Symmetrically, the wakeup code needs
to set NEED_RESCHED before it tests the TS_POLLING flag, so memory
ordering is paramount.]
Fernando's dual-core Athlon64 system has a sufficiently advanced memory
ordering model so that it triggered this scenario very often.
( And it also turned out that the reason why these latencies never
triggered on my testsystems is that i routinely use idle=poll, which
was the only idle variant not affected by this bug. )
The fix is to change the smp_mb__after_clear_bit() to an smp_mb(), to
act as an absolute barrier between the TS_POLLING write and the
NEED_RESCHED read. This affects almost all idling methods (default,
ACPI, APM), on all 3 x86 architectures: i386, x86_64, ia64.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Tested-by: Fernando Lopez-Lezcano <nando@ccrma.Stanford.EDU>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c
index b75cff25de4b..199016927541 100644
--- a/arch/i386/kernel/apm.c
+++ b/arch/i386/kernel/apm.c
@@ -785,7 +785,11 @@ static int apm_do_idle(void)
polling = !!(current_thread_info()->status & TS_POLLING);
if (polling) {
current_thread_info()->status &= ~TS_POLLING;
- smp_mb__after_clear_bit();
+ /*
+ * TS_POLLING-cleared state must be visible before we
+ * test NEED_RESCHED:
+ */
+ smp_mb();
}
if (!need_resched()) {
idled = 1;
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 99308510a17c..c641056233a6 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -102,7 +102,12 @@ void default_idle(void)
{
if (!hlt_counter && boot_cpu_data.hlt_works_ok) {
current_thread_info()->status &= ~TS_POLLING;
- smp_mb__after_clear_bit();
+ /*
+ * TS_POLLING-cleared state must be visible before we
+ * test NEED_RESCHED:
+ */
+ smp_mb();
+
local_irq_disable();
if (!need_resched())
safe_halt(); /* enables interrupts racelessly */
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index 51922b98086a..17685abaf496 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -268,10 +268,16 @@ cpu_idle (void)
/* endless idle loop with no priority at all */
while (1) {
- if (can_do_pal_halt)
+ if (can_do_pal_halt) {
current_thread_info()->status &= ~TS_POLLING;
- else
+ /*
+ * TS_POLLING-cleared state must be visible before we
+ * test NEED_RESCHED:
+ */
+ smp_mb();
+ } else {
current_thread_info()->status |= TS_POLLING;
+ }
if (!need_resched()) {
void (*idle)(void);
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index a418ee4c8c62..cbbc6adc1a92 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -109,7 +109,11 @@ void exit_idle(void)
static void default_idle(void)
{
current_thread_info()->status &= ~TS_POLLING;
- smp_mb__after_clear_bit();
+ /*
+ * TS_POLLING-cleared state must be visible before we
+ * test NEED_RESCHED:
+ */
+ smp_mb();
local_irq_disable();
if (!need_resched()) {
/* Enables interrupts one instruction before HLT.
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 65b3f056ad89..6dac6050bb5a 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -211,7 +211,11 @@ acpi_processor_power_activate(struct acpi_processor *pr,
static void acpi_safe_halt(void)
{
current_thread_info()->status &= ~TS_POLLING;
- smp_mb__after_clear_bit();
+ /*
+ * TS_POLLING-cleared state must be visible before we
+ * test NEED_RESCHED:
+ */
+ smp_mb();
if (!need_resched())
safe_halt();
current_thread_info()->status |= TS_POLLING;
@@ -345,7 +349,11 @@ static void acpi_processor_idle(void)
*/
if (cx->type == ACPI_STATE_C2 || cx->type == ACPI_STATE_C3) {
current_thread_info()->status &= ~TS_POLLING;
- smp_mb__after_clear_bit();
+ /*
+ * TS_POLLING-cleared state must be visible before we
+ * test NEED_RESCHED:
+ */
+ smp_mb();
if (need_resched()) {
current_thread_info()->status |= TS_POLLING;
local_irq_enable();
commit 9127d4b1d9b2e8fba8e7fbc7f88ea93e5eb01396
Author: Ingo Molnar <mingo@elte.hu>
Date: Fri Dec 22 01:08:52 2006 -0800
[PATCH] lock debugging: fix DEBUG_LOCKS_WARN_ON() & debug_locks_silent
Matthew Wilcox noticed that the debug_locks_silent use should be inverted
in DEBUG_LOCKS_WARN_ON(). This bug was causing spurious stacktraces and
incorrect failures in the locking self-test on the parisc kernel.
Bug-found-by: Matthew Wilcox <matthew@wil.cx>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h
index a1c10b0c4cf0..1678a5de7013 100644
--- a/include/linux/debug_locks.h
+++ b/include/linux/debug_locks.h
@@ -24,7 +24,7 @@ extern int debug_locks_off(void);
int __ret = 0; \
\
if (unlikely(c)) { \
- if (debug_locks_silent || debug_locks_off()) \
+ if (debug_locks_off() && !debug_locks_silent) \
WARN_ON(1); \
__ret = 1; \
} \
commit 9bfb18392ef586467277fa25d8f3a7a93611f6df
Author: Ingo Molnar <mingo@elte.hu>
Date: Mon Dec 18 20:05:09 2006 +0100
[PATCH] workqueue: fix schedule_on_each_cpu()
fix the schedule_on_each_cpu() implementation: __queue_work() is now
stricter, hence set the work-pending bit before passing in the new work.
(found in the -rt tree, using Peter Zijlstra's files-lock scalability
patchset)
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 742cbbe49bdc..180a8ce11535 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -637,9 +637,11 @@ int schedule_on_each_cpu(work_func_t func)
mutex_lock(&workqueue_mutex);
for_each_online_cpu(cpu) {
- INIT_WORK(per_cpu_ptr(works, cpu), func);
- __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu),
- per_cpu_ptr(works, cpu));
+ struct work_struct *work = per_cpu_ptr(works, cpu);
+
+ INIT_WORK(work, func);
+ set_bit(WORK_STRUCT_PENDING, work_data_bits(work));
+ __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work);
}
mutex_unlock(&workqueue_mutex);
flush_workqueue(keventd_wq);
commit 136f1e7a8cb7d17ff91706518549697071640ae4
Author: Ingo Molnar <mingo@elte.hu>
Date: Wed Dec 20 11:53:32 2006 +0100
[PATCH] x86_64: fix boot time hang in detect_calgary()
if CONFIG_CALGARY_IOMMU is built into the kernel via
CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT, or is enabled via the
iommu=calgary boot option, then the detect_calgary() function runs to
detect the presence of a Calgary IOMMU.
detect_calgary() first searches the BIOS EBDA area for a "rio_table_hdr"
BIOS table. It has this parsing algorithm for the EBDA:
while (offset) {
...
/* The next offset is stored in the 1st word. 0 means no more */
offset = *((unsigned short *)(ptr + offset));
}
got that? Lets repeat it slowly: we've got a BIOS-supplied data
structure, plus Linux kernel code that will only break out of an
infinite parsing loop once the BIOS gives a zero offset. Ok?
Translation: what an excellent opportunity for BIOS writers to lock up
the Linux boot process in an utterly hard to debug place! Indeed the
BIOS jumped on that opportunity on my box, which has the following EBDA
chaining layout:
384, 65282, 65535, 65535, 65535, 65535, 65535, 65535 ...
see the pattern? So my, definitely non-Calgary system happily locks up
in detect_calgary()!
the patch below fixes the boot hang by trusting the BIOS-supplied data
structure a bit less: the parser always has to make forward progress,
and if it doesnt, we break out of the loop and i get the expected kernel
message:
Calgary: Unable to locate Rio Grande Table in EBDA - bailing!
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Muli Ben-Yehuda <muli@il.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c
index 3215675ab128..87d90cb68a74 100644
--- a/arch/x86_64/kernel/pci-calgary.c
+++ b/arch/x86_64/kernel/pci-calgary.c
@@ -1052,7 +1052,7 @@ void __init detect_calgary(void)
void *tbl;
int calgary_found = 0;
unsigned long ptr;
- int offset;
+ unsigned int offset, prev_offset;
int ret;
/*
@@ -1071,15 +1071,20 @@ void __init detect_calgary(void)
ptr = (unsigned long)phys_to_virt(get_bios_ebda());
rio_table_hdr = NULL;
+ prev_offset = 0;
offset = 0x180;
- while (offset) {
+ /*
+ * The next offset is stored in the 1st word.
+ * Only parse up until the offset increases:
+ */
+ while (offset > prev_offset) {
/* The block id is stored in the 2nd word */
if (*((unsigned short *)(ptr + offset + 2)) == 0x4752){
/* set the pointer past the offset & block id */
rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4);
break;
}
- /* The next offset is stored in the 1st word. 0 means no more */
+ prev_offset = offset;
offset = *((unsigned short *)(ptr + offset));
}
if (!rio_table_hdr) {
commit a9622f6219ce58faba1417743bf3078501eb3434
Author: Ingo Molnar <mingo@elte.hu>
Date: Wed Dec 20 11:28:46 2006 +0100
[PATCH] x86_64: fix boot hang caused by CALGARY_IOMMU_ENABLED_BY_DEFAULT
one of my boxes didnt boot the 2.6.20-rc1-rt0 kernel rpm, it hung during
early bootup. After an hour or two of happy debugging i narrowed it down
to the CALGARY_IOMMU_ENABLED_BY_DEFAULT option, which was freshly added
to 2.6.20 via the x86_64 tree and /enabled by default/.
commit bff6547bb6a4e82c399d74e7fba78b12d2f162ed claims:
[PATCH] Calgary: allow compiling Calgary in but not using it by default
This patch makes it possible to compile Calgary in but not use it by
default. In this mode, use 'iommu=calgary' to activate it.
but the change does not actually practice it:
config CALGARY_IOMMU_ENABLED_BY_DEFAULT
bool "Should Calgary be enabled by default?"
default y
depends on CALGARY_IOMMU
help
Should Calgary be enabled by default? if you choose 'y', Calgary
will be used (if it exists). If you choose 'n', Calgary will not be
used even if it exists. If you choose 'n' and would like to use
Calgary anyway, pass 'iommu=calgary' on the kernel command line.
If unsure, say Y.
it's both 'default y', and says "If unsure, say Y". Clearly not a typo.
disabling this option makes my box boot again. The patch below fixes the
Kconfig entry. Grumble.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index d4275537b25b..ef6672455695 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -480,14 +480,13 @@ config CALGARY_IOMMU
config CALGARY_IOMMU_ENABLED_BY_DEFAULT
bool "Should Calgary be enabled by default?"
- default y
depends on CALGARY_IOMMU
help
- Should Calgary be enabled by default? if you choose 'y', Calgary
+ Should Calgary be enabled by default? If you choose 'y', Calgary
will be used (if it exists). If you choose 'n', Calgary will not be
used even if it exists. If you choose 'n' and would like to use
Calgary anyway, pass 'iommu=calgary' on the kernel command line.
- If unsure, say Y.
+ If unsure, say N.
# need this always selected by IOMMU for the VIA workaround
config SWIOTLB