Patches contributed by Eötvös Lorand University
commit 82a1fcb90287052aabfa235e7ffc693ea003fe69
Author: Ingo Molnar <mingo@elte.hu>
Date: Fri Jan 25 21:08:02 2008 +0100
softlockup: automatically detect hung TASK_UNINTERRUPTIBLE tasks
this patch extends the soft-lockup detector to automatically
detect hung TASK_UNINTERRUPTIBLE tasks. Such hung tasks are
printed the following way:
------------------>
INFO: task prctl:3042 blocked for more than 120 seconds.
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message
prctl D fd5e3793 0 3042 2997
f6050f38 00000046 00000001 fd5e3793 00000009 c06d8264 c06dae80 00000286
f6050f40 f6050f00 f7d34d90 f7d34fc8 c1e1be80 00000001 f6050000 00000000
f7e92d00 00000286 f6050f18 c0489d1a f6050f40 00006605 00000000 c0133a5b
Call Trace:
[<c04883a5>] schedule_timeout+0x6d/0x8b
[<c04883d8>] schedule_timeout_uninterruptible+0x15/0x17
[<c0133a76>] msleep+0x10/0x16
[<c0138974>] sys_prctl+0x30/0x1e2
[<c0104c52>] sysenter_past_esp+0x5f/0xa5
=======================
2 locks held by prctl/3042:
#0: (&sb->s_type->i_mutex_key#5){--..}, at: [<c0197d11>] do_fsync+0x38/0x7a
#1: (jbd_handle){--..}, at: [<c01ca3d2>] journal_start+0xc7/0xe9
<------------------
the current default timeout is 120 seconds. Such messages are printed
up to 10 times per bootup. If the system has crashed already then the
messages are not printed.
if lockdep is enabled then all held locks are printed as well.
this feature is a natural extension to the softlockup-detector (kernel
locked up without scheduling) and to the NMI watchdog (kernel locked up
with IRQs disabled).
[ Gautham R Shenoy <ego@in.ibm.com>: CPU hotplug fixes. ]
[ Andrew Morton <akpm@linux-foundation.org>: build warning fix. ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h
index 1678a5de7013..f4a5871767f5 100644
--- a/include/linux/debug_locks.h
+++ b/include/linux/debug_locks.h
@@ -47,6 +47,7 @@ struct task_struct;
#ifdef CONFIG_LOCKDEP
extern void debug_show_all_locks(void);
+extern void __debug_show_held_locks(struct task_struct *task);
extern void debug_show_held_locks(struct task_struct *task);
extern void debug_check_no_locks_freed(const void *from, unsigned long len);
extern void debug_check_no_locks_held(struct task_struct *task);
@@ -55,6 +56,10 @@ static inline void debug_show_all_locks(void)
{
}
+static inline void __debug_show_held_locks(struct task_struct *task)
+{
+}
+
static inline void debug_show_held_locks(struct task_struct *task)
{
}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 288245f83bd4..0846f1f9e196 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -258,12 +258,17 @@ extern void account_process_tick(struct task_struct *task, int user);
extern void update_process_times(int user);
extern void scheduler_tick(void);
+extern void sched_show_task(struct task_struct *p);
+
#ifdef CONFIG_DETECT_SOFTLOCKUP
extern void softlockup_tick(void);
extern void spawn_softlockup_task(void);
extern void touch_softlockup_watchdog(void);
extern void touch_all_softlockup_watchdogs(void);
extern int softlockup_thresh;
+extern unsigned long sysctl_hung_task_check_count;
+extern unsigned long sysctl_hung_task_timeout_secs;
+extern long sysctl_hung_task_warnings;
#else
static inline void softlockup_tick(void)
{
@@ -1041,6 +1046,11 @@ struct task_struct {
/* ipc stuff */
struct sysv_sem sysvsem;
#endif
+#ifdef CONFIG_DETECT_SOFTLOCKUP
+/* hung task detection */
+ unsigned long last_switch_timestamp;
+ unsigned long last_switch_count;
+#endif
/* CPU-specific state of this task */
struct thread_struct thread;
/* filesystem information */
diff --git a/kernel/fork.c b/kernel/fork.c
index 8dd8ff281009..09c0b90a69cc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1059,6 +1059,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->prev_utime = cputime_zero;
p->prev_stime = cputime_zero;
+#ifdef CONFIG_DETECT_SOFTLOCKUP
+ p->last_switch_count = 0;
+ p->last_switch_timestamp = 0;
+#endif
+
#ifdef CONFIG_TASK_XACCT
p->rchar = 0; /* I/O counter: bytes read */
p->wchar = 0; /* I/O counter: bytes written */
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index e2c07ece367d..3574379f4d62 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -3206,7 +3206,11 @@ void debug_show_all_locks(void)
EXPORT_SYMBOL_GPL(debug_show_all_locks);
-void debug_show_held_locks(struct task_struct *task)
+/*
+ * Careful: only use this function if you are sure that
+ * the task cannot run in parallel!
+ */
+void __debug_show_held_locks(struct task_struct *task)
{
if (unlikely(!debug_locks)) {
printk("INFO: lockdep is turned off.\n");
@@ -3214,6 +3218,12 @@ void debug_show_held_locks(struct task_struct *task)
}
lockdep_print_held_locks(task);
}
+EXPORT_SYMBOL_GPL(__debug_show_held_locks);
+
+void debug_show_held_locks(struct task_struct *task)
+{
+ __debug_show_held_locks(task);
+}
EXPORT_SYMBOL_GPL(debug_show_held_locks);
diff --git a/kernel/sched.c b/kernel/sched.c
index c0e2db683e29..5b3d46574eeb 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4945,7 +4945,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
static const char stat_nam[] = "RSDTtZX";
-static void show_task(struct task_struct *p)
+void sched_show_task(struct task_struct *p)
{
unsigned long free = 0;
unsigned state;
@@ -4998,7 +4998,7 @@ void show_state_filter(unsigned long state_filter)
*/
touch_nmi_watchdog();
if (!state_filter || (p->state & state_filter))
- show_task(p);
+ sched_show_task(p);
} while_each_thread(g, p);
touch_all_softlockup_watchdogs();
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 11df812263c8..02f0ad534441 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -8,6 +8,7 @@
*/
#include <linux/mm.h>
#include <linux/cpu.h>
+#include <linux/nmi.h>
#include <linux/init.h>
#include <linux/delay.h>
#include <linux/freezer.h>
@@ -24,7 +25,7 @@ static DEFINE_PER_CPU(unsigned long, print_timestamp);
static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
static int did_panic;
-int softlockup_thresh = 10;
+int softlockup_thresh = 60;
static int
softlock_panic(struct notifier_block *this, unsigned long event, void *ptr)
@@ -45,7 +46,7 @@ static struct notifier_block panic_block = {
*/
static unsigned long get_timestamp(int this_cpu)
{
- return cpu_clock(this_cpu) >> 30; /* 2^30 ~= 10^9 */
+ return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */
}
void touch_softlockup_watchdog(void)
@@ -100,11 +101,7 @@ void softlockup_tick(void)
now = get_timestamp(this_cpu);
- /* Wake up the high-prio watchdog task every second: */
- if (now > (touch_timestamp + 1))
- wake_up_process(per_cpu(watchdog_task, this_cpu));
-
- /* Warn about unreasonable 10+ seconds delays: */
+ /* Warn about unreasonable delays: */
if (now <= (touch_timestamp + softlockup_thresh))
return;
@@ -121,12 +118,94 @@ void softlockup_tick(void)
spin_unlock(&print_lock);
}
+/*
+ * Have a reasonable limit on the number of tasks checked:
+ */
+unsigned long sysctl_hung_task_check_count = 1024;
+
+/*
+ * Zero means infinite timeout - no checking done:
+ */
+unsigned long sysctl_hung_task_timeout_secs = 120;
+
+long sysctl_hung_task_warnings = 10;
+
+/*
+ * Only do the hung-tasks check on one CPU:
+ */
+static int check_cpu __read_mostly = -1;
+
+static void check_hung_task(struct task_struct *t, unsigned long now)
+{
+ unsigned long switch_count = t->nvcsw + t->nivcsw;
+
+ if (t->flags & PF_FROZEN)
+ return;
+
+ if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
+ t->last_switch_count = switch_count;
+ t->last_switch_timestamp = now;
+ return;
+ }
+ if ((long)(now - t->last_switch_timestamp) <
+ sysctl_hung_task_timeout_secs)
+ return;
+ if (sysctl_hung_task_warnings < 0)
+ return;
+ sysctl_hung_task_warnings--;
+
+ /*
+ * Ok, the task did not get scheduled for more than 2 minutes,
+ * complain:
+ */
+ printk(KERN_ERR "INFO: task %s:%d blocked for more than "
+ "%ld seconds.\n", t->comm, t->pid,
+ sysctl_hung_task_timeout_secs);
+ printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
+ " disables this message.\n");
+ sched_show_task(t);
+ __debug_show_held_locks(t);
+
+ t->last_switch_timestamp = now;
+ touch_nmi_watchdog();
+}
+
+/*
+ * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
+ * a really long time (120 seconds). If that happens, print out
+ * a warning.
+ */
+static void check_hung_uninterruptible_tasks(int this_cpu)
+{
+ int max_count = sysctl_hung_task_check_count;
+ unsigned long now = get_timestamp(this_cpu);
+ struct task_struct *g, *t;
+
+ /*
+ * If the system crashed already then all bets are off,
+ * do not report extra hung tasks:
+ */
+ if ((tainted & TAINT_DIE) || did_panic)
+ return;
+
+ read_lock(&tasklist_lock);
+ do_each_thread(g, t) {
+ if (!--max_count)
+ break;
+ if (t->state & TASK_UNINTERRUPTIBLE)
+ check_hung_task(t, now);
+ } while_each_thread(g, t);
+
+ read_unlock(&tasklist_lock);
+}
+
/*
* The watchdog thread - runs every second and touches the timestamp.
*/
static int watchdog(void *__bind_cpu)
{
struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+ int this_cpu = (long)__bind_cpu;
sched_setscheduler(current, SCHED_FIFO, ¶m);
@@ -135,13 +214,18 @@ static int watchdog(void *__bind_cpu)
/*
* Run briefly once per second to reset the softlockup timestamp.
- * If this gets delayed for more than 10 seconds then the
+ * If this gets delayed for more than 60 seconds then the
* debug-printout triggers in softlockup_tick().
*/
while (!kthread_should_stop()) {
- set_current_state(TASK_INTERRUPTIBLE);
touch_softlockup_watchdog();
- schedule();
+ msleep_interruptible(10000);
+
+ if (this_cpu != check_cpu)
+ continue;
+
+ if (sysctl_hung_task_timeout_secs)
+ check_hung_uninterruptible_tasks(this_cpu);
}
return 0;
@@ -171,6 +255,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
break;
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
+ check_cpu = any_online_cpu(cpu_online_map);
wake_up_process(per_cpu(watchdog_task, hotcpu));
break;
#ifdef CONFIG_HOTPLUG_CPU
@@ -181,6 +266,15 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
/* Unbind so it can run. Fall thru. */
kthread_bind(per_cpu(watchdog_task, hotcpu),
any_online_cpu(cpu_online_map));
+ case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
+ if (hotcpu == check_cpu) {
+ cpumask_t temp_cpu_online_map = cpu_online_map;
+
+ cpu_clear(hotcpu, temp_cpu_online_map);
+ check_cpu = any_online_cpu(temp_cpu_online_map);
+ }
+ break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
p = per_cpu(watchdog_task, hotcpu);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c95f3ed34474..96f31c1bc4f0 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -753,6 +753,33 @@ static struct ctl_table kern_table[] = {
.extra1 = &one,
.extra2 = &sixty,
},
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "hung_task_check_count",
+ .data = &sysctl_hung_task_check_count,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "hung_task_timeout_secs",
+ .data = &sysctl_hung_task_timeout_secs,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "hung_task_warnings",
+ .data = &sysctl_hung_task_warnings,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ },
#endif
#ifdef CONFIG_COMPAT
{
commit d0d23b5432fe61229dd3641c5e94d4130bc4e61b
Author: Ingo Molnar <mingo@elte.hu>
Date: Fri Jan 25 21:08:02 2008 +0100
cpu-hotplug: fix build on !CONFIG_SMP
fix build on !CONFIG_SMP.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 3a3ff1c5cbef..0be8d65bc3c8 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -71,19 +71,25 @@ static inline void unregister_cpu_notifier(struct notifier_block *nb)
int cpu_up(unsigned int cpu);
+extern void cpu_hotplug_init(void);
+
#else
static inline int register_cpu_notifier(struct notifier_block *nb)
{
return 0;
}
+
static inline void unregister_cpu_notifier(struct notifier_block *nb)
{
}
+static inline void cpu_hotplug_init(void)
+{
+}
+
#endif /* CONFIG_SMP */
extern struct sysdev_class cpu_sysdev_class;
-extern void cpu_hotplug_init(void);
extern void cpu_maps_update_begin(void);
extern void cpu_maps_update_done(void);
commit 86faf39d0fc04272b05fab1db6d683f3ac7199d1
Author: Ingo Molnar <mingo@elte.hu>
Date: Fri Jan 25 21:07:59 2008 +0100
sched: remove printk_clock references from ia64
remove remaining printk_clock references from ia64.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index 4ac2b1f1bd3b..86028c69861e 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -71,8 +71,6 @@ unsigned long __per_cpu_offset[NR_CPUS];
EXPORT_SYMBOL(__per_cpu_offset);
#endif
-extern void ia64_setup_printk_clock(void);
-
DEFINE_PER_CPU(struct cpuinfo_ia64, cpu_info);
DEFINE_PER_CPU(unsigned long, local_per_cpu_offset);
unsigned long ia64_cycles_per_usec;
@@ -507,8 +505,6 @@ setup_arch (char **cmdline_p)
/* process SAL system table: */
ia64_sal_init(__va(efi.sal_systab));
- ia64_setup_printk_clock();
-
#ifdef CONFIG_SMP
cpu_physical_id(0) = hard_smp_processor_id();
#endif
diff --git a/arch/ia64/sn/kernel/setup.c b/arch/ia64/sn/kernel/setup.c
index 1f38a3a68390..bb1d24929640 100644
--- a/arch/ia64/sn/kernel/setup.c
+++ b/arch/ia64/sn/kernel/setup.c
@@ -64,7 +64,6 @@ extern void sn_timer_init(void);
extern unsigned long last_time_offset;
extern void (*ia64_mark_idle) (int);
extern void snidle(int);
-extern unsigned long long (*ia64_printk_clock)(void);
unsigned long sn_rtc_cycles_per_second;
EXPORT_SYMBOL(sn_rtc_cycles_per_second);
@@ -360,14 +359,6 @@ sn_scan_pcdp(void)
static unsigned long sn2_rtc_initial;
-static unsigned long long ia64_sn2_printk_clock(void)
-{
- unsigned long rtc_now = rtc_time();
-
- return (rtc_now - sn2_rtc_initial) *
- (1000000000 / sn_rtc_cycles_per_second);
-}
-
/**
* sn_setup - SN platform setup routine
* @cmdline_p: kernel command line
@@ -468,8 +459,6 @@ void __init sn_setup(char **cmdline_p)
platform_intr_list[ACPI_INTERRUPT_CPEI] = IA64_CPE_VECTOR;
- ia64_printk_clock = ia64_sn2_printk_clock;
-
printk("SGI SAL version %x.%02x\n", version >> 8, version & 0x00FF);
/*
commit b842271fbb9c8b5fd0e1c3e1895a3b67ba5bcc54
Author: Ingo Molnar <mingo@elte.hu>
Date: Fri Jan 25 21:07:59 2008 +0100
sched: remove printk_clock()
printk_clock() is obsolete - it has been replaced with cpu_clock().
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/arch/arm/kernel/time.c b/arch/arm/kernel/time.c
index f6f3689a86ee..e59b5b84168d 100644
--- a/arch/arm/kernel/time.c
+++ b/arch/arm/kernel/time.c
@@ -79,17 +79,6 @@ static unsigned long dummy_gettimeoffset(void)
}
#endif
-/*
- * An implementation of printk_clock() independent from
- * sched_clock(). This avoids non-bootable kernels when
- * printk_clock is enabled.
- */
-unsigned long long printk_clock(void)
-{
- return (unsigned long long)(jiffies - INITIAL_JIFFIES) *
- (1000000000 / HZ);
-}
-
static unsigned long next_rtc_update;
/*
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 2bb84214e5f1..3ab042720970 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -344,33 +344,6 @@ udelay (unsigned long usecs)
}
EXPORT_SYMBOL(udelay);
-static unsigned long long ia64_itc_printk_clock(void)
-{
- if (ia64_get_kr(IA64_KR_PER_CPU_DATA))
- return sched_clock();
- return 0;
-}
-
-static unsigned long long ia64_default_printk_clock(void)
-{
- return (unsigned long long)(jiffies_64 - INITIAL_JIFFIES) *
- (1000000000/HZ);
-}
-
-unsigned long long (*ia64_printk_clock)(void) = &ia64_default_printk_clock;
-
-unsigned long long printk_clock(void)
-{
- return ia64_printk_clock();
-}
-
-void __init
-ia64_setup_printk_clock(void)
-{
- if (!(sal_platform_features & IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT))
- ia64_printk_clock = ia64_itc_printk_clock;
-}
-
/* IA64 doesn't cache the timezone */
void update_vsyscall_tz(void)
{
diff --git a/kernel/printk.c b/kernel/printk.c
index 5f9d053699f9..3b7c968d0ef9 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -573,11 +573,6 @@ static int __init printk_time_setup(char *str)
__setup("time", printk_time_setup);
-__attribute__((weak)) unsigned long long printk_clock(void)
-{
- return sched_clock();
-}
-
/* Check if we have any console registered that can be called early in boot. */
static int have_callable_console(void)
{
commit d713f519332e029d43eca8462629314eee1ded86
Author: Ingo Molnar <mingo@elte.hu>
Date: Fri Jan 25 21:07:58 2008 +0100
sched: fix CONFIG_PRINT_TIME's reliance on sched_clock()
Stefano Brivio reported weird printk timestamp behavior during
CPU frequency changes:
http://bugzilla.kernel.org/show_bug.cgi?id=9475
fix CONFIG_PRINT_TIME's reliance on sched_clock() and use cpu_clock()
instead.
Reported-and-bisected-by: Stefano Brivio <stefano.brivio@polimi.it>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/kernel/printk.c b/kernel/printk.c
index e6c1f36d8c3a..5f9d053699f9 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -707,7 +707,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
loglev_char = default_message_loglevel
+ '0';
}
- t = printk_clock();
+ t = cpu_clock(printk_cpu);
nanosec_rem = do_div(t, 1000000000);
tlen = sprintf(tbuf,
"<%c>[%5lu.%06lu] ",
commit 32a76006683f7b28ae3cc491da37716e002f198e
Author: Ingo Molnar <mingo@elte.hu>
Date: Fri Jan 25 21:07:58 2008 +0100
printk: make printk more robust by not allowing recursion
make printk more robust by allowing recursion only if there's a crash
going on. Also add recursion detection.
I've tested it with an artificially injected printk recursion - instead
of a lockup or spontaneous reboot or other crash, the output was a well
controlled:
[ 41.057335] SysRq : <2>BUG: recent printk recursion!
[ 41.057335] loglevel0-8 reBoot Crashdump show-all-locks(D) tErm Full kIll saK showMem Nice powerOff showPc show-all-timers(Q) unRaw Sync showTasks Unmount shoW-blocked-tasks
also do all this printk-debug logic with irqs disabled.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Reviewed-by: Nick Piggin <npiggin@suse.de>
diff --git a/kernel/printk.c b/kernel/printk.c
index 89011bf8c106..e6c1f36d8c3a 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -628,30 +628,57 @@ asmlinkage int printk(const char *fmt, ...)
/* cpu currently holding logbuf_lock */
static volatile unsigned int printk_cpu = UINT_MAX;
+const char printk_recursion_bug_msg [] =
+ KERN_CRIT "BUG: recent printk recursion!\n";
+static int printk_recursion_bug;
+
asmlinkage int vprintk(const char *fmt, va_list args)
{
+ static int log_level_unknown = 1;
+ static char printk_buf[1024];
+
unsigned long flags;
- int printed_len;
+ int printed_len = 0;
+ int this_cpu;
char *p;
- static char printk_buf[1024];
- static int log_level_unknown = 1;
boot_delay_msec();
preempt_disable();
- if (unlikely(oops_in_progress) && printk_cpu == smp_processor_id())
- /* If a crash is occurring during printk() on this CPU,
- * make sure we can't deadlock */
- zap_locks();
-
/* This stops the holder of console_sem just where we want him */
raw_local_irq_save(flags);
+ this_cpu = smp_processor_id();
+
+ /*
+ * Ouch, printk recursed into itself!
+ */
+ if (unlikely(printk_cpu == this_cpu)) {
+ /*
+ * If a crash is occurring during printk() on this CPU,
+ * then try to get the crash message out but make sure
+ * we can't deadlock. Otherwise just return to avoid the
+ * recursion and return - but flag the recursion so that
+ * it can be printed at the next appropriate moment:
+ */
+ if (!oops_in_progress) {
+ printk_recursion_bug = 1;
+ goto out_restore_irqs;
+ }
+ zap_locks();
+ }
+
lockdep_off();
spin_lock(&logbuf_lock);
- printk_cpu = smp_processor_id();
+ printk_cpu = this_cpu;
+ if (printk_recursion_bug) {
+ printk_recursion_bug = 0;
+ strcpy(printk_buf, printk_recursion_bug_msg);
+ printed_len = sizeof(printk_recursion_bug_msg);
+ }
/* Emit the output into the temporary buffer */
- printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args);
+ printed_len += vscnprintf(printk_buf + printed_len,
+ sizeof(printk_buf), fmt, args);
/*
* Copy the output into log_buf. If the caller didn't provide
@@ -744,6 +771,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
printk_cpu = UINT_MAX;
spin_unlock(&logbuf_lock);
lockdep_on();
+out_restore_irqs:
raw_local_irq_restore(flags);
}
commit c61935fd0e7f087a643827b4bf5ef646963c10fa
Author: Ingo Molnar <mingo@elte.hu>
Date: Tue Jan 22 11:24:58 2008 +0100
sched: group scheduler, set uid share fix
setting cpu share to 1 causes hangs, as reported in:
http://bugzilla.kernel.org/show_bug.cgi?id=9779
as the default share is 1024, the values of 0 and 1 can indeed
cause problems. Limit it to 2 or higher values.
These values can only be set by the root user - but still it
makes sense to protect against nonsensical values.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/kernel/sched.c b/kernel/sched.c
index 37cf07aa4164..e76b11ca6df3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7153,6 +7153,14 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
{
int i;
+ /*
+ * A weight of 0 or 1 can cause arithmetics problems.
+ * (The default weight is 1024 - so there's no practical
+ * limitation from this.)
+ */
+ if (shares < 2)
+ shares = 2;
+
spin_lock(&tg->lock);
if (tg->shares == shares)
goto done;
commit 23be8c7ddf4fd31a14579a2109c89845f7a0fbb6
Author: Ingo Molnar <mingo@elte.hu>
Date: Tue Jan 15 16:44:37 2008 +0100
x86: fix boot crash on HIGHMEM4G && SPARSEMEM
Denys Fedoryshchenko reported a bootup crash when he upgraded
his system from 3GB to 4GB RAM:
http://lkml.org/lkml/2008/1/7/9
the bug is due to HIGHMEM4G && SPARSEMEM kernels making pfn_to_page()
to return an invalid pointer when the pfn is in a memory hole. The
256 MB PCI aperture at the end of RAM was not mapped by sparsemem,
and hence the pfn was not valid. But set_highmem_pages_init() iterated
this range without checking the pfn's validity first.
this bug was probably present in the sparsemem code ever since sparsemem
has been introduced in v2.6.13. It was masked due to HIGHMEM64G using
larger memory regions in sparsemem_32.h:
#ifdef CONFIG_X86_PAE
#define SECTION_SIZE_BITS 30
#define MAX_PHYSADDR_BITS 36
#define MAX_PHYSMEM_BITS 36
#else
#define SECTION_SIZE_BITS 26
#define MAX_PHYSADDR_BITS 32
#define MAX_PHYSMEM_BITS 32
#endif
which creates 1GB sparsemem regions instead of 64MB sparsemem regions.
So in practice we only ever created true sparsemem holes on x86 with
HIGHMEM4G - but that was rarely used by distros.
( btw., we could probably save 2MB of mem_map[]s on X86_PAE if we reduced
the sparsemem region size to 256 MB. )
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index c7d19471261d..3c76d194fd2c 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -321,8 +321,13 @@ extern void set_highmem_pages_init(int);
static void __init set_highmem_pages_init(int bad_ppro)
{
int pfn;
- for (pfn = highstart_pfn; pfn < highend_pfn; pfn++)
- add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
+ for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) {
+ /*
+ * Holes under sparsemem might not have no mem_map[]:
+ */
+ if (pfn_valid(pfn))
+ add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
+ }
totalram_pages += totalhigh_pages;
}
#endif /* CONFIG_FLATMEM */
commit 2997c8c4a0b179e8b834a7f30ba4323f2c60ccf4
Author: Ingo Molnar <mingo@elte.hu>
Date: Fri Jan 11 13:35:54 2008 +0100
block: fix blktrace timestamps
David Dillow reported broken blktrace timestamps. The reason
is cpu_clock() which is not a global time source.
Fix bkltrace timestamps by using ktime_get() like the networking
code does for packet timestamps. This also removes a whole lot
of complexity from bkltrace.c and shrinks the code by 500 bytes:
text data bss dec hex filename
2888 124 44 3056 bf0 blktrace.o.before
2390 116 44 2550 9f6 blktrace.o.after
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
diff --git a/block/blktrace.c b/block/blktrace.c
index 498a0a54a6aa..7471621d4ded 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -25,7 +25,6 @@
#include <linux/time.h>
#include <asm/uaccess.h>
-static DEFINE_PER_CPU(unsigned long long, blk_trace_cpu_offset) = { 0, };
static unsigned int blktrace_seq __read_mostly = 1;
/*
@@ -41,7 +40,7 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
const int cpu = smp_processor_id();
t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
- t->time = cpu_clock(cpu) - per_cpu(blk_trace_cpu_offset, cpu);
+ t->time = ktime_to_ns(ktime_get());
t->device = bt->dev;
t->action = action;
t->pid = pid;
@@ -159,7 +158,7 @@ void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
t->sequence = ++(*sequence);
- t->time = cpu_clock(cpu) - per_cpu(blk_trace_cpu_offset, cpu);
+ t->time = ktime_to_ns(ktime_get());
t->sector = sector;
t->bytes = bytes;
t->action = what;
@@ -506,73 +505,9 @@ void blk_trace_shutdown(struct request_queue *q)
}
}
-/*
- * Average offset over two calls to cpu_clock() with a gettimeofday()
- * in the middle
- */
-static void blk_check_time(unsigned long long *t, int this_cpu)
-{
- unsigned long long a, b;
- struct timeval tv;
-
- a = cpu_clock(this_cpu);
- do_gettimeofday(&tv);
- b = cpu_clock(this_cpu);
-
- *t = tv.tv_sec * 1000000000 + tv.tv_usec * 1000;
- *t -= (a + b) / 2;
-}
-
-/*
- * calibrate our inter-CPU timings
- */
-static void blk_trace_check_cpu_time(void *data)
-{
- unsigned long long *t;
- int this_cpu = get_cpu();
-
- t = &per_cpu(blk_trace_cpu_offset, this_cpu);
-
- /*
- * Just call it twice, hopefully the second call will be cache hot
- * and a little more precise
- */
- blk_check_time(t, this_cpu);
- blk_check_time(t, this_cpu);
-
- put_cpu();
-}
-
-static void blk_trace_set_ht_offsets(void)
-{
-#if defined(CONFIG_SCHED_SMT)
- int cpu, i;
-
- /*
- * now make sure HT siblings have the same time offset
- */
- preempt_disable();
- for_each_online_cpu(cpu) {
- unsigned long long *cpu_off, *sibling_off;
-
- for_each_cpu_mask(i, per_cpu(cpu_sibling_map, cpu)) {
- if (i == cpu)
- continue;
-
- cpu_off = &per_cpu(blk_trace_cpu_offset, cpu);
- sibling_off = &per_cpu(blk_trace_cpu_offset, i);
- *sibling_off = *cpu_off;
- }
- }
- preempt_enable();
-#endif
-}
-
static __init int blk_trace_init(void)
{
mutex_init(&blk_tree_mutex);
- on_each_cpu(blk_trace_check_cpu_time, NULL, 1, 1);
- blk_trace_set_ht_offsets();
return 0;
}
commit a263898f628dd21e59210b547986c154788f628e
Author: Ingo Molnar <mingo@elte.hu>
Date: Sun Dec 30 11:58:17 2007 +0100
CPU hotplug: fix cpu_is_offline() on !CONFIG_HOTPLUG_CPU
make randconfig bootup testing found that the cpufreq code
crashes on bootup, if the powernow-k8 driver is enabled and
if maxcpus=1 passed on the boot line to a !CONFIG_HOTPLUG_CPU
kernel.
First lockdep found out that there's an inconsistent unlock
sequence:
=====================================
[ BUG: bad unlock balance detected! ]
-------------------------------------
swapper/1 is trying to release lock (&per_cpu(cpu_policy_rwsem, cpu)) at:
[<ffffffff806ffd8e>] unlock_policy_rwsem_write+0x3c/0x42
but there are no more locks to release!
Call Trace:
[<ffffffff806ffd8e>] unlock_policy_rwsem_write+0x3c/0x42
[<ffffffff80251c29>] print_unlock_inbalance_bug+0x104/0x12c
[<ffffffff80252f3a>] mark_held_locks+0x56/0x94
[<ffffffff806ffd8e>] unlock_policy_rwsem_write+0x3c/0x42
[<ffffffff807008b6>] cpufreq_add_dev+0x2a8/0x5c4
...
then shortly afterwards the cpufreq code crashed on an assert:
------------[ cut here ]------------
kernel BUG at drivers/cpufreq/cpufreq.c:1068!
invalid opcode: 0000 [1] SMP
[...]
Call Trace:
[<ffffffff805145d6>] sysdev_driver_unregister+0x5b/0x91
[<ffffffff806ff520>] cpufreq_register_driver+0x15d/0x1a2
[<ffffffff80cc0596>] powernowk8_init+0x86/0x94
[...]
---[ end trace 1e9219be2b4431de ]---
the bug was caused by maxcpus=1 bootup, which brought up the
secondary core as !cpu_online() but !cpu_is_offline() either,
which on on !CONFIG_HOTPLUG_CPU is always 0 (include/linux/cpu.h):
/* CPUs don't go offline once they're online w/o CONFIG_HOTPLUG_CPU */
static inline int cpu_is_offline(int cpu) { return 0; }
but the cpufreq code uses cpu_online() and cpu_is_offline() in
a mixed way - the low-level drivers use cpu_online(), while
the cpufreq core uses cpu_is_offline(). This opened up the
possibility to add the non-initialized sysdev device of the
secondary core:
cpufreq-core: trying to register driver powernow-k8
cpufreq-core: adding CPU 0
powernow-k8: BIOS error - no PSB or ACPI _PSS objects
cpufreq-core: initialization failed
cpufreq-core: adding CPU 1
cpufreq-core: initialization failed
which then blew up. The fix is to make cpu_is_offline() always
the negation of cpu_online(). With that fix applied the kernel
boots up fine without crashing:
Calling initcall 0xffffffff80cc0510: powernowk8_init+0x0/0x94()
powernow-k8: Found 1 AMD Athlon(tm) 64 X2 Dual Core Processor 3800+ processors (1 cpu cores) (version 2.20.00)
powernow-k8: BIOS error - no PSB or ACPI _PSS objects
initcall 0xffffffff80cc0510: powernowk8_init+0x0/0x94() returned -19.
initcall 0xffffffff80cc0510 ran for 19 msecs: powernowk8_init+0x0/0x94()
Calling initcall 0xffffffff80cc328f: init_lapic_nmi_sysfs+0x0/0x39()
We could fix this by making CPU enumeration aware of max_cpus, but that
would be more fragile IMO, and the cpu_online(cpu) != cpu_is_offline(cpu)
possibility was quite confusing and a continuous source of bugs too.
Most distributions have kernels with CPU hotplug enabled, so this bug
remained hidden for a long time.
Bug forensics:
The broken cpu_is_offline() API variant was introduced via:
commit a59d2e4e6977e7b94e003c96a41f07e96cddc340
Author: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon Mar 8 06:06:03 2004 -0800
[PATCH] minor cleanups for hotplug CPUs
( this predates linux-2.6.git, this commit is available from Thomas's
historic git tree. )
Then 1.5 years later the cpufreq code made use of it:
commit c32b6b8e524d2c337767d312814484d9289550cf
Author: Ashok Raj <ashok.raj@intel.com>
Date: Sun Oct 30 14:59:54 2005 -0800
[PATCH] create and destroy cpufreq sysfs entries based on cpu notifiers
+ if (cpu_is_offline(cpu))
+ return 0;
which is a correct use of the subtly broken new API. v2.6.15 then
shipped with this bug included.
then it took two more years for random-kernel qa to hit it.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index b79c57569367..92f2029a34f3 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -107,7 +107,6 @@ extern void unlock_cpu_hotplug(void);
#define register_hotcpu_notifier(nb) register_cpu_notifier(nb)
#define unregister_hotcpu_notifier(nb) unregister_cpu_notifier(nb)
int cpu_down(unsigned int cpu);
-#define cpu_is_offline(cpu) unlikely(!cpu_online(cpu))
#else /* CONFIG_HOTPLUG_CPU */
@@ -122,9 +121,6 @@ static inline void cpuhotplug_mutex_unlock(struct mutex *cpu_hp_mutex)
/* These aren't inline functions due to a GCC bug. */
#define register_hotcpu_notifier(nb) ({ (void)(nb); 0; })
#define unregister_hotcpu_notifier(nb) ({ (void)(nb); })
-
-/* CPUs don't go offline once they're online w/o CONFIG_HOTPLUG_CPU */
-static inline int cpu_is_offline(int cpu) { return 0; }
#endif /* CONFIG_HOTPLUG_CPU */
#ifdef CONFIG_PM_SLEEP_SMP
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 23f55140ccd5..85bd790c201e 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -397,6 +397,8 @@ extern cpumask_t cpu_present_map;
#define cpu_present(cpu) ((cpu) == 0)
#endif
+#define cpu_is_offline(cpu) unlikely(!cpu_online(cpu))
+
#ifdef CONFIG_SMP
extern int nr_cpu_ids;
#define any_online_cpu(mask) __any_online_cpu(&(mask))