Patches contributed by Eötvös Lorand University
commit 8a6c160a2a13d82c75a50af7282b906cce948df5
Author: Ingo Molnar <mingo@elte.hu>
Date: Wed Apr 30 22:13:44 2008 +0200
x86: redo thread_info.h change
redo Roland's "signals: x86 TS_RESTORE_SIGMASK" ontop of the unified
thread_info.h file.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/include/asm-x86/thread_info.h b/include/asm-x86/thread_info.h
index 348f0e0faa3b..74481b72ae0f 100644
--- a/include/asm-x86/thread_info.h
+++ b/include/asm-x86/thread_info.h
@@ -80,7 +80,6 @@ struct thread_info {
#endif
#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
#define TIF_SECCOMP 8 /* secure computing */
-#define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal */
#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */
#define TIF_HRTICK_RESCHED 11 /* reprogram hrtick timer */
#define TIF_NOTSC 16 /* TSC is not accessible in userland */
@@ -108,7 +107,6 @@ struct thread_info {
#endif
#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
#define _TIF_SECCOMP (1 << TIF_SECCOMP)
-#define _TIF_RESTORE_SIGMASK (1 << TIF_RESTORE_SIGMASK)
#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY)
#define _TIF_HRTICK_RESCHED (1 << TIF_HRTICK_RESCHED)
#define _TIF_NOTSC (1 << TIF_NOTSC)
@@ -237,9 +235,20 @@ static inline struct thread_info *stack_thread_info(void)
#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/
#define TS_POLLING 0x0004 /* true if in idle loop
and not sleeping */
+#define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal() */
#define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING)
+#ifndef __ASSEMBLY__
+#define HAVE_SET_RESTORE_SIGMASK 1
+static inline void set_restore_sigmask(void)
+{
+ struct thread_info *ti = current_thread_info();
+ ti->status |= TS_RESTORE_SIGMASK;
+ set_bit(TIF_SIGPENDING, &ti->flags);
+}
+#endif /* !__ASSEMBLY__ */
+
#ifndef __ASSEMBLY__
extern void arch_task_cache_init(void);
extern void free_thread_info(struct thread_info *ti);
commit 1c7d06d419dbe82c76fbb4d3e1fa61b2da2dc00b
Author: Ingo Molnar <mingo@elte.hu>
Date: Wed Apr 30 22:12:05 2008 +0200
revert: thread_info.h change
temporarily revert parts of "signals: x86 TS_RESTORE_SIGMASK".
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/include/asm-x86/thread_info_32.h b/include/asm-x86/thread_info_32.h
index b6338829d1a8..531859962096 100644
--- a/include/asm-x86/thread_info_32.h
+++ b/include/asm-x86/thread_info_32.h
@@ -131,6 +131,7 @@ static inline struct thread_info *current_thread_info(void)
#define TIF_SYSCALL_EMU 5 /* syscall emulation active */
#define TIF_SYSCALL_AUDIT 6 /* syscall auditing active */
#define TIF_SECCOMP 7 /* secure computing */
+#define TIF_RESTORE_SIGMASK 8 /* restore signal mask in do_signal() */
#define TIF_HRTICK_RESCHED 9 /* reprogram hrtick timer */
#define TIF_MEMDIE 16
#define TIF_DEBUG 17 /* uses debug registers */
@@ -150,6 +151,7 @@ static inline struct thread_info *current_thread_info(void)
#define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU)
#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
#define _TIF_SECCOMP (1 << TIF_SECCOMP)
+#define _TIF_RESTORE_SIGMASK (1 << TIF_RESTORE_SIGMASK)
#define _TIF_HRTICK_RESCHED (1 << TIF_HRTICK_RESCHED)
#define _TIF_DEBUG (1 << TIF_DEBUG)
#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP)
@@ -186,20 +188,9 @@ static inline struct thread_info *current_thread_info(void)
this quantum (SMP) */
#define TS_POLLING 0x0002 /* True if in idle loop
and not sleeping */
-#define TS_RESTORE_SIGMASK 0x0004 /* restore signal mask in do_signal() */
#define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING)
-#ifndef __ASSEMBLY__
-#define HAVE_SET_RESTORE_SIGMASK 1
-static inline void set_restore_sigmask(void)
-{
- struct thread_info *ti = current_thread_info();
- ti->status |= TS_RESTORE_SIGMASK;
- set_bit(TIF_SIGPENDING, &ti->flags);
-}
-#endif /* !__ASSEMBLY__ */
-
#endif /* __KERNEL__ */
#endif /* _ASM_THREAD_INFO_H */
diff --git a/include/asm-x86/thread_info_64.h b/include/asm-x86/thread_info_64.h
index cb69f70abba1..ed664e874dec 100644
--- a/include/asm-x86/thread_info_64.h
+++ b/include/asm-x86/thread_info_64.h
@@ -109,6 +109,7 @@ static inline struct thread_info *stack_thread_info(void)
#define TIF_IRET 5 /* force IRET */
#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
#define TIF_SECCOMP 8 /* secure computing */
+#define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal */
#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */
#define TIF_HRTICK_RESCHED 11 /* reprogram hrtick timer */
/* 16 free */
@@ -132,6 +133,7 @@ static inline struct thread_info *stack_thread_info(void)
#define _TIF_IRET (1 << TIF_IRET)
#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
#define _TIF_SECCOMP (1 << TIF_SECCOMP)
+#define _TIF_RESTORE_SIGMASK (1 << TIF_RESTORE_SIGMASK)
#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY)
#define _TIF_HRTICK_RESCHED (1 << TIF_HRTICK_RESCHED)
#define _TIF_IA32 (1 << TIF_IA32)
@@ -176,20 +178,9 @@ static inline struct thread_info *stack_thread_info(void)
#define TS_COMPAT 0x0002 /* 32bit syscall active */
#define TS_POLLING 0x0004 /* true if in idle loop
and not sleeping */
-#define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal() */
#define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING)
-#ifndef __ASSEMBLY__
-#define HAVE_SET_RESTORE_SIGMASK 1
-static inline void set_restore_sigmask(void)
-{
- struct thread_info *ti = current_thread_info();
- ti->status |= TS_RESTORE_SIGMASK;
- set_bit(TIF_SIGPENDING, &ti->flags);
-}
-#endif /* !__ASSEMBLY__ */
-
#endif /* __KERNEL__ */
#endif /* _ASM_THREAD_INFO_H */
commit 82fd866701881623d69fe280dbac06ddff1fdef9
Author: Ingo Molnar <mingo@elte.hu>
Date: Thu May 1 03:46:22 2008 +0200
x86: rdc: leds build/config fix
select NEW_LEDS for now until the Kconfig dependencies have been
fixed.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 42109f119df7..fe361ae7ef2f 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -335,6 +335,7 @@ config X86_RDC321X
select GENERIC_GPIO
select LEDS_CLASS
select LEDS_GPIO
+ select NEW_LEDS
help
This option is needed for RDC R-321x system-on-chip, also known
as R-8610-(G).
commit f7c83a0aaa772f8d0189fa197d77c762caaa367a
Author: Ingo Molnar <mingo@elte.hu>
Date: Wed Apr 30 09:48:07 2008 +0200
Fix drivers/media build for modular builds
Fix allmodconfig build bug introduced in latest -git by commit
7c91f0624a9 ("V4L/DVB(7767): Move tuners to common/tuners"):
LD kernel/built-in.o
LD drivers/built-in.o
ld: drivers/media/built-in.o: No such file: No such file or directory
which happens if all media drivers are modular:
http://redhat.com/~mingo/misc/config-Wed_Apr_30_09_24_48_CEST_2008.bad
In that case there's no obj-y rule connecting all the built-in.o files and
the link tree breaks.
The fix is to add a guaranteed obj-y rule for the core vmlinux to build.
(which results in an empty object file if all media drivers are modular)
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
diff --git a/drivers/media/Makefile b/drivers/media/Makefile
index 73f742c7e818..cc11c4c0e7e7 100644
--- a/drivers/media/Makefile
+++ b/drivers/media/Makefile
@@ -2,6 +2,8 @@
# Makefile for the kernel multimedia device drivers.
#
+obj-y := common/
+
obj-$(CONFIG_VIDEO_MEDIA) += common/
# Since hybrid devices are here, should be compiled if DVB and/or V4L
commit bf726eab3711cf192405d21688a4b21e07b6188a
Author: Ingo Molnar <mingo@elte.hu>
Date: Thu May 8 11:53:48 2008 +0200
semaphore: fix
Yanmin Zhang reported:
| Comparing with kernel 2.6.25, AIM7 (use tmpfs) has more th
| regression under 2.6.26-rc1 on my 8-core stoakley, 16-core tigerton,
| and Itanium Montecito. Bisect located the patch below:
|
| 64ac24e738823161693bf791f87adc802cf529ff is first bad commit
| commit 64ac24e738823161693bf791f87adc802cf529ff
| Author: Matthew Wilcox <matthew@wil.cx>
| Date: Fri Mar 7 21:55:58 2008 -0500
|
| Generic semaphore implementation
|
| After I manually reverted the patch against 2.6.26-rc1 while fixing
| lots of conflicts/errors, aim7 regression became less than 2%.
i reproduced the AIM7 workload and can confirm Yanmin's findings that
-.26-rc1 regresses over .25 - by over 67% here.
Looking at the workload i found and fixed what i believe to be the real
bug causing the AIM7 regression: it was inefficient wakeup / scheduling
/ locking behavior of the new generic semaphore code, causing suboptimal
performance.
The problem comes from the following code. The new semaphore code does
this on down():
spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
sem->count--;
else
__down(sem);
spin_unlock_irqrestore(&sem->lock, flags);
and this on up():
spin_lock_irqsave(&sem->lock, flags);
if (likely(list_empty(&sem->wait_list)))
sem->count++;
else
__up(sem);
spin_unlock_irqrestore(&sem->lock, flags);
where __up() does:
list_del(&waiter->list);
waiter->up = 1;
wake_up_process(waiter->task);
and where __down() does this in essence:
list_add_tail(&waiter.list, &sem->wait_list);
waiter.task = task;
waiter.up = 0;
for (;;) {
[...]
spin_unlock_irq(&sem->lock);
timeout = schedule_timeout(timeout);
spin_lock_irq(&sem->lock);
if (waiter.up)
return 0;
}
the fastpath looks good and obvious, but note the following property of
the contended path: if there's a task on the ->wait_list, the up() of
the current owner will "pass over" ownership to that waiting task, in a
wake-one manner, via the waiter->up flag and by removing the waiter from
the wait list.
That is all and fine in principle, but as implemented in
kernel/semaphore.c it also creates a nasty, hidden source of contention!
The contention comes from the following property of the new semaphore
code: the new owner owns the semaphore exclusively, even if it is not
running yet.
So if the old owner, even if just a few instructions later, does a
down() [lock_kernel()] again, it will be blocked and will have to wait
on the new owner to eventually be scheduled (possibly on another CPU)!
Or if another task gets to lock_kernel() sooner than the "new owner"
scheduled, it will be blocked unnecessarily and for a very long time
when there are 2000 tasks running.
I.e. the implementation of the new semaphores code does wake-one and
lock ownership in a very restrictive way - it does not allow
opportunistic re-locking of the lock at all and keeps the scheduler from
picking task order intelligently.
This kind of scheduling, with 2000 AIM7 processes running, creates awful
cross-scheduling between those 2000 tasks, causes reduced parallelism, a
throttled runqueue length and a lot of idle time. With increasing number
of CPUs it causes an exponentially worse behavior in AIM7, as the chance
for a newly woken new-owner task to actually run anytime soon is less
and less likely.
Note that it takes just a tiny bit of contention for the 'new-semaphore
catastrophy' to happen: the wakeup latencies get added to whatever small
contention there is, and quickly snowball out of control!
I believe Yanmin's findings and numbers support this analysis too.
The best fix for this problem is to use the same scheduling logic that
the kernel/mutex.c code uses: keep the wake-one behavior (that is OK and
wanted because we do not want to over-schedule), but also allow
opportunistic locking of the lock even if a wakee is already "in
flight".
The patch below implements this new logic. With this patch applied the
AIM7 regression is largely fixed on my quad testbox:
# v2.6.25 vanilla:
..................
Tasks Jobs/Min JTI Real CPU Jobs/sec/task
2000 56096.4 91 207.5 789.7 0.4675
2000 55894.4 94 208.2 792.7 0.4658
# v2.6.26-rc1-166-gc0a1811 vanilla:
...................................
Tasks Jobs/Min JTI Real CPU Jobs/sec/task
2000 33230.6 83 350.3 784.5 0.2769
2000 31778.1 86 366.3 783.6 0.2648
# v2.6.26-rc1-166-gc0a1811 + semaphore-speedup:
...............................................
Tasks Jobs/Min JTI Real CPU Jobs/sec/task
2000 55707.1 92 209.0 795.6 0.4642
2000 55704.4 96 209.0 796.0 0.4642
i.e. a 67% speedup. We are now back to within 1% of the v2.6.25
performance levels and have zero idle time during the test, as expected.
Btw., interactivity also improved dramatically with the fix - for
example console-switching became almost instantaneous during this
workload (which after all is running 2000 tasks at once!), without the
patch it was stuck for a minute at times.
There's another nice side-effect of this speedup patch, the new generic
semaphore code got even smaller:
text data bss dec hex filename
1241 0 0 1241 4d9 semaphore.o.before
1207 0 0 1207 4b7 semaphore.o.after
(because the waiter.up complication got removed.)
Longer-term we should look into using the mutex code for the generic
semaphore code as well - but i's not easy due to legacies and it's
outside of the scope of v2.6.26 and outside the scope of this patch as
well.
Bisected-by: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index 5c2942e768cd..5e41217239e8 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -54,10 +54,9 @@ void down(struct semaphore *sem)
unsigned long flags;
spin_lock_irqsave(&sem->lock, flags);
- if (likely(sem->count > 0))
- sem->count--;
- else
+ if (unlikely(!sem->count))
__down(sem);
+ sem->count--;
spin_unlock_irqrestore(&sem->lock, flags);
}
EXPORT_SYMBOL(down);
@@ -77,10 +76,10 @@ int down_interruptible(struct semaphore *sem)
int result = 0;
spin_lock_irqsave(&sem->lock, flags);
- if (likely(sem->count > 0))
- sem->count--;
- else
+ if (unlikely(!sem->count))
result = __down_interruptible(sem);
+ if (!result)
+ sem->count--;
spin_unlock_irqrestore(&sem->lock, flags);
return result;
@@ -103,10 +102,10 @@ int down_killable(struct semaphore *sem)
int result = 0;
spin_lock_irqsave(&sem->lock, flags);
- if (likely(sem->count > 0))
- sem->count--;
- else
+ if (unlikely(!sem->count))
result = __down_killable(sem);
+ if (!result)
+ sem->count--;
spin_unlock_irqrestore(&sem->lock, flags);
return result;
@@ -157,10 +156,10 @@ int down_timeout(struct semaphore *sem, long jiffies)
int result = 0;
spin_lock_irqsave(&sem->lock, flags);
- if (likely(sem->count > 0))
- sem->count--;
- else
+ if (unlikely(!sem->count))
result = __down_timeout(sem, jiffies);
+ if (!result)
+ sem->count--;
spin_unlock_irqrestore(&sem->lock, flags);
return result;
@@ -179,9 +178,8 @@ void up(struct semaphore *sem)
unsigned long flags;
spin_lock_irqsave(&sem->lock, flags);
- if (likely(list_empty(&sem->wait_list)))
- sem->count++;
- else
+ sem->count++;
+ if (unlikely(!list_empty(&sem->wait_list)))
__up(sem);
spin_unlock_irqrestore(&sem->lock, flags);
}
@@ -192,7 +190,6 @@ EXPORT_SYMBOL(up);
struct semaphore_waiter {
struct list_head list;
struct task_struct *task;
- int up;
};
/*
@@ -205,33 +202,34 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
{
struct task_struct *task = current;
struct semaphore_waiter waiter;
+ int ret = 0;
- list_add_tail(&waiter.list, &sem->wait_list);
waiter.task = task;
- waiter.up = 0;
+ list_add_tail(&waiter.list, &sem->wait_list);
for (;;) {
- if (state == TASK_INTERRUPTIBLE && signal_pending(task))
- goto interrupted;
- if (state == TASK_KILLABLE && fatal_signal_pending(task))
- goto interrupted;
- if (timeout <= 0)
- goto timed_out;
+ if (state == TASK_INTERRUPTIBLE && signal_pending(task)) {
+ ret = -EINTR;
+ break;
+ }
+ if (state == TASK_KILLABLE && fatal_signal_pending(task)) {
+ ret = -EINTR;
+ break;
+ }
+ if (timeout <= 0) {
+ ret = -ETIME;
+ break;
+ }
__set_task_state(task, state);
spin_unlock_irq(&sem->lock);
timeout = schedule_timeout(timeout);
spin_lock_irq(&sem->lock);
- if (waiter.up)
- return 0;
+ if (sem->count > 0)
+ break;
}
- timed_out:
- list_del(&waiter.list);
- return -ETIME;
-
- interrupted:
list_del(&waiter.list);
- return -EINTR;
+ return ret;
}
static noinline void __sched __down(struct semaphore *sem)
@@ -258,7 +256,5 @@ static noinline void __sched __up(struct semaphore *sem)
{
struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,
struct semaphore_waiter, list);
- list_del(&waiter->list);
- waiter->up = 1;
wake_up_process(waiter->task);
}
commit a5574cf65b5f03ce9ade3918764fe22e5e2371e3
Author: Ingo Molnar <mingo@elte.hu>
Date: Mon May 5 23:19:50 2008 +0200
sched, x86: add HAVE_UNSTABLE_SCHED_CLOCK
add the HAVE_UNSTABLE_SCHED_CLOCK, for architectures to select.
the next change utilizes it.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 845ea2b2d487..bbcafaa160c0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -18,6 +18,7 @@ config X86_64
### Arch settings
config X86
def_bool y
+ select HAVE_UNSTABLE_SCHED_CLOCK
select HAVE_IDE
select HAVE_OPROFILE
select HAVE_KPROBES
diff --git a/init/Kconfig b/init/Kconfig
index f0e62e5ce0dc..fa42e6b549d3 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -316,6 +316,12 @@ config CPUSETS
Say N if unsure.
+#
+# Architectures with an unreliable sched_clock() should select this:
+#
+config HAVE_UNSTABLE_SCHED_CLOCK
+ bool
+
config GROUP_SCHED
bool "Group CPU scheduler"
default y
commit dfbf4a1bc319f0f9a31e39b2da1fa5c55e85af89
Author: Ingo Molnar <mingo@elte.hu>
Date: Wed Apr 23 09:24:06 2008 +0200
sched: fix cpu clock
David Miller pointed it out that nothing in cpu_clock() sets
prev_cpu_time. This caused __sync_cpu_clock() to be called
all the time - against the intention of this code.
The result was that in practice we hit a global spinlock every
time cpu_clock() is called - which - even though cpu_clock()
is used for tracing and debugging, is suboptimal.
While at it, also:
- move the irq disabling to the outest layer,
this should make cpu_clock() warp-free when called with irqs
enabled.
- use long long instead of cycles_t - for platforms where cycles_t
is 32-bit.
Reported-by: David Miller <davem@davemloft.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/kernel/sched.c b/kernel/sched.c
index f98f75f3c708..9457106b18af 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -910,11 +910,14 @@ static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);
static DEFINE_SPINLOCK(time_sync_lock);
static unsigned long long prev_global_time;
-static unsigned long long __sync_cpu_clock(cycles_t time, int cpu)
+static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu)
{
- unsigned long flags;
-
- spin_lock_irqsave(&time_sync_lock, flags);
+ /*
+ * We want this inlined, to not get tracer function calls
+ * in this critical section:
+ */
+ spin_acquire(&time_sync_lock.dep_map, 0, 0, _THIS_IP_);
+ __raw_spin_lock(&time_sync_lock.raw_lock);
if (time < prev_global_time) {
per_cpu(time_offset, cpu) += prev_global_time - time;
@@ -923,7 +926,8 @@ static unsigned long long __sync_cpu_clock(cycles_t time, int cpu)
prev_global_time = time;
}
- spin_unlock_irqrestore(&time_sync_lock, flags);
+ __raw_spin_unlock(&time_sync_lock.raw_lock);
+ spin_release(&time_sync_lock.dep_map, 1, _THIS_IP_);
return time;
}
@@ -931,7 +935,6 @@ static unsigned long long __sync_cpu_clock(cycles_t time, int cpu)
static unsigned long long __cpu_clock(int cpu)
{
unsigned long long now;
- unsigned long flags;
struct rq *rq;
/*
@@ -941,11 +944,9 @@ static unsigned long long __cpu_clock(int cpu)
if (unlikely(!scheduler_running))
return 0;
- local_irq_save(flags);
rq = cpu_rq(cpu);
update_rq_clock(rq);
now = rq->clock;
- local_irq_restore(flags);
return now;
}
@@ -957,13 +958,18 @@ static unsigned long long __cpu_clock(int cpu)
unsigned long long cpu_clock(int cpu)
{
unsigned long long prev_cpu_time, time, delta_time;
+ unsigned long flags;
+ local_irq_save(flags);
prev_cpu_time = per_cpu(prev_cpu_time, cpu);
time = __cpu_clock(cpu) + per_cpu(time_offset, cpu);
delta_time = time-prev_cpu_time;
- if (unlikely(delta_time > time_sync_thresh))
+ if (unlikely(delta_time > time_sync_thresh)) {
time = __sync_cpu_clock(time, cpu);
+ per_cpu(prev_cpu_time, cpu) = time;
+ }
+ local_irq_restore(flags);
return time;
}
commit 690229a0912ca2fef8b542fe4d8b73acfcdc6e24
Author: Ingo Molnar <mingo@elte.hu>
Date: Wed Apr 23 09:31:35 2008 +0200
sched: make clock sync tunable by architecture code
make time_sync_thresh tunable to architecture code.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 698b5a4d25a7..54c9ca26b7d8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -158,6 +158,8 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
}
#endif
+extern unsigned long long time_sync_thresh;
+
/*
* Task state bitmask. NOTE! These bits are also
* encoded in fs/proc/array.c: get_task_state().
diff --git a/kernel/sched.c b/kernel/sched.c
index 3ac3d7af04a1..8f433fedfcb3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -899,7 +899,7 @@ static inline u64 global_rt_runtime(void)
return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
}
-static const unsigned long long time_sync_thresh = 100000;
+unsigned long long time_sync_thresh = 100000;
static DEFINE_PER_CPU(unsigned long long, time_offset);
static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);
commit 733a0771df46af942b8355cd8bb15780106b4353
Author: Ingo Molnar <mingo@elte.hu>
Date: Mon Apr 28 14:05:18 2008 +0200
sched: remove old sched doc
Fabio Checconi noticed that Documentation/scheduler/sched-design.txt was
a stale copy of the old scheduler. Remove it.
Reported-by: Fabio Checconi <fabio@gandalf.sssup.it>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/Documentation/scheduler/sched-design.txt b/Documentation/scheduler/sched-design.txt
deleted file mode 100644
index 1605bf0cba8b..000000000000
--- a/Documentation/scheduler/sched-design.txt
+++ /dev/null
@@ -1,165 +0,0 @@
- Goals, Design and Implementation of the
- new ultra-scalable O(1) scheduler
-
-
- This is an edited version of an email Ingo Molnar sent to
- lkml on 4 Jan 2002. It describes the goals, design, and
- implementation of Ingo's new ultra-scalable O(1) scheduler.
- Last Updated: 18 April 2002.
-
-
-Goal
-====
-
-The main goal of the new scheduler is to keep all the good things we know
-and love about the current Linux scheduler:
-
- - good interactive performance even during high load: if the user
- types or clicks then the system must react instantly and must execute
- the user tasks smoothly, even during considerable background load.
-
- - good scheduling/wakeup performance with 1-2 runnable processes.
-
- - fairness: no process should stay without any timeslice for any
- unreasonable amount of time. No process should get an unjustly high
- amount of CPU time.
-
- - priorities: less important tasks can be started with lower priority,
- more important tasks with higher priority.
-
- - SMP efficiency: no CPU should stay idle if there is work to do.
-
- - SMP affinity: processes which run on one CPU should stay affine to
- that CPU. Processes should not bounce between CPUs too frequently.
-
- - plus additional scheduler features: RT scheduling, CPU binding.
-
-and the goal is also to add a few new things:
-
- - fully O(1) scheduling. Are you tired of the recalculation loop
- blowing the L1 cache away every now and then? Do you think the goodness
- loop is taking a bit too long to finish if there are lots of runnable
- processes? This new scheduler takes no prisoners: wakeup(), schedule(),
- the timer interrupt are all O(1) algorithms. There is no recalculation
- loop. There is no goodness loop either.
-
- - 'perfect' SMP scalability. With the new scheduler there is no 'big'
- runqueue_lock anymore - it's all per-CPU runqueues and locks - two
- tasks on two separate CPUs can wake up, schedule and context-switch
- completely in parallel, without any interlocking. All
- scheduling-relevant data is structured for maximum scalability.
-
- - better SMP affinity. The old scheduler has a particular weakness that
- causes the random bouncing of tasks between CPUs if/when higher
- priority/interactive tasks, this was observed and reported by many
- people. The reason is that the timeslice recalculation loop first needs
- every currently running task to consume its timeslice. But when this
- happens on eg. an 8-way system, then this property starves an
- increasing number of CPUs from executing any process. Once the last
- task that has a timeslice left has finished using up that timeslice,
- the recalculation loop is triggered and other CPUs can start executing
- tasks again - after having idled around for a number of timer ticks.
- The more CPUs, the worse this effect.
-
- Furthermore, this same effect causes the bouncing effect as well:
- whenever there is such a 'timeslice squeeze' of the global runqueue,
- idle processors start executing tasks which are not affine to that CPU.
- (because the affine tasks have finished off their timeslices already.)
-
- The new scheduler solves this problem by distributing timeslices on a
- per-CPU basis, without having any global synchronization or
- recalculation.
-
- - batch scheduling. A significant proportion of computing-intensive tasks
- benefit from batch-scheduling, where timeslices are long and processes
- are roundrobin scheduled. The new scheduler does such batch-scheduling
- of the lowest priority tasks - so nice +19 jobs will get
- 'batch-scheduled' automatically. With this scheduler, nice +19 jobs are
- in essence SCHED_IDLE, from an interactiveness point of view.
-
- - handle extreme loads more smoothly, without breakdown and scheduling
- storms.
-
- - O(1) RT scheduling. For those RT folks who are paranoid about the
- O(nr_running) property of the goodness loop and the recalculation loop.
-
- - run fork()ed children before the parent. Andrea has pointed out the
- advantages of this a few months ago, but patches for this feature
- do not work with the old scheduler as well as they should,
- because idle processes often steal the new child before the fork()ing
- CPU gets to execute it.
-
-
-Design
-======
-
-The core of the new scheduler contains the following mechanisms:
-
- - *two* priority-ordered 'priority arrays' per CPU. There is an 'active'
- array and an 'expired' array. The active array contains all tasks that
- are affine to this CPU and have timeslices left. The expired array
- contains all tasks which have used up their timeslices - but this array
- is kept sorted as well. The active and expired array is not accessed
- directly, it's accessed through two pointers in the per-CPU runqueue
- structure. If all active tasks are used up then we 'switch' the two
- pointers and from now on the ready-to-go (former-) expired array is the
- active array - and the empty active array serves as the new collector
- for expired tasks.
-
- - there is a 64-bit bitmap cache for array indices. Finding the highest
- priority task is thus a matter of two x86 BSFL bit-search instructions.
-
-the split-array solution enables us to have an arbitrary number of active
-and expired tasks, and the recalculation of timeslices can be done
-immediately when the timeslice expires. Because the arrays are always
-access through the pointers in the runqueue, switching the two arrays can
-be done very quickly.
-
-this is a hybride priority-list approach coupled with roundrobin
-scheduling and the array-switch method of distributing timeslices.
-
- - there is a per-task 'load estimator'.
-
-one of the toughest things to get right is good interactive feel during
-heavy system load. While playing with various scheduler variants i found
-that the best interactive feel is achieved not by 'boosting' interactive
-tasks, but by 'punishing' tasks that want to use more CPU time than there
-is available. This method is also much easier to do in an O(1) fashion.
-
-to establish the actual 'load' the task contributes to the system, a
-complex-looking but pretty accurate method is used: there is a 4-entry
-'history' ringbuffer of the task's activities during the last 4 seconds.
-This ringbuffer is operated without much overhead. The entries tell the
-scheduler a pretty accurate load-history of the task: has it used up more
-CPU time or less during the past N seconds. [the size '4' and the interval
-of 4x 1 seconds was found by lots of experimentation - this part is
-flexible and can be changed in both directions.]
-
-the penalty a task gets for generating more load than the CPU can handle
-is a priority decrease - there is a maximum amount to this penalty
-relative to their static priority, so even fully CPU-bound tasks will
-observe each other's priorities, and will share the CPU accordingly.
-
-the SMP load-balancer can be extended/switched with additional parallel
-computing and cache hierarchy concepts: NUMA scheduling, multi-core CPUs
-can be supported easily by changing the load-balancer. Right now it's
-tuned for my SMP systems.
-
-i skipped the prev->mm == next->mm advantage - no workload i know of shows
-any sensitivity to this. It can be added back by sacrificing O(1)
-schedule() [the current and one-lower priority list can be searched for a
-that->mm == current->mm condition], but costs a fair number of cycles
-during a number of important workloads, so i wanted to avoid this as much
-as possible.
-
-- the SMP idle-task startup code was still racy and the new scheduler
-triggered this. So i streamlined the idle-setup code a bit. We do not call
-into schedule() before all processors have started up fully and all idle
-threads are in place.
-
-- the patch also cleans up a number of aspects of sched.c - moves code
-into other areas of the kernel where it's appropriate, and simplifies
-certain code paths and data constructs. As a result, the new scheduler's
-code is smaller than the old one.
-
- Ingo
commit 7a1aa309f21ea2f6c31f364341e4027ecf4e79bc
Author: Ingo Molnar <mingo@elte.hu>
Date: Mon May 5 01:06:54 2008 -0700
irda: fix !PNP support for drivers/net/irda/smsc-ircc2.c
x86.git testing found this build bug on v2.6.26-rc1:
ERROR: "pnp_get_resource" [drivers/net/irda/smsc-ircc2.ko] undefined!
make[1]: *** [__modpost] Error 1
make: *** [modules] Error 2
the driver did not anticipate the case of !CONFIG_PNP which is rare but
still possible. Instead of restricting the driver to PNP-only in the
Kconfig space, add the (trivial) dummy struct pnp_driver - this is that
other drivers use in the !PNP case too.
The driver itself can in theory be initialized on !PNP too in certain
cases, via smsc_ircc_legacy_probe().
Patch only minimally build tested, i dont have this hardware.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/drivers/net/irda/smsc-ircc2.c b/drivers/net/irda/smsc-ircc2.c
index 1f26da761e9f..cfe0194fef71 100644
--- a/drivers/net/irda/smsc-ircc2.c
+++ b/drivers/net/irda/smsc-ircc2.c
@@ -376,6 +376,7 @@ MODULE_DEVICE_TABLE(pnp, smsc_ircc_pnp_table);
static int pnp_driver_registered;
+#ifdef CONFIG_PNP
static int __init smsc_ircc_pnp_probe(struct pnp_dev *dev,
const struct pnp_device_id *dev_id)
{
@@ -402,7 +403,9 @@ static struct pnp_driver smsc_ircc_pnp_driver = {
.id_table = smsc_ircc_pnp_table,
.probe = smsc_ircc_pnp_probe,
};
-
+#else /* CONFIG_PNP */
+static struct pnp_driver smsc_ircc_pnp_driver;
+#endif
/*******************************************************************************
*