Patches contributed by Eötvös Lorand University


commit 8a6c160a2a13d82c75a50af7282b906cce948df5
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Apr 30 22:13:44 2008 +0200

    x86: redo thread_info.h change
    
    redo Roland's "signals: x86 TS_RESTORE_SIGMASK" ontop of the unified
    thread_info.h file.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/asm-x86/thread_info.h b/include/asm-x86/thread_info.h
index 348f0e0faa3b..74481b72ae0f 100644
--- a/include/asm-x86/thread_info.h
+++ b/include/asm-x86/thread_info.h
@@ -80,7 +80,6 @@ struct thread_info {
 #endif
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
-#define TIF_RESTORE_SIGMASK	9	/* restore signal mask in do_signal */
 #define TIF_MCE_NOTIFY		10	/* notify userspace of an MCE */
 #define TIF_HRTICK_RESCHED	11	/* reprogram hrtick timer */
 #define TIF_NOTSC		16	/* TSC is not accessible in userland */
@@ -108,7 +107,6 @@ struct thread_info {
 #endif
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
-#define _TIF_RESTORE_SIGMASK	(1 << TIF_RESTORE_SIGMASK)
 #define _TIF_MCE_NOTIFY		(1 << TIF_MCE_NOTIFY)
 #define _TIF_HRTICK_RESCHED	(1 << TIF_HRTICK_RESCHED)
 #define _TIF_NOTSC		(1 << TIF_NOTSC)
@@ -237,9 +235,20 @@ static inline struct thread_info *stack_thread_info(void)
 #define TS_COMPAT		0x0002	/* 32bit syscall active (64BIT)*/
 #define TS_POLLING		0x0004	/* true if in idle loop
 					   and not sleeping */
+#define TS_RESTORE_SIGMASK	0x0008	/* restore signal mask in do_signal() */
 
 #define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING)
 
+#ifndef __ASSEMBLY__
+#define HAVE_SET_RESTORE_SIGMASK	1
+static inline void set_restore_sigmask(void)
+{
+	struct thread_info *ti = current_thread_info();
+	ti->status |= TS_RESTORE_SIGMASK;
+	set_bit(TIF_SIGPENDING, &ti->flags);
+}
+#endif	/* !__ASSEMBLY__ */
+
 #ifndef __ASSEMBLY__
 extern void arch_task_cache_init(void);
 extern void free_thread_info(struct thread_info *ti);

commit 1c7d06d419dbe82c76fbb4d3e1fa61b2da2dc00b
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Apr 30 22:12:05 2008 +0200

    revert: thread_info.h change
    
    temporarily revert parts of "signals: x86 TS_RESTORE_SIGMASK".
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/asm-x86/thread_info_32.h b/include/asm-x86/thread_info_32.h
index b6338829d1a8..531859962096 100644
--- a/include/asm-x86/thread_info_32.h
+++ b/include/asm-x86/thread_info_32.h
@@ -131,6 +131,7 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_SYSCALL_EMU		5	/* syscall emulation active */
 #define TIF_SYSCALL_AUDIT	6	/* syscall auditing active */
 #define TIF_SECCOMP		7	/* secure computing */
+#define TIF_RESTORE_SIGMASK	8	/* restore signal mask in do_signal() */
 #define TIF_HRTICK_RESCHED	9	/* reprogram hrtick timer */
 #define TIF_MEMDIE		16
 #define TIF_DEBUG		17	/* uses debug registers */
@@ -150,6 +151,7 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_SYSCALL_EMU	(1 << TIF_SYSCALL_EMU)
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
+#define _TIF_RESTORE_SIGMASK	(1 << TIF_RESTORE_SIGMASK)
 #define _TIF_HRTICK_RESCHED	(1 << TIF_HRTICK_RESCHED)
 #define _TIF_DEBUG		(1 << TIF_DEBUG)
 #define _TIF_IO_BITMAP		(1 << TIF_IO_BITMAP)
@@ -186,20 +188,9 @@ static inline struct thread_info *current_thread_info(void)
 					   this quantum (SMP) */
 #define TS_POLLING		0x0002	/* True if in idle loop
 					   and not sleeping */
-#define TS_RESTORE_SIGMASK	0x0004	/* restore signal mask in do_signal() */
 
 #define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING)
 
-#ifndef __ASSEMBLY__
-#define HAVE_SET_RESTORE_SIGMASK	1
-static inline void set_restore_sigmask(void)
-{
-	struct thread_info *ti = current_thread_info();
-	ti->status |= TS_RESTORE_SIGMASK;
-	set_bit(TIF_SIGPENDING, &ti->flags);
-}
-#endif	/* !__ASSEMBLY__ */
-
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_THREAD_INFO_H */
diff --git a/include/asm-x86/thread_info_64.h b/include/asm-x86/thread_info_64.h
index cb69f70abba1..ed664e874dec 100644
--- a/include/asm-x86/thread_info_64.h
+++ b/include/asm-x86/thread_info_64.h
@@ -109,6 +109,7 @@ static inline struct thread_info *stack_thread_info(void)
 #define TIF_IRET		5	/* force IRET */
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
+#define TIF_RESTORE_SIGMASK	9	/* restore signal mask in do_signal */
 #define TIF_MCE_NOTIFY		10	/* notify userspace of an MCE */
 #define TIF_HRTICK_RESCHED	11	/* reprogram hrtick timer */
 /* 16 free */
@@ -132,6 +133,7 @@ static inline struct thread_info *stack_thread_info(void)
 #define _TIF_IRET		(1 << TIF_IRET)
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
+#define _TIF_RESTORE_SIGMASK	(1 << TIF_RESTORE_SIGMASK)
 #define _TIF_MCE_NOTIFY		(1 << TIF_MCE_NOTIFY)
 #define _TIF_HRTICK_RESCHED	(1 << TIF_HRTICK_RESCHED)
 #define _TIF_IA32		(1 << TIF_IA32)
@@ -176,20 +178,9 @@ static inline struct thread_info *stack_thread_info(void)
 #define TS_COMPAT		0x0002	/* 32bit syscall active */
 #define TS_POLLING		0x0004	/* true if in idle loop
 					   and not sleeping */
-#define TS_RESTORE_SIGMASK	0x0008	/* restore signal mask in do_signal() */
 
 #define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING)
 
-#ifndef __ASSEMBLY__
-#define HAVE_SET_RESTORE_SIGMASK	1
-static inline void set_restore_sigmask(void)
-{
-	struct thread_info *ti = current_thread_info();
-	ti->status |= TS_RESTORE_SIGMASK;
-	set_bit(TIF_SIGPENDING, &ti->flags);
-}
-#endif	/* !__ASSEMBLY__ */
-
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_THREAD_INFO_H */

commit 82fd866701881623d69fe280dbac06ddff1fdef9
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu May 1 03:46:22 2008 +0200

    x86: rdc: leds build/config fix
    
    select NEW_LEDS for now until the Kconfig dependencies have been
    fixed.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 42109f119df7..fe361ae7ef2f 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -335,6 +335,7 @@ config X86_RDC321X
 	select GENERIC_GPIO
 	select LEDS_CLASS
 	select LEDS_GPIO
+	select NEW_LEDS
 	help
 	  This option is needed for RDC R-321x system-on-chip, also known
 	  as R-8610-(G).

commit f7c83a0aaa772f8d0189fa197d77c762caaa367a
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Apr 30 09:48:07 2008 +0200

    Fix drivers/media build for modular builds
    
    Fix allmodconfig build bug introduced in latest -git by commit
    7c91f0624a9 ("V4L/DVB(7767): Move tuners to common/tuners"):
    
      LD      kernel/built-in.o
      LD      drivers/built-in.o
      ld: drivers/media/built-in.o: No such file: No such file or directory
    
    which happens if all media drivers are modular:
    
      http://redhat.com/~mingo/misc/config-Wed_Apr_30_09_24_48_CEST_2008.bad
    
    In that case there's no obj-y rule connecting all the built-in.o files and
    the link tree breaks.
    
    The fix is to add a guaranteed obj-y rule for the core vmlinux to build.
    (which results in an empty object file if all media drivers are modular)
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Acked-by: Sam Ravnborg <sam@ravnborg.org>
    Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
    Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/media/Makefile b/drivers/media/Makefile
index 73f742c7e818..cc11c4c0e7e7 100644
--- a/drivers/media/Makefile
+++ b/drivers/media/Makefile
@@ -2,6 +2,8 @@
 # Makefile for the kernel multimedia device drivers.
 #
 
+obj-y := common/
+
 obj-$(CONFIG_VIDEO_MEDIA) += common/
 
 # Since hybrid devices are here, should be compiled if DVB and/or V4L

commit bf726eab3711cf192405d21688a4b21e07b6188a
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu May 8 11:53:48 2008 +0200

    semaphore: fix
    
    Yanmin Zhang reported:
    
    | Comparing with kernel 2.6.25, AIM7 (use tmpfs) has more th
    | regression under 2.6.26-rc1 on my 8-core stoakley, 16-core tigerton,
    | and Itanium Montecito. Bisect located the patch below:
    |
    | 64ac24e738823161693bf791f87adc802cf529ff is first bad commit
    | commit 64ac24e738823161693bf791f87adc802cf529ff
    | Author: Matthew Wilcox <matthew@wil.cx>
    | Date:   Fri Mar 7 21:55:58 2008 -0500
    |
    |     Generic semaphore implementation
    |
    | After I manually reverted the patch against 2.6.26-rc1 while fixing
    | lots of conflicts/errors, aim7 regression became less than 2%.
    
    i reproduced the AIM7 workload and can confirm Yanmin's findings that
    -.26-rc1 regresses over .25 - by over 67% here.
    
    Looking at the workload i found and fixed what i believe to be the real
    bug causing the AIM7 regression: it was inefficient wakeup / scheduling
    / locking behavior of the new generic semaphore code, causing suboptimal
    performance.
    
    The problem comes from the following code. The new semaphore code does
    this on down():
    
            spin_lock_irqsave(&sem->lock, flags);
            if (likely(sem->count > 0))
                    sem->count--;
            else
                    __down(sem);
            spin_unlock_irqrestore(&sem->lock, flags);
    
    and this on up():
    
            spin_lock_irqsave(&sem->lock, flags);
            if (likely(list_empty(&sem->wait_list)))
                    sem->count++;
            else
                    __up(sem);
            spin_unlock_irqrestore(&sem->lock, flags);
    
    where __up() does:
    
            list_del(&waiter->list);
            waiter->up = 1;
            wake_up_process(waiter->task);
    
    and where __down() does this in essence:
    
            list_add_tail(&waiter.list, &sem->wait_list);
            waiter.task = task;
            waiter.up = 0;
            for (;;) {
                    [...]
                    spin_unlock_irq(&sem->lock);
                    timeout = schedule_timeout(timeout);
                    spin_lock_irq(&sem->lock);
                    if (waiter.up)
                            return 0;
            }
    
    the fastpath looks good and obvious, but note the following property of
    the contended path: if there's a task on the ->wait_list, the up() of
    the current owner will "pass over" ownership to that waiting task, in a
    wake-one manner, via the waiter->up flag and by removing the waiter from
    the wait list.
    
    That is all and fine in principle, but as implemented in
    kernel/semaphore.c it also creates a nasty, hidden source of contention!
    
    The contention comes from the following property of the new semaphore
    code: the new owner owns the semaphore exclusively, even if it is not
    running yet.
    
    So if the old owner, even if just a few instructions later, does a
    down() [lock_kernel()] again, it will be blocked and will have to wait
    on the new owner to eventually be scheduled (possibly on another CPU)!
    Or if another task gets to lock_kernel() sooner than the "new owner"
    scheduled, it will be blocked unnecessarily and for a very long time
    when there are 2000 tasks running.
    
    I.e. the implementation of the new semaphores code does wake-one and
    lock ownership in a very restrictive way - it does not allow
    opportunistic re-locking of the lock at all and keeps the scheduler from
    picking task order intelligently.
    
    This kind of scheduling, with 2000 AIM7 processes running, creates awful
    cross-scheduling between those 2000 tasks, causes reduced parallelism, a
    throttled runqueue length and a lot of idle time. With increasing number
    of CPUs it causes an exponentially worse behavior in AIM7, as the chance
    for a newly woken new-owner task to actually run anytime soon is less
    and less likely.
    
    Note that it takes just a tiny bit of contention for the 'new-semaphore
    catastrophy' to happen: the wakeup latencies get added to whatever small
    contention there is, and quickly snowball out of control!
    
    I believe Yanmin's findings and numbers support this analysis too.
    
    The best fix for this problem is to use the same scheduling logic that
    the kernel/mutex.c code uses: keep the wake-one behavior (that is OK and
    wanted because we do not want to over-schedule), but also allow
    opportunistic locking of the lock even if a wakee is already "in
    flight".
    
    The patch below implements this new logic. With this patch applied the
    AIM7 regression is largely fixed on my quad testbox:
    
      # v2.6.25 vanilla:
      ..................
      Tasks   Jobs/Min        JTI     Real    CPU     Jobs/sec/task
      2000    56096.4         91      207.5   789.7   0.4675
      2000    55894.4         94      208.2   792.7   0.4658
    
      # v2.6.26-rc1-166-gc0a1811 vanilla:
      ...................................
      Tasks   Jobs/Min        JTI     Real    CPU     Jobs/sec/task
      2000    33230.6         83      350.3   784.5   0.2769
      2000    31778.1         86      366.3   783.6   0.2648
    
      # v2.6.26-rc1-166-gc0a1811 + semaphore-speedup:
      ...............................................
      Tasks   Jobs/Min        JTI     Real    CPU     Jobs/sec/task
      2000    55707.1         92      209.0   795.6   0.4642
      2000    55704.4         96      209.0   796.0   0.4642
    
    i.e. a 67% speedup. We are now back to within 1% of the v2.6.25
    performance levels and have zero idle time during the test, as expected.
    
    Btw., interactivity also improved dramatically with the fix - for
    example console-switching became almost instantaneous during this
    workload (which after all is running 2000 tasks at once!), without the
    patch it was stuck for a minute at times.
    
    There's another nice side-effect of this speedup patch, the new generic
    semaphore code got even smaller:
    
       text    data     bss     dec     hex filename
       1241       0       0    1241     4d9 semaphore.o.before
       1207       0       0    1207     4b7 semaphore.o.after
    
    (because the waiter.up complication got removed.)
    
    Longer-term we should look into using the mutex code for the generic
    semaphore code as well - but i's not easy due to legacies and it's
    outside of the scope of v2.6.26 and outside the scope of this patch as
    well.
    
    Bisected-by: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index 5c2942e768cd..5e41217239e8 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -54,10 +54,9 @@ void down(struct semaphore *sem)
 	unsigned long flags;
 
 	spin_lock_irqsave(&sem->lock, flags);
-	if (likely(sem->count > 0))
-		sem->count--;
-	else
+	if (unlikely(!sem->count))
 		__down(sem);
+	sem->count--;
 	spin_unlock_irqrestore(&sem->lock, flags);
 }
 EXPORT_SYMBOL(down);
@@ -77,10 +76,10 @@ int down_interruptible(struct semaphore *sem)
 	int result = 0;
 
 	spin_lock_irqsave(&sem->lock, flags);
-	if (likely(sem->count > 0))
-		sem->count--;
-	else
+	if (unlikely(!sem->count))
 		result = __down_interruptible(sem);
+	if (!result)
+		sem->count--;
 	spin_unlock_irqrestore(&sem->lock, flags);
 
 	return result;
@@ -103,10 +102,10 @@ int down_killable(struct semaphore *sem)
 	int result = 0;
 
 	spin_lock_irqsave(&sem->lock, flags);
-	if (likely(sem->count > 0))
-		sem->count--;
-	else
+	if (unlikely(!sem->count))
 		result = __down_killable(sem);
+	if (!result)
+		sem->count--;
 	spin_unlock_irqrestore(&sem->lock, flags);
 
 	return result;
@@ -157,10 +156,10 @@ int down_timeout(struct semaphore *sem, long jiffies)
 	int result = 0;
 
 	spin_lock_irqsave(&sem->lock, flags);
-	if (likely(sem->count > 0))
-		sem->count--;
-	else
+	if (unlikely(!sem->count))
 		result = __down_timeout(sem, jiffies);
+	if (!result)
+		sem->count--;
 	spin_unlock_irqrestore(&sem->lock, flags);
 
 	return result;
@@ -179,9 +178,8 @@ void up(struct semaphore *sem)
 	unsigned long flags;
 
 	spin_lock_irqsave(&sem->lock, flags);
-	if (likely(list_empty(&sem->wait_list)))
-		sem->count++;
-	else
+	sem->count++;
+	if (unlikely(!list_empty(&sem->wait_list)))
 		__up(sem);
 	spin_unlock_irqrestore(&sem->lock, flags);
 }
@@ -192,7 +190,6 @@ EXPORT_SYMBOL(up);
 struct semaphore_waiter {
 	struct list_head list;
 	struct task_struct *task;
-	int up;
 };
 
 /*
@@ -205,33 +202,34 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
 {
 	struct task_struct *task = current;
 	struct semaphore_waiter waiter;
+	int ret = 0;
 
-	list_add_tail(&waiter.list, &sem->wait_list);
 	waiter.task = task;
-	waiter.up = 0;
+	list_add_tail(&waiter.list, &sem->wait_list);
 
 	for (;;) {
-		if (state == TASK_INTERRUPTIBLE && signal_pending(task))
-			goto interrupted;
-		if (state == TASK_KILLABLE && fatal_signal_pending(task))
-			goto interrupted;
-		if (timeout <= 0)
-			goto timed_out;
+		if (state == TASK_INTERRUPTIBLE && signal_pending(task)) {
+			ret = -EINTR;
+			break;
+		}
+		if (state == TASK_KILLABLE && fatal_signal_pending(task)) {
+			ret = -EINTR;
+			break;
+		}
+		if (timeout <= 0) {
+			ret = -ETIME;
+			break;
+		}
 		__set_task_state(task, state);
 		spin_unlock_irq(&sem->lock);
 		timeout = schedule_timeout(timeout);
 		spin_lock_irq(&sem->lock);
-		if (waiter.up)
-			return 0;
+		if (sem->count > 0)
+			break;
 	}
 
- timed_out:
-	list_del(&waiter.list);
-	return -ETIME;
-
- interrupted:
 	list_del(&waiter.list);
-	return -EINTR;
+	return ret;
 }
 
 static noinline void __sched __down(struct semaphore *sem)
@@ -258,7 +256,5 @@ static noinline void __sched __up(struct semaphore *sem)
 {
 	struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,
 						struct semaphore_waiter, list);
-	list_del(&waiter->list);
-	waiter->up = 1;
 	wake_up_process(waiter->task);
 }

commit a5574cf65b5f03ce9ade3918764fe22e5e2371e3
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon May 5 23:19:50 2008 +0200

    sched, x86: add HAVE_UNSTABLE_SCHED_CLOCK
    
    add the HAVE_UNSTABLE_SCHED_CLOCK, for architectures to select.
    
    the next change utilizes it.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 845ea2b2d487..bbcafaa160c0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -18,6 +18,7 @@ config X86_64
 ### Arch settings
 config X86
 	def_bool y
+	select HAVE_UNSTABLE_SCHED_CLOCK
 	select HAVE_IDE
 	select HAVE_OPROFILE
 	select HAVE_KPROBES
diff --git a/init/Kconfig b/init/Kconfig
index f0e62e5ce0dc..fa42e6b549d3 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -316,6 +316,12 @@ config CPUSETS
 
 	  Say N if unsure.
 
+#
+# Architectures with an unreliable sched_clock() should select this:
+#
+config HAVE_UNSTABLE_SCHED_CLOCK
+	bool
+
 config GROUP_SCHED
 	bool "Group CPU scheduler"
 	default y

commit dfbf4a1bc319f0f9a31e39b2da1fa5c55e85af89
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Apr 23 09:24:06 2008 +0200

    sched: fix cpu clock
    
    David Miller pointed it out that nothing in cpu_clock() sets
    prev_cpu_time. This caused __sync_cpu_clock() to be called
    all the time - against the intention of this code.
    
    The result was that in practice we hit a global spinlock every
    time cpu_clock() is called - which - even though cpu_clock()
    is used for tracing and debugging, is suboptimal.
    
    While at it, also:
    
    - move the irq disabling to the outest layer,
      this should make cpu_clock() warp-free when called with irqs
      enabled.
    
    - use long long instead of cycles_t - for platforms where cycles_t
      is 32-bit.
    
    Reported-by: David Miller <davem@davemloft.net>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index f98f75f3c708..9457106b18af 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -910,11 +910,14 @@ static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);
 static DEFINE_SPINLOCK(time_sync_lock);
 static unsigned long long prev_global_time;
 
-static unsigned long long __sync_cpu_clock(cycles_t time, int cpu)
+static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(&time_sync_lock, flags);
+	/*
+	 * We want this inlined, to not get tracer function calls
+	 * in this critical section:
+	 */
+	spin_acquire(&time_sync_lock.dep_map, 0, 0, _THIS_IP_);
+	__raw_spin_lock(&time_sync_lock.raw_lock);
 
 	if (time < prev_global_time) {
 		per_cpu(time_offset, cpu) += prev_global_time - time;
@@ -923,7 +926,8 @@ static unsigned long long __sync_cpu_clock(cycles_t time, int cpu)
 		prev_global_time = time;
 	}
 
-	spin_unlock_irqrestore(&time_sync_lock, flags);
+	__raw_spin_unlock(&time_sync_lock.raw_lock);
+	spin_release(&time_sync_lock.dep_map, 1, _THIS_IP_);
 
 	return time;
 }
@@ -931,7 +935,6 @@ static unsigned long long __sync_cpu_clock(cycles_t time, int cpu)
 static unsigned long long __cpu_clock(int cpu)
 {
 	unsigned long long now;
-	unsigned long flags;
 	struct rq *rq;
 
 	/*
@@ -941,11 +944,9 @@ static unsigned long long __cpu_clock(int cpu)
 	if (unlikely(!scheduler_running))
 		return 0;
 
-	local_irq_save(flags);
 	rq = cpu_rq(cpu);
 	update_rq_clock(rq);
 	now = rq->clock;
-	local_irq_restore(flags);
 
 	return now;
 }
@@ -957,13 +958,18 @@ static unsigned long long __cpu_clock(int cpu)
 unsigned long long cpu_clock(int cpu)
 {
 	unsigned long long prev_cpu_time, time, delta_time;
+	unsigned long flags;
 
+	local_irq_save(flags);
 	prev_cpu_time = per_cpu(prev_cpu_time, cpu);
 	time = __cpu_clock(cpu) + per_cpu(time_offset, cpu);
 	delta_time = time-prev_cpu_time;
 
-	if (unlikely(delta_time > time_sync_thresh))
+	if (unlikely(delta_time > time_sync_thresh)) {
 		time = __sync_cpu_clock(time, cpu);
+		per_cpu(prev_cpu_time, cpu) = time;
+	}
+	local_irq_restore(flags);
 
 	return time;
 }

commit 690229a0912ca2fef8b542fe4d8b73acfcdc6e24
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Apr 23 09:31:35 2008 +0200

    sched: make clock sync tunable by architecture code
    
    make time_sync_thresh tunable to architecture code.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 698b5a4d25a7..54c9ca26b7d8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -158,6 +158,8 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 }
 #endif
 
+extern unsigned long long time_sync_thresh;
+
 /*
  * Task state bitmask. NOTE! These bits are also
  * encoded in fs/proc/array.c: get_task_state().
diff --git a/kernel/sched.c b/kernel/sched.c
index 3ac3d7af04a1..8f433fedfcb3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -899,7 +899,7 @@ static inline u64 global_rt_runtime(void)
 	return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
 }
 
-static const unsigned long long time_sync_thresh = 100000;
+unsigned long long time_sync_thresh = 100000;
 
 static DEFINE_PER_CPU(unsigned long long, time_offset);
 static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);

commit 733a0771df46af942b8355cd8bb15780106b4353
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Apr 28 14:05:18 2008 +0200

    sched: remove old sched doc
    
    Fabio Checconi noticed that Documentation/scheduler/sched-design.txt was
    a stale copy of the old scheduler. Remove it.
    
    Reported-by: Fabio Checconi <fabio@gandalf.sssup.it>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/scheduler/sched-design.txt b/Documentation/scheduler/sched-design.txt
deleted file mode 100644
index 1605bf0cba8b..000000000000
--- a/Documentation/scheduler/sched-design.txt
+++ /dev/null
@@ -1,165 +0,0 @@
-		   Goals, Design and Implementation of the
-		      new ultra-scalable O(1) scheduler
-
-
-  This is an edited version of an email Ingo Molnar sent to
-  lkml on 4 Jan 2002.  It describes the goals, design, and
-  implementation of Ingo's new ultra-scalable O(1) scheduler.
-  Last Updated: 18 April 2002.
-
-
-Goal
-====
-
-The main goal of the new scheduler is to keep all the good things we know
-and love about the current Linux scheduler:
-
- - good interactive performance even during high load: if the user
-   types or clicks then the system must react instantly and must execute
-   the user tasks smoothly, even during considerable background load.
-
- - good scheduling/wakeup performance with 1-2 runnable processes.
-
- - fairness: no process should stay without any timeslice for any
-   unreasonable amount of time. No process should get an unjustly high
-   amount of CPU time.
-
- - priorities: less important tasks can be started with lower priority,
-   more important tasks with higher priority.
-
- - SMP efficiency: no CPU should stay idle if there is work to do.
-
- - SMP affinity: processes which run on one CPU should stay affine to
-   that CPU. Processes should not bounce between CPUs too frequently.
-
- - plus additional scheduler features: RT scheduling, CPU binding.
-
-and the goal is also to add a few new things:
-
- - fully O(1) scheduling. Are you tired of the recalculation loop
-   blowing the L1 cache away every now and then? Do you think the goodness
-   loop is taking a bit too long to finish if there are lots of runnable
-   processes? This new scheduler takes no prisoners: wakeup(), schedule(),
-   the timer interrupt are all O(1) algorithms. There is no recalculation
-   loop. There is no goodness loop either.
-
- - 'perfect' SMP scalability. With the new scheduler there is no 'big'
-   runqueue_lock anymore - it's all per-CPU runqueues and locks - two
-   tasks on two separate CPUs can wake up, schedule and context-switch
-   completely in parallel, without any interlocking. All
-   scheduling-relevant data is structured for maximum scalability.
-
- - better SMP affinity. The old scheduler has a particular weakness that
-   causes the random bouncing of tasks between CPUs if/when higher
-   priority/interactive tasks, this was observed and reported by many
-   people. The reason is that the timeslice recalculation loop first needs
-   every currently running task to consume its timeslice. But when this
-   happens on eg. an 8-way system, then this property starves an
-   increasing number of CPUs from executing any process. Once the last
-   task that has a timeslice left has finished using up that timeslice,
-   the recalculation loop is triggered and other CPUs can start executing
-   tasks again - after having idled around for a number of timer ticks.
-   The more CPUs, the worse this effect.
-
-   Furthermore, this same effect causes the bouncing effect as well:
-   whenever there is such a 'timeslice squeeze' of the global runqueue,
-   idle processors start executing tasks which are not affine to that CPU.
-   (because the affine tasks have finished off their timeslices already.)
-
-   The new scheduler solves this problem by distributing timeslices on a
-   per-CPU basis, without having any global synchronization or
-   recalculation.
-
- - batch scheduling. A significant proportion of computing-intensive tasks
-   benefit from batch-scheduling, where timeslices are long and processes
-   are roundrobin scheduled. The new scheduler does such batch-scheduling
-   of the lowest priority tasks - so nice +19 jobs will get
-   'batch-scheduled' automatically. With this scheduler, nice +19 jobs are
-   in essence SCHED_IDLE, from an interactiveness point of view.
-
- - handle extreme loads more smoothly, without breakdown and scheduling
-   storms.
-
- - O(1) RT scheduling. For those RT folks who are paranoid about the
-   O(nr_running) property of the goodness loop and the recalculation loop.
-
- - run fork()ed children before the parent. Andrea has pointed out the
-   advantages of this a few months ago, but patches for this feature
-   do not work with the old scheduler as well as they should,
-   because idle processes often steal the new child before the fork()ing
-   CPU gets to execute it.
-
-
-Design
-======
-
-The core of the new scheduler contains the following mechanisms:
-
- - *two* priority-ordered 'priority arrays' per CPU. There is an 'active'
-   array and an 'expired' array. The active array contains all tasks that
-   are affine to this CPU and have timeslices left. The expired array
-   contains all tasks which have used up their timeslices - but this array
-   is kept sorted as well. The active and expired array is not accessed
-   directly, it's accessed through two pointers in the per-CPU runqueue
-   structure. If all active tasks are used up then we 'switch' the two
-   pointers and from now on the ready-to-go (former-) expired array is the
-   active array - and the empty active array serves as the new collector
-   for expired tasks.
-
- - there is a 64-bit bitmap cache for array indices. Finding the highest
-   priority task is thus a matter of two x86 BSFL bit-search instructions.
-
-the split-array solution enables us to have an arbitrary number of active
-and expired tasks, and the recalculation of timeslices can be done
-immediately when the timeslice expires. Because the arrays are always
-access through the pointers in the runqueue, switching the two arrays can
-be done very quickly.
-
-this is a hybride priority-list approach coupled with roundrobin
-scheduling and the array-switch method of distributing timeslices.
-
- - there is a per-task 'load estimator'.
-
-one of the toughest things to get right is good interactive feel during
-heavy system load. While playing with various scheduler variants i found
-that the best interactive feel is achieved not by 'boosting' interactive
-tasks, but by 'punishing' tasks that want to use more CPU time than there
-is available. This method is also much easier to do in an O(1) fashion.
-
-to establish the actual 'load' the task contributes to the system, a
-complex-looking but pretty accurate method is used: there is a 4-entry
-'history' ringbuffer of the task's activities during the last 4 seconds.
-This ringbuffer is operated without much overhead. The entries tell the
-scheduler a pretty accurate load-history of the task: has it used up more
-CPU time or less during the past N seconds. [the size '4' and the interval
-of 4x 1 seconds was found by lots of experimentation - this part is
-flexible and can be changed in both directions.]
-
-the penalty a task gets for generating more load than the CPU can handle
-is a priority decrease - there is a maximum amount to this penalty
-relative to their static priority, so even fully CPU-bound tasks will
-observe each other's priorities, and will share the CPU accordingly.
-
-the SMP load-balancer can be extended/switched with additional parallel
-computing and cache hierarchy concepts: NUMA scheduling, multi-core CPUs
-can be supported easily by changing the load-balancer. Right now it's
-tuned for my SMP systems.
-
-i skipped the prev->mm == next->mm advantage - no workload i know of shows
-any sensitivity to this. It can be added back by sacrificing O(1)
-schedule() [the current and one-lower priority list can be searched for a
-that->mm == current->mm condition], but costs a fair number of cycles
-during a number of important workloads, so i wanted to avoid this as much
-as possible.
-
-- the SMP idle-task startup code was still racy and the new scheduler
-triggered this. So i streamlined the idle-setup code a bit. We do not call
-into schedule() before all processors have started up fully and all idle
-threads are in place.
-
-- the patch also cleans up a number of aspects of sched.c - moves code
-into other areas of the kernel where it's appropriate, and simplifies
-certain code paths and data constructs. As a result, the new scheduler's
-code is smaller than the old one.
-
-	Ingo

commit 7a1aa309f21ea2f6c31f364341e4027ecf4e79bc
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon May 5 01:06:54 2008 -0700

    irda: fix !PNP support for drivers/net/irda/smsc-ircc2.c
    
    x86.git testing found this build bug on v2.6.26-rc1:
    
      ERROR: "pnp_get_resource" [drivers/net/irda/smsc-ircc2.ko] undefined!
      make[1]: *** [__modpost] Error 1
      make: *** [modules] Error 2
    
    the driver did not anticipate the case of !CONFIG_PNP which is rare but
    still possible. Instead of restricting the driver to PNP-only in the
    Kconfig space, add the (trivial) dummy struct pnp_driver - this is that
    other drivers use in the !PNP case too.
    
    The driver itself can in theory be initialized on !PNP too in certain
    cases, via smsc_ircc_legacy_probe().
    
    Patch only minimally build tested, i dont have this hardware.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/irda/smsc-ircc2.c b/drivers/net/irda/smsc-ircc2.c
index 1f26da761e9f..cfe0194fef71 100644
--- a/drivers/net/irda/smsc-ircc2.c
+++ b/drivers/net/irda/smsc-ircc2.c
@@ -376,6 +376,7 @@ MODULE_DEVICE_TABLE(pnp, smsc_ircc_pnp_table);
 
 static int pnp_driver_registered;
 
+#ifdef CONFIG_PNP
 static int __init smsc_ircc_pnp_probe(struct pnp_dev *dev,
 				      const struct pnp_device_id *dev_id)
 {
@@ -402,7 +403,9 @@ static struct pnp_driver smsc_ircc_pnp_driver = {
 	.id_table	= smsc_ircc_pnp_table,
 	.probe		= smsc_ircc_pnp_probe,
 };
-
+#else /* CONFIG_PNP */
+static struct pnp_driver smsc_ircc_pnp_driver;
+#endif
 
 /*******************************************************************************
  *