Patches contributed by Eötvös Lorand University

<<Prev 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249[250]251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 Next>>

commit 8a6c160a2a13d82c75a50af7282b906cce948df5
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Apr 30 22:13:44 2008 +0200

    x86: redo thread_info.h change
    
    redo Roland's "signals: x86 TS_RESTORE_SIGMASK" ontop of the unified
    thread_info.h file.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/asm-x86/thread_info.h b/include/asm-x86/thread_info.h
index 348f0e0faa3b..74481b72ae0f 100644
--- a/include/asm-x86/thread_info.h
+++ b/include/asm-x86/thread_info.h
@@ -80,7 +80,6 @@ struct thread_info {
 #endif
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
-#define TIF_RESTORE_SIGMASK	9	/* restore signal mask in do_signal */
 #define TIF_MCE_NOTIFY		10	/* notify userspace of an MCE */
 #define TIF_HRTICK_RESCHED	11	/* reprogram hrtick timer */
 #define TIF_NOTSC		16	/* TSC is not accessible in userland */
@@ -108,7 +107,6 @@ struct thread_info {
 #endif
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
-#define _TIF_RESTORE_SIGMASK	(1 << TIF_RESTORE_SIGMASK)
 #define _TIF_MCE_NOTIFY		(1 << TIF_MCE_NOTIFY)
 #define _TIF_HRTICK_RESCHED	(1 << TIF_HRTICK_RESCHED)
 #define _TIF_NOTSC		(1 << TIF_NOTSC)
@@ -237,9 +235,20 @@ static inline struct thread_info *stack_thread_info(void)
 #define TS_COMPAT		0x0002	/* 32bit syscall active (64BIT)*/
 #define TS_POLLING		0x0004	/* true if in idle loop
 					   and not sleeping */
+#define TS_RESTORE_SIGMASK	0x0008	/* restore signal mask in do_signal() */
 
 #define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING)
 
+#ifndef __ASSEMBLY__
+#define HAVE_SET_RESTORE_SIGMASK	1
+static inline void set_restore_sigmask(void)
+{
+	struct thread_info *ti = current_thread_info();
+	ti->status |= TS_RESTORE_SIGMASK;
+	set_bit(TIF_SIGPENDING, &ti->flags);
+}
+#endif	/* !__ASSEMBLY__ */
+
 #ifndef __ASSEMBLY__
 extern void arch_task_cache_init(void);
 extern void free_thread_info(struct thread_info *ti);

commit 1c7d06d419dbe82c76fbb4d3e1fa61b2da2dc00b
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Apr 30 22:12:05 2008 +0200

    revert: thread_info.h change
    
    temporarily revert parts of "signals: x86 TS_RESTORE_SIGMASK".
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/asm-x86/thread_info_32.h b/include/asm-x86/thread_info_32.h
index b6338829d1a8..531859962096 100644
--- a/include/asm-x86/thread_info_32.h
+++ b/include/asm-x86/thread_info_32.h
@@ -131,6 +131,7 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_SYSCALL_EMU		5	/* syscall emulation active */
 #define TIF_SYSCALL_AUDIT	6	/* syscall auditing active */
 #define TIF_SECCOMP		7	/* secure computing */
+#define TIF_RESTORE_SIGMASK	8	/* restore signal mask in do_signal() */
 #define TIF_HRTICK_RESCHED	9	/* reprogram hrtick timer */
 #define TIF_MEMDIE		16
 #define TIF_DEBUG		17	/* uses debug registers */
@@ -150,6 +151,7 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_SYSCALL_EMU	(1 << TIF_SYSCALL_EMU)
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
+#define _TIF_RESTORE_SIGMASK	(1 << TIF_RESTORE_SIGMASK)
 #define _TIF_HRTICK_RESCHED	(1 << TIF_HRTICK_RESCHED)
 #define _TIF_DEBUG		(1 << TIF_DEBUG)
 #define _TIF_IO_BITMAP		(1 << TIF_IO_BITMAP)
@@ -186,20 +188,9 @@ static inline struct thread_info *current_thread_info(void)
 					   this quantum (SMP) */
 #define TS_POLLING		0x0002	/* True if in idle loop
 					   and not sleeping */
-#define TS_RESTORE_SIGMASK	0x0004	/* restore signal mask in do_signal() */
 
 #define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING)
 
-#ifndef __ASSEMBLY__
-#define HAVE_SET_RESTORE_SIGMASK	1
-static inline void set_restore_sigmask(void)
-{
-	struct thread_info *ti = current_thread_info();
-	ti->status |= TS_RESTORE_SIGMASK;
-	set_bit(TIF_SIGPENDING, &ti->flags);
-}
-#endif	/* !__ASSEMBLY__ */
-
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_THREAD_INFO_H */
diff --git a/include/asm-x86/thread_info_64.h b/include/asm-x86/thread_info_64.h
index cb69f70abba1..ed664e874dec 100644
--- a/include/asm-x86/thread_info_64.h
+++ b/include/asm-x86/thread_info_64.h
@@ -109,6 +109,7 @@ static inline struct thread_info *stack_thread_info(void)
 #define TIF_IRET		5	/* force IRET */
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
+#define TIF_RESTORE_SIGMASK	9	/* restore signal mask in do_signal */
 #define TIF_MCE_NOTIFY		10	/* notify userspace of an MCE */
 #define TIF_HRTICK_RESCHED	11	/* reprogram hrtick timer */
 /* 16 free */
@@ -132,6 +133,7 @@ static inline struct thread_info *stack_thread_info(void)
 #define _TIF_IRET		(1 << TIF_IRET)
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
+#define _TIF_RESTORE_SIGMASK	(1 << TIF_RESTORE_SIGMASK)
 #define _TIF_MCE_NOTIFY		(1 << TIF_MCE_NOTIFY)
 #define _TIF_HRTICK_RESCHED	(1 << TIF_HRTICK_RESCHED)
 #define _TIF_IA32		(1 << TIF_IA32)
@@ -176,20 +178,9 @@ static inline struct thread_info *stack_thread_info(void)
 #define TS_COMPAT		0x0002	/* 32bit syscall active */
 #define TS_POLLING		0x0004	/* true if in idle loop
 					   and not sleeping */
-#define TS_RESTORE_SIGMASK	0x0008	/* restore signal mask in do_signal() */
 
 #define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING)
 
-#ifndef __ASSEMBLY__
-#define HAVE_SET_RESTORE_SIGMASK	1
-static inline void set_restore_sigmask(void)
-{
-	struct thread_info *ti = current_thread_info();
-	ti->status |= TS_RESTORE_SIGMASK;
-	set_bit(TIF_SIGPENDING, &ti->flags);
-}
-#endif	/* !__ASSEMBLY__ */
-
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_THREAD_INFO_H */

commit 82fd866701881623d69fe280dbac06ddff1fdef9
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu May 1 03:46:22 2008 +0200

    x86: rdc: leds build/config fix
    
    select NEW_LEDS for now until the Kconfig dependencies have been
    fixed.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 42109f119df7..fe361ae7ef2f 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -335,6 +335,7 @@ config X86_RDC321X
 	select GENERIC_GPIO
 	select LEDS_CLASS
 	select LEDS_GPIO
+	select NEW_LEDS
 	help
 	  This option is needed for RDC R-321x system-on-chip, also known
 	  as R-8610-(G).

commit f7c83a0aaa772f8d0189fa197d77c762caaa367a
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Apr 30 09:48:07 2008 +0200

    Fix drivers/media build for modular builds
    
    Fix allmodconfig build bug introduced in latest -git by commit
    7c91f0624a9 ("V4L/DVB(7767): Move tuners to common/tuners"):
    
      LD      kernel/built-in.o
      LD      drivers/built-in.o
      ld: drivers/media/built-in.o: No such file: No such file or directory
    
    which happens if all media drivers are modular:
    
      http://redhat.com/~mingo/misc/config-Wed_Apr_30_09_24_48_CEST_2008.bad
    
    In that case there's no obj-y rule connecting all the built-in.o files and
    the link tree breaks.
    
    The fix is to add a guaranteed obj-y rule for the core vmlinux to build.
    (which results in an empty object file if all media drivers are modular)
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Acked-by: Sam Ravnborg <sam@ravnborg.org>
    Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
    Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/media/Makefile b/drivers/media/Makefile
index 73f742c7e818..cc11c4c0e7e7 100644
--- a/drivers/media/Makefile
+++ b/drivers/media/Makefile
@@ -2,6 +2,8 @@
 # Makefile for the kernel multimedia device drivers.
 #
 
+obj-y := common/
+
 obj-$(CONFIG_VIDEO_MEDIA) += common/
 
 # Since hybrid devices are here, should be compiled if DVB and/or V4L

commit bf726eab3711cf192405d21688a4b21e07b6188a
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu May 8 11:53:48 2008 +0200

    semaphore: fix
    
    Yanmin Zhang reported:
    
    | Comparing with kernel 2.6.25, AIM7 (use tmpfs) has more th
    | regression under 2.6.26-rc1 on my 8-core stoakley, 16-core tigerton,
    | and Itanium Montecito. Bisect located the patch below:
    |
    | 64ac24e738823161693bf791f87adc802cf529ff is first bad commit
    | commit 64ac24e738823161693bf791f87adc802cf529ff
    | Author: Matthew Wilcox <matthew@wil.cx>
    | Date:   Fri Mar 7 21:55:58 2008 -0500
    |
    |     Generic semaphore implementation
    |
    | After I manually reverted the patch against 2.6.26-rc1 while fixing
    | lots of conflicts/errors, aim7 regression became less than 2%.
    
    i reproduced the AIM7 workload and can confirm Yanmin's findings that
    -.26-rc1 regresses over .25 - by over 67% here.
    
    Looking at the workload i found and fixed what i believe to be the real
    bug causing the AIM7 regression: it was inefficient wakeup / scheduling
    / locking behavior of the new generic semaphore code, causing suboptimal
    performance.
    
    The problem comes from the following code. The new semaphore code does
    this on down():
    
            spin_lock_irqsave(&sem->lock, flags);
            if (likely(sem->count > 0))
                    sem->count--;
            else
                    __down(sem);
            spin_unlock_irqrestore(&sem->lock, flags);
    
    and this on up():
    
            spin_lock_irqsave(&sem->lock, flags);
            if (likely(list_empty(&sem->wait_list)))
                    sem->count++;
            else
                    __up(sem);
            spin_unlock_irqrestore(&sem->lock, flags);
    
    where __up() does:
    
            list_del(&waiter->list);
            waiter->up = 1;
            wake_up_process(waiter->task);
    
    and where __down() does this in essence:
    
            list_add_tail(&waiter.list, &sem->wait_list);
            waiter.task = task;
            waiter.up = 0;
            for (;;) {
                    [...]
                    spin_unlock_irq(&sem->lock);
                    timeout = schedule_timeout(timeout);
                    spin_lock_irq(&sem->lock);
                    if (waiter.up)
                            return 0;
            }
    
    the fastpath looks good and obvious, but note the following property of
    the contended path: if there's a task on the ->wait_list, the up() of
    the current owner will "pass over" ownership to that waiting task, in a
    wake-one manner, via the waiter->up flag and by removing the waiter from
    the wait list.
    
    That is all and fine in principle, but as implemented in
    kernel/semaphore.c it also creates a nasty, hidden source of contention!
    
    The contention comes from the following property of the new semaphore
    code: the new owner owns the semaphore exclusively, even if it is not
    running yet.
    
    So if the old owner, even if just a few instructions later, does a
    down() [lock_kernel()] again, it will be blocked and will have to wait
    on the new owner to eventually be scheduled (possibly on another CPU)!
    Or if another task gets to lock_kernel() sooner than the "new owner"
    scheduled, it will be blocked unnecessarily and for a very long time
    when there are 2000 tasks running.
    
    I.e. the implementation of the new semaphores code does wake-one and
    lock ownership in a very restrictive way - it does not allow
    opportunistic re-locking of the lock at all and keeps the scheduler from
    picking task order intelligently.
    
    This kind of scheduling, with 2000 AIM7 processes running, creates awful
    cross-scheduling between those 2000 tasks, causes reduced parallelism, a
    throttled runqueue length and a lot of idle time. With increasing number
    of CPUs it causes an exponentially worse behavior in AIM7, as the chance
    for a newly woken new-owner task to actually run anytime soon is less
    and less likely.
    
    Note that it takes just a tiny bit of contention for the 'new-semaphore
    catastrophy' to happen: the wakeup latencies get added to whatever small
    contention there is, and quickly snowball out of control!
    
    I believe Yanmin's findings and numbers support this analysis too.
    
    The best fix for this problem is to use the same scheduling logic that
    the kernel/mutex.c code uses: keep the wake-one behavior (that is OK and
    wanted because we do not want to over-schedule), but also allow
    opportunistic locking of the lock even if a wakee is already "in
    flight".
    
    The patch below implements this new logic. With this patch applied the
    AIM7 regression is largely fixed on my quad testbox:
    
      # v2.6.25 vanilla:
      ..................
      Tasks   Jobs/Min        JTI     Real    CPU     Jobs/sec/task
      2000    56096.4         91      207.5   789.7   0.4675
      2000    55894.4         94      208.2   792.7   0.4658
    
      # v2.6.26-rc1-166-gc0a1811 vanilla:
      ...................................
      Tasks   Jobs/Min        JTI     Real    CPU     Jobs/sec/task
      2000    33230.6         83      350.3   784.5   0.2769
      2000    31778.1         86      366.3   783.6   0.2648
    
      # v2.6.26-rc1-166-gc0a1811 + semaphore-speedup:
      ...............................................
      Tasks   Jobs/Min        JTI     Real    CPU     Jobs/sec/task
      2000    55707.1         92      209.0   795.6   0.4642
      2000    55704.4         96      209.0   796.0   0.4642
    
    i.e. a 67% speedup. We are now back to within 1% of the v2.6.25
    performance levels and have zero idle time during the test, as expected.
    
    Btw., interactivity also improved dramatically with the fix - for
    example console-switching became almost instantaneous during this
    workload (which after all is running 2000 tasks at once!), without the
    patch it was stuck for a minute at times.
    
    There's another nice side-effect of this speedup patch, the new generic
    semaphore code got even smaller:
    
       text    data     bss     dec     hex filename
       1241       0       0    1241     4d9 semaphore.o.before
       1207       0       0    1207     4b7 semaphore.o.after
    
    (because the waiter.up complication got removed.)
    
    Longer-term we should look into using the mutex code for the generic
    semaphore code as well - but i's not easy due to legacies and it's
    outside of the scope of v2.6.26 and outside the scope of this patch as
    well.
    
    Bisected-by: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index 5c2942e768cd..5e41217239e8 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -54,10 +54,9 @@ void down(struct semaphore *sem)
 	unsigned long flags;
 
 	spin_lock_irqsave(&sem->lock, flags);
-	if (likely(sem->count > 0))
-		sem->count--;
-	else
+	if (unlikely(!sem->count))
 		__down(sem);
+	sem->count--;
 	spin_unlock_irqrestore(&sem->lock, flags);
 }
 EXPORT_SYMBOL(down);
@@ -77,10 +76,10 @@ int down_interruptible(struct semaphore *sem)
 	int result = 0;
 
 	spin_lock_irqsave(&sem->lock, flags);
-	if (likely(sem->count > 0))
-		sem->count--;
-	else
+	if (unlikely(!sem->count))
 		result = __down_interruptible(sem);
+	if (!result)
+		sem->count--;
 	spin_unlock_irqrestore(&sem->lock, flags);
 
 	return result;
@@ -103,10 +102,10 @@ int down_killable(struct semaphore *sem)
 	int result = 0;
 
 	spin_lock_irqsave(&sem->lock, flags);
-	if (likely(sem->count > 0))
-		sem->count--;
-	else
+	if (unlikely(!sem->count))
 		result = __down_killable(sem);
+	if (!result)
+		sem->count--;
 	spin_unlock_irqrestore(&sem->lock, flags);
 
 	return result;
@@ -157,10 +156,10 @@ int down_timeout(struct semaphore *sem, long jiffies)
 	int result = 0;
 
 	spin_lock_irqsave(&sem->lock, flags);
-	if (likely(sem->count > 0))
-		sem->count--;
-	else
+	if (unlikely(!sem->count))
 		result = __down_timeout(sem, jiffies);
+	if (!result)
+		sem->count--;
 	spin_unlock_irqrestore(&sem->lock, flags);
 
 	return result;
@@ -179,9 +178,8 @@ void up(struct semaphore *sem)
 	unsigned long flags;
 
 	spin_lock_irqsave(&sem->lock, flags);
-	if (likely(list_empty(&sem->wait_list)))
-		sem->count++;
-	else
+	sem->count++;
+	if (unlikely(!list_empty(&sem->wait_list)))
 		__up(sem);
 	spin_unlock_irqrestore(&sem->lock, flags);
 }
@@ -192,7 +190,6 @@ EXPORT_SYMBOL(up);
 struct semaphore_waiter {
 	struct list_head list;
 	struct task_struct *task;
-	int up;
 };
 
 /*
@@ -205,33 +202,34 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
 {
 	struct task_struct *task = current;
 	struct semaphore_waiter waiter;
+	int ret = 0;
 
-	list_add_tail(&waiter.list, &sem->wait_list);
 	waiter.task = task;
-	waiter.up = 0;
+	list_add_tail(&waiter.list, &sem->wait_list);
 
 	for (;;) {
-		if (state == TASK_INTERRUPTIBLE && signal_pending(task))
-			goto interrupted;
-		if (state == TASK_KILLABLE && fatal_signal_pending(task))
-			goto interrupted;
-		if (timeout <= 0)
-			goto timed_out;
+		if (state == TASK_INTERRUPTIBLE && signal_pending(task)) {
+			ret = -EINTR;
+			break;
+		}
+		if (state == TASK_KILLABLE && fatal_signal_pending(task)) {
+			ret = -EINTR;
+			break;
+		}
+		if (timeout <= 0) {
+			ret = -ETIME;
+			break;
+		}
 		__set_task_state(task, state);
 		spin_unlock_irq(&sem->lock);
 		timeout = schedule_timeout(timeout);
 		spin_lock_irq(&sem->lock);
-		if (waiter.up)
-			return 0;
+		if (sem->count > 0)
+			break;
 	}
 
- timed_out:
-	list_del(&waiter.list);
-	return -ETIME;
-
- interrupted:
 	list_del(&waiter.list);
-	return -EINTR;
+	return ret;
 }
 
 static noinline void __sched __down(struct semaphore *sem)
@@ -258,7 +256,5 @@ static noinline void __sched __up(struct semaphore *sem)
 {
 	struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,
 						struct semaphore_waiter, list);
-	list_del(&waiter->list);
-	waiter->up = 1;
 	wake_up_process(waiter->task);
 }

commit a5574cf65b5f03ce9ade3918764fe22e5e2371e3
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon May 5 23:19:50 2008 +0200

    sched, x86: add HAVE_UNSTABLE_SCHED_CLOCK
    
    add the HAVE_UNSTABLE_SCHED_CLOCK, for architectures to select.
    
    the next change utilizes it.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 845ea2b2d487..bbcafaa160c0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -18,6 +18,7 @@ config X86_64
 ### Arch settings
 config X86
 	def_bool y
+	select HAVE_UNSTABLE_SCHED_CLOCK
 	select HAVE_IDE
 	select HAVE_OPROFILE
 	select HAVE_KPROBES
diff --git a/init/Kconfig b/init/Kconfig
index f0e62e5ce0dc..fa42e6b549d3 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -316,6 +316,12 @@ config CPUSETS
 
 	  Say N if unsure.
 
+#
+# Architectures with an unreliable sched_clock() should select this:
+#
+config HAVE_UNSTABLE_SCHED_CLOCK
+	bool
+
 config GROUP_SCHED
 	bool "Group CPU scheduler"
 	default y

commit dfbf4a1bc319f0f9a31e39b2da1fa5c55e85af89
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Apr 23 09:24:06 2008 +0200

    sched: fix cpu clock
    
    David Miller pointed it out that nothing in cpu_clock() sets
    prev_cpu_time. This caused __sync_cpu_clock() to be called
    all the time - against the intention of this code.
    
    The result was that in practice we hit a global spinlock every
    time cpu_clock() is called - which - even though cpu_clock()
    is used for tracing and debugging, is suboptimal.
    
    While at it, also:
    
    - move the irq disabling to the outest layer,
      this should make cpu_clock() warp-free when called with irqs
      enabled.
    
    - use long long instead of cycles_t - for platforms where cycles_t
      is 32-bit.
    
    Reported-by: David Miller <davem@davemloft.net>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index f98f75f3c708..9457106b18af 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -910,11 +910,14 @@ static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);
 static DEFINE_SPINLOCK(time_sync_lock);
 static unsigned long long prev_global_time;
 
-static unsigned long long __sync_cpu_clock(cycles_t time, int cpu)
+static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(&time_sync_lock, flags);
+	/*
+	 * We want this inlined, to not get tracer function calls
+	 * in this critical section:
+	 */
+	spin_acquire(&time_sync_lock.dep_map, 0, 0, _THIS_IP_);
+	__raw_spin_lock(&time_sync_lock.raw_lock);
 
 	if (time < prev_global_time) {
 		per_cpu(time_offset, cpu) += prev_global_time - time;
@@ -923,7 +926,8 @@ static unsigned long long __sync_cpu_clock(cycles_t time, int cpu)
 		prev_global_time = time;
 	}
 
-	spin_unlock_irqrestore(&time_sync_lock, flags);
+	__raw_spin_unlock(&time_sync_lock.raw_lock);
+	spin_release(&time_sync_lock.dep_map, 1, _THIS_IP_);
 
 	return time;
 }
@@ -931,7 +935,6 @@ static unsigned long long __sync_cpu_clock(cycles_t time, int cpu)
 static unsigned long long __cpu_clock(int cpu)
 {
 	unsigned long long now;
-	unsigned long flags;
 	struct rq *rq;
 
 	/*
@@ -941,11 +944,9 @@ static unsigned long long __cpu_clock(int cpu)
 	if (unlikely(!scheduler_running))
 		return 0;
 
-	local_irq_save(flags);
 	rq = cpu_rq(cpu);
 	update_rq_clock(rq);
 	now = rq->clock;
-	local_irq_restore(flags);
 
 	return now;
 }
@@ -957,13 +958,18 @@ static unsigned long long __cpu_clock(int cpu)
 unsigned long long cpu_clock(int cpu)
 {
 	unsigned long long prev_cpu_time, time, delta_time;
+	unsigned long flags;
 
+	local_irq_save(flags);
 	prev_cpu_time = per_cpu(prev_cpu_time, cpu);
 	time = __cpu_clock(cpu) + per_cpu(time_offset, cpu);
 	delta_time = time-prev_cpu_time;
 
-	if (unlikely(delta_time > time_sync_thresh))
+	if (unlikely(delta_time > time_sync_thresh)) {
 		time = __sync_cpu_clock(time, cpu);
+		per_cpu(prev_cpu_time, cpu) = time;
+	}
+	local_irq_restore(flags);
 
 	return time;
 }

commit 690229a0912ca2fef8b542fe4d8b73acfcdc6e24
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Apr 23 09:31:35 2008 +0200

    sched: make clock sync tunable by architecture code
    
    make time_sync_thresh tunable to architecture code.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 698b5a4d25a7..54c9ca26b7d8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -158,6 +158,8 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 }
 #endif
 
+extern unsigned long long time_sync_thresh;
+
 /*
  * Task state bitmask. NOTE! These bits are also
  * encoded in fs/proc/array.c: get_task_state().
diff --git a/kernel/sched.c b/kernel/sched.c
index 3ac3d7af04a1..8f433fedfcb3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -899,7 +899,7 @@ static inline u64 global_rt_runtime(void)
 	return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
 }
 
-static const unsigned long long time_sync_thresh = 100000;
+unsigned long long time_sync_thresh = 100000;
 
 static DEFINE_PER_CPU(unsigned long long, time_offset);
 static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);

commit 733a0771df46af942b8355cd8bb15780106b4353
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Apr 28 14:05:18 2008 +0200

    sched: remove old sched doc
    
    Fabio Checconi noticed that Documentation/scheduler/sched-design.txt was
    a stale copy of the old scheduler. Remove it.
    
    Reported-by: Fabio Checconi <fabio@gandalf.sssup.it>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/scheduler/sched-design.txt b/Documentation/scheduler/sched-design.txt
deleted file mode 100644
index 1605bf0cba8b..000000000000
--- a/Documentation/scheduler/sched-design.txt
+++ /dev/null
@@ -1,165 +0,0 @@
-		   Goals, Design and Implementation of the
-		      new ultra-scalable O(1) scheduler
-
-
-  This is an edited version of an email Ingo Molnar sent to
-  lkml on 4 Jan 2002.  It describes the goals, design, and
-  implementation of Ingo's new ultra-scalable O(1) scheduler.
-  Last Updated: 18 April 2002.
-
-
-Goal
-====
-
-The main goal of the new scheduler is to keep all the good things we know
-and love about the current Linux scheduler:
-
- - good interactive performance even during high load: if the user
-   types or clicks then the system must react instantly and must execute
-   the user tasks smoothly, even during considerable background load.
-
- - good scheduling/wakeup performance with 1-2 runnable processes.
-
- - fairness: no process should stay without any timeslice for any
-   unreasonable amount of time. No process should get an unjustly high
-   amount of CPU time.
-
- - priorities: less important tasks can be started with lower priority,
-   more important tasks with higher priority.
-
- - SMP efficiency: no CPU should stay idle if there is work to do.
-
- - SMP affinity: processes which run on one CPU should stay affine to
-   that CPU. Processes should not bounce between CPUs too frequently.
-
- - plus additional scheduler features: RT scheduling, CPU binding.
-
-and the goal is also to add a few new things:
-
- - fully O(1) scheduling. Are you tired of the recalculation loop
-   blowing the L1 cache away every now and then? Do you think the goodness
-   loop is taking a bit too long to finish if there are lots of runnable
-   processes? This new scheduler takes no prisoners: wakeup(), schedule(),
-   the timer interrupt are all O(1) algorithms. There is no recalculation
-   loop. There is no goodness loop either.
-
- - 'perfect' SMP scalability. With the new scheduler there is no 'big'
-   runqueue_lock anymore - it's all per-CPU runqueues and locks - two
-   tasks on two separate CPUs can wake up, schedule and context-switch
-   completely in parallel, without any interlocking. All
-   scheduling-relevant data is structured for maximum scalability.
-
- - better SMP affinity. The old scheduler has a particular weakness that
-   causes the random bouncing of tasks between CPUs if/when higher
-   priority/interactive tasks, this was observed and reported by many
-   people. The reason is that the timeslice recalculation loop first needs
-   every currently running task to consume its timeslice. But when this
-   happens on eg. an 8-way system, then this property starves an
-   increasing number of CPUs from executing any process. Once the last
-   task that has a timeslice left has finished using up that timeslice,
-   the recalculation loop is triggered and other CPUs can start executing
-   tasks again - after having idled around for a number of timer ticks.
-   The more CPUs, the worse this effect.
-
-   Furthermore, this same effect causes the bouncing effect as well:
-   whenever there is such a 'timeslice squeeze' of the global runqueue,
-   idle processors start executing tasks which are not affine to that CPU.
-   (because the affine tasks have finished off their timeslices already.)
-
-   The new scheduler solves this problem by distributing timeslices on a
-   per-CPU basis, without having any global synchronization or
-   recalculation.
-
- - batch scheduling. A significant proportion of computing-intensive tasks
-   benefit from batch-scheduling, where timeslices are long and processes
-   are roundrobin scheduled. The new scheduler does such batch-scheduling
-   of the lowest priority tasks - so nice +19 jobs will get
-   'batch-scheduled' automatically. With this scheduler, nice +19 jobs are
-   in essence SCHED_IDLE, from an interactiveness point of view.
-
- - handle extreme loads more smoothly, without breakdown and scheduling
-   storms.
-
- - O(1) RT scheduling. For those RT folks who are paranoid about the
-   O(nr_running) property of the goodness loop and the recalculation loop.
-
- - run fork()ed children before the parent. Andrea has pointed out the
-   advantages of this a few months ago, but patches for this feature
-   do not work with the old scheduler as well as they should,
-   because idle processes often steal the new child before the fork()ing
-   CPU gets to execute it.
-
-
-Design
-======
-
-The core of the new scheduler contains the following mechanisms:
-
- - *two* priority-ordered 'priority arrays' per CPU. There is an 'active'
-   array and an 'expired' array. The active array contains all tasks that
-   are affine to this CPU and have timeslices left. The expired array
-   contains all tasks which have used up their timeslices - but this array
-   is kept sorted as well. The active and expired array is not accessed
-   directly, it's accessed through two pointers in the per-CPU runqueue
-   structure. If all active tasks are used up then we 'switch' the two
-   pointers and from now on the ready-to-go (former-) expired array is the
-   active array - and the empty active array serves as the new collector
-   for expired tasks.
-
- - there is a 64-bit bitmap cache for array indices. Finding the highest
-   priority task is thus a matter of two x86 BSFL bit-search instructions.
-
-the split-array solution enables us to have an arbitrary number of active
-and expired tasks, and the recalculation of timeslices can be done
-immediately when the timeslice expires. Because the arrays are always
-access through the pointers in the runqueue, switching the two arrays can
-be done very quickly.
-
-this is a hybride priority-list approach coupled with roundrobin
-scheduling and the array-switch method of distributing timeslices.
-
- - there is a per-task 'load estimator'.
-
-one of the toughest things to get right is good interactive feel during
-heavy system load. While playing with various scheduler variants i found
-that the best interactive feel is achieved not by 'boosting' interactive
-tasks, but by 'punishing' tasks that want to use more CPU time than there
-is available. This method is also much easier to do in an O(1) fashion.
-
-to establish the actual 'load' the task contributes to the system, a
-complex-looking but pretty accurate method is used: there is a 4-entry
-'history' ringbuffer of the task's activities during the last 4 seconds.
-This ringbuffer is operated without much overhead. The entries tell the
-scheduler a pretty accurate load-history of the task: has it used up more
-CPU time or less during the past N seconds. [the size '4' and the interval
-of 4x 1 seconds was found by lots of experimentation - this part is
-flexible and can be changed in both directions.]
-
-the penalty a task gets for generating more load than the CPU can handle
-is a priority decrease - there is a maximum amount to this penalty
-relative to their static priority, so even fully CPU-bound tasks will
-observe each other's priorities, and will share the CPU accordingly.
-
-the SMP load-balancer can be extended/switched with additional parallel
-computing and cache hierarchy concepts: NUMA scheduling, multi-core CPUs
-can be supported easily by changing the load-balancer. Right now it's
-tuned for my SMP systems.
-
-i skipped the prev->mm == next->mm advantage - no workload i know of shows
-any sensitivity to this. It can be added back by sacrificing O(1)
-schedule() [the current and one-lower priority list can be searched for a
-that->mm == current->mm condition], but costs a fair number of cycles
-during a number of important workloads, so i wanted to avoid this as much
-as possible.
-
-- the SMP idle-task startup code was still racy and the new scheduler
-triggered this. So i streamlined the idle-setup code a bit. We do not call
-into schedule() before all processors have started up fully and all idle
-threads are in place.
-
-- the patch also cleans up a number of aspects of sched.c - moves code
-into other areas of the kernel where it's appropriate, and simplifies
-certain code paths and data constructs. As a result, the new scheduler's
-code is smaller than the old one.
-
-	Ingo

commit 7a1aa309f21ea2f6c31f364341e4027ecf4e79bc
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon May 5 01:06:54 2008 -0700

    irda: fix !PNP support for drivers/net/irda/smsc-ircc2.c
    
    x86.git testing found this build bug on v2.6.26-rc1:
    
      ERROR: "pnp_get_resource" [drivers/net/irda/smsc-ircc2.ko] undefined!
      make[1]: *** [__modpost] Error 1
      make: *** [modules] Error 2
    
    the driver did not anticipate the case of !CONFIG_PNP which is rare but
    still possible. Instead of restricting the driver to PNP-only in the
    Kconfig space, add the (trivial) dummy struct pnp_driver - this is that
    other drivers use in the !PNP case too.
    
    The driver itself can in theory be initialized on !PNP too in certain
    cases, via smsc_ircc_legacy_probe().
    
    Patch only minimally build tested, i dont have this hardware.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/irda/smsc-ircc2.c b/drivers/net/irda/smsc-ircc2.c
index 1f26da761e9f..cfe0194fef71 100644
--- a/drivers/net/irda/smsc-ircc2.c
+++ b/drivers/net/irda/smsc-ircc2.c
@@ -376,6 +376,7 @@ MODULE_DEVICE_TABLE(pnp, smsc_ircc_pnp_table);
 
 static int pnp_driver_registered;
 
+#ifdef CONFIG_PNP
 static int __init smsc_ircc_pnp_probe(struct pnp_dev *dev,
 				      const struct pnp_device_id *dev_id)
 {
@@ -402,7 +403,9 @@ static struct pnp_driver smsc_ircc_pnp_driver = {
 	.id_table	= smsc_ircc_pnp_table,
 	.probe		= smsc_ircc_pnp_probe,
 };
-
+#else /* CONFIG_PNP */
+static struct pnp_driver smsc_ircc_pnp_driver;
+#endif
 
 /*******************************************************************************
  *