Patches contributed by Eötvös Lorand University

<<Prev 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264[265]266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 Next>>

commit 902955fc13259dcec1321d45251a477977fcba39
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Mar 3 13:53:58 2008 +0100

    x86: revert "x86: fix pmd_bad and pud_bad to support huge pages"
    
    revert commit cded932b75ab0a5f9181ee3da34a0a488d1a14fd,
    "x86: fix pmd_bad and pud_bad to support huge pages", it causes
    a bootup hang, as reported and bisected by Arjan van de Ven.
    
    Bisected-by: Arjan van de Ven <arjan@linux.intel.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/asm-x86/pgtable_32.h b/include/asm-x86/pgtable_32.h
index b478efa971e0..a842c7222b1e 100644
--- a/include/asm-x86/pgtable_32.h
+++ b/include/asm-x86/pgtable_32.h
@@ -91,9 +91,7 @@ extern unsigned long pg0[];
 /* To avoid harmful races, pmd_none(x) should check only the lower when PAE */
 #define pmd_none(x)	(!(unsigned long)pmd_val(x))
 #define pmd_present(x)	(pmd_val(x) & _PAGE_PRESENT)
-#define	pmd_bad(x)	((pmd_val(x) \
-			  & ~(PAGE_MASK | _PAGE_USER | _PAGE_PSE | _PAGE_NX)) \
-			 != _KERNPG_TABLE)
+#define	pmd_bad(x)	((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
 
 
 #define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
diff --git a/include/asm-x86/pgtable_64.h b/include/asm-x86/pgtable_64.h
index 0a9258333cbd..0a0b77bc736a 100644
--- a/include/asm-x86/pgtable_64.h
+++ b/include/asm-x86/pgtable_64.h
@@ -153,14 +153,12 @@ static inline unsigned long pgd_bad(pgd_t pgd)
 
 static inline unsigned long pud_bad(pud_t pud)
 {
-	return pud_val(pud) &
-		~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER | _PAGE_PSE | _PAGE_NX);
+	return pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
 }
 
 static inline unsigned long pmd_bad(pmd_t pmd)
 {
-	return pmd_val(pmd) &
-		~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER | _PAGE_PSE | _PAGE_NX);
+	return pmd_val(pmd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
 }
 
 #define pte_none(x)	(!pte_val(x))

commit b4ef95de00be4c2c30feccf607a45093c8c118b7
Author: Ingo Molnar <mingo@elte.hu>
Date:   Tue Feb 26 09:40:27 2008 +0100

    x86: disable BTS ptrace extensions for now
    
    revert the BTS ptrace extension for now.
    
    based on general objections from Roland McGrath:
    
        http://lkml.org/lkml/2008/2/21/323
    
    we'll let the BTS functionality cook some more and re-enable
    it in v2.6.26. We'll leave the dead code around to help the
    development of this code.
    
    (X86_BTS is not defined at the moment)
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index a7d50a547dc2..be3c7a299f02 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -603,11 +603,13 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 	}
 #endif
 
+#ifdef X86_BTS
 	if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
 		ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
 
 	if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
 		ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
+#endif
 
 
 	if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 43f287744f9f..3baf9b9f4c87 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -604,11 +604,13 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
 		memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
 	}
 
+#ifdef X86_BTS
 	if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
 		ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
 
 	if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
 		ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
+#endif
 }
 
 /*
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index d862e396b099..f41fdc98efb1 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -544,6 +544,8 @@ static int ptrace_set_debugreg(struct task_struct *child,
 	return 0;
 }
 
+#ifdef X86_BTS
+
 static int ptrace_bts_get_size(struct task_struct *child)
 {
 	if (!child->thread.ds_area_msr)
@@ -826,6 +828,7 @@ void ptrace_bts_take_timestamp(struct task_struct *tsk,
 
 	ptrace_bts_write_record(tsk, &rec);
 }
+#endif /* X86_BTS */
 
 /*
  * Called by kernel/ptrace.c when detaching..
@@ -839,7 +842,9 @@ void ptrace_disable(struct task_struct *child)
 	clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
 #endif
 	if (child->thread.ds_area_msr) {
+#ifdef X86_BTS
 		ptrace_bts_realloc(child, 0, 0);
+#endif
 		child->thread.debugctlmsr &= ~ds_debugctl_mask();
 		if (!child->thread.debugctlmsr)
 			clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
@@ -961,6 +966,10 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		break;
 #endif
 
+	/*
+	 * These bits need more cooking - not enabled yet:
+	 */
+#ifdef X86_BTS
 	case PTRACE_BTS_CONFIG:
 		ret = ptrace_bts_config
 			(child, data, (struct ptrace_bts_config __user *)addr);
@@ -988,6 +997,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		ret = ptrace_bts_drain
 			(child, data, (struct bts_struct __user *) addr);
 		break;
+#endif
 
 	default:
 		ret = ptrace_request(child, request, addr, data);
@@ -1226,12 +1236,14 @@ asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
 	case PTRACE_SETOPTIONS:
 	case PTRACE_SET_THREAD_AREA:
 	case PTRACE_GET_THREAD_AREA:
+#ifdef X86_BTS
 	case PTRACE_BTS_CONFIG:
 	case PTRACE_BTS_STATUS:
 	case PTRACE_BTS_SIZE:
 	case PTRACE_BTS_GET:
 	case PTRACE_BTS_CLEAR:
 	case PTRACE_BTS_DRAIN:
+#endif
 		return sys_ptrace(request, pid, addr, data);
 
 	default:

commit 757265b8c57bb8fd91785d3d1a87fb483c86c9c2
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Feb 28 20:19:06 2008 +0100

    x86: delay the export removal of init_mm
    
    delay the removal of this symbol export by one more kernel release,
    giving external modules such as VirtualBox a chance to stop using it.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index ba899ff2a8f9..c1d1fd0c299b 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -316,3 +316,15 @@ Why:	Largely unmaintained and almost entirely unused.  File system
 	is largely pointless as without a lot of work only the most
 	trivial of Solaris binaries can work with the emulation code.
 Who:	David S. Miller <davem@davemloft.net>
+
+---------------------------
+
+What:	init_mm export
+When:	2.6.26
+Why:	Not used in-tree. The current out-of-tree users used it to
+	work around problems in the CPA code which should be resolved
+	by now. One usecase was described to provide verification code
+	of the CPA operation. That's a good idea in general, but such
+	code / infrastructure should be in the kernel and not in some
+	out-of-tree driver.
+Who:	Thomas Gleixner <tglx@linutronix.de>
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index 5b3ce7934363..3d01e47777db 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -15,6 +15,7 @@ static struct files_struct init_files = INIT_FILES;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
+EXPORT_UNUSED_SYMBOL(init_mm); /* will be removed in 2.6.26 */
 
 /*
  * Initial thread structure.

commit b16bf712f491808a8c926dd481c696fe7d73ee5a
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Feb 28 14:02:08 2008 +0100

    x86: fix leak un ioremap_page_range() failure
    
    Jan Beulich noticed it during code review that if a driver's ioremap()
    fails (say due to -ENOMEM) then we might leak the struct vm_area.
    
    Free it properly.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 882328efc3db..ac3c959e271d 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -162,7 +162,7 @@ static void __iomem *__ioremap(unsigned long phys_addr, unsigned long size,
 	area->phys_addr = phys_addr;
 	vaddr = (unsigned long) area->addr;
 	if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot)) {
-		remove_vm_area((void *)(vaddr & PAGE_MASK));
+		free_vm_area(area);
 		return NULL;
 	}

commit 5d119b2c9a490e2d647eae134211b32a18a04c7d
Author: Ingo Molnar <mingo@elte.hu>
Date:   Tue Feb 26 12:55:57 2008 +0100

    x86: fix execve with -fstack-protect
    
    pointed out by pageexec@freemail.hu:
    
    > what happens here is that gcc treats the argument area as owned by the
    > callee, not the caller and is allowed to do certain tricks. for ssp it
    > will make a copy of the struct passed by value into the local variable
    > area and pass *its* address down, and it won't copy it back into the
    > original instance stored in the argument area.
    >
    > so once sys_execve returns, the pt_regs passed by value hasn't at all
    > changed and its default content will cause a nice double fault (FWIW,
    > this part took me the longest to debug, being down with cold didn't
    > help it either ;).
    
    To fix this we pass in pt_regs by pointer.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 2ad9a1bc6a73..c20c9e7e08dd 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -453,6 +453,7 @@ ENTRY(stub_execve)
 	CFI_REGISTER rip, r11
 	SAVE_REST
 	FIXUP_TOP_OF_STACK %r11
+	movq %rsp, %rcx
 	call sys_execve
 	RESTORE_TOP_OF_STACK %r11
 	movq %rax,RAX(%rsp)
@@ -1036,15 +1037,16 @@ ENDPROC(child_rip)
  *	rdi: name, rsi: argv, rdx: envp
  *
  * We want to fallback into:
- *	extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs)
+ *	extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs *regs)
  *
  * do_sys_execve asm fallback arguments:
- *	rdi: name, rsi: argv, rdx: envp, fake frame on the stack
+ *	rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack
  */
 ENTRY(kernel_execve)
 	CFI_STARTPROC
 	FAKE_STACK_FRAME $0
 	SAVE_ALL	
+	movq %rsp,%rcx
 	call sys_execve
 	movq %rax, RAX(%rsp)	
 	RESTORE_REST
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b0cc8f0136d8..43f287744f9f 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -730,16 +730,16 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
  */
 asmlinkage
 long sys_execve(char __user *name, char __user * __user *argv,
-		char __user * __user *envp, struct pt_regs regs)
+		char __user * __user *envp, struct pt_regs *regs)
 {
 	long error;
 	char * filename;
 
 	filename = getname(name);
 	error = PTR_ERR(filename);
-	if (IS_ERR(filename)) 
+	if (IS_ERR(filename))
 		return error;
-	error = do_execve(filename, argv, envp, &regs); 
+	error = do_execve(filename, argv, envp, regs);
 	putname(filename);
 	return error;
 }

commit d4afe414189b098d56bcd24280c018aa2ac9a990
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Feb 21 13:39:30 2008 +0100

    x86: rename KERNEL_TEXT_SIZE => KERNEL_IMAGE_SIZE
    
    The KERNEL_TEXT_SIZE constant was mis-named, as we not only map the kernel
    text but data, bss and init sections as well.
    
    That name led me on the wrong path with the KERNEL_TEXT_SIZE regression,
    because i knew how big of _text_ my images have and i knew about the 40 MB
    "text" limit so i wrongly thought to be on the safe side of the 40 MB limit
    with my 29 MB of text, while the total image size was slightly above 40 MB.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index b037b15be7f8..a007454133a3 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -393,7 +393,7 @@ NEXT_PAGE(level2_kernel_pgt)
 	 *  too.)
 	 */
 	PMDS(0, __PAGE_KERNEL_LARGE_EXEC|_PAGE_GLOBAL,
-		KERNEL_TEXT_SIZE/PMD_SIZE)
+		KERNEL_IMAGE_SIZE/PMD_SIZE)
 
 NEXT_PAGE(level2_spare_pgt)
 	.fill   512, 8, 0
diff --git a/include/asm-x86/page_64.h b/include/asm-x86/page_64.h
index 3e2e3ca63048..143546073b95 100644
--- a/include/asm-x86/page_64.h
+++ b/include/asm-x86/page_64.h
@@ -51,8 +51,8 @@
  * Kernel image size is limited to 128 MB (see level2_kernel_pgt in
  * arch/x86/kernel/head_64.S), and it is mapped here:
  */
-#define KERNEL_TEXT_SIZE	(128*1024*1024)
-#define KERNEL_TEXT_START	_AC(0xffffffff80000000, UL)
+#define KERNEL_IMAGE_SIZE	(128*1024*1024)
+#define KERNEL_IMAGE_START	_AC(0xffffffff80000000, UL)
 
 #ifndef __ASSEMBLY__
 void clear_page(void *page);

commit 88f3aec7afd9ae3e6f6d221801996b69aad1e3a4
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Feb 21 11:04:11 2008 +0100

    x86: fix spontaneous reboot with allyesconfig bzImage
    
    recently the 64-bit allyesconfig bzImage kernel started spontaneously
    rebooting during early bootup.
    
    after a few fun hours spent with early init debugging, it turns out
    that we've got this rather annoying limit on the size of the kernel
    image:
    
          #define KERNEL_TEXT_SIZE  (40*1024*1024)
    
    which limit my vmlinux just happened to pass:
    
           text           data       bss        dec       hex   filename
       29703744        4222751   8646224   42572719   2899baf   vmlinux
    
    40 MB is 42572719 bytes, so my vmlinux was just 1.5% above this limit :-/
    
    So it happily crashed right in head_64.S, which - as we all know - is
    the most debuggable code in the whole architecture ;-)
    
    So increase the limit to allow an up to 128MB kernel image to be mapped.
    (should anyone be that crazy or lazy)
    
    We have a full 4K of pagetable (level2_kernel_pgt) allocated for these
    mappings already, so there's no RAM overhead and the limit was rather
    pointless and arbitrary.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index eb415043a929..b037b15be7f8 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -379,18 +379,24 @@ NEXT_PAGE(level2_ident_pgt)
 	/* Since I easily can, map the first 1G.
 	 * Don't set NX because code runs from these pages.
 	 */
-	PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD)
+	PMDS(0, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD)
 
 NEXT_PAGE(level2_kernel_pgt)
-	/* 40MB kernel mapping. The kernel code cannot be bigger than that.
-	   When you change this change KERNEL_TEXT_SIZE in page.h too. */
-	/* (2^48-(2*1024*1024*1024)-((2^39)*511)-((2^30)*510)) = 0 */
-	PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC|_PAGE_GLOBAL, KERNEL_TEXT_SIZE/PMD_SIZE)
-	/* Module mapping starts here */
-	.fill	(PTRS_PER_PMD - (KERNEL_TEXT_SIZE/PMD_SIZE)),8,0
+	/*
+	 * 128 MB kernel mapping. We spend a full page on this pagetable
+	 * anyway.
+	 *
+	 * The kernel code+data+bss must not be bigger than that.
+	 *
+	 * (NOTE: at +128MB starts the module area, see MODULES_VADDR.
+	 *  If you want to increase this then increase MODULES_VADDR
+	 *  too.)
+	 */
+	PMDS(0, __PAGE_KERNEL_LARGE_EXEC|_PAGE_GLOBAL,
+		KERNEL_TEXT_SIZE/PMD_SIZE)
 
 NEXT_PAGE(level2_spare_pgt)
-	.fill   512,8,0
+	.fill   512, 8, 0
 
 #undef PMDS
 #undef NEXT_PAGE
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 6dbb0035c57a..a02a14f0f324 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -172,8 +172,9 @@ set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
 }
 
 /*
- * The head.S code sets up the kernel high mapping from:
- * __START_KERNEL_map to __START_KERNEL_map + KERNEL_TEXT_SIZE
+ * The head.S code sets up the kernel high mapping:
+ *
+ *   from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
  *
  * phys_addr holds the negative offset to the kernel, which is added
  * to the compile time generated pmds. This results in invalid pmds up
diff --git a/include/asm-x86/page_64.h b/include/asm-x86/page_64.h
index f7393bc516ef..3e2e3ca63048 100644
--- a/include/asm-x86/page_64.h
+++ b/include/asm-x86/page_64.h
@@ -47,8 +47,12 @@
 #define __PHYSICAL_MASK_SHIFT	46
 #define __VIRTUAL_MASK_SHIFT	48
 
-#define KERNEL_TEXT_SIZE  (40*1024*1024)
-#define KERNEL_TEXT_START _AC(0xffffffff80000000, UL)
+/*
+ * Kernel image size is limited to 128 MB (see level2_kernel_pgt in
+ * arch/x86/kernel/head_64.S), and it is mapped here:
+ */
+#define KERNEL_TEXT_SIZE	(128*1024*1024)
+#define KERNEL_TEXT_START	_AC(0xffffffff80000000, UL)
 
 #ifndef __ASSEMBLY__
 void clear_page(void *page);

commit 92cb54a37a42a41cfb2ef7f1478bfa4395198258
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Feb 13 14:37:52 2008 +0100

    x86: make DEBUG_PAGEALLOC and CPA more robust
    
    Use PF_MEMALLOC to prevent recursive calls in the DBEUG_PAGEALLOC
    case. This makes the code simpler and more robust against allocation
    failures.
    
    This fixes the following fallback to non-mmconfig:
    
       http://lkml.org/lkml/2008/2/20/551
       http://bugzilla.kernel.org/show_bug.cgi?id=10083
    
    Also, for DEBUG_PAGEALLOC=n reduce the pool size to one page.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 464d8fc21ce6..14e48b5a94ba 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -44,6 +44,12 @@ static inline unsigned long highmap_end_pfn(void)
 
 #endif
 
+#ifdef CONFIG_DEBUG_PAGEALLOC
+# define debug_pagealloc 1
+#else
+# define debug_pagealloc 0
+#endif
+
 static inline int
 within(unsigned long addr, unsigned long start, unsigned long end)
 {
@@ -355,45 +361,48 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
 
 static LIST_HEAD(page_pool);
 static unsigned long pool_size, pool_pages, pool_low;
-static unsigned long pool_used, pool_failed, pool_refill;
+static unsigned long pool_used, pool_failed;
 
-static void cpa_fill_pool(void)
+static void cpa_fill_pool(struct page **ret)
 {
-	struct page *p;
 	gfp_t gfp = GFP_KERNEL;
+	unsigned long flags;
+	struct page *p;
 
-	/* Do not allocate from interrupt context */
-	if (in_irq() || irqs_disabled())
-		return;
 	/*
-	 * Check unlocked. I does not matter when we have one more
-	 * page in the pool. The bit lock avoids recursive pool
-	 * allocations:
+	 * Avoid recursion (on debug-pagealloc) and also signal
+	 * our priority to get to these pagetables:
 	 */
-	if (pool_pages >= pool_size || test_and_set_bit_lock(0, &pool_refill))
+	if (current->flags & PF_MEMALLOC)
 		return;
+	current->flags |= PF_MEMALLOC;
 
-#ifdef CONFIG_DEBUG_PAGEALLOC
 	/*
-	 * We could do:
-	 * gfp = in_atomic() ? GFP_ATOMIC : GFP_KERNEL;
-	 * but this fails on !PREEMPT kernels
+	 * Allocate atomically from atomic contexts:
 	 */
-	gfp =  GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN;
-#endif
+	if (in_atomic() || irqs_disabled() || debug_pagealloc)
+		gfp =  GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN;
 
-	while (pool_pages < pool_size) {
+	while (pool_pages < pool_size || (ret && !*ret)) {
 		p = alloc_pages(gfp, 0);
 		if (!p) {
 			pool_failed++;
 			break;
 		}
-		spin_lock_irq(&pgd_lock);
+		/*
+		 * If the call site needs a page right now, provide it:
+		 */
+		if (ret && !*ret) {
+			*ret = p;
+			continue;
+		}
+		spin_lock_irqsave(&pgd_lock, flags);
 		list_add(&p->lru, &page_pool);
 		pool_pages++;
-		spin_unlock_irq(&pgd_lock);
+		spin_unlock_irqrestore(&pgd_lock, flags);
 	}
-	clear_bit_unlock(0, &pool_refill);
+
+	current->flags &= ~PF_MEMALLOC;
 }
 
 #define SHIFT_MB		(20 - PAGE_SHIFT)
@@ -414,11 +423,15 @@ void __init cpa_init(void)
 	 * GiB. Shift MiB to Gib and multiply the result by
 	 * POOL_PAGES_PER_GB:
 	 */
-	gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB;
-	pool_size = POOL_PAGES_PER_GB * gb;
+	if (debug_pagealloc) {
+		gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB;
+		pool_size = POOL_PAGES_PER_GB * gb;
+	} else {
+		pool_size = 1;
+	}
 	pool_low = pool_size;
 
-	cpa_fill_pool();
+	cpa_fill_pool(NULL);
 	printk(KERN_DEBUG
 	       "CPA: page pool initialized %lu of %lu pages preallocated\n",
 	       pool_pages, pool_size);
@@ -440,16 +453,20 @@ static int split_large_page(pte_t *kpte, unsigned long address)
 	spin_lock_irqsave(&pgd_lock, flags);
 	if (list_empty(&page_pool)) {
 		spin_unlock_irqrestore(&pgd_lock, flags);
-		return -ENOMEM;
+		base = NULL;
+		cpa_fill_pool(&base);
+		if (!base)
+			return -ENOMEM;
+		spin_lock_irqsave(&pgd_lock, flags);
+	} else {
+		base = list_first_entry(&page_pool, struct page, lru);
+		list_del(&base->lru);
+		pool_pages--;
+
+		if (pool_pages < pool_low)
+			pool_low = pool_pages;
 	}
 
-	base = list_first_entry(&page_pool, struct page, lru);
-	list_del(&base->lru);
-	pool_pages--;
-
-	if (pool_pages < pool_low)
-		pool_low = pool_pages;
-
 	/*
 	 * Check for races, another CPU might have split this page
 	 * up for us already:
@@ -734,7 +751,8 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
 		cpa_flush_all(cache);
 
 out:
-	cpa_fill_pool();
+	cpa_fill_pool(NULL);
+
 	return ret;
 }
 
@@ -897,7 +915,7 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
 	 * Try to refill the page pool here. We can do this only after
 	 * the tlb flush.
 	 */
-	cpa_fill_pool();
+	cpa_fill_pool(NULL);
 }
 
 #ifdef CONFIG_HIBERNATION

commit 7eee3e677d6e2e9007afcd7d79b0715525aa552e
Author: Ingo Molnar <mingo@elte.hu>
Date:   Fri Feb 22 10:32:21 2008 +0100

    sched: clean up __pick_last_entity() a bit
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 7abad50d935f..c8e6492c5925 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -202,14 +202,12 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
 
 static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 {
-	struct rb_node *last;
-	struct sched_entity *se;
+	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
 
-	last = rb_last(&cfs_rq->tasks_timeline);
 	if (!last)
 		return NULL;
-	se = rb_entry(last, struct sched_entity, run_node);
-	return se;
+
+	return rb_entry(last, struct sched_entity, run_node);
 }
 
 /**************************************************************

commit 6892b75e60557a48c01d57ba320419a9e2ce9846
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Feb 13 14:02:36 2008 +0100

    sched: make early bootup sched_clock() use safer
    
    do not call sched_clock() too early. Not only might rq->idle
    not be set up - but pure per-cpu data might not be accessible
    either.
    
    this solves an ia64 early bootup hang with CONFIG_PRINTK_TIME=y.
    
    Tested-by: Tony Luck <tony.luck@gmail.com>
    Acked-by: Tony Luck <tony.luck@gmail.com>
    Acked-by: David S. Miller <davem@davemloft.net>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index b387a8de26a5..7286ccb01082 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -668,6 +668,8 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
  */
 unsigned int sysctl_sched_rt_period = 1000000;
 
+static __read_mostly int scheduler_running;
+
 /*
  * part of the period that we allow rt tasks to run in us.
  * default: 0.95s
@@ -689,14 +691,16 @@ unsigned long long cpu_clock(int cpu)
 	unsigned long flags;
 	struct rq *rq;
 
-	local_irq_save(flags);
-	rq = cpu_rq(cpu);
 	/*
 	 * Only call sched_clock() if the scheduler has already been
 	 * initialized (some code might call cpu_clock() very early):
 	 */
-	if (rq->idle)
-		update_rq_clock(rq);
+	if (unlikely(!scheduler_running))
+		return 0;
+
+	local_irq_save(flags);
+	rq = cpu_rq(cpu);
+	update_rq_clock(rq);
 	now = rq->clock;
 	local_irq_restore(flags);
 
@@ -7284,6 +7288,8 @@ void __init sched_init(void)
 	 * During early bootup we pretend to be a normal task:
 	 */
 	current->sched_class = &fair_sched_class;
+
+	scheduler_running = 1;
 }
 
 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP