Patches contributed by Eötvös Lorand University


commit 7c178a26d3e753d2a4346d3e4b8aa549d387f698
Author: Ingo Molnar <mingo@elte.hu>
Date:   Fri Feb 20 23:27:29 2009 +0100

    x86, mm: fault.c, remove #ifdef from fault_in_kernel_space()
    
    Impact: cleanup
    
    Removal of an #ifdef in fault_in_kernel_space(), by making
    use of the new TASK_SIZE_MAX symbol which is now available
    on 32-bit too.
    
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 6fa9f175cba3..f195691ec26e 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -960,11 +960,7 @@ access_error(unsigned long error_code, int write, struct vm_area_struct *vma)
 
 static int fault_in_kernel_space(unsigned long address)
 {
-#ifdef CONFIG_X86_32
-	return address >= TASK_SIZE;
-#else
 	return address >= TASK_SIZE_MAX;
-#endif
 }
 
 /*

commit d951734654f76a370a89b4e531af9b765bd13541
Author: Ingo Molnar <mingo@elte.hu>
Date:   Fri Feb 20 23:32:28 2009 +0100

    x86, mm: rename TASK_SIZE64 => TASK_SIZE_MAX
    
    Impact: cleanup
    
    Rename TASK_SIZE64 to TASK_SIZE_MAX, and provide the
    define on 32-bit too. (mapped to TASK_SIZE)
    
    This allows 32-bit code to make use of the (former-) TASK_SIZE64
    symbol as well, in a clean way.
    
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 72914d0315e9..c7a98f738210 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -861,6 +861,7 @@ static inline void spin_lock_prefetch(const void *x)
  * User space process size: 3GB (default).
  */
 #define TASK_SIZE		PAGE_OFFSET
+#define TASK_SIZE_MAX		TASK_SIZE
 #define STACK_TOP		TASK_SIZE
 #define STACK_TOP_MAX		STACK_TOP
 
@@ -920,7 +921,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
 /*
  * User space process size. 47bits minus one guard page.
  */
-#define TASK_SIZE64	((1UL << 47) - PAGE_SIZE)
+#define TASK_SIZE_MAX	((1UL << 47) - PAGE_SIZE)
 
 /* This decides where the kernel will search for a free chunk of vm
  * space during mmap's.
@@ -929,12 +930,12 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
 					0xc0000000 : 0xFFFFe000)
 
 #define TASK_SIZE		(test_thread_flag(TIF_IA32) ? \
-					IA32_PAGE_OFFSET : TASK_SIZE64)
+					IA32_PAGE_OFFSET : TASK_SIZE_MAX)
 #define TASK_SIZE_OF(child)	((test_tsk_thread_flag(child, TIF_IA32)) ? \
-					IA32_PAGE_OFFSET : TASK_SIZE64)
+					IA32_PAGE_OFFSET : TASK_SIZE_MAX)
 
 #define STACK_TOP		TASK_SIZE
-#define STACK_TOP_MAX		TASK_SIZE64
+#define STACK_TOP_MAX		TASK_SIZE_MAX
 
 #define INIT_THREAD  { \
 	.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index d2f7cd5b2c83..fb2159a5c817 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -268,7 +268,7 @@ static unsigned long debugreg_addr_limit(struct task_struct *task)
 	if (test_tsk_thread_flag(task, TIF_IA32))
 		return IA32_PAGE_OFFSET - 3;
 #endif
-	return TASK_SIZE64 - 7;
+	return TASK_SIZE_MAX - 7;
 }
 
 #endif	/* CONFIG_X86_32 */
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 9c2dc5d79531..6fa9f175cba3 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -963,7 +963,7 @@ static int fault_in_kernel_space(unsigned long address)
 #ifdef CONFIG_X86_32
 	return address >= TASK_SIZE;
 #else
-	return address >= TASK_SIZE64;
+	return address >= TASK_SIZE_MAX;
 #endif
 }
 
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 9c98cc6ba978..7133cdf9098b 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -85,8 +85,8 @@ static unsigned long vdso_addr(unsigned long start, unsigned len)
 	unsigned long addr, end;
 	unsigned offset;
 	end = (start + PMD_SIZE - 1) & PMD_MASK;
-	if (end >= TASK_SIZE64)
-		end = TASK_SIZE64;
+	if (end >= TASK_SIZE_MAX)
+		end = TASK_SIZE_MAX;
 	end -= len;
 	/* This loses some more bits than a modulo, but is cheaper */
 	offset = get_random_int() & (PTRS_PER_PTE - 1);

commit c3731c68668325abddee8665018c74c7156a57be
Author: Ingo Molnar <mingo@elte.hu>
Date:   Fri Feb 20 23:22:34 2009 +0100

    x86, mm: fault.c, remove #ifdef from do_page_fault()
    
    Impact: cleanup
    
    do_page_fault() has this ugly #ifdef in its prototype:
    
      #ifdef CONFIG_X86_64
      asmlinkage
      #endif
      void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
    
    Replace it with 'dotraplinkage' which maps to exactly the above
    construct: nothing on 32-bit and asmlinkage on 64-bit.
    
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 8fe2dd254df0..9c2dc5d79531 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -972,10 +972,8 @@ static int fault_in_kernel_space(unsigned long address)
  * and the problem, and then passes it off to one of the appropriate
  * routines.
  */
-#ifdef CONFIG_X86_64
-asmlinkage
-#endif
-void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
+dotraplinkage void __kprobes
+do_page_fault(struct pt_regs *regs, unsigned long error_code)
 {
 	struct vm_area_struct *vma;
 	struct task_struct *tsk;

commit 1cc99544dde9e48602979f16b9309fade6e93051
Author: Ingo Molnar <mingo@elte.hu>
Date:   Fri Feb 20 23:07:48 2009 +0100

    x86, mm: fault.c, unify oops handling
    
    Impact: add oops-recursion check to 32-bit
    
    Unify the oops state-machine, to the 64-bit version. It is
    slightly more careful in that it does a recursion check
    in oops_begin(), and is thus more likely to show the relevant
    oops.
    
    It also means that 32-bit will print one more line at the
    end of pagefault triggered oopses:
    
            printk(KERN_EMERG "CR2: %016lx\n", address);
    
    Which is generally good information to be seen in partial-dump
    digital-camera jpegs ;-)
    
    The downside is the somewhat more complex critical path. Both
    variants have been tested well meanwhile by kernel developers
    crashing their boxes so i dont think this is a practical worry.
    
    This removes 3 ugly #ifdefs from no_context() and makes the
    function a lot nicer read.
    
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index ebfaca3bbb12..8fe2dd254df0 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -659,11 +659,8 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 {
 	struct task_struct *tsk = current;
 	unsigned long *stackend;
-
-#ifdef CONFIG_X86_64
 	unsigned long flags;
 	int sig;
-#endif
 
 	/* Are we prepared to handle this kernel fault? */
 	if (fixup_exception(regs))
@@ -690,11 +687,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 	 * Oops. The kernel tried to access some bad page. We'll have to
 	 * terminate things with extreme prejudice:
 	 */
-#ifdef CONFIG_X86_32
-	bust_spinlocks(1);
-#else
 	flags = oops_begin();
-#endif
 
 	show_fault_oops(regs, error_code, address);
 
@@ -702,15 +695,10 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 	if (*stackend != STACK_END_MAGIC)
 		printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
 
-	tsk->thread.cr2 = address;
-	tsk->thread.trap_no = 14;
-	tsk->thread.error_code = error_code;
+	tsk->thread.cr2		= address;
+	tsk->thread.trap_no	= 14;
+	tsk->thread.error_code	= error_code;
 
-#ifdef CONFIG_X86_32
-	die("Oops", regs, error_code);
-	bust_spinlocks(0);
-	do_exit(SIGKILL);
-#else
 	sig = SIGKILL;
 	if (__die("Oops", regs, error_code))
 		sig = 0;
@@ -719,7 +707,6 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 	printk(KERN_EMERG "CR2: %016lx\n", address);
 
 	oops_end(flags, regs, sig);
-#endif
 }
 
 /*

commit 8f7661496cece8320137d5e26808825498fd2b26
Author: Ingo Molnar <mingo@elte.hu>
Date:   Fri Feb 20 23:00:29 2009 +0100

    x86, mm: fault.c, unify oops printing
    
    Impact: refine/extend page fault related oops printing on 64-bit
    
     - honor the pause_on_oops logic on 64-bit too
     - print out NX fault warnings on 64-bit as well
     - factor out the NX fault message to make it git-greppable and readable
    
    Note that this means that we do the PF_INSTR check on 32-bit non-PAE
    as well where it should not occur ... normally. Cannot hurt.
    
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 4ce62fb80da7..ebfaca3bbb12 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -595,28 +595,24 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
 	return 0;
 }
 
+static const char nx_warning[] = KERN_CRIT
+"kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n";
+
 static void
 show_fault_oops(struct pt_regs *regs, unsigned long error_code,
 		unsigned long address)
 {
-#ifdef CONFIG_X86_32
 	if (!oops_may_print())
 		return;
-#endif
 
-#ifdef CONFIG_X86_PAE
 	if (error_code & PF_INSTR) {
 		unsigned int level;
 
 		pte_t *pte = lookup_address(address, &level);
 
-		if (pte && pte_present(*pte) && !pte_exec(*pte)) {
-			printk(KERN_CRIT "kernel tried to execute "
-				"NX-protected page - exploit attempt? "
-				"(uid: %d)\n", current_uid());
-		}
+		if (pte && pte_present(*pte) && !pte_exec(*pte))
+			printk(nx_warning, current_uid());
 	}
-#endif
 
 	printk(KERN_ALERT "BUG: unable to handle kernel ");
 	if (address < PAGE_SIZE)

commit f2f13a8535174dbb813a0607a9d4737cfba98f6c
Author: Ingo Molnar <mingo@elte.hu>
Date:   Fri Feb 20 22:50:24 2009 +0100

    x86, mm: fault.c, reorder functions
    
    Impact: cleanup
    
    Avoid a couple more #ifdefs by moving fundamentally non-unifiable
    functions into a single #ifdef 32-bit / #else / #endif block in
    fault.c: vmalloc*(), dump_pagetable(), check_vm8086_mode().
    
    No code changed:
    
       text    data     bss     dec     hex filename
       4618      32      24    4674    1242 fault.o.before
       4618      32      24    4674    1242 fault.o.after
    
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 379beaec6caa..4ce62fb80da7 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -191,18 +191,124 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address,
 	force_sig_info(si_signo, &info, tsk);
 }
 
-#ifdef CONFIG_X86_64
-static int bad_address(void *p)
+DEFINE_SPINLOCK(pgd_lock);
+LIST_HEAD(pgd_list);
+
+#ifdef CONFIG_X86_32
+static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
 {
-	unsigned long dummy;
+	unsigned index = pgd_index(address);
+	pgd_t *pgd_k;
+	pud_t *pud, *pud_k;
+	pmd_t *pmd, *pmd_k;
 
-	return probe_kernel_address((unsigned long *)p, dummy);
+	pgd += index;
+	pgd_k = init_mm.pgd + index;
+
+	if (!pgd_present(*pgd_k))
+		return NULL;
+
+	/*
+	 * set_pgd(pgd, *pgd_k); here would be useless on PAE
+	 * and redundant with the set_pmd() on non-PAE. As would
+	 * set_pud.
+	 */
+	pud = pud_offset(pgd, address);
+	pud_k = pud_offset(pgd_k, address);
+	if (!pud_present(*pud_k))
+		return NULL;
+
+	pmd = pmd_offset(pud, address);
+	pmd_k = pmd_offset(pud_k, address);
+	if (!pmd_present(*pmd_k))
+		return NULL;
+
+	if (!pmd_present(*pmd)) {
+		set_pmd(pmd, *pmd_k);
+		arch_flush_lazy_mmu_mode();
+	} else {
+		BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
+	}
+
+	return pmd_k;
+}
+
+void vmalloc_sync_all(void)
+{
+	unsigned long address;
+
+	if (SHARED_KERNEL_PMD)
+		return;
+
+	for (address = VMALLOC_START & PMD_MASK;
+	     address >= TASK_SIZE && address < FIXADDR_TOP;
+	     address += PMD_SIZE) {
+
+		unsigned long flags;
+		struct page *page;
+
+		spin_lock_irqsave(&pgd_lock, flags);
+		list_for_each_entry(page, &pgd_list, lru) {
+			if (!vmalloc_sync_one(page_address(page), address))
+				break;
+		}
+		spin_unlock_irqrestore(&pgd_lock, flags);
+	}
+}
+
+/*
+ * 32-bit:
+ *
+ *   Handle a fault on the vmalloc or module mapping area
+ */
+static noinline int vmalloc_fault(unsigned long address)
+{
+	unsigned long pgd_paddr;
+	pmd_t *pmd_k;
+	pte_t *pte_k;
+
+	/* Make sure we are in vmalloc area: */
+	if (!(address >= VMALLOC_START && address < VMALLOC_END))
+		return -1;
+
+	/*
+	 * Synchronize this task's top level page-table
+	 * with the 'reference' page table.
+	 *
+	 * Do _not_ use "current" here. We might be inside
+	 * an interrupt in the middle of a task switch..
+	 */
+	pgd_paddr = read_cr3();
+	pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
+	if (!pmd_k)
+		return -1;
+
+	pte_k = pte_offset_kernel(pmd_k, address);
+	if (!pte_present(*pte_k))
+		return -1;
+
+	return 0;
+}
+
+/*
+ * Did it hit the DOS screen memory VA from vm86 mode?
+ */
+static inline void
+check_v8086_mode(struct pt_regs *regs, unsigned long address,
+		 struct task_struct *tsk)
+{
+	unsigned long bit;
+
+	if (!v8086_mode(regs))
+		return;
+
+	bit = (address - 0xA0000) >> PAGE_SHIFT;
+	if (bit < 32)
+		tsk->thread.screen_bitmap |= 1 << bit;
 }
-#endif
 
 static void dump_pagetable(unsigned long address)
 {
-#ifdef CONFIG_X86_32
 	__typeof__(pte_val(__pte(0))) page;
 
 	page = read_cr3();
@@ -239,7 +345,132 @@ static void dump_pagetable(unsigned long address)
 	}
 
 	printk("\n");
-#else /* CONFIG_X86_64 */
+}
+
+#else /* CONFIG_X86_64: */
+
+void vmalloc_sync_all(void)
+{
+	unsigned long address;
+
+	for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
+	     address += PGDIR_SIZE) {
+
+		const pgd_t *pgd_ref = pgd_offset_k(address);
+		unsigned long flags;
+		struct page *page;
+
+		if (pgd_none(*pgd_ref))
+			continue;
+
+		spin_lock_irqsave(&pgd_lock, flags);
+		list_for_each_entry(page, &pgd_list, lru) {
+			pgd_t *pgd;
+			pgd = (pgd_t *)page_address(page) + pgd_index(address);
+			if (pgd_none(*pgd))
+				set_pgd(pgd, *pgd_ref);
+			else
+				BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+		}
+		spin_unlock_irqrestore(&pgd_lock, flags);
+	}
+}
+
+/*
+ * 64-bit:
+ *
+ *   Handle a fault on the vmalloc area
+ *
+ * This assumes no large pages in there.
+ */
+static noinline int vmalloc_fault(unsigned long address)
+{
+	pgd_t *pgd, *pgd_ref;
+	pud_t *pud, *pud_ref;
+	pmd_t *pmd, *pmd_ref;
+	pte_t *pte, *pte_ref;
+
+	/* Make sure we are in vmalloc area: */
+	if (!(address >= VMALLOC_START && address < VMALLOC_END))
+		return -1;
+
+	/*
+	 * Copy kernel mappings over when needed. This can also
+	 * happen within a race in page table update. In the later
+	 * case just flush:
+	 */
+	pgd = pgd_offset(current->active_mm, address);
+	pgd_ref = pgd_offset_k(address);
+	if (pgd_none(*pgd_ref))
+		return -1;
+
+	if (pgd_none(*pgd))
+		set_pgd(pgd, *pgd_ref);
+	else
+		BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+
+	/*
+	 * Below here mismatches are bugs because these lower tables
+	 * are shared:
+	 */
+
+	pud = pud_offset(pgd, address);
+	pud_ref = pud_offset(pgd_ref, address);
+	if (pud_none(*pud_ref))
+		return -1;
+
+	if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
+		BUG();
+
+	pmd = pmd_offset(pud, address);
+	pmd_ref = pmd_offset(pud_ref, address);
+	if (pmd_none(*pmd_ref))
+		return -1;
+
+	if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
+		BUG();
+
+	pte_ref = pte_offset_kernel(pmd_ref, address);
+	if (!pte_present(*pte_ref))
+		return -1;
+
+	pte = pte_offset_kernel(pmd, address);
+
+	/*
+	 * Don't use pte_page here, because the mappings can point
+	 * outside mem_map, and the NUMA hash lookup cannot handle
+	 * that:
+	 */
+	if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
+		BUG();
+
+	return 0;
+}
+
+static const char errata93_warning[] =
+KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
+KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
+KERN_ERR "******* Please consider a BIOS update.\n"
+KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
+
+/*
+ * No vm86 mode in 64-bit mode:
+ */
+static inline void
+check_v8086_mode(struct pt_regs *regs, unsigned long address,
+		 struct task_struct *tsk)
+{
+}
+
+static int bad_address(void *p)
+{
+	unsigned long dummy;
+
+	return probe_kernel_address((unsigned long *)p, dummy);
+}
+
+static void dump_pagetable(unsigned long address)
+{
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
@@ -284,83 +515,9 @@ static void dump_pagetable(unsigned long address)
 	return;
 bad:
 	printk("BAD\n");
-#endif
-}
-
-#ifdef CONFIG_X86_32
-static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
-{
-	unsigned index = pgd_index(address);
-	pgd_t *pgd_k;
-	pud_t *pud, *pud_k;
-	pmd_t *pmd, *pmd_k;
-
-	pgd += index;
-	pgd_k = init_mm.pgd + index;
-
-	if (!pgd_present(*pgd_k))
-		return NULL;
-
-	/*
-	 * set_pgd(pgd, *pgd_k); here would be useless on PAE
-	 * and redundant with the set_pmd() on non-PAE. As would
-	 * set_pud.
-	 */
-	pud = pud_offset(pgd, address);
-	pud_k = pud_offset(pgd_k, address);
-	if (!pud_present(*pud_k))
-		return NULL;
-
-	pmd = pmd_offset(pud, address);
-	pmd_k = pmd_offset(pud_k, address);
-	if (!pmd_present(*pmd_k))
-		return NULL;
-
-	if (!pmd_present(*pmd)) {
-		set_pmd(pmd, *pmd_k);
-		arch_flush_lazy_mmu_mode();
-	} else {
-		BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
-	}
-
-	return pmd_k;
-}
-
-/*
- * Did it hit the DOS screen memory VA from vm86 mode?
- */
-static inline void
-check_v8086_mode(struct pt_regs *regs, unsigned long address,
-		 struct task_struct *tsk)
-{
-	unsigned long bit;
-
-	if (!v8086_mode(regs))
-		return;
-
-	bit = (address - 0xA0000) >> PAGE_SHIFT;
-	if (bit < 32)
-		tsk->thread.screen_bitmap |= 1 << bit;
-}
-
-#else /* CONFIG_X86_64: */
-
-static const char errata93_warning[] =
-KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
-KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
-KERN_ERR "******* Please consider a BIOS update.\n"
-KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
-
-/*
- * No vm86 mode in 64-bit mode:
- */
-static inline void
-check_v8086_mode(struct pt_regs *regs, unsigned long address,
-		 struct task_struct *tsk)
-{
 }
 
-#endif
+#endif /* CONFIG_X86_64 */
 
 /*
  * Workaround for K8 erratum #93 & buggy BIOS.
@@ -795,109 +952,6 @@ spurious_fault(unsigned long error_code, unsigned long address)
 	return ret;
 }
 
-/*
- * 32-bit:
- *
- *   Handle a fault on the vmalloc or module mapping area
- *
- * 64-bit:
- *
- *   Handle a fault on the vmalloc area
- *
- * This assumes no large pages in there.
- */
-static noinline int vmalloc_fault(unsigned long address)
-{
-#ifdef CONFIG_X86_32
-	unsigned long pgd_paddr;
-	pmd_t *pmd_k;
-	pte_t *pte_k;
-
-	/* Make sure we are in vmalloc area: */
-	if (!(address >= VMALLOC_START && address < VMALLOC_END))
-		return -1;
-
-	/*
-	 * Synchronize this task's top level page-table
-	 * with the 'reference' page table.
-	 *
-	 * Do _not_ use "current" here. We might be inside
-	 * an interrupt in the middle of a task switch..
-	 */
-	pgd_paddr = read_cr3();
-	pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
-	if (!pmd_k)
-		return -1;
-
-	pte_k = pte_offset_kernel(pmd_k, address);
-	if (!pte_present(*pte_k))
-		return -1;
-
-	return 0;
-#else
-	pgd_t *pgd, *pgd_ref;
-	pud_t *pud, *pud_ref;
-	pmd_t *pmd, *pmd_ref;
-	pte_t *pte, *pte_ref;
-
-	/* Make sure we are in vmalloc area: */
-	if (!(address >= VMALLOC_START && address < VMALLOC_END))
-		return -1;
-
-	/*
-	 * Copy kernel mappings over when needed. This can also
-	 * happen within a race in page table update. In the later
-	 * case just flush:
-	 */
-	pgd = pgd_offset(current->active_mm, address);
-	pgd_ref = pgd_offset_k(address);
-	if (pgd_none(*pgd_ref))
-		return -1;
-
-	if (pgd_none(*pgd))
-		set_pgd(pgd, *pgd_ref);
-	else
-		BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
-
-	/*
-	 * Below here mismatches are bugs because these lower tables
-	 * are shared:
-	 */
-
-	pud = pud_offset(pgd, address);
-	pud_ref = pud_offset(pgd_ref, address);
-	if (pud_none(*pud_ref))
-		return -1;
-
-	if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
-		BUG();
-
-	pmd = pmd_offset(pud, address);
-	pmd_ref = pmd_offset(pud_ref, address);
-	if (pmd_none(*pmd_ref))
-		return -1;
-
-	if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
-		BUG();
-
-	pte_ref = pte_offset_kernel(pmd_ref, address);
-	if (!pte_present(*pte_ref))
-		return -1;
-
-	pte = pte_offset_kernel(pmd, address);
-
-	/*
-	 * Don't use pte_page here, because the mappings can point
-	 * outside mem_map, and the NUMA hash lookup cannot handle
-	 * that:
-	 */
-	if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
-		BUG();
-
-	return 0;
-#endif
-}
-
 int show_unhandled_signals = 1;
 
 static inline int
@@ -1115,53 +1169,3 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
 
 	up_read(&mm->mmap_sem);
 }
-
-DEFINE_SPINLOCK(pgd_lock);
-LIST_HEAD(pgd_list);
-
-void vmalloc_sync_all(void)
-{
-	unsigned long address;
-
-#ifdef CONFIG_X86_32
-	if (SHARED_KERNEL_PMD)
-		return;
-
-	for (address = VMALLOC_START & PMD_MASK;
-	     address >= TASK_SIZE && address < FIXADDR_TOP;
-	     address += PMD_SIZE) {
-
-		unsigned long flags;
-		struct page *page;
-
-		spin_lock_irqsave(&pgd_lock, flags);
-		list_for_each_entry(page, &pgd_list, lru) {
-			if (!vmalloc_sync_one(page_address(page), address))
-				break;
-		}
-		spin_unlock_irqrestore(&pgd_lock, flags);
-	}
-#else /* CONFIG_X86_64 */
-	for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
-	     address += PGDIR_SIZE) {
-
-		const pgd_t *pgd_ref = pgd_offset_k(address);
-		unsigned long flags;
-		struct page *page;
-
-		if (pgd_none(*pgd_ref))
-			continue;
-
-		spin_lock_irqsave(&pgd_lock, flags);
-		list_for_each_entry(page, &pgd_list, lru) {
-			pgd_t *pgd;
-			pgd = (pgd_t *)page_address(page) + pgd_index(address);
-			if (pgd_none(*pgd))
-				set_pgd(pgd, *pgd_ref);
-			else
-				BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
-		}
-		spin_unlock_irqrestore(&pgd_lock, flags);
-	}
-#endif
-}

commit b18018126f422f5b706fd750373425e10e84b486
Author: Ingo Molnar <mingo@elte.hu>
Date:   Fri Feb 20 22:42:57 2009 +0100

    x86, mm, kprobes: fault.c, simplify notify_page_fault()
    
    Impact: cleanup
    
    Remove an #ifdef from notify_page_fault(). The function still
    compiles to nothing in the !CONFIG_KPROBES case.
    
    Introduce kprobes_built_in() and kprobe_fault_handler() helpers
    to allow this - they returns 0 if !CONFIG_KPROBES.
    
    No code changed:
    
       text    data     bss     dec     hex filename
       4618      32      24    4674    1242 fault.o.before
       4618      32      24    4674    1242 fault.o.after
    
    Cc: Masami Hiramatsu <mhiramat@redhat.com>
    Cc: Andrew Morton <akpm@linux-foundation.org>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index fe99af4b86d9..379beaec6caa 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -68,11 +68,10 @@ static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
 
 static inline int notify_page_fault(struct pt_regs *regs)
 {
-#ifdef CONFIG_KPROBES
 	int ret = 0;
 
 	/* kprobe_running() needs smp_processor_id() */
-	if (!user_mode_vm(regs)) {
+	if (kprobes_built_in() && !user_mode_vm(regs)) {
 		preempt_disable();
 		if (kprobe_running() && kprobe_fault_handler(regs, 14))
 			ret = 1;
@@ -80,9 +79,6 @@ static inline int notify_page_fault(struct pt_regs *regs)
 	}
 
 	return ret;
-#else
-	return 0;
-#endif
 }
 
 /*
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 32851eef48f0..2ec6cc14a114 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -182,6 +182,14 @@ struct kprobe_blackpoint {
 DECLARE_PER_CPU(struct kprobe *, current_kprobe);
 DECLARE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
 
+/*
+ * For #ifdef avoidance:
+ */
+static inline int kprobes_built_in(void)
+{
+	return 1;
+}
+
 #ifdef CONFIG_KRETPROBES
 extern void arch_prepare_kretprobe(struct kretprobe_instance *ri,
 				   struct pt_regs *regs);
@@ -271,8 +279,16 @@ void unregister_kretprobes(struct kretprobe **rps, int num);
 void kprobe_flush_task(struct task_struct *tk);
 void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head);
 
-#else /* CONFIG_KPROBES */
+#else /* !CONFIG_KPROBES: */
 
+static inline int kprobes_built_in(void)
+{
+	return 0;
+}
+static inline int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
+{
+	return 0;
+}
 static inline struct kprobe *get_kprobe(void *addr)
 {
 	return NULL;
@@ -329,5 +345,5 @@ static inline void unregister_kretprobes(struct kretprobe **rps, int num)
 static inline void kprobe_flush_task(struct task_struct *tk)
 {
 }
-#endif				/* CONFIG_KPROBES */
-#endif				/* _LINUX_KPROBES_H */
+#endif /* CONFIG_KPROBES */
+#endif /* _LINUX_KPROBES_H */

commit b814d41f0987c7648d7ed07471258101c95c026b
Author: Ingo Molnar <mingo@elte.hu>
Date:   Fri Feb 20 22:32:10 2009 +0100

    x86, mm: fault.c, simplify kmmio_fault()
    
    Impact: cleanup
    
    Remove an #ifdef from kmmio_fault() - we can do this by
    providing default implementations for is_kmmio_active()
    and kmmio_handler(). The compiler optimizes it all away
    in the !CONFIG_MMIOTRACE case.
    
    Also, while at it, clean up mmiotrace.h a bit:
    
     - standard header guards
     - standard vertical spaces for structure definitions
    
    No code changed (both with mmiotrace on and off in the config):
    
       text    data     bss     dec     hex filename
       2947      12      12    2971     b9b fault.o.before
       2947      12      12    2971     b9b fault.o.after
    
    Cc: Pekka Paalanen <pq@iki.fi>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 3e3661462739..fe99af4b86d9 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -55,13 +55,14 @@ enum x86_pf_error_code {
 	PF_INSTR	=		1 << 4,
 };
 
+/*
+ * (returns 0 if mmiotrace is disabled)
+ */
 static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
 {
-#ifdef CONFIG_MMIOTRACE
 	if (unlikely(is_kmmio_active()))
 		if (kmmio_handler(regs, addr) == 1)
 			return -1;
-#endif
 	return 0;
 }
 
diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h
index 139d7c88d9c9..3d1b7bde1283 100644
--- a/include/linux/mmiotrace.h
+++ b/include/linux/mmiotrace.h
@@ -1,5 +1,5 @@
-#ifndef MMIOTRACE_H
-#define MMIOTRACE_H
+#ifndef _LINUX_MMIOTRACE_H
+#define _LINUX_MMIOTRACE_H
 
 #include <linux/types.h>
 #include <linux/list.h>
@@ -13,28 +13,34 @@ typedef void (*kmmio_post_handler_t)(struct kmmio_probe *,
 				unsigned long condition, struct pt_regs *);
 
 struct kmmio_probe {
-	struct list_head list; /* kmmio internal list */
-	unsigned long addr; /* start location of the probe point */
-	unsigned long len; /* length of the probe region */
-	kmmio_pre_handler_t pre_handler; /* Called before addr is executed. */
-	kmmio_post_handler_t post_handler; /* Called after addr is executed */
-	void *private;
+	/* kmmio internal list: */
+	struct list_head	list;
+	/* start location of the probe point: */
+	unsigned long		addr;
+	/* length of the probe region: */
+	unsigned long		len;
+	/* Called before addr is executed: */
+	kmmio_pre_handler_t	pre_handler;
+	/* Called after addr is executed: */
+	kmmio_post_handler_t	post_handler;
+	void			*private;
 };
 
+extern unsigned int kmmio_count;
+
+extern int register_kmmio_probe(struct kmmio_probe *p);
+extern void unregister_kmmio_probe(struct kmmio_probe *p);
+
+#ifdef CONFIG_MMIOTRACE
 /* kmmio is active by some kmmio_probes? */
 static inline int is_kmmio_active(void)
 {
-	extern unsigned int kmmio_count;
 	return kmmio_count;
 }
 
-extern int register_kmmio_probe(struct kmmio_probe *p);
-extern void unregister_kmmio_probe(struct kmmio_probe *p);
-
 /* Called from page fault handler. */
 extern int kmmio_handler(struct pt_regs *regs, unsigned long addr);
 
-#ifdef CONFIG_MMIOTRACE
 /* Called from ioremap.c */
 extern void mmiotrace_ioremap(resource_size_t offset, unsigned long size,
 							void __iomem *addr);
@@ -43,7 +49,17 @@ extern void mmiotrace_iounmap(volatile void __iomem *addr);
 /* For anyone to insert markers. Remember trailing newline. */
 extern int mmiotrace_printk(const char *fmt, ...)
 				__attribute__ ((format (printf, 1, 2)));
-#else
+#else /* !CONFIG_MMIOTRACE: */
+static inline int is_kmmio_active(void)
+{
+	return 0;
+}
+
+static inline int kmmio_handler(struct pt_regs *regs, unsigned long addr)
+{
+	return 0;
+}
+
 static inline void mmiotrace_ioremap(resource_size_t offset,
 					unsigned long size, void __iomem *addr)
 {
@@ -63,28 +79,28 @@ static inline int mmiotrace_printk(const char *fmt, ...)
 #endif /* CONFIG_MMIOTRACE */
 
 enum mm_io_opcode {
-	MMIO_READ = 0x1,     /* struct mmiotrace_rw */
-	MMIO_WRITE = 0x2,    /* struct mmiotrace_rw */
-	MMIO_PROBE = 0x3,    /* struct mmiotrace_map */
-	MMIO_UNPROBE = 0x4,  /* struct mmiotrace_map */
-	MMIO_UNKNOWN_OP = 0x5, /* struct mmiotrace_rw */
+	MMIO_READ	= 0x1,	/* struct mmiotrace_rw */
+	MMIO_WRITE	= 0x2,	/* struct mmiotrace_rw */
+	MMIO_PROBE	= 0x3,	/* struct mmiotrace_map */
+	MMIO_UNPROBE	= 0x4,	/* struct mmiotrace_map */
+	MMIO_UNKNOWN_OP = 0x5,	/* struct mmiotrace_rw */
 };
 
 struct mmiotrace_rw {
-	resource_size_t phys;	/* PCI address of register */
-	unsigned long value;
-	unsigned long pc;	/* optional program counter */
-	int map_id;
-	unsigned char opcode;	/* one of MMIO_{READ,WRITE,UNKNOWN_OP} */
-	unsigned char width;	/* size of register access in bytes */
+	resource_size_t	phys;	/* PCI address of register */
+	unsigned long	value;
+	unsigned long	pc;	/* optional program counter */
+	int		map_id;
+	unsigned char	opcode;	/* one of MMIO_{READ,WRITE,UNKNOWN_OP} */
+	unsigned char	width;	/* size of register access in bytes */
 };
 
 struct mmiotrace_map {
-	resource_size_t phys;	/* base address in PCI space */
-	unsigned long virt;	/* base virtual address */
-	unsigned long len;	/* mapping size */
-	int map_id;
-	unsigned char opcode;	/* MMIO_PROBE or MMIO_UNPROBE */
+	resource_size_t	phys;	/* base address in PCI space */
+	unsigned long	virt;	/* base virtual address */
+	unsigned long	len;	/* mapping size */
+	int		map_id;
+	unsigned char	opcode;	/* MMIO_PROBE or MMIO_UNPROBE */
 };
 
 /* in kernel/trace/trace_mmiotrace.c */
@@ -94,4 +110,4 @@ extern void mmio_trace_rw(struct mmiotrace_rw *rw);
 extern void mmio_trace_mapping(struct mmiotrace_map *map);
 extern int mmio_trace_printk(const char *fmt, va_list args);
 
-#endif /* MMIOTRACE_H */
+#endif /* _LINUX_MMIOTRACE_H */

commit 121d5d0a7e5808fbcfda484efd7ba840ac93450f
Author: Ingo Molnar <mingo@elte.hu>
Date:   Fri Feb 20 22:18:08 2009 +0100

    x86, mm: fault.c, enable PF_RSVD checks on 32-bit too
    
    Impact: improve page fault handling robustness
    
    The 'PF_RSVD' flag (bit 3) of the page-fault error_code is a
    relatively recent addition to x86 CPUs, so the 32-bit do_fault()
    implementation never had it. This flag gets set when the CPU
    detects nonzero values in any reserved bits of the page directory
    entries.
    
    Extend the existing 64-bit check for PF_RSVD in do_page_fault()
    to 32-bit too. If we detect such a fault then we print a more
    informative oops and the pagetables.
    
    This unifies the code some more, removes an ugly #ifdef and improves
    the 32-bit page fault code robustness a bit. It slightly increases
    the 32-bit kernel text size.
    
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 7dc0615c3cfe..3e3661462739 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -477,7 +477,6 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
 	dump_pagetable(address);
 }
 
-#ifdef CONFIG_X86_64
 static noinline void
 pgtable_bad(struct pt_regs *regs, unsigned long error_code,
 	    unsigned long address)
@@ -503,7 +502,6 @@ pgtable_bad(struct pt_regs *regs, unsigned long error_code,
 
 	oops_end(flags, regs, sig);
 }
-#endif
 
 static noinline void
 no_context(struct pt_regs *regs, unsigned long error_code,
@@ -1015,10 +1013,8 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
 			local_irq_enable();
 	}
 
-#ifdef CONFIG_X86_64
 	if (unlikely(error_code & PF_RSVD))
 		pgtable_bad(regs, error_code, address);
-#endif
 
 	/*
 	 * If we're in an interrupt, have no user context or are running

commit 8c938f9fae887f6e180bf802aa1c33cf74712aff
Author: Ingo Molnar <mingo@elte.hu>
Date:   Fri Feb 20 22:12:18 2009 +0100

    x86, mm: fault.c, factor out the vm86 fault check
    
    Impact: cleanup
    
    Instead of an ugly, open-coded, #ifdef-ed vm86 related legacy check
    in do_page_fault(), put it into the check_v8086_mode() helper
    function and merge it with an existing #ifdef.
    
    Also, simplify the code flow a tiny bit in the helper.
    
    No code changed:
    
    arch/x86/mm/fault.o:
    
       text    data     bss     dec     hex filename
       2711      12      12    2735     aaf fault.o.before
       2711      12      12    2735     aaf fault.o.after
    
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 72973c7682ba..7dc0615c3cfe 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -328,14 +328,41 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
 
 	return pmd_k;
 }
-#endif
 
-#ifdef CONFIG_X86_64
+/*
+ * Did it hit the DOS screen memory VA from vm86 mode?
+ */
+static inline void
+check_v8086_mode(struct pt_regs *regs, unsigned long address,
+		 struct task_struct *tsk)
+{
+	unsigned long bit;
+
+	if (!v8086_mode(regs))
+		return;
+
+	bit = (address - 0xA0000) >> PAGE_SHIFT;
+	if (bit < 32)
+		tsk->thread.screen_bitmap |= 1 << bit;
+}
+
+#else /* CONFIG_X86_64: */
+
 static const char errata93_warning[] =
 KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
 KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
 KERN_ERR "******* Please consider a BIOS update.\n"
 KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
+
+/*
+ * No vm86 mode in 64-bit mode:
+ */
+static inline void
+check_v8086_mode(struct pt_regs *regs, unsigned long address,
+		 struct task_struct *tsk)
+{
+}
+
 #endif
 
 /*
@@ -1091,16 +1118,8 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	else
 		tsk->min_flt++;
 
-#ifdef CONFIG_X86_32
-	/*
-	 * Did it hit the DOS screen memory VA from vm86 mode?
-	 */
-	if (v8086_mode(regs)) {
-		unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
-		if (bit < 32)
-			tsk->thread.screen_bitmap |= 1 << bit;
-	}
-#endif
+	check_v8086_mode(regs, address, tsk);
+
 	up_read(&mm->mmap_sem);
 }