Patches contributed by Eötvös Lorand University


commit 813400060fe2824163cabd9afed4e476e7ce282e
Merge: 1bf7b31efa0c fe955e5c793a
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Jun 17 18:21:41 2009 +0200

    Merge branch 'x86/urgent' into x86/mce3
    
    Conflicts:
            arch/x86/kernel/cpu/mcheck/mce_intel.c
    
    Merge reason: merge with an urgent-branch MCE fix.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit b0a5b83ee0fce9dbf8ff5fe1f8c9ae7dfafe458c
Author: Ingo Molnar <mingo@elte.hu>
Date:   Tue Jun 16 16:11:14 2009 +0200

    dma-debug: Put all hash-chain locks into the same lock class
    
    Alan Cox reported that lockdep runs out of its stack-trace entries
    with certain configs:
    
     BUG: MAX_STACK_TRACE_ENTRIES too low
    
    This happens because there are 1024 hash buckets, each with a
    separate lock. Lockdep puts each lock into a separate lock class and
    tracks them independently.
    
    But in reality we never take more than one of the buckets, so they
    really belong into a single lock-class. Annotate the has bucket lock
    init accordingly.
    
    [ Impact: reduce the lockdep footprint of dma-debug ]
    
    Reported-by: Alan Cox <alan@linux.intel.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index a9b6b5c9e091..c9187fed0b93 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -716,7 +716,7 @@ void dma_debug_init(u32 num_entries)
 
 	for (i = 0; i < HASH_SIZE; ++i) {
 		INIT_LIST_HEAD(&dma_entry_hash[i].list);
-		dma_entry_hash[i].lock = SPIN_LOCK_UNLOCKED;
+		spin_lock_init(&dma_entry_hash[i].lock);
 	}
 
 	if (dma_debug_fs_init() != 0) {

commit a3d06cc6aa3e765dc2bf98626f87272dcf641dca
Merge: 0990b1c65729 65795efbd380
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Jun 17 13:06:17 2009 +0200

    Merge branch 'linus' into perfcounters/core
    
    Conflicts:
            arch/x86/include/asm/kmap_types.h
            include/linux/mm.h
    
            include/asm-generic/kmap_types.h
    
    Merge reason: We crossed changes with kmap_types.h cleanups in mainline.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --cc include/asm-generic/kmap_types.h
index 58c33055c304,54e8b3d956b7..eddbce0f9fb9
--- a/include/asm-generic/kmap_types.h
+++ b/include/asm-generic/kmap_types.h
@@@ -24,7 -24,7 +24,10 @@@ D(12)	KM_SOFTIRQ1
  D(13)	KM_SYNC_ICACHE,
  D(14)	KM_SYNC_DCACHE,
  D(15)	KM_UML_USERCOPY, /* UML specific, for copy_*_user - used in do_op_one_page */
--D(16)	KM_TYPE_NR
++D(16)	KM_IRQ_PTE,
++D(17)	KM_NMI,
++D(18)	KM_NMI_PTE,
++D(19)	KM_TYPE_NR
  };
  
  #undef D
diff --cc include/linux/mm.h
index b457bc047ab1,d88d6fc530ad..cf260d848eb9
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@@ -849,25 -853,6 +853,12 @@@ extern int mprotect_fixup(struct vm_are
  			  struct vm_area_struct **pprev, unsigned long start,
  			  unsigned long end, unsigned long newflags);
  
- /*
-  * get_user_pages_fast provides equivalent functionality to get_user_pages,
-  * operating on current and current->mm (force=0 and doesn't return any vmas).
-  *
-  * get_user_pages_fast may take mmap_sem and page tables, so no assumptions
-  * can be made about locking. get_user_pages_fast is to be implemented in a
-  * way that is advantageous (vs get_user_pages()) when the user memory area is
-  * already faulted in and present in ptes. However if the pages have to be
-  * faulted in, it may turn out to be slightly slower).
-  */
- int get_user_pages_fast(unsigned long start, int nr_pages, int write,
- 			struct page **pages);
- 
 +/*
 + * doesn't attempt to fault and will return short.
 + */
 +int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
 +			  struct page **pages);
 +
  /*
   * A callback you can register to apply pressure to ageable caches.
   *

commit eadb8a091b27a840de7450f84ecff5ef13476424
Merge: 73874005cd88 65795efbd380
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Jun 17 12:52:15 2009 +0200

    Merge branch 'linus' into tracing/hw-breakpoints
    
    Conflicts:
            arch/x86/Kconfig
            arch/x86/kernel/traps.c
            arch/x86/power/cpu.c
            arch/x86/power/cpu_32.c
            kernel/Makefile
    
    Semantic conflict:
            arch/x86/kernel/hw_breakpoint.c
    
    Merge reason: Resolve the conflicts, move from put_cpu_no_sched() to
                  put_cpu() in arch/x86/kernel/hw_breakpoint.c.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --cc arch/x86/Kconfig
index 3033375ed6bc,cf42fc305419..52421d52f21e
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@@ -46,7 -46,12 +46,13 @@@ config X8
  	select HAVE_KERNEL_GZIP
  	select HAVE_KERNEL_BZIP2
  	select HAVE_KERNEL_LZMA
 +	select HAVE_HW_BREAKPOINT
+ 	select HAVE_ARCH_KMEMCHECK
+ 
+ config OUTPUT_FORMAT
+ 	string
+ 	default "elf32-i386" if X86_32
+ 	default "elf64-x86-64" if X86_64
  
  config ARCH_DEFCONFIG
  	string
diff --cc arch/x86/include/asm/processor.h
index 448b34a8e393,c7768269b1cf..2b03f700d3f2
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@@ -428,15 -425,20 +426,19 @@@ struct thread_struct 
  	unsigned short		fsindex;
  	unsigned short		gsindex;
  #endif
+ #ifdef CONFIG_X86_32
  	unsigned long		ip;
+ #endif
+ #ifdef CONFIG_X86_64
  	unsigned long		fs;
+ #endif
  	unsigned long		gs;
  	/* Hardware debugging registers: */
 -	unsigned long		debugreg0;
 -	unsigned long		debugreg1;
 -	unsigned long		debugreg2;
 -	unsigned long		debugreg3;
 +	unsigned long		debugreg[HBP_NUM];
  	unsigned long		debugreg6;
  	unsigned long		debugreg7;
 +	/* Hardware breakpoint info */
 +	struct hw_breakpoint	*hbp[HBP_NUM];
  	/* Fault info: */
  	unsigned long		cr2;
  	unsigned long		trap_no;
diff --cc arch/x86/kernel/hw_breakpoint.c
index 69451473dbd2,000000000000..51d959528b1d
mode 100644,000000..100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@@ -1,391 -1,0 +1,391 @@@
 +/*
 + * This program is free software; you can redistribute it and/or modify
 + * it under the terms of the GNU General Public License as published by
 + * the Free Software Foundation; either version 2 of the License, or
 + * (at your option) any later version.
 + *
 + * This program is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 + * GNU General Public License for more details.
 + *
 + * You should have received a copy of the GNU General Public License
 + * along with this program; if not, write to the Free Software
 + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 + *
 + * Copyright (C) 2007 Alan Stern
 + * Copyright (C) 2009 IBM Corporation
 + */
 +
 +/*
 + * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
 + * using the CPU's debug registers.
 + */
 +
 +#include <linux/irqflags.h>
 +#include <linux/notifier.h>
 +#include <linux/kallsyms.h>
 +#include <linux/kprobes.h>
 +#include <linux/percpu.h>
 +#include <linux/kdebug.h>
 +#include <linux/kernel.h>
 +#include <linux/module.h>
 +#include <linux/sched.h>
 +#include <linux/init.h>
 +#include <linux/smp.h>
 +
 +#include <asm/hw_breakpoint.h>
 +#include <asm/processor.h>
 +#include <asm/debugreg.h>
 +
 +/* Unmasked kernel DR7 value */
 +static unsigned long kdr7;
 +
 +/*
 + * Masks for the bits corresponding to registers DR0 - DR3 in DR7 register.
 + * Used to clear and verify the status of bits corresponding to DR0 - DR3
 + */
 +static const unsigned long	dr7_masks[HBP_NUM] = {
 +	0x000f0003,	/* LEN0, R/W0, G0, L0 */
 +	0x00f0000c,	/* LEN1, R/W1, G1, L1 */
 +	0x0f000030,	/* LEN2, R/W2, G2, L2 */
 +	0xf00000c0	/* LEN3, R/W3, G3, L3 */
 +};
 +
 +
 +/*
 + * Encode the length, type, Exact, and Enable bits for a particular breakpoint
 + * as stored in debug register 7.
 + */
 +static unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type)
 +{
 +	unsigned long bp_info;
 +
 +	bp_info = (len | type) & 0xf;
 +	bp_info <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE);
 +	bp_info |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE)) |
 +				DR_GLOBAL_SLOWDOWN;
 +	return bp_info;
 +}
 +
 +void arch_update_kernel_hw_breakpoint(void *unused)
 +{
 +	struct hw_breakpoint *bp;
 +	int i, cpu = get_cpu();
 +	unsigned long temp_kdr7 = 0;
 +
 +	/* Don't allow debug exceptions while we update the registers */
 +	set_debugreg(0UL, 7);
 +
 +	for (i = hbp_kernel_pos; i < HBP_NUM; i++) {
 +		per_cpu(this_hbp_kernel[i], cpu) = bp = hbp_kernel[i];
 +		if (bp) {
 +			temp_kdr7 |= encode_dr7(i, bp->info.len, bp->info.type);
 +			set_debugreg(bp->info.address, i);
 +		}
 +	}
 +
 +	/* No need to set DR6. Update the debug registers with kernel-space
 +	 * breakpoint values from kdr7 and user-space requests from the
 +	 * current process
 +	 */
 +	kdr7 = temp_kdr7;
 +	set_debugreg(kdr7 | current->thread.debugreg7, 7);
- 	put_cpu_no_resched();
++	put_cpu();
 +}
 +
 +/*
 + * Install the thread breakpoints in their debug registers.
 + */
 +void arch_install_thread_hw_breakpoint(struct task_struct *tsk)
 +{
 +	struct thread_struct *thread = &(tsk->thread);
 +
 +	switch (hbp_kernel_pos) {
 +	case 4:
 +		set_debugreg(thread->debugreg[3], 3);
 +	case 3:
 +		set_debugreg(thread->debugreg[2], 2);
 +	case 2:
 +		set_debugreg(thread->debugreg[1], 1);
 +	case 1:
 +		set_debugreg(thread->debugreg[0], 0);
 +	default:
 +		break;
 +	}
 +
 +	/* No need to set DR6 */
 +	set_debugreg((kdr7 | thread->debugreg7), 7);
 +}
 +
 +/*
 + * Install the debug register values for just the kernel, no thread.
 + */
 +void arch_uninstall_thread_hw_breakpoint()
 +{
 +	/* Clear the user-space portion of debugreg7 by setting only kdr7 */
 +	set_debugreg(kdr7, 7);
 +
 +}
 +
 +static int get_hbp_len(u8 hbp_len)
 +{
 +	unsigned int len_in_bytes = 0;
 +
 +	switch (hbp_len) {
 +	case HW_BREAKPOINT_LEN_1:
 +		len_in_bytes = 1;
 +		break;
 +	case HW_BREAKPOINT_LEN_2:
 +		len_in_bytes = 2;
 +		break;
 +	case HW_BREAKPOINT_LEN_4:
 +		len_in_bytes = 4;
 +		break;
 +#ifdef CONFIG_X86_64
 +	case HW_BREAKPOINT_LEN_8:
 +		len_in_bytes = 8;
 +		break;
 +#endif
 +	}
 +	return len_in_bytes;
 +}
 +
 +/*
 + * Check for virtual address in user space.
 + */
 +int arch_check_va_in_userspace(unsigned long va, u8 hbp_len)
 +{
 +	unsigned int len;
 +
 +	len = get_hbp_len(hbp_len);
 +
 +	return (va <= TASK_SIZE - len);
 +}
 +
 +/*
 + * Check for virtual address in kernel space.
 + */
 +int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len)
 +{
 +	unsigned int len;
 +
 +	len = get_hbp_len(hbp_len);
 +
 +	return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
 +}
 +
 +/*
 + * Store a breakpoint's encoded address, length, and type.
 + */
 +static int arch_store_info(struct hw_breakpoint *bp, struct task_struct *tsk)
 +{
 +	/*
 +	 * User-space requests will always have the address field populated
 +	 * Symbol names from user-space are rejected
 +	 */
 +	if (tsk && bp->info.name)
 +		return -EINVAL;
 +	/*
 +	 * For kernel-addresses, either the address or symbol name can be
 +	 * specified.
 +	 */
 +	if (bp->info.name)
 +		bp->info.address = (unsigned long)
 +					kallsyms_lookup_name(bp->info.name);
 +	if (bp->info.address)
 +		return 0;
 +	return -EINVAL;
 +}
 +
 +/*
 + * Validate the arch-specific HW Breakpoint register settings
 + */
 +int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp,
 +						struct task_struct *tsk)
 +{
 +	unsigned int align;
 +	int ret = -EINVAL;
 +
 +	switch (bp->info.type) {
 +	/*
 +	 * Ptrace-refactoring code
 +	 * For now, we'll allow instruction breakpoint only for user-space
 +	 * addresses
 +	 */
 +	case HW_BREAKPOINT_EXECUTE:
 +		if ((!arch_check_va_in_userspace(bp->info.address,
 +							bp->info.len)) &&
 +			bp->info.len != HW_BREAKPOINT_LEN_EXECUTE)
 +			return ret;
 +		break;
 +	case HW_BREAKPOINT_WRITE:
 +		break;
 +	case HW_BREAKPOINT_RW:
 +		break;
 +	default:
 +		return ret;
 +	}
 +
 +	switch (bp->info.len) {
 +	case HW_BREAKPOINT_LEN_1:
 +		align = 0;
 +		break;
 +	case HW_BREAKPOINT_LEN_2:
 +		align = 1;
 +		break;
 +	case HW_BREAKPOINT_LEN_4:
 +		align = 3;
 +		break;
 +#ifdef CONFIG_X86_64
 +	case HW_BREAKPOINT_LEN_8:
 +		align = 7;
 +		break;
 +#endif
 +	default:
 +		return ret;
 +	}
 +
 +	if (bp->triggered)
 +		ret = arch_store_info(bp, tsk);
 +
 +	if (ret < 0)
 +		return ret;
 +	/*
 +	 * Check that the low-order bits of the address are appropriate
 +	 * for the alignment implied by len.
 +	 */
 +	if (bp->info.address & align)
 +		return -EINVAL;
 +
 +	/* Check that the virtual address is in the proper range */
 +	if (tsk) {
 +		if (!arch_check_va_in_userspace(bp->info.address, bp->info.len))
 +			return -EFAULT;
 +	} else {
 +		if (!arch_check_va_in_kernelspace(bp->info.address,
 +								bp->info.len))
 +			return -EFAULT;
 +	}
 +	return 0;
 +}
 +
 +void arch_update_user_hw_breakpoint(int pos, struct task_struct *tsk)
 +{
 +	struct thread_struct *thread = &(tsk->thread);
 +	struct hw_breakpoint *bp = thread->hbp[pos];
 +
 +	thread->debugreg7 &= ~dr7_masks[pos];
 +	if (bp) {
 +		thread->debugreg[pos] = bp->info.address;
 +		thread->debugreg7 |= encode_dr7(pos, bp->info.len,
 +							bp->info.type);
 +	} else
 +		thread->debugreg[pos] = 0;
 +}
 +
 +void arch_flush_thread_hw_breakpoint(struct task_struct *tsk)
 +{
 +	int i;
 +	struct thread_struct *thread = &(tsk->thread);
 +
 +	thread->debugreg7 = 0;
 +	for (i = 0; i < HBP_NUM; i++)
 +		thread->debugreg[i] = 0;
 +}
 +
 +/*
 + * Handle debug exception notifications.
 + *
 + * Return value is either NOTIFY_STOP or NOTIFY_DONE as explained below.
 + *
 + * NOTIFY_DONE returned if one of the following conditions is true.
 + * i) When the causative address is from user-space and the exception
 + * is a valid one, i.e. not triggered as a result of lazy debug register
 + * switching
 + * ii) When there are more bits than trap<n> set in DR6 register (such
 + * as BD, BS or BT) indicating that more than one debug condition is
 + * met and requires some more action in do_debug().
 + *
 + * NOTIFY_STOP returned for all other cases
 + *
 + */
 +int __kprobes hw_breakpoint_handler(struct die_args *args)
 +{
 +	int i, cpu, rc = NOTIFY_STOP;
 +	struct hw_breakpoint *bp;
 +	unsigned long dr7, dr6;
 +	unsigned long *dr6_p;
 +
 +	/* The DR6 value is pointed by args->err */
 +	dr6_p = (unsigned long *)ERR_PTR(args->err);
 +	dr6 = *dr6_p;
 +
 +	/* Do an early return if no trap bits are set in DR6 */
 +	if ((dr6 & DR_TRAP_BITS) == 0)
 +		return NOTIFY_DONE;
 +
 +	/* Lazy debug register switching */
 +	if (!test_tsk_thread_flag(current, TIF_DEBUG))
 +		arch_uninstall_thread_hw_breakpoint();
 +
 +	get_debugreg(dr7, 7);
 +	/* Disable breakpoints during exception handling */
 +	set_debugreg(0UL, 7);
 +	/*
 +	 * Assert that local interrupts are disabled
 +	 * Reset the DRn bits in the virtualized register value.
 +	 * The ptrace trigger routine will add in whatever is needed.
 +	 */
 +	current->thread.debugreg6 &= ~DR_TRAP_BITS;
 +	cpu = get_cpu();
 +
 +	/* Handle all the breakpoints that were triggered */
 +	for (i = 0; i < HBP_NUM; ++i) {
 +		if (likely(!(dr6 & (DR_TRAP0 << i))))
 +			continue;
 +		/*
 +		 * Find the corresponding hw_breakpoint structure and
 +		 * invoke its triggered callback.
 +		 */
 +		if (i >= hbp_kernel_pos)
 +			bp = per_cpu(this_hbp_kernel[i], cpu);
 +		else {
 +			bp = current->thread.hbp[i];
 +			if (bp)
 +				rc = NOTIFY_DONE;
 +		}
 +		/*
 +		 * Reset the 'i'th TRAP bit in dr6 to denote completion of
 +		 * exception handling
 +		 */
 +		(*dr6_p) &= ~(DR_TRAP0 << i);
 +		/*
 +		 * bp can be NULL due to lazy debug register switching
 +		 * or due to the delay between updates of hbp_kernel_pos
 +		 * and this_hbp_kernel.
 +		 */
 +		if (!bp)
 +			continue;
 +
 +		(bp->triggered)(bp, args->regs);
 +	}
 +	if (dr6 & (~DR_TRAP_BITS))
 +		rc = NOTIFY_DONE;
 +
 +	set_debugreg(dr7, 7);
- 	put_cpu_no_resched();
++	put_cpu();
 +	return rc;
 +}
 +
 +/*
 + * Handle debug exception notifications.
 + */
 +int __kprobes hw_breakpoint_exceptions_notify(
 +		struct notifier_block *unused, unsigned long val, void *data)
 +{
 +	if (val != DIE_DEBUG)
 +		return NOTIFY_DONE;
 +
 +	return hw_breakpoint_handler(data);
 +}
diff --cc arch/x86/kernel/traps.c
index 124a4d5a95b2,5f935f0d5861..286d64eba31b
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@@ -529,13 -530,15 +530,17 @@@ asmlinkage __kprobes struct pt_regs *sy
  dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
  {
  	struct task_struct *tsk = current;
 -	unsigned long condition;
 +	unsigned long dr6;
  	int si_code;
  
 -	get_debugreg(condition, 6);
 +	get_debugreg(dr6, 6);
  
+ 	/* Catch kmemcheck conditions first of all! */
 -	if (condition & DR_STEP && kmemcheck_trap(regs))
++	if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
+ 		return;
+ 
 +	/* DR6 may or may not be cleared by the CPU */
 +	set_debugreg(0, 6);
  	/*
  	 * The processor cleared BTF, so don't mark that we need it set.
  	 */
diff --cc arch/x86/power/cpu.c
index 46866a13a93a,d277ef1eea51..394cbb88987c
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@@ -8,19 -8,28 +8,29 @@@
   * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
   */
  
- #include <linux/smp.h>
  #include <linux/suspend.h>
- #include <asm/proto.h>
- #include <asm/page.h>
+ #include <linux/smp.h>
+ 
  #include <asm/pgtable.h>
+ #include <asm/proto.h>
  #include <asm/mtrr.h>
+ #include <asm/page.h>
+ #include <asm/mce.h>
  #include <asm/xcr.h>
  #include <asm/suspend.h>
 +#include <asm/debugreg.h>
  
- static void fix_processor_context(void);
+ #ifdef CONFIG_X86_32
+ static struct saved_context saved_context;
  
+ unsigned long saved_context_ebx;
+ unsigned long saved_context_esp, saved_context_ebp;
+ unsigned long saved_context_esi, saved_context_edi;
+ unsigned long saved_context_eflags;
+ #else
+ /* CONFIG_X86_64 */
  struct saved_context saved_context;
+ #endif
  
  /**
   *	__save_processor_state - save CPU registers before creating a
@@@ -69,12 -97,16 +98,17 @@@ static void __save_processor_state(stru
  	ctxt->cr0 = read_cr0();
  	ctxt->cr2 = read_cr2();
  	ctxt->cr3 = read_cr3();
+ #ifdef CONFIG_X86_32
+ 	ctxt->cr4 = read_cr4_safe();
+ #else
+ /* CONFIG_X86_64 */
  	ctxt->cr4 = read_cr4();
  	ctxt->cr8 = read_cr8();
+ #endif
 +	hw_breakpoint_disable();
  }
  
+ /* Needed by apm.c */
  void save_processor_state(void)
  {
  	__save_processor_state(&saved_context);
@@@ -88,6 -123,52 +125,32 @@@ static void do_fpu_end(void
  	kernel_fpu_end();
  }
  
+ static void fix_processor_context(void)
+ {
+ 	int cpu = smp_processor_id();
+ 	struct tss_struct *t = &per_cpu(init_tss, cpu);
+ 
+ 	set_tss_desc(cpu, t);	/*
+ 				 * This just modifies memory; should not be
+ 				 * necessary. But... This is necessary, because
+ 				 * 386 hardware has concept of busy TSS or some
+ 				 * similar stupidity.
+ 				 */
+ 
+ #ifdef CONFIG_X86_64
+ 	get_cpu_gdt_table(cpu)[GDT_ENTRY_TSS].type = 9;
+ 
+ 	syscall_init();				/* This sets MSR_*STAR and related */
+ #endif
+ 	load_TR_desc();				/* This does ltr */
+ 	load_LDT(&current->active_mm->context);	/* This does lldt */
+ 
+ 	/*
+ 	 * Now maybe reload the debug registers
+ 	 */
 -	if (current->thread.debugreg7) {
 -#ifdef CONFIG_X86_32
 -		set_debugreg(current->thread.debugreg0, 0);
 -		set_debugreg(current->thread.debugreg1, 1);
 -		set_debugreg(current->thread.debugreg2, 2);
 -		set_debugreg(current->thread.debugreg3, 3);
 -		/* no 4 and 5 */
 -		set_debugreg(current->thread.debugreg6, 6);
 -		set_debugreg(current->thread.debugreg7, 7);
 -#else
 -		/* CONFIG_X86_64 */
 -		loaddebug(&current->thread, 0);
 -		loaddebug(&current->thread, 1);
 -		loaddebug(&current->thread, 2);
 -		loaddebug(&current->thread, 3);
 -		/* no 4 and 5 */
 -		loaddebug(&current->thread, 6);
 -		loaddebug(&current->thread, 7);
 -#endif
 -	}
 -
++	load_debug_registers();
+ }
+ 
  /**
   *	__restore_processor_state - restore the contents of CPU registers saved
   *		by __save_processor_state()
diff --cc kernel/Makefile
index 18ad1110b226,9df4501cb921..f88decb1b445
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@@ -96,7 -97,7 +97,8 @@@ obj-$(CONFIG_TRACING) += trace
  obj-$(CONFIG_X86_DS) += trace/
  obj-$(CONFIG_SMP) += sched_cpupri.o
  obj-$(CONFIG_SLOW_WORK) += slow-work.o
 +obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
+ obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
  
  ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
  # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is

commit cc4949e1fdade5d063e9f8783cf0e2cc92041ce5
Merge: 28b4868820a5 300df7dc89cc
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Jun 17 08:59:01 2009 +0200

    Merge branch 'linus' into x86/urgent
    
    Merge reason: pull in latest to fix a bug in it.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit 8a4a6182fd43c46ed8c12e26b4669854bcad300a
Merge: 5dfaf90f8052 6a047d8b9efc
Author: Ingo Molnar <mingo@elte.hu>
Date:   Tue Jun 16 11:51:24 2009 +0200

    Merge branch 'amd-iommu/fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/joro/linux-2.6-iommu into x86/urgent

commit 5dfaf90f8052327c92fbe3c470a2e6634be296c0
Author: Ingo Molnar <mingo@elte.hu>
Date:   Tue Jun 16 10:23:32 2009 +0200

    x86: mm: Read cr2 before prefetching the mmap_lock
    
    Prefetch instructions can generate spurious faults on certain
    models of older CPUs. The faults themselves cannot be stopped
    and they can occur pretty much anywhere - so the way we solve
    them is that we detect certain patterns and ignore the fault.
    
    There is one small path of code where we must not take faults
    though: the #PF handler execution leading up to the reading
    of the CR2 (the faulting address). If we take a fault there
    then we destroy the CR2 value (with that of the prefetching
    instruction's) and possibly mishandle user-space or
    kernel-space pagefaults.
    
    It turns out that in current upstream we do exactly that:
    
            prefetchw(&mm->mmap_sem);
    
            /* Get the faulting address: */
            address = read_cr2();
    
    This is not good.
    
    So turn around the order: first read the cr2 then prefetch
    the lock address. Reading cr2 is plenty fast (2 cycles) so
    delaying the prefetch by this amount shouldnt be a big issue
    performance-wise.
    
    [ And this might explain a mystery fault.c warning that sometimes
      occurs on one an old AMD/Semptron based test-system i have -
      which does have such prefetch problems. ]
    
    Cc: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Nick Piggin <npiggin@suse.de>
    Cc: Pekka Enberg <penberg@cs.helsinki.fi>
    Cc: Vegard Nossum <vegard.nossum@gmail.com>
    Cc: Jeremy Fitzhardinge <jeremy@goop.org>
    Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
    LKML-Reference: <20090616030522.GA22162@Krystal>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index c6acc6326374..0482fa649738 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -951,11 +951,11 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	tsk = current;
 	mm = tsk->mm;
 
-	prefetchw(&mm->mmap_sem);
-
 	/* Get the faulting address: */
 	address = read_cr2();
 
+	prefetchw(&mm->mmap_sem);
+
 	if (unlikely(kmmio_fault(regs, address)))
 		return;
 

commit e2eae0f5605b90a0838608043c21050b08b6dd95
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Jun 15 16:15:19 2009 +0200

    perf report: Fix 32-bit printf format
    
    Yong Wang reported the following compiler warning:
    
     builtin-report.c: In function 'process_overflow_event':
     builtin-report.c:984: error: cast to pointer from integer of different size
    
    Which happens because we try to print ->ips[] out with a limited
    format, losing the high 32 bits. Print it out using %016Lx instead.
    
    Reported-by: Yong Wang <yong.y.wang@linux.intel.com>
    Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Mike Galbraith <efault@gmx.de>
    Cc: Paul Mackerras <paulus@samba.org>
    Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
    LKML-Reference: <new-submission>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 1e2f5dde312c..f86bb07c0e84 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -982,7 +982,7 @@ process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
 				chain->nr);
 
 			for (i = 0; i < chain->nr; i++)
-				dprintf("..... %2d: %p\n", i, (void *)chain->ips[i]);
+				dprintf("..... %2d: %016Lx\n", i, chain->ips[i]);
 		}
 		if (collapse_syscalls) {
 			/*

commit 3dfabc74c65904c9e6cf952391312d16ea772ef5
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Jun 15 11:24:38 2009 +0200

    perf report: Add per system call overhead histogram
    
    Take advantage of call-graph percounter sampling/recording to
    display a non-trivial histogram: the true, collapsed/summarized
    cost measurement, on a per system call total overhead basis:
    
     aldebaran:~/linux/linux/tools/perf> ./perf record -g -a -f ~/hackbench 10
     aldebaran:~/linux/linux/tools/perf> ./perf report -s symbol --syscalls | head -10
     #
     # (3536 samples)
     #
     # Overhead  Symbol
     # ........  ......
     #
         40.75%  [k] sys_write
         40.21%  [k] sys_read
          4.44%  [k] do_nmi
     ...
    
    This is done by accounting each (reliable) call-chain that chains back
    to a given system call to that system call function.
    
    [ So in the above example we can see that hackbench spends about 40% of
      its total time somewhere in sys_write() and 40% somewhere in
      sys_read(), the rest of the time is spent in user-space. The time
      is not spent in sys_write() _itself_ but in one of its many child
      functions. ]
    
    Or, a recording of a (source files are already in the page-cache) kernel build:
    
     $ perf record -g -m 512 -f -- make -j32 kernel
     $ perf report -s s --syscalls | grep '\[k\]' | grep -v nmi
    
         4.14%  [k] do_page_fault
         1.20%  [k] sys_write
         1.10%  [k] sys_open
         0.63%  [k] sys_exit_group
         0.48%  [k] smp_apic_timer_interrupt
         0.37%  [k] sys_read
         0.37%  [k] sys_execve
         0.20%  [k] sys_mmap
         0.18%  [k] sys_close
         0.14%  [k] sys_munmap
         0.13%  [k] sys_poll
         0.09%  [k] sys_newstat
         0.07%  [k] sys_clone
         0.06%  [k] sys_newfstat
         0.05%  [k] sys_access
         0.05%  [k] schedule
    
    Shows the true total cost of each syscall variant that gets used
    during a kernel build. This profile reveals it that pagefaults are
    the costliest, followed by read()/write().
    
    An interesting detail: timer interrupts cost 0.5% - or 0.5 seconds
    per 100 seconds of kernel build-time. (this was done with HZ=1000)
    
    The summary is done in 'perf report', i.e. in the post-processing
    stage - so once we have a good call-graph recording, this type of
    non-trivial high-level analysis becomes possible.
    
    Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Mike Galbraith <efault@gmx.de>
    Cc: Paul Mackerras <paulus@samba.org>
    Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Frederic Weisbecker <fweisbec@gmail.com>
    Cc: Pekka Enberg <penberg@cs.helsinki.fi>
    LKML-Reference: <new-submission>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index aebba5659345..1e2f5dde312c 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -40,6 +40,7 @@ static int		dump_trace = 0;
 
 static int		verbose;
 static int		full_paths;
+static int		collapse_syscalls;
 
 static unsigned long	page_size;
 static unsigned long	mmap_window = 32;
@@ -983,6 +984,15 @@ process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
 			for (i = 0; i < chain->nr; i++)
 				dprintf("..... %2d: %p\n", i, (void *)chain->ips[i]);
 		}
+		if (collapse_syscalls) {
+			/*
+			 * Find the all-but-last kernel entry
+			 * amongst the call-chains - to get
+			 * to the level of system calls:
+			 */
+			if (chain->kernel >= 2)
+				ip = chain->ips[chain->kernel-2];
+		}
 	}
 
 	dprintf(" ... thread: %s:%d\n", thread->comm, thread->pid);
@@ -1343,6 +1353,8 @@ static const struct option options[] = {
 		   "sort by key(s): pid, comm, dso, symbol. Default: pid,symbol"),
 	OPT_BOOLEAN('P', "full-paths", &full_paths,
 		    "Don't shorten the pathnames taking into account the cwd"),
+	OPT_BOOLEAN('S', "syscalls", &collapse_syscalls,
+		    "show per syscall summary overhead, using call graph"),
 	OPT_END()
 };
 

commit 613d8602292165f86ba1969784fea01a06d55900
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Jun 15 08:17:12 2009 +0200

    perf record: Fix fast task-exit race
    
    Recording with -a (or with -p) can race with tasks going away:
    
       couldn't open /proc/8440/maps
    
    Causing an early exit() and no recording done.
    
    Do not abort the recording session - instead just skip that task.
    
    Also, only print the warnings under -v.
    
    Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Mike Galbraith <efault@gmx.de>
    Cc: Paul Mackerras <paulus@samba.org>
    Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
    LKML-Reference: <new-submission>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index a177a591b52c..e1dfef24887f 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -202,8 +202,12 @@ static void pid_synthesize_comm_event(pid_t pid, int full)
 
 	fd = open(filename, O_RDONLY);
 	if (fd < 0) {
-		fprintf(stderr, "couldn't open %s\n", filename);
-		exit(EXIT_FAILURE);
+		/*
+		 * We raced with a task exiting - just return:
+		 */
+		if (verbose)
+			fprintf(stderr, "couldn't open %s\n", filename);
+		return;
 	}
 	if (read(fd, bf, sizeof(bf)) < 0) {
 		fprintf(stderr, "couldn't read %s\n", filename);
@@ -273,8 +277,12 @@ static void pid_synthesize_mmap_samples(pid_t pid)
 
 	fp = fopen(filename, "r");
 	if (fp == NULL) {
-		fprintf(stderr, "couldn't open %s\n", filename);
-		exit(EXIT_FAILURE);
+		/*
+		 * We raced with a task exiting - just return:
+		 */
+		if (verbose)
+			fprintf(stderr, "couldn't open %s\n", filename);
+		return;
 	}
 	while (1) {
 		char bf[BUFSIZ], *pbf = bf;