Patches contributed by Eötvös Lorand University

<<Prev 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72[73]74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 Next>>

commit 813400060fe2824163cabd9afed4e476e7ce282e
Merge: 1bf7b31efa0c fe955e5c793a
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Jun 17 18:21:41 2009 +0200

    Merge branch 'x86/urgent' into x86/mce3
    
    Conflicts:
            arch/x86/kernel/cpu/mcheck/mce_intel.c
    
    Merge reason: merge with an urgent-branch MCE fix.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit b0a5b83ee0fce9dbf8ff5fe1f8c9ae7dfafe458c
Author: Ingo Molnar <mingo@elte.hu>
Date:   Tue Jun 16 16:11:14 2009 +0200

    dma-debug: Put all hash-chain locks into the same lock class
    
    Alan Cox reported that lockdep runs out of its stack-trace entries
    with certain configs:
    
     BUG: MAX_STACK_TRACE_ENTRIES too low
    
    This happens because there are 1024 hash buckets, each with a
    separate lock. Lockdep puts each lock into a separate lock class and
    tracks them independently.
    
    But in reality we never take more than one of the buckets, so they
    really belong into a single lock-class. Annotate the has bucket lock
    init accordingly.
    
    [ Impact: reduce the lockdep footprint of dma-debug ]
    
    Reported-by: Alan Cox <alan@linux.intel.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index a9b6b5c9e091..c9187fed0b93 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -716,7 +716,7 @@ void dma_debug_init(u32 num_entries)
 
 	for (i = 0; i < HASH_SIZE; ++i) {
 		INIT_LIST_HEAD(&dma_entry_hash[i].list);
-		dma_entry_hash[i].lock = SPIN_LOCK_UNLOCKED;
+		spin_lock_init(&dma_entry_hash[i].lock);
 	}
 
 	if (dma_debug_fs_init() != 0) {

commit a3d06cc6aa3e765dc2bf98626f87272dcf641dca
Merge: 0990b1c65729 65795efbd380
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Jun 17 13:06:17 2009 +0200

    Merge branch 'linus' into perfcounters/core
    
    Conflicts:
            arch/x86/include/asm/kmap_types.h
            include/linux/mm.h
    
            include/asm-generic/kmap_types.h
    
    Merge reason: We crossed changes with kmap_types.h cleanups in mainline.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --cc include/asm-generic/kmap_types.h
index 58c33055c304,54e8b3d956b7..eddbce0f9fb9
--- a/include/asm-generic/kmap_types.h
+++ b/include/asm-generic/kmap_types.h
@@@ -24,7 -24,7 +24,10 @@@ D(12)	KM_SOFTIRQ1
  D(13)	KM_SYNC_ICACHE,
  D(14)	KM_SYNC_DCACHE,
  D(15)	KM_UML_USERCOPY, /* UML specific, for copy_*_user - used in do_op_one_page */
--D(16)	KM_TYPE_NR
++D(16)	KM_IRQ_PTE,
++D(17)	KM_NMI,
++D(18)	KM_NMI_PTE,
++D(19)	KM_TYPE_NR
  };
  
  #undef D
diff --cc include/linux/mm.h
index b457bc047ab1,d88d6fc530ad..cf260d848eb9
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@@ -849,25 -853,6 +853,12 @@@ extern int mprotect_fixup(struct vm_are
  			  struct vm_area_struct **pprev, unsigned long start,
  			  unsigned long end, unsigned long newflags);
  
- /*
-  * get_user_pages_fast provides equivalent functionality to get_user_pages,
-  * operating on current and current->mm (force=0 and doesn't return any vmas).
-  *
-  * get_user_pages_fast may take mmap_sem and page tables, so no assumptions
-  * can be made about locking. get_user_pages_fast is to be implemented in a
-  * way that is advantageous (vs get_user_pages()) when the user memory area is
-  * already faulted in and present in ptes. However if the pages have to be
-  * faulted in, it may turn out to be slightly slower).
-  */
- int get_user_pages_fast(unsigned long start, int nr_pages, int write,
- 			struct page **pages);
- 
 +/*
 + * doesn't attempt to fault and will return short.
 + */
 +int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
 +			  struct page **pages);
 +
  /*
   * A callback you can register to apply pressure to ageable caches.
   *

commit eadb8a091b27a840de7450f84ecff5ef13476424
Merge: 73874005cd88 65795efbd380
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Jun 17 12:52:15 2009 +0200

    Merge branch 'linus' into tracing/hw-breakpoints
    
    Conflicts:
            arch/x86/Kconfig
            arch/x86/kernel/traps.c
            arch/x86/power/cpu.c
            arch/x86/power/cpu_32.c
            kernel/Makefile
    
    Semantic conflict:
            arch/x86/kernel/hw_breakpoint.c
    
    Merge reason: Resolve the conflicts, move from put_cpu_no_sched() to
                  put_cpu() in arch/x86/kernel/hw_breakpoint.c.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --cc arch/x86/Kconfig
index 3033375ed6bc,cf42fc305419..52421d52f21e
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@@ -46,7 -46,12 +46,13 @@@ config X8
  	select HAVE_KERNEL_GZIP
  	select HAVE_KERNEL_BZIP2
  	select HAVE_KERNEL_LZMA
 +	select HAVE_HW_BREAKPOINT
+ 	select HAVE_ARCH_KMEMCHECK
+ 
+ config OUTPUT_FORMAT
+ 	string
+ 	default "elf32-i386" if X86_32
+ 	default "elf64-x86-64" if X86_64
  
  config ARCH_DEFCONFIG
  	string
diff --cc arch/x86/include/asm/processor.h
index 448b34a8e393,c7768269b1cf..2b03f700d3f2
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@@ -428,15 -425,20 +426,19 @@@ struct thread_struct 
  	unsigned short		fsindex;
  	unsigned short		gsindex;
  #endif
+ #ifdef CONFIG_X86_32
  	unsigned long		ip;
+ #endif
+ #ifdef CONFIG_X86_64
  	unsigned long		fs;
+ #endif
  	unsigned long		gs;
  	/* Hardware debugging registers: */
 -	unsigned long		debugreg0;
 -	unsigned long		debugreg1;
 -	unsigned long		debugreg2;
 -	unsigned long		debugreg3;
 +	unsigned long		debugreg[HBP_NUM];
  	unsigned long		debugreg6;
  	unsigned long		debugreg7;
 +	/* Hardware breakpoint info */
 +	struct hw_breakpoint	*hbp[HBP_NUM];
  	/* Fault info: */
  	unsigned long		cr2;
  	unsigned long		trap_no;
diff --cc arch/x86/kernel/hw_breakpoint.c
index 69451473dbd2,000000000000..51d959528b1d
mode 100644,000000..100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@@ -1,391 -1,0 +1,391 @@@
 +/*
 + * This program is free software; you can redistribute it and/or modify
 + * it under the terms of the GNU General Public License as published by
 + * the Free Software Foundation; either version 2 of the License, or
 + * (at your option) any later version.
 + *
 + * This program is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 + * GNU General Public License for more details.
 + *
 + * You should have received a copy of the GNU General Public License
 + * along with this program; if not, write to the Free Software
 + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 + *
 + * Copyright (C) 2007 Alan Stern
 + * Copyright (C) 2009 IBM Corporation
 + */
 +
 +/*
 + * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
 + * using the CPU's debug registers.
 + */
 +
 +#include <linux/irqflags.h>
 +#include <linux/notifier.h>
 +#include <linux/kallsyms.h>
 +#include <linux/kprobes.h>
 +#include <linux/percpu.h>
 +#include <linux/kdebug.h>
 +#include <linux/kernel.h>
 +#include <linux/module.h>
 +#include <linux/sched.h>
 +#include <linux/init.h>
 +#include <linux/smp.h>
 +
 +#include <asm/hw_breakpoint.h>
 +#include <asm/processor.h>
 +#include <asm/debugreg.h>
 +
 +/* Unmasked kernel DR7 value */
 +static unsigned long kdr7;
 +
 +/*
 + * Masks for the bits corresponding to registers DR0 - DR3 in DR7 register.
 + * Used to clear and verify the status of bits corresponding to DR0 - DR3
 + */
 +static const unsigned long	dr7_masks[HBP_NUM] = {
 +	0x000f0003,	/* LEN0, R/W0, G0, L0 */
 +	0x00f0000c,	/* LEN1, R/W1, G1, L1 */
 +	0x0f000030,	/* LEN2, R/W2, G2, L2 */
 +	0xf00000c0	/* LEN3, R/W3, G3, L3 */
 +};
 +
 +
 +/*
 + * Encode the length, type, Exact, and Enable bits for a particular breakpoint
 + * as stored in debug register 7.
 + */
 +static unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type)
 +{
 +	unsigned long bp_info;
 +
 +	bp_info = (len | type) & 0xf;
 +	bp_info <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE);
 +	bp_info |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE)) |
 +				DR_GLOBAL_SLOWDOWN;
 +	return bp_info;
 +}
 +
 +void arch_update_kernel_hw_breakpoint(void *unused)
 +{
 +	struct hw_breakpoint *bp;
 +	int i, cpu = get_cpu();
 +	unsigned long temp_kdr7 = 0;
 +
 +	/* Don't allow debug exceptions while we update the registers */
 +	set_debugreg(0UL, 7);
 +
 +	for (i = hbp_kernel_pos; i < HBP_NUM; i++) {
 +		per_cpu(this_hbp_kernel[i], cpu) = bp = hbp_kernel[i];
 +		if (bp) {
 +			temp_kdr7 |= encode_dr7(i, bp->info.len, bp->info.type);
 +			set_debugreg(bp->info.address, i);
 +		}
 +	}
 +
 +	/* No need to set DR6. Update the debug registers with kernel-space
 +	 * breakpoint values from kdr7 and user-space requests from the
 +	 * current process
 +	 */
 +	kdr7 = temp_kdr7;
 +	set_debugreg(kdr7 | current->thread.debugreg7, 7);
- 	put_cpu_no_resched();
++	put_cpu();
 +}
 +
 +/*
 + * Install the thread breakpoints in their debug registers.
 + */
 +void arch_install_thread_hw_breakpoint(struct task_struct *tsk)
 +{
 +	struct thread_struct *thread = &(tsk->thread);
 +
 +	switch (hbp_kernel_pos) {
 +	case 4:
 +		set_debugreg(thread->debugreg[3], 3);
 +	case 3:
 +		set_debugreg(thread->debugreg[2], 2);
 +	case 2:
 +		set_debugreg(thread->debugreg[1], 1);
 +	case 1:
 +		set_debugreg(thread->debugreg[0], 0);
 +	default:
 +		break;
 +	}
 +
 +	/* No need to set DR6 */
 +	set_debugreg((kdr7 | thread->debugreg7), 7);
 +}
 +
 +/*
 + * Install the debug register values for just the kernel, no thread.
 + */
 +void arch_uninstall_thread_hw_breakpoint()
 +{
 +	/* Clear the user-space portion of debugreg7 by setting only kdr7 */
 +	set_debugreg(kdr7, 7);
 +
 +}
 +
 +static int get_hbp_len(u8 hbp_len)
 +{
 +	unsigned int len_in_bytes = 0;
 +
 +	switch (hbp_len) {
 +	case HW_BREAKPOINT_LEN_1:
 +		len_in_bytes = 1;
 +		break;
 +	case HW_BREAKPOINT_LEN_2:
 +		len_in_bytes = 2;
 +		break;
 +	case HW_BREAKPOINT_LEN_4:
 +		len_in_bytes = 4;
 +		break;
 +#ifdef CONFIG_X86_64
 +	case HW_BREAKPOINT_LEN_8:
 +		len_in_bytes = 8;
 +		break;
 +#endif
 +	}
 +	return len_in_bytes;
 +}
 +
 +/*
 + * Check for virtual address in user space.
 + */
 +int arch_check_va_in_userspace(unsigned long va, u8 hbp_len)
 +{
 +	unsigned int len;
 +
 +	len = get_hbp_len(hbp_len);
 +
 +	return (va <= TASK_SIZE - len);
 +}
 +
 +/*
 + * Check for virtual address in kernel space.
 + */
 +int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len)
 +{
 +	unsigned int len;
 +
 +	len = get_hbp_len(hbp_len);
 +
 +	return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
 +}
 +
 +/*
 + * Store a breakpoint's encoded address, length, and type.
 + */
 +static int arch_store_info(struct hw_breakpoint *bp, struct task_struct *tsk)
 +{
 +	/*
 +	 * User-space requests will always have the address field populated
 +	 * Symbol names from user-space are rejected
 +	 */
 +	if (tsk && bp->info.name)
 +		return -EINVAL;
 +	/*
 +	 * For kernel-addresses, either the address or symbol name can be
 +	 * specified.
 +	 */
 +	if (bp->info.name)
 +		bp->info.address = (unsigned long)
 +					kallsyms_lookup_name(bp->info.name);
 +	if (bp->info.address)
 +		return 0;
 +	return -EINVAL;
 +}
 +
 +/*
 + * Validate the arch-specific HW Breakpoint register settings
 + */
 +int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp,
 +						struct task_struct *tsk)
 +{
 +	unsigned int align;
 +	int ret = -EINVAL;
 +
 +	switch (bp->info.type) {
 +	/*
 +	 * Ptrace-refactoring code
 +	 * For now, we'll allow instruction breakpoint only for user-space
 +	 * addresses
 +	 */
 +	case HW_BREAKPOINT_EXECUTE:
 +		if ((!arch_check_va_in_userspace(bp->info.address,
 +							bp->info.len)) &&
 +			bp->info.len != HW_BREAKPOINT_LEN_EXECUTE)
 +			return ret;
 +		break;
 +	case HW_BREAKPOINT_WRITE:
 +		break;
 +	case HW_BREAKPOINT_RW:
 +		break;
 +	default:
 +		return ret;
 +	}
 +
 +	switch (bp->info.len) {
 +	case HW_BREAKPOINT_LEN_1:
 +		align = 0;
 +		break;
 +	case HW_BREAKPOINT_LEN_2:
 +		align = 1;
 +		break;
 +	case HW_BREAKPOINT_LEN_4:
 +		align = 3;
 +		break;
 +#ifdef CONFIG_X86_64
 +	case HW_BREAKPOINT_LEN_8:
 +		align = 7;
 +		break;
 +#endif
 +	default:
 +		return ret;
 +	}
 +
 +	if (bp->triggered)
 +		ret = arch_store_info(bp, tsk);
 +
 +	if (ret < 0)
 +		return ret;
 +	/*
 +	 * Check that the low-order bits of the address are appropriate
 +	 * for the alignment implied by len.
 +	 */
 +	if (bp->info.address & align)
 +		return -EINVAL;
 +
 +	/* Check that the virtual address is in the proper range */
 +	if (tsk) {
 +		if (!arch_check_va_in_userspace(bp->info.address, bp->info.len))
 +			return -EFAULT;
 +	} else {
 +		if (!arch_check_va_in_kernelspace(bp->info.address,
 +								bp->info.len))
 +			return -EFAULT;
 +	}
 +	return 0;
 +}
 +
 +void arch_update_user_hw_breakpoint(int pos, struct task_struct *tsk)
 +{
 +	struct thread_struct *thread = &(tsk->thread);
 +	struct hw_breakpoint *bp = thread->hbp[pos];
 +
 +	thread->debugreg7 &= ~dr7_masks[pos];
 +	if (bp) {
 +		thread->debugreg[pos] = bp->info.address;
 +		thread->debugreg7 |= encode_dr7(pos, bp->info.len,
 +							bp->info.type);
 +	} else
 +		thread->debugreg[pos] = 0;
 +}
 +
 +void arch_flush_thread_hw_breakpoint(struct task_struct *tsk)
 +{
 +	int i;
 +	struct thread_struct *thread = &(tsk->thread);
 +
 +	thread->debugreg7 = 0;
 +	for (i = 0; i < HBP_NUM; i++)
 +		thread->debugreg[i] = 0;
 +}
 +
 +/*
 + * Handle debug exception notifications.
 + *
 + * Return value is either NOTIFY_STOP or NOTIFY_DONE as explained below.
 + *
 + * NOTIFY_DONE returned if one of the following conditions is true.
 + * i) When the causative address is from user-space and the exception
 + * is a valid one, i.e. not triggered as a result of lazy debug register
 + * switching
 + * ii) When there are more bits than trap<n> set in DR6 register (such
 + * as BD, BS or BT) indicating that more than one debug condition is
 + * met and requires some more action in do_debug().
 + *
 + * NOTIFY_STOP returned for all other cases
 + *
 + */
 +int __kprobes hw_breakpoint_handler(struct die_args *args)
 +{
 +	int i, cpu, rc = NOTIFY_STOP;
 +	struct hw_breakpoint *bp;
 +	unsigned long dr7, dr6;
 +	unsigned long *dr6_p;
 +
 +	/* The DR6 value is pointed by args->err */
 +	dr6_p = (unsigned long *)ERR_PTR(args->err);
 +	dr6 = *dr6_p;
 +
 +	/* Do an early return if no trap bits are set in DR6 */
 +	if ((dr6 & DR_TRAP_BITS) == 0)
 +		return NOTIFY_DONE;
 +
 +	/* Lazy debug register switching */
 +	if (!test_tsk_thread_flag(current, TIF_DEBUG))
 +		arch_uninstall_thread_hw_breakpoint();
 +
 +	get_debugreg(dr7, 7);
 +	/* Disable breakpoints during exception handling */
 +	set_debugreg(0UL, 7);
 +	/*
 +	 * Assert that local interrupts are disabled
 +	 * Reset the DRn bits in the virtualized register value.
 +	 * The ptrace trigger routine will add in whatever is needed.
 +	 */
 +	current->thread.debugreg6 &= ~DR_TRAP_BITS;
 +	cpu = get_cpu();
 +
 +	/* Handle all the breakpoints that were triggered */
 +	for (i = 0; i < HBP_NUM; ++i) {
 +		if (likely(!(dr6 & (DR_TRAP0 << i))))
 +			continue;
 +		/*
 +		 * Find the corresponding hw_breakpoint structure and
 +		 * invoke its triggered callback.
 +		 */
 +		if (i >= hbp_kernel_pos)
 +			bp = per_cpu(this_hbp_kernel[i], cpu);
 +		else {
 +			bp = current->thread.hbp[i];
 +			if (bp)
 +				rc = NOTIFY_DONE;
 +		}
 +		/*
 +		 * Reset the 'i'th TRAP bit in dr6 to denote completion of
 +		 * exception handling
 +		 */
 +		(*dr6_p) &= ~(DR_TRAP0 << i);
 +		/*
 +		 * bp can be NULL due to lazy debug register switching
 +		 * or due to the delay between updates of hbp_kernel_pos
 +		 * and this_hbp_kernel.
 +		 */
 +		if (!bp)
 +			continue;
 +
 +		(bp->triggered)(bp, args->regs);
 +	}
 +	if (dr6 & (~DR_TRAP_BITS))
 +		rc = NOTIFY_DONE;
 +
 +	set_debugreg(dr7, 7);
- 	put_cpu_no_resched();
++	put_cpu();
 +	return rc;
 +}
 +
 +/*
 + * Handle debug exception notifications.
 + */
 +int __kprobes hw_breakpoint_exceptions_notify(
 +		struct notifier_block *unused, unsigned long val, void *data)
 +{
 +	if (val != DIE_DEBUG)
 +		return NOTIFY_DONE;
 +
 +	return hw_breakpoint_handler(data);
 +}
diff --cc arch/x86/kernel/traps.c
index 124a4d5a95b2,5f935f0d5861..286d64eba31b
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@@ -529,13 -530,15 +530,17 @@@ asmlinkage __kprobes struct pt_regs *sy
  dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
  {
  	struct task_struct *tsk = current;
 -	unsigned long condition;
 +	unsigned long dr6;
  	int si_code;
  
 -	get_debugreg(condition, 6);
 +	get_debugreg(dr6, 6);
  
+ 	/* Catch kmemcheck conditions first of all! */
 -	if (condition & DR_STEP && kmemcheck_trap(regs))
++	if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
+ 		return;
+ 
 +	/* DR6 may or may not be cleared by the CPU */
 +	set_debugreg(0, 6);
  	/*
  	 * The processor cleared BTF, so don't mark that we need it set.
  	 */
diff --cc arch/x86/power/cpu.c
index 46866a13a93a,d277ef1eea51..394cbb88987c
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@@ -8,19 -8,28 +8,29 @@@
   * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
   */
  
- #include <linux/smp.h>
  #include <linux/suspend.h>
- #include <asm/proto.h>
- #include <asm/page.h>
+ #include <linux/smp.h>
+ 
  #include <asm/pgtable.h>
+ #include <asm/proto.h>
  #include <asm/mtrr.h>
+ #include <asm/page.h>
+ #include <asm/mce.h>
  #include <asm/xcr.h>
  #include <asm/suspend.h>
 +#include <asm/debugreg.h>
  
- static void fix_processor_context(void);
+ #ifdef CONFIG_X86_32
+ static struct saved_context saved_context;
  
+ unsigned long saved_context_ebx;
+ unsigned long saved_context_esp, saved_context_ebp;
+ unsigned long saved_context_esi, saved_context_edi;
+ unsigned long saved_context_eflags;
+ #else
+ /* CONFIG_X86_64 */
  struct saved_context saved_context;
+ #endif
  
  /**
   *	__save_processor_state - save CPU registers before creating a
@@@ -69,12 -97,16 +98,17 @@@ static void __save_processor_state(stru
  	ctxt->cr0 = read_cr0();
  	ctxt->cr2 = read_cr2();
  	ctxt->cr3 = read_cr3();
+ #ifdef CONFIG_X86_32
+ 	ctxt->cr4 = read_cr4_safe();
+ #else
+ /* CONFIG_X86_64 */
  	ctxt->cr4 = read_cr4();
  	ctxt->cr8 = read_cr8();
+ #endif
 +	hw_breakpoint_disable();
  }
  
+ /* Needed by apm.c */
  void save_processor_state(void)
  {
  	__save_processor_state(&saved_context);
@@@ -88,6 -123,52 +125,32 @@@ static void do_fpu_end(void
  	kernel_fpu_end();
  }
  
+ static void fix_processor_context(void)
+ {
+ 	int cpu = smp_processor_id();
+ 	struct tss_struct *t = &per_cpu(init_tss, cpu);
+ 
+ 	set_tss_desc(cpu, t);	/*
+ 				 * This just modifies memory; should not be
+ 				 * necessary. But... This is necessary, because
+ 				 * 386 hardware has concept of busy TSS or some
+ 				 * similar stupidity.
+ 				 */
+ 
+ #ifdef CONFIG_X86_64
+ 	get_cpu_gdt_table(cpu)[GDT_ENTRY_TSS].type = 9;
+ 
+ 	syscall_init();				/* This sets MSR_*STAR and related */
+ #endif
+ 	load_TR_desc();				/* This does ltr */
+ 	load_LDT(&current->active_mm->context);	/* This does lldt */
+ 
+ 	/*
+ 	 * Now maybe reload the debug registers
+ 	 */
 -	if (current->thread.debugreg7) {
 -#ifdef CONFIG_X86_32
 -		set_debugreg(current->thread.debugreg0, 0);
 -		set_debugreg(current->thread.debugreg1, 1);
 -		set_debugreg(current->thread.debugreg2, 2);
 -		set_debugreg(current->thread.debugreg3, 3);
 -		/* no 4 and 5 */
 -		set_debugreg(current->thread.debugreg6, 6);
 -		set_debugreg(current->thread.debugreg7, 7);
 -#else
 -		/* CONFIG_X86_64 */
 -		loaddebug(&current->thread, 0);
 -		loaddebug(&current->thread, 1);
 -		loaddebug(&current->thread, 2);
 -		loaddebug(&current->thread, 3);
 -		/* no 4 and 5 */
 -		loaddebug(&current->thread, 6);
 -		loaddebug(&current->thread, 7);
 -#endif
 -	}
 -
++	load_debug_registers();
+ }
+ 
  /**
   *	__restore_processor_state - restore the contents of CPU registers saved
   *		by __save_processor_state()
diff --cc kernel/Makefile
index 18ad1110b226,9df4501cb921..f88decb1b445
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@@ -96,7 -97,7 +97,8 @@@ obj-$(CONFIG_TRACING) += trace
  obj-$(CONFIG_X86_DS) += trace/
  obj-$(CONFIG_SMP) += sched_cpupri.o
  obj-$(CONFIG_SLOW_WORK) += slow-work.o
 +obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
+ obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
  
  ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
  # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is

commit cc4949e1fdade5d063e9f8783cf0e2cc92041ce5
Merge: 28b4868820a5 300df7dc89cc
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Jun 17 08:59:01 2009 +0200

    Merge branch 'linus' into x86/urgent
    
    Merge reason: pull in latest to fix a bug in it.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit 8a4a6182fd43c46ed8c12e26b4669854bcad300a
Merge: 5dfaf90f8052 6a047d8b9efc
Author: Ingo Molnar <mingo@elte.hu>
Date:   Tue Jun 16 11:51:24 2009 +0200

    Merge branch 'amd-iommu/fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/joro/linux-2.6-iommu into x86/urgent

commit 5dfaf90f8052327c92fbe3c470a2e6634be296c0
Author: Ingo Molnar <mingo@elte.hu>
Date:   Tue Jun 16 10:23:32 2009 +0200

    x86: mm: Read cr2 before prefetching the mmap_lock
    
    Prefetch instructions can generate spurious faults on certain
    models of older CPUs. The faults themselves cannot be stopped
    and they can occur pretty much anywhere - so the way we solve
    them is that we detect certain patterns and ignore the fault.
    
    There is one small path of code where we must not take faults
    though: the #PF handler execution leading up to the reading
    of the CR2 (the faulting address). If we take a fault there
    then we destroy the CR2 value (with that of the prefetching
    instruction's) and possibly mishandle user-space or
    kernel-space pagefaults.
    
    It turns out that in current upstream we do exactly that:
    
            prefetchw(&mm->mmap_sem);
    
            /* Get the faulting address: */
            address = read_cr2();
    
    This is not good.
    
    So turn around the order: first read the cr2 then prefetch
    the lock address. Reading cr2 is plenty fast (2 cycles) so
    delaying the prefetch by this amount shouldnt be a big issue
    performance-wise.
    
    [ And this might explain a mystery fault.c warning that sometimes
      occurs on one an old AMD/Semptron based test-system i have -
      which does have such prefetch problems. ]
    
    Cc: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Nick Piggin <npiggin@suse.de>
    Cc: Pekka Enberg <penberg@cs.helsinki.fi>
    Cc: Vegard Nossum <vegard.nossum@gmail.com>
    Cc: Jeremy Fitzhardinge <jeremy@goop.org>
    Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
    LKML-Reference: <20090616030522.GA22162@Krystal>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index c6acc6326374..0482fa649738 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -951,11 +951,11 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	tsk = current;
 	mm = tsk->mm;
 
-	prefetchw(&mm->mmap_sem);
-
 	/* Get the faulting address: */
 	address = read_cr2();
 
+	prefetchw(&mm->mmap_sem);
+
 	if (unlikely(kmmio_fault(regs, address)))
 		return;

commit e2eae0f5605b90a0838608043c21050b08b6dd95
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Jun 15 16:15:19 2009 +0200

    perf report: Fix 32-bit printf format
    
    Yong Wang reported the following compiler warning:
    
     builtin-report.c: In function 'process_overflow_event':
     builtin-report.c:984: error: cast to pointer from integer of different size
    
    Which happens because we try to print ->ips[] out with a limited
    format, losing the high 32 bits. Print it out using %016Lx instead.
    
    Reported-by: Yong Wang <yong.y.wang@linux.intel.com>
    Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Mike Galbraith <efault@gmx.de>
    Cc: Paul Mackerras <paulus@samba.org>
    Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
    LKML-Reference: <new-submission>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 1e2f5dde312c..f86bb07c0e84 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -982,7 +982,7 @@ process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
 				chain->nr);
 
 			for (i = 0; i < chain->nr; i++)
-				dprintf("..... %2d: %p\n", i, (void *)chain->ips[i]);
+				dprintf("..... %2d: %016Lx\n", i, chain->ips[i]);
 		}
 		if (collapse_syscalls) {
 			/*

commit 3dfabc74c65904c9e6cf952391312d16ea772ef5
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Jun 15 11:24:38 2009 +0200

    perf report: Add per system call overhead histogram
    
    Take advantage of call-graph percounter sampling/recording to
    display a non-trivial histogram: the true, collapsed/summarized
    cost measurement, on a per system call total overhead basis:
    
     aldebaran:~/linux/linux/tools/perf> ./perf record -g -a -f ~/hackbench 10
     aldebaran:~/linux/linux/tools/perf> ./perf report -s symbol --syscalls | head -10
     #
     # (3536 samples)
     #
     # Overhead  Symbol
     # ........  ......
     #
         40.75%  [k] sys_write
         40.21%  [k] sys_read
          4.44%  [k] do_nmi
     ...
    
    This is done by accounting each (reliable) call-chain that chains back
    to a given system call to that system call function.
    
    [ So in the above example we can see that hackbench spends about 40% of
      its total time somewhere in sys_write() and 40% somewhere in
      sys_read(), the rest of the time is spent in user-space. The time
      is not spent in sys_write() _itself_ but in one of its many child
      functions. ]
    
    Or, a recording of a (source files are already in the page-cache) kernel build:
    
     $ perf record -g -m 512 -f -- make -j32 kernel
     $ perf report -s s --syscalls | grep '\[k\]' | grep -v nmi
    
         4.14%  [k] do_page_fault
         1.20%  [k] sys_write
         1.10%  [k] sys_open
         0.63%  [k] sys_exit_group
         0.48%  [k] smp_apic_timer_interrupt
         0.37%  [k] sys_read
         0.37%  [k] sys_execve
         0.20%  [k] sys_mmap
         0.18%  [k] sys_close
         0.14%  [k] sys_munmap
         0.13%  [k] sys_poll
         0.09%  [k] sys_newstat
         0.07%  [k] sys_clone
         0.06%  [k] sys_newfstat
         0.05%  [k] sys_access
         0.05%  [k] schedule
    
    Shows the true total cost of each syscall variant that gets used
    during a kernel build. This profile reveals it that pagefaults are
    the costliest, followed by read()/write().
    
    An interesting detail: timer interrupts cost 0.5% - or 0.5 seconds
    per 100 seconds of kernel build-time. (this was done with HZ=1000)
    
    The summary is done in 'perf report', i.e. in the post-processing
    stage - so once we have a good call-graph recording, this type of
    non-trivial high-level analysis becomes possible.
    
    Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Mike Galbraith <efault@gmx.de>
    Cc: Paul Mackerras <paulus@samba.org>
    Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Frederic Weisbecker <fweisbec@gmail.com>
    Cc: Pekka Enberg <penberg@cs.helsinki.fi>
    LKML-Reference: <new-submission>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index aebba5659345..1e2f5dde312c 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -40,6 +40,7 @@ static int		dump_trace = 0;
 
 static int		verbose;
 static int		full_paths;
+static int		collapse_syscalls;
 
 static unsigned long	page_size;
 static unsigned long	mmap_window = 32;
@@ -983,6 +984,15 @@ process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
 			for (i = 0; i < chain->nr; i++)
 				dprintf("..... %2d: %p\n", i, (void *)chain->ips[i]);
 		}
+		if (collapse_syscalls) {
+			/*
+			 * Find the all-but-last kernel entry
+			 * amongst the call-chains - to get
+			 * to the level of system calls:
+			 */
+			if (chain->kernel >= 2)
+				ip = chain->ips[chain->kernel-2];
+		}
 	}
 
 	dprintf(" ... thread: %s:%d\n", thread->comm, thread->pid);
@@ -1343,6 +1353,8 @@ static const struct option options[] = {
 		   "sort by key(s): pid, comm, dso, symbol. Default: pid,symbol"),
 	OPT_BOOLEAN('P', "full-paths", &full_paths,
 		    "Don't shorten the pathnames taking into account the cwd"),
+	OPT_BOOLEAN('S', "syscalls", &collapse_syscalls,
+		    "show per syscall summary overhead, using call graph"),
 	OPT_END()
 };

commit 613d8602292165f86ba1969784fea01a06d55900
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Jun 15 08:17:12 2009 +0200

    perf record: Fix fast task-exit race
    
    Recording with -a (or with -p) can race with tasks going away:
    
       couldn't open /proc/8440/maps
    
    Causing an early exit() and no recording done.
    
    Do not abort the recording session - instead just skip that task.
    
    Also, only print the warnings under -v.
    
    Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Cc: Mike Galbraith <efault@gmx.de>
    Cc: Paul Mackerras <paulus@samba.org>
    Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
    LKML-Reference: <new-submission>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index a177a591b52c..e1dfef24887f 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -202,8 +202,12 @@ static void pid_synthesize_comm_event(pid_t pid, int full)
 
 	fd = open(filename, O_RDONLY);
 	if (fd < 0) {
-		fprintf(stderr, "couldn't open %s\n", filename);
-		exit(EXIT_FAILURE);
+		/*
+		 * We raced with a task exiting - just return:
+		 */
+		if (verbose)
+			fprintf(stderr, "couldn't open %s\n", filename);
+		return;
 	}
 	if (read(fd, bf, sizeof(bf)) < 0) {
 		fprintf(stderr, "couldn't read %s\n", filename);
@@ -273,8 +277,12 @@ static void pid_synthesize_mmap_samples(pid_t pid)
 
 	fp = fopen(filename, "r");
 	if (fp == NULL) {
-		fprintf(stderr, "couldn't open %s\n", filename);
-		exit(EXIT_FAILURE);
+		/*
+		 * We raced with a task exiting - just return:
+		 */
+		if (verbose)
+			fprintf(stderr, "couldn't open %s\n", filename);
+		return;
 	}
 	while (1) {
 		char bf[BUFSIZ], *pbf = bf;