Patches contributed by Eötvös Lorand University
commit 813400060fe2824163cabd9afed4e476e7ce282e
Merge: 1bf7b31efa0c fe955e5c793a
Author: Ingo Molnar <mingo@elte.hu>
Date: Wed Jun 17 18:21:41 2009 +0200
Merge branch 'x86/urgent' into x86/mce3
Conflicts:
arch/x86/kernel/cpu/mcheck/mce_intel.c
Merge reason: merge with an urgent-branch MCE fix.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
commit b0a5b83ee0fce9dbf8ff5fe1f8c9ae7dfafe458c
Author: Ingo Molnar <mingo@elte.hu>
Date: Tue Jun 16 16:11:14 2009 +0200
dma-debug: Put all hash-chain locks into the same lock class
Alan Cox reported that lockdep runs out of its stack-trace entries
with certain configs:
BUG: MAX_STACK_TRACE_ENTRIES too low
This happens because there are 1024 hash buckets, each with a
separate lock. Lockdep puts each lock into a separate lock class and
tracks them independently.
But in reality we never take more than one of the buckets, so they
really belong into a single lock-class. Annotate the has bucket lock
init accordingly.
[ Impact: reduce the lockdep footprint of dma-debug ]
Reported-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index a9b6b5c9e091..c9187fed0b93 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -716,7 +716,7 @@ void dma_debug_init(u32 num_entries)
for (i = 0; i < HASH_SIZE; ++i) {
INIT_LIST_HEAD(&dma_entry_hash[i].list);
- dma_entry_hash[i].lock = SPIN_LOCK_UNLOCKED;
+ spin_lock_init(&dma_entry_hash[i].lock);
}
if (dma_debug_fs_init() != 0) {
commit a3d06cc6aa3e765dc2bf98626f87272dcf641dca
Merge: 0990b1c65729 65795efbd380
Author: Ingo Molnar <mingo@elte.hu>
Date: Wed Jun 17 13:06:17 2009 +0200
Merge branch 'linus' into perfcounters/core
Conflicts:
arch/x86/include/asm/kmap_types.h
include/linux/mm.h
include/asm-generic/kmap_types.h
Merge reason: We crossed changes with kmap_types.h cleanups in mainline.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --cc include/asm-generic/kmap_types.h
index 58c33055c304,54e8b3d956b7..eddbce0f9fb9
--- a/include/asm-generic/kmap_types.h
+++ b/include/asm-generic/kmap_types.h
@@@ -24,7 -24,7 +24,10 @@@ D(12) KM_SOFTIRQ1
D(13) KM_SYNC_ICACHE,
D(14) KM_SYNC_DCACHE,
D(15) KM_UML_USERCOPY, /* UML specific, for copy_*_user - used in do_op_one_page */
--D(16) KM_TYPE_NR
++D(16) KM_IRQ_PTE,
++D(17) KM_NMI,
++D(18) KM_NMI_PTE,
++D(19) KM_TYPE_NR
};
#undef D
diff --cc include/linux/mm.h
index b457bc047ab1,d88d6fc530ad..cf260d848eb9
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@@ -849,25 -853,6 +853,12 @@@ extern int mprotect_fixup(struct vm_are
struct vm_area_struct **pprev, unsigned long start,
unsigned long end, unsigned long newflags);
- /*
- * get_user_pages_fast provides equivalent functionality to get_user_pages,
- * operating on current and current->mm (force=0 and doesn't return any vmas).
- *
- * get_user_pages_fast may take mmap_sem and page tables, so no assumptions
- * can be made about locking. get_user_pages_fast is to be implemented in a
- * way that is advantageous (vs get_user_pages()) when the user memory area is
- * already faulted in and present in ptes. However if the pages have to be
- * faulted in, it may turn out to be slightly slower).
- */
- int get_user_pages_fast(unsigned long start, int nr_pages, int write,
- struct page **pages);
-
+/*
+ * doesn't attempt to fault and will return short.
+ */
+int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
+ struct page **pages);
+
/*
* A callback you can register to apply pressure to ageable caches.
*
commit eadb8a091b27a840de7450f84ecff5ef13476424
Merge: 73874005cd88 65795efbd380
Author: Ingo Molnar <mingo@elte.hu>
Date: Wed Jun 17 12:52:15 2009 +0200
Merge branch 'linus' into tracing/hw-breakpoints
Conflicts:
arch/x86/Kconfig
arch/x86/kernel/traps.c
arch/x86/power/cpu.c
arch/x86/power/cpu_32.c
kernel/Makefile
Semantic conflict:
arch/x86/kernel/hw_breakpoint.c
Merge reason: Resolve the conflicts, move from put_cpu_no_sched() to
put_cpu() in arch/x86/kernel/hw_breakpoint.c.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --cc arch/x86/Kconfig
index 3033375ed6bc,cf42fc305419..52421d52f21e
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@@ -46,7 -46,12 +46,13 @@@ config X8
select HAVE_KERNEL_GZIP
select HAVE_KERNEL_BZIP2
select HAVE_KERNEL_LZMA
+ select HAVE_HW_BREAKPOINT
+ select HAVE_ARCH_KMEMCHECK
+
+ config OUTPUT_FORMAT
+ string
+ default "elf32-i386" if X86_32
+ default "elf64-x86-64" if X86_64
config ARCH_DEFCONFIG
string
diff --cc arch/x86/include/asm/processor.h
index 448b34a8e393,c7768269b1cf..2b03f700d3f2
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@@ -428,15 -425,20 +426,19 @@@ struct thread_struct
unsigned short fsindex;
unsigned short gsindex;
#endif
+ #ifdef CONFIG_X86_32
unsigned long ip;
+ #endif
+ #ifdef CONFIG_X86_64
unsigned long fs;
+ #endif
unsigned long gs;
/* Hardware debugging registers: */
- unsigned long debugreg0;
- unsigned long debugreg1;
- unsigned long debugreg2;
- unsigned long debugreg3;
+ unsigned long debugreg[HBP_NUM];
unsigned long debugreg6;
unsigned long debugreg7;
+ /* Hardware breakpoint info */
+ struct hw_breakpoint *hbp[HBP_NUM];
/* Fault info: */
unsigned long cr2;
unsigned long trap_no;
diff --cc arch/x86/kernel/hw_breakpoint.c
index 69451473dbd2,000000000000..51d959528b1d
mode 100644,000000..100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@@ -1,391 -1,0 +1,391 @@@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2007 Alan Stern
+ * Copyright (C) 2009 IBM Corporation
+ */
+
+/*
+ * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
+ * using the CPU's debug registers.
+ */
+
+#include <linux/irqflags.h>
+#include <linux/notifier.h>
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/percpu.h>
+#include <linux/kdebug.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+
+#include <asm/hw_breakpoint.h>
+#include <asm/processor.h>
+#include <asm/debugreg.h>
+
+/* Unmasked kernel DR7 value */
+static unsigned long kdr7;
+
+/*
+ * Masks for the bits corresponding to registers DR0 - DR3 in DR7 register.
+ * Used to clear and verify the status of bits corresponding to DR0 - DR3
+ */
+static const unsigned long dr7_masks[HBP_NUM] = {
+ 0x000f0003, /* LEN0, R/W0, G0, L0 */
+ 0x00f0000c, /* LEN1, R/W1, G1, L1 */
+ 0x0f000030, /* LEN2, R/W2, G2, L2 */
+ 0xf00000c0 /* LEN3, R/W3, G3, L3 */
+};
+
+
+/*
+ * Encode the length, type, Exact, and Enable bits for a particular breakpoint
+ * as stored in debug register 7.
+ */
+static unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type)
+{
+ unsigned long bp_info;
+
+ bp_info = (len | type) & 0xf;
+ bp_info <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE);
+ bp_info |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE)) |
+ DR_GLOBAL_SLOWDOWN;
+ return bp_info;
+}
+
+void arch_update_kernel_hw_breakpoint(void *unused)
+{
+ struct hw_breakpoint *bp;
+ int i, cpu = get_cpu();
+ unsigned long temp_kdr7 = 0;
+
+ /* Don't allow debug exceptions while we update the registers */
+ set_debugreg(0UL, 7);
+
+ for (i = hbp_kernel_pos; i < HBP_NUM; i++) {
+ per_cpu(this_hbp_kernel[i], cpu) = bp = hbp_kernel[i];
+ if (bp) {
+ temp_kdr7 |= encode_dr7(i, bp->info.len, bp->info.type);
+ set_debugreg(bp->info.address, i);
+ }
+ }
+
+ /* No need to set DR6. Update the debug registers with kernel-space
+ * breakpoint values from kdr7 and user-space requests from the
+ * current process
+ */
+ kdr7 = temp_kdr7;
+ set_debugreg(kdr7 | current->thread.debugreg7, 7);
- put_cpu_no_resched();
++ put_cpu();
+}
+
+/*
+ * Install the thread breakpoints in their debug registers.
+ */
+void arch_install_thread_hw_breakpoint(struct task_struct *tsk)
+{
+ struct thread_struct *thread = &(tsk->thread);
+
+ switch (hbp_kernel_pos) {
+ case 4:
+ set_debugreg(thread->debugreg[3], 3);
+ case 3:
+ set_debugreg(thread->debugreg[2], 2);
+ case 2:
+ set_debugreg(thread->debugreg[1], 1);
+ case 1:
+ set_debugreg(thread->debugreg[0], 0);
+ default:
+ break;
+ }
+
+ /* No need to set DR6 */
+ set_debugreg((kdr7 | thread->debugreg7), 7);
+}
+
+/*
+ * Install the debug register values for just the kernel, no thread.
+ */
+void arch_uninstall_thread_hw_breakpoint()
+{
+ /* Clear the user-space portion of debugreg7 by setting only kdr7 */
+ set_debugreg(kdr7, 7);
+
+}
+
+static int get_hbp_len(u8 hbp_len)
+{
+ unsigned int len_in_bytes = 0;
+
+ switch (hbp_len) {
+ case HW_BREAKPOINT_LEN_1:
+ len_in_bytes = 1;
+ break;
+ case HW_BREAKPOINT_LEN_2:
+ len_in_bytes = 2;
+ break;
+ case HW_BREAKPOINT_LEN_4:
+ len_in_bytes = 4;
+ break;
+#ifdef CONFIG_X86_64
+ case HW_BREAKPOINT_LEN_8:
+ len_in_bytes = 8;
+ break;
+#endif
+ }
+ return len_in_bytes;
+}
+
+/*
+ * Check for virtual address in user space.
+ */
+int arch_check_va_in_userspace(unsigned long va, u8 hbp_len)
+{
+ unsigned int len;
+
+ len = get_hbp_len(hbp_len);
+
+ return (va <= TASK_SIZE - len);
+}
+
+/*
+ * Check for virtual address in kernel space.
+ */
+int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len)
+{
+ unsigned int len;
+
+ len = get_hbp_len(hbp_len);
+
+ return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
+}
+
+/*
+ * Store a breakpoint's encoded address, length, and type.
+ */
+static int arch_store_info(struct hw_breakpoint *bp, struct task_struct *tsk)
+{
+ /*
+ * User-space requests will always have the address field populated
+ * Symbol names from user-space are rejected
+ */
+ if (tsk && bp->info.name)
+ return -EINVAL;
+ /*
+ * For kernel-addresses, either the address or symbol name can be
+ * specified.
+ */
+ if (bp->info.name)
+ bp->info.address = (unsigned long)
+ kallsyms_lookup_name(bp->info.name);
+ if (bp->info.address)
+ return 0;
+ return -EINVAL;
+}
+
+/*
+ * Validate the arch-specific HW Breakpoint register settings
+ */
+int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp,
+ struct task_struct *tsk)
+{
+ unsigned int align;
+ int ret = -EINVAL;
+
+ switch (bp->info.type) {
+ /*
+ * Ptrace-refactoring code
+ * For now, we'll allow instruction breakpoint only for user-space
+ * addresses
+ */
+ case HW_BREAKPOINT_EXECUTE:
+ if ((!arch_check_va_in_userspace(bp->info.address,
+ bp->info.len)) &&
+ bp->info.len != HW_BREAKPOINT_LEN_EXECUTE)
+ return ret;
+ break;
+ case HW_BREAKPOINT_WRITE:
+ break;
+ case HW_BREAKPOINT_RW:
+ break;
+ default:
+ return ret;
+ }
+
+ switch (bp->info.len) {
+ case HW_BREAKPOINT_LEN_1:
+ align = 0;
+ break;
+ case HW_BREAKPOINT_LEN_2:
+ align = 1;
+ break;
+ case HW_BREAKPOINT_LEN_4:
+ align = 3;
+ break;
+#ifdef CONFIG_X86_64
+ case HW_BREAKPOINT_LEN_8:
+ align = 7;
+ break;
+#endif
+ default:
+ return ret;
+ }
+
+ if (bp->triggered)
+ ret = arch_store_info(bp, tsk);
+
+ if (ret < 0)
+ return ret;
+ /*
+ * Check that the low-order bits of the address are appropriate
+ * for the alignment implied by len.
+ */
+ if (bp->info.address & align)
+ return -EINVAL;
+
+ /* Check that the virtual address is in the proper range */
+ if (tsk) {
+ if (!arch_check_va_in_userspace(bp->info.address, bp->info.len))
+ return -EFAULT;
+ } else {
+ if (!arch_check_va_in_kernelspace(bp->info.address,
+ bp->info.len))
+ return -EFAULT;
+ }
+ return 0;
+}
+
+void arch_update_user_hw_breakpoint(int pos, struct task_struct *tsk)
+{
+ struct thread_struct *thread = &(tsk->thread);
+ struct hw_breakpoint *bp = thread->hbp[pos];
+
+ thread->debugreg7 &= ~dr7_masks[pos];
+ if (bp) {
+ thread->debugreg[pos] = bp->info.address;
+ thread->debugreg7 |= encode_dr7(pos, bp->info.len,
+ bp->info.type);
+ } else
+ thread->debugreg[pos] = 0;
+}
+
+void arch_flush_thread_hw_breakpoint(struct task_struct *tsk)
+{
+ int i;
+ struct thread_struct *thread = &(tsk->thread);
+
+ thread->debugreg7 = 0;
+ for (i = 0; i < HBP_NUM; i++)
+ thread->debugreg[i] = 0;
+}
+
+/*
+ * Handle debug exception notifications.
+ *
+ * Return value is either NOTIFY_STOP or NOTIFY_DONE as explained below.
+ *
+ * NOTIFY_DONE returned if one of the following conditions is true.
+ * i) When the causative address is from user-space and the exception
+ * is a valid one, i.e. not triggered as a result of lazy debug register
+ * switching
+ * ii) When there are more bits than trap<n> set in DR6 register (such
+ * as BD, BS or BT) indicating that more than one debug condition is
+ * met and requires some more action in do_debug().
+ *
+ * NOTIFY_STOP returned for all other cases
+ *
+ */
+int __kprobes hw_breakpoint_handler(struct die_args *args)
+{
+ int i, cpu, rc = NOTIFY_STOP;
+ struct hw_breakpoint *bp;
+ unsigned long dr7, dr6;
+ unsigned long *dr6_p;
+
+ /* The DR6 value is pointed by args->err */
+ dr6_p = (unsigned long *)ERR_PTR(args->err);
+ dr6 = *dr6_p;
+
+ /* Do an early return if no trap bits are set in DR6 */
+ if ((dr6 & DR_TRAP_BITS) == 0)
+ return NOTIFY_DONE;
+
+ /* Lazy debug register switching */
+ if (!test_tsk_thread_flag(current, TIF_DEBUG))
+ arch_uninstall_thread_hw_breakpoint();
+
+ get_debugreg(dr7, 7);
+ /* Disable breakpoints during exception handling */
+ set_debugreg(0UL, 7);
+ /*
+ * Assert that local interrupts are disabled
+ * Reset the DRn bits in the virtualized register value.
+ * The ptrace trigger routine will add in whatever is needed.
+ */
+ current->thread.debugreg6 &= ~DR_TRAP_BITS;
+ cpu = get_cpu();
+
+ /* Handle all the breakpoints that were triggered */
+ for (i = 0; i < HBP_NUM; ++i) {
+ if (likely(!(dr6 & (DR_TRAP0 << i))))
+ continue;
+ /*
+ * Find the corresponding hw_breakpoint structure and
+ * invoke its triggered callback.
+ */
+ if (i >= hbp_kernel_pos)
+ bp = per_cpu(this_hbp_kernel[i], cpu);
+ else {
+ bp = current->thread.hbp[i];
+ if (bp)
+ rc = NOTIFY_DONE;
+ }
+ /*
+ * Reset the 'i'th TRAP bit in dr6 to denote completion of
+ * exception handling
+ */
+ (*dr6_p) &= ~(DR_TRAP0 << i);
+ /*
+ * bp can be NULL due to lazy debug register switching
+ * or due to the delay between updates of hbp_kernel_pos
+ * and this_hbp_kernel.
+ */
+ if (!bp)
+ continue;
+
+ (bp->triggered)(bp, args->regs);
+ }
+ if (dr6 & (~DR_TRAP_BITS))
+ rc = NOTIFY_DONE;
+
+ set_debugreg(dr7, 7);
- put_cpu_no_resched();
++ put_cpu();
+ return rc;
+}
+
+/*
+ * Handle debug exception notifications.
+ */
+int __kprobes hw_breakpoint_exceptions_notify(
+ struct notifier_block *unused, unsigned long val, void *data)
+{
+ if (val != DIE_DEBUG)
+ return NOTIFY_DONE;
+
+ return hw_breakpoint_handler(data);
+}
diff --cc arch/x86/kernel/traps.c
index 124a4d5a95b2,5f935f0d5861..286d64eba31b
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@@ -529,13 -530,15 +530,17 @@@ asmlinkage __kprobes struct pt_regs *sy
dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
{
struct task_struct *tsk = current;
- unsigned long condition;
+ unsigned long dr6;
int si_code;
- get_debugreg(condition, 6);
+ get_debugreg(dr6, 6);
+ /* Catch kmemcheck conditions first of all! */
- if (condition & DR_STEP && kmemcheck_trap(regs))
++ if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
+ return;
+
+ /* DR6 may or may not be cleared by the CPU */
+ set_debugreg(0, 6);
/*
* The processor cleared BTF, so don't mark that we need it set.
*/
diff --cc arch/x86/power/cpu.c
index 46866a13a93a,d277ef1eea51..394cbb88987c
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@@ -8,19 -8,28 +8,29 @@@
* Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
*/
- #include <linux/smp.h>
#include <linux/suspend.h>
- #include <asm/proto.h>
- #include <asm/page.h>
+ #include <linux/smp.h>
+
#include <asm/pgtable.h>
+ #include <asm/proto.h>
#include <asm/mtrr.h>
+ #include <asm/page.h>
+ #include <asm/mce.h>
#include <asm/xcr.h>
#include <asm/suspend.h>
+#include <asm/debugreg.h>
- static void fix_processor_context(void);
+ #ifdef CONFIG_X86_32
+ static struct saved_context saved_context;
+ unsigned long saved_context_ebx;
+ unsigned long saved_context_esp, saved_context_ebp;
+ unsigned long saved_context_esi, saved_context_edi;
+ unsigned long saved_context_eflags;
+ #else
+ /* CONFIG_X86_64 */
struct saved_context saved_context;
+ #endif
/**
* __save_processor_state - save CPU registers before creating a
@@@ -69,12 -97,16 +98,17 @@@ static void __save_processor_state(stru
ctxt->cr0 = read_cr0();
ctxt->cr2 = read_cr2();
ctxt->cr3 = read_cr3();
+ #ifdef CONFIG_X86_32
+ ctxt->cr4 = read_cr4_safe();
+ #else
+ /* CONFIG_X86_64 */
ctxt->cr4 = read_cr4();
ctxt->cr8 = read_cr8();
+ #endif
+ hw_breakpoint_disable();
}
+ /* Needed by apm.c */
void save_processor_state(void)
{
__save_processor_state(&saved_context);
@@@ -88,6 -123,52 +125,32 @@@ static void do_fpu_end(void
kernel_fpu_end();
}
+ static void fix_processor_context(void)
+ {
+ int cpu = smp_processor_id();
+ struct tss_struct *t = &per_cpu(init_tss, cpu);
+
+ set_tss_desc(cpu, t); /*
+ * This just modifies memory; should not be
+ * necessary. But... This is necessary, because
+ * 386 hardware has concept of busy TSS or some
+ * similar stupidity.
+ */
+
+ #ifdef CONFIG_X86_64
+ get_cpu_gdt_table(cpu)[GDT_ENTRY_TSS].type = 9;
+
+ syscall_init(); /* This sets MSR_*STAR and related */
+ #endif
+ load_TR_desc(); /* This does ltr */
+ load_LDT(¤t->active_mm->context); /* This does lldt */
+
+ /*
+ * Now maybe reload the debug registers
+ */
- if (current->thread.debugreg7) {
-#ifdef CONFIG_X86_32
- set_debugreg(current->thread.debugreg0, 0);
- set_debugreg(current->thread.debugreg1, 1);
- set_debugreg(current->thread.debugreg2, 2);
- set_debugreg(current->thread.debugreg3, 3);
- /* no 4 and 5 */
- set_debugreg(current->thread.debugreg6, 6);
- set_debugreg(current->thread.debugreg7, 7);
-#else
- /* CONFIG_X86_64 */
- loaddebug(¤t->thread, 0);
- loaddebug(¤t->thread, 1);
- loaddebug(¤t->thread, 2);
- loaddebug(¤t->thread, 3);
- /* no 4 and 5 */
- loaddebug(¤t->thread, 6);
- loaddebug(¤t->thread, 7);
-#endif
- }
-
++ load_debug_registers();
+ }
+
/**
* __restore_processor_state - restore the contents of CPU registers saved
* by __save_processor_state()
diff --cc kernel/Makefile
index 18ad1110b226,9df4501cb921..f88decb1b445
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@@ -96,7 -97,7 +97,8 @@@ obj-$(CONFIG_TRACING) += trace
obj-$(CONFIG_X86_DS) += trace/
obj-$(CONFIG_SMP) += sched_cpupri.o
obj-$(CONFIG_SLOW_WORK) += slow-work.o
+obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
+ obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
commit cc4949e1fdade5d063e9f8783cf0e2cc92041ce5
Merge: 28b4868820a5 300df7dc89cc
Author: Ingo Molnar <mingo@elte.hu>
Date: Wed Jun 17 08:59:01 2009 +0200
Merge branch 'linus' into x86/urgent
Merge reason: pull in latest to fix a bug in it.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
commit 8a4a6182fd43c46ed8c12e26b4669854bcad300a
Merge: 5dfaf90f8052 6a047d8b9efc
Author: Ingo Molnar <mingo@elte.hu>
Date: Tue Jun 16 11:51:24 2009 +0200
Merge branch 'amd-iommu/fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/joro/linux-2.6-iommu into x86/urgent
commit 5dfaf90f8052327c92fbe3c470a2e6634be296c0
Author: Ingo Molnar <mingo@elte.hu>
Date: Tue Jun 16 10:23:32 2009 +0200
x86: mm: Read cr2 before prefetching the mmap_lock
Prefetch instructions can generate spurious faults on certain
models of older CPUs. The faults themselves cannot be stopped
and they can occur pretty much anywhere - so the way we solve
them is that we detect certain patterns and ignore the fault.
There is one small path of code where we must not take faults
though: the #PF handler execution leading up to the reading
of the CR2 (the faulting address). If we take a fault there
then we destroy the CR2 value (with that of the prefetching
instruction's) and possibly mishandle user-space or
kernel-space pagefaults.
It turns out that in current upstream we do exactly that:
prefetchw(&mm->mmap_sem);
/* Get the faulting address: */
address = read_cr2();
This is not good.
So turn around the order: first read the cr2 then prefetch
the lock address. Reading cr2 is plenty fast (2 cycles) so
delaying the prefetch by this amount shouldnt be a big issue
performance-wise.
[ And this might explain a mystery fault.c warning that sometimes
occurs on one an old AMD/Semptron based test-system i have -
which does have such prefetch problems. ]
Cc: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Vegard Nossum <vegard.nossum@gmail.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
LKML-Reference: <20090616030522.GA22162@Krystal>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index c6acc6326374..0482fa649738 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -951,11 +951,11 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
tsk = current;
mm = tsk->mm;
- prefetchw(&mm->mmap_sem);
-
/* Get the faulting address: */
address = read_cr2();
+ prefetchw(&mm->mmap_sem);
+
if (unlikely(kmmio_fault(regs, address)))
return;
commit e2eae0f5605b90a0838608043c21050b08b6dd95
Author: Ingo Molnar <mingo@elte.hu>
Date: Mon Jun 15 16:15:19 2009 +0200
perf report: Fix 32-bit printf format
Yong Wang reported the following compiler warning:
builtin-report.c: In function 'process_overflow_event':
builtin-report.c:984: error: cast to pointer from integer of different size
Which happens because we try to print ->ips[] out with a limited
format, losing the high 32 bits. Print it out using %016Lx instead.
Reported-by: Yong Wang <yong.y.wang@linux.intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 1e2f5dde312c..f86bb07c0e84 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -982,7 +982,7 @@ process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
chain->nr);
for (i = 0; i < chain->nr; i++)
- dprintf("..... %2d: %p\n", i, (void *)chain->ips[i]);
+ dprintf("..... %2d: %016Lx\n", i, chain->ips[i]);
}
if (collapse_syscalls) {
/*
commit 3dfabc74c65904c9e6cf952391312d16ea772ef5
Author: Ingo Molnar <mingo@elte.hu>
Date: Mon Jun 15 11:24:38 2009 +0200
perf report: Add per system call overhead histogram
Take advantage of call-graph percounter sampling/recording to
display a non-trivial histogram: the true, collapsed/summarized
cost measurement, on a per system call total overhead basis:
aldebaran:~/linux/linux/tools/perf> ./perf record -g -a -f ~/hackbench 10
aldebaran:~/linux/linux/tools/perf> ./perf report -s symbol --syscalls | head -10
#
# (3536 samples)
#
# Overhead Symbol
# ........ ......
#
40.75% [k] sys_write
40.21% [k] sys_read
4.44% [k] do_nmi
...
This is done by accounting each (reliable) call-chain that chains back
to a given system call to that system call function.
[ So in the above example we can see that hackbench spends about 40% of
its total time somewhere in sys_write() and 40% somewhere in
sys_read(), the rest of the time is spent in user-space. The time
is not spent in sys_write() _itself_ but in one of its many child
functions. ]
Or, a recording of a (source files are already in the page-cache) kernel build:
$ perf record -g -m 512 -f -- make -j32 kernel
$ perf report -s s --syscalls | grep '\[k\]' | grep -v nmi
4.14% [k] do_page_fault
1.20% [k] sys_write
1.10% [k] sys_open
0.63% [k] sys_exit_group
0.48% [k] smp_apic_timer_interrupt
0.37% [k] sys_read
0.37% [k] sys_execve
0.20% [k] sys_mmap
0.18% [k] sys_close
0.14% [k] sys_munmap
0.13% [k] sys_poll
0.09% [k] sys_newstat
0.07% [k] sys_clone
0.06% [k] sys_newfstat
0.05% [k] sys_access
0.05% [k] schedule
Shows the true total cost of each syscall variant that gets used
during a kernel build. This profile reveals it that pagefaults are
the costliest, followed by read()/write().
An interesting detail: timer interrupts cost 0.5% - or 0.5 seconds
per 100 seconds of kernel build-time. (this was done with HZ=1000)
The summary is done in 'perf report', i.e. in the post-processing
stage - so once we have a good call-graph recording, this type of
non-trivial high-level analysis becomes possible.
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index aebba5659345..1e2f5dde312c 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -40,6 +40,7 @@ static int dump_trace = 0;
static int verbose;
static int full_paths;
+static int collapse_syscalls;
static unsigned long page_size;
static unsigned long mmap_window = 32;
@@ -983,6 +984,15 @@ process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
for (i = 0; i < chain->nr; i++)
dprintf("..... %2d: %p\n", i, (void *)chain->ips[i]);
}
+ if (collapse_syscalls) {
+ /*
+ * Find the all-but-last kernel entry
+ * amongst the call-chains - to get
+ * to the level of system calls:
+ */
+ if (chain->kernel >= 2)
+ ip = chain->ips[chain->kernel-2];
+ }
}
dprintf(" ... thread: %s:%d\n", thread->comm, thread->pid);
@@ -1343,6 +1353,8 @@ static const struct option options[] = {
"sort by key(s): pid, comm, dso, symbol. Default: pid,symbol"),
OPT_BOOLEAN('P', "full-paths", &full_paths,
"Don't shorten the pathnames taking into account the cwd"),
+ OPT_BOOLEAN('S', "syscalls", &collapse_syscalls,
+ "show per syscall summary overhead, using call graph"),
OPT_END()
};
commit 613d8602292165f86ba1969784fea01a06d55900
Author: Ingo Molnar <mingo@elte.hu>
Date: Mon Jun 15 08:17:12 2009 +0200
perf record: Fix fast task-exit race
Recording with -a (or with -p) can race with tasks going away:
couldn't open /proc/8440/maps
Causing an early exit() and no recording done.
Do not abort the recording session - instead just skip that task.
Also, only print the warnings under -v.
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index a177a591b52c..e1dfef24887f 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -202,8 +202,12 @@ static void pid_synthesize_comm_event(pid_t pid, int full)
fd = open(filename, O_RDONLY);
if (fd < 0) {
- fprintf(stderr, "couldn't open %s\n", filename);
- exit(EXIT_FAILURE);
+ /*
+ * We raced with a task exiting - just return:
+ */
+ if (verbose)
+ fprintf(stderr, "couldn't open %s\n", filename);
+ return;
}
if (read(fd, bf, sizeof(bf)) < 0) {
fprintf(stderr, "couldn't read %s\n", filename);
@@ -273,8 +277,12 @@ static void pid_synthesize_mmap_samples(pid_t pid)
fp = fopen(filename, "r");
if (fp == NULL) {
- fprintf(stderr, "couldn't open %s\n", filename);
- exit(EXIT_FAILURE);
+ /*
+ * We raced with a task exiting - just return:
+ */
+ if (verbose)
+ fprintf(stderr, "couldn't open %s\n", filename);
+ return;
}
while (1) {
char bf[BUFSIZ], *pbf = bf;