From b6ab4afee4ed56d0f69df59485585cff828c327d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 30 Dec 2008 09:41:04 +0100 Subject: tracing, kvm: change MARKERS to select instead of depends on Impact: build fix fix: kernel/trace/Kconfig:42:error: found recursive dependency: TRACING -> TRACEPOINTS -> MARKERS -> KVM_TRACE -> RELAY -> KMEMTRACE -> TRACING markers is a facility that should be selected - not depended on by an interactive Kconfig entry. Signed-off-by: Ingo Molnar --- arch/x86/kvm/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index b81125f0bdee..c7da3683f4c5 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -55,7 +55,8 @@ config KVM_AMD config KVM_TRACE bool "KVM trace support" - depends on KVM && MARKERS && SYSFS + depends on KVM && SYSFS + select MARKERS select RELAY select DEBUG_FS default n -- cgit 1.4.1 From fe6f90e57fd31af8daca534ea01db2e5666c15da Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Sat, 3 Jan 2009 21:23:51 +0200 Subject: trace: mmiotrace to the tracer menu in Kconfig Impact: cosmetic change in Kconfig menu layout This patch was originally suggested by Peter Zijlstra, but seems it was forgotten. CONFIG_MMIOTRACE and CONFIG_MMIOTRACE_TEST were selectable directly under the Kernel hacking / debugging menu in the kernel configuration system. They were present only for x86 and x86_64. Other tracers that use the ftrace tracing framework are in their own sub-menu. This patch moves the mmiotrace configuration options there. Since the Kconfig file, where the tracer menu is, is not architecture specific, HAVE_MMIOTRACE_SUPPORT is introduced and provided only by x86/x86_64. CONFIG_MMIOTRACE now depends on it. Signed-off-by: Pekka Paalanen Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.debug | 24 ++---------------------- kernel/trace/Kconfig | 23 +++++++++++++++++++++++ 2 files changed, 25 insertions(+), 22 deletions(-) (limited to 'arch') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 10d6cc3fd052..e1983fa025d2 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -174,28 +174,8 @@ config IOMMU_LEAK Add a simple leak tracer to the IOMMU code. This is useful when you are debugging a buggy device driver that leaks IOMMU mappings. -config MMIOTRACE - bool "Memory mapped IO tracing" - depends on DEBUG_KERNEL && PCI - select TRACING - help - Mmiotrace traces Memory Mapped I/O access and is meant for - debugging and reverse engineering. It is called from the ioremap - implementation and works via page faults. Tracing is disabled by - default and can be enabled at run-time. - - See Documentation/tracers/mmiotrace.txt. - If you are not helping to develop drivers, say N. - -config MMIOTRACE_TEST - tristate "Test module for mmiotrace" - depends on MMIOTRACE && m - help - This is a dumb module for testing mmiotrace. It is very dangerous - as it will write garbage to IO memory starting at a given address. - However, it should be safe to use on e.g. unused portion of VRAM. - - Say N, unless you absolutely know what you are doing. +config HAVE_MMIOTRACE_SUPPORT + def_bool y # # IO delay types: diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 1c0b7504cab3..944239296f13 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -323,4 +323,27 @@ config FTRACE_STARTUP_TEST functioning properly. It will do tests on all the configured tracers of ftrace. +config MMIOTRACE + bool "Memory mapped IO tracing" + depends on HAVE_MMIOTRACE_SUPPORT && DEBUG_KERNEL && PCI + select TRACING + help + Mmiotrace traces Memory Mapped I/O access and is meant for + debugging and reverse engineering. It is called from the ioremap + implementation and works via page faults. Tracing is disabled by + default and can be enabled at run-time. + + See Documentation/tracers/mmiotrace.txt. + If you are not helping to develop drivers, say N. + +config MMIOTRACE_TEST + tristate "Test module for mmiotrace" + depends on MMIOTRACE && m + help + This is a dumb module for testing mmiotrace. It is very dangerous + as it will write garbage to IO memory starting at a given address. + However, it should be safe to use on e.g. unused portion of VRAM. + + Say N, unless you absolutely know what you are doing. + endmenu -- cgit 1.4.1 From d3e75ff14bc1453c4762428395aac9953a023efc Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 9 Jan 2009 11:29:46 +0800 Subject: ftrace, ia64: IA64 static ftrace support IA64 ftrace suppport. In IA64, below code will be added in each function if -pg is enabled. alloc r40=ar.pfs,12,8,0 mov r43=r0;; mov r42=b0 mov r41=r1 nop.i 0x0 br.call.sptk.many b0 = _mcount;; Signed-off-by: Shaohua Li Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/ia64/Kconfig | 1 + arch/ia64/include/asm/ftrace.h | 15 +++++++++++++ arch/ia64/kernel/entry.S | 49 ++++++++++++++++++++++++++++++++++++++++++ arch/ia64/kernel/ia64_ksyms.c | 6 ++++++ 4 files changed, 71 insertions(+) create mode 100644 arch/ia64/include/asm/ftrace.h (limited to 'arch') diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 3d31636cbafb..b992ba447c41 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -21,6 +21,7 @@ config IA64 select HAVE_OPROFILE select HAVE_KPROBES select HAVE_KRETPROBES + select HAVE_FUNCTION_TRACER select HAVE_DMA_ATTRS select HAVE_KVM select HAVE_ARCH_TRACEHOOK diff --git a/arch/ia64/include/asm/ftrace.h b/arch/ia64/include/asm/ftrace.h new file mode 100644 index 000000000000..48694b3ba5a8 --- /dev/null +++ b/arch/ia64/include/asm/ftrace.h @@ -0,0 +1,15 @@ +#ifndef _ASM_IA64_FTRACE_H +#define _ASM_IA64_FTRACE_H + +#ifdef CONFIG_FUNCTION_TRACER +#define MCOUNT_INSN_SIZE 32 /* sizeof mcount call */ + +#ifndef __ASSEMBLY__ +extern void _mcount(unsigned long pfs, unsigned long r1, unsigned long b0, unsigned long r0); +#define mcount _mcount + +#endif + +#endif /* CONFIG_FUNCTION_TRACER */ + +#endif /* _ASM_IA64_FTRACE_H */ diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S index d435f4a7a96c..c2f7d798e2a5 100644 --- a/arch/ia64/kernel/entry.S +++ b/arch/ia64/kernel/entry.S @@ -47,6 +47,7 @@ #include #include #include +#include #include "minstate.h" @@ -1404,6 +1405,54 @@ GLOBAL_ENTRY(unw_init_running) br.ret.sptk.many rp END(unw_init_running) +#ifdef CONFIG_FUNCTION_TRACER +GLOBAL_ENTRY(_mcount) + movl r2 = ftrace_stub + movl r3 = ftrace_trace_function;; + ld8 r3 = [r3];; + ld8 r3 = [r3];; + cmp.eq p7,p0 = r2, r3 +(p7) br.sptk.many ftrace_stub + ;; + + alloc loc0 = ar.pfs, 4, 4, 2, 0 + ;; + mov loc1 = b0 + mov out0 = b0 + mov loc2 = r8 + mov loc3 = r15 + ;; + adds out0 = -MCOUNT_INSN_SIZE, out0 + mov out1 = in2 + mov b6 = r3 + + br.call.sptk.many b0 = b6 + ;; + mov ar.pfs = loc0 + mov b0 = loc1 + mov r8 = loc2 + mov r15 = loc3 + br ftrace_stub + ;; +END(_mcount) + +GLOBAL_ENTRY(ftrace_stub) + mov r3 = b0 + movl r2 = _mcount_ret_helper + ;; + mov b6 = r2 + mov b7 = r3 + br.ret.sptk.many b6 + +_mcount_ret_helper: + mov b0 = r42 + mov r1 = r41 + mov ar.pfs = r40 + br b7 +END(ftrace_stub) + +#endif /* CONFIG_FUNCTION_TRACER */ + .rodata .align 8 .globl sys_call_table diff --git a/arch/ia64/kernel/ia64_ksyms.c b/arch/ia64/kernel/ia64_ksyms.c index 6da1f20d7372..2d311864e359 100644 --- a/arch/ia64/kernel/ia64_ksyms.c +++ b/arch/ia64/kernel/ia64_ksyms.c @@ -112,3 +112,9 @@ EXPORT_SYMBOL_GPL(esi_call_phys); #endif extern char ia64_ivt[]; EXPORT_SYMBOL(ia64_ivt); + +#include +#ifdef CONFIG_FUNCTION_TRACER +/* mcount is defined in assembly */ +EXPORT_SYMBOL(_mcount); +#endif -- cgit 1.4.1 From a14a07b8018b714e03a39ff2180c66e307ef4238 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 9 Jan 2009 11:29:49 +0800 Subject: ftrace, ia64: IA64 dynamic ftrace support IA64 dynamic ftrace support. The original _mcount stub for each function is like: alloc r40=ar.pfs,12,8,0 mov r43=r0;; mov r42=b0 mov r41=r1 nop.i 0x0 br.call.sptk.many b0 = _mcount;; The patch convert it to below for nop: [MII] nop.m 0x0 mov r3=ip nop.i 0x0 [MLX] nop.m 0x0 nop.x 0x0;; This isn't completely nop, as there is one instuction 'mov r3=ip', but it should be light and harmless for code follow it. And below is for call [MII] nop.m 0x0 mov r3=ip nop.i 0x0 [MLX] nop.m 0x0 brl.many .;; In this way, only one instruction is changed to convert code between nop and call. This should meet dyn-ftrace's requirement. But this requires CPU support brl instruction, so dyn-ftrace isn't supported for old Itanium system. Assume there are quite few such old system running. Signed-off-by: Shaohua Li Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/ia64/Kconfig | 2 + arch/ia64/include/asm/ftrace.h | 13 +++ arch/ia64/kernel/Makefile | 5 + arch/ia64/kernel/entry.S | 51 ++++++++++ arch/ia64/kernel/ftrace.c | 206 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 277 insertions(+) create mode 100644 arch/ia64/kernel/ftrace.c (limited to 'arch') diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index b992ba447c41..e20c1d45930a 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -21,6 +21,8 @@ config IA64 select HAVE_OPROFILE select HAVE_KPROBES select HAVE_KRETPROBES + select HAVE_FTRACE_MCOUNT_RECORD + select HAVE_DYNAMIC_FTRACE if (!ITANIUM) select HAVE_FUNCTION_TRACER select HAVE_DMA_ATTRS select HAVE_KVM diff --git a/arch/ia64/include/asm/ftrace.h b/arch/ia64/include/asm/ftrace.h index 48694b3ba5a8..d20db3c2a656 100644 --- a/arch/ia64/include/asm/ftrace.h +++ b/arch/ia64/include/asm/ftrace.h @@ -8,6 +8,19 @@ extern void _mcount(unsigned long pfs, unsigned long r1, unsigned long b0, unsigned long r0); #define mcount _mcount +#include +/* In IA64, MCOUNT_ADDR is set in link time, so it's not a constant at compile time */ +#define MCOUNT_ADDR (((struct fnptr *)mcount)->ip) +#define FTRACE_ADDR (((struct fnptr *)ftrace_caller)->ip) + +static inline unsigned long ftrace_call_adjust(unsigned long addr) +{ + /* second bundle, insn 2 */ + return addr - 0x12; +} + +struct dyn_arch_ftrace { +}; #endif #endif /* CONFIG_FUNCTION_TRACER */ diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile index c381ea954892..ab6e7ec0bba3 100644 --- a/arch/ia64/kernel/Makefile +++ b/arch/ia64/kernel/Makefile @@ -2,6 +2,10 @@ # Makefile for the linux kernel. # +ifdef CONFIG_DYNAMIC_FTRACE +CFLAGS_REMOVE_ftrace.o = -pg +endif + extra-y := head.o init_task.o vmlinux.lds obj-y := acpi.o entry.o efi.o efi_stub.o gate-data.o fsys.o ia64_ksyms.o irq.o irq_ia64.o \ @@ -28,6 +32,7 @@ obj-$(CONFIG_IA64_CYCLONE) += cyclone.o obj-$(CONFIG_CPU_FREQ) += cpufreq/ obj-$(CONFIG_IA64_MCA_RECOVERY) += mca_recovery.o obj-$(CONFIG_KPROBES) += kprobes.o jprobes.o +obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_IA64_UNCACHED_ALLOCATOR) += uncached.o diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S index c2f7d798e2a5..e0be92a6abb0 100644 --- a/arch/ia64/kernel/entry.S +++ b/arch/ia64/kernel/entry.S @@ -1406,6 +1406,56 @@ GLOBAL_ENTRY(unw_init_running) END(unw_init_running) #ifdef CONFIG_FUNCTION_TRACER +#ifdef CONFIG_DYNAMIC_FTRACE +GLOBAL_ENTRY(_mcount) + br ftrace_stub +END(_mcount) + +.here: + br.ret.sptk.many b0 + +GLOBAL_ENTRY(ftrace_caller) + alloc out0 = ar.pfs, 8, 0, 4, 0 + mov out3 = r0 + ;; + mov out2 = b0 + add r3 = 0x20, r3 + mov out1 = r1; + br.call.sptk.many b0 = ftrace_patch_gp + //this might be called from module, so we must patch gp +ftrace_patch_gp: + movl gp=__gp + mov b0 = r3 + ;; +.global ftrace_call; +ftrace_call: +{ + .mlx + nop.m 0x0 + movl r3 = .here;; +} + alloc loc0 = ar.pfs, 4, 4, 2, 0 + ;; + mov loc1 = b0 + mov out0 = b0 + mov loc2 = r8 + mov loc3 = r15 + ;; + adds out0 = -MCOUNT_INSN_SIZE, out0 + mov out1 = in2 + mov b6 = r3 + + br.call.sptk.many b0 = b6 + ;; + mov ar.pfs = loc0 + mov b0 = loc1 + mov r8 = loc2 + mov r15 = loc3 + br ftrace_stub + ;; +END(ftrace_caller) + +#else GLOBAL_ENTRY(_mcount) movl r2 = ftrace_stub movl r3 = ftrace_trace_function;; @@ -1435,6 +1485,7 @@ GLOBAL_ENTRY(_mcount) br ftrace_stub ;; END(_mcount) +#endif GLOBAL_ENTRY(ftrace_stub) mov r3 = b0 diff --git a/arch/ia64/kernel/ftrace.c b/arch/ia64/kernel/ftrace.c new file mode 100644 index 000000000000..7fc8c961b1f7 --- /dev/null +++ b/arch/ia64/kernel/ftrace.c @@ -0,0 +1,206 @@ +/* + * Dynamic function tracing support. + * + * Copyright (C) 2008 Shaohua Li + * + * For licencing details, see COPYING. + * + * Defines low-level handling of mcount calls when the kernel + * is compiled with the -pg flag. When using dynamic ftrace, the + * mcount call-sites get patched lazily with NOP till they are + * enabled. All code mutation routines here take effect atomically. + */ + +#include +#include + +#include +#include + +/* In IA64, each function will be added below two bundles with -pg option */ +static unsigned char __attribute__((aligned(8))) +ftrace_orig_code[MCOUNT_INSN_SIZE] = { + 0x02, 0x40, 0x31, 0x10, 0x80, 0x05, /* alloc r40=ar.pfs,12,8,0 */ + 0xb0, 0x02, 0x00, 0x00, 0x42, 0x40, /* mov r43=r0;; */ + 0x05, 0x00, 0xc4, 0x00, /* mov r42=b0 */ + 0x11, 0x48, 0x01, 0x02, 0x00, 0x21, /* mov r41=r1 */ + 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, /* nop.i 0x0 */ + 0x08, 0x00, 0x00, 0x50 /* br.call.sptk.many b0 = _mcount;; */ +}; + +struct ftrace_orig_insn { + u64 dummy1, dummy2, dummy3; + u64 dummy4:64-41+13; + u64 imm20:20; + u64 dummy5:3; + u64 sign:1; + u64 dummy6:4; +}; + +/* mcount stub will be converted below for nop */ +static unsigned char ftrace_nop_code[MCOUNT_INSN_SIZE] = { + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MII] nop.m 0x0 */ + 0x30, 0x00, 0x00, 0x60, 0x00, 0x00, /* mov r3=ip */ + 0x00, 0x00, 0x04, 0x00, /* nop.i 0x0 */ + 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MLX] nop.m 0x0 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* nop.x 0x0;; */ + 0x00, 0x00, 0x04, 0x00 +}; + +static unsigned char *ftrace_nop_replace(void) +{ + return ftrace_nop_code; +} + +/* + * mcount stub will be converted below for call + * Note: Just the last instruction is changed against nop + * */ +static unsigned char __attribute__((aligned(8))) +ftrace_call_code[MCOUNT_INSN_SIZE] = { + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MII] nop.m 0x0 */ + 0x30, 0x00, 0x00, 0x60, 0x00, 0x00, /* mov r3=ip */ + 0x00, 0x00, 0x04, 0x00, /* nop.i 0x0 */ + 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MLX] nop.m 0x0 */ + 0xff, 0xff, 0xff, 0xff, 0x7f, 0x00, /* brl.many .;;*/ + 0xf8, 0xff, 0xff, 0xc8 +}; + +struct ftrace_call_insn { + u64 dummy1, dummy2; + u64 dummy3:48; + u64 imm39_l:16; + u64 imm39_h:23; + u64 dummy4:13; + u64 imm20:20; + u64 dummy5:3; + u64 i:1; + u64 dummy6:4; +}; + +static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) +{ + struct ftrace_call_insn *code = (void *)ftrace_call_code; + unsigned long offset = addr - (ip + 0x10); + + code->imm39_l = offset >> 24; + code->imm39_h = offset >> 40; + code->imm20 = offset >> 4; + code->i = offset >> 63; + return ftrace_call_code; +} + +static int +ftrace_modify_code(unsigned long ip, unsigned char *old_code, + unsigned char *new_code, int do_check) +{ + unsigned char replaced[MCOUNT_INSN_SIZE]; + + /* + * Note: Due to modules and __init, code can + * disappear and change, we need to protect against faulting + * as well as code changing. We do this by using the + * probe_kernel_* functions. + * + * No real locking needed, this code is run through + * kstop_machine, or before SMP starts. + */ + + if (!do_check) + goto skip_check; + + /* read the text we want to modify */ + if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) + return -EFAULT; + + /* Make sure it is what we expect it to be */ + if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0) + return -EINVAL; + +skip_check: + /* replace the text with the new text */ + if (probe_kernel_write(((void *)ip), new_code, MCOUNT_INSN_SIZE)) + return -EPERM; + flush_icache_range(ip, ip + MCOUNT_INSN_SIZE); + + return 0; +} + +static int ftrace_make_nop_check(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned char __attribute__((aligned(8))) replaced[MCOUNT_INSN_SIZE]; + unsigned long ip = rec->ip; + + if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) + return -EFAULT; + if (rec->flags & FTRACE_FL_CONVERTED) { + struct ftrace_call_insn *call_insn, *tmp_call; + + call_insn = (void *)ftrace_call_code; + tmp_call = (void *)replaced; + call_insn->imm39_l = tmp_call->imm39_l; + call_insn->imm39_h = tmp_call->imm39_h; + call_insn->imm20 = tmp_call->imm20; + call_insn->i = tmp_call->i; + if (memcmp(replaced, ftrace_call_code, MCOUNT_INSN_SIZE) != 0) + return -EINVAL; + return 0; + } else { + struct ftrace_orig_insn *call_insn, *tmp_call; + + call_insn = (void *)ftrace_orig_code; + tmp_call = (void *)replaced; + call_insn->sign = tmp_call->sign; + call_insn->imm20 = tmp_call->imm20; + if (memcmp(replaced, ftrace_orig_code, MCOUNT_INSN_SIZE) != 0) + return -EINVAL; + return 0; + } +} + +int ftrace_make_nop(struct module *mod, + struct dyn_ftrace *rec, unsigned long addr) +{ + int ret; + char *new; + + ret = ftrace_make_nop_check(rec, addr); + if (ret) + return ret; + new = ftrace_nop_replace(); + return ftrace_modify_code(rec->ip, NULL, new, 0); +} + +int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned long ip = rec->ip; + unsigned char *old, *new; + + old= ftrace_nop_replace(); + new = ftrace_call_replace(ip, addr); + return ftrace_modify_code(ip, old, new, 1); +} + +/* in IA64, _mcount can't directly call ftrace_stub. Only jump is ok */ +int ftrace_update_ftrace_func(ftrace_func_t func) +{ + unsigned long ip; + unsigned long addr = ((struct fnptr *)ftrace_call)->ip; + + if (func == ftrace_stub) + return 0; + ip = ((struct fnptr *)func)->ip; + + ia64_patch_imm64(addr + 2, ip); + + flush_icache_range(addr, addr + 16); + return 0; +} + +/* run from kstop_machine */ +int __init ftrace_dyn_arch_init(void *data) +{ + *(unsigned long *)data = 0; + + return 0; +} -- cgit 1.4.1 From b1818748b0cf9427e48acf9713295e829a0d715f Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Mon, 19 Jan 2009 10:31:01 +0100 Subject: x86, ftrace, hw-branch-tracer: dump trace on oops Dump the branch trace on an oops (based on ftrace_dump_on_oops). Signed-off-by: Markus Metzger Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack.c | 6 ++++++ include/linux/ftrace.h | 13 +++++++++++++ kernel/trace/trace.h | 1 - kernel/trace/trace_hw_branches.c | 29 ++++++++++++++++++++++------- 4 files changed, 41 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 6b1f6f6f8661..077c9ea655fc 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -14,6 +14,7 @@ #include #include #include +#include #include @@ -195,6 +196,11 @@ unsigned __kprobes long oops_begin(void) int cpu; unsigned long flags; + /* notify the hw-branch tracer so it may disable tracing and + add the last trace to the trace buffer - + the earlier this happens, the more useful the trace. */ + trace_hw_branch_oops(); + oops_enter(); /* racy, but better than risking deadlock. */ diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 054721487574..9f7880d87c39 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -496,4 +496,17 @@ static inline int test_tsk_trace_graph(struct task_struct *tsk) #endif /* CONFIG_TRACING */ + +#ifdef CONFIG_HW_BRANCH_TRACER + +void trace_hw_branch(u64 from, u64 to); +void trace_hw_branch_oops(void); + +#else /* CONFIG_HW_BRANCH_TRACER */ + +static inline void trace_hw_branch(u64 from, u64 to) {} +static inline void trace_hw_branch_oops(void) {} + +#endif /* CONFIG_HW_BRANCH_TRACER */ + #endif /* _LINUX_FTRACE_H */ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 54b72781e920..b96037d970df 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -438,7 +438,6 @@ void trace_function(struct trace_array *tr, void trace_graph_return(struct ftrace_graph_ret *trace); int trace_graph_entry(struct ftrace_graph_ent *trace); -void trace_hw_branch(struct trace_array *tr, u64 from, u64 to); void tracing_start_cmdline_record(void); void tracing_stop_cmdline_record(void); diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index 398195397c75..e56df2c7d679 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -40,6 +40,7 @@ static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer); #define this_buffer per_cpu(buffer, smp_processor_id()) static int __read_mostly trace_hw_branches_enabled; +static struct trace_array *hw_branch_trace __read_mostly; /* @@ -128,6 +129,8 @@ static struct notifier_block bts_hotcpu_notifier __cpuinitdata = { static int bts_trace_init(struct trace_array *tr) { + hw_branch_trace = tr; + register_hotcpu_notifier(&bts_hotcpu_notifier); tracing_reset_online_cpus(tr); bts_trace_start(tr); @@ -170,8 +173,9 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) return TRACE_TYPE_UNHANDLED; } -void trace_hw_branch(struct trace_array *tr, u64 from, u64 to) +void trace_hw_branch(u64 from, u64 to) { + struct trace_array *tr = hw_branch_trace; struct ring_buffer_event *event; struct hw_branch_entry *entry; unsigned long irq1, irq2; @@ -204,8 +208,7 @@ void trace_hw_branch(struct trace_array *tr, u64 from, u64 to) local_irq_restore(irq1); } -static void trace_bts_at(struct trace_array *tr, - const struct bts_trace *trace, void *at) +static void trace_bts_at(const struct bts_trace *trace, void *at) { struct bts_struct bts; int err = 0; @@ -220,7 +223,7 @@ static void trace_bts_at(struct trace_array *tr, switch (bts.qualifier) { case BTS_BRANCH: - trace_hw_branch(tr, bts.variant.lbr.from, bts.variant.lbr.to); + trace_hw_branch(bts.variant.lbr.from, bts.variant.lbr.to); break; } } @@ -236,12 +239,15 @@ static void trace_bts_cpu(void *arg) const struct bts_trace *trace; unsigned char *at; - if (!this_tracer) + if (unlikely(!tr)) return; if (unlikely(atomic_read(&tr->data[raw_smp_processor_id()]->disabled))) return; + if (unlikely(!this_tracer)) + return; + ds_suspend_bts(this_tracer); trace = ds_read_bts(this_tracer); if (!trace) @@ -249,11 +255,11 @@ static void trace_bts_cpu(void *arg) for (at = trace->ds.top; (void *)at < trace->ds.end; at += trace->ds.size) - trace_bts_at(tr, trace, at); + trace_bts_at(trace, at); for (at = trace->ds.begin; (void *)at < trace->ds.top; at += trace->ds.size) - trace_bts_at(tr, trace, at); + trace_bts_at(trace, at); out: ds_resume_bts(this_tracer); @@ -268,6 +274,15 @@ static void trace_bts_prepare(struct trace_iterator *iter) mutex_unlock(&bts_tracer_mutex); } +void trace_hw_branch_oops(void) +{ + mutex_lock(&bts_tracer_mutex); + + trace_bts_cpu(hw_branch_trace); + + mutex_unlock(&bts_tracer_mutex); +} + struct tracer bts_tracer __read_mostly = { .name = "hw-branch-tracer", -- cgit 1.4.1 From ce5e5540c0e839781e7cd134517d5d2e9e819636 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Mon, 19 Jan 2009 10:38:35 +0100 Subject: x86, ds, bts: cleanup DS configuration Cleanup the cpuid check for DS configuration. Signed-off-by: Markus Metzger Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index da91701a2348..169a120587be 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -15,8 +15,8 @@ * - buffer allocation (memory accounting) * * - * Copyright (C) 2007-2008 Intel Corporation. - * Markus Metzger , 2007-2008 + * Copyright (C) 2007-2009 Intel Corporation. + * Markus Metzger , 2007-2009 */ @@ -890,7 +890,7 @@ int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value) } static const struct ds_configuration ds_cfg_netburst = { - .name = "netburst", + .name = "Netburst", .ctl[dsf_bts] = (1 << 2) | (1 << 3), .ctl[dsf_bts_kernel] = (1 << 5), .ctl[dsf_bts_user] = (1 << 6), @@ -904,7 +904,7 @@ static const struct ds_configuration ds_cfg_netburst = { #endif }; static const struct ds_configuration ds_cfg_pentium_m = { - .name = "pentium m", + .name = "Pentium M", .ctl[dsf_bts] = (1 << 6) | (1 << 7), .sizeof_field = sizeof(long), @@ -915,8 +915,8 @@ static const struct ds_configuration ds_cfg_pentium_m = { .sizeof_rec[ds_pebs] = sizeof(long) * 18, #endif }; -static const struct ds_configuration ds_cfg_core2 = { - .name = "core 2", +static const struct ds_configuration ds_cfg_core2_atom = { + .name = "Core 2/Atom", .ctl[dsf_bts] = (1 << 6) | (1 << 7), .ctl[dsf_bts_kernel] = (1 << 9), .ctl[dsf_bts_user] = (1 << 10), @@ -949,19 +949,22 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) switch (c->x86) { case 0x6: switch (c->x86_model) { - case 0 ... 0xC: - /* sorry, don't know about them */ - break; - case 0xD: - case 0xE: /* Pentium M */ + case 0x9: + case 0xd: /* Pentium M */ ds_configure(&ds_cfg_pentium_m); break; - default: /* Core2, Atom, ... */ - ds_configure(&ds_cfg_core2); + case 0xf: + case 0x17: /* Core2 */ + case 0x1c: /* Atom */ + ds_configure(&ds_cfg_core2_atom); + break; + case 0x1a: /* i7 */ + default: + /* sorry, don't know about them */ break; } break; - case 0xF: + case 0xf: switch (c->x86_model) { case 0x0: case 0x1: -- cgit 1.4.1 From 890252823766e562301e61340f3187a14033d045 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 26 Jan 2009 18:28:02 +0300 Subject: x86: ftrace - simplify wait_for_nmi Get rid of 'waited' stack variable. Signed-off-by: Cyrill Gorcunov Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/kernel/ftrace.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 1b43086b097a..4d33224c055f 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -133,15 +133,14 @@ void ftrace_nmi_exit(void) static void wait_for_nmi(void) { - int waited = 0; + if (!atomic_read(&in_nmi)) + return; - while (atomic_read(&in_nmi)) { - waited = 1; + do { cpu_relax(); - } + } while(atomic_read(&in_nmi)); - if (waited) - nmi_wait_count++; + nmi_wait_count++; } static int -- cgit 1.4.1 From 78d904b46a72fcf15ea6a39672bbef92953876b5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 5 Feb 2009 18:43:07 -0500 Subject: ring-buffer: add NMI protection for spinlocks Impact: prevent deadlock in NMI The ring buffers are not yet totally lockless with writing to the buffer. When a writer crosses a page, it grabs a per cpu spinlock to protect against a reader. The spinlocks taken by a writer are not to protect against other writers, since a writer can only write to its own per cpu buffer. The spinlocks protect against readers that can touch any cpu buffer. The writers are made to be reentrant with the spinlocks disabling interrupts. The problem arises when an NMI writes to the buffer, and that write crosses a page boundary. If it grabs a spinlock, it can be racing with another writer (since disabling interrupts does not protect against NMIs) or with a reader on the same CPU. Luckily, most of the users are not reentrant and protects against this issue. But if a user of the ring buffer becomes reentrant (which is what the ring buffers do allow), if the NMI also writes to the ring buffer then we risk the chance of a deadlock. This patch moves the ftrace_nmi_enter called by nmi_enter() to the ring buffer code. It replaces the current ftrace_nmi_enter that is used by arch specific code to arch_ftrace_nmi_enter and updates the Kconfig to handle it. When an NMI is called, it will set a per cpu variable in the ring buffer code and will clear it when the NMI exits. If a write to the ring buffer crosses page boundaries inside an NMI, a trylock is used on the spin lock instead. If the spinlock fails to be acquired, then the entry is discarded. This bug appeared in the ftrace work in the RT tree, where event tracing is reentrant. This workaround solved the deadlocks that appeared there. Signed-off-by: Steven Rostedt --- arch/x86/Kconfig | 1 + arch/x86/kernel/ftrace.c | 8 ++++---- include/linux/ftrace_irq.h | 10 +++++++++- kernel/trace/Kconfig | 8 ++++++++ kernel/trace/ring_buffer.c | 48 ++++++++++++++++++++++++++++++++++++++++++++-- 5 files changed, 68 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 73f7fe8fd4d1..a6be725cb049 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -34,6 +34,7 @@ config X86 select HAVE_FUNCTION_TRACER select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACE_MCOUNT_TEST + select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE || FUNCTION_GRAPH_TRACER select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) select HAVE_ARCH_KGDB if !X86_VOYAGER select HAVE_ARCH_TRACEHOOK diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 4d33224c055f..4c683587055b 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -113,7 +113,7 @@ static void ftrace_mod_code(void) MCOUNT_INSN_SIZE); } -void ftrace_nmi_enter(void) +void arch_ftrace_nmi_enter(void) { atomic_inc(&in_nmi); /* Must have in_nmi seen before reading write flag */ @@ -124,7 +124,7 @@ void ftrace_nmi_enter(void) } } -void ftrace_nmi_exit(void) +void arch_ftrace_nmi_exit(void) { /* Finish all executions before clearing in_nmi */ smp_wmb(); @@ -376,12 +376,12 @@ int ftrace_disable_ftrace_graph_caller(void) */ static atomic_t in_nmi; -void ftrace_nmi_enter(void) +void arch_ftrace_nmi_enter(void) { atomic_inc(&in_nmi); } -void ftrace_nmi_exit(void) +void arch_ftrace_nmi_exit(void) { atomic_dec(&in_nmi); } diff --git a/include/linux/ftrace_irq.h b/include/linux/ftrace_irq.h index 366a054d0b05..29de6779a963 100644 --- a/include/linux/ftrace_irq.h +++ b/include/linux/ftrace_irq.h @@ -2,7 +2,15 @@ #define _LINUX_FTRACE_IRQ_H -#if defined(CONFIG_DYNAMIC_FTRACE) || defined(CONFIG_FUNCTION_GRAPH_TRACER) +#ifdef CONFIG_FTRACE_NMI_ENTER +extern void arch_ftrace_nmi_enter(void); +extern void arch_ftrace_nmi_exit(void); +#else +static inline void arch_ftrace_nmi_enter(void) { } +static inline void arch_ftrace_nmi_exit(void) { } +#endif + +#ifdef CONFIG_RING_BUFFER extern void ftrace_nmi_enter(void); extern void ftrace_nmi_exit(void); #else diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 28f2644484d9..25131a5d5e4f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -9,6 +9,9 @@ config USER_STACKTRACE_SUPPORT config NOP_TRACER bool +config HAVE_FTRACE_NMI_ENTER + bool + config HAVE_FUNCTION_TRACER bool @@ -37,6 +40,11 @@ config TRACER_MAX_TRACE config RING_BUFFER bool +config FTRACE_NMI_ENTER + bool + depends on HAVE_FTRACE_NMI_ENTER + default y + config TRACING bool select DEBUG_FS diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b36d7374ceef..a60a6a852f42 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -4,6 +4,7 @@ * Copyright (C) 2008 Steven Rostedt */ #include +#include #include #include #include @@ -18,6 +19,35 @@ #include "trace.h" +/* + * Since the write to the buffer is still not fully lockless, + * we must be careful with NMIs. The locks in the writers + * are taken when a write crosses to a new page. The locks + * protect against races with the readers (this will soon + * be fixed with a lockless solution). + * + * Because we can not protect against NMIs, and we want to + * keep traces reentrant, we need to manage what happens + * when we are in an NMI. + */ +static DEFINE_PER_CPU(int, rb_in_nmi); + +void ftrace_nmi_enter(void) +{ + __get_cpu_var(rb_in_nmi)++; + /* call arch specific handler too */ + arch_ftrace_nmi_enter(); +} + +void ftrace_nmi_exit(void) +{ + arch_ftrace_nmi_exit(); + __get_cpu_var(rb_in_nmi)--; + /* NMIs are not recursive */ + WARN_ON_ONCE(__get_cpu_var(rb_in_nmi)); +} + + /* * A fast way to enable or disable all ring buffers is to * call tracing_on or tracing_off. Turning off the ring buffers @@ -982,6 +1012,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer *buffer = cpu_buffer->buffer; struct ring_buffer_event *event; unsigned long flags; + bool lock_taken = false; commit_page = cpu_buffer->commit_page; /* we just need to protect against interrupts */ @@ -995,7 +1026,19 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *next_page = tail_page; local_irq_save(flags); - __raw_spin_lock(&cpu_buffer->lock); + /* + * NMIs can happen after we take the lock. + * If we are in an NMI, only take the lock + * if it is not already taken. Otherwise + * simply fail. + */ + if (unlikely(__get_cpu_var(rb_in_nmi))) { + if (!__raw_spin_trylock(&cpu_buffer->lock)) + goto out_unlock; + } else + __raw_spin_lock(&cpu_buffer->lock); + + lock_taken = true; rb_inc_page(cpu_buffer, &next_page); @@ -1097,7 +1140,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, if (tail <= BUF_PAGE_SIZE) local_set(&tail_page->write, tail); - __raw_spin_unlock(&cpu_buffer->lock); + if (likely(lock_taken)) + __raw_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); return NULL; } -- cgit 1.4.1 From 4e6ea1440c67de32d7c89aacf233472dfc3bce82 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 5 Feb 2009 22:30:07 -0500 Subject: ftrace, x86: rename in_nmi variable Impact: clean up The in_nmi variable in x86 arch ftrace.c is a misnomer. Andrew Morton pointed out that the in_nmi variable is incremented by all CPUS. It can be set when another CPU is running an NMI. Since this is actually intentional, the fix is to rename it to what it really is: "nmi_running" Signed-off-by: Steven Rostedt --- arch/x86/kernel/ftrace.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 4c683587055b..e3fad2ef622c 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -82,7 +82,7 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) * are the same as what exists. */ -static atomic_t in_nmi = ATOMIC_INIT(0); +static atomic_t nmi_running = ATOMIC_INIT(0); static int mod_code_status; /* holds return value of text write */ static int mod_code_write; /* set when NMI should do the write */ static void *mod_code_ip; /* holds the IP to write to */ @@ -115,8 +115,8 @@ static void ftrace_mod_code(void) void arch_ftrace_nmi_enter(void) { - atomic_inc(&in_nmi); - /* Must have in_nmi seen before reading write flag */ + atomic_inc(&nmi_running); + /* Must have nmi_running seen before reading write flag */ smp_mb(); if (mod_code_write) { ftrace_mod_code(); @@ -126,19 +126,19 @@ void arch_ftrace_nmi_enter(void) void arch_ftrace_nmi_exit(void) { - /* Finish all executions before clearing in_nmi */ + /* Finish all executions before clearing nmi_running */ smp_wmb(); - atomic_dec(&in_nmi); + atomic_dec(&nmi_running); } static void wait_for_nmi(void) { - if (!atomic_read(&in_nmi)) + if (!atomic_read(&nmi_running)) return; do { cpu_relax(); - } while(atomic_read(&in_nmi)); + } while (atomic_read(&nmi_running)); nmi_wait_count++; } @@ -374,16 +374,16 @@ int ftrace_disable_ftrace_graph_caller(void) * this page for dynamic ftrace. They have been * simplified to ignore all traces in NMI context. */ -static atomic_t in_nmi; +static atomic_t nmi_running; void arch_ftrace_nmi_enter(void) { - atomic_inc(&in_nmi); + atomic_inc(&nmi_running); } void arch_ftrace_nmi_exit(void) { - atomic_dec(&in_nmi); + atomic_dec(&nmi_running); } #endif /* !CONFIG_DYNAMIC_FTRACE */ @@ -475,7 +475,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) &return_to_handler; /* Nmi's are currently unsupported */ - if (unlikely(atomic_read(&in_nmi))) + if (unlikely(atomic_read(&nmi_running))) return; if (unlikely(atomic_read(¤t->tracing_graph_pause))) -- cgit 1.4.1 From 9a5fd902273d01170fd033691bd70b142baa7309 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 6 Feb 2009 01:14:26 -0500 Subject: ftrace: change function graph tracer to use new in_nmi The function graph tracer piggy backed onto the dynamic ftracer to use the in_nmi custom code for dynamic tracing. The problem was (as Andrew Morton pointed out) it really only wanted to bail out if the context of the current CPU was in NMI context. But the dynamic ftrace in_nmi custom code was true if _any_ CPU happened to be in NMI context. Now that we have a generic in_nmi interface, this patch changes the function graph code to use it instead of the dynamic ftarce custom code. Reported-by: Andrew Morton Signed-off-by: Steven Rostedt --- arch/x86/Kconfig | 2 +- arch/x86/kernel/ftrace.c | 21 +-------------------- 2 files changed, 2 insertions(+), 21 deletions(-) (limited to 'arch') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a6be725cb049..2cf7bbcaed4e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -34,7 +34,7 @@ config X86 select HAVE_FUNCTION_TRACER select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACE_MCOUNT_TEST - select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE || FUNCTION_GRAPH_TRACER + select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) select HAVE_ARCH_KGDB if !X86_VOYAGER select HAVE_ARCH_TRACEHOOK diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index e3fad2ef622c..918073c6681b 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -367,25 +367,6 @@ int ftrace_disable_ftrace_graph_caller(void) return ftrace_mod_jmp(ip, old_offset, new_offset); } -#else /* CONFIG_DYNAMIC_FTRACE */ - -/* - * These functions are picked from those used on - * this page for dynamic ftrace. They have been - * simplified to ignore all traces in NMI context. - */ -static atomic_t nmi_running; - -void arch_ftrace_nmi_enter(void) -{ - atomic_inc(&nmi_running); -} - -void arch_ftrace_nmi_exit(void) -{ - atomic_dec(&nmi_running); -} - #endif /* !CONFIG_DYNAMIC_FTRACE */ /* Add a function return address to the trace stack on thread info.*/ @@ -475,7 +456,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) &return_to_handler; /* Nmi's are currently unsupported */ - if (unlikely(atomic_read(&nmi_running))) + if (unlikely(in_nmi())) return; if (unlikely(atomic_read(¤t->tracing_graph_pause))) -- cgit 1.4.1 From a81bd80a0b0a405dc0483e2c428332d69da2c79f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 6 Feb 2009 01:45:16 -0500 Subject: ring-buffer: use generic version of in_nmi Impact: clean up Now that a generic in_nmi is available, this patch removes the special code in the ring_buffer and implements the in_nmi generic version instead. With this change, I was also able to rename the "arch_ftrace_nmi_enter" back to "ftrace_nmi_enter" and remove the code from the ring buffer. Signed-off-by: Steven Rostedt --- arch/x86/kernel/ftrace.c | 4 ++-- include/linux/ftrace_irq.h | 8 -------- kernel/trace/ring_buffer.c | 43 +++++++++++++------------------------------ 3 files changed, 15 insertions(+), 40 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 918073c6681b..d74d75e0952d 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -113,7 +113,7 @@ static void ftrace_mod_code(void) MCOUNT_INSN_SIZE); } -void arch_ftrace_nmi_enter(void) +void ftrace_nmi_enter(void) { atomic_inc(&nmi_running); /* Must have nmi_running seen before reading write flag */ @@ -124,7 +124,7 @@ void arch_ftrace_nmi_enter(void) } } -void arch_ftrace_nmi_exit(void) +void ftrace_nmi_exit(void) { /* Finish all executions before clearing nmi_running */ smp_wmb(); diff --git a/include/linux/ftrace_irq.h b/include/linux/ftrace_irq.h index 29de6779a963..dca7bf8cffe2 100644 --- a/include/linux/ftrace_irq.h +++ b/include/linux/ftrace_irq.h @@ -3,14 +3,6 @@ #ifdef CONFIG_FTRACE_NMI_ENTER -extern void arch_ftrace_nmi_enter(void); -extern void arch_ftrace_nmi_exit(void); -#else -static inline void arch_ftrace_nmi_enter(void) { } -static inline void arch_ftrace_nmi_exit(void) { } -#endif - -#ifdef CONFIG_RING_BUFFER extern void ftrace_nmi_enter(void); extern void ftrace_nmi_exit(void); #else diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a60a6a852f42..5ee344417cd5 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -19,35 +20,6 @@ #include "trace.h" -/* - * Since the write to the buffer is still not fully lockless, - * we must be careful with NMIs. The locks in the writers - * are taken when a write crosses to a new page. The locks - * protect against races with the readers (this will soon - * be fixed with a lockless solution). - * - * Because we can not protect against NMIs, and we want to - * keep traces reentrant, we need to manage what happens - * when we are in an NMI. - */ -static DEFINE_PER_CPU(int, rb_in_nmi); - -void ftrace_nmi_enter(void) -{ - __get_cpu_var(rb_in_nmi)++; - /* call arch specific handler too */ - arch_ftrace_nmi_enter(); -} - -void ftrace_nmi_exit(void) -{ - arch_ftrace_nmi_exit(); - __get_cpu_var(rb_in_nmi)--; - /* NMIs are not recursive */ - WARN_ON_ONCE(__get_cpu_var(rb_in_nmi)); -} - - /* * A fast way to enable or disable all ring buffers is to * call tracing_on or tracing_off. Turning off the ring buffers @@ -1027,12 +999,23 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, local_irq_save(flags); /* + * Since the write to the buffer is still not + * fully lockless, we must be careful with NMIs. + * The locks in the writers are taken when a write + * crosses to a new page. The locks protect against + * races with the readers (this will soon be fixed + * with a lockless solution). + * + * Because we can not protect against NMIs, and we + * want to keep traces reentrant, we need to manage + * what happens when we are in an NMI. + * * NMIs can happen after we take the lock. * If we are in an NMI, only take the lock * if it is not already taken. Otherwise * simply fail. */ - if (unlikely(__get_cpu_var(rb_in_nmi))) { + if (unlikely(in_nmi())) { if (!__raw_spin_trylock(&cpu_buffer->lock)) goto out_unlock; } else -- cgit 1.4.1 From 1292211058aaf872eeb2a0e2677d237916b4501f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 7 Feb 2009 22:16:12 +0100 Subject: tracing/power: move the power trace headers to a dedicated file Impact: cleanup Move the power tracer headers to trace/power.h to keep ftrace.h and power bits more easy to maintain as separated topics. Signed-off-by: Frederic Weisbecker Cc: Arjan van de Ven Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 2 +- arch/x86/kernel/process.c | 2 +- include/linux/ftrace.h | 30 ------------------------- include/trace/power.h | 35 ++++++++++++++++++++++++++++++ kernel/trace/trace.h | 1 + kernel/trace/trace_power.c | 2 +- 6 files changed, 39 insertions(+), 33 deletions(-) create mode 100644 include/trace/power.h (limited to 'arch') diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 4b1c319d30c3..7ed925edf4d2 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -33,7 +33,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index e68bb9e30864..026819ffcb0c 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 5e302d636fc2..106b7909d500 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -339,36 +339,6 @@ ftrace_init_module(struct module *mod, unsigned long *start, unsigned long *end) { } #endif -enum { - POWER_NONE = 0, - POWER_CSTATE = 1, - POWER_PSTATE = 2, -}; - -struct power_trace { -#ifdef CONFIG_POWER_TRACER - ktime_t stamp; - ktime_t end; - int type; - int state; -#endif -}; - -#ifdef CONFIG_POWER_TRACER -extern void trace_power_start(struct power_trace *it, unsigned int type, - unsigned int state); -extern void trace_power_mark(struct power_trace *it, unsigned int type, - unsigned int state); -extern void trace_power_end(struct power_trace *it); -#else -static inline void trace_power_start(struct power_trace *it, unsigned int type, - unsigned int state) { } -static inline void trace_power_mark(struct power_trace *it, unsigned int type, - unsigned int state) { } -static inline void trace_power_end(struct power_trace *it) { } -#endif - - /* * Structure that defines an entry function trace. */ diff --git a/include/trace/power.h b/include/trace/power.h new file mode 100644 index 000000000000..c7cefbcdaea4 --- /dev/null +++ b/include/trace/power.h @@ -0,0 +1,35 @@ +#ifndef _TRACE_POWER_H +#define _TRACE_POWER_H + +#include + +enum { + POWER_NONE = 0, + POWER_CSTATE = 1, + POWER_PSTATE = 2, +}; + +struct power_trace { +#ifdef CONFIG_POWER_TRACER + ktime_t stamp; + ktime_t end; + int type; + int state; +#endif +}; + +#ifdef CONFIG_POWER_TRACER +extern void trace_power_start(struct power_trace *it, unsigned int type, + unsigned int state); +extern void trace_power_mark(struct power_trace *it, unsigned int type, + unsigned int state); +extern void trace_power_end(struct power_trace *it); +#else +static inline void trace_power_start(struct power_trace *it, unsigned int type, + unsigned int state) { } +static inline void trace_power_mark(struct power_trace *it, unsigned int type, + unsigned int state) { } +static inline void trace_power_end(struct power_trace *it) { } +#endif + +#endif /* _TRACE_POWER_H */ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index a011ec062225..1ecfb9d2b365 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -10,6 +10,7 @@ #include #include #include +#include enum trace_type { __TRACE_FIRST_TYPE = 0, diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c index bfc21f8079ab..b1d0d087d3a6 100644 --- a/kernel/trace/trace_power.c +++ b/kernel/trace/trace_power.c @@ -11,7 +11,7 @@ #include #include -#include +#include #include #include -- cgit 1.4.1 From 3861a17bcc0af815f684c6178bc9ec2d790c350e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 8 Feb 2009 00:04:02 +0100 Subject: tracing/function-graph-tracer: drop the kernel_text_address check When the function graph tracer picks a return address, it ensures this address is really a kernel text one by calling __kernel_text_address() Actually this path has never been taken.Its role was more likely to debug the tracer on the beginning of its development but this function is wasteful since it is called for every traced function. The fault check is already sufficient. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- arch/x86/kernel/ftrace.c | 7 ------- kernel/extable.c | 4 ++-- kernel/module.c | 2 +- 3 files changed, 3 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index d74d75e0952d..18828aee8781 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -491,13 +491,6 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) return; } - if (unlikely(!__kernel_text_address(old))) { - ftrace_graph_stop(); - *parent = old; - WARN_ON(1); - return; - } - calltime = cpu_clock(raw_smp_processor_id()); if (push_return_trace(old, calltime, diff --git a/kernel/extable.c b/kernel/extable.c index e136ed8d82ba..0df6253730be 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -41,7 +41,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr) return e; } -__notrace_funcgraph int core_kernel_text(unsigned long addr) +int core_kernel_text(unsigned long addr) { if (addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) @@ -54,7 +54,7 @@ __notrace_funcgraph int core_kernel_text(unsigned long addr) return 0; } -__notrace_funcgraph int __kernel_text_address(unsigned long addr) +int __kernel_text_address(unsigned long addr) { if (core_kernel_text(addr)) return 1; diff --git a/kernel/module.c b/kernel/module.c index ba22484a987e..22d7379709da 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2735,7 +2735,7 @@ int is_module_address(unsigned long addr) /* Is this a valid kernel address? */ -__notrace_funcgraph struct module *__module_text_address(unsigned long addr) +struct module *__module_text_address(unsigned long addr) { struct module *mod; -- cgit 1.4.1 From 966657883fdc3a2883a5e641ca4ec8f79ffb8ecd Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 10 Feb 2009 11:53:23 -0500 Subject: tracing, x86: fix constraint for parent variable The constraint used for retrieving and restoring the parent function pointer is incorrect. The parent variable is a pointer, and the address of the pointer is modified by the asm statement and not the pointer itself. It is incorrect to pass it in as an output constraint since the asm will never update the pointer. Signed-off-by: Steven Rostedt --- arch/x86/kernel/ftrace.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 18828aee8781..370bafaa43a3 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -468,8 +468,8 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) * ignore such a protection. */ asm volatile( - "1: " _ASM_MOV " (%[parent_old]), %[old]\n" - "2: " _ASM_MOV " %[return_hooker], (%[parent_replaced])\n" + "1: " _ASM_MOV " (%[parent]), %[old]\n" + "2: " _ASM_MOV " %[return_hooker], (%[parent])\n" " movl $0, %[faulted]\n" ".section .fixup, \"ax\"\n" @@ -479,9 +479,8 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) _ASM_EXTABLE(1b, 3b) _ASM_EXTABLE(2b, 3b) - : [parent_replaced] "=r" (parent), [old] "=r" (old), - [faulted] "=r" (faulted) - : [parent_old] "0" (parent), [return_hooker] "r" (return_hooker) + : [old] "=r" (old), [faulted] "=r" (faulted) + : [parent] "r" (parent), [return_hooker] "r" (return_hooker) : "memory" ); -- cgit 1.4.1 From 5a5fb7dbe88dd57dc2bef0f3be9da991e789612d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 12 Feb 2009 10:53:37 -0500 Subject: preempt-count: force hardirq-count to max of 10 To add a bit in the preempt_count to be set when in NMI context, we found that some archs did not have enough bits to spare. This is due to the hardirq_count being a mask that can hold NR_IRQS. Some archs allow for over 16000 IRQs, and that would require a mask of 14 bits. The sofitrq mask is 8 bits and the preempt disable mask is also 8 bits. The PREEMP_ACTIVE bit is bit 30, and bit 31 would make the preempt_count (which is type int) a negative number. A negative preempt_count is a sign of failure. Add them up 14+8+8+1+1 you get 32 bits. No room for the NMI bit. But the hardirq_count is to track the number of nested IRQs, not the number of total IRQs. This originally took the paranoid approach of setting the max nesting to NR_IRQS. But when we have archs with over 1000 IRQs, it is not practical to think they will ever all nest on a single CPU. Not to mention that this would most definitely cause a stack overflow. This patch sets a max of 10 bits to be used for IRQ nesting. I did a 'git grep HARDIRQ' to examine all users of HARDIRQ_BITS and HARDIRQ_MASK, and found that making it a max of 10 would not hurt anyone. I did find that the m68k expected it to be 8 bits, so I allow for the archs to set the number to be less than 10. I removed the setting of HARDIRQ_BITS from the archs that set it to more than 10. This includes ALPHA, ia64 and avr32. This will always allow room for the NMI bit, and if we need to allow for NMI nesting, we have 4 bits to play with. Signed-off-by: Steven Rostedt --- arch/alpha/include/asm/hardirq.h | 13 ----------- arch/avr32/include/asm/hardirq.h | 11 --------- arch/ia64/include/asm/hardirq.h | 10 --------- include/linux/hardirq.h | 48 ++++++++++++++++++++-------------------- 4 files changed, 24 insertions(+), 58 deletions(-) (limited to 'arch') diff --git a/arch/alpha/include/asm/hardirq.h b/arch/alpha/include/asm/hardirq.h index d953e234daa8..88971460fa6c 100644 --- a/arch/alpha/include/asm/hardirq.h +++ b/arch/alpha/include/asm/hardirq.h @@ -14,17 +14,4 @@ typedef struct { void ack_bad_irq(unsigned int irq); -#define HARDIRQ_BITS 12 - -/* - * The hardirq mask has to be large enough to have - * space for potentially nestable IRQ sources in the system - * to nest on a single CPU. On Alpha, interrupts are masked at the CPU - * by IPL as well as at the system level. We only have 8 IPLs (UNIX PALcode) - * so we really only have 8 nestable IRQs, but allow some overhead - */ -#if (1 << HARDIRQ_BITS) < 16 -#error HARDIRQ_BITS is too low! -#endif - #endif /* _ALPHA_HARDIRQ_H */ diff --git a/arch/avr32/include/asm/hardirq.h b/arch/avr32/include/asm/hardirq.h index 267354356f60..015bc75ea798 100644 --- a/arch/avr32/include/asm/hardirq.h +++ b/arch/avr32/include/asm/hardirq.h @@ -20,15 +20,4 @@ void ack_bad_irq(unsigned int irq); #endif /* __ASSEMBLY__ */ -#define HARDIRQ_BITS 12 - -/* - * The hardirq mask has to be large enough to have - * space for potentially all IRQ sources in the system - * nesting on a single CPU: - */ -#if (1 << HARDIRQ_BITS) < NR_IRQS -# error HARDIRQ_BITS is too low! -#endif - #endif /* __ASM_AVR32_HARDIRQ_H */ diff --git a/arch/ia64/include/asm/hardirq.h b/arch/ia64/include/asm/hardirq.h index 140e495b8e0e..d514cd9edb49 100644 --- a/arch/ia64/include/asm/hardirq.h +++ b/arch/ia64/include/asm/hardirq.h @@ -20,16 +20,6 @@ #define local_softirq_pending() (local_cpu_data->softirq_pending) -#define HARDIRQ_BITS 14 - -/* - * The hardirq mask has to be large enough to have space for potentially all IRQ sources - * in the system nesting on a single CPU: - */ -#if (1 << HARDIRQ_BITS) < NR_IRQS -# error HARDIRQ_BITS is too low! -#endif - extern void __iomem *ipi_base_addr; void ack_bad_irq(unsigned int irq); diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index f3cf86e1465b..9841221f53f2 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -15,61 +15,61 @@ * - bits 0-7 are the preemption count (max preemption depth: 256) * - bits 8-15 are the softirq count (max # of softirqs: 256) * - * The hardirq count can be overridden per architecture, the default is: + * The hardirq count can in theory reach the same as NR_IRQS. + * In reality, the number of nested IRQS is limited to the stack + * size as well. For archs with over 1000 IRQS it is not practical + * to expect that they will all nest. We give a max of 10 bits for + * hardirq nesting. An arch may choose to give less than 10 bits. + * m68k expects it to be 8. * - * - bits 16-27 are the hardirq count (max # of hardirqs: 4096) - * - ( bit 28 is the PREEMPT_ACTIVE flag. ) + * - bits 16-25 are the hardirq count (max # of nested hardirqs: 1024) + * - bit 26 is the NMI_MASK + * - bit 28 is the PREEMPT_ACTIVE flag * * PREEMPT_MASK: 0x000000ff * SOFTIRQ_MASK: 0x0000ff00 - * HARDIRQ_MASK: 0x0fff0000 + * HARDIRQ_MASK: 0x03ff0000 + * NMI_MASK: 0x04000000 */ #define PREEMPT_BITS 8 #define SOFTIRQ_BITS 8 +#define NMI_BITS 1 -#ifndef HARDIRQ_BITS -#define HARDIRQ_BITS 12 +#define MAX_HARDIRQ_BITS 10 -#ifndef MAX_HARDIRQS_PER_CPU -#define MAX_HARDIRQS_PER_CPU NR_IRQS +#ifndef HARDIRQ_BITS +# define HARDIRQ_BITS MAX_HARDIRQ_BITS #endif -/* - * The hardirq mask has to be large enough to have space for potentially - * all IRQ sources in the system nesting on a single CPU. - */ -#if (1 << HARDIRQ_BITS) < MAX_HARDIRQS_PER_CPU -# error HARDIRQ_BITS is too low! -#endif +#if HARDIRQ_BITS > MAX_HARDIRQ_BITS +#error HARDIRQ_BITS too high! #endif #define PREEMPT_SHIFT 0 #define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS) #define HARDIRQ_SHIFT (SOFTIRQ_SHIFT + SOFTIRQ_BITS) +#define NMI_SHIFT (HARDIRQ_SHIFT + HARDIRQ_BITS) #define __IRQ_MASK(x) ((1UL << (x))-1) #define PREEMPT_MASK (__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT) #define SOFTIRQ_MASK (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT) #define HARDIRQ_MASK (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT) +#define NMI_MASK (__IRQ_MASK(NMI_BITS) << NMI_SHIFT) #define PREEMPT_OFFSET (1UL << PREEMPT_SHIFT) #define SOFTIRQ_OFFSET (1UL << SOFTIRQ_SHIFT) #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT) +#define NMI_OFFSET (1UL << NMI_SHIFT) -#if PREEMPT_ACTIVE < (1 << (HARDIRQ_SHIFT + HARDIRQ_BITS)) +#if PREEMPT_ACTIVE < (1 << (NMI_SHIFT + NMI_BITS)) #error PREEMPT_ACTIVE is too low! #endif -#define NMI_OFFSET (PREEMPT_ACTIVE << 1) - -#if NMI_OFFSET >= 0x80000000 -#error PREEMPT_ACTIVE too high! -#endif - #define hardirq_count() (preempt_count() & HARDIRQ_MASK) #define softirq_count() (preempt_count() & SOFTIRQ_MASK) -#define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK)) +#define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \ + | NMI_MASK)) /* * Are we doing bottom half or hardware interrupt processing? @@ -82,7 +82,7 @@ /* * Are we in NMI context? */ -#define in_nmi() (preempt_count() & NMI_OFFSET) +#define in_nmi() (preempt_count() & NMI_MASK) #if defined(CONFIG_PREEMPT) # define PREEMPT_INATOMIC_BASE kernel_locked() -- cgit 1.4.1 From b5f9fd0f8a05c9bafb91a9a85b9110938d8e585b Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Wed, 11 Feb 2009 13:57:25 -0500 Subject: tracing: convert c/p state power tracer to use tracepoints Convert the c/p state "power" tracer to use tracepoints. Avoids a function call when the tracer is disabled. Signed-off-by: Jason Baron Acked-by: Ingo Molnar Signed-off-by: Steven Rostedt --- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 2 + arch/x86/kernel/process.c | 3 + include/trace/power.h | 25 ++--- kernel/trace/trace_power.c | 173 +++++++++++++++++------------ 4 files changed, 119 insertions(+), 84 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 7ed925edf4d2..c5d737cdb365 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -70,6 +70,8 @@ struct acpi_cpufreq_data { static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data); +DEFINE_TRACE(power_mark); + /* acpi_perf_data is a pointer to percpu data. */ static struct acpi_processor_performance *acpi_perf_data; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 026819ffcb0c..e0d0fd7ab514 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -19,6 +19,9 @@ EXPORT_SYMBOL(idle_nomwait); struct kmem_cache *task_xstate_cachep; +DEFINE_TRACE(power_start); +DEFINE_TRACE(power_end); + int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { *dst = *src; diff --git a/include/trace/power.h b/include/trace/power.h index c7cefbcdaea4..2c733e58e89c 100644 --- a/include/trace/power.h +++ b/include/trace/power.h @@ -2,6 +2,7 @@ #define _TRACE_POWER_H #include +#include enum { POWER_NONE = 0, @@ -18,18 +19,16 @@ struct power_trace { #endif }; -#ifdef CONFIG_POWER_TRACER -extern void trace_power_start(struct power_trace *it, unsigned int type, - unsigned int state); -extern void trace_power_mark(struct power_trace *it, unsigned int type, - unsigned int state); -extern void trace_power_end(struct power_trace *it); -#else -static inline void trace_power_start(struct power_trace *it, unsigned int type, - unsigned int state) { } -static inline void trace_power_mark(struct power_trace *it, unsigned int type, - unsigned int state) { } -static inline void trace_power_end(struct power_trace *it) { } -#endif +DECLARE_TRACE(power_start, + TPPROTO(struct power_trace *it, unsigned int type, unsigned int state), + TPARGS(it, type, state)); + +DECLARE_TRACE(power_mark, + TPPROTO(struct power_trace *it, unsigned int type, unsigned int state), + TPARGS(it, type, state)); + +DECLARE_TRACE(power_end, + TPPROTO(struct power_trace *it), + TPARGS(it)); #endif /* _TRACE_POWER_H */ diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c index b1d0d087d3a6..91ce672fb037 100644 --- a/kernel/trace/trace_power.c +++ b/kernel/trace/trace_power.c @@ -21,15 +21,116 @@ static struct trace_array *power_trace; static int __read_mostly trace_power_enabled; +static void probe_power_start(struct power_trace *it, unsigned int type, + unsigned int level) +{ + if (!trace_power_enabled) + return; + + memset(it, 0, sizeof(struct power_trace)); + it->state = level; + it->type = type; + it->stamp = ktime_get(); +} + + +static void probe_power_end(struct power_trace *it) +{ + struct ring_buffer_event *event; + struct trace_power *entry; + struct trace_array_cpu *data; + struct trace_array *tr = power_trace; + + if (!trace_power_enabled) + return; + + preempt_disable(); + it->end = ktime_get(); + data = tr->data[smp_processor_id()]; + + event = trace_buffer_lock_reserve(tr, TRACE_POWER, + sizeof(*entry), 0, 0); + if (!event) + goto out; + entry = ring_buffer_event_data(event); + entry->state_data = *it; + trace_buffer_unlock_commit(tr, event, 0, 0); + out: + preempt_enable(); +} + +static void probe_power_mark(struct power_trace *it, unsigned int type, + unsigned int level) +{ + struct ring_buffer_event *event; + struct trace_power *entry; + struct trace_array_cpu *data; + struct trace_array *tr = power_trace; + + if (!trace_power_enabled) + return; + + memset(it, 0, sizeof(struct power_trace)); + it->state = level; + it->type = type; + it->stamp = ktime_get(); + preempt_disable(); + it->end = it->stamp; + data = tr->data[smp_processor_id()]; + + event = trace_buffer_lock_reserve(tr, TRACE_POWER, + sizeof(*entry), 0, 0); + if (!event) + goto out; + entry = ring_buffer_event_data(event); + entry->state_data = *it; + trace_buffer_unlock_commit(tr, event, 0, 0); + out: + preempt_enable(); +} + +static int tracing_power_register(void) +{ + int ret; + + ret = register_trace_power_start(probe_power_start); + if (ret) { + pr_info("power trace: Couldn't activate tracepoint" + " probe to trace_power_start\n"); + return ret; + } + ret = register_trace_power_end(probe_power_end); + if (ret) { + pr_info("power trace: Couldn't activate tracepoint" + " probe to trace_power_end\n"); + goto fail_start; + } + ret = register_trace_power_mark(probe_power_mark); + if (ret) { + pr_info("power trace: Couldn't activate tracepoint" + " probe to trace_power_mark\n"); + goto fail_end; + } + return ret; +fail_end: + unregister_trace_power_end(probe_power_end); +fail_start: + unregister_trace_power_start(probe_power_start); + return ret; +} static void start_power_trace(struct trace_array *tr) { trace_power_enabled = 1; + tracing_power_register(); } static void stop_power_trace(struct trace_array *tr) { trace_power_enabled = 0; + unregister_trace_power_start(probe_power_start); + unregister_trace_power_end(probe_power_end); + unregister_trace_power_mark(probe_power_mark); } @@ -39,6 +140,7 @@ static int power_trace_init(struct trace_array *tr) power_trace = tr; trace_power_enabled = 1; + tracing_power_register(); for_each_cpu(cpu, cpu_possible_mask) tracing_reset(tr, cpu); @@ -95,74 +197,3 @@ static int init_power_trace(void) return register_tracer(&power_tracer); } device_initcall(init_power_trace); - -void trace_power_start(struct power_trace *it, unsigned int type, - unsigned int level) -{ - if (!trace_power_enabled) - return; - - memset(it, 0, sizeof(struct power_trace)); - it->state = level; - it->type = type; - it->stamp = ktime_get(); -} -EXPORT_SYMBOL_GPL(trace_power_start); - - -void trace_power_end(struct power_trace *it) -{ - struct ring_buffer_event *event; - struct trace_power *entry; - struct trace_array_cpu *data; - struct trace_array *tr = power_trace; - - if (!trace_power_enabled) - return; - - preempt_disable(); - it->end = ktime_get(); - data = tr->data[smp_processor_id()]; - - event = trace_buffer_lock_reserve(tr, TRACE_POWER, - sizeof(*entry), 0, 0); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - entry->state_data = *it; - trace_buffer_unlock_commit(tr, event, 0, 0); - out: - preempt_enable(); -} -EXPORT_SYMBOL_GPL(trace_power_end); - -void trace_power_mark(struct power_trace *it, unsigned int type, - unsigned int level) -{ - struct ring_buffer_event *event; - struct trace_power *entry; - struct trace_array_cpu *data; - struct trace_array *tr = power_trace; - - if (!trace_power_enabled) - return; - - memset(it, 0, sizeof(struct power_trace)); - it->state = level; - it->type = type; - it->stamp = ktime_get(); - preempt_disable(); - it->end = it->stamp; - data = tr->data[smp_processor_id()]; - - event = trace_buffer_lock_reserve(tr, TRACE_POWER, - sizeof(*entry), 0, 0); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - entry->state_data = *it; - trace_buffer_unlock_commit(tr, event, 0, 0); - out: - preempt_enable(); -} -EXPORT_SYMBOL_GPL(trace_power_mark); -- cgit 1.4.1 From 16239630974516a8879a3695ee9b4dc661f79f96 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 17 Feb 2009 17:57:30 -0500 Subject: ftrace, x86: make kernel text writable only for conversions Impact: keep kernel text read only Because dynamic ftrace converts the calls to mcount into and out of nops at run time, we needed to always keep the kernel text writable. But this defeats the point of CONFIG_DEBUG_RODATA. This patch converts the kernel code to writable before ftrace modifies the text, and converts it back to read only afterward. The kernel text is converted to read/write, stop_machine is called to modify the code, then the kernel text is converted back to read only. The original version used SYSTEM_STATE to determine when it was OK or not to change the code to rw or ro. Andrew Morton pointed out that using SYSTEM_STATE is a bad idea since there is no guarantee to what its state will actually be. Instead, I moved the check into the set_kernel_text_* functions themselves, and use a local variable to determine when it is OK to change the kernel text RW permissions. [ Update: Ingo Molnar suggested moving the prototypes to cacheflush.h ] Reviewed-by: Andrew Morton Signed-off-by: Steven Rostedt --- arch/x86/include/asm/cacheflush.h | 5 +++++ arch/x86/kernel/ftrace.c | 13 +++++++++++++ arch/x86/mm/init_32.c | 35 ++++++++++++++++++++++++++++++++--- arch/x86/mm/init_64.c | 37 ++++++++++++++++++++++++++++++++----- 4 files changed, 82 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index 2f8466540fb5..6145063cfe0e 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -104,6 +104,11 @@ void clflush_cache_range(void *addr, unsigned int size); #ifdef CONFIG_DEBUG_RODATA void mark_rodata_ro(void); extern const int rodata_test_data; +void set_kernel_text_rw(void); +void set_kernel_text_ro(void); +#else +static inline void set_kernel_text_rw(void) { } +static inline void set_kernel_text_ro(void) { } #endif #ifdef CONFIG_DEBUG_RODATA_TEST diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 231bdd3c5b1c..77857d4f7d0f 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -26,6 +27,18 @@ #ifdef CONFIG_DYNAMIC_FTRACE +int ftrace_arch_code_modify_prepare(void) +{ + set_kernel_text_rw(); + return 0; +} + +int ftrace_arch_code_modify_post_process(void) +{ + set_kernel_text_ro(); + return 0; +} + union ftrace_code_union { char code[MCOUNT_INSN_SIZE]; struct { diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 2cef05074413..3eb2ed188a4c 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -1155,17 +1155,47 @@ static noinline int do_test_wp_bit(void) const int rodata_test_data = 0xC3; EXPORT_SYMBOL_GPL(rodata_test_data); +static int kernel_set_to_readonly; + +void set_kernel_text_rw(void) +{ + unsigned long start = PFN_ALIGN(_text); + unsigned long size = PFN_ALIGN(_etext) - start; + + if (!kernel_set_to_readonly) + return; + + pr_debug("Set kernel text: %lx - %lx for read write\n", + start, start+size); + + set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT); +} + +void set_kernel_text_ro(void) +{ + unsigned long start = PFN_ALIGN(_text); + unsigned long size = PFN_ALIGN(_etext) - start; + + if (!kernel_set_to_readonly) + return; + + pr_debug("Set kernel text: %lx - %lx for read only\n", + start, start+size); + + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); +} + void mark_rodata_ro(void) { unsigned long start = PFN_ALIGN(_text); unsigned long size = PFN_ALIGN(_etext) - start; -#ifndef CONFIG_DYNAMIC_FTRACE - /* Dynamic tracing modifies the kernel text section */ set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); printk(KERN_INFO "Write protecting the kernel text: %luk\n", size >> 10); + kernel_set_to_readonly = 1; + #ifdef CONFIG_CPA_DEBUG printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n", start, start+size); @@ -1174,7 +1204,6 @@ void mark_rodata_ro(void) printk(KERN_INFO "Testing CPA: write protecting again\n"); set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT); #endif -#endif /* CONFIG_DYNAMIC_FTRACE */ start += size; size = (unsigned long)__end_rodata - start; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index e6d36b490250..63fdc531601d 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -986,21 +986,48 @@ void free_initmem(void) const int rodata_test_data = 0xC3; EXPORT_SYMBOL_GPL(rodata_test_data); +static int kernel_set_to_readonly; + +void set_kernel_text_rw(void) +{ + unsigned long start = PFN_ALIGN(_stext); + unsigned long end = PFN_ALIGN(__start_rodata); + + if (!kernel_set_to_readonly) + return; + + pr_debug("Set kernel text: %lx - %lx for read write\n", + start, end); + + set_memory_rw(start, (end - start) >> PAGE_SHIFT); +} + +void set_kernel_text_ro(void) +{ + unsigned long start = PFN_ALIGN(_stext); + unsigned long end = PFN_ALIGN(__start_rodata); + + if (!kernel_set_to_readonly) + return; + + pr_debug("Set kernel text: %lx - %lx for read only\n", + start, end); + + set_memory_ro(start, (end - start) >> PAGE_SHIFT); +} + void mark_rodata_ro(void) { unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata); unsigned long rodata_start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; -#ifdef CONFIG_DYNAMIC_FTRACE - /* Dynamic tracing modifies the kernel text section */ - start = rodata_start; -#endif - printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", (end - start) >> 10); set_memory_ro(start, (end - start) >> PAGE_SHIFT); + kernel_set_to_readonly = 1; + /* * The rodata section (but not the kernel text!) should also be * not-executable. -- cgit 1.4.1 From 90c7ac49aa819feb9433b5310089fca6399881c0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 19 Feb 2009 13:32:57 -0500 Subject: ftrace: immediately stop code modification if failure is detected Impact: fix to prevent NMI lockup If the page fault handler produces a WARN_ON in the modifying of text, and the system is setup to have a high frequency of NMIs, we can lock up the system on a failure to modify code. The modifying of code with NMIs allows all NMIs to modify the code if it is about to run. This prevents a modifier on one CPU from modifying code running in NMI context on another CPU. The modifying is done through stop_machine, so only NMIs must be considered. But if the write causes the page fault handler to produce a warning, the print can slow it down enough that as soon as it is done it will take another NMI before going back to the process context. The new NMI will perform the write again causing another print and this will hang the box. This patch turns off the writing as soon as a failure is detected and does not wait for it to be turned off by the process context. This will keep NMIs from getting stuck in this back and forth of print outs. Signed-off-by: Steven Rostedt --- arch/x86/kernel/ftrace.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 77857d4f7d0f..c56d73894322 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -124,6 +124,10 @@ static void ftrace_mod_code(void) */ mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode, MCOUNT_INSN_SIZE); + + /* if we fail, then kill any new writers */ + if (mod_code_status) + mod_code_write = 0; } void ftrace_nmi_enter(void) -- cgit 1.4.1 From 499aa86dcbc3c4daf7d2c59c5c30e1a78220fbc1 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Tue, 24 Feb 2009 14:12:34 +0100 Subject: x86, ptrace: remove CONFIG guards around declarations Remove unnecessary CONFIG guards around type declarations and macro definitions. Reported-by: Cyrill Gorcunov Signed-off-by: Markus Metzger Cc: markus.t.metzger@gmail.com Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ptrace-abi.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/ptrace-abi.h b/arch/x86/include/asm/ptrace-abi.h index 8e0f8d199e05..86723035a515 100644 --- a/arch/x86/include/asm/ptrace-abi.h +++ b/arch/x86/include/asm/ptrace-abi.h @@ -80,8 +80,6 @@ #define PTRACE_SINGLEBLOCK 33 /* resume execution until next branch */ -#ifdef CONFIG_X86_PTRACE_BTS - #ifndef __ASSEMBLY__ #include @@ -140,6 +138,5 @@ struct ptrace_bts_config { BTS records are read from oldest to newest. Returns number of BTS records drained. */ -#endif /* CONFIG_X86_PTRACE_BTS */ #endif /* _ASM_X86_PTRACE_ABI_H */ -- cgit 1.4.1 From c79a61f55773d2519fd0525bf58385f7d20752d3 Mon Sep 17 00:00:00 2001 From: Uwe Kleine-Koenig Date: Fri, 27 Feb 2009 21:30:03 +0100 Subject: tracing: make CALLER_ADDRx overwriteable The current definition of CALLER_ADDRx isn't suitable for all platforms. E.g. for ARM __builtin_return_address(N) doesn't work for N > 0 and AFAIK for powerpc there are no frame pointers needed to have a working __builtin_return_address. This patch allows defining the CALLER_ADDRx macros in and let these take precedence. Because now is included unconditionally in all archs that don't already had this include get an empty one for free. Signed-off-by: Uwe Kleine-Koenig Cc: Peter Zijlstra Cc: Ingo Molnar Reviewed-by: KOSAKI Motohiro Signed-off-by: Steven Rostedt --- arch/alpha/include/asm/ftrace.h | 1 + arch/avr32/include/asm/ftrace.h | 1 + arch/blackfin/include/asm/ftrace.h | 1 + arch/cris/include/asm/ftrace.h | 1 + arch/h8300/include/asm/ftrace.h | 1 + arch/m68k/include/asm/ftrace.h | 1 + arch/mips/include/asm/ftrace.h | 1 + arch/parisc/include/asm/ftrace.h | 1 + arch/um/include/asm/ftrace.h | 1 + arch/xtensa/include/asm/ftrace.h | 1 + include/asm-frv/ftrace.h | 1 + include/asm-m32r/ftrace.h | 1 + include/asm-mn10300/ftrace.h | 1 + include/linux/ftrace.h | 41 +++++++++++++++++++------------------- 14 files changed, 34 insertions(+), 20 deletions(-) create mode 100644 arch/alpha/include/asm/ftrace.h create mode 100644 arch/avr32/include/asm/ftrace.h create mode 100644 arch/blackfin/include/asm/ftrace.h create mode 100644 arch/cris/include/asm/ftrace.h create mode 100644 arch/h8300/include/asm/ftrace.h create mode 100644 arch/m68k/include/asm/ftrace.h create mode 100644 arch/mips/include/asm/ftrace.h create mode 100644 arch/parisc/include/asm/ftrace.h create mode 100644 arch/um/include/asm/ftrace.h create mode 100644 arch/xtensa/include/asm/ftrace.h create mode 100644 include/asm-frv/ftrace.h create mode 100644 include/asm-m32r/ftrace.h create mode 100644 include/asm-mn10300/ftrace.h (limited to 'arch') diff --git a/arch/alpha/include/asm/ftrace.h b/arch/alpha/include/asm/ftrace.h new file mode 100644 index 000000000000..40a8c178f10d --- /dev/null +++ b/arch/alpha/include/asm/ftrace.h @@ -0,0 +1 @@ +/* empty */ diff --git a/arch/avr32/include/asm/ftrace.h b/arch/avr32/include/asm/ftrace.h new file mode 100644 index 000000000000..40a8c178f10d --- /dev/null +++ b/arch/avr32/include/asm/ftrace.h @@ -0,0 +1 @@ +/* empty */ diff --git a/arch/blackfin/include/asm/ftrace.h b/arch/blackfin/include/asm/ftrace.h new file mode 100644 index 000000000000..40a8c178f10d --- /dev/null +++ b/arch/blackfin/include/asm/ftrace.h @@ -0,0 +1 @@ +/* empty */ diff --git a/arch/cris/include/asm/ftrace.h b/arch/cris/include/asm/ftrace.h new file mode 100644 index 000000000000..40a8c178f10d --- /dev/null +++ b/arch/cris/include/asm/ftrace.h @@ -0,0 +1 @@ +/* empty */ diff --git a/arch/h8300/include/asm/ftrace.h b/arch/h8300/include/asm/ftrace.h new file mode 100644 index 000000000000..40a8c178f10d --- /dev/null +++ b/arch/h8300/include/asm/ftrace.h @@ -0,0 +1 @@ +/* empty */ diff --git a/arch/m68k/include/asm/ftrace.h b/arch/m68k/include/asm/ftrace.h new file mode 100644 index 000000000000..40a8c178f10d --- /dev/null +++ b/arch/m68k/include/asm/ftrace.h @@ -0,0 +1 @@ +/* empty */ diff --git a/arch/mips/include/asm/ftrace.h b/arch/mips/include/asm/ftrace.h new file mode 100644 index 000000000000..40a8c178f10d --- /dev/null +++ b/arch/mips/include/asm/ftrace.h @@ -0,0 +1 @@ +/* empty */ diff --git a/arch/parisc/include/asm/ftrace.h b/arch/parisc/include/asm/ftrace.h new file mode 100644 index 000000000000..40a8c178f10d --- /dev/null +++ b/arch/parisc/include/asm/ftrace.h @@ -0,0 +1 @@ +/* empty */ diff --git a/arch/um/include/asm/ftrace.h b/arch/um/include/asm/ftrace.h new file mode 100644 index 000000000000..40a8c178f10d --- /dev/null +++ b/arch/um/include/asm/ftrace.h @@ -0,0 +1 @@ +/* empty */ diff --git a/arch/xtensa/include/asm/ftrace.h b/arch/xtensa/include/asm/ftrace.h new file mode 100644 index 000000000000..40a8c178f10d --- /dev/null +++ b/arch/xtensa/include/asm/ftrace.h @@ -0,0 +1 @@ +/* empty */ diff --git a/include/asm-frv/ftrace.h b/include/asm-frv/ftrace.h new file mode 100644 index 000000000000..40a8c178f10d --- /dev/null +++ b/include/asm-frv/ftrace.h @@ -0,0 +1 @@ +/* empty */ diff --git a/include/asm-m32r/ftrace.h b/include/asm-m32r/ftrace.h new file mode 100644 index 000000000000..40a8c178f10d --- /dev/null +++ b/include/asm-m32r/ftrace.h @@ -0,0 +1 @@ +/* empty */ diff --git a/include/asm-mn10300/ftrace.h b/include/asm-mn10300/ftrace.h new file mode 100644 index 000000000000..40a8c178f10d --- /dev/null +++ b/include/asm-mn10300/ftrace.h @@ -0,0 +1 @@ +/* empty */ diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 847bb3c48dd0..1f69ac7c1587 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -11,6 +11,8 @@ #include #include +#include + #ifdef CONFIG_FUNCTION_TRACER extern int ftrace_enabled; @@ -103,8 +105,6 @@ struct ftrace_func_command { }; #ifdef CONFIG_DYNAMIC_FTRACE -/* asm/ftrace.h must be defined for archs supporting dynamic ftrace */ -#include int ftrace_arch_code_modify_prepare(void); int ftrace_arch_code_modify_post_process(void); @@ -282,24 +282,25 @@ static inline void __ftrace_enabled_restore(int enabled) #endif } -#ifdef CONFIG_FRAME_POINTER -/* TODO: need to fix this for ARM */ -# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) -# define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1)) -# define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2)) -# define CALLER_ADDR3 ((unsigned long)__builtin_return_address(3)) -# define CALLER_ADDR4 ((unsigned long)__builtin_return_address(4)) -# define CALLER_ADDR5 ((unsigned long)__builtin_return_address(5)) -# define CALLER_ADDR6 ((unsigned long)__builtin_return_address(6)) -#else -# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) -# define CALLER_ADDR1 0UL -# define CALLER_ADDR2 0UL -# define CALLER_ADDR3 0UL -# define CALLER_ADDR4 0UL -# define CALLER_ADDR5 0UL -# define CALLER_ADDR6 0UL -#endif +#ifndef HAVE_ARCH_CALLER_ADDR +# ifdef CONFIG_FRAME_POINTER +# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) +# define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1)) +# define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2)) +# define CALLER_ADDR3 ((unsigned long)__builtin_return_address(3)) +# define CALLER_ADDR4 ((unsigned long)__builtin_return_address(4)) +# define CALLER_ADDR5 ((unsigned long)__builtin_return_address(5)) +# define CALLER_ADDR6 ((unsigned long)__builtin_return_address(6)) +# else +# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) +# define CALLER_ADDR1 0UL +# define CALLER_ADDR2 0UL +# define CALLER_ADDR3 0UL +# define CALLER_ADDR4 0UL +# define CALLER_ADDR5 0UL +# define CALLER_ADDR6 0UL +# endif +#endif /* ifndef HAVE_ARCH_CALLER_ADDR */ #ifdef CONFIG_IRQSOFF_TRACER extern void time_hardirqs_on(unsigned long a0, unsigned long a1); -- cgit 1.4.1 From 0012693ad4f636c720fed3802027f9427962f540 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 5 Mar 2009 01:49:22 +0100 Subject: tracing/function-graph-tracer: use the more lightweight local clock Impact: decrease hangs risks with the graph tracer on slow systems Since the function graph tracer can spend too much time on timer interrupts, it's better now to use the more lightweight local clock. Anyway, the function graph traces are more reliable on a per cpu trace. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Peter Zijlstra LKML-Reference: <49af243d.06e9300a.53ad.ffff840c@mx.google.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ftrace.c | 2 +- include/linux/ftrace.h | 13 +++++++------ kernel/trace/trace_functions_graph.c | 2 +- 3 files changed, 9 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 3925ec0184b1..a85da1764b1c 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -436,7 +436,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) return; } - calltime = cpu_clock(raw_smp_processor_id()); + calltime = trace_clock_local(); if (ftrace_push_return_trace(old, calltime, self_addr, &trace.depth) == -EBUSY) { diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 1f69ac7c1587..6ea62acbe4b6 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -1,15 +1,16 @@ #ifndef _LINUX_FTRACE_H #define _LINUX_FTRACE_H -#include -#include -#include -#include -#include -#include +#include #include +#include #include +#include +#include #include +#include +#include +#include #include diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index c009553a8e81..e527f2f66c73 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -112,7 +112,7 @@ unsigned long ftrace_return_to_handler(void) unsigned long ret; ftrace_pop_return_trace(&trace, &ret); - trace.rettime = cpu_clock(raw_smp_processor_id()); + trace.rettime = trace_clock_local(); ftrace_graph_return(&trace); if (unlikely(!ret)) { -- cgit 1.4.1 From 40ada30f9621fbd831ac2437b9a2a399aad34b00 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 5 Mar 2009 21:19:55 +0100 Subject: tracing: clean up menu Clean up menu structure, introduce TRACING_SUPPORT switch that signals whether an architecture supports various instrumentation mechanisms. Signed-off-by: Ingo Molnar --- arch/Kconfig | 1 + kernel/trace/Kconfig | 29 ++++++++++++++++------------- 2 files changed, 17 insertions(+), 13 deletions(-) (limited to 'arch') diff --git a/arch/Kconfig b/arch/Kconfig index 550dab22daa1..a092dc77c24d 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -6,6 +6,7 @@ config OPROFILE tristate "OProfile system profiling (EXPERIMENTAL)" depends on PROFILING depends on HAVE_OPROFILE + depends on TRACING_SUPPORT select TRACING select RING_BUFFER help diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 999c6a2485df..5d733da5345a 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -53,12 +53,22 @@ config TRACING select TRACEPOINTS select NOP_TRACER +# +# Minimum requirements an architecture has to meet for us to +# be able to offer generic tracing facilities: +# +config TRACING_SUPPORT + bool + depends on TRACE_IRQFLAGS_SUPPORT + depends on STACKTRACE_SUPPORT + +if TRACING_SUPPORT + menu "Tracers" config FUNCTION_TRACER bool "Kernel Function Tracer" depends on HAVE_FUNCTION_TRACER - depends on DEBUG_KERNEL select FRAME_POINTER select KALLSYMS select TRACING @@ -91,7 +101,6 @@ config IRQSOFF_TRACER default n depends on TRACE_IRQFLAGS_SUPPORT depends on GENERIC_TIME - depends on DEBUG_KERNEL select TRACE_IRQFLAGS select TRACING select TRACER_MAX_TRACE @@ -114,7 +123,6 @@ config PREEMPT_TRACER default n depends on GENERIC_TIME depends on PREEMPT - depends on DEBUG_KERNEL select TRACING select TRACER_MAX_TRACE help @@ -142,7 +150,6 @@ config SYSPROF_TRACER config SCHED_TRACER bool "Scheduling Latency Tracer" - depends on DEBUG_KERNEL select TRACING select CONTEXT_SWITCH_TRACER select TRACER_MAX_TRACE @@ -152,7 +159,6 @@ config SCHED_TRACER config CONTEXT_SWITCH_TRACER bool "Trace process context switches" - depends on DEBUG_KERNEL select TRACING select MARKERS help @@ -161,7 +167,6 @@ config CONTEXT_SWITCH_TRACER config EVENT_TRACER bool "Trace various events in the kernel" - depends on DEBUG_KERNEL select TRACING help This tracer hooks to various trace points in the kernel @@ -170,7 +175,6 @@ config EVENT_TRACER config BOOT_TRACER bool "Trace boot initcalls" - depends on DEBUG_KERNEL select TRACING select CONTEXT_SWITCH_TRACER help @@ -188,7 +192,6 @@ config BOOT_TRACER config TRACE_BRANCH_PROFILING bool "Trace likely/unlikely profiler" - depends on DEBUG_KERNEL select TRACING help This tracer profiles all the the likely and unlikely macros @@ -241,7 +244,6 @@ config BRANCH_TRACER config POWER_TRACER bool "Trace power consumption behavior" - depends on DEBUG_KERNEL depends on X86 select TRACING help @@ -253,7 +255,6 @@ config POWER_TRACER config STACK_TRACER bool "Trace max stack" depends on HAVE_FUNCTION_TRACER - depends on DEBUG_KERNEL select FUNCTION_TRACER select STACKTRACE select KALLSYMS @@ -343,7 +344,6 @@ config DYNAMIC_FTRACE bool "enable/disable ftrace tracepoints dynamically" depends on FUNCTION_TRACER depends on HAVE_DYNAMIC_FTRACE - depends on DEBUG_KERNEL default y help This option will modify all the calls to ftrace dynamically @@ -369,7 +369,7 @@ config FTRACE_SELFTEST config FTRACE_STARTUP_TEST bool "Perform a startup test on ftrace" - depends on TRACING && DEBUG_KERNEL + depends on TRACING select FTRACE_SELFTEST help This option performs a series of startup tests on ftrace. On bootup @@ -379,7 +379,7 @@ config FTRACE_STARTUP_TEST config MMIOTRACE bool "Memory mapped IO tracing" - depends on HAVE_MMIOTRACE_SUPPORT && DEBUG_KERNEL && PCI + depends on HAVE_MMIOTRACE_SUPPORT && PCI select TRACING help Mmiotrace traces Memory Mapped I/O access and is meant for @@ -401,3 +401,6 @@ config MMIOTRACE_TEST Say N, unless you absolutely know what you are doing. endmenu + +endif # TRACING_SUPPORT + -- cgit 1.4.1 From 3945dab45aa8c89014893bfa8eb1e1661a409cef Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 6 Mar 2009 10:37:22 -0500 Subject: tracing, Text Edit Lock - SMP alternatives support Use the mutual exclusion provided by the text edit lock in alternatives code. Since alternative_smp_* will be called from module init code, etc, we'd better protect it from other subsystems. Signed-off-by: Masami Hiramatsu LKML-Reference: <49B14332.9030109@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/alternative.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 4c80f1557433..092a7b8be68d 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -226,6 +227,7 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end) { u8 **ptr; + mutex_lock(&text_mutex); for (ptr = start; ptr < end; ptr++) { if (*ptr < text) continue; @@ -234,6 +236,7 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end) /* turn DS segment override prefix into lock prefix */ text_poke(*ptr, ((unsigned char []){0xf0}), 1); }; + mutex_unlock(&text_mutex); } static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end) @@ -243,6 +246,7 @@ static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end if (noreplace_smp) return; + mutex_lock(&text_mutex); for (ptr = start; ptr < end; ptr++) { if (*ptr < text) continue; @@ -251,6 +255,7 @@ static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end /* turn lock prefix into DS segment override prefix */ text_poke(*ptr, ((unsigned char []){0x3E}), 1); }; + mutex_unlock(&text_mutex); } struct smp_alt_module { -- cgit 1.4.1 From 78ff7fae04554b49d29226ed12536268c2500d1f Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 6 Mar 2009 10:37:54 -0500 Subject: x86: implement atomic text_poke() via fixmap Use fixmaps instead of vmap/vunmap in text_poke() for avoiding page allocation and delayed unmapping. At the result of above change, text_poke() becomes atomic and can be called from stop_machine() etc. Signed-off-by: Masami Hiramatsu Acked-by: Mathieu Desnoyers LKML-Reference: <49B14352.2040705@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fixmap.h | 2 ++ arch/x86/kernel/alternative.c | 24 +++++++++++++++--------- 2 files changed, 17 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 63a79c77d220..81937a5dc77c 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -111,6 +111,8 @@ enum fixed_addresses { #ifdef CONFIG_PARAVIRT FIX_PARAVIRT_BOOTMAP, #endif + FIX_TEXT_POKE0, /* reserve 2 pages for text_poke() */ + FIX_TEXT_POKE1, __end_of_permanent_fixed_addresses, #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT FIX_OHCI1394_BASE, diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 092a7b8be68d..2d903b760ddb 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -13,7 +13,9 @@ #include #include #include +#include #include +#include #define MAX_PATCH_LEN (255-1) @@ -505,15 +507,16 @@ void *text_poke_early(void *addr, const void *opcode, size_t len) * It means the size must be writable atomically and the address must be aligned * in a way that permits an atomic write. It also makes sure we fit on a single * page. + * + * Note: Must be called under text_mutex. */ void *__kprobes text_poke(void *addr, const void *opcode, size_t len) { + unsigned long flags; char *vaddr; - int nr_pages = 2; struct page *pages[2]; int i; - might_sleep(); if (!core_kernel_text((unsigned long)addr)) { pages[0] = vmalloc_to_page(addr); pages[1] = vmalloc_to_page(addr + PAGE_SIZE); @@ -523,14 +526,17 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len) pages[1] = virt_to_page(addr + PAGE_SIZE); } BUG_ON(!pages[0]); - if (!pages[1]) - nr_pages = 1; - vaddr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); - BUG_ON(!vaddr); - local_irq_disable(); + set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0])); + if (pages[1]) + set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1])); + vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0); + local_irq_save(flags); memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len); - local_irq_enable(); - vunmap(vaddr); + local_irq_restore(flags); + clear_fixmap(FIX_TEXT_POKE0); + if (pages[1]) + clear_fixmap(FIX_TEXT_POKE1); + local_flush_tlb(); sync_core(); /* Could also do a CLFLUSH here to speed up CPU recovery; but that causes hangs on some VIA CPUs. */ -- cgit 1.4.1 From 7cf49427042400d40bdc80b5c3399b6b5945afa8 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 9 Mar 2009 12:40:40 -0400 Subject: x86: expand irq-off region in text_poke() Expand irq-off region to cover fixmap using code and cache synchronizing. Signed-off-by: Masami Hiramatsu LKML-Reference: <49B54688.8090403@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/alternative.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 2d903b760ddb..f57658702571 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -526,13 +526,12 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len) pages[1] = virt_to_page(addr + PAGE_SIZE); } BUG_ON(!pages[0]); + local_irq_save(flags); set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0])); if (pages[1]) set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1])); vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0); - local_irq_save(flags); memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len); - local_irq_restore(flags); clear_fixmap(FIX_TEXT_POKE0); if (pages[1]) clear_fixmap(FIX_TEXT_POKE1); @@ -542,5 +541,6 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len) that causes hangs on some VIA CPUs. */ for (i = 0; i < len; i++) BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]); + local_irq_restore(flags); return addr; } -- cgit 1.4.1 From 1b3fa2ce64363c289b3b14723cca7290bf91cfce Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 7 Mar 2009 05:53:00 +0100 Subject: tracing/x86: basic implementation of syscall tracing for x86 Provide the x86 trace callbacks to trace syscalls. Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt Cc: Lai Jiangshan LKML-Reference: <1236401580-5758-3-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 + arch/x86/include/asm/thread_info.h | 9 ++++++--- arch/x86/kernel/ptrace.c | 7 +++++++ 3 files changed, 14 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index bdcee12c25ab..b0a638b4199a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -35,6 +35,7 @@ config X86 select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACE_MCOUNT_TEST select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE + select HAVE_FTRACE_SYSCALLS select HAVE_KVM select HAVE_ARCH_KGDB select HAVE_ARCH_TRACEHOOK diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index df9d5f78385e..8820a73ae090 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -94,6 +94,7 @@ struct thread_info { #define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ #define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ #define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ +#define TIF_SYSCALL_FTRACE 27 /* for ftrace syscall instrumentation */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) @@ -115,15 +116,17 @@ struct thread_info { #define _TIF_FORCED_TF (1 << TIF_FORCED_TF) #define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) #define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) +#define _TIF_SYSCALL_FTRACE (1 << TIF_SYSCALL_FTRACE) /* work to do in syscall_trace_enter() */ #define _TIF_WORK_SYSCALL_ENTRY \ - (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | \ + (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_FTRACE | \ _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | _TIF_SINGLESTEP) /* work to do in syscall_trace_leave() */ #define _TIF_WORK_SYSCALL_EXIT \ - (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP) + (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP | \ + _TIF_SYSCALL_FTRACE) /* work to do on interrupt/exception return */ #define _TIF_WORK_MASK \ @@ -132,7 +135,7 @@ struct thread_info { _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU)) /* work to do on any return to user space */ -#define _TIF_ALLWORK_MASK (0x0000FFFF & ~_TIF_SECCOMP) +#define _TIF_ALLWORK_MASK ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_FTRACE) /* Only used for 64 bit */ #define _TIF_DO_NOTIFY_MASK \ diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 3d9672e59c16..99749d6e87a8 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -1416,6 +1417,9 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs) tracehook_report_syscall_entry(regs)) ret = -1L; + if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE))) + ftrace_syscall_enter(regs); + if (unlikely(current->audit_context)) { if (IS_IA32) audit_syscall_entry(AUDIT_ARCH_I386, @@ -1439,6 +1443,9 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs) if (unlikely(current->audit_context)) audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); + if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE))) + ftrace_syscall_exit(regs); + if (test_thread_flag(TIF_SYSCALL_TRACE)) tracehook_report_syscall_exit(regs, 0); -- cgit 1.4.1 From f58ba100678f421bdcb000a3c71793f432dfab93 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 13 Mar 2009 15:42:12 +0100 Subject: tracing/syscalls: support for syscalls tracing on x86 Extend x86 architecture syscall tracing support with syscall metadata table details. (The upcoming core syscall tracing modifications rely on this.) Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt LKML-Reference: <1236955332-10133-3-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ftrace.h | 7 +++++ arch/x86/kernel/ftrace.c | 63 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) (limited to 'arch') diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index db24c2278be0..bd2c6511c887 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -28,6 +28,13 @@ #endif +/* FIXME: I don't want to stay hardcoded */ +#ifdef CONFIG_X86_64 +# define FTRACE_SYSCALL_MAX 296 +#else +# define FTRACE_SYSCALL_MAX 333 +#endif + #ifdef CONFIG_FUNCTION_TRACER #define MCOUNT_ADDR ((long)(mcount)) #define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */ diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index a85da1764b1c..1d0d7f42efe3 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -453,3 +453,66 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) } } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + +#ifdef CONFIG_FTRACE_SYSCALLS + +extern unsigned long __start_syscalls_metadata[]; +extern unsigned long __stop_syscalls_metadata[]; +extern unsigned long *sys_call_table; + +static struct syscall_metadata **syscalls_metadata; + +static struct syscall_metadata *find_syscall_meta(unsigned long *syscall) +{ + struct syscall_metadata *start; + struct syscall_metadata *stop; + char str[KSYM_SYMBOL_LEN]; + + + start = (struct syscall_metadata *)__start_syscalls_metadata; + stop = (struct syscall_metadata *)__stop_syscalls_metadata; + kallsyms_lookup((unsigned long) syscall, NULL, NULL, NULL, str); + + for ( ; start < stop; start++) { + if (start->name && !strcmp(start->name, str)) + return start; + } + return NULL; +} + +struct syscall_metadata *syscall_nr_to_meta(int nr) +{ + if (!syscalls_metadata || nr >= FTRACE_SYSCALL_MAX || nr < 0) + return NULL; + + return syscalls_metadata[nr]; +} + +void arch_init_ftrace_syscalls(void) +{ + int i; + struct syscall_metadata *meta; + unsigned long **psys_syscall_table = &sys_call_table; + static atomic_t refs; + + if (atomic_inc_return(&refs) != 1) + goto end; + + syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * + FTRACE_SYSCALL_MAX, GFP_KERNEL); + if (!syscalls_metadata) { + WARN_ON(1); + return; + } + + for (i = 0; i < FTRACE_SYSCALL_MAX; i++) { + meta = find_syscall_meta(psys_syscall_table[i]); + syscalls_metadata[i] = meta; + } + return; + + /* Paranoid: avoid overflow */ +end: + atomic_dec(&refs); +} +#endif -- cgit 1.4.1 From ccd50dfd92ea2c4ba9e39531ac55db53393e783e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 13 Mar 2009 17:02:17 +0100 Subject: tracing/syscalls: support for syscalls tracing on x86, fix Impact: build fix kernel/built-in.o: In function `ftrace_syscall_exit': (.text+0x76667): undefined reference to `syscall_nr_to_meta' ftrace.o is built: obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o But now a CONFIG_FTRACE_SYSCALLS dependency is needed too. Cc: Frederic Weisbecker Cc: Steven Rostedt Cc: Lai Jiangshan LKML-Reference: <1236401580-5758-3-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 339ce35648e6..84000eb931ff 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -66,7 +66,8 @@ obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-y += apic/ obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o -obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o +obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o +obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o -- cgit 1.4.1 From e9d9df44736d116726f4596f7e2f9ce2764ffc0a Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 18 Mar 2009 16:42:57 +0800 Subject: ftrace: protect running nmi (V3) When I review the sensitive code ftrace_nmi_enter(), I found the atomic variable nmi_running does protect NMI VS do_ftrace_mod_code(), but it can not protects NMI(entered nmi) VS NMI(ftrace_nmi_enter()). cpu#1 | cpu#2 | cpu#3 ftrace_nmi_enter() | do_ftrace_mod_code() | not modify | | ------------------------|-----------------------|-- executing | set mod_code_write = 1| executing --|-----------------------|-------------------- executing | | ftrace_nmi_enter() executing | | do modify ------------------------|-----------------------|----------------- ftrace_nmi_exit() | | cpu#3 may be being modified the code which is still being executed on cpu#1, it will have undefined results and possibly take a GPF, this patch prevents it occurred. Signed-off-by: Lai Jiangshan LKML-Reference: <49C0B411.30003@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- arch/x86/kernel/ftrace.c | 63 ++++++++++++++++++++++++++++++------------------ 1 file changed, 40 insertions(+), 23 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 1d0d7f42efe3..57b33edb7ce3 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -79,11 +79,11 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) * * 1) Put the instruction pointer into the IP buffer * and the new code into the "code" buffer. - * 2) Set a flag that says we are modifying code - * 3) Wait for any running NMIs to finish. - * 4) Write the code - * 5) clear the flag. - * 6) Wait for any running NMIs to finish. + * 2) Wait for any running NMIs to finish and set a flag that says + * we are modifying code, it is done in an atomic operation. + * 3) Write the code + * 4) clear the flag. + * 5) Wait for any running NMIs to finish. * * If an NMI is executed, the first thing it does is to call * "ftrace_nmi_enter". This will check if the flag is set to write @@ -95,9 +95,9 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) * are the same as what exists. */ +#define MOD_CODE_WRITE_FLAG (1 << 31) /* set when NMI should do the write */ static atomic_t nmi_running = ATOMIC_INIT(0); static int mod_code_status; /* holds return value of text write */ -static int mod_code_write; /* set when NMI should do the write */ static void *mod_code_ip; /* holds the IP to write to */ static void *mod_code_newcode; /* holds the text to write to the IP */ @@ -114,6 +114,20 @@ int ftrace_arch_read_dyn_info(char *buf, int size) return r; } +static void clear_mod_flag(void) +{ + int old = atomic_read(&nmi_running); + + for (;;) { + int new = old & ~MOD_CODE_WRITE_FLAG; + + if (old == new) + break; + + old = atomic_cmpxchg(&nmi_running, old, new); + } +} + static void ftrace_mod_code(void) { /* @@ -127,27 +141,39 @@ static void ftrace_mod_code(void) /* if we fail, then kill any new writers */ if (mod_code_status) - mod_code_write = 0; + clear_mod_flag(); } void ftrace_nmi_enter(void) { - atomic_inc(&nmi_running); - /* Must have nmi_running seen before reading write flag */ - smp_mb(); - if (mod_code_write) { + if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) { + smp_rmb(); ftrace_mod_code(); atomic_inc(&nmi_update_count); } + /* Must have previous changes seen before executions */ + smp_mb(); } void ftrace_nmi_exit(void) { /* Finish all executions before clearing nmi_running */ - smp_wmb(); + smp_mb(); atomic_dec(&nmi_running); } +static void wait_for_nmi_and_set_mod_flag(void) +{ + if (!atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG)) + return; + + do { + cpu_relax(); + } while (atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG)); + + nmi_wait_count++; +} + static void wait_for_nmi(void) { if (!atomic_read(&nmi_running)) @@ -167,14 +193,9 @@ do_ftrace_mod_code(unsigned long ip, void *new_code) mod_code_newcode = new_code; /* The buffers need to be visible before we let NMIs write them */ - smp_wmb(); - - mod_code_write = 1; - - /* Make sure write bit is visible before we wait on NMIs */ smp_mb(); - wait_for_nmi(); + wait_for_nmi_and_set_mod_flag(); /* Make sure all running NMIs have finished before we write the code */ smp_mb(); @@ -182,13 +203,9 @@ do_ftrace_mod_code(unsigned long ip, void *new_code) ftrace_mod_code(); /* Make sure the write happens before clearing the bit */ - smp_wmb(); - - mod_code_write = 0; - - /* make sure NMIs see the cleared bit */ smp_mb(); + clear_mod_flag(); wait_for_nmi(); return mod_code_status; -- cgit 1.4.1 From 5d1a03dc541dc6672e60e57249ed22f40654ca47 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 23 Mar 2009 23:38:49 -0400 Subject: function-graph: moved the timestamp from arch to generic code This patch move the timestamp from happening in the arch specific code into the general code. This allows for better control by the tracer to time manipulation. Signed-off-by: Steven Rostedt --- arch/x86/kernel/ftrace.c | 6 +----- include/linux/ftrace.h | 3 +-- kernel/trace/trace_functions_graph.c | 8 +++++--- 3 files changed, 7 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 57b33edb7ce3..61df77532120 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -410,7 +410,6 @@ int ftrace_disable_ftrace_graph_caller(void) void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) { unsigned long old; - unsigned long long calltime; int faulted; struct ftrace_graph_ent trace; unsigned long return_hooker = (unsigned long) @@ -453,10 +452,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) return; } - calltime = trace_clock_local(); - - if (ftrace_push_return_trace(old, calltime, - self_addr, &trace.depth) == -EBUSY) { + if (ftrace_push_return_trace(old, self_addr, &trace.depth) == -EBUSY) { *parent = old; return; } diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index db3fed630db3..1141248c84ee 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -369,8 +369,7 @@ struct ftrace_ret_stack { extern void return_to_handler(void); extern int -ftrace_push_return_trace(unsigned long ret, unsigned long long time, - unsigned long func, int *depth); +ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth); extern void ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index e876816fa8e7..d28687e7b3a7 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -57,9 +57,9 @@ static struct tracer_flags tracer_flags = { /* Add a function return address to the trace stack on thread info.*/ int -ftrace_push_return_trace(unsigned long ret, unsigned long long time, - unsigned long func, int *depth) +ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth) { + unsigned long long calltime; int index; if (!current->ret_stack) @@ -71,11 +71,13 @@ ftrace_push_return_trace(unsigned long ret, unsigned long long time, return -EBUSY; } + calltime = trace_clock_local(); + index = ++current->curr_ret_stack; barrier(); current->ret_stack[index].ret = ret; current->ret_stack[index].func = func; - current->ret_stack[index].calltime = time; + current->ret_stack[index].calltime = calltime; *depth = index; return 0; -- cgit 1.4.1 From fee039a1d05c6e0f71b0fe270d847742a02d56c4 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 23 Mar 2009 10:14:52 -0400 Subject: x86: kretprobe-booster interrupt emulation code fix Fix interrupt emulation code in kretprobe-booster according to pt_regs update (es/ds change and gs adding). This issue has been reported on systemtap-bugzilla: http://sources.redhat.com/bugzilla/show_bug.cgi?id=9965 | On a -tip kernel on x86_32, kretprobe_example (from samples) triggers the | following backtrace when its retprobing a class of functions that cause a | copy_from/to_user(). | | BUG: sleeping function called from invalid context at mm/memory.c:3196 | in_atomic(): 0, irqs_disabled(): 1, pid: 2286, name: cat Signed-off-by: Masami Hiramatsu Acked-by: Ananth N Mavinakayanahalli Tested-by: Bharata B Rao Cc: systemtap-ml LKML-Reference: <49C7995C.2010601@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 4558dd3918cf..759095d53a06 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -638,13 +638,13 @@ static void __used __kprobes kretprobe_trampoline_holder(void) #else " pushf\n" /* - * Skip cs, ip, orig_ax. + * Skip cs, ip, orig_ax and gs. * trampoline_handler() will plug in these values */ - " subl $12, %esp\n" + " subl $16, %esp\n" " pushl %fs\n" - " pushl %ds\n" " pushl %es\n" + " pushl %ds\n" " pushl %eax\n" " pushl %ebp\n" " pushl %edi\n" @@ -655,10 +655,10 @@ static void __used __kprobes kretprobe_trampoline_holder(void) " movl %esp, %eax\n" " call trampoline_handler\n" /* Move flags to cs */ - " movl 52(%esp), %edx\n" - " movl %edx, 48(%esp)\n" + " movl 56(%esp), %edx\n" + " movl %edx, 52(%esp)\n" /* Replace saved flags with true return address. */ - " movl %eax, 52(%esp)\n" + " movl %eax, 56(%esp)\n" " popl %ebx\n" " popl %ecx\n" " popl %edx\n" @@ -666,8 +666,8 @@ static void __used __kprobes kretprobe_trampoline_holder(void) " popl %edi\n" " popl %ebp\n" " popl %eax\n" - /* Skip ip, orig_ax, es, ds, fs */ - " addl $20, %esp\n" + /* Skip ds, es, fs, gs, orig_ax and ip */ + " addl $24, %esp\n" " popf\n" #endif " ret\n"); @@ -691,6 +691,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs) regs->cs = __KERNEL_CS; #else regs->cs = __KERNEL_CS | get_kernel_rpl(); + regs->gs = 0; #endif regs->ip = trampoline_address; regs->orig_ax = ~0UL; -- cgit 1.4.1 From a095bdbb136f7bed96b7adf5aa1dd27bb2f839bf Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Fri, 27 Mar 2009 23:08:34 +1100 Subject: tracing, powerpc: fix powerpc tree and tracing tree interaction Today's linux-next build (powerpc allyesconfig) failed like this: arch/powerpc/kernel/ftrace.c: In function 'prepare_ftrace_return': arch/powerpc/kernel/ftrace.c:612: warning: passing argument 3 of 'ftrace_push_return_trace' makes pointer from integer without a cast arch/powerpc/kernel/ftrace.c:612: error: too many arguments to function 'ftrace_push_return_trace' Caused by commit 5d1a03dc541dc6672e60e57249ed22f40654ca47 ("function-graph: moved the timestamp from arch to generic code") from the tracing tree which (removed an argument from ftrace_push_return_trace()) interacting with commit 6794c78243bfda020ab184d6d578944f8e90d26c ("powerpc64: port of the function graph tracer") from the powerpc tree. Signed-off-by: Stephen Rothwell Cc: Steven Rostedt Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: LKML-Reference: <20090327230834.93d0221d.sfr@canb.auug.org.au> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/ftrace.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c index 5b5d16b2cac8..5455943f16aa 100644 --- a/arch/powerpc/kernel/ftrace.c +++ b/arch/powerpc/kernel/ftrace.c @@ -557,7 +557,6 @@ extern void mod_return_to_handler(void); void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) { unsigned long old; - unsigned long long calltime; int faulted; struct ftrace_graph_ent trace; unsigned long return_hooker = (unsigned long)&return_to_handler; @@ -606,10 +605,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) return; } - calltime = cpu_clock(raw_smp_processor_id()); - - if (ftrace_push_return_trace(old, calltime, - self_addr, &trace.depth) == -EBUSY) { + if (ftrace_push_return_trace(old, self_addr, &trace.depth) == -EBUSY) { *parent = old; return; } -- cgit 1.4.1