From 140fe3b1ab9c082182ef13359fab4ddba95c24c3 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 21 Jun 2011 10:35:55 +0800 Subject: jump_label: Fix jump_label update for modules The jump labels entries for modules do not stop at __stop__jump_table, but after mod->jump_entries + mod_num_jump_entries. By checking the wrong end point, module trace events never get enabled. Cc: Ingo Molnar Acked-by: Jason Baron Tested-by: Avi Kivity Tested-by: Johannes Berg Signed-off-by: Xiao Guangrong Link: http://lkml.kernel.org/r/4E00038B.2060404@cn.fujitsu.com Signed-off-by: Steven Rostedt --- kernel/jump_label.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/kernel/jump_label.c b/kernel/jump_label.c index fa27e750dbc0..a8ce45097f3d 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -375,15 +375,19 @@ int jump_label_text_reserved(void *start, void *end) static void jump_label_update(struct jump_label_key *key, int enable) { - struct jump_entry *entry = key->entries; - - /* if there are no users, entry can be NULL */ - if (entry) - __jump_label_update(key, entry, __stop___jump_table, enable); + struct jump_entry *entry = key->entries, *stop = __stop___jump_table; #ifdef CONFIG_MODULES + struct module *mod = __module_address((jump_label_t)key); + __jump_label_mod_update(key, enable); + + if (mod) + stop = mod->jump_entries + mod->num_jump_entries; #endif + /* if there are no users, entry can be NULL */ + if (entry) + __jump_label_update(key, entry, stop, enable); } #endif -- cgit 1.4.1 From e9dbfae53eeb9fc3d4bb7da3df87fa9875f5da02 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 5 Jul 2011 11:36:06 -0400 Subject: tracing: Fix bug when reading system filters on module removal The event system is freed when its nr_events is set to zero. This happens when a module created an event system and then later the module is removed. Modules may share systems, so the system is allocated when it is created and freed when the modules are unloaded and all the events under the system are removed (nr_events set to zero). The problem arises when a task opened the "filter" file for the system. If the module is unloaded and it removed the last event for that system, the system structure is freed. If the task that opened the filter file accesses the "filter" file after the system has been freed, the system will access an invalid pointer. By adding a ref_count, and using it to keep track of what is using the event system, we can free it after all users are finished with the event system. Cc: Reported-by: Johannes Berg Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 1 + kernel/trace/trace_events.c | 86 +++++++++++++++++++++++++++++++++----- kernel/trace/trace_events_filter.c | 6 +++ 3 files changed, 82 insertions(+), 11 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 229f8591f61d..f8074072d111 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -677,6 +677,7 @@ struct event_subsystem { struct dentry *entry; struct event_filter *filter; int nr_events; + int ref_count; }; #define FILTER_PRED_INVALID ((unsigned short)-1) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 686ec399f2a8..ffc5b2884af1 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -244,6 +244,35 @@ static void ftrace_clear_events(void) mutex_unlock(&event_mutex); } +static void __put_system(struct event_subsystem *system) +{ + struct event_filter *filter = system->filter; + + WARN_ON_ONCE(system->ref_count == 0); + if (--system->ref_count) + return; + + if (filter) { + kfree(filter->filter_string); + kfree(filter); + } + kfree(system->name); + kfree(system); +} + +static void __get_system(struct event_subsystem *system) +{ + WARN_ON_ONCE(system->ref_count == 0); + system->ref_count++; +} + +static void put_system(struct event_subsystem *system) +{ + mutex_lock(&event_mutex); + __put_system(system); + mutex_unlock(&event_mutex); +} + /* * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. */ @@ -826,6 +855,47 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, return cnt; } +static LIST_HEAD(event_subsystems); + +static int subsystem_open(struct inode *inode, struct file *filp) +{ + struct event_subsystem *system = NULL; + int ret; + + /* Make sure the system still exists */ + mutex_lock(&event_mutex); + list_for_each_entry(system, &event_subsystems, list) { + if (system == inode->i_private) { + /* Don't open systems with no events */ + if (!system->nr_events) { + system = NULL; + break; + } + __get_system(system); + break; + } + } + mutex_unlock(&event_mutex); + + if (system != inode->i_private) + return -ENODEV; + + ret = tracing_open_generic(inode, filp); + if (ret < 0) + put_system(system); + + return ret; +} + +static int subsystem_release(struct inode *inode, struct file *file) +{ + struct event_subsystem *system = inode->i_private; + + put_system(system); + + return 0; +} + static ssize_t subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -963,10 +1033,11 @@ static const struct file_operations ftrace_event_filter_fops = { }; static const struct file_operations ftrace_subsystem_filter_fops = { - .open = tracing_open_generic, + .open = subsystem_open, .read = subsystem_filter_read, .write = subsystem_filter_write, .llseek = default_llseek, + .release = subsystem_release, }; static const struct file_operations ftrace_system_enable_fops = { @@ -1002,8 +1073,6 @@ static struct dentry *event_trace_events_dir(void) return d_events; } -static LIST_HEAD(event_subsystems); - static struct dentry * event_subsystem_dir(const char *name, struct dentry *d_events) { @@ -1013,6 +1082,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events) /* First see if we did not already create this dir */ list_for_each_entry(system, &event_subsystems, list) { if (strcmp(system->name, name) == 0) { + __get_system(system); system->nr_events++; return system->entry; } @@ -1035,6 +1105,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events) } system->nr_events = 1; + system->ref_count = 1; system->name = kstrdup(name, GFP_KERNEL); if (!system->name) { debugfs_remove(system->entry); @@ -1184,16 +1255,9 @@ static void remove_subsystem_dir(const char *name) list_for_each_entry(system, &event_subsystems, list) { if (strcmp(system->name, name) == 0) { if (!--system->nr_events) { - struct event_filter *filter = system->filter; - debugfs_remove_recursive(system->entry); list_del(&system->list); - if (filter) { - kfree(filter->filter_string); - kfree(filter); - } - kfree(system->name); - kfree(system); + __put_system(system); } break; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 8008ddcfbf20..256764ecccd6 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1886,6 +1886,12 @@ int apply_subsystem_event_filter(struct event_subsystem *system, mutex_lock(&event_mutex); + /* Make sure the system still has events */ + if (!system->nr_events) { + err = -ENODEV; + goto out_unlock; + } + if (!strcmp(strstrip(filter_string), "0")) { filter_free_subsystem_preds(system); remove_filter_string(system->filter); -- cgit 1.4.1 From 40ee4dffff061399eb9358e0c8fcfbaf8de4c8fe Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 5 Jul 2011 14:32:51 -0400 Subject: tracing: Have "enable" file use refcounts like the "filter" file The "enable" file for the event system can be removed when a module is unloaded and the event system only has events from that module. As the event system nr_events count goes to zero, it may be freed if its ref_count is also set to zero. Like the "filter" file, the "enable" file may be opened by a task and referenced later, after a module has been unloaded and the events for that event system have been removed. Although the "filter" file referenced the event system structure, the "enable" file only references a pointer to the event system name. Since the name is freed when the event system is removed, it is possible that an access to the "enable" file may reference a freed pointer. Update the "enable" file to use the subsystem_open() routine that the "filter" file uses, to keep a reference to the event system structure while the "enable" file is opened. Cc: Reported-by: Johannes Berg Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index ffc5b2884af1..3e2a7c91c548 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -557,7 +557,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { const char set_to_char[4] = { '?', '0', '1', 'X' }; - const char *system = filp->private_data; + struct event_subsystem *system = filp->private_data; struct ftrace_event_call *call; char buf[2]; int set = 0; @@ -568,7 +568,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, if (!call->name || !call->class || !call->class->reg) continue; - if (system && strcmp(call->class->system, system) != 0) + if (system && strcmp(call->class->system, system->name) != 0) continue; /* @@ -598,7 +598,8 @@ static ssize_t system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - const char *system = filp->private_data; + struct event_subsystem *system = filp->private_data; + const char *name = NULL; unsigned long val; char buf[64]; ssize_t ret; @@ -622,7 +623,14 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, if (val != 0 && val != 1) return -EINVAL; - ret = __ftrace_set_clr_event(NULL, system, NULL, val); + /* + * Opening of "enable" adds a ref count to system, + * so the name is safe to use. + */ + if (system) + name = system->name; + + ret = __ftrace_set_clr_event(NULL, name, NULL, val); if (ret) goto out; @@ -862,6 +870,9 @@ static int subsystem_open(struct inode *inode, struct file *filp) struct event_subsystem *system = NULL; int ret; + if (!inode->i_private) + goto skip_search; + /* Make sure the system still exists */ mutex_lock(&event_mutex); list_for_each_entry(system, &event_subsystems, list) { @@ -880,8 +891,9 @@ static int subsystem_open(struct inode *inode, struct file *filp) if (system != inode->i_private) return -ENODEV; + skip_search: ret = tracing_open_generic(inode, filp); - if (ret < 0) + if (ret < 0 && system) put_system(system); return ret; @@ -891,7 +903,8 @@ static int subsystem_release(struct inode *inode, struct file *file) { struct event_subsystem *system = inode->i_private; - put_system(system); + if (system) + put_system(system); return 0; } @@ -1041,10 +1054,11 @@ static const struct file_operations ftrace_subsystem_filter_fops = { }; static const struct file_operations ftrace_system_enable_fops = { - .open = tracing_open_generic, + .open = subsystem_open, .read = system_enable_read, .write = system_enable_write, .llseek = default_llseek, + .release = subsystem_release, }; static const struct file_operations ftrace_show_header_fops = { @@ -1133,8 +1147,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events) "'%s/filter' entry\n", name); } - trace_create_file("enable", 0644, system->entry, - (void *)system->name, + trace_create_file("enable", 0644, system->entry, system, &ftrace_system_enable_fops); return system->entry; -- cgit 1.4.1 From 43dd61c9a09bd413e837df829e6bfb42159be52a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 7 Jul 2011 11:09:22 -0400 Subject: ftrace: Fix regression of :mod:module function enabling The new code that allows different utilities to pick and choose what functions they trace broke the :mod: hook that allows users to trace only functions of a particular module. The reason is that the :mod: hook bypasses the hash that is setup to allow individual users to trace their own functions and uses the global hash directly. But if the global hash has not been set up, it will cause a bug: echo '*:mod:radeon' > /sys/kernel/debug/set_ftrace_filter produces: [drm:drm_mode_getfb] *ERROR* invalid framebuffer id [drm:radeon_crtc_page_flip] *ERROR* failed to reserve new rbo buffer before flip BUG: unable to handle kernel paging request at ffffffff8160ec90 IP: [] add_hash_entry+0x66/0xd0 PGD 1a05067 PUD 1a09063 PMD 80000000016001e1 Oops: 0003 [#1] SMP Jul 7 04:02:28 phyllis kernel: [55303.858604] CPU 1 Modules linked in: cryptd aes_x86_64 aes_generic binfmt_misc rfcomm bnep ip6table_filter hid radeon r8169 ahci libahci mii ttm drm_kms_helper drm video i2c_algo_bit intel_agp intel_gtt Pid: 10344, comm: bash Tainted: G WC 3.0.0-rc5 #1 Dell Inc. Inspiron N5010/0YXXJJ RIP: 0010:[] [] add_hash_entry+0x66/0xd0 RSP: 0018:ffff88003a96bda8 EFLAGS: 00010246 RAX: ffff8801301735c0 RBX: ffffffff8160ec80 RCX: 0000000000306ee0 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff880137c92940 RBP: ffff88003a96bdb8 R08: ffff880137c95680 R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000000 R12: ffffffff81c9df78 R13: ffff8801153d1000 R14: 0000000000000000 R15: 0000000000000000 FS: 00007f329c18a700(0000) GS:ffff880137c80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffff8160ec90 CR3: 000000003002b000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process bash (pid: 10344, threadinfo ffff88003a96a000, task ffff88012fcfc470) Stack: 0000000000000fd0 00000000000000fc ffff88003a96be38 ffffffff810d92f5 ffff88011c4c4e00 ffff880000000000 000000000b69f4d0 ffffffff8160ec80 ffff8800300e6f06 0000000081130295 0000000000000282 ffff8800300e6f00 Call Trace: [] match_records+0x155/0x1b0 [] ftrace_mod_callback+0xbc/0x100 [] ftrace_regex_write+0x16f/0x210 [] ftrace_filter_write+0xf/0x20 [] vfs_write+0xc8/0x190 [] sys_write+0x51/0x90 [] system_call_fastpath+0x16/0x1b Code: 48 8b 33 31 d2 48 85 f6 75 33 49 89 d4 4c 03 63 08 49 8b 14 24 48 85 d2 48 89 10 74 04 48 89 42 08 49 89 04 24 4c 89 60 08 31 d2 RIP [] add_hash_entry+0x66/0xd0 RSP CR2: ffffffff8160ec90 ---[ end trace a5d031828efdd88e ]--- Reported-by: Brian Marete Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 3 ++- kernel/trace/ftrace.c | 12 +++--------- kernel/trace/trace_functions.c | 3 ++- 3 files changed, 7 insertions(+), 11 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 9d88e1cb5dbb..ed0eb5254d1c 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -123,7 +123,8 @@ stack_trace_sysctl(struct ctl_table *table, int write, struct ftrace_func_command { struct list_head list; char *name; - int (*func)(char *func, char *cmd, + int (*func)(struct ftrace_hash *hash, + char *func, char *cmd, char *params, int enable); }; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 908038f57440..1c4c0b087e1d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2407,10 +2407,9 @@ ftrace_match_module_records(struct ftrace_hash *hash, char *buff, char *mod) */ static int -ftrace_mod_callback(char *func, char *cmd, char *param, int enable) +ftrace_mod_callback(struct ftrace_hash *hash, + char *func, char *cmd, char *param, int enable) { - struct ftrace_ops *ops = &global_ops; - struct ftrace_hash *hash; char *mod; int ret = -EINVAL; @@ -2430,11 +2429,6 @@ ftrace_mod_callback(char *func, char *cmd, char *param, int enable) if (!strlen(mod)) return ret; - if (enable) - hash = ops->filter_hash; - else - hash = ops->notrace_hash; - ret = ftrace_match_module_records(hash, func, mod); if (!ret) ret = -EINVAL; @@ -2760,7 +2754,7 @@ static int ftrace_process_regex(struct ftrace_hash *hash, mutex_lock(&ftrace_cmd_mutex); list_for_each_entry(p, &ftrace_commands, list) { if (strcmp(p->name, command) == 0) { - ret = p->func(func, command, next, enable); + ret = p->func(hash, func, command, next, enable); goto out_unlock; } } diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 8d0e1cc4e974..c7b0c6a7db09 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -324,7 +324,8 @@ ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param) } static int -ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable) +ftrace_trace_onoff_callback(struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enable) { struct ftrace_probe_ops *ops; void *count = (void *)-1; -- cgit 1.4.1 From 04da85b86188f224cc9b391b5bdd92a3ba20ffcf Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 11 Jul 2011 10:12:59 -0400 Subject: ftrace: Fix warning when CONFIG_FUNCTION_TRACER is not defined The struct ftrace_hash was declared within CONFIG_FUNCTION_TRACER but was referenced outside of it. Reported-by: Ingo Molnar Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index ed0eb5254d1c..f0c0e8a47ae6 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -19,6 +19,8 @@ #include +struct ftrace_hash; + #ifdef CONFIG_FUNCTION_TRACER extern int ftrace_enabled; @@ -29,8 +31,6 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip); -struct ftrace_hash; - enum { FTRACE_OPS_FL_ENABLED = 1 << 0, FTRACE_OPS_FL_GLOBAL = 1 << 1, -- cgit 1.4.1 From f7bc8b61f65726ff98f52e286b28e294499d7a08 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 14 Jul 2011 23:02:27 -0400 Subject: ftrace: Fix regression where ftrace breaks when modules are loaded Enabling function tracer to trace all functions, then load a module and then disable function tracing will cause ftrace to fail. This can also happen by enabling function tracing on the command line: ftrace=function and during boot up, modules are loaded, then you disable function tracing with 'echo nop > current_tracer' you will trigger a bug in ftrace that will shut itself down. The reason is, the new ftrace code keeps ref counts of all ftrace_ops that are registered for tracing. When one or more ftrace_ops are registered, all the records that represent the functions that the ftrace_ops will trace have a ref count incremented. If this ref count is not zero, when the code modification runs, that function will be enabled for tracing. If the ref count is zero, that function will be disabled from tracing. To make sure the accounting was working, FTRACE_WARN_ON()s were added to updating of the ref counts. If the ref count hits its max (> 2^30 ftrace_ops added), or if the ref count goes below zero, a FTRACE_WARN_ON() is triggered which disables all modification of code. Since it is common for ftrace_ops to trace all functions in the kernel, instead of creating > 20,000 hash items for the ftrace_ops, the hash count is just set to zero, and it represents that the ftrace_ops is to trace all functions. This is where the issues arrise. If you enable function tracing to trace all functions, and then add a module, the modules function records do not get the ref count updated. When the function tracer is disabled, all function records ref counts are subtracted. Since the modules never had their ref counts incremented, they go below zero and the FTRACE_WARN_ON() is triggered. The solution to this is rather simple. When modules are loaded, and their functions are added to the the ftrace pool, look to see if any ftrace_ops are registered that trace all functions. And for those, update the ref count for the module function records. Reported-by: Thomas Gleixner Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1c4c0b087e1d..ef9271b69b4f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1744,10 +1744,36 @@ static cycle_t ftrace_update_time; static unsigned long ftrace_update_cnt; unsigned long ftrace_update_tot_cnt; +static int ops_traces_mod(struct ftrace_ops *ops) +{ + struct ftrace_hash *hash; + + hash = ops->filter_hash; + return !!(!hash || !hash->count); +} + static int ftrace_update_code(struct module *mod) { struct dyn_ftrace *p; cycle_t start, stop; + unsigned long ref = 0; + + /* + * When adding a module, we need to check if tracers are + * currently enabled and if they are set to trace all functions. + * If they are, we need to enable the module functions as well + * as update the reference counts for those function records. + */ + if (mod) { + struct ftrace_ops *ops; + + for (ops = ftrace_ops_list; + ops != &ftrace_list_end; ops = ops->next) { + if (ops->flags & FTRACE_OPS_FL_ENABLED && + ops_traces_mod(ops)) + ref++; + } + } start = ftrace_now(raw_smp_processor_id()); ftrace_update_cnt = 0; @@ -1760,7 +1786,7 @@ static int ftrace_update_code(struct module *mod) p = ftrace_new_addrs; ftrace_new_addrs = p->newlist; - p->flags = 0L; + p->flags = ref; /* * Do the initial record conversion from mcount jump @@ -1783,7 +1809,7 @@ static int ftrace_update_code(struct module *mod) * conversion puts the module to the correct state, thus * passing the ftrace_make_call check. */ - if (ftrace_start_up) { + if (ftrace_start_up && ref) { int failed = __ftrace_replace_code(p, 1); if (failed) { ftrace_bug(failed, p->ip); -- cgit 1.4.1