From 7440172974e85b1828bdd84ac6b23b5bcad9c5eb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 6 Nov 2018 18:44:52 -0800 Subject: tracing: Replace synchronize_sched() and call_rcu_sched() Now that synchronize_rcu() waits for preempt-disable regions of code as well as RCU read-side critical sections, synchronize_sched() can be replaced by synchronize_rcu(). Similarly, call_rcu_sched() can be replaced by call_rcu(). This commit therefore makes these changes. Signed-off-by: Paul E. McKenney Cc: Ingo Molnar Cc: Acked-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 24 ++++++++++++------------ kernel/trace/ring_buffer.c | 12 ++++++------ kernel/trace/trace.c | 10 +++++----- kernel/trace/trace_events_filter.c | 4 ++-- kernel/trace/trace_kprobe.c | 2 +- 5 files changed, 26 insertions(+), 26 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f536f601bd46..5b4f73e4fd56 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -173,7 +173,7 @@ static void ftrace_sync(struct work_struct *work) { /* * This function is just a stub to implement a hard force - * of synchronize_sched(). This requires synchronizing + * of synchronize_rcu(). This requires synchronizing * tasks even in userspace and idle. * * Yes, function tracing is rude. @@ -934,7 +934,7 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, ftrace_profile_enabled = 0; /* * unregister_ftrace_profiler calls stop_machine - * so this acts like an synchronize_sched. + * so this acts like an synchronize_rcu. */ unregister_ftrace_profiler(); } @@ -1086,7 +1086,7 @@ struct ftrace_ops *ftrace_ops_trampoline(unsigned long addr) /* * Some of the ops may be dynamically allocated, - * they are freed after a synchronize_sched(). + * they are freed after a synchronize_rcu(). */ preempt_disable_notrace(); @@ -1286,7 +1286,7 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash) { if (!hash || hash == EMPTY_HASH) return; - call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu); + call_rcu(&hash->rcu, __free_ftrace_hash_rcu); } void ftrace_free_filter(struct ftrace_ops *ops) @@ -1501,7 +1501,7 @@ static bool hash_contains_ip(unsigned long ip, * the ip is not in the ops->notrace_hash. * * This needs to be called with preemption disabled as - * the hashes are freed with call_rcu_sched(). + * the hashes are freed with call_rcu(). */ static int ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) @@ -4496,7 +4496,7 @@ unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr, if (ftrace_enabled && !ftrace_hash_empty(hash)) ftrace_run_modify_code(&probe->ops, FTRACE_UPDATE_CALLS, &old_hash_ops); - synchronize_sched(); + synchronize_rcu(); hlist_for_each_entry_safe(entry, tmp, &hhd, hlist) { hlist_del(&entry->hlist); @@ -5314,7 +5314,7 @@ ftrace_graph_release(struct inode *inode, struct file *file) mutex_unlock(&graph_lock); /* Wait till all users are no longer using the old hash */ - synchronize_sched(); + synchronize_rcu(); free_ftrace_hash(old_hash); } @@ -5707,7 +5707,7 @@ void ftrace_release_mod(struct module *mod) list_for_each_entry_safe(mod_map, n, &ftrace_mod_maps, list) { if (mod_map->mod == mod) { list_del_rcu(&mod_map->list); - call_rcu_sched(&mod_map->rcu, ftrace_free_mod_map); + call_rcu(&mod_map->rcu, ftrace_free_mod_map); break; } } @@ -5927,7 +5927,7 @@ ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, struct ftrace_mod_map *mod_map; const char *ret = NULL; - /* mod_map is freed via call_rcu_sched() */ + /* mod_map is freed via call_rcu() */ preempt_disable(); list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) { ret = ftrace_func_address_lookup(mod_map, addr, size, off, sym); @@ -6262,7 +6262,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, /* * Some of the ops may be dynamically allocated, - * they must be freed after a synchronize_sched(). + * they must be freed after a synchronize_rcu(). */ preempt_disable_notrace(); @@ -6433,7 +6433,7 @@ static void clear_ftrace_pids(struct trace_array *tr) rcu_assign_pointer(tr->function_pids, NULL); /* Wait till all users are no longer using pid filtering */ - synchronize_sched(); + synchronize_rcu(); trace_free_pid_list(pid_list); } @@ -6580,7 +6580,7 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf, rcu_assign_pointer(tr->function_pids, pid_list); if (filtered_pids) { - synchronize_sched(); + synchronize_rcu(); trace_free_pid_list(filtered_pids); } else if (pid_list) { /* Register a probe to set whether to ignore the tracing of a task */ diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 65bd4616220d..4f3247a53259 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1834,7 +1834,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, * There could have been a race between checking * record_disable and incrementing it. */ - synchronize_sched(); + synchronize_rcu(); for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; rb_check_pages(cpu_buffer); @@ -3151,7 +3151,7 @@ static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) * This prevents all writes to the buffer. Any attempt to write * to the buffer after this will fail and return NULL. * - * The caller should call synchronize_sched() after this. + * The caller should call synchronize_rcu() after this. */ void ring_buffer_record_disable(struct ring_buffer *buffer) { @@ -3253,7 +3253,7 @@ bool ring_buffer_record_is_set_on(struct ring_buffer *buffer) * This prevents all writes to the buffer. Any attempt to write * to the buffer after this will fail and return NULL. * - * The caller should call synchronize_sched() after this. + * The caller should call synchronize_rcu() after this. */ void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu) { @@ -4191,7 +4191,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_read_prepare); void ring_buffer_read_prepare_sync(void) { - synchronize_sched(); + synchronize_rcu(); } EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync); @@ -4363,7 +4363,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) atomic_inc(&cpu_buffer->record_disabled); /* Make sure all commits have finished */ - synchronize_sched(); + synchronize_rcu(); raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); @@ -4496,7 +4496,7 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, goto out; /* - * We can't do a synchronize_sched here because this + * We can't do a synchronize_rcu here because this * function can be called in atomic context. * Normally this will be called from the same CPU as cpu. * If not it's up to the caller to protect this. diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ff1c4b20cd0a..51612b4a603f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1681,7 +1681,7 @@ void tracing_reset(struct trace_buffer *buf, int cpu) ring_buffer_record_disable(buffer); /* Make sure all commits have finished */ - synchronize_sched(); + synchronize_rcu(); ring_buffer_reset_cpu(buffer, cpu); ring_buffer_record_enable(buffer); @@ -1698,7 +1698,7 @@ void tracing_reset_online_cpus(struct trace_buffer *buf) ring_buffer_record_disable(buffer); /* Make sure all commits have finished */ - synchronize_sched(); + synchronize_rcu(); buf->time_start = buffer_ftrace_now(buf, buf->cpu); @@ -2250,7 +2250,7 @@ void trace_buffered_event_disable(void) preempt_enable(); /* Wait for all current users to finish */ - synchronize_sched(); + synchronize_rcu(); for_each_tracing_cpu(cpu) { free_page((unsigned long)per_cpu(trace_buffered_event, cpu)); @@ -5398,7 +5398,7 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf) if (tr->current_trace->reset) tr->current_trace->reset(tr); - /* Current trace needs to be nop_trace before synchronize_sched */ + /* Current trace needs to be nop_trace before synchronize_rcu */ tr->current_trace = &nop_trace; #ifdef CONFIG_TRACER_MAX_TRACE @@ -5412,7 +5412,7 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf) * The update_max_tr is called from interrupts disabled * so a synchronized_sched() is sufficient. */ - synchronize_sched(); + synchronize_rcu(); free_snapshot(tr); } #endif diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 84a65173b1e9..35f3aa55be85 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1614,7 +1614,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir, /* * The calls can still be using the old filters. - * Do a synchronize_sched() and to ensure all calls are + * Do a synchronize_rcu() and to ensure all calls are * done with them before we free them. */ tracepoint_synchronize_unregister(); @@ -1845,7 +1845,7 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir, if (filter) { /* * No event actually uses the system filter - * we can free it without synchronize_sched(). + * we can free it without synchronize_rcu(). */ __free_filter(system->filter); system->filter = filter; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index fec67188c4d2..adc153ab51c0 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -333,7 +333,7 @@ disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) * event_call related objects, which will be accessed in * the kprobe_trace_func/kretprobe_trace_func. */ - synchronize_sched(); + synchronize_rcu(); kfree(link); /* Ignored if link == NULL */ } -- cgit v1.2.3 From c43ac4a5301986c015137bb89568979f9b3264ca Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 27 Nov 2018 09:36:37 -0500 Subject: tracing: Do not line wrap short line in function_graph_enter() Commit 588ca1786f2dd ("function_graph: Use new curr_ret_depth to manage depth instead of curr_ret_stack") removed a parameter from the call ftrace_push_return_trace() that made it so that the entire call was under 80 characters, but it did not remove the line break. There's no reason to break that line up, so make it a single line. Link: http://lkml.kernel.org/r/20181122100322.GN2131@hirez.programming.kicks-ass.net Reported-by: Peter Zijlstra Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_functions_graph.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 086af4f5c3e8..0d235e44d08e 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -188,8 +188,7 @@ int function_graph_enter(unsigned long ret, unsigned long func, trace.func = func; trace.depth = ++current->curr_ret_depth; - if (ftrace_push_return_trace(ret, func, - frame_pointer, retp)) + if (ftrace_push_return_trace(ret, func, frame_pointer, retp)) goto out; /* Only trace if the calling function expects to */ -- cgit v1.2.3 From d864a3ca883095aa12575b84841ebd52b3d808fa Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 12 Nov 2018 15:21:22 -0500 Subject: fgraph: Create a fgraph.c file to store function graph infrastructure As the function graph infrastructure can be used by thing other than tracing, moving the code to its own file out of the trace_functions_graph.c code makes more sense. The fgraph.c file will only contain the infrastructure required to hook into functions and their return code. Reviewed-by: Joel Fernandes (Google) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Makefile | 1 + kernel/trace/fgraph.c | 232 +++++++++++++++++++++++++++++++++++ kernel/trace/trace_functions_graph.c | 220 --------------------------------- 3 files changed, 233 insertions(+), 220 deletions(-) create mode 100644 kernel/trace/fgraph.c (limited to 'kernel/trace') diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index f81dadbc7c4a..c7ade7965464 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -57,6 +57,7 @@ obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o +obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += fgraph.o ifeq ($(CONFIG_BLOCK),y) obj-$(CONFIG_EVENT_TRACING) += blktrace.o endif diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c new file mode 100644 index 000000000000..5ad9c0e88b80 --- /dev/null +++ b/kernel/trace/fgraph.c @@ -0,0 +1,232 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Infrastructure to took into function calls and returns. + * Copyright (c) 2008-2009 Frederic Weisbecker + * Mostly borrowed from function tracer which + * is Copyright (c) Steven Rostedt + * + * Highly modified by Steven Rostedt (VMware). + */ +#include + +#include "trace.h" + +static bool kill_ftrace_graph; + +/** + * ftrace_graph_is_dead - returns true if ftrace_graph_stop() was called + * + * ftrace_graph_stop() is called when a severe error is detected in + * the function graph tracing. This function is called by the critical + * paths of function graph to keep those paths from doing any more harm. + */ +bool ftrace_graph_is_dead(void) +{ + return kill_ftrace_graph; +} + +/** + * ftrace_graph_stop - set to permanently disable function graph tracincg + * + * In case of an error int function graph tracing, this is called + * to try to keep function graph tracing from causing any more harm. + * Usually this is pretty severe and this is called to try to at least + * get a warning out to the user. + */ +void ftrace_graph_stop(void) +{ + kill_ftrace_graph = true; +} + +/* Add a function return address to the trace stack on thread info.*/ +static int +ftrace_push_return_trace(unsigned long ret, unsigned long func, + unsigned long frame_pointer, unsigned long *retp) +{ + unsigned long long calltime; + int index; + + if (unlikely(ftrace_graph_is_dead())) + return -EBUSY; + + if (!current->ret_stack) + return -EBUSY; + + /* + * We must make sure the ret_stack is tested before we read + * anything else. + */ + smp_rmb(); + + /* The return trace stack is full */ + if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) { + atomic_inc(¤t->trace_overrun); + return -EBUSY; + } + + /* + * The curr_ret_stack is an index to ftrace return stack of + * current task. Its value should be in [0, FTRACE_RETFUNC_ + * DEPTH) when the function graph tracer is used. To support + * filtering out specific functions, it makes the index + * negative by subtracting huge value (FTRACE_NOTRACE_DEPTH) + * so when it sees a negative index the ftrace will ignore + * the record. And the index gets recovered when returning + * from the filtered function by adding the FTRACE_NOTRACE_ + * DEPTH and then it'll continue to record functions normally. + * + * The curr_ret_stack is initialized to -1 and get increased + * in this function. So it can be less than -1 only if it was + * filtered out via ftrace_graph_notrace_addr() which can be + * set from set_graph_notrace file in tracefs by user. + */ + if (current->curr_ret_stack < -1) + return -EBUSY; + + calltime = trace_clock_local(); + + index = ++current->curr_ret_stack; + if (ftrace_graph_notrace_addr(func)) + current->curr_ret_stack -= FTRACE_NOTRACE_DEPTH; + barrier(); + current->ret_stack[index].ret = ret; + current->ret_stack[index].func = func; + current->ret_stack[index].calltime = calltime; +#ifdef HAVE_FUNCTION_GRAPH_FP_TEST + current->ret_stack[index].fp = frame_pointer; +#endif +#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR + current->ret_stack[index].retp = retp; +#endif + return 0; +} + +int function_graph_enter(unsigned long ret, unsigned long func, + unsigned long frame_pointer, unsigned long *retp) +{ + struct ftrace_graph_ent trace; + + trace.func = func; + trace.depth = ++current->curr_ret_depth; + + if (ftrace_push_return_trace(ret, func, frame_pointer, retp)) + goto out; + + /* Only trace if the calling function expects to */ + if (!ftrace_graph_entry(&trace)) + goto out_ret; + + return 0; + out_ret: + current->curr_ret_stack--; + out: + current->curr_ret_depth--; + return -EBUSY; +} + +/* Retrieve a function return address to the trace stack on thread info.*/ +static void +ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, + unsigned long frame_pointer) +{ + int index; + + index = current->curr_ret_stack; + + /* + * A negative index here means that it's just returned from a + * notrace'd function. Recover index to get an original + * return address. See ftrace_push_return_trace(). + * + * TODO: Need to check whether the stack gets corrupted. + */ + if (index < 0) + index += FTRACE_NOTRACE_DEPTH; + + if (unlikely(index < 0 || index >= FTRACE_RETFUNC_DEPTH)) { + ftrace_graph_stop(); + WARN_ON(1); + /* Might as well panic, otherwise we have no where to go */ + *ret = (unsigned long)panic; + return; + } + +#ifdef HAVE_FUNCTION_GRAPH_FP_TEST + /* + * The arch may choose to record the frame pointer used + * and check it here to make sure that it is what we expect it + * to be. If gcc does not set the place holder of the return + * address in the frame pointer, and does a copy instead, then + * the function graph trace will fail. This test detects this + * case. + * + * Currently, x86_32 with optimize for size (-Os) makes the latest + * gcc do the above. + * + * Note, -mfentry does not use frame pointers, and this test + * is not needed if CC_USING_FENTRY is set. + */ + if (unlikely(current->ret_stack[index].fp != frame_pointer)) { + ftrace_graph_stop(); + WARN(1, "Bad frame pointer: expected %lx, received %lx\n" + " from func %ps return to %lx\n", + current->ret_stack[index].fp, + frame_pointer, + (void *)current->ret_stack[index].func, + current->ret_stack[index].ret); + *ret = (unsigned long)panic; + return; + } +#endif + + *ret = current->ret_stack[index].ret; + trace->func = current->ret_stack[index].func; + trace->calltime = current->ret_stack[index].calltime; + trace->overrun = atomic_read(¤t->trace_overrun); + trace->depth = current->curr_ret_depth--; + /* + * We still want to trace interrupts coming in if + * max_depth is set to 1. Make sure the decrement is + * seen before ftrace_graph_return. + */ + barrier(); +} + +/* + * Send the trace to the ring-buffer. + * @return the original return address. + */ +unsigned long ftrace_return_to_handler(unsigned long frame_pointer) +{ + struct ftrace_graph_ret trace; + unsigned long ret; + + ftrace_pop_return_trace(&trace, &ret, frame_pointer); + trace.rettime = trace_clock_local(); + ftrace_graph_return(&trace); + /* + * The ftrace_graph_return() may still access the current + * ret_stack structure, we need to make sure the update of + * curr_ret_stack is after that. + */ + barrier(); + current->curr_ret_stack--; + /* + * The curr_ret_stack can be less than -1 only if it was + * filtered out and it's about to return from the function. + * Recover the index and continue to trace normal functions. + */ + if (current->curr_ret_stack < -1) { + current->curr_ret_stack += FTRACE_NOTRACE_DEPTH; + return ret; + } + + if (unlikely(!ret)) { + ftrace_graph_stop(); + WARN_ON(1); + /* Might as well panic. What else to do? */ + ret = (unsigned long)panic; + } + + return ret; +} diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 0d235e44d08e..b846d82c2f95 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -16,33 +16,6 @@ #include "trace.h" #include "trace_output.h" -static bool kill_ftrace_graph; - -/** - * ftrace_graph_is_dead - returns true if ftrace_graph_stop() was called - * - * ftrace_graph_stop() is called when a severe error is detected in - * the function graph tracing. This function is called by the critical - * paths of function graph to keep those paths from doing any more harm. - */ -bool ftrace_graph_is_dead(void) -{ - return kill_ftrace_graph; -} - -/** - * ftrace_graph_stop - set to permanently disable function graph tracincg - * - * In case of an error int function graph tracing, this is called - * to try to keep function graph tracing from causing any more harm. - * Usually this is pretty severe and this is called to try to at least - * get a warning out to the user. - */ -void ftrace_graph_stop(void) -{ - kill_ftrace_graph = true; -} - /* When set, irq functions will be ignored */ static int ftrace_graph_skip_irqs; @@ -117,199 +90,6 @@ static void print_graph_duration(struct trace_array *tr, unsigned long long duration, struct trace_seq *s, u32 flags); -/* Add a function return address to the trace stack on thread info.*/ -static int -ftrace_push_return_trace(unsigned long ret, unsigned long func, - unsigned long frame_pointer, unsigned long *retp) -{ - unsigned long long calltime; - int index; - - if (unlikely(ftrace_graph_is_dead())) - return -EBUSY; - - if (!current->ret_stack) - return -EBUSY; - - /* - * We must make sure the ret_stack is tested before we read - * anything else. - */ - smp_rmb(); - - /* The return trace stack is full */ - if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) { - atomic_inc(¤t->trace_overrun); - return -EBUSY; - } - - /* - * The curr_ret_stack is an index to ftrace return stack of - * current task. Its value should be in [0, FTRACE_RETFUNC_ - * DEPTH) when the function graph tracer is used. To support - * filtering out specific functions, it makes the index - * negative by subtracting huge value (FTRACE_NOTRACE_DEPTH) - * so when it sees a negative index the ftrace will ignore - * the record. And the index gets recovered when returning - * from the filtered function by adding the FTRACE_NOTRACE_ - * DEPTH and then it'll continue to record functions normally. - * - * The curr_ret_stack is initialized to -1 and get increased - * in this function. So it can be less than -1 only if it was - * filtered out via ftrace_graph_notrace_addr() which can be - * set from set_graph_notrace file in tracefs by user. - */ - if (current->curr_ret_stack < -1) - return -EBUSY; - - calltime = trace_clock_local(); - - index = ++current->curr_ret_stack; - if (ftrace_graph_notrace_addr(func)) - current->curr_ret_stack -= FTRACE_NOTRACE_DEPTH; - barrier(); - current->ret_stack[index].ret = ret; - current->ret_stack[index].func = func; - current->ret_stack[index].calltime = calltime; -#ifdef HAVE_FUNCTION_GRAPH_FP_TEST - current->ret_stack[index].fp = frame_pointer; -#endif -#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR - current->ret_stack[index].retp = retp; -#endif - return 0; -} - -int function_graph_enter(unsigned long ret, unsigned long func, - unsigned long frame_pointer, unsigned long *retp) -{ - struct ftrace_graph_ent trace; - - trace.func = func; - trace.depth = ++current->curr_ret_depth; - - if (ftrace_push_return_trace(ret, func, frame_pointer, retp)) - goto out; - - /* Only trace if the calling function expects to */ - if (!ftrace_graph_entry(&trace)) - goto out_ret; - - return 0; - out_ret: - current->curr_ret_stack--; - out: - current->curr_ret_depth--; - return -EBUSY; -} - -/* Retrieve a function return address to the trace stack on thread info.*/ -static void -ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, - unsigned long frame_pointer) -{ - int index; - - index = current->curr_ret_stack; - - /* - * A negative index here means that it's just returned from a - * notrace'd function. Recover index to get an original - * return address. See ftrace_push_return_trace(). - * - * TODO: Need to check whether the stack gets corrupted. - */ - if (index < 0) - index += FTRACE_NOTRACE_DEPTH; - - if (unlikely(index < 0 || index >= FTRACE_RETFUNC_DEPTH)) { - ftrace_graph_stop(); - WARN_ON(1); - /* Might as well panic, otherwise we have no where to go */ - *ret = (unsigned long)panic; - return; - } - -#ifdef HAVE_FUNCTION_GRAPH_FP_TEST - /* - * The arch may choose to record the frame pointer used - * and check it here to make sure that it is what we expect it - * to be. If gcc does not set the place holder of the return - * address in the frame pointer, and does a copy instead, then - * the function graph trace will fail. This test detects this - * case. - * - * Currently, x86_32 with optimize for size (-Os) makes the latest - * gcc do the above. - * - * Note, -mfentry does not use frame pointers, and this test - * is not needed if CC_USING_FENTRY is set. - */ - if (unlikely(current->ret_stack[index].fp != frame_pointer)) { - ftrace_graph_stop(); - WARN(1, "Bad frame pointer: expected %lx, received %lx\n" - " from func %ps return to %lx\n", - current->ret_stack[index].fp, - frame_pointer, - (void *)current->ret_stack[index].func, - current->ret_stack[index].ret); - *ret = (unsigned long)panic; - return; - } -#endif - - *ret = current->ret_stack[index].ret; - trace->func = current->ret_stack[index].func; - trace->calltime = current->ret_stack[index].calltime; - trace->overrun = atomic_read(¤t->trace_overrun); - trace->depth = current->curr_ret_depth--; - /* - * We still want to trace interrupts coming in if - * max_depth is set to 1. Make sure the decrement is - * seen before ftrace_graph_return. - */ - barrier(); -} - -/* - * Send the trace to the ring-buffer. - * @return the original return address. - */ -unsigned long ftrace_return_to_handler(unsigned long frame_pointer) -{ - struct ftrace_graph_ret trace; - unsigned long ret; - - ftrace_pop_return_trace(&trace, &ret, frame_pointer); - trace.rettime = trace_clock_local(); - ftrace_graph_return(&trace); - /* - * The ftrace_graph_return() may still access the current - * ret_stack structure, we need to make sure the update of - * curr_ret_stack is after that. - */ - barrier(); - current->curr_ret_stack--; - /* - * The curr_ret_stack can be less than -1 only if it was - * filtered out and it's about to return from the function. - * Recover the index and continue to trace normal functions. - */ - if (current->curr_ret_stack < -1) { - current->curr_ret_stack += FTRACE_NOTRACE_DEPTH; - return ret; - } - - if (unlikely(!ret)) { - ftrace_graph_stop(); - WARN_ON(1); - /* Might as well panic. What else to do? */ - ret = (unsigned long)panic; - } - - return ret; -} - /** * ftrace_graph_ret_addr - convert a potentially modified stack return address * to its original value -- cgit v1.2.3 From 9cd2992f2d6c8df54c5b937d5d1f8a23b684cc1d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 14 Nov 2018 13:14:58 -0500 Subject: fgraph: Have set_graph_notrace only affect function_graph tracer In order to make the function graph infrastructure more generic, there can not be code specific for the function_graph tracer in the generic code. This includes the set_graph_notrace logic, that stops all graph calls when a function in the set_graph_notrace is hit. By using the trace_recursion mask, we can use a bit in the current task_struct to implement the notrace code, and move the logic out of fgraph.c and into trace_functions_graph.c and keeps it affecting only the tracer and not all call graph callbacks. Acked-by: Namhyung Kim Reviewed-by: Joel Fernandes (Google) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/fgraph.c | 21 --------------------- kernel/trace/trace.h | 7 +++++++ kernel/trace/trace_functions_graph.c | 22 ++++++++++++++++++++++ 3 files changed, 29 insertions(+), 21 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 5ad9c0e88b80..e852b69c0e64 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -64,30 +64,9 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, return -EBUSY; } - /* - * The curr_ret_stack is an index to ftrace return stack of - * current task. Its value should be in [0, FTRACE_RETFUNC_ - * DEPTH) when the function graph tracer is used. To support - * filtering out specific functions, it makes the index - * negative by subtracting huge value (FTRACE_NOTRACE_DEPTH) - * so when it sees a negative index the ftrace will ignore - * the record. And the index gets recovered when returning - * from the filtered function by adding the FTRACE_NOTRACE_ - * DEPTH and then it'll continue to record functions normally. - * - * The curr_ret_stack is initialized to -1 and get increased - * in this function. So it can be less than -1 only if it was - * filtered out via ftrace_graph_notrace_addr() which can be - * set from set_graph_notrace file in tracefs by user. - */ - if (current->curr_ret_stack < -1) - return -EBUSY; - calltime = trace_clock_local(); index = ++current->curr_ret_stack; - if (ftrace_graph_notrace_addr(func)) - current->curr_ret_stack -= FTRACE_NOTRACE_DEPTH; barrier(); current->ret_stack[index].ret = ret; current->ret_stack[index].func = func; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 447bd96ee658..f67060a75f38 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -534,6 +534,13 @@ enum { TRACE_GRAPH_DEPTH_START_BIT, TRACE_GRAPH_DEPTH_END_BIT, + + /* + * To implement set_graph_notrace, if this bit is set, we ignore + * function graph tracing of called functions, until the return + * function is called to clear it. + */ + TRACE_GRAPH_NOTRACE_BIT, }; #define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index b846d82c2f95..ecf543df943b 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -188,6 +188,18 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) int cpu; int pc; + if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT)) + return 0; + + if (ftrace_graph_notrace_addr(trace->func)) { + trace_recursion_set(TRACE_GRAPH_NOTRACE_BIT); + /* + * Need to return 1 to have the return called + * that will clear the NOTRACE bit. + */ + return 1; + } + if (!ftrace_trace_task(tr)) return 0; @@ -290,6 +302,11 @@ void trace_graph_return(struct ftrace_graph_ret *trace) ftrace_graph_addr_finish(trace); + if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT)) { + trace_recursion_clear(TRACE_GRAPH_NOTRACE_BIT); + return; + } + local_irq_save(flags); cpu = raw_smp_processor_id(); data = per_cpu_ptr(tr->trace_buffer.data, cpu); @@ -315,6 +332,11 @@ static void trace_graph_thresh_return(struct ftrace_graph_ret *trace) { ftrace_graph_addr_finish(trace); + if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT)) { + trace_recursion_clear(TRACE_GRAPH_NOTRACE_BIT); + return; + } + if (tracing_thresh && (trace->rettime - trace->calltime < tracing_thresh)) return; -- cgit v1.2.3 From db6638d7d177a8bc74c9e539e2e0d7d061c767b1 Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Wed, 5 Dec 2018 12:10:35 -0500 Subject: blkcg: remove bio->bi_css and instead use bio->bi_blkg Prior patches ensured that any bio that interacts with a request_queue is properly associated with a blkg. This makes bio->bi_css unnecessary as blkg maintains a reference to blkcg already. This removes the bio field bi_css and transfers corresponding uses to access via bi_blkg. Signed-off-by: Dennis Zhou Reviewed-by: Josef Bacik Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 2868d85f1fb1..fac0ddf8a8e2 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -764,9 +764,9 @@ blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) return NULL; - if (!bio->bi_css) + if (!bio->bi_blkg) return NULL; - return cgroup_get_kernfs_id(bio->bi_css->cgroup); + return cgroup_get_kernfs_id(bio_blkcg(bio)->css.cgroup); } #else static union kernfs_node_id * -- cgit v1.2.3 From 761efe8a94cfcd0a3dd90f2008411550f3520b63 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Sun, 18 Nov 2018 18:44:04 -0500 Subject: function_graph: Remove the use of FTRACE_NOTRACE_DEPTH The curr_ret_stack is no longer set to a negative value when a function is not to be traced by the function graph tracer. Remove the usage of FTRACE_NOTRACE_DEPTH, as it is no longer needed. Reviewed-by: Joel Fernandes (Google) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/fgraph.c | 19 ------------------- kernel/trace/trace_functions_graph.c | 11 ----------- 2 files changed, 30 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index e852b69c0e64..de887a983ac7 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -112,16 +112,6 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, index = current->curr_ret_stack; - /* - * A negative index here means that it's just returned from a - * notrace'd function. Recover index to get an original - * return address. See ftrace_push_return_trace(). - * - * TODO: Need to check whether the stack gets corrupted. - */ - if (index < 0) - index += FTRACE_NOTRACE_DEPTH; - if (unlikely(index < 0 || index >= FTRACE_RETFUNC_DEPTH)) { ftrace_graph_stop(); WARN_ON(1); @@ -190,15 +180,6 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) */ barrier(); current->curr_ret_stack--; - /* - * The curr_ret_stack can be less than -1 only if it was - * filtered out and it's about to return from the function. - * Recover the index and continue to trace normal functions. - */ - if (current->curr_ret_stack < -1) { - current->curr_ret_stack += FTRACE_NOTRACE_DEPTH; - return ret; - } if (unlikely(!ret)) { ftrace_graph_stop(); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index ecf543df943b..eaf9b1629956 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -115,9 +115,6 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, if (ret != (unsigned long)return_to_handler) return ret; - if (index < -1) - index += FTRACE_NOTRACE_DEPTH; - if (index < 0) return ret; @@ -675,10 +672,6 @@ print_graph_entry_leaf(struct trace_iterator *iter, cpu_data = per_cpu_ptr(data->cpu_data, cpu); - /* If a graph tracer ignored set_graph_notrace */ - if (call->depth < -1) - call->depth += FTRACE_NOTRACE_DEPTH; - /* * Comments display at + 1 to depth. Since * this is a leaf function, keep the comments @@ -721,10 +714,6 @@ print_graph_entry_nested(struct trace_iterator *iter, struct fgraph_cpu_data *cpu_data; int cpu = iter->cpu; - /* If a graph tracer ignored set_graph_notrace */ - if (call->depth < -1) - call->depth += FTRACE_NOTRACE_DEPTH; - cpu_data = per_cpu_ptr(data->cpu_data, cpu); cpu_data->depth = call->depth; -- cgit v1.2.3 From 3306fc4aff464f9c08c8899695a218f4b1125d4a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 15 Nov 2018 12:32:38 -0500 Subject: ftrace: Create new ftrace_internal.h header In order to move function graph infrastructure into its own file (fgraph.h) it needs to access various functions and variables in ftrace.c that are currently static. Create a new file called ftrace-internal.h that holds the function prototypes and the extern declarations of the variables needed by fgraph.c as well, and make them global in ftrace.c such that they can be used outside that file. Reviewed-by: Joel Fernandes (Google) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 76 ++++++++---------------------------------- kernel/trace/ftrace_internal.h | 75 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+), 62 deletions(-) create mode 100644 kernel/trace/ftrace_internal.h (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 77734451cb05..52c89428b0db 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -40,6 +40,7 @@ #include #include +#include "ftrace_internal.h" #include "trace_output.h" #include "trace_stat.h" @@ -77,7 +78,7 @@ #define ASSIGN_OPS_HASH(opsname, val) #endif -static struct ftrace_ops ftrace_list_end __read_mostly = { +struct ftrace_ops ftrace_list_end __read_mostly = { .func = ftrace_stub, .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB, INIT_OPS_HASH(ftrace_list_end) @@ -112,11 +113,11 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops); */ static int ftrace_disabled __read_mostly; -static DEFINE_MUTEX(ftrace_lock); +DEFINE_MUTEX(ftrace_lock); -static struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = &ftrace_list_end; +struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; -static struct ftrace_ops global_ops; +struct ftrace_ops global_ops; #if ARCH_SUPPORTS_FTRACE_OPS static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, @@ -127,26 +128,6 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip); #define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops) #endif -/* - * Traverse the ftrace_global_list, invoking all entries. The reason that we - * can use rcu_dereference_raw_notrace() is that elements removed from this list - * are simply leaked, so there is no need to interact with a grace-period - * mechanism. The rcu_dereference_raw_notrace() calls are needed to handle - * concurrent insertions into the ftrace_global_list. - * - * Silly Alpha and silly pointer-speculation compiler optimizations! - */ -#define do_for_each_ftrace_op(op, list) \ - op = rcu_dereference_raw_notrace(list); \ - do - -/* - * Optimized for just a single item in the list (as that is the normal case). - */ -#define while_for_each_ftrace_op(op) \ - while (likely(op = rcu_dereference_raw_notrace((op)->next)) && \ - unlikely((op) != &ftrace_list_end)) - static inline void ftrace_ops_init(struct ftrace_ops *ops) { #ifdef CONFIG_DYNAMIC_FTRACE @@ -187,17 +168,11 @@ static void ftrace_sync_ipi(void *data) } #ifdef CONFIG_FUNCTION_GRAPH_TRACER -static void update_function_graph_func(void); - /* Both enabled by default (can be cleared by function_graph tracer flags */ static bool fgraph_sleep_time = true; static bool fgraph_graph_time = true; - -#else -static inline void update_function_graph_func(void) { } #endif - static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops) { /* @@ -334,7 +309,7 @@ static int remove_ftrace_ops(struct ftrace_ops __rcu **list, static void ftrace_update_trampoline(struct ftrace_ops *ops); -static int __register_ftrace_function(struct ftrace_ops *ops) +int __register_ftrace_function(struct ftrace_ops *ops) { if (ops->flags & FTRACE_OPS_FL_DELETED) return -EINVAL; @@ -375,7 +350,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops) return 0; } -static int __unregister_ftrace_function(struct ftrace_ops *ops) +int __unregister_ftrace_function(struct ftrace_ops *ops) { int ret; @@ -1022,9 +997,7 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer) #endif /* CONFIG_FUNCTION_PROFILER */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER -static int ftrace_graph_active; -#else -# define ftrace_graph_active 0 +int ftrace_graph_active; #endif #ifdef CONFIG_DYNAMIC_FTRACE @@ -1067,7 +1040,7 @@ static const struct ftrace_hash empty_hash = { }; #define EMPTY_HASH ((struct ftrace_hash *)&empty_hash) -static struct ftrace_ops global_ops = { +struct ftrace_ops global_ops = { .func = ftrace_stub, .local_hash.notrace_hash = EMPTY_HASH, .local_hash.filter_hash = EMPTY_HASH, @@ -1503,7 +1476,7 @@ static bool hash_contains_ip(unsigned long ip, * This needs to be called with preemption disabled as * the hashes are freed with call_rcu_sched(). */ -static int +int ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) { struct ftrace_ops_hash hash; @@ -2682,7 +2655,7 @@ static void ftrace_startup_all(int command) update_all_ops = false; } -static int ftrace_startup(struct ftrace_ops *ops, int command) +int ftrace_startup(struct ftrace_ops *ops, int command) { int ret; @@ -2724,7 +2697,7 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) return 0; } -static int ftrace_shutdown(struct ftrace_ops *ops, int command) +int ftrace_shutdown(struct ftrace_ops *ops, int command) { int ret; @@ -6177,7 +6150,7 @@ void ftrace_init_trace_array(struct trace_array *tr) } #else -static struct ftrace_ops global_ops = { +struct ftrace_ops global_ops = { .func = ftrace_stub, .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED | @@ -6194,31 +6167,10 @@ core_initcall(ftrace_nodyn_init); static inline int ftrace_init_dyn_tracefs(struct dentry *d_tracer) { return 0; } static inline void ftrace_startup_enable(int command) { } static inline void ftrace_startup_all(int command) { } -/* Keep as macros so we do not need to define the commands */ -# define ftrace_startup(ops, command) \ - ({ \ - int ___ret = __register_ftrace_function(ops); \ - if (!___ret) \ - (ops)->flags |= FTRACE_OPS_FL_ENABLED; \ - ___ret; \ - }) -# define ftrace_shutdown(ops, command) \ - ({ \ - int ___ret = __unregister_ftrace_function(ops); \ - if (!___ret) \ - (ops)->flags &= ~FTRACE_OPS_FL_ENABLED; \ - ___ret; \ - }) # define ftrace_startup_sysctl() do { } while (0) # define ftrace_shutdown_sysctl() do { } while (0) -static inline int -ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) -{ - return 1; -} - static void ftrace_update_trampoline(struct ftrace_ops *ops) { } @@ -6930,7 +6882,7 @@ static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace) * function against the global ops, and not just trace any function * that any ftrace_ops registered. */ -static void update_function_graph_func(void) +void update_function_graph_func(void) { struct ftrace_ops *op; bool do_test = false; diff --git a/kernel/trace/ftrace_internal.h b/kernel/trace/ftrace_internal.h new file mode 100644 index 000000000000..0515a2096f90 --- /dev/null +++ b/kernel/trace/ftrace_internal.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_KERNEL_FTRACE_INTERNAL_H +#define _LINUX_KERNEL_FTRACE_INTERNAL_H + +#ifdef CONFIG_FUNCTION_TRACER + +/* + * Traverse the ftrace_global_list, invoking all entries. The reason that we + * can use rcu_dereference_raw_notrace() is that elements removed from this list + * are simply leaked, so there is no need to interact with a grace-period + * mechanism. The rcu_dereference_raw_notrace() calls are needed to handle + * concurrent insertions into the ftrace_global_list. + * + * Silly Alpha and silly pointer-speculation compiler optimizations! + */ +#define do_for_each_ftrace_op(op, list) \ + op = rcu_dereference_raw_notrace(list); \ + do + +/* + * Optimized for just a single item in the list (as that is the normal case). + */ +#define while_for_each_ftrace_op(op) \ + while (likely(op = rcu_dereference_raw_notrace((op)->next)) && \ + unlikely((op) != &ftrace_list_end)) + +extern struct ftrace_ops __rcu *ftrace_ops_list; +extern struct ftrace_ops ftrace_list_end; +extern struct mutex ftrace_lock; +extern struct ftrace_ops global_ops; + +#ifdef CONFIG_DYNAMIC_FTRACE + +int ftrace_startup(struct ftrace_ops *ops, int command); +int ftrace_shutdown(struct ftrace_ops *ops, int command); +int ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs); + +#else /* !CONFIG_DYNAMIC_FTRACE */ + +int __register_ftrace_function(struct ftrace_ops *ops); +int __unregister_ftrace_function(struct ftrace_ops *ops); +/* Keep as macros so we do not need to define the commands */ +# define ftrace_startup(ops, command) \ + ({ \ + int ___ret = __register_ftrace_function(ops); \ + if (!___ret) \ + (ops)->flags |= FTRACE_OPS_FL_ENABLED; \ + ___ret; \ + }) +# define ftrace_shutdown(ops, command) \ + ({ \ + int ___ret = __unregister_ftrace_function(ops); \ + if (!___ret) \ + (ops)->flags &= ~FTRACE_OPS_FL_ENABLED; \ + ___ret; \ + }) +static inline int +ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) +{ + return 1; +} +#endif /* CONFIG_DYNAMIC_FTRACE */ + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +extern int ftrace_graph_active; +void update_function_graph_func(void); +#else /* !CONFIG_FUNCTION_GRAPH_TRACER */ +# define ftrace_graph_active 0 +static inline void update_function_graph_func(void) { } +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + +#else /* !CONFIG_FUNCTION_TRACER */ +#endif /* CONFIG_FUNCTION_TRACER */ + +#endif -- cgit v1.2.3 From c8dd0f45874547e6e77bab03d71feb16c4cb98a8 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 23 Nov 2018 13:06:07 -0500 Subject: function_graph: Do not expose the graph_time option when profiler is not configured When the function profiler is not configured, the "graph_time" option is meaningless, as the function profiler is the only thing that makes use of it. Do not expose it if the profiler is not configured. Link: http://lkml.kernel.org/r/20181123061133.GA195223@google.com Reported-by: Joel Fernandes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 5 +++++ kernel/trace/trace_functions_graph.c | 4 ++++ 2 files changed, 9 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f67060a75f38..ab16eca76e59 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -862,7 +862,12 @@ static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash) #define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) extern void ftrace_graph_sleep_time_control(bool enable); + +#ifdef CONFIG_FUNCTION_PROFILER extern void ftrace_graph_graph_time_control(bool enable); +#else +static inline void ftrace_graph_graph_time_control(bool enable) { } +#endif extern enum print_line_t print_graph_function_flags(struct trace_iterator *iter, u32 flags); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index eaf9b1629956..855c13c61e77 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -60,8 +60,12 @@ static struct tracer_opt trace_opts[] = { { TRACER_OPT(funcgraph-tail, TRACE_GRAPH_PRINT_TAIL) }, /* Include sleep time (scheduled out) between entry and return */ { TRACER_OPT(sleep-time, TRACE_GRAPH_SLEEP_TIME) }, + +#ifdef CONFIG_FUNCTION_PROFILER /* Include time within nested functions */ { TRACER_OPT(graph-time, TRACE_GRAPH_GRAPH_TIME) }, +#endif + { } /* Empty entry */ }; -- cgit v1.2.3 From e73e679f656e678b0e7f8961094201f3544f4541 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 15 Nov 2018 12:35:13 -0500 Subject: fgraph: Move function graph specific code into fgraph.c To make the function graph infrastructure more managable, the code needs to be in its own file (fgraph.c). Move the code that is specific for managing the function graph infrastructure out of ftrace.c and into fgraph.c Reviewed-by: Joel Fernandes (Google) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/fgraph.c | 360 +++++++++++++++++++++++++++++++++++++++++++++++- kernel/trace/ftrace.c | 368 +------------------------------------------------- 2 files changed, 366 insertions(+), 362 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index de887a983ac7..374f3e42e29e 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -7,11 +7,27 @@ * * Highly modified by Steven Rostedt (VMware). */ +#include #include +#include -#include "trace.h" +#include + +#include "ftrace_internal.h" + +#ifdef CONFIG_DYNAMIC_FTRACE +#define ASSIGN_OPS_HASH(opsname, val) \ + .func_hash = val, \ + .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock), +#else +#define ASSIGN_OPS_HASH(opsname, val) +#endif static bool kill_ftrace_graph; +int ftrace_graph_active; + +/* Both enabled by default (can be cleared by function_graph tracer flags */ +static bool fgraph_sleep_time = true; /** * ftrace_graph_is_dead - returns true if ftrace_graph_stop() was called @@ -161,6 +177,31 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, barrier(); } +/* + * Hibernation protection. + * The state of the current task is too much unstable during + * suspend/restore to disk. We want to protect against that. + */ +static int +ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state, + void *unused) +{ + switch (state) { + case PM_HIBERNATION_PREPARE: + pause_graph_tracing(); + break; + + case PM_POST_HIBERNATION: + unpause_graph_tracing(); + break; + } + return NOTIFY_DONE; +} + +static struct notifier_block ftrace_suspend_notifier = { + .notifier_call = ftrace_suspend_notifier_call, +}; + /* * Send the trace to the ring-buffer. * @return the original return address. @@ -190,3 +231,320 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) return ret; } + +static struct ftrace_ops graph_ops = { + .func = ftrace_stub, + .flags = FTRACE_OPS_FL_RECURSION_SAFE | + FTRACE_OPS_FL_INITIALIZED | + FTRACE_OPS_FL_PID | + FTRACE_OPS_FL_STUB, +#ifdef FTRACE_GRAPH_TRAMP_ADDR + .trampoline = FTRACE_GRAPH_TRAMP_ADDR, + /* trampoline_size is only needed for dynamically allocated tramps */ +#endif + ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash) +}; + +void ftrace_graph_sleep_time_control(bool enable) +{ + fgraph_sleep_time = enable; +} + +int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) +{ + return 0; +} + +/* The callbacks that hook a function */ +trace_func_graph_ret_t ftrace_graph_return = + (trace_func_graph_ret_t)ftrace_stub; +trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub; +static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub; + +/* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */ +static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) +{ + int i; + int ret = 0; + int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE; + struct task_struct *g, *t; + + for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) { + ret_stack_list[i] = + kmalloc_array(FTRACE_RETFUNC_DEPTH, + sizeof(struct ftrace_ret_stack), + GFP_KERNEL); + if (!ret_stack_list[i]) { + start = 0; + end = i; + ret = -ENOMEM; + goto free; + } + } + + read_lock(&tasklist_lock); + do_each_thread(g, t) { + if (start == end) { + ret = -EAGAIN; + goto unlock; + } + + if (t->ret_stack == NULL) { + atomic_set(&t->tracing_graph_pause, 0); + atomic_set(&t->trace_overrun, 0); + t->curr_ret_stack = -1; + t->curr_ret_depth = -1; + /* Make sure the tasks see the -1 first: */ + smp_wmb(); + t->ret_stack = ret_stack_list[start++]; + } + } while_each_thread(g, t); + +unlock: + read_unlock(&tasklist_lock); +free: + for (i = start; i < end; i++) + kfree(ret_stack_list[i]); + return ret; +} + +static void +ftrace_graph_probe_sched_switch(void *ignore, bool preempt, + struct task_struct *prev, struct task_struct *next) +{ + unsigned long long timestamp; + int index; + + /* + * Does the user want to count the time a function was asleep. + * If so, do not update the time stamps. + */ + if (fgraph_sleep_time) + return; + + timestamp = trace_clock_local(); + + prev->ftrace_timestamp = timestamp; + + /* only process tasks that we timestamped */ + if (!next->ftrace_timestamp) + return; + + /* + * Update all the counters in next to make up for the + * time next was sleeping. + */ + timestamp -= next->ftrace_timestamp; + + for (index = next->curr_ret_stack; index >= 0; index--) + next->ret_stack[index].calltime += timestamp; +} + +static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace) +{ + if (!ftrace_ops_test(&global_ops, trace->func, NULL)) + return 0; + return __ftrace_graph_entry(trace); +} + +/* + * The function graph tracer should only trace the functions defined + * by set_ftrace_filter and set_ftrace_notrace. If another function + * tracer ops is registered, the graph tracer requires testing the + * function against the global ops, and not just trace any function + * that any ftrace_ops registered. + */ +void update_function_graph_func(void) +{ + struct ftrace_ops *op; + bool do_test = false; + + /* + * The graph and global ops share the same set of functions + * to test. If any other ops is on the list, then + * the graph tracing needs to test if its the function + * it should call. + */ + do_for_each_ftrace_op(op, ftrace_ops_list) { + if (op != &global_ops && op != &graph_ops && + op != &ftrace_list_end) { + do_test = true; + /* in double loop, break out with goto */ + goto out; + } + } while_for_each_ftrace_op(op); + out: + if (do_test) + ftrace_graph_entry = ftrace_graph_entry_test; + else + ftrace_graph_entry = __ftrace_graph_entry; +} + +static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack); + +static void +graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack) +{ + atomic_set(&t->tracing_graph_pause, 0); + atomic_set(&t->trace_overrun, 0); + t->ftrace_timestamp = 0; + /* make curr_ret_stack visible before we add the ret_stack */ + smp_wmb(); + t->ret_stack = ret_stack; +} + +/* + * Allocate a return stack for the idle task. May be the first + * time through, or it may be done by CPU hotplug online. + */ +void ftrace_graph_init_idle_task(struct task_struct *t, int cpu) +{ + t->curr_ret_stack = -1; + t->curr_ret_depth = -1; + /* + * The idle task has no parent, it either has its own + * stack or no stack at all. + */ + if (t->ret_stack) + WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu)); + + if (ftrace_graph_active) { + struct ftrace_ret_stack *ret_stack; + + ret_stack = per_cpu(idle_ret_stack, cpu); + if (!ret_stack) { + ret_stack = + kmalloc_array(FTRACE_RETFUNC_DEPTH, + sizeof(struct ftrace_ret_stack), + GFP_KERNEL); + if (!ret_stack) + return; + per_cpu(idle_ret_stack, cpu) = ret_stack; + } + graph_init_task(t, ret_stack); + } +} + +/* Allocate a return stack for newly created task */ +void ftrace_graph_init_task(struct task_struct *t) +{ + /* Make sure we do not use the parent ret_stack */ + t->ret_stack = NULL; + t->curr_ret_stack = -1; + t->curr_ret_depth = -1; + + if (ftrace_graph_active) { + struct ftrace_ret_stack *ret_stack; + + ret_stack = kmalloc_array(FTRACE_RETFUNC_DEPTH, + sizeof(struct ftrace_ret_stack), + GFP_KERNEL); + if (!ret_stack) + return; + graph_init_task(t, ret_stack); + } +} + +void ftrace_graph_exit_task(struct task_struct *t) +{ + struct ftrace_ret_stack *ret_stack = t->ret_stack; + + t->ret_stack = NULL; + /* NULL must become visible to IRQs before we free it: */ + barrier(); + + kfree(ret_stack); +} + +/* Allocate a return stack for each task */ +static int start_graph_tracing(void) +{ + struct ftrace_ret_stack **ret_stack_list; + int ret, cpu; + + ret_stack_list = kmalloc_array(FTRACE_RETSTACK_ALLOC_SIZE, + sizeof(struct ftrace_ret_stack *), + GFP_KERNEL); + + if (!ret_stack_list) + return -ENOMEM; + + /* The cpu_boot init_task->ret_stack will never be freed */ + for_each_online_cpu(cpu) { + if (!idle_task(cpu)->ret_stack) + ftrace_graph_init_idle_task(idle_task(cpu), cpu); + } + + do { + ret = alloc_retstack_tasklist(ret_stack_list); + } while (ret == -EAGAIN); + + if (!ret) { + ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); + if (ret) + pr_info("ftrace_graph: Couldn't activate tracepoint" + " probe to kernel_sched_switch\n"); + } + + kfree(ret_stack_list); + return ret; +} + +int register_ftrace_graph(trace_func_graph_ret_t retfunc, + trace_func_graph_ent_t entryfunc) +{ + int ret = 0; + + mutex_lock(&ftrace_lock); + + /* we currently allow only one tracer registered at a time */ + if (ftrace_graph_active) { + ret = -EBUSY; + goto out; + } + + register_pm_notifier(&ftrace_suspend_notifier); + + ftrace_graph_active++; + ret = start_graph_tracing(); + if (ret) { + ftrace_graph_active--; + goto out; + } + + ftrace_graph_return = retfunc; + + /* + * Update the indirect function to the entryfunc, and the + * function that gets called to the entry_test first. Then + * call the update fgraph entry function to determine if + * the entryfunc should be called directly or not. + */ + __ftrace_graph_entry = entryfunc; + ftrace_graph_entry = ftrace_graph_entry_test; + update_function_graph_func(); + + ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET); +out: + mutex_unlock(&ftrace_lock); + return ret; +} + +void unregister_ftrace_graph(void) +{ + mutex_lock(&ftrace_lock); + + if (unlikely(!ftrace_graph_active)) + goto out; + + ftrace_graph_active--; + ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; + ftrace_graph_entry = ftrace_graph_entry_stub; + __ftrace_graph_entry = ftrace_graph_entry_stub; + ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET); + unregister_pm_notifier(&ftrace_suspend_notifier); + unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); + + out: + mutex_unlock(&ftrace_lock); +} diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 52c89428b0db..c53533b833cf 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include @@ -167,12 +166,6 @@ static void ftrace_sync_ipi(void *data) smp_rmb(); } -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -/* Both enabled by default (can be cleared by function_graph tracer flags */ -static bool fgraph_sleep_time = true; -static bool fgraph_graph_time = true; -#endif - static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops) { /* @@ -790,6 +783,13 @@ function_profile_call(unsigned long ip, unsigned long parent_ip, } #ifdef CONFIG_FUNCTION_GRAPH_TRACER +static bool fgraph_graph_time = true; + +void ftrace_graph_graph_time_control(bool enable) +{ + fgraph_graph_time = enable; +} + static int profile_graph_entry(struct ftrace_graph_ent *trace) { int index = current->curr_ret_stack; @@ -996,10 +996,6 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer) } #endif /* CONFIG_FUNCTION_PROFILER */ -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -int ftrace_graph_active; -#endif - #ifdef CONFIG_DYNAMIC_FTRACE static struct ftrace_ops *removed_ops; @@ -6697,353 +6693,3 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, mutex_unlock(&ftrace_lock); return ret; } - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - -static struct ftrace_ops graph_ops = { - .func = ftrace_stub, - .flags = FTRACE_OPS_FL_RECURSION_SAFE | - FTRACE_OPS_FL_INITIALIZED | - FTRACE_OPS_FL_PID | - FTRACE_OPS_FL_STUB, -#ifdef FTRACE_GRAPH_TRAMP_ADDR - .trampoline = FTRACE_GRAPH_TRAMP_ADDR, - /* trampoline_size is only needed for dynamically allocated tramps */ -#endif - ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash) -}; - -void ftrace_graph_sleep_time_control(bool enable) -{ - fgraph_sleep_time = enable; -} - -void ftrace_graph_graph_time_control(bool enable) -{ - fgraph_graph_time = enable; -} - -int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) -{ - return 0; -} - -/* The callbacks that hook a function */ -trace_func_graph_ret_t ftrace_graph_return = - (trace_func_graph_ret_t)ftrace_stub; -trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub; -static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub; - -/* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */ -static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) -{ - int i; - int ret = 0; - int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE; - struct task_struct *g, *t; - - for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) { - ret_stack_list[i] = - kmalloc_array(FTRACE_RETFUNC_DEPTH, - sizeof(struct ftrace_ret_stack), - GFP_KERNEL); - if (!ret_stack_list[i]) { - start = 0; - end = i; - ret = -ENOMEM; - goto free; - } - } - - read_lock(&tasklist_lock); - do_each_thread(g, t) { - if (start == end) { - ret = -EAGAIN; - goto unlock; - } - - if (t->ret_stack == NULL) { - atomic_set(&t->tracing_graph_pause, 0); - atomic_set(&t->trace_overrun, 0); - t->curr_ret_stack = -1; - t->curr_ret_depth = -1; - /* Make sure the tasks see the -1 first: */ - smp_wmb(); - t->ret_stack = ret_stack_list[start++]; - } - } while_each_thread(g, t); - -unlock: - read_unlock(&tasklist_lock); -free: - for (i = start; i < end; i++) - kfree(ret_stack_list[i]); - return ret; -} - -static void -ftrace_graph_probe_sched_switch(void *ignore, bool preempt, - struct task_struct *prev, struct task_struct *next) -{ - unsigned long long timestamp; - int index; - - /* - * Does the user want to count the time a function was asleep. - * If so, do not update the time stamps. - */ - if (fgraph_sleep_time) - return; - - timestamp = trace_clock_local(); - - prev->ftrace_timestamp = timestamp; - - /* only process tasks that we timestamped */ - if (!next->ftrace_timestamp) - return; - - /* - * Update all the counters in next to make up for the - * time next was sleeping. - */ - timestamp -= next->ftrace_timestamp; - - for (index = next->curr_ret_stack; index >= 0; index--) - next->ret_stack[index].calltime += timestamp; -} - -/* Allocate a return stack for each task */ -static int start_graph_tracing(void) -{ - struct ftrace_ret_stack **ret_stack_list; - int ret, cpu; - - ret_stack_list = kmalloc_array(FTRACE_RETSTACK_ALLOC_SIZE, - sizeof(struct ftrace_ret_stack *), - GFP_KERNEL); - - if (!ret_stack_list) - return -ENOMEM; - - /* The cpu_boot init_task->ret_stack will never be freed */ - for_each_online_cpu(cpu) { - if (!idle_task(cpu)->ret_stack) - ftrace_graph_init_idle_task(idle_task(cpu), cpu); - } - - do { - ret = alloc_retstack_tasklist(ret_stack_list); - } while (ret == -EAGAIN); - - if (!ret) { - ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); - if (ret) - pr_info("ftrace_graph: Couldn't activate tracepoint" - " probe to kernel_sched_switch\n"); - } - - kfree(ret_stack_list); - return ret; -} - -/* - * Hibernation protection. - * The state of the current task is too much unstable during - * suspend/restore to disk. We want to protect against that. - */ -static int -ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state, - void *unused) -{ - switch (state) { - case PM_HIBERNATION_PREPARE: - pause_graph_tracing(); - break; - - case PM_POST_HIBERNATION: - unpause_graph_tracing(); - break; - } - return NOTIFY_DONE; -} - -static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace) -{ - if (!ftrace_ops_test(&global_ops, trace->func, NULL)) - return 0; - return __ftrace_graph_entry(trace); -} - -/* - * The function graph tracer should only trace the functions defined - * by set_ftrace_filter and set_ftrace_notrace. If another function - * tracer ops is registered, the graph tracer requires testing the - * function against the global ops, and not just trace any function - * that any ftrace_ops registered. - */ -void update_function_graph_func(void) -{ - struct ftrace_ops *op; - bool do_test = false; - - /* - * The graph and global ops share the same set of functions - * to test. If any other ops is on the list, then - * the graph tracing needs to test if its the function - * it should call. - */ - do_for_each_ftrace_op(op, ftrace_ops_list) { - if (op != &global_ops && op != &graph_ops && - op != &ftrace_list_end) { - do_test = true; - /* in double loop, break out with goto */ - goto out; - } - } while_for_each_ftrace_op(op); - out: - if (do_test) - ftrace_graph_entry = ftrace_graph_entry_test; - else - ftrace_graph_entry = __ftrace_graph_entry; -} - -static struct notifier_block ftrace_suspend_notifier = { - .notifier_call = ftrace_suspend_notifier_call, -}; - -int register_ftrace_graph(trace_func_graph_ret_t retfunc, - trace_func_graph_ent_t entryfunc) -{ - int ret = 0; - - mutex_lock(&ftrace_lock); - - /* we currently allow only one tracer registered at a time */ - if (ftrace_graph_active) { - ret = -EBUSY; - goto out; - } - - register_pm_notifier(&ftrace_suspend_notifier); - - ftrace_graph_active++; - ret = start_graph_tracing(); - if (ret) { - ftrace_graph_active--; - goto out; - } - - ftrace_graph_return = retfunc; - - /* - * Update the indirect function to the entryfunc, and the - * function that gets called to the entry_test first. Then - * call the update fgraph entry function to determine if - * the entryfunc should be called directly or not. - */ - __ftrace_graph_entry = entryfunc; - ftrace_graph_entry = ftrace_graph_entry_test; - update_function_graph_func(); - - ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET); -out: - mutex_unlock(&ftrace_lock); - return ret; -} - -void unregister_ftrace_graph(void) -{ - mutex_lock(&ftrace_lock); - - if (unlikely(!ftrace_graph_active)) - goto out; - - ftrace_graph_active--; - ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; - ftrace_graph_entry = ftrace_graph_entry_stub; - __ftrace_graph_entry = ftrace_graph_entry_stub; - ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET); - unregister_pm_notifier(&ftrace_suspend_notifier); - unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); - - out: - mutex_unlock(&ftrace_lock); -} - -static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack); - -static void -graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack) -{ - atomic_set(&t->tracing_graph_pause, 0); - atomic_set(&t->trace_overrun, 0); - t->ftrace_timestamp = 0; - /* make curr_ret_stack visible before we add the ret_stack */ - smp_wmb(); - t->ret_stack = ret_stack; -} - -/* - * Allocate a return stack for the idle task. May be the first - * time through, or it may be done by CPU hotplug online. - */ -void ftrace_graph_init_idle_task(struct task_struct *t, int cpu) -{ - t->curr_ret_stack = -1; - t->curr_ret_depth = -1; - /* - * The idle task has no parent, it either has its own - * stack or no stack at all. - */ - if (t->ret_stack) - WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu)); - - if (ftrace_graph_active) { - struct ftrace_ret_stack *ret_stack; - - ret_stack = per_cpu(idle_ret_stack, cpu); - if (!ret_stack) { - ret_stack = - kmalloc_array(FTRACE_RETFUNC_DEPTH, - sizeof(struct ftrace_ret_stack), - GFP_KERNEL); - if (!ret_stack) - return; - per_cpu(idle_ret_stack, cpu) = ret_stack; - } - graph_init_task(t, ret_stack); - } -} - -/* Allocate a return stack for newly created task */ -void ftrace_graph_init_task(struct task_struct *t) -{ - /* Make sure we do not use the parent ret_stack */ - t->ret_stack = NULL; - t->curr_ret_stack = -1; - t->curr_ret_depth = -1; - - if (ftrace_graph_active) { - struct ftrace_ret_stack *ret_stack; - - ret_stack = kmalloc_array(FTRACE_RETFUNC_DEPTH, - sizeof(struct ftrace_ret_stack), - GFP_KERNEL); - if (!ret_stack) - return; - graph_init_task(t, ret_stack); - } -} - -void ftrace_graph_exit_task(struct task_struct *t) -{ - struct ftrace_ret_stack *ret_stack = t->ret_stack; - - t->ret_stack = NULL; - /* NULL must become visible to IRQs before we free it: */ - barrier(); - - kfree(ret_stack); -} -#endif -- cgit v1.2.3 From 317e04ca905ac6c4b33cb879e9a107c412125f14 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 28 Nov 2018 10:26:27 -0500 Subject: tracing: Rearrange functions in trace_sched_wakeup.c Rearrange the functions in trace_sched_wakeup.c so that there are fewer #ifdef CONFIG_FUNCTION_TRACER and #ifdef CONFIG_FUNCTION_GRAPH_TRACER, instead of having the #ifdefs spread all over. No functional change is made. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_sched_wakeup.c | 272 ++++++++++++++++++-------------------- 1 file changed, 130 insertions(+), 142 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 7d04b9890755..2ce78100b4d3 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -35,26 +35,19 @@ static arch_spinlock_t wakeup_lock = static void wakeup_reset(struct trace_array *tr); static void __wakeup_reset(struct trace_array *tr); +static int start_func_tracer(struct trace_array *tr, int graph); +static void stop_func_tracer(struct trace_array *tr, int graph); static int save_flags; #ifdef CONFIG_FUNCTION_GRAPH_TRACER -static int wakeup_display_graph(struct trace_array *tr, int set); # define is_graph(tr) ((tr)->trace_flags & TRACE_ITER_DISPLAY_GRAPH) #else -static inline int wakeup_display_graph(struct trace_array *tr, int set) -{ - return 0; -} # define is_graph(tr) false #endif - #ifdef CONFIG_FUNCTION_TRACER -static int wakeup_graph_entry(struct ftrace_graph_ent *trace); -static void wakeup_graph_return(struct ftrace_graph_ret *trace); - static bool function_enabled; /* @@ -104,122 +97,8 @@ out_enable: return 0; } -/* - * wakeup uses its own tracer function to keep the overhead down: - */ -static void -wakeup_tracer_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *pt_regs) -{ - struct trace_array *tr = wakeup_trace; - struct trace_array_cpu *data; - unsigned long flags; - int pc; - - if (!func_prolog_preempt_disable(tr, &data, &pc)) - return; - - local_irq_save(flags); - trace_function(tr, ip, parent_ip, flags, pc); - local_irq_restore(flags); - - atomic_dec(&data->disabled); - preempt_enable_notrace(); -} - -static int register_wakeup_function(struct trace_array *tr, int graph, int set) -{ - int ret; - - /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */ - if (function_enabled || (!set && !(tr->trace_flags & TRACE_ITER_FUNCTION))) - return 0; - - if (graph) - ret = register_ftrace_graph(&wakeup_graph_return, - &wakeup_graph_entry); - else - ret = register_ftrace_function(tr->ops); - - if (!ret) - function_enabled = true; - - return ret; -} - -static void unregister_wakeup_function(struct trace_array *tr, int graph) -{ - if (!function_enabled) - return; - - if (graph) - unregister_ftrace_graph(); - else - unregister_ftrace_function(tr->ops); - - function_enabled = false; -} - -static int wakeup_function_set(struct trace_array *tr, u32 mask, int set) -{ - if (!(mask & TRACE_ITER_FUNCTION)) - return 0; - - if (set) - register_wakeup_function(tr, is_graph(tr), 1); - else - unregister_wakeup_function(tr, is_graph(tr)); - return 1; -} -#else -static int register_wakeup_function(struct trace_array *tr, int graph, int set) -{ - return 0; -} -static void unregister_wakeup_function(struct trace_array *tr, int graph) { } -static int wakeup_function_set(struct trace_array *tr, u32 mask, int set) -{ - return 0; -} -#endif /* CONFIG_FUNCTION_TRACER */ - -static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) -{ - struct tracer *tracer = tr->current_trace; - - if (wakeup_function_set(tr, mask, set)) - return 0; - #ifdef CONFIG_FUNCTION_GRAPH_TRACER - if (mask & TRACE_ITER_DISPLAY_GRAPH) - return wakeup_display_graph(tr, set); -#endif - - return trace_keep_overwrite(tracer, mask, set); -} -static int start_func_tracer(struct trace_array *tr, int graph) -{ - int ret; - - ret = register_wakeup_function(tr, graph, 0); - - if (!ret && tracing_is_enabled()) - tracer_enabled = 1; - else - tracer_enabled = 0; - - return ret; -} - -static void stop_func_tracer(struct trace_array *tr, int graph) -{ - tracer_enabled = 0; - - unregister_wakeup_function(tr, graph); -} - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER static int wakeup_display_graph(struct trace_array *tr, int set) { if (!(is_graph(tr) ^ set)) @@ -318,20 +197,94 @@ static void wakeup_print_header(struct seq_file *s) else trace_default_header(s); } +#else /* CONFIG_FUNCTION_GRAPH_TRACER */ +static int wakeup_graph_entry(struct ftrace_graph_ent *trace) +{ + return -1; +} +static void wakeup_graph_return(struct ftrace_graph_ret *trace) { } +#endif /* else CONFIG_FUNCTION_GRAPH_TRACER */ +/* + * wakeup uses its own tracer function to keep the overhead down: + */ static void -__trace_function(struct trace_array *tr, - unsigned long ip, unsigned long parent_ip, - unsigned long flags, int pc) +wakeup_tracer_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *pt_regs) { - if (is_graph(tr)) - trace_graph_function(tr, ip, parent_ip, flags, pc); + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; + unsigned long flags; + int pc; + + if (!func_prolog_preempt_disable(tr, &data, &pc)) + return; + + local_irq_save(flags); + trace_function(tr, ip, parent_ip, flags, pc); + local_irq_restore(flags); + + atomic_dec(&data->disabled); + preempt_enable_notrace(); +} + +static int register_wakeup_function(struct trace_array *tr, int graph, int set) +{ + int ret; + + /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */ + if (function_enabled || (!set && !(tr->trace_flags & TRACE_ITER_FUNCTION))) + return 0; + + if (graph) + ret = register_ftrace_graph(&wakeup_graph_return, + &wakeup_graph_entry); else - trace_function(tr, ip, parent_ip, flags, pc); + ret = register_ftrace_function(tr->ops); + + if (!ret) + function_enabled = true; + + return ret; } -#else -#define __trace_function trace_function +static void unregister_wakeup_function(struct trace_array *tr, int graph) +{ + if (!function_enabled) + return; + + if (graph) + unregister_ftrace_graph(); + else + unregister_ftrace_function(tr->ops); + + function_enabled = false; +} + +static int wakeup_function_set(struct trace_array *tr, u32 mask, int set) +{ + if (!(mask & TRACE_ITER_FUNCTION)) + return 0; + + if (set) + register_wakeup_function(tr, is_graph(tr), 1); + else + unregister_wakeup_function(tr, is_graph(tr)); + return 1; +} +#else /* CONFIG_FUNCTION_TRACER */ +static int register_wakeup_function(struct trace_array *tr, int graph, int set) +{ + return 0; +} +static void unregister_wakeup_function(struct trace_array *tr, int graph) { } +static int wakeup_function_set(struct trace_array *tr, u32 mask, int set) +{ + return 0; +} +#endif /* else CONFIG_FUNCTION_TRACER */ + +#ifndef CONFIG_FUNCTION_GRAPH_TRACER static enum print_line_t wakeup_print_line(struct trace_iterator *iter) { return TRACE_TYPE_UNHANDLED; @@ -340,23 +293,58 @@ static enum print_line_t wakeup_print_line(struct trace_iterator *iter) static void wakeup_trace_open(struct trace_iterator *iter) { } static void wakeup_trace_close(struct trace_iterator *iter) { } -#ifdef CONFIG_FUNCTION_TRACER -static int wakeup_graph_entry(struct ftrace_graph_ent *trace) -{ - return -1; -} -static void wakeup_graph_return(struct ftrace_graph_ret *trace) { } static void wakeup_print_header(struct seq_file *s) { trace_default_header(s); } -#else -static void wakeup_print_header(struct seq_file *s) +#endif /* !CONFIG_FUNCTION_GRAPH_TRACER */ + +static void +__trace_function(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, + unsigned long flags, int pc) +{ + if (is_graph(tr)) + trace_graph_function(tr, ip, parent_ip, flags, pc); + else + trace_function(tr, ip, parent_ip, flags, pc); +} + +static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) { - trace_latency_header(s); + struct tracer *tracer = tr->current_trace; + + if (wakeup_function_set(tr, mask, set)) + return 0; + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + if (mask & TRACE_ITER_DISPLAY_GRAPH) + return wakeup_display_graph(tr, set); +#endif + + return trace_keep_overwrite(tracer, mask, set); +} + +static int start_func_tracer(struct trace_array *tr, int graph) +{ + int ret; + + ret = register_wakeup_function(tr, graph, 0); + + if (!ret && tracing_is_enabled()) + tracer_enabled = 1; + else + tracer_enabled = 0; + + return ret; +} + +static void stop_func_tracer(struct trace_array *tr, int graph) +{ + tracer_enabled = 0; + + unregister_wakeup_function(tr, graph); } -#endif /* CONFIG_FUNCTION_TRACER */ -#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ /* * Should this new latency be reported/recorded? -- cgit v1.2.3 From 688f7089d8851b1a81106f0c0b9b29181b2f2dc8 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 15 Nov 2018 14:06:47 -0500 Subject: fgraph: Add new fgraph_ops structure to enable function graph hooks Currently the registering of function graph is to pass in a entry and return function. We need to have a way to associate those functions together where the entry can determine to run the return hook. Having a structure that contains both functions will facilitate the process of converting the code to be able to do such. This is similar to the way function hooks are enabled (it passes in ftrace_ops). Instead of passing in the functions to use, a single structure is passed in to the registering function. The unregister function is now passed in the fgraph_ops handle. When we allow more than one callback to the function graph hooks, this will let the system know which one to remove. Reviewed-by: Joel Fernandes (Google) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/fgraph.c | 9 ++++----- kernel/trace/ftrace.c | 10 +++++++--- kernel/trace/trace_functions_graph.c | 21 ++++++++++++++++----- kernel/trace/trace_irqsoff.c | 18 +++++++----------- kernel/trace/trace_sched_wakeup.c | 16 +++++++--------- kernel/trace/trace_selftest.c | 8 ++++++-- 6 files changed, 47 insertions(+), 35 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 374f3e42e29e..cc35606e9a3e 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -490,8 +490,7 @@ static int start_graph_tracing(void) return ret; } -int register_ftrace_graph(trace_func_graph_ret_t retfunc, - trace_func_graph_ent_t entryfunc) +int register_ftrace_graph(struct fgraph_ops *gops) { int ret = 0; @@ -512,7 +511,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, goto out; } - ftrace_graph_return = retfunc; + ftrace_graph_return = gops->retfunc; /* * Update the indirect function to the entryfunc, and the @@ -520,7 +519,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, * call the update fgraph entry function to determine if * the entryfunc should be called directly or not. */ - __ftrace_graph_entry = entryfunc; + __ftrace_graph_entry = gops->entryfunc; ftrace_graph_entry = ftrace_graph_entry_test; update_function_graph_func(); @@ -530,7 +529,7 @@ out: return ret; } -void unregister_ftrace_graph(void) +void unregister_ftrace_graph(struct fgraph_ops *gops) { mutex_lock(&ftrace_lock); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c53533b833cf..d06fe588e650 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -849,15 +849,19 @@ static void profile_graph_return(struct ftrace_graph_ret *trace) local_irq_restore(flags); } +static struct fgraph_ops fprofiler_ops = { + .entryfunc = &profile_graph_entry, + .retfunc = &profile_graph_return, +}; + static int register_ftrace_profiler(void) { - return register_ftrace_graph(&profile_graph_return, - &profile_graph_entry); + return register_ftrace_graph(&fprofiler_ops); } static void unregister_ftrace_profiler(void) { - unregister_ftrace_graph(); + unregister_ftrace_graph(&fprofiler_ops); } #else static struct ftrace_ops ftrace_profile_ops __read_mostly = { diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 855c13c61e77..140b4b51ab34 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -345,17 +345,25 @@ static void trace_graph_thresh_return(struct ftrace_graph_ret *trace) trace_graph_return(trace); } +static struct fgraph_ops funcgraph_thresh_ops = { + .entryfunc = &trace_graph_entry, + .retfunc = &trace_graph_thresh_return, +}; + +static struct fgraph_ops funcgraph_ops = { + .entryfunc = &trace_graph_entry, + .retfunc = &trace_graph_return, +}; + static int graph_trace_init(struct trace_array *tr) { int ret; set_graph_array(tr); if (tracing_thresh) - ret = register_ftrace_graph(&trace_graph_thresh_return, - &trace_graph_entry); + ret = register_ftrace_graph(&funcgraph_thresh_ops); else - ret = register_ftrace_graph(&trace_graph_return, - &trace_graph_entry); + ret = register_ftrace_graph(&funcgraph_ops); if (ret) return ret; tracing_start_cmdline_record(); @@ -366,7 +374,10 @@ static int graph_trace_init(struct trace_array *tr) static void graph_trace_reset(struct trace_array *tr) { tracing_stop_cmdline_record(); - unregister_ftrace_graph(); + if (tracing_thresh) + unregister_ftrace_graph(&funcgraph_thresh_ops); + else + unregister_ftrace_graph(&funcgraph_ops); } static int graph_trace_update_thresh(struct trace_array *tr) diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 98ea6d28df15..d3294721f119 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -218,6 +218,11 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace) atomic_dec(&data->disabled); } +static struct fgraph_ops fgraph_ops = { + .entryfunc = &irqsoff_graph_entry, + .retfunc = &irqsoff_graph_return, +}; + static void irqsoff_trace_open(struct trace_iterator *iter) { if (is_graph(iter->tr)) @@ -272,13 +277,6 @@ __trace_function(struct trace_array *tr, #else #define __trace_function trace_function -#ifdef CONFIG_FUNCTION_TRACER -static int irqsoff_graph_entry(struct ftrace_graph_ent *trace) -{ - return -1; -} -#endif - static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) { return TRACE_TYPE_UNHANDLED; @@ -288,7 +286,6 @@ static void irqsoff_trace_open(struct trace_iterator *iter) { } static void irqsoff_trace_close(struct trace_iterator *iter) { } #ifdef CONFIG_FUNCTION_TRACER -static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { } static void irqsoff_print_header(struct seq_file *s) { trace_default_header(s); @@ -468,8 +465,7 @@ static int register_irqsoff_function(struct trace_array *tr, int graph, int set) return 0; if (graph) - ret = register_ftrace_graph(&irqsoff_graph_return, - &irqsoff_graph_entry); + ret = register_ftrace_graph(&fgraph_ops); else ret = register_ftrace_function(tr->ops); @@ -485,7 +481,7 @@ static void unregister_irqsoff_function(struct trace_array *tr, int graph) return; if (graph) - unregister_ftrace_graph(); + unregister_ftrace_graph(&fgraph_ops); else unregister_ftrace_function(tr->ops); diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 2ce78100b4d3..4ea7e6845efb 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -162,6 +162,11 @@ static void wakeup_graph_return(struct ftrace_graph_ret *trace) return; } +static struct fgraph_ops fgraph_wakeup_ops = { + .entryfunc = &wakeup_graph_entry, + .retfunc = &wakeup_graph_return, +}; + static void wakeup_trace_open(struct trace_iterator *iter) { if (is_graph(iter->tr)) @@ -197,12 +202,6 @@ static void wakeup_print_header(struct seq_file *s) else trace_default_header(s); } -#else /* CONFIG_FUNCTION_GRAPH_TRACER */ -static int wakeup_graph_entry(struct ftrace_graph_ent *trace) -{ - return -1; -} -static void wakeup_graph_return(struct ftrace_graph_ret *trace) { } #endif /* else CONFIG_FUNCTION_GRAPH_TRACER */ /* @@ -237,8 +236,7 @@ static int register_wakeup_function(struct trace_array *tr, int graph, int set) return 0; if (graph) - ret = register_ftrace_graph(&wakeup_graph_return, - &wakeup_graph_entry); + ret = register_ftrace_graph(&fgraph_wakeup_ops); else ret = register_ftrace_function(tr->ops); @@ -254,7 +252,7 @@ static void unregister_wakeup_function(struct trace_array *tr, int graph) return; if (graph) - unregister_ftrace_graph(); + unregister_ftrace_graph(&fgraph_wakeup_ops); else unregister_ftrace_function(tr->ops); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 11e9daa4a568..9d402e7fc949 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -741,6 +741,11 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace) return trace_graph_entry(trace); } +static struct fgraph_ops fgraph_ops __initdata = { + .entryfunc = &trace_graph_entry_watchdog, + .retfunc = &trace_graph_return, +}; + /* * Pretty much the same than for the function tracer from which the selftest * has been borrowed. @@ -765,8 +770,7 @@ trace_selftest_startup_function_graph(struct tracer *trace, */ tracing_reset_online_cpus(&tr->trace_buffer); set_graph_array(tr); - ret = register_ftrace_graph(&trace_graph_return, - &trace_graph_entry_watchdog); + ret = register_ftrace_graph(&fgraph_ops); if (ret) { warn_failed_init_tracer(trace, ret); goto out; -- cgit v1.2.3 From 76b42b63ed0d004961097d3a3cd979129d4afd26 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Sun, 18 Nov 2018 18:36:19 -0500 Subject: function_graph: Move ftrace_graph_ret_addr() to fgraph.c Move the function function_graph_ret_addr() to fgraph.c, as the management of the curr_ret_stack is going to change, and all the accesses to ret_stack needs to be done in fgraph.c. Reviewed-by: Joel Fernandes (Google) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/fgraph.c | 55 ++++++++++++++++++++++++++++++++++++ kernel/trace/trace_functions_graph.c | 55 ------------------------------------ 2 files changed, 55 insertions(+), 55 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index cc35606e9a3e..90fcefcaff2a 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -232,6 +232,61 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) return ret; } +/** + * ftrace_graph_ret_addr - convert a potentially modified stack return address + * to its original value + * + * This function can be called by stack unwinding code to convert a found stack + * return address ('ret') to its original value, in case the function graph + * tracer has modified it to be 'return_to_handler'. If the address hasn't + * been modified, the unchanged value of 'ret' is returned. + * + * 'idx' is a state variable which should be initialized by the caller to zero + * before the first call. + * + * 'retp' is a pointer to the return address on the stack. It's ignored if + * the arch doesn't have HAVE_FUNCTION_GRAPH_RET_ADDR_PTR defined. + */ +#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR +unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, + unsigned long ret, unsigned long *retp) +{ + int index = task->curr_ret_stack; + int i; + + if (ret != (unsigned long)return_to_handler) + return ret; + + if (index < 0) + return ret; + + for (i = 0; i <= index; i++) + if (task->ret_stack[i].retp == retp) + return task->ret_stack[i].ret; + + return ret; +} +#else /* !HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */ +unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, + unsigned long ret, unsigned long *retp) +{ + int task_idx; + + if (ret != (unsigned long)return_to_handler) + return ret; + + task_idx = task->curr_ret_stack; + + if (!task->ret_stack || task_idx < *idx) + return ret; + + task_idx -= *idx; + (*idx)++; + + return task->ret_stack[task_idx].ret; +} +#endif /* HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */ + static struct ftrace_ops graph_ops = { .func = ftrace_stub, .flags = FTRACE_OPS_FL_RECURSION_SAFE | diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 140b4b51ab34..c2af1560e856 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -94,61 +94,6 @@ static void print_graph_duration(struct trace_array *tr, unsigned long long duration, struct trace_seq *s, u32 flags); -/** - * ftrace_graph_ret_addr - convert a potentially modified stack return address - * to its original value - * - * This function can be called by stack unwinding code to convert a found stack - * return address ('ret') to its original value, in case the function graph - * tracer has modified it to be 'return_to_handler'. If the address hasn't - * been modified, the unchanged value of 'ret' is returned. - * - * 'idx' is a state variable which should be initialized by the caller to zero - * before the first call. - * - * 'retp' is a pointer to the return address on the stack. It's ignored if - * the arch doesn't have HAVE_FUNCTION_GRAPH_RET_ADDR_PTR defined. - */ -#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR -unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, - unsigned long ret, unsigned long *retp) -{ - int index = task->curr_ret_stack; - int i; - - if (ret != (unsigned long)return_to_handler) - return ret; - - if (index < 0) - return ret; - - for (i = 0; i <= index; i++) - if (task->ret_stack[i].retp == retp) - return task->ret_stack[i].ret; - - return ret; -} -#else /* !HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */ -unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, - unsigned long ret, unsigned long *retp) -{ - int task_idx; - - if (ret != (unsigned long)return_to_handler) - return ret; - - task_idx = task->curr_ret_stack; - - if (!task->ret_stack || task_idx < *idx) - return ret; - - task_idx -= *idx; - (*idx)++; - - return task->ret_stack[task_idx].ret; -} -#endif /* HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */ - int __trace_graph_entry(struct trace_array *tr, struct ftrace_graph_ent *trace, unsigned long flags, -- cgit v1.2.3 From b0e21a61d3196762b61f43ae994ffd255f646774 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 19 Nov 2018 20:54:08 -0500 Subject: function_graph: Have profiler use new helper ftrace_graph_get_ret_stack() The ret_stack processing is going to change, and that is going to break anything that is accessing the ret_stack directly. One user is the function graph profiler. By using the ftrace_graph_get_ret_stack() helper function, the profiler can access the ret_stack entry without relying on the implementation details of the stack itself. Reviewed-by: Joel Fernandes (Google) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/fgraph.c | 11 +++++++++++ kernel/trace/ftrace.c | 21 +++++++++++---------- 2 files changed, 22 insertions(+), 10 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 90fcefcaff2a..a3704ec8b599 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -232,6 +232,17 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) return ret; } +struct ftrace_ret_stack * +ftrace_graph_get_ret_stack(struct task_struct *task, int idx) +{ + idx = current->curr_ret_stack - idx; + + if (idx >= 0 && idx <= task->curr_ret_stack) + return ¤t->ret_stack[idx]; + + return NULL; +} + /** * ftrace_graph_ret_addr - convert a potentially modified stack return address * to its original value diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d06fe588e650..8ef9fc226037 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -792,7 +792,7 @@ void ftrace_graph_graph_time_control(bool enable) static int profile_graph_entry(struct ftrace_graph_ent *trace) { - int index = current->curr_ret_stack; + struct ftrace_ret_stack *ret_stack; function_profile_call(trace->func, 0, NULL, NULL); @@ -800,14 +800,16 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace) if (!current->ret_stack) return 0; - if (index >= 0 && index < FTRACE_RETFUNC_DEPTH) - current->ret_stack[index].subtime = 0; + ret_stack = ftrace_graph_get_ret_stack(current, 0); + if (ret_stack) + ret_stack->subtime = 0; return 1; } static void profile_graph_return(struct ftrace_graph_ret *trace) { + struct ftrace_ret_stack *ret_stack; struct ftrace_profile_stat *stat; unsigned long long calltime; struct ftrace_profile *rec; @@ -825,16 +827,15 @@ static void profile_graph_return(struct ftrace_graph_ret *trace) calltime = trace->rettime - trace->calltime; if (!fgraph_graph_time) { - int index; - - index = current->curr_ret_stack; /* Append this call time to the parent time to subtract */ - if (index) - current->ret_stack[index - 1].subtime += calltime; + ret_stack = ftrace_graph_get_ret_stack(current, 1); + if (ret_stack) + ret_stack->subtime += calltime; - if (current->ret_stack[index].subtime < calltime) - calltime -= current->ret_stack[index].subtime; + ret_stack = ftrace_graph_get_ret_stack(current, 0); + if (ret_stack && ret_stack->subtime < calltime) + calltime -= ret_stack->subtime; else calltime = 0; } -- cgit v1.2.3 From ca16b0fbb05242f18da9d810c07d3882ffed831c Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 20 Jun 2018 14:08:00 +0300 Subject: tracing: Have trace_stack nr_entries compare not be so subtle Dan Carpenter reviewed the trace_stack.c code and figured he found an off by one bug. "From reviewing the code, it seems possible for stack_trace_max.nr_entries to be set to .max_entries and in that case we would be reading one element beyond the end of the stack_dump_trace[] array. If it's not set to .max_entries then the bug doesn't affect runtime." Although it looks to be the case, it is not. Because we have: static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] = { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX }; struct stack_trace stack_trace_max = { .max_entries = STACK_TRACE_ENTRIES - 1, .entries = &stack_dump_trace[0], }; And: stack_trace_max.nr_entries = x; for (; x < i; x++) stack_dump_trace[x] = ULONG_MAX; Even if nr_entries equals max_entries, indexing with it into the stack_dump_trace[] array will not overflow the array. But if it is the case, the second part of the conditional that tests stack_dump_trace[nr_entries] to ULONG_MAX will always be true. By applying Dan's patch, it removes the subtle aspect of it and makes the if conditional slightly more efficient. Link: http://lkml.kernel.org/r/20180620110758.crunhd5bfep7zuiz@kili.mountain Signed-off-by: Dan Carpenter Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_stack.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 2b0d1ee3241c..e2a153fc1afc 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -286,7 +286,7 @@ __next(struct seq_file *m, loff_t *pos) { long n = *pos - 1; - if (n > stack_trace_max.nr_entries || stack_dump_trace[n] == ULONG_MAX) + if (n >= stack_trace_max.nr_entries || stack_dump_trace[n] == ULONG_MAX) return NULL; m->private = (void *)n; -- cgit v1.2.3 From 2c2b0a78b373908926e4683ea5571332f63c0eb5 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 29 Nov 2018 20:32:26 -0500 Subject: ring-buffer: Add percentage of ring buffer full to wake up reader Instead of just waiting for a page to be full before waking up a pending reader, allow the reader to pass in a "percentage" of pages that have content before waking up a reader. This should help keep the process of reading the events not cause wake ups that constantly cause reading of the buffer. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 71 ++++++++++++++++++++++++++++++++++++++++++---- kernel/trace/trace.c | 8 +++--- 2 files changed, 70 insertions(+), 9 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 65bd4616220d..9edb628603ab 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -487,6 +487,9 @@ struct ring_buffer_per_cpu { local_t dropped_events; local_t committing; local_t commits; + local_t pages_touched; + local_t pages_read; + size_t shortest_full; unsigned long read; unsigned long read_bytes; u64 write_stamp; @@ -529,6 +532,41 @@ struct ring_buffer_iter { u64 read_stamp; }; +/** + * ring_buffer_nr_pages - get the number of buffer pages in the ring buffer + * @buffer: The ring_buffer to get the number of pages from + * @cpu: The cpu of the ring_buffer to get the number of pages from + * + * Returns the number of pages used by a per_cpu buffer of the ring buffer. + */ +size_t ring_buffer_nr_pages(struct ring_buffer *buffer, int cpu) +{ + return buffer->buffers[cpu]->nr_pages; +} + +/** + * ring_buffer_nr_pages_dirty - get the number of used pages in the ring buffer + * @buffer: The ring_buffer to get the number of pages from + * @cpu: The cpu of the ring_buffer to get the number of pages from + * + * Returns the number of pages that have content in the ring buffer. + */ +size_t ring_buffer_nr_dirty_pages(struct ring_buffer *buffer, int cpu) +{ + size_t read; + size_t cnt; + + read = local_read(&buffer->buffers[cpu]->pages_read); + cnt = local_read(&buffer->buffers[cpu]->pages_touched); + /* The reader can read an empty page, but not more than that */ + if (cnt < read) { + WARN_ON_ONCE(read > cnt + 1); + return 0; + } + + return cnt - read; +} + /* * rb_wake_up_waiters - wake up tasks waiting for ring buffer input * @@ -556,7 +594,7 @@ static void rb_wake_up_waiters(struct irq_work *work) * as data is added to any of the @buffer's cpu buffers. Otherwise * it will wait for data to be added to a specific cpu buffer. */ -int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full) +int ring_buffer_wait(struct ring_buffer *buffer, int cpu, int full) { struct ring_buffer_per_cpu *uninitialized_var(cpu_buffer); DEFINE_WAIT(wait); @@ -571,7 +609,7 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full) if (cpu == RING_BUFFER_ALL_CPUS) { work = &buffer->irq_work; /* Full only makes sense on per cpu reads */ - full = false; + full = 0; } else { if (!cpumask_test_cpu(cpu, buffer->cpumask)) return -ENODEV; @@ -623,15 +661,22 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full) !ring_buffer_empty_cpu(buffer, cpu)) { unsigned long flags; bool pagebusy; + size_t nr_pages; + size_t dirty; if (!full) break; raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; + nr_pages = cpu_buffer->nr_pages; + dirty = ring_buffer_nr_dirty_pages(buffer, cpu); + if (!cpu_buffer->shortest_full || + cpu_buffer->shortest_full < full) + cpu_buffer->shortest_full = full; raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - - if (!pagebusy) + if (!pagebusy && + (!nr_pages || (dirty * 100) > full * nr_pages)) break; } @@ -1054,6 +1099,7 @@ static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write); old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries); + local_inc(&cpu_buffer->pages_touched); /* * Just make sure we have seen our old_write and synchronize * with any interrupts that come in. @@ -2603,6 +2649,16 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) { + size_t nr_pages; + size_t dirty; + size_t full; + + full = cpu_buffer->shortest_full; + nr_pages = cpu_buffer->nr_pages; + dirty = ring_buffer_nr_dirty_pages(buffer, cpu_buffer->cpu); + if (full && nr_pages && (dirty * 100) <= full * nr_pages) + return; + cpu_buffer->irq_work.wakeup_full = true; cpu_buffer->irq_work.full_waiters_pending = false; /* irq_work_queue() supplies it's own memory barriers */ @@ -3732,13 +3788,15 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) goto spin; /* - * Yeah! We succeeded in replacing the page. + * Yay! We succeeded in replacing the page. * * Now make the new head point back to the reader page. */ rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list; rb_inc_page(cpu_buffer, &cpu_buffer->head_page); + local_inc(&cpu_buffer->pages_read); + /* Finally update the reader page to the new head */ cpu_buffer->reader_page = reader; cpu_buffer->reader_page->read = 0; @@ -4334,6 +4392,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) local_set(&cpu_buffer->entries, 0); local_set(&cpu_buffer->committing, 0); local_set(&cpu_buffer->commits, 0); + local_set(&cpu_buffer->pages_touched, 0); + local_set(&cpu_buffer->pages_read, 0); + cpu_buffer->shortest_full = 0; cpu_buffer->read = 0; cpu_buffer->read_bytes = 0; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ff1c4b20cd0a..48d5eb22ff33 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1431,7 +1431,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) } #endif /* CONFIG_TRACER_MAX_TRACE */ -static int wait_on_pipe(struct trace_iterator *iter, bool full) +static int wait_on_pipe(struct trace_iterator *iter, int full) { /* Iterators are static, they should be filled or empty */ if (trace_buffer_iter(iter, iter->cpu_file)) @@ -5693,7 +5693,7 @@ static int tracing_wait_pipe(struct file *filp) mutex_unlock(&iter->mutex); - ret = wait_on_pipe(iter, false); + ret = wait_on_pipe(iter, 0); mutex_lock(&iter->mutex); @@ -6751,7 +6751,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, if ((filp->f_flags & O_NONBLOCK)) return -EAGAIN; - ret = wait_on_pipe(iter, false); + ret = wait_on_pipe(iter, 0); if (ret) return ret; @@ -6948,7 +6948,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) goto out; - ret = wait_on_pipe(iter, true); + ret = wait_on_pipe(iter, 1); if (ret) goto out; -- cgit v1.2.3 From 03329f9939781041424edd29487b9603a704fcd9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 29 Nov 2018 21:38:42 -0500 Subject: tracing: Add tracefs file buffer_percentage Add a "buffer_percentage" file, that allows users to specify how much of the buffer (percentage of pages) need to be filled before waking up a task blocked on a per cpu trace_pipe_raw file. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 39 +++++++++++++++++++-------------- kernel/trace/trace.c | 54 +++++++++++++++++++++++++++++++++++++++++++++- kernel/trace/trace.h | 1 + 3 files changed, 77 insertions(+), 17 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9edb628603ab..5434c16f2192 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -489,6 +489,7 @@ struct ring_buffer_per_cpu { local_t commits; local_t pages_touched; local_t pages_read; + long last_pages_touch; size_t shortest_full; unsigned long read; unsigned long read_bytes; @@ -2632,7 +2633,9 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, static __always_inline void rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) { - bool pagebusy; + size_t nr_pages; + size_t dirty; + size_t full; if (buffer->irq_work.waiters_pending) { buffer->irq_work.waiters_pending = false; @@ -2646,24 +2649,27 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) irq_work_queue(&cpu_buffer->irq_work.work); } - pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; + if (cpu_buffer->last_pages_touch == local_read(&cpu_buffer->pages_touched)) + return; - if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) { - size_t nr_pages; - size_t dirty; - size_t full; + if (cpu_buffer->reader_page == cpu_buffer->commit_page) + return; - full = cpu_buffer->shortest_full; - nr_pages = cpu_buffer->nr_pages; - dirty = ring_buffer_nr_dirty_pages(buffer, cpu_buffer->cpu); - if (full && nr_pages && (dirty * 100) <= full * nr_pages) - return; + if (!cpu_buffer->irq_work.full_waiters_pending) + return; - cpu_buffer->irq_work.wakeup_full = true; - cpu_buffer->irq_work.full_waiters_pending = false; - /* irq_work_queue() supplies it's own memory barriers */ - irq_work_queue(&cpu_buffer->irq_work.work); - } + cpu_buffer->last_pages_touch = local_read(&cpu_buffer->pages_touched); + + full = cpu_buffer->shortest_full; + nr_pages = cpu_buffer->nr_pages; + dirty = ring_buffer_nr_dirty_pages(buffer, cpu_buffer->cpu); + if (full && nr_pages && (dirty * 100) <= full * nr_pages) + return; + + cpu_buffer->irq_work.wakeup_full = true; + cpu_buffer->irq_work.full_waiters_pending = false; + /* irq_work_queue() supplies it's own memory barriers */ + irq_work_queue(&cpu_buffer->irq_work.work); } /* @@ -4394,6 +4400,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) local_set(&cpu_buffer->commits, 0); local_set(&cpu_buffer->pages_touched, 0); local_set(&cpu_buffer->pages_read, 0); + cpu_buffer->last_pages_touch = 0; cpu_buffer->shortest_full = 0; cpu_buffer->read = 0; cpu_buffer->read_bytes = 0; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 48d5eb22ff33..d382fd1aa4a6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6948,7 +6948,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) goto out; - ret = wait_on_pipe(iter, 1); + ret = wait_on_pipe(iter, iter->tr->buffer_percent); if (ret) goto out; @@ -7662,6 +7662,53 @@ static const struct file_operations rb_simple_fops = { .llseek = default_llseek, }; +static ssize_t +buffer_percent_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + char buf[64]; + int r; + + r = tr->buffer_percent; + r = sprintf(buf, "%d\n", r); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +buffer_percent_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + if (val > 100) + return -EINVAL; + + if (!val) + val = 1; + + tr->buffer_percent = val; + + (*ppos)++; + + return cnt; +} + +static const struct file_operations buffer_percent_fops = { + .open = tracing_open_generic_tr, + .read = buffer_percent_read, + .write = buffer_percent_write, + .release = tracing_release_generic_tr, + .llseek = default_llseek, +}; + struct dentry *trace_instance_dir; static void @@ -7970,6 +8017,11 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("timestamp_mode", 0444, d_tracer, tr, &trace_time_stamp_mode_fops); + tr->buffer_percent = 1; + + trace_create_file("buffer_percent", 0444, d_tracer, + tr, &buffer_percent_fops); + create_trace_options_dir(tr); #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index ab16eca76e59..08900828d282 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -247,6 +247,7 @@ struct trace_array { int clock_id; int nr_topts; bool clear_trace; + int buffer_percent; struct tracer *current_trace; unsigned int trace_flags; unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE]; -- cgit v1.2.3 From a7b1d74e872a4299e1aef66a648316c9c23e5ab4 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 29 Nov 2018 22:36:47 -0500 Subject: tracing: Change default buffer_percent to 50 After running several tests, it appears that having the reader wait till half the buffer is full before starting to read (and causing its own events to fill up the ring buffer constantly), works well. It keeps trace-cmd (the main user of this interface) from dominating the traces it records. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d382fd1aa4a6..194c01838e3f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8017,7 +8017,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("timestamp_mode", 0444, d_tracer, tr, &trace_time_stamp_mode_fops); - tr->buffer_percent = 1; + tr->buffer_percent = 50; trace_create_file("buffer_percent", 0444, d_tracer, tr, &buffer_percent_fops); -- cgit v1.2.3 From 547cd9eacd1c699c8d1fa77c95c6cdb33b2eba6a Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 5 Nov 2018 18:00:15 +0900 Subject: tracing/uprobes: Add busy check when cleanup all uprobes Add a busy check loop in cleanup_all_probes() before trying to remove all events in uprobe_events, the same way that kprobe_events does. Without this change, writing null to uprobe_events will try to remove events but if one of them is enabled, it will stop there leaving some events cleared and others not clceared. With this change, writing null to uprobe_events makes sure all events are not enabled before removing events. So, it clears all events, or returns an error (-EBUSY) with keeping all events. Link: http://lkml.kernel.org/r/154140841557.17322.12653952888762532401.stgit@devbox Reviewed-by: Tom Zanussi Tested-by: Tom Zanussi Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_uprobe.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 31ea48eceda1..b708e4ff7ea7 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -587,12 +587,19 @@ static int cleanup_all_probes(void) int ret = 0; mutex_lock(&uprobe_lock); + /* Ensure no probe is in use. */ + list_for_each_entry(tu, &uprobe_list, list) + if (trace_probe_is_enabled(&tu->tp)) { + ret = -EBUSY; + goto end; + } while (!list_empty(&uprobe_list)) { tu = list_entry(uprobe_list.next, struct trace_uprobe, list); ret = unregister_trace_uprobe(tu); if (ret) break; } +end: mutex_unlock(&uprobe_lock); return ret; } -- cgit v1.2.3 From fc800a10be26017f8f338bc8e500d48e3e6429d9 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 5 Nov 2018 18:00:43 +0900 Subject: tracing: Lock event_mutex before synth_event_mutex synthetic event is using synth_event_mutex for protecting synth_event_list, and event_trigger_write() path acquires locks as below order. event_trigger_write(event_mutex) ->trigger_process_regex(trigger_cmd_mutex) ->event_hist_trigger_func(synth_event_mutex) On the other hand, synthetic event creation and deletion paths call trace_add_event_call() and trace_remove_event_call() which acquires event_mutex. In that case, if we keep the synth_event_mutex locked while registering/unregistering synthetic events, its dependency will be inversed. To avoid this issue, current synthetic event is using a 2 phase process to create/delete events. For example, it searches existing events under synth_event_mutex to check for event-name conflicts, and unlocks synth_event_mutex, then registers a new event under event_mutex locked. Finally, it locks synth_event_mutex and tries to add the new event to the list. But it can introduce complexity and a chance for name conflicts. To solve this simpler, this introduces trace_add_event_call_nolock() and trace_remove_event_call_nolock() which don't acquire event_mutex inside. synthetic event can lock event_mutex before synth_event_mutex to solve the lock dependency issue simpler. Link: http://lkml.kernel.org/r/154140844377.17322.13781091165954002713.stgit@devbox Reviewed-by: Tom Zanussi Tested-by: Tom Zanussi Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events.c | 34 ++++++++++++++++++++++++++++------ kernel/trace/trace_events_hist.c | 24 ++++++++++-------------- 2 files changed, 38 insertions(+), 20 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f94be0c2827b..a3b157f689ee 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2305,11 +2305,11 @@ __trace_early_add_new_event(struct trace_event_call *call, struct ftrace_module_file_ops; static void __add_event_to_tracers(struct trace_event_call *call); -/* Add an additional event_call dynamically */ -int trace_add_event_call(struct trace_event_call *call) +int trace_add_event_call_nolock(struct trace_event_call *call) { int ret; - mutex_lock(&event_mutex); + lockdep_assert_held(&event_mutex); + mutex_lock(&trace_types_lock); ret = __register_event(call, NULL); @@ -2317,6 +2317,16 @@ int trace_add_event_call(struct trace_event_call *call) __add_event_to_tracers(call); mutex_unlock(&trace_types_lock); + return ret; +} + +/* Add an additional event_call dynamically */ +int trace_add_event_call(struct trace_event_call *call) +{ + int ret; + + mutex_lock(&event_mutex); + ret = trace_add_event_call_nolock(call); mutex_unlock(&event_mutex); return ret; } @@ -2366,17 +2376,29 @@ static int probe_remove_event_call(struct trace_event_call *call) return 0; } -/* Remove an event_call */ -int trace_remove_event_call(struct trace_event_call *call) +/* no event_mutex version */ +int trace_remove_event_call_nolock(struct trace_event_call *call) { int ret; - mutex_lock(&event_mutex); + lockdep_assert_held(&event_mutex); + mutex_lock(&trace_types_lock); down_write(&trace_event_sem); ret = probe_remove_event_call(call); up_write(&trace_event_sem); mutex_unlock(&trace_types_lock); + + return ret; +} + +/* Remove an event_call */ +int trace_remove_event_call(struct trace_event_call *call) +{ + int ret; + + mutex_lock(&event_mutex); + ret = trace_remove_event_call_nolock(call); mutex_unlock(&event_mutex); return ret; diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index eb908ef2ecec..1670c65389fe 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -912,7 +912,7 @@ static int register_synth_event(struct synth_event *event) call->data = event; call->tp = event->tp; - ret = trace_add_event_call(call); + ret = trace_add_event_call_nolock(call); if (ret) { pr_warn("Failed to register synthetic event: %s\n", trace_event_name(call)); @@ -936,7 +936,7 @@ static int unregister_synth_event(struct synth_event *event) struct trace_event_call *call = &event->call; int ret; - ret = trace_remove_event_call(call); + ret = trace_remove_event_call_nolock(call); return ret; } @@ -1013,12 +1013,10 @@ static void add_or_delete_synth_event(struct synth_event *event, int delete) if (delete) free_synth_event(event); else { - mutex_lock(&synth_event_mutex); if (!find_synth_event(event->name)) list_add(&event->list, &synth_event_list); else free_synth_event(event); - mutex_unlock(&synth_event_mutex); } } @@ -1030,6 +1028,7 @@ static int create_synth_event(int argc, char **argv) int i, consumed = 0, n_fields = 0, ret = 0; char *name; + mutex_lock(&event_mutex); mutex_lock(&synth_event_mutex); /* @@ -1102,8 +1101,6 @@ static int create_synth_event(int argc, char **argv) goto err; } out: - mutex_unlock(&synth_event_mutex); - if (event) { if (delete_event) { ret = unregister_synth_event(event); @@ -1113,10 +1110,13 @@ static int create_synth_event(int argc, char **argv) add_or_delete_synth_event(event, ret); } } + mutex_unlock(&synth_event_mutex); + mutex_unlock(&event_mutex); return ret; err: mutex_unlock(&synth_event_mutex); + mutex_unlock(&event_mutex); for (i = 0; i < n_fields; i++) free_synth_field(fields[i]); @@ -1127,12 +1127,10 @@ static int create_synth_event(int argc, char **argv) static int release_all_synth_events(void) { - struct list_head release_events; struct synth_event *event, *e; int ret = 0; - INIT_LIST_HEAD(&release_events); - + mutex_lock(&event_mutex); mutex_lock(&synth_event_mutex); list_for_each_entry(event, &synth_event_list, list) { @@ -1142,16 +1140,14 @@ static int release_all_synth_events(void) } } - list_splice_init(&event->list, &release_events); - - mutex_unlock(&synth_event_mutex); - - list_for_each_entry_safe(event, e, &release_events, list) { + list_for_each_entry_safe(event, e, &synth_event_list, list) { list_del(&event->list); ret = unregister_synth_event(event); add_or_delete_synth_event(event, !ret); } + mutex_unlock(&synth_event_mutex); + mutex_unlock(&event_mutex); return ret; } -- cgit v1.2.3 From faacb361f271be4baf2d807e2eeaba87e059225f Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 5 Nov 2018 18:01:12 +0900 Subject: tracing: Simplify creation and deletion of synthetic events Since the event_mutex and synth_event_mutex ordering issue is gone, we can skip existing event check when adding or deleting events, and some redundant code in error path. This changes release_all_synth_events() to abort the process when it hits any error and returns the error code. It succeeds only if it has no error. Link: http://lkml.kernel.org/r/154140847194.17322.17960275728005067803.stgit@devbox Reviewed-by: Tom Zanussi Tested-by: Tom Zanussi Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 53 ++++++++++++++-------------------------- 1 file changed, 18 insertions(+), 35 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 1670c65389fe..0feb7f460123 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1008,18 +1008,6 @@ struct hist_var_data { struct hist_trigger_data *hist_data; }; -static void add_or_delete_synth_event(struct synth_event *event, int delete) -{ - if (delete) - free_synth_event(event); - else { - if (!find_synth_event(event->name)) - list_add(&event->list, &synth_event_list); - else - free_synth_event(event); - } -} - static int create_synth_event(int argc, char **argv) { struct synth_field *field, *fields[SYNTH_FIELDS_MAX]; @@ -1052,15 +1040,16 @@ static int create_synth_event(int argc, char **argv) if (event) { if (delete_event) { if (event->ref) { - event = NULL; ret = -EBUSY; goto out; } - list_del(&event->list); - goto out; - } - event = NULL; - ret = -EEXIST; + ret = unregister_synth_event(event); + if (!ret) { + list_del(&event->list); + free_synth_event(event); + } + } else + ret = -EEXIST; goto out; } else if (delete_event) { ret = -ENOENT; @@ -1100,29 +1089,21 @@ static int create_synth_event(int argc, char **argv) event = NULL; goto err; } + ret = register_synth_event(event); + if (!ret) + list_add(&event->list, &synth_event_list); + else + free_synth_event(event); out: - if (event) { - if (delete_event) { - ret = unregister_synth_event(event); - add_or_delete_synth_event(event, !ret); - } else { - ret = register_synth_event(event); - add_or_delete_synth_event(event, ret); - } - } mutex_unlock(&synth_event_mutex); mutex_unlock(&event_mutex); return ret; err: - mutex_unlock(&synth_event_mutex); - mutex_unlock(&event_mutex); - for (i = 0; i < n_fields; i++) free_synth_field(fields[i]); - free_synth_event(event); - return ret; + goto out; } static int release_all_synth_events(void) @@ -1141,10 +1122,12 @@ static int release_all_synth_events(void) } list_for_each_entry_safe(event, e, &synth_event_list, list) { - list_del(&event->list); - ret = unregister_synth_event(event); - add_or_delete_synth_event(event, !ret); + if (!ret) { + list_del(&event->list); + free_synth_event(event); + } else + break; } mutex_unlock(&synth_event_mutex); mutex_unlock(&event_mutex); -- cgit v1.2.3 From d00bbea9456f35fb34310d454e561f05d68d07fe Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 5 Nov 2018 18:01:40 +0900 Subject: tracing: Integrate similar probe argument parsers Integrate similar argument parsers for kprobes and uprobes events into traceprobe_parse_probe_arg(). Link: http://lkml.kernel.org/r/154140850016.17322.9836787731210512176.stgit@devbox Reviewed-by: Tom Zanussi Tested-by: Tom Zanussi Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 48 ++------------------------------------------- kernel/trace/trace_probe.c | 47 +++++++++++++++++++++++++++++++++++++++++--- kernel/trace/trace_probe.h | 7 ++----- kernel/trace/trace_uprobe.c | 44 ++--------------------------------------- 4 files changed, 50 insertions(+), 96 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index fec67188c4d2..d313bcc259dc 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -548,7 +548,6 @@ static int create_trace_kprobe(int argc, char **argv) bool is_return = false, is_delete = false; char *symbol = NULL, *event = NULL, *group = NULL; int maxactive = 0; - char *arg; long offset = 0; void *addr = NULL; char buf[MAX_EVENT_NAME_LEN]; @@ -676,53 +675,10 @@ static int create_trace_kprobe(int argc, char **argv) } /* parse arguments */ - ret = 0; for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { - struct probe_arg *parg = &tk->tp.args[i]; - - /* Increment count for freeing args in error case */ - tk->tp.nr_args++; - - /* Parse argument name */ - arg = strchr(argv[i], '='); - if (arg) { - *arg++ = '\0'; - parg->name = kstrdup(argv[i], GFP_KERNEL); - } else { - arg = argv[i]; - /* If argument name is omitted, set "argN" */ - snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1); - parg->name = kstrdup(buf, GFP_KERNEL); - } - - if (!parg->name) { - pr_info("Failed to allocate argument[%d] name.\n", i); - ret = -ENOMEM; - goto error; - } - - if (!is_good_name(parg->name)) { - pr_info("Invalid argument[%d] name: %s\n", - i, parg->name); - ret = -EINVAL; - goto error; - } - - if (traceprobe_conflict_field_name(parg->name, - tk->tp.args, i)) { - pr_info("Argument[%d] name '%s' conflicts with " - "another field.\n", i, argv[i]); - ret = -EINVAL; - goto error; - } - - /* Parse fetch argument */ - ret = traceprobe_parse_probe_arg(arg, &tk->tp.size, parg, - flags); - if (ret) { - pr_info("Parse error at argument[%d]. (%d)\n", i, ret); + ret = traceprobe_parse_probe_arg(&tk->tp, i, argv[i], flags); + if (ret) goto error; - } } ret = register_trace_kprobe(tk); diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index bd30e9398d2a..449150c6a87f 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -348,7 +348,7 @@ static int __parse_bitfield_probe_arg(const char *bf, } /* String length checking wrapper */ -int traceprobe_parse_probe_arg(char *arg, ssize_t *size, +static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, struct probe_arg *parg, unsigned int flags) { struct fetch_insn *code, *scode, *tmp = NULL; @@ -491,8 +491,8 @@ fail: } /* Return 1 if name is reserved or already used by another argument */ -int traceprobe_conflict_field_name(const char *name, - struct probe_arg *args, int narg) +static int traceprobe_conflict_field_name(const char *name, + struct probe_arg *args, int narg) { int i; @@ -507,6 +507,47 @@ int traceprobe_conflict_field_name(const char *name, return 0; } +int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, char *arg, + unsigned int flags) +{ + struct probe_arg *parg = &tp->args[i]; + char *body; + int ret; + + /* Increment count for freeing args in error case */ + tp->nr_args++; + + body = strchr(arg, '='); + if (body) { + parg->name = kmemdup_nul(arg, body - arg, GFP_KERNEL); + body++; + } else { + /* If argument name is omitted, set "argN" */ + parg->name = kasprintf(GFP_KERNEL, "arg%d", i + 1); + body = arg; + } + if (!parg->name) + return -ENOMEM; + + if (!is_good_name(parg->name)) { + pr_info("Invalid argument[%d] name: %s\n", + i, parg->name); + return -EINVAL; + } + + if (traceprobe_conflict_field_name(parg->name, tp->args, i)) { + pr_info("Argument[%d]: '%s' conflicts with another field.\n", + i, parg->name); + return -EINVAL; + } + + /* Parse fetch argument */ + ret = traceprobe_parse_probe_arg_body(body, &tp->size, parg, flags); + if (ret) + pr_info("Parse error at argument[%d]. (%d)\n", i, ret); + return ret; +} + void traceprobe_free_probe_arg(struct probe_arg *arg) { struct fetch_insn *code = arg->code; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 974afc1a3e73..feeec261b356 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -272,11 +272,8 @@ find_event_file_link(struct trace_probe *tp, struct trace_event_file *file) #define TPARG_FL_FENTRY BIT(2) #define TPARG_FL_MASK GENMASK(2, 0) -extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size, - struct probe_arg *parg, unsigned int flags); - -extern int traceprobe_conflict_field_name(const char *name, - struct probe_arg *args, int narg); +extern int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, + char *arg, unsigned int flags); extern int traceprobe_update_arg(struct probe_arg *arg); extern void traceprobe_free_probe_arg(struct probe_arg *arg); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index b708e4ff7ea7..6eaaa2150685 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -517,51 +517,11 @@ static int create_trace_uprobe(int argc, char **argv) } /* parse arguments */ - ret = 0; for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { - struct probe_arg *parg = &tu->tp.args[i]; - - /* Increment count for freeing args in error case */ - tu->tp.nr_args++; - - /* Parse argument name */ - arg = strchr(argv[i], '='); - if (arg) { - *arg++ = '\0'; - parg->name = kstrdup(argv[i], GFP_KERNEL); - } else { - arg = argv[i]; - /* If argument name is omitted, set "argN" */ - snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1); - parg->name = kstrdup(buf, GFP_KERNEL); - } - - if (!parg->name) { - pr_info("Failed to allocate argument[%d] name.\n", i); - ret = -ENOMEM; - goto error; - } - - if (!is_good_name(parg->name)) { - pr_info("Invalid argument[%d] name: %s\n", i, parg->name); - ret = -EINVAL; - goto error; - } - - if (traceprobe_conflict_field_name(parg->name, tu->tp.args, i)) { - pr_info("Argument[%d] name '%s' conflicts with " - "another field.\n", i, argv[i]); - ret = -EINVAL; - goto error; - } - - /* Parse fetch argument */ - ret = traceprobe_parse_probe_arg(arg, &tu->tp.size, parg, + ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], is_return ? TPARG_FL_RETURN : 0); - if (ret) { - pr_info("Parse error at argument[%d]. (%d)\n", i, ret); + if (ret) goto error; - } } ret = register_trace_uprobe(tu); -- cgit v1.2.3 From 5448d44c38557fc15d1c53b608a9c9f0e1ca8f86 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 5 Nov 2018 18:02:08 +0900 Subject: tracing: Add unified dynamic event framework Add unified dynamic event framework for ftrace kprobes, uprobes and synthetic events. Those dynamic events can be co-exist on same file because those syntax doesn't overlap. This introduces a framework part which provides a unified tracefs interface and operations. Link: http://lkml.kernel.org/r/154140852824.17322.12250362185969352095.stgit@devbox Reviewed-by: Tom Zanussi Tested-by: Tom Zanussi Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 3 + kernel/trace/Makefile | 1 + kernel/trace/trace.c | 4 + kernel/trace/trace_dynevent.c | 210 ++++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace_dynevent.h | 119 ++++++++++++++++++++++++ 5 files changed, 337 insertions(+) create mode 100644 kernel/trace/trace_dynevent.c create mode 100644 kernel/trace/trace_dynevent.h (limited to 'kernel/trace') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 5e3de28c7677..bf2e8a5a91f1 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -518,6 +518,9 @@ config BPF_EVENTS help This allows the user to attach BPF programs to kprobe events. +config DYNAMIC_EVENTS + def_bool n + config PROBE_EVENTS def_bool n diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index c7ade7965464..c2b2148bb1d2 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -79,6 +79,7 @@ endif ifeq ($(CONFIG_TRACING),y) obj-$(CONFIG_KGDB_KDB) += trace_kdb.o endif +obj-$(CONFIG_DYNAMIC_EVENTS) += trace_dynevent.o obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o obj-$(CONFIG_UPROBE_EVENTS) += trace_uprobe.o diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 194c01838e3f..7e0332f90ed4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4604,6 +4604,10 @@ static const char readme_msg[] = "\t\t\t traces\n" #endif #endif /* CONFIG_STACK_TRACER */ +#ifdef CONFIG_DYNAMIC_EVENTS + " dynamic_events\t\t- Add/remove/show the generic dynamic events\n" + "\t\t\t Write into this file to define/undefine new trace events.\n" +#endif #ifdef CONFIG_KPROBE_EVENTS " kprobe_events\t\t- Add/remove/show the kernel dynamic events\n" "\t\t\t Write into this file to define/undefine new trace events.\n" diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c new file mode 100644 index 000000000000..f17a887abb66 --- /dev/null +++ b/kernel/trace/trace_dynevent.c @@ -0,0 +1,210 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Generic dynamic event control interface + * + * Copyright (C) 2018 Masami Hiramatsu + */ + +#include +#include +#include +#include +#include +#include + +#include "trace.h" +#include "trace_dynevent.h" + +static DEFINE_MUTEX(dyn_event_ops_mutex); +static LIST_HEAD(dyn_event_ops_list); + +int dyn_event_register(struct dyn_event_operations *ops) +{ + if (!ops || !ops->create || !ops->show || !ops->is_busy || + !ops->free || !ops->match) + return -EINVAL; + + INIT_LIST_HEAD(&ops->list); + mutex_lock(&dyn_event_ops_mutex); + list_add_tail(&ops->list, &dyn_event_ops_list); + mutex_unlock(&dyn_event_ops_mutex); + return 0; +} + +int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type) +{ + struct dyn_event *pos, *n; + char *system = NULL, *event, *p; + int ret = -ENOENT; + + if (argv[0][1] != ':') + return -EINVAL; + + event = &argv[0][2]; + p = strchr(event, '/'); + if (p) { + system = event; + event = p + 1; + *p = '\0'; + } + if (event[0] == '\0') + return -EINVAL; + + mutex_lock(&event_mutex); + for_each_dyn_event_safe(pos, n) { + if (type && type != pos->ops) + continue; + if (pos->ops->match(system, event, pos)) { + ret = pos->ops->free(pos); + break; + } + } + mutex_unlock(&event_mutex); + + return ret; +} + +static int create_dyn_event(int argc, char **argv) +{ + struct dyn_event_operations *ops; + int ret; + + if (argv[0][0] == '-') + return dyn_event_release(argc, argv, NULL); + + mutex_lock(&dyn_event_ops_mutex); + list_for_each_entry(ops, &dyn_event_ops_list, list) { + ret = ops->create(argc, (const char **)argv); + if (!ret || ret != -ECANCELED) + break; + } + mutex_unlock(&dyn_event_ops_mutex); + if (ret == -ECANCELED) + ret = -EINVAL; + + return ret; +} + +/* Protected by event_mutex */ +LIST_HEAD(dyn_event_list); + +void *dyn_event_seq_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&event_mutex); + return seq_list_start(&dyn_event_list, *pos); +} + +void *dyn_event_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + return seq_list_next(v, &dyn_event_list, pos); +} + +void dyn_event_seq_stop(struct seq_file *m, void *v) +{ + mutex_unlock(&event_mutex); +} + +static int dyn_event_seq_show(struct seq_file *m, void *v) +{ + struct dyn_event *ev = v; + + if (ev && ev->ops) + return ev->ops->show(m, ev); + + return 0; +} + +static const struct seq_operations dyn_event_seq_op = { + .start = dyn_event_seq_start, + .next = dyn_event_seq_next, + .stop = dyn_event_seq_stop, + .show = dyn_event_seq_show +}; + +/* + * dyn_events_release_all - Release all specific events + * @type: the dyn_event_operations * which filters releasing events + * + * This releases all events which ->ops matches @type. If @type is NULL, + * all events are released. + * Return -EBUSY if any of them are in use, and return other errors when + * it failed to free the given event. Except for -EBUSY, event releasing + * process will be aborted at that point and there may be some other + * releasable events on the list. + */ +int dyn_events_release_all(struct dyn_event_operations *type) +{ + struct dyn_event *ev, *tmp; + int ret = 0; + + mutex_lock(&event_mutex); + for_each_dyn_event(ev) { + if (type && ev->ops != type) + continue; + if (ev->ops->is_busy(ev)) { + ret = -EBUSY; + goto out; + } + } + for_each_dyn_event_safe(ev, tmp) { + if (type && ev->ops != type) + continue; + ret = ev->ops->free(ev); + if (ret) + break; + } +out: + mutex_unlock(&event_mutex); + + return ret; +} + +static int dyn_event_open(struct inode *inode, struct file *file) +{ + int ret; + + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { + ret = dyn_events_release_all(NULL); + if (ret < 0) + return ret; + } + + return seq_open(file, &dyn_event_seq_op); +} + +static ssize_t dyn_event_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + return trace_parse_run_command(file, buffer, count, ppos, + create_dyn_event); +} + +static const struct file_operations dynamic_events_ops = { + .owner = THIS_MODULE, + .open = dyn_event_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .write = dyn_event_write, +}; + +/* Make a tracefs interface for controlling dynamic events */ +static __init int init_dynamic_event(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + + d_tracer = tracing_init_dentry(); + if (IS_ERR(d_tracer)) + return 0; + + entry = tracefs_create_file("dynamic_events", 0644, d_tracer, + NULL, &dynamic_events_ops); + + /* Event list interface */ + if (!entry) + pr_warn("Could not create tracefs 'dynamic_events' entry\n"); + + return 0; +} +fs_initcall(init_dynamic_event); diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h new file mode 100644 index 000000000000..8c334064e4d6 --- /dev/null +++ b/kernel/trace/trace_dynevent.h @@ -0,0 +1,119 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Common header file for generic dynamic events. + */ + +#ifndef _TRACE_DYNEVENT_H +#define _TRACE_DYNEVENT_H + +#include +#include +#include +#include + +#include "trace.h" + +struct dyn_event; + +/** + * struct dyn_event_operations - Methods for each type of dynamic events + * + * These methods must be set for each type, since there is no default method. + * Before using this for dyn_event_init(), it must be registered by + * dyn_event_register(). + * + * @create: Parse and create event method. This is invoked when user passes + * a event definition to dynamic_events interface. This must not destruct + * the arguments and return -ECANCELED if given arguments doesn't match its + * command prefix. + * @show: Showing method. This is invoked when user reads the event definitions + * via dynamic_events interface. + * @is_busy: Check whether given event is busy so that it can not be deleted. + * Return true if it is busy, otherwides false. + * @free: Delete the given event. Return 0 if success, otherwides error. + * @match: Check whether given event and system name match this event. + * Return true if it matches, otherwides false. + * + * Except for @create, these methods are called under holding event_mutex. + */ +struct dyn_event_operations { + struct list_head list; + int (*create)(int argc, const char *argv[]); + int (*show)(struct seq_file *m, struct dyn_event *ev); + bool (*is_busy)(struct dyn_event *ev); + int (*free)(struct dyn_event *ev); + bool (*match)(const char *system, const char *event, + struct dyn_event *ev); +}; + +/* Register new dyn_event type -- must be called at first */ +int dyn_event_register(struct dyn_event_operations *ops); + +/** + * struct dyn_event - Dynamic event list header + * + * The dyn_event structure encapsulates a list and a pointer to the operators + * for making a global list of dynamic events. + * User must includes this in each event structure, so that those events can + * be added/removed via dynamic_events interface. + */ +struct dyn_event { + struct list_head list; + struct dyn_event_operations *ops; +}; + +extern struct list_head dyn_event_list; + +static inline +int dyn_event_init(struct dyn_event *ev, struct dyn_event_operations *ops) +{ + if (!ev || !ops) + return -EINVAL; + + INIT_LIST_HEAD(&ev->list); + ev->ops = ops; + return 0; +} + +static inline int dyn_event_add(struct dyn_event *ev) +{ + lockdep_assert_held(&event_mutex); + + if (!ev || !ev->ops) + return -EINVAL; + + list_add_tail(&ev->list, &dyn_event_list); + return 0; +} + +static inline void dyn_event_remove(struct dyn_event *ev) +{ + lockdep_assert_held(&event_mutex); + list_del_init(&ev->list); +} + +void *dyn_event_seq_start(struct seq_file *m, loff_t *pos); +void *dyn_event_seq_next(struct seq_file *m, void *v, loff_t *pos); +void dyn_event_seq_stop(struct seq_file *m, void *v); +int dyn_events_release_all(struct dyn_event_operations *type); +int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type); + +/* + * for_each_dyn_event - iterate over the dyn_event list + * @pos: the struct dyn_event * to use as a loop cursor + * + * This is just a basement of for_each macro. Wrap this for + * each actual event structure with ops filtering. + */ +#define for_each_dyn_event(pos) \ + list_for_each_entry(pos, &dyn_event_list, list) + +/* + * for_each_dyn_event - iterate over the dyn_event list safely + * @pos: the struct dyn_event * to use as a loop cursor + * @n: the struct dyn_event * to use as temporary storage + */ +#define for_each_dyn_event_safe(pos, n) \ + list_for_each_entry_safe(pos, n, &dyn_event_list, list) + +#endif -- cgit v1.2.3 From 6212dd29683eec51d6d05374a66ac953e81250e9 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 5 Nov 2018 18:02:36 +0900 Subject: tracing/kprobes: Use dyn_event framework for kprobe events Use dyn_event framework for kprobe events. This shows kprobe events on "tracing/dynamic_events" file. User can also define new events via tracing/dynamic_events. Link: http://lkml.kernel.org/r/154140855646.17322.6619219995865980392.stgit@devbox Reviewed-by: Tom Zanussi Tested-by: Tom Zanussi Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 1 + kernel/trace/trace_kprobe.c | 319 ++++++++++++++++++++++++-------------------- kernel/trace/trace_probe.c | 27 ++++ kernel/trace/trace_probe.h | 2 + 4 files changed, 204 insertions(+), 145 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index bf2e8a5a91f1..c0f6b0105609 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -461,6 +461,7 @@ config KPROBE_EVENTS bool "Enable kprobes-based dynamic events" select TRACING select PROBE_EVENTS + select DYNAMIC_EVENTS default y help This allows the user to add tracing events (similar to tracepoints) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index d313bcc259dc..bdf8c2ad5152 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -12,6 +12,7 @@ #include #include +#include "trace_dynevent.h" #include "trace_kprobe_selftest.h" #include "trace_probe.h" #include "trace_probe_tmpl.h" @@ -19,17 +20,51 @@ #define KPROBE_EVENT_SYSTEM "kprobes" #define KRETPROBE_MAXACTIVE_MAX 4096 +static int trace_kprobe_create(int argc, const char **argv); +static int trace_kprobe_show(struct seq_file *m, struct dyn_event *ev); +static int trace_kprobe_release(struct dyn_event *ev); +static bool trace_kprobe_is_busy(struct dyn_event *ev); +static bool trace_kprobe_match(const char *system, const char *event, + struct dyn_event *ev); + +static struct dyn_event_operations trace_kprobe_ops = { + .create = trace_kprobe_create, + .show = trace_kprobe_show, + .is_busy = trace_kprobe_is_busy, + .free = trace_kprobe_release, + .match = trace_kprobe_match, +}; + /** * Kprobe event core functions */ struct trace_kprobe { - struct list_head list; + struct dyn_event devent; struct kretprobe rp; /* Use rp.kp for kprobe use */ unsigned long __percpu *nhit; const char *symbol; /* symbol name */ struct trace_probe tp; }; +static bool is_trace_kprobe(struct dyn_event *ev) +{ + return ev->ops == &trace_kprobe_ops; +} + +static struct trace_kprobe *to_trace_kprobe(struct dyn_event *ev) +{ + return container_of(ev, struct trace_kprobe, devent); +} + +/** + * for_each_trace_kprobe - iterate over the trace_kprobe list + * @pos: the struct trace_kprobe * for each entry + * @dpos: the struct dyn_event * to use as a loop cursor + */ +#define for_each_trace_kprobe(pos, dpos) \ + for_each_dyn_event(dpos) \ + if (is_trace_kprobe(dpos) && (pos = to_trace_kprobe(dpos))) + #define SIZEOF_TRACE_KPROBE(n) \ (offsetof(struct trace_kprobe, tp.args) + \ (sizeof(struct probe_arg) * (n))) @@ -81,6 +116,22 @@ static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk) return ret; } +static bool trace_kprobe_is_busy(struct dyn_event *ev) +{ + struct trace_kprobe *tk = to_trace_kprobe(ev); + + return trace_probe_is_enabled(&tk->tp); +} + +static bool trace_kprobe_match(const char *system, const char *event, + struct dyn_event *ev) +{ + struct trace_kprobe *tk = to_trace_kprobe(ev); + + return strcmp(trace_event_name(&tk->tp.call), event) == 0 && + (!system || strcmp(tk->tp.call.class->system, system) == 0); +} + static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk) { unsigned long nhit = 0; @@ -128,9 +179,6 @@ bool trace_kprobe_error_injectable(struct trace_event_call *call) static int register_kprobe_event(struct trace_kprobe *tk); static int unregister_kprobe_event(struct trace_kprobe *tk); -static DEFINE_MUTEX(probe_lock); -static LIST_HEAD(probe_list); - static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs); static int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs); @@ -192,7 +240,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, if (!tk->tp.class.system) goto error; - INIT_LIST_HEAD(&tk->list); + dyn_event_init(&tk->devent, &trace_kprobe_ops); INIT_LIST_HEAD(&tk->tp.files); return tk; error: @@ -207,6 +255,9 @@ static void free_trace_kprobe(struct trace_kprobe *tk) { int i; + if (!tk) + return; + for (i = 0; i < tk->tp.nr_args; i++) traceprobe_free_probe_arg(&tk->tp.args[i]); @@ -220,9 +271,10 @@ static void free_trace_kprobe(struct trace_kprobe *tk) static struct trace_kprobe *find_trace_kprobe(const char *event, const char *group) { + struct dyn_event *pos; struct trace_kprobe *tk; - list_for_each_entry(tk, &probe_list, list) + for_each_trace_kprobe(tk, pos) if (strcmp(trace_event_name(&tk->tp.call), event) == 0 && strcmp(tk->tp.call.class->system, group) == 0) return tk; @@ -321,7 +373,7 @@ disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) * created with perf_event_open. We don't need to wait for these * trace_kprobes */ - if (list_empty(&tk->list)) + if (list_empty(&tk->devent.list)) wait = 0; out: if (wait) { @@ -419,7 +471,7 @@ static void __unregister_trace_kprobe(struct trace_kprobe *tk) } } -/* Unregister a trace_probe and probe_event: call with locking probe_lock */ +/* Unregister a trace_probe and probe_event */ static int unregister_trace_kprobe(struct trace_kprobe *tk) { /* Enabled event can not be unregistered */ @@ -431,7 +483,7 @@ static int unregister_trace_kprobe(struct trace_kprobe *tk) return -EBUSY; __unregister_trace_kprobe(tk); - list_del(&tk->list); + dyn_event_remove(&tk->devent); return 0; } @@ -442,7 +494,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk) struct trace_kprobe *old_tk; int ret; - mutex_lock(&probe_lock); + mutex_lock(&event_mutex); /* Delete old (same name) event if exist */ old_tk = find_trace_kprobe(trace_event_name(&tk->tp.call), @@ -471,10 +523,10 @@ static int register_trace_kprobe(struct trace_kprobe *tk) if (ret < 0) unregister_kprobe_event(tk); else - list_add_tail(&tk->list, &probe_list); + dyn_event_add(&tk->devent); end: - mutex_unlock(&probe_lock); + mutex_unlock(&event_mutex); return ret; } @@ -483,6 +535,7 @@ static int trace_kprobe_module_callback(struct notifier_block *nb, unsigned long val, void *data) { struct module *mod = data; + struct dyn_event *pos; struct trace_kprobe *tk; int ret; @@ -490,8 +543,8 @@ static int trace_kprobe_module_callback(struct notifier_block *nb, return NOTIFY_DONE; /* Update probes on coming module */ - mutex_lock(&probe_lock); - list_for_each_entry(tk, &probe_list, list) { + mutex_lock(&event_mutex); + for_each_trace_kprobe(tk, pos) { if (trace_kprobe_within_module(tk, mod)) { /* Don't need to check busy - this should have gone. */ __unregister_trace_kprobe(tk); @@ -502,7 +555,7 @@ static int trace_kprobe_module_callback(struct notifier_block *nb, mod->name, ret); } } - mutex_unlock(&probe_lock); + mutex_unlock(&event_mutex); return NOTIFY_DONE; } @@ -520,7 +573,7 @@ static inline void sanitize_event_name(char *name) *name = '_'; } -static int create_trace_kprobe(int argc, char **argv) +static int trace_kprobe_create(int argc, const char *argv[]) { /* * Argument syntax: @@ -544,9 +597,10 @@ static int create_trace_kprobe(int argc, char **argv) * FETCHARG:TYPE : use TYPE instead of unsigned long. */ struct trace_kprobe *tk; - int i, ret = 0; - bool is_return = false, is_delete = false; - char *symbol = NULL, *event = NULL, *group = NULL; + int i, len, ret = 0; + bool is_return = false; + char *symbol = NULL, *tmp = NULL; + const char *event = NULL, *group = KPROBE_EVENT_SYSTEM; int maxactive = 0; long offset = 0; void *addr = NULL; @@ -554,26 +608,26 @@ static int create_trace_kprobe(int argc, char **argv) unsigned int flags = TPARG_FL_KERNEL; /* argc must be >= 1 */ - if (argv[0][0] == 'p') - is_return = false; - else if (argv[0][0] == 'r') { + if (argv[0][0] == 'r') { is_return = true; flags |= TPARG_FL_RETURN; - } else if (argv[0][0] == '-') - is_delete = true; - else { - pr_info("Probe definition must be started with 'p', 'r' or" - " '-'.\n"); - return -EINVAL; - } + } else if (argv[0][0] != 'p' || argc < 2) + return -ECANCELED; event = strchr(&argv[0][1], ':'); - if (event) { - event[0] = '\0'; + if (event) event++; - } + if (is_return && isdigit(argv[0][1])) { - ret = kstrtouint(&argv[0][1], 0, &maxactive); + if (event) + len = event - &argv[0][1] - 1; + else + len = strlen(&argv[0][1]); + if (len > MAX_EVENT_NAME_LEN - 1) + return -E2BIG; + memcpy(buf, &argv[0][1], len); + buf[len] = '\0'; + ret = kstrtouint(buf, 0, &maxactive); if (ret) { pr_info("Failed to parse maxactive.\n"); return ret; @@ -588,74 +642,37 @@ static int create_trace_kprobe(int argc, char **argv) } } - if (event) { - char *slash; - - slash = strchr(event, '/'); - if (slash) { - group = event; - event = slash + 1; - slash[0] = '\0'; - if (strlen(group) == 0) { - pr_info("Group name is not specified\n"); - return -EINVAL; - } - } - if (strlen(event) == 0) { - pr_info("Event name is not specified\n"); - return -EINVAL; - } - } - if (!group) - group = KPROBE_EVENT_SYSTEM; - - if (is_delete) { - if (!event) { - pr_info("Delete command needs an event name.\n"); - return -EINVAL; - } - mutex_lock(&probe_lock); - tk = find_trace_kprobe(event, group); - if (!tk) { - mutex_unlock(&probe_lock); - pr_info("Event %s/%s doesn't exist.\n", group, event); - return -ENOENT; - } - /* delete an event */ - ret = unregister_trace_kprobe(tk); - if (ret == 0) - free_trace_kprobe(tk); - mutex_unlock(&probe_lock); - return ret; - } - - if (argc < 2) { - pr_info("Probe point is not specified.\n"); - return -EINVAL; - } - /* try to parse an address. if that fails, try to read the * input as a symbol. */ if (kstrtoul(argv[1], 0, (unsigned long *)&addr)) { + /* Check whether uprobe event specified */ + if (strchr(argv[1], '/') && strchr(argv[1], ':')) + return -ECANCELED; /* a symbol specified */ - symbol = argv[1]; + symbol = kstrdup(argv[1], GFP_KERNEL); + if (!symbol) + return -ENOMEM; /* TODO: support .init module functions */ ret = traceprobe_split_symbol_offset(symbol, &offset); if (ret || offset < 0 || offset > UINT_MAX) { pr_info("Failed to parse either an address or a symbol.\n"); - return ret; + goto out; } if (kprobe_on_func_entry(NULL, symbol, offset)) flags |= TPARG_FL_FENTRY; if (offset && is_return && !(flags & TPARG_FL_FENTRY)) { pr_info("Given offset is not valid for return probe.\n"); - return -EINVAL; + ret = -EINVAL; + goto out; } } argc -= 2; argv += 2; - /* setup a probe */ - if (!event) { + if (event) { + ret = traceprobe_parse_event_name(&event, &group, buf); + if (ret) + goto out; + } else { /* Make a new event name */ if (symbol) snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_%ld", @@ -666,17 +683,27 @@ static int create_trace_kprobe(int argc, char **argv) sanitize_event_name(buf); event = buf; } + + /* setup a probe */ tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive, argc, is_return); if (IS_ERR(tk)) { pr_info("Failed to allocate trace_probe.(%d)\n", (int)PTR_ERR(tk)); - return PTR_ERR(tk); + ret = PTR_ERR(tk); + goto out; } /* parse arguments */ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { - ret = traceprobe_parse_probe_arg(&tk->tp, i, argv[i], flags); + tmp = kstrdup(argv[i], GFP_KERNEL); + if (!tmp) { + ret = -ENOMEM; + goto error; + } + + ret = traceprobe_parse_probe_arg(&tk->tp, i, tmp, flags); + kfree(tmp); if (ret) goto error; } @@ -684,60 +711,39 @@ static int create_trace_kprobe(int argc, char **argv) ret = register_trace_kprobe(tk); if (ret) goto error; - return 0; +out: + kfree(symbol); + return ret; error: free_trace_kprobe(tk); - return ret; + goto out; } -static int release_all_trace_kprobes(void) +static int create_or_delete_trace_kprobe(int argc, char **argv) { - struct trace_kprobe *tk; - int ret = 0; - - mutex_lock(&probe_lock); - /* Ensure no probe is in use. */ - list_for_each_entry(tk, &probe_list, list) - if (trace_probe_is_enabled(&tk->tp)) { - ret = -EBUSY; - goto end; - } - /* TODO: Use batch unregistration */ - while (!list_empty(&probe_list)) { - tk = list_entry(probe_list.next, struct trace_kprobe, list); - ret = unregister_trace_kprobe(tk); - if (ret) - goto end; - free_trace_kprobe(tk); - } - -end: - mutex_unlock(&probe_lock); + int ret; - return ret; -} + if (argv[0][0] == '-') + return dyn_event_release(argc, argv, &trace_kprobe_ops); -/* Probes listing interfaces */ -static void *probes_seq_start(struct seq_file *m, loff_t *pos) -{ - mutex_lock(&probe_lock); - return seq_list_start(&probe_list, *pos); + ret = trace_kprobe_create(argc, (const char **)argv); + return ret == -ECANCELED ? -EINVAL : ret; } -static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos) +static int trace_kprobe_release(struct dyn_event *ev) { - return seq_list_next(v, &probe_list, pos); -} + struct trace_kprobe *tk = to_trace_kprobe(ev); + int ret = unregister_trace_kprobe(tk); -static void probes_seq_stop(struct seq_file *m, void *v) -{ - mutex_unlock(&probe_lock); + if (!ret) + free_trace_kprobe(tk); + return ret; } -static int probes_seq_show(struct seq_file *m, void *v) +static int trace_kprobe_show(struct seq_file *m, struct dyn_event *ev) { - struct trace_kprobe *tk = v; + struct trace_kprobe *tk = to_trace_kprobe(ev); int i; seq_putc(m, trace_kprobe_is_return(tk) ? 'r' : 'p'); @@ -759,10 +765,20 @@ static int probes_seq_show(struct seq_file *m, void *v) return 0; } +static int probes_seq_show(struct seq_file *m, void *v) +{ + struct dyn_event *ev = v; + + if (!is_trace_kprobe(ev)) + return 0; + + return trace_kprobe_show(m, ev); +} + static const struct seq_operations probes_seq_op = { - .start = probes_seq_start, - .next = probes_seq_next, - .stop = probes_seq_stop, + .start = dyn_event_seq_start, + .next = dyn_event_seq_next, + .stop = dyn_event_seq_stop, .show = probes_seq_show }; @@ -771,7 +787,7 @@ static int probes_open(struct inode *inode, struct file *file) int ret; if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { - ret = release_all_trace_kprobes(); + ret = dyn_events_release_all(&trace_kprobe_ops); if (ret < 0) return ret; } @@ -783,7 +799,7 @@ static ssize_t probes_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { return trace_parse_run_command(file, buffer, count, ppos, - create_trace_kprobe); + create_or_delete_trace_kprobe); } static const struct file_operations kprobe_events_ops = { @@ -798,8 +814,13 @@ static const struct file_operations kprobe_events_ops = { /* Probes profiling interfaces */ static int probes_profile_seq_show(struct seq_file *m, void *v) { - struct trace_kprobe *tk = v; + struct dyn_event *ev = v; + struct trace_kprobe *tk; + if (!is_trace_kprobe(ev)) + return 0; + + tk = to_trace_kprobe(ev); seq_printf(m, " %-44s %15lu %15lu\n", trace_event_name(&tk->tp.call), trace_kprobe_nhit(tk), @@ -809,9 +830,9 @@ static int probes_profile_seq_show(struct seq_file *m, void *v) } static const struct seq_operations profile_seq_op = { - .start = probes_seq_start, - .next = probes_seq_next, - .stop = probes_seq_stop, + .start = dyn_event_seq_start, + .next = dyn_event_seq_next, + .stop = dyn_event_seq_stop, .show = probes_profile_seq_show }; @@ -1332,7 +1353,7 @@ static int register_kprobe_event(struct trace_kprobe *tk) kfree(call->print_fmt); return -ENODEV; } - ret = trace_add_event_call(call); + ret = trace_add_event_call_nolock(call); if (ret) { pr_info("Failed to register kprobe event: %s\n", trace_event_name(call)); @@ -1347,7 +1368,7 @@ static int unregister_kprobe_event(struct trace_kprobe *tk) int ret; /* tp->event is unregistered in trace_remove_event_call() */ - ret = trace_remove_event_call(&tk->tp.call); + ret = trace_remove_event_call_nolock(&tk->tp.call); if (!ret) kfree(tk->tp.call.print_fmt); return ret; @@ -1364,7 +1385,7 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs, char *event; /* - * local trace_kprobes are not added to probe_list, so they are never + * local trace_kprobes are not added to dyn_event, so they are never * searched in find_trace_kprobe(). Therefore, there is no concern of * duplicated name here. */ @@ -1422,6 +1443,11 @@ static __init int init_kprobe_trace(void) { struct dentry *d_tracer; struct dentry *entry; + int ret; + + ret = dyn_event_register(&trace_kprobe_ops); + if (ret) + return ret; if (register_module_notifier(&trace_kprobe_module_nb)) return -EINVAL; @@ -1479,9 +1505,8 @@ static __init int kprobe_trace_self_tests_init(void) pr_info("Testing kprobe tracing: "); - ret = trace_run_command("p:testprobe kprobe_trace_selftest_target " - "$stack $stack0 +0($stack)", - create_trace_kprobe); + ret = trace_run_command("p:testprobe kprobe_trace_selftest_target $stack $stack0 +0($stack)", + create_or_delete_trace_kprobe); if (WARN_ON_ONCE(ret)) { pr_warn("error on probing function entry.\n"); warn++; @@ -1501,8 +1526,8 @@ static __init int kprobe_trace_self_tests_init(void) } } - ret = trace_run_command("r:testprobe2 kprobe_trace_selftest_target " - "$retval", create_trace_kprobe); + ret = trace_run_command("r:testprobe2 kprobe_trace_selftest_target $retval", + create_or_delete_trace_kprobe); if (WARN_ON_ONCE(ret)) { pr_warn("error on probing function return.\n"); warn++; @@ -1572,20 +1597,24 @@ static __init int kprobe_trace_self_tests_init(void) disable_trace_kprobe(tk, file); } - ret = trace_run_command("-:testprobe", create_trace_kprobe); + ret = trace_run_command("-:testprobe", create_or_delete_trace_kprobe); if (WARN_ON_ONCE(ret)) { pr_warn("error on deleting a probe.\n"); warn++; } - ret = trace_run_command("-:testprobe2", create_trace_kprobe); + ret = trace_run_command("-:testprobe2", create_or_delete_trace_kprobe); if (WARN_ON_ONCE(ret)) { pr_warn("error on deleting a probe.\n"); warn++; } end: - release_all_trace_kprobes(); + ret = dyn_events_release_all(&trace_kprobe_ops); + if (WARN_ON_ONCE(ret)) { + pr_warn("error on cleaning up probes.\n"); + warn++; + } /* * Wait for the optimizer work to finish. Otherwise it might fiddle * with probes in already freed __init text. diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 449150c6a87f..ff86417c0149 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -154,6 +154,33 @@ int traceprobe_split_symbol_offset(char *symbol, long *offset) return 0; } +/* @buf must has MAX_EVENT_NAME_LEN size */ +int traceprobe_parse_event_name(const char **pevent, const char **pgroup, + char *buf) +{ + const char *slash, *event = *pevent; + + slash = strchr(event, '/'); + if (slash) { + if (slash == event) { + pr_info("Group name is not specified\n"); + return -EINVAL; + } + if (slash - event + 1 > MAX_EVENT_NAME_LEN) { + pr_info("Group name is too long\n"); + return -E2BIG; + } + strlcpy(buf, event, slash - event + 1); + *pgroup = buf; + *pevent = slash + 1; + } + if (strlen(event) == 0) { + pr_info("Event name is not specified\n"); + return -EINVAL; + } + return 0; +} + #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) static int parse_probe_vars(char *arg, const struct fetch_type *t, diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index feeec261b356..8a63f8bc01bc 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -279,6 +279,8 @@ extern int traceprobe_update_arg(struct probe_arg *arg); extern void traceprobe_free_probe_arg(struct probe_arg *arg); extern int traceprobe_split_symbol_offset(char *symbol, long *offset); +extern int traceprobe_parse_event_name(const char **pevent, + const char **pgroup, char *buf); extern int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return); -- cgit v1.2.3 From 0597c49c69d53f00df99421d832453f4e92a5006 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 5 Nov 2018 18:03:04 +0900 Subject: tracing/uprobes: Use dyn_event framework for uprobe events Use dyn_event framework for uprobe events. This shows uprobe events on "dynamic_events" file. User can also define new uprobe events via dynamic_events. Link: http://lkml.kernel.org/r/154140858481.17322.9091293846515154065.stgit@devbox Reviewed-by: Tom Zanussi Tested-by: Tom Zanussi Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 1 + kernel/trace/trace_uprobe.c | 278 +++++++++++++++++++++++--------------------- 2 files changed, 149 insertions(+), 130 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index c0f6b0105609..2cab3c5dfe2c 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -501,6 +501,7 @@ config UPROBE_EVENTS depends on PERF_EVENTS select UPROBES select PROBE_EVENTS + select DYNAMIC_EVENTS select TRACING default y help diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 6eaaa2150685..4a7b21c891f3 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -7,6 +7,7 @@ */ #define pr_fmt(fmt) "trace_kprobe: " fmt +#include #include #include #include @@ -14,6 +15,7 @@ #include #include +#include "trace_dynevent.h" #include "trace_probe.h" #include "trace_probe_tmpl.h" @@ -37,11 +39,26 @@ struct trace_uprobe_filter { struct list_head perf_events; }; +static int trace_uprobe_create(int argc, const char **argv); +static int trace_uprobe_show(struct seq_file *m, struct dyn_event *ev); +static int trace_uprobe_release(struct dyn_event *ev); +static bool trace_uprobe_is_busy(struct dyn_event *ev); +static bool trace_uprobe_match(const char *system, const char *event, + struct dyn_event *ev); + +static struct dyn_event_operations trace_uprobe_ops = { + .create = trace_uprobe_create, + .show = trace_uprobe_show, + .is_busy = trace_uprobe_is_busy, + .free = trace_uprobe_release, + .match = trace_uprobe_match, +}; + /* * uprobe event core functions */ struct trace_uprobe { - struct list_head list; + struct dyn_event devent; struct trace_uprobe_filter filter; struct uprobe_consumer consumer; struct path path; @@ -53,6 +70,25 @@ struct trace_uprobe { struct trace_probe tp; }; +static bool is_trace_uprobe(struct dyn_event *ev) +{ + return ev->ops == &trace_uprobe_ops; +} + +static struct trace_uprobe *to_trace_uprobe(struct dyn_event *ev) +{ + return container_of(ev, struct trace_uprobe, devent); +} + +/** + * for_each_trace_uprobe - iterate over the trace_uprobe list + * @pos: the struct trace_uprobe * for each entry + * @dpos: the struct dyn_event * to use as a loop cursor + */ +#define for_each_trace_uprobe(pos, dpos) \ + for_each_dyn_event(dpos) \ + if (is_trace_uprobe(dpos) && (pos = to_trace_uprobe(dpos))) + #define SIZEOF_TRACE_UPROBE(n) \ (offsetof(struct trace_uprobe, tp.args) + \ (sizeof(struct probe_arg) * (n))) @@ -60,9 +96,6 @@ struct trace_uprobe { static int register_uprobe_event(struct trace_uprobe *tu); static int unregister_uprobe_event(struct trace_uprobe *tu); -static DEFINE_MUTEX(uprobe_lock); -static LIST_HEAD(uprobe_list); - struct uprobe_dispatch_data { struct trace_uprobe *tu; unsigned long bp_addr; @@ -209,6 +242,22 @@ static inline bool is_ret_probe(struct trace_uprobe *tu) return tu->consumer.ret_handler != NULL; } +static bool trace_uprobe_is_busy(struct dyn_event *ev) +{ + struct trace_uprobe *tu = to_trace_uprobe(ev); + + return trace_probe_is_enabled(&tu->tp); +} + +static bool trace_uprobe_match(const char *system, const char *event, + struct dyn_event *ev) +{ + struct trace_uprobe *tu = to_trace_uprobe(ev); + + return strcmp(trace_event_name(&tu->tp.call), event) == 0 && + (!system || strcmp(tu->tp.call.class->system, system) == 0); +} + /* * Allocate new trace_uprobe and initialize it (including uprobes). */ @@ -236,7 +285,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) if (!tu->tp.class.system) goto error; - INIT_LIST_HEAD(&tu->list); + dyn_event_init(&tu->devent, &trace_uprobe_ops); INIT_LIST_HEAD(&tu->tp.files); tu->consumer.handler = uprobe_dispatcher; if (is_ret) @@ -255,6 +304,9 @@ static void free_trace_uprobe(struct trace_uprobe *tu) { int i; + if (!tu) + return; + for (i = 0; i < tu->tp.nr_args; i++) traceprobe_free_probe_arg(&tu->tp.args[i]); @@ -267,9 +319,10 @@ static void free_trace_uprobe(struct trace_uprobe *tu) static struct trace_uprobe *find_probe_event(const char *event, const char *group) { + struct dyn_event *pos; struct trace_uprobe *tu; - list_for_each_entry(tu, &uprobe_list, list) + for_each_trace_uprobe(tu, pos) if (strcmp(trace_event_name(&tu->tp.call), event) == 0 && strcmp(tu->tp.call.class->system, group) == 0) return tu; @@ -277,7 +330,7 @@ static struct trace_uprobe *find_probe_event(const char *event, const char *grou return NULL; } -/* Unregister a trace_uprobe and probe_event: call with locking uprobe_lock */ +/* Unregister a trace_uprobe and probe_event */ static int unregister_trace_uprobe(struct trace_uprobe *tu) { int ret; @@ -286,7 +339,7 @@ static int unregister_trace_uprobe(struct trace_uprobe *tu) if (ret) return ret; - list_del(&tu->list); + dyn_event_remove(&tu->devent); free_trace_uprobe(tu); return 0; } @@ -302,13 +355,14 @@ static int unregister_trace_uprobe(struct trace_uprobe *tu) */ static struct trace_uprobe *find_old_trace_uprobe(struct trace_uprobe *new) { + struct dyn_event *pos; struct trace_uprobe *tmp, *old = NULL; struct inode *new_inode = d_real_inode(new->path.dentry); old = find_probe_event(trace_event_name(&new->tp.call), new->tp.call.class->system); - list_for_each_entry(tmp, &uprobe_list, list) { + for_each_trace_uprobe(tmp, pos) { if ((old ? old != tmp : true) && new_inode == d_real_inode(tmp->path.dentry) && new->offset == tmp->offset && @@ -326,7 +380,7 @@ static int register_trace_uprobe(struct trace_uprobe *tu) struct trace_uprobe *old_tu; int ret; - mutex_lock(&uprobe_lock); + mutex_lock(&event_mutex); /* register as an event */ old_tu = find_old_trace_uprobe(tu); @@ -348,10 +402,10 @@ static int register_trace_uprobe(struct trace_uprobe *tu) goto end; } - list_add_tail(&tu->list, &uprobe_list); + dyn_event_add(&tu->devent); end: - mutex_unlock(&uprobe_lock); + mutex_unlock(&event_mutex); return ret; } @@ -362,91 +416,49 @@ end: * * - Remove uprobe: -:[GRP/]EVENT */ -static int create_trace_uprobe(int argc, char **argv) +static int trace_uprobe_create(int argc, const char **argv) { struct trace_uprobe *tu; - char *arg, *event, *group, *filename, *rctr, *rctr_end; + const char *event = NULL, *group = UPROBE_EVENT_SYSTEM; + char *arg, *filename, *rctr, *rctr_end, *tmp; char buf[MAX_EVENT_NAME_LEN]; struct path path; unsigned long offset, ref_ctr_offset; - bool is_delete, is_return; + bool is_return = false; int i, ret; ret = 0; - is_delete = false; - is_return = false; - event = NULL; - group = NULL; ref_ctr_offset = 0; /* argc must be >= 1 */ - if (argv[0][0] == '-') - is_delete = true; - else if (argv[0][0] == 'r') + if (argv[0][0] == 'r') is_return = true; - else if (argv[0][0] != 'p') { - pr_info("Probe definition must be started with 'p', 'r' or '-'.\n"); - return -EINVAL; - } + else if (argv[0][0] != 'p' || argc < 2) + return -ECANCELED; - if (argv[0][1] == ':') { + if (argv[0][1] == ':') event = &argv[0][2]; - arg = strchr(event, '/'); - if (arg) { - group = event; - event = arg + 1; - event[-1] = '\0'; + if (!strchr(argv[1], '/')) + return -ECANCELED; - if (strlen(group) == 0) { - pr_info("Group name is not specified\n"); - return -EINVAL; - } - } - if (strlen(event) == 0) { - pr_info("Event name is not specified\n"); - return -EINVAL; - } - } - if (!group) - group = UPROBE_EVENT_SYSTEM; - - if (is_delete) { - int ret; - - if (!event) { - pr_info("Delete command needs an event name.\n"); - return -EINVAL; - } - mutex_lock(&uprobe_lock); - tu = find_probe_event(event, group); - - if (!tu) { - mutex_unlock(&uprobe_lock); - pr_info("Event %s/%s doesn't exist.\n", group, event); - return -ENOENT; - } - /* delete an event */ - ret = unregister_trace_uprobe(tu); - mutex_unlock(&uprobe_lock); - return ret; - } + filename = kstrdup(argv[1], GFP_KERNEL); + if (!filename) + return -ENOMEM; - if (argc < 2) { - pr_info("Probe point is not specified.\n"); - return -EINVAL; - } /* Find the last occurrence, in case the path contains ':' too. */ - arg = strrchr(argv[1], ':'); - if (!arg) - return -EINVAL; + arg = strrchr(filename, ':'); + if (!arg || !isdigit(arg[1])) { + kfree(filename); + return -ECANCELED; + } *arg++ = '\0'; - filename = argv[1]; ret = kern_path(filename, LOOKUP_FOLLOW, &path); - if (ret) + if (ret) { + kfree(filename); return ret; - + } if (!d_is_reg(path.dentry)) { ret = -EINVAL; goto fail_address_parse; @@ -480,7 +492,11 @@ static int create_trace_uprobe(int argc, char **argv) argv += 2; /* setup a probe */ - if (!event) { + if (event) { + ret = traceprobe_parse_event_name(&event, &group, buf); + if (ret) + goto fail_address_parse; + } else { char *tail; char *ptr; @@ -508,18 +524,19 @@ static int create_trace_uprobe(int argc, char **argv) tu->offset = offset; tu->ref_ctr_offset = ref_ctr_offset; tu->path = path; - tu->filename = kstrdup(filename, GFP_KERNEL); - - if (!tu->filename) { - pr_info("Failed to allocate filename.\n"); - ret = -ENOMEM; - goto error; - } + tu->filename = filename; /* parse arguments */ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { - ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], + tmp = kstrdup(argv[i], GFP_KERNEL); + if (!tmp) { + ret = -ENOMEM; + goto error; + } + + ret = traceprobe_parse_probe_arg(&tu->tp, i, tmp, is_return ? TPARG_FL_RETURN : 0); + kfree(tmp); if (ret) goto error; } @@ -535,55 +552,35 @@ error: fail_address_parse: path_put(&path); + kfree(filename); pr_info("Failed to parse address or file.\n"); return ret; } -static int cleanup_all_probes(void) +static int create_or_delete_trace_uprobe(int argc, char **argv) { - struct trace_uprobe *tu; - int ret = 0; + int ret; - mutex_lock(&uprobe_lock); - /* Ensure no probe is in use. */ - list_for_each_entry(tu, &uprobe_list, list) - if (trace_probe_is_enabled(&tu->tp)) { - ret = -EBUSY; - goto end; - } - while (!list_empty(&uprobe_list)) { - tu = list_entry(uprobe_list.next, struct trace_uprobe, list); - ret = unregister_trace_uprobe(tu); - if (ret) - break; - } -end: - mutex_unlock(&uprobe_lock); - return ret; -} + if (argv[0][0] == '-') + return dyn_event_release(argc, argv, &trace_uprobe_ops); -/* Probes listing interfaces */ -static void *probes_seq_start(struct seq_file *m, loff_t *pos) -{ - mutex_lock(&uprobe_lock); - return seq_list_start(&uprobe_list, *pos); + ret = trace_uprobe_create(argc, (const char **)argv); + return ret == -ECANCELED ? -EINVAL : ret; } -static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos) +static int trace_uprobe_release(struct dyn_event *ev) { - return seq_list_next(v, &uprobe_list, pos); -} + struct trace_uprobe *tu = to_trace_uprobe(ev); -static void probes_seq_stop(struct seq_file *m, void *v) -{ - mutex_unlock(&uprobe_lock); + return unregister_trace_uprobe(tu); } -static int probes_seq_show(struct seq_file *m, void *v) +/* Probes listing interfaces */ +static int trace_uprobe_show(struct seq_file *m, struct dyn_event *ev) { - struct trace_uprobe *tu = v; + struct trace_uprobe *tu = to_trace_uprobe(ev); char c = is_ret_probe(tu) ? 'r' : 'p'; int i; @@ -601,11 +598,21 @@ static int probes_seq_show(struct seq_file *m, void *v) return 0; } +static int probes_seq_show(struct seq_file *m, void *v) +{ + struct dyn_event *ev = v; + + if (!is_trace_uprobe(ev)) + return 0; + + return trace_uprobe_show(m, ev); +} + static const struct seq_operations probes_seq_op = { - .start = probes_seq_start, - .next = probes_seq_next, - .stop = probes_seq_stop, - .show = probes_seq_show + .start = dyn_event_seq_start, + .next = dyn_event_seq_next, + .stop = dyn_event_seq_stop, + .show = probes_seq_show }; static int probes_open(struct inode *inode, struct file *file) @@ -613,7 +620,7 @@ static int probes_open(struct inode *inode, struct file *file) int ret; if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { - ret = cleanup_all_probes(); + ret = dyn_events_release_all(&trace_uprobe_ops); if (ret) return ret; } @@ -624,7 +631,8 @@ static int probes_open(struct inode *inode, struct file *file) static ssize_t probes_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { - return trace_parse_run_command(file, buffer, count, ppos, create_trace_uprobe); + return trace_parse_run_command(file, buffer, count, ppos, + create_or_delete_trace_uprobe); } static const struct file_operations uprobe_events_ops = { @@ -639,17 +647,22 @@ static const struct file_operations uprobe_events_ops = { /* Probes profiling interfaces */ static int probes_profile_seq_show(struct seq_file *m, void *v) { - struct trace_uprobe *tu = v; + struct dyn_event *ev = v; + struct trace_uprobe *tu; + + if (!is_trace_uprobe(ev)) + return 0; + tu = to_trace_uprobe(ev); seq_printf(m, " %s %-44s %15lu\n", tu->filename, trace_event_name(&tu->tp.call), tu->nhit); return 0; } static const struct seq_operations profile_seq_op = { - .start = probes_seq_start, - .next = probes_seq_next, - .stop = probes_seq_stop, + .start = dyn_event_seq_start, + .next = dyn_event_seq_next, + .stop = dyn_event_seq_stop, .show = probes_profile_seq_show }; @@ -1307,7 +1320,7 @@ static int register_uprobe_event(struct trace_uprobe *tu) return -ENODEV; } - ret = trace_add_event_call(call); + ret = trace_add_event_call_nolock(call); if (ret) { pr_info("Failed to register uprobe event: %s\n", @@ -1324,7 +1337,7 @@ static int unregister_uprobe_event(struct trace_uprobe *tu) int ret; /* tu->event is unregistered in trace_remove_event_call() */ - ret = trace_remove_event_call(&tu->tp.call); + ret = trace_remove_event_call_nolock(&tu->tp.call); if (ret) return ret; kfree(tu->tp.call.print_fmt); @@ -1351,7 +1364,7 @@ create_local_trace_uprobe(char *name, unsigned long offs, } /* - * local trace_kprobes are not added to probe_list, so they are never + * local trace_kprobes are not added to dyn_event, so they are never * searched in find_trace_kprobe(). Therefore, there is no concern of * duplicated name "DUMMY_EVENT" here. */ @@ -1399,6 +1412,11 @@ void destroy_local_trace_uprobe(struct trace_event_call *event_call) static __init int init_uprobe_trace(void) { struct dentry *d_tracer; + int ret; + + ret = dyn_event_register(&trace_uprobe_ops); + if (ret) + return ret; d_tracer = tracing_init_dentry(); if (IS_ERR(d_tracer)) -- cgit v1.2.3 From 7bbab38d07f3185fddf6fce126e2239010efdfce Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 5 Nov 2018 18:03:33 +0900 Subject: tracing: Use dyn_event framework for synthetic events Use dyn_event framework for synthetic events. This shows synthetic events on "tracing/dynamic_events" file in addition to tracing/synthetic_events interface. User can also define new events via tracing/dynamic_events with "s:" prefix. So, the new syntax is below; s:[synthetic/]EVENT_NAME TYPE ARG; [TYPE ARG;]... To remove events via tracing/dynamic_events, you can use "-:" prefix as same as other events. Link: http://lkml.kernel.org/r/154140861301.17322.15454611233735614508.stgit@devbox Reviewed-by: Tom Zanussi Tested-by: Tom Zanussi Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 1 + kernel/trace/trace.c | 8 ++ kernel/trace/trace_events_hist.c | 265 ++++++++++++++++++++++++--------------- 3 files changed, 176 insertions(+), 98 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 2cab3c5dfe2c..fa8b1fe824f3 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -635,6 +635,7 @@ config HIST_TRIGGERS depends on ARCH_HAVE_NMI_SAFE_CMPXCHG select TRACING_MAP select TRACING + select DYNAMIC_EVENTS default n help Hist triggers allow one or more arbitrary trace event fields diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7e0332f90ed4..911470ad9e94 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4620,6 +4620,9 @@ static const char readme_msg[] = "\t accepts: event-definitions (one definition per line)\n" "\t Format: p[:[/]] []\n" "\t r[maxactive][:[/]] []\n" +#ifdef CONFIG_HIST_TRIGGERS + "\t s:[synthetic/] []\n" +#endif "\t -:[/]\n" #ifdef CONFIG_KPROBE_EVENTS "\t place: [:][+]|\n" @@ -4638,6 +4641,11 @@ static const char readme_msg[] = "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string, symbol,\n" "\t b@/,\n" "\t \\[\\]\n" +#ifdef CONFIG_HIST_TRIGGERS + "\t field: ;\n" + "\t stype: u8/u16/u32/u64, s8/s16/s32/s64, pid_t,\n" + "\t [unsigned] char/int/long\n" +#endif #endif " events/\t\t- Directory containing all trace event subsystems:\n" " enable\t\t- Write 0/1 to enable/disable tracing of all events\n" diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 0feb7f460123..414aabd67d1f 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -15,6 +15,7 @@ #include "tracing_map.h" #include "trace.h" +#include "trace_dynevent.h" #define SYNTH_SYSTEM "synthetic" #define SYNTH_FIELDS_MAX 16 @@ -292,6 +293,21 @@ struct hist_trigger_data { unsigned int n_max_var_str; }; +static int synth_event_create(int argc, const char **argv); +static int synth_event_show(struct seq_file *m, struct dyn_event *ev); +static int synth_event_release(struct dyn_event *ev); +static bool synth_event_is_busy(struct dyn_event *ev); +static bool synth_event_match(const char *system, const char *event, + struct dyn_event *ev); + +static struct dyn_event_operations synth_event_ops = { + .create = synth_event_create, + .show = synth_event_show, + .is_busy = synth_event_is_busy, + .free = synth_event_release, + .match = synth_event_match, +}; + struct synth_field { char *type; char *name; @@ -301,7 +317,7 @@ struct synth_field { }; struct synth_event { - struct list_head list; + struct dyn_event devent; int ref; char *name; struct synth_field **fields; @@ -312,6 +328,32 @@ struct synth_event { struct tracepoint *tp; }; +static bool is_synth_event(struct dyn_event *ev) +{ + return ev->ops == &synth_event_ops; +} + +static struct synth_event *to_synth_event(struct dyn_event *ev) +{ + return container_of(ev, struct synth_event, devent); +} + +static bool synth_event_is_busy(struct dyn_event *ev) +{ + struct synth_event *event = to_synth_event(ev); + + return event->ref != 0; +} + +static bool synth_event_match(const char *system, const char *event, + struct dyn_event *ev) +{ + struct synth_event *sev = to_synth_event(ev); + + return strcmp(sev->name, event) == 0 && + (!system || strcmp(system, SYNTH_SYSTEM) == 0); +} + struct action_data; typedef void (*action_fn_t) (struct hist_trigger_data *hist_data, @@ -402,7 +444,6 @@ static bool have_hist_err(void) return false; } -static LIST_HEAD(synth_event_list); static DEFINE_MUTEX(synth_event_mutex); struct synth_trace_event { @@ -738,14 +779,12 @@ static void free_synth_field(struct synth_field *field) kfree(field); } -static struct synth_field *parse_synth_field(int argc, char **argv, +static struct synth_field *parse_synth_field(int argc, const char **argv, int *consumed) { struct synth_field *field; - const char *prefix = NULL; - char *field_type = argv[0], *field_name; + const char *prefix = NULL, *field_type = argv[0], *field_name, *array; int len, ret = 0; - char *array; if (field_type[0] == ';') field_type++; @@ -762,20 +801,31 @@ static struct synth_field *parse_synth_field(int argc, char **argv, *consumed = 2; } - len = strlen(field_name); - if (field_name[len - 1] == ';') - field_name[len - 1] = '\0'; - field = kzalloc(sizeof(*field), GFP_KERNEL); if (!field) return ERR_PTR(-ENOMEM); - len = strlen(field_type) + 1; + len = strlen(field_name); array = strchr(field_name, '['); + if (array) + len -= strlen(array); + else if (field_name[len - 1] == ';') + len--; + + field->name = kmemdup_nul(field_name, len, GFP_KERNEL); + if (!field->name) { + ret = -ENOMEM; + goto free; + } + + if (field_type[0] == ';') + field_type++; + len = strlen(field_type) + 1; if (array) len += strlen(array); if (prefix) len += strlen(prefix); + field->type = kzalloc(len, GFP_KERNEL); if (!field->type) { ret = -ENOMEM; @@ -786,7 +836,8 @@ static struct synth_field *parse_synth_field(int argc, char **argv, strcat(field->type, field_type); if (array) { strcat(field->type, array); - *array = '\0'; + if (field->type[len - 1] == ';') + field->type[len - 1] = '\0'; } field->size = synth_field_size(field->type); @@ -800,11 +851,6 @@ static struct synth_field *parse_synth_field(int argc, char **argv, field->is_signed = synth_field_signed(field->type); - field->name = kstrdup(field_name, GFP_KERNEL); - if (!field->name) { - ret = -ENOMEM; - goto free; - } out: return field; free: @@ -868,9 +914,13 @@ static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals, static struct synth_event *find_synth_event(const char *name) { + struct dyn_event *pos; struct synth_event *event; - list_for_each_entry(event, &synth_event_list, list) { + for_each_dyn_event(pos) { + if (!is_synth_event(pos)) + continue; + event = to_synth_event(pos); if (strcmp(event->name, name) == 0) return event; } @@ -921,7 +971,7 @@ static int register_synth_event(struct synth_event *event) ret = set_synth_event_print_fmt(call); if (ret < 0) { - trace_remove_event_call(call); + trace_remove_event_call_nolock(call); goto err; } out: @@ -959,7 +1009,7 @@ static void free_synth_event(struct synth_event *event) kfree(event); } -static struct synth_event *alloc_synth_event(char *event_name, int n_fields, +static struct synth_event *alloc_synth_event(const char *name, int n_fields, struct synth_field **fields) { struct synth_event *event; @@ -971,7 +1021,7 @@ static struct synth_event *alloc_synth_event(char *event_name, int n_fields, goto out; } - event->name = kstrdup(event_name, GFP_KERNEL); + event->name = kstrdup(name, GFP_KERNEL); if (!event->name) { kfree(event); event = ERR_PTR(-ENOMEM); @@ -985,6 +1035,8 @@ static struct synth_event *alloc_synth_event(char *event_name, int n_fields, goto out; } + dyn_event_init(&event->devent, &synth_event_ops); + for (i = 0; i < n_fields; i++) event->fields[i] = fields[i]; @@ -1008,16 +1060,11 @@ struct hist_var_data { struct hist_trigger_data *hist_data; }; -static int create_synth_event(int argc, char **argv) +static int __create_synth_event(int argc, const char *name, const char **argv) { struct synth_field *field, *fields[SYNTH_FIELDS_MAX]; struct synth_event *event = NULL; - bool delete_event = false; int i, consumed = 0, n_fields = 0, ret = 0; - char *name; - - mutex_lock(&event_mutex); - mutex_lock(&synth_event_mutex); /* * Argument syntax: @@ -1025,43 +1072,20 @@ static int create_synth_event(int argc, char **argv) * - Remove synthetic event: ! field[;field] ... * where 'field' = type field_name */ - if (argc < 1) { - ret = -EINVAL; - goto out; - } - name = argv[0]; - if (name[0] == '!') { - delete_event = true; - name++; - } + if (name[0] == '\0' || argc < 1) + return -EINVAL; + + mutex_lock(&event_mutex); + mutex_lock(&synth_event_mutex); event = find_synth_event(name); if (event) { - if (delete_event) { - if (event->ref) { - ret = -EBUSY; - goto out; - } - ret = unregister_synth_event(event); - if (!ret) { - list_del(&event->list); - free_synth_event(event); - } - } else - ret = -EEXIST; - goto out; - } else if (delete_event) { - ret = -ENOENT; + ret = -EEXIST; goto out; } - if (argc < 2) { - ret = -EINVAL; - goto out; - } - - for (i = 1; i < argc - 1; i++) { + for (i = 0; i < argc - 1; i++) { if (strcmp(argv[i], ";") == 0) continue; if (n_fields == SYNTH_FIELDS_MAX) { @@ -1091,7 +1115,7 @@ static int create_synth_event(int argc, char **argv) } ret = register_synth_event(event); if (!ret) - list_add(&event->list, &synth_event_list); + dyn_event_add(&event->devent); else free_synth_event(event); out: @@ -1106,57 +1130,77 @@ static int create_synth_event(int argc, char **argv) goto out; } -static int release_all_synth_events(void) +static int create_or_delete_synth_event(int argc, char **argv) { - struct synth_event *event, *e; - int ret = 0; - - mutex_lock(&event_mutex); - mutex_lock(&synth_event_mutex); - - list_for_each_entry(event, &synth_event_list, list) { - if (event->ref) { - mutex_unlock(&synth_event_mutex); - return -EBUSY; - } - } + const char *name = argv[0]; + struct synth_event *event = NULL; + int ret; - list_for_each_entry_safe(event, e, &synth_event_list, list) { - ret = unregister_synth_event(event); - if (!ret) { - list_del(&event->list); - free_synth_event(event); + /* trace_run_command() ensures argc != 0 */ + if (name[0] == '!') { + mutex_lock(&event_mutex); + mutex_lock(&synth_event_mutex); + event = find_synth_event(name + 1); + if (event) { + if (event->ref) + ret = -EBUSY; + else { + ret = unregister_synth_event(event); + if (!ret) { + dyn_event_remove(&event->devent); + free_synth_event(event); + } + } } else - break; + ret = -ENOENT; + mutex_unlock(&synth_event_mutex); + mutex_unlock(&event_mutex); + return ret; } - mutex_unlock(&synth_event_mutex); - mutex_unlock(&event_mutex); - return ret; + ret = __create_synth_event(argc - 1, name, (const char **)argv + 1); + return ret == -ECANCELED ? -EINVAL : ret; } - -static void *synth_events_seq_start(struct seq_file *m, loff_t *pos) +static int synth_event_create(int argc, const char **argv) { - mutex_lock(&synth_event_mutex); + const char *name = argv[0]; + int len; - return seq_list_start(&synth_event_list, *pos); -} + if (name[0] != 's' || name[1] != ':') + return -ECANCELED; + name += 2; -static void *synth_events_seq_next(struct seq_file *m, void *v, loff_t *pos) -{ - return seq_list_next(v, &synth_event_list, pos); + /* This interface accepts group name prefix */ + if (strchr(name, '/')) { + len = sizeof(SYNTH_SYSTEM "/") - 1; + if (strncmp(name, SYNTH_SYSTEM "/", len)) + return -EINVAL; + name += len; + } + return __create_synth_event(argc - 1, name, argv + 1); } -static void synth_events_seq_stop(struct seq_file *m, void *v) +static int synth_event_release(struct dyn_event *ev) { - mutex_unlock(&synth_event_mutex); + struct synth_event *event = to_synth_event(ev); + int ret; + + if (event->ref) + return -EBUSY; + + ret = unregister_synth_event(event); + if (ret) + return ret; + + dyn_event_remove(ev); + free_synth_event(event); + return 0; } -static int synth_events_seq_show(struct seq_file *m, void *v) +static int __synth_event_show(struct seq_file *m, struct synth_event *event) { struct synth_field *field; - struct synth_event *event = v; unsigned int i; seq_printf(m, "%s\t", event->name); @@ -1174,11 +1218,30 @@ static int synth_events_seq_show(struct seq_file *m, void *v) return 0; } +static int synth_event_show(struct seq_file *m, struct dyn_event *ev) +{ + struct synth_event *event = to_synth_event(ev); + + seq_printf(m, "s:%s/", event->class.system); + + return __synth_event_show(m, event); +} + +static int synth_events_seq_show(struct seq_file *m, void *v) +{ + struct dyn_event *ev = v; + + if (!is_synth_event(ev)) + return 0; + + return __synth_event_show(m, to_synth_event(ev)); +} + static const struct seq_operations synth_events_seq_op = { - .start = synth_events_seq_start, - .next = synth_events_seq_next, - .stop = synth_events_seq_stop, - .show = synth_events_seq_show + .start = dyn_event_seq_start, + .next = dyn_event_seq_next, + .stop = dyn_event_seq_stop, + .show = synth_events_seq_show, }; static int synth_events_open(struct inode *inode, struct file *file) @@ -1186,7 +1249,7 @@ static int synth_events_open(struct inode *inode, struct file *file) int ret; if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { - ret = release_all_synth_events(); + ret = dyn_events_release_all(&synth_event_ops); if (ret < 0) return ret; } @@ -1199,7 +1262,7 @@ static ssize_t synth_events_write(struct file *file, size_t count, loff_t *ppos) { return trace_parse_run_command(file, buffer, count, ppos, - create_synth_event); + create_or_delete_synth_event); } static const struct file_operations synth_events_fops = { @@ -5791,6 +5854,12 @@ static __init int trace_events_hist_init(void) struct dentry *d_tracer; int err = 0; + err = dyn_event_register(&synth_event_ops); + if (err) { + pr_warn("Could not register synth_event_ops\n"); + return err; + } + d_tracer = tracing_init_dentry(); if (IS_ERR(d_tracer)) { err = PTR_ERR(d_tracer); -- cgit v1.2.3 From 0e2b81f7b52a1c1a8c46986f9ca01eb7b3c421f8 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 5 Nov 2018 18:04:01 +0900 Subject: tracing: Remove unneeded synth_event_mutex Rmove unneeded synth_event_mutex. This mutex protects the reference count in synth_event, however, those operational points are already protected by event_mutex. 1. In __create_synth_event() and create_or_delete_synth_event(), those synth_event_mutex clearly obtained right after event_mutex. 2. event_hist_trigger_func() is trigger_hist_cmd.func() which is called by trigger_process_regex(), which is a part of event_trigger_regex_write() and this function takes event_mutex. 3. hist_unreg_all() is trigger_hist_cmd.unreg_all() which is called by event_trigger_regex_open() and it takes event_mutex. 4. onmatch_destroy() and onmatch_create() have long call tree, but both are finally invoked from event_trigger_regex_write() and event_trace_del_tracer(), former takes event_mutex, and latter ensures called under event_mutex locked. Finally, I ensured there is no resource conflict. For safety, I added lockdep_assert_held(&event_mutex) for each function. Link: http://lkml.kernel.org/r/154140864134.17322.4796059721306031894.stgit@devbox Reviewed-by: Tom Zanussi Tested-by: Tom Zanussi Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 30 +++++++----------------------- 1 file changed, 7 insertions(+), 23 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 414aabd67d1f..21e4954375a1 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -444,8 +444,6 @@ static bool have_hist_err(void) return false; } -static DEFINE_MUTEX(synth_event_mutex); - struct synth_trace_event { struct trace_entry ent; u64 fields[]; @@ -1077,7 +1075,6 @@ static int __create_synth_event(int argc, const char *name, const char **argv) return -EINVAL; mutex_lock(&event_mutex); - mutex_lock(&synth_event_mutex); event = find_synth_event(name); if (event) { @@ -1119,7 +1116,6 @@ static int __create_synth_event(int argc, const char *name, const char **argv) else free_synth_event(event); out: - mutex_unlock(&synth_event_mutex); mutex_unlock(&event_mutex); return ret; @@ -1139,7 +1135,6 @@ static int create_or_delete_synth_event(int argc, char **argv) /* trace_run_command() ensures argc != 0 */ if (name[0] == '!') { mutex_lock(&event_mutex); - mutex_lock(&synth_event_mutex); event = find_synth_event(name + 1); if (event) { if (event->ref) @@ -1153,7 +1148,6 @@ static int create_or_delete_synth_event(int argc, char **argv) } } else ret = -ENOENT; - mutex_unlock(&synth_event_mutex); mutex_unlock(&event_mutex); return ret; } @@ -3535,7 +3529,7 @@ static void onmatch_destroy(struct action_data *data) { unsigned int i; - mutex_lock(&synth_event_mutex); + lockdep_assert_held(&event_mutex); kfree(data->onmatch.match_event); kfree(data->onmatch.match_event_system); @@ -3548,8 +3542,6 @@ static void onmatch_destroy(struct action_data *data) data->onmatch.synth_event->ref--; kfree(data); - - mutex_unlock(&synth_event_mutex); } static void destroy_field_var(struct field_var *field_var) @@ -3700,15 +3692,14 @@ static int onmatch_create(struct hist_trigger_data *hist_data, struct synth_event *event; int ret = 0; - mutex_lock(&synth_event_mutex); + lockdep_assert_held(&event_mutex); + event = find_synth_event(data->onmatch.synth_event_name); if (!event) { hist_err("onmatch: Couldn't find synthetic event: ", data->onmatch.synth_event_name); - mutex_unlock(&synth_event_mutex); return -EINVAL; } event->ref++; - mutex_unlock(&synth_event_mutex); var_ref_idx = hist_data->n_var_refs; @@ -3782,9 +3773,7 @@ static int onmatch_create(struct hist_trigger_data *hist_data, out: return ret; err: - mutex_lock(&synth_event_mutex); event->ref--; - mutex_unlock(&synth_event_mutex); goto out; } @@ -5492,6 +5481,8 @@ static void hist_unreg_all(struct trace_event_file *file) struct synth_event *se; const char *se_name; + lockdep_assert_held(&event_mutex); + if (hist_file_check_refs(file)) return; @@ -5501,12 +5492,10 @@ static void hist_unreg_all(struct trace_event_file *file) list_del_rcu(&test->list); trace_event_trigger_enable_disable(file, 0); - mutex_lock(&synth_event_mutex); se_name = trace_event_name(file->event_call); se = find_synth_event(se_name); if (se) se->ref--; - mutex_unlock(&synth_event_mutex); update_cond_flag(file); if (hist_data->enable_timestamps) @@ -5532,6 +5521,8 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, char *trigger, *p; int ret = 0; + lockdep_assert_held(&event_mutex); + if (glob && strlen(glob)) { last_cmd_set(param); hist_err_clear(); @@ -5622,14 +5613,10 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, } cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); - - mutex_lock(&synth_event_mutex); se_name = trace_event_name(file->event_call); se = find_synth_event(se_name); if (se) se->ref--; - mutex_unlock(&synth_event_mutex); - ret = 0; goto out_free; } @@ -5665,13 +5652,10 @@ enable: if (ret) goto out_unreg; - mutex_lock(&synth_event_mutex); se_name = trace_event_name(file->event_call); se = find_synth_event(se_name); if (se) se->ref++; - mutex_unlock(&synth_event_mutex); - /* Just return zero, not the number of registered triggers */ ret = 0; out: -- cgit v1.2.3 From 7e1413edd6194a9807aa5f3ac0378b9b4b9da879 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 4 Dec 2018 13:35:45 -0500 Subject: tracing: Consolidate trace_add/remove_event_call back to the nolock functions The trace_add/remove_event_call_nolock() functions were added to allow the tace_add/remove_event_call() code be called when the event_mutex lock was already taken. Now that all callers are done within the event_mutex, there's no reason to have two different interfaces. Remove the current wrapper trace_add/remove_event_call()s and rename the _nolock versions back to the original names. Link: http://lkml.kernel.org/r/154140866955.17322.2081425494660638846.stgit@devbox Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events.c | 30 ++++-------------------------- kernel/trace/trace_events_hist.c | 6 +++--- kernel/trace/trace_kprobe.c | 4 ++-- kernel/trace/trace_uprobe.c | 4 ++-- 4 files changed, 11 insertions(+), 33 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index a3b157f689ee..bd0162c0467c 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2305,7 +2305,8 @@ __trace_early_add_new_event(struct trace_event_call *call, struct ftrace_module_file_ops; static void __add_event_to_tracers(struct trace_event_call *call); -int trace_add_event_call_nolock(struct trace_event_call *call) +/* Add an additional event_call dynamically */ +int trace_add_event_call(struct trace_event_call *call) { int ret; lockdep_assert_held(&event_mutex); @@ -2320,17 +2321,6 @@ int trace_add_event_call_nolock(struct trace_event_call *call) return ret; } -/* Add an additional event_call dynamically */ -int trace_add_event_call(struct trace_event_call *call) -{ - int ret; - - mutex_lock(&event_mutex); - ret = trace_add_event_call_nolock(call); - mutex_unlock(&event_mutex); - return ret; -} - /* * Must be called under locking of trace_types_lock, event_mutex and * trace_event_sem. @@ -2376,8 +2366,8 @@ static int probe_remove_event_call(struct trace_event_call *call) return 0; } -/* no event_mutex version */ -int trace_remove_event_call_nolock(struct trace_event_call *call) +/* Remove an event_call */ +int trace_remove_event_call(struct trace_event_call *call) { int ret; @@ -2392,18 +2382,6 @@ int trace_remove_event_call_nolock(struct trace_event_call *call) return ret; } -/* Remove an event_call */ -int trace_remove_event_call(struct trace_event_call *call) -{ - int ret; - - mutex_lock(&event_mutex); - ret = trace_remove_event_call_nolock(call); - mutex_unlock(&event_mutex); - - return ret; -} - #define for_each_event(event, start, end) \ for (event = start; \ (unsigned long)event < (unsigned long)end; \ diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 21e4954375a1..82e72c48a5a9 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -960,7 +960,7 @@ static int register_synth_event(struct synth_event *event) call->data = event; call->tp = event->tp; - ret = trace_add_event_call_nolock(call); + ret = trace_add_event_call(call); if (ret) { pr_warn("Failed to register synthetic event: %s\n", trace_event_name(call)); @@ -969,7 +969,7 @@ static int register_synth_event(struct synth_event *event) ret = set_synth_event_print_fmt(call); if (ret < 0) { - trace_remove_event_call_nolock(call); + trace_remove_event_call(call); goto err; } out: @@ -984,7 +984,7 @@ static int unregister_synth_event(struct synth_event *event) struct trace_event_call *call = &event->call; int ret; - ret = trace_remove_event_call_nolock(call); + ret = trace_remove_event_call(call); return ret; } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index bdf8c2ad5152..0e0f7b8024fb 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1353,7 +1353,7 @@ static int register_kprobe_event(struct trace_kprobe *tk) kfree(call->print_fmt); return -ENODEV; } - ret = trace_add_event_call_nolock(call); + ret = trace_add_event_call(call); if (ret) { pr_info("Failed to register kprobe event: %s\n", trace_event_name(call)); @@ -1368,7 +1368,7 @@ static int unregister_kprobe_event(struct trace_kprobe *tk) int ret; /* tp->event is unregistered in trace_remove_event_call() */ - ret = trace_remove_event_call_nolock(&tk->tp.call); + ret = trace_remove_event_call(&tk->tp.call); if (!ret) kfree(tk->tp.call.print_fmt); return ret; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 4a7b21c891f3..e335576b9411 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1320,7 +1320,7 @@ static int register_uprobe_event(struct trace_uprobe *tu) return -ENODEV; } - ret = trace_add_event_call_nolock(call); + ret = trace_add_event_call(call); if (ret) { pr_info("Failed to register uprobe event: %s\n", @@ -1337,7 +1337,7 @@ static int unregister_uprobe_event(struct trace_uprobe *tu) int ret; /* tu->event is unregistered in trace_remove_event_call() */ - ret = trace_remove_event_call_nolock(&tu->tp.call); + ret = trace_remove_event_call(&tu->tp.call); if (ret) return ret; kfree(tu->tp.call.print_fmt); -- cgit v1.2.3 From 1ce25e9f6fff7633955e8ce776e5e9d7a780d34b Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 5 Nov 2018 18:04:57 +0900 Subject: tracing: Add generic event-name based remove event method Add a generic method to remove event from dynamic event list. This is same as other system under ftrace. You just need to pass the event name with '!', e.g. # echo p:new_grp/new_event _do_fork > dynamic_events This creates an event, and # echo '!p:new_grp/new_event _do_fork' > dynamic_events Or, # echo '!p:new_grp/new_event' > dynamic_events will remove new_grp/new_event event. Note that this doesn't check the event prefix (e.g. "p:") strictly, because the "group/event" name must be unique. Link: http://lkml.kernel.org/r/154140869774.17322.8887303560398645347.stgit@devbox Reviewed-by: Tom Zanussi Tested-by: Tom Zanussi Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_dynevent.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index f17a887abb66..dd1f43588d70 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -37,10 +37,17 @@ int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type) char *system = NULL, *event, *p; int ret = -ENOENT; - if (argv[0][1] != ':') - return -EINVAL; + if (argv[0][0] == '-') { + if (argv[0][1] != ':') + return -EINVAL; + event = &argv[0][2]; + } else { + event = strchr(argv[0], ':'); + if (!event) + return -EINVAL; + event++; + } - event = &argv[0][2]; p = strchr(event, '/'); if (p) { system = event; @@ -69,7 +76,7 @@ static int create_dyn_event(int argc, char **argv) struct dyn_event_operations *ops; int ret; - if (argv[0][0] == '-') + if (argv[0][0] == '-' || argv[0][0] == '!') return dyn_event_release(argc, argv, NULL); mutex_lock(&dyn_event_ops_mutex); -- cgit v1.2.3 From a0572f687fb3c46e15554f4789797a077cc393b4 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 5 Dec 2018 12:48:53 -0500 Subject: ftrace: Allow ftrace_replace_code() to be schedulable The function ftrace_replace_code() is the ftrace engine that does the work to modify all the nops into the calls to the function callback in all the functions being traced. The generic version which is normally called from stop machine, but an architecture can implement a non stop machine version and still use the generic ftrace_replace_code(). When an architecture does this, ftrace_replace_code() may be called from a schedulable context, where it can allow the code to be preemptible, and schedule out. In order to allow an architecture to make ftrace_replace_code() schedulable, a new command flag is added called: FTRACE_MAY_SLEEP Which can be or'd to the command that is passed to ftrace_modify_all_code() that calls ftrace_replace_code() and will have it call cond_resched() in the loop that modifies the nops into the calls to the ftrace trampolines. Link: http://lkml.kernel.org/r/20181204192903.8193-1-anders.roxell@linaro.org Link: http://lkml.kernel.org/r/20181205183303.828422192@goodmis.org Reported-by: Anders Roxell Tested-by: Anders Roxell Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8ef9fc226037..ab3e8b995e12 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -77,6 +77,11 @@ #define ASSIGN_OPS_HASH(opsname, val) #endif +enum { + FTRACE_MODIFY_ENABLE_FL = (1 << 0), + FTRACE_MODIFY_MAY_SLEEP_FL = (1 << 1), +}; + struct ftrace_ops ftrace_list_end __read_mostly = { .func = ftrace_stub, .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB, @@ -2389,10 +2394,12 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) return -1; /* unknow ftrace bug */ } -void __weak ftrace_replace_code(int enable) +void __weak ftrace_replace_code(int mod_flags) { struct dyn_ftrace *rec; struct ftrace_page *pg; + int enable = mod_flags & FTRACE_MODIFY_ENABLE_FL; + int schedulable = mod_flags & FTRACE_MODIFY_MAY_SLEEP_FL; int failed; if (unlikely(ftrace_disabled)) @@ -2409,6 +2416,8 @@ void __weak ftrace_replace_code(int enable) /* Stop processing */ return; } + if (schedulable) + cond_resched(); } while_for_each_ftrace_rec(); } @@ -2522,8 +2531,12 @@ int __weak ftrace_arch_code_modify_post_process(void) void ftrace_modify_all_code(int command) { int update = command & FTRACE_UPDATE_TRACE_FUNC; + int mod_flags = 0; int err = 0; + if (command & FTRACE_MAY_SLEEP) + mod_flags = FTRACE_MODIFY_MAY_SLEEP_FL; + /* * If the ftrace_caller calls a ftrace_ops func directly, * we need to make sure that it only traces functions it @@ -2541,9 +2554,9 @@ void ftrace_modify_all_code(int command) } if (command & FTRACE_UPDATE_CALLS) - ftrace_replace_code(1); + ftrace_replace_code(mod_flags | FTRACE_MODIFY_ENABLE_FL); else if (command & FTRACE_DISABLE_CALLS) - ftrace_replace_code(0); + ftrace_replace_code(mod_flags); if (update && ftrace_trace_function != ftrace_ops_list_func) { function_trace_op = set_function_trace_op; -- cgit v1.2.3 From 45fe439bc3699851802062c05acf4cbac4d672bc Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 7 Dec 2018 12:25:52 -0500 Subject: fgraph: Add comment to describe ftrace_graph_get_ret_stack The ret_stack should not be accessed directly via the curr_ret_stack variable on the task_struct. This is because the ret_stack is going to be converted into a series of longs and not an array of ret_stack structures. The way that a ret_stack should be retrieved is via the ftrace_graph_get_ret_stack structure, but it needs to be documented on how to use it. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/fgraph.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index a3704ec8b599..d4f04f0ca646 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -232,6 +232,17 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) return ret; } +/** + * ftrace_graph_get_ret_stack - return the entry of the shadow stack + * @task: The task to read the shadow stack from + * @idx: Index down the shadow stack + * + * Return the ret_struct on the shadow stack of the @task at the + * call graph at @idx starting with zero. If @idx is zero, it + * will return the last saved ret_stack entry. If it is greater than + * zero, it will return the corresponding ret_stack for the depth + * of saved return addresses. + */ struct ftrace_ret_stack * ftrace_graph_get_ret_stack(struct task_struct *task, int idx) { -- cgit v1.2.3 From a38d1107f937ca95dcf820161ef44ea683d6a0b1 Mon Sep 17 00:00:00 2001 From: Matt Mullins Date: Wed, 12 Dec 2018 16:42:37 -0800 Subject: bpf: support raw tracepoints in modules Distributions build drivers as modules, including network and filesystem drivers which export numerous tracepoints. This enables bpf(BPF_RAW_TRACEPOINT_OPEN) to attach to those tracepoints. Signed-off-by: Matt Mullins Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 99 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 97 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 9864a35c8bb5..9ddb6fddb4e0 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -17,6 +17,43 @@ #include "trace_probe.h" #include "trace.h" +#ifdef CONFIG_MODULES +struct bpf_trace_module { + struct module *module; + struct list_head list; +}; + +static LIST_HEAD(bpf_trace_modules); +static DEFINE_MUTEX(bpf_module_mutex); + +static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char *name) +{ + struct bpf_raw_event_map *btp, *ret = NULL; + struct bpf_trace_module *btm; + unsigned int i; + + mutex_lock(&bpf_module_mutex); + list_for_each_entry(btm, &bpf_trace_modules, list) { + for (i = 0; i < btm->module->num_bpf_raw_events; ++i) { + btp = &btm->module->bpf_raw_events[i]; + if (!strcmp(btp->tp->name, name)) { + if (try_module_get(btm->module)) + ret = btp; + goto out; + } + } + } +out: + mutex_unlock(&bpf_module_mutex); + return ret; +} +#else +static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char *name) +{ + return NULL; +} +#endif /* CONFIG_MODULES */ + u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); @@ -1076,7 +1113,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info) extern struct bpf_raw_event_map __start__bpf_raw_tp[]; extern struct bpf_raw_event_map __stop__bpf_raw_tp[]; -struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name) +struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name) { struct bpf_raw_event_map *btp = __start__bpf_raw_tp; @@ -1084,7 +1121,16 @@ struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name) if (!strcmp(btp->tp->name, name)) return btp; } - return NULL; + + return bpf_get_raw_tracepoint_module(name); +} + +void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) +{ + struct module *mod = __module_address((unsigned long)btp); + + if (mod) + module_put(mod); } static __always_inline @@ -1222,3 +1268,52 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, return err; } + +#ifdef CONFIG_MODULES +int bpf_event_notify(struct notifier_block *nb, unsigned long op, void *module) +{ + struct bpf_trace_module *btm, *tmp; + struct module *mod = module; + + if (mod->num_bpf_raw_events == 0 || + (op != MODULE_STATE_COMING && op != MODULE_STATE_GOING)) + return 0; + + mutex_lock(&bpf_module_mutex); + + switch (op) { + case MODULE_STATE_COMING: + btm = kzalloc(sizeof(*btm), GFP_KERNEL); + if (btm) { + btm->module = module; + list_add(&btm->list, &bpf_trace_modules); + } + break; + case MODULE_STATE_GOING: + list_for_each_entry_safe(btm, tmp, &bpf_trace_modules, list) { + if (btm->module == module) { + list_del(&btm->list); + kfree(btm); + break; + } + } + break; + } + + mutex_unlock(&bpf_module_mutex); + + return 0; +} + +static struct notifier_block bpf_module_nb = { + .notifier_call = bpf_event_notify, +}; + +int __init bpf_event_init(void) +{ + register_module_notifier(&bpf_module_nb); + return 0; +} + +fs_initcall(bpf_event_init); +#endif /* CONFIG_MODULES */ -- cgit v1.2.3 From e8d086ddb5339d72c60e6c7b8d28810f26960f9a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 18 Dec 2018 15:50:02 -0500 Subject: tracing: Fix ftrace_graph_get_ret_stack() to use task and not current The function ftrace_graph_get_ret_stack() takes a task struct descriptor but uses current as the task to perform the operations on. In pretty much all cases the task decriptor is the same as current, so this wasn't an issue. But there is a case in the ARM architecture that passes in a task that is not current, and expects a result from that task, and this code breaks it. Fixes: 51584396cff5 ("arm64: Use ftrace_graph_get_ret_stack() instead of curr_ret_stack") Reported-by: James Morse Tested-by: James Morse Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/fgraph.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index d4f04f0ca646..8dfd5021b933 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -246,10 +246,10 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) struct ftrace_ret_stack * ftrace_graph_get_ret_stack(struct task_struct *task, int idx) { - idx = current->curr_ret_stack - idx; + idx = task->curr_ret_stack - idx; if (idx >= 0 && idx <= task->curr_ret_stack) - return ¤t->ret_stack[idx]; + return &task->ret_stack[idx]; return NULL; } -- cgit v1.2.3 From 6801f0d5ca00bed159c7f87870cc6f5411837544 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 18 Dec 2018 14:33:20 -0600 Subject: tracing: Remove unnecessary hist trigger struct field hist_field.var_idx is completely unused, so remove it. Link: http://lkml.kernel.org/r/d4e066c0f509f5f13ad3babc8c33ca6e7ddc439a.1545161087.git.tom.zanussi@linux.intel.com Acked-by: Namhyung Kim Reviewed-by: Masami Hiramatsu Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 82e72c48a5a9..d29bf8a8e1dd 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -61,7 +61,6 @@ struct hist_field { char *system; char *event_name; char *name; - unsigned int var_idx; unsigned int var_ref_idx; bool read_once; }; -- cgit v1.2.3 From 2f31ed9308cc9e95c149078b276a47360ab507bb Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 18 Dec 2018 14:33:21 -0600 Subject: tracing: Change strlen to sizeof for hist trigger static strings There's no need to use strlen() for static strings when the length is already known, so update trace_events_hist.c with sizeof() for those cases. Link: http://lkml.kernel.org/r/e3e754f2bd18e56eaa8baf79bee619316ebf4cfc.1545161087.git.tom.zanussi@linux.intel.com Acked-by: Namhyung Kim Reviewed-by: Masami Hiramatsu Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index d29bf8a8e1dd..25d06b3ae1f6 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -507,7 +507,7 @@ static int synth_field_string_size(char *type) start = strstr(type, "char["); if (start == NULL) return -EINVAL; - start += strlen("char["); + start += sizeof("char[") - 1; end = strchr(type, ']'); if (!end || end < start) @@ -1843,8 +1843,8 @@ static int parse_action(char *str, struct hist_trigger_attrs *attrs) if (attrs->n_actions >= HIST_ACTIONS_MAX) return ret; - if ((strncmp(str, "onmatch(", strlen("onmatch(")) == 0) || - (strncmp(str, "onmax(", strlen("onmax(")) == 0)) { + if ((strncmp(str, "onmatch(", sizeof("onmatch(") - 1) == 0) || + (strncmp(str, "onmax(", sizeof("onmax(") - 1) == 0)) { attrs->action_str[attrs->n_actions] = kstrdup(str, GFP_KERNEL); if (!attrs->action_str[attrs->n_actions]) { ret = -ENOMEM; @@ -1861,34 +1861,34 @@ static int parse_assignment(char *str, struct hist_trigger_attrs *attrs) { int ret = 0; - if ((strncmp(str, "key=", strlen("key=")) == 0) || - (strncmp(str, "keys=", strlen("keys=")) == 0)) { + if ((strncmp(str, "key=", sizeof("key=") - 1) == 0) || + (strncmp(str, "keys=", sizeof("keys=") - 1) == 0)) { attrs->keys_str = kstrdup(str, GFP_KERNEL); if (!attrs->keys_str) { ret = -ENOMEM; goto out; } - } else if ((strncmp(str, "val=", strlen("val=")) == 0) || - (strncmp(str, "vals=", strlen("vals=")) == 0) || - (strncmp(str, "values=", strlen("values=")) == 0)) { + } else if ((strncmp(str, "val=", sizeof("val=") - 1) == 0) || + (strncmp(str, "vals=", sizeof("vals=") - 1) == 0) || + (strncmp(str, "values=", sizeof("values=") - 1) == 0)) { attrs->vals_str = kstrdup(str, GFP_KERNEL); if (!attrs->vals_str) { ret = -ENOMEM; goto out; } - } else if (strncmp(str, "sort=", strlen("sort=")) == 0) { + } else if (strncmp(str, "sort=", sizeof("sort=") - 1) == 0) { attrs->sort_key_str = kstrdup(str, GFP_KERNEL); if (!attrs->sort_key_str) { ret = -ENOMEM; goto out; } - } else if (strncmp(str, "name=", strlen("name=")) == 0) { + } else if (strncmp(str, "name=", sizeof("name=") - 1) == 0) { attrs->name = kstrdup(str, GFP_KERNEL); if (!attrs->name) { ret = -ENOMEM; goto out; } - } else if (strncmp(str, "clock=", strlen("clock=")) == 0) { + } else if (strncmp(str, "clock=", sizeof("clock=") - 1) == 0) { strsep(&str, "="); if (!str) { ret = -EINVAL; @@ -1901,7 +1901,7 @@ static int parse_assignment(char *str, struct hist_trigger_attrs *attrs) ret = -ENOMEM; goto out; } - } else if (strncmp(str, "size=", strlen("size=")) == 0) { + } else if (strncmp(str, "size=", sizeof("size=") - 1) == 0) { int map_bits = parse_map_size(str); if (map_bits < 0) { @@ -3497,7 +3497,7 @@ static struct action_data *onmax_parse(char *str) if (!onmax_fn_name || !str) goto free; - if (strncmp(onmax_fn_name, "save", strlen("save")) == 0) { + if (strncmp(onmax_fn_name, "save", sizeof("save") - 1) == 0) { char *params = strsep(&str, ")"); if (!params) { @@ -4302,8 +4302,8 @@ static int parse_actions(struct hist_trigger_data *hist_data) for (i = 0; i < hist_data->attrs->n_actions; i++) { str = hist_data->attrs->action_str[i]; - if (strncmp(str, "onmatch(", strlen("onmatch(")) == 0) { - char *action_str = str + strlen("onmatch("); + if (strncmp(str, "onmatch(", sizeof("onmatch(") - 1) == 0) { + char *action_str = str + sizeof("onmatch(") - 1; data = onmatch_parse(tr, action_str); if (IS_ERR(data)) { @@ -4311,8 +4311,8 @@ static int parse_actions(struct hist_trigger_data *hist_data) break; } data->fn = action_trace; - } else if (strncmp(str, "onmax(", strlen("onmax(")) == 0) { - char *action_str = str + strlen("onmax("); + } else if (strncmp(str, "onmax(", sizeof("onmax(") - 1) == 0) { + char *action_str = str + sizeof("onmax(") - 1; data = onmax_parse(action_str); if (IS_ERR(data)) { @@ -5548,9 +5548,9 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, p++; continue; } - if (p >= param + strlen(param) - strlen("if") - 1) + if (p >= param + strlen(param) - (sizeof("if") - 1) - 1) return -EINVAL; - if (*(p + strlen("if")) != ' ' && *(p + strlen("if")) != '\t') { + if (*(p + sizeof("if") - 1) != ' ' && *(p + sizeof("if") - 1) != '\t') { p++; continue; } -- cgit v1.2.3 From e4f6d245031e04bdd12db390298acec0474a1a46 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Wed, 19 Dec 2018 13:09:16 -0600 Subject: tracing: Use var_refs[] for hist trigger reference checking Since all the variable reference hist_fields are collected into hist_data->var_refs[] array, there's no need to go through all the fields looking for them, or in separate arrays like synth_var_refs[], which will be going away soon anyway. This also allows us to get rid of some unnecessary code and functions currently used for the same purpose. Link: http://lkml.kernel.org/r/1545246556.4239.7.camel@gmail.com Acked-by: Namhyung Kim Reviewed-by: Masami Hiramatsu Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 68 +++++++--------------------------------- 1 file changed, 11 insertions(+), 57 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 25d06b3ae1f6..f3caf6e484f4 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1297,75 +1297,29 @@ check_field_for_var_ref(struct hist_field *hist_field, struct hist_trigger_data *var_data, unsigned int var_idx) { - struct hist_field *found = NULL; - - if (hist_field && hist_field->flags & HIST_FIELD_FL_VAR_REF) { - if (hist_field->var.idx == var_idx && - hist_field->var.hist_data == var_data) { - found = hist_field; - } - } - - return found; -} - -static struct hist_field * -check_field_for_var_refs(struct hist_trigger_data *hist_data, - struct hist_field *hist_field, - struct hist_trigger_data *var_data, - unsigned int var_idx, - unsigned int level) -{ - struct hist_field *found = NULL; - unsigned int i; - - if (level > 3) - return found; - - if (!hist_field) - return found; - - found = check_field_for_var_ref(hist_field, var_data, var_idx); - if (found) - return found; - - for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++) { - struct hist_field *operand; + WARN_ON(!(hist_field && hist_field->flags & HIST_FIELD_FL_VAR_REF)); - operand = hist_field->operands[i]; - found = check_field_for_var_refs(hist_data, operand, var_data, - var_idx, level + 1); - if (found) - return found; - } + if (hist_field && hist_field->var.idx == var_idx && + hist_field->var.hist_data == var_data) + return hist_field; - return found; + return NULL; } static struct hist_field *find_var_ref(struct hist_trigger_data *hist_data, struct hist_trigger_data *var_data, unsigned int var_idx) { - struct hist_field *hist_field, *found = NULL; + struct hist_field *hist_field; unsigned int i; - for_each_hist_field(i, hist_data) { - hist_field = hist_data->fields[i]; - found = check_field_for_var_refs(hist_data, hist_field, - var_data, var_idx, 0); - if (found) - return found; - } - - for (i = 0; i < hist_data->n_synth_var_refs; i++) { - hist_field = hist_data->synth_var_refs[i]; - found = check_field_for_var_refs(hist_data, hist_field, - var_data, var_idx, 0); - if (found) - return found; + for (i = 0; i < hist_data->n_var_refs; i++) { + hist_field = hist_data->var_refs[i]; + if (check_field_for_var_ref(hist_field, var_data, var_idx)) + return hist_field; } - return found; + return NULL; } static struct hist_field *find_any_var_ref(struct hist_trigger_data *hist_data, -- cgit v1.2.3 From de40f033d4e84e843d6a12266e3869015ea9097c Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 18 Dec 2018 14:33:23 -0600 Subject: tracing: Remove open-coding of hist trigger var_ref management Have create_var_ref() manage the hist trigger's var_ref list, rather than having similar code doing it in multiple places. This cleans up the code and makes sure var_refs are always accounted properly. Also, document the var_ref-related functions to make what their purpose clearer. Link: http://lkml.kernel.org/r/05ddae93ff514e66fc03897d6665231892939913.1545161087.git.tom.zanussi@linux.intel.com Acked-by: Namhyung Kim Reviewed-by: Masami Hiramatsu Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 93 ++++++++++++++++++++++++++++++++-------- 1 file changed, 75 insertions(+), 18 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index f3caf6e484f4..6309f4dbfb9c 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1292,6 +1292,17 @@ static u64 hist_field_cpu(struct hist_field *hist_field, return cpu; } +/** + * check_field_for_var_ref - Check if a VAR_REF field references a variable + * @hist_field: The VAR_REF field to check + * @var_data: The hist trigger that owns the variable + * @var_idx: The trigger variable identifier + * + * Check the given VAR_REF field to see whether or not it references + * the given variable associated with the given trigger. + * + * Return: The VAR_REF field if it does reference the variable, NULL if not + */ static struct hist_field * check_field_for_var_ref(struct hist_field *hist_field, struct hist_trigger_data *var_data, @@ -1306,6 +1317,18 @@ check_field_for_var_ref(struct hist_field *hist_field, return NULL; } +/** + * find_var_ref - Check if a trigger has a reference to a trigger variable + * @hist_data: The hist trigger that might have a reference to the variable + * @var_data: The hist trigger that owns the variable + * @var_idx: The trigger variable identifier + * + * Check the list of var_refs[] on the first hist trigger to see + * whether any of them are references to the variable on the second + * trigger. + * + * Return: The VAR_REF field referencing the variable if so, NULL if not + */ static struct hist_field *find_var_ref(struct hist_trigger_data *hist_data, struct hist_trigger_data *var_data, unsigned int var_idx) @@ -1322,6 +1345,20 @@ static struct hist_field *find_var_ref(struct hist_trigger_data *hist_data, return NULL; } +/** + * find_any_var_ref - Check if there is a reference to a given trigger variable + * @hist_data: The hist trigger + * @var_idx: The trigger variable identifier + * + * Check to see whether the given variable is currently referenced by + * any other trigger. + * + * The trigger the variable is defined on is explicitly excluded - the + * assumption being that a self-reference doesn't prevent a trigger + * from being removed. + * + * Return: The VAR_REF field referencing the variable if so, NULL if not + */ static struct hist_field *find_any_var_ref(struct hist_trigger_data *hist_data, unsigned int var_idx) { @@ -1340,6 +1377,19 @@ static struct hist_field *find_any_var_ref(struct hist_trigger_data *hist_data, return found; } +/** + * check_var_refs - Check if there is a reference to any of trigger's variables + * @hist_data: The hist trigger + * + * A trigger can define one or more variables. If any one of them is + * currently referenced by any other trigger, this function will + * determine that. + + * Typically used to determine whether or not a trigger can be removed + * - if there are any references to a trigger's variables, it cannot. + * + * Return: True if there is a reference to any of trigger's variables + */ static bool check_var_refs(struct hist_trigger_data *hist_data) { struct hist_field *field; @@ -2343,7 +2393,23 @@ static int init_var_ref(struct hist_field *ref_field, goto out; } -static struct hist_field *create_var_ref(struct hist_field *var_field, +/** + * create_var_ref - Create a variable reference and attach it to trigger + * @hist_data: The trigger that will be referencing the variable + * @var_field: The VAR field to create a reference to + * @system: The optional system string + * @event_name: The optional event_name string + * + * Given a variable hist_field, create a VAR_REF hist_field that + * represents a reference to it. + * + * This function also adds the reference to the trigger that + * now references the variable. + * + * Return: The VAR_REF field if successful, NULL if not + */ +static struct hist_field *create_var_ref(struct hist_trigger_data *hist_data, + struct hist_field *var_field, char *system, char *event_name) { unsigned long flags = HIST_FIELD_FL_VAR_REF; @@ -2355,6 +2421,9 @@ static struct hist_field *create_var_ref(struct hist_field *var_field, destroy_hist_field(ref_field, 0); return NULL; } + + hist_data->var_refs[hist_data->n_var_refs] = ref_field; + ref_field->var_ref_idx = hist_data->n_var_refs++; } return ref_field; @@ -2428,7 +2497,8 @@ static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data, var_field = find_event_var(hist_data, system, event_name, var_name); if (var_field) - ref_field = create_var_ref(var_field, system, event_name); + ref_field = create_var_ref(hist_data, var_field, + system, event_name); if (!ref_field) hist_err_event("Couldn't find variable: $", @@ -2546,8 +2616,6 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data, if (!s) { hist_field = parse_var_ref(hist_data, ref_system, ref_event, ref_var); if (hist_field) { - hist_data->var_refs[hist_data->n_var_refs] = hist_field; - hist_field->var_ref_idx = hist_data->n_var_refs++; if (var_name) { hist_field = create_alias(hist_data, hist_field, var_name); if (!hist_field) { @@ -3321,7 +3389,6 @@ static int onmax_create(struct hist_trigger_data *hist_data, unsigned int var_ref_idx = hist_data->n_var_refs; struct field_var *field_var; char *onmax_var_str, *param; - unsigned long flags; unsigned int i; int ret = 0; @@ -3338,18 +3405,10 @@ static int onmax_create(struct hist_trigger_data *hist_data, return -EINVAL; } - flags = HIST_FIELD_FL_VAR_REF; - ref_field = create_hist_field(hist_data, NULL, flags, NULL); + ref_field = create_var_ref(hist_data, var_field, NULL, NULL); if (!ref_field) return -ENOMEM; - if (init_var_ref(ref_field, var_field, NULL, NULL)) { - destroy_hist_field(ref_field, 0); - ret = -ENOMEM; - goto out; - } - hist_data->var_refs[hist_data->n_var_refs] = ref_field; - ref_field->var_ref_idx = hist_data->n_var_refs++; data->onmax.var = ref_field; data->fn = onmax_save; @@ -3538,9 +3597,6 @@ static void save_synth_var_ref(struct hist_trigger_data *hist_data, struct hist_field *var_ref) { hist_data->synth_var_refs[hist_data->n_synth_var_refs++] = var_ref; - - hist_data->var_refs[hist_data->n_var_refs] = var_ref; - var_ref->var_ref_idx = hist_data->n_var_refs++; } static int check_synth_field(struct synth_event *event, @@ -3694,7 +3750,8 @@ static int onmatch_create(struct hist_trigger_data *hist_data, } if (check_synth_field(event, hist_field, field_pos) == 0) { - var_ref = create_var_ref(hist_field, system, event_name); + var_ref = create_var_ref(hist_data, hist_field, + system, event_name); if (!var_ref) { kfree(p); ret = -ENOMEM; -- cgit v1.2.3 From 656fe2ba85e81d00e4447bf77b8da2be3c47acb2 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 18 Dec 2018 14:33:24 -0600 Subject: tracing: Use hist trigger's var_ref array to destroy var_refs Since every var ref for a trigger has an entry in the var_ref[] array, use that to destroy the var_refs, instead of piecemeal via the field expressions. This allows us to avoid having to keep and treat differently separate lists for the action-related references, which future patches will remove. Link: http://lkml.kernel.org/r/fad1a164f0e257c158e70d6eadbf6c586e04b2a2.1545161087.git.tom.zanussi@linux.intel.com Acked-by: Namhyung Kim Reviewed-by: Masami Hiramatsu Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 6309f4dbfb9c..923c572ee68d 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2190,6 +2190,15 @@ static int contains_operator(char *str) return field_op; } +static void __destroy_hist_field(struct hist_field *hist_field) +{ + kfree(hist_field->var.name); + kfree(hist_field->name); + kfree(hist_field->type); + + kfree(hist_field); +} + static void destroy_hist_field(struct hist_field *hist_field, unsigned int level) { @@ -2201,14 +2210,13 @@ static void destroy_hist_field(struct hist_field *hist_field, if (!hist_field) return; + if (hist_field->flags & HIST_FIELD_FL_VAR_REF) + return; /* var refs will be destroyed separately */ + for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++) destroy_hist_field(hist_field->operands[i], level + 1); - kfree(hist_field->var.name); - kfree(hist_field->name); - kfree(hist_field->type); - - kfree(hist_field); + __destroy_hist_field(hist_field); } static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, @@ -2335,6 +2343,12 @@ static void destroy_hist_fields(struct hist_trigger_data *hist_data) hist_data->fields[i] = NULL; } } + + for (i = 0; i < hist_data->n_var_refs; i++) { + WARN_ON(!(hist_data->var_refs[i]->flags & HIST_FIELD_FL_VAR_REF)); + __destroy_hist_field(hist_data->var_refs[i]); + hist_data->var_refs[i] = NULL; + } } static int init_var_ref(struct hist_field *ref_field, -- cgit v1.2.3 From 912201345f7c39e6b0ac283207be2b6641fa47b9 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 18 Dec 2018 14:33:25 -0600 Subject: tracing: Remove hist trigger synth_var_refs All var_refs are now handled uniformly and there's no reason to treat the synth_refs in a special way now, so remove them and associated functions. Link: http://lkml.kernel.org/r/b4d3470526b8f0426dcec125399dad9ad9b8589d.1545161087.git.tom.zanussi@linux.intel.com Acked-by: Namhyung Kim Reviewed-by: Masami Hiramatsu Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 923c572ee68d..1b1aee944588 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -279,8 +279,6 @@ struct hist_trigger_data { struct action_data *actions[HIST_ACTIONS_MAX]; unsigned int n_actions; - struct hist_field *synth_var_refs[SYNTH_FIELDS_MAX]; - unsigned int n_synth_var_refs; struct field_var *field_vars[SYNTH_FIELDS_MAX]; unsigned int n_field_vars; unsigned int n_field_var_str; @@ -3599,20 +3597,6 @@ static void save_field_var(struct hist_trigger_data *hist_data, } -static void destroy_synth_var_refs(struct hist_trigger_data *hist_data) -{ - unsigned int i; - - for (i = 0; i < hist_data->n_synth_var_refs; i++) - destroy_hist_field(hist_data->synth_var_refs[i], 0); -} - -static void save_synth_var_ref(struct hist_trigger_data *hist_data, - struct hist_field *var_ref) -{ - hist_data->synth_var_refs[hist_data->n_synth_var_refs++] = var_ref; -} - static int check_synth_field(struct synth_event *event, struct hist_field *hist_field, unsigned int field_pos) @@ -3772,7 +3756,6 @@ static int onmatch_create(struct hist_trigger_data *hist_data, goto err; } - save_synth_var_ref(hist_data, var_ref); field_pos++; kfree(p); continue; @@ -4516,7 +4499,6 @@ static void destroy_hist_data(struct hist_trigger_data *hist_data) destroy_actions(hist_data); destroy_field_vars(hist_data); destroy_field_var_hists(hist_data); - destroy_synth_var_refs(hist_data); kfree(hist_data); } -- cgit v1.2.3 From 05ddb25cb31491b0901d057c40ad50d01b3db783 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 18 Dec 2018 14:33:26 -0600 Subject: tracing: Add hist trigger comments for variable-related fields Add a few comments to help clarify how variable and variable reference fields are used in the code. Link: http://lkml.kernel.org/r/ea857ce948531d7bec712bbb0f38360aa1d378ec.1545161087.git.tom.zanussi@linux.intel.com Acked-by: Namhyung Kim Reviewed-by: Masami Hiramatsu Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 1b1aee944588..c5448c103770 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -40,6 +40,16 @@ enum field_op_id { FIELD_OP_UNARY_MINUS, }; +/* + * A hist_var (histogram variable) contains variable information for + * hist_fields having the HIST_FIELD_FL_VAR or HIST_FIELD_FL_VAR_REF + * flag set. A hist_var has a variable name e.g. ts0, and is + * associated with a given histogram trigger, as specified by + * hist_data. The hist_var idx is the unique index assigned to the + * variable by the hist trigger's tracing_map. The idx is what is + * used to set a variable's value and, by a variable reference, to + * retrieve it. + */ struct hist_var { char *name; struct hist_trigger_data *hist_data; @@ -56,11 +66,29 @@ struct hist_field { const char *type; struct hist_field *operands[HIST_FIELD_OPERANDS_MAX]; struct hist_trigger_data *hist_data; + + /* + * Variable fields contain variable-specific info in var. + */ struct hist_var var; enum field_op_id operator; char *system; char *event_name; + + /* + * The name field is used for EXPR and VAR_REF fields. VAR + * fields contain the variable name in var.name. + */ char *name; + + /* + * When a histogram trigger is hit, if it has any references + * to variables, the values of those variables are collected + * into a var_ref_vals array by resolve_var_refs(). The + * current value of each variable is read from the tracing_map + * using the hist field's hist_var.idx and entered into the + * var_ref_idx entry i.e. var_ref_vals[var_ref_idx]. + */ unsigned int var_ref_idx; bool read_once; }; @@ -365,6 +393,14 @@ struct action_data { union { struct { + /* + * When a histogram trigger is hit, the values of any + * references to variables, including variables being passed + * as parameters to synthetic events, are collected into a + * var_ref_vals array. This var_ref_idx is the index of the + * first param in the array to be passed to the synthetic + * event invocation. + */ unsigned int var_ref_idx; char *match_event; char *match_event_system; -- cgit v1.2.3 From 59dd974bc079079c23b5429cba841696fa7fae41 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Mon, 29 Oct 2018 23:35:40 +0100 Subject: tracing: Merge seq_print_sym_short() and seq_print_sym_offset() These two functions are nearly identical, so we can avoid some code duplication by moving the conditional into a common implementation. Link: http://lkml.kernel.org/r/20181029223542.26175-2-linux@rasmusvillemoes.dk Signed-off-by: Rasmus Villemoes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_output.c | 34 +++++++--------------------------- 1 file changed, 7 insertions(+), 27 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 6e6cc64faa38..85ecd061c7be 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -339,34 +339,17 @@ static inline const char *kretprobed(const char *name) #endif /* CONFIG_KRETPROBES */ static void -seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) +seq_print_sym(struct trace_seq *s, const char *fmt, unsigned long address, + bool offset) { char str[KSYM_SYMBOL_LEN]; #ifdef CONFIG_KALLSYMS const char *name; - kallsyms_lookup(address, NULL, NULL, NULL, str); - - name = kretprobed(str); - - if (name && strlen(name)) { - trace_seq_printf(s, fmt, name); - return; - } -#endif - snprintf(str, KSYM_SYMBOL_LEN, "0x%08lx", address); - trace_seq_printf(s, fmt, str); -} - -static void -seq_print_sym_offset(struct trace_seq *s, const char *fmt, - unsigned long address) -{ - char str[KSYM_SYMBOL_LEN]; -#ifdef CONFIG_KALLSYMS - const char *name; - - sprint_symbol(str, address); + if (offset) + sprint_symbol(str, address); + else + kallsyms_lookup(address, NULL, NULL, NULL, str); name = kretprobed(str); if (name && strlen(name)) { @@ -424,10 +407,7 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) goto out; } - if (sym_flags & TRACE_ITER_SYM_OFFSET) - seq_print_sym_offset(s, "%s", ip); - else - seq_print_sym_short(s, "%s", ip); + seq_print_sym(s, "%s", ip, sym_flags & TRACE_ITER_SYM_OFFSET); if (sym_flags & TRACE_ITER_SYM_ADDR) trace_seq_printf(s, " <" IP_FMT ">", ip); -- cgit v1.2.3 From cc9f59fb3bc455984404dbab8be16f156a47d023 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Mon, 29 Oct 2018 23:35:41 +0100 Subject: tracing: Avoid -Wformat-nonliteral warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Building with -Wformat-nonliteral, gcc complains kernel/trace/trace_output.c: In function ‘seq_print_sym’: kernel/trace/trace_output.c:356:3: warning: format not a string literal, argument types not checked [-Wformat-nonliteral] trace_seq_printf(s, fmt, name); But seq_print_sym only has a single caller which passes "%s" as fmt, so we might as well just use that directly. That also paves the way for further cleanups that will actually make that format string go away entirely. Link: http://lkml.kernel.org/r/20181029223542.26175-3-linux@rasmusvillemoes.dk Signed-off-by: Rasmus Villemoes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_output.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 85ecd061c7be..f06fb899b746 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -339,8 +339,7 @@ static inline const char *kretprobed(const char *name) #endif /* CONFIG_KRETPROBES */ static void -seq_print_sym(struct trace_seq *s, const char *fmt, unsigned long address, - bool offset) +seq_print_sym(struct trace_seq *s, unsigned long address, bool offset) { char str[KSYM_SYMBOL_LEN]; #ifdef CONFIG_KALLSYMS @@ -353,12 +352,12 @@ seq_print_sym(struct trace_seq *s, const char *fmt, unsigned long address, name = kretprobed(str); if (name && strlen(name)) { - trace_seq_printf(s, fmt, name); + trace_seq_printf(s, "%s", name); return; } #endif snprintf(str, KSYM_SYMBOL_LEN, "0x%08lx", address); - trace_seq_printf(s, fmt, str); + trace_seq_printf(s, "%s", str); } #ifndef CONFIG_64BIT @@ -407,7 +406,7 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) goto out; } - seq_print_sym(s, "%s", ip, sym_flags & TRACE_ITER_SYM_OFFSET); + seq_print_sym(s, ip, sym_flags & TRACE_ITER_SYM_OFFSET); if (sym_flags & TRACE_ITER_SYM_ADDR) trace_seq_printf(s, " <" IP_FMT ">", ip); -- cgit v1.2.3 From bea6957d5cd7cbb327083d3bcf080133ee63ef65 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Mon, 29 Oct 2018 23:35:42 +0100 Subject: tracing: Simplify printf'ing in seq_print_sym trace_seq_printf(..., "%s", ...) can be done with trace_seq_puts() instead, avoiding printf overhead. In the second instance, the string we're copying was just created from an snprintf() to a stack buffer, so we might as well do that printf directly. This naturally leads to moving the declaration of the str buffer inside the CONFIG_KALLSYMS guard, which in turn will make gcc inline the function for !CONFIG_KALLSYMS (it only has a single caller, but the huge stack frame seems to make gcc not inline it for CONFIG_KALLSYMS). Link: http://lkml.kernel.org/r/20181029223542.26175-4-linux@rasmusvillemoes.dk Signed-off-by: Rasmus Villemoes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_output.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index f06fb899b746..54373d93e251 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -341,8 +341,8 @@ static inline const char *kretprobed(const char *name) static void seq_print_sym(struct trace_seq *s, unsigned long address, bool offset) { - char str[KSYM_SYMBOL_LEN]; #ifdef CONFIG_KALLSYMS + char str[KSYM_SYMBOL_LEN]; const char *name; if (offset) @@ -352,12 +352,11 @@ seq_print_sym(struct trace_seq *s, unsigned long address, bool offset) name = kretprobed(str); if (name && strlen(name)) { - trace_seq_printf(s, "%s", name); + trace_seq_puts(s, name); return; } #endif - snprintf(str, KSYM_SYMBOL_LEN, "0x%08lx", address); - trace_seq_printf(s, "%s", str); + trace_seq_printf(s, "0x%08lx", address); } #ifndef CONFIG_64BIT -- cgit v1.2.3 From 1cce377df1800154301525a8ee04100dd83c9633 Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Wed, 16 May 2018 21:30:12 +0200 Subject: =?UTF-8?q?tracing:=20Make=20function=20=E2=80=98ftrace=5Fexports?= =?UTF-8?q?=E2=80=99=20static?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In commit 478409dd683d ("tracing: Add hook to function tracing for other subsystems to use"), a new function ‘ftrace_exports’ was added. Since this function can be made static, make it so. Silence the following warning triggered using W=1: kernel/trace/trace.c:2451:6: warning: no previous prototype for ‘ftrace_exports’ [-Wmissing-prototypes] Link: http://lkml.kernel.org/r/20180516193012.25390-1-malat@debian.org Signed-off-by: Mathieu Malaterre Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 911470ad9e94..5afcfecb4bc2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2452,7 +2452,7 @@ static inline void ftrace_exports_disable(void) static_branch_disable(&ftrace_exports_enabled); } -void ftrace_exports(struct ring_buffer_event *event) +static void ftrace_exports(struct ring_buffer_event *event) { struct trace_export *export; -- cgit v1.2.3 From 754481e6954cbef53f8bc4412ad48dde611e21d3 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 19 Dec 2018 22:38:21 -0500 Subject: tracing: Use str_has_prefix() helper for histogram code The tracing histogram code contains a lot of instances of the construct: strncmp(str, "const", sizeof("const") - 1) This can be prone to bugs due to typos or bad cut and paste. Use the str_has_prefix() helper macro instead that removes the need for having two copies of the constant string. Cc: Tom Zanussi Acked-by: Namhyung Kim Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index c5448c103770..9d590138f870 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1881,8 +1881,8 @@ static int parse_action(char *str, struct hist_trigger_attrs *attrs) if (attrs->n_actions >= HIST_ACTIONS_MAX) return ret; - if ((strncmp(str, "onmatch(", sizeof("onmatch(") - 1) == 0) || - (strncmp(str, "onmax(", sizeof("onmax(") - 1) == 0)) { + if ((str_has_prefix(str, "onmatch(")) || + (str_has_prefix(str, "onmax("))) { attrs->action_str[attrs->n_actions] = kstrdup(str, GFP_KERNEL); if (!attrs->action_str[attrs->n_actions]) { ret = -ENOMEM; @@ -1899,34 +1899,34 @@ static int parse_assignment(char *str, struct hist_trigger_attrs *attrs) { int ret = 0; - if ((strncmp(str, "key=", sizeof("key=") - 1) == 0) || - (strncmp(str, "keys=", sizeof("keys=") - 1) == 0)) { + if ((str_has_prefix(str, "key=")) || + (str_has_prefix(str, "keys="))) { attrs->keys_str = kstrdup(str, GFP_KERNEL); if (!attrs->keys_str) { ret = -ENOMEM; goto out; } - } else if ((strncmp(str, "val=", sizeof("val=") - 1) == 0) || - (strncmp(str, "vals=", sizeof("vals=") - 1) == 0) || - (strncmp(str, "values=", sizeof("values=") - 1) == 0)) { + } else if ((str_has_prefix(str, "val=")) || + (str_has_prefix(str, "vals=")) || + (str_has_prefix(str, "values="))) { attrs->vals_str = kstrdup(str, GFP_KERNEL); if (!attrs->vals_str) { ret = -ENOMEM; goto out; } - } else if (strncmp(str, "sort=", sizeof("sort=") - 1) == 0) { + } else if (str_has_prefix(str, "sort=")) { attrs->sort_key_str = kstrdup(str, GFP_KERNEL); if (!attrs->sort_key_str) { ret = -ENOMEM; goto out; } - } else if (strncmp(str, "name=", sizeof("name=") - 1) == 0) { + } else if (str_has_prefix(str, "name=")) { attrs->name = kstrdup(str, GFP_KERNEL); if (!attrs->name) { ret = -ENOMEM; goto out; } - } else if (strncmp(str, "clock=", sizeof("clock=") - 1) == 0) { + } else if (str_has_prefix(str, "clock=")) { strsep(&str, "="); if (!str) { ret = -EINVAL; @@ -1939,7 +1939,7 @@ static int parse_assignment(char *str, struct hist_trigger_attrs *attrs) ret = -ENOMEM; goto out; } - } else if (strncmp(str, "size=", sizeof("size=") - 1) == 0) { + } else if (str_has_prefix(str, "size=")) { int map_bits = parse_map_size(str); if (map_bits < 0) { @@ -3558,7 +3558,7 @@ static struct action_data *onmax_parse(char *str) if (!onmax_fn_name || !str) goto free; - if (strncmp(onmax_fn_name, "save", sizeof("save") - 1) == 0) { + if (str_has_prefix(onmax_fn_name, "save")) { char *params = strsep(&str, ")"); if (!params) { @@ -4346,7 +4346,7 @@ static int parse_actions(struct hist_trigger_data *hist_data) for (i = 0; i < hist_data->attrs->n_actions; i++) { str = hist_data->attrs->action_str[i]; - if (strncmp(str, "onmatch(", sizeof("onmatch(") - 1) == 0) { + if (str_has_prefix(str, "onmatch(")) { char *action_str = str + sizeof("onmatch(") - 1; data = onmatch_parse(tr, action_str); @@ -4355,7 +4355,7 @@ static int parse_actions(struct hist_trigger_data *hist_data) break; } data->fn = action_trace; - } else if (strncmp(str, "onmax(", sizeof("onmax(") - 1) == 0) { + } else if (str_has_prefix(str, "onmax(")) { char *action_str = str + sizeof("onmax(") - 1; data = onmax_parse(action_str); -- cgit v1.2.3 From b6b2735514bcd70ad1556a33892a636b20ece671 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 20 Dec 2018 13:20:07 -0500 Subject: tracing: Use str_has_prefix() instead of using fixed sizes There are several instances of strncmp(str, "const", 123), where 123 is the strlen of the const string to check if "const" is the prefix of str. But this can be error prone. Use str_has_prefix() instead. Acked-by: Namhyung Kim Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- kernel/trace/trace_events.c | 2 +- kernel/trace/trace_events_hist.c | 2 +- kernel/trace/trace_probe.c | 4 ++-- kernel/trace/trace_stack.c | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5afcfecb4bc2..eac2824a18ab 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4411,7 +4411,7 @@ static int trace_set_options(struct trace_array *tr, char *option) cmp = strstrip(option); - if (strncmp(cmp, "no", 2) == 0) { + if (str_has_prefix(cmp, "no")) { neg = 1; cmp += 2; } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index bd0162c0467c..5b3b0c3c8a47 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1251,7 +1251,7 @@ static int f_show(struct seq_file *m, void *v) */ array_descriptor = strchr(field->type, '['); - if (!strncmp(field->type, "__data_loc", 10)) + if (str_has_prefix(field->type, "__data_loc")) array_descriptor = NULL; if (!array_descriptor) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 9d590138f870..0d878dcd1e4b 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -518,7 +518,7 @@ static int synth_event_define_fields(struct trace_event_call *call) static bool synth_field_signed(char *type) { - if (strncmp(type, "u", 1) == 0) + if (str_has_prefix(type, "u")) return false; return true; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index ff86417c0149..541375737403 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -194,7 +194,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, code->op = FETCH_OP_RETVAL; else ret = -EINVAL; - } else if (strncmp(arg, "stack", 5) == 0) { + } else if (str_has_prefix(arg, "stack")) { if (arg[5] == '\0') { code->op = FETCH_OP_STACKP; } else if (isdigit(arg[5])) { @@ -213,7 +213,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API } else if (((flags & TPARG_FL_MASK) == (TPARG_FL_KERNEL | TPARG_FL_FENTRY)) && - strncmp(arg, "arg", 3) == 0) { + str_has_prefix(arg, "arg")) { if (!isdigit(arg[3])) return -EINVAL; ret = kstrtoul(arg + 3, 10, ¶m); diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index e2a153fc1afc..3641f28c343f 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -448,7 +448,7 @@ static char stack_trace_filter_buf[COMMAND_LINE_SIZE+1] __initdata; static __init int enable_stacktrace(char *str) { - if (strncmp(str, "_filter=", 8) == 0) + if (str_has_prefix(str, "_filter=")) strncpy(stack_trace_filter_buf, str+8, COMMAND_LINE_SIZE); stack_tracer_enabled = 1; -- cgit v1.2.3 From 036876fa56204ae0fa59045bd6bbb2691a060633 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 21 Dec 2018 18:40:46 -0500 Subject: tracing: Have the historgram use the result of str_has_prefix() for len of prefix As str_has_prefix() returns the length on match, we can use that for the updating of the string pointer instead of recalculating the prefix size. Cc: Tom Zanussi Acked-by: Namhyung Kim Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 0d878dcd1e4b..449d90cfa151 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -4342,12 +4342,13 @@ static int parse_actions(struct hist_trigger_data *hist_data) unsigned int i; int ret = 0; char *str; + int len; for (i = 0; i < hist_data->attrs->n_actions; i++) { str = hist_data->attrs->action_str[i]; - if (str_has_prefix(str, "onmatch(")) { - char *action_str = str + sizeof("onmatch(") - 1; + if ((len = str_has_prefix(str, "onmatch("))) { + char *action_str = str + len; data = onmatch_parse(tr, action_str); if (IS_ERR(data)) { @@ -4355,8 +4356,8 @@ static int parse_actions(struct hist_trigger_data *hist_data) break; } data->fn = action_trace; - } else if (str_has_prefix(str, "onmax(")) { - char *action_str = str + sizeof("onmax(") - 1; + } else if ((len = str_has_prefix(str, "onmax("))) { + char *action_str = str + len; data = onmax_parse(action_str); if (IS_ERR(data)) { -- cgit v1.2.3 From 3d739c1f6156c70eb0548aa288dcfbac9e0bd162 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 21 Dec 2018 23:10:26 -0500 Subject: tracing: Use the return of str_has_prefix() to remove open coded numbers There are several locations that compare constants to the beginning of string variables to determine what commands should be done, then the constant length is used to index into the string. This is error prone as the hard coded numbers have to match the size of the constants. Instead, use the len returned from str_has_prefix() and remove the open coded string length sizes. Cc: Joe Perches Acked-by: Masami Hiramatsu (for trace_probe part) Acked-by: Namhyung Kim Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 8 +++++--- kernel/trace/trace_probe.c | 17 +++++++++-------- kernel/trace/trace_stack.c | 6 ++++-- 3 files changed, 18 insertions(+), 13 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index eac2824a18ab..18b86c3974e1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4408,13 +4408,15 @@ static int trace_set_options(struct trace_array *tr, char *option) int neg = 0; int ret; size_t orig_len = strlen(option); + int len; cmp = strstrip(option); - if (str_has_prefix(cmp, "no")) { + len = str_has_prefix(cmp, "no"); + if (len) neg = 1; - cmp += 2; - } + + cmp += len; mutex_lock(&trace_types_lock); diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 541375737403..9962cb5da8ac 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -186,19 +186,20 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, static int parse_probe_vars(char *arg, const struct fetch_type *t, struct fetch_insn *code, unsigned int flags) { - int ret = 0; unsigned long param; + int ret = 0; + int len; if (strcmp(arg, "retval") == 0) { if (flags & TPARG_FL_RETURN) code->op = FETCH_OP_RETVAL; else ret = -EINVAL; - } else if (str_has_prefix(arg, "stack")) { - if (arg[5] == '\0') { + } else if ((len = str_has_prefix(arg, "stack"))) { + if (arg[len] == '\0') { code->op = FETCH_OP_STACKP; - } else if (isdigit(arg[5])) { - ret = kstrtoul(arg + 5, 10, ¶m); + } else if (isdigit(arg[len])) { + ret = kstrtoul(arg + len, 10, ¶m); if (ret || ((flags & TPARG_FL_KERNEL) && param > PARAM_MAX_STACK)) ret = -EINVAL; @@ -213,10 +214,10 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API } else if (((flags & TPARG_FL_MASK) == (TPARG_FL_KERNEL | TPARG_FL_FENTRY)) && - str_has_prefix(arg, "arg")) { - if (!isdigit(arg[3])) + (len = str_has_prefix(arg, "arg"))) { + if (!isdigit(arg[len])) return -EINVAL; - ret = kstrtoul(arg + 3, 10, ¶m); + ret = kstrtoul(arg + len, 10, ¶m); if (ret || !param || param > PARAM_MAX_STACK) return -EINVAL; code->op = FETCH_OP_ARG; diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 3641f28c343f..eec648a0d673 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -448,8 +448,10 @@ static char stack_trace_filter_buf[COMMAND_LINE_SIZE+1] __initdata; static __init int enable_stacktrace(char *str) { - if (str_has_prefix(str, "_filter=")) - strncpy(stack_trace_filter_buf, str+8, COMMAND_LINE_SIZE); + int len; + + if ((len = str_has_prefix(str, "_filter="))) + strncpy(stack_trace_filter_buf, str + len, COMMAND_LINE_SIZE); stack_tracer_enabled = 1; last_stack_tracer_enabled = 1; -- cgit v1.2.3 From 96d4f267e40f9509e8a66e2b39e8b95655617693 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 3 Jan 2019 18:57:57 -0800 Subject: Remove 'type' argument from access_ok() function Nobody has actually used the type (VERIFY_READ vs VERIFY_WRITE) argument of the user address range verification function since we got rid of the old racy i386-only code to walk page tables by hand. It existed because the original 80386 would not honor the write protect bit when in kernel mode, so you had to do COW by hand before doing any user access. But we haven't supported that in a long time, and these days the 'type' argument is a purely historical artifact. A discussion about extending 'user_access_begin()' to do the range checking resulted this patch, because there is no way we're going to move the old VERIFY_xyz interface to that model. And it's best done at the end of the merge window when I've done most of my merges, so let's just get this done once and for all. This patch was mostly done with a sed-script, with manual fix-ups for the cases that weren't of the trivial 'access_ok(VERIFY_xyz' form. There were a couple of notable cases: - csky still had the old "verify_area()" name as an alias. - the iter_iov code had magical hardcoded knowledge of the actual values of VERIFY_{READ,WRITE} (not that they mattered, since nothing really used it) - microblaze used the type argument for a debug printout but other than those oddities this should be a total no-op patch. I tried to fix up all architectures, did fairly extensive grepping for access_ok() uses, and the changes are trivial, but I may have missed something. Any missed conversion should be trivially fixable, though. Signed-off-by: Linus Torvalds --- kernel/trace/bpf_trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 9ddb6fddb4e0..8b068adb9da1 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -170,7 +170,7 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src, return -EPERM; if (unlikely(uaccess_kernel())) return -EPERM; - if (!access_ok(VERIFY_WRITE, unsafe_ptr, size)) + if (!access_ok(unsafe_ptr, size)) return -EPERM; return probe_kernel_write(unsafe_ptr, src, size); -- cgit v1.2.3 From 8b05a3a7503c2a982c9c462eae96cfbd59506783 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Fri, 11 Jan 2019 07:01:13 +0100 Subject: tracing/kprobes: Fix NULL pointer dereference in trace_kprobe_create() It is possible to trigger a NULL pointer dereference by writing an incorrectly formatted string to krpobe_events (trying to create a kretprobe omitting the symbol). Example: echo "r:event_1 " >> /sys/kernel/debug/tracing/kprobe_events That triggers this: BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 #PF error: [normal kernel read fault] PGD 0 P4D 0 Oops: 0000 [#1] SMP PTI CPU: 6 PID: 1757 Comm: bash Not tainted 5.0.0-rc1+ #125 Hardware name: Dell Inc. XPS 13 9370/0F6P3V, BIOS 1.5.1 08/09/2018 RIP: 0010:kstrtoull+0x2/0x20 Code: 28 00 00 00 75 17 48 83 c4 18 5b 41 5c 5d c3 b8 ea ff ff ff eb e1 b8 de ff ff ff eb da e8 d6 36 bb ff 66 0f 1f 44 00 00 31 c0 <80> 3f 2b 55 48 89 e5 0f 94 c0 48 01 c7 e8 5c ff ff ff 5d c3 66 2e RSP: 0018:ffffb5d482e57cb8 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000000001 RCX: ffffffff82b12720 RDX: ffffb5d482e57cf8 RSI: 0000000000000000 RDI: 0000000000000000 RBP: ffffb5d482e57d70 R08: ffffa0c05e5a7080 R09: ffffa0c05e003980 R10: 0000000000000000 R11: 0000000040000000 R12: ffffa0c04fe87b08 R13: 0000000000000001 R14: 000000000000000b R15: ffffa0c058d749e1 FS: 00007f137c7f7740(0000) GS:ffffa0c05e580000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 0000000497d46004 CR4: 00000000003606e0 Call Trace: ? trace_kprobe_create+0xb6/0x840 ? _cond_resched+0x19/0x40 ? _cond_resched+0x19/0x40 ? __kmalloc+0x62/0x210 ? argv_split+0x8f/0x140 ? trace_kprobe_create+0x840/0x840 ? trace_kprobe_create+0x840/0x840 create_or_delete_trace_kprobe+0x11/0x30 trace_run_command+0x50/0x90 trace_parse_run_command+0xc1/0x160 probes_write+0x10/0x20 __vfs_write+0x3a/0x1b0 ? apparmor_file_permission+0x1a/0x20 ? security_file_permission+0x31/0xf0 ? _cond_resched+0x19/0x40 vfs_write+0xb1/0x1a0 ksys_write+0x55/0xc0 __x64_sys_write+0x1a/0x20 do_syscall_64+0x5a/0x120 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fix by doing the proper argument checks in trace_kprobe_create(). Cc: Ingo Molnar Link: https://lore.kernel.org/lkml/20190111095108.b79a2ee026185cbd62365977@kernel.org Link: http://lkml.kernel.org/r/20190111060113.GA22841@xps-13 Fixes: 6212dd29683e ("tracing/kprobes: Use dyn_event framework for kprobe events") Acked-by: Masami Hiramatsu Signed-off-by: Andrea Righi Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 5c19b8c41c7e..d5fb09ebba8b 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -607,11 +607,17 @@ static int trace_kprobe_create(int argc, const char *argv[]) char buf[MAX_EVENT_NAME_LEN]; unsigned int flags = TPARG_FL_KERNEL; - /* argc must be >= 1 */ - if (argv[0][0] == 'r') { + switch (argv[0][0]) { + case 'r': is_return = true; flags |= TPARG_FL_RETURN; - } else if (argv[0][0] != 'p' || argc < 2) + break; + case 'p': + break; + default: + return -ECANCELED; + } + if (argc < 2) return -ECANCELED; event = strchr(&argv[0][1], ':'); -- cgit v1.2.3 From ea6eb5e7d15e1838de335609994b4546e2abcaaf Mon Sep 17 00:00:00 2001 From: Andreas Ziegler Date: Thu, 17 Jan 2019 14:30:23 +0100 Subject: tracing: uprobes: Fix typo in pr_fmt string The subsystem-specific message prefix for uprobes was also "trace_kprobe: " instead of "trace_uprobe: " as described in the original commit message. Link: http://lkml.kernel.org/r/20190117133023.19292-1-andreas.ziegler@fau.de Cc: Ingo Molnar Cc: stable@vger.kernel.org Acked-by: Masami Hiramatsu Fixes: 7257634135c24 ("tracing/probe: Show subsystem name in messages") Signed-off-by: Andreas Ziegler Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_uprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index e335576b9411..19a1a8e19062 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -5,7 +5,7 @@ * Copyright (C) IBM Corporation, 2010-2012 * Author: Srikar Dronamraju */ -#define pr_fmt(fmt) "trace_kprobe: " fmt +#define pr_fmt(fmt) "trace_uprobe: " fmt #include #include -- cgit v1.2.3 From 0722069a5374b904ec1a67f91249f90e1cfae259 Mon Sep 17 00:00:00 2001 From: Andreas Ziegler Date: Wed, 16 Jan 2019 15:16:29 +0100 Subject: tracing/uprobes: Fix output for multiple string arguments When printing multiple uprobe arguments as strings the output for the earlier arguments would also include all later string arguments. This is best explained in an example: Consider adding a uprobe to a function receiving two strings as parameters which is at offset 0xa0 in strlib.so and we want to print both parameters when the uprobe is hit (on x86_64): $ echo 'p:func /lib/strlib.so:0xa0 +0(%di):string +0(%si):string' > \ /sys/kernel/debug/tracing/uprobe_events When the function is called as func("foo", "bar") and we hit the probe, the trace file shows a line like the following: [...] func: (0x7f7e683706a0) arg1="foobar" arg2="bar" Note the extra "bar" printed as part of arg1. This behaviour stacks up for additional string arguments. The strings are stored in a dynamically growing part of the uprobe buffer by fetch_store_string() after copying them from userspace via strncpy_from_user(). The return value of strncpy_from_user() is then directly used as the required size for the string. However, this does not take the terminating null byte into account as the documentation for strncpy_from_user() cleary states that it "[...] returns the length of the string (not including the trailing NUL)" even though the null byte will be copied to the destination. Therefore, subsequent calls to fetch_store_string() will overwrite the terminating null byte of the most recently fetched string with the first character of the current string, leading to the "accumulation" of strings in earlier arguments in the output. Fix this by incrementing the return value of strncpy_from_user() by one if we did not hit the maximum buffer size. Link: http://lkml.kernel.org/r/20190116141629.5752-1-andreas.ziegler@fau.de Cc: Ingo Molnar Cc: stable@vger.kernel.org Fixes: 5baaa59ef09e ("tracing/probes: Implement 'memory' fetch method for uprobes") Acked-by: Masami Hiramatsu Signed-off-by: Andreas Ziegler Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_uprobe.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 19a1a8e19062..9bde07c06362 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -160,6 +160,13 @@ fetch_store_string(unsigned long addr, void *dest, void *base) if (ret >= 0) { if (ret == maxlen) dst[ret - 1] = '\0'; + else + /* + * Include the terminating null byte. In this case it + * was copied by strncpy_from_user but not accounted + * for in ret. + */ + ret++; *(u32 *)dest = make_data_loc(ret, (void *)dst - base); } -- cgit v1.2.3 From 5620196951192f7cd2da0a04e7c0113f40bfc14e Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 11 Jan 2019 13:20:20 -0300 Subject: perf: Make perf_event_output() propagate the output() return For the original mode of operation it isn't needed, since we report back errors via PERF_RECORD_LOST records in the ring buffer, but for use in bpf_perf_event_output() it is convenient to return the errors, basically -ENOSPC. Currently bpf_perf_event_output() returns an error indication, the last thing it does, which is to push it to the ring buffer is that can fail and if so, this failure won't be reported back to its users, fix it. Reported-by: Jamal Hadi Salim Tested-by: Jamal Hadi Salim Acked-by: Peter Zijlstra (Intel) Cc: Adrian Hunter Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/r/20190118150938.GN5823@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- kernel/trace/bpf_trace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 8b068adb9da1..088c2032ceaf 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -431,8 +431,7 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, if (unlikely(event->oncpu != cpu)) return -EOPNOTSUPP; - perf_event_output(event, sd, regs); - return 0; + return perf_event_output(event, sd, regs); } BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, -- cgit v1.2.3 From e16ec34039c701594d55d08a5aa49ee3e1abc821 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 30 Jan 2019 18:12:44 -0800 Subject: bpf: fix potential deadlock in bpf_prog_register Lockdep found a potential deadlock between cpu_hotplug_lock, bpf_event_mutex, and cpuctx_mutex: [ 13.007000] WARNING: possible circular locking dependency detected [ 13.007587] 5.0.0-rc3-00018-g2fa53f892422-dirty #477 Not tainted [ 13.008124] ------------------------------------------------------ [ 13.008624] test_progs/246 is trying to acquire lock: [ 13.009030] 0000000094160d1d (tracepoints_mutex){+.+.}, at: tracepoint_probe_register_prio+0x2d/0x300 [ 13.009770] [ 13.009770] but task is already holding lock: [ 13.010239] 00000000d663ef86 (bpf_event_mutex){+.+.}, at: bpf_probe_register+0x1d/0x60 [ 13.010877] [ 13.010877] which lock already depends on the new lock. [ 13.010877] [ 13.011532] [ 13.011532] the existing dependency chain (in reverse order) is: [ 13.012129] [ 13.012129] -> #4 (bpf_event_mutex){+.+.}: [ 13.012582] perf_event_query_prog_array+0x9b/0x130 [ 13.013016] _perf_ioctl+0x3aa/0x830 [ 13.013354] perf_ioctl+0x2e/0x50 [ 13.013668] do_vfs_ioctl+0x8f/0x6a0 [ 13.014003] ksys_ioctl+0x70/0x80 [ 13.014320] __x64_sys_ioctl+0x16/0x20 [ 13.014668] do_syscall_64+0x4a/0x180 [ 13.015007] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 13.015469] [ 13.015469] -> #3 (&cpuctx_mutex){+.+.}: [ 13.015910] perf_event_init_cpu+0x5a/0x90 [ 13.016291] perf_event_init+0x1b2/0x1de [ 13.016654] start_kernel+0x2b8/0x42a [ 13.016995] secondary_startup_64+0xa4/0xb0 [ 13.017382] [ 13.017382] -> #2 (pmus_lock){+.+.}: [ 13.017794] perf_event_init_cpu+0x21/0x90 [ 13.018172] cpuhp_invoke_callback+0xb3/0x960 [ 13.018573] _cpu_up+0xa7/0x140 [ 13.018871] do_cpu_up+0xa4/0xc0 [ 13.019178] smp_init+0xcd/0xd2 [ 13.019483] kernel_init_freeable+0x123/0x24f [ 13.019878] kernel_init+0xa/0x110 [ 13.020201] ret_from_fork+0x24/0x30 [ 13.020541] [ 13.020541] -> #1 (cpu_hotplug_lock.rw_sem){++++}: [ 13.021051] static_key_slow_inc+0xe/0x20 [ 13.021424] tracepoint_probe_register_prio+0x28c/0x300 [ 13.021891] perf_trace_event_init+0x11f/0x250 [ 13.022297] perf_trace_init+0x6b/0xa0 [ 13.022644] perf_tp_event_init+0x25/0x40 [ 13.023011] perf_try_init_event+0x6b/0x90 [ 13.023386] perf_event_alloc+0x9a8/0xc40 [ 13.023754] __do_sys_perf_event_open+0x1dd/0xd30 [ 13.024173] do_syscall_64+0x4a/0x180 [ 13.024519] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 13.024968] [ 13.024968] -> #0 (tracepoints_mutex){+.+.}: [ 13.025434] __mutex_lock+0x86/0x970 [ 13.025764] tracepoint_probe_register_prio+0x2d/0x300 [ 13.026215] bpf_probe_register+0x40/0x60 [ 13.026584] bpf_raw_tracepoint_open.isra.34+0xa4/0x130 [ 13.027042] __do_sys_bpf+0x94f/0x1a90 [ 13.027389] do_syscall_64+0x4a/0x180 [ 13.027727] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 13.028171] [ 13.028171] other info that might help us debug this: [ 13.028171] [ 13.028807] Chain exists of: [ 13.028807] tracepoints_mutex --> &cpuctx_mutex --> bpf_event_mutex [ 13.028807] [ 13.029666] Possible unsafe locking scenario: [ 13.029666] [ 13.030140] CPU0 CPU1 [ 13.030510] ---- ---- [ 13.030875] lock(bpf_event_mutex); [ 13.031166] lock(&cpuctx_mutex); [ 13.031645] lock(bpf_event_mutex); [ 13.032135] lock(tracepoints_mutex); [ 13.032441] [ 13.032441] *** DEADLOCK *** [ 13.032441] [ 13.032911] 1 lock held by test_progs/246: [ 13.033239] #0: 00000000d663ef86 (bpf_event_mutex){+.+.}, at: bpf_probe_register+0x1d/0x60 [ 13.033909] [ 13.033909] stack backtrace: [ 13.034258] CPU: 1 PID: 246 Comm: test_progs Not tainted 5.0.0-rc3-00018-g2fa53f892422-dirty #477 [ 13.034964] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 04/01/2014 [ 13.035657] Call Trace: [ 13.035859] dump_stack+0x5f/0x8b [ 13.036130] print_circular_bug.isra.37+0x1ce/0x1db [ 13.036526] __lock_acquire+0x1158/0x1350 [ 13.036852] ? lock_acquire+0x98/0x190 [ 13.037154] lock_acquire+0x98/0x190 [ 13.037447] ? tracepoint_probe_register_prio+0x2d/0x300 [ 13.037876] __mutex_lock+0x86/0x970 [ 13.038167] ? tracepoint_probe_register_prio+0x2d/0x300 [ 13.038600] ? tracepoint_probe_register_prio+0x2d/0x300 [ 13.039028] ? __mutex_lock+0x86/0x970 [ 13.039337] ? __mutex_lock+0x24a/0x970 [ 13.039649] ? bpf_probe_register+0x1d/0x60 [ 13.039992] ? __bpf_trace_sched_wake_idle_without_ipi+0x10/0x10 [ 13.040478] ? tracepoint_probe_register_prio+0x2d/0x300 [ 13.040906] tracepoint_probe_register_prio+0x2d/0x300 [ 13.041325] bpf_probe_register+0x40/0x60 [ 13.041649] bpf_raw_tracepoint_open.isra.34+0xa4/0x130 [ 13.042068] ? __might_fault+0x3e/0x90 [ 13.042374] __do_sys_bpf+0x94f/0x1a90 [ 13.042678] do_syscall_64+0x4a/0x180 [ 13.042975] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 13.043382] RIP: 0033:0x7f23b10a07f9 [ 13.045155] RSP: 002b:00007ffdef42fdd8 EFLAGS: 00000202 ORIG_RAX: 0000000000000141 [ 13.045759] RAX: ffffffffffffffda RBX: 00007ffdef42ff70 RCX: 00007f23b10a07f9 [ 13.046326] RDX: 0000000000000070 RSI: 00007ffdef42fe10 RDI: 0000000000000011 [ 13.046893] RBP: 00007ffdef42fdf0 R08: 0000000000000038 R09: 00007ffdef42fe10 [ 13.047462] R10: 0000000000000000 R11: 0000000000000202 R12: 0000000000000000 [ 13.048029] R13: 0000000000000016 R14: 00007f23b1db4690 R15: 0000000000000000 Since tracepoints_mutex will be taken in tracepoint_probe_register/unregister() there is no need to take bpf_event_mutex too. bpf_event_mutex is protecting modifications to prog array used in kprobe/perf bpf progs. bpf_raw_tracepoints don't need to take this mutex. Fixes: c4f6699dfcb8 ("bpf: introduce BPF_RAW_TRACEPOINT") Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/trace/bpf_trace.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 8b068adb9da1..f1a86a0d881d 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1204,22 +1204,12 @@ static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog * int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog) { - int err; - - mutex_lock(&bpf_event_mutex); - err = __bpf_probe_register(btp, prog); - mutex_unlock(&bpf_event_mutex); - return err; + return __bpf_probe_register(btp, prog); } int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog) { - int err; - - mutex_lock(&bpf_event_mutex); - err = tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, prog); - mutex_unlock(&bpf_event_mutex); - return err; + return tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, prog); } int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, -- cgit v1.2.3 From 01e7187b41191376cee8bea8de9f907b001e87b4 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 23 Jan 2019 15:19:18 +0100 Subject: pipe: stop using ->can_merge Al Viro pointed out that since there is only one pipe buffer type to which new data can be appended, it isn't necessary to have a ->can_merge field in struct pipe_buf_operations, we can just check for a magic type. Suggested-by: Al Viro Signed-off-by: Jann Horn Signed-off-by: Al Viro --- kernel/trace/trace.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c521b7347482..f380139e972c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5823,7 +5823,6 @@ static void tracing_spd_release_pipe(struct splice_pipe_desc *spd, } static const struct pipe_buf_operations tracing_pipe_buf_ops = { - .can_merge = 0, .confirm = generic_pipe_buf_confirm, .release = generic_pipe_buf_release, .steal = generic_pipe_buf_steal, @@ -6843,7 +6842,6 @@ static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, /* Pipe buffer operations for a buffer. */ static const struct pipe_buf_operations buffer_pipe_buf_ops = { - .can_merge = 0, .confirm = generic_pipe_buf_confirm, .release = buffer_pipe_buf_release, .steal = generic_pipe_buf_steal, -- cgit v1.2.3 From 9acd8de69d107537a68d010c9149fa9d9aba91f4 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Tue, 1 Jan 2019 23:46:10 +0800 Subject: function_graph: Support displaying relative timestamp When function_graph is used for latency tracers, relative timestamp is more straightforward than absolute timestamp as function trace does. This change adds relative timestamp support to function_graph and applies to latency tracers (wakeup and irqsoff). Instead of: # tracer: irqsoff # # irqsoff latency trace v1.1.5 on 5.0.0-rc1-test # -------------------------------------------------------------------- # latency: 521 us, #1125/1125, CPU#2 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:8) # ----------------- # | task: swapper/2-0 (uid:0 nice:0 policy:0 rt_prio:0) # ----------------- # => started at: __schedule # => ended at: _raw_spin_unlock_irq # # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / # TIME CPU TASK/PID |||| DURATION FUNCTION CALLS # | | | | |||| | | | | | | 124.974306 | 2) systemd-693 | d..1 0.000 us | __schedule(); 124.974307 | 2) systemd-693 | d..1 | rcu_note_context_switch() { 124.974308 | 2) systemd-693 | d..1 0.487 us | rcu_preempt_deferred_qs(); 124.974309 | 2) systemd-693 | d..1 0.451 us | rcu_qs(); 124.974310 | 2) systemd-693 | d..1 2.301 us | } [..] 124.974826 | 2) -0 | d..2 | finish_task_switch() { 124.974826 | 2) -0 | d..2 | _raw_spin_unlock_irq() { 124.974827 | 2) -0 | d..2 0.000 us | _raw_spin_unlock_irq(); 124.974828 | 2) -0 | d..2 0.000 us | tracer_hardirqs_on(); -0 2d..2 552us : => __schedule => schedule_idle => do_idle => cpu_startup_entry => start_secondary => secondary_startup_64 Show: # tracer: irqsoff # # irqsoff latency trace v1.1.5 on 5.0.0-rc1-test+ # -------------------------------------------------------------------- # latency: 511 us, #1053/1053, CPU#7 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:8) # ----------------- # | task: swapper/7-0 (uid:0 nice:0 policy:0 rt_prio:0) # ----------------- # => started at: __schedule # => ended at: _raw_spin_unlock_irq # # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / # REL TIME CPU TASK/PID |||| DURATION FUNCTION CALLS # | | | | |||| | | | | | | 0 us | 7) sshd-1704 | d..1 0.000 us | __schedule(); 1 us | 7) sshd-1704 | d..1 | rcu_note_context_switch() { 1 us | 7) sshd-1704 | d..1 0.611 us | rcu_preempt_deferred_qs(); 2 us | 7) sshd-1704 | d..1 0.484 us | rcu_qs(); 3 us | 7) sshd-1704 | d..1 2.599 us | } [..] 509 us | 7) -0 | d..2 | finish_task_switch() { 510 us | 7) -0 | d..2 | _raw_spin_unlock_irq() { 510 us | 7) -0 | d..2 0.000 us | _raw_spin_unlock_irq(); 512 us | 7) -0 | d..2 0.000 us | tracer_hardirqs_on(); -0 7d..2 543us : => __schedule => schedule_idle => do_idle => cpu_startup_entry => start_secondary => secondary_startup_64 Link: http://lkml.kernel.org/r/20190101154614.8887-2-changbin.du@gmail.com Signed-off-by: Changbin Du Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 9 +++++---- kernel/trace/trace_functions_graph.c | 25 +++++++++++++++++++++++++ kernel/trace/trace_irqsoff.c | 2 +- kernel/trace/trace_sched_wakeup.c | 2 +- 4 files changed, 32 insertions(+), 6 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 08900828d282..a34fa5e76abb 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -855,10 +855,11 @@ static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash) #define TRACE_GRAPH_PRINT_PROC 0x8 #define TRACE_GRAPH_PRINT_DURATION 0x10 #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 -#define TRACE_GRAPH_PRINT_IRQS 0x40 -#define TRACE_GRAPH_PRINT_TAIL 0x80 -#define TRACE_GRAPH_SLEEP_TIME 0x100 -#define TRACE_GRAPH_GRAPH_TIME 0x200 +#define TRACE_GRAPH_PRINT_REL_TIME 0x40 +#define TRACE_GRAPH_PRINT_IRQS 0x80 +#define TRACE_GRAPH_PRINT_TAIL 0x100 +#define TRACE_GRAPH_SLEEP_TIME 0x200 +#define TRACE_GRAPH_GRAPH_TIME 0x400 #define TRACE_GRAPH_PRINT_FILL_SHIFT 28 #define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index c2af1560e856..16ebbdd7b22e 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -500,6 +500,17 @@ static void print_graph_abs_time(u64 t, struct trace_seq *s) (unsigned long)t, usecs_rem); } +static void +print_graph_rel_time(struct trace_iterator *iter, struct trace_seq *s) +{ + unsigned long long usecs; + + usecs = iter->ts - iter->trace_buffer->time_start; + do_div(usecs, NSEC_PER_USEC); + + trace_seq_printf(s, "%9llu us | ", usecs); +} + static void print_graph_irq(struct trace_iterator *iter, unsigned long addr, enum trace_type type, int cpu, pid_t pid, u32 flags) @@ -517,6 +528,10 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, if (flags & TRACE_GRAPH_PRINT_ABS_TIME) print_graph_abs_time(iter->ts, s); + /* Relative time */ + if (flags & TRACE_GRAPH_PRINT_REL_TIME) + print_graph_rel_time(iter, s); + /* Cpu */ if (flags & TRACE_GRAPH_PRINT_CPU) print_graph_cpu(s, cpu); @@ -725,6 +740,10 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, if (flags & TRACE_GRAPH_PRINT_ABS_TIME) print_graph_abs_time(iter->ts, s); + /* Relative time */ + if (flags & TRACE_GRAPH_PRINT_REL_TIME) + print_graph_rel_time(iter, s); + /* Cpu */ if (flags & TRACE_GRAPH_PRINT_CPU) print_graph_cpu(s, cpu); @@ -1101,6 +1120,8 @@ static void print_lat_header(struct seq_file *s, u32 flags) if (flags & TRACE_GRAPH_PRINT_ABS_TIME) size += 16; + if (flags & TRACE_GRAPH_PRINT_REL_TIME) + size += 16; if (flags & TRACE_GRAPH_PRINT_CPU) size += 4; if (flags & TRACE_GRAPH_PRINT_PROC) @@ -1125,6 +1146,8 @@ static void __print_graph_headers_flags(struct trace_array *tr, seq_putc(s, '#'); if (flags & TRACE_GRAPH_PRINT_ABS_TIME) seq_puts(s, " TIME "); + if (flags & TRACE_GRAPH_PRINT_REL_TIME) + seq_puts(s, " REL TIME "); if (flags & TRACE_GRAPH_PRINT_CPU) seq_puts(s, " CPU"); if (flags & TRACE_GRAPH_PRINT_PROC) @@ -1139,6 +1162,8 @@ static void __print_graph_headers_flags(struct trace_array *tr, seq_putc(s, '#'); if (flags & TRACE_GRAPH_PRINT_ABS_TIME) seq_puts(s, " | "); + if (flags & TRACE_GRAPH_PRINT_REL_TIME) + seq_puts(s, " | "); if (flags & TRACE_GRAPH_PRINT_CPU) seq_puts(s, " | "); if (flags & TRACE_GRAPH_PRINT_PROC) diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index d3294721f119..e6a62e7c6b69 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -238,7 +238,7 @@ static void irqsoff_trace_close(struct trace_iterator *iter) #define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \ TRACE_GRAPH_PRINT_PROC | \ - TRACE_GRAPH_PRINT_ABS_TIME | \ + TRACE_GRAPH_PRINT_REL_TIME | \ TRACE_GRAPH_PRINT_DURATION) static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 4ea7e6845efb..b6c5fa10347e 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -180,7 +180,7 @@ static void wakeup_trace_close(struct trace_iterator *iter) } #define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC | \ - TRACE_GRAPH_PRINT_ABS_TIME | \ + TRACE_GRAPH_PRINT_REL_TIME | \ TRACE_GRAPH_PRINT_DURATION) static enum print_line_t wakeup_print_line(struct trace_iterator *iter) -- cgit v1.2.3 From 91457c018f15591682d160ae7f0f31f81c4a9f33 Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Mon, 14 Jan 2019 21:30:37 +0100 Subject: tracing: Annotate implicit fall through in parse_probe_arg() There is a plan to build the kernel with -Wimplicit-fallthrough and this place in the code produced a warning (W=1). This commit remove the following warning: kernel/trace/trace_probe.c:302:6: warning: this statement may fall through [-Wimplicit-fallthrough=] Link: http://lkml.kernel.org/r/20190114203039.16535-1-malat@debian.org Signed-off-by: Mathieu Malaterre Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_probe.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 9962cb5da8ac..89da34b326e3 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -300,6 +300,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, case '+': /* deref memory */ arg++; /* Skip '+', because kstrtol() rejects it. */ + /* fall through */ case '-': tmp = strchr(arg, '('); if (!tmp) -- cgit v1.2.3 From 9399ca21d203f01a39696a6554344b5d6613ba7b Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Mon, 14 Jan 2019 21:30:38 +0100 Subject: tracing: Annotate implicit fall through in predicate_parse() There is a plan to build the kernel with -Wimplicit-fallthrough and this place in the code produced a warning (W=1). This commit remove the following warning: kernel/trace/trace_events_filter.c:494:8: warning: this statement may fall through [-Wimplicit-fallthrough=] Link: http://lkml.kernel.org/r/20190114203039.16535-2-malat@debian.org Signed-off-by: Mathieu Malaterre Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_filter.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 27821480105e..eb694756c4bb 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -495,6 +495,7 @@ predicate_parse(const char *str, int nr_parens, int nr_preds, ptr++; break; } + /* fall through */ default: parse_error(pe, FILT_ERR_TOO_MANY_PREDS, next - str); -- cgit v1.2.3 From 6c6dbce196c201810348b6cef6fc5ec77d4d0973 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 14 Jan 2019 16:37:53 -0500 Subject: tracing: Add comment to predicate_parse() about "&&" or "||" As the predicat_parse() code is rather complex, commenting subtleties is important. The switch case statement should be commented to describe that it is only looking for two '&' or '|' together, which is why the fall through to an error is after the check. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_filter.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index eb694756c4bb..f052ecb085e9 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -491,6 +491,7 @@ predicate_parse(const char *str, int nr_parens, int nr_preds, break; case '&': case '|': + /* accepting only "&&" or "||" */ if (next[1] == next[0]) { ptr++; break; -- cgit v1.2.3 From 97f0a3bcdf34f66c2017bad51445eafef1224870 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Tue, 1 Jan 2019 23:46:11 +0800 Subject: tracing: Show more info for funcgraph wakeup tracers Add these info fields to funcgraph wakeup tracers: o Show CPU info since the waker could be on a different CPU. o Show function duration and overhead. o Show IRQ markers. Link: http://lkml.kernel.org/r/20190101154614.8887-3-changbin.du@gmail.com Signed-off-by: Changbin Du Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_sched_wakeup.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index b6c5fa10347e..da5b6e012840 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -180,8 +180,11 @@ static void wakeup_trace_close(struct trace_iterator *iter) } #define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC | \ + TRACE_GRAPH_PRINT_CPU | \ TRACE_GRAPH_PRINT_REL_TIME | \ - TRACE_GRAPH_PRINT_DURATION) + TRACE_GRAPH_PRINT_DURATION | \ + TRACE_GRAPH_PRINT_OVERHEAD | \ + TRACE_GRAPH_PRINT_IRQS) static enum print_line_t wakeup_print_line(struct trace_iterator *iter) { -- cgit v1.2.3 From afbab501c66bece057bc30656f71f856cd1b3baa Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Tue, 1 Jan 2019 23:46:12 +0800 Subject: tracing: Put a margin between flags and duration for wakeup tracers Don't mix context flags with function duration info. Instead of this: # tracer: wakeup_rt # # wakeup_rt latency trace v1.1.5 on 5.0.0-rc1-test+ # -------------------------------------------------------------------- # latency: 177 us, #545/545, CPU#0 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:8) # ----------------- # | task: migration/0-11 (uid:0 nice:0 policy:1 rt_prio:99) # ----------------- # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / # REL TIME CPU TASK/PID |||| DURATION FUNCTION CALLS # | | | | |||| | | | | | | 0 us | 0) -0 | dNh5 | /* 0:120:R + [000] 11: 0:R migration/0 */ 2 us | 0) -0 | dNh5 0.000 us | (null)(); 4 us | 0) -0 | dNh4 | _raw_spin_unlock() { 4 us | 0) -0 | dNh4 0.304 us | preempt_count_sub(); 5 us | 0) -0 | dNh3 1.063 us | } 5 us | 0) -0 | dNh3 0.266 us | ttwu_stat(); 6 us | 0) -0 | dNh3 | _raw_spin_unlock_irqrestore() { 6 us | 0) -0 | dNh3 0.273 us | preempt_count_sub(); 6 us | 0) -0 | dNh2 0.818 us | } Show this: # tracer: wakeup # # wakeup latency trace v1.1.5 on 4.20.0+ # -------------------------------------------------------------------- # latency: 593 us, #674/674, CPU#0 | (M:desktop VP:0, KP:0, SP:0 HP:0 #P:4) # ----------------- # | task: kworker/0:1H-339 (uid:0 nice:-20 policy:0 rt_prio:0) # ----------------- # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / # REL TIME CPU TASK/PID |||| DURATION FUNCTION CALLS # | | | | |||| | | | | | | 0 us | 0) -0 | dNs. | | /* 0:120:R + [000] 339:100:R kworker/0:1H */ 3 us | 0) -0 | dNs. | 0.000 us | (null)(); 67 us | 0) -0 | dNs. | 0.721 us | ttwu_stat(); 69 us | 0) -0 | dNs. | 0.607 us | _raw_spin_unlock_irqrestore(); 71 us | 0) -0 | .Ns. | 0.598 us | _raw_spin_lock_irq(); 72 us | 0) -0 | .Ns. | 0.584 us | _raw_spin_lock_irq(); 73 us | 0) -0 | dNs. | + 11.118 us | __next_timer_interrupt(); 75 us | 0) -0 | dNs. | | call_timer_fn() { 76 us | 0) -0 | dNs. | | delayed_work_timer_fn() { 76 us | 0) -0 | dNs. | | __queue_work() { ... Link: http://lkml.kernel.org/r/20190101154614.8887-4-changbin.du@gmail.com Signed-off-by: Changbin Du Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_functions_graph.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 16ebbdd7b22e..69ebf3c2f1b5 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -380,6 +380,7 @@ static void print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry) { trace_seq_putc(s, ' '); trace_print_lat_fmt(s, entry); + trace_seq_puts(s, " | "); } /* If the pid changed since the last trace, output this event */ @@ -1153,7 +1154,7 @@ static void __print_graph_headers_flags(struct trace_array *tr, if (flags & TRACE_GRAPH_PRINT_PROC) seq_puts(s, " TASK/PID "); if (lat) - seq_puts(s, "||||"); + seq_puts(s, "|||| "); if (flags & TRACE_GRAPH_PRINT_DURATION) seq_puts(s, " DURATION "); seq_puts(s, " FUNCTION CALLS\n"); @@ -1169,7 +1170,7 @@ static void __print_graph_headers_flags(struct trace_array *tr, if (flags & TRACE_GRAPH_PRINT_PROC) seq_puts(s, " | | "); if (lat) - seq_puts(s, "||||"); + seq_puts(s, "|||| "); if (flags & TRACE_GRAPH_PRINT_DURATION) seq_puts(s, " | | "); seq_puts(s, " | | | |\n"); -- cgit v1.2.3 From f52d569f3d929d9530d75976baa191d2d50fc51b Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Thu, 17 Jan 2019 00:02:49 +0800 Subject: tracing: Show stacktrace for wakeup tracers This align the behavior of wakeup tracers with irqsoff latency tracer that we record stacktrace at the beginning and end of waking up. The stacktrace shows us what is happening in the kernel. Link: http://lkml.kernel.org/r/20190116160249.7554-1-changbin.du@gmail.com Signed-off-by: Changbin Du Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_sched_wakeup.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index da5b6e012840..f4fe7d1781e9 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -475,6 +475,7 @@ probe_wakeup_sched_switch(void *ignore, bool preempt, __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); + __trace_stack(wakeup_trace, flags, 0, pc); T0 = data->preempt_timestamp; T1 = ftrace_now(cpu); @@ -586,6 +587,7 @@ probe_wakeup(void *ignore, struct task_struct *p) data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu); data->preempt_timestamp = ftrace_now(cpu); tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc); + __trace_stack(wakeup_trace, flags, 0, pc); /* * We must be careful in using CALLER_ADDR2. But since wake_up -- cgit v1.2.3 From d325c402964e7c63db94e9138c530832269a1297 Mon Sep 17 00:00:00 2001 From: Miroslav Benes Date: Fri, 28 Dec 2018 14:38:47 +0100 Subject: ring-buffer: Remove unused function ring_buffer_page_len() Commit 6b7e633fe9c2 ("tracing: Remove extra zeroing out of the ring buffer page") removed the only caller of ring_buffer_page_len(). The function is now unused and may be removed. Link: http://lkml.kernel.org/r/20181228133847.106177-1-mbenes@suse.cz Signed-off-by: Miroslav Benes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 06e864a334bb..9a91479bbbfe 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -353,20 +353,6 @@ static void rb_init_page(struct buffer_data_page *bpage) local_set(&bpage->commit, 0); } -/** - * ring_buffer_page_len - the size of data on the page. - * @page: The page to read - * - * Returns the amount of data on the page, including buffer page header. - */ -size_t ring_buffer_page_len(void *page) -{ - struct buffer_data_page *bpage = page; - - return (local_read(&bpage->commit) & ~RB_MISSED_FLAGS) - + BUF_PAGE_HDR_SIZE; -} - /* * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing * this issue out. -- cgit v1.2.3 From 85acbb21b9310043692cf18b1fd14067b5a25ccd Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Sun, 10 Feb 2019 00:19:19 +0800 Subject: tracing: Change the function format to display function names by perf Here is an example for this change. $ sudo perf record -e 'ftrace:function' --filter='ip==schedule' $ sudo perf report The output of perf before this patch: \# Samples: 100 of event 'ftrace:function' \# Event count (approx.): 100 \# \# Overhead Trace output \# ........ ...................................... \# 51.00% ffffffff81f6aaa0 <-- ffffffff81158e8d 29.00% ffffffff81f6aaa0 <-- ffffffff8116ccb2 8.00% ffffffff81f6aaa0 <-- ffffffff81f6f2ed 4.00% ffffffff81f6aaa0 <-- ffffffff811628db 4.00% ffffffff81f6aaa0 <-- ffffffff81f6ec5b 2.00% ffffffff81f6aaa0 <-- ffffffff81f6f21a 1.00% ffffffff81f6aaa0 <-- ffffffff811b04af 1.00% ffffffff81f6aaa0 <-- ffffffff8143ce17 After this patch: \# Samples: 36 of event 'ftrace:function' \# Event count (approx.): 36 \# \# Overhead Trace output \# ........ ............................................ \# 38.89% schedule <-- schedule_hrtimeout_range_clock 27.78% schedule <-- worker_thread 13.89% schedule <-- schedule_timeout 11.11% schedule <-- smpboot_thread_fn 5.56% schedule <-- rcu_gp_kthread 2.78% schedule <-- exit_to_usermode_loop Link: http://lkml.kernel.org/r/20190209161919.32350-1-changbin.du@gmail.com Signed-off-by: Changbin Du Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_entries.h | 41 +++++++++++++++++++---------------------- 1 file changed, 19 insertions(+), 22 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 06bb2fd9a56c..fc8e97328e54 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -65,7 +65,8 @@ FTRACE_ENTRY_REG(function, ftrace_entry, __field( unsigned long, parent_ip ) ), - F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip), + F_printk(" %ps <-- %ps", + (void *)__entry->ip, (void *)__entry->parent_ip), FILTER_TRACE_FN, @@ -83,7 +84,7 @@ FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry, __field_desc( int, graph_ent, depth ) ), - F_printk("--> %lx (%d)", __entry->func, __entry->depth), + F_printk("--> %ps (%d)", (void *)__entry->func, __entry->depth), FILTER_OTHER ); @@ -102,8 +103,8 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, __field_desc( int, ret, depth ) ), - F_printk("<-- %lx (%d) (start: %llx end: %llx) over: %d", - __entry->func, __entry->depth, + F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d", + (void *)__entry->func, __entry->depth, __entry->calltime, __entry->rettime, __entry->depth), @@ -167,12 +168,6 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry, #define FTRACE_STACK_ENTRIES 8 -#ifndef CONFIG_64BIT -# define IP_FMT "%08lx" -#else -# define IP_FMT "%016lx" -#endif - FTRACE_ENTRY(kernel_stack, stack_entry, TRACE_STACK, @@ -182,12 +177,13 @@ FTRACE_ENTRY(kernel_stack, stack_entry, __dynamic_array(unsigned long, caller ) ), - F_printk("\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" - "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" - "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n", - __entry->caller[0], __entry->caller[1], __entry->caller[2], - __entry->caller[3], __entry->caller[4], __entry->caller[5], - __entry->caller[6], __entry->caller[7]), + F_printk("\t=> %ps\n\t=> %ps\n\t=> %ps\n" + "\t=> %ps\n\t=> %ps\n\t=> %ps\n" + "\t=> %ps\n\t=> %ps\n", + (void *)__entry->caller[0], (void *)__entry->caller[1], + (void *)__entry->caller[2], (void *)__entry->caller[3], + (void *)__entry->caller[4], (void *)__entry->caller[5], + (void *)__entry->caller[6], (void *)__entry->caller[7]), FILTER_OTHER ); @@ -201,12 +197,13 @@ FTRACE_ENTRY(user_stack, userstack_entry, __array( unsigned long, caller, FTRACE_STACK_ENTRIES ) ), - F_printk("\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" - "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" - "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n", - __entry->caller[0], __entry->caller[1], __entry->caller[2], - __entry->caller[3], __entry->caller[4], __entry->caller[5], - __entry->caller[6], __entry->caller[7]), + F_printk("\t=> %ps\n\t=> %ps\n\t=> %ps\n" + "\t=> %ps\n\t=> %ps\n\t=> %ps\n" + "\t=> %ps\n\t=> %ps\n", + (void *)__entry->caller[0], (void *)__entry->caller[1], + (void *)__entry->caller[2], (void *)__entry->caller[3], + (void *)__entry->caller[4], (void *)__entry->caller[5], + (void *)__entry->caller[6], (void *)__entry->caller[7]), FILTER_OTHER ); -- cgit v1.2.3 From f6675872db57305fa957021efc788f9983ed3b67 Mon Sep 17 00:00:00 2001 From: Andreas Ziegler Date: Wed, 6 Feb 2019 20:00:13 +0100 Subject: tracing: probeevent: Correctly update remaining space in dynamic area Commit 9178412ddf5a ("tracing: probeevent: Return consumed bytes of dynamic area") improved the string fetching mechanism by returning the number of required bytes after copying the argument to the dynamic area. However, this return value is now only used to increment the pointer inside the dynamic area but misses updating the 'maxlen' variable which indicates the remaining space in the dynamic area. This means that fetch_store_string() always reads the *total* size of the dynamic area from the data_loc pointer instead of the *remaining* size (and passes it along to strncpy_from_{user,unsafe}) even if we're already about to copy data into the middle of the dynamic area. Link: http://lkml.kernel.org/r/20190206190013.16405-1-andreas.ziegler@fau.de Cc: Ingo Molnar Cc: stable@vger.kernel.org Fixes: 9178412ddf5a ("tracing: probeevent: Return consumed bytes of dynamic area") Acked-by: Masami Hiramatsu Signed-off-by: Andreas Ziegler Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_probe_tmpl.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index 5c56afc17cf8..4737bb8c07a3 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -180,10 +180,12 @@ store_trace_args(void *data, struct trace_probe *tp, struct pt_regs *regs, if (unlikely(arg->dynamic)) *dl = make_data_loc(maxlen, dyndata - base); ret = process_fetch_insn(arg->code, regs, dl, base); - if (unlikely(ret < 0 && arg->dynamic)) + if (unlikely(ret < 0 && arg->dynamic)) { *dl = make_data_loc(0, dyndata - base); - else + } else { dyndata += ret; + maxlen -= ret; + } } } -- cgit v1.2.3 From eeeb080bae906a57b6513d37efe3c38f2cb87a1c Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 13 Feb 2019 01:13:40 +0900 Subject: kprobes: Prohibit probing on hardirq tracers Since kprobes breakpoint handling involves hardirq tracer, probing these functions cause breakpoint recursion problem. Prohibit probing on those functions. Signed-off-by: Masami Hiramatsu Acked-by: Steven Rostedt (VMware) Cc: Alexander Shishkin Cc: Andrea Righi Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mathieu Desnoyers Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/154998802073.31052.17255044712514564153.stgit@devbox Signed-off-by: Ingo Molnar --- kernel/trace/trace_irqsoff.c | 9 +++++++-- kernel/trace/trace_preemptirq.c | 5 +++++ 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index d3294721f119..d42a473b8240 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "trace.h" @@ -365,7 +366,7 @@ out: __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); } -static inline void +static nokprobe_inline void start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc) { int cpu; @@ -401,7 +402,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc) atomic_dec(&data->disabled); } -static inline void +static nokprobe_inline void stop_critical_timing(unsigned long ip, unsigned long parent_ip, int pc) { int cpu; @@ -443,6 +444,7 @@ void start_critical_timings(void) start_critical_timing(CALLER_ADDR0, CALLER_ADDR1, pc); } EXPORT_SYMBOL_GPL(start_critical_timings); +NOKPROBE_SYMBOL(start_critical_timings); void stop_critical_timings(void) { @@ -452,6 +454,7 @@ void stop_critical_timings(void) stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1, pc); } EXPORT_SYMBOL_GPL(stop_critical_timings); +NOKPROBE_SYMBOL(stop_critical_timings); #ifdef CONFIG_FUNCTION_TRACER static bool function_enabled; @@ -611,6 +614,7 @@ void tracer_hardirqs_on(unsigned long a0, unsigned long a1) if (!preempt_trace(pc) && irq_trace()) stop_critical_timing(a0, a1, pc); } +NOKPROBE_SYMBOL(tracer_hardirqs_on); void tracer_hardirqs_off(unsigned long a0, unsigned long a1) { @@ -619,6 +623,7 @@ void tracer_hardirqs_off(unsigned long a0, unsigned long a1) if (!preempt_trace(pc) && irq_trace()) start_critical_timing(a0, a1, pc); } +NOKPROBE_SYMBOL(tracer_hardirqs_off); static int irqsoff_tracer_init(struct trace_array *tr) { diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index 71f553cceb3c..4d8e99fdbbbe 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "trace.h" #define CREATE_TRACE_POINTS @@ -30,6 +31,7 @@ void trace_hardirqs_on(void) lockdep_hardirqs_on(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_on); +NOKPROBE_SYMBOL(trace_hardirqs_on); void trace_hardirqs_off(void) { @@ -43,6 +45,7 @@ void trace_hardirqs_off(void) lockdep_hardirqs_off(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_off); +NOKPROBE_SYMBOL(trace_hardirqs_off); __visible void trace_hardirqs_on_caller(unsigned long caller_addr) { @@ -56,6 +59,7 @@ __visible void trace_hardirqs_on_caller(unsigned long caller_addr) lockdep_hardirqs_on(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_on_caller); +NOKPROBE_SYMBOL(trace_hardirqs_on_caller); __visible void trace_hardirqs_off_caller(unsigned long caller_addr) { @@ -69,6 +73,7 @@ __visible void trace_hardirqs_off_caller(unsigned long caller_addr) lockdep_hardirqs_off(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_off_caller); +NOKPROBE_SYMBOL(trace_hardirqs_off_caller); #endif /* CONFIG_TRACE_IRQFLAGS */ #ifdef CONFIG_TRACE_PREEMPT_TOGGLE -- cgit v1.2.3 From 2c4f1fcbef0bc324830bc2fb1a264c08ec93dec5 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Fri, 25 Jan 2019 23:10:50 +0800 Subject: kprobe: Do not use uaccess functions to access kernel memory that can fault The userspace can ask kprobe to intercept strings at any memory address, including invalid kernel address. In this case, fetch_store_strlen() would crash since it uses general usercopy function, and user access functions are no longer allowed to access kernel memory. For example, we can crash the kernel by doing something as below: $ sudo kprobe 'p:do_sys_open +0(+0(%si)):string' [ 103.620391] BUG: GPF in non-whitelisted uaccess (non-canonical address?) [ 103.622104] general protection fault: 0000 [#1] SMP PTI [ 103.623424] CPU: 10 PID: 1046 Comm: cat Not tainted 5.0.0-rc3-00130-gd73aba1-dirty #96 [ 103.625321] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.12.0-2-g628b2e6-dirty-20190104_103505-linux 04/01/2014 [ 103.628284] RIP: 0010:process_fetch_insn+0x1ab/0x4b0 [ 103.629518] Code: 10 83 80 28 2e 00 00 01 31 d2 31 ff 48 8b 74 24 28 eb 0c 81 fa ff 0f 00 00 7f 1c 85 c0 75 18 66 66 90 0f ae e8 48 63 ca 89 f8 <8a> 0c 31 66 66 90 83 c2 01 84 c9 75 dc 89 54 24 34 89 44 24 28 48 [ 103.634032] RSP: 0018:ffff88845eb37ce0 EFLAGS: 00010246 [ 103.635312] RAX: 0000000000000000 RBX: ffff888456c4e5a8 RCX: 0000000000000000 [ 103.637057] RDX: 0000000000000000 RSI: 2e646c2f6374652f RDI: 0000000000000000 [ 103.638795] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 [ 103.640556] R10: 0000000000000001 R11: 0000000000000000 R12: 0000000000000000 [ 103.642297] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 [ 103.644040] FS: 0000000000000000(0000) GS:ffff88846f000000(0000) knlGS:0000000000000000 [ 103.646019] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 103.647436] CR2: 00007ffc79758038 CR3: 0000000463360006 CR4: 0000000000020ee0 [ 103.649147] Call Trace: [ 103.649781] ? sched_clock_cpu+0xc/0xa0 [ 103.650747] ? do_sys_open+0x5/0x220 [ 103.651635] kprobe_trace_func+0x303/0x380 [ 103.652645] ? do_sys_open+0x5/0x220 [ 103.653528] kprobe_dispatcher+0x45/0x50 [ 103.654682] ? do_sys_open+0x1/0x220 [ 103.655875] kprobe_ftrace_handler+0x90/0xf0 [ 103.657282] ftrace_ops_assist_func+0x54/0xf0 [ 103.658564] ? __call_rcu+0x1dc/0x280 [ 103.659482] 0xffffffffc00000bf [ 103.660384] ? __ia32_sys_open+0x20/0x20 [ 103.661682] ? do_sys_open+0x1/0x220 [ 103.662863] do_sys_open+0x5/0x220 [ 103.663988] do_syscall_64+0x60/0x210 [ 103.665201] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 103.666862] RIP: 0033:0x7fc22fadccdd [ 103.668034] Code: 48 89 54 24 e0 41 83 e2 40 75 32 89 f0 25 00 00 41 00 3d 00 00 41 00 74 24 89 f2 b8 01 01 00 00 48 89 fe bf 9c ff ff ff 0f 05 <48> 3d 00 f0 ff ff 77 33 f3 c3 66 0f 1f 84 00 00 00 00 00 48 8d 44 [ 103.674029] RSP: 002b:00007ffc7972c3a8 EFLAGS: 00000287 ORIG_RAX: 0000000000000101 [ 103.676512] RAX: ffffffffffffffda RBX: 0000562f86147a21 RCX: 00007fc22fadccdd [ 103.678853] RDX: 0000000000080000 RSI: 00007fc22fae1428 RDI: 00000000ffffff9c [ 103.681151] RBP: ffffffffffffffff R08: 0000000000000000 R09: 0000000000000000 [ 103.683489] R10: 0000000000000000 R11: 0000000000000287 R12: 00007fc22fce90a8 [ 103.685774] R13: 0000000000000001 R14: 0000000000000000 R15: 0000000000000000 [ 103.688056] Modules linked in: [ 103.689131] ---[ end trace 43792035c28984a1 ]--- This can be fixed by using probe_mem_read() instead, as it can handle faulting kernel memory addresses, which kprobes can legitimately do. Link: http://lkml.kernel.org/r/20190125151051.7381-1-changbin.du@gmail.com Cc: stable@vger.kernel.org Fixes: 9da3f2b7405 ("x86/fault: BUG() when uaccess helpers fault on kernel addresses") Signed-off-by: Changbin Du Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index d5fb09ebba8b..9eaf07f99212 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -861,22 +861,14 @@ static const struct file_operations kprobe_profile_ops = { static nokprobe_inline int fetch_store_strlen(unsigned long addr) { - mm_segment_t old_fs; int ret, len = 0; u8 c; - old_fs = get_fs(); - set_fs(KERNEL_DS); - pagefault_disable(); - do { - ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1); + ret = probe_mem_read(&c, (u8 *)addr + len, 1); len++; } while (c && ret == 0 && len < MAX_STRING_SIZE); - pagefault_enable(); - set_fs(old_fs); - return (ret < 0) ? ret : len; } -- cgit v1.2.3 From 9e7382153f80ba45a0bbcd540fb77d4b15f6e966 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Thu, 14 Feb 2019 15:29:50 +0000 Subject: tracing: Fix number of entries in trace header The following commit 441dae8f2f29 ("tracing: Add support for display of tgid in trace output") removed the call to print_event_info() from print_func_help_header_irq() which results in the ftrace header not reporting the number of entries written in the buffer. As this wasn't the original intent of the patch, re-introduce the call to print_event_info() to restore the orginal behaviour. Link: http://lkml.kernel.org/r/20190214152950.4179-1-quentin.perret@arm.com Acked-by: Joel Fernandes Cc: stable@vger.kernel.org Fixes: 441dae8f2f29 ("tracing: Add support for display of tgid in trace output") Signed-off-by: Quentin Perret Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c521b7347482..c4238b441624 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3384,6 +3384,8 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file const char tgid_space[] = " "; const char space[] = " "; + print_event_info(buf, m); + seq_printf(m, "# %s _-----=> irqs-off\n", tgid ? tgid_space : space); seq_printf(m, "# %s / _----=> need-resched\n", -- cgit v1.2.3 From f79b3f338564e7674dbe6375bcf685c2ba483efe Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 11 Feb 2019 15:00:48 -0500 Subject: ftrace: Allow enabling of filters via index of available_filter_functions Enabling of large number of functions by echoing in a large subset of the functions in available_filter_functions can take a very long time. The process requires testing all functions registered by the function tracer (which is in the 10s of thousands), and doing a kallsyms lookup to convert the ip address into a name, then comparing that name with the string passed in. When a function causes the function tracer to crash the system, a binary bisect of the available_filter_functions can be done to find the culprit. But this requires passing in half of the functions in available_filter_functions over and over again, which makes it basically a O(n^2) operation. With 40,000 functions, that ends up bing 1,600,000,000 opertions! And enabling this can take over 20 minutes. As a quick speed up, if a number is passed into one of the filter files, instead of doing a search, it just enables the function at the corresponding line of the available_filter_functions file. That is: # echo 50 > set_ftrace_filter # cat set_ftrace_filter x86_pmu_commit_txn # head -50 available_filter_functions | tail -1 x86_pmu_commit_txn This allows setting of half the available_filter_functions to take place in less than a second! # time seq 20000 > set_ftrace_filter real 0m0.042s user 0m0.005s sys 0m0.015s # wc -l set_ftrace_filter 20000 set_ftrace_filter Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 30 ++++++++++++++++++++++++++++++ kernel/trace/trace.h | 1 + kernel/trace/trace_events_filter.c | 5 +++++ 3 files changed, 36 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index aac7847c0214..fa79323331b2 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3701,6 +3701,31 @@ enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int clear_filter) return ret; } +static int +add_rec_by_index(struct ftrace_hash *hash, struct ftrace_glob *func_g, + int clear_filter) +{ + long index = simple_strtoul(func_g->search, NULL, 0); + struct ftrace_page *pg; + struct dyn_ftrace *rec; + + /* The index starts at 1 */ + if (--index < 0) + return 0; + + do_for_each_ftrace_rec(pg, rec) { + if (pg->index <= index) { + index -= pg->index; + /* this is a double loop, break goes to the next page */ + break; + } + rec = &pg->records[index]; + enter_record(hash, rec, clear_filter); + return 1; + } while_for_each_ftrace_rec(); + return 0; +} + static int ftrace_match_record(struct dyn_ftrace *rec, struct ftrace_glob *func_g, struct ftrace_glob *mod_g, int exclude_mod) @@ -3769,6 +3794,11 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod) if (unlikely(ftrace_disabled)) goto out_unlock; + if (func_g.type == MATCH_INDEX) { + found = add_rec_by_index(hash, &func_g, clear_filter); + goto out_unlock; + } + do_for_each_ftrace_rec(pg, rec) { if (rec->flags & FTRACE_FL_DISABLED) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index a34fa5e76abb..ae7df090b93e 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1459,6 +1459,7 @@ enum regex_type { MATCH_MIDDLE_ONLY, MATCH_END_ONLY, MATCH_GLOB, + MATCH_INDEX, }; struct regex { diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index f052ecb085e9..ade606c33231 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -825,6 +825,9 @@ enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not) *search = buff; + if (isdigit(buff[0])) + return MATCH_INDEX; + for (i = 0; i < len; i++) { if (buff[i] == '*') { if (!i) { @@ -862,6 +865,8 @@ static void filter_build_regex(struct filter_pred *pred) } switch (type) { + /* MATCH_INDEX should not happen, but if it does, match full */ + case MATCH_INDEX: case MATCH_FULL: r->match = regex_match_full; break; -- cgit v1.2.3 From e7f0c424d0806b05d6f47be9f202b037eb701707 Mon Sep 17 00:00:00 2001 From: "zhangyi (F)" Date: Wed, 13 Feb 2019 20:29:06 +0800 Subject: tracing: Do not free iter->trace in fail path of tracing_open_pipe() Commit d716ff71dd12 ("tracing: Remove taking of trace_types_lock in pipe files") use the current tracer instead of the copy in tracing_open_pipe(), but it forget to remove the freeing sentence in the error path. There's an error path that can call kfree(iter->trace) after the iter->trace was assigned to tr->current_trace, which would be bad to free. Link: http://lkml.kernel.org/r/1550060946-45984-1-git-send-email-yi.zhang@huawei.com Cc: stable@vger.kernel.org Fixes: d716ff71dd12 ("tracing: Remove taking of trace_types_lock in pipe files") Signed-off-by: zhangyi (F) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c521b7347482..b583ff7656bb 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5624,7 +5624,6 @@ out: return ret; fail: - kfree(iter->trace); kfree(iter); __trace_array_put(tr); mutex_unlock(&trace_types_lock); -- cgit v1.2.3 From 7d18a10c316783357fb1b2b649cfcf97c70a7bee Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Wed, 13 Feb 2019 17:42:41 -0600 Subject: tracing: Refactor hist trigger action code The hist trigger action code currently implements two essentially hard-coded pairs of 'actions' - onmax(), which tracks a variable and saves some event fields when a max is hit, and onmatch(), which is hard-coded to generate a synthetic event. These hardcoded pairs (track max/save fields and detect match/generate synthetic event) should really be decoupled into separate components that can then be arbitrarily combined. The first component of each pair (track max/detect match) is called a 'handler' in the new code, while the second component (save fields/generate synthetic event) is called an 'action' in this scheme. This change refactors the action code to reflect this split by adding two handlers, HANDLER_ONMATCH and HANDLER_ONMAX, along with two actions, ACTION_SAVE and ACTION_TRACE. The new code combines them to produce the existing ONMATCH/TRACE and ONMAX/SAVE functionality, but doesn't implement the other combinations now possible. Future patches will expand these to further useful cases, such as ONMAX/TRACE, as well as add additional handlers and actions such as ONCHANGE and SNAPSHOT. Also, add abbreviated documentation for handlers and actions to README. Link: http://lkml.kernel.org/r/98bfdd48c1b4ff29fc5766442f99f5bc3c34b76b.1550100284.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 407 +++++++++++++++++++++++---------------- 1 file changed, 238 insertions(+), 169 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 449d90cfa151..dfaaad582797 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -313,9 +313,9 @@ struct hist_trigger_data { struct field_var_hist *field_var_hists[SYNTH_FIELDS_MAX]; unsigned int n_field_var_hists; - struct field_var *max_vars[SYNTH_FIELDS_MAX]; - unsigned int n_max_vars; - unsigned int n_max_var_str; + struct field_var *save_vars[SYNTH_FIELDS_MAX]; + unsigned int n_save_vars; + unsigned int n_save_var_str; }; static int synth_event_create(int argc, const char **argv); @@ -383,11 +383,25 @@ struct action_data; typedef void (*action_fn_t) (struct hist_trigger_data *hist_data, struct tracing_map_elt *elt, void *rec, - struct ring_buffer_event *rbe, + struct ring_buffer_event *rbe, void *key, struct action_data *data, u64 *var_ref_vals); +enum handler_id { + HANDLER_ONMATCH = 1, + HANDLER_ONMAX, +}; + +enum action_id { + ACTION_SAVE = 1, + ACTION_TRACE, +}; + struct action_data { + enum handler_id handler; + enum action_id action; + char *action_name; action_fn_t fn; + unsigned int n_params; char *params[SYNTH_FIELDS_MAX]; @@ -404,13 +418,11 @@ struct action_data { unsigned int var_ref_idx; char *match_event; char *match_event_system; - char *synth_event_name; struct synth_event *synth_event; } onmatch; struct { char *var_str; - char *fn_name; unsigned int max_var_ref_idx; struct hist_field *max_var; struct hist_field *var; @@ -1078,7 +1090,7 @@ static struct synth_event *alloc_synth_event(const char *name, int n_fields, static void action_trace(struct hist_trigger_data *hist_data, struct tracing_map_elt *elt, void *rec, - struct ring_buffer_event *rbe, + struct ring_buffer_event *rbe, void *key, struct action_data *data, u64 *var_ref_vals) { struct synth_event *event = data->onmatch.synth_event; @@ -1644,7 +1656,7 @@ find_match_var(struct hist_trigger_data *hist_data, char *var_name) for (i = 0; i < hist_data->n_actions; i++) { struct action_data *data = hist_data->actions[i]; - if (data->fn == action_trace) { + if (data->handler == HANDLER_ONMATCH) { char *system = data->onmatch.match_event_system; char *event_name = data->onmatch.match_event; @@ -2076,7 +2088,7 @@ static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt) } } - n_str = hist_data->n_field_var_str + hist_data->n_max_var_str; + n_str = hist_data->n_field_var_str + hist_data->n_save_var_str; size = STR_VAR_LEN_MAX; @@ -3050,7 +3062,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, int ret; if (target_hist_data->n_field_var_hists >= SYNTH_FIELDS_MAX) { - hist_err_event("onmatch: Too many field variables defined: ", + hist_err_event("trace action: Too many field variables defined: ", subsys_name, event_name, field_name); return ERR_PTR(-EINVAL); } @@ -3058,7 +3070,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, file = event_file(tr, subsys_name, event_name); if (IS_ERR(file)) { - hist_err_event("onmatch: Event file not found: ", + hist_err_event("trace action: Event file not found: ", subsys_name, event_name, field_name); ret = PTR_ERR(file); return ERR_PTR(ret); @@ -3072,7 +3084,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, */ hist_data = find_compatible_hist(target_hist_data, file); if (!hist_data) { - hist_err_event("onmatch: Matching event histogram not found: ", + hist_err_event("trace action: Matching event histogram not found: ", subsys_name, event_name, field_name); return ERR_PTR(-EINVAL); } @@ -3134,7 +3146,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, kfree(cmd); kfree(var_hist->cmd); kfree(var_hist); - hist_err_event("onmatch: Couldn't create histogram for field: ", + hist_err_event("trace action: Couldn't create histogram for field: ", subsys_name, event_name, field_name); return ERR_PTR(ret); } @@ -3147,7 +3159,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, if (IS_ERR_OR_NULL(event_var)) { kfree(var_hist->cmd); kfree(var_hist); - hist_err_event("onmatch: Couldn't find synthetic variable: ", + hist_err_event("trace action: Couldn't find synthetic variable: ", subsys_name, event_name, field_name); return ERR_PTR(-EINVAL); } @@ -3230,8 +3242,8 @@ static void update_max_vars(struct hist_trigger_data *hist_data, struct ring_buffer_event *rbe, void *rec) { - __update_field_vars(elt, rbe, rec, hist_data->max_vars, - hist_data->n_max_vars, hist_data->n_field_var_str); + __update_field_vars(elt, rbe, rec, hist_data->save_vars, + hist_data->n_save_vars, hist_data->n_field_var_str); } static struct hist_field *create_var(struct hist_trigger_data *hist_data, @@ -3375,9 +3387,9 @@ static void onmax_print(struct seq_file *m, seq_printf(m, "\n\tmax: %10llu", tracing_map_read_var(elt, max_idx)); - for (i = 0; i < hist_data->n_max_vars; i++) { - struct hist_field *save_val = hist_data->max_vars[i]->val; - struct hist_field *save_var = hist_data->max_vars[i]->var; + for (i = 0; i < hist_data->n_save_vars; i++) { + struct hist_field *save_val = hist_data->save_vars[i]->val; + struct hist_field *save_var = hist_data->save_vars[i]->var; u64 val; save_var_idx = save_var->var.idx; @@ -3394,7 +3406,7 @@ static void onmax_print(struct seq_file *m, static void onmax_save(struct hist_trigger_data *hist_data, struct tracing_map_elt *elt, void *rec, - struct ring_buffer_event *rbe, + struct ring_buffer_event *rbe, void *key, struct action_data *data, u64 *var_ref_vals) { unsigned int max_idx = data->onmax.max_var->var.idx; @@ -3421,7 +3433,7 @@ static void onmax_destroy(struct action_data *data) destroy_hist_field(data->onmax.var, 0); kfree(data->onmax.var_str); - kfree(data->onmax.fn_name); + kfree(data->action_name); for (i = 0; i < data->n_params; i++) kfree(data->params[i]); @@ -3429,15 +3441,16 @@ static void onmax_destroy(struct action_data *data) kfree(data); } +static int action_create(struct hist_trigger_data *hist_data, + struct action_data *data); + static int onmax_create(struct hist_trigger_data *hist_data, struct action_data *data) { + struct hist_field *var_field, *ref_field, *max_var = NULL; struct trace_event_file *file = hist_data->event_file; - struct hist_field *var_field, *ref_field, *max_var; unsigned int var_ref_idx = hist_data->n_var_refs; - struct field_var *field_var; - char *onmax_var_str, *param; - unsigned int i; + char *onmax_var_str; int ret = 0; onmax_var_str = data->onmax.var_str; @@ -3459,8 +3472,8 @@ static int onmax_create(struct hist_trigger_data *hist_data, data->onmax.var = ref_field; - data->fn = onmax_save; data->onmax.max_var_ref_idx = var_ref_idx; + max_var = create_var(hist_data, file, "max", sizeof(u64), "u64"); if (IS_ERR(max_var)) { hist_err("onmax: Couldn't create onmax variable: ", "max"); @@ -3469,27 +3482,7 @@ static int onmax_create(struct hist_trigger_data *hist_data, } data->onmax.max_var = max_var; - for (i = 0; i < data->n_params; i++) { - param = kstrdup(data->params[i], GFP_KERNEL); - if (!param) { - ret = -ENOMEM; - goto out; - } - - field_var = create_target_field_var(hist_data, NULL, NULL, param); - if (IS_ERR(field_var)) { - hist_err("onmax: Couldn't create field variable: ", param); - ret = PTR_ERR(field_var); - kfree(param); - goto out; - } - - hist_data->max_vars[hist_data->n_max_vars++] = field_var; - if (field_var->val->flags & HIST_FIELD_FL_STRING) - hist_data->n_max_var_str++; - - kfree(param); - } + ret = action_create(hist_data, data); out: return ret; } @@ -3500,11 +3493,14 @@ static int parse_action_params(char *params, struct action_data *data) int ret = 0; while (params) { - if (data->n_params >= SYNTH_FIELDS_MAX) + if (data->n_params >= SYNTH_FIELDS_MAX) { + hist_err("Too many action params", ""); goto out; + } param = strsep(¶ms, ","); if (!param) { + hist_err("No action param found", ""); ret = -EINVAL; goto out; } @@ -3528,10 +3524,71 @@ static int parse_action_params(char *params, struct action_data *data) return ret; } -static struct action_data *onmax_parse(char *str) +static int action_parse(char *str, struct action_data *data, + enum handler_id handler) +{ + char *action_name; + int ret = 0; + + strsep(&str, "."); + if (!str) { + hist_err("action parsing: No action found", ""); + ret = -EINVAL; + goto out; + } + + action_name = strsep(&str, "("); + if (!action_name || !str) { + hist_err("action parsing: No action found", ""); + ret = -EINVAL; + goto out; + } + + if (str_has_prefix(action_name, "save")) { + char *params = strsep(&str, ")"); + + if (!params) { + hist_err("action parsing: No params found for %s", "save"); + ret = -EINVAL; + goto out; + } + + ret = parse_action_params(params, data); + if (ret) + goto out; + + if (handler == HANDLER_ONMAX) + data->fn = onmax_save; + + data->action = ACTION_SAVE; + } else { + char *params = strsep(&str, ")"); + + if (params) { + ret = parse_action_params(params, data); + if (ret) + goto out; + } + + data->fn = action_trace; + data->action = ACTION_TRACE; + } + + data->action_name = kstrdup(action_name, GFP_KERNEL); + if (!data->action_name) { + ret = -ENOMEM; + goto out; + } + + data->handler = handler; + out: + return ret; +} + +static struct action_data *onmax_parse(char *str, enum handler_id handler) { - char *onmax_fn_name, *onmax_var_str; struct action_data *data; + char *onmax_var_str; int ret = -EINVAL; data = kzalloc(sizeof(*data), GFP_KERNEL); @@ -3550,33 +3607,9 @@ static struct action_data *onmax_parse(char *str) goto free; } - strsep(&str, "."); - if (!str) - goto free; - - onmax_fn_name = strsep(&str, "("); - if (!onmax_fn_name || !str) - goto free; - - if (str_has_prefix(onmax_fn_name, "save")) { - char *params = strsep(&str, ")"); - - if (!params) { - ret = -EINVAL; - goto free; - } - - ret = parse_action_params(params, data); - if (ret) - goto free; - } else - goto free; - - data->onmax.fn_name = kstrdup(onmax_fn_name, GFP_KERNEL); - if (!data->onmax.fn_name) { - ret = -ENOMEM; + ret = action_parse(str, data, handler); + if (ret) goto free; - } out: return data; free: @@ -3593,7 +3626,7 @@ static void onmatch_destroy(struct action_data *data) kfree(data->onmatch.match_event); kfree(data->onmatch.match_event_system); - kfree(data->onmatch.synth_event_name); + kfree(data->action_name); for (i = 0; i < data->n_params; i++) kfree(data->params[i]); @@ -3651,8 +3684,9 @@ static int check_synth_field(struct synth_event *event, } static struct hist_field * -onmatch_find_var(struct hist_trigger_data *hist_data, struct action_data *data, - char *system, char *event, char *var) +trace_action_find_var(struct hist_trigger_data *hist_data, + struct action_data *data, + char *system, char *event, char *var) { struct hist_field *hist_field; @@ -3660,7 +3694,7 @@ onmatch_find_var(struct hist_trigger_data *hist_data, struct action_data *data, hist_field = find_target_event_var(hist_data, system, event, var); if (!hist_field) { - if (!system) { + if (!system && data->handler == HANDLER_ONMATCH) { system = data->onmatch.match_event_system; event = data->onmatch.match_event; } @@ -3669,15 +3703,15 @@ onmatch_find_var(struct hist_trigger_data *hist_data, struct action_data *data, } if (!hist_field) - hist_err_event("onmatch: Couldn't find onmatch param: $", system, event, var); + hist_err_event("trace action: Couldn't find param: $", system, event, var); return hist_field; } static struct hist_field * -onmatch_create_field_var(struct hist_trigger_data *hist_data, - struct action_data *data, char *system, - char *event, char *var) +trace_action_create_field_var(struct hist_trigger_data *hist_data, + struct action_data *data, char *system, + char *event, char *var) { struct hist_field *hist_field = NULL; struct field_var *field_var; @@ -3700,7 +3734,7 @@ onmatch_create_field_var(struct hist_trigger_data *hist_data, * looking for fields on the onmatch(system.event.xxx) * event. */ - if (!system) { + if (!system && data->handler == HANDLER_ONMATCH) { system = data->onmatch.match_event_system; event = data->onmatch.match_event; } @@ -3724,9 +3758,8 @@ onmatch_create_field_var(struct hist_trigger_data *hist_data, goto out; } -static int onmatch_create(struct hist_trigger_data *hist_data, - struct trace_event_file *file, - struct action_data *data) +static int trace_action_create(struct hist_trigger_data *hist_data, + struct action_data *data) { char *event_name, *param, *system = NULL; struct hist_field *hist_field, *var_ref; @@ -3737,11 +3770,12 @@ static int onmatch_create(struct hist_trigger_data *hist_data, lockdep_assert_held(&event_mutex); - event = find_synth_event(data->onmatch.synth_event_name); + event = find_synth_event(data->action_name); if (!event) { - hist_err("onmatch: Couldn't find synthetic event: ", data->onmatch.synth_event_name); + hist_err("trace action: Couldn't find synthetic event: ", data->action_name); return -EINVAL; } + event->ref++; var_ref_idx = hist_data->n_var_refs; @@ -3769,13 +3803,15 @@ static int onmatch_create(struct hist_trigger_data *hist_data, } if (param[0] == '$') - hist_field = onmatch_find_var(hist_data, data, system, - event_name, param); + hist_field = trace_action_find_var(hist_data, data, + system, event_name, + param); else - hist_field = onmatch_create_field_var(hist_data, data, - system, - event_name, - param); + hist_field = trace_action_create_field_var(hist_data, + data, + system, + event_name, + param); if (!hist_field) { kfree(p); @@ -3797,7 +3833,7 @@ static int onmatch_create(struct hist_trigger_data *hist_data, continue; } - hist_err_event("onmatch: Param type doesn't match synthetic event field type: ", + hist_err_event("trace action: Param type doesn't match synthetic event field type: ", system, event_name, param); kfree(p); ret = -EINVAL; @@ -3805,12 +3841,11 @@ static int onmatch_create(struct hist_trigger_data *hist_data, } if (field_pos != event->n_fields) { - hist_err("onmatch: Param count doesn't match synthetic event field count: ", event->name); + hist_err("trace action: Param count doesn't match synthetic event field count: ", event->name); ret = -EINVAL; goto err; } - data->fn = action_trace; data->onmatch.synth_event = event; data->onmatch.var_ref_idx = var_ref_idx; out: @@ -3821,10 +3856,58 @@ static int onmatch_create(struct hist_trigger_data *hist_data, goto out; } +static int action_create(struct hist_trigger_data *hist_data, + struct action_data *data) +{ + struct field_var *field_var; + unsigned int i; + char *param; + int ret = 0; + + if (data->action == ACTION_TRACE) + return trace_action_create(hist_data, data); + + if (data->action == ACTION_SAVE) { + if (hist_data->n_save_vars) { + ret = -EEXIST; + hist_err("save action: Can't have more than one save() action per hist", ""); + goto out; + } + + for (i = 0; i < data->n_params; i++) { + param = kstrdup(data->params[i], GFP_KERNEL); + if (!param) { + ret = -ENOMEM; + goto out; + } + + field_var = create_target_field_var(hist_data, NULL, NULL, param); + if (IS_ERR(field_var)) { + hist_err("save action: Couldn't create field variable: ", param); + ret = PTR_ERR(field_var); + kfree(param); + goto out; + } + + hist_data->save_vars[hist_data->n_save_vars++] = field_var; + if (field_var->val->flags & HIST_FIELD_FL_STRING) + hist_data->n_save_var_str++; + kfree(param); + } + } + out: + return ret; +} + +static int onmatch_create(struct hist_trigger_data *hist_data, + struct action_data *data) +{ + return action_create(hist_data, data); +} + static struct action_data *onmatch_parse(struct trace_array *tr, char *str) { char *match_event, *match_event_system; - char *synth_event_name, *params; struct action_data *data; int ret = -EINVAL; @@ -3862,31 +3945,7 @@ static struct action_data *onmatch_parse(struct trace_array *tr, char *str) goto free; } - strsep(&str, "."); - if (!str) { - hist_err("onmatch: Missing . after onmatch(): ", str); - goto free; - } - - synth_event_name = strsep(&str, "("); - if (!synth_event_name || !str) { - hist_err("onmatch: Missing opening paramlist paren: ", synth_event_name); - goto free; - } - - data->onmatch.synth_event_name = kstrdup(synth_event_name, GFP_KERNEL); - if (!data->onmatch.synth_event_name) { - ret = -ENOMEM; - goto free; - } - - params = strsep(&str, ")"); - if (!params || !str || (str && strlen(str))) { - hist_err("onmatch: Missing closing paramlist paren: ", params); - goto free; - } - - ret = parse_action_params(params, data); + ret = action_parse(str, data, HANDLER_ONMATCH); if (ret) goto free; out: @@ -4326,9 +4385,9 @@ static void destroy_actions(struct hist_trigger_data *hist_data) for (i = 0; i < hist_data->n_actions; i++) { struct action_data *data = hist_data->actions[i]; - if (data->fn == action_trace) + if (data->handler == HANDLER_ONMATCH) onmatch_destroy(data); - else if (data->fn == onmax_save) + else if (data->handler == HANDLER_ONMAX) onmax_destroy(data); else kfree(data); @@ -4355,16 +4414,14 @@ static int parse_actions(struct hist_trigger_data *hist_data) ret = PTR_ERR(data); break; } - data->fn = action_trace; } else if ((len = str_has_prefix(str, "onmax("))) { char *action_str = str + len; - data = onmax_parse(action_str); + data = onmax_parse(action_str, HANDLER_ONMAX); if (IS_ERR(data)) { ret = PTR_ERR(data); break; } - data->fn = onmax_save; } else { ret = -EINVAL; break; @@ -4376,8 +4433,7 @@ static int parse_actions(struct hist_trigger_data *hist_data) return ret; } -static int create_actions(struct hist_trigger_data *hist_data, - struct trace_event_file *file) +static int create_actions(struct hist_trigger_data *hist_data) { struct action_data *data; unsigned int i; @@ -4386,14 +4442,17 @@ static int create_actions(struct hist_trigger_data *hist_data, for (i = 0; i < hist_data->attrs->n_actions; i++) { data = hist_data->actions[i]; - if (data->fn == action_trace) { - ret = onmatch_create(hist_data, file, data); + if (data->handler == HANDLER_ONMATCH) { + ret = onmatch_create(hist_data, data); if (ret) - return ret; - } else if (data->fn == onmax_save) { + break; + } else if (data->handler == HANDLER_ONMAX) { ret = onmax_create(hist_data, data); if (ret) - return ret; + break; + } else { + ret = -EINVAL; + break; } } @@ -4409,26 +4468,42 @@ static void print_actions(struct seq_file *m, for (i = 0; i < hist_data->n_actions; i++) { struct action_data *data = hist_data->actions[i]; - if (data->fn == onmax_save) + if (data->handler == HANDLER_ONMAX) onmax_print(m, hist_data, elt, data); } } +static void print_action_spec(struct seq_file *m, + struct hist_trigger_data *hist_data, + struct action_data *data) +{ + unsigned int i; + + if (data->action == ACTION_SAVE) { + for (i = 0; i < hist_data->n_save_vars; i++) { + seq_printf(m, "%s", hist_data->save_vars[i]->var->var.name); + if (i < hist_data->n_save_vars - 1) + seq_puts(m, ","); + } + } else if (data->action == ACTION_TRACE) { + for (i = 0; i < data->n_params; i++) { + if (i) + seq_puts(m, ","); + seq_printf(m, "%s", data->params[i]); + } + } +} + static void print_onmax_spec(struct seq_file *m, struct hist_trigger_data *hist_data, struct action_data *data) { - unsigned int i; - seq_puts(m, ":onmax("); seq_printf(m, "%s", data->onmax.var_str); - seq_printf(m, ").%s(", data->onmax.fn_name); + seq_printf(m, ").%s(", data->action_name); + + print_action_spec(m, hist_data, data); - for (i = 0; i < hist_data->n_max_vars; i++) { - seq_printf(m, "%s", hist_data->max_vars[i]->var->var.name); - if (i < hist_data->n_max_vars - 1) - seq_puts(m, ","); - } seq_puts(m, ")"); } @@ -4436,18 +4511,12 @@ static void print_onmatch_spec(struct seq_file *m, struct hist_trigger_data *hist_data, struct action_data *data) { - unsigned int i; - seq_printf(m, ":onmatch(%s.%s).", data->onmatch.match_event_system, data->onmatch.match_event); - seq_printf(m, "%s(", data->onmatch.synth_event->name); + seq_printf(m, "%s(", data->action_name); - for (i = 0; i < data->n_params; i++) { - if (i) - seq_puts(m, ","); - seq_printf(m, "%s", data->params[i]); - } + print_action_spec(m, hist_data, data); seq_puts(m, ")"); } @@ -4464,7 +4533,9 @@ static bool actions_match(struct hist_trigger_data *hist_data, struct action_data *data = hist_data->actions[i]; struct action_data *data_test = hist_data_test->actions[i]; - if (data->fn != data_test->fn) + if (data->handler != data_test->handler) + return false; + if (data->action != data_test->action) return false; if (data->n_params != data_test->n_params) @@ -4475,23 +4546,20 @@ static bool actions_match(struct hist_trigger_data *hist_data, return false; } - if (data->fn == action_trace) { - if (strcmp(data->onmatch.synth_event_name, - data_test->onmatch.synth_event_name) != 0) - return false; + if (strcmp(data->action_name, data_test->action_name) != 0) + return false; + + if (data->handler == HANDLER_ONMATCH) { if (strcmp(data->onmatch.match_event_system, data_test->onmatch.match_event_system) != 0) return false; if (strcmp(data->onmatch.match_event, data_test->onmatch.match_event) != 0) return false; - } else if (data->fn == onmax_save) { + } else if (data->handler == HANDLER_ONMAX) { if (strcmp(data->onmax.var_str, data_test->onmax.var_str) != 0) return false; - if (strcmp(data->onmax.fn_name, - data_test->onmax.fn_name) != 0) - return false; } } @@ -4507,9 +4575,9 @@ static void print_actions_spec(struct seq_file *m, for (i = 0; i < hist_data->n_actions; i++) { struct action_data *data = hist_data->actions[i]; - if (data->fn == action_trace) + if (data->handler == HANDLER_ONMATCH) print_onmatch_spec(m, hist_data, data); - else if (data->fn == onmax_save) + else if (data->handler == HANDLER_ONMAX) print_onmax_spec(m, hist_data, data); } } @@ -4703,14 +4771,15 @@ static inline void add_to_key(char *compound_key, void *key, static void hist_trigger_actions(struct hist_trigger_data *hist_data, struct tracing_map_elt *elt, void *rec, - struct ring_buffer_event *rbe, u64 *var_ref_vals) + struct ring_buffer_event *rbe, void *key, + u64 *var_ref_vals) { struct action_data *data; unsigned int i; for (i = 0; i < hist_data->n_actions; i++) { data = hist_data->actions[i]; - data->fn(hist_data, elt, rec, rbe, data, var_ref_vals); + data->fn(hist_data, elt, rec, rbe, key, data, var_ref_vals); } } @@ -4771,7 +4840,7 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec, hist_trigger_elt_update(hist_data, elt, rec, rbe, var_ref_vals); if (resolve_var_refs(hist_data, key, var_ref_vals, true)) - hist_trigger_actions(hist_data, elt, rec, rbe, var_ref_vals); + hist_trigger_actions(hist_data, elt, rec, rbe, key, var_ref_vals); } static void hist_trigger_stacktrace_print(struct seq_file *m, @@ -5683,7 +5752,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, if (has_hist_vars(hist_data)) save_hist_vars(hist_data); - ret = create_actions(hist_data, file); + ret = create_actions(hist_data); if (ret) goto out_unreg; -- cgit v1.2.3 From c3e49506a0f426a850675e39419879214060ca8b Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Wed, 13 Feb 2019 17:42:43 -0600 Subject: tracing: Split up onmatch action data Currently, the onmatch action data binds the onmatch action to data related to synthetic event generation. Since we want to allow the onmatch handler to potentially invoke a different action, and because we expect other handlers to generate synthetic events, we need to separate the data related to these two functions. Also rename the onmatch data to something more descriptive, and create and use common action data destroy function. Link: http://lkml.kernel.org/r/b9abbf9aae69fe3920cdc8ddbcaad544dd258d78.1550100284.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 12 ++++- kernel/trace/trace_events_hist.c | 103 ++++++++++++++++++++------------------- 2 files changed, 63 insertions(+), 52 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b583ff7656bb..b477926ac3bc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4700,6 +4700,7 @@ static const char readme_msg[] = "\t [:size=#entries]\n" "\t [:pause][:continue][:clear]\n" "\t [:name=histname1]\n" + "\t [:.]\n" "\t [if ]\n\n" "\t When a matching event is hit, an entry is added to a hash\n" "\t table using the key(s) and value(s) named, and the value of a\n" @@ -4741,7 +4742,16 @@ static const char readme_msg[] = "\t The enable_hist and disable_hist triggers can be used to\n" "\t have one event conditionally start and stop another event's\n" "\t already-attached hist trigger. The syntax is analagous to\n" - "\t the enable_event and disable_event triggers.\n" + "\t the enable_event and disable_event triggers.\n\n" + "\t Hist trigger handlers and actions are executed whenever a\n" + "\t a histogram entry is added or updated. They take the form:\n\n" + "\t .\n\n" + "\t The available handlers are:\n\n" + "\t onmatch(matching.event) - invoke on addition or update\n" + "\t onmax(var) - invoke if var exceeds current max\n\n" + "\t The available actions are:\n\n" + "\t (param list) - generate synthetic event\n" + "\t save(field,...) - save current event fields\n" #endif ; diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index dfaaad582797..0b843ecef547 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -405,21 +405,22 @@ struct action_data { unsigned int n_params; char *params[SYNTH_FIELDS_MAX]; + /* + * When a histogram trigger is hit, the values of any + * references to variables, including variables being passed + * as parameters to synthetic events, are collected into a + * var_ref_vals array. This var_ref_idx is the index of the + * first param in the array to be passed to the synthetic + * event invocation. + */ + unsigned int var_ref_idx; + struct synth_event *synth_event; + union { struct { - /* - * When a histogram trigger is hit, the values of any - * references to variables, including variables being passed - * as parameters to synthetic events, are collected into a - * var_ref_vals array. This var_ref_idx is the index of the - * first param in the array to be passed to the synthetic - * event invocation. - */ - unsigned int var_ref_idx; - char *match_event; - char *match_event_system; - struct synth_event *synth_event; - } onmatch; + char *event; + char *event_system; + } match_data; struct { char *var_str; @@ -1093,9 +1094,9 @@ static void action_trace(struct hist_trigger_data *hist_data, struct ring_buffer_event *rbe, void *key, struct action_data *data, u64 *var_ref_vals) { - struct synth_event *event = data->onmatch.synth_event; + struct synth_event *event = data->synth_event; - trace_synth(event, var_ref_vals, data->onmatch.var_ref_idx); + trace_synth(event, var_ref_vals, data->var_ref_idx); } struct hist_var_data { @@ -1657,8 +1658,8 @@ find_match_var(struct hist_trigger_data *hist_data, char *var_name) struct action_data *data = hist_data->actions[i]; if (data->handler == HANDLER_ONMATCH) { - char *system = data->onmatch.match_event_system; - char *event_name = data->onmatch.match_event; + char *system = data->match_data.event_system; + char *event_name = data->match_data.event; file = find_var_file(tr, system, event_name, var_name); if (!file) @@ -3425,22 +3426,33 @@ static void onmax_save(struct hist_trigger_data *hist_data, update_max_vars(hist_data, elt, rbe, rec); } -static void onmax_destroy(struct action_data *data) +static void action_data_destroy(struct action_data *data) { unsigned int i; - destroy_hist_field(data->onmax.max_var, 0); - destroy_hist_field(data->onmax.var, 0); + lockdep_assert_held(&event_mutex); - kfree(data->onmax.var_str); kfree(data->action_name); for (i = 0; i < data->n_params; i++) kfree(data->params[i]); + if (data->synth_event) + data->synth_event->ref--; + kfree(data); } +static void onmax_destroy(struct action_data *data) +{ + destroy_hist_field(data->onmax.max_var, 0); + destroy_hist_field(data->onmax.var, 0); + + kfree(data->onmax.var_str); + + action_data_destroy(data); +} + static int action_create(struct hist_trigger_data *hist_data, struct action_data *data); @@ -3620,21 +3632,10 @@ static struct action_data *onmax_parse(char *str, enum handler_id handler) static void onmatch_destroy(struct action_data *data) { - unsigned int i; - - lockdep_assert_held(&event_mutex); + kfree(data->match_data.event); + kfree(data->match_data.event_system); - kfree(data->onmatch.match_event); - kfree(data->onmatch.match_event_system); - kfree(data->action_name); - - for (i = 0; i < data->n_params; i++) - kfree(data->params[i]); - - if (data->onmatch.synth_event) - data->onmatch.synth_event->ref--; - - kfree(data); + action_data_destroy(data); } static void destroy_field_var(struct field_var *field_var) @@ -3695,8 +3696,8 @@ trace_action_find_var(struct hist_trigger_data *hist_data, hist_field = find_target_event_var(hist_data, system, event, var); if (!hist_field) { if (!system && data->handler == HANDLER_ONMATCH) { - system = data->onmatch.match_event_system; - event = data->onmatch.match_event; + system = data->match_data.event_system; + event = data->match_data.event; } hist_field = find_event_var(hist_data, system, event, var); @@ -3735,8 +3736,8 @@ trace_action_create_field_var(struct hist_trigger_data *hist_data, * event. */ if (!system && data->handler == HANDLER_ONMATCH) { - system = data->onmatch.match_event_system; - event = data->onmatch.match_event; + system = data->match_data.event_system; + event = data->match_data.event; } /* @@ -3846,8 +3847,8 @@ static int trace_action_create(struct hist_trigger_data *hist_data, goto err; } - data->onmatch.synth_event = event; - data->onmatch.var_ref_idx = var_ref_idx; + data->synth_event = event; + data->var_ref_idx = var_ref_idx; out: return ret; err: @@ -3933,14 +3934,14 @@ static struct action_data *onmatch_parse(struct trace_array *tr, char *str) goto free; } - data->onmatch.match_event = kstrdup(match_event, GFP_KERNEL); - if (!data->onmatch.match_event) { + data->match_data.event = kstrdup(match_event, GFP_KERNEL); + if (!data->match_data.event) { ret = -ENOMEM; goto free; } - data->onmatch.match_event_system = kstrdup(match_event_system, GFP_KERNEL); - if (!data->onmatch.match_event_system) { + data->match_data.event_system = kstrdup(match_event_system, GFP_KERNEL); + if (!data->match_data.event_system) { ret = -ENOMEM; goto free; } @@ -4511,8 +4512,8 @@ static void print_onmatch_spec(struct seq_file *m, struct hist_trigger_data *hist_data, struct action_data *data) { - seq_printf(m, ":onmatch(%s.%s).", data->onmatch.match_event_system, - data->onmatch.match_event); + seq_printf(m, ":onmatch(%s.%s).", data->match_data.event_system, + data->match_data.event); seq_printf(m, "%s(", data->action_name); @@ -4550,11 +4551,11 @@ static bool actions_match(struct hist_trigger_data *hist_data, return false; if (data->handler == HANDLER_ONMATCH) { - if (strcmp(data->onmatch.match_event_system, - data_test->onmatch.match_event_system) != 0) + if (strcmp(data->match_data.event_system, + data_test->match_data.event_system) != 0) return false; - if (strcmp(data->onmatch.match_event, - data_test->onmatch.match_event) != 0) + if (strcmp(data->match_data.event, + data_test->match_data.event) != 0) return false; } else if (data->handler == HANDLER_ONMAX) { if (strcmp(data->onmax.var_str, -- cgit v1.2.3 From 466f4528fbc692ea56deca278fa6aeb79e6e8b21 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Wed, 13 Feb 2019 17:42:44 -0600 Subject: tracing: Generalize hist trigger onmax and save action The action refactor code allowed actions and handlers to be separated, but the existing onmax handler and save action code is still not flexible enough to handle arbitrary coupling. This change generalizes them and in the process makes additional handlers and actions easier to implement. The onmax action can be broken up and thought of as two separate components - a variable to be tracked (the parameter given to the onmax($var_to_track) function) and an invisible variable created to save the ongoing result of doing something with that variable, such as saving the max value of that variable so far seen. Separating it out like this and renaming it appropriately allows us to use the same code for similar tracking functions such as onchange($var_to_track), which would just track the last value seen rather than the max seen so far, which is useful in some situations. Additionally, because different handlers and actions may want to save and access data differently e.g. save and retrieve tracking values as local variables vs something more global, save_val() and get_val() interface functions are introduced and max-specific implementations are used instead. The same goes for the code that checks whether a maximum has been hit - a generic check_val() interface and max-checking implementation is used instead, which allows future patches to make use of he same code using their own implemetations of similar functionality. Link: http://lkml.kernel.org/r/980ea73dd8e3f36db3d646f99652f8fed42b77d4.1550100284.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 236 ++++++++++++++++++++++++++------------- 1 file changed, 160 insertions(+), 76 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 0b843ecef547..0515229e5f95 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -386,6 +386,8 @@ typedef void (*action_fn_t) (struct hist_trigger_data *hist_data, struct ring_buffer_event *rbe, void *key, struct action_data *data, u64 *var_ref_vals); +typedef bool (*check_track_val_fn_t) (u64 track_val, u64 var_val); + enum handler_id { HANDLER_ONMATCH = 1, HANDLER_ONMAX, @@ -423,15 +425,35 @@ struct action_data { } match_data; struct { + /* + * var_str contains the $-unstripped variable + * name referenced by var_ref, and used when + * printing the action. Because var_ref + * creation is deferred to create_actions(), + * we need a per-action way to save it until + * then, thus var_str. + */ char *var_str; - unsigned int max_var_ref_idx; - struct hist_field *max_var; - struct hist_field *var; - } onmax; + + /* + * var_ref refers to the variable being + * tracked e.g onmax($var). + */ + struct hist_field *var_ref; + + /* + * track_var contains the 'invisible' tracking + * variable created to keep the current + * e.g. max value. + */ + struct hist_field *track_var; + + check_track_val_fn_t check_val; + action_fn_t save_data; + } track_data; }; }; - static char last_hist_cmd[MAX_FILTER_STR_VAL]; static char hist_err_str[MAX_FILTER_STR_VAL]; @@ -3238,10 +3260,10 @@ static void update_field_vars(struct hist_trigger_data *hist_data, hist_data->n_field_vars, 0); } -static void update_max_vars(struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, - struct ring_buffer_event *rbe, - void *rec) +static void save_track_data_vars(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, void *rec, + struct ring_buffer_event *rbe, void *key, + struct action_data *data, u64 *var_ref_vals) { __update_field_vars(elt, rbe, rec, hist_data->save_vars, hist_data->n_save_vars, hist_data->n_field_var_str); @@ -3379,14 +3401,67 @@ create_target_field_var(struct hist_trigger_data *target_hist_data, return create_field_var(target_hist_data, file, var_name); } -static void onmax_print(struct seq_file *m, - struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, - struct action_data *data) +static bool check_track_val_max(u64 track_val, u64 var_val) +{ + if (var_val <= track_val) + return false; + + return true; +} + +static u64 get_track_val(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, + struct action_data *data) { - unsigned int i, save_var_idx, max_idx = data->onmax.max_var->var.idx; + unsigned int track_var_idx = data->track_data.track_var->var.idx; + u64 track_val; + + track_val = tracing_map_read_var(elt, track_var_idx); - seq_printf(m, "\n\tmax: %10llu", tracing_map_read_var(elt, max_idx)); + return track_val; +} + +static void save_track_val(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, + struct action_data *data, u64 var_val) +{ + unsigned int track_var_idx = data->track_data.track_var->var.idx; + + tracing_map_set_var(elt, track_var_idx, var_val); +} + +static void save_track_data(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, void *rec, + struct ring_buffer_event *rbe, void *key, + struct action_data *data, u64 *var_ref_vals) +{ + if (data->track_data.save_data) + data->track_data.save_data(hist_data, elt, rec, rbe, key, data, var_ref_vals); +} + +static bool check_track_val(struct tracing_map_elt *elt, + struct action_data *data, + u64 var_val) +{ + struct hist_trigger_data *hist_data; + u64 track_val; + + hist_data = data->track_data.track_var->hist_data; + track_val = get_track_val(hist_data, elt, data); + + return data->track_data.check_val(track_val, var_val); +} + +static void track_data_print(struct seq_file *m, + struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, + struct action_data *data) +{ + u64 track_val = get_track_val(hist_data, elt, data); + unsigned int i, save_var_idx; + + if (data->handler == HANDLER_ONMAX) + seq_printf(m, "\n\tmax: %10llu", track_val); for (i = 0; i < hist_data->n_save_vars; i++) { struct hist_field *save_val = hist_data->save_vars[i]->val; @@ -3405,25 +3480,17 @@ static void onmax_print(struct seq_file *m, } } -static void onmax_save(struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, void *rec, - struct ring_buffer_event *rbe, void *key, - struct action_data *data, u64 *var_ref_vals) +static void ontrack_action(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, void *rec, + struct ring_buffer_event *rbe, void *key, + struct action_data *data, u64 *var_ref_vals) { - unsigned int max_idx = data->onmax.max_var->var.idx; - unsigned int max_var_ref_idx = data->onmax.max_var_ref_idx; + u64 var_val = var_ref_vals[data->track_data.var_ref->var_ref_idx]; - u64 var_val, max_val; - - var_val = var_ref_vals[max_var_ref_idx]; - max_val = tracing_map_read_var(elt, max_idx); - - if (var_val <= max_val) - return; - - tracing_map_set_var(elt, max_idx, var_val); - - update_max_vars(hist_data, elt, rbe, rec); + if (check_track_val(elt, data, var_val)) { + save_track_val(hist_data, elt, data, var_val); + save_track_data(hist_data, elt, rec, rbe, key, data, var_ref_vals); + } } static void action_data_destroy(struct action_data *data) @@ -3443,12 +3510,13 @@ static void action_data_destroy(struct action_data *data) kfree(data); } -static void onmax_destroy(struct action_data *data) +static void track_data_destroy(struct hist_trigger_data *hist_data, + struct action_data *data) { - destroy_hist_field(data->onmax.max_var, 0); - destroy_hist_field(data->onmax.var, 0); + destroy_hist_field(data->track_data.track_var, 0); + destroy_hist_field(data->track_data.var_ref, 0); - kfree(data->onmax.var_str); + kfree(data->track_data.var_str); action_data_destroy(data); } @@ -3456,25 +3524,24 @@ static void onmax_destroy(struct action_data *data) static int action_create(struct hist_trigger_data *hist_data, struct action_data *data); -static int onmax_create(struct hist_trigger_data *hist_data, - struct action_data *data) +static int track_data_create(struct hist_trigger_data *hist_data, + struct action_data *data) { - struct hist_field *var_field, *ref_field, *max_var = NULL; + struct hist_field *var_field, *ref_field, *track_var = NULL; struct trace_event_file *file = hist_data->event_file; - unsigned int var_ref_idx = hist_data->n_var_refs; - char *onmax_var_str; + char *track_data_var_str; int ret = 0; - onmax_var_str = data->onmax.var_str; - if (onmax_var_str[0] != '$') { - hist_err("onmax: For onmax(x), x must be a variable: ", onmax_var_str); + track_data_var_str = data->track_data.var_str; + if (track_data_var_str[0] != '$') { + hist_err("For onmax(x), x must be a variable: ", track_data_var_str); return -EINVAL; } - onmax_var_str++; + track_data_var_str++; - var_field = find_target_event_var(hist_data, NULL, NULL, onmax_var_str); + var_field = find_target_event_var(hist_data, NULL, NULL, track_data_var_str); if (!var_field) { - hist_err("onmax: Couldn't find onmax variable: ", onmax_var_str); + hist_err("Couldn't find onmax variable: ", track_data_var_str); return -EINVAL; } @@ -3482,17 +3549,16 @@ static int onmax_create(struct hist_trigger_data *hist_data, if (!ref_field) return -ENOMEM; - data->onmax.var = ref_field; + data->track_data.var_ref = ref_field; - data->onmax.max_var_ref_idx = var_ref_idx; - - max_var = create_var(hist_data, file, "max", sizeof(u64), "u64"); - if (IS_ERR(max_var)) { - hist_err("onmax: Couldn't create onmax variable: ", "max"); - ret = PTR_ERR(max_var); + if (data->handler == HANDLER_ONMAX) + track_var = create_var(hist_data, file, "__max", sizeof(u64), "u64"); + if (IS_ERR(track_var)) { + hist_err("Couldn't create onmax variable: ", "__max"); + ret = PTR_ERR(track_var); goto out; } - data->onmax.max_var = max_var; + data->track_data.track_var = track_var; ret = action_create(hist_data, data); out: @@ -3570,8 +3636,15 @@ static int action_parse(char *str, struct action_data *data, goto out; if (handler == HANDLER_ONMAX) - data->fn = onmax_save; + data->track_data.check_val = check_track_val_max; + else { + hist_err("action parsing: Handler doesn't support action: ", action_name); + ret = -EINVAL; + goto out; + } + data->track_data.save_data = save_track_data_vars; + data->fn = ontrack_action; data->action = ACTION_SAVE; } else { char *params = strsep(&str, ")"); @@ -3582,7 +3655,15 @@ static int action_parse(char *str, struct action_data *data, goto out; } - data->fn = action_trace; + if (handler == HANDLER_ONMAX) + data->track_data.check_val = check_track_val_max; + + if (handler != HANDLER_ONMATCH) { + data->track_data.save_data = action_trace; + data->fn = ontrack_action; + } else + data->fn = action_trace; + data->action = ACTION_TRACE; } @@ -3597,24 +3678,25 @@ static int action_parse(char *str, struct action_data *data, return ret; } -static struct action_data *onmax_parse(char *str, enum handler_id handler) +static struct action_data *track_data_parse(struct hist_trigger_data *hist_data, + char *str, enum handler_id handler) { struct action_data *data; - char *onmax_var_str; int ret = -EINVAL; + char *var_str; data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) return ERR_PTR(-ENOMEM); - onmax_var_str = strsep(&str, ")"); - if (!onmax_var_str || !str) { + var_str = strsep(&str, ")"); + if (!var_str || !str) { ret = -EINVAL; goto free; } - data->onmax.var_str = kstrdup(onmax_var_str, GFP_KERNEL); - if (!data->onmax.var_str) { + data->track_data.var_str = kstrdup(var_str, GFP_KERNEL); + if (!data->track_data.var_str) { ret = -ENOMEM; goto free; } @@ -3625,7 +3707,7 @@ static struct action_data *onmax_parse(char *str, enum handler_id handler) out: return data; free: - onmax_destroy(data); + track_data_destroy(hist_data, data); data = ERR_PTR(ret); goto out; } @@ -4389,7 +4471,7 @@ static void destroy_actions(struct hist_trigger_data *hist_data) if (data->handler == HANDLER_ONMATCH) onmatch_destroy(data); else if (data->handler == HANDLER_ONMAX) - onmax_destroy(data); + track_data_destroy(hist_data, data); else kfree(data); } @@ -4418,7 +4500,8 @@ static int parse_actions(struct hist_trigger_data *hist_data) } else if ((len = str_has_prefix(str, "onmax("))) { char *action_str = str + len; - data = onmax_parse(action_str, HANDLER_ONMAX); + data = track_data_parse(hist_data, action_str, + HANDLER_ONMAX); if (IS_ERR(data)) { ret = PTR_ERR(data); break; @@ -4448,7 +4531,7 @@ static int create_actions(struct hist_trigger_data *hist_data) if (ret) break; } else if (data->handler == HANDLER_ONMAX) { - ret = onmax_create(hist_data, data); + ret = track_data_create(hist_data, data); if (ret) break; } else { @@ -4470,7 +4553,7 @@ static void print_actions(struct seq_file *m, struct action_data *data = hist_data->actions[i]; if (data->handler == HANDLER_ONMAX) - onmax_print(m, hist_data, elt, data); + track_data_print(m, hist_data, elt, data); } } @@ -4495,12 +4578,13 @@ static void print_action_spec(struct seq_file *m, } } -static void print_onmax_spec(struct seq_file *m, - struct hist_trigger_data *hist_data, - struct action_data *data) +static void print_track_data_spec(struct seq_file *m, + struct hist_trigger_data *hist_data, + struct action_data *data) { - seq_puts(m, ":onmax("); - seq_printf(m, "%s", data->onmax.var_str); + if (data->handler == HANDLER_ONMAX) + seq_puts(m, ":onmax("); + seq_printf(m, "%s", data->track_data.var_str); seq_printf(m, ").%s(", data->action_name); print_action_spec(m, hist_data, data); @@ -4558,8 +4642,8 @@ static bool actions_match(struct hist_trigger_data *hist_data, data_test->match_data.event) != 0) return false; } else if (data->handler == HANDLER_ONMAX) { - if (strcmp(data->onmax.var_str, - data_test->onmax.var_str) != 0) + if (strcmp(data->track_data.var_str, + data_test->track_data.var_str) != 0) return false; } } @@ -4579,7 +4663,7 @@ static void print_actions_spec(struct seq_file *m, if (data->handler == HANDLER_ONMATCH) print_onmatch_spec(m, hist_data, data); else if (data->handler == HANDLER_ONMAX) - print_onmax_spec(m, hist_data, data); + print_track_data_spec(m, hist_data, data); } } -- cgit v1.2.3 From a35873a0993b4d38b40871f10fa4356c088c7140 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Wed, 13 Feb 2019 17:42:45 -0600 Subject: tracing: Add conditional snapshot Currently, tracing snapshots are context-free - they capture the ring buffer contents at the time the tracing_snapshot() function was invoked, and nothing else. Additionally, they're always taken unconditionally - the calling code can decide whether or not to take a snapshot, but the data used to make that decision is kept separately from the snapshot itself. This change adds the ability to associate with each trace instance some user data, along with an 'update' function that can use that data to determine whether or not to actually take a snapshot. The update function can then update that data along with any other state (as part of the data presumably), if warranted. Because snapshots are 'global' per-instance, only one user can enable and use a conditional snapshot for any given trace instance. To enable a conditional snapshot (see details in the function and data structure comments), the user calls tracing_snapshot_cond_enable(). Similarly, to disable a conditional snapshot and free it up for other users, tracing_snapshot_cond_disable() should be called. To actually initiate a conditional snapshot, tracing_snapshot_cond() should be called. tracing_snapshot_cond() will invoke the update() callback, allowing the user to decide whether or not to actually take the snapshot and update the user-defined data associated with the snapshot. If the callback returns 'true', tracing_snapshot_cond() will then actually take the snapshot and return. This scheme allows for flexibility in snapshot implementations - for example, by implementing slightly different update() callbacks, snapshots can be taken in situations where the user is only interested in taking a snapshot when a new maximum in hit versus when a value changes in any way at all. Future patches will demonstrate both cases. Link: http://lkml.kernel.org/r/1bea07828d5fd6864a585f83b1eed47ce097eb45.1550100284.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 192 +++++++++++++++++++++++++++++++++++++- kernel/trace/trace.h | 56 ++++++++++- kernel/trace/trace_sched_wakeup.c | 2 +- 3 files changed, 244 insertions(+), 6 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b477926ac3bc..9f4d56f74b46 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -894,7 +894,7 @@ int __trace_bputs(unsigned long ip, const char *str) EXPORT_SYMBOL_GPL(__trace_bputs); #ifdef CONFIG_TRACER_SNAPSHOT -void tracing_snapshot_instance(struct trace_array *tr) +void tracing_snapshot_instance_cond(struct trace_array *tr, void *cond_data) { struct tracer *tracer = tr->current_trace; unsigned long flags; @@ -920,10 +920,15 @@ void tracing_snapshot_instance(struct trace_array *tr) } local_irq_save(flags); - update_max_tr(tr, current, smp_processor_id()); + update_max_tr(tr, current, smp_processor_id(), cond_data); local_irq_restore(flags); } +void tracing_snapshot_instance(struct trace_array *tr) +{ + tracing_snapshot_instance_cond(tr, NULL); +} + /** * tracing_snapshot - take a snapshot of the current buffer. * @@ -946,6 +951,54 @@ void tracing_snapshot(void) } EXPORT_SYMBOL_GPL(tracing_snapshot); +/** + * tracing_snapshot_cond - conditionally take a snapshot of the current buffer. + * @tr: The tracing instance to snapshot + * @cond_data: The data to be tested conditionally, and possibly saved + * + * This is the same as tracing_snapshot() except that the snapshot is + * conditional - the snapshot will only happen if the + * cond_snapshot.update() implementation receiving the cond_data + * returns true, which means that the trace array's cond_snapshot + * update() operation used the cond_data to determine whether the + * snapshot should be taken, and if it was, presumably saved it along + * with the snapshot. + */ +void tracing_snapshot_cond(struct trace_array *tr, void *cond_data) +{ + tracing_snapshot_instance_cond(tr, cond_data); +} +EXPORT_SYMBOL_GPL(tracing_snapshot_cond); + +/** + * tracing_snapshot_cond_data - get the user data associated with a snapshot + * @tr: The tracing instance + * + * When the user enables a conditional snapshot using + * tracing_snapshot_cond_enable(), the user-defined cond_data is saved + * with the snapshot. This accessor is used to retrieve it. + * + * Should not be called from cond_snapshot.update(), since it takes + * the tr->max_lock lock, which the code calling + * cond_snapshot.update() has already done. + * + * Returns the cond_data associated with the trace array's snapshot. + */ +void *tracing_cond_snapshot_data(struct trace_array *tr) +{ + void *cond_data = NULL; + + arch_spin_lock(&tr->max_lock); + + if (tr->cond_snapshot) + cond_data = tr->cond_snapshot->cond_data; + + arch_spin_unlock(&tr->max_lock); + + return cond_data; +} +EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data); + static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf, struct trace_buffer *size_buf, int cpu_id); static void set_buffer_entries(struct trace_buffer *buf, unsigned long val); @@ -1025,12 +1078,103 @@ void tracing_snapshot_alloc(void) tracing_snapshot(); } EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); + +/** + * tracing_snapshot_cond_enable - enable conditional snapshot for an instance + * @tr: The tracing instance + * @cond_data: User data to associate with the snapshot + * @update: Implementation of the cond_snapshot update function + * + * Check whether the conditional snapshot for the given instance has + * already been enabled, or if the current tracer is already using a + * snapshot; if so, return -EBUSY, else create a cond_snapshot and + * save the cond_data and update function inside. + * + * Returns 0 if successful, error otherwise. + */ +int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, + cond_update_fn_t update) +{ + struct cond_snapshot *cond_snapshot; + int ret = 0; + + cond_snapshot = kzalloc(sizeof(*cond_snapshot), GFP_KERNEL); + if (!cond_snapshot) + return -ENOMEM; + + cond_snapshot->cond_data = cond_data; + cond_snapshot->update = update; + + mutex_lock(&trace_types_lock); + + ret = tracing_alloc_snapshot_instance(tr); + if (ret) + goto fail_unlock; + + if (tr->current_trace->use_max_tr) { + ret = -EBUSY; + goto fail_unlock; + } + + if (tr->cond_snapshot) { + ret = -EBUSY; + goto fail_unlock; + } + + arch_spin_lock(&tr->max_lock); + tr->cond_snapshot = cond_snapshot; + arch_spin_unlock(&tr->max_lock); + + mutex_unlock(&trace_types_lock); + + return ret; + + fail_unlock: + mutex_unlock(&trace_types_lock); + kfree(cond_snapshot); + return ret; +} +EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable); + +/** + * tracing_snapshot_cond_disable - disable conditional snapshot for an instance + * @tr: The tracing instance + * + * Check whether the conditional snapshot for the given instance is + * enabled; if so, free the cond_snapshot associated with it, + * otherwise return -EINVAL. + * + * Returns 0 if successful, error otherwise. + */ +int tracing_snapshot_cond_disable(struct trace_array *tr) +{ + int ret = 0; + + arch_spin_lock(&tr->max_lock); + + if (!tr->cond_snapshot) + ret = -EINVAL; + else { + kfree(tr->cond_snapshot); + tr->cond_snapshot = NULL; + } + + arch_spin_unlock(&tr->max_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable); #else void tracing_snapshot(void) { WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used"); } EXPORT_SYMBOL_GPL(tracing_snapshot); +void tracing_snapshot_cond(struct trace_array *tr, void *cond_data) +{ + WARN_ONCE(1, "Snapshot feature not enabled, but internal conditional snapshot used"); +} +EXPORT_SYMBOL_GPL(tracing_snapshot_cond); int tracing_alloc_snapshot(void) { WARN_ONCE(1, "Snapshot feature not enabled, but snapshot allocation used"); @@ -1043,6 +1187,21 @@ void tracing_snapshot_alloc(void) tracing_snapshot(); } EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); +void *tracing_cond_snapshot_data(struct trace_array *tr) +{ + return NULL; +} +EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data); +int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update) +{ + return -ENODEV; +} +EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable); +int tracing_snapshot_cond_disable(struct trace_array *tr) +{ + return false; +} +EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable); #endif /* CONFIG_TRACER_SNAPSHOT */ void tracer_tracing_off(struct trace_array *tr) @@ -1354,12 +1513,14 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) * @tr: tracer * @tsk: the task with the latency * @cpu: The cpu that initiated the trace. + * @cond_data: User data associated with a conditional snapshot * * Flip the buffers between the @tr and the max_tr and record information * about which task was the cause of this latency. */ void -update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) +update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu, + void *cond_data) { if (tr->stop_count) return; @@ -1380,9 +1541,15 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) else ring_buffer_record_off(tr->max_buffer.buffer); +#ifdef CONFIG_TRACER_SNAPSHOT + if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data)) + goto out_unlock; +#endif swap(tr->trace_buffer.buffer, tr->max_buffer.buffer); __update_max_tr(tr, tsk, cpu); + + out_unlock: arch_spin_unlock(&tr->max_lock); } @@ -5396,6 +5563,16 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf) if (t == tr->current_trace) goto out; +#ifdef CONFIG_TRACER_SNAPSHOT + if (t->use_max_tr) { + arch_spin_lock(&tr->max_lock); + if (tr->cond_snapshot) + ret = -EBUSY; + arch_spin_unlock(&tr->max_lock); + if (ret) + goto out; + } +#endif /* Some tracers won't work on kernel command line */ if (system_state < SYSTEM_RUNNING && t->noboot) { pr_warn("Tracer '%s' is not allowed on command line, ignored\n", @@ -6477,6 +6654,13 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, goto out; } + arch_spin_lock(&tr->max_lock); + if (tr->cond_snapshot) + ret = -EBUSY; + arch_spin_unlock(&tr->max_lock); + if (ret) + goto out; + switch (val) { case 0: if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { @@ -6502,7 +6686,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, local_irq_disable(); /* Now, we're going to swap */ if (iter->cpu_file == RING_BUFFER_ALL_CPUS) - update_max_tr(tr, current, smp_processor_id()); + update_max_tr(tr, current, smp_processor_id(), NULL); else update_max_tr_single(tr, current, iter->cpu_file); local_irq_enable(); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index ae7df090b93e..d80cee49e0eb 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -194,6 +194,51 @@ struct trace_pid_list { unsigned long *pids; }; +typedef bool (*cond_update_fn_t)(struct trace_array *tr, void *cond_data); + +/** + * struct cond_snapshot - conditional snapshot data and callback + * + * The cond_snapshot structure encapsulates a callback function and + * data associated with the snapshot for a given tracing instance. + * + * When a snapshot is taken conditionally, by invoking + * tracing_snapshot_cond(tr, cond_data), the cond_data passed in is + * passed in turn to the cond_snapshot.update() function. That data + * can be compared by the update() implementation with the cond_data + * contained wihin the struct cond_snapshot instance associated with + * the trace_array. Because the tr->max_lock is held throughout the + * update() call, the update() function can directly retrieve the + * cond_snapshot and cond_data associated with the per-instance + * snapshot associated with the trace_array. + * + * The cond_snapshot.update() implementation can save data to be + * associated with the snapshot if it decides to, and returns 'true' + * in that case, or it returns 'false' if the conditional snapshot + * shouldn't be taken. + * + * The cond_snapshot instance is created and associated with the + * user-defined cond_data by tracing_cond_snapshot_enable(). + * Likewise, the cond_snapshot instance is destroyed and is no longer + * associated with the trace instance by + * tracing_cond_snapshot_disable(). + * + * The method below is required. + * + * @update: When a conditional snapshot is invoked, the update() + * callback function is invoked with the tr->max_lock held. The + * update() implementation signals whether or not to actually + * take the snapshot, by returning 'true' if so, 'false' if no + * snapshot should be taken. Because the max_lock is held for + * the duration of update(), the implementation is safe to + * directly retrieven and save any implementation data it needs + * to in association with the snapshot. + */ +struct cond_snapshot { + void *cond_data; + cond_update_fn_t update; +}; + /* * The trace array - an array of per-CPU trace arrays. This is the * highest level data structure that individual tracers deal with. @@ -277,6 +322,9 @@ struct trace_array { #endif int time_stamp_abs_ref; struct list_head hist_vars; +#ifdef CONFIG_TRACER_SNAPSHOT + struct cond_snapshot *cond_snapshot; +#endif }; enum { @@ -727,7 +775,8 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, const char __user *ubuf, size_t cnt); #ifdef CONFIG_TRACER_MAX_TRACE -void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); +void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu, + void *cond_data); void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu); #endif /* CONFIG_TRACER_MAX_TRACE */ @@ -1810,6 +1859,11 @@ static inline bool event_command_needs_rec(struct event_command *cmd_ops) extern int trace_event_enable_disable(struct trace_event_file *file, int enable, int soft_disable); extern int tracing_alloc_snapshot(void); +extern void tracing_snapshot_cond(struct trace_array *tr, void *cond_data); +extern int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update); + +extern int tracing_snapshot_cond_disable(struct trace_array *tr); +extern void *tracing_cond_snapshot_data(struct trace_array *tr); extern const char *__start___trace_bprintk_fmt[]; extern const char *__stop___trace_bprintk_fmt[]; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index f4fe7d1781e9..743b2b520d34 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -486,7 +486,7 @@ probe_wakeup_sched_switch(void *ignore, bool preempt, if (likely(!is_tracing_stopped())) { wakeup_trace->max_latency = delta; - update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu); + update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu, NULL); } out_unlock: -- cgit v1.2.3 From a3785b7eca8fd45c7c39f2ddfcd67368af30c1b4 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Wed, 13 Feb 2019 17:42:46 -0600 Subject: tracing: Add hist trigger snapshot() action Add support for hist:handlerXXX($var).snapshot(), which will take a snapshot of the current trace buffer whenever handlerXXX is hit. As a first user, this also adds snapshot() action support for the onmax() handler i.e. hist:onmax($var).snapshot(). Also, the hist trigger key printing is moved into a separate function so the snapshot() action can print a histogram key outside the histogram display - add and use hist_trigger_print_key() for that purpose. Link: http://lkml.kernel.org/r/2f1a952c0dcd8aca8702ce81269581a692396d45.1550100284.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 3 + kernel/trace/trace_events_hist.c | 266 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 259 insertions(+), 10 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9f4d56f74b46..dd60c14a0fb0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4919,6 +4919,9 @@ static const char readme_msg[] = "\t The available actions are:\n\n" "\t (param list) - generate synthetic event\n" "\t save(field,...) - save current event fields\n" +#ifdef CONFIG_TRACER_SNAPSHOT + "\t snapshot() - snapshot the trace buffer\n" +#endif #endif ; diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 0515229e5f95..571937a268a3 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -396,6 +396,7 @@ enum handler_id { enum action_id { ACTION_SAVE = 1, ACTION_TRACE, + ACTION_SNAPSHOT, }; struct action_data { @@ -454,6 +455,83 @@ struct action_data { }; }; +struct track_data { + u64 track_val; + bool updated; + + unsigned int key_len; + void *key; + struct tracing_map_elt elt; + + struct action_data *action_data; + struct hist_trigger_data *hist_data; +}; + +struct hist_elt_data { + char *comm; + u64 *var_ref_vals; + char *field_var_str[SYNTH_FIELDS_MAX]; +}; + +struct snapshot_context { + struct tracing_map_elt *elt; + void *key; +}; + +static void track_data_free(struct track_data *track_data) +{ + struct hist_elt_data *elt_data; + + if (!track_data) + return; + + kfree(track_data->key); + + elt_data = track_data->elt.private_data; + if (elt_data) { + kfree(elt_data->comm); + kfree(elt_data); + } + + kfree(track_data); +} + +static struct track_data *track_data_alloc(unsigned int key_len, + struct action_data *action_data, + struct hist_trigger_data *hist_data) +{ + struct track_data *data = kzalloc(sizeof(*data), GFP_KERNEL); + struct hist_elt_data *elt_data; + + if (!data) + return ERR_PTR(-ENOMEM); + + data->key = kzalloc(key_len, GFP_KERNEL); + if (!data->key) { + track_data_free(data); + return ERR_PTR(-ENOMEM); + } + + data->key_len = key_len; + data->action_data = action_data; + data->hist_data = hist_data; + + elt_data = kzalloc(sizeof(*elt_data), GFP_KERNEL); + if (!elt_data) { + track_data_free(data); + return ERR_PTR(-ENOMEM); + } + data->elt.private_data = elt_data; + + elt_data->comm = kzalloc(TASK_COMM_LEN, GFP_KERNEL); + if (!elt_data->comm) { + track_data_free(data); + return ERR_PTR(-ENOMEM); + } + + return data; +} + static char last_hist_cmd[MAX_FILTER_STR_VAL]; static char hist_err_str[MAX_FILTER_STR_VAL]; @@ -1726,12 +1804,6 @@ static struct hist_field *find_event_var(struct hist_trigger_data *hist_data, return hist_field; } -struct hist_elt_data { - char *comm; - u64 *var_ref_vals; - char *field_var_str[SYNTH_FIELDS_MAX]; -}; - static u64 hist_field_var_ref(struct hist_field *hist_field, struct tracing_map_elt *elt, struct ring_buffer_event *rbe, @@ -3452,6 +3524,112 @@ static bool check_track_val(struct tracing_map_elt *elt, return data->track_data.check_val(track_val, var_val); } +#ifdef CONFIG_TRACER_SNAPSHOT +static bool cond_snapshot_update(struct trace_array *tr, void *cond_data) +{ + /* called with tr->max_lock held */ + struct track_data *track_data = tr->cond_snapshot->cond_data; + struct hist_elt_data *elt_data, *track_elt_data; + struct snapshot_context *context = cond_data; + u64 track_val; + + if (!track_data) + return false; + + track_val = get_track_val(track_data->hist_data, context->elt, + track_data->action_data); + + track_data->track_val = track_val; + memcpy(track_data->key, context->key, track_data->key_len); + + elt_data = context->elt->private_data; + track_elt_data = track_data->elt.private_data; + if (elt_data->comm) + memcpy(track_elt_data->comm, elt_data->comm, TASK_COMM_LEN); + + track_data->updated = true; + + return true; +} + +static void save_track_data_snapshot(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, void *rec, + struct ring_buffer_event *rbe, void *key, + struct action_data *data, + u64 *var_ref_vals) +{ + struct trace_event_file *file = hist_data->event_file; + struct snapshot_context context; + + context.elt = elt; + context.key = key; + + tracing_snapshot_cond(file->tr, &context); +} + +static void hist_trigger_print_key(struct seq_file *m, + struct hist_trigger_data *hist_data, + void *key, + struct tracing_map_elt *elt); + +static struct action_data *snapshot_action(struct hist_trigger_data *hist_data) +{ + unsigned int i; + + if (!hist_data->n_actions) + return NULL; + + for (i = 0; i < hist_data->n_actions; i++) { + struct action_data *data = hist_data->actions[i]; + + if (data->action == ACTION_SNAPSHOT) + return data; + } + + return NULL; +} + +static void track_data_snapshot_print(struct seq_file *m, + struct hist_trigger_data *hist_data) +{ + struct trace_event_file *file = hist_data->event_file; + struct track_data *track_data; + struct action_data *action; + + track_data = tracing_cond_snapshot_data(file->tr); + if (!track_data) + return; + + if (!track_data->updated) + return; + + action = snapshot_action(hist_data); + if (!action) + return; + + seq_puts(m, "\nSnapshot taken (see tracing/snapshot). Details:\n"); + seq_printf(m, "\ttriggering value { %s(%s) }: %10llu", + action->handler == HANDLER_ONMAX ? "onmax" : "onchange", + action->track_data.var_str, track_data->track_val); + + seq_puts(m, "\ttriggered by event with key: "); + hist_trigger_print_key(m, hist_data, track_data->key, &track_data->elt); + seq_putc(m, '\n'); +} +#else +static bool cond_snapshot_update(struct trace_array *tr, void *cond_data) +{ + return false; +} +static void save_track_data_snapshot(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, void *rec, + struct ring_buffer_event *rbe, void *key, + struct action_data *data, + u64 *var_ref_vals) {} +static void track_data_snapshot_print(struct seq_file *m, + struct hist_trigger_data *hist_data) {} +#endif /* CONFIG_TRACER_SNAPSHOT */ + static void track_data_print(struct seq_file *m, struct hist_trigger_data *hist_data, struct tracing_map_elt *elt, @@ -3463,6 +3641,9 @@ static void track_data_print(struct seq_file *m, if (data->handler == HANDLER_ONMAX) seq_printf(m, "\n\tmax: %10llu", track_val); + if (data->action == ACTION_SNAPSHOT) + return; + for (i = 0; i < hist_data->n_save_vars; i++) { struct hist_field *save_val = hist_data->save_vars[i]->val; struct hist_field *save_var = hist_data->save_vars[i]->var; @@ -3513,9 +3694,21 @@ static void action_data_destroy(struct action_data *data) static void track_data_destroy(struct hist_trigger_data *hist_data, struct action_data *data) { + struct trace_event_file *file = hist_data->event_file; + destroy_hist_field(data->track_data.track_var, 0); destroy_hist_field(data->track_data.var_ref, 0); + if (data->action == ACTION_SNAPSHOT) { + struct track_data *track_data; + + track_data = tracing_cond_snapshot_data(file->tr); + if (track_data && track_data->hist_data == hist_data) { + tracing_snapshot_cond_disable(file->tr); + track_data_free(track_data); + } + } + kfree(data->track_data.var_str); action_data_destroy(data); @@ -3646,6 +3839,26 @@ static int action_parse(char *str, struct action_data *data, data->track_data.save_data = save_track_data_vars; data->fn = ontrack_action; data->action = ACTION_SAVE; + } else if (str_has_prefix(action_name, "snapshot")) { + char *params = strsep(&str, ")"); + + if (!str) { + hist_err("action parsing: No closing paren found: %s", params); + ret = -EINVAL; + goto out; + } + + if (handler == HANDLER_ONMAX) + data->track_data.check_val = check_track_val_max; + else { + hist_err("action parsing: Handler doesn't support action: ", action_name); + ret = -EINVAL; + goto out; + } + + data->track_data.save_data = save_track_data_snapshot; + data->fn = ontrack_action; + data->action = ACTION_SNAPSHOT; } else { char *params = strsep(&str, ")"); @@ -3942,6 +4155,8 @@ static int trace_action_create(struct hist_trigger_data *hist_data, static int action_create(struct hist_trigger_data *hist_data, struct action_data *data) { + struct trace_event_file *file = hist_data->event_file; + struct track_data *track_data; struct field_var *field_var; unsigned int i; char *param; @@ -3950,6 +4165,21 @@ static int action_create(struct hist_trigger_data *hist_data, if (data->action == ACTION_TRACE) return trace_action_create(hist_data, data); + if (data->action == ACTION_SNAPSHOT) { + track_data = track_data_alloc(hist_data->key_size, data, hist_data); + if (IS_ERR(track_data)) { + ret = PTR_ERR(track_data); + goto out; + } + + ret = tracing_snapshot_cond_enable(file->tr, track_data, + cond_snapshot_update); + if (ret) + track_data_free(track_data); + + goto out; + } + if (data->action == ACTION_SAVE) { if (hist_data->n_save_vars) { ret = -EEXIST; @@ -4552,6 +4782,9 @@ static void print_actions(struct seq_file *m, for (i = 0; i < hist_data->n_actions; i++) { struct action_data *data = hist_data->actions[i]; + if (data->action == ACTION_SNAPSHOT) + continue; + if (data->handler == HANDLER_ONMAX) track_data_print(m, hist_data, elt, data); } @@ -4946,10 +5179,10 @@ static void hist_trigger_stacktrace_print(struct seq_file *m, } } -static void -hist_trigger_entry_print(struct seq_file *m, - struct hist_trigger_data *hist_data, void *key, - struct tracing_map_elt *elt) +static void hist_trigger_print_key(struct seq_file *m, + struct hist_trigger_data *hist_data, + void *key, + struct tracing_map_elt *elt) { struct hist_field *key_field; char str[KSYM_SYMBOL_LEN]; @@ -5025,6 +5258,17 @@ hist_trigger_entry_print(struct seq_file *m, seq_puts(m, " "); seq_puts(m, "}"); +} + +static void hist_trigger_entry_print(struct seq_file *m, + struct hist_trigger_data *hist_data, + void *key, + struct tracing_map_elt *elt) +{ + const char *field_name; + unsigned int i; + + hist_trigger_print_key(m, hist_data, key, elt); seq_printf(m, " hitcount: %10llu", tracing_map_read_sum(elt, HITCOUNT_IDX)); @@ -5091,6 +5335,8 @@ static void hist_trigger_show(struct seq_file *m, if (n_entries < 0) n_entries = 0; + track_data_snapshot_print(m, hist_data); + seq_printf(m, "\nTotals:\n Hits: %llu\n Entries: %u\n Dropped: %llu\n", (u64)atomic64_read(&hist_data->map->hits), n_entries, (u64)atomic64_read(&hist_data->map->drops)); -- cgit v1.2.3 From dff81f559285b5c6df287eb231a9d6b02057ef8b Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Wed, 13 Feb 2019 17:42:48 -0600 Subject: tracing: Add hist trigger onchange() handler Add support for a hist:onchange($var) handler, similar to the onmax() handler but triggering whenever there's any change in $var, not just a max. Link: http://lkml.kernel.org/r/dfbc7e4ada242603e9ec3f049b5ad076a07dfd03.1550100284.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 3 ++- kernel/trace/trace_events_hist.c | 58 ++++++++++++++++++++++++++++++++++------ 2 files changed, 52 insertions(+), 9 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index dd60c14a0fb0..be6779f963c6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4915,7 +4915,8 @@ static const char readme_msg[] = "\t .\n\n" "\t The available handlers are:\n\n" "\t onmatch(matching.event) - invoke on addition or update\n" - "\t onmax(var) - invoke if var exceeds current max\n\n" + "\t onmax(var) - invoke if var exceeds current max\n" + "\t onchange(var) - invoke action if var changes\n\n" "\t The available actions are:\n\n" "\t (param list) - generate synthetic event\n" "\t save(field,...) - save current event fields\n" diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 571937a268a3..2f3323ca9d24 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -391,6 +391,7 @@ typedef bool (*check_track_val_fn_t) (u64 track_val, u64 var_val); enum handler_id { HANDLER_ONMATCH = 1, HANDLER_ONMAX, + HANDLER_ONCHANGE, }; enum action_id { @@ -1989,7 +1990,8 @@ static int parse_action(char *str, struct hist_trigger_attrs *attrs) return ret; if ((str_has_prefix(str, "onmatch(")) || - (str_has_prefix(str, "onmax("))) { + (str_has_prefix(str, "onmax(")) || + (str_has_prefix(str, "onchange("))) { attrs->action_str[attrs->n_actions] = kstrdup(str, GFP_KERNEL); if (!attrs->action_str[attrs->n_actions]) { ret = -ENOMEM; @@ -3481,6 +3483,14 @@ static bool check_track_val_max(u64 track_val, u64 var_val) return true; } +static bool check_track_val_changed(u64 track_val, u64 var_val) +{ + if (var_val == track_val) + return false; + + return true; +} + static u64 get_track_val(struct hist_trigger_data *hist_data, struct tracing_map_elt *elt, struct action_data *data) @@ -3640,6 +3650,8 @@ static void track_data_print(struct seq_file *m, if (data->handler == HANDLER_ONMAX) seq_printf(m, "\n\tmax: %10llu", track_val); + else if (data->handler == HANDLER_ONCHANGE) + seq_printf(m, "\n\tchanged: %10llu", track_val); if (data->action == ACTION_SNAPSHOT) return; @@ -3727,14 +3739,14 @@ static int track_data_create(struct hist_trigger_data *hist_data, track_data_var_str = data->track_data.var_str; if (track_data_var_str[0] != '$') { - hist_err("For onmax(x), x must be a variable: ", track_data_var_str); + hist_err("For onmax(x) or onchange(x), x must be a variable: ", track_data_var_str); return -EINVAL; } track_data_var_str++; var_field = find_target_event_var(hist_data, NULL, NULL, track_data_var_str); if (!var_field) { - hist_err("Couldn't find onmax variable: ", track_data_var_str); + hist_err("Couldn't find onmax or onchange variable: ", track_data_var_str); return -EINVAL; } @@ -3751,6 +3763,14 @@ static int track_data_create(struct hist_trigger_data *hist_data, ret = PTR_ERR(track_var); goto out; } + + if (data->handler == HANDLER_ONCHANGE) + track_var = create_var(hist_data, file, "__change", sizeof(u64), "u64"); + if (IS_ERR(track_var)) { + hist_err("Couldn't create onchange variable: ", "__change"); + ret = PTR_ERR(track_var); + goto out; + } data->track_data.track_var = track_var; ret = action_create(hist_data, data); @@ -3830,6 +3850,8 @@ static int action_parse(char *str, struct action_data *data, if (handler == HANDLER_ONMAX) data->track_data.check_val = check_track_val_max; + else if (handler == HANDLER_ONCHANGE) + data->track_data.check_val = check_track_val_changed; else { hist_err("action parsing: Handler doesn't support action: ", action_name); ret = -EINVAL; @@ -3850,6 +3872,8 @@ static int action_parse(char *str, struct action_data *data, if (handler == HANDLER_ONMAX) data->track_data.check_val = check_track_val_max; + else if (handler == HANDLER_ONCHANGE) + data->track_data.check_val = check_track_val_changed; else { hist_err("action parsing: Handler doesn't support action: ", action_name); ret = -EINVAL; @@ -3870,6 +3894,8 @@ static int action_parse(char *str, struct action_data *data, if (handler == HANDLER_ONMAX) data->track_data.check_val = check_track_val_max; + else if (handler == HANDLER_ONCHANGE) + data->track_data.check_val = check_track_val_changed; if (handler != HANDLER_ONMATCH) { data->track_data.save_data = action_trace; @@ -4700,7 +4726,8 @@ static void destroy_actions(struct hist_trigger_data *hist_data) if (data->handler == HANDLER_ONMATCH) onmatch_destroy(data); - else if (data->handler == HANDLER_ONMAX) + else if (data->handler == HANDLER_ONMAX || + data->handler == HANDLER_ONCHANGE) track_data_destroy(hist_data, data); else kfree(data); @@ -4736,6 +4763,15 @@ static int parse_actions(struct hist_trigger_data *hist_data) ret = PTR_ERR(data); break; } + } else if ((len = str_has_prefix(str, "onchange("))) { + char *action_str = str + len; + + data = track_data_parse(hist_data, action_str, + HANDLER_ONCHANGE); + if (IS_ERR(data)) { + ret = PTR_ERR(data); + break; + } } else { ret = -EINVAL; break; @@ -4760,7 +4796,8 @@ static int create_actions(struct hist_trigger_data *hist_data) ret = onmatch_create(hist_data, data); if (ret) break; - } else if (data->handler == HANDLER_ONMAX) { + } else if (data->handler == HANDLER_ONMAX || + data->handler == HANDLER_ONCHANGE) { ret = track_data_create(hist_data, data); if (ret) break; @@ -4785,7 +4822,8 @@ static void print_actions(struct seq_file *m, if (data->action == ACTION_SNAPSHOT) continue; - if (data->handler == HANDLER_ONMAX) + if (data->handler == HANDLER_ONMAX || + data->handler == HANDLER_ONCHANGE) track_data_print(m, hist_data, elt, data); } } @@ -4817,6 +4855,8 @@ static void print_track_data_spec(struct seq_file *m, { if (data->handler == HANDLER_ONMAX) seq_puts(m, ":onmax("); + else if (data->handler == HANDLER_ONCHANGE) + seq_puts(m, ":onchange("); seq_printf(m, "%s", data->track_data.var_str); seq_printf(m, ").%s(", data->action_name); @@ -4874,7 +4914,8 @@ static bool actions_match(struct hist_trigger_data *hist_data, if (strcmp(data->match_data.event, data_test->match_data.event) != 0) return false; - } else if (data->handler == HANDLER_ONMAX) { + } else if (data->handler == HANDLER_ONMAX || + data->handler == HANDLER_ONCHANGE) { if (strcmp(data->track_data.var_str, data_test->track_data.var_str) != 0) return false; @@ -4895,7 +4936,8 @@ static void print_actions_spec(struct seq_file *m, if (data->handler == HANDLER_ONMATCH) print_onmatch_spec(m, hist_data, data); - else if (data->handler == HANDLER_ONMAX) + else if (data->handler == HANDLER_ONMAX || + data->handler == HANDLER_ONCHANGE) print_track_data_spec(m, hist_data, data); } } -- cgit v1.2.3 From e91eefd731d933194940805bb1f75a4972dc607c Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Wed, 13 Feb 2019 17:42:50 -0600 Subject: tracing: Add alternative synthetic event trace action syntax Add a 'trace(synthetic_event_name, params)' alternative to synthetic_event_name(params). Currently, the syntax used for generating synthetic events is to invoke synthetic_event_name(params) i.e. use the synthetic event name as a function call. Users requested a new form that more explicitly shows that the synthetic event is in effect being traced. In this version, a new 'trace()' keyword is used, and the synthetic event name is passed in as the first argument. In addition, for the sake of consistency with other actions, change the documention to emphasize the trace() form over the function-call form, which remains documented as equivalent. Link: http://lkml.kernel.org/r/d082773e50232a001480cf837679a1e01c1a2eb7.1550100284.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- kernel/trace/trace_events_hist.c | 42 ++++++++++++++++++++++++++++++++++++---- 2 files changed, 39 insertions(+), 5 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index be6779f963c6..0460cc0f28fd 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4918,7 +4918,7 @@ static const char readme_msg[] = "\t onmax(var) - invoke if var exceeds current max\n" "\t onchange(var) - invoke action if var changes\n\n" "\t The available actions are:\n\n" - "\t (param list) - generate synthetic event\n" + "\t trace(,param list) - generate synthetic event\n" "\t save(field,...) - save current event fields\n" #ifdef CONFIG_TRACER_SNAPSHOT "\t snapshot() - snapshot the trace buffer\n" diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 2f3323ca9d24..66386ba1425f 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -419,6 +419,8 @@ struct action_data { */ unsigned int var_ref_idx; struct synth_event *synth_event; + bool use_trace_keyword; + char *synth_event_name; union { struct { @@ -3700,6 +3702,8 @@ static void action_data_destroy(struct action_data *data) if (data->synth_event) data->synth_event->ref--; + kfree(data->synth_event_name); + kfree(data); } @@ -3781,6 +3785,7 @@ static int track_data_create(struct hist_trigger_data *hist_data, static int parse_action_params(char *params, struct action_data *data) { char *param, *saved_param; + bool first_param = true; int ret = 0; while (params) { @@ -3809,6 +3814,13 @@ static int parse_action_params(char *params, struct action_data *data) goto out; } + if (first_param && data->use_trace_keyword) { + data->synth_event_name = saved_param; + first_param = false; + continue; + } + first_param = false; + data->params[data->n_params++] = saved_param; } out: @@ -3886,6 +3898,9 @@ static int action_parse(char *str, struct action_data *data, } else { char *params = strsep(&str, ")"); + if (str_has_prefix(action_name, "trace")) + data->use_trace_keyword = true; + if (params) { ret = parse_action_params(params, data); if (ret) @@ -4088,13 +4103,19 @@ static int trace_action_create(struct hist_trigger_data *hist_data, unsigned int i, var_ref_idx; unsigned int field_pos = 0; struct synth_event *event; + char *synth_event_name; int ret = 0; lockdep_assert_held(&event_mutex); - event = find_synth_event(data->action_name); + if (data->use_trace_keyword) + synth_event_name = data->synth_event_name; + else + synth_event_name = data->action_name; + + event = find_synth_event(synth_event_name); if (!event) { - hist_err("trace action: Couldn't find synthetic event: ", data->action_name); + hist_err("trace action: Couldn't find synthetic event: ", synth_event_name); return -EINVAL; } @@ -4841,8 +4862,10 @@ static void print_action_spec(struct seq_file *m, seq_puts(m, ","); } } else if (data->action == ACTION_TRACE) { + if (data->use_trace_keyword) + seq_printf(m, "%s", data->synth_event_name); for (i = 0; i < data->n_params; i++) { - if (i) + if (i || data->use_trace_keyword) seq_puts(m, ","); seq_printf(m, "%s", data->params[i]); } @@ -4890,6 +4913,7 @@ static bool actions_match(struct hist_trigger_data *hist_data, for (i = 0; i < hist_data->n_actions; i++) { struct action_data *data = hist_data->actions[i]; struct action_data *data_test = hist_data_test->actions[i]; + char *action_name, *action_name_test; if (data->handler != data_test->handler) return false; @@ -4904,7 +4928,17 @@ static bool actions_match(struct hist_trigger_data *hist_data, return false; } - if (strcmp(data->action_name, data_test->action_name) != 0) + if (data->use_trace_keyword) + action_name = data->synth_event_name; + else + action_name = data->action_name; + + if (data_test->use_trace_keyword) + action_name_test = data_test->synth_event_name; + else + action_name_test = data_test->action_name; + + if (strcmp(action_name, action_name_test) != 0) return false; if (data->handler == HANDLER_ONMATCH) { -- cgit v1.2.3 From 1c347a94ca79ef89156c7ad5d3a44bb2320a7047 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 14 Feb 2019 18:45:21 -0500 Subject: tracing: Comment why cond_snapshot is checked outside of max_lock protection Before setting tr->cond_snapshot, it must be NULL before it can be updated. It can go to NULL when a trace event hist trigger is created or removed, and can only be modified under the max_lock spin lock. But because it can only be set to something other than NULL under both the max_lock spin lock as well as the trace_types_lock, we can perform the check if it is not NULL only under the trace_types_lock and fail out without having to grab the max_lock spin lock. This is very subtle, and deserves a comment. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0460cc0f28fd..2cf3c747a357 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1116,6 +1116,14 @@ int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, goto fail_unlock; } + /* + * The cond_snapshot can only change to NULL without the + * trace_types_lock. We don't care if we race with it going + * to NULL, but we want to make sure that it's not set to + * something other than NULL when we get here, which we can + * do safely with only holding the trace_types_lock and not + * having to take the max_lock. + */ if (tr->cond_snapshot) { ret = -EBUSY; goto fail_unlock; -- cgit v1.2.3 From 9e5a36a3371f48fef0ebea6826d1d66f6201c522 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sun, 17 Feb 2019 22:32:22 +0000 Subject: tracing: Fix spelling mistake: "analagous" -> "analogous" There is a spelling mistake in the mini-howto help text. Fix it. Link: http://lkml.kernel.org/r/20190217223222.16479-1-colin.king@canonical.com Signed-off-by: Colin Ian King Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2cf3c747a357..3835f7ed3293 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4916,7 +4916,7 @@ static const char readme_msg[] = "\t unchanged.\n\n" "\t The enable_hist and disable_hist triggers can be used to\n" "\t have one event conditionally start and stop another event's\n" - "\t already-attached hist trigger. The syntax is analagous to\n" + "\t already-attached hist trigger. The syntax is analogous to\n" "\t the enable_event and disable_event triggers.\n\n" "\t Hist trigger handlers and actions are executed whenever a\n" "\t a histogram entry is added or updated. They take the form:\n\n" -- cgit v1.2.3 From 83540fbc8812a580b6ad8f93f4c29e62e417687e Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 20 Feb 2019 17:54:43 +0100 Subject: tracing/perf: Use strndup_user() instead of buggy open-coded version The first version of this method was missing the check for `ret == PATH_MAX`; then such a check was added, but it didn't call kfree() on error, so there was still a small memory leak in the error case. Fix it by using strndup_user() instead of open-coding it. Link: http://lkml.kernel.org/r/20190220165443.152385-1-jannh@google.com Cc: Ingo Molnar Cc: stable@vger.kernel.org Fixes: 0eadcc7a7bc0 ("perf/core: Fix perf_uprobe_init()") Reviewed-by: Masami Hiramatsu Acked-by: Song Liu Signed-off-by: Jann Horn Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_event_perf.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 76217bbef815..4629a6104474 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -299,15 +299,13 @@ int perf_uprobe_init(struct perf_event *p_event, if (!p_event->attr.uprobe_path) return -EINVAL; - path = kzalloc(PATH_MAX, GFP_KERNEL); - if (!path) - return -ENOMEM; - ret = strncpy_from_user( - path, u64_to_user_ptr(p_event->attr.uprobe_path), PATH_MAX); - if (ret == PATH_MAX) - return -E2BIG; - if (ret < 0) - goto out; + + path = strndup_user(u64_to_user_ptr(p_event->attr.uprobe_path), + PATH_MAX); + if (IS_ERR(path)) { + ret = PTR_ERR(path); + return (ret == -EINVAL) ? -E2BIG : ret; + } if (path[0] == '\0') { ret = -EINVAL; goto out; -- cgit v1.2.3 From b303c6df80c9f8f13785aa83a0471fca7e38b24d Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 21 Feb 2019 13:13:38 +0900 Subject: kbuild: compute false-positive -Wmaybe-uninitialized cases in Kconfig Since -Wmaybe-uninitialized was introduced by GCC 4.7, we have patched various false positives: - commit e74fc973b6e5 ("Turn off -Wmaybe-uninitialized when building with -Os") turned off this option for -Os. - commit 815eb71e7149 ("Kbuild: disable 'maybe-uninitialized' warning for CONFIG_PROFILE_ALL_BRANCHES") turned off this option for CONFIG_PROFILE_ALL_BRANCHES - commit a76bcf557ef4 ("Kbuild: enable -Wmaybe-uninitialized warning for "make W=1"") turned off this option for GCC < 4.9 Arnd provided more explanation in https://lkml.org/lkml/2017/3/14/903 I think this looks better by shifting the logic from Makefile to Kconfig. Link: https://github.com/ClangBuiltLinux/linux/issues/350 Signed-off-by: Masahiro Yamada Reviewed-by: Nathan Chancellor Tested-by: Nick Desaulniers --- kernel/trace/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/trace') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index fa8b1fe824f3..8bd1d6d001d7 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -370,6 +370,7 @@ config PROFILE_ANNOTATED_BRANCHES config PROFILE_ALL_BRANCHES bool "Profile all if conditionals" if !FORTIFY_SOURCE select TRACE_BRANCH_PROFILING + imply CC_DISABLE_WARN_MAYBE_UNINITIALIZED # avoid false positives help This tracer profiles all branch conditions. Every if () taken in the kernel is recorded whether it hit or miss. -- cgit v1.2.3 From 6a072128d262d2b98d31626906a96700d1fc11eb Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Thu, 23 Aug 2018 13:25:34 +0300 Subject: tracing: Fix event filters and triggers to handle negative numbers Then tracing syscall exit event it is extremely useful to filter exit codes equal to some negative value, to react only to required errors. But negative numbers does not work: [root@snorch sys_exit_read]# echo "ret == -1" > filter bash: echo: write error: Invalid argument [root@snorch sys_exit_read]# cat filter ret == -1 ^ parse_error: Invalid value (did you forget quotes)? Similar thing happens when setting triggers. These is a regression in v4.17 introduced by the commit mentioned below, testing without these commit shows no problem with negative numbers. Link: http://lkml.kernel.org/r/20180823102534.7642-1-ptikhomirov@virtuozzo.com Cc: stable@vger.kernel.org Fixes: 80765597bc58 ("tracing: Rewrite filter logic to be simpler and faster") Signed-off-by: Pavel Tikhomirov Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_filter.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 27821480105e..217ef481fbbb 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1301,7 +1301,7 @@ static int parse_pred(const char *str, void *data, /* go past the last quote */ i++; - } else if (isdigit(str[i])) { + } else if (isdigit(str[i]) || str[i] == '-') { /* Make sure the field is not a string */ if (is_string_field(field)) { @@ -1314,6 +1314,9 @@ static int parse_pred(const char *str, void *data, goto err_free; } + if (str[i] == '-') + i++; + /* We allow 0xDEADBEEF */ while (isalnum(str[i])) i++; -- cgit v1.2.3 From 49ef5f45701c3dedc6aa6efee5d03756fea0f99d Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 22 Feb 2019 01:16:43 +0900 Subject: tracing/kprobes: Use probe_kernel_read instead of probe_mem_read Use probe_kernel_read() instead of probe_mem_read() because probe_mem_read() is a kind of wrapper for switching memory read function between uprobes and kprobes. Link: http://lkml.kernel.org/r/20190222011643.3e19ade84a3db3e83518648f@kernel.org Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 9eaf07f99212..99592c27465e 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -865,7 +865,7 @@ fetch_store_strlen(unsigned long addr) u8 c; do { - ret = probe_mem_read(&c, (u8 *)addr + len, 1); + ret = probe_kernel_read(&c, (u8 *)addr + len, 1); len++; } while (c && ret == 0 && len < MAX_STRING_SIZE); -- cgit v1.2.3 From ed581aaf99be10883c8364df16bd80a7b8f72efc Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 4 Feb 2019 15:07:23 -0600 Subject: tracing: Use str_has_prefix() in synth_event_create() Since we now have a str_has_prefix() that returns the length, we can use that instead of explicitly calculating it. Link: http://lkml.kernel.org/r/03418373fd1e80030e7394b8e3e081c5de28a710.1549309756.git.tom.zanussi@linux.intel.com Cc: Joe Perches Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 66386ba1425f..5b03b9a869bb 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1316,8 +1316,8 @@ static int synth_event_create(int argc, const char **argv) /* This interface accepts group name prefix */ if (strchr(name, '/')) { - len = sizeof(SYNTH_SYSTEM "/") - 1; - if (strncmp(name, SYNTH_SYSTEM "/", len)) + len = str_has_prefix(name, SYNTH_SYSTEM "/"); + if (len == 0) return -EINVAL; name += len; } -- cgit v1.2.3 From 9f0bbf3115ca9f91f43b7c74e9ac7d79f47fc6c2 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 4 Feb 2019 15:07:24 -0600 Subject: tracing: Use strncpy instead of memcpy for string keys in hist triggers Because there may be random garbage beyond a string's null terminator, it's not correct to copy the the complete character array for use as a hist trigger key. This results in multiple histogram entries for the 'same' string key. So, in the case of a string key, use strncpy instead of memcpy to avoid copying in the extra bytes. Before, using the gdbus entries in the following hist trigger as an example: # echo 'hist:key=comm' > /sys/kernel/debug/tracing/events/sched/sched_waking/trigger # cat /sys/kernel/debug/tracing/events/sched/sched_waking/hist ... { comm: ImgDecoder #4 } hitcount: 203 { comm: gmain } hitcount: 213 { comm: gmain } hitcount: 216 { comm: StreamTrans #73 } hitcount: 221 { comm: mozStorage #3 } hitcount: 230 { comm: gdbus } hitcount: 233 { comm: StyleThread#5 } hitcount: 253 { comm: gdbus } hitcount: 256 { comm: gdbus } hitcount: 260 { comm: StyleThread#4 } hitcount: 271 ... # cat /sys/kernel/debug/tracing/events/sched/sched_waking/hist | egrep gdbus | wc -l 51 After: # cat /sys/kernel/debug/tracing/events/sched/sched_waking/hist | egrep gdbus | wc -l 1 Link: http://lkml.kernel.org/r/50c35ae1267d64eee975b8125e151e600071d4dc.1549309756.git.tom.zanussi@linux.intel.com Cc: Namhyung Kim Cc: stable@vger.kernel.org Fixes: 79e577cbce4c4 ("tracing: Support string type key properly") Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 5b03b9a869bb..c7774fa119a7 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -5157,9 +5157,10 @@ static inline void add_to_key(char *compound_key, void *key, /* ensure NULL-termination */ if (size > key_field->size - 1) size = key_field->size - 1; - } - memcpy(compound_key + key_field->offset, key, size); + strncpy(compound_key + key_field->offset, (char *)key, size); + } else + memcpy(compound_key + key_field->offset, key, size); } static void -- cgit v1.2.3 From 27242c62b141240079d1ac8d35adcdc70cae8895 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 5 Mar 2019 10:11:59 -0600 Subject: tracing: Use strncpy instead of memcpy when copying comm for hist triggers Because there may be random garbage beyond a string's null terminator, code that might use the entire comm array e.g. histogram keys, can give unexpected results if that garbage is copied in too, so avoid that possibility by using strncpy instead of memcpy. Link: http://lkml.kernel.org/r/1eb9f096a8086c3c82c7fc087c900005143cec54.1551802084.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index c7774fa119a7..ca46339f3009 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2141,7 +2141,7 @@ static inline void save_comm(char *comm, struct task_struct *task) return; } - memcpy(comm, task->comm, TASK_COMM_LEN); + strncpy(comm, task->comm, TASK_COMM_LEN); } static void hist_elt_data_free(struct hist_elt_data *elt_data) @@ -3557,7 +3557,7 @@ static bool cond_snapshot_update(struct trace_array *tr, void *cond_data) elt_data = context->elt->private_data; track_elt_data = track_data->elt.private_data; if (elt_data->comm) - memcpy(track_elt_data->comm, elt_data->comm, TASK_COMM_LEN); + strncpy(track_elt_data->comm, elt_data->comm, TASK_COMM_LEN); track_data->updated = true; -- cgit v1.2.3 From 85f726a35e504418607b77c5e7da165dc1ea63ce Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 5 Mar 2019 10:12:00 -0600 Subject: tracing: Use strncpy instead of memcpy when copying comm in trace.c Because there may be random garbage beyond a string's null terminator, code that might use the entire comm array e.g. histogram keys, can give unexpected results if that garbage is copied in too, so avoid that possibility by using strncpy instead of memcpy. Link: http://lkml.kernel.org/r/1d6ebac26570c2a29ce9fb575379f17ef5c8b81b.1551802084.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Suggested-by: Steven Rostedt (VMware) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3835f7ed3293..e9cc47e59d25 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1497,7 +1497,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) max_data->critical_start = data->critical_start; max_data->critical_end = data->critical_end; - memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN); + strncpy(max_data->comm, tsk->comm, TASK_COMM_LEN); max_data->pid = tsk->pid; /* * If tsk == current, then use current_uid(), as that does not use @@ -1923,7 +1923,7 @@ static inline char *get_saved_cmdlines(int idx) static inline void set_cmdline(int idx, const char *cmdline) { - memcpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN); + strncpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN); } static int allocate_cmdlines_buffer(unsigned int val, -- cgit v1.2.3 From 0841625201b649c0b1bf0f0e25cf4401e68fb8fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valdis=20Kl=C4=93tnieks?= Date: Tue, 12 Mar 2019 04:52:58 -0400 Subject: tracing/probes: Make reserved_field_names static sparse complains: CHECK kernel/trace/trace_probe.c kernel/trace/trace_probe.c:16:12: warning: symbol 'reserved_field_names' was not declared. Should it be static? Yes, it should be static. Link: http://lkml.kernel.org/r/2478.1552380778@turing-police Signed-off-by: Valdis Kletnieks Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_probe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 89da34b326e3..cfcf77e6fb19 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -13,7 +13,7 @@ #include "trace_probe.h" -const char *reserved_field_names[] = { +static const char *reserved_field_names[] = { "common_type", "common_flags", "common_preempt_count", -- cgit v1.2.3 From cede666e2eb28dc8a680d1622fded533769f07a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valdis=20Kl=C4=93tnieks?= Date: Tue, 12 Mar 2019 04:58:32 -0400 Subject: trace/probes: Remove kernel doc style from non kernel doc comment CC kernel/trace/trace_kprobe.o kernel/trace/trace_kprobe.c:41: warning: cannot understand function prototype: 'struct trace_kprobe ' The real problem is that a comment looked like kerneldoc when it shouldn't be... Link: http://lkml.kernel.org/r/2812.1552381112@turing-police Signed-off-by: Valdis Kletnieks Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index d5fb09ebba8b..ceafa0a2b1d1 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -35,7 +35,7 @@ static struct dyn_event_operations trace_kprobe_ops = { .match = trace_kprobe_match, }; -/** +/* * Kprobe event core functions */ struct trace_kprobe { -- cgit v1.2.3 From 31b265b3baaf55f209229888b7ffea523ddab366 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Fri, 8 Mar 2019 11:32:04 -0800 Subject: tracing: kdb: Fix ftdump to not sleep As reported back in 2016-11 [1], the "ftdump" kdb command triggers a BUG for "sleeping function called from invalid context". kdb's "ftdump" command wants to call ring_buffer_read_prepare() in atomic context. A very simple solution for this is to add allocation flags to ring_buffer_read_prepare() so kdb can call it without triggering the allocation error. This patch does that. Note that in the original email thread about this, it was suggested that perhaps the solution for kdb was to either preallocate the buffer ahead of time or create our own iterator. I'm hoping that this alternative of adding allocation flags to ring_buffer_read_prepare() can be considered since it means I don't need to duplicate more of the core trace code into "trace_kdb.c" (for either creating my own iterator or re-preparing a ring allocator whose memory was already allocated). NOTE: another option for kdb is to actually figure out how to make it reuse the existing ftrace_dump() function and totally eliminate the duplication. This sounds very appealing and actually works (the "sr z" command can be seen to properly dump the ftrace buffer). The downside here is that ftrace_dump() fully consumes the trace buffer. Unless that is changed I'd rather not use it because it means "ftdump | grep xyz" won't be very useful to search the ftrace buffer since it will throw away the whole trace on the first grep. A future patch to dump only the last few lines of the buffer will also be hard to implement. [1] https://lkml.kernel.org/r/20161117191605.GA21459@google.com Link: http://lkml.kernel.org/r/20190308193205.213659-1-dianders@chromium.org Reported-by: Brian Norris Signed-off-by: Douglas Anderson Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 5 +++-- kernel/trace/trace.c | 6 ++++-- kernel/trace/trace_kdb.c | 6 ++++-- 3 files changed, 11 insertions(+), 6 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9a91479bbbfe..41b6f96e5366 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -4191,6 +4191,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume); * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer * @buffer: The ring buffer to read from * @cpu: The cpu buffer to iterate over + * @flags: gfp flags to use for memory allocation * * This performs the initial preparations necessary to iterate * through the buffer. Memory is allocated, buffer recording @@ -4208,7 +4209,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume); * This overall must be paired with ring_buffer_read_finish. */ struct ring_buffer_iter * -ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu) +ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu, gfp_t flags) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_iter *iter; @@ -4216,7 +4217,7 @@ ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu) if (!cpumask_test_cpu(cpu, buffer->cpumask)) return NULL; - iter = kmalloc(sizeof(*iter), GFP_KERNEL); + iter = kmalloc(sizeof(*iter), flags); if (!iter) return NULL; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e9cc47e59d25..ccd759eaad79 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4077,7 +4077,8 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { iter->buffer_iter[cpu] = - ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu); + ring_buffer_read_prepare(iter->trace_buffer->buffer, + cpu, GFP_KERNEL); } ring_buffer_read_prepare_sync(); for_each_tracing_cpu(cpu) { @@ -4087,7 +4088,8 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) } else { cpu = iter->cpu_file; iter->buffer_iter[cpu] = - ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu); + ring_buffer_read_prepare(iter->trace_buffer->buffer, + cpu, GFP_KERNEL); ring_buffer_read_prepare_sync(); ring_buffer_read_start(iter->buffer_iter[cpu]); tracing_iter_reset(iter, cpu); diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index d953c163a079..810d78a8d14c 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -51,14 +51,16 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file) if (cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { iter.buffer_iter[cpu] = - ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu); + ring_buffer_read_prepare(iter.trace_buffer->buffer, + cpu, GFP_ATOMIC); ring_buffer_read_start(iter.buffer_iter[cpu]); tracing_iter_reset(&iter, cpu); } } else { iter.cpu_file = cpu_file; iter.buffer_iter[cpu_file] = - ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu_file); + ring_buffer_read_prepare(iter.trace_buffer->buffer, + cpu_file, GFP_ATOMIC); ring_buffer_read_start(iter.buffer_iter[cpu_file]); tracing_iter_reset(&iter, cpu_file); } -- cgit v1.2.3 From f6d85f04e29859dd3ea65395c05925da352dae89 Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Mon, 14 Jan 2019 21:31:13 +0100 Subject: blkcg: annotate implicit fall through There is a plan to build the kernel with -Wimplicit-fallthrough and this place in the code produced a warning (W=1). This commit remove the following warning: kernel/trace/blktrace.c:725:9: warning: this statement may fall through [-Wimplicit-fallthrough=] Signed-off-by: Mathieu Malaterre Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/trace') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index fac0ddf8a8e2..e1c6d79fb4cc 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -723,6 +723,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) #endif case BLKTRACESTART: start = 1; + /* fall through */ case BLKTRACESTOP: ret = __blk_trace_startstop(q, start); break; -- cgit v1.2.3 From 287c038c0b994dae7569d96eca154f6a7ff6b4a9 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 14 Mar 2019 13:30:09 +0900 Subject: tracing/probe: Check maxactive error cases Check maxactive on kprobe error case, because maxactive is only for kretprobe, not for kprobe. Also, maxactive should not be 0, it should be at least 1. Link: http://lkml.kernel.org/r/155253780952.14922.15784129810238750331.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index ceafa0a2b1d1..a14837545295 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -624,7 +624,11 @@ static int trace_kprobe_create(int argc, const char *argv[]) if (event) event++; - if (is_return && isdigit(argv[0][1])) { + if (isdigit(argv[0][1])) { + if (!is_return) { + pr_info("Maxactive is not for kprobe"); + return -EINVAL; + } if (event) len = event - &argv[0][1] - 1; else @@ -634,8 +638,8 @@ static int trace_kprobe_create(int argc, const char *argv[]) memcpy(buf, &argv[0][1], len); buf[len] = '\0'; ret = kstrtouint(buf, 0, &maxactive); - if (ret) { - pr_info("Failed to parse maxactive.\n"); + if (ret || !maxactive) { + pr_info("Invalid maxactive number\n"); return ret; } /* kretprobes instances are iterated over via a list. The -- cgit v1.2.3 From dec65d79fd269d05427c8167090bfc9c3d0b56c4 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 14 Mar 2019 13:30:20 +0900 Subject: tracing/probe: Check event name length correctly Ensure given name of event is not too long when parsing it, and fix to update event name offset correctly when the group name is given. For example, this makes probe event to check the "p:foo/" error case correctly. Link: http://lkml.kernel.org/r/155253782046.14922.14724124823730168629.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_probe.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index cfcf77e6fb19..0dc13bff2847 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -159,6 +159,7 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, char *buf) { const char *slash, *event = *pevent; + int len; slash = strchr(event, '/'); if (slash) { @@ -173,10 +174,15 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, strlcpy(buf, event, slash - event + 1); *pgroup = buf; *pevent = slash + 1; + event = *pevent; } - if (strlen(event) == 0) { + len = strlen(event); + if (len == 0) { pr_info("Event name is not specified\n"); return -EINVAL; + } else if (len > MAX_EVENT_NAME_LEN) { + pr_info("Event name is too long\n"); + return -E2BIG; } return 0; } -- cgit v1.2.3 From b4443c17a3c9d652dc5d7679ddca867ee3cdaa9c Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 14 Mar 2019 13:30:30 +0900 Subject: tracing/probe: Check the size of argument name and body Check the size of argument name and expression is not 0 and smaller than maximum length. Link: http://lkml.kernel.org/r/155253783029.14922.12650939303827581096.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_probe.c | 2 ++ kernel/trace/trace_probe.h | 1 + 2 files changed, 3 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 0dc13bff2847..bf9d3d7e0c9d 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -554,6 +554,8 @@ int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, char *arg, body = strchr(arg, '='); if (body) { + if (body - arg > MAX_ARG_NAME_LEN || body == arg) + return -EINVAL; parg->name = kmemdup_nul(arg, body - arg, GFP_KERNEL); body++; } else { diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 8a63f8bc01bc..2177c206de15 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -32,6 +32,7 @@ #define MAX_TRACE_ARGS 128 #define MAX_ARGSTR_LEN 63 #define MAX_ARRAY_LEN 64 +#define MAX_ARG_NAME_LEN 32 #define MAX_STRING_SIZE PATH_MAX /* Reserved field names */ -- cgit v1.2.3 From 5b7a96220900e3c3f6fb53908eb4602cda959376 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 14 Mar 2019 13:30:40 +0900 Subject: tracing/probe: Check event/group naming rule at parsing Check event and group naming rule at parsing it instead of allocating probes. Link: http://lkml.kernel.org/r/155253784064.14922.2336893061156236237.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 7 +------ kernel/trace/trace_probe.c | 8 ++++++++ kernel/trace/trace_uprobe.c | 5 +---- 3 files changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index a14837545295..5265117ad7e8 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -221,7 +221,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, tk->rp.maxactive = maxactive; - if (!event || !is_good_name(event)) { + if (!event || !group) { ret = -EINVAL; goto error; } @@ -231,11 +231,6 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, if (!tk->tp.call.name) goto error; - if (!group || !is_good_name(group)) { - ret = -EINVAL; - goto error; - } - tk->tp.class.system = kstrdup(group, GFP_KERNEL); if (!tk->tp.class.system) goto error; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index bf9d3d7e0c9d..8f8411e7835f 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -172,6 +172,10 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, return -E2BIG; } strlcpy(buf, event, slash - event + 1); + if (!is_good_name(buf)) { + pr_info("Group name must follow the same rules as C identifiers\n"); + return -EINVAL; + } *pgroup = buf; *pevent = slash + 1; event = *pevent; @@ -184,6 +188,10 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, pr_info("Event name is too long\n"); return -E2BIG; } + if (!is_good_name(event)) { + pr_info("Event name must follow the same rules as C identifiers\n"); + return -EINVAL; + } return 0; } diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index e335576b9411..52f033489377 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -266,10 +266,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) { struct trace_uprobe *tu; - if (!event || !is_good_name(event)) - return ERR_PTR(-EINVAL); - - if (!group || !is_good_name(group)) + if (!event || !group) return ERR_PTR(-EINVAL); tu = kzalloc(SIZEOF_TRACE_UPROBE(nargs), GFP_KERNEL); -- cgit v1.2.3 From a039480e9e93896cadc5a91468964febb3c5d488 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 14 Mar 2019 13:30:50 +0900 Subject: tracing/probe: Verify alloc_trace_*probe() result Since alloc_trace_*probe() returns -EINVAL only if !event && !group, it should not happen in trace_*probe_create(). If we catch that case there is a bug. So use WARN_ON_ONCE() instead of pr_info(). Link: http://lkml.kernel.org/r/155253785078.14922.16902223633734601469.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 4 ++-- kernel/trace/trace_uprobe.c | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 5265117ad7e8..63aa0ab56051 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -693,9 +693,9 @@ static int trace_kprobe_create(int argc, const char *argv[]) tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive, argc, is_return); if (IS_ERR(tk)) { - pr_info("Failed to allocate trace_probe.(%d)\n", - (int)PTR_ERR(tk)); ret = PTR_ERR(tk); + /* This must return -ENOMEM otherwise there is a bug */ + WARN_ON_ONCE(ret != -ENOMEM); goto out; } diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 52f033489377..b54137ec7810 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -514,8 +514,9 @@ static int trace_uprobe_create(int argc, const char **argv) tu = alloc_trace_uprobe(group, event, argc, is_return); if (IS_ERR(tu)) { - pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu)); ret = PTR_ERR(tu); + /* This must return -ENOMEM otherwise there is a bug */ + WARN_ON_ONCE(ret != -ENOMEM); goto fail_address_parse; } tu->offset = offset; -- cgit v1.2.3 From ff9d31d0d46672e201fc9ff59c42f1eef5f00c77 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Wed, 20 Mar 2019 12:53:33 -0500 Subject: tracing: Remove unnecessary var_ref destroy in track_data_destroy() Commit 656fe2ba85e8 (tracing: Use hist trigger's var_ref array to destroy var_refs) centralized the destruction of all the var_refs in one place so that other code didn't have to do it. The track_data_destroy() added later ignored that and also destroyed the track_data var_ref, causing a double-free error flagged by KASAN. ================================================================== BUG: KASAN: use-after-free in destroy_hist_field+0x30/0x70 Read of size 8 at addr ffff888086df2210 by task bash/1694 CPU: 6 PID: 1694 Comm: bash Not tainted 5.1.0-rc1-test+ #15 Hardware name: Hewlett-Packard HP Compaq Pro 6300 SFF/339A, BIOS K01 v03.03 07/14/2016 Call Trace: dump_stack+0x71/0xa0 ? destroy_hist_field+0x30/0x70 print_address_description.cold.3+0x9/0x1fb ? destroy_hist_field+0x30/0x70 ? destroy_hist_field+0x30/0x70 kasan_report.cold.4+0x1a/0x33 ? __kasan_slab_free+0x100/0x150 ? destroy_hist_field+0x30/0x70 destroy_hist_field+0x30/0x70 track_data_destroy+0x55/0xe0 destroy_hist_data+0x1f0/0x350 hist_unreg_all+0x203/0x220 event_trigger_open+0xbb/0x130 do_dentry_open+0x296/0x700 ? stacktrace_count_trigger+0x30/0x30 ? generic_permission+0x56/0x200 ? __x64_sys_fchdir+0xd0/0xd0 ? inode_permission+0x55/0x200 ? security_inode_permission+0x18/0x60 path_openat+0x633/0x22b0 ? path_lookupat.isra.50+0x420/0x420 ? __kasan_kmalloc.constprop.12+0xc1/0xd0 ? kmem_cache_alloc+0xe5/0x260 ? getname_flags+0x6c/0x2a0 ? do_sys_open+0x149/0x2b0 ? do_syscall_64+0x73/0x1b0 ? entry_SYSCALL_64_after_hwframe+0x44/0xa9 ? _raw_write_lock_bh+0xe0/0xe0 ? __kernel_text_address+0xe/0x30 ? unwind_get_return_address+0x2f/0x50 ? __list_add_valid+0x2d/0x70 ? deactivate_slab.isra.62+0x1f4/0x5a0 ? getname_flags+0x6c/0x2a0 ? set_track+0x76/0x120 do_filp_open+0x11a/0x1a0 ? may_open_dev+0x50/0x50 ? _raw_spin_lock+0x7a/0xd0 ? _raw_write_lock_bh+0xe0/0xe0 ? __alloc_fd+0x10f/0x200 do_sys_open+0x1db/0x2b0 ? filp_open+0x50/0x50 do_syscall_64+0x73/0x1b0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7fa7b24a4ca2 Code: 25 00 00 41 00 3d 00 00 41 00 74 4c 48 8d 05 85 7a 0d 00 8b 00 85 c0 75 6d 89 f2 b8 01 01 00 00 48 89 fe bf 9c ff ff ff 0f 05 <48> 3d 00 f0 ff ff 0f 87 a2 00 00 00 48 8b 4c 24 28 64 48 33 0c 25 RSP: 002b:00007fffbafb3af0 EFLAGS: 00000246 ORIG_RAX: 0000000000000101 RAX: ffffffffffffffda RBX: 000055d3648ade30 RCX: 00007fa7b24a4ca2 RDX: 0000000000000241 RSI: 000055d364a55240 RDI: 00000000ffffff9c RBP: 00007fffbafb3bf0 R08: 0000000000000020 R09: 0000000000000002 R10: 00000000000001b6 R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000000003 R14: 0000000000000001 R15: 000055d364a55240 ================================================================== So remove the track_data_destroy() destroy_hist_field() call for that var_ref. Link: http://lkml.kernel.org/r/1deffec420f6a16d11dd8647318d34a66d1989a9.camel@linux.intel.com Fixes: 466f4528fbc69 ("tracing: Generalize hist trigger onmax and save action") Reported-by: Steven Rostedt (VMware) Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index ca46339f3009..795aa2038377 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -3713,7 +3713,6 @@ static void track_data_destroy(struct hist_trigger_data *hist_data, struct trace_event_file *file = hist_data->event_file; destroy_hist_field(data->track_data.track_var, 0); - destroy_hist_field(data->track_data.var_ref, 0); if (data->action == ACTION_SNAPSHOT) { struct track_data *track_data; -- cgit v1.2.3 From 3dee10da2e9ff220e054a8f158cc296c797fbe81 Mon Sep 17 00:00:00 2001 From: Frank Rowand Date: Thu, 21 Mar 2019 23:58:20 -0700 Subject: tracing: initialize variable in create_dyn_event() Fix compile warning in create_dyn_event(): 'ret' may be used uninitialized in this function [-Wuninitialized]. Link: http://lkml.kernel.org/r/1553237900-8555-1-git-send-email-frowand.list@gmail.com Cc: Masami Hiramatsu Cc: Ingo Molnar Cc: Tom Zanussi Cc: Ravi Bangoria Cc: stable@vger.kernel.org Fixes: 5448d44c3855 ("tracing: Add unified dynamic event framework") Signed-off-by: Frank Rowand Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_dynevent.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index dd1f43588d70..fa100ed3b4de 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -74,7 +74,7 @@ int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type) static int create_dyn_event(int argc, char **argv) { struct dyn_event_operations *ops; - int ret; + int ret = -ENODEV; if (argv[0][0] == '-' || argv[0][0] == '!') return dyn_event_release(argc, argv, NULL); -- cgit v1.2.3 From 9efb85c5cfac7e1f0caae4471446d936ff2163fe Mon Sep 17 00:00:00 2001 From: Hariprasad Kelam Date: Sun, 24 Mar 2019 00:05:23 +0530 Subject: ftrace: Fix warning using plain integer as NULL & spelling corrections Changed 0 --> NULL to avoid sparse warning Corrected spelling mistakes reported by checkpatch.pl Sparse warning below: sudo make C=2 CF=-D__CHECK_ENDIAN__ M=kernel/trace CHECK kernel/trace/ftrace.c kernel/trace/ftrace.c:3007:24: warning: Using plain integer as NULL pointer kernel/trace/ftrace.c:4758:37: warning: Using plain integer as NULL pointer Link: http://lkml.kernel.org/r/20190323183523.GA2244@hari-Inspiron-1545 Signed-off-by: Hariprasad Kelam Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index fa79323331b2..26c8ca9bd06b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1992,7 +1992,7 @@ static void print_bug_type(void) * modifying the code. @failed should be one of either: * EFAULT - if the problem happens on reading the @ip address * EINVAL - if what is read at @ip is not what was expected - * EPERM - if the problem happens on writting to the @ip address + * EPERM - if the problem happens on writing to the @ip address */ void ftrace_bug(int failed, struct dyn_ftrace *rec) { @@ -2391,7 +2391,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr); } - return -1; /* unknow ftrace bug */ + return -1; /* unknown ftrace bug */ } void __weak ftrace_replace_code(int mod_flags) @@ -3004,7 +3004,7 @@ ftrace_allocate_pages(unsigned long num_to_init) int cnt; if (!num_to_init) - return 0; + return NULL; start_pg = pg = kzalloc(sizeof(*pg), GFP_KERNEL); if (!pg) @@ -4755,7 +4755,7 @@ static int ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove, int reset, int enable) { - return ftrace_set_hash(ops, 0, 0, ip, remove, reset, enable); + return ftrace_set_hash(ops, NULL, 0, ip, remove, reset, enable); } /** @@ -5463,7 +5463,7 @@ void ftrace_create_filter_files(struct ftrace_ops *ops, /* * The name "destroy_filter_files" is really a misnomer. Although - * in the future, it may actualy delete the files, but this is + * in the future, it may actually delete the files, but this is * really intended to make sure the ops passed in are disabled * and that when this function returns, the caller is free to * free the ops. @@ -5786,7 +5786,7 @@ void ftrace_module_enable(struct module *mod) /* * If the tracing is enabled, go ahead and enable the record. * - * The reason not to enable the record immediatelly is the + * The reason not to enable the record immediately is the * inherent check of ftrace_make_nop/ftrace_make_call for * correct previous instructions. Making first the NOP * conversion puts the module to the correct state, thus -- cgit v1.2.3 From d08e411397cb6fcb3d3fb075c27a41975c99e88f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 7 Nov 2016 16:26:36 -0500 Subject: tracing/syscalls: Pass in hardcoded 6 into syscall_get_arguments() The only users that calls syscall_get_arguments() with a variable and not a hard coded '6' is ftrace_syscall_enter(). syscall_get_arguments() can be optimized by removing a variable input, and always grabbing 6 arguments regardless of what the system call actually uses. Change ftrace_syscall_enter() to pass the 6 args into a local stack array and copy the necessary arguments into the trace event as needed. This is needed to remove two parameters from syscall_get_arguments(). Link: http://lkml.kernel.org/r/20161107213233.627583542@goodmis.org Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_syscalls.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index f93a56d2db27..e9f5bbbad6d9 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -314,6 +314,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) struct ring_buffer_event *event; struct ring_buffer *buffer; unsigned long irq_flags; + unsigned long args[6]; int pc; int syscall_nr; int size; @@ -347,7 +348,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) entry = ring_buffer_event_data(event); entry->nr = syscall_nr; - syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); + syscall_get_arguments(current, regs, 0, 6, args); + memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args); event_trigger_unlock_commit(trace_file, buffer, event, entry, irq_flags, pc); @@ -583,6 +585,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) struct syscall_metadata *sys_data; struct syscall_trace_enter *rec; struct hlist_head *head; + unsigned long args[6]; bool valid_prog_array; int syscall_nr; int rctx; @@ -613,8 +616,8 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) return; rec->nr = syscall_nr; - syscall_get_arguments(current, regs, 0, sys_data->nb_args, - (unsigned long *)&rec->args); + syscall_get_arguments(current, regs, 0, 6, args); + memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args); if ((valid_prog_array && !perf_call_bpf_enter(sys_data->enter_event, regs, sys_data, rec)) || -- cgit v1.2.3 From b35f549df1d7520d37ba1e6d4a8d4df6bd52d136 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 7 Nov 2016 16:26:37 -0500 Subject: syscalls: Remove start and number from syscall_get_arguments() args At Linux Plumbers, Andy Lutomirski approached me and pointed out that the function call syscall_get_arguments() implemented in x86 was horribly written and not optimized for the standard case of passing in 0 and 6 for the starting index and the number of system calls to get. When looking at all the users of this function, I discovered that all instances pass in only 0 and 6 for these arguments. Instead of having this function handle different cases that are never used, simply rewrite it to return the first 6 arguments of a system call. This should help out the performance of tracing system calls by ptrace, ftrace and perf. Link: http://lkml.kernel.org/r/20161107213233.754809394@goodmis.org Cc: Oleg Nesterov Cc: Kees Cook Cc: Andy Lutomirski Cc: Dominik Brodowski Cc: Dave Martin Cc: "Dmitry V. Levin" Cc: x86@kernel.org Cc: linux-snps-arc@lists.infradead.org Cc: linux-kernel@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org Cc: linux-c6x-dev@linux-c6x.org Cc: uclinux-h8-devel@lists.sourceforge.jp Cc: linux-hexagon@vger.kernel.org Cc: linux-ia64@vger.kernel.org Cc: linux-mips@vger.kernel.org Cc: nios2-dev@lists.rocketboards.org Cc: openrisc@lists.librecores.org Cc: linux-parisc@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-riscv@lists.infradead.org Cc: linux-s390@vger.kernel.org Cc: linux-sh@vger.kernel.org Cc: sparclinux@vger.kernel.org Cc: linux-um@lists.infradead.org Cc: linux-xtensa@linux-xtensa.org Cc: linux-arch@vger.kernel.org Acked-by: Paul Burton # MIPS parts Acked-by: Max Filippov # For xtensa changes Acked-by: Will Deacon # For the arm64 bits Reviewed-by: Thomas Gleixner # for x86 Reviewed-by: Dmitry V. Levin Reported-by: Andy Lutomirski Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_syscalls.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index e9f5bbbad6d9..fa8fbff736d6 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -348,7 +348,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) entry = ring_buffer_event_data(event); entry->nr = syscall_nr; - syscall_get_arguments(current, regs, 0, 6, args); + syscall_get_arguments(current, regs, args); memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args); event_trigger_unlock_commit(trace_file, buffer, event, entry, @@ -616,7 +616,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) return; rec->nr = syscall_nr; - syscall_get_arguments(current, regs, 0, 6, args); + syscall_get_arguments(current, regs, args); memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args); if ((valid_prog_array && -- cgit v1.2.3 From 15fab63e1e57be9fdb5eec1bbc5916e9825e9acb Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 5 Apr 2019 14:02:10 -0700 Subject: fs: prevent page refcount overflow in pipe_buf_get Change pipe_buf_get() to return a bool indicating whether it succeeded in raising the refcount of the page (if the thing in the pipe is a page). This removes another mechanism for overflowing the page refcount. All callers converted to handle a failure. Reported-by: Jann Horn Signed-off-by: Matthew Wilcox Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- kernel/trace/trace.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c4238b441624..0f300d488c9f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6835,12 +6835,16 @@ static void buffer_pipe_buf_release(struct pipe_inode_info *pipe, buf->private = 0; } -static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, +static bool buffer_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct buffer_ref *ref = (struct buffer_ref *)buf->private; + if (ref->ref > INT_MAX/2) + return false; + ref->ref++; + return true; } /* Pipe buffer operations for a buffer. */ -- cgit v1.2.3 From fabe38ab6b2bd9418350284c63825f13b8a6abba Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sun, 24 Feb 2019 01:50:20 +0900 Subject: kprobes: Mark ftrace mcount handler functions nokprobe Mark ftrace mcount handler functions nokprobe since probing on these functions with kretprobe pushes return address incorrectly on kretprobe shadow stack. Reported-by: Francis Deslauriers Tested-by: Andrea Righi Signed-off-by: Masami Hiramatsu Acked-by: Steven Rostedt Acked-by: Steven Rostedt (VMware) Cc: Linus Torvalds Cc: Mathieu Desnoyers Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/155094062044.6137.6419622920568680640.stgit@devbox Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 26c8ca9bd06b..b920358dd8f7 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -33,6 +33,7 @@ #include #include #include +#include #include @@ -6246,7 +6247,7 @@ void ftrace_reset_array_ops(struct trace_array *tr) tr->ops->func = ftrace_stub; } -static inline void +static nokprobe_inline void __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ignored, struct pt_regs *regs) { @@ -6306,11 +6307,13 @@ static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, { __ftrace_ops_list_func(ip, parent_ip, NULL, regs); } +NOKPROBE_SYMBOL(ftrace_ops_list_func); #else static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip) { __ftrace_ops_list_func(ip, parent_ip, NULL, NULL); } +NOKPROBE_SYMBOL(ftrace_ops_no_ops); #endif /* @@ -6337,6 +6340,7 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip, preempt_enable_notrace(); trace_clear_recursion(bit); } +NOKPROBE_SYMBOL(ftrace_ops_assist_func); /** * ftrace_ops_get_func - get the function a trampoline should call -- cgit v1.2.3 From b987222654f84f7b4ca95b3a55eca784cb30235b Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Thu, 4 Apr 2019 23:59:25 +0200 Subject: tracing: Fix buffer_ref pipe ops This fixes multiple issues in buffer_pipe_buf_ops: - The ->steal() handler must not return zero unless the pipe buffer has the only reference to the page. But generic_pipe_buf_steal() assumes that every reference to the pipe is tracked by the page's refcount, which isn't true for these buffers - buffer_pipe_buf_get(), which duplicates a buffer, doesn't touch the page's refcount. Fix it by using generic_pipe_buf_nosteal(), which refuses every attempted theft. It should be easy to actually support ->steal, but the only current users of pipe_buf_steal() are the virtio console and FUSE, and they also only use it as an optimization. So it's probably not worth the effort. - The ->get() and ->release() handlers can be invoked concurrently on pipe buffers backed by the same struct buffer_ref. Make them safe against concurrency by using refcount_t. - The pointers stored in ->private were only zeroed out when the last reference to the buffer_ref was dropped. As far as I know, this shouldn't be necessary anyway, but if we do it, let's always do it. Link: http://lkml.kernel.org/r/20190404215925.253531-1-jannh@google.com Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Al Viro Cc: stable@vger.kernel.org Fixes: 73a757e63114d ("ring-buffer: Return reader page back into existing ring buffer") Signed-off-by: Jann Horn Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 21153e64bf1c..0cfa13a60086 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7025,19 +7025,23 @@ struct buffer_ref { struct ring_buffer *buffer; void *page; int cpu; - int ref; + refcount_t refcount; }; +static void buffer_ref_release(struct buffer_ref *ref) +{ + if (!refcount_dec_and_test(&ref->refcount)) + return; + ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page); + kfree(ref); +} + static void buffer_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct buffer_ref *ref = (struct buffer_ref *)buf->private; - if (--ref->ref) - return; - - ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page); - kfree(ref); + buffer_ref_release(ref); buf->private = 0; } @@ -7046,14 +7050,14 @@ static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, { struct buffer_ref *ref = (struct buffer_ref *)buf->private; - ref->ref++; + refcount_inc(&ref->refcount); } /* Pipe buffer operations for a buffer. */ static const struct pipe_buf_operations buffer_pipe_buf_ops = { .confirm = generic_pipe_buf_confirm, .release = buffer_pipe_buf_release, - .steal = generic_pipe_buf_steal, + .steal = generic_pipe_buf_nosteal, .get = buffer_pipe_buf_get, }; @@ -7066,11 +7070,7 @@ static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i) struct buffer_ref *ref = (struct buffer_ref *)spd->partial[i].private; - if (--ref->ref) - return; - - ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page); - kfree(ref); + buffer_ref_release(ref); spd->partial[i].private = 0; } @@ -7125,7 +7125,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, break; } - ref->ref = 1; + refcount_set(&ref->refcount, 1); ref->buffer = iter->trace_buffer->buffer; ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file); if (IS_ERR(ref->page)) { -- cgit v1.2.3 From 91862cc7867bba4ee5c8fcf0ca2f1d30427b6129 Mon Sep 17 00:00:00 2001 From: Wenwen Wang Date: Fri, 19 Apr 2019 21:22:59 -0500 Subject: tracing: Fix a memory leak by early error exit in trace_pid_write() In trace_pid_write(), the buffer for trace parser is allocated through kmalloc() in trace_parser_get_init(). Later on, after the buffer is used, it is then freed through kfree() in trace_parser_put(). However, it is possible that trace_pid_write() is terminated due to unexpected errors, e.g., ENOMEM. In that case, the allocated buffer will not be freed, which is a memory leak bug. To fix this issue, free the allocated buffer when an error is encountered. Link: http://lkml.kernel.org/r/1555726979-15633-1-git-send-email-wang6495@umn.edu Fixes: f4d34a87e9c10 ("tracing: Use pid bitmap instead of a pid array for set_event_pid") Cc: stable@vger.kernel.org Signed-off-by: Wenwen Wang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0cfa13a60086..46f68fad6373 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -496,8 +496,10 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, * not modified. */ pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); - if (!pid_list) + if (!pid_list) { + trace_parser_put(&parser); return -ENOMEM; + } pid_list->pid_max = READ_ONCE(pid_max); @@ -507,6 +509,7 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3); if (!pid_list->pids) { + trace_parser_put(&parser); kfree(pid_list); return -ENOMEM; } -- cgit v1.2.3 From d6097c9e4454adf1f8f2c9547c2fa6060d55d952 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 23 Apr 2019 22:03:18 +0200 Subject: trace: Fix preempt_enable_no_resched() abuse Unless the very next line is schedule(), or implies it, one must not use preempt_enable_no_resched(). It can cause a preemption to go missing and thereby cause arbitrary delays, breaking the PREEMPT=y invariant. Link: http://lkml.kernel.org/r/20190423200318.GY14281@hirez.programming.kicks-ass.net Cc: Waiman Long Cc: Linus Torvalds Cc: Ingo Molnar Cc: Will Deacon Cc: Thomas Gleixner Cc: the arch/x86 maintainers Cc: Davidlohr Bueso Cc: Tim Chen Cc: huang ying Cc: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: stable@vger.kernel.org Fixes: 2c2d7329d8af ("tracing/ftrace: use preempt_enable_no_resched_notrace in ring_buffer_time_stamp()") Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 41b6f96e5366..4ee8d8aa3d0f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -762,7 +762,7 @@ u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu) preempt_disable_notrace(); time = rb_time_stamp(buffer); - preempt_enable_no_resched_notrace(); + preempt_enable_notrace(); return time; } -- cgit v1.2.3