diff options
Diffstat (limited to 'kernel/trace')
48 files changed, 5814 insertions, 1775 deletions
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 49de13cae428..e130da35808f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -1281,4 +1281,18 @@ config HIST_TRIGGERS_DEBUG source "kernel/trace/rv/Kconfig" +config TRACE_REMOTE + bool + +config SIMPLE_RING_BUFFER + bool + +config TRACE_REMOTE_TEST + tristate "Test module for remote tracing" + select TRACE_REMOTE + select SIMPLE_RING_BUFFER + help + This trace remote includes a ring-buffer writer implementation using + "simple_ring_buffer". This is solely intending for testing. + endif # FTRACE diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 04096c21d06b..8d3d96e847d8 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -69,6 +69,7 @@ obj-$(CONFIG_TRACING) += trace_seq.o obj-$(CONFIG_TRACING) += trace_stat.o obj-$(CONFIG_TRACING) += trace_printk.o obj-$(CONFIG_TRACING) += trace_pid.o +obj-$(CONFIG_TRACER_SNAPSHOT) += trace_snapshot.o obj-$(CONFIG_TRACING) += pid_list.o obj-$(CONFIG_TRACING_MAP) += tracing_map.o obj-$(CONFIG_PREEMPTIRQ_DELAY_TEST) += preemptirq_delay_test.o @@ -128,4 +129,37 @@ obj-$(CONFIG_FPROBE_EVENTS) += trace_fprobe.o obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o obj-$(CONFIG_RV) += rv/ +obj-$(CONFIG_TRACE_REMOTE) += trace_remote.o +obj-$(CONFIG_SIMPLE_RING_BUFFER) += simple_ring_buffer.o +obj-$(CONFIG_TRACE_REMOTE_TEST) += remote_test.o + +# simple_ring_buffer is used by the pKVM hypervisor which does not have access +# to all kernel symbols. Fail the build if forbidden symbols are found. + +# Basic compiler and tooling-generated symbols that can safely be left +# undefined. Ensure KASAN is enabled to avoid logic that may disable +# FORTIFY_SOURCE when KASAN is not enabled. undefsyms_base.o does not +# automatically get KASAN flags because it is not linked into vmlinux. +targets += undefsyms_base.o +KASAN_SANITIZE_undefsyms_base.o := y + +UNDEFINED_ALLOWLIST = __asan __gcov __kasan __kcsan __hwasan __sancov __sanitizer __tsan __ubsan __msan \ + __aeabi_unwind_cpp __s390_indirect_jump __x86_indirect_thunk simple_ring_buffer \ + $(shell $(NM) -u $(obj)/undefsyms_base.o 2>/dev/null | awk '{print $$2}') + +quiet_cmd_check_undefined = NM $< + cmd_check_undefined = \ + undefsyms=$$($(NM) -u $< | grep -v $(addprefix -e , $(UNDEFINED_ALLOWLIST)) || true); \ + if [ -n "$$undefsyms" ]; then \ + echo "Unexpected symbols in $<:" >&2; \ + echo "$$undefsyms" >&2; \ + false; \ + fi; \ + touch $@ + +$(obj)/%.o.checked: $(obj)/%.o $(obj)/undefsyms_base.o FORCE + $(call if_changed,check_undefined) + +always-$(CONFIG_SIMPLE_RING_BUFFER) += simple_ring_buffer.o.checked + libftrace-y := ftrace.o diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0b040a417442..a02bd258677e 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2384,7 +2384,8 @@ static void bpf_kprobe_multi_link_release(struct bpf_link *link) struct bpf_kprobe_multi_link *kmulti_link; kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); - unregister_fprobe(&kmulti_link->fp); + /* Don't wait for RCU GP here. */ + unregister_fprobe_async(&kmulti_link->fp); kprobe_multi_put_modules(kmulti_link->mods, kmulti_link->mods_cnt); } @@ -2752,6 +2753,10 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (!is_kprobe_multi(prog)) return -EINVAL; + /* kprobe_multi is not allowed to be sleepable. */ + if (prog->sleepable) + return -EINVAL; + /* Writing to context is not allowed for kprobes. */ if (prog->aux->kprobe_write_ctx) return -EINVAL; diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index dcadf1d23b8a..f378613ad120 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -4,6 +4,7 @@ */ #define pr_fmt(fmt) "fprobe: " fmt +#include <linux/cleanup.h> #include <linux/err.h> #include <linux/fprobe.h> #include <linux/kallsyms.h> @@ -78,36 +79,33 @@ static const struct rhashtable_params fprobe_rht_params = { }; /* Node insertion and deletion requires the fprobe_mutex */ -static int insert_fprobe_node(struct fprobe_hlist_node *node) +static int __insert_fprobe_node(struct fprobe_hlist_node *node, struct fprobe *fp) { + int ret; + lockdep_assert_held(&fprobe_mutex); - return rhltable_insert(&fprobe_ip_table, &node->hlist, fprobe_rht_params); + ret = rhltable_insert(&fprobe_ip_table, &node->hlist, fprobe_rht_params); + /* Set the fprobe pointer if insertion was successful. */ + if (!ret) + WRITE_ONCE(node->fp, fp); + return ret; } -/* Return true if there are synonims */ -static bool delete_fprobe_node(struct fprobe_hlist_node *node) +static void __delete_fprobe_node(struct fprobe_hlist_node *node) { lockdep_assert_held(&fprobe_mutex); - bool ret; - /* Avoid double deleting */ + /* Avoid double deleting and non-inserted nodes */ if (READ_ONCE(node->fp) != NULL) { WRITE_ONCE(node->fp, NULL); rhltable_remove(&fprobe_ip_table, &node->hlist, fprobe_rht_params); } - - rcu_read_lock(); - ret = !!rhltable_lookup(&fprobe_ip_table, &node->addr, - fprobe_rht_params); - rcu_read_unlock(); - - return ret; } /* Check existence of the fprobe */ -static bool is_fprobe_still_exist(struct fprobe *fp) +static bool fprobe_registered(struct fprobe *fp) { struct hlist_head *head; struct fprobe_hlist *fph; @@ -120,7 +118,7 @@ static bool is_fprobe_still_exist(struct fprobe *fp) } return false; } -NOKPROBE_SYMBOL(is_fprobe_still_exist); +NOKPROBE_SYMBOL(fprobe_registered); static int add_fprobe_hash(struct fprobe *fp) { @@ -132,9 +130,6 @@ static int add_fprobe_hash(struct fprobe *fp) if (WARN_ON_ONCE(!fph)) return -EINVAL; - if (is_fprobe_still_exist(fp)) - return -EEXIST; - head = &fprobe_table[hash_ptr(fp, FPROBE_HASH_BITS)]; hlist_add_head_rcu(&fp->hlist_array->hlist, head); return 0; @@ -149,7 +144,7 @@ static int del_fprobe_hash(struct fprobe *fp) if (WARN_ON_ONCE(!fph)) return -EINVAL; - if (!is_fprobe_still_exist(fp)) + if (!fprobe_registered(fp)) return -ENOENT; fph->fp = NULL; @@ -255,7 +250,65 @@ static inline int __fprobe_kprobe_handler(unsigned long ip, unsigned long parent return ret; } +static int fprobe_fgraph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops, + struct ftrace_regs *fregs); +static void fprobe_return(struct ftrace_graph_ret *trace, + struct fgraph_ops *gops, + struct ftrace_regs *fregs); + +static struct fgraph_ops fprobe_graph_ops = { + .entryfunc = fprobe_fgraph_entry, + .retfunc = fprobe_return, +}; +/* Number of fgraph fprobe nodes */ +static int nr_fgraph_fprobes; +/* Is fprobe_graph_ops registered? */ +static bool fprobe_graph_registered; + +/* Add @addrs to the ftrace filter and register fgraph if needed. */ +static int fprobe_graph_add_ips(unsigned long *addrs, int num) +{ + int ret; + + lockdep_assert_held(&fprobe_mutex); + + ret = ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 0, 0); + if (ret) + return ret; + + if (!fprobe_graph_registered) { + ret = register_ftrace_graph(&fprobe_graph_ops); + if (WARN_ON_ONCE(ret)) { + ftrace_free_filter(&fprobe_graph_ops.ops); + return ret; + } + fprobe_graph_registered = true; + } + return 0; +} + +static void __fprobe_graph_unregister(void) +{ + if (fprobe_graph_registered) { + unregister_ftrace_graph(&fprobe_graph_ops); + ftrace_free_filter(&fprobe_graph_ops.ops); + fprobe_graph_registered = false; + } +} + +/* Remove @addrs from the ftrace filter and unregister fgraph if possible. */ +static void fprobe_graph_remove_ips(unsigned long *addrs, int num) +{ + lockdep_assert_held(&fprobe_mutex); + + if (!nr_fgraph_fprobes) + __fprobe_graph_unregister(); + else if (num) + ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 1, 0); +} + #if defined(CONFIG_DYNAMIC_FTRACE_WITH_ARGS) || defined(CONFIG_DYNAMIC_FTRACE_WITH_REGS) + /* ftrace_ops callback, this processes fprobes which have only entry_handler. */ static void fprobe_ftrace_entry(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct ftrace_regs *fregs) @@ -298,7 +351,10 @@ static struct ftrace_ops fprobe_ftrace_ops = { .func = fprobe_ftrace_entry, .flags = FTRACE_OPS_FL_SAVE_ARGS, }; -static int fprobe_ftrace_active; +/* Number of ftrace fprobe nodes */ +static int nr_ftrace_fprobes; +/* Is fprobe_ftrace_ops registered? */ +static bool fprobe_ftrace_registered; static int fprobe_ftrace_add_ips(unsigned long *addrs, int num) { @@ -310,25 +366,33 @@ static int fprobe_ftrace_add_ips(unsigned long *addrs, int num) if (ret) return ret; - if (!fprobe_ftrace_active) { + if (!fprobe_ftrace_registered) { ret = register_ftrace_function(&fprobe_ftrace_ops); if (ret) { ftrace_free_filter(&fprobe_ftrace_ops); return ret; } + fprobe_ftrace_registered = true; } - fprobe_ftrace_active++; return 0; } +static void __fprobe_ftrace_unregister(void) +{ + if (fprobe_ftrace_registered) { + unregister_ftrace_function(&fprobe_ftrace_ops); + ftrace_free_filter(&fprobe_ftrace_ops); + fprobe_ftrace_registered = false; + } +} + static void fprobe_ftrace_remove_ips(unsigned long *addrs, int num) { lockdep_assert_held(&fprobe_mutex); - fprobe_ftrace_active--; - if (!fprobe_ftrace_active) - unregister_ftrace_function(&fprobe_ftrace_ops); - if (num) + if (!nr_ftrace_fprobes) + __fprobe_ftrace_unregister(); + else if (num) ftrace_set_filter_ips(&fprobe_ftrace_ops, addrs, num, 1, 0); } @@ -337,12 +401,78 @@ static bool fprobe_is_ftrace(struct fprobe *fp) return !fp->exit_handler; } +/* Node insertion and deletion requires the fprobe_mutex */ +static int insert_fprobe_node(struct fprobe_hlist_node *node, struct fprobe *fp) +{ + int ret; + + lockdep_assert_held(&fprobe_mutex); + + ret = __insert_fprobe_node(node, fp); + if (!ret) { + if (fprobe_is_ftrace(fp)) + nr_ftrace_fprobes++; + else + nr_fgraph_fprobes++; + } + + return ret; +} + +static void delete_fprobe_node(struct fprobe_hlist_node *node) +{ + struct fprobe *fp; + + lockdep_assert_held(&fprobe_mutex); + + fp = READ_ONCE(node->fp); + if (fp) { + if (fprobe_is_ftrace(fp)) + nr_ftrace_fprobes--; + else + nr_fgraph_fprobes--; + } + __delete_fprobe_node(node); +} + +static bool fprobe_exists_on_hash(unsigned long ip, bool ftrace) +{ + struct rhlist_head *head, *pos; + struct fprobe_hlist_node *node; + struct fprobe *fp; + + guard(rcu)(); + head = rhltable_lookup(&fprobe_ip_table, &ip, + fprobe_rht_params); + if (!head) + return false; + /* We have to check the same type on the list. */ + rhl_for_each_entry_rcu(node, pos, head, hlist) { + if (node->addr != ip) + break; + fp = READ_ONCE(node->fp); + if (likely(fp)) { + if ((!ftrace && fp->exit_handler) || + (ftrace && !fp->exit_handler)) + return true; + } + } + + return false; +} + #ifdef CONFIG_MODULES -static void fprobe_set_ips(unsigned long *ips, unsigned int cnt, int remove, - int reset) +static void fprobe_remove_ips(unsigned long *ips, unsigned int cnt) { - ftrace_set_filter_ips(&fprobe_graph_ops.ops, ips, cnt, remove, reset); - ftrace_set_filter_ips(&fprobe_ftrace_ops, ips, cnt, remove, reset); + if (!nr_fgraph_fprobes) + __fprobe_graph_unregister(); + else if (cnt) + ftrace_set_filter_ips(&fprobe_graph_ops.ops, ips, cnt, 1, 0); + + if (!nr_ftrace_fprobes) + __fprobe_ftrace_unregister(); + else if (cnt) + ftrace_set_filter_ips(&fprobe_ftrace_ops, ips, cnt, 1, 0); } #endif #else @@ -360,11 +490,62 @@ static bool fprobe_is_ftrace(struct fprobe *fp) return false; } +/* Node insertion and deletion requires the fprobe_mutex */ +static int insert_fprobe_node(struct fprobe_hlist_node *node, struct fprobe *fp) +{ + int ret; + + lockdep_assert_held(&fprobe_mutex); + + ret = __insert_fprobe_node(node, fp); + if (!ret) + nr_fgraph_fprobes++; + + return ret; +} + +static void delete_fprobe_node(struct fprobe_hlist_node *node) +{ + struct fprobe *fp; + + lockdep_assert_held(&fprobe_mutex); + + fp = READ_ONCE(node->fp); + if (fp) + nr_fgraph_fprobes--; + __delete_fprobe_node(node); +} + +static bool fprobe_exists_on_hash(unsigned long ip, bool ftrace __maybe_unused) +{ + struct rhlist_head *head, *pos; + struct fprobe_hlist_node *node; + struct fprobe *fp; + + guard(rcu)(); + head = rhltable_lookup(&fprobe_ip_table, &ip, + fprobe_rht_params); + if (!head) + return false; + /* We only need to check fp is there. */ + rhl_for_each_entry_rcu(node, pos, head, hlist) { + if (node->addr != ip) + break; + fp = READ_ONCE(node->fp); + if (likely(fp)) + return true; + } + + return false; +} + #ifdef CONFIG_MODULES -static void fprobe_set_ips(unsigned long *ips, unsigned int cnt, int remove, - int reset) +static void fprobe_remove_ips(unsigned long *ips, unsigned int cnt) { - ftrace_set_filter_ips(&fprobe_graph_ops.ops, ips, cnt, remove, reset); + if (!nr_fgraph_fprobes) + __fprobe_graph_unregister(); + else if (cnt) + ftrace_set_filter_ips(&fprobe_graph_ops.ops, ips, cnt, 1, 0); } #endif #endif /* !CONFIG_DYNAMIC_FTRACE_WITH_ARGS && !CONFIG_DYNAMIC_FTRACE_WITH_REGS */ @@ -450,8 +631,6 @@ static int fprobe_fgraph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops used += FPROBE_HEADER_SIZE_IN_LONG + size_words; } } - if (used < reserved_words) - memset(fgraph_data + used, 0, reserved_words - used); /* If any exit_handler is set, data must be used. */ return used != 0; @@ -482,7 +661,7 @@ static void fprobe_return(struct ftrace_graph_ret *trace, if (!fp) break; curr += FPROBE_HEADER_SIZE_IN_LONG; - if (is_fprobe_still_exist(fp) && !fprobe_disabled(fp)) { + if (fprobe_registered(fp) && !fprobe_disabled(fp)) { if (WARN_ON_ONCE(curr + size > size_words)) break; fp->exit_handler(fp, trace->func, ret_ip, fregs, @@ -494,51 +673,9 @@ static void fprobe_return(struct ftrace_graph_ret *trace, } NOKPROBE_SYMBOL(fprobe_return); -static struct fgraph_ops fprobe_graph_ops = { - .entryfunc = fprobe_fgraph_entry, - .retfunc = fprobe_return, -}; -static int fprobe_graph_active; - -/* Add @addrs to the ftrace filter and register fgraph if needed. */ -static int fprobe_graph_add_ips(unsigned long *addrs, int num) -{ - int ret; - - lockdep_assert_held(&fprobe_mutex); - - ret = ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 0, 0); - if (ret) - return ret; - - if (!fprobe_graph_active) { - ret = register_ftrace_graph(&fprobe_graph_ops); - if (WARN_ON_ONCE(ret)) { - ftrace_free_filter(&fprobe_graph_ops.ops); - return ret; - } - } - fprobe_graph_active++; - return 0; -} - -/* Remove @addrs from the ftrace filter and unregister fgraph if possible. */ -static void fprobe_graph_remove_ips(unsigned long *addrs, int num) -{ - lockdep_assert_held(&fprobe_mutex); - - fprobe_graph_active--; - /* Q: should we unregister it ? */ - if (!fprobe_graph_active) - unregister_ftrace_graph(&fprobe_graph_ops); - - if (num) - ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 1, 0); -} - #ifdef CONFIG_MODULES -#define FPROBE_IPS_BATCH_INIT 8 +#define FPROBE_IPS_BATCH_INIT 128 /* instruction pointer address list */ struct fprobe_addr_list { int index; @@ -546,43 +683,29 @@ struct fprobe_addr_list { unsigned long *addrs; }; -static int fprobe_addr_list_add(struct fprobe_addr_list *alist, unsigned long addr) +static int fprobe_remove_node_in_module(struct module *mod, struct fprobe_hlist_node *node, + struct fprobe_addr_list *alist) { - unsigned long *addrs; - - /* Previously we failed to expand the list. */ - if (alist->index == alist->size) - return -ENOSPC; + lockdep_assert_in_rcu_read_lock(); - alist->addrs[alist->index++] = addr; - if (alist->index < alist->size) + if (!within_module(node->addr, mod)) return 0; - /* Expand the address list */ - addrs = kcalloc(alist->size * 2, sizeof(*addrs), GFP_KERNEL); - if (!addrs) - return -ENOMEM; - - memcpy(addrs, alist->addrs, alist->size * sizeof(*addrs)); - alist->size *= 2; - kfree(alist->addrs); - alist->addrs = addrs; - - return 0; -} - -static void fprobe_remove_node_in_module(struct module *mod, struct fprobe_hlist_node *node, - struct fprobe_addr_list *alist) -{ - if (!within_module(node->addr, mod)) - return; - if (delete_fprobe_node(node)) - return; + delete_fprobe_node(node); + /* If no address list is available, we can't track this address. */ + if (!alist->addrs) + return 0; /* - * If failed to update alist, just continue to update hlist. - * Therefore, at list user handler will not hit anymore. + * Don't care the type here, because all fprobes on the same + * address must be removed eventually. */ - fprobe_addr_list_add(alist, node->addr); + if (!rhltable_lookup(&fprobe_ip_table, &node->addr, fprobe_rht_params)) { + alist->addrs[alist->index++] = node->addr; + if (alist->index == alist->size) + return -ENOSPC; + } + + return 0; } /* Handle module unloading to manage fprobe_ip_table. */ @@ -593,29 +716,48 @@ static int fprobe_module_callback(struct notifier_block *nb, struct fprobe_hlist_node *node; struct rhashtable_iter iter; struct module *mod = data; + bool retry; if (val != MODULE_STATE_GOING) return NOTIFY_DONE; alist.addrs = kcalloc(alist.size, sizeof(*alist.addrs), GFP_KERNEL); - /* If failed to alloc memory, we can not remove ips from hash. */ - if (!alist.addrs) - return NOTIFY_DONE; + /* + * If failed to alloc memory, ftrace_ops will not be able to remove ips from + * hash, but we can still remove nodes from fprobe_ip_table, so we can avoid + * the potential wrong callback. So just print a warning here and try to + * continue without address list. + */ + WARN_ONCE(!alist.addrs, + "Failed to allocate memory for fprobe_addr_list, ftrace_ops will not be updated"); mutex_lock(&fprobe_mutex); +again: + retry = false; + alist.index = 0; rhltable_walk_enter(&fprobe_ip_table, &iter); do { rhashtable_walk_start(&iter); while ((node = rhashtable_walk_next(&iter)) && !IS_ERR(node)) - fprobe_remove_node_in_module(mod, node, &alist); + if (fprobe_remove_node_in_module(mod, node, &alist) < 0) { + retry = true; + break; + } rhashtable_walk_stop(&iter); - } while (node == ERR_PTR(-EAGAIN)); + } while (node == ERR_PTR(-EAGAIN) && !retry); rhashtable_walk_exit(&iter); + /* Remove any ips from hash table(s) */ + fprobe_remove_ips(alist.addrs, alist.index); + /* + * If we break rhashtable walk loop except for -EAGAIN, we need + * to restart looping from start for safety. Anyway, this is + * not a hotpath. + */ + if (retry) + goto again; - if (alist.index > 0) - fprobe_set_ips(alist.addrs, alist.index, 1, 0); mutex_unlock(&fprobe_mutex); kfree(alist.addrs); @@ -759,7 +901,6 @@ static int fprobe_init(struct fprobe *fp, unsigned long *addrs, int num) fp->hlist_array = hlist_array; hlist_array->fp = fp; for (i = 0; i < num; i++) { - hlist_array->array[i].fp = fp; addr = ftrace_location(addrs[i]); if (!addr) { fprobe_fail_cleanup(fp); @@ -823,6 +964,8 @@ int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter } EXPORT_SYMBOL_GPL(register_fprobe); +static int unregister_fprobe_nolock(struct fprobe *fp); + /** * register_fprobe_ips() - Register fprobe to ftrace by address. * @fp: A fprobe data structure to be registered. @@ -841,35 +984,33 @@ int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num) struct fprobe_hlist *hlist_array; int ret, i; + guard(mutex)(&fprobe_mutex); + if (fprobe_registered(fp)) + return -EEXIST; + ret = fprobe_init(fp, addrs, num); if (ret) return ret; - mutex_lock(&fprobe_mutex); - - hlist_array = fp->hlist_array; if (fprobe_is_ftrace(fp)) ret = fprobe_ftrace_add_ips(addrs, num); else ret = fprobe_graph_add_ips(addrs, num); - - if (!ret) { - add_fprobe_hash(fp); - for (i = 0; i < hlist_array->size; i++) { - ret = insert_fprobe_node(&hlist_array->array[i]); - if (ret) - break; - } - /* fallback on insert error */ - if (ret) { - for (i--; i >= 0; i--) - delete_fprobe_node(&hlist_array->array[i]); - } + if (ret) { + fprobe_fail_cleanup(fp); + return ret; } - mutex_unlock(&fprobe_mutex); - if (ret) - fprobe_fail_cleanup(fp); + hlist_array = fp->hlist_array; + ret = add_fprobe_hash(fp); + for (i = 0; i < hlist_array->size && !ret; i++) + ret = insert_fprobe_node(&hlist_array->array[i], fp); + + if (ret) { + unregister_fprobe_nolock(fp); + /* In error case, wait for clean up safely. */ + synchronize_rcu(); + } return ret; } @@ -913,37 +1054,28 @@ bool fprobe_is_registered(struct fprobe *fp) return true; } -/** - * unregister_fprobe() - Unregister fprobe. - * @fp: A fprobe data structure to be unregistered. - * - * Unregister fprobe (and remove ftrace hooks from the function entries). - * - * Return 0 if @fp is unregistered successfully, -errno if not. - */ -int unregister_fprobe(struct fprobe *fp) +static int unregister_fprobe_nolock(struct fprobe *fp) { - struct fprobe_hlist *hlist_array; + struct fprobe_hlist *hlist_array = fp->hlist_array; unsigned long *addrs = NULL; - int ret = 0, i, count; - - mutex_lock(&fprobe_mutex); - if (!fp || !is_fprobe_still_exist(fp)) { - ret = -EINVAL; - goto out; - } + int i, count; - hlist_array = fp->hlist_array; addrs = kcalloc(hlist_array->size, sizeof(unsigned long), GFP_KERNEL); - if (!addrs) { - ret = -ENOMEM; /* TODO: Fallback to one-by-one loop */ - goto out; - } + /* + * This will remove fprobe_hash_node from the hash table even if + * memory allocation fails. However, ftrace_ops will not be updated. + * Anyway, when the last fprobe is unregistered, ftrace_ops is also + * unregistered. + */ + if (!addrs) + pr_warn("Failed to allocate working array. ftrace_ops may not sync.\n"); /* Remove non-synonim ips from table and hash */ count = 0; for (i = 0; i < hlist_array->size; i++) { - if (!delete_fprobe_node(&hlist_array->array[i])) + delete_fprobe_node(&hlist_array->array[i]); + if (addrs && !fprobe_exists_on_hash(hlist_array->array[i].addr, + fprobe_is_ftrace(fp))) addrs[count++] = hlist_array->array[i].addr; } del_fprobe_hash(fp); @@ -955,11 +1087,44 @@ int unregister_fprobe(struct fprobe *fp) kfree_rcu(hlist_array, rcu); fp->hlist_array = NULL; + kfree(addrs); -out: - mutex_unlock(&fprobe_mutex); + return 0; +} - kfree(addrs); +/** + * unregister_fprobe_async() - Unregister fprobe without RCU GP wait + * @fp: A fprobe data structure to be unregistered. + * + * Unregister fprobe (and remove ftrace hooks from the function entries). + * This function will NOT wait until the fprobe is no longer used. + * + * Return 0 if @fp is unregistered successfully, -errno if not. + */ +int unregister_fprobe_async(struct fprobe *fp) +{ + guard(mutex)(&fprobe_mutex); + if (!fp || !fprobe_registered(fp)) + return -EINVAL; + + return unregister_fprobe_nolock(fp); +} + +/** + * unregister_fprobe() - Unregister fprobe with RCU GP wait + * @fp: A fprobe data structure to be unregistered. + * + * Unregister fprobe (and remove ftrace hooks from the function entries). + * This function will block until the fprobe is no longer used. + * + * Return 0 if @fp is unregistered successfully, -errno if not. + */ +int unregister_fprobe(struct fprobe *fp) +{ + int ret = unregister_fprobe_async(fp); + + if (!ret) + synchronize_rcu(); return ret; } EXPORT_SYMBOL_GPL(unregister_fprobe); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 413310912609..b2611de3f594 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6841,7 +6841,8 @@ bool ftrace_filter_param __initdata; static int __init set_ftrace_notrace(char *str) { ftrace_filter_param = true; - strscpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); + trace_append_boot_param(ftrace_notrace_buf, str, ',', + FTRACE_FILTER_SIZE); return 1; } __setup("ftrace_notrace=", set_ftrace_notrace); @@ -6849,7 +6850,8 @@ __setup("ftrace_notrace=", set_ftrace_notrace); static int __init set_ftrace_filter(char *str) { ftrace_filter_param = true; - strscpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); + trace_append_boot_param(ftrace_filter_buf, str, ',', + FTRACE_FILTER_SIZE); return 1; } __setup("ftrace_filter=", set_ftrace_filter); @@ -6861,14 +6863,16 @@ static int ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer); static int __init set_graph_function(char *str) { - strscpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); + trace_append_boot_param(ftrace_graph_buf, str, ',', + FTRACE_FILTER_SIZE); return 1; } __setup("ftrace_graph_filter=", set_graph_function); static int __init set_graph_notrace_function(char *str) { - strscpy(ftrace_graph_notrace_buf, str, FTRACE_FILTER_SIZE); + trace_append_boot_param(ftrace_graph_notrace_buf, str, ',', + FTRACE_FILTER_SIZE); return 1; } __setup("ftrace_graph_notrace=", set_graph_notrace_function); @@ -9267,6 +9271,15 @@ static int kallsyms_callback(void *data, const char *name, unsigned long addr) * @addrs array, which needs to be big enough to store at least @cnt * addresses. * + * For a single symbol (cnt == 1), uses kallsyms_lookup_name() which + * performs an O(log N) binary search via the sorted kallsyms index. + * This avoids the full O(N) linear scan over all kernel symbols that + * the multi-symbol path requires. + * + * For multiple symbols, uses a single-pass linear scan via + * kallsyms_on_each_symbol() with binary search into the sorted input + * array. + * * Returns: 0 if all provided symbols are found, -ESRCH otherwise. */ int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *addrs) @@ -9274,6 +9287,19 @@ int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *a struct kallsyms_data args; int found_all; + /* Fast path: single symbol uses O(log N) binary search */ + if (cnt == 1) { + addrs[0] = kallsyms_lookup_name(sorted_syms[0]); + if (addrs[0] && ftrace_location(addrs[0])) + return 0; + /* + * Binary lookup can fail for duplicate symbol names + * where the first match is not ftrace-instrumented. + * Retry with linear scan. + */ + } + + /* Batch path: single-pass O(N) linear scan */ memset(addrs, 0, sizeof(*addrs) * cnt); args.addrs = addrs; args.syms = sorted_syms; diff --git a/kernel/trace/remote_test.c b/kernel/trace/remote_test.c new file mode 100644 index 000000000000..a3e2c9b606eb --- /dev/null +++ b/kernel/trace/remote_test.c @@ -0,0 +1,261 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2025 - Google LLC + * Author: Vincent Donnefort <vdonnefort@google.com> + */ + +#include <linux/module.h> +#include <linux/simple_ring_buffer.h> +#include <linux/trace_remote.h> +#include <linux/tracefs.h> +#include <linux/types.h> + +#define REMOTE_EVENT_INCLUDE_FILE kernel/trace/remote_test_events.h +#include <trace/define_remote_events.h> + +static DEFINE_PER_CPU(struct simple_rb_per_cpu *, simple_rbs); +static struct trace_buffer_desc *remote_test_buffer_desc; + +/* + * The trace_remote lock already serializes accesses from the trace_remote_callbacks. + * However write_event can still race with load/unload. + */ +static DEFINE_MUTEX(simple_rbs_lock); + +static int remote_test_load_simple_rb(int cpu, struct ring_buffer_desc *rb_desc) +{ + struct simple_rb_per_cpu *cpu_buffer; + struct simple_buffer_page *bpages; + int ret = -ENOMEM; + + cpu_buffer = kmalloc_obj(*cpu_buffer); + if (!cpu_buffer) + return ret; + + bpages = kmalloc_objs(*bpages, rb_desc->nr_page_va); + if (!bpages) + goto err_free_cpu_buffer; + + ret = simple_ring_buffer_init(cpu_buffer, bpages, rb_desc); + if (ret) + goto err_free_bpages; + + scoped_guard(mutex, &simple_rbs_lock) { + WARN_ON(*per_cpu_ptr(&simple_rbs, cpu)); + *per_cpu_ptr(&simple_rbs, cpu) = cpu_buffer; + } + + return 0; + +err_free_bpages: + kfree(bpages); + +err_free_cpu_buffer: + kfree(cpu_buffer); + + return ret; +} + +static void remote_test_unload_simple_rb(int cpu) +{ + struct simple_rb_per_cpu *cpu_buffer = *per_cpu_ptr(&simple_rbs, cpu); + struct simple_buffer_page *bpages; + + if (!cpu_buffer) + return; + + guard(mutex)(&simple_rbs_lock); + + bpages = cpu_buffer->bpages; + simple_ring_buffer_unload(cpu_buffer); + kfree(bpages); + kfree(cpu_buffer); + *per_cpu_ptr(&simple_rbs, cpu) = NULL; +} + +static struct trace_buffer_desc *remote_test_load(unsigned long size, void *unused) +{ + struct ring_buffer_desc *rb_desc; + struct trace_buffer_desc *desc; + size_t desc_size; + int cpu, ret; + + if (WARN_ON(remote_test_buffer_desc)) + return ERR_PTR(-EINVAL); + + desc_size = trace_buffer_desc_size(size, num_possible_cpus()); + if (desc_size == SIZE_MAX) { + ret = -E2BIG; + goto err; + } + + desc = kmalloc(desc_size, GFP_KERNEL); + if (!desc) { + ret = -ENOMEM; + goto err; + } + + ret = trace_remote_alloc_buffer(desc, desc_size, size, cpu_possible_mask); + if (ret) + goto err_free_desc; + + for_each_ring_buffer_desc(rb_desc, cpu, desc) { + ret = remote_test_load_simple_rb(rb_desc->cpu, rb_desc); + if (ret) + goto err_unload; + } + + remote_test_buffer_desc = desc; + + return remote_test_buffer_desc; + +err_unload: + for_each_ring_buffer_desc(rb_desc, cpu, desc) + remote_test_unload_simple_rb(rb_desc->cpu); + trace_remote_free_buffer(desc); + +err_free_desc: + kfree(desc); + +err: + return ERR_PTR(ret); +} + +static void remote_test_unload(struct trace_buffer_desc *desc, void *unused) +{ + struct ring_buffer_desc *rb_desc; + int cpu; + + if (WARN_ON(desc != remote_test_buffer_desc)) + return; + + for_each_ring_buffer_desc(rb_desc, cpu, desc) + remote_test_unload_simple_rb(rb_desc->cpu); + + remote_test_buffer_desc = NULL; + trace_remote_free_buffer(desc); + kfree(desc); +} + +static int remote_test_enable_tracing(bool enable, void *unused) +{ + struct ring_buffer_desc *rb_desc; + int cpu; + + if (!remote_test_buffer_desc) + return -ENODEV; + + for_each_ring_buffer_desc(rb_desc, cpu, remote_test_buffer_desc) + WARN_ON(simple_ring_buffer_enable_tracing(*per_cpu_ptr(&simple_rbs, rb_desc->cpu), + enable)); + return 0; +} + +static int remote_test_swap_reader_page(unsigned int cpu, void *unused) +{ + struct simple_rb_per_cpu *cpu_buffer; + + if (cpu >= NR_CPUS) + return -EINVAL; + + cpu_buffer = *per_cpu_ptr(&simple_rbs, cpu); + if (!cpu_buffer) + return -EINVAL; + + return simple_ring_buffer_swap_reader_page(cpu_buffer); +} + +static int remote_test_reset(unsigned int cpu, void *unused) +{ + struct simple_rb_per_cpu *cpu_buffer; + + if (cpu >= NR_CPUS) + return -EINVAL; + + cpu_buffer = *per_cpu_ptr(&simple_rbs, cpu); + if (!cpu_buffer) + return -EINVAL; + + return simple_ring_buffer_reset(cpu_buffer); +} + +static int remote_test_enable_event(unsigned short id, bool enable, void *unused) +{ + if (id != REMOTE_TEST_EVENT_ID) + return -EINVAL; + + /* + * Let's just use the struct remote_event enabled field that is turned on and off by + * trace_remote. This is a bit racy but good enough for a simple test module. + */ + return 0; +} + +static ssize_t +write_event_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *pos) +{ + struct remote_event_format_selftest *evt_test; + struct simple_rb_per_cpu *cpu_buffer; + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + guard(mutex)(&simple_rbs_lock); + + if (!remote_event_selftest.enabled) + return -ENODEV; + + guard(preempt)(); + + cpu_buffer = *this_cpu_ptr(&simple_rbs); + if (!cpu_buffer) + return -ENODEV; + + evt_test = simple_ring_buffer_reserve(cpu_buffer, + sizeof(struct remote_event_format_selftest), + trace_clock_global()); + if (!evt_test) + return -ENODEV; + + evt_test->hdr.id = REMOTE_TEST_EVENT_ID; + evt_test->id = val; + + simple_ring_buffer_commit(cpu_buffer); + + return cnt; +} + +static const struct file_operations write_event_fops = { + .write = write_event_write, +}; + +static int remote_test_init_tracefs(struct dentry *d, void *unused) +{ + return tracefs_create_file("write_event", 0200, d, NULL, &write_event_fops) ? + 0 : -ENOMEM; +} + +static struct trace_remote_callbacks trace_remote_callbacks = { + .init = remote_test_init_tracefs, + .load_trace_buffer = remote_test_load, + .unload_trace_buffer = remote_test_unload, + .enable_tracing = remote_test_enable_tracing, + .swap_reader_page = remote_test_swap_reader_page, + .reset = remote_test_reset, + .enable_event = remote_test_enable_event, +}; + +static int __init remote_test_init(void) +{ + return trace_remote_register("test", &trace_remote_callbacks, NULL, + &remote_event_selftest, 1); +} + +module_init(remote_test_init); + +MODULE_DESCRIPTION("Test module for the trace remote interface"); +MODULE_AUTHOR("Vincent Donnefort"); +MODULE_LICENSE("GPL"); diff --git a/kernel/trace/remote_test_events.h b/kernel/trace/remote_test_events.h new file mode 100644 index 000000000000..26b93b3406fc --- /dev/null +++ b/kernel/trace/remote_test_events.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#define REMOTE_TEST_EVENT_ID 1 + +REMOTE_EVENT(selftest, REMOTE_TEST_EVENT_ID, + RE_STRUCT( + re_field(u64, id) + ), + RE_PRINTK("id=%llu", __entry->id) +); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 170170bd83bd..7b07d2004cc6 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -4,8 +4,10 @@ * * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> */ +#include <linux/ring_buffer_types.h> #include <linux/sched/isolation.h> #include <linux/trace_recursion.h> +#include <linux/panic_notifier.h> #include <linux/trace_events.h> #include <linux/ring_buffer.h> #include <linux/trace_clock.h> @@ -30,6 +32,7 @@ #include <linux/oom.h> #include <linux/mm.h> +#include <asm/ring_buffer.h> #include <asm/local64.h> #include <asm/local.h> #include <asm/setup.h> @@ -157,23 +160,6 @@ int ring_buffer_print_entry_header(struct trace_seq *s) /* Used for individual buffers (after the counter) */ #define RB_BUFFER_OFF (1 << 20) -#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data) - -#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) -#define RB_ALIGNMENT 4U -#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) -#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ - -#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS -# define RB_FORCE_8BYTE_ALIGNMENT 0 -# define RB_ARCH_ALIGNMENT RB_ALIGNMENT -#else -# define RB_FORCE_8BYTE_ALIGNMENT 1 -# define RB_ARCH_ALIGNMENT 8U -#endif - -#define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT) - /* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ #define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX @@ -316,10 +302,6 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data); #define for_each_online_buffer_cpu(buffer, cpu) \ for_each_cpu_and(cpu, buffer->cpumask, cpu_online_mask) -#define TS_SHIFT 27 -#define TS_MASK ((1ULL << TS_SHIFT) - 1) -#define TS_DELTA_TEST (~TS_MASK) - static u64 rb_event_time_stamp(struct ring_buffer_event *event) { u64 ts; @@ -338,12 +320,6 @@ static u64 rb_event_time_stamp(struct ring_buffer_event *event) #define RB_MISSED_MASK (3 << 30) -struct buffer_data_page { - u64 time_stamp; /* page time stamp */ - local_t commit; /* write committed index */ - unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */ -}; - struct buffer_data_read_page { unsigned order; /* order of the page */ struct buffer_data_page *data; /* actual data, stored in this page */ @@ -437,14 +413,6 @@ static struct buffer_data_page *alloc_cpu_data(int cpu, int order) return dpage; } -/* - * We need to fit the time_stamp delta into 27 bits. - */ -static inline bool test_time_stamp(u64 delta) -{ - return !!(delta & TS_DELTA_TEST); -} - struct rb_irq_work { struct irq_work work; wait_queue_head_t waiters; @@ -555,10 +523,12 @@ struct ring_buffer_per_cpu { unsigned int mapped; unsigned int user_mapped; /* user space mapping */ struct mutex mapping_lock; - unsigned long *subbuf_ids; /* ID to subbuf VA */ + struct buffer_page **subbuf_ids; /* ID to subbuf VA */ struct trace_buffer_meta *meta_page; struct ring_buffer_cpu_meta *ring_meta; + struct ring_buffer_remote *remote; + /* ring buffer pages to update, > 0 to add, < 0 to remove */ long nr_pages_to_update; struct list_head new_pages; /* new pages to add */ @@ -581,6 +551,8 @@ struct trace_buffer { struct ring_buffer_per_cpu **buffers; + struct ring_buffer_remote *remote; + struct hlist_node node; u64 (*clock)(void); @@ -589,6 +561,7 @@ struct trace_buffer { unsigned long range_addr_start; unsigned long range_addr_end; + struct notifier_block flush_nb; struct ring_buffer_meta *meta; @@ -627,16 +600,17 @@ int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq (unsigned int)sizeof(field.commit), (unsigned int)is_signed_type(long)); - trace_seq_printf(s, "\tfield: int overwrite;\t" + trace_seq_printf(s, "\tfield: char overwrite;\t" "offset:%u;\tsize:%u;\tsigned:%u;\n", (unsigned int)offsetof(typeof(field), commit), 1, - (unsigned int)is_signed_type(long)); + (unsigned int)is_signed_type(char)); trace_seq_printf(s, "\tfield: char data;\t" "offset:%u;\tsize:%u;\tsigned:%u;\n", (unsigned int)offsetof(typeof(field), data), - (unsigned int)buffer->subbuf_size, + (unsigned int)(buffer ? buffer->subbuf_size : + PAGE_SIZE - BUF_PAGE_HDR_SIZE), (unsigned int)is_signed_type(char)); return !trace_seq_has_overflowed(s); @@ -1913,7 +1887,7 @@ static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu) static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) { struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; - struct buffer_page *head_page, *orig_head; + struct buffer_page *head_page, *orig_head, *orig_reader; unsigned long entry_bytes = 0; unsigned long entries = 0; int ret; @@ -1924,16 +1898,17 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) return; orig_head = head_page = cpu_buffer->head_page; + orig_reader = cpu_buffer->reader_page; /* Do the reader page first */ - ret = rb_validate_buffer(cpu_buffer->reader_page->page, cpu_buffer->cpu); + ret = rb_validate_buffer(orig_reader->page, cpu_buffer->cpu); if (ret < 0) { pr_info("Ring buffer reader page is invalid\n"); goto invalid; } entries += ret; - entry_bytes += local_read(&cpu_buffer->reader_page->page->commit); - local_set(&cpu_buffer->reader_page->entries, ret); + entry_bytes += local_read(&orig_reader->page->commit); + local_set(&orig_reader->entries, ret); ts = head_page->page->time_stamp; @@ -2036,8 +2011,8 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) /* Iterate until finding the commit page */ for (i = 0; i < meta->nr_subbufs + 1; i++, rb_inc_page(&head_page)) { - /* Reader page has already been done */ - if (head_page == cpu_buffer->reader_page) + /* The original reader page has already been checked/counted. */ + if (head_page == orig_reader) continue; ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu); @@ -2238,6 +2213,40 @@ static void rb_meta_buffer_update(struct ring_buffer_per_cpu *cpu_buffer, } } +static struct ring_buffer_desc *ring_buffer_desc(struct trace_buffer_desc *trace_desc, int cpu) +{ + struct ring_buffer_desc *desc, *end; + size_t len; + int i; + + if (!trace_desc) + return NULL; + + if (cpu >= trace_desc->nr_cpus) + return NULL; + + end = (struct ring_buffer_desc *)((void *)trace_desc + trace_desc->struct_len); + desc = __first_ring_buffer_desc(trace_desc); + len = struct_size(desc, page_va, desc->nr_page_va); + desc = (struct ring_buffer_desc *)((void *)desc + (len * cpu)); + + if (desc < end && desc->cpu == cpu) + return desc; + + /* Missing CPUs, need to linear search */ + for_each_ring_buffer_desc(desc, i, trace_desc) { + if (desc->cpu == cpu) + return desc; + } + + return NULL; +} + +static void *ring_buffer_desc_page(struct ring_buffer_desc *desc, unsigned int page_id) +{ + return page_id >= desc->nr_page_va ? NULL : (void *)desc->page_va[page_id]; +} + static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, long nr_pages, struct list_head *pages) { @@ -2245,6 +2254,7 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_cpu_meta *meta = NULL; struct buffer_page *bpage, *tmp; bool user_thread = current->mm != NULL; + struct ring_buffer_desc *desc = NULL; long i; /* @@ -2273,6 +2283,12 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, if (buffer->range_addr_start) meta = rb_range_meta(buffer, nr_pages, cpu_buffer->cpu); + if (buffer->remote) { + desc = ring_buffer_desc(buffer->remote->desc, cpu_buffer->cpu); + if (!desc || WARN_ON(desc->nr_page_va != (nr_pages + 1))) + return -EINVAL; + } + for (i = 0; i < nr_pages; i++) { bpage = alloc_cpu_page(cpu_buffer->cpu); @@ -2297,6 +2313,16 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, rb_meta_buffer_update(cpu_buffer, bpage); bpage->range = 1; bpage->id = i + 1; + } else if (desc) { + void *p = ring_buffer_desc_page(desc, i + 1); + + if (WARN_ON(!p)) + goto free_pages; + + bpage->page = p; + bpage->range = 1; /* bpage->page can't be freed */ + bpage->id = i + 1; + cpu_buffer->subbuf_ids[i + 1] = bpage; } else { int order = cpu_buffer->buffer->subbuf_order; bpage->page = alloc_cpu_data(cpu_buffer->cpu, order); @@ -2394,6 +2420,30 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) if (cpu_buffer->ring_meta->head_buffer) rb_meta_buffer_update(cpu_buffer, bpage); bpage->range = 1; + } else if (buffer->remote) { + struct ring_buffer_desc *desc = ring_buffer_desc(buffer->remote->desc, cpu); + + if (!desc) + goto fail_free_reader; + + cpu_buffer->remote = buffer->remote; + cpu_buffer->meta_page = (struct trace_buffer_meta *)(void *)desc->meta_va; + cpu_buffer->nr_pages = nr_pages; + cpu_buffer->subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1, + sizeof(*cpu_buffer->subbuf_ids), GFP_KERNEL); + if (!cpu_buffer->subbuf_ids) + goto fail_free_reader; + + /* Remote buffers are read-only and immutable */ + atomic_inc(&cpu_buffer->record_disabled); + atomic_inc(&cpu_buffer->resize_disabled); + + bpage->page = ring_buffer_desc_page(desc, cpu_buffer->meta_page->reader.id); + if (!bpage->page) + goto fail_free_reader; + + bpage->range = 1; + cpu_buffer->subbuf_ids[0] = bpage; } else { int order = cpu_buffer->buffer->subbuf_order; bpage->page = alloc_cpu_data(cpu, order); @@ -2453,6 +2503,9 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) irq_work_sync(&cpu_buffer->irq_work.work); + if (cpu_buffer->remote) + kfree(cpu_buffer->subbuf_ids); + free_buffer_page(cpu_buffer->reader_page); if (head) { @@ -2471,11 +2524,22 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) kfree(cpu_buffer); } +/* Stop recording on a persistent buffer and flush cache if needed. */ +static int rb_flush_buffer_cb(struct notifier_block *nb, unsigned long event, void *data) +{ + struct trace_buffer *buffer = container_of(nb, struct trace_buffer, flush_nb); + + ring_buffer_record_off(buffer); + arch_ring_buffer_flush_range(buffer->range_addr_start, buffer->range_addr_end); + return NOTIFY_DONE; +} + static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, int order, unsigned long start, unsigned long end, unsigned long scratch_size, - struct lock_class_key *key) + struct lock_class_key *key, + struct ring_buffer_remote *remote) { struct trace_buffer *buffer __free(kfree) = NULL; long nr_pages; @@ -2515,6 +2579,8 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, if (!buffer->buffers) goto fail_free_cpumask; + cpu = raw_smp_processor_id(); + /* If start/end are specified, then that overrides size */ if (start && end) { unsigned long buffers_start; @@ -2570,6 +2636,15 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, buffer->range_addr_end = end; rb_range_meta_init(buffer, nr_pages, scratch_size); + } else if (remote) { + struct ring_buffer_desc *desc = ring_buffer_desc(remote->desc, cpu); + + buffer->remote = remote; + /* The writer is remote. This ring-buffer is read-only */ + atomic_inc(&buffer->record_disabled); + nr_pages = desc->nr_page_va - 1; + if (nr_pages < 2) + goto fail_free_buffers; } else { /* need at least two pages */ @@ -2578,7 +2653,6 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, nr_pages = 2; } - cpu = raw_smp_processor_id(); cpumask_set_cpu(cpu, buffer->cpumask); buffer->buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu); if (!buffer->buffers[cpu]) @@ -2590,6 +2664,12 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, mutex_init(&buffer->mutex); + /* Persistent ring buffer needs to flush cache before reboot. */ + if (start && end) { + buffer->flush_nb.notifier_call = rb_flush_buffer_cb; + atomic_notifier_chain_register(&panic_notifier_list, &buffer->flush_nb); + } + return_ptr(buffer); fail_free_buffers: @@ -2620,7 +2700,7 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *key) { /* Default buffer page size - one system page */ - return alloc_buffer(size, flags, 0, 0, 0, 0, key); + return alloc_buffer(size, flags, 0, 0, 0, 0, key, NULL); } EXPORT_SYMBOL_GPL(__ring_buffer_alloc); @@ -2647,7 +2727,18 @@ struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flag struct lock_class_key *key) { return alloc_buffer(size, flags, order, start, start + range_size, - scratch_size, key); + scratch_size, key, NULL); +} + +/** + * __ring_buffer_alloc_remote - allocate a new ring_buffer from a remote + * @remote: Contains a description of the ring-buffer pages and remote callbacks. + * @key: ring buffer reader_lock_key. + */ +struct trace_buffer *__ring_buffer_alloc_remote(struct ring_buffer_remote *remote, + struct lock_class_key *key) +{ + return alloc_buffer(0, 0, 0, 0, 0, 0, key, remote); } void *ring_buffer_meta_scratch(struct trace_buffer *buffer, unsigned int *size) @@ -2677,6 +2768,9 @@ ring_buffer_free(struct trace_buffer *buffer) { int cpu; + if (buffer->range_addr_start && buffer->range_addr_end) + atomic_notifier_chain_unregister(&panic_notifier_list, &buffer->flush_nb); + cpuhp_state_remove_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node); irq_work_sync(&buffer->irq_work.work); @@ -4435,18 +4529,20 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, ret = rb_read_data_buffer(bpage, tail, cpu_buffer->cpu, &ts, &delta); if (ret < 0) { if (delta < ts) { - buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld\n", - cpu_buffer->cpu, ts, delta); + buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld clock:%pS\n", + cpu_buffer->cpu, ts, delta, + cpu_buffer->buffer->clock); goto out; } } if ((full && ts > info->ts) || (!full && ts + info->delta != info->ts)) { - buffer_warn_return("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s context:%s\n", + buffer_warn_return("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s context:%s\ntrace clock:%pS", cpu_buffer->cpu, ts + info->delta, info->ts, info->delta, info->before, info->after, - full ? " (full)" : "", show_interrupt_level()); + full ? " (full)" : "", show_interrupt_level(), + cpu_buffer->buffer->clock); } out: atomic_dec(this_cpu_ptr(&checking)); @@ -5274,14 +5370,66 @@ unsigned long ring_buffer_overruns(struct trace_buffer *buffer) } EXPORT_SYMBOL_GPL(ring_buffer_overruns); +static bool rb_read_remote_meta_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + local_set(&cpu_buffer->entries, READ_ONCE(cpu_buffer->meta_page->entries)); + local_set(&cpu_buffer->overrun, READ_ONCE(cpu_buffer->meta_page->overrun)); + local_set(&cpu_buffer->pages_touched, READ_ONCE(cpu_buffer->meta_page->pages_touched)); + local_set(&cpu_buffer->pages_lost, READ_ONCE(cpu_buffer->meta_page->pages_lost)); + + return rb_num_of_entries(cpu_buffer); +} + +static void rb_update_remote_head(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct buffer_page *next, *orig; + int retry = 3; + + orig = next = cpu_buffer->head_page; + rb_inc_page(&next); + + /* Run after the writer */ + while (cpu_buffer->head_page->page->time_stamp > next->page->time_stamp) { + rb_inc_page(&next); + + rb_list_head_clear(cpu_buffer->head_page->list.prev); + rb_inc_page(&cpu_buffer->head_page); + rb_set_list_to_head(cpu_buffer->head_page->list.prev); + + if (cpu_buffer->head_page == orig) { + if (WARN_ON_ONCE(!(--retry))) + return; + } + } + + orig = cpu_buffer->commit_page = cpu_buffer->head_page; + retry = 3; + + while (cpu_buffer->commit_page->page->time_stamp < next->page->time_stamp) { + rb_inc_page(&next); + rb_inc_page(&cpu_buffer->commit_page); + + if (cpu_buffer->commit_page == orig) { + if (WARN_ON_ONCE(!(--retry))) + return; + } + } +} + static void rb_iter_reset(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + if (cpu_buffer->remote) { + rb_read_remote_meta_page(cpu_buffer); + rb_update_remote_head(cpu_buffer); + } + /* Iterator usage is expected to have record disabled */ iter->head_page = cpu_buffer->reader_page; iter->head = cpu_buffer->reader_page->read; iter->next_event = iter->head; + iter->missed_events = 0; iter->cache_reader_page = iter->head_page; iter->cache_read = cpu_buffer->read; @@ -5428,7 +5576,65 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter, } static struct buffer_page * -rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) +__rb_get_reader_page_from_remote(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct buffer_page *new_reader, *prev_reader, *prev_head, *new_head, *last; + + if (!rb_read_remote_meta_page(cpu_buffer)) + return NULL; + + /* More to read on the reader page */ + if (cpu_buffer->reader_page->read < rb_page_size(cpu_buffer->reader_page)) { + if (!cpu_buffer->reader_page->read) + cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp; + return cpu_buffer->reader_page; + } + + prev_reader = cpu_buffer->subbuf_ids[cpu_buffer->meta_page->reader.id]; + + WARN_ON_ONCE(cpu_buffer->remote->swap_reader_page(cpu_buffer->cpu, + cpu_buffer->remote->priv)); + /* nr_pages doesn't include the reader page */ + if (WARN_ON_ONCE(cpu_buffer->meta_page->reader.id > cpu_buffer->nr_pages)) + return NULL; + + new_reader = cpu_buffer->subbuf_ids[cpu_buffer->meta_page->reader.id]; + + WARN_ON_ONCE(prev_reader == new_reader); + + prev_head = new_reader; /* New reader was also the previous head */ + new_head = prev_head; + rb_inc_page(&new_head); + last = prev_head; + rb_dec_page(&last); + + /* Clear the old HEAD flag */ + rb_list_head_clear(cpu_buffer->head_page->list.prev); + + prev_reader->list.next = prev_head->list.next; + prev_reader->list.prev = prev_head->list.prev; + + /* Swap prev_reader with new_reader */ + last->list.next = &prev_reader->list; + new_head->list.prev = &prev_reader->list; + + new_reader->list.prev = &new_reader->list; + new_reader->list.next = &new_head->list; + + /* Reactivate the HEAD flag */ + rb_set_list_to_head(&last->list); + + cpu_buffer->head_page = new_head; + cpu_buffer->reader_page = new_reader; + cpu_buffer->pages = &new_head->list; + cpu_buffer->read_stamp = new_reader->page->time_stamp; + cpu_buffer->lost_events = cpu_buffer->meta_page->reader.lost_events; + + return rb_page_size(cpu_buffer->reader_page) ? cpu_buffer->reader_page : NULL; +} + +static struct buffer_page * +__rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *reader = NULL; unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size); @@ -5598,6 +5804,13 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) return reader; } +static struct buffer_page * +rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + return cpu_buffer->remote ? __rb_get_reader_page_from_remote(cpu_buffer) : + __rb_get_reader_page(cpu_buffer); +} + static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) { struct ring_buffer_event *event; @@ -5896,10 +6109,7 @@ ring_buffer_peek(struct trace_buffer *buffer, int cpu, u64 *ts, */ bool ring_buffer_iter_dropped(struct ring_buffer_iter *iter) { - bool ret = iter->missed_events != 0; - - iter->missed_events = 0; - return ret; + return iter->missed_events != 0; } EXPORT_SYMBOL_GPL(ring_buffer_iter_dropped); @@ -6061,7 +6271,7 @@ void ring_buffer_iter_advance(struct ring_buffer_iter *iter) unsigned long flags; raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); - + iter->missed_events = 0; rb_advance_iter(iter); raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); @@ -6154,6 +6364,8 @@ static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer) meta->entries = local_read(&cpu_buffer->entries); meta->overrun = local_read(&cpu_buffer->overrun); meta->read = cpu_buffer->read; + meta->pages_lost = local_read(&cpu_buffer->pages_lost); + meta->pages_touched = local_read(&cpu_buffer->pages_touched); /* Some archs do not have data cache coherency between kernel and user-space */ flush_kernel_vmap_range(cpu_buffer->meta_page, PAGE_SIZE); @@ -6164,6 +6376,23 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *page; + if (cpu_buffer->remote) { + if (!cpu_buffer->remote->reset) + return; + + cpu_buffer->remote->reset(cpu_buffer->cpu, cpu_buffer->remote->priv); + rb_read_remote_meta_page(cpu_buffer); + + /* Read related values, not covered by the meta-page */ + local_set(&cpu_buffer->pages_read, 0); + cpu_buffer->read = 0; + cpu_buffer->read_bytes = 0; + cpu_buffer->last_overrun = 0; + cpu_buffer->reader_page->read = 0; + + return; + } + rb_head_page_deactivate(cpu_buffer); cpu_buffer->head_page @@ -6394,6 +6623,46 @@ bool ring_buffer_empty_cpu(struct trace_buffer *buffer, int cpu) } EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu); +int ring_buffer_poll_remote(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (cpu != RING_BUFFER_ALL_CPUS) { + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return -EINVAL; + + cpu_buffer = buffer->buffers[cpu]; + + guard(raw_spinlock)(&cpu_buffer->reader_lock); + if (rb_read_remote_meta_page(cpu_buffer)) + rb_wakeups(buffer, cpu_buffer); + + return 0; + } + + guard(cpus_read_lock)(); + + /* + * Make sure all the ring buffers are up to date before we start reading + * them. + */ + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + + guard(raw_spinlock)(&cpu_buffer->reader_lock); + rb_read_remote_meta_page(cpu_buffer); + } + + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + + if (rb_num_of_entries(cpu_buffer)) + rb_wakeups(buffer, cpu_buffer); + } + + return 0; +} + #ifdef CONFIG_RING_BUFFER_ALLOW_SWAP /** * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers @@ -6632,6 +6901,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, unsigned int commit; unsigned int read; u64 save_timestamp; + bool force_memcpy; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return -1; @@ -6669,6 +6939,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer, /* Check if any events were dropped */ missed_events = cpu_buffer->lost_events; + force_memcpy = cpu_buffer->mapped || cpu_buffer->remote; + /* * If this page has been partially read or * if len is not big enough to read the rest of the page or @@ -6678,7 +6950,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, */ if (read || (len < (commit - read)) || cpu_buffer->reader_page == cpu_buffer->commit_page || - cpu_buffer->mapped) { + force_memcpy) { struct buffer_data_page *rpage = cpu_buffer->reader_page->page; unsigned int rpos = read; unsigned int pos = 0; @@ -7034,7 +7306,7 @@ static void rb_free_meta_page(struct ring_buffer_per_cpu *cpu_buffer) } static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer, - unsigned long *subbuf_ids) + struct buffer_page **subbuf_ids) { struct trace_buffer_meta *meta = cpu_buffer->meta_page; unsigned int nr_subbufs = cpu_buffer->nr_pages + 1; @@ -7043,7 +7315,7 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer, int id = 0; id = rb_page_id(cpu_buffer, cpu_buffer->reader_page, id); - subbuf_ids[id++] = (unsigned long)cpu_buffer->reader_page->page; + subbuf_ids[id++] = cpu_buffer->reader_page; cnt++; first_subbuf = subbuf = rb_set_head_page(cpu_buffer); @@ -7053,7 +7325,7 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer, if (WARN_ON(id >= nr_subbufs)) break; - subbuf_ids[id] = (unsigned long)subbuf->page; + subbuf_ids[id] = subbuf; rb_inc_page(&subbuf); id++; @@ -7062,7 +7334,7 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer, WARN_ON(cnt != nr_subbufs); - /* install subbuf ID to kern VA translation */ + /* install subbuf ID to bpage translation */ cpu_buffer->subbuf_ids = subbuf_ids; meta->meta_struct_len = sizeof(*meta); @@ -7218,13 +7490,15 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, } while (p < nr_pages) { + struct buffer_page *subbuf; struct page *page; int off = 0; if (WARN_ON_ONCE(s >= nr_subbufs)) return -EINVAL; - page = virt_to_page((void *)cpu_buffer->subbuf_ids[s]); + subbuf = cpu_buffer->subbuf_ids[s]; + page = virt_to_page((void *)subbuf->page); for (; off < (1 << (subbuf_order)); off++, page++) { if (p >= nr_pages) @@ -7251,10 +7525,11 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu, struct vm_area_struct *vma) { struct ring_buffer_per_cpu *cpu_buffer; - unsigned long flags, *subbuf_ids; + struct buffer_page **subbuf_ids; + unsigned long flags; int err; - if (!cpumask_test_cpu(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask) || buffer->remote) return -EINVAL; cpu_buffer = buffer->buffers[cpu]; @@ -7275,7 +7550,7 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu, if (err) return err; - /* subbuf_ids include the reader while nr_pages does not */ + /* subbuf_ids includes the reader while nr_pages does not */ subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1, sizeof(*subbuf_ids), GFP_KERNEL); if (!subbuf_ids) { rb_free_meta_page(cpu_buffer); @@ -7468,6 +7743,12 @@ out: return 0; } +static void rb_cpu_sync(void *data) +{ + /* Not really needed, but documents what is happening */ + smp_rmb(); +} + /* * We only allocate new buffers, never free them if the CPU goes down. * If we were to free the buffer, then the user would lose any trace that was in @@ -7506,7 +7787,18 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node) cpu); return -ENOMEM; } - smp_wmb(); + + /* + * Ensure trace_buffer readers observe the newly allocated + * ring_buffer_per_cpu before they check the cpumask. Instead of using a + * read barrier for all readers, send an IPI. + */ + if (unlikely(system_state == SYSTEM_RUNNING)) { + on_each_cpu(rb_cpu_sync, NULL, 1); + /* Not really needed, but documents what is happening */ + smp_wmb(); + } + cpumask_set_cpu(cpu, buffer->cpumask); return 0; } diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig index 5b4be87ba59d..3884b14df375 100644 --- a/kernel/trace/rv/Kconfig +++ b/kernel/trace/rv/Kconfig @@ -23,6 +23,19 @@ config LTL_MON_EVENTS_ID config RV_LTL_MONITOR bool +config RV_HA_MONITOR + bool + +config HA_MON_EVENTS_IMPLICIT + select DA_MON_EVENTS_IMPLICIT + select RV_HA_MONITOR + bool + +config HA_MON_EVENTS_ID + select DA_MON_EVENTS_ID + select RV_HA_MONITOR + bool + menuconfig RV bool "Runtime Verification" select TRACING @@ -65,6 +78,11 @@ source "kernel/trace/rv/monitors/pagefault/Kconfig" source "kernel/trace/rv/monitors/sleep/Kconfig" # Add new rtapp monitors here +source "kernel/trace/rv/monitors/stall/Kconfig" +source "kernel/trace/rv/monitors/deadline/Kconfig" +source "kernel/trace/rv/monitors/nomiss/Kconfig" +# Add new deadline monitors here + # Add new monitors here config RV_REACTORS diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile index 750e4ad6fa0f..94498da35b37 100644 --- a/kernel/trace/rv/Makefile +++ b/kernel/trace/rv/Makefile @@ -17,6 +17,9 @@ obj-$(CONFIG_RV_MON_STS) += monitors/sts/sts.o obj-$(CONFIG_RV_MON_NRP) += monitors/nrp/nrp.o obj-$(CONFIG_RV_MON_SSSW) += monitors/sssw/sssw.o obj-$(CONFIG_RV_MON_OPID) += monitors/opid/opid.o +obj-$(CONFIG_RV_MON_STALL) += monitors/stall/stall.o +obj-$(CONFIG_RV_MON_DEADLINE) += monitors/deadline/deadline.o +obj-$(CONFIG_RV_MON_NOMISS) += monitors/nomiss/nomiss.o # Add new monitors here obj-$(CONFIG_RV_REACTORS) += rv_reactors.o obj-$(CONFIG_RV_REACT_PRINTK) += reactor_printk.o diff --git a/kernel/trace/rv/monitors/deadline/Kconfig b/kernel/trace/rv/monitors/deadline/Kconfig new file mode 100644 index 000000000000..38804a6ad91d --- /dev/null +++ b/kernel/trace/rv/monitors/deadline/Kconfig @@ -0,0 +1,10 @@ +config RV_MON_DEADLINE + depends on RV + bool "deadline monitor" + help + Collection of monitors to check the deadline scheduler and server + behave according to specifications. Enable this to enable all + scheduler specification supported by the current kernel. + + For further information, see: + Documentation/trace/rv/monitor_deadline.rst diff --git a/kernel/trace/rv/monitors/deadline/deadline.c b/kernel/trace/rv/monitors/deadline/deadline.c new file mode 100644 index 000000000000..d566d4542ebf --- /dev/null +++ b/kernel/trace/rv/monitors/deadline/deadline.c @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <linux/kallsyms.h> + +#define MODULE_NAME "deadline" + +#include "deadline.h" + +struct rv_monitor rv_deadline = { + .name = "deadline", + .description = "container for several deadline scheduler specifications.", + .enable = NULL, + .disable = NULL, + .reset = NULL, + .enabled = 0, +}; + +/* Used by other monitors */ +struct sched_class *rv_ext_sched_class; + +static int __init register_deadline(void) +{ + if (IS_ENABLED(CONFIG_SCHED_CLASS_EXT)) { + rv_ext_sched_class = (void *)kallsyms_lookup_name("ext_sched_class"); + if (!rv_ext_sched_class) + pr_warn("rv: Missing ext_sched_class, monitors may not work.\n"); + } + return rv_register_monitor(&rv_deadline, NULL); +} + +static void __exit unregister_deadline(void) +{ + rv_unregister_monitor(&rv_deadline); +} + +module_init(register_deadline); +module_exit(unregister_deadline); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("deadline: container for several deadline scheduler specifications."); diff --git a/kernel/trace/rv/monitors/deadline/deadline.h b/kernel/trace/rv/monitors/deadline/deadline.h new file mode 100644 index 000000000000..0bbfd2543329 --- /dev/null +++ b/kernel/trace/rv/monitors/deadline/deadline.h @@ -0,0 +1,202 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <linux/kernel.h> +#include <linux/uaccess.h> +#include <linux/sched/deadline.h> +#include <asm/syscall.h> +#include <uapi/linux/sched/types.h> +#include <trace/events/sched.h> + +/* + * Dummy values if not available + */ +#ifndef __NR_sched_setscheduler +#define __NR_sched_setscheduler -__COUNTER__ +#endif +#ifndef __NR_sched_setattr +#define __NR_sched_setattr -__COUNTER__ +#endif + +extern struct rv_monitor rv_deadline; +/* Initialised when registering the deadline container */ +extern struct sched_class *rv_ext_sched_class; + +/* + * If both have dummy values, the syscalls are not supported and we don't even + * need to register the handler. + */ +static inline bool should_skip_syscall_handle(void) +{ + return __NR_sched_setattr < 0 && __NR_sched_setscheduler < 0; +} + +/* + * is_supported_type - return true if @type is supported by the deadline monitors + */ +static inline bool is_supported_type(u8 type) +{ + return type == DL_TASK || type == DL_SERVER_FAIR || type == DL_SERVER_EXT; +} + +/* + * is_server_type - return true if @type is a supported server + */ +static inline bool is_server_type(u8 type) +{ + return is_supported_type(type) && type != DL_TASK; +} + +/* + * Use negative numbers for the server. + * Currently only one fair server per CPU, may change in the future. + */ +#define fair_server_id(cpu) (-cpu) +#define ext_server_id(cpu) (-cpu - num_possible_cpus()) +#define NO_SERVER_ID (-2 * num_possible_cpus()) +/* + * Get a unique id used for dl entities + * + * The cpu is not required for tasks as the pid is used there, if this function + * is called on a dl_se that for sure corresponds to a task, DL_TASK can be + * used in place of cpu. + * We need the cpu for servers as it is provided in the tracepoint and we + * cannot easily retrieve it from the dl_se (requires the struct rq definition). + */ +static inline int get_entity_id(struct sched_dl_entity *dl_se, int cpu, u8 type) +{ + if (dl_server(dl_se) && type != DL_TASK) { + if (type == DL_SERVER_FAIR) + return fair_server_id(cpu); + if (type == DL_SERVER_EXT) + return ext_server_id(cpu); + return NO_SERVER_ID; + } + return dl_task_of(dl_se)->pid; +} + +static inline bool task_is_scx_enabled(struct task_struct *tsk) +{ + return IS_ENABLED(CONFIG_SCHED_CLASS_EXT) && + tsk->sched_class == rv_ext_sched_class; +} + +/* Expand id and target as arguments for da functions */ +#define EXPAND_ID(dl_se, cpu, type) get_entity_id(dl_se, cpu, type), dl_se +#define EXPAND_ID_TASK(tsk) get_entity_id(&tsk->dl, task_cpu(tsk), DL_TASK), &tsk->dl + +static inline u8 get_server_type(struct task_struct *tsk) +{ + if (tsk->policy == SCHED_NORMAL || tsk->policy == SCHED_EXT || + tsk->policy == SCHED_BATCH || tsk->policy == SCHED_IDLE) + return task_is_scx_enabled(tsk) ? DL_SERVER_EXT : DL_SERVER_FAIR; + return DL_OTHER; +} + +static inline int extract_params(struct pt_regs *regs, long id, pid_t *pid_out) +{ + size_t size = offsetofend(struct sched_attr, sched_flags); + struct sched_attr __user *uattr, attr; + int new_policy = -1, ret; + unsigned long args[6]; + + switch (id) { + case __NR_sched_setscheduler: + syscall_get_arguments(current, regs, args); + *pid_out = args[0]; + new_policy = args[1]; + break; + case __NR_sched_setattr: + syscall_get_arguments(current, regs, args); + *pid_out = args[0]; + uattr = (struct sched_attr __user *)args[1]; + /* + * Just copy up to sched_flags, we are not interested after that + */ + ret = copy_struct_from_user(&attr, size, uattr, size); + if (ret) + return ret; + if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY) + return -EINVAL; + new_policy = attr.sched_policy; + break; + default: + return -EINVAL; + } + + return new_policy & ~SCHED_RESET_ON_FORK; +} + +/* Helper functions requiring DA/HA utilities */ +#ifdef RV_MON_TYPE + +/* + * get_fair_server - get the fair server associated to a task + * + * If the task is a boosted task, the server is available in the task_struct, + * otherwise grab the dl entity saved for the CPU where the task is enqueued. + * This function assumes the task is enqueued somewhere. + */ +static inline struct sched_dl_entity *get_server(struct task_struct *tsk, u8 type) +{ + if (tsk->dl_server && get_server_type(tsk) == type) + return tsk->dl_server; + if (type == DL_SERVER_FAIR) + return da_get_target_by_id(fair_server_id(task_cpu(tsk))); + if (type == DL_SERVER_EXT) + return da_get_target_by_id(ext_server_id(task_cpu(tsk))); + return NULL; +} + +/* + * Initialise monitors for all tasks and pre-allocate the storage for servers. + * This is necessary since we don't have access to the servers here and + * allocation can cause deadlocks from their tracepoints. We can only fill + * pre-initialised storage from there. + */ +static inline int init_storage(bool skip_tasks) +{ + struct task_struct *g, *p; + int cpu; + + for_each_possible_cpu(cpu) { + if (!da_create_empty_storage(fair_server_id(cpu))) + goto fail; + if (IS_ENABLED(CONFIG_SCHED_CLASS_EXT) && + !da_create_empty_storage(ext_server_id(cpu))) + goto fail; + } + + if (skip_tasks) + return 0; + + read_lock(&tasklist_lock); + for_each_process_thread(g, p) { + if (p->policy == SCHED_DEADLINE) { + if (!da_create_storage(EXPAND_ID_TASK(p), NULL)) { + read_unlock(&tasklist_lock); + goto fail; + } + } + } + read_unlock(&tasklist_lock); + return 0; + +fail: + da_monitor_destroy(); + return -ENOMEM; +} + +static void __maybe_unused handle_newtask(void *data, struct task_struct *task, u64 flags) +{ + /* Might be superfluous as tasks are not started with this policy.. */ + if (task->policy == SCHED_DEADLINE) + da_create_storage(EXPAND_ID_TASK(task), NULL); +} + +static void __maybe_unused handle_exit(void *data, struct task_struct *p, bool group_dead) +{ + if (p->policy == SCHED_DEADLINE) + da_destroy_storage(get_entity_id(&p->dl, DL_TASK, DL_TASK)); +} + +#endif diff --git a/kernel/trace/rv/monitors/nomiss/Kconfig b/kernel/trace/rv/monitors/nomiss/Kconfig new file mode 100644 index 000000000000..e1886c3a0dd9 --- /dev/null +++ b/kernel/trace/rv/monitors/nomiss/Kconfig @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_NOMISS + depends on RV + depends on HAVE_SYSCALL_TRACEPOINTS + depends on RV_MON_DEADLINE + default y + select HA_MON_EVENTS_ID + bool "nomiss monitor" + help + Monitor to ensure dl entities run to completion before their deadiline. + This monitor is part of the deadline monitors collection. + + For further information, see: + Documentation/trace/rv/monitor_deadline.rst diff --git a/kernel/trace/rv/monitors/nomiss/nomiss.c b/kernel/trace/rv/monitors/nomiss/nomiss.c new file mode 100644 index 000000000000..31f90f3638d8 --- /dev/null +++ b/kernel/trace/rv/monitors/nomiss/nomiss.c @@ -0,0 +1,293 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> + +#define MODULE_NAME "nomiss" + +#include <uapi/linux/sched/types.h> +#include <trace/events/syscalls.h> +#include <trace/events/sched.h> +#include <trace/events/task.h> +#include <rv_trace.h> + +#define RV_MON_TYPE RV_MON_PER_OBJ +#define HA_TIMER_TYPE HA_TIMER_WHEEL +/* The start condition is on sched_switch, it's dangerous to allocate there */ +#define DA_SKIP_AUTO_ALLOC +typedef struct sched_dl_entity *monitor_target; +#include "nomiss.h" +#include <rv/ha_monitor.h> +#include <monitors/deadline/deadline.h> + +/* + * User configurable deadline threshold. If the total utilisation of deadline + * tasks is larger than 1, they are only guaranteed bounded tardiness. See + * Documentation/scheduler/sched-deadline.rst for more details. + * The minimum tardiness without sched_feat(HRTICK_DL) is 1 tick to accommodate + * for throttle enforced on the next tick. + */ +static u64 deadline_thresh = TICK_NSEC; +module_param(deadline_thresh, ullong, 0644); +#define DEADLINE_NS(ha_mon) (ha_get_target(ha_mon)->dl_deadline + deadline_thresh) + +static u64 ha_get_env(struct ha_monitor *ha_mon, enum envs_nomiss env, u64 time_ns) +{ + if (env == clk_nomiss) + return ha_get_clk_ns(ha_mon, env, time_ns); + else if (env == is_constr_dl_nomiss) + return !dl_is_implicit(ha_get_target(ha_mon)); + else if (env == is_defer_nomiss) + return ha_get_target(ha_mon)->dl_defer; + return ENV_INVALID_VALUE; +} + +static void ha_reset_env(struct ha_monitor *ha_mon, enum envs_nomiss env, u64 time_ns) +{ + if (env == clk_nomiss) + ha_reset_clk_ns(ha_mon, env, time_ns); +} + +static inline bool ha_verify_invariants(struct ha_monitor *ha_mon, + enum states curr_state, enum events event, + enum states next_state, u64 time_ns) +{ + if (curr_state == ready_nomiss) + return ha_check_invariant_ns(ha_mon, clk_nomiss, time_ns); + else if (curr_state == running_nomiss) + return ha_check_invariant_ns(ha_mon, clk_nomiss, time_ns); + return true; +} + +static inline void ha_convert_inv_guard(struct ha_monitor *ha_mon, + enum states curr_state, enum events event, + enum states next_state, u64 time_ns) +{ + if (curr_state == next_state) + return; + if (curr_state == ready_nomiss) + ha_inv_to_guard(ha_mon, clk_nomiss, DEADLINE_NS(ha_mon), time_ns); + else if (curr_state == running_nomiss) + ha_inv_to_guard(ha_mon, clk_nomiss, DEADLINE_NS(ha_mon), time_ns); +} + +static inline bool ha_verify_guards(struct ha_monitor *ha_mon, + enum states curr_state, enum events event, + enum states next_state, u64 time_ns) +{ + bool res = true; + + if (curr_state == ready_nomiss && event == dl_replenish_nomiss) + ha_reset_env(ha_mon, clk_nomiss, time_ns); + else if (curr_state == ready_nomiss && event == dl_throttle_nomiss) + res = ha_get_env(ha_mon, is_defer_nomiss, time_ns) == 1ull; + else if (curr_state == idle_nomiss && event == dl_replenish_nomiss) + ha_reset_env(ha_mon, clk_nomiss, time_ns); + else if (curr_state == running_nomiss && event == dl_replenish_nomiss) + ha_reset_env(ha_mon, clk_nomiss, time_ns); + else if (curr_state == sleeping_nomiss && event == dl_replenish_nomiss) + ha_reset_env(ha_mon, clk_nomiss, time_ns); + else if (curr_state == sleeping_nomiss && event == dl_throttle_nomiss) + res = ha_get_env(ha_mon, is_constr_dl_nomiss, time_ns) == 1ull || + ha_get_env(ha_mon, is_defer_nomiss, time_ns) == 1ull; + else if (curr_state == throttled_nomiss && event == dl_replenish_nomiss) + ha_reset_env(ha_mon, clk_nomiss, time_ns); + return res; +} + +static inline void ha_setup_invariants(struct ha_monitor *ha_mon, + enum states curr_state, enum events event, + enum states next_state, u64 time_ns) +{ + if (next_state == curr_state && event != dl_replenish_nomiss) + return; + if (next_state == ready_nomiss) + ha_start_timer_ns(ha_mon, clk_nomiss, DEADLINE_NS(ha_mon), time_ns); + else if (next_state == running_nomiss) + ha_start_timer_ns(ha_mon, clk_nomiss, DEADLINE_NS(ha_mon), time_ns); + else if (curr_state == ready_nomiss) + ha_cancel_timer(ha_mon); + else if (curr_state == running_nomiss) + ha_cancel_timer(ha_mon); +} + +static bool ha_verify_constraint(struct ha_monitor *ha_mon, + enum states curr_state, enum events event, + enum states next_state, u64 time_ns) +{ + if (!ha_verify_invariants(ha_mon, curr_state, event, next_state, time_ns)) + return false; + + ha_convert_inv_guard(ha_mon, curr_state, event, next_state, time_ns); + + if (!ha_verify_guards(ha_mon, curr_state, event, next_state, time_ns)) + return false; + + ha_setup_invariants(ha_mon, curr_state, event, next_state, time_ns); + + return true; +} + +static void handle_dl_replenish(void *data, struct sched_dl_entity *dl_se, + int cpu, u8 type) +{ + if (is_supported_type(type)) + da_handle_event(EXPAND_ID(dl_se, cpu, type), dl_replenish_nomiss); +} + +static void handle_dl_throttle(void *data, struct sched_dl_entity *dl_se, + int cpu, u8 type) +{ + if (is_supported_type(type)) + da_handle_event(EXPAND_ID(dl_se, cpu, type), dl_throttle_nomiss); +} + +static void handle_dl_server_stop(void *data, struct sched_dl_entity *dl_se, + int cpu, u8 type) +{ + /* + * This isn't the standard use of da_handle_start_run_event since this + * event cannot only occur from the initial state. + * It is fine to use here because it always brings to a known state and + * the fact we "pretend" the transition starts from the initial state + * has no side effect. + */ + if (is_supported_type(type)) + da_handle_start_run_event(EXPAND_ID(dl_se, cpu, type), dl_server_stop_nomiss); +} + +static inline void handle_server_switch(struct task_struct *next, int cpu, u8 type) +{ + struct sched_dl_entity *dl_se = get_server(next, type); + + if (dl_se && is_idle_task(next)) + da_handle_event(EXPAND_ID(dl_se, cpu, type), dl_server_idle_nomiss); +} + +static void handle_sched_switch(void *data, bool preempt, + struct task_struct *prev, + struct task_struct *next, + unsigned int prev_state) +{ + int cpu = task_cpu(next); + + if (prev_state != TASK_RUNNING && !preempt && prev->policy == SCHED_DEADLINE) + da_handle_event(EXPAND_ID_TASK(prev), sched_switch_suspend_nomiss); + if (next->policy == SCHED_DEADLINE) + da_handle_start_run_event(EXPAND_ID_TASK(next), sched_switch_in_nomiss); + + /* + * The server is available in next only if the next task is boosted, + * otherwise we need to retrieve it. + * Here the server continues in the state running/armed until actually + * stopped, this works since we continue expecting a throttle. + */ + if (next->dl_server) + da_handle_start_event(EXPAND_ID(next->dl_server, cpu, + get_server_type(next)), + sched_switch_in_nomiss); + else { + handle_server_switch(next, cpu, DL_SERVER_FAIR); + if (IS_ENABLED(CONFIG_SCHED_CLASS_EXT)) + handle_server_switch(next, cpu, DL_SERVER_EXT); + } +} + +static void handle_sys_enter(void *data, struct pt_regs *regs, long id) +{ + struct task_struct *p; + int new_policy = -1; + pid_t pid = 0; + + new_policy = extract_params(regs, id, &pid); + if (new_policy < 0) + return; + guard(rcu)(); + p = pid ? find_task_by_vpid(pid) : current; + if (unlikely(!p) || new_policy == p->policy) + return; + + if (p->policy == SCHED_DEADLINE) + da_reset(EXPAND_ID_TASK(p)); + else if (new_policy == SCHED_DEADLINE) + da_create_or_get(EXPAND_ID_TASK(p)); +} + +static void handle_sched_wakeup(void *data, struct task_struct *tsk) +{ + if (tsk->policy == SCHED_DEADLINE) + da_handle_event(EXPAND_ID_TASK(tsk), sched_wakeup_nomiss); +} + +static int enable_nomiss(void) +{ + int retval; + + retval = da_monitor_init(); + if (retval) + return retval; + + retval = init_storage(false); + if (retval) + return retval; + rv_attach_trace_probe("nomiss", sched_dl_replenish_tp, handle_dl_replenish); + rv_attach_trace_probe("nomiss", sched_dl_throttle_tp, handle_dl_throttle); + rv_attach_trace_probe("nomiss", sched_dl_server_stop_tp, handle_dl_server_stop); + rv_attach_trace_probe("nomiss", sched_switch, handle_sched_switch); + rv_attach_trace_probe("nomiss", sched_wakeup, handle_sched_wakeup); + if (!should_skip_syscall_handle()) + rv_attach_trace_probe("nomiss", sys_enter, handle_sys_enter); + rv_attach_trace_probe("nomiss", task_newtask, handle_newtask); + rv_attach_trace_probe("nomiss", sched_process_exit, handle_exit); + + return 0; +} + +static void disable_nomiss(void) +{ + rv_this.enabled = 0; + + /* Those are RCU writers, detach earlier hoping to close a bit faster */ + rv_detach_trace_probe("nomiss", task_newtask, handle_newtask); + rv_detach_trace_probe("nomiss", sched_process_exit, handle_exit); + if (!should_skip_syscall_handle()) + rv_detach_trace_probe("nomiss", sys_enter, handle_sys_enter); + + rv_detach_trace_probe("nomiss", sched_dl_replenish_tp, handle_dl_replenish); + rv_detach_trace_probe("nomiss", sched_dl_throttle_tp, handle_dl_throttle); + rv_detach_trace_probe("nomiss", sched_dl_server_stop_tp, handle_dl_server_stop); + rv_detach_trace_probe("nomiss", sched_switch, handle_sched_switch); + rv_detach_trace_probe("nomiss", sched_wakeup, handle_sched_wakeup); + + da_monitor_destroy(); +} + +static struct rv_monitor rv_this = { + .name = "nomiss", + .description = "dl entities run to completion before their deadline.", + .enable = enable_nomiss, + .disable = disable_nomiss, + .reset = da_monitor_reset_all, + .enabled = 0, +}; + +static int __init register_nomiss(void) +{ + return rv_register_monitor(&rv_this, &rv_deadline); +} + +static void __exit unregister_nomiss(void) +{ + rv_unregister_monitor(&rv_this); +} + +module_init(register_nomiss); +module_exit(unregister_nomiss); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("nomiss: dl entities run to completion before their deadline."); diff --git a/kernel/trace/rv/monitors/nomiss/nomiss.h b/kernel/trace/rv/monitors/nomiss/nomiss.h new file mode 100644 index 000000000000..3d1b436194d7 --- /dev/null +++ b/kernel/trace/rv/monitors/nomiss/nomiss.h @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of nomiss automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +#define MONITOR_NAME nomiss + +enum states_nomiss { + ready_nomiss, + idle_nomiss, + running_nomiss, + sleeping_nomiss, + throttled_nomiss, + state_max_nomiss, +}; + +#define INVALID_STATE state_max_nomiss + +enum events_nomiss { + dl_replenish_nomiss, + dl_server_idle_nomiss, + dl_server_stop_nomiss, + dl_throttle_nomiss, + sched_switch_in_nomiss, + sched_switch_suspend_nomiss, + sched_wakeup_nomiss, + event_max_nomiss, +}; + +enum envs_nomiss { + clk_nomiss, + is_constr_dl_nomiss, + is_defer_nomiss, + env_max_nomiss, + env_max_stored_nomiss = is_constr_dl_nomiss, +}; + +_Static_assert(env_max_stored_nomiss <= MAX_HA_ENV_LEN, "Not enough slots"); +#define HA_CLK_NS + +struct automaton_nomiss { + char *state_names[state_max_nomiss]; + char *event_names[event_max_nomiss]; + char *env_names[env_max_nomiss]; + unsigned char function[state_max_nomiss][event_max_nomiss]; + unsigned char initial_state; + bool final_states[state_max_nomiss]; +}; + +static const struct automaton_nomiss automaton_nomiss = { + .state_names = { + "ready", + "idle", + "running", + "sleeping", + "throttled", + }, + .event_names = { + "dl_replenish", + "dl_server_idle", + "dl_server_stop", + "dl_throttle", + "sched_switch_in", + "sched_switch_suspend", + "sched_wakeup", + }, + .env_names = { + "clk", + "is_constr_dl", + "is_defer", + }, + .function = { + { + ready_nomiss, + idle_nomiss, + sleeping_nomiss, + throttled_nomiss, + running_nomiss, + INVALID_STATE, + ready_nomiss, + }, + { + ready_nomiss, + idle_nomiss, + sleeping_nomiss, + throttled_nomiss, + running_nomiss, + INVALID_STATE, + INVALID_STATE, + }, + { + running_nomiss, + idle_nomiss, + sleeping_nomiss, + throttled_nomiss, + running_nomiss, + sleeping_nomiss, + running_nomiss, + }, + { + ready_nomiss, + sleeping_nomiss, + sleeping_nomiss, + throttled_nomiss, + running_nomiss, + INVALID_STATE, + ready_nomiss, + }, + { + ready_nomiss, + throttled_nomiss, + INVALID_STATE, + throttled_nomiss, + INVALID_STATE, + throttled_nomiss, + throttled_nomiss, + }, + }, + .initial_state = ready_nomiss, + .final_states = { 1, 0, 0, 0, 0 }, +}; diff --git a/kernel/trace/rv/monitors/nomiss/nomiss_trace.h b/kernel/trace/rv/monitors/nomiss/nomiss_trace.h new file mode 100644 index 000000000000..42e7efaca4e7 --- /dev/null +++ b/kernel/trace/rv/monitors/nomiss/nomiss_trace.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_NOMISS +DEFINE_EVENT(event_da_monitor_id, event_nomiss, + TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state), + TP_ARGS(id, state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor_id, error_nomiss, + TP_PROTO(int id, char *state, char *event), + TP_ARGS(id, state, event)); + +DEFINE_EVENT(error_env_da_monitor_id, error_env_nomiss, + TP_PROTO(int id, char *state, char *event, char *env), + TP_ARGS(id, state, event, env)); +#endif /* CONFIG_RV_MON_NOMISS */ diff --git a/kernel/trace/rv/monitors/opid/Kconfig b/kernel/trace/rv/monitors/opid/Kconfig index 561d32da572b..6d02e239b684 100644 --- a/kernel/trace/rv/monitors/opid/Kconfig +++ b/kernel/trace/rv/monitors/opid/Kconfig @@ -2,18 +2,13 @@ # config RV_MON_OPID depends on RV - depends on TRACE_IRQFLAGS - depends on TRACE_PREEMPT_TOGGLE depends on RV_MON_SCHED - default y if PREEMPT_RT - select DA_MON_EVENTS_IMPLICIT + default y + select HA_MON_EVENTS_IMPLICIT bool "opid monitor" help Monitor to ensure operations like wakeup and need resched occur with - interrupts and preemption disabled or during IRQs, where preemption - may not be disabled explicitly. - - This monitor is unstable on !PREEMPT_RT, say N unless you are testing it. + interrupts and preemption disabled. For further information, see: Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/opid/opid.c b/kernel/trace/rv/monitors/opid/opid.c index 25a40e90fa40..4594c7c46601 100644 --- a/kernel/trace/rv/monitors/opid/opid.c +++ b/kernel/trace/rv/monitors/opid/opid.c @@ -10,94 +10,63 @@ #define MODULE_NAME "opid" #include <trace/events/sched.h> -#include <trace/events/irq.h> -#include <trace/events/preemptirq.h> #include <rv_trace.h> #include <monitors/sched/sched.h> #define RV_MON_TYPE RV_MON_PER_CPU #include "opid.h" -#include <rv/da_monitor.h> +#include <rv/ha_monitor.h> -#ifdef CONFIG_X86_LOCAL_APIC -#include <asm/trace/irq_vectors.h> - -static void handle_vector_irq_entry(void *data, int vector) +static u64 ha_get_env(struct ha_monitor *ha_mon, enum envs_opid env, u64 time_ns) { - da_handle_event(irq_entry_opid); -} - -static void attach_vector_irq(void) -{ - rv_attach_trace_probe("opid", local_timer_entry, handle_vector_irq_entry); - if (IS_ENABLED(CONFIG_IRQ_WORK)) - rv_attach_trace_probe("opid", irq_work_entry, handle_vector_irq_entry); - if (IS_ENABLED(CONFIG_SMP)) { - rv_attach_trace_probe("opid", reschedule_entry, handle_vector_irq_entry); - rv_attach_trace_probe("opid", call_function_entry, handle_vector_irq_entry); - rv_attach_trace_probe("opid", call_function_single_entry, handle_vector_irq_entry); + if (env == irq_off_opid) + return irqs_disabled(); + else if (env == preempt_off_opid) { + /* + * If CONFIG_PREEMPTION is enabled, then the tracepoint itself disables + * preemption (adding one to the preempt_count). Since we are + * interested in the preempt_count at the time the tracepoint was + * hit, we consider 1 as still enabled. + */ + if (IS_ENABLED(CONFIG_PREEMPTION)) + return (preempt_count() & PREEMPT_MASK) > 1; + return true; } + return ENV_INVALID_VALUE; } -static void detach_vector_irq(void) +static inline bool ha_verify_guards(struct ha_monitor *ha_mon, + enum states curr_state, enum events event, + enum states next_state, u64 time_ns) { - rv_detach_trace_probe("opid", local_timer_entry, handle_vector_irq_entry); - if (IS_ENABLED(CONFIG_IRQ_WORK)) - rv_detach_trace_probe("opid", irq_work_entry, handle_vector_irq_entry); - if (IS_ENABLED(CONFIG_SMP)) { - rv_detach_trace_probe("opid", reschedule_entry, handle_vector_irq_entry); - rv_detach_trace_probe("opid", call_function_entry, handle_vector_irq_entry); - rv_detach_trace_probe("opid", call_function_single_entry, handle_vector_irq_entry); - } + bool res = true; + + if (curr_state == any_opid && event == sched_need_resched_opid) + res = ha_get_env(ha_mon, irq_off_opid, time_ns) == 1ull; + else if (curr_state == any_opid && event == sched_waking_opid) + res = ha_get_env(ha_mon, irq_off_opid, time_ns) == 1ull && + ha_get_env(ha_mon, preempt_off_opid, time_ns) == 1ull; + return res; } -#else -/* We assume irq_entry tracepoints are sufficient on other architectures */ -static void attach_vector_irq(void) { } -static void detach_vector_irq(void) { } -#endif - -static void handle_irq_disable(void *data, unsigned long ip, unsigned long parent_ip) +static bool ha_verify_constraint(struct ha_monitor *ha_mon, + enum states curr_state, enum events event, + enum states next_state, u64 time_ns) { - da_handle_event(irq_disable_opid); -} + if (!ha_verify_guards(ha_mon, curr_state, event, next_state, time_ns)) + return false; -static void handle_irq_enable(void *data, unsigned long ip, unsigned long parent_ip) -{ - da_handle_event(irq_enable_opid); -} - -static void handle_irq_entry(void *data, int irq, struct irqaction *action) -{ - da_handle_event(irq_entry_opid); -} - -static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip) -{ - da_handle_event(preempt_disable_opid); -} - -static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip) -{ - da_handle_event(preempt_enable_opid); + return true; } static void handle_sched_need_resched(void *data, struct task_struct *tsk, int cpu, int tif) { - /* The monitor's intitial state is not in_irq */ - if (this_cpu_read(hardirq_context)) - da_handle_event(sched_need_resched_opid); - else - da_handle_start_event(sched_need_resched_opid); + da_handle_start_run_event(sched_need_resched_opid); } static void handle_sched_waking(void *data, struct task_struct *p) { - /* The monitor's intitial state is not in_irq */ - if (this_cpu_read(hardirq_context)) - da_handle_event(sched_waking_opid); - else - da_handle_start_event(sched_waking_opid); + da_handle_start_run_event(sched_waking_opid); } static int enable_opid(void) @@ -108,14 +77,8 @@ static int enable_opid(void) if (retval) return retval; - rv_attach_trace_probe("opid", irq_disable, handle_irq_disable); - rv_attach_trace_probe("opid", irq_enable, handle_irq_enable); - rv_attach_trace_probe("opid", irq_handler_entry, handle_irq_entry); - rv_attach_trace_probe("opid", preempt_disable, handle_preempt_disable); - rv_attach_trace_probe("opid", preempt_enable, handle_preempt_enable); rv_attach_trace_probe("opid", sched_set_need_resched_tp, handle_sched_need_resched); rv_attach_trace_probe("opid", sched_waking, handle_sched_waking); - attach_vector_irq(); return 0; } @@ -124,14 +87,8 @@ static void disable_opid(void) { rv_this.enabled = 0; - rv_detach_trace_probe("opid", irq_disable, handle_irq_disable); - rv_detach_trace_probe("opid", irq_enable, handle_irq_enable); - rv_detach_trace_probe("opid", irq_handler_entry, handle_irq_entry); - rv_detach_trace_probe("opid", preempt_disable, handle_preempt_disable); - rv_detach_trace_probe("opid", preempt_enable, handle_preempt_enable); rv_detach_trace_probe("opid", sched_set_need_resched_tp, handle_sched_need_resched); rv_detach_trace_probe("opid", sched_waking, handle_sched_waking); - detach_vector_irq(); da_monitor_destroy(); } diff --git a/kernel/trace/rv/monitors/opid/opid.h b/kernel/trace/rv/monitors/opid/opid.h index 092992514970..fb0aa4c28aa6 100644 --- a/kernel/trace/rv/monitors/opid/opid.h +++ b/kernel/trace/rv/monitors/opid/opid.h @@ -8,30 +8,31 @@ #define MONITOR_NAME opid enum states_opid { - disabled_opid, - enabled_opid, - in_irq_opid, - irq_disabled_opid, - preempt_disabled_opid, + any_opid, state_max_opid, }; #define INVALID_STATE state_max_opid enum events_opid { - irq_disable_opid, - irq_enable_opid, - irq_entry_opid, - preempt_disable_opid, - preempt_enable_opid, sched_need_resched_opid, sched_waking_opid, event_max_opid, }; +enum envs_opid { + irq_off_opid, + preempt_off_opid, + env_max_opid, + env_max_stored_opid = irq_off_opid, +}; + +_Static_assert(env_max_stored_opid <= MAX_HA_ENV_LEN, "Not enough slots"); + struct automaton_opid { char *state_names[state_max_opid]; char *event_names[event_max_opid]; + char *env_names[env_max_opid]; unsigned char function[state_max_opid][event_max_opid]; unsigned char initial_state; bool final_states[state_max_opid]; @@ -39,68 +40,19 @@ struct automaton_opid { static const struct automaton_opid automaton_opid = { .state_names = { - "disabled", - "enabled", - "in_irq", - "irq_disabled", - "preempt_disabled", + "any", }, .event_names = { - "irq_disable", - "irq_enable", - "irq_entry", - "preempt_disable", - "preempt_enable", "sched_need_resched", "sched_waking", }, + .env_names = { + "irq_off", + "preempt_off", + }, .function = { - { - INVALID_STATE, - preempt_disabled_opid, - disabled_opid, - INVALID_STATE, - irq_disabled_opid, - disabled_opid, - disabled_opid, - }, - { - irq_disabled_opid, - INVALID_STATE, - INVALID_STATE, - preempt_disabled_opid, - enabled_opid, - INVALID_STATE, - INVALID_STATE, - }, - { - INVALID_STATE, - enabled_opid, - in_irq_opid, - INVALID_STATE, - INVALID_STATE, - in_irq_opid, - in_irq_opid, - }, - { - INVALID_STATE, - enabled_opid, - in_irq_opid, - disabled_opid, - INVALID_STATE, - irq_disabled_opid, - INVALID_STATE, - }, - { - disabled_opid, - INVALID_STATE, - INVALID_STATE, - INVALID_STATE, - enabled_opid, - INVALID_STATE, - INVALID_STATE, - }, + { any_opid, any_opid }, }, - .initial_state = disabled_opid, - .final_states = { 0, 1, 0, 0, 0 }, + .initial_state = any_opid, + .final_states = { 1 }, }; diff --git a/kernel/trace/rv/monitors/opid/opid_trace.h b/kernel/trace/rv/monitors/opid/opid_trace.h index 3df6ff955c30..b04005b64208 100644 --- a/kernel/trace/rv/monitors/opid/opid_trace.h +++ b/kernel/trace/rv/monitors/opid/opid_trace.h @@ -12,4 +12,8 @@ DEFINE_EVENT(event_da_monitor, event_opid, DEFINE_EVENT(error_da_monitor, error_opid, TP_PROTO(char *state, char *event), TP_ARGS(state, event)); + +DEFINE_EVENT(error_env_da_monitor, error_env_opid, + TP_PROTO(char *state, char *event, char *env), + TP_ARGS(state, event, env)); #endif /* CONFIG_RV_MON_OPID */ diff --git a/kernel/trace/rv/monitors/sleep/sleep.c b/kernel/trace/rv/monitors/sleep/sleep.c index c1347da69e9d..8dfe5ec13e19 100644 --- a/kernel/trace/rv/monitors/sleep/sleep.c +++ b/kernel/trace/rv/monitors/sleep/sleep.c @@ -49,6 +49,7 @@ static void ltl_atoms_init(struct task_struct *task, struct ltl_monitor *mon, bo ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false); ltl_atom_set(mon, LTL_CLOCK_NANOSLEEP, false); ltl_atom_set(mon, LTL_FUTEX_WAIT, false); + ltl_atom_set(mon, LTL_EPOLL_WAIT, false); ltl_atom_set(mon, LTL_FUTEX_LOCK_PI, false); ltl_atom_set(mon, LTL_BLOCK_ON_RT_MUTEX, false); } @@ -63,6 +64,7 @@ static void ltl_atoms_init(struct task_struct *task, struct ltl_monitor *mon, bo ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, false); ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false); ltl_atom_set(mon, LTL_CLOCK_NANOSLEEP, false); + ltl_atom_set(mon, LTL_EPOLL_WAIT, false); if (strstarts(task->comm, "migration/")) ltl_atom_set(mon, LTL_TASK_IS_MIGRATION, true); @@ -162,6 +164,11 @@ static void handle_sys_enter(void *data, struct pt_regs *regs, long id) break; } break; +#ifdef __NR_epoll_wait + case __NR_epoll_wait: + ltl_atom_update(current, LTL_EPOLL_WAIT, true); + break; +#endif } } @@ -174,6 +181,7 @@ static void handle_sys_exit(void *data, struct pt_regs *regs, long ret) ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, false); ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, false); ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false); + ltl_atom_set(mon, LTL_EPOLL_WAIT, false); ltl_atom_update(current, LTL_CLOCK_NANOSLEEP, false); } diff --git a/kernel/trace/rv/monitors/sleep/sleep.h b/kernel/trace/rv/monitors/sleep/sleep.h index 2ab46fd218d2..95dc2727c059 100644 --- a/kernel/trace/rv/monitors/sleep/sleep.h +++ b/kernel/trace/rv/monitors/sleep/sleep.h @@ -15,6 +15,7 @@ enum ltl_atom { LTL_ABORT_SLEEP, LTL_BLOCK_ON_RT_MUTEX, LTL_CLOCK_NANOSLEEP, + LTL_EPOLL_WAIT, LTL_FUTEX_LOCK_PI, LTL_FUTEX_WAIT, LTL_KERNEL_THREAD, @@ -40,6 +41,7 @@ static const char *ltl_atom_str(enum ltl_atom atom) "ab_sl", "bl_on_rt_mu", "cl_na", + "ep_wa", "fu_lo_pi", "fu_wa", "ker_th", @@ -75,39 +77,41 @@ static_assert(RV_NUM_BA_STATES <= RV_MAX_BA_STATES); static void ltl_start(struct task_struct *task, struct ltl_monitor *mon) { - bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms); - bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms); - bool val40 = task_is_rcu || task_is_migration; - bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms); - bool val41 = futex_lock_pi || val40; - bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms); - bool val5 = block_on_rt_mutex || val41; - bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon->atoms); - bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms); - bool val32 = abort_sleep || kthread_should_stop; bool woken_by_nmi = test_bit(LTL_WOKEN_BY_NMI, mon->atoms); - bool val33 = woken_by_nmi || val32; bool woken_by_hardirq = test_bit(LTL_WOKEN_BY_HARDIRQ, mon->atoms); - bool val34 = woken_by_hardirq || val33; bool woken_by_equal_or_higher_prio = test_bit(LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO, mon->atoms); - bool val14 = woken_by_equal_or_higher_prio || val34; bool wake = test_bit(LTL_WAKE, mon->atoms); - bool val13 = !wake; - bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms); + bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms); + bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms); + bool sleep = test_bit(LTL_SLEEP, mon->atoms); + bool rt = test_bit(LTL_RT, mon->atoms); + bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME, mon->atoms); bool nanosleep_clock_tai = test_bit(LTL_NANOSLEEP_CLOCK_TAI, mon->atoms); bool nanosleep_clock_monotonic = test_bit(LTL_NANOSLEEP_CLOCK_MONOTONIC, mon->atoms); - bool val24 = nanosleep_clock_monotonic || nanosleep_clock_tai; - bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME, mon->atoms); - bool val25 = nanosleep_timer_abstime && val24; - bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms); - bool val18 = clock_nanosleep && val25; + bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon->atoms); + bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms); bool futex_wait = test_bit(LTL_FUTEX_WAIT, mon->atoms); - bool val9 = futex_wait || val18; + bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms); + bool epoll_wait = test_bit(LTL_EPOLL_WAIT, mon->atoms); + bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms); + bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms); + bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms); + bool val42 = task_is_rcu || task_is_migration; + bool val43 = futex_lock_pi || val42; + bool val5 = block_on_rt_mutex || val43; + bool val34 = abort_sleep || kthread_should_stop; + bool val35 = woken_by_nmi || val34; + bool val36 = woken_by_hardirq || val35; + bool val14 = woken_by_equal_or_higher_prio || val36; + bool val13 = !wake; + bool val26 = nanosleep_clock_monotonic || nanosleep_clock_tai; + bool val27 = nanosleep_timer_abstime && val26; + bool val18 = clock_nanosleep && val27; + bool val20 = val18 || epoll_wait; + bool val9 = futex_wait || val20; bool val11 = val9 || kernel_thread; - bool sleep = test_bit(LTL_SLEEP, mon->atoms); bool val2 = !sleep; - bool rt = test_bit(LTL_RT, mon->atoms); bool val1 = !rt; bool val3 = val1 || val2; @@ -124,39 +128,41 @@ static void ltl_start(struct task_struct *task, struct ltl_monitor *mon) static void ltl_possible_next_states(struct ltl_monitor *mon, unsigned int state, unsigned long *next) { - bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms); - bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms); - bool val40 = task_is_rcu || task_is_migration; - bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms); - bool val41 = futex_lock_pi || val40; - bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms); - bool val5 = block_on_rt_mutex || val41; - bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon->atoms); - bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms); - bool val32 = abort_sleep || kthread_should_stop; bool woken_by_nmi = test_bit(LTL_WOKEN_BY_NMI, mon->atoms); - bool val33 = woken_by_nmi || val32; bool woken_by_hardirq = test_bit(LTL_WOKEN_BY_HARDIRQ, mon->atoms); - bool val34 = woken_by_hardirq || val33; bool woken_by_equal_or_higher_prio = test_bit(LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO, mon->atoms); - bool val14 = woken_by_equal_or_higher_prio || val34; bool wake = test_bit(LTL_WAKE, mon->atoms); - bool val13 = !wake; - bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms); + bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms); + bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms); + bool sleep = test_bit(LTL_SLEEP, mon->atoms); + bool rt = test_bit(LTL_RT, mon->atoms); + bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME, mon->atoms); bool nanosleep_clock_tai = test_bit(LTL_NANOSLEEP_CLOCK_TAI, mon->atoms); bool nanosleep_clock_monotonic = test_bit(LTL_NANOSLEEP_CLOCK_MONOTONIC, mon->atoms); - bool val24 = nanosleep_clock_monotonic || nanosleep_clock_tai; - bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME, mon->atoms); - bool val25 = nanosleep_timer_abstime && val24; - bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms); - bool val18 = clock_nanosleep && val25; + bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon->atoms); + bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms); bool futex_wait = test_bit(LTL_FUTEX_WAIT, mon->atoms); - bool val9 = futex_wait || val18; + bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms); + bool epoll_wait = test_bit(LTL_EPOLL_WAIT, mon->atoms); + bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms); + bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms); + bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms); + bool val42 = task_is_rcu || task_is_migration; + bool val43 = futex_lock_pi || val42; + bool val5 = block_on_rt_mutex || val43; + bool val34 = abort_sleep || kthread_should_stop; + bool val35 = woken_by_nmi || val34; + bool val36 = woken_by_hardirq || val35; + bool val14 = woken_by_equal_or_higher_prio || val36; + bool val13 = !wake; + bool val26 = nanosleep_clock_monotonic || nanosleep_clock_tai; + bool val27 = nanosleep_timer_abstime && val26; + bool val18 = clock_nanosleep && val27; + bool val20 = val18 || epoll_wait; + bool val9 = futex_wait || val20; bool val11 = val9 || kernel_thread; - bool sleep = test_bit(LTL_SLEEP, mon->atoms); bool val2 = !sleep; - bool rt = test_bit(LTL_RT, mon->atoms); bool val1 = !rt; bool val3 = val1 || val2; diff --git a/kernel/trace/rv/monitors/stall/Kconfig b/kernel/trace/rv/monitors/stall/Kconfig new file mode 100644 index 000000000000..6f846b642544 --- /dev/null +++ b/kernel/trace/rv/monitors/stall/Kconfig @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_STALL + depends on RV + select HA_MON_EVENTS_ID + bool "stall monitor" + help + Enable the stall sample monitor that illustrates the usage of hybrid + automata monitors. It can be used to identify tasks stalled for + longer than a threshold. + + For further information, see: + Documentation/trace/rv/monitor_stall.rst diff --git a/kernel/trace/rv/monitors/stall/stall.c b/kernel/trace/rv/monitors/stall/stall.c new file mode 100644 index 000000000000..9ccfda6b0e73 --- /dev/null +++ b/kernel/trace/rv/monitors/stall/stall.c @@ -0,0 +1,150 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> + +#define MODULE_NAME "stall" + +#include <trace/events/sched.h> +#include <rv_trace.h> + +#define RV_MON_TYPE RV_MON_PER_TASK +#define HA_TIMER_TYPE HA_TIMER_WHEEL +#include "stall.h" +#include <rv/ha_monitor.h> + +static u64 threshold_jiffies = 1000; +module_param(threshold_jiffies, ullong, 0644); + +static u64 ha_get_env(struct ha_monitor *ha_mon, enum envs_stall env, u64 time_ns) +{ + if (env == clk_stall) + return ha_get_clk_jiffy(ha_mon, env); + return ENV_INVALID_VALUE; +} + +static void ha_reset_env(struct ha_monitor *ha_mon, enum envs_stall env, u64 time_ns) +{ + if (env == clk_stall) + ha_reset_clk_jiffy(ha_mon, env); +} + +static inline bool ha_verify_invariants(struct ha_monitor *ha_mon, + enum states curr_state, enum events event, + enum states next_state, u64 time_ns) +{ + if (curr_state == enqueued_stall) + return ha_check_invariant_jiffy(ha_mon, clk_stall, time_ns); + return true; +} + +static inline bool ha_verify_guards(struct ha_monitor *ha_mon, + enum states curr_state, enum events event, + enum states next_state, u64 time_ns) +{ + bool res = true; + + if (curr_state == dequeued_stall && event == sched_wakeup_stall) + ha_reset_env(ha_mon, clk_stall, time_ns); + else if (curr_state == running_stall && event == sched_switch_preempt_stall) + ha_reset_env(ha_mon, clk_stall, time_ns); + return res; +} + +static inline void ha_setup_invariants(struct ha_monitor *ha_mon, + enum states curr_state, enum events event, + enum states next_state, u64 time_ns) +{ + if (next_state == curr_state) + return; + if (next_state == enqueued_stall) + ha_start_timer_jiffy(ha_mon, clk_stall, threshold_jiffies, time_ns); + else if (curr_state == enqueued_stall) + ha_cancel_timer(ha_mon); +} + +static bool ha_verify_constraint(struct ha_monitor *ha_mon, + enum states curr_state, enum events event, + enum states next_state, u64 time_ns) +{ + if (!ha_verify_invariants(ha_mon, curr_state, event, next_state, time_ns)) + return false; + + if (!ha_verify_guards(ha_mon, curr_state, event, next_state, time_ns)) + return false; + + ha_setup_invariants(ha_mon, curr_state, event, next_state, time_ns); + + return true; +} + +static void handle_sched_switch(void *data, bool preempt, + struct task_struct *prev, + struct task_struct *next, + unsigned int prev_state) +{ + if (!preempt && prev_state != TASK_RUNNING) + da_handle_start_event(prev, sched_switch_wait_stall); + else + da_handle_event(prev, sched_switch_preempt_stall); + da_handle_event(next, sched_switch_in_stall); +} + +static void handle_sched_wakeup(void *data, struct task_struct *p) +{ + da_handle_event(p, sched_wakeup_stall); +} + +static int enable_stall(void) +{ + int retval; + + retval = da_monitor_init(); + if (retval) + return retval; + + rv_attach_trace_probe("stall", sched_switch, handle_sched_switch); + rv_attach_trace_probe("stall", sched_wakeup, handle_sched_wakeup); + + return 0; +} + +static void disable_stall(void) +{ + rv_this.enabled = 0; + + rv_detach_trace_probe("stall", sched_switch, handle_sched_switch); + rv_detach_trace_probe("stall", sched_wakeup, handle_sched_wakeup); + + da_monitor_destroy(); +} + +static struct rv_monitor rv_this = { + .name = "stall", + .description = "identify tasks stalled for longer than a threshold.", + .enable = enable_stall, + .disable = disable_stall, + .reset = da_monitor_reset_all, + .enabled = 0, +}; + +static int __init register_stall(void) +{ + return rv_register_monitor(&rv_this, NULL); +} + +static void __exit unregister_stall(void) +{ + rv_unregister_monitor(&rv_this); +} + +module_init(register_stall); +module_exit(unregister_stall); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("stall: identify tasks stalled for longer than a threshold."); diff --git a/kernel/trace/rv/monitors/stall/stall.h b/kernel/trace/rv/monitors/stall/stall.h new file mode 100644 index 000000000000..638520cb1082 --- /dev/null +++ b/kernel/trace/rv/monitors/stall/stall.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of stall automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +#define MONITOR_NAME stall + +enum states_stall { + dequeued_stall, + enqueued_stall, + running_stall, + state_max_stall, +}; + +#define INVALID_STATE state_max_stall + +enum events_stall { + sched_switch_in_stall, + sched_switch_preempt_stall, + sched_switch_wait_stall, + sched_wakeup_stall, + event_max_stall, +}; + +enum envs_stall { + clk_stall, + env_max_stall, + env_max_stored_stall = env_max_stall, +}; + +_Static_assert(env_max_stored_stall <= MAX_HA_ENV_LEN, "Not enough slots"); + +struct automaton_stall { + char *state_names[state_max_stall]; + char *event_names[event_max_stall]; + char *env_names[env_max_stall]; + unsigned char function[state_max_stall][event_max_stall]; + unsigned char initial_state; + bool final_states[state_max_stall]; +}; + +static const struct automaton_stall automaton_stall = { + .state_names = { + "dequeued", + "enqueued", + "running", + }, + .event_names = { + "sched_switch_in", + "sched_switch_preempt", + "sched_switch_wait", + "sched_wakeup", + }, + .env_names = { + "clk", + }, + .function = { + { + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + enqueued_stall, + }, + { + running_stall, + INVALID_STATE, + INVALID_STATE, + enqueued_stall, + }, + { + running_stall, + enqueued_stall, + dequeued_stall, + running_stall, + }, + }, + .initial_state = dequeued_stall, + .final_states = { 1, 0, 0 }, +}; diff --git a/kernel/trace/rv/monitors/stall/stall_trace.h b/kernel/trace/rv/monitors/stall/stall_trace.h new file mode 100644 index 000000000000..6a7cc1b1d040 --- /dev/null +++ b/kernel/trace/rv/monitors/stall/stall_trace.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_STALL +DEFINE_EVENT(event_da_monitor_id, event_stall, + TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state), + TP_ARGS(id, state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor_id, error_stall, + TP_PROTO(int id, char *state, char *event), + TP_ARGS(id, state, event)); + +DEFINE_EVENT(error_env_da_monitor_id, error_env_stall, + TP_PROTO(int id, char *state, char *event, char *env), + TP_ARGS(id, state, event, env)); +#endif /* CONFIG_RV_MON_STALL */ diff --git a/kernel/trace/rv/rv_trace.h b/kernel/trace/rv/rv_trace.h index 4a6faddac614..9622c269789c 100644 --- a/kernel/trace/rv/rv_trace.h +++ b/kernel/trace/rv/rv_trace.h @@ -62,9 +62,39 @@ DECLARE_EVENT_CLASS(error_da_monitor, #include <monitors/scpd/scpd_trace.h> #include <monitors/snep/snep_trace.h> #include <monitors/sts/sts_trace.h> -#include <monitors/opid/opid_trace.h> // Add new monitors based on CONFIG_DA_MON_EVENTS_IMPLICIT here +#ifdef CONFIG_HA_MON_EVENTS_IMPLICIT +/* For simplicity this class is marked as DA although relevant only for HA */ +DECLARE_EVENT_CLASS(error_env_da_monitor, + + TP_PROTO(char *state, char *event, char *env), + + TP_ARGS(state, event, env), + + TP_STRUCT__entry( + __string( state, state ) + __string( event, event ) + __string( env, env ) + ), + + TP_fast_assign( + __assign_str(state); + __assign_str(event); + __assign_str(env); + ), + + TP_printk("event %s not expected in the state %s with env %s", + __get_str(event), + __get_str(state), + __get_str(env)) +); + +#include <monitors/opid/opid_trace.h> +// Add new monitors based on CONFIG_HA_MON_EVENTS_IMPLICIT here + +#endif + #endif /* CONFIG_DA_MON_EVENTS_IMPLICIT */ #ifdef CONFIG_DA_MON_EVENTS_ID @@ -128,6 +158,41 @@ DECLARE_EVENT_CLASS(error_da_monitor_id, #include <monitors/sssw/sssw_trace.h> // Add new monitors based on CONFIG_DA_MON_EVENTS_ID here +#ifdef CONFIG_HA_MON_EVENTS_ID +/* For simplicity this class is marked as DA although relevant only for HA */ +DECLARE_EVENT_CLASS(error_env_da_monitor_id, + + TP_PROTO(int id, char *state, char *event, char *env), + + TP_ARGS(id, state, event, env), + + TP_STRUCT__entry( + __field( int, id ) + __string( state, state ) + __string( event, event ) + __string( env, env ) + ), + + TP_fast_assign( + __assign_str(state); + __assign_str(event); + __assign_str(env); + __entry->id = id; + ), + + TP_printk("%d: event %s not expected in the state %s with env %s", + __entry->id, + __get_str(event), + __get_str(state), + __get_str(env)) +); + +#include <monitors/stall/stall_trace.h> +#include <monitors/nomiss/nomiss_trace.h> +// Add new monitors based on CONFIG_HA_MON_EVENTS_ID here + +#endif + #endif /* CONFIG_DA_MON_EVENTS_ID */ #ifdef CONFIG_LTL_MON_EVENTS_ID DECLARE_EVENT_CLASS(event_ltl_monitor_id, diff --git a/kernel/trace/simple_ring_buffer.c b/kernel/trace/simple_ring_buffer.c new file mode 100644 index 000000000000..f4642f5adda3 --- /dev/null +++ b/kernel/trace/simple_ring_buffer.c @@ -0,0 +1,517 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2025 - Google LLC + * Author: Vincent Donnefort <vdonnefort@google.com> + */ + +#include <linux/atomic.h> +#include <linux/simple_ring_buffer.h> + +#include <asm/barrier.h> +#include <asm/local.h> + +enum simple_rb_link_type { + SIMPLE_RB_LINK_NORMAL = 0, + SIMPLE_RB_LINK_HEAD = 1, + SIMPLE_RB_LINK_HEAD_MOVING +}; + +#define SIMPLE_RB_LINK_MASK ~(SIMPLE_RB_LINK_HEAD | SIMPLE_RB_LINK_HEAD_MOVING) + +static void simple_bpage_set_head_link(struct simple_buffer_page *bpage) +{ + unsigned long link = (unsigned long)bpage->link.next; + + link &= SIMPLE_RB_LINK_MASK; + link |= SIMPLE_RB_LINK_HEAD; + + /* + * Paired with simple_rb_find_head() to order access between the head + * link and overrun. It ensures we always report an up-to-date value + * after swapping the reader page. + */ + smp_store_release(&bpage->link.next, (struct list_head *)link); +} + +static bool simple_bpage_unset_head_link(struct simple_buffer_page *bpage, + struct simple_buffer_page *dst, + enum simple_rb_link_type new_type) +{ + unsigned long *link = (unsigned long *)(&bpage->link.next); + unsigned long old = (*link & SIMPLE_RB_LINK_MASK) | SIMPLE_RB_LINK_HEAD; + unsigned long new = (unsigned long)(&dst->link) | new_type; + + return try_cmpxchg(link, &old, new); +} + +static void simple_bpage_set_normal_link(struct simple_buffer_page *bpage) +{ + unsigned long link = (unsigned long)bpage->link.next; + + WRITE_ONCE(bpage->link.next, (struct list_head *)(link & SIMPLE_RB_LINK_MASK)); +} + +static struct simple_buffer_page *simple_bpage_from_link(struct list_head *link) +{ + unsigned long ptr = (unsigned long)link & SIMPLE_RB_LINK_MASK; + + return container_of((struct list_head *)ptr, struct simple_buffer_page, link); +} + +static struct simple_buffer_page *simple_bpage_next_page(struct simple_buffer_page *bpage) +{ + return simple_bpage_from_link(bpage->link.next); +} + +static void simple_bpage_reset(struct simple_buffer_page *bpage) +{ + bpage->write = 0; + bpage->entries = 0; + + local_set(&bpage->page->commit, 0); +} + +static void simple_bpage_init(struct simple_buffer_page *bpage, void *page) +{ + INIT_LIST_HEAD(&bpage->link); + bpage->page = (struct buffer_data_page *)page; + + simple_bpage_reset(bpage); +} + +#define simple_rb_meta_inc(__meta, __inc) \ + WRITE_ONCE((__meta), (__meta + __inc)) + +static bool simple_rb_loaded(struct simple_rb_per_cpu *cpu_buffer) +{ + return !!cpu_buffer->bpages; +} + +static int simple_rb_find_head(struct simple_rb_per_cpu *cpu_buffer) +{ + int retry = cpu_buffer->nr_pages * 2; + struct simple_buffer_page *head; + + head = cpu_buffer->head_page; + + while (retry--) { + unsigned long link; + +spin: + /* See smp_store_release in simple_bpage_set_head_link() */ + link = (unsigned long)smp_load_acquire(&head->link.prev->next); + + switch (link & ~SIMPLE_RB_LINK_MASK) { + /* Found the head */ + case SIMPLE_RB_LINK_HEAD: + cpu_buffer->head_page = head; + return 0; + /* The writer caught the head, we can spin, that won't be long */ + case SIMPLE_RB_LINK_HEAD_MOVING: + goto spin; + } + + head = simple_bpage_next_page(head); + } + + return -EBUSY; +} + +/** + * simple_ring_buffer_swap_reader_page - Swap ring-buffer head with the reader + * @cpu_buffer: A simple_rb_per_cpu + * + * This function enables consuming reading. It ensures the current head page will not be overwritten + * and can be safely read. + * + * Returns 0 on success, -ENODEV if @cpu_buffer was unloaded or -EBUSY if we failed to catch the + * head page. + */ +int simple_ring_buffer_swap_reader_page(struct simple_rb_per_cpu *cpu_buffer) +{ + struct simple_buffer_page *last, *head, *reader; + unsigned long overrun; + int retry = 8; + int ret; + + if (!simple_rb_loaded(cpu_buffer)) + return -ENODEV; + + reader = cpu_buffer->reader_page; + + do { + /* Run after the writer to find the head */ + ret = simple_rb_find_head(cpu_buffer); + if (ret) + return ret; + + head = cpu_buffer->head_page; + + /* Connect the reader page around the header page */ + reader->link.next = head->link.next; + reader->link.prev = head->link.prev; + + /* The last page before the head */ + last = simple_bpage_from_link(head->link.prev); + + /* The reader page points to the new header page */ + simple_bpage_set_head_link(reader); + + overrun = cpu_buffer->meta->overrun; + } while (!simple_bpage_unset_head_link(last, reader, SIMPLE_RB_LINK_NORMAL) && retry--); + + if (!retry) + return -EINVAL; + + cpu_buffer->head_page = simple_bpage_from_link(reader->link.next); + cpu_buffer->head_page->link.prev = &reader->link; + cpu_buffer->reader_page = head; + cpu_buffer->meta->reader.lost_events = overrun - cpu_buffer->last_overrun; + cpu_buffer->meta->reader.id = cpu_buffer->reader_page->id; + cpu_buffer->last_overrun = overrun; + + return 0; +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_swap_reader_page); + +static struct simple_buffer_page *simple_rb_move_tail(struct simple_rb_per_cpu *cpu_buffer) +{ + struct simple_buffer_page *tail, *new_tail; + + tail = cpu_buffer->tail_page; + new_tail = simple_bpage_next_page(tail); + + if (simple_bpage_unset_head_link(tail, new_tail, SIMPLE_RB_LINK_HEAD_MOVING)) { + /* + * Oh no! we've caught the head. There is none anymore and + * swap_reader will spin until we set the new one. Overrun must + * be written first, to make sure we report the correct number + * of lost events. + */ + simple_rb_meta_inc(cpu_buffer->meta->overrun, new_tail->entries); + simple_rb_meta_inc(cpu_buffer->meta->pages_lost, 1); + + simple_bpage_set_head_link(new_tail); + simple_bpage_set_normal_link(tail); + } + + simple_bpage_reset(new_tail); + cpu_buffer->tail_page = new_tail; + + simple_rb_meta_inc(cpu_buffer->meta->pages_touched, 1); + + return new_tail; +} + +static unsigned long rb_event_size(unsigned long length) +{ + struct ring_buffer_event *event; + + return length + RB_EVNT_HDR_SIZE + sizeof(event->array[0]); +} + +static struct ring_buffer_event * +rb_event_add_ts_extend(struct ring_buffer_event *event, u64 delta) +{ + event->type_len = RINGBUF_TYPE_TIME_EXTEND; + event->time_delta = delta & TS_MASK; + event->array[0] = delta >> TS_SHIFT; + + return (struct ring_buffer_event *)((unsigned long)event + 8); +} + +static struct ring_buffer_event * +simple_rb_reserve_next(struct simple_rb_per_cpu *cpu_buffer, unsigned long length, u64 timestamp) +{ + unsigned long ts_ext_size = 0, event_size = rb_event_size(length); + struct simple_buffer_page *tail = cpu_buffer->tail_page; + struct ring_buffer_event *event; + u32 write, prev_write; + u64 time_delta; + + time_delta = timestamp - cpu_buffer->write_stamp; + + if (test_time_stamp(time_delta)) + ts_ext_size = 8; + + prev_write = tail->write; + write = prev_write + event_size + ts_ext_size; + + if (unlikely(write > (PAGE_SIZE - BUF_PAGE_HDR_SIZE))) + tail = simple_rb_move_tail(cpu_buffer); + + if (!tail->entries) { + tail->page->time_stamp = timestamp; + time_delta = 0; + ts_ext_size = 0; + write = event_size; + prev_write = 0; + } + + tail->write = write; + tail->entries++; + + cpu_buffer->write_stamp = timestamp; + + event = (struct ring_buffer_event *)(tail->page->data + prev_write); + if (ts_ext_size) { + event = rb_event_add_ts_extend(event, time_delta); + time_delta = 0; + } + + event->type_len = 0; + event->time_delta = time_delta; + event->array[0] = event_size - RB_EVNT_HDR_SIZE; + + return event; +} + +/** + * simple_ring_buffer_reserve - Reserve an entry in @cpu_buffer + * @cpu_buffer: A simple_rb_per_cpu + * @length: Size of the entry in bytes + * @timestamp: Timestamp of the entry + * + * Returns the address of the entry where to write data or NULL + */ +void *simple_ring_buffer_reserve(struct simple_rb_per_cpu *cpu_buffer, unsigned long length, + u64 timestamp) +{ + struct ring_buffer_event *rb_event; + + if (cmpxchg(&cpu_buffer->status, SIMPLE_RB_READY, SIMPLE_RB_WRITING) != SIMPLE_RB_READY) + return NULL; + + rb_event = simple_rb_reserve_next(cpu_buffer, length, timestamp); + + return &rb_event->array[1]; +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_reserve); + +/** + * simple_ring_buffer_commit - Commit the entry reserved with simple_ring_buffer_reserve() + * @cpu_buffer: The simple_rb_per_cpu where the entry has been reserved + */ +void simple_ring_buffer_commit(struct simple_rb_per_cpu *cpu_buffer) +{ + local_set(&cpu_buffer->tail_page->page->commit, + cpu_buffer->tail_page->write); + simple_rb_meta_inc(cpu_buffer->meta->entries, 1); + + /* + * Paired with simple_rb_enable_tracing() to ensure data is + * written to the ring-buffer before teardown. + */ + smp_store_release(&cpu_buffer->status, SIMPLE_RB_READY); +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_commit); + +static u32 simple_rb_enable_tracing(struct simple_rb_per_cpu *cpu_buffer, bool enable) +{ + u32 prev_status; + + if (enable) + return cmpxchg(&cpu_buffer->status, SIMPLE_RB_UNAVAILABLE, SIMPLE_RB_READY); + + /* Wait for the buffer to be released */ + do { + prev_status = cmpxchg_acquire(&cpu_buffer->status, + SIMPLE_RB_READY, + SIMPLE_RB_UNAVAILABLE); + } while (prev_status == SIMPLE_RB_WRITING); + + return prev_status; +} + +/** + * simple_ring_buffer_reset - Reset @cpu_buffer + * @cpu_buffer: A simple_rb_per_cpu + * + * This will not clear the content of the data, only reset counters and pointers + * + * Returns 0 on success or -ENODEV if @cpu_buffer was unloaded. + */ +int simple_ring_buffer_reset(struct simple_rb_per_cpu *cpu_buffer) +{ + struct simple_buffer_page *bpage; + u32 prev_status; + int ret; + + if (!simple_rb_loaded(cpu_buffer)) + return -ENODEV; + + prev_status = simple_rb_enable_tracing(cpu_buffer, false); + + ret = simple_rb_find_head(cpu_buffer); + if (ret) + return ret; + + bpage = cpu_buffer->tail_page = cpu_buffer->head_page; + do { + simple_bpage_reset(bpage); + bpage = simple_bpage_next_page(bpage); + } while (bpage != cpu_buffer->head_page); + + simple_bpage_reset(cpu_buffer->reader_page); + + cpu_buffer->last_overrun = 0; + cpu_buffer->write_stamp = 0; + + cpu_buffer->meta->reader.read = 0; + cpu_buffer->meta->reader.lost_events = 0; + cpu_buffer->meta->entries = 0; + cpu_buffer->meta->overrun = 0; + cpu_buffer->meta->read = 0; + cpu_buffer->meta->pages_lost = 0; + cpu_buffer->meta->pages_touched = 0; + + if (prev_status == SIMPLE_RB_READY) + simple_rb_enable_tracing(cpu_buffer, true); + + return 0; +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_reset); + +int simple_ring_buffer_init_mm(struct simple_rb_per_cpu *cpu_buffer, + struct simple_buffer_page *bpages, + const struct ring_buffer_desc *desc, + void *(*load_page)(unsigned long va), + void (*unload_page)(void *va)) +{ + struct simple_buffer_page *bpage = bpages; + int ret = 0; + void *page; + int i; + + /* At least 1 reader page and two pages in the ring-buffer */ + if (desc->nr_page_va < 3) + return -EINVAL; + + memset(cpu_buffer, 0, sizeof(*cpu_buffer)); + + cpu_buffer->meta = load_page(desc->meta_va); + if (!cpu_buffer->meta) + return -EINVAL; + + memset(cpu_buffer->meta, 0, sizeof(*cpu_buffer->meta)); + cpu_buffer->meta->meta_page_size = PAGE_SIZE; + + /* The reader page is not part of the ring initially */ + page = load_page(desc->page_va[0]); + if (!page) { + unload_page(cpu_buffer->meta); + return -EINVAL; + } + + simple_bpage_init(bpage, page); + bpage->id = 0; + + cpu_buffer->nr_pages = 1; + + cpu_buffer->reader_page = bpage; + cpu_buffer->tail_page = bpage + 1; + cpu_buffer->head_page = bpage + 1; + + for (i = 1; i < desc->nr_page_va; i++) { + page = load_page(desc->page_va[i]); + if (!page) { + ret = -EINVAL; + break; + } + + simple_bpage_init(++bpage, page); + + bpage->link.next = &(bpage + 1)->link; + bpage->link.prev = &(bpage - 1)->link; + bpage->id = i; + + cpu_buffer->nr_pages = i + 1; + } + + if (ret) { + for (i--; i >= 0; i--) + unload_page(bpages[i].page); + unload_page(cpu_buffer->meta); + + return ret; + } + + cpu_buffer->meta->nr_subbufs = cpu_buffer->nr_pages; + /* Close the ring */ + bpage->link.next = &cpu_buffer->tail_page->link; + cpu_buffer->tail_page->link.prev = &bpage->link; + + /* The last init'ed page points to the head page */ + simple_bpage_set_head_link(bpage); + + cpu_buffer->bpages = bpages; + + return 0; +} + +static void *__load_page(unsigned long page) +{ + return (void *)page; +} + +static void __unload_page(void *page) { } + +/** + * simple_ring_buffer_init - Init @cpu_buffer based on @desc + * @cpu_buffer: A simple_rb_per_cpu buffer to init, allocated by the caller. + * @bpages: Array of simple_buffer_pages, with as many elements as @desc->nr_page_va + * @desc: A ring_buffer_desc + * + * Returns 0 on success or -EINVAL if the content of @desc is invalid + */ +int simple_ring_buffer_init(struct simple_rb_per_cpu *cpu_buffer, struct simple_buffer_page *bpages, + const struct ring_buffer_desc *desc) +{ + return simple_ring_buffer_init_mm(cpu_buffer, bpages, desc, __load_page, __unload_page); +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_init); + +void simple_ring_buffer_unload_mm(struct simple_rb_per_cpu *cpu_buffer, + void (*unload_page)(void *)) +{ + int p; + + if (!simple_rb_loaded(cpu_buffer)) + return; + + simple_rb_enable_tracing(cpu_buffer, false); + + unload_page(cpu_buffer->meta); + for (p = 0; p < cpu_buffer->nr_pages; p++) + unload_page(cpu_buffer->bpages[p].page); + + cpu_buffer->bpages = NULL; +} + +/** + * simple_ring_buffer_unload - Prepare @cpu_buffer for deletion + * @cpu_buffer: A simple_rb_per_cpu that will be deleted. + */ +void simple_ring_buffer_unload(struct simple_rb_per_cpu *cpu_buffer) +{ + return simple_ring_buffer_unload_mm(cpu_buffer, __unload_page); +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_unload); + +/** + * simple_ring_buffer_enable_tracing - Enable or disable writing to @cpu_buffer + * @cpu_buffer: A simple_rb_per_cpu + * @enable: True to enable tracing, False to disable it + * + * Returns 0 on success or -ENODEV if @cpu_buffer was unloaded + */ +int simple_ring_buffer_enable_tracing(struct simple_rb_per_cpu *cpu_buffer, bool enable) +{ + if (!simple_rb_loaded(cpu_buffer)) + return -ENODEV; + + simple_rb_enable_tracing(cpu_buffer, enable); + + return 0; +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_enable_tracing); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a626211ceb9a..6eb4d3097a4d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -47,7 +47,6 @@ #include <linux/trace.h> #include <linux/sched/clock.h> #include <linux/sched/rt.h> -#include <linux/fsnotify.h> #include <linux/irq_work.h> #include <linux/workqueue.h> #include <linux/sort.h> @@ -219,14 +218,36 @@ static void ftrace_trace_userstack(struct trace_array *tr, static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; static char *default_bootup_tracer; -static bool allocate_snapshot; -static bool snapshot_at_boot; - static char boot_instance_info[COMMAND_LINE_SIZE] __initdata; static int boot_instance_index; -static char boot_snapshot_info[COMMAND_LINE_SIZE] __initdata; -static int boot_snapshot_index; +/* + * Repeated boot parameters, including Bootconfig array expansions, need + * to stay in the delimiter form that the existing parser consumes. + */ +void __init trace_append_boot_param(char *buf, const char *str, char sep, + int size) +{ + int len, needed, str_len; + + if (!*str) + return; + + len = strlen(buf); + str_len = strlen(str); + needed = len + str_len + 1; + + /* For continuation, account for the separator. */ + if (len) + needed++; + if (needed > size) + return; + + if (len) + buf[len++] = sep; + + strscpy(buf + len, str, size - len); +} static int __init set_cmdline_ftrace(char *str) { @@ -276,38 +297,6 @@ static int __init stop_trace_on_warning(char *str) } __setup("traceoff_on_warning", stop_trace_on_warning); -static int __init boot_alloc_snapshot(char *str) -{ - char *slot = boot_snapshot_info + boot_snapshot_index; - int left = sizeof(boot_snapshot_info) - boot_snapshot_index; - int ret; - - if (str[0] == '=') { - str++; - if (strlen(str) >= left) - return -1; - - ret = snprintf(slot, left, "%s\t", str); - boot_snapshot_index += ret; - } else { - allocate_snapshot = true; - /* We also need the main ring buffer expanded */ - trace_set_ring_buffer_expanded(NULL); - } - return 1; -} -__setup("alloc_snapshot", boot_alloc_snapshot); - - -static int __init boot_snapshot(char *str) -{ - snapshot_at_boot = true; - boot_alloc_snapshot(str); - return 1; -} -__setup("ftrace_boot_snapshot", boot_snapshot); - - static int __init boot_instance(char *str) { char *slot = boot_instance_info + boot_instance_index; @@ -329,7 +318,8 @@ static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata; static int __init set_trace_boot_options(char *str) { - strscpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); + trace_append_boot_param(trace_boot_options_buf, str, ',', + MAX_TRACER_SIZE); return 1; } __setup("trace_options=", set_trace_boot_options); @@ -578,8 +568,59 @@ void trace_set_ring_buffer_expanded(struct trace_array *tr) tr->ring_buffer_expanded = true; } +static void trace_array_autoremove(struct work_struct *work) +{ + struct trace_array *tr = container_of(work, struct trace_array, autoremove_work); + + trace_array_destroy(tr); +} + +static struct workqueue_struct *autoremove_wq; + +static void trace_array_kick_autoremove(struct trace_array *tr) +{ + if (autoremove_wq) + queue_work(autoremove_wq, &tr->autoremove_work); +} + +static void trace_array_cancel_autoremove(struct trace_array *tr) +{ + /* + * Since this can be called inside trace_array_autoremove(), + * it has to avoid deadlock of the workqueue. + */ + if (work_pending(&tr->autoremove_work)) + cancel_work_sync(&tr->autoremove_work); +} + +static void trace_array_init_autoremove(struct trace_array *tr) +{ + INIT_WORK(&tr->autoremove_work, trace_array_autoremove); +} + +static void trace_array_start_autoremove(void) +{ + if (autoremove_wq) + return; + + autoremove_wq = alloc_workqueue("tr_autoremove_wq", + WQ_UNBOUND | WQ_HIGHPRI, 0); + if (!autoremove_wq) + pr_warn("Unable to allocate tr_autoremove_wq. autoremove disabled.\n"); +} + LIST_HEAD(ftrace_trace_arrays); +static int __trace_array_get(struct trace_array *this_tr) +{ + /* When free_on_close is set, this is not available anymore. */ + if (autoremove_wq && this_tr->free_on_close) + return -ENODEV; + + this_tr->ref++; + return 0; +} + int trace_array_get(struct trace_array *this_tr) { struct trace_array *tr; @@ -587,8 +628,7 @@ int trace_array_get(struct trace_array *this_tr) guard(mutex)(&trace_types_lock); list_for_each_entry(tr, &ftrace_trace_arrays, list) { if (tr == this_tr) { - tr->ref++; - return 0; + return __trace_array_get(tr); } } @@ -599,6 +639,12 @@ static void __trace_array_put(struct trace_array *this_tr) { WARN_ON(!this_tr->ref); this_tr->ref--; + /* + * When free_on_close is set, prepare removing the array + * when the last reference is released. + */ + if (this_tr->ref == 1 && this_tr->free_on_close) + trace_array_kick_autoremove(this_tr); } /** @@ -807,47 +853,6 @@ void tracing_on(void) EXPORT_SYMBOL_GPL(tracing_on); #ifdef CONFIG_TRACER_SNAPSHOT -static void tracing_snapshot_instance_cond(struct trace_array *tr, - void *cond_data) -{ - unsigned long flags; - - if (in_nmi()) { - trace_array_puts(tr, "*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n"); - trace_array_puts(tr, "*** snapshot is being ignored ***\n"); - return; - } - - if (!tr->allocated_snapshot) { - trace_array_puts(tr, "*** SNAPSHOT NOT ALLOCATED ***\n"); - trace_array_puts(tr, "*** stopping trace here! ***\n"); - tracer_tracing_off(tr); - return; - } - - if (tr->mapped) { - trace_array_puts(tr, "*** BUFFER MEMORY MAPPED ***\n"); - trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n"); - return; - } - - /* Note, snapshot can not be used when the tracer uses it */ - if (tracer_uses_snapshot(tr->current_trace)) { - trace_array_puts(tr, "*** LATENCY TRACER ACTIVE ***\n"); - trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n"); - return; - } - - local_irq_save(flags); - update_max_tr(tr, current, smp_processor_id(), cond_data); - local_irq_restore(flags); -} - -void tracing_snapshot_instance(struct trace_array *tr) -{ - tracing_snapshot_instance_cond(tr, NULL); -} - /** * tracing_snapshot - take a snapshot of the current buffer. * @@ -871,138 +876,6 @@ void tracing_snapshot(void) EXPORT_SYMBOL_GPL(tracing_snapshot); /** - * tracing_snapshot_cond - conditionally take a snapshot of the current buffer. - * @tr: The tracing instance to snapshot - * @cond_data: The data to be tested conditionally, and possibly saved - * - * This is the same as tracing_snapshot() except that the snapshot is - * conditional - the snapshot will only happen if the - * cond_snapshot.update() implementation receiving the cond_data - * returns true, which means that the trace array's cond_snapshot - * update() operation used the cond_data to determine whether the - * snapshot should be taken, and if it was, presumably saved it along - * with the snapshot. - */ -void tracing_snapshot_cond(struct trace_array *tr, void *cond_data) -{ - tracing_snapshot_instance_cond(tr, cond_data); -} -EXPORT_SYMBOL_GPL(tracing_snapshot_cond); - -/** - * tracing_cond_snapshot_data - get the user data associated with a snapshot - * @tr: The tracing instance - * - * When the user enables a conditional snapshot using - * tracing_snapshot_cond_enable(), the user-defined cond_data is saved - * with the snapshot. This accessor is used to retrieve it. - * - * Should not be called from cond_snapshot.update(), since it takes - * the tr->max_lock lock, which the code calling - * cond_snapshot.update() has already done. - * - * Returns the cond_data associated with the trace array's snapshot. - */ -void *tracing_cond_snapshot_data(struct trace_array *tr) -{ - void *cond_data = NULL; - - local_irq_disable(); - arch_spin_lock(&tr->max_lock); - - if (tr->cond_snapshot) - cond_data = tr->cond_snapshot->cond_data; - - arch_spin_unlock(&tr->max_lock); - local_irq_enable(); - - return cond_data; -} -EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data); - -static int resize_buffer_duplicate_size(struct array_buffer *trace_buf, - struct array_buffer *size_buf, int cpu_id); -static void set_buffer_entries(struct array_buffer *buf, unsigned long val); - -int tracing_alloc_snapshot_instance(struct trace_array *tr) -{ - int order; - int ret; - - if (!tr->allocated_snapshot) { - - /* Make the snapshot buffer have the same order as main buffer */ - order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer); - ret = ring_buffer_subbuf_order_set(tr->snapshot_buffer.buffer, order); - if (ret < 0) - return ret; - - /* allocate spare buffer */ - ret = resize_buffer_duplicate_size(&tr->snapshot_buffer, - &tr->array_buffer, RING_BUFFER_ALL_CPUS); - if (ret < 0) - return ret; - - tr->allocated_snapshot = true; - } - - return 0; -} - -static void free_snapshot(struct trace_array *tr) -{ - /* - * We don't free the ring buffer. instead, resize it because - * The max_tr ring buffer has some state (e.g. ring->clock) and - * we want preserve it. - */ - ring_buffer_subbuf_order_set(tr->snapshot_buffer.buffer, 0); - ring_buffer_resize(tr->snapshot_buffer.buffer, 1, RING_BUFFER_ALL_CPUS); - set_buffer_entries(&tr->snapshot_buffer, 1); - tracing_reset_online_cpus(&tr->snapshot_buffer); - tr->allocated_snapshot = false; -} - -static int tracing_arm_snapshot_locked(struct trace_array *tr) -{ - int ret; - - lockdep_assert_held(&trace_types_lock); - - spin_lock(&tr->snapshot_trigger_lock); - if (tr->snapshot == UINT_MAX || tr->mapped) { - spin_unlock(&tr->snapshot_trigger_lock); - return -EBUSY; - } - - tr->snapshot++; - spin_unlock(&tr->snapshot_trigger_lock); - - ret = tracing_alloc_snapshot_instance(tr); - if (ret) { - spin_lock(&tr->snapshot_trigger_lock); - tr->snapshot--; - spin_unlock(&tr->snapshot_trigger_lock); - } - - return ret; -} - -int tracing_arm_snapshot(struct trace_array *tr) -{ - guard(mutex)(&trace_types_lock); - return tracing_arm_snapshot_locked(tr); -} - -void tracing_disarm_snapshot(struct trace_array *tr) -{ - spin_lock(&tr->snapshot_trigger_lock); - if (!WARN_ON(!tr->snapshot)) - tr->snapshot--; - spin_unlock(&tr->snapshot_trigger_lock); -} - -/** * tracing_alloc_snapshot - allocate snapshot buffer. * * This only allocates the snapshot buffer if it isn't already @@ -1022,159 +895,18 @@ int tracing_alloc_snapshot(void) return ret; } -EXPORT_SYMBOL_GPL(tracing_alloc_snapshot); - -/** - * tracing_snapshot_alloc - allocate and take a snapshot of the current buffer. - * - * This is similar to tracing_snapshot(), but it will allocate the - * snapshot buffer if it isn't already allocated. Use this only - * where it is safe to sleep, as the allocation may sleep. - * - * This causes a swap between the snapshot buffer and the current live - * tracing buffer. You can use this to take snapshots of the live - * trace when some condition is triggered, but continue to trace. - */ -void tracing_snapshot_alloc(void) -{ - int ret; - - ret = tracing_alloc_snapshot(); - if (ret < 0) - return; - - tracing_snapshot(); -} -EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); - -/** - * tracing_snapshot_cond_enable - enable conditional snapshot for an instance - * @tr: The tracing instance - * @cond_data: User data to associate with the snapshot - * @update: Implementation of the cond_snapshot update function - * - * Check whether the conditional snapshot for the given instance has - * already been enabled, or if the current tracer is already using a - * snapshot; if so, return -EBUSY, else create a cond_snapshot and - * save the cond_data and update function inside. - * - * Returns 0 if successful, error otherwise. - */ -int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, - cond_update_fn_t update) -{ - struct cond_snapshot *cond_snapshot __free(kfree) = - kzalloc_obj(*cond_snapshot); - int ret; - - if (!cond_snapshot) - return -ENOMEM; - - cond_snapshot->cond_data = cond_data; - cond_snapshot->update = update; - - guard(mutex)(&trace_types_lock); - - if (tracer_uses_snapshot(tr->current_trace)) - return -EBUSY; - - /* - * The cond_snapshot can only change to NULL without the - * trace_types_lock. We don't care if we race with it going - * to NULL, but we want to make sure that it's not set to - * something other than NULL when we get here, which we can - * do safely with only holding the trace_types_lock and not - * having to take the max_lock. - */ - if (tr->cond_snapshot) - return -EBUSY; - - ret = tracing_arm_snapshot_locked(tr); - if (ret) - return ret; - - local_irq_disable(); - arch_spin_lock(&tr->max_lock); - tr->cond_snapshot = no_free_ptr(cond_snapshot); - arch_spin_unlock(&tr->max_lock); - local_irq_enable(); - - return 0; -} -EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable); - -/** - * tracing_snapshot_cond_disable - disable conditional snapshot for an instance - * @tr: The tracing instance - * - * Check whether the conditional snapshot for the given instance is - * enabled; if so, free the cond_snapshot associated with it, - * otherwise return -EINVAL. - * - * Returns 0 if successful, error otherwise. - */ -int tracing_snapshot_cond_disable(struct trace_array *tr) -{ - int ret = 0; - - local_irq_disable(); - arch_spin_lock(&tr->max_lock); - - if (!tr->cond_snapshot) - ret = -EINVAL; - else { - kfree(tr->cond_snapshot); - tr->cond_snapshot = NULL; - } - - arch_spin_unlock(&tr->max_lock); - local_irq_enable(); - - tracing_disarm_snapshot(tr); - - return ret; -} -EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable); #else void tracing_snapshot(void) { WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used"); } EXPORT_SYMBOL_GPL(tracing_snapshot); -void tracing_snapshot_cond(struct trace_array *tr, void *cond_data) -{ - WARN_ONCE(1, "Snapshot feature not enabled, but internal conditional snapshot used"); -} -EXPORT_SYMBOL_GPL(tracing_snapshot_cond); -int tracing_alloc_snapshot(void) -{ - WARN_ONCE(1, "Snapshot feature not enabled, but snapshot allocation used"); - return -ENODEV; -} -EXPORT_SYMBOL_GPL(tracing_alloc_snapshot); void tracing_snapshot_alloc(void) { /* Give warning */ tracing_snapshot(); } EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); -void *tracing_cond_snapshot_data(struct trace_array *tr) -{ - return NULL; -} -EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data); -int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update) -{ - return -ENODEV; -} -EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable); -int tracing_snapshot_cond_disable(struct trace_array *tr) -{ - return false; -} -EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable); -#define free_snapshot(tr) do { } while (0) -#define tracing_arm_snapshot_locked(tr) ({ -EBUSY; }) #endif /* CONFIG_TRACER_SNAPSHOT */ void tracer_tracing_off(struct trace_array *tr) @@ -1487,206 +1219,6 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) unsigned long __read_mostly tracing_thresh; -#ifdef CONFIG_TRACER_MAX_TRACE -#ifdef LATENCY_FS_NOTIFY -static struct workqueue_struct *fsnotify_wq; - -static void latency_fsnotify_workfn(struct work_struct *work) -{ - struct trace_array *tr = container_of(work, struct trace_array, - fsnotify_work); - fsnotify_inode(tr->d_max_latency->d_inode, FS_MODIFY); -} - -static void latency_fsnotify_workfn_irq(struct irq_work *iwork) -{ - struct trace_array *tr = container_of(iwork, struct trace_array, - fsnotify_irqwork); - queue_work(fsnotify_wq, &tr->fsnotify_work); -} - -__init static int latency_fsnotify_init(void) -{ - fsnotify_wq = alloc_workqueue("tr_max_lat_wq", - WQ_UNBOUND | WQ_HIGHPRI, 0); - if (!fsnotify_wq) { - pr_err("Unable to allocate tr_max_lat_wq\n"); - return -ENOMEM; - } - return 0; -} - -late_initcall_sync(latency_fsnotify_init); - -void latency_fsnotify(struct trace_array *tr) -{ - if (!fsnotify_wq) - return; - /* - * We cannot call queue_work(&tr->fsnotify_work) from here because it's - * possible that we are called from __schedule() or do_idle(), which - * could cause a deadlock. - */ - irq_work_queue(&tr->fsnotify_irqwork); -} -#endif /* !LATENCY_FS_NOTIFY */ - -static const struct file_operations tracing_max_lat_fops; - -static void trace_create_maxlat_file(struct trace_array *tr, - struct dentry *d_tracer) -{ -#ifdef LATENCY_FS_NOTIFY - INIT_WORK(&tr->fsnotify_work, latency_fsnotify_workfn); - init_irq_work(&tr->fsnotify_irqwork, latency_fsnotify_workfn_irq); -#endif - tr->d_max_latency = trace_create_file("tracing_max_latency", - TRACE_MODE_WRITE, - d_tracer, tr, - &tracing_max_lat_fops); -} - -/* - * Copy the new maximum trace into the separate maximum-trace - * structure. (this way the maximum trace is permanently saved, - * for later retrieval via /sys/kernel/tracing/tracing_max_latency) - */ -static void -__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) -{ - struct array_buffer *trace_buf = &tr->array_buffer; - struct trace_array_cpu *data = per_cpu_ptr(trace_buf->data, cpu); - struct array_buffer *max_buf = &tr->snapshot_buffer; - struct trace_array_cpu *max_data = per_cpu_ptr(max_buf->data, cpu); - - max_buf->cpu = cpu; - max_buf->time_start = data->preempt_timestamp; - - max_data->saved_latency = tr->max_latency; - max_data->critical_start = data->critical_start; - max_data->critical_end = data->critical_end; - - strscpy(max_data->comm, tsk->comm); - max_data->pid = tsk->pid; - /* - * If tsk == current, then use current_uid(), as that does not use - * RCU. The irq tracer can be called out of RCU scope. - */ - if (tsk == current) - max_data->uid = current_uid(); - else - max_data->uid = task_uid(tsk); - - max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; - max_data->policy = tsk->policy; - max_data->rt_priority = tsk->rt_priority; - - /* record this tasks comm */ - tracing_record_cmdline(tsk); - latency_fsnotify(tr); -} -#else -static inline void trace_create_maxlat_file(struct trace_array *tr, - struct dentry *d_tracer) { } -static inline void __update_max_tr(struct trace_array *tr, - struct task_struct *tsk, int cpu) { } -#endif /* CONFIG_TRACER_MAX_TRACE */ - -#ifdef CONFIG_TRACER_SNAPSHOT -/** - * update_max_tr - snapshot all trace buffers from global_trace to max_tr - * @tr: tracer - * @tsk: the task with the latency - * @cpu: The cpu that initiated the trace. - * @cond_data: User data associated with a conditional snapshot - * - * Flip the buffers between the @tr and the max_tr and record information - * about which task was the cause of this latency. - */ -void -update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu, - void *cond_data) -{ - if (tr->stop_count) - return; - - WARN_ON_ONCE(!irqs_disabled()); - - if (!tr->allocated_snapshot) { - /* Only the nop tracer should hit this when disabling */ - WARN_ON_ONCE(tr->current_trace != &nop_trace); - return; - } - - arch_spin_lock(&tr->max_lock); - - /* Inherit the recordable setting from array_buffer */ - if (ring_buffer_record_is_set_on(tr->array_buffer.buffer)) - ring_buffer_record_on(tr->snapshot_buffer.buffer); - else - ring_buffer_record_off(tr->snapshot_buffer.buffer); - - if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data)) { - arch_spin_unlock(&tr->max_lock); - return; - } - - swap(tr->array_buffer.buffer, tr->snapshot_buffer.buffer); - - __update_max_tr(tr, tsk, cpu); - - arch_spin_unlock(&tr->max_lock); - - /* Any waiters on the old snapshot buffer need to wake up */ - ring_buffer_wake_waiters(tr->array_buffer.buffer, RING_BUFFER_ALL_CPUS); -} - -/** - * update_max_tr_single - only copy one trace over, and reset the rest - * @tr: tracer - * @tsk: task with the latency - * @cpu: the cpu of the buffer to copy. - * - * Flip the trace of a single CPU buffer between the @tr and the max_tr. - */ -void -update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) -{ - int ret; - - if (tr->stop_count) - return; - - WARN_ON_ONCE(!irqs_disabled()); - if (!tr->allocated_snapshot) { - /* Only the nop tracer should hit this when disabling */ - WARN_ON_ONCE(tr->current_trace != &nop_trace); - return; - } - - arch_spin_lock(&tr->max_lock); - - ret = ring_buffer_swap_cpu(tr->snapshot_buffer.buffer, tr->array_buffer.buffer, cpu); - - if (ret == -EBUSY) { - /* - * We failed to swap the buffer due to a commit taking - * place on this CPU. We fail to record, but we reset - * the max trace buffer (no one writes directly to it) - * and flag that it failed. - * Another reason is resize is in progress. - */ - trace_array_printk_buf(tr->snapshot_buffer.buffer, _THIS_IP_, - "Failed to swap buffers due to commit or resize in progress\n"); - } - - WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); - - __update_max_tr(tr, tsk, cpu); - arch_spin_unlock(&tr->max_lock); -} -#endif /* CONFIG_TRACER_SNAPSHOT */ - struct pipe_wait { struct trace_iterator *iter; int wait_index; @@ -1995,7 +1527,7 @@ int __init register_tracer(struct tracer *type) return 0; } -static void tracing_reset_cpu(struct array_buffer *buf, int cpu) +void tracing_reset_cpu(struct array_buffer *buf, int cpu) { struct trace_buffer *buffer = buf->buffer; @@ -3760,50 +3292,6 @@ static void test_ftrace_alive(struct seq_file *m) "# MAY BE MISSING FUNCTION EVENTS\n"); } -#ifdef CONFIG_TRACER_SNAPSHOT -static void show_snapshot_main_help(struct seq_file *m) -{ - seq_puts(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n" - "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n" - "# Takes a snapshot of the main buffer.\n" - "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n" - "# (Doesn't have to be '2' works with any number that\n" - "# is not a '0' or '1')\n"); -} - -static void show_snapshot_percpu_help(struct seq_file *m) -{ - seq_puts(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n"); -#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP - seq_puts(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n" - "# Takes a snapshot of the main buffer for this cpu.\n"); -#else - seq_puts(m, "# echo 1 > snapshot : Not supported with this kernel.\n" - "# Must use main snapshot file to allocate.\n"); -#endif - seq_puts(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n" - "# (Doesn't have to be '2' works with any number that\n" - "# is not a '0' or '1')\n"); -} - -static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) -{ - if (iter->tr->allocated_snapshot) - seq_puts(m, "#\n# * Snapshot is allocated *\n#\n"); - else - seq_puts(m, "#\n# * Snapshot is freed *\n#\n"); - - seq_puts(m, "# Snapshot commands:\n"); - if (iter->cpu_file == RING_BUFFER_ALL_CPUS) - show_snapshot_main_help(m); - else - show_snapshot_percpu_help(m); -} -#else -/* Should never be called */ -static inline void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { } -#endif - static int s_show(struct seq_file *m, void *v) { struct trace_iterator *iter = v; @@ -3852,17 +3340,6 @@ static int s_show(struct seq_file *m, void *v) return 0; } -/* - * Should be used after trace_array_get(), trace_types_lock - * ensures that i_cdev was already initialized. - */ -static inline int tracing_get_cpu(struct inode *inode) -{ - if (inode->i_cdev) /* See trace_create_cpu_file() */ - return (long)inode->i_cdev - 1; - return RING_BUFFER_ALL_CPUS; -} - static const struct seq_operations tracer_seq_ops = { .start = s_start, .next = s_next, @@ -3889,7 +3366,7 @@ static void free_trace_iter_content(struct trace_iterator *iter) free_cpumask_var(iter->started); } -static struct trace_iterator * +struct trace_iterator * __tracing_open(struct inode *inode, struct file *file, bool snapshot) { struct trace_array *tr = inode->i_private; @@ -4022,6 +3499,11 @@ int tracing_open_generic_tr(struct inode *inode, struct file *filp) if (ret) return ret; + if ((filp->f_mode & FMODE_WRITE) && trace_array_is_readonly(tr)) { + trace_array_put(tr); + return -EACCES; + } + filp->private_data = inode->i_private; return 0; @@ -4050,8 +3532,6 @@ int tracing_open_file_tr(struct inode *inode, struct file *filp) event_file_get(file); } - filp->private_data = inode->i_private; - return 0; } @@ -4071,7 +3551,7 @@ int tracing_single_release_file_tr(struct inode *inode, struct file *filp) return single_release(inode, filp); } -static int tracing_release(struct inode *inode, struct file *file) +int tracing_release(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; struct seq_file *m = file->private_data; @@ -5222,7 +4702,7 @@ int tracer_init(struct tracer *t, struct trace_array *tr) return t->init(tr); } -static void set_buffer_entries(struct array_buffer *buf, unsigned long val) +void trace_set_buffer_entries(struct array_buffer *buf, unsigned long val) { int cpu; @@ -5233,40 +4713,12 @@ static void set_buffer_entries(struct array_buffer *buf, unsigned long val) static void update_buffer_entries(struct array_buffer *buf, int cpu) { if (cpu == RING_BUFFER_ALL_CPUS) { - set_buffer_entries(buf, ring_buffer_size(buf->buffer, 0)); + trace_set_buffer_entries(buf, ring_buffer_size(buf->buffer, 0)); } else { per_cpu_ptr(buf->data, cpu)->entries = ring_buffer_size(buf->buffer, cpu); } } -#ifdef CONFIG_TRACER_SNAPSHOT -/* resize @tr's buffer to the size of @size_tr's entries */ -static int resize_buffer_duplicate_size(struct array_buffer *trace_buf, - struct array_buffer *size_buf, int cpu_id) -{ - int cpu, ret = 0; - - if (cpu_id == RING_BUFFER_ALL_CPUS) { - for_each_tracing_cpu(cpu) { - ret = ring_buffer_resize(trace_buf->buffer, - per_cpu_ptr(size_buf->data, cpu)->entries, cpu); - if (ret < 0) - break; - per_cpu_ptr(trace_buf->data, cpu)->entries = - per_cpu_ptr(size_buf->data, cpu)->entries; - } - } else { - ret = ring_buffer_resize(trace_buf->buffer, - per_cpu_ptr(size_buf->data, cpu_id)->entries, cpu_id); - if (ret == 0) - per_cpu_ptr(trace_buf->data, cpu_id)->entries = - per_cpu_ptr(size_buf->data, cpu_id)->entries; - } - - return ret; -} -#endif /* CONFIG_TRACER_SNAPSHOT */ - static int __tracing_resize_ring_buffer(struct trace_array *tr, unsigned long size, int cpu) { @@ -5462,6 +4914,10 @@ static void update_last_data(struct trace_array *tr) /* Only if the buffer has previous boot data clear and update it. */ tr->flags &= ~TRACE_ARRAY_FL_LAST_BOOT; + /* If this is a backup instance, mark it for autoremove. */ + if (tr->flags & TRACE_ARRAY_FL_VMALLOC) + tr->free_on_close = true; + /* Reset the module list and reload them */ if (tr->scratch) { struct trace_scratch *tscratch = tr->scratch; @@ -5685,9 +5141,8 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, return ret; } -static ssize_t -tracing_nsecs_read(unsigned long *ptr, char __user *ubuf, - size_t cnt, loff_t *ppos) +ssize_t tracing_nsecs_read(unsigned long *ptr, char __user *ubuf, + size_t cnt, loff_t *ppos) { char buf[64]; int r; @@ -5699,9 +5154,8 @@ tracing_nsecs_read(unsigned long *ptr, char __user *ubuf, return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } -static ssize_t -tracing_nsecs_write(unsigned long *ptr, const char __user *ubuf, - size_t cnt, loff_t *ppos) +ssize_t tracing_nsecs_write(unsigned long *ptr, const char __user *ubuf, + size_t cnt, loff_t *ppos) { unsigned long val; int ret; @@ -5743,28 +5197,6 @@ tracing_thresh_write(struct file *filp, const char __user *ubuf, return cnt; } -#ifdef CONFIG_TRACER_MAX_TRACE - -static ssize_t -tracing_max_lat_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - struct trace_array *tr = filp->private_data; - - return tracing_nsecs_read(&tr->max_latency, ubuf, cnt, ppos); -} - -static ssize_t -tracing_max_lat_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - struct trace_array *tr = filp->private_data; - - return tracing_nsecs_write(&tr->max_latency, ubuf, cnt, ppos); -} - -#endif - static int open_pipe_on_cpu(struct trace_array *tr, int cpu) { if (cpu == RING_BUFFER_ALL_CPUS) { @@ -7097,6 +6529,11 @@ static int tracing_clock_open(struct inode *inode, struct file *file) if (ret) return ret; + if ((file->f_mode & FMODE_WRITE) && trace_array_is_readonly(tr)) { + trace_array_put(tr); + return -EACCES; + } + ret = single_open(file, tracing_clock_show, inode->i_private); if (ret < 0) trace_array_put(tr); @@ -7142,194 +6579,6 @@ u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_eve return ring_buffer_event_time_stamp(buffer, rbe); } -struct ftrace_buffer_info { - struct trace_iterator iter; - void *spare; - unsigned int spare_cpu; - unsigned int spare_size; - unsigned int read; -}; - -#ifdef CONFIG_TRACER_SNAPSHOT -static int tracing_snapshot_open(struct inode *inode, struct file *file) -{ - struct trace_array *tr = inode->i_private; - struct trace_iterator *iter; - struct seq_file *m; - int ret; - - ret = tracing_check_open_get_tr(tr); - if (ret) - return ret; - - if (file->f_mode & FMODE_READ) { - iter = __tracing_open(inode, file, true); - if (IS_ERR(iter)) - ret = PTR_ERR(iter); - } else { - /* Writes still need the seq_file to hold the private data */ - ret = -ENOMEM; - m = kzalloc_obj(*m); - if (!m) - goto out; - iter = kzalloc_obj(*iter); - if (!iter) { - kfree(m); - goto out; - } - ret = 0; - - iter->tr = tr; - iter->array_buffer = &tr->snapshot_buffer; - iter->cpu_file = tracing_get_cpu(inode); - m->private = iter; - file->private_data = m; - } -out: - if (ret < 0) - trace_array_put(tr); - - return ret; -} - -static void tracing_swap_cpu_buffer(void *tr) -{ - update_max_tr_single((struct trace_array *)tr, current, smp_processor_id()); -} - -static ssize_t -tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, - loff_t *ppos) -{ - struct seq_file *m = filp->private_data; - struct trace_iterator *iter = m->private; - struct trace_array *tr = iter->tr; - unsigned long val; - int ret; - - ret = tracing_update_buffers(tr); - if (ret < 0) - return ret; - - ret = kstrtoul_from_user(ubuf, cnt, 10, &val); - if (ret) - return ret; - - guard(mutex)(&trace_types_lock); - - if (tracer_uses_snapshot(tr->current_trace)) - return -EBUSY; - - local_irq_disable(); - arch_spin_lock(&tr->max_lock); - if (tr->cond_snapshot) - ret = -EBUSY; - arch_spin_unlock(&tr->max_lock); - local_irq_enable(); - if (ret) - return ret; - - switch (val) { - case 0: - if (iter->cpu_file != RING_BUFFER_ALL_CPUS) - return -EINVAL; - if (tr->allocated_snapshot) - free_snapshot(tr); - break; - case 1: -/* Only allow per-cpu swap if the ring buffer supports it */ -#ifndef CONFIG_RING_BUFFER_ALLOW_SWAP - if (iter->cpu_file != RING_BUFFER_ALL_CPUS) - return -EINVAL; -#endif - if (tr->allocated_snapshot) - ret = resize_buffer_duplicate_size(&tr->snapshot_buffer, - &tr->array_buffer, iter->cpu_file); - - ret = tracing_arm_snapshot_locked(tr); - if (ret) - return ret; - - /* Now, we're going to swap */ - if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { - local_irq_disable(); - update_max_tr(tr, current, smp_processor_id(), NULL); - local_irq_enable(); - } else { - smp_call_function_single(iter->cpu_file, tracing_swap_cpu_buffer, - (void *)tr, 1); - } - tracing_disarm_snapshot(tr); - break; - default: - if (tr->allocated_snapshot) { - if (iter->cpu_file == RING_BUFFER_ALL_CPUS) - tracing_reset_online_cpus(&tr->snapshot_buffer); - else - tracing_reset_cpu(&tr->snapshot_buffer, iter->cpu_file); - } - break; - } - - if (ret >= 0) { - *ppos += cnt; - ret = cnt; - } - - return ret; -} - -static int tracing_snapshot_release(struct inode *inode, struct file *file) -{ - struct seq_file *m = file->private_data; - int ret; - - ret = tracing_release(inode, file); - - if (file->f_mode & FMODE_READ) - return ret; - - /* If write only, the seq_file is just a stub */ - if (m) - kfree(m->private); - kfree(m); - - return 0; -} - -static int tracing_buffers_open(struct inode *inode, struct file *filp); -static ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf, - size_t count, loff_t *ppos); -static int tracing_buffers_release(struct inode *inode, struct file *file); -static ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, unsigned int flags); - -static int snapshot_raw_open(struct inode *inode, struct file *filp) -{ - struct ftrace_buffer_info *info; - int ret; - - /* The following checks for tracefs lockdown */ - ret = tracing_buffers_open(inode, filp); - if (ret < 0) - return ret; - - info = filp->private_data; - - if (tracer_uses_snapshot(info->iter.trace)) { - tracing_buffers_release(inode, filp); - return -EBUSY; - } - - info->iter.snapshot = true; - info->iter.array_buffer = &info->iter.tr->snapshot_buffer; - - return ret; -} - -#endif /* CONFIG_TRACER_SNAPSHOT */ - - static const struct file_operations tracing_thresh_fops = { .open = tracing_open_generic, .read = tracing_thresh_read, @@ -7337,16 +6586,6 @@ static const struct file_operations tracing_thresh_fops = { .llseek = generic_file_llseek, }; -#ifdef CONFIG_TRACER_MAX_TRACE -static const struct file_operations tracing_max_lat_fops = { - .open = tracing_open_generic_tr, - .read = tracing_max_lat_read, - .write = tracing_max_lat_write, - .llseek = generic_file_llseek, - .release = tracing_release_generic_tr, -}; -#endif - static const struct file_operations set_tracer_fops = { .open = tracing_open_generic_tr, .read = tracing_set_trace_read, @@ -7433,24 +6672,6 @@ static const struct file_operations last_boot_fops = { .release = tracing_seq_release, }; -#ifdef CONFIG_TRACER_SNAPSHOT -static const struct file_operations snapshot_fops = { - .open = tracing_snapshot_open, - .read = seq_read, - .write = tracing_snapshot_write, - .llseek = tracing_lseek, - .release = tracing_snapshot_release, -}; - -static const struct file_operations snapshot_raw_fops = { - .open = snapshot_raw_open, - .read = tracing_buffers_read, - .release = tracing_buffers_release, - .splice_read = tracing_buffers_splice_read, -}; - -#endif /* CONFIG_TRACER_SNAPSHOT */ - /* * trace_min_max_write - Write a u64 value to a trace_min_max_param struct * @filp: The active open file structure @@ -7810,7 +7031,7 @@ static const struct file_operations tracing_err_log_fops = { .release = tracing_err_log_release, }; -static int tracing_buffers_open(struct inode *inode, struct file *filp) +int tracing_buffers_open(struct inode *inode, struct file *filp) { struct trace_array *tr = inode->i_private; struct ftrace_buffer_info *info; @@ -7858,9 +7079,8 @@ tracing_buffers_poll(struct file *filp, poll_table *poll_table) return trace_poll(iter, filp, poll_table); } -static ssize_t -tracing_buffers_read(struct file *filp, char __user *ubuf, - size_t count, loff_t *ppos) +ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) { struct ftrace_buffer_info *info = filp->private_data; struct trace_iterator *iter = &info->iter; @@ -7961,7 +7181,7 @@ static int tracing_buffers_flush(struct file *file, fl_owner_t id) return 0; } -static int tracing_buffers_release(struct inode *inode, struct file *file) +int tracing_buffers_release(struct inode *inode, struct file *file) { struct ftrace_buffer_info *info = file->private_data; struct trace_iterator *iter = &info->iter; @@ -8035,10 +7255,9 @@ static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i) spd->partial[i].private = 0; } -static ssize_t -tracing_buffers_splice_read(struct file *file, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) +ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) { struct ftrace_buffer_info *info = file->private_data; struct trace_iterator *iter = &info->iter; @@ -8192,44 +7411,6 @@ static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned return 0; } -#ifdef CONFIG_TRACER_SNAPSHOT -static int get_snapshot_map(struct trace_array *tr) -{ - int err = 0; - - /* - * Called with mmap_lock held. lockdep would be unhappy if we would now - * take trace_types_lock. Instead use the specific - * snapshot_trigger_lock. - */ - spin_lock(&tr->snapshot_trigger_lock); - - if (tr->snapshot || tr->mapped == UINT_MAX) - err = -EBUSY; - else - tr->mapped++; - - spin_unlock(&tr->snapshot_trigger_lock); - - /* Wait for update_max_tr() to observe iter->tr->mapped */ - if (tr->mapped == 1) - synchronize_rcu(); - - return err; - -} -static void put_snapshot_map(struct trace_array *tr) -{ - spin_lock(&tr->snapshot_trigger_lock); - if (!WARN_ON(!tr->mapped)) - tr->mapped--; - spin_unlock(&tr->snapshot_trigger_lock); -} -#else -static inline int get_snapshot_map(struct trace_array *tr) { return 0; } -static inline void put_snapshot_map(struct trace_array *tr) { } -#endif - /* * This is called when a VMA is duplicated (e.g., on fork()) to increment * the user_mapped counter without remapping pages. @@ -8410,170 +7591,6 @@ static const struct file_operations tracing_dyn_info_fops = { }; #endif /* CONFIG_DYNAMIC_FTRACE */ -#if defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) -static void -ftrace_snapshot(unsigned long ip, unsigned long parent_ip, - struct trace_array *tr, struct ftrace_probe_ops *ops, - void *data) -{ - tracing_snapshot_instance(tr); -} - -static void -ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, - struct trace_array *tr, struct ftrace_probe_ops *ops, - void *data) -{ - struct ftrace_func_mapper *mapper = data; - long *count = NULL; - - if (mapper) - count = (long *)ftrace_func_mapper_find_ip(mapper, ip); - - if (count) { - - if (*count <= 0) - return; - - (*count)--; - } - - tracing_snapshot_instance(tr); -} - -static int -ftrace_snapshot_print(struct seq_file *m, unsigned long ip, - struct ftrace_probe_ops *ops, void *data) -{ - struct ftrace_func_mapper *mapper = data; - long *count = NULL; - - seq_printf(m, "%ps:", (void *)ip); - - seq_puts(m, "snapshot"); - - if (mapper) - count = (long *)ftrace_func_mapper_find_ip(mapper, ip); - - if (count) - seq_printf(m, ":count=%ld\n", *count); - else - seq_puts(m, ":unlimited\n"); - - return 0; -} - -static int -ftrace_snapshot_init(struct ftrace_probe_ops *ops, struct trace_array *tr, - unsigned long ip, void *init_data, void **data) -{ - struct ftrace_func_mapper *mapper = *data; - - if (!mapper) { - mapper = allocate_ftrace_func_mapper(); - if (!mapper) - return -ENOMEM; - *data = mapper; - } - - return ftrace_func_mapper_add_ip(mapper, ip, init_data); -} - -static void -ftrace_snapshot_free(struct ftrace_probe_ops *ops, struct trace_array *tr, - unsigned long ip, void *data) -{ - struct ftrace_func_mapper *mapper = data; - - if (!ip) { - if (!mapper) - return; - free_ftrace_func_mapper(mapper, NULL); - return; - } - - ftrace_func_mapper_remove_ip(mapper, ip); -} - -static struct ftrace_probe_ops snapshot_probe_ops = { - .func = ftrace_snapshot, - .print = ftrace_snapshot_print, -}; - -static struct ftrace_probe_ops snapshot_count_probe_ops = { - .func = ftrace_count_snapshot, - .print = ftrace_snapshot_print, - .init = ftrace_snapshot_init, - .free = ftrace_snapshot_free, -}; - -static int -ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash, - char *glob, char *cmd, char *param, int enable) -{ - struct ftrace_probe_ops *ops; - void *count = (void *)-1; - char *number; - int ret; - - if (!tr) - return -ENODEV; - - /* hash funcs only work with set_ftrace_filter */ - if (!enable) - return -EINVAL; - - ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops; - - if (glob[0] == '!') { - ret = unregister_ftrace_function_probe_func(glob+1, tr, ops); - if (!ret) - tracing_disarm_snapshot(tr); - - return ret; - } - - if (!param) - goto out_reg; - - number = strsep(¶m, ":"); - - if (!strlen(number)) - goto out_reg; - - /* - * We use the callback data field (which is a pointer) - * as our counter. - */ - ret = kstrtoul(number, 0, (unsigned long *)&count); - if (ret) - return ret; - - out_reg: - ret = tracing_arm_snapshot(tr); - if (ret < 0) - return ret; - - ret = register_ftrace_function_probe(glob, tr, ops, count); - if (ret < 0) - tracing_disarm_snapshot(tr); - - return ret < 0 ? ret : 0; -} - -static struct ftrace_func_command ftrace_snapshot_cmd = { - .name = "snapshot", - .func = ftrace_trace_snapshot_callback, -}; - -static __init int register_snapshot_cmd(void) -{ - return register_ftrace_command(&ftrace_snapshot_cmd); -} -#else -static inline __init int register_snapshot_cmd(void) { return 0; } -#endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */ - static struct dentry *tracing_get_dentry(struct trace_array *tr) { /* Top directory uses NULL as the parent */ @@ -8606,7 +7623,7 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu) return tr->percpu_dir; } -static struct dentry * +struct dentry * trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent, void *data, long cpu, const struct file_operations *fops) { @@ -9366,8 +8383,7 @@ static void setup_trace_scratch(struct trace_array *tr, memset(tscratch, 0, size); } -static int -allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, unsigned long size) +int allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size) { enum ring_buffer_flags rb_flags; struct trace_scratch *tscratch; @@ -9406,8 +8422,8 @@ allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, unsigned } /* Allocate the first page for all buffers */ - set_buffer_entries(&tr->array_buffer, - ring_buffer_size(tr->array_buffer.buffer, 0)); + trace_set_buffer_entries(&tr->array_buffer, + ring_buffer_size(tr->array_buffer.buffer, 0)); return 0; } @@ -9430,23 +8446,11 @@ static int allocate_trace_buffers(struct trace_array *tr, unsigned long size) if (ret) return ret; -#ifdef CONFIG_TRACER_SNAPSHOT - /* Fix mapped buffer trace arrays do not have snapshot buffers */ - if (tr->range_addr_start) - return 0; - - ret = allocate_trace_buffer(tr, &tr->snapshot_buffer, - allocate_snapshot ? size : 1); - if (MEM_FAIL(ret, "Failed to allocate trace buffer\n")) { + ret = trace_allocate_snapshot(tr, size); + if (MEM_FAIL(ret, "Failed to allocate trace buffer\n")) free_trace_buffer(&tr->array_buffer); - return -ENOMEM; - } - tr->allocated_snapshot = allocate_snapshot; - allocate_snapshot = false; -#endif - - return 0; + return ret; } static void free_trace_buffers(struct trace_array *tr) @@ -9527,8 +8531,8 @@ struct trace_array *trace_array_find_get(const char *instance) guard(mutex)(&trace_types_lock); tr = trace_array_find(instance); - if (tr) - tr->ref++; + if (tr && __trace_array_get(tr) < 0) + tr = NULL; return tr; } @@ -9625,6 +8629,8 @@ trace_array_create_systems(const char *name, const char *systems, if (ftrace_allocate_ftrace_ops(tr) < 0) goto out_free_tr; + trace_array_init_autoremove(tr); + ftrace_init_trace_array(tr); init_trace_flags_index(tr); @@ -9735,7 +8741,9 @@ struct trace_array *trace_array_get_by_name(const char *name, const char *system list_for_each_entry(tr, &ftrace_trace_arrays, list) { if (tr->name && strcmp(tr->name, name) == 0) { - tr->ref++; + /* if this fails, @tr is going to be removed. */ + if (__trace_array_get(tr) < 0) + tr = NULL; return tr; } } @@ -9774,6 +8782,7 @@ static int __remove_instance(struct trace_array *tr) set_tracer_flag(tr, 1ULL << i, 0); } + trace_array_cancel_autoremove(tr); tracing_set_nop(tr); clear_ftrace_function_probes(tr); event_trace_del_tracer(tr); @@ -9866,17 +8875,22 @@ static __init void create_trace_instances(struct dentry *d_tracer) static void init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) { + umode_t writable_mode = TRACE_MODE_WRITE; int cpu; + if (trace_array_is_readonly(tr)) + writable_mode = TRACE_MODE_READ; + trace_create_file("available_tracers", TRACE_MODE_READ, d_tracer, - tr, &show_traces_fops); + tr, &show_traces_fops); - trace_create_file("current_tracer", TRACE_MODE_WRITE, d_tracer, - tr, &set_tracer_fops); + trace_create_file("current_tracer", writable_mode, d_tracer, + tr, &set_tracer_fops); - trace_create_file("tracing_cpumask", TRACE_MODE_WRITE, d_tracer, + trace_create_file("tracing_cpumask", writable_mode, d_tracer, tr, &tracing_cpumask_fops); + /* Options are used for changing print-format even for readonly instance. */ trace_create_file("trace_options", TRACE_MODE_WRITE, d_tracer, tr, &tracing_iter_fops); @@ -9886,12 +8900,36 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("trace_pipe", TRACE_MODE_READ, d_tracer, tr, &tracing_pipe_fops); - trace_create_file("buffer_size_kb", TRACE_MODE_WRITE, d_tracer, + trace_create_file("buffer_size_kb", writable_mode, d_tracer, tr, &tracing_entries_fops); trace_create_file("buffer_total_size_kb", TRACE_MODE_READ, d_tracer, tr, &tracing_total_entries_fops); + trace_create_file("trace_clock", writable_mode, d_tracer, tr, + &trace_clock_fops); + + trace_create_file("timestamp_mode", TRACE_MODE_READ, d_tracer, tr, + &trace_time_stamp_mode_fops); + + tr->buffer_percent = 50; + + trace_create_file("buffer_subbuf_size_kb", writable_mode, d_tracer, + tr, &buffer_subbuf_size_fops); + + create_trace_options_dir(tr); + + if (tr->range_addr_start) + trace_create_file("last_boot_info", TRACE_MODE_READ, d_tracer, + tr, &last_boot_fops); + + for_each_tracing_cpu(cpu) + tracing_init_tracefs_percpu(tr, cpu); + + /* Read-only instance has above files only. */ + if (trace_array_is_readonly(tr)) + return; + trace_create_file("free_buffer", 0200, d_tracer, tr, &tracing_free_buffer_fops); @@ -9903,49 +8941,29 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("trace_marker_raw", 0220, d_tracer, tr, &tracing_mark_raw_fops); - trace_create_file("trace_clock", TRACE_MODE_WRITE, d_tracer, tr, - &trace_clock_fops); - - trace_create_file("tracing_on", TRACE_MODE_WRITE, d_tracer, - tr, &rb_simple_fops); - - trace_create_file("timestamp_mode", TRACE_MODE_READ, d_tracer, tr, - &trace_time_stamp_mode_fops); - - tr->buffer_percent = 50; - trace_create_file("buffer_percent", TRACE_MODE_WRITE, d_tracer, - tr, &buffer_percent_fops); - - trace_create_file("buffer_subbuf_size_kb", TRACE_MODE_WRITE, d_tracer, - tr, &buffer_subbuf_size_fops); + tr, &buffer_percent_fops); trace_create_file("syscall_user_buf_size", TRACE_MODE_WRITE, d_tracer, - tr, &tracing_syscall_buf_fops); + tr, &tracing_syscall_buf_fops); - create_trace_options_dir(tr); + trace_create_file("tracing_on", TRACE_MODE_WRITE, d_tracer, + tr, &rb_simple_fops); trace_create_maxlat_file(tr, d_tracer); if (ftrace_create_function_files(tr, d_tracer)) MEM_FAIL(1, "Could not allocate function filter files"); - if (tr->range_addr_start) { - trace_create_file("last_boot_info", TRACE_MODE_READ, d_tracer, - tr, &last_boot_fops); #ifdef CONFIG_TRACER_SNAPSHOT - } else { + if (!tr->range_addr_start) trace_create_file("snapshot", TRACE_MODE_WRITE, d_tracer, tr, &snapshot_fops); #endif - } trace_create_file("error_log", TRACE_MODE_WRITE, d_tracer, tr, &tracing_err_log_fops); - for_each_tracing_cpu(cpu) - tracing_init_tracefs_percpu(tr, cpu); - ftrace_init_tracefs(tr, d_tracer); } @@ -10554,47 +9572,6 @@ ssize_t trace_parse_run_command(struct file *file, const char __user *buffer, return done; } -#ifdef CONFIG_TRACER_SNAPSHOT -__init static bool tr_needs_alloc_snapshot(const char *name) -{ - char *test; - int len = strlen(name); - bool ret; - - if (!boot_snapshot_index) - return false; - - if (strncmp(name, boot_snapshot_info, len) == 0 && - boot_snapshot_info[len] == '\t') - return true; - - test = kmalloc(strlen(name) + 3, GFP_KERNEL); - if (!test) - return false; - - sprintf(test, "\t%s\t", name); - ret = strstr(boot_snapshot_info, test) == NULL; - kfree(test); - return ret; -} - -__init static void do_allocate_snapshot(const char *name) -{ - if (!tr_needs_alloc_snapshot(name)) - return; - - /* - * When allocate_snapshot is set, the next call to - * allocate_trace_buffers() (called by trace_array_get_by_name()) - * will allocate the snapshot buffer. That will also clear - * this flag. - */ - allocate_snapshot = true; -} -#else -static inline void do_allocate_snapshot(const char *name) { } -#endif - __init static int backup_instance_area(const char *backup, unsigned long *addr, phys_addr_t *size) { @@ -10744,8 +9721,7 @@ __init static void enable_instances(void) } } else { /* Only non mapped buffers have snapshot buffers */ - if (IS_ENABLED(CONFIG_TRACER_SNAPSHOT)) - do_allocate_snapshot(name); + do_allocate_snapshot(name); } tr = trace_array_create_systems(name, NULL, addr, size); @@ -10771,17 +9747,41 @@ __init static void enable_instances(void) /* * Backup buffers can be freed but need vfree(). */ - if (backup) - tr->flags |= TRACE_ARRAY_FL_VMALLOC; + if (backup) { + tr->flags |= TRACE_ARRAY_FL_VMALLOC | TRACE_ARRAY_FL_RDONLY; + trace_array_start_autoremove(); + } if (start || backup) { tr->flags |= TRACE_ARRAY_FL_BOOT | TRACE_ARRAY_FL_LAST_BOOT; tr->range_name = no_free_ptr(rname); } + /* + * Save the events to start and enabled them after all boot instances + * have been created. + */ + tr->boot_events = curr_str; + } + + /* Enable the events after all boot instances have been created */ + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + + if (!tr->boot_events || !(*tr->boot_events)) { + tr->boot_events = NULL; + continue; + } + + curr_str = tr->boot_events; + + /* Clear the instance if this is a persistent buffer */ + if (tr->flags & TRACE_ARRAY_FL_LAST_BOOT) + update_last_data(tr); + while ((tok = strsep(&curr_str, ","))) { early_enable_events(tr, tok, true); } + tr->boot_events = NULL; } } @@ -10937,24 +9937,6 @@ struct trace_array *trace_get_global_array(void) } #endif -void __init ftrace_boot_snapshot(void) -{ -#ifdef CONFIG_TRACER_SNAPSHOT - struct trace_array *tr; - - if (!snapshot_at_boot) - return; - - list_for_each_entry(tr, &ftrace_trace_arrays, list) { - if (!tr->allocated_snapshot) - continue; - - tracing_snapshot_instance(tr); - trace_array_puts(tr, "** Boot snapshot taken **\n"); - } -#endif -} - void __init early_trace_init(void) { if (tracepoint_printk) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b8f3804586a0..80fe152af1dd 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -264,6 +264,7 @@ static inline bool still_need_pid_events(int type, struct trace_pid_list *pid_li typedef bool (*cond_update_fn_t)(struct trace_array *tr, void *cond_data); +#ifdef CONFIG_TRACER_SNAPSHOT /** * struct cond_snapshot - conditional snapshot data and callback * @@ -306,6 +307,7 @@ struct cond_snapshot { void *cond_data; cond_update_fn_t update; }; +#endif /* CONFIG_TRACER_SNAPSHOT */ /* * struct trace_func_repeats - used to keep track of the consecutive @@ -405,7 +407,10 @@ struct trace_array { unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE]; unsigned int flags; raw_spinlock_t start_lock; - const char *system_names; + union { + const char *system_names; + char *boot_events; + }; struct list_head err_log; struct dentry *dir; struct dentry *options; @@ -453,6 +458,12 @@ struct trace_array { * we do not waste memory on systems that are not using tracing. */ bool ring_buffer_expanded; + /* + * If the ring buffer is a read only backup instance, it will be + * removed after dumping all data via pipe, because no readable data. + */ + bool free_on_close; + struct work_struct autoremove_work; }; enum { @@ -462,6 +473,7 @@ enum { TRACE_ARRAY_FL_MOD_INIT = BIT(3), TRACE_ARRAY_FL_MEMMAP = BIT(4), TRACE_ARRAY_FL_VMALLOC = BIT(5), + TRACE_ARRAY_FL_RDONLY = BIT(6), }; #ifdef CONFIG_MODULES @@ -491,6 +503,12 @@ extern unsigned long trace_adjust_address(struct trace_array *tr, unsigned long extern struct trace_array *printk_trace; +static inline bool trace_array_is_readonly(struct trace_array *tr) +{ + /* backup instance is read only. */ + return tr->flags & TRACE_ARRAY_FL_RDONLY; +} + /* * The global tracer (top) should be the first trace array added, * but we check the flag anyway. @@ -675,6 +693,7 @@ void tracing_reset_all_online_cpus(void); void tracing_reset_all_online_cpus_unlocked(void); int tracing_open_generic(struct inode *inode, struct file *filp); int tracing_open_generic_tr(struct inode *inode, struct file *filp); +int tracing_release(struct inode *inode, struct file *file); int tracing_release_generic_tr(struct inode *inode, struct file *file); int tracing_open_file_tr(struct inode *inode, struct file *filp); int tracing_release_file_tr(struct inode *inode, struct file *filp); @@ -684,12 +703,54 @@ void tracer_tracing_on(struct trace_array *tr); void tracer_tracing_off(struct trace_array *tr); void tracer_tracing_disable(struct trace_array *tr); void tracer_tracing_enable(struct trace_array *tr); +int allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size); struct dentry *trace_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops); +struct dentry *trace_create_cpu_file(const char *name, + umode_t mode, + struct dentry *parent, + void *data, + long cpu, + const struct file_operations *fops); + +struct trace_iterator *__tracing_open(struct inode *inode, struct file *file, + bool snapshot); +int tracing_buffers_open(struct inode *inode, struct file *filp); +ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos); +int tracing_buffers_release(struct inode *inode, struct file *file); +ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, unsigned int flags); + +ssize_t tracing_nsecs_read(unsigned long *ptr, char __user *ubuf, + size_t cnt, loff_t *ppos); +ssize_t tracing_nsecs_write(unsigned long *ptr, const char __user *ubuf, + size_t cnt, loff_t *ppos); + +void trace_set_buffer_entries(struct array_buffer *buf, unsigned long val); +/* + * Should be used after trace_array_get(), trace_types_lock + * ensures that i_cdev was already initialized. + */ +static inline int tracing_get_cpu(struct inode *inode) +{ + if (inode->i_cdev) /* See trace_create_cpu_file() */ + return (long)inode->i_cdev - 1; + return RING_BUFFER_ALL_CPUS; +} +void tracing_reset_cpu(struct array_buffer *buf, int cpu); + +struct ftrace_buffer_info { + struct trace_iterator iter; + void *spare; + unsigned int spare_cpu; + unsigned int spare_size; + unsigned int read; +}; /** * tracer_tracing_is_on_cpu - show real state of ring buffer enabled on for a cpu @@ -806,13 +867,13 @@ void update_max_tr_single(struct trace_array *tr, #if defined(CONFIG_TRACER_MAX_TRACE) && defined(CONFIG_FSNOTIFY) # define LATENCY_FS_NOTIFY #endif +#endif /* CONFIG_TRACER_SNAPSHOT */ #ifdef LATENCY_FS_NOTIFY void latency_fsnotify(struct trace_array *tr); #else static inline void latency_fsnotify(struct trace_array *tr) { } #endif -#endif /* CONFIG_TRACER_SNAPSHOT */ #ifdef CONFIG_STACKTRACE void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, int skip); @@ -828,11 +889,15 @@ static inline bool tracer_uses_snapshot(struct tracer *tracer) { return tracer->use_max_tr; } +void trace_create_maxlat_file(struct trace_array *tr, + struct dentry *d_tracer); #else static inline bool tracer_uses_snapshot(struct tracer *tracer) { return false; } +static inline void trace_create_maxlat_file(struct trace_array *tr, + struct dentry *d_tracer) { } #endif void trace_last_func_repeats(struct trace_array *tr, @@ -862,6 +927,8 @@ extern int DYN_FTRACE_TEST_NAME(void); #define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2 extern int DYN_FTRACE_TEST_NAME2(void); +void __init trace_append_boot_param(char *buf, const char *str, + char sep, int size); extern void trace_set_ring_buffer_expanded(struct trace_array *tr); extern bool tracing_selftest_disabled; @@ -1802,11 +1869,6 @@ extern struct trace_event_file *find_event_file(struct trace_array *tr, const char *system, const char *event); -static inline void *event_file_data(struct file *filp) -{ - return READ_ONCE(file_inode(filp)->i_private); -} - extern struct mutex event_mutex; extern struct list_head ftrace_events; @@ -1827,12 +1889,22 @@ static inline struct trace_event_file *event_file_file(struct file *filp) struct trace_event_file *file; lockdep_assert_held(&event_mutex); - file = READ_ONCE(file_inode(filp)->i_private); + file = file_inode(filp)->i_private; if (!file || file->flags & EVENT_FILE_FL_FREED) return NULL; return file; } +static inline void *event_file_data(struct file *filp) +{ + struct trace_event_file *file; + + lockdep_assert_held(&event_mutex); + file = file_inode(filp)->i_private; + WARN_ON(!file || file->flags & EVENT_FILE_FL_FREED); + return file; +} + extern const struct file_operations event_trigger_fops; extern const struct file_operations event_hist_fops; extern const struct file_operations event_hist_debug_fops; @@ -2135,12 +2207,6 @@ static inline bool event_command_needs_rec(struct event_command *cmd_ops) extern int trace_event_enable_disable(struct trace_event_file *file, int enable, int soft_disable); -extern int tracing_alloc_snapshot(void); -extern void tracing_snapshot_cond(struct trace_array *tr, void *cond_data); -extern int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update); - -extern int tracing_snapshot_cond_disable(struct trace_array *tr); -extern void *tracing_cond_snapshot_data(struct trace_array *tr); extern const char *__start___trace_bprintk_fmt[]; extern const char *__stop___trace_bprintk_fmt[]; @@ -2228,19 +2294,71 @@ static inline void trace_event_update_all(struct trace_eval_map **map, int len) #endif #ifdef CONFIG_TRACER_SNAPSHOT +extern const struct file_operations snapshot_fops; +extern const struct file_operations snapshot_raw_fops; + +/* Used when creating instances */ +int trace_allocate_snapshot(struct trace_array *tr, int size); + +int tracing_alloc_snapshot(void); +void tracing_snapshot_cond(struct trace_array *tr, void *cond_data); +int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update); +int tracing_snapshot_cond_disable(struct trace_array *tr); +void *tracing_cond_snapshot_data(struct trace_array *tr); void tracing_snapshot_instance(struct trace_array *tr); int tracing_alloc_snapshot_instance(struct trace_array *tr); +int tracing_arm_snapshot_locked(struct trace_array *tr); int tracing_arm_snapshot(struct trace_array *tr); void tracing_disarm_snapshot(struct trace_array *tr); -#else +void free_snapshot(struct trace_array *tr); +void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter); +int get_snapshot_map(struct trace_array *tr); +void put_snapshot_map(struct trace_array *tr); +int resize_buffer_duplicate_size(struct array_buffer *trace_buf, + struct array_buffer *size_buf, int cpu_id); +__init void do_allocate_snapshot(const char *name); +# ifdef CONFIG_DYNAMIC_FTRACE +__init int register_snapshot_cmd(void); +# else +static inline int register_snapshot_cmd(void) { return 0; } +# endif +#else /* !CONFIG_TRACER_SNAPSHOT */ +static inline int trace_allocate_snapshot(struct trace_array *tr, int size) { return 0; } static inline void tracing_snapshot_instance(struct trace_array *tr) { } static inline int tracing_alloc_snapshot_instance(struct trace_array *tr) { return 0; } +static inline int tracing_arm_snapshot_locked(struct trace_array *tr) { return -EBUSY; } static inline int tracing_arm_snapshot(struct trace_array *tr) { return 0; } static inline void tracing_disarm_snapshot(struct trace_array *tr) { } -#endif +static inline void free_snapshot(struct trace_array *tr) {} +static inline void tracing_snapshot_cond(struct trace_array *tr, void *cond_data) +{ + WARN_ONCE(1, "Snapshot feature not enabled, but internal conditional snapshot used"); +} +static inline void *tracing_cond_snapshot_data(struct trace_array *tr) +{ + return NULL; +} +static inline int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update) +{ + return -ENODEV; +} +static inline int tracing_snapshot_cond_disable(struct trace_array *tr) +{ + return false; +} +static inline void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) +{ + /* Should never be called */ + WARN_ONCE(1, "Snapshot print function called without snapshot configured"); +} +static inline int get_snapshot_map(struct trace_array *tr) { return 0; } +static inline void put_snapshot_map(struct trace_array *tr) { } +static inline void do_allocate_snapshot(const char *name) { } +static inline int register_snapshot_cmd(void) { return 0; } +#endif /* CONFIG_TRACER_SNAPSHOT */ #ifdef CONFIG_PREEMPT_TRACER void tracer_preempt_on(unsigned long a0, unsigned long a1); diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index dbe29b4c6a7a..2ca2541c8a58 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -61,7 +61,8 @@ trace_boot_set_instance_options(struct trace_array *tr, struct xbc_node *node) v = memparse(p, NULL); if (v < PAGE_SIZE) pr_err("Buffer size is too small: %s\n", p); - if (tracing_resize_ring_buffer(tr, v, RING_BUFFER_ALL_CPUS) < 0) + if (trace_array_is_readonly(tr) || + tracing_resize_ring_buffer(tr, v, RING_BUFFER_ALL_CPUS) < 0) pr_err("Failed to resize trace buffer to %s\n", p); } @@ -597,7 +598,7 @@ trace_boot_enable_tracer(struct trace_array *tr, struct xbc_node *node) p = xbc_node_find_value(node, "tracer", NULL); if (p && *p != '\0') { - if (tracing_set_tracer(tr, p) < 0) + if (trace_array_is_readonly(tr) || tracing_set_tracer(tr, p) < 0) pr_err("Failed to set given tracer: %s\n", p); } diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 6809b370e991..d1564db95a8f 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -373,10 +373,10 @@ __init static int init_annotated_branch_stats(void) int ret; ret = register_stat_tracer(&annotated_branch_stats); - if (!ret) { + if (ret) { printk(KERN_WARNING "Warning: could not register " "annotated branches stats\n"); - return 1; + return ret; } return 0; } @@ -438,10 +438,10 @@ __init static int all_annotated_branch_stats(void) int ret; ret = register_stat_tracer(&all_branch_stats); - if (!ret) { + if (ret) { printk(KERN_WARNING "Warning: could not register " "all branches stats\n"); - return 1; + return ret; } return 0; } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 249d1cba72c0..c46e623e7e0d 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1401,6 +1401,9 @@ static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, { int ret; + if (trace_array_is_readonly(tr)) + return -EACCES; + mutex_lock(&event_mutex); ret = __ftrace_set_clr_event_nolock(tr, match, sub, event, set, mod); mutex_unlock(&event_mutex); @@ -1718,7 +1721,7 @@ static int t_show_filters(struct seq_file *m, void *v) len = get_call_len(call); - seq_printf(m, "%s:%s%*.s%s\n", call->class->system, + seq_printf(m, "%s:%s%*s%s\n", call->class->system, trace_event_name(call), len, "", filter->filter_string); return 0; @@ -1750,7 +1753,7 @@ static int t_show_triggers(struct seq_file *m, void *v) len = get_call_len(call); list_for_each_entry_rcu(data, &file->triggers, list) { - seq_printf(m, "%s:%s%*.s", call->class->system, + seq_printf(m, "%s:%s%*s", call->class->system, trace_event_name(call), len, ""); data->cmd_ops->print(m, data); @@ -2184,12 +2187,12 @@ static int trace_format_open(struct inode *inode, struct file *file) static ssize_t event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - int id = (long)event_file_data(filp); + /* id is directly in i_private and available for inode's lifetime. */ + int id = (long)file_inode(filp)->i_private; char buf[32]; int len; - if (unlikely(!id)) - return -ENODEV; + WARN_ON(!id); len = sprintf(buf, "%d\n", id); @@ -2247,12 +2250,8 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, mutex_lock(&event_mutex); file = event_file_file(filp); - if (file) { - if (file->flags & EVENT_FILE_FL_FREED) - err = -ENODEV; - else - err = apply_event_filter(file, buf); - } + if (file) + err = apply_event_filter(file, buf); mutex_unlock(&event_mutex); kfree(buf); @@ -2973,8 +2972,8 @@ event_subsystem_dir(struct trace_array *tr, const char *name, } else __get_system(system); - /* ftrace only has directories no files */ - if (strcmp(name, "ftrace") == 0) + /* ftrace only has directories no files, readonly instance too. */ + if (strcmp(name, "ftrace") == 0 || trace_array_is_readonly(tr)) nr_entries = 0; else nr_entries = ARRAY_SIZE(system_entries); @@ -3139,28 +3138,30 @@ event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file) int ret; static struct eventfs_entry event_entries[] = { { - .name = "enable", + .name = "format", .callback = event_callback, - .release = event_release, }, +#ifdef CONFIG_PERF_EVENTS { - .name = "filter", + .name = "id", .callback = event_callback, }, +#endif +#define NR_RO_EVENT_ENTRIES (1 + IS_ENABLED(CONFIG_PERF_EVENTS)) +/* Readonly files must be above this line and counted by NR_RO_EVENT_ENTRIES. */ { - .name = "trigger", + .name = "enable", .callback = event_callback, + .release = event_release, }, { - .name = "format", + .name = "filter", .callback = event_callback, }, -#ifdef CONFIG_PERF_EVENTS { - .name = "id", + .name = "trigger", .callback = event_callback, }, -#endif #ifdef CONFIG_HIST_TRIGGERS { .name = "hist", @@ -3193,7 +3194,10 @@ event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file) if (!e_events) return -ENOMEM; - nr_entries = ARRAY_SIZE(event_entries); + if (trace_array_is_readonly(tr)) + nr_entries = NR_RO_EVENT_ENTRIES; + else + nr_entries = ARRAY_SIZE(event_entries); name = trace_event_name(call); ei = eventfs_create_dir(name, e_events, event_entries, nr_entries, file); @@ -3679,20 +3683,27 @@ static struct boot_triggers { } bootup_triggers[MAX_BOOT_TRIGGERS]; static char bootup_trigger_buf[COMMAND_LINE_SIZE]; +static int boot_trigger_buf_len; static int nr_boot_triggers; static __init int setup_trace_triggers(char *str) { char *trigger; char *buf; + int len = boot_trigger_buf_len; int i; - strscpy(bootup_trigger_buf, str, COMMAND_LINE_SIZE); + if (len >= COMMAND_LINE_SIZE) + return 1; + + strscpy(bootup_trigger_buf + len, str, COMMAND_LINE_SIZE - len); trace_set_ring_buffer_expanded(NULL); disable_tracing_selftest("running event triggers"); - buf = bootup_trigger_buf; - for (i = 0; i < MAX_BOOT_TRIGGERS; i++) { + buf = bootup_trigger_buf + len; + boot_trigger_buf_len += strlen(buf) + 1; + + for (i = nr_boot_triggers; i < MAX_BOOT_TRIGGERS; i++) { trigger = strsep(&buf, ","); if (!trigger) break; @@ -4536,31 +4547,44 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) int nr_entries; static struct eventfs_entry events_entries[] = { { - .name = "enable", + .name = "header_page", .callback = events_callback, }, { - .name = "header_page", + .name = "header_event", .callback = events_callback, }, +#define NR_RO_TOP_ENTRIES 2 +/* Readonly files must be above this line and counted by NR_RO_TOP_ENTRIES. */ { - .name = "header_event", + .name = "enable", .callback = events_callback, }, }; - entry = trace_create_file("set_event", TRACE_MODE_WRITE, parent, - tr, &ftrace_set_event_fops); - if (!entry) - return -ENOMEM; + if (!trace_array_is_readonly(tr)) { + entry = trace_create_file("set_event", TRACE_MODE_WRITE, parent, + tr, &ftrace_set_event_fops); + if (!entry) + return -ENOMEM; + + /* There are not as crucial, just warn if they are not created */ + trace_create_file("show_event_filters", TRACE_MODE_READ, parent, tr, + &ftrace_show_event_filters_fops); - trace_create_file("show_event_filters", TRACE_MODE_READ, parent, tr, - &ftrace_show_event_filters_fops); + trace_create_file("show_event_triggers", TRACE_MODE_READ, parent, tr, + &ftrace_show_event_triggers_fops); - trace_create_file("show_event_triggers", TRACE_MODE_READ, parent, tr, - &ftrace_show_event_triggers_fops); + trace_create_file("set_event_pid", TRACE_MODE_WRITE, parent, + tr, &ftrace_set_event_pid_fops); - nr_entries = ARRAY_SIZE(events_entries); + trace_create_file("set_event_notrace_pid", + TRACE_MODE_WRITE, parent, tr, + &ftrace_set_event_notrace_pid_fops); + nr_entries = ARRAY_SIZE(events_entries); + } else { + nr_entries = NR_RO_TOP_ENTRIES; + } e_events = eventfs_create_events_dir("events", parent, events_entries, nr_entries, tr); @@ -4569,15 +4593,6 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) return -ENOMEM; } - /* There are not as crucial, just warn if they are not created */ - - trace_create_file("set_event_pid", TRACE_MODE_WRITE, parent, - tr, &ftrace_set_event_pid_fops); - - trace_create_file("set_event_notrace_pid", - TRACE_MODE_WRITE, parent, tr, - &ftrace_set_event_notrace_pid_fops); - tr->event_dir = e_events; return 0; diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 73ea180cad55..eb2c2bc8bc3d 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1361,13 +1361,16 @@ static const char *hist_field_name(struct hist_field *field, field->flags & HIST_FIELD_FL_VAR_REF) { if (field->system) { static char full_name[MAX_FILTER_STR_VAL]; + static char *fmt; + int len; - strcat(full_name, field->system); - strcat(full_name, "."); - strcat(full_name, field->event_name); - strcat(full_name, "."); - strcat(full_name, field->name); - field_name = full_name; + fmt = field->flags & HIST_FIELD_FL_VAR_REF ? "%s.%s.$%s" : "%s.%s.%s"; + + len = snprintf(full_name, sizeof(full_name), fmt, + field->system, field->event_name, + field->name); + if (len < sizeof(full_name)) + field_name = full_name; } else field_name = field->name; } else if (field->flags & HIST_FIELD_FL_TIMESTAMP) @@ -1740,9 +1743,10 @@ static const char *get_hist_field_flags(struct hist_field *hist_field) static void expr_field_str(struct hist_field *field, char *expr) { - if (field->flags & HIST_FIELD_FL_VAR_REF) - strcat(expr, "$"); - else if (field->flags & HIST_FIELD_FL_CONST) { + if (field->flags & HIST_FIELD_FL_VAR_REF) { + if (!field->system) + strcat(expr, "$"); + } else if (field->flags & HIST_FIELD_FL_CONST) { char str[HIST_CONST_DIGITS_MAX]; snprintf(str, HIST_CONST_DIGITS_MAX, "%llu", field->constant); @@ -5836,8 +5840,6 @@ static int event_hist_open(struct inode *inode, struct file *file) hist_file->file = file; hist_file->last_act = get_hist_hit_count(event_file); - /* Clear private_data to avoid warning in single_open() */ - file->private_data = NULL; ret = single_open(file, hist_show, hist_file); if (ret) { kfree(hist_file); @@ -6126,8 +6128,6 @@ static int event_hist_debug_open(struct inode *inode, struct file *file) if (ret) return ret; - /* Clear private_data to avoid warning in single_open() */ - file->private_data = NULL; ret = single_open(file, hist_debug_show, file); if (ret) tracing_release_file_tr(inode, file); @@ -6158,7 +6158,8 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) else if (field_name) { if (hist_field->flags & HIST_FIELD_FL_VAR_REF || hist_field->flags & HIST_FIELD_FL_ALIAS) - seq_putc(m, '$'); + if (!hist_field->system) + seq_putc(m, '$'); seq_printf(m, "%s", field_name); } else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP) seq_puts(m, "common_timestamp"); diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 8bb95b2a6fcf..39ac4eba0702 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -395,7 +395,7 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, n_u64++; } else { struct trace_print_flags __flags[] = { - __def_gfpflag_names, {-1, NULL} }; + __def_gfpflag_names }; char *space = (i == se->n_fields - 1 ? "" : " "); print_synth_event_num_val(s, print_fmt, @@ -408,7 +408,7 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, trace_seq_puts(s, " ("); trace_print_flags_seq(s, "|", entry->fields[n_u64].as_u64, - __flags); + __flags, ARRAY_SIZE(__flags)); trace_seq_putc(s, ')'); } n_u64++; diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index d5230b759a2d..655db2e82513 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -22,6 +22,39 @@ static struct task_struct *trigger_kthread; static struct llist_head trigger_data_free_list; static DEFINE_MUTEX(trigger_data_kthread_mutex); +static int trigger_kthread_fn(void *ignore); + +static void trigger_create_kthread_locked(void) +{ + lockdep_assert_held(&trigger_data_kthread_mutex); + + if (!trigger_kthread) { + struct task_struct *kthread; + + kthread = kthread_create(trigger_kthread_fn, NULL, + "trigger_data_free"); + if (!IS_ERR(kthread)) + WRITE_ONCE(trigger_kthread, kthread); + } +} + +static void trigger_data_free_queued_locked(void) +{ + struct event_trigger_data *data, *tmp; + struct llist_node *llnodes; + + lockdep_assert_held(&trigger_data_kthread_mutex); + + llnodes = llist_del_all(&trigger_data_free_list); + if (!llnodes) + return; + + tracepoint_synchronize_unregister(); + + llist_for_each_entry_safe(data, tmp, llnodes, llist) + kfree(data); +} + /* Bulk garbage collection of event_trigger_data elements */ static int trigger_kthread_fn(void *ignore) { @@ -56,30 +89,50 @@ void trigger_data_free(struct event_trigger_data *data) if (data->cmd_ops->set_filter) data->cmd_ops->set_filter(NULL, data, NULL); + /* + * Boot-time trigger registration can fail before kthread creation + * works. Keep the deferred-free semantics during boot and let late + * init start the kthread to drain the list. + */ + if (system_state == SYSTEM_BOOTING && !trigger_kthread) { + llist_add(&data->llist, &trigger_data_free_list); + return; + } + if (unlikely(!trigger_kthread)) { guard(mutex)(&trigger_data_kthread_mutex); + + trigger_create_kthread_locked(); /* Check again after taking mutex */ if (!trigger_kthread) { - struct task_struct *kthread; - - kthread = kthread_create(trigger_kthread_fn, NULL, - "trigger_data_free"); - if (!IS_ERR(kthread)) - WRITE_ONCE(trigger_kthread, kthread); + llist_add(&data->llist, &trigger_data_free_list); + /* Drain the queued frees synchronously if creation failed. */ + trigger_data_free_queued_locked(); + return; } } - if (!trigger_kthread) { - /* Do it the slow way */ - tracepoint_synchronize_unregister(); - kfree(data); - return; - } - llist_add(&data->llist, &trigger_data_free_list); wake_up_process(trigger_kthread); } +static int __init trigger_data_free_init(void) +{ + guard(mutex)(&trigger_data_kthread_mutex); + + if (llist_empty(&trigger_data_free_list)) + return 0; + + trigger_create_kthread_locked(); + if (trigger_kthread) + wake_up_process(trigger_kthread); + else + trigger_data_free_queued_locked(); + + return 0; +} +late_initcall(trigger_data_free_init); + static inline void data_ops_trigger(struct event_trigger_data *data, struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index a5dbb72528e0..a8420e6abb56 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -31,7 +31,8 @@ static char kprobe_boot_events_buf[COMMAND_LINE_SIZE] __initdata; static int __init set_kprobe_boot_events(char *str) { - strscpy(kprobe_boot_events_buf, str, COMMAND_LINE_SIZE); + trace_append_boot_param(kprobe_boot_events_buf, str, ';', + COMMAND_LINE_SIZE); disable_tracing_selftest("running kprobe events"); return 1; @@ -765,6 +766,14 @@ static unsigned int number_of_same_symbols(const char *mod, const char *func_nam if (!mod) kallsyms_on_each_match_symbol(count_symbols, func_name, &ctx.count); + /* + * If the symbol is found in vmlinux, use vmlinux resolution only. + * This prevents module symbols from shadowing vmlinux symbols + * and causing -EADDRNOTAVAIL for unqualified kprobe targets. + */ + if (!mod && ctx.count > 0) + return ctx.count; + module_kallsyms_on_each_symbol(mod, count_mod_symbols, &ctx); return ctx.count; diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index dee610e465b9..75678053b21c 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -58,6 +58,7 @@ enum osnoise_options_index { OSN_PANIC_ON_STOP, OSN_PREEMPT_DISABLE, OSN_IRQ_DISABLE, + OSN_TIMERLAT_ALIGN, OSN_MAX }; @@ -66,7 +67,8 @@ static const char * const osnoise_options_str[OSN_MAX] = { "OSNOISE_WORKLOAD", "PANIC_ON_STOP", "OSNOISE_PREEMPT_DISABLE", - "OSNOISE_IRQ_DISABLE" }; + "OSNOISE_IRQ_DISABLE", + "TIMERLAT_ALIGN" }; #define OSN_DEFAULT_OPTIONS 0x2 static unsigned long osnoise_options = OSN_DEFAULT_OPTIONS; @@ -251,6 +253,11 @@ struct timerlat_variables { static DEFINE_PER_CPU(struct timerlat_variables, per_cpu_timerlat_var); /* + * timerlat wake-up offset for next thread with TIMERLAT_ALIGN set. + */ +static atomic64_t align_next; + +/* * this_cpu_tmr_var - Return the per-cpu timerlat_variables on its relative CPU */ static inline struct timerlat_variables *this_cpu_tmr_var(void) @@ -268,6 +275,7 @@ static inline void tlat_var_reset(void) /* Synchronize with the timerlat interfaces */ mutex_lock(&interface_lock); + /* * So far, all the values are initialized as 0, so * zeroing the structure is perfect. @@ -278,6 +286,12 @@ static inline void tlat_var_reset(void) hrtimer_cancel(&tlat_var->timer); memset(tlat_var, 0, sizeof(*tlat_var)); } + /* + * Reset also align_next, to be filled by a new offset by the first timerlat + * thread that wakes up, if TIMERLAT_ALIGN is set. + */ + atomic64_set(&align_next, 0); + mutex_unlock(&interface_lock); } #else /* CONFIG_TIMERLAT_TRACER */ @@ -326,6 +340,7 @@ static struct osnoise_data { u64 stop_tracing_total; /* stop trace in the final operation (report/thread) */ #ifdef CONFIG_TIMERLAT_TRACER u64 timerlat_period; /* timerlat period */ + u64 timerlat_align_us; /* timerlat alignment */ u64 print_stack; /* print IRQ stack if total > */ int timerlat_tracer; /* timerlat tracer */ #endif @@ -338,6 +353,7 @@ static struct osnoise_data { #ifdef CONFIG_TIMERLAT_TRACER .print_stack = 0, .timerlat_period = DEFAULT_TIMERLAT_PERIOD, + .timerlat_align_us = 0, .timerlat_tracer = 0, #endif }; @@ -1830,6 +1846,26 @@ static int wait_next_period(struct timerlat_variables *tlat) tlat->abs_period = (u64) ktime_to_ns(next_abs_period); /* + * Align thread in the first cycle on each CPU to the set alignment + * if TIMERLAT_ALIGN is set. + * + * This is done by using an atomic64_t to store the next absolute period. + * The first thread that wakes up will set the atomic64_t to its + * absolute period, and the other threads will increment it by + * the alignment value. + */ + if (test_bit(OSN_TIMERLAT_ALIGN, &osnoise_options) && !tlat->count + && atomic64_cmpxchg_relaxed(&align_next, 0, tlat->abs_period)) { + /* + * A thread has already set align_next, use it and increment it + * to be used by the next thread that wakes up after this one. + */ + tlat->abs_period = atomic64_add_return_relaxed( + osnoise_data.timerlat_align_us * 1000, &align_next); + next_abs_period = ns_to_ktime(tlat->abs_period); + } + + /* * If the new abs_period is in the past, skip the activation. */ while (ktime_compare(now, next_abs_period) > 0) { @@ -2073,8 +2109,8 @@ static void osnoise_hotplug_workfn(struct work_struct *dummy) if (!osnoise_has_registered_instances()) return; - guard(mutex)(&interface_lock); guard(cpus_read_lock)(); + guard(mutex)(&interface_lock); if (!cpu_online(cpu)) return; @@ -2237,11 +2273,11 @@ static ssize_t osnoise_options_write(struct file *filp, const char __user *ubuf, if (running) stop_per_cpu_kthreads(); - mutex_lock(&interface_lock); /* * avoid CPU hotplug operations that might read options. */ cpus_read_lock(); + mutex_lock(&interface_lock); retval = cnt; @@ -2257,8 +2293,8 @@ static ssize_t osnoise_options_write(struct file *filp, const char __user *ubuf, clear_bit(option, &osnoise_options); } - cpus_read_unlock(); mutex_unlock(&interface_lock); + cpus_read_unlock(); if (running) start_per_cpu_kthreads(); @@ -2345,16 +2381,16 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count, if (running) stop_per_cpu_kthreads(); - mutex_lock(&interface_lock); /* * osnoise_cpumask is read by CPU hotplug operations. */ cpus_read_lock(); + mutex_lock(&interface_lock); cpumask_copy(&osnoise_cpumask, osnoise_cpumask_new); - cpus_read_unlock(); mutex_unlock(&interface_lock); + cpus_read_unlock(); if (running) start_per_cpu_kthreads(); @@ -2650,6 +2686,17 @@ static struct trace_min_max_param timerlat_period = { .min = &timerlat_min_period, }; +/* + * osnoise/timerlat_align_us: align the first wakeup of all timerlat + * threads to a common boundary (in us). 0 means disabled. + */ +static struct trace_min_max_param timerlat_align_us = { + .lock = &interface_lock, + .val = &osnoise_data.timerlat_align_us, + .max = NULL, + .min = NULL, +}; + static const struct file_operations timerlat_fd_fops = { .open = timerlat_fd_open, .read = timerlat_fd_read, @@ -2746,6 +2793,11 @@ static int init_timerlat_tracefs(struct dentry *top_dir) if (!tmp) return -ENOMEM; + tmp = tracefs_create_file("timerlat_align_us", TRACE_MODE_WRITE, top_dir, + &timerlat_align_us, &trace_min_max_fops); + if (!tmp) + return -ENOMEM; + retval = osnoise_create_cpu_timerlat_fd(top_dir); if (retval) return retval; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 1996d7aba038..a5ad76175d10 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -69,14 +69,15 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) const char * trace_print_flags_seq(struct trace_seq *p, const char *delim, unsigned long flags, - const struct trace_print_flags *flag_array) + const struct trace_print_flags *flag_array, + size_t flag_array_size) { unsigned long mask; const char *str; const char *ret = trace_seq_buffer_ptr(p); int i, first = 1; - for (i = 0; flag_array[i].name && flags; i++) { + for (i = 0; i < flag_array_size && flags; i++) { mask = flag_array[i].mask; if ((flags & mask) != mask) @@ -106,12 +107,13 @@ EXPORT_SYMBOL(trace_print_flags_seq); const char * trace_print_symbols_seq(struct trace_seq *p, unsigned long val, - const struct trace_print_flags *symbol_array) + const struct trace_print_flags *symbol_array, + size_t symbol_array_size) { int i; const char *ret = trace_seq_buffer_ptr(p); - for (i = 0; symbol_array[i].name; i++) { + for (i = 0; i < symbol_array_size; i++) { if (val != symbol_array[i].mask) continue; @@ -133,14 +135,15 @@ EXPORT_SYMBOL(trace_print_symbols_seq); const char * trace_print_flags_seq_u64(struct trace_seq *p, const char *delim, unsigned long long flags, - const struct trace_print_flags_u64 *flag_array) + const struct trace_print_flags_u64 *flag_array, + size_t flag_array_size) { unsigned long long mask; const char *str; const char *ret = trace_seq_buffer_ptr(p); int i, first = 1; - for (i = 0; flag_array[i].name && flags; i++) { + for (i = 0; i < flag_array_size && flags; i++) { mask = flag_array[i].mask; if ((flags & mask) != mask) @@ -170,12 +173,13 @@ EXPORT_SYMBOL(trace_print_flags_seq_u64); const char * trace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, - const struct trace_print_flags_u64 *symbol_array) + const struct trace_print_flags_u64 *symbol_array, + size_t symbol_array_size) { int i; const char *ret = trace_seq_buffer_ptr(p); - for (i = 0; symbol_array[i].name; i++) { + for (i = 0; i < symbol_array_size; i++) { if (val != symbol_array[i].mask) continue; @@ -719,12 +723,13 @@ void print_function_args(struct trace_seq *s, unsigned long *args, { const struct btf_param *param; const struct btf_type *t; + const struct btf_enum *enums; const char *param_name; char name[KSYM_NAME_LEN]; unsigned long arg; struct btf *btf; s32 tid, nr = 0; - int a, p, x; + int a, p, x, i; u16 encode; trace_seq_printf(s, "("); @@ -778,6 +783,15 @@ void print_function_args(struct trace_seq *s, unsigned long *args, break; case BTF_KIND_ENUM: trace_seq_printf(s, "%ld", arg); + enums = btf_enum(t); + for (i = 0; i < btf_vlen(t); i++) { + if (arg == enums[i].val) { + trace_seq_printf(s, " [%s]", + btf_name_by_offset(btf, + enums[i].name_off)); + break; + } + } break; default: /* This does not handle complex arguments */ diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 5ea5e0d76f00..3ea17af60169 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -197,6 +197,7 @@ struct notifier_block module_trace_bprintk_format_nb = { .notifier_call = module_trace_bprintk_format_notify, }; +__printf(2, 3) int __trace_bprintk(unsigned long ip, const char *fmt, ...) { int ret; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index e0a5dc86c07e..44c22d4e7881 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -962,8 +962,6 @@ static int parse_probe_vars(char *orig_arg, const struct fetch_type *t, code->op = FETCH_OP_COMM; return 0; } - /* backward compatibility */ - ctx->offset = 0; goto inval; } @@ -1068,7 +1066,7 @@ static int __parse_imm_string(char *str, char **pbuf, int offs) { size_t len = strlen(str); - if (str[len - 1] != '"') { + if (!len || str[len - 1] != '"') { trace_probe_log_err(offs + len, IMMSTR_NO_CLOSE); return -EINVAL; } @@ -1523,6 +1521,12 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, parg->offset = *size; *size += parg->type->size * (parg->count ?: 1); + if (*size > MAX_PROBE_EVENT_SIZE) { + ret = -E2BIG; + trace_probe_log_err(ctx->offset, EVENT_TOO_BIG); + goto fail; + } + if (parg->count) { len = strlen(parg->type->fmttype) + 6; parg->fmt = kmalloc(len, GFP_KERNEL); diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 9fc56c937130..262d8707a3df 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -38,6 +38,7 @@ #define MAX_BTF_ARGS_LEN 128 #define MAX_DENTRY_ARGS_LEN 256 #define MAX_STRING_SIZE PATH_MAX +#define MAX_PROBE_EVENT_SIZE 3072 /* Reserved field names */ #define FIELD_STRING_IP "__probe_ip" @@ -561,7 +562,8 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(BAD_TYPE4STR, "This type does not fit for string."),\ C(NEED_STRING_TYPE, "$comm and immediate-string only accepts string type"),\ C(TOO_MANY_ARGS, "Too many arguments are specified"), \ - C(TOO_MANY_EARGS, "Too many entry arguments specified"), + C(TOO_MANY_EARGS, "Too many entry arguments specified"), \ + C(EVENT_TOO_BIG, "Event too big (too many fields?)"), #undef C #define C(a, b) TP_ERR_##a diff --git a/kernel/trace/trace_remote.c b/kernel/trace/trace_remote.c new file mode 100644 index 000000000000..d6c3f94d67cd --- /dev/null +++ b/kernel/trace/trace_remote.c @@ -0,0 +1,1384 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2025 - Google LLC + * Author: Vincent Donnefort <vdonnefort@google.com> + */ + +#include <linux/kstrtox.h> +#include <linux/lockdep.h> +#include <linux/mutex.h> +#include <linux/tracefs.h> +#include <linux/trace_remote.h> +#include <linux/trace_seq.h> +#include <linux/types.h> + +#include "trace.h" + +#define TRACEFS_DIR "remotes" +#define TRACEFS_MODE_WRITE 0640 +#define TRACEFS_MODE_READ 0440 + +enum tri_type { + TRI_CONSUMING, + TRI_NONCONSUMING, +}; + +struct trace_remote_iterator { + struct trace_remote *remote; + struct trace_seq seq; + struct delayed_work poll_work; + unsigned long lost_events; + u64 ts; + struct ring_buffer_iter *rb_iter; + struct ring_buffer_iter **rb_iters; + struct remote_event_hdr *evt; + int cpu; + int evt_cpu; + loff_t pos; + enum tri_type type; +}; + +struct trace_remote { + struct trace_remote_callbacks *cbs; + void *priv; + struct trace_buffer *trace_buffer; + struct trace_buffer_desc *trace_buffer_desc; + struct dentry *dentry; + struct eventfs_inode *eventfs; + struct remote_event *events; + unsigned long nr_events; + unsigned long trace_buffer_size; + struct ring_buffer_remote rb_remote; + struct mutex lock; + struct rw_semaphore reader_lock; + struct rw_semaphore *pcpu_reader_locks; + unsigned int nr_readers; + unsigned int poll_ms; + bool tracing_on; +}; + +static bool trace_remote_loaded(struct trace_remote *remote) +{ + return !!remote->trace_buffer; +} + +static int trace_remote_load(struct trace_remote *remote) +{ + struct ring_buffer_remote *rb_remote = &remote->rb_remote; + struct trace_buffer_desc *desc; + + lockdep_assert_held(&remote->lock); + + if (trace_remote_loaded(remote)) + return 0; + + desc = remote->cbs->load_trace_buffer(remote->trace_buffer_size, remote->priv); + if (IS_ERR(desc)) + return PTR_ERR(desc); + + rb_remote->desc = desc; + rb_remote->swap_reader_page = remote->cbs->swap_reader_page; + rb_remote->priv = remote->priv; + rb_remote->reset = remote->cbs->reset; + remote->trace_buffer = ring_buffer_alloc_remote(rb_remote); + if (!remote->trace_buffer) { + remote->cbs->unload_trace_buffer(desc, remote->priv); + return -ENOMEM; + } + + remote->trace_buffer_desc = desc; + + return 0; +} + +static void trace_remote_try_unload(struct trace_remote *remote) +{ + lockdep_assert_held(&remote->lock); + + if (!trace_remote_loaded(remote)) + return; + + /* The buffer is being read or writable */ + if (remote->nr_readers || remote->tracing_on) + return; + + /* The buffer has readable data */ + if (!ring_buffer_empty(remote->trace_buffer)) + return; + + ring_buffer_free(remote->trace_buffer); + remote->trace_buffer = NULL; + remote->cbs->unload_trace_buffer(remote->trace_buffer_desc, remote->priv); +} + +static int trace_remote_enable_tracing(struct trace_remote *remote) +{ + int ret; + + lockdep_assert_held(&remote->lock); + + if (remote->tracing_on) + return 0; + + ret = trace_remote_load(remote); + if (ret) + return ret; + + ret = remote->cbs->enable_tracing(true, remote->priv); + if (ret) { + trace_remote_try_unload(remote); + return ret; + } + + remote->tracing_on = true; + + return 0; +} + +static int trace_remote_disable_tracing(struct trace_remote *remote) +{ + int ret; + + lockdep_assert_held(&remote->lock); + + if (!remote->tracing_on) + return 0; + + ret = remote->cbs->enable_tracing(false, remote->priv); + if (ret) + return ret; + + ring_buffer_poll_remote(remote->trace_buffer, RING_BUFFER_ALL_CPUS); + remote->tracing_on = false; + trace_remote_try_unload(remote); + + return 0; +} + +static void trace_remote_reset(struct trace_remote *remote, int cpu) +{ + lockdep_assert_held(&remote->lock); + + if (!trace_remote_loaded(remote)) + return; + + if (cpu == RING_BUFFER_ALL_CPUS) + ring_buffer_reset(remote->trace_buffer); + else + ring_buffer_reset_cpu(remote->trace_buffer, cpu); + + trace_remote_try_unload(remote); +} + +static ssize_t +tracing_on_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct seq_file *seq = filp->private_data; + struct trace_remote *remote = seq->private; + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + guard(mutex)(&remote->lock); + + ret = val ? trace_remote_enable_tracing(remote) : trace_remote_disable_tracing(remote); + if (ret) + return ret; + + return cnt; +} +static int tracing_on_show(struct seq_file *s, void *unused) +{ + struct trace_remote *remote = s->private; + + seq_printf(s, "%d\n", remote->tracing_on); + + return 0; +} +DEFINE_SHOW_STORE_ATTRIBUTE(tracing_on); + +static ssize_t buffer_size_kb_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct seq_file *seq = filp->private_data; + struct trace_remote *remote = seq->private; + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + /* KiB to Bytes */ + if (!val || check_shl_overflow(val, 10, &val)) + return -EINVAL; + + guard(mutex)(&remote->lock); + + if (trace_remote_loaded(remote)) + return -EBUSY; + + remote->trace_buffer_size = val; + + return cnt; +} + +static int buffer_size_kb_show(struct seq_file *s, void *unused) +{ + struct trace_remote *remote = s->private; + + seq_printf(s, "%lu (%s)\n", remote->trace_buffer_size >> 10, + trace_remote_loaded(remote) ? "loaded" : "unloaded"); + + return 0; +} +DEFINE_SHOW_STORE_ATTRIBUTE(buffer_size_kb); + +static int trace_remote_get(struct trace_remote *remote, int cpu) +{ + int ret; + + if (remote->nr_readers == UINT_MAX) + return -EBUSY; + + ret = trace_remote_load(remote); + if (ret) + return ret; + + if (cpu != RING_BUFFER_ALL_CPUS && !remote->pcpu_reader_locks) { + int lock_cpu; + + remote->pcpu_reader_locks = kcalloc(nr_cpu_ids, sizeof(*remote->pcpu_reader_locks), + GFP_KERNEL); + if (!remote->pcpu_reader_locks) { + trace_remote_try_unload(remote); + return -ENOMEM; + } + + for_each_possible_cpu(lock_cpu) + init_rwsem(&remote->pcpu_reader_locks[lock_cpu]); + } + + remote->nr_readers++; + + return 0; +} + +static void trace_remote_put(struct trace_remote *remote) +{ + if (WARN_ON(!remote->nr_readers)) + return; + + remote->nr_readers--; + if (remote->nr_readers) + return; + + kfree(remote->pcpu_reader_locks); + remote->pcpu_reader_locks = NULL; + + trace_remote_try_unload(remote); +} + +static bool trace_remote_has_cpu(struct trace_remote *remote, int cpu) +{ + if (cpu == RING_BUFFER_ALL_CPUS) + return true; + + return ring_buffer_poll_remote(remote->trace_buffer, cpu) == 0; +} + +static void __poll_remote(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct trace_remote_iterator *iter; + + iter = container_of(dwork, struct trace_remote_iterator, poll_work); + ring_buffer_poll_remote(iter->remote->trace_buffer, iter->cpu); + schedule_delayed_work((struct delayed_work *)work, + msecs_to_jiffies(iter->remote->poll_ms)); +} + +static void __free_ring_buffer_iter(struct trace_remote_iterator *iter, int cpu) +{ + if (cpu != RING_BUFFER_ALL_CPUS) { + ring_buffer_read_finish(iter->rb_iter); + return; + } + + for_each_possible_cpu(cpu) { + if (iter->rb_iters[cpu]) + ring_buffer_read_finish(iter->rb_iters[cpu]); + } + + kfree(iter->rb_iters); +} + +static int __alloc_ring_buffer_iter(struct trace_remote_iterator *iter, int cpu) +{ + if (cpu != RING_BUFFER_ALL_CPUS) { + iter->rb_iter = ring_buffer_read_start(iter->remote->trace_buffer, cpu, GFP_KERNEL); + + return iter->rb_iter ? 0 : -ENOMEM; + } + + iter->rb_iters = kcalloc(nr_cpu_ids, sizeof(*iter->rb_iters), GFP_KERNEL); + if (!iter->rb_iters) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + iter->rb_iters[cpu] = ring_buffer_read_start(iter->remote->trace_buffer, cpu, + GFP_KERNEL); + if (!iter->rb_iters[cpu]) { + /* This CPU isn't part of trace_buffer. Skip it */ + if (!trace_remote_has_cpu(iter->remote, cpu)) + continue; + + __free_ring_buffer_iter(iter, RING_BUFFER_ALL_CPUS); + return -ENOMEM; + } + } + + return 0; +} + +static struct trace_remote_iterator +*trace_remote_iter(struct trace_remote *remote, int cpu, enum tri_type type) +{ + struct trace_remote_iterator *iter = NULL; + int ret; + + lockdep_assert_held(&remote->lock); + + if (type == TRI_NONCONSUMING && !trace_remote_loaded(remote)) + return NULL; + + ret = trace_remote_get(remote, cpu); + if (ret) + return ERR_PTR(ret); + + if (!trace_remote_has_cpu(remote, cpu)) { + ret = -ENODEV; + goto err; + } + + iter = kzalloc_obj(*iter); + if (iter) { + iter->remote = remote; + iter->cpu = cpu; + iter->type = type; + trace_seq_init(&iter->seq); + + switch (type) { + case TRI_CONSUMING: + ring_buffer_poll_remote(remote->trace_buffer, cpu); + INIT_DELAYED_WORK(&iter->poll_work, __poll_remote); + schedule_delayed_work(&iter->poll_work, msecs_to_jiffies(remote->poll_ms)); + break; + case TRI_NONCONSUMING: + ret = __alloc_ring_buffer_iter(iter, cpu); + break; + } + + if (ret) + goto err; + + return iter; + } + ret = -ENOMEM; + +err: + kfree(iter); + trace_remote_put(remote); + + return ERR_PTR(ret); +} + +static void trace_remote_iter_free(struct trace_remote_iterator *iter) +{ + struct trace_remote *remote; + + if (!iter) + return; + + remote = iter->remote; + + lockdep_assert_held(&remote->lock); + + switch (iter->type) { + case TRI_CONSUMING: + cancel_delayed_work_sync(&iter->poll_work); + break; + case TRI_NONCONSUMING: + __free_ring_buffer_iter(iter, iter->cpu); + break; + } + + kfree(iter); + trace_remote_put(remote); +} + +static void trace_remote_iter_read_start(struct trace_remote_iterator *iter) +{ + struct trace_remote *remote = iter->remote; + int cpu = iter->cpu; + + /* Acquire global reader lock */ + if (cpu == RING_BUFFER_ALL_CPUS && iter->type == TRI_CONSUMING) + down_write(&remote->reader_lock); + else + down_read(&remote->reader_lock); + + if (cpu == RING_BUFFER_ALL_CPUS) + return; + + /* + * No need for the remote lock here, iter holds a reference on + * remote->nr_readers + */ + + /* Get the per-CPU one */ + if (WARN_ON_ONCE(!remote->pcpu_reader_locks)) + return; + + if (iter->type == TRI_CONSUMING) + down_write(&remote->pcpu_reader_locks[cpu]); + else + down_read(&remote->pcpu_reader_locks[cpu]); +} + +static void trace_remote_iter_read_finished(struct trace_remote_iterator *iter) +{ + struct trace_remote *remote = iter->remote; + int cpu = iter->cpu; + + /* Release per-CPU reader lock */ + if (cpu != RING_BUFFER_ALL_CPUS) { + /* + * No need for the remote lock here, iter holds a reference on + * remote->nr_readers + */ + if (iter->type == TRI_CONSUMING) + up_write(&remote->pcpu_reader_locks[cpu]); + else + up_read(&remote->pcpu_reader_locks[cpu]); + } + + /* Release global reader lock */ + if (cpu == RING_BUFFER_ALL_CPUS && iter->type == TRI_CONSUMING) + up_write(&remote->reader_lock); + else + up_read(&remote->reader_lock); +} + +static struct ring_buffer_iter *__get_rb_iter(struct trace_remote_iterator *iter, int cpu) +{ + return iter->cpu != RING_BUFFER_ALL_CPUS ? iter->rb_iter : iter->rb_iters[cpu]; +} + +static struct ring_buffer_event * +__peek_event(struct trace_remote_iterator *iter, int cpu, u64 *ts, unsigned long *lost_events) +{ + struct ring_buffer_event *rb_evt; + struct ring_buffer_iter *rb_iter; + + switch (iter->type) { + case TRI_CONSUMING: + return ring_buffer_peek(iter->remote->trace_buffer, cpu, ts, lost_events); + case TRI_NONCONSUMING: + rb_iter = __get_rb_iter(iter, cpu); + if (!rb_iter) + return NULL; + + rb_evt = ring_buffer_iter_peek(rb_iter, ts); + if (!rb_evt) + return NULL; + + *lost_events = ring_buffer_iter_dropped(rb_iter); + + return rb_evt; + } + + return NULL; +} + +static bool trace_remote_iter_read_event(struct trace_remote_iterator *iter) +{ + struct trace_buffer *trace_buffer = iter->remote->trace_buffer; + struct ring_buffer_event *rb_evt; + int cpu = iter->cpu; + + if (cpu != RING_BUFFER_ALL_CPUS) { + if (ring_buffer_empty_cpu(trace_buffer, cpu)) + return false; + + rb_evt = __peek_event(iter, cpu, &iter->ts, &iter->lost_events); + if (!rb_evt) + return false; + + iter->evt_cpu = cpu; + iter->evt = ring_buffer_event_data(rb_evt); + return true; + } + + iter->ts = U64_MAX; + for_each_possible_cpu(cpu) { + unsigned long lost_events; + u64 ts; + + if (ring_buffer_empty_cpu(trace_buffer, cpu)) + continue; + + rb_evt = __peek_event(iter, cpu, &ts, &lost_events); + if (!rb_evt) + continue; + + if (ts >= iter->ts) + continue; + + iter->ts = ts; + iter->evt_cpu = cpu; + iter->evt = ring_buffer_event_data(rb_evt); + iter->lost_events = lost_events; + } + + return iter->ts != U64_MAX; +} + +static void trace_remote_iter_move(struct trace_remote_iterator *iter) +{ + struct trace_buffer *trace_buffer = iter->remote->trace_buffer; + + switch (iter->type) { + case TRI_CONSUMING: + ring_buffer_consume(trace_buffer, iter->evt_cpu, NULL, NULL); + break; + case TRI_NONCONSUMING: + ring_buffer_iter_advance(__get_rb_iter(iter, iter->evt_cpu)); + break; + } +} + +static struct remote_event *trace_remote_find_event(struct trace_remote *remote, unsigned short id); + +static int trace_remote_iter_print_event(struct trace_remote_iterator *iter) +{ + struct remote_event *evt; + unsigned long usecs_rem; + u64 ts = iter->ts; + + if (iter->lost_events) + trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", + iter->evt_cpu, iter->lost_events); + + do_div(ts, 1000); + usecs_rem = do_div(ts, USEC_PER_SEC); + + trace_seq_printf(&iter->seq, "[%03d]\t%5llu.%06lu: ", iter->evt_cpu, + ts, usecs_rem); + + evt = trace_remote_find_event(iter->remote, iter->evt->id); + if (!evt) + trace_seq_printf(&iter->seq, "UNKNOWN id=%d\n", iter->evt->id); + else + evt->print(iter->evt, &iter->seq); + + return trace_seq_has_overflowed(&iter->seq) ? -EOVERFLOW : 0; +} + +static int trace_pipe_open(struct inode *inode, struct file *filp) +{ + struct trace_remote *remote = inode->i_private; + struct trace_remote_iterator *iter; + int cpu = tracing_get_cpu(inode); + + guard(mutex)(&remote->lock); + + iter = trace_remote_iter(remote, cpu, TRI_CONSUMING); + if (IS_ERR(iter)) + return PTR_ERR(iter); + + filp->private_data = iter; + + return IS_ERR(iter) ? PTR_ERR(iter) : 0; +} + +static int trace_pipe_release(struct inode *inode, struct file *filp) +{ + struct trace_remote_iterator *iter = filp->private_data; + struct trace_remote *remote = iter->remote; + + guard(mutex)(&remote->lock); + + trace_remote_iter_free(iter); + + return 0; +} + +static ssize_t trace_pipe_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct trace_remote_iterator *iter = filp->private_data; + struct trace_buffer *trace_buffer = iter->remote->trace_buffer; + int ret; + +copy_to_user: + ret = trace_seq_to_user(&iter->seq, ubuf, cnt); + if (ret != -EBUSY) + return ret; + + trace_seq_init(&iter->seq); + + ret = ring_buffer_wait(trace_buffer, iter->cpu, 0, NULL, NULL); + if (ret < 0) + return ret; + + trace_remote_iter_read_start(iter); + + while (trace_remote_iter_read_event(iter)) { + int prev_len = iter->seq.seq.len; + + if (trace_remote_iter_print_event(iter)) { + iter->seq.seq.len = prev_len; + break; + } + + trace_remote_iter_move(iter); + } + + trace_remote_iter_read_finished(iter); + + goto copy_to_user; +} + +static const struct file_operations trace_pipe_fops = { + .open = trace_pipe_open, + .read = trace_pipe_read, + .release = trace_pipe_release, +}; + +static void *trace_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct trace_remote_iterator *iter = m->private; + + ++*pos; + + if (!iter || !trace_remote_iter_read_event(iter)) + return NULL; + + trace_remote_iter_move(iter); + iter->pos++; + + return iter; +} + +static void *trace_start(struct seq_file *m, loff_t *pos) +{ + struct trace_remote_iterator *iter = m->private; + loff_t i; + + if (!iter) + return NULL; + + trace_remote_iter_read_start(iter); + + if (!*pos) { + iter->pos = -1; + return trace_next(m, NULL, &i); + } + + i = iter->pos; + while (i < *pos) { + iter = trace_next(m, NULL, &i); + if (!iter) + return NULL; + } + + return iter; +} + +static int trace_show(struct seq_file *m, void *v) +{ + struct trace_remote_iterator *iter = v; + + trace_seq_init(&iter->seq); + + if (trace_remote_iter_print_event(iter)) { + seq_printf(m, "[EVENT %d PRINT TOO BIG]\n", iter->evt->id); + return 0; + } + + return trace_print_seq(m, &iter->seq); +} + +static void trace_stop(struct seq_file *m, void *v) +{ + struct trace_remote_iterator *iter = m->private; + + if (iter) + trace_remote_iter_read_finished(iter); +} + +static const struct seq_operations trace_sops = { + .start = trace_start, + .next = trace_next, + .show = trace_show, + .stop = trace_stop, +}; + +static int trace_open(struct inode *inode, struct file *filp) +{ + struct trace_remote *remote = inode->i_private; + struct trace_remote_iterator *iter = NULL; + int cpu = tracing_get_cpu(inode); + int ret; + + if (!(filp->f_mode & FMODE_READ)) + return 0; + + guard(mutex)(&remote->lock); + + iter = trace_remote_iter(remote, cpu, TRI_NONCONSUMING); + if (IS_ERR(iter)) + return PTR_ERR(iter); + + ret = seq_open(filp, &trace_sops); + if (ret) { + trace_remote_iter_free(iter); + return ret; + } + + ((struct seq_file *)filp->private_data)->private = (void *)iter; + + return 0; +} + +static int trace_release(struct inode *inode, struct file *filp) +{ + struct trace_remote_iterator *iter; + + if (!(filp->f_mode & FMODE_READ)) + return 0; + + iter = ((struct seq_file *)filp->private_data)->private; + seq_release(inode, filp); + + if (!iter) + return 0; + + guard(mutex)(&iter->remote->lock); + + trace_remote_iter_free(iter); + + return 0; +} + +static ssize_t trace_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct inode *inode = file_inode(filp); + struct trace_remote *remote = inode->i_private; + int cpu = tracing_get_cpu(inode); + + guard(mutex)(&remote->lock); + + trace_remote_reset(remote, cpu); + + return cnt; +} + +static const struct file_operations trace_fops = { + .open = trace_open, + .write = trace_write, + .read = seq_read, + .read_iter = seq_read_iter, + .release = trace_release, +}; + +static int trace_remote_init_tracefs(const char *name, struct trace_remote *remote) +{ + struct dentry *remote_d, *percpu_d, *d; + static struct dentry *root; + static DEFINE_MUTEX(lock); + bool root_inited = false; + int cpu; + + guard(mutex)(&lock); + + if (!root) { + root = tracefs_create_dir(TRACEFS_DIR, NULL); + if (!root) { + pr_err("Failed to create tracefs dir "TRACEFS_DIR"\n"); + return -ENOMEM; + } + root_inited = true; + } + + remote_d = tracefs_create_dir(name, root); + if (!remote_d) { + pr_err("Failed to create tracefs dir "TRACEFS_DIR"%s/\n", name); + goto err; + } + + d = trace_create_file("tracing_on", TRACEFS_MODE_WRITE, remote_d, remote, &tracing_on_fops); + if (!d) + goto err; + + d = trace_create_file("buffer_size_kb", TRACEFS_MODE_WRITE, remote_d, remote, + &buffer_size_kb_fops); + if (!d) + goto err; + + d = trace_create_file("trace_pipe", TRACEFS_MODE_READ, remote_d, remote, &trace_pipe_fops); + if (!d) + goto err; + + d = trace_create_file("trace", TRACEFS_MODE_WRITE, remote_d, remote, &trace_fops); + if (!d) + goto err; + + percpu_d = tracefs_create_dir("per_cpu", remote_d); + if (!percpu_d) { + pr_err("Failed to create tracefs dir "TRACEFS_DIR"%s/per_cpu/\n", name); + goto err; + } + + for_each_possible_cpu(cpu) { + struct dentry *cpu_d; + char cpu_name[16]; + + snprintf(cpu_name, sizeof(cpu_name), "cpu%d", cpu); + cpu_d = tracefs_create_dir(cpu_name, percpu_d); + if (!cpu_d) { + pr_err("Failed to create tracefs dir "TRACEFS_DIR"%s/percpu/cpu%d\n", + name, cpu); + goto err; + } + + d = trace_create_cpu_file("trace_pipe", TRACEFS_MODE_READ, cpu_d, remote, cpu, + &trace_pipe_fops); + if (!d) + goto err; + + d = trace_create_cpu_file("trace", TRACEFS_MODE_WRITE, cpu_d, remote, cpu, + &trace_fops); + if (!d) + goto err; + } + + remote->dentry = remote_d; + + return 0; + +err: + if (root_inited) { + tracefs_remove(root); + root = NULL; + } else { + tracefs_remove(remote_d); + } + + return -ENOMEM; +} + +static int trace_remote_register_events(const char *remote_name, struct trace_remote *remote, + struct remote_event *events, size_t nr_events); + +/** + * trace_remote_register() - Register a Tracefs remote + * @name: Name of the remote, used for the Tracefs remotes/ directory. + * @cbs: Set of callbacks used to control the remote. + * @priv: Private data, passed to each callback from @cbs. + * @events: Array of events. &remote_event.name and &remote_event.id must be + * filled by the caller. + * @nr_events: Number of events in the @events array. + * + * A trace remote is an entity, outside of the kernel (most likely firmware or + * hypervisor) capable of writing events into a Tracefs compatible ring-buffer. + * The kernel would then act as a reader. + * + * The registered remote will be found under the Tracefs directory + * remotes/<name>. + * + * Return: 0 on success, negative error code on failure. + */ +int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, void *priv, + struct remote_event *events, size_t nr_events) +{ + struct trace_remote *remote; + int ret; + + remote = kzalloc_obj(*remote); + if (!remote) + return -ENOMEM; + + remote->cbs = cbs; + remote->priv = priv; + remote->trace_buffer_size = 7 << 10; + remote->poll_ms = 100; + mutex_init(&remote->lock); + init_rwsem(&remote->reader_lock); + + if (trace_remote_init_tracefs(name, remote)) { + kfree(remote); + return -ENOMEM; + } + + ret = trace_remote_register_events(name, remote, events, nr_events); + if (ret) { + pr_err("Failed to register events for trace remote '%s' (%d)\n", + name, ret); + return ret; + } + + ret = cbs->init ? cbs->init(remote->dentry, priv) : 0; + if (ret) + pr_err("Init failed for trace remote '%s' (%d)\n", name, ret); + + return ret; +} +EXPORT_SYMBOL_GPL(trace_remote_register); + +/** + * trace_remote_free_buffer() - Free trace buffer allocated with trace_remote_alloc_buffer() + * @desc: Descriptor of the per-CPU ring-buffers, originally filled by + * trace_remote_alloc_buffer() + * + * Most likely called from &trace_remote_callbacks.unload_trace_buffer. + */ +void trace_remote_free_buffer(struct trace_buffer_desc *desc) +{ + struct ring_buffer_desc *rb_desc; + int cpu; + + for_each_ring_buffer_desc(rb_desc, cpu, desc) { + unsigned int id; + + free_page(rb_desc->meta_va); + + for (id = 0; id < rb_desc->nr_page_va; id++) + free_page(rb_desc->page_va[id]); + } +} +EXPORT_SYMBOL_GPL(trace_remote_free_buffer); + +/** + * trace_remote_alloc_buffer() - Dynamically allocate a trace buffer + * @desc: Uninitialized trace_buffer_desc + * @desc_size: Size of the trace_buffer_desc. Must be at least equal to + * trace_buffer_desc_size() + * @buffer_size: Size in bytes of each per-CPU ring-buffer + * @cpumask: CPUs to allocate a ring-buffer for + * + * Helper to dynamically allocate a set of pages (enough to cover @buffer_size) + * for each CPU from @cpumask and fill @desc. Most likely called from + * &trace_remote_callbacks.load_trace_buffer. + * + * Return: 0 on success, negative error code on failure. + */ +int trace_remote_alloc_buffer(struct trace_buffer_desc *desc, size_t desc_size, size_t buffer_size, + const struct cpumask *cpumask) +{ + unsigned int nr_pages = max(DIV_ROUND_UP(buffer_size, PAGE_SIZE), 2UL) + 1; + void *desc_end = desc + desc_size; + struct ring_buffer_desc *rb_desc; + int cpu, ret = -ENOMEM; + + if (desc_size < struct_size(desc, __data, 0)) + return -EINVAL; + + desc->nr_cpus = 0; + desc->struct_len = struct_size(desc, __data, 0); + + rb_desc = (struct ring_buffer_desc *)&desc->__data[0]; + + for_each_cpu(cpu, cpumask) { + unsigned int id; + + if ((void *)rb_desc + struct_size(rb_desc, page_va, nr_pages) > desc_end) { + ret = -EINVAL; + goto err; + } + + rb_desc->cpu = cpu; + rb_desc->nr_page_va = 0; + rb_desc->meta_va = (unsigned long)__get_free_page(GFP_KERNEL); + if (!rb_desc->meta_va) + goto err; + + for (id = 0; id < nr_pages; id++) { + rb_desc->page_va[id] = (unsigned long)__get_free_page(GFP_KERNEL); + if (!rb_desc->page_va[id]) + goto err; + + rb_desc->nr_page_va++; + } + desc->nr_cpus++; + desc->struct_len += offsetof(struct ring_buffer_desc, page_va); + desc->struct_len += struct_size(rb_desc, page_va, rb_desc->nr_page_va); + rb_desc = __next_ring_buffer_desc(rb_desc); + } + + return 0; + +err: + trace_remote_free_buffer(desc); + return ret; +} +EXPORT_SYMBOL_GPL(trace_remote_alloc_buffer); + +static int +trace_remote_enable_event(struct trace_remote *remote, struct remote_event *evt, bool enable) +{ + int ret; + + lockdep_assert_held(&remote->lock); + + if (evt->enabled == enable) + return 0; + + ret = remote->cbs->enable_event(evt->id, enable, remote->priv); + if (ret) + return ret; + + evt->enabled = enable; + + return 0; +} + +static int remote_event_enable_show(struct seq_file *s, void *unused) +{ + struct remote_event *evt = s->private; + + seq_printf(s, "%d\n", evt->enabled); + + return 0; +} + +static ssize_t remote_event_enable_write(struct file *filp, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct seq_file *seq = filp->private_data; + struct remote_event *evt = seq->private; + struct trace_remote *remote = evt->remote; + u8 enable; + int ret; + + ret = kstrtou8_from_user(ubuf, count, 10, &enable); + if (ret) + return ret; + + guard(mutex)(&remote->lock); + + ret = trace_remote_enable_event(remote, evt, enable); + if (ret) + return ret; + + return count; +} +DEFINE_SHOW_STORE_ATTRIBUTE(remote_event_enable); + +static int remote_event_id_show(struct seq_file *s, void *unused) +{ + struct remote_event *evt = s->private; + + seq_printf(s, "%d\n", evt->id); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(remote_event_id); + +static int remote_event_format_show(struct seq_file *s, void *unused) +{ + size_t offset = sizeof(struct remote_event_hdr); + struct remote_event *evt = s->private; + struct trace_event_fields *field; + + seq_printf(s, "name: %s\n", evt->name); + seq_printf(s, "ID: %d\n", evt->id); + seq_puts(s, + "format:\n\tfield:unsigned short common_type;\toffset:0;\tsize:2;\tsigned:0;\n\n"); + + field = &evt->fields[0]; + while (field->name) { + seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%u;\tsigned:%d;\n", + field->type, field->name, offset, field->size, + field->is_signed); + offset += field->size; + field++; + } + + if (field != &evt->fields[0]) + seq_puts(s, "\n"); + + seq_printf(s, "print fmt: %s\n", evt->print_fmt); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(remote_event_format); + +static int remote_event_callback(const char *name, umode_t *mode, void **data, + const struct file_operations **fops) +{ + if (!strcmp(name, "enable")) { + *mode = TRACEFS_MODE_WRITE; + *fops = &remote_event_enable_fops; + return 1; + } + + if (!strcmp(name, "id")) { + *mode = TRACEFS_MODE_READ; + *fops = &remote_event_id_fops; + return 1; + } + + if (!strcmp(name, "format")) { + *mode = TRACEFS_MODE_READ; + *fops = &remote_event_format_fops; + return 1; + } + + return 0; +} + +static ssize_t remote_events_dir_enable_write(struct file *filp, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct trace_remote *remote = file_inode(filp)->i_private; + int i, ret; + u8 enable; + + ret = kstrtou8_from_user(ubuf, count, 10, &enable); + if (ret) + return ret; + + guard(mutex)(&remote->lock); + + for (i = 0; i < remote->nr_events; i++) { + struct remote_event *evt = &remote->events[i]; + + trace_remote_enable_event(remote, evt, enable); + } + + return count; +} + +static ssize_t remote_events_dir_enable_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct trace_remote *remote = file_inode(filp)->i_private; + const char enabled_char[] = {'0', '1', 'X'}; + char enabled_str[] = " \n"; + int i, enabled = -1; + + guard(mutex)(&remote->lock); + + for (i = 0; i < remote->nr_events; i++) { + struct remote_event *evt = &remote->events[i]; + + if (enabled == -1) { + enabled = evt->enabled; + } else if (enabled != evt->enabled) { + enabled = 2; + break; + } + } + + enabled_str[0] = enabled_char[enabled == -1 ? 0 : enabled]; + + return simple_read_from_buffer(ubuf, cnt, ppos, enabled_str, 2); +} + +static const struct file_operations remote_events_dir_enable_fops = { + .write = remote_events_dir_enable_write, + .read = remote_events_dir_enable_read, +}; + +static ssize_t +remote_events_dir_header_page_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct trace_seq *s; + int ret; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + trace_seq_init(s); + + ring_buffer_print_page_header(NULL, s); + ret = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, trace_seq_used(s)); + kfree(s); + + return ret; +} + +static const struct file_operations remote_events_dir_header_page_fops = { + .read = remote_events_dir_header_page_read, +}; + +static ssize_t +remote_events_dir_header_event_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct trace_seq *s; + int ret; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + trace_seq_init(s); + + ring_buffer_print_entry_header(s); + ret = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, trace_seq_used(s)); + kfree(s); + + return ret; +} + +static const struct file_operations remote_events_dir_header_event_fops = { + .read = remote_events_dir_header_event_read, +}; + +static int remote_events_dir_callback(const char *name, umode_t *mode, void **data, + const struct file_operations **fops) +{ + if (!strcmp(name, "enable")) { + *mode = TRACEFS_MODE_WRITE; + *fops = &remote_events_dir_enable_fops; + return 1; + } + + if (!strcmp(name, "header_page")) { + *mode = TRACEFS_MODE_READ; + *fops = &remote_events_dir_header_page_fops; + return 1; + } + + if (!strcmp(name, "header_event")) { + *mode = TRACEFS_MODE_READ; + *fops = &remote_events_dir_header_event_fops; + return 1; + } + + return 0; +} + +static int trace_remote_init_eventfs(const char *remote_name, struct trace_remote *remote, + struct remote_event *evt) +{ + struct eventfs_inode *eventfs = remote->eventfs; + static struct eventfs_entry dir_entries[] = { + { + .name = "enable", + .callback = remote_events_dir_callback, + }, { + .name = "header_page", + .callback = remote_events_dir_callback, + }, { + .name = "header_event", + .callback = remote_events_dir_callback, + } + }; + static struct eventfs_entry entries[] = { + { + .name = "enable", + .callback = remote_event_callback, + }, { + .name = "id", + .callback = remote_event_callback, + }, { + .name = "format", + .callback = remote_event_callback, + } + }; + bool eventfs_create = false; + + if (!eventfs) { + eventfs = eventfs_create_events_dir("events", remote->dentry, dir_entries, + ARRAY_SIZE(dir_entries), remote); + if (IS_ERR(eventfs)) + return PTR_ERR(eventfs); + + /* + * Create similar hierarchy as local events even if a single system is supported at + * the moment + */ + eventfs = eventfs_create_dir(remote_name, eventfs, NULL, 0, NULL); + if (IS_ERR(eventfs)) + return PTR_ERR(eventfs); + + remote->eventfs = eventfs; + eventfs_create = true; + } + + eventfs = eventfs_create_dir(evt->name, eventfs, entries, ARRAY_SIZE(entries), evt); + if (IS_ERR(eventfs)) { + if (eventfs_create) { + eventfs_remove_events_dir(remote->eventfs); + remote->eventfs = NULL; + } + return PTR_ERR(eventfs); + } + + return 0; +} + +static int trace_remote_attach_events(struct trace_remote *remote, struct remote_event *events, + size_t nr_events) +{ + int i; + + for (i = 0; i < nr_events; i++) { + struct remote_event *evt = &events[i]; + + if (evt->remote) + return -EEXIST; + + evt->remote = remote; + + /* We need events to be sorted for efficient lookup */ + if (i && evt->id <= events[i - 1].id) + return -EINVAL; + } + + remote->events = events; + remote->nr_events = nr_events; + + return 0; +} + +static int trace_remote_register_events(const char *remote_name, struct trace_remote *remote, + struct remote_event *events, size_t nr_events) +{ + int i, ret; + + ret = trace_remote_attach_events(remote, events, nr_events); + if (ret) + return ret; + + for (i = 0; i < nr_events; i++) { + struct remote_event *evt = &events[i]; + + ret = trace_remote_init_eventfs(remote_name, remote, evt); + if (ret) + pr_warn("Failed to init eventfs for event '%s' (%d)", + evt->name, ret); + } + + return 0; +} + +static int __cmp_events(const void *key, const void *data) +{ + const struct remote_event *evt = data; + int id = (int)((long)key); + + return id - (int)evt->id; +} + +static struct remote_event *trace_remote_find_event(struct trace_remote *remote, unsigned short id) +{ + return bsearch((const void *)(unsigned long)id, remote->events, remote->nr_events, + sizeof(*remote->events), __cmp_events); +} diff --git a/kernel/trace/trace_snapshot.c b/kernel/trace/trace_snapshot.c new file mode 100644 index 000000000000..07b43c9863a2 --- /dev/null +++ b/kernel/trace/trace_snapshot.c @@ -0,0 +1,1066 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/fsnotify.h> + +#include <asm/setup.h> /* COMMAND_LINE_SIZE */ + +#include "trace.h" + +/* Used if snapshot allocated at boot */ +static bool allocate_snapshot; +static bool snapshot_at_boot; + +static char boot_snapshot_info[COMMAND_LINE_SIZE] __initdata; +static int boot_snapshot_index; + +static int __init boot_alloc_snapshot(char *str) +{ + char *slot = boot_snapshot_info + boot_snapshot_index; + int left = sizeof(boot_snapshot_info) - boot_snapshot_index; + int ret; + + if (str[0] == '=') { + str++; + if (strlen(str) >= left) + return -1; + + ret = snprintf(slot, left, "%s\t", str); + boot_snapshot_index += ret; + } else { + allocate_snapshot = true; + /* We also need the main ring buffer expanded */ + trace_set_ring_buffer_expanded(NULL); + } + return 1; +} +__setup("alloc_snapshot", boot_alloc_snapshot); + + +static int __init boot_snapshot(char *str) +{ + snapshot_at_boot = true; + boot_alloc_snapshot(str); + return 1; +} +__setup("ftrace_boot_snapshot", boot_snapshot); +static void tracing_snapshot_instance_cond(struct trace_array *tr, + void *cond_data) +{ + unsigned long flags; + + if (in_nmi()) { + trace_array_puts(tr, "*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n"); + trace_array_puts(tr, "*** snapshot is being ignored ***\n"); + return; + } + + if (!tr->allocated_snapshot) { + trace_array_puts(tr, "*** SNAPSHOT NOT ALLOCATED ***\n"); + trace_array_puts(tr, "*** stopping trace here! ***\n"); + tracer_tracing_off(tr); + return; + } + + if (tr->mapped) { + trace_array_puts(tr, "*** BUFFER MEMORY MAPPED ***\n"); + trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n"); + return; + } + + /* Note, snapshot can not be used when the tracer uses it */ + if (tracer_uses_snapshot(tr->current_trace)) { + trace_array_puts(tr, "*** LATENCY TRACER ACTIVE ***\n"); + trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n"); + return; + } + + local_irq_save(flags); + update_max_tr(tr, current, smp_processor_id(), cond_data); + local_irq_restore(flags); +} + +void tracing_snapshot_instance(struct trace_array *tr) +{ + tracing_snapshot_instance_cond(tr, NULL); +} + +/** + * tracing_snapshot_cond - conditionally take a snapshot of the current buffer. + * @tr: The tracing instance to snapshot + * @cond_data: The data to be tested conditionally, and possibly saved + * + * This is the same as tracing_snapshot() except that the snapshot is + * conditional - the snapshot will only happen if the + * cond_snapshot.update() implementation receiving the cond_data + * returns true, which means that the trace array's cond_snapshot + * update() operation used the cond_data to determine whether the + * snapshot should be taken, and if it was, presumably saved it along + * with the snapshot. + */ +void tracing_snapshot_cond(struct trace_array *tr, void *cond_data) +{ + tracing_snapshot_instance_cond(tr, cond_data); +} +EXPORT_SYMBOL_GPL(tracing_snapshot_cond); + +/** + * tracing_cond_snapshot_data - get the user data associated with a snapshot + * @tr: The tracing instance + * + * When the user enables a conditional snapshot using + * tracing_snapshot_cond_enable(), the user-defined cond_data is saved + * with the snapshot. This accessor is used to retrieve it. + * + * Should not be called from cond_snapshot.update(), since it takes + * the tr->max_lock lock, which the code calling + * cond_snapshot.update() has already done. + * + * Returns the cond_data associated with the trace array's snapshot. + */ +void *tracing_cond_snapshot_data(struct trace_array *tr) +{ + void *cond_data = NULL; + + local_irq_disable(); + arch_spin_lock(&tr->max_lock); + + if (tr->cond_snapshot) + cond_data = tr->cond_snapshot->cond_data; + + arch_spin_unlock(&tr->max_lock); + local_irq_enable(); + + return cond_data; +} +EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data); + +/* resize @tr's buffer to the size of @size_tr's entries */ +int resize_buffer_duplicate_size(struct array_buffer *trace_buf, + struct array_buffer *size_buf, int cpu_id) +{ + int cpu, ret = 0; + + if (cpu_id == RING_BUFFER_ALL_CPUS) { + for_each_tracing_cpu(cpu) { + ret = ring_buffer_resize(trace_buf->buffer, + per_cpu_ptr(size_buf->data, cpu)->entries, cpu); + if (ret < 0) + break; + per_cpu_ptr(trace_buf->data, cpu)->entries = + per_cpu_ptr(size_buf->data, cpu)->entries; + } + } else { + ret = ring_buffer_resize(trace_buf->buffer, + per_cpu_ptr(size_buf->data, cpu_id)->entries, cpu_id); + if (ret == 0) + per_cpu_ptr(trace_buf->data, cpu_id)->entries = + per_cpu_ptr(size_buf->data, cpu_id)->entries; + } + + return ret; +} + +int tracing_alloc_snapshot_instance(struct trace_array *tr) +{ + int order; + int ret; + + if (!tr->allocated_snapshot) { + + /* Make the snapshot buffer have the same order as main buffer */ + order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer); + ret = ring_buffer_subbuf_order_set(tr->snapshot_buffer.buffer, order); + if (ret < 0) + return ret; + + /* allocate spare buffer */ + ret = resize_buffer_duplicate_size(&tr->snapshot_buffer, + &tr->array_buffer, RING_BUFFER_ALL_CPUS); + if (ret < 0) + return ret; + + tr->allocated_snapshot = true; + } + + return 0; +} + +void free_snapshot(struct trace_array *tr) +{ + /* + * We don't free the ring buffer. instead, resize it because + * The max_tr ring buffer has some state (e.g. ring->clock) and + * we want preserve it. + */ + ring_buffer_subbuf_order_set(tr->snapshot_buffer.buffer, 0); + ring_buffer_resize(tr->snapshot_buffer.buffer, 1, RING_BUFFER_ALL_CPUS); + trace_set_buffer_entries(&tr->snapshot_buffer, 1); + tracing_reset_online_cpus(&tr->snapshot_buffer); + tr->allocated_snapshot = false; +} + +int tracing_arm_snapshot_locked(struct trace_array *tr) +{ + int ret; + + lockdep_assert_held(&trace_types_lock); + + spin_lock(&tr->snapshot_trigger_lock); + if (tr->snapshot == UINT_MAX || tr->mapped) { + spin_unlock(&tr->snapshot_trigger_lock); + return -EBUSY; + } + + tr->snapshot++; + spin_unlock(&tr->snapshot_trigger_lock); + + ret = tracing_alloc_snapshot_instance(tr); + if (ret) { + spin_lock(&tr->snapshot_trigger_lock); + tr->snapshot--; + spin_unlock(&tr->snapshot_trigger_lock); + } + + return ret; +} + +int tracing_arm_snapshot(struct trace_array *tr) +{ + guard(mutex)(&trace_types_lock); + return tracing_arm_snapshot_locked(tr); +} + +void tracing_disarm_snapshot(struct trace_array *tr) +{ + spin_lock(&tr->snapshot_trigger_lock); + if (!WARN_ON(!tr->snapshot)) + tr->snapshot--; + spin_unlock(&tr->snapshot_trigger_lock); +} + +/** + * tracing_snapshot_alloc - allocate and take a snapshot of the current buffer. + * + * This is similar to tracing_snapshot(), but it will allocate the + * snapshot buffer if it isn't already allocated. Use this only + * where it is safe to sleep, as the allocation may sleep. + * + * This causes a swap between the snapshot buffer and the current live + * tracing buffer. You can use this to take snapshots of the live + * trace when some condition is triggered, but continue to trace. + */ +void tracing_snapshot_alloc(void) +{ + int ret; + + ret = tracing_alloc_snapshot(); + if (ret < 0) + return; + + tracing_snapshot(); +} +EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); + +/** + * tracing_snapshot_cond_enable - enable conditional snapshot for an instance + * @tr: The tracing instance + * @cond_data: User data to associate with the snapshot + * @update: Implementation of the cond_snapshot update function + * + * Check whether the conditional snapshot for the given instance has + * already been enabled, or if the current tracer is already using a + * snapshot; if so, return -EBUSY, else create a cond_snapshot and + * save the cond_data and update function inside. + * + * Returns 0 if successful, error otherwise. + */ +int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, + cond_update_fn_t update) +{ + struct cond_snapshot *cond_snapshot __free(kfree) = + kzalloc_obj(*cond_snapshot); + int ret; + + if (!cond_snapshot) + return -ENOMEM; + + cond_snapshot->cond_data = cond_data; + cond_snapshot->update = update; + + guard(mutex)(&trace_types_lock); + + if (tracer_uses_snapshot(tr->current_trace)) + return -EBUSY; + + /* + * The cond_snapshot can only change to NULL without the + * trace_types_lock. We don't care if we race with it going + * to NULL, but we want to make sure that it's not set to + * something other than NULL when we get here, which we can + * do safely with only holding the trace_types_lock and not + * having to take the max_lock. + */ + if (tr->cond_snapshot) + return -EBUSY; + + ret = tracing_arm_snapshot_locked(tr); + if (ret) + return ret; + + local_irq_disable(); + arch_spin_lock(&tr->max_lock); + tr->cond_snapshot = no_free_ptr(cond_snapshot); + arch_spin_unlock(&tr->max_lock); + local_irq_enable(); + + return 0; +} +EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable); + +/** + * tracing_snapshot_cond_disable - disable conditional snapshot for an instance + * @tr: The tracing instance + * + * Check whether the conditional snapshot for the given instance is + * enabled; if so, free the cond_snapshot associated with it, + * otherwise return -EINVAL. + * + * Returns 0 if successful, error otherwise. + */ +int tracing_snapshot_cond_disable(struct trace_array *tr) +{ + int ret = 0; + + local_irq_disable(); + arch_spin_lock(&tr->max_lock); + + if (!tr->cond_snapshot) + ret = -EINVAL; + else { + kfree(tr->cond_snapshot); + tr->cond_snapshot = NULL; + } + + arch_spin_unlock(&tr->max_lock); + local_irq_enable(); + + tracing_disarm_snapshot(tr); + + return ret; +} +EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable); + +#ifdef CONFIG_TRACER_MAX_TRACE +#ifdef LATENCY_FS_NOTIFY +static struct workqueue_struct *fsnotify_wq; + +static void latency_fsnotify_workfn(struct work_struct *work) +{ + struct trace_array *tr = container_of(work, struct trace_array, + fsnotify_work); + fsnotify_inode(tr->d_max_latency->d_inode, FS_MODIFY); +} + +static void latency_fsnotify_workfn_irq(struct irq_work *iwork) +{ + struct trace_array *tr = container_of(iwork, struct trace_array, + fsnotify_irqwork); + queue_work(fsnotify_wq, &tr->fsnotify_work); +} + +__init static int latency_fsnotify_init(void) +{ + fsnotify_wq = alloc_workqueue("tr_max_lat_wq", + WQ_UNBOUND | WQ_HIGHPRI, 0); + if (!fsnotify_wq) { + pr_err("Unable to allocate tr_max_lat_wq\n"); + return -ENOMEM; + } + return 0; +} + +late_initcall_sync(latency_fsnotify_init); + +void latency_fsnotify(struct trace_array *tr) +{ + if (!fsnotify_wq) + return; + /* + * We cannot call queue_work(&tr->fsnotify_work) from here because it's + * possible that we are called from __schedule() or do_idle(), which + * could cause a deadlock. + */ + irq_work_queue(&tr->fsnotify_irqwork); +} +#endif /* LATENCY_FS_NOTIFY */ + +static const struct file_operations tracing_max_lat_fops; + +void trace_create_maxlat_file(struct trace_array *tr, + struct dentry *d_tracer) +{ +#ifdef LATENCY_FS_NOTIFY + INIT_WORK(&tr->fsnotify_work, latency_fsnotify_workfn); + init_irq_work(&tr->fsnotify_irqwork, latency_fsnotify_workfn_irq); +#endif + tr->d_max_latency = trace_create_file("tracing_max_latency", + TRACE_MODE_WRITE, + d_tracer, tr, + &tracing_max_lat_fops); +} + +/* + * Copy the new maximum trace into the separate maximum-trace + * structure. (this way the maximum trace is permanently saved, + * for later retrieval via /sys/kernel/tracing/tracing_max_latency) + */ +static void +__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) +{ + struct array_buffer *trace_buf = &tr->array_buffer; + struct trace_array_cpu *data = per_cpu_ptr(trace_buf->data, cpu); + struct array_buffer *max_buf = &tr->snapshot_buffer; + struct trace_array_cpu *max_data = per_cpu_ptr(max_buf->data, cpu); + + max_buf->cpu = cpu; + max_buf->time_start = data->preempt_timestamp; + + max_data->saved_latency = tr->max_latency; + max_data->critical_start = data->critical_start; + max_data->critical_end = data->critical_end; + + strscpy(max_data->comm, tsk->comm); + max_data->pid = tsk->pid; + /* + * If tsk == current, then use current_uid(), as that does not use + * RCU. The irq tracer can be called out of RCU scope. + */ + if (tsk == current) + max_data->uid = current_uid(); + else + max_data->uid = task_uid(tsk); + + max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; + max_data->policy = tsk->policy; + max_data->rt_priority = tsk->rt_priority; + + /* record this tasks comm */ + tracing_record_cmdline(tsk); + latency_fsnotify(tr); +} +#else +static inline void __update_max_tr(struct trace_array *tr, + struct task_struct *tsk, int cpu) { } +#endif /* CONFIG_TRACER_MAX_TRACE */ + +/** + * update_max_tr - snapshot all trace buffers from global_trace to max_tr + * @tr: tracer + * @tsk: the task with the latency + * @cpu: The cpu that initiated the trace. + * @cond_data: User data associated with a conditional snapshot + * + * Flip the buffers between the @tr and the max_tr and record information + * about which task was the cause of this latency. + */ +void +update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu, + void *cond_data) +{ + if (tr->stop_count) + return; + + WARN_ON_ONCE(!irqs_disabled()); + + if (!tr->allocated_snapshot) { + /* Only the nop tracer should hit this when disabling */ + WARN_ON_ONCE(tr->current_trace != &nop_trace); + return; + } + + arch_spin_lock(&tr->max_lock); + + /* Inherit the recordable setting from array_buffer */ + if (ring_buffer_record_is_set_on(tr->array_buffer.buffer)) + ring_buffer_record_on(tr->snapshot_buffer.buffer); + else + ring_buffer_record_off(tr->snapshot_buffer.buffer); + + if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data)) { + arch_spin_unlock(&tr->max_lock); + return; + } + + swap(tr->array_buffer.buffer, tr->snapshot_buffer.buffer); + + __update_max_tr(tr, tsk, cpu); + + arch_spin_unlock(&tr->max_lock); + + /* Any waiters on the old snapshot buffer need to wake up */ + ring_buffer_wake_waiters(tr->array_buffer.buffer, RING_BUFFER_ALL_CPUS); +} + +/** + * update_max_tr_single - only copy one trace over, and reset the rest + * @tr: tracer + * @tsk: task with the latency + * @cpu: the cpu of the buffer to copy. + * + * Flip the trace of a single CPU buffer between the @tr and the max_tr. + */ +void +update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) +{ + int ret; + + if (tr->stop_count) + return; + + WARN_ON_ONCE(!irqs_disabled()); + if (!tr->allocated_snapshot) { + /* Only the nop tracer should hit this when disabling */ + WARN_ON_ONCE(tr->current_trace != &nop_trace); + return; + } + + arch_spin_lock(&tr->max_lock); + + ret = ring_buffer_swap_cpu(tr->snapshot_buffer.buffer, tr->array_buffer.buffer, cpu); + + if (ret == -EBUSY) { + /* + * We failed to swap the buffer due to a commit taking + * place on this CPU. We fail to record, but we reset + * the max trace buffer (no one writes directly to it) + * and flag that it failed. + * Another reason is resize is in progress. + */ + trace_array_printk_buf(tr->snapshot_buffer.buffer, _THIS_IP_, + "Failed to swap buffers due to commit or resize in progress\n"); + } + + WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); + + __update_max_tr(tr, tsk, cpu); + arch_spin_unlock(&tr->max_lock); +} + +static void show_snapshot_main_help(struct seq_file *m) +{ + seq_puts(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n" + "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n" + "# Takes a snapshot of the main buffer.\n" + "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n" + "# (Doesn't have to be '2' works with any number that\n" + "# is not a '0' or '1')\n"); +} + +static void show_snapshot_percpu_help(struct seq_file *m) +{ + seq_puts(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n"); +#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP + seq_puts(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n" + "# Takes a snapshot of the main buffer for this cpu.\n"); +#else + seq_puts(m, "# echo 1 > snapshot : Not supported with this kernel.\n" + "# Must use main snapshot file to allocate.\n"); +#endif + seq_puts(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n" + "# (Doesn't have to be '2' works with any number that\n" + "# is not a '0' or '1')\n"); +} + +void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) +{ + if (iter->tr->allocated_snapshot) + seq_puts(m, "#\n# * Snapshot is allocated *\n#\n"); + else + seq_puts(m, "#\n# * Snapshot is freed *\n#\n"); + + seq_puts(m, "# Snapshot commands:\n"); + if (iter->cpu_file == RING_BUFFER_ALL_CPUS) + show_snapshot_main_help(m); + else + show_snapshot_percpu_help(m); +} + +static int tracing_snapshot_open(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + struct trace_iterator *iter; + struct seq_file *m; + int ret; + + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; + + if (file->f_mode & FMODE_READ) { + iter = __tracing_open(inode, file, true); + if (IS_ERR(iter)) + ret = PTR_ERR(iter); + } else { + /* Writes still need the seq_file to hold the private data */ + ret = -ENOMEM; + m = kzalloc_obj(*m); + if (!m) + goto out; + iter = kzalloc_obj(*iter); + if (!iter) { + kfree(m); + goto out; + } + ret = 0; + + iter->tr = tr; + iter->array_buffer = &tr->snapshot_buffer; + iter->cpu_file = tracing_get_cpu(inode); + m->private = iter; + file->private_data = m; + } +out: + if (ret < 0) + trace_array_put(tr); + + return ret; +} + +static void tracing_swap_cpu_buffer(void *tr) +{ + update_max_tr_single((struct trace_array *)tr, current, smp_processor_id()); +} + +static ssize_t +tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct seq_file *m = filp->private_data; + struct trace_iterator *iter = m->private; + struct trace_array *tr = iter->tr; + unsigned long val; + int ret; + + ret = tracing_update_buffers(tr); + if (ret < 0) + return ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + guard(mutex)(&trace_types_lock); + + if (tracer_uses_snapshot(tr->current_trace)) + return -EBUSY; + + local_irq_disable(); + arch_spin_lock(&tr->max_lock); + if (tr->cond_snapshot) + ret = -EBUSY; + arch_spin_unlock(&tr->max_lock); + local_irq_enable(); + if (ret) + return ret; + + switch (val) { + case 0: + if (iter->cpu_file != RING_BUFFER_ALL_CPUS) + return -EINVAL; + if (tr->allocated_snapshot) + free_snapshot(tr); + break; + case 1: +/* Only allow per-cpu swap if the ring buffer supports it */ +#ifndef CONFIG_RING_BUFFER_ALLOW_SWAP + if (iter->cpu_file != RING_BUFFER_ALL_CPUS) + return -EINVAL; +#endif + if (tr->allocated_snapshot) + ret = resize_buffer_duplicate_size(&tr->snapshot_buffer, + &tr->array_buffer, iter->cpu_file); + + ret = tracing_arm_snapshot_locked(tr); + if (ret) + return ret; + + /* Now, we're going to swap */ + if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { + local_irq_disable(); + update_max_tr(tr, current, smp_processor_id(), NULL); + local_irq_enable(); + } else { + smp_call_function_single(iter->cpu_file, tracing_swap_cpu_buffer, + (void *)tr, 1); + } + tracing_disarm_snapshot(tr); + break; + default: + if (tr->allocated_snapshot) { + if (iter->cpu_file == RING_BUFFER_ALL_CPUS) + tracing_reset_online_cpus(&tr->snapshot_buffer); + else + tracing_reset_cpu(&tr->snapshot_buffer, iter->cpu_file); + } + break; + } + + if (ret >= 0) { + *ppos += cnt; + ret = cnt; + } + + return ret; +} + +static int tracing_snapshot_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = file->private_data; + int ret; + + ret = tracing_release(inode, file); + + if (file->f_mode & FMODE_READ) + return ret; + + /* If write only, the seq_file is just a stub */ + if (m) + kfree(m->private); + kfree(m); + + return 0; +} + +static int snapshot_raw_open(struct inode *inode, struct file *filp) +{ + struct ftrace_buffer_info *info; + int ret; + + /* The following checks for tracefs lockdown */ + ret = tracing_buffers_open(inode, filp); + if (ret < 0) + return ret; + + info = filp->private_data; + + if (tracer_uses_snapshot(info->iter.trace)) { + tracing_buffers_release(inode, filp); + return -EBUSY; + } + + info->iter.snapshot = true; + info->iter.array_buffer = &info->iter.tr->snapshot_buffer; + + return ret; +} + +const struct file_operations snapshot_fops = { + .open = tracing_snapshot_open, + .read = seq_read, + .write = tracing_snapshot_write, + .llseek = tracing_lseek, + .release = tracing_snapshot_release, +}; + +const struct file_operations snapshot_raw_fops = { + .open = snapshot_raw_open, + .read = tracing_buffers_read, + .release = tracing_buffers_release, + .splice_read = tracing_buffers_splice_read, +}; + +#ifdef CONFIG_TRACER_MAX_TRACE +static ssize_t +tracing_max_lat_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + + return tracing_nsecs_read(&tr->max_latency, ubuf, cnt, ppos); +} + +static ssize_t +tracing_max_lat_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + + return tracing_nsecs_write(&tr->max_latency, ubuf, cnt, ppos); +} + +static const struct file_operations tracing_max_lat_fops = { + .open = tracing_open_generic_tr, + .read = tracing_max_lat_read, + .write = tracing_max_lat_write, + .llseek = generic_file_llseek, + .release = tracing_release_generic_tr, +}; +#endif /* CONFIG_TRACER_MAX_TRACE */ + +int get_snapshot_map(struct trace_array *tr) +{ + int err = 0; + + /* + * Called with mmap_lock held. lockdep would be unhappy if we would now + * take trace_types_lock. Instead use the specific + * snapshot_trigger_lock. + */ + spin_lock(&tr->snapshot_trigger_lock); + + if (tr->snapshot || tr->mapped == UINT_MAX) + err = -EBUSY; + else + tr->mapped++; + + spin_unlock(&tr->snapshot_trigger_lock); + + /* Wait for update_max_tr() to observe iter->tr->mapped */ + if (tr->mapped == 1) + synchronize_rcu(); + + return err; + +} + +void put_snapshot_map(struct trace_array *tr) +{ + spin_lock(&tr->snapshot_trigger_lock); + if (!WARN_ON(!tr->mapped)) + tr->mapped--; + spin_unlock(&tr->snapshot_trigger_lock); +} + +#ifdef CONFIG_DYNAMIC_FTRACE +static void +ftrace_snapshot(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) +{ + tracing_snapshot_instance(tr); +} + +static void +ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) +{ + struct ftrace_func_mapper *mapper = data; + long *count = NULL; + + if (mapper) + count = (long *)ftrace_func_mapper_find_ip(mapper, ip); + + if (count) { + + if (*count <= 0) + return; + + (*count)--; + } + + tracing_snapshot_instance(tr); +} + +static int +ftrace_snapshot_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *data) +{ + struct ftrace_func_mapper *mapper = data; + long *count = NULL; + + seq_printf(m, "%ps:", (void *)ip); + + seq_puts(m, "snapshot"); + + if (mapper) + count = (long *)ftrace_func_mapper_find_ip(mapper, ip); + + if (count) + seq_printf(m, ":count=%ld\n", *count); + else + seq_puts(m, ":unlimited\n"); + + return 0; +} + +static int +ftrace_snapshot_init(struct ftrace_probe_ops *ops, struct trace_array *tr, + unsigned long ip, void *init_data, void **data) +{ + struct ftrace_func_mapper *mapper = *data; + + if (!mapper) { + mapper = allocate_ftrace_func_mapper(); + if (!mapper) + return -ENOMEM; + *data = mapper; + } + + return ftrace_func_mapper_add_ip(mapper, ip, init_data); +} + +static void +ftrace_snapshot_free(struct ftrace_probe_ops *ops, struct trace_array *tr, + unsigned long ip, void *data) +{ + struct ftrace_func_mapper *mapper = data; + + if (!ip) { + if (!mapper) + return; + free_ftrace_func_mapper(mapper, NULL); + return; + } + + ftrace_func_mapper_remove_ip(mapper, ip); +} + +static struct ftrace_probe_ops snapshot_probe_ops = { + .func = ftrace_snapshot, + .print = ftrace_snapshot_print, +}; + +static struct ftrace_probe_ops snapshot_count_probe_ops = { + .func = ftrace_count_snapshot, + .print = ftrace_snapshot_print, + .init = ftrace_snapshot_init, + .free = ftrace_snapshot_free, +}; + +static int +ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enable) +{ + struct ftrace_probe_ops *ops; + void *count = (void *)-1; + char *number; + int ret; + + if (!tr) + return -ENODEV; + + /* hash funcs only work with set_ftrace_filter */ + if (!enable) + return -EINVAL; + + ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops; + + if (glob[0] == '!') { + ret = unregister_ftrace_function_probe_func(glob+1, tr, ops); + if (!ret) + tracing_disarm_snapshot(tr); + + return ret; + } + + if (!param) + goto out_reg; + + number = strsep(¶m, ":"); + + if (!strlen(number)) + goto out_reg; + + /* + * We use the callback data field (which is a pointer) + * as our counter. + */ + ret = kstrtoul(number, 0, (unsigned long *)&count); + if (ret) + return ret; + + out_reg: + ret = tracing_arm_snapshot(tr); + if (ret < 0) + return ret; + + ret = register_ftrace_function_probe(glob, tr, ops, count); + if (ret < 0) + tracing_disarm_snapshot(tr); + + return ret < 0 ? ret : 0; +} + +static struct ftrace_func_command ftrace_snapshot_cmd = { + .name = "snapshot", + .func = ftrace_trace_snapshot_callback, +}; + +__init int register_snapshot_cmd(void) +{ + return register_ftrace_command(&ftrace_snapshot_cmd); +} +#endif /* CONFIG_DYNAMIC_FTRACE */ + +int trace_allocate_snapshot(struct trace_array *tr, int size) +{ + int ret; + + /* Fix mapped buffer trace arrays do not have snapshot buffers */ + if (tr->range_addr_start) + return 0; + + /* allocate_snapshot can only be true during system boot */ + ret = allocate_trace_buffer(tr, &tr->snapshot_buffer, + allocate_snapshot ? size : 1); + if (ret < 0) + return -ENOMEM; + + tr->allocated_snapshot = allocate_snapshot; + + allocate_snapshot = false; + return 0; +} + +__init static bool tr_needs_alloc_snapshot(const char *name) +{ + char *test; + int len = strlen(name); + bool ret; + + if (!boot_snapshot_index) + return false; + + if (strncmp(name, boot_snapshot_info, len) == 0 && + boot_snapshot_info[len] == '\t') + return true; + + test = kmalloc(strlen(name) + 3, GFP_KERNEL); + if (!test) + return false; + + sprintf(test, "\t%s\t", name); + ret = strstr(boot_snapshot_info, test) == NULL; + kfree(test); + return ret; +} + +__init void do_allocate_snapshot(const char *name) +{ + if (!tr_needs_alloc_snapshot(name)) + return; + + /* + * When allocate_snapshot is set, the next call to + * allocate_trace_buffers() (called by trace_array_get_by_name()) + * will allocate the snapshot buffer. That will also clear + * this flag. + */ + allocate_snapshot = true; +} + +void __init ftrace_boot_snapshot(void) +{ + struct trace_array *tr; + + if (!snapshot_at_boot) + return; + + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (!tr->allocated_snapshot) + continue; + + tracing_snapshot_instance(tr); + trace_array_puts(tr, "** Boot snapshot taken **\n"); + } +} diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 37317b81fcda..8ad72e17d8eb 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -174,7 +174,6 @@ sys_enter_openat_print(struct syscall_trace_enter *trace, struct syscall_metadat { O_NOFOLLOW, "O_NOFOLLOW" }, { O_NOATIME, "O_NOATIME" }, { O_CLOEXEC, "O_CLOEXEC" }, - { -1, NULL } }; trace_seq_printf(s, "%s(", entry->name); @@ -205,7 +204,7 @@ sys_enter_openat_print(struct syscall_trace_enter *trace, struct syscall_metadat trace_seq_puts(s, "O_RDONLY|"); } - trace_print_flags_seq(s, "|", bits, __flags); + trace_print_flags_seq(s, "|", bits, __flags, ARRAY_SIZE(__flags)); /* * trace_print_flags_seq() adds a '\0' to the * buffer, but this needs to append more to the seq. diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index bf1a507695b6..0dd7927df22a 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -386,13 +386,11 @@ static void tracing_map_elt_init_fields(struct tracing_map_elt *elt) } } -static void tracing_map_elt_free(struct tracing_map_elt *elt) +static void __tracing_map_elt_free(struct tracing_map_elt *elt) { if (!elt) return; - if (elt->map->ops && elt->map->ops->elt_free) - elt->map->ops->elt_free(elt); kfree(elt->fields); kfree(elt->vars); kfree(elt->var_set); @@ -400,6 +398,17 @@ static void tracing_map_elt_free(struct tracing_map_elt *elt) kfree(elt); } +static void tracing_map_elt_free(struct tracing_map_elt *elt) +{ + if (!elt) + return; + + /* Only objects initialized with alloc_elt() should be passed to free_elt().*/ + if (elt->map->ops && elt->map->ops->elt_free) + elt->map->ops->elt_free(elt); + __tracing_map_elt_free(elt); +} + static struct tracing_map_elt *tracing_map_elt_alloc(struct tracing_map *map) { struct tracing_map_elt *elt; @@ -444,7 +453,7 @@ static struct tracing_map_elt *tracing_map_elt_alloc(struct tracing_map *map) } return elt; free: - tracing_map_elt_free(elt); + __tracing_map_elt_free(elt); return ERR_PTR(err); } diff --git a/kernel/trace/undefsyms_base.c b/kernel/trace/undefsyms_base.c new file mode 100644 index 000000000000..e65baf58e6ff --- /dev/null +++ b/kernel/trace/undefsyms_base.c @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * simple_ring_buffer is used by the pKVM hypervisor which does not have access + * to all kernel symbols. Whatever is undefined when compiling this file is + * compiler and tooling-generated symbols that can safely be ignored for + * simple_ring_buffer. + */ + +#include <linux/atomic.h> +#include <linux/string.h> +#include <asm/page.h> + +void undefsyms_base(void *p, int n); + +static char page[PAGE_SIZE] __aligned(PAGE_SIZE); + +void undefsyms_base(void *p, int n) +{ + char buffer[256] = { 0 }; + + u32 u = 0; + memset((char * volatile)page, 8, PAGE_SIZE); + memset((char * volatile)buffer, 8, sizeof(buffer)); + memcpy((void * volatile)p, buffer, sizeof(buffer)); + cmpxchg((u32 * volatile)&u, 0, 8); + WARN_ON(n == 0xdeadbeef); +} |
