summaryrefslogtreecommitdiff
path: root/kernel/trace
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/trace')
-rw-r--r--kernel/trace/Kconfig14
-rw-r--r--kernel/trace/Makefile34
-rw-r--r--kernel/trace/bpf_trace.c7
-rw-r--r--kernel/trace/fprobe.c491
-rw-r--r--kernel/trace/ftrace.c34
-rw-r--r--kernel/trace/remote_test.c261
-rw-r--r--kernel/trace/remote_test_events.h10
-rw-r--r--kernel/trace/ring_buffer.c430
-rw-r--r--kernel/trace/rv/Kconfig18
-rw-r--r--kernel/trace/rv/Makefile3
-rw-r--r--kernel/trace/rv/monitors/deadline/Kconfig10
-rw-r--r--kernel/trace/rv/monitors/deadline/deadline.c44
-rw-r--r--kernel/trace/rv/monitors/deadline/deadline.h202
-rw-r--r--kernel/trace/rv/monitors/nomiss/Kconfig15
-rw-r--r--kernel/trace/rv/monitors/nomiss/nomiss.c293
-rw-r--r--kernel/trace/rv/monitors/nomiss/nomiss.h123
-rw-r--r--kernel/trace/rv/monitors/nomiss/nomiss_trace.h19
-rw-r--r--kernel/trace/rv/monitors/opid/Kconfig11
-rw-r--r--kernel/trace/rv/monitors/opid/opid.c111
-rw-r--r--kernel/trace/rv/monitors/opid/opid.h86
-rw-r--r--kernel/trace/rv/monitors/opid/opid_trace.h4
-rw-r--r--kernel/trace/rv/monitors/sleep/sleep.c8
-rw-r--r--kernel/trace/rv/monitors/sleep/sleep.h98
-rw-r--r--kernel/trace/rv/monitors/stall/Kconfig13
-rw-r--r--kernel/trace/rv/monitors/stall/stall.c150
-rw-r--r--kernel/trace/rv/monitors/stall/stall.h81
-rw-r--r--kernel/trace/rv/monitors/stall/stall_trace.h19
-rw-r--r--kernel/trace/rv/rv_trace.h67
-rw-r--r--kernel/trace/simple_ring_buffer.c517
-rw-r--r--kernel/trace/trace.c1414
-rw-r--r--kernel/trace/trace.h150
-rw-r--r--kernel/trace/trace_boot.c5
-rw-r--r--kernel/trace/trace_branch.c8
-rw-r--r--kernel/trace/trace_events.c107
-rw-r--r--kernel/trace/trace_events_hist.c29
-rw-r--r--kernel/trace/trace_events_synth.c4
-rw-r--r--kernel/trace/trace_events_trigger.c79
-rw-r--r--kernel/trace/trace_kprobe.c11
-rw-r--r--kernel/trace/trace_osnoise.c64
-rw-r--r--kernel/trace/trace_output.c32
-rw-r--r--kernel/trace/trace_printk.c1
-rw-r--r--kernel/trace/trace_probe.c10
-rw-r--r--kernel/trace/trace_probe.h4
-rw-r--r--kernel/trace/trace_remote.c1384
-rw-r--r--kernel/trace/trace_snapshot.c1066
-rw-r--r--kernel/trace/trace_syscalls.c3
-rw-r--r--kernel/trace/tracing_map.c17
-rw-r--r--kernel/trace/undefsyms_base.c28
48 files changed, 5814 insertions, 1775 deletions
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 49de13cae428..e130da35808f 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -1281,4 +1281,18 @@ config HIST_TRIGGERS_DEBUG
source "kernel/trace/rv/Kconfig"
+config TRACE_REMOTE
+ bool
+
+config SIMPLE_RING_BUFFER
+ bool
+
+config TRACE_REMOTE_TEST
+ tristate "Test module for remote tracing"
+ select TRACE_REMOTE
+ select SIMPLE_RING_BUFFER
+ help
+ This trace remote includes a ring-buffer writer implementation using
+ "simple_ring_buffer". This is solely intending for testing.
+
endif # FTRACE
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 04096c21d06b..8d3d96e847d8 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -69,6 +69,7 @@ obj-$(CONFIG_TRACING) += trace_seq.o
obj-$(CONFIG_TRACING) += trace_stat.o
obj-$(CONFIG_TRACING) += trace_printk.o
obj-$(CONFIG_TRACING) += trace_pid.o
+obj-$(CONFIG_TRACER_SNAPSHOT) += trace_snapshot.o
obj-$(CONFIG_TRACING) += pid_list.o
obj-$(CONFIG_TRACING_MAP) += tracing_map.o
obj-$(CONFIG_PREEMPTIRQ_DELAY_TEST) += preemptirq_delay_test.o
@@ -128,4 +129,37 @@ obj-$(CONFIG_FPROBE_EVENTS) += trace_fprobe.o
obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o
obj-$(CONFIG_RV) += rv/
+obj-$(CONFIG_TRACE_REMOTE) += trace_remote.o
+obj-$(CONFIG_SIMPLE_RING_BUFFER) += simple_ring_buffer.o
+obj-$(CONFIG_TRACE_REMOTE_TEST) += remote_test.o
+
+# simple_ring_buffer is used by the pKVM hypervisor which does not have access
+# to all kernel symbols. Fail the build if forbidden symbols are found.
+
+# Basic compiler and tooling-generated symbols that can safely be left
+# undefined. Ensure KASAN is enabled to avoid logic that may disable
+# FORTIFY_SOURCE when KASAN is not enabled. undefsyms_base.o does not
+# automatically get KASAN flags because it is not linked into vmlinux.
+targets += undefsyms_base.o
+KASAN_SANITIZE_undefsyms_base.o := y
+
+UNDEFINED_ALLOWLIST = __asan __gcov __kasan __kcsan __hwasan __sancov __sanitizer __tsan __ubsan __msan \
+ __aeabi_unwind_cpp __s390_indirect_jump __x86_indirect_thunk simple_ring_buffer \
+ $(shell $(NM) -u $(obj)/undefsyms_base.o 2>/dev/null | awk '{print $$2}')
+
+quiet_cmd_check_undefined = NM $<
+ cmd_check_undefined = \
+ undefsyms=$$($(NM) -u $< | grep -v $(addprefix -e , $(UNDEFINED_ALLOWLIST)) || true); \
+ if [ -n "$$undefsyms" ]; then \
+ echo "Unexpected symbols in $<:" >&2; \
+ echo "$$undefsyms" >&2; \
+ false; \
+ fi; \
+ touch $@
+
+$(obj)/%.o.checked: $(obj)/%.o $(obj)/undefsyms_base.o FORCE
+ $(call if_changed,check_undefined)
+
+always-$(CONFIG_SIMPLE_RING_BUFFER) += simple_ring_buffer.o.checked
+
libftrace-y := ftrace.o
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 0b040a417442..a02bd258677e 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2384,7 +2384,8 @@ static void bpf_kprobe_multi_link_release(struct bpf_link *link)
struct bpf_kprobe_multi_link *kmulti_link;
kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
- unregister_fprobe(&kmulti_link->fp);
+ /* Don't wait for RCU GP here. */
+ unregister_fprobe_async(&kmulti_link->fp);
kprobe_multi_put_modules(kmulti_link->mods, kmulti_link->mods_cnt);
}
@@ -2752,6 +2753,10 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
if (!is_kprobe_multi(prog))
return -EINVAL;
+ /* kprobe_multi is not allowed to be sleepable. */
+ if (prog->sleepable)
+ return -EINVAL;
+
/* Writing to context is not allowed for kprobes. */
if (prog->aux->kprobe_write_ctx)
return -EINVAL;
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index dcadf1d23b8a..f378613ad120 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -4,6 +4,7 @@
*/
#define pr_fmt(fmt) "fprobe: " fmt
+#include <linux/cleanup.h>
#include <linux/err.h>
#include <linux/fprobe.h>
#include <linux/kallsyms.h>
@@ -78,36 +79,33 @@ static const struct rhashtable_params fprobe_rht_params = {
};
/* Node insertion and deletion requires the fprobe_mutex */
-static int insert_fprobe_node(struct fprobe_hlist_node *node)
+static int __insert_fprobe_node(struct fprobe_hlist_node *node, struct fprobe *fp)
{
+ int ret;
+
lockdep_assert_held(&fprobe_mutex);
- return rhltable_insert(&fprobe_ip_table, &node->hlist, fprobe_rht_params);
+ ret = rhltable_insert(&fprobe_ip_table, &node->hlist, fprobe_rht_params);
+ /* Set the fprobe pointer if insertion was successful. */
+ if (!ret)
+ WRITE_ONCE(node->fp, fp);
+ return ret;
}
-/* Return true if there are synonims */
-static bool delete_fprobe_node(struct fprobe_hlist_node *node)
+static void __delete_fprobe_node(struct fprobe_hlist_node *node)
{
lockdep_assert_held(&fprobe_mutex);
- bool ret;
- /* Avoid double deleting */
+ /* Avoid double deleting and non-inserted nodes */
if (READ_ONCE(node->fp) != NULL) {
WRITE_ONCE(node->fp, NULL);
rhltable_remove(&fprobe_ip_table, &node->hlist,
fprobe_rht_params);
}
-
- rcu_read_lock();
- ret = !!rhltable_lookup(&fprobe_ip_table, &node->addr,
- fprobe_rht_params);
- rcu_read_unlock();
-
- return ret;
}
/* Check existence of the fprobe */
-static bool is_fprobe_still_exist(struct fprobe *fp)
+static bool fprobe_registered(struct fprobe *fp)
{
struct hlist_head *head;
struct fprobe_hlist *fph;
@@ -120,7 +118,7 @@ static bool is_fprobe_still_exist(struct fprobe *fp)
}
return false;
}
-NOKPROBE_SYMBOL(is_fprobe_still_exist);
+NOKPROBE_SYMBOL(fprobe_registered);
static int add_fprobe_hash(struct fprobe *fp)
{
@@ -132,9 +130,6 @@ static int add_fprobe_hash(struct fprobe *fp)
if (WARN_ON_ONCE(!fph))
return -EINVAL;
- if (is_fprobe_still_exist(fp))
- return -EEXIST;
-
head = &fprobe_table[hash_ptr(fp, FPROBE_HASH_BITS)];
hlist_add_head_rcu(&fp->hlist_array->hlist, head);
return 0;
@@ -149,7 +144,7 @@ static int del_fprobe_hash(struct fprobe *fp)
if (WARN_ON_ONCE(!fph))
return -EINVAL;
- if (!is_fprobe_still_exist(fp))
+ if (!fprobe_registered(fp))
return -ENOENT;
fph->fp = NULL;
@@ -255,7 +250,65 @@ static inline int __fprobe_kprobe_handler(unsigned long ip, unsigned long parent
return ret;
}
+static int fprobe_fgraph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
+ struct ftrace_regs *fregs);
+static void fprobe_return(struct ftrace_graph_ret *trace,
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs);
+
+static struct fgraph_ops fprobe_graph_ops = {
+ .entryfunc = fprobe_fgraph_entry,
+ .retfunc = fprobe_return,
+};
+/* Number of fgraph fprobe nodes */
+static int nr_fgraph_fprobes;
+/* Is fprobe_graph_ops registered? */
+static bool fprobe_graph_registered;
+
+/* Add @addrs to the ftrace filter and register fgraph if needed. */
+static int fprobe_graph_add_ips(unsigned long *addrs, int num)
+{
+ int ret;
+
+ lockdep_assert_held(&fprobe_mutex);
+
+ ret = ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 0, 0);
+ if (ret)
+ return ret;
+
+ if (!fprobe_graph_registered) {
+ ret = register_ftrace_graph(&fprobe_graph_ops);
+ if (WARN_ON_ONCE(ret)) {
+ ftrace_free_filter(&fprobe_graph_ops.ops);
+ return ret;
+ }
+ fprobe_graph_registered = true;
+ }
+ return 0;
+}
+
+static void __fprobe_graph_unregister(void)
+{
+ if (fprobe_graph_registered) {
+ unregister_ftrace_graph(&fprobe_graph_ops);
+ ftrace_free_filter(&fprobe_graph_ops.ops);
+ fprobe_graph_registered = false;
+ }
+}
+
+/* Remove @addrs from the ftrace filter and unregister fgraph if possible. */
+static void fprobe_graph_remove_ips(unsigned long *addrs, int num)
+{
+ lockdep_assert_held(&fprobe_mutex);
+
+ if (!nr_fgraph_fprobes)
+ __fprobe_graph_unregister();
+ else if (num)
+ ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 1, 0);
+}
+
#if defined(CONFIG_DYNAMIC_FTRACE_WITH_ARGS) || defined(CONFIG_DYNAMIC_FTRACE_WITH_REGS)
+
/* ftrace_ops callback, this processes fprobes which have only entry_handler. */
static void fprobe_ftrace_entry(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *ops, struct ftrace_regs *fregs)
@@ -298,7 +351,10 @@ static struct ftrace_ops fprobe_ftrace_ops = {
.func = fprobe_ftrace_entry,
.flags = FTRACE_OPS_FL_SAVE_ARGS,
};
-static int fprobe_ftrace_active;
+/* Number of ftrace fprobe nodes */
+static int nr_ftrace_fprobes;
+/* Is fprobe_ftrace_ops registered? */
+static bool fprobe_ftrace_registered;
static int fprobe_ftrace_add_ips(unsigned long *addrs, int num)
{
@@ -310,25 +366,33 @@ static int fprobe_ftrace_add_ips(unsigned long *addrs, int num)
if (ret)
return ret;
- if (!fprobe_ftrace_active) {
+ if (!fprobe_ftrace_registered) {
ret = register_ftrace_function(&fprobe_ftrace_ops);
if (ret) {
ftrace_free_filter(&fprobe_ftrace_ops);
return ret;
}
+ fprobe_ftrace_registered = true;
}
- fprobe_ftrace_active++;
return 0;
}
+static void __fprobe_ftrace_unregister(void)
+{
+ if (fprobe_ftrace_registered) {
+ unregister_ftrace_function(&fprobe_ftrace_ops);
+ ftrace_free_filter(&fprobe_ftrace_ops);
+ fprobe_ftrace_registered = false;
+ }
+}
+
static void fprobe_ftrace_remove_ips(unsigned long *addrs, int num)
{
lockdep_assert_held(&fprobe_mutex);
- fprobe_ftrace_active--;
- if (!fprobe_ftrace_active)
- unregister_ftrace_function(&fprobe_ftrace_ops);
- if (num)
+ if (!nr_ftrace_fprobes)
+ __fprobe_ftrace_unregister();
+ else if (num)
ftrace_set_filter_ips(&fprobe_ftrace_ops, addrs, num, 1, 0);
}
@@ -337,12 +401,78 @@ static bool fprobe_is_ftrace(struct fprobe *fp)
return !fp->exit_handler;
}
+/* Node insertion and deletion requires the fprobe_mutex */
+static int insert_fprobe_node(struct fprobe_hlist_node *node, struct fprobe *fp)
+{
+ int ret;
+
+ lockdep_assert_held(&fprobe_mutex);
+
+ ret = __insert_fprobe_node(node, fp);
+ if (!ret) {
+ if (fprobe_is_ftrace(fp))
+ nr_ftrace_fprobes++;
+ else
+ nr_fgraph_fprobes++;
+ }
+
+ return ret;
+}
+
+static void delete_fprobe_node(struct fprobe_hlist_node *node)
+{
+ struct fprobe *fp;
+
+ lockdep_assert_held(&fprobe_mutex);
+
+ fp = READ_ONCE(node->fp);
+ if (fp) {
+ if (fprobe_is_ftrace(fp))
+ nr_ftrace_fprobes--;
+ else
+ nr_fgraph_fprobes--;
+ }
+ __delete_fprobe_node(node);
+}
+
+static bool fprobe_exists_on_hash(unsigned long ip, bool ftrace)
+{
+ struct rhlist_head *head, *pos;
+ struct fprobe_hlist_node *node;
+ struct fprobe *fp;
+
+ guard(rcu)();
+ head = rhltable_lookup(&fprobe_ip_table, &ip,
+ fprobe_rht_params);
+ if (!head)
+ return false;
+ /* We have to check the same type on the list. */
+ rhl_for_each_entry_rcu(node, pos, head, hlist) {
+ if (node->addr != ip)
+ break;
+ fp = READ_ONCE(node->fp);
+ if (likely(fp)) {
+ if ((!ftrace && fp->exit_handler) ||
+ (ftrace && !fp->exit_handler))
+ return true;
+ }
+ }
+
+ return false;
+}
+
#ifdef CONFIG_MODULES
-static void fprobe_set_ips(unsigned long *ips, unsigned int cnt, int remove,
- int reset)
+static void fprobe_remove_ips(unsigned long *ips, unsigned int cnt)
{
- ftrace_set_filter_ips(&fprobe_graph_ops.ops, ips, cnt, remove, reset);
- ftrace_set_filter_ips(&fprobe_ftrace_ops, ips, cnt, remove, reset);
+ if (!nr_fgraph_fprobes)
+ __fprobe_graph_unregister();
+ else if (cnt)
+ ftrace_set_filter_ips(&fprobe_graph_ops.ops, ips, cnt, 1, 0);
+
+ if (!nr_ftrace_fprobes)
+ __fprobe_ftrace_unregister();
+ else if (cnt)
+ ftrace_set_filter_ips(&fprobe_ftrace_ops, ips, cnt, 1, 0);
}
#endif
#else
@@ -360,11 +490,62 @@ static bool fprobe_is_ftrace(struct fprobe *fp)
return false;
}
+/* Node insertion and deletion requires the fprobe_mutex */
+static int insert_fprobe_node(struct fprobe_hlist_node *node, struct fprobe *fp)
+{
+ int ret;
+
+ lockdep_assert_held(&fprobe_mutex);
+
+ ret = __insert_fprobe_node(node, fp);
+ if (!ret)
+ nr_fgraph_fprobes++;
+
+ return ret;
+}
+
+static void delete_fprobe_node(struct fprobe_hlist_node *node)
+{
+ struct fprobe *fp;
+
+ lockdep_assert_held(&fprobe_mutex);
+
+ fp = READ_ONCE(node->fp);
+ if (fp)
+ nr_fgraph_fprobes--;
+ __delete_fprobe_node(node);
+}
+
+static bool fprobe_exists_on_hash(unsigned long ip, bool ftrace __maybe_unused)
+{
+ struct rhlist_head *head, *pos;
+ struct fprobe_hlist_node *node;
+ struct fprobe *fp;
+
+ guard(rcu)();
+ head = rhltable_lookup(&fprobe_ip_table, &ip,
+ fprobe_rht_params);
+ if (!head)
+ return false;
+ /* We only need to check fp is there. */
+ rhl_for_each_entry_rcu(node, pos, head, hlist) {
+ if (node->addr != ip)
+ break;
+ fp = READ_ONCE(node->fp);
+ if (likely(fp))
+ return true;
+ }
+
+ return false;
+}
+
#ifdef CONFIG_MODULES
-static void fprobe_set_ips(unsigned long *ips, unsigned int cnt, int remove,
- int reset)
+static void fprobe_remove_ips(unsigned long *ips, unsigned int cnt)
{
- ftrace_set_filter_ips(&fprobe_graph_ops.ops, ips, cnt, remove, reset);
+ if (!nr_fgraph_fprobes)
+ __fprobe_graph_unregister();
+ else if (cnt)
+ ftrace_set_filter_ips(&fprobe_graph_ops.ops, ips, cnt, 1, 0);
}
#endif
#endif /* !CONFIG_DYNAMIC_FTRACE_WITH_ARGS && !CONFIG_DYNAMIC_FTRACE_WITH_REGS */
@@ -450,8 +631,6 @@ static int fprobe_fgraph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops
used += FPROBE_HEADER_SIZE_IN_LONG + size_words;
}
}
- if (used < reserved_words)
- memset(fgraph_data + used, 0, reserved_words - used);
/* If any exit_handler is set, data must be used. */
return used != 0;
@@ -482,7 +661,7 @@ static void fprobe_return(struct ftrace_graph_ret *trace,
if (!fp)
break;
curr += FPROBE_HEADER_SIZE_IN_LONG;
- if (is_fprobe_still_exist(fp) && !fprobe_disabled(fp)) {
+ if (fprobe_registered(fp) && !fprobe_disabled(fp)) {
if (WARN_ON_ONCE(curr + size > size_words))
break;
fp->exit_handler(fp, trace->func, ret_ip, fregs,
@@ -494,51 +673,9 @@ static void fprobe_return(struct ftrace_graph_ret *trace,
}
NOKPROBE_SYMBOL(fprobe_return);
-static struct fgraph_ops fprobe_graph_ops = {
- .entryfunc = fprobe_fgraph_entry,
- .retfunc = fprobe_return,
-};
-static int fprobe_graph_active;
-
-/* Add @addrs to the ftrace filter and register fgraph if needed. */
-static int fprobe_graph_add_ips(unsigned long *addrs, int num)
-{
- int ret;
-
- lockdep_assert_held(&fprobe_mutex);
-
- ret = ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 0, 0);
- if (ret)
- return ret;
-
- if (!fprobe_graph_active) {
- ret = register_ftrace_graph(&fprobe_graph_ops);
- if (WARN_ON_ONCE(ret)) {
- ftrace_free_filter(&fprobe_graph_ops.ops);
- return ret;
- }
- }
- fprobe_graph_active++;
- return 0;
-}
-
-/* Remove @addrs from the ftrace filter and unregister fgraph if possible. */
-static void fprobe_graph_remove_ips(unsigned long *addrs, int num)
-{
- lockdep_assert_held(&fprobe_mutex);
-
- fprobe_graph_active--;
- /* Q: should we unregister it ? */
- if (!fprobe_graph_active)
- unregister_ftrace_graph(&fprobe_graph_ops);
-
- if (num)
- ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 1, 0);
-}
-
#ifdef CONFIG_MODULES
-#define FPROBE_IPS_BATCH_INIT 8
+#define FPROBE_IPS_BATCH_INIT 128
/* instruction pointer address list */
struct fprobe_addr_list {
int index;
@@ -546,43 +683,29 @@ struct fprobe_addr_list {
unsigned long *addrs;
};
-static int fprobe_addr_list_add(struct fprobe_addr_list *alist, unsigned long addr)
+static int fprobe_remove_node_in_module(struct module *mod, struct fprobe_hlist_node *node,
+ struct fprobe_addr_list *alist)
{
- unsigned long *addrs;
-
- /* Previously we failed to expand the list. */
- if (alist->index == alist->size)
- return -ENOSPC;
+ lockdep_assert_in_rcu_read_lock();
- alist->addrs[alist->index++] = addr;
- if (alist->index < alist->size)
+ if (!within_module(node->addr, mod))
return 0;
- /* Expand the address list */
- addrs = kcalloc(alist->size * 2, sizeof(*addrs), GFP_KERNEL);
- if (!addrs)
- return -ENOMEM;
-
- memcpy(addrs, alist->addrs, alist->size * sizeof(*addrs));
- alist->size *= 2;
- kfree(alist->addrs);
- alist->addrs = addrs;
-
- return 0;
-}
-
-static void fprobe_remove_node_in_module(struct module *mod, struct fprobe_hlist_node *node,
- struct fprobe_addr_list *alist)
-{
- if (!within_module(node->addr, mod))
- return;
- if (delete_fprobe_node(node))
- return;
+ delete_fprobe_node(node);
+ /* If no address list is available, we can't track this address. */
+ if (!alist->addrs)
+ return 0;
/*
- * If failed to update alist, just continue to update hlist.
- * Therefore, at list user handler will not hit anymore.
+ * Don't care the type here, because all fprobes on the same
+ * address must be removed eventually.
*/
- fprobe_addr_list_add(alist, node->addr);
+ if (!rhltable_lookup(&fprobe_ip_table, &node->addr, fprobe_rht_params)) {
+ alist->addrs[alist->index++] = node->addr;
+ if (alist->index == alist->size)
+ return -ENOSPC;
+ }
+
+ return 0;
}
/* Handle module unloading to manage fprobe_ip_table. */
@@ -593,29 +716,48 @@ static int fprobe_module_callback(struct notifier_block *nb,
struct fprobe_hlist_node *node;
struct rhashtable_iter iter;
struct module *mod = data;
+ bool retry;
if (val != MODULE_STATE_GOING)
return NOTIFY_DONE;
alist.addrs = kcalloc(alist.size, sizeof(*alist.addrs), GFP_KERNEL);
- /* If failed to alloc memory, we can not remove ips from hash. */
- if (!alist.addrs)
- return NOTIFY_DONE;
+ /*
+ * If failed to alloc memory, ftrace_ops will not be able to remove ips from
+ * hash, but we can still remove nodes from fprobe_ip_table, so we can avoid
+ * the potential wrong callback. So just print a warning here and try to
+ * continue without address list.
+ */
+ WARN_ONCE(!alist.addrs,
+ "Failed to allocate memory for fprobe_addr_list, ftrace_ops will not be updated");
mutex_lock(&fprobe_mutex);
+again:
+ retry = false;
+ alist.index = 0;
rhltable_walk_enter(&fprobe_ip_table, &iter);
do {
rhashtable_walk_start(&iter);
while ((node = rhashtable_walk_next(&iter)) && !IS_ERR(node))
- fprobe_remove_node_in_module(mod, node, &alist);
+ if (fprobe_remove_node_in_module(mod, node, &alist) < 0) {
+ retry = true;
+ break;
+ }
rhashtable_walk_stop(&iter);
- } while (node == ERR_PTR(-EAGAIN));
+ } while (node == ERR_PTR(-EAGAIN) && !retry);
rhashtable_walk_exit(&iter);
+ /* Remove any ips from hash table(s) */
+ fprobe_remove_ips(alist.addrs, alist.index);
+ /*
+ * If we break rhashtable walk loop except for -EAGAIN, we need
+ * to restart looping from start for safety. Anyway, this is
+ * not a hotpath.
+ */
+ if (retry)
+ goto again;
- if (alist.index > 0)
- fprobe_set_ips(alist.addrs, alist.index, 1, 0);
mutex_unlock(&fprobe_mutex);
kfree(alist.addrs);
@@ -759,7 +901,6 @@ static int fprobe_init(struct fprobe *fp, unsigned long *addrs, int num)
fp->hlist_array = hlist_array;
hlist_array->fp = fp;
for (i = 0; i < num; i++) {
- hlist_array->array[i].fp = fp;
addr = ftrace_location(addrs[i]);
if (!addr) {
fprobe_fail_cleanup(fp);
@@ -823,6 +964,8 @@ int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter
}
EXPORT_SYMBOL_GPL(register_fprobe);
+static int unregister_fprobe_nolock(struct fprobe *fp);
+
/**
* register_fprobe_ips() - Register fprobe to ftrace by address.
* @fp: A fprobe data structure to be registered.
@@ -841,35 +984,33 @@ int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num)
struct fprobe_hlist *hlist_array;
int ret, i;
+ guard(mutex)(&fprobe_mutex);
+ if (fprobe_registered(fp))
+ return -EEXIST;
+
ret = fprobe_init(fp, addrs, num);
if (ret)
return ret;
- mutex_lock(&fprobe_mutex);
-
- hlist_array = fp->hlist_array;
if (fprobe_is_ftrace(fp))
ret = fprobe_ftrace_add_ips(addrs, num);
else
ret = fprobe_graph_add_ips(addrs, num);
-
- if (!ret) {
- add_fprobe_hash(fp);
- for (i = 0; i < hlist_array->size; i++) {
- ret = insert_fprobe_node(&hlist_array->array[i]);
- if (ret)
- break;
- }
- /* fallback on insert error */
- if (ret) {
- for (i--; i >= 0; i--)
- delete_fprobe_node(&hlist_array->array[i]);
- }
+ if (ret) {
+ fprobe_fail_cleanup(fp);
+ return ret;
}
- mutex_unlock(&fprobe_mutex);
- if (ret)
- fprobe_fail_cleanup(fp);
+ hlist_array = fp->hlist_array;
+ ret = add_fprobe_hash(fp);
+ for (i = 0; i < hlist_array->size && !ret; i++)
+ ret = insert_fprobe_node(&hlist_array->array[i], fp);
+
+ if (ret) {
+ unregister_fprobe_nolock(fp);
+ /* In error case, wait for clean up safely. */
+ synchronize_rcu();
+ }
return ret;
}
@@ -913,37 +1054,28 @@ bool fprobe_is_registered(struct fprobe *fp)
return true;
}
-/**
- * unregister_fprobe() - Unregister fprobe.
- * @fp: A fprobe data structure to be unregistered.
- *
- * Unregister fprobe (and remove ftrace hooks from the function entries).
- *
- * Return 0 if @fp is unregistered successfully, -errno if not.
- */
-int unregister_fprobe(struct fprobe *fp)
+static int unregister_fprobe_nolock(struct fprobe *fp)
{
- struct fprobe_hlist *hlist_array;
+ struct fprobe_hlist *hlist_array = fp->hlist_array;
unsigned long *addrs = NULL;
- int ret = 0, i, count;
-
- mutex_lock(&fprobe_mutex);
- if (!fp || !is_fprobe_still_exist(fp)) {
- ret = -EINVAL;
- goto out;
- }
+ int i, count;
- hlist_array = fp->hlist_array;
addrs = kcalloc(hlist_array->size, sizeof(unsigned long), GFP_KERNEL);
- if (!addrs) {
- ret = -ENOMEM; /* TODO: Fallback to one-by-one loop */
- goto out;
- }
+ /*
+ * This will remove fprobe_hash_node from the hash table even if
+ * memory allocation fails. However, ftrace_ops will not be updated.
+ * Anyway, when the last fprobe is unregistered, ftrace_ops is also
+ * unregistered.
+ */
+ if (!addrs)
+ pr_warn("Failed to allocate working array. ftrace_ops may not sync.\n");
/* Remove non-synonim ips from table and hash */
count = 0;
for (i = 0; i < hlist_array->size; i++) {
- if (!delete_fprobe_node(&hlist_array->array[i]))
+ delete_fprobe_node(&hlist_array->array[i]);
+ if (addrs && !fprobe_exists_on_hash(hlist_array->array[i].addr,
+ fprobe_is_ftrace(fp)))
addrs[count++] = hlist_array->array[i].addr;
}
del_fprobe_hash(fp);
@@ -955,11 +1087,44 @@ int unregister_fprobe(struct fprobe *fp)
kfree_rcu(hlist_array, rcu);
fp->hlist_array = NULL;
+ kfree(addrs);
-out:
- mutex_unlock(&fprobe_mutex);
+ return 0;
+}
- kfree(addrs);
+/**
+ * unregister_fprobe_async() - Unregister fprobe without RCU GP wait
+ * @fp: A fprobe data structure to be unregistered.
+ *
+ * Unregister fprobe (and remove ftrace hooks from the function entries).
+ * This function will NOT wait until the fprobe is no longer used.
+ *
+ * Return 0 if @fp is unregistered successfully, -errno if not.
+ */
+int unregister_fprobe_async(struct fprobe *fp)
+{
+ guard(mutex)(&fprobe_mutex);
+ if (!fp || !fprobe_registered(fp))
+ return -EINVAL;
+
+ return unregister_fprobe_nolock(fp);
+}
+
+/**
+ * unregister_fprobe() - Unregister fprobe with RCU GP wait
+ * @fp: A fprobe data structure to be unregistered.
+ *
+ * Unregister fprobe (and remove ftrace hooks from the function entries).
+ * This function will block until the fprobe is no longer used.
+ *
+ * Return 0 if @fp is unregistered successfully, -errno if not.
+ */
+int unregister_fprobe(struct fprobe *fp)
+{
+ int ret = unregister_fprobe_async(fp);
+
+ if (!ret)
+ synchronize_rcu();
return ret;
}
EXPORT_SYMBOL_GPL(unregister_fprobe);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 413310912609..b2611de3f594 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -6841,7 +6841,8 @@ bool ftrace_filter_param __initdata;
static int __init set_ftrace_notrace(char *str)
{
ftrace_filter_param = true;
- strscpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
+ trace_append_boot_param(ftrace_notrace_buf, str, ',',
+ FTRACE_FILTER_SIZE);
return 1;
}
__setup("ftrace_notrace=", set_ftrace_notrace);
@@ -6849,7 +6850,8 @@ __setup("ftrace_notrace=", set_ftrace_notrace);
static int __init set_ftrace_filter(char *str)
{
ftrace_filter_param = true;
- strscpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
+ trace_append_boot_param(ftrace_filter_buf, str, ',',
+ FTRACE_FILTER_SIZE);
return 1;
}
__setup("ftrace_filter=", set_ftrace_filter);
@@ -6861,14 +6863,16 @@ static int ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer);
static int __init set_graph_function(char *str)
{
- strscpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
+ trace_append_boot_param(ftrace_graph_buf, str, ',',
+ FTRACE_FILTER_SIZE);
return 1;
}
__setup("ftrace_graph_filter=", set_graph_function);
static int __init set_graph_notrace_function(char *str)
{
- strscpy(ftrace_graph_notrace_buf, str, FTRACE_FILTER_SIZE);
+ trace_append_boot_param(ftrace_graph_notrace_buf, str, ',',
+ FTRACE_FILTER_SIZE);
return 1;
}
__setup("ftrace_graph_notrace=", set_graph_notrace_function);
@@ -9267,6 +9271,15 @@ static int kallsyms_callback(void *data, const char *name, unsigned long addr)
* @addrs array, which needs to be big enough to store at least @cnt
* addresses.
*
+ * For a single symbol (cnt == 1), uses kallsyms_lookup_name() which
+ * performs an O(log N) binary search via the sorted kallsyms index.
+ * This avoids the full O(N) linear scan over all kernel symbols that
+ * the multi-symbol path requires.
+ *
+ * For multiple symbols, uses a single-pass linear scan via
+ * kallsyms_on_each_symbol() with binary search into the sorted input
+ * array.
+ *
* Returns: 0 if all provided symbols are found, -ESRCH otherwise.
*/
int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *addrs)
@@ -9274,6 +9287,19 @@ int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *a
struct kallsyms_data args;
int found_all;
+ /* Fast path: single symbol uses O(log N) binary search */
+ if (cnt == 1) {
+ addrs[0] = kallsyms_lookup_name(sorted_syms[0]);
+ if (addrs[0] && ftrace_location(addrs[0]))
+ return 0;
+ /*
+ * Binary lookup can fail for duplicate symbol names
+ * where the first match is not ftrace-instrumented.
+ * Retry with linear scan.
+ */
+ }
+
+ /* Batch path: single-pass O(N) linear scan */
memset(addrs, 0, sizeof(*addrs) * cnt);
args.addrs = addrs;
args.syms = sorted_syms;
diff --git a/kernel/trace/remote_test.c b/kernel/trace/remote_test.c
new file mode 100644
index 000000000000..a3e2c9b606eb
--- /dev/null
+++ b/kernel/trace/remote_test.c
@@ -0,0 +1,261 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2025 - Google LLC
+ * Author: Vincent Donnefort <vdonnefort@google.com>
+ */
+
+#include <linux/module.h>
+#include <linux/simple_ring_buffer.h>
+#include <linux/trace_remote.h>
+#include <linux/tracefs.h>
+#include <linux/types.h>
+
+#define REMOTE_EVENT_INCLUDE_FILE kernel/trace/remote_test_events.h
+#include <trace/define_remote_events.h>
+
+static DEFINE_PER_CPU(struct simple_rb_per_cpu *, simple_rbs);
+static struct trace_buffer_desc *remote_test_buffer_desc;
+
+/*
+ * The trace_remote lock already serializes accesses from the trace_remote_callbacks.
+ * However write_event can still race with load/unload.
+ */
+static DEFINE_MUTEX(simple_rbs_lock);
+
+static int remote_test_load_simple_rb(int cpu, struct ring_buffer_desc *rb_desc)
+{
+ struct simple_rb_per_cpu *cpu_buffer;
+ struct simple_buffer_page *bpages;
+ int ret = -ENOMEM;
+
+ cpu_buffer = kmalloc_obj(*cpu_buffer);
+ if (!cpu_buffer)
+ return ret;
+
+ bpages = kmalloc_objs(*bpages, rb_desc->nr_page_va);
+ if (!bpages)
+ goto err_free_cpu_buffer;
+
+ ret = simple_ring_buffer_init(cpu_buffer, bpages, rb_desc);
+ if (ret)
+ goto err_free_bpages;
+
+ scoped_guard(mutex, &simple_rbs_lock) {
+ WARN_ON(*per_cpu_ptr(&simple_rbs, cpu));
+ *per_cpu_ptr(&simple_rbs, cpu) = cpu_buffer;
+ }
+
+ return 0;
+
+err_free_bpages:
+ kfree(bpages);
+
+err_free_cpu_buffer:
+ kfree(cpu_buffer);
+
+ return ret;
+}
+
+static void remote_test_unload_simple_rb(int cpu)
+{
+ struct simple_rb_per_cpu *cpu_buffer = *per_cpu_ptr(&simple_rbs, cpu);
+ struct simple_buffer_page *bpages;
+
+ if (!cpu_buffer)
+ return;
+
+ guard(mutex)(&simple_rbs_lock);
+
+ bpages = cpu_buffer->bpages;
+ simple_ring_buffer_unload(cpu_buffer);
+ kfree(bpages);
+ kfree(cpu_buffer);
+ *per_cpu_ptr(&simple_rbs, cpu) = NULL;
+}
+
+static struct trace_buffer_desc *remote_test_load(unsigned long size, void *unused)
+{
+ struct ring_buffer_desc *rb_desc;
+ struct trace_buffer_desc *desc;
+ size_t desc_size;
+ int cpu, ret;
+
+ if (WARN_ON(remote_test_buffer_desc))
+ return ERR_PTR(-EINVAL);
+
+ desc_size = trace_buffer_desc_size(size, num_possible_cpus());
+ if (desc_size == SIZE_MAX) {
+ ret = -E2BIG;
+ goto err;
+ }
+
+ desc = kmalloc(desc_size, GFP_KERNEL);
+ if (!desc) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ ret = trace_remote_alloc_buffer(desc, desc_size, size, cpu_possible_mask);
+ if (ret)
+ goto err_free_desc;
+
+ for_each_ring_buffer_desc(rb_desc, cpu, desc) {
+ ret = remote_test_load_simple_rb(rb_desc->cpu, rb_desc);
+ if (ret)
+ goto err_unload;
+ }
+
+ remote_test_buffer_desc = desc;
+
+ return remote_test_buffer_desc;
+
+err_unload:
+ for_each_ring_buffer_desc(rb_desc, cpu, desc)
+ remote_test_unload_simple_rb(rb_desc->cpu);
+ trace_remote_free_buffer(desc);
+
+err_free_desc:
+ kfree(desc);
+
+err:
+ return ERR_PTR(ret);
+}
+
+static void remote_test_unload(struct trace_buffer_desc *desc, void *unused)
+{
+ struct ring_buffer_desc *rb_desc;
+ int cpu;
+
+ if (WARN_ON(desc != remote_test_buffer_desc))
+ return;
+
+ for_each_ring_buffer_desc(rb_desc, cpu, desc)
+ remote_test_unload_simple_rb(rb_desc->cpu);
+
+ remote_test_buffer_desc = NULL;
+ trace_remote_free_buffer(desc);
+ kfree(desc);
+}
+
+static int remote_test_enable_tracing(bool enable, void *unused)
+{
+ struct ring_buffer_desc *rb_desc;
+ int cpu;
+
+ if (!remote_test_buffer_desc)
+ return -ENODEV;
+
+ for_each_ring_buffer_desc(rb_desc, cpu, remote_test_buffer_desc)
+ WARN_ON(simple_ring_buffer_enable_tracing(*per_cpu_ptr(&simple_rbs, rb_desc->cpu),
+ enable));
+ return 0;
+}
+
+static int remote_test_swap_reader_page(unsigned int cpu, void *unused)
+{
+ struct simple_rb_per_cpu *cpu_buffer;
+
+ if (cpu >= NR_CPUS)
+ return -EINVAL;
+
+ cpu_buffer = *per_cpu_ptr(&simple_rbs, cpu);
+ if (!cpu_buffer)
+ return -EINVAL;
+
+ return simple_ring_buffer_swap_reader_page(cpu_buffer);
+}
+
+static int remote_test_reset(unsigned int cpu, void *unused)
+{
+ struct simple_rb_per_cpu *cpu_buffer;
+
+ if (cpu >= NR_CPUS)
+ return -EINVAL;
+
+ cpu_buffer = *per_cpu_ptr(&simple_rbs, cpu);
+ if (!cpu_buffer)
+ return -EINVAL;
+
+ return simple_ring_buffer_reset(cpu_buffer);
+}
+
+static int remote_test_enable_event(unsigned short id, bool enable, void *unused)
+{
+ if (id != REMOTE_TEST_EVENT_ID)
+ return -EINVAL;
+
+ /*
+ * Let's just use the struct remote_event enabled field that is turned on and off by
+ * trace_remote. This is a bit racy but good enough for a simple test module.
+ */
+ return 0;
+}
+
+static ssize_t
+write_event_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *pos)
+{
+ struct remote_event_format_selftest *evt_test;
+ struct simple_rb_per_cpu *cpu_buffer;
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ guard(mutex)(&simple_rbs_lock);
+
+ if (!remote_event_selftest.enabled)
+ return -ENODEV;
+
+ guard(preempt)();
+
+ cpu_buffer = *this_cpu_ptr(&simple_rbs);
+ if (!cpu_buffer)
+ return -ENODEV;
+
+ evt_test = simple_ring_buffer_reserve(cpu_buffer,
+ sizeof(struct remote_event_format_selftest),
+ trace_clock_global());
+ if (!evt_test)
+ return -ENODEV;
+
+ evt_test->hdr.id = REMOTE_TEST_EVENT_ID;
+ evt_test->id = val;
+
+ simple_ring_buffer_commit(cpu_buffer);
+
+ return cnt;
+}
+
+static const struct file_operations write_event_fops = {
+ .write = write_event_write,
+};
+
+static int remote_test_init_tracefs(struct dentry *d, void *unused)
+{
+ return tracefs_create_file("write_event", 0200, d, NULL, &write_event_fops) ?
+ 0 : -ENOMEM;
+}
+
+static struct trace_remote_callbacks trace_remote_callbacks = {
+ .init = remote_test_init_tracefs,
+ .load_trace_buffer = remote_test_load,
+ .unload_trace_buffer = remote_test_unload,
+ .enable_tracing = remote_test_enable_tracing,
+ .swap_reader_page = remote_test_swap_reader_page,
+ .reset = remote_test_reset,
+ .enable_event = remote_test_enable_event,
+};
+
+static int __init remote_test_init(void)
+{
+ return trace_remote_register("test", &trace_remote_callbacks, NULL,
+ &remote_event_selftest, 1);
+}
+
+module_init(remote_test_init);
+
+MODULE_DESCRIPTION("Test module for the trace remote interface");
+MODULE_AUTHOR("Vincent Donnefort");
+MODULE_LICENSE("GPL");
diff --git a/kernel/trace/remote_test_events.h b/kernel/trace/remote_test_events.h
new file mode 100644
index 000000000000..26b93b3406fc
--- /dev/null
+++ b/kernel/trace/remote_test_events.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#define REMOTE_TEST_EVENT_ID 1
+
+REMOTE_EVENT(selftest, REMOTE_TEST_EVENT_ID,
+ RE_STRUCT(
+ re_field(u64, id)
+ ),
+ RE_PRINTK("id=%llu", __entry->id)
+);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 170170bd83bd..7b07d2004cc6 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4,8 +4,10 @@
*
* Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
*/
+#include <linux/ring_buffer_types.h>
#include <linux/sched/isolation.h>
#include <linux/trace_recursion.h>
+#include <linux/panic_notifier.h>
#include <linux/trace_events.h>
#include <linux/ring_buffer.h>
#include <linux/trace_clock.h>
@@ -30,6 +32,7 @@
#include <linux/oom.h>
#include <linux/mm.h>
+#include <asm/ring_buffer.h>
#include <asm/local64.h>
#include <asm/local.h>
#include <asm/setup.h>
@@ -157,23 +160,6 @@ int ring_buffer_print_entry_header(struct trace_seq *s)
/* Used for individual buffers (after the counter) */
#define RB_BUFFER_OFF (1 << 20)
-#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data)
-
-#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
-#define RB_ALIGNMENT 4U
-#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
-#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */
-
-#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS
-# define RB_FORCE_8BYTE_ALIGNMENT 0
-# define RB_ARCH_ALIGNMENT RB_ALIGNMENT
-#else
-# define RB_FORCE_8BYTE_ALIGNMENT 1
-# define RB_ARCH_ALIGNMENT 8U
-#endif
-
-#define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT)
-
/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
@@ -316,10 +302,6 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
#define for_each_online_buffer_cpu(buffer, cpu) \
for_each_cpu_and(cpu, buffer->cpumask, cpu_online_mask)
-#define TS_SHIFT 27
-#define TS_MASK ((1ULL << TS_SHIFT) - 1)
-#define TS_DELTA_TEST (~TS_MASK)
-
static u64 rb_event_time_stamp(struct ring_buffer_event *event)
{
u64 ts;
@@ -338,12 +320,6 @@ static u64 rb_event_time_stamp(struct ring_buffer_event *event)
#define RB_MISSED_MASK (3 << 30)
-struct buffer_data_page {
- u64 time_stamp; /* page time stamp */
- local_t commit; /* write committed index */
- unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */
-};
-
struct buffer_data_read_page {
unsigned order; /* order of the page */
struct buffer_data_page *data; /* actual data, stored in this page */
@@ -437,14 +413,6 @@ static struct buffer_data_page *alloc_cpu_data(int cpu, int order)
return dpage;
}
-/*
- * We need to fit the time_stamp delta into 27 bits.
- */
-static inline bool test_time_stamp(u64 delta)
-{
- return !!(delta & TS_DELTA_TEST);
-}
-
struct rb_irq_work {
struct irq_work work;
wait_queue_head_t waiters;
@@ -555,10 +523,12 @@ struct ring_buffer_per_cpu {
unsigned int mapped;
unsigned int user_mapped; /* user space mapping */
struct mutex mapping_lock;
- unsigned long *subbuf_ids; /* ID to subbuf VA */
+ struct buffer_page **subbuf_ids; /* ID to subbuf VA */
struct trace_buffer_meta *meta_page;
struct ring_buffer_cpu_meta *ring_meta;
+ struct ring_buffer_remote *remote;
+
/* ring buffer pages to update, > 0 to add, < 0 to remove */
long nr_pages_to_update;
struct list_head new_pages; /* new pages to add */
@@ -581,6 +551,8 @@ struct trace_buffer {
struct ring_buffer_per_cpu **buffers;
+ struct ring_buffer_remote *remote;
+
struct hlist_node node;
u64 (*clock)(void);
@@ -589,6 +561,7 @@ struct trace_buffer {
unsigned long range_addr_start;
unsigned long range_addr_end;
+ struct notifier_block flush_nb;
struct ring_buffer_meta *meta;
@@ -627,16 +600,17 @@ int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq
(unsigned int)sizeof(field.commit),
(unsigned int)is_signed_type(long));
- trace_seq_printf(s, "\tfield: int overwrite;\t"
+ trace_seq_printf(s, "\tfield: char overwrite;\t"
"offset:%u;\tsize:%u;\tsigned:%u;\n",
(unsigned int)offsetof(typeof(field), commit),
1,
- (unsigned int)is_signed_type(long));
+ (unsigned int)is_signed_type(char));
trace_seq_printf(s, "\tfield: char data;\t"
"offset:%u;\tsize:%u;\tsigned:%u;\n",
(unsigned int)offsetof(typeof(field), data),
- (unsigned int)buffer->subbuf_size,
+ (unsigned int)(buffer ? buffer->subbuf_size :
+ PAGE_SIZE - BUF_PAGE_HDR_SIZE),
(unsigned int)is_signed_type(char));
return !trace_seq_has_overflowed(s);
@@ -1913,7 +1887,7 @@ static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu)
static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
{
struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
- struct buffer_page *head_page, *orig_head;
+ struct buffer_page *head_page, *orig_head, *orig_reader;
unsigned long entry_bytes = 0;
unsigned long entries = 0;
int ret;
@@ -1924,16 +1898,17 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
return;
orig_head = head_page = cpu_buffer->head_page;
+ orig_reader = cpu_buffer->reader_page;
/* Do the reader page first */
- ret = rb_validate_buffer(cpu_buffer->reader_page->page, cpu_buffer->cpu);
+ ret = rb_validate_buffer(orig_reader->page, cpu_buffer->cpu);
if (ret < 0) {
pr_info("Ring buffer reader page is invalid\n");
goto invalid;
}
entries += ret;
- entry_bytes += local_read(&cpu_buffer->reader_page->page->commit);
- local_set(&cpu_buffer->reader_page->entries, ret);
+ entry_bytes += local_read(&orig_reader->page->commit);
+ local_set(&orig_reader->entries, ret);
ts = head_page->page->time_stamp;
@@ -2036,8 +2011,8 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
/* Iterate until finding the commit page */
for (i = 0; i < meta->nr_subbufs + 1; i++, rb_inc_page(&head_page)) {
- /* Reader page has already been done */
- if (head_page == cpu_buffer->reader_page)
+ /* The original reader page has already been checked/counted. */
+ if (head_page == orig_reader)
continue;
ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu);
@@ -2238,6 +2213,40 @@ static void rb_meta_buffer_update(struct ring_buffer_per_cpu *cpu_buffer,
}
}
+static struct ring_buffer_desc *ring_buffer_desc(struct trace_buffer_desc *trace_desc, int cpu)
+{
+ struct ring_buffer_desc *desc, *end;
+ size_t len;
+ int i;
+
+ if (!trace_desc)
+ return NULL;
+
+ if (cpu >= trace_desc->nr_cpus)
+ return NULL;
+
+ end = (struct ring_buffer_desc *)((void *)trace_desc + trace_desc->struct_len);
+ desc = __first_ring_buffer_desc(trace_desc);
+ len = struct_size(desc, page_va, desc->nr_page_va);
+ desc = (struct ring_buffer_desc *)((void *)desc + (len * cpu));
+
+ if (desc < end && desc->cpu == cpu)
+ return desc;
+
+ /* Missing CPUs, need to linear search */
+ for_each_ring_buffer_desc(desc, i, trace_desc) {
+ if (desc->cpu == cpu)
+ return desc;
+ }
+
+ return NULL;
+}
+
+static void *ring_buffer_desc_page(struct ring_buffer_desc *desc, unsigned int page_id)
+{
+ return page_id >= desc->nr_page_va ? NULL : (void *)desc->page_va[page_id];
+}
+
static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
long nr_pages, struct list_head *pages)
{
@@ -2245,6 +2254,7 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_cpu_meta *meta = NULL;
struct buffer_page *bpage, *tmp;
bool user_thread = current->mm != NULL;
+ struct ring_buffer_desc *desc = NULL;
long i;
/*
@@ -2273,6 +2283,12 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
if (buffer->range_addr_start)
meta = rb_range_meta(buffer, nr_pages, cpu_buffer->cpu);
+ if (buffer->remote) {
+ desc = ring_buffer_desc(buffer->remote->desc, cpu_buffer->cpu);
+ if (!desc || WARN_ON(desc->nr_page_va != (nr_pages + 1)))
+ return -EINVAL;
+ }
+
for (i = 0; i < nr_pages; i++) {
bpage = alloc_cpu_page(cpu_buffer->cpu);
@@ -2297,6 +2313,16 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
rb_meta_buffer_update(cpu_buffer, bpage);
bpage->range = 1;
bpage->id = i + 1;
+ } else if (desc) {
+ void *p = ring_buffer_desc_page(desc, i + 1);
+
+ if (WARN_ON(!p))
+ goto free_pages;
+
+ bpage->page = p;
+ bpage->range = 1; /* bpage->page can't be freed */
+ bpage->id = i + 1;
+ cpu_buffer->subbuf_ids[i + 1] = bpage;
} else {
int order = cpu_buffer->buffer->subbuf_order;
bpage->page = alloc_cpu_data(cpu_buffer->cpu, order);
@@ -2394,6 +2420,30 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
if (cpu_buffer->ring_meta->head_buffer)
rb_meta_buffer_update(cpu_buffer, bpage);
bpage->range = 1;
+ } else if (buffer->remote) {
+ struct ring_buffer_desc *desc = ring_buffer_desc(buffer->remote->desc, cpu);
+
+ if (!desc)
+ goto fail_free_reader;
+
+ cpu_buffer->remote = buffer->remote;
+ cpu_buffer->meta_page = (struct trace_buffer_meta *)(void *)desc->meta_va;
+ cpu_buffer->nr_pages = nr_pages;
+ cpu_buffer->subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1,
+ sizeof(*cpu_buffer->subbuf_ids), GFP_KERNEL);
+ if (!cpu_buffer->subbuf_ids)
+ goto fail_free_reader;
+
+ /* Remote buffers are read-only and immutable */
+ atomic_inc(&cpu_buffer->record_disabled);
+ atomic_inc(&cpu_buffer->resize_disabled);
+
+ bpage->page = ring_buffer_desc_page(desc, cpu_buffer->meta_page->reader.id);
+ if (!bpage->page)
+ goto fail_free_reader;
+
+ bpage->range = 1;
+ cpu_buffer->subbuf_ids[0] = bpage;
} else {
int order = cpu_buffer->buffer->subbuf_order;
bpage->page = alloc_cpu_data(cpu, order);
@@ -2453,6 +2503,9 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
irq_work_sync(&cpu_buffer->irq_work.work);
+ if (cpu_buffer->remote)
+ kfree(cpu_buffer->subbuf_ids);
+
free_buffer_page(cpu_buffer->reader_page);
if (head) {
@@ -2471,11 +2524,22 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
kfree(cpu_buffer);
}
+/* Stop recording on a persistent buffer and flush cache if needed. */
+static int rb_flush_buffer_cb(struct notifier_block *nb, unsigned long event, void *data)
+{
+ struct trace_buffer *buffer = container_of(nb, struct trace_buffer, flush_nb);
+
+ ring_buffer_record_off(buffer);
+ arch_ring_buffer_flush_range(buffer->range_addr_start, buffer->range_addr_end);
+ return NOTIFY_DONE;
+}
+
static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
int order, unsigned long start,
unsigned long end,
unsigned long scratch_size,
- struct lock_class_key *key)
+ struct lock_class_key *key,
+ struct ring_buffer_remote *remote)
{
struct trace_buffer *buffer __free(kfree) = NULL;
long nr_pages;
@@ -2515,6 +2579,8 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
if (!buffer->buffers)
goto fail_free_cpumask;
+ cpu = raw_smp_processor_id();
+
/* If start/end are specified, then that overrides size */
if (start && end) {
unsigned long buffers_start;
@@ -2570,6 +2636,15 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
buffer->range_addr_end = end;
rb_range_meta_init(buffer, nr_pages, scratch_size);
+ } else if (remote) {
+ struct ring_buffer_desc *desc = ring_buffer_desc(remote->desc, cpu);
+
+ buffer->remote = remote;
+ /* The writer is remote. This ring-buffer is read-only */
+ atomic_inc(&buffer->record_disabled);
+ nr_pages = desc->nr_page_va - 1;
+ if (nr_pages < 2)
+ goto fail_free_buffers;
} else {
/* need at least two pages */
@@ -2578,7 +2653,6 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
nr_pages = 2;
}
- cpu = raw_smp_processor_id();
cpumask_set_cpu(cpu, buffer->cpumask);
buffer->buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
if (!buffer->buffers[cpu])
@@ -2590,6 +2664,12 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
mutex_init(&buffer->mutex);
+ /* Persistent ring buffer needs to flush cache before reboot. */
+ if (start && end) {
+ buffer->flush_nb.notifier_call = rb_flush_buffer_cb;
+ atomic_notifier_chain_register(&panic_notifier_list, &buffer->flush_nb);
+ }
+
return_ptr(buffer);
fail_free_buffers:
@@ -2620,7 +2700,7 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
struct lock_class_key *key)
{
/* Default buffer page size - one system page */
- return alloc_buffer(size, flags, 0, 0, 0, 0, key);
+ return alloc_buffer(size, flags, 0, 0, 0, 0, key, NULL);
}
EXPORT_SYMBOL_GPL(__ring_buffer_alloc);
@@ -2647,7 +2727,18 @@ struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flag
struct lock_class_key *key)
{
return alloc_buffer(size, flags, order, start, start + range_size,
- scratch_size, key);
+ scratch_size, key, NULL);
+}
+
+/**
+ * __ring_buffer_alloc_remote - allocate a new ring_buffer from a remote
+ * @remote: Contains a description of the ring-buffer pages and remote callbacks.
+ * @key: ring buffer reader_lock_key.
+ */
+struct trace_buffer *__ring_buffer_alloc_remote(struct ring_buffer_remote *remote,
+ struct lock_class_key *key)
+{
+ return alloc_buffer(0, 0, 0, 0, 0, 0, key, remote);
}
void *ring_buffer_meta_scratch(struct trace_buffer *buffer, unsigned int *size)
@@ -2677,6 +2768,9 @@ ring_buffer_free(struct trace_buffer *buffer)
{
int cpu;
+ if (buffer->range_addr_start && buffer->range_addr_end)
+ atomic_notifier_chain_unregister(&panic_notifier_list, &buffer->flush_nb);
+
cpuhp_state_remove_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node);
irq_work_sync(&buffer->irq_work.work);
@@ -4435,18 +4529,20 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
ret = rb_read_data_buffer(bpage, tail, cpu_buffer->cpu, &ts, &delta);
if (ret < 0) {
if (delta < ts) {
- buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld\n",
- cpu_buffer->cpu, ts, delta);
+ buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld clock:%pS\n",
+ cpu_buffer->cpu, ts, delta,
+ cpu_buffer->buffer->clock);
goto out;
}
}
if ((full && ts > info->ts) ||
(!full && ts + info->delta != info->ts)) {
- buffer_warn_return("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s context:%s\n",
+ buffer_warn_return("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s context:%s\ntrace clock:%pS",
cpu_buffer->cpu,
ts + info->delta, info->ts, info->delta,
info->before, info->after,
- full ? " (full)" : "", show_interrupt_level());
+ full ? " (full)" : "", show_interrupt_level(),
+ cpu_buffer->buffer->clock);
}
out:
atomic_dec(this_cpu_ptr(&checking));
@@ -5274,14 +5370,66 @@ unsigned long ring_buffer_overruns(struct trace_buffer *buffer)
}
EXPORT_SYMBOL_GPL(ring_buffer_overruns);
+static bool rb_read_remote_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ local_set(&cpu_buffer->entries, READ_ONCE(cpu_buffer->meta_page->entries));
+ local_set(&cpu_buffer->overrun, READ_ONCE(cpu_buffer->meta_page->overrun));
+ local_set(&cpu_buffer->pages_touched, READ_ONCE(cpu_buffer->meta_page->pages_touched));
+ local_set(&cpu_buffer->pages_lost, READ_ONCE(cpu_buffer->meta_page->pages_lost));
+
+ return rb_num_of_entries(cpu_buffer);
+}
+
+static void rb_update_remote_head(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct buffer_page *next, *orig;
+ int retry = 3;
+
+ orig = next = cpu_buffer->head_page;
+ rb_inc_page(&next);
+
+ /* Run after the writer */
+ while (cpu_buffer->head_page->page->time_stamp > next->page->time_stamp) {
+ rb_inc_page(&next);
+
+ rb_list_head_clear(cpu_buffer->head_page->list.prev);
+ rb_inc_page(&cpu_buffer->head_page);
+ rb_set_list_to_head(cpu_buffer->head_page->list.prev);
+
+ if (cpu_buffer->head_page == orig) {
+ if (WARN_ON_ONCE(!(--retry)))
+ return;
+ }
+ }
+
+ orig = cpu_buffer->commit_page = cpu_buffer->head_page;
+ retry = 3;
+
+ while (cpu_buffer->commit_page->page->time_stamp < next->page->time_stamp) {
+ rb_inc_page(&next);
+ rb_inc_page(&cpu_buffer->commit_page);
+
+ if (cpu_buffer->commit_page == orig) {
+ if (WARN_ON_ONCE(!(--retry)))
+ return;
+ }
+ }
+}
+
static void rb_iter_reset(struct ring_buffer_iter *iter)
{
struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+ if (cpu_buffer->remote) {
+ rb_read_remote_meta_page(cpu_buffer);
+ rb_update_remote_head(cpu_buffer);
+ }
+
/* Iterator usage is expected to have record disabled */
iter->head_page = cpu_buffer->reader_page;
iter->head = cpu_buffer->reader_page->read;
iter->next_event = iter->head;
+ iter->missed_events = 0;
iter->cache_reader_page = iter->head_page;
iter->cache_read = cpu_buffer->read;
@@ -5428,7 +5576,65 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
}
static struct buffer_page *
-rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
+__rb_get_reader_page_from_remote(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct buffer_page *new_reader, *prev_reader, *prev_head, *new_head, *last;
+
+ if (!rb_read_remote_meta_page(cpu_buffer))
+ return NULL;
+
+ /* More to read on the reader page */
+ if (cpu_buffer->reader_page->read < rb_page_size(cpu_buffer->reader_page)) {
+ if (!cpu_buffer->reader_page->read)
+ cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp;
+ return cpu_buffer->reader_page;
+ }
+
+ prev_reader = cpu_buffer->subbuf_ids[cpu_buffer->meta_page->reader.id];
+
+ WARN_ON_ONCE(cpu_buffer->remote->swap_reader_page(cpu_buffer->cpu,
+ cpu_buffer->remote->priv));
+ /* nr_pages doesn't include the reader page */
+ if (WARN_ON_ONCE(cpu_buffer->meta_page->reader.id > cpu_buffer->nr_pages))
+ return NULL;
+
+ new_reader = cpu_buffer->subbuf_ids[cpu_buffer->meta_page->reader.id];
+
+ WARN_ON_ONCE(prev_reader == new_reader);
+
+ prev_head = new_reader; /* New reader was also the previous head */
+ new_head = prev_head;
+ rb_inc_page(&new_head);
+ last = prev_head;
+ rb_dec_page(&last);
+
+ /* Clear the old HEAD flag */
+ rb_list_head_clear(cpu_buffer->head_page->list.prev);
+
+ prev_reader->list.next = prev_head->list.next;
+ prev_reader->list.prev = prev_head->list.prev;
+
+ /* Swap prev_reader with new_reader */
+ last->list.next = &prev_reader->list;
+ new_head->list.prev = &prev_reader->list;
+
+ new_reader->list.prev = &new_reader->list;
+ new_reader->list.next = &new_head->list;
+
+ /* Reactivate the HEAD flag */
+ rb_set_list_to_head(&last->list);
+
+ cpu_buffer->head_page = new_head;
+ cpu_buffer->reader_page = new_reader;
+ cpu_buffer->pages = &new_head->list;
+ cpu_buffer->read_stamp = new_reader->page->time_stamp;
+ cpu_buffer->lost_events = cpu_buffer->meta_page->reader.lost_events;
+
+ return rb_page_size(cpu_buffer->reader_page) ? cpu_buffer->reader_page : NULL;
+}
+
+static struct buffer_page *
+__rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
{
struct buffer_page *reader = NULL;
unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size);
@@ -5598,6 +5804,13 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
return reader;
}
+static struct buffer_page *
+rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return cpu_buffer->remote ? __rb_get_reader_page_from_remote(cpu_buffer) :
+ __rb_get_reader_page(cpu_buffer);
+}
+
static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
{
struct ring_buffer_event *event;
@@ -5896,10 +6109,7 @@ ring_buffer_peek(struct trace_buffer *buffer, int cpu, u64 *ts,
*/
bool ring_buffer_iter_dropped(struct ring_buffer_iter *iter)
{
- bool ret = iter->missed_events != 0;
-
- iter->missed_events = 0;
- return ret;
+ return iter->missed_events != 0;
}
EXPORT_SYMBOL_GPL(ring_buffer_iter_dropped);
@@ -6061,7 +6271,7 @@ void ring_buffer_iter_advance(struct ring_buffer_iter *iter)
unsigned long flags;
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
-
+ iter->missed_events = 0;
rb_advance_iter(iter);
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
@@ -6154,6 +6364,8 @@ static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
meta->entries = local_read(&cpu_buffer->entries);
meta->overrun = local_read(&cpu_buffer->overrun);
meta->read = cpu_buffer->read;
+ meta->pages_lost = local_read(&cpu_buffer->pages_lost);
+ meta->pages_touched = local_read(&cpu_buffer->pages_touched);
/* Some archs do not have data cache coherency between kernel and user-space */
flush_kernel_vmap_range(cpu_buffer->meta_page, PAGE_SIZE);
@@ -6164,6 +6376,23 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
{
struct buffer_page *page;
+ if (cpu_buffer->remote) {
+ if (!cpu_buffer->remote->reset)
+ return;
+
+ cpu_buffer->remote->reset(cpu_buffer->cpu, cpu_buffer->remote->priv);
+ rb_read_remote_meta_page(cpu_buffer);
+
+ /* Read related values, not covered by the meta-page */
+ local_set(&cpu_buffer->pages_read, 0);
+ cpu_buffer->read = 0;
+ cpu_buffer->read_bytes = 0;
+ cpu_buffer->last_overrun = 0;
+ cpu_buffer->reader_page->read = 0;
+
+ return;
+ }
+
rb_head_page_deactivate(cpu_buffer);
cpu_buffer->head_page
@@ -6394,6 +6623,46 @@ bool ring_buffer_empty_cpu(struct trace_buffer *buffer, int cpu)
}
EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu);
+int ring_buffer_poll_remote(struct trace_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (cpu != RING_BUFFER_ALL_CPUS) {
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return -EINVAL;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ guard(raw_spinlock)(&cpu_buffer->reader_lock);
+ if (rb_read_remote_meta_page(cpu_buffer))
+ rb_wakeups(buffer, cpu_buffer);
+
+ return 0;
+ }
+
+ guard(cpus_read_lock)();
+
+ /*
+ * Make sure all the ring buffers are up to date before we start reading
+ * them.
+ */
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+
+ guard(raw_spinlock)(&cpu_buffer->reader_lock);
+ rb_read_remote_meta_page(cpu_buffer);
+ }
+
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+
+ if (rb_num_of_entries(cpu_buffer))
+ rb_wakeups(buffer, cpu_buffer);
+ }
+
+ return 0;
+}
+
#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
/**
* ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers
@@ -6632,6 +6901,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
unsigned int commit;
unsigned int read;
u64 save_timestamp;
+ bool force_memcpy;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return -1;
@@ -6669,6 +6939,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
/* Check if any events were dropped */
missed_events = cpu_buffer->lost_events;
+ force_memcpy = cpu_buffer->mapped || cpu_buffer->remote;
+
/*
* If this page has been partially read or
* if len is not big enough to read the rest of the page or
@@ -6678,7 +6950,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
*/
if (read || (len < (commit - read)) ||
cpu_buffer->reader_page == cpu_buffer->commit_page ||
- cpu_buffer->mapped) {
+ force_memcpy) {
struct buffer_data_page *rpage = cpu_buffer->reader_page->page;
unsigned int rpos = read;
unsigned int pos = 0;
@@ -7034,7 +7306,7 @@ static void rb_free_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
}
static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
- unsigned long *subbuf_ids)
+ struct buffer_page **subbuf_ids)
{
struct trace_buffer_meta *meta = cpu_buffer->meta_page;
unsigned int nr_subbufs = cpu_buffer->nr_pages + 1;
@@ -7043,7 +7315,7 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
int id = 0;
id = rb_page_id(cpu_buffer, cpu_buffer->reader_page, id);
- subbuf_ids[id++] = (unsigned long)cpu_buffer->reader_page->page;
+ subbuf_ids[id++] = cpu_buffer->reader_page;
cnt++;
first_subbuf = subbuf = rb_set_head_page(cpu_buffer);
@@ -7053,7 +7325,7 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
if (WARN_ON(id >= nr_subbufs))
break;
- subbuf_ids[id] = (unsigned long)subbuf->page;
+ subbuf_ids[id] = subbuf;
rb_inc_page(&subbuf);
id++;
@@ -7062,7 +7334,7 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
WARN_ON(cnt != nr_subbufs);
- /* install subbuf ID to kern VA translation */
+ /* install subbuf ID to bpage translation */
cpu_buffer->subbuf_ids = subbuf_ids;
meta->meta_struct_len = sizeof(*meta);
@@ -7218,13 +7490,15 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
}
while (p < nr_pages) {
+ struct buffer_page *subbuf;
struct page *page;
int off = 0;
if (WARN_ON_ONCE(s >= nr_subbufs))
return -EINVAL;
- page = virt_to_page((void *)cpu_buffer->subbuf_ids[s]);
+ subbuf = cpu_buffer->subbuf_ids[s];
+ page = virt_to_page((void *)subbuf->page);
for (; off < (1 << (subbuf_order)); off++, page++) {
if (p >= nr_pages)
@@ -7251,10 +7525,11 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu,
struct vm_area_struct *vma)
{
struct ring_buffer_per_cpu *cpu_buffer;
- unsigned long flags, *subbuf_ids;
+ struct buffer_page **subbuf_ids;
+ unsigned long flags;
int err;
- if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ if (!cpumask_test_cpu(cpu, buffer->cpumask) || buffer->remote)
return -EINVAL;
cpu_buffer = buffer->buffers[cpu];
@@ -7275,7 +7550,7 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu,
if (err)
return err;
- /* subbuf_ids include the reader while nr_pages does not */
+ /* subbuf_ids includes the reader while nr_pages does not */
subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1, sizeof(*subbuf_ids), GFP_KERNEL);
if (!subbuf_ids) {
rb_free_meta_page(cpu_buffer);
@@ -7468,6 +7743,12 @@ out:
return 0;
}
+static void rb_cpu_sync(void *data)
+{
+ /* Not really needed, but documents what is happening */
+ smp_rmb();
+}
+
/*
* We only allocate new buffers, never free them if the CPU goes down.
* If we were to free the buffer, then the user would lose any trace that was in
@@ -7506,7 +7787,18 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node)
cpu);
return -ENOMEM;
}
- smp_wmb();
+
+ /*
+ * Ensure trace_buffer readers observe the newly allocated
+ * ring_buffer_per_cpu before they check the cpumask. Instead of using a
+ * read barrier for all readers, send an IPI.
+ */
+ if (unlikely(system_state == SYSTEM_RUNNING)) {
+ on_each_cpu(rb_cpu_sync, NULL, 1);
+ /* Not really needed, but documents what is happening */
+ smp_wmb();
+ }
+
cpumask_set_cpu(cpu, buffer->cpumask);
return 0;
}
diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig
index 5b4be87ba59d..3884b14df375 100644
--- a/kernel/trace/rv/Kconfig
+++ b/kernel/trace/rv/Kconfig
@@ -23,6 +23,19 @@ config LTL_MON_EVENTS_ID
config RV_LTL_MONITOR
bool
+config RV_HA_MONITOR
+ bool
+
+config HA_MON_EVENTS_IMPLICIT
+ select DA_MON_EVENTS_IMPLICIT
+ select RV_HA_MONITOR
+ bool
+
+config HA_MON_EVENTS_ID
+ select DA_MON_EVENTS_ID
+ select RV_HA_MONITOR
+ bool
+
menuconfig RV
bool "Runtime Verification"
select TRACING
@@ -65,6 +78,11 @@ source "kernel/trace/rv/monitors/pagefault/Kconfig"
source "kernel/trace/rv/monitors/sleep/Kconfig"
# Add new rtapp monitors here
+source "kernel/trace/rv/monitors/stall/Kconfig"
+source "kernel/trace/rv/monitors/deadline/Kconfig"
+source "kernel/trace/rv/monitors/nomiss/Kconfig"
+# Add new deadline monitors here
+
# Add new monitors here
config RV_REACTORS
diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile
index 750e4ad6fa0f..94498da35b37 100644
--- a/kernel/trace/rv/Makefile
+++ b/kernel/trace/rv/Makefile
@@ -17,6 +17,9 @@ obj-$(CONFIG_RV_MON_STS) += monitors/sts/sts.o
obj-$(CONFIG_RV_MON_NRP) += monitors/nrp/nrp.o
obj-$(CONFIG_RV_MON_SSSW) += monitors/sssw/sssw.o
obj-$(CONFIG_RV_MON_OPID) += monitors/opid/opid.o
+obj-$(CONFIG_RV_MON_STALL) += monitors/stall/stall.o
+obj-$(CONFIG_RV_MON_DEADLINE) += monitors/deadline/deadline.o
+obj-$(CONFIG_RV_MON_NOMISS) += monitors/nomiss/nomiss.o
# Add new monitors here
obj-$(CONFIG_RV_REACTORS) += rv_reactors.o
obj-$(CONFIG_RV_REACT_PRINTK) += reactor_printk.o
diff --git a/kernel/trace/rv/monitors/deadline/Kconfig b/kernel/trace/rv/monitors/deadline/Kconfig
new file mode 100644
index 000000000000..38804a6ad91d
--- /dev/null
+++ b/kernel/trace/rv/monitors/deadline/Kconfig
@@ -0,0 +1,10 @@
+config RV_MON_DEADLINE
+ depends on RV
+ bool "deadline monitor"
+ help
+ Collection of monitors to check the deadline scheduler and server
+ behave according to specifications. Enable this to enable all
+ scheduler specification supported by the current kernel.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_deadline.rst
diff --git a/kernel/trace/rv/monitors/deadline/deadline.c b/kernel/trace/rv/monitors/deadline/deadline.c
new file mode 100644
index 000000000000..d566d4542ebf
--- /dev/null
+++ b/kernel/trace/rv/monitors/deadline/deadline.c
@@ -0,0 +1,44 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+#include <linux/kallsyms.h>
+
+#define MODULE_NAME "deadline"
+
+#include "deadline.h"
+
+struct rv_monitor rv_deadline = {
+ .name = "deadline",
+ .description = "container for several deadline scheduler specifications.",
+ .enable = NULL,
+ .disable = NULL,
+ .reset = NULL,
+ .enabled = 0,
+};
+
+/* Used by other monitors */
+struct sched_class *rv_ext_sched_class;
+
+static int __init register_deadline(void)
+{
+ if (IS_ENABLED(CONFIG_SCHED_CLASS_EXT)) {
+ rv_ext_sched_class = (void *)kallsyms_lookup_name("ext_sched_class");
+ if (!rv_ext_sched_class)
+ pr_warn("rv: Missing ext_sched_class, monitors may not work.\n");
+ }
+ return rv_register_monitor(&rv_deadline, NULL);
+}
+
+static void __exit unregister_deadline(void)
+{
+ rv_unregister_monitor(&rv_deadline);
+}
+
+module_init(register_deadline);
+module_exit(unregister_deadline);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>");
+MODULE_DESCRIPTION("deadline: container for several deadline scheduler specifications.");
diff --git a/kernel/trace/rv/monitors/deadline/deadline.h b/kernel/trace/rv/monitors/deadline/deadline.h
new file mode 100644
index 000000000000..0bbfd2543329
--- /dev/null
+++ b/kernel/trace/rv/monitors/deadline/deadline.h
@@ -0,0 +1,202 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/kernel.h>
+#include <linux/uaccess.h>
+#include <linux/sched/deadline.h>
+#include <asm/syscall.h>
+#include <uapi/linux/sched/types.h>
+#include <trace/events/sched.h>
+
+/*
+ * Dummy values if not available
+ */
+#ifndef __NR_sched_setscheduler
+#define __NR_sched_setscheduler -__COUNTER__
+#endif
+#ifndef __NR_sched_setattr
+#define __NR_sched_setattr -__COUNTER__
+#endif
+
+extern struct rv_monitor rv_deadline;
+/* Initialised when registering the deadline container */
+extern struct sched_class *rv_ext_sched_class;
+
+/*
+ * If both have dummy values, the syscalls are not supported and we don't even
+ * need to register the handler.
+ */
+static inline bool should_skip_syscall_handle(void)
+{
+ return __NR_sched_setattr < 0 && __NR_sched_setscheduler < 0;
+}
+
+/*
+ * is_supported_type - return true if @type is supported by the deadline monitors
+ */
+static inline bool is_supported_type(u8 type)
+{
+ return type == DL_TASK || type == DL_SERVER_FAIR || type == DL_SERVER_EXT;
+}
+
+/*
+ * is_server_type - return true if @type is a supported server
+ */
+static inline bool is_server_type(u8 type)
+{
+ return is_supported_type(type) && type != DL_TASK;
+}
+
+/*
+ * Use negative numbers for the server.
+ * Currently only one fair server per CPU, may change in the future.
+ */
+#define fair_server_id(cpu) (-cpu)
+#define ext_server_id(cpu) (-cpu - num_possible_cpus())
+#define NO_SERVER_ID (-2 * num_possible_cpus())
+/*
+ * Get a unique id used for dl entities
+ *
+ * The cpu is not required for tasks as the pid is used there, if this function
+ * is called on a dl_se that for sure corresponds to a task, DL_TASK can be
+ * used in place of cpu.
+ * We need the cpu for servers as it is provided in the tracepoint and we
+ * cannot easily retrieve it from the dl_se (requires the struct rq definition).
+ */
+static inline int get_entity_id(struct sched_dl_entity *dl_se, int cpu, u8 type)
+{
+ if (dl_server(dl_se) && type != DL_TASK) {
+ if (type == DL_SERVER_FAIR)
+ return fair_server_id(cpu);
+ if (type == DL_SERVER_EXT)
+ return ext_server_id(cpu);
+ return NO_SERVER_ID;
+ }
+ return dl_task_of(dl_se)->pid;
+}
+
+static inline bool task_is_scx_enabled(struct task_struct *tsk)
+{
+ return IS_ENABLED(CONFIG_SCHED_CLASS_EXT) &&
+ tsk->sched_class == rv_ext_sched_class;
+}
+
+/* Expand id and target as arguments for da functions */
+#define EXPAND_ID(dl_se, cpu, type) get_entity_id(dl_se, cpu, type), dl_se
+#define EXPAND_ID_TASK(tsk) get_entity_id(&tsk->dl, task_cpu(tsk), DL_TASK), &tsk->dl
+
+static inline u8 get_server_type(struct task_struct *tsk)
+{
+ if (tsk->policy == SCHED_NORMAL || tsk->policy == SCHED_EXT ||
+ tsk->policy == SCHED_BATCH || tsk->policy == SCHED_IDLE)
+ return task_is_scx_enabled(tsk) ? DL_SERVER_EXT : DL_SERVER_FAIR;
+ return DL_OTHER;
+}
+
+static inline int extract_params(struct pt_regs *regs, long id, pid_t *pid_out)
+{
+ size_t size = offsetofend(struct sched_attr, sched_flags);
+ struct sched_attr __user *uattr, attr;
+ int new_policy = -1, ret;
+ unsigned long args[6];
+
+ switch (id) {
+ case __NR_sched_setscheduler:
+ syscall_get_arguments(current, regs, args);
+ *pid_out = args[0];
+ new_policy = args[1];
+ break;
+ case __NR_sched_setattr:
+ syscall_get_arguments(current, regs, args);
+ *pid_out = args[0];
+ uattr = (struct sched_attr __user *)args[1];
+ /*
+ * Just copy up to sched_flags, we are not interested after that
+ */
+ ret = copy_struct_from_user(&attr, size, uattr, size);
+ if (ret)
+ return ret;
+ if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
+ return -EINVAL;
+ new_policy = attr.sched_policy;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return new_policy & ~SCHED_RESET_ON_FORK;
+}
+
+/* Helper functions requiring DA/HA utilities */
+#ifdef RV_MON_TYPE
+
+/*
+ * get_fair_server - get the fair server associated to a task
+ *
+ * If the task is a boosted task, the server is available in the task_struct,
+ * otherwise grab the dl entity saved for the CPU where the task is enqueued.
+ * This function assumes the task is enqueued somewhere.
+ */
+static inline struct sched_dl_entity *get_server(struct task_struct *tsk, u8 type)
+{
+ if (tsk->dl_server && get_server_type(tsk) == type)
+ return tsk->dl_server;
+ if (type == DL_SERVER_FAIR)
+ return da_get_target_by_id(fair_server_id(task_cpu(tsk)));
+ if (type == DL_SERVER_EXT)
+ return da_get_target_by_id(ext_server_id(task_cpu(tsk)));
+ return NULL;
+}
+
+/*
+ * Initialise monitors for all tasks and pre-allocate the storage for servers.
+ * This is necessary since we don't have access to the servers here and
+ * allocation can cause deadlocks from their tracepoints. We can only fill
+ * pre-initialised storage from there.
+ */
+static inline int init_storage(bool skip_tasks)
+{
+ struct task_struct *g, *p;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ if (!da_create_empty_storage(fair_server_id(cpu)))
+ goto fail;
+ if (IS_ENABLED(CONFIG_SCHED_CLASS_EXT) &&
+ !da_create_empty_storage(ext_server_id(cpu)))
+ goto fail;
+ }
+
+ if (skip_tasks)
+ return 0;
+
+ read_lock(&tasklist_lock);
+ for_each_process_thread(g, p) {
+ if (p->policy == SCHED_DEADLINE) {
+ if (!da_create_storage(EXPAND_ID_TASK(p), NULL)) {
+ read_unlock(&tasklist_lock);
+ goto fail;
+ }
+ }
+ }
+ read_unlock(&tasklist_lock);
+ return 0;
+
+fail:
+ da_monitor_destroy();
+ return -ENOMEM;
+}
+
+static void __maybe_unused handle_newtask(void *data, struct task_struct *task, u64 flags)
+{
+ /* Might be superfluous as tasks are not started with this policy.. */
+ if (task->policy == SCHED_DEADLINE)
+ da_create_storage(EXPAND_ID_TASK(task), NULL);
+}
+
+static void __maybe_unused handle_exit(void *data, struct task_struct *p, bool group_dead)
+{
+ if (p->policy == SCHED_DEADLINE)
+ da_destroy_storage(get_entity_id(&p->dl, DL_TASK, DL_TASK));
+}
+
+#endif
diff --git a/kernel/trace/rv/monitors/nomiss/Kconfig b/kernel/trace/rv/monitors/nomiss/Kconfig
new file mode 100644
index 000000000000..e1886c3a0dd9
--- /dev/null
+++ b/kernel/trace/rv/monitors/nomiss/Kconfig
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_NOMISS
+ depends on RV
+ depends on HAVE_SYSCALL_TRACEPOINTS
+ depends on RV_MON_DEADLINE
+ default y
+ select HA_MON_EVENTS_ID
+ bool "nomiss monitor"
+ help
+ Monitor to ensure dl entities run to completion before their deadiline.
+ This monitor is part of the deadline monitors collection.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_deadline.rst
diff --git a/kernel/trace/rv/monitors/nomiss/nomiss.c b/kernel/trace/rv/monitors/nomiss/nomiss.c
new file mode 100644
index 000000000000..31f90f3638d8
--- /dev/null
+++ b/kernel/trace/rv/monitors/nomiss/nomiss.c
@@ -0,0 +1,293 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+#include <rv/instrumentation.h>
+
+#define MODULE_NAME "nomiss"
+
+#include <uapi/linux/sched/types.h>
+#include <trace/events/syscalls.h>
+#include <trace/events/sched.h>
+#include <trace/events/task.h>
+#include <rv_trace.h>
+
+#define RV_MON_TYPE RV_MON_PER_OBJ
+#define HA_TIMER_TYPE HA_TIMER_WHEEL
+/* The start condition is on sched_switch, it's dangerous to allocate there */
+#define DA_SKIP_AUTO_ALLOC
+typedef struct sched_dl_entity *monitor_target;
+#include "nomiss.h"
+#include <rv/ha_monitor.h>
+#include <monitors/deadline/deadline.h>
+
+/*
+ * User configurable deadline threshold. If the total utilisation of deadline
+ * tasks is larger than 1, they are only guaranteed bounded tardiness. See
+ * Documentation/scheduler/sched-deadline.rst for more details.
+ * The minimum tardiness without sched_feat(HRTICK_DL) is 1 tick to accommodate
+ * for throttle enforced on the next tick.
+ */
+static u64 deadline_thresh = TICK_NSEC;
+module_param(deadline_thresh, ullong, 0644);
+#define DEADLINE_NS(ha_mon) (ha_get_target(ha_mon)->dl_deadline + deadline_thresh)
+
+static u64 ha_get_env(struct ha_monitor *ha_mon, enum envs_nomiss env, u64 time_ns)
+{
+ if (env == clk_nomiss)
+ return ha_get_clk_ns(ha_mon, env, time_ns);
+ else if (env == is_constr_dl_nomiss)
+ return !dl_is_implicit(ha_get_target(ha_mon));
+ else if (env == is_defer_nomiss)
+ return ha_get_target(ha_mon)->dl_defer;
+ return ENV_INVALID_VALUE;
+}
+
+static void ha_reset_env(struct ha_monitor *ha_mon, enum envs_nomiss env, u64 time_ns)
+{
+ if (env == clk_nomiss)
+ ha_reset_clk_ns(ha_mon, env, time_ns);
+}
+
+static inline bool ha_verify_invariants(struct ha_monitor *ha_mon,
+ enum states curr_state, enum events event,
+ enum states next_state, u64 time_ns)
+{
+ if (curr_state == ready_nomiss)
+ return ha_check_invariant_ns(ha_mon, clk_nomiss, time_ns);
+ else if (curr_state == running_nomiss)
+ return ha_check_invariant_ns(ha_mon, clk_nomiss, time_ns);
+ return true;
+}
+
+static inline void ha_convert_inv_guard(struct ha_monitor *ha_mon,
+ enum states curr_state, enum events event,
+ enum states next_state, u64 time_ns)
+{
+ if (curr_state == next_state)
+ return;
+ if (curr_state == ready_nomiss)
+ ha_inv_to_guard(ha_mon, clk_nomiss, DEADLINE_NS(ha_mon), time_ns);
+ else if (curr_state == running_nomiss)
+ ha_inv_to_guard(ha_mon, clk_nomiss, DEADLINE_NS(ha_mon), time_ns);
+}
+
+static inline bool ha_verify_guards(struct ha_monitor *ha_mon,
+ enum states curr_state, enum events event,
+ enum states next_state, u64 time_ns)
+{
+ bool res = true;
+
+ if (curr_state == ready_nomiss && event == dl_replenish_nomiss)
+ ha_reset_env(ha_mon, clk_nomiss, time_ns);
+ else if (curr_state == ready_nomiss && event == dl_throttle_nomiss)
+ res = ha_get_env(ha_mon, is_defer_nomiss, time_ns) == 1ull;
+ else if (curr_state == idle_nomiss && event == dl_replenish_nomiss)
+ ha_reset_env(ha_mon, clk_nomiss, time_ns);
+ else if (curr_state == running_nomiss && event == dl_replenish_nomiss)
+ ha_reset_env(ha_mon, clk_nomiss, time_ns);
+ else if (curr_state == sleeping_nomiss && event == dl_replenish_nomiss)
+ ha_reset_env(ha_mon, clk_nomiss, time_ns);
+ else if (curr_state == sleeping_nomiss && event == dl_throttle_nomiss)
+ res = ha_get_env(ha_mon, is_constr_dl_nomiss, time_ns) == 1ull ||
+ ha_get_env(ha_mon, is_defer_nomiss, time_ns) == 1ull;
+ else if (curr_state == throttled_nomiss && event == dl_replenish_nomiss)
+ ha_reset_env(ha_mon, clk_nomiss, time_ns);
+ return res;
+}
+
+static inline void ha_setup_invariants(struct ha_monitor *ha_mon,
+ enum states curr_state, enum events event,
+ enum states next_state, u64 time_ns)
+{
+ if (next_state == curr_state && event != dl_replenish_nomiss)
+ return;
+ if (next_state == ready_nomiss)
+ ha_start_timer_ns(ha_mon, clk_nomiss, DEADLINE_NS(ha_mon), time_ns);
+ else if (next_state == running_nomiss)
+ ha_start_timer_ns(ha_mon, clk_nomiss, DEADLINE_NS(ha_mon), time_ns);
+ else if (curr_state == ready_nomiss)
+ ha_cancel_timer(ha_mon);
+ else if (curr_state == running_nomiss)
+ ha_cancel_timer(ha_mon);
+}
+
+static bool ha_verify_constraint(struct ha_monitor *ha_mon,
+ enum states curr_state, enum events event,
+ enum states next_state, u64 time_ns)
+{
+ if (!ha_verify_invariants(ha_mon, curr_state, event, next_state, time_ns))
+ return false;
+
+ ha_convert_inv_guard(ha_mon, curr_state, event, next_state, time_ns);
+
+ if (!ha_verify_guards(ha_mon, curr_state, event, next_state, time_ns))
+ return false;
+
+ ha_setup_invariants(ha_mon, curr_state, event, next_state, time_ns);
+
+ return true;
+}
+
+static void handle_dl_replenish(void *data, struct sched_dl_entity *dl_se,
+ int cpu, u8 type)
+{
+ if (is_supported_type(type))
+ da_handle_event(EXPAND_ID(dl_se, cpu, type), dl_replenish_nomiss);
+}
+
+static void handle_dl_throttle(void *data, struct sched_dl_entity *dl_se,
+ int cpu, u8 type)
+{
+ if (is_supported_type(type))
+ da_handle_event(EXPAND_ID(dl_se, cpu, type), dl_throttle_nomiss);
+}
+
+static void handle_dl_server_stop(void *data, struct sched_dl_entity *dl_se,
+ int cpu, u8 type)
+{
+ /*
+ * This isn't the standard use of da_handle_start_run_event since this
+ * event cannot only occur from the initial state.
+ * It is fine to use here because it always brings to a known state and
+ * the fact we "pretend" the transition starts from the initial state
+ * has no side effect.
+ */
+ if (is_supported_type(type))
+ da_handle_start_run_event(EXPAND_ID(dl_se, cpu, type), dl_server_stop_nomiss);
+}
+
+static inline void handle_server_switch(struct task_struct *next, int cpu, u8 type)
+{
+ struct sched_dl_entity *dl_se = get_server(next, type);
+
+ if (dl_se && is_idle_task(next))
+ da_handle_event(EXPAND_ID(dl_se, cpu, type), dl_server_idle_nomiss);
+}
+
+static void handle_sched_switch(void *data, bool preempt,
+ struct task_struct *prev,
+ struct task_struct *next,
+ unsigned int prev_state)
+{
+ int cpu = task_cpu(next);
+
+ if (prev_state != TASK_RUNNING && !preempt && prev->policy == SCHED_DEADLINE)
+ da_handle_event(EXPAND_ID_TASK(prev), sched_switch_suspend_nomiss);
+ if (next->policy == SCHED_DEADLINE)
+ da_handle_start_run_event(EXPAND_ID_TASK(next), sched_switch_in_nomiss);
+
+ /*
+ * The server is available in next only if the next task is boosted,
+ * otherwise we need to retrieve it.
+ * Here the server continues in the state running/armed until actually
+ * stopped, this works since we continue expecting a throttle.
+ */
+ if (next->dl_server)
+ da_handle_start_event(EXPAND_ID(next->dl_server, cpu,
+ get_server_type(next)),
+ sched_switch_in_nomiss);
+ else {
+ handle_server_switch(next, cpu, DL_SERVER_FAIR);
+ if (IS_ENABLED(CONFIG_SCHED_CLASS_EXT))
+ handle_server_switch(next, cpu, DL_SERVER_EXT);
+ }
+}
+
+static void handle_sys_enter(void *data, struct pt_regs *regs, long id)
+{
+ struct task_struct *p;
+ int new_policy = -1;
+ pid_t pid = 0;
+
+ new_policy = extract_params(regs, id, &pid);
+ if (new_policy < 0)
+ return;
+ guard(rcu)();
+ p = pid ? find_task_by_vpid(pid) : current;
+ if (unlikely(!p) || new_policy == p->policy)
+ return;
+
+ if (p->policy == SCHED_DEADLINE)
+ da_reset(EXPAND_ID_TASK(p));
+ else if (new_policy == SCHED_DEADLINE)
+ da_create_or_get(EXPAND_ID_TASK(p));
+}
+
+static void handle_sched_wakeup(void *data, struct task_struct *tsk)
+{
+ if (tsk->policy == SCHED_DEADLINE)
+ da_handle_event(EXPAND_ID_TASK(tsk), sched_wakeup_nomiss);
+}
+
+static int enable_nomiss(void)
+{
+ int retval;
+
+ retval = da_monitor_init();
+ if (retval)
+ return retval;
+
+ retval = init_storage(false);
+ if (retval)
+ return retval;
+ rv_attach_trace_probe("nomiss", sched_dl_replenish_tp, handle_dl_replenish);
+ rv_attach_trace_probe("nomiss", sched_dl_throttle_tp, handle_dl_throttle);
+ rv_attach_trace_probe("nomiss", sched_dl_server_stop_tp, handle_dl_server_stop);
+ rv_attach_trace_probe("nomiss", sched_switch, handle_sched_switch);
+ rv_attach_trace_probe("nomiss", sched_wakeup, handle_sched_wakeup);
+ if (!should_skip_syscall_handle())
+ rv_attach_trace_probe("nomiss", sys_enter, handle_sys_enter);
+ rv_attach_trace_probe("nomiss", task_newtask, handle_newtask);
+ rv_attach_trace_probe("nomiss", sched_process_exit, handle_exit);
+
+ return 0;
+}
+
+static void disable_nomiss(void)
+{
+ rv_this.enabled = 0;
+
+ /* Those are RCU writers, detach earlier hoping to close a bit faster */
+ rv_detach_trace_probe("nomiss", task_newtask, handle_newtask);
+ rv_detach_trace_probe("nomiss", sched_process_exit, handle_exit);
+ if (!should_skip_syscall_handle())
+ rv_detach_trace_probe("nomiss", sys_enter, handle_sys_enter);
+
+ rv_detach_trace_probe("nomiss", sched_dl_replenish_tp, handle_dl_replenish);
+ rv_detach_trace_probe("nomiss", sched_dl_throttle_tp, handle_dl_throttle);
+ rv_detach_trace_probe("nomiss", sched_dl_server_stop_tp, handle_dl_server_stop);
+ rv_detach_trace_probe("nomiss", sched_switch, handle_sched_switch);
+ rv_detach_trace_probe("nomiss", sched_wakeup, handle_sched_wakeup);
+
+ da_monitor_destroy();
+}
+
+static struct rv_monitor rv_this = {
+ .name = "nomiss",
+ .description = "dl entities run to completion before their deadline.",
+ .enable = enable_nomiss,
+ .disable = disable_nomiss,
+ .reset = da_monitor_reset_all,
+ .enabled = 0,
+};
+
+static int __init register_nomiss(void)
+{
+ return rv_register_monitor(&rv_this, &rv_deadline);
+}
+
+static void __exit unregister_nomiss(void)
+{
+ rv_unregister_monitor(&rv_this);
+}
+
+module_init(register_nomiss);
+module_exit(unregister_nomiss);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>");
+MODULE_DESCRIPTION("nomiss: dl entities run to completion before their deadline.");
diff --git a/kernel/trace/rv/monitors/nomiss/nomiss.h b/kernel/trace/rv/monitors/nomiss/nomiss.h
new file mode 100644
index 000000000000..3d1b436194d7
--- /dev/null
+++ b/kernel/trace/rv/monitors/nomiss/nomiss.h
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Automatically generated C representation of nomiss automaton
+ * For further information about this format, see kernel documentation:
+ * Documentation/trace/rv/deterministic_automata.rst
+ */
+
+#define MONITOR_NAME nomiss
+
+enum states_nomiss {
+ ready_nomiss,
+ idle_nomiss,
+ running_nomiss,
+ sleeping_nomiss,
+ throttled_nomiss,
+ state_max_nomiss,
+};
+
+#define INVALID_STATE state_max_nomiss
+
+enum events_nomiss {
+ dl_replenish_nomiss,
+ dl_server_idle_nomiss,
+ dl_server_stop_nomiss,
+ dl_throttle_nomiss,
+ sched_switch_in_nomiss,
+ sched_switch_suspend_nomiss,
+ sched_wakeup_nomiss,
+ event_max_nomiss,
+};
+
+enum envs_nomiss {
+ clk_nomiss,
+ is_constr_dl_nomiss,
+ is_defer_nomiss,
+ env_max_nomiss,
+ env_max_stored_nomiss = is_constr_dl_nomiss,
+};
+
+_Static_assert(env_max_stored_nomiss <= MAX_HA_ENV_LEN, "Not enough slots");
+#define HA_CLK_NS
+
+struct automaton_nomiss {
+ char *state_names[state_max_nomiss];
+ char *event_names[event_max_nomiss];
+ char *env_names[env_max_nomiss];
+ unsigned char function[state_max_nomiss][event_max_nomiss];
+ unsigned char initial_state;
+ bool final_states[state_max_nomiss];
+};
+
+static const struct automaton_nomiss automaton_nomiss = {
+ .state_names = {
+ "ready",
+ "idle",
+ "running",
+ "sleeping",
+ "throttled",
+ },
+ .event_names = {
+ "dl_replenish",
+ "dl_server_idle",
+ "dl_server_stop",
+ "dl_throttle",
+ "sched_switch_in",
+ "sched_switch_suspend",
+ "sched_wakeup",
+ },
+ .env_names = {
+ "clk",
+ "is_constr_dl",
+ "is_defer",
+ },
+ .function = {
+ {
+ ready_nomiss,
+ idle_nomiss,
+ sleeping_nomiss,
+ throttled_nomiss,
+ running_nomiss,
+ INVALID_STATE,
+ ready_nomiss,
+ },
+ {
+ ready_nomiss,
+ idle_nomiss,
+ sleeping_nomiss,
+ throttled_nomiss,
+ running_nomiss,
+ INVALID_STATE,
+ INVALID_STATE,
+ },
+ {
+ running_nomiss,
+ idle_nomiss,
+ sleeping_nomiss,
+ throttled_nomiss,
+ running_nomiss,
+ sleeping_nomiss,
+ running_nomiss,
+ },
+ {
+ ready_nomiss,
+ sleeping_nomiss,
+ sleeping_nomiss,
+ throttled_nomiss,
+ running_nomiss,
+ INVALID_STATE,
+ ready_nomiss,
+ },
+ {
+ ready_nomiss,
+ throttled_nomiss,
+ INVALID_STATE,
+ throttled_nomiss,
+ INVALID_STATE,
+ throttled_nomiss,
+ throttled_nomiss,
+ },
+ },
+ .initial_state = ready_nomiss,
+ .final_states = { 1, 0, 0, 0, 0 },
+};
diff --git a/kernel/trace/rv/monitors/nomiss/nomiss_trace.h b/kernel/trace/rv/monitors/nomiss/nomiss_trace.h
new file mode 100644
index 000000000000..42e7efaca4e7
--- /dev/null
+++ b/kernel/trace/rv/monitors/nomiss/nomiss_trace.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_NOMISS
+DEFINE_EVENT(event_da_monitor_id, event_nomiss,
+ TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state),
+ TP_ARGS(id, state, event, next_state, final_state));
+
+DEFINE_EVENT(error_da_monitor_id, error_nomiss,
+ TP_PROTO(int id, char *state, char *event),
+ TP_ARGS(id, state, event));
+
+DEFINE_EVENT(error_env_da_monitor_id, error_env_nomiss,
+ TP_PROTO(int id, char *state, char *event, char *env),
+ TP_ARGS(id, state, event, env));
+#endif /* CONFIG_RV_MON_NOMISS */
diff --git a/kernel/trace/rv/monitors/opid/Kconfig b/kernel/trace/rv/monitors/opid/Kconfig
index 561d32da572b..6d02e239b684 100644
--- a/kernel/trace/rv/monitors/opid/Kconfig
+++ b/kernel/trace/rv/monitors/opid/Kconfig
@@ -2,18 +2,13 @@
#
config RV_MON_OPID
depends on RV
- depends on TRACE_IRQFLAGS
- depends on TRACE_PREEMPT_TOGGLE
depends on RV_MON_SCHED
- default y if PREEMPT_RT
- select DA_MON_EVENTS_IMPLICIT
+ default y
+ select HA_MON_EVENTS_IMPLICIT
bool "opid monitor"
help
Monitor to ensure operations like wakeup and need resched occur with
- interrupts and preemption disabled or during IRQs, where preemption
- may not be disabled explicitly.
-
- This monitor is unstable on !PREEMPT_RT, say N unless you are testing it.
+ interrupts and preemption disabled.
For further information, see:
Documentation/trace/rv/monitor_sched.rst
diff --git a/kernel/trace/rv/monitors/opid/opid.c b/kernel/trace/rv/monitors/opid/opid.c
index 25a40e90fa40..4594c7c46601 100644
--- a/kernel/trace/rv/monitors/opid/opid.c
+++ b/kernel/trace/rv/monitors/opid/opid.c
@@ -10,94 +10,63 @@
#define MODULE_NAME "opid"
#include <trace/events/sched.h>
-#include <trace/events/irq.h>
-#include <trace/events/preemptirq.h>
#include <rv_trace.h>
#include <monitors/sched/sched.h>
#define RV_MON_TYPE RV_MON_PER_CPU
#include "opid.h"
-#include <rv/da_monitor.h>
+#include <rv/ha_monitor.h>
-#ifdef CONFIG_X86_LOCAL_APIC
-#include <asm/trace/irq_vectors.h>
-
-static void handle_vector_irq_entry(void *data, int vector)
+static u64 ha_get_env(struct ha_monitor *ha_mon, enum envs_opid env, u64 time_ns)
{
- da_handle_event(irq_entry_opid);
-}
-
-static void attach_vector_irq(void)
-{
- rv_attach_trace_probe("opid", local_timer_entry, handle_vector_irq_entry);
- if (IS_ENABLED(CONFIG_IRQ_WORK))
- rv_attach_trace_probe("opid", irq_work_entry, handle_vector_irq_entry);
- if (IS_ENABLED(CONFIG_SMP)) {
- rv_attach_trace_probe("opid", reschedule_entry, handle_vector_irq_entry);
- rv_attach_trace_probe("opid", call_function_entry, handle_vector_irq_entry);
- rv_attach_trace_probe("opid", call_function_single_entry, handle_vector_irq_entry);
+ if (env == irq_off_opid)
+ return irqs_disabled();
+ else if (env == preempt_off_opid) {
+ /*
+ * If CONFIG_PREEMPTION is enabled, then the tracepoint itself disables
+ * preemption (adding one to the preempt_count). Since we are
+ * interested in the preempt_count at the time the tracepoint was
+ * hit, we consider 1 as still enabled.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPTION))
+ return (preempt_count() & PREEMPT_MASK) > 1;
+ return true;
}
+ return ENV_INVALID_VALUE;
}
-static void detach_vector_irq(void)
+static inline bool ha_verify_guards(struct ha_monitor *ha_mon,
+ enum states curr_state, enum events event,
+ enum states next_state, u64 time_ns)
{
- rv_detach_trace_probe("opid", local_timer_entry, handle_vector_irq_entry);
- if (IS_ENABLED(CONFIG_IRQ_WORK))
- rv_detach_trace_probe("opid", irq_work_entry, handle_vector_irq_entry);
- if (IS_ENABLED(CONFIG_SMP)) {
- rv_detach_trace_probe("opid", reschedule_entry, handle_vector_irq_entry);
- rv_detach_trace_probe("opid", call_function_entry, handle_vector_irq_entry);
- rv_detach_trace_probe("opid", call_function_single_entry, handle_vector_irq_entry);
- }
+ bool res = true;
+
+ if (curr_state == any_opid && event == sched_need_resched_opid)
+ res = ha_get_env(ha_mon, irq_off_opid, time_ns) == 1ull;
+ else if (curr_state == any_opid && event == sched_waking_opid)
+ res = ha_get_env(ha_mon, irq_off_opid, time_ns) == 1ull &&
+ ha_get_env(ha_mon, preempt_off_opid, time_ns) == 1ull;
+ return res;
}
-#else
-/* We assume irq_entry tracepoints are sufficient on other architectures */
-static void attach_vector_irq(void) { }
-static void detach_vector_irq(void) { }
-#endif
-
-static void handle_irq_disable(void *data, unsigned long ip, unsigned long parent_ip)
+static bool ha_verify_constraint(struct ha_monitor *ha_mon,
+ enum states curr_state, enum events event,
+ enum states next_state, u64 time_ns)
{
- da_handle_event(irq_disable_opid);
-}
+ if (!ha_verify_guards(ha_mon, curr_state, event, next_state, time_ns))
+ return false;
-static void handle_irq_enable(void *data, unsigned long ip, unsigned long parent_ip)
-{
- da_handle_event(irq_enable_opid);
-}
-
-static void handle_irq_entry(void *data, int irq, struct irqaction *action)
-{
- da_handle_event(irq_entry_opid);
-}
-
-static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip)
-{
- da_handle_event(preempt_disable_opid);
-}
-
-static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip)
-{
- da_handle_event(preempt_enable_opid);
+ return true;
}
static void handle_sched_need_resched(void *data, struct task_struct *tsk, int cpu, int tif)
{
- /* The monitor's intitial state is not in_irq */
- if (this_cpu_read(hardirq_context))
- da_handle_event(sched_need_resched_opid);
- else
- da_handle_start_event(sched_need_resched_opid);
+ da_handle_start_run_event(sched_need_resched_opid);
}
static void handle_sched_waking(void *data, struct task_struct *p)
{
- /* The monitor's intitial state is not in_irq */
- if (this_cpu_read(hardirq_context))
- da_handle_event(sched_waking_opid);
- else
- da_handle_start_event(sched_waking_opid);
+ da_handle_start_run_event(sched_waking_opid);
}
static int enable_opid(void)
@@ -108,14 +77,8 @@ static int enable_opid(void)
if (retval)
return retval;
- rv_attach_trace_probe("opid", irq_disable, handle_irq_disable);
- rv_attach_trace_probe("opid", irq_enable, handle_irq_enable);
- rv_attach_trace_probe("opid", irq_handler_entry, handle_irq_entry);
- rv_attach_trace_probe("opid", preempt_disable, handle_preempt_disable);
- rv_attach_trace_probe("opid", preempt_enable, handle_preempt_enable);
rv_attach_trace_probe("opid", sched_set_need_resched_tp, handle_sched_need_resched);
rv_attach_trace_probe("opid", sched_waking, handle_sched_waking);
- attach_vector_irq();
return 0;
}
@@ -124,14 +87,8 @@ static void disable_opid(void)
{
rv_this.enabled = 0;
- rv_detach_trace_probe("opid", irq_disable, handle_irq_disable);
- rv_detach_trace_probe("opid", irq_enable, handle_irq_enable);
- rv_detach_trace_probe("opid", irq_handler_entry, handle_irq_entry);
- rv_detach_trace_probe("opid", preempt_disable, handle_preempt_disable);
- rv_detach_trace_probe("opid", preempt_enable, handle_preempt_enable);
rv_detach_trace_probe("opid", sched_set_need_resched_tp, handle_sched_need_resched);
rv_detach_trace_probe("opid", sched_waking, handle_sched_waking);
- detach_vector_irq();
da_monitor_destroy();
}
diff --git a/kernel/trace/rv/monitors/opid/opid.h b/kernel/trace/rv/monitors/opid/opid.h
index 092992514970..fb0aa4c28aa6 100644
--- a/kernel/trace/rv/monitors/opid/opid.h
+++ b/kernel/trace/rv/monitors/opid/opid.h
@@ -8,30 +8,31 @@
#define MONITOR_NAME opid
enum states_opid {
- disabled_opid,
- enabled_opid,
- in_irq_opid,
- irq_disabled_opid,
- preempt_disabled_opid,
+ any_opid,
state_max_opid,
};
#define INVALID_STATE state_max_opid
enum events_opid {
- irq_disable_opid,
- irq_enable_opid,
- irq_entry_opid,
- preempt_disable_opid,
- preempt_enable_opid,
sched_need_resched_opid,
sched_waking_opid,
event_max_opid,
};
+enum envs_opid {
+ irq_off_opid,
+ preempt_off_opid,
+ env_max_opid,
+ env_max_stored_opid = irq_off_opid,
+};
+
+_Static_assert(env_max_stored_opid <= MAX_HA_ENV_LEN, "Not enough slots");
+
struct automaton_opid {
char *state_names[state_max_opid];
char *event_names[event_max_opid];
+ char *env_names[env_max_opid];
unsigned char function[state_max_opid][event_max_opid];
unsigned char initial_state;
bool final_states[state_max_opid];
@@ -39,68 +40,19 @@ struct automaton_opid {
static const struct automaton_opid automaton_opid = {
.state_names = {
- "disabled",
- "enabled",
- "in_irq",
- "irq_disabled",
- "preempt_disabled",
+ "any",
},
.event_names = {
- "irq_disable",
- "irq_enable",
- "irq_entry",
- "preempt_disable",
- "preempt_enable",
"sched_need_resched",
"sched_waking",
},
+ .env_names = {
+ "irq_off",
+ "preempt_off",
+ },
.function = {
- {
- INVALID_STATE,
- preempt_disabled_opid,
- disabled_opid,
- INVALID_STATE,
- irq_disabled_opid,
- disabled_opid,
- disabled_opid,
- },
- {
- irq_disabled_opid,
- INVALID_STATE,
- INVALID_STATE,
- preempt_disabled_opid,
- enabled_opid,
- INVALID_STATE,
- INVALID_STATE,
- },
- {
- INVALID_STATE,
- enabled_opid,
- in_irq_opid,
- INVALID_STATE,
- INVALID_STATE,
- in_irq_opid,
- in_irq_opid,
- },
- {
- INVALID_STATE,
- enabled_opid,
- in_irq_opid,
- disabled_opid,
- INVALID_STATE,
- irq_disabled_opid,
- INVALID_STATE,
- },
- {
- disabled_opid,
- INVALID_STATE,
- INVALID_STATE,
- INVALID_STATE,
- enabled_opid,
- INVALID_STATE,
- INVALID_STATE,
- },
+ { any_opid, any_opid },
},
- .initial_state = disabled_opid,
- .final_states = { 0, 1, 0, 0, 0 },
+ .initial_state = any_opid,
+ .final_states = { 1 },
};
diff --git a/kernel/trace/rv/monitors/opid/opid_trace.h b/kernel/trace/rv/monitors/opid/opid_trace.h
index 3df6ff955c30..b04005b64208 100644
--- a/kernel/trace/rv/monitors/opid/opid_trace.h
+++ b/kernel/trace/rv/monitors/opid/opid_trace.h
@@ -12,4 +12,8 @@ DEFINE_EVENT(event_da_monitor, event_opid,
DEFINE_EVENT(error_da_monitor, error_opid,
TP_PROTO(char *state, char *event),
TP_ARGS(state, event));
+
+DEFINE_EVENT(error_env_da_monitor, error_env_opid,
+ TP_PROTO(char *state, char *event, char *env),
+ TP_ARGS(state, event, env));
#endif /* CONFIG_RV_MON_OPID */
diff --git a/kernel/trace/rv/monitors/sleep/sleep.c b/kernel/trace/rv/monitors/sleep/sleep.c
index c1347da69e9d..8dfe5ec13e19 100644
--- a/kernel/trace/rv/monitors/sleep/sleep.c
+++ b/kernel/trace/rv/monitors/sleep/sleep.c
@@ -49,6 +49,7 @@ static void ltl_atoms_init(struct task_struct *task, struct ltl_monitor *mon, bo
ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false);
ltl_atom_set(mon, LTL_CLOCK_NANOSLEEP, false);
ltl_atom_set(mon, LTL_FUTEX_WAIT, false);
+ ltl_atom_set(mon, LTL_EPOLL_WAIT, false);
ltl_atom_set(mon, LTL_FUTEX_LOCK_PI, false);
ltl_atom_set(mon, LTL_BLOCK_ON_RT_MUTEX, false);
}
@@ -63,6 +64,7 @@ static void ltl_atoms_init(struct task_struct *task, struct ltl_monitor *mon, bo
ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, false);
ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false);
ltl_atom_set(mon, LTL_CLOCK_NANOSLEEP, false);
+ ltl_atom_set(mon, LTL_EPOLL_WAIT, false);
if (strstarts(task->comm, "migration/"))
ltl_atom_set(mon, LTL_TASK_IS_MIGRATION, true);
@@ -162,6 +164,11 @@ static void handle_sys_enter(void *data, struct pt_regs *regs, long id)
break;
}
break;
+#ifdef __NR_epoll_wait
+ case __NR_epoll_wait:
+ ltl_atom_update(current, LTL_EPOLL_WAIT, true);
+ break;
+#endif
}
}
@@ -174,6 +181,7 @@ static void handle_sys_exit(void *data, struct pt_regs *regs, long ret)
ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, false);
ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, false);
ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false);
+ ltl_atom_set(mon, LTL_EPOLL_WAIT, false);
ltl_atom_update(current, LTL_CLOCK_NANOSLEEP, false);
}
diff --git a/kernel/trace/rv/monitors/sleep/sleep.h b/kernel/trace/rv/monitors/sleep/sleep.h
index 2ab46fd218d2..95dc2727c059 100644
--- a/kernel/trace/rv/monitors/sleep/sleep.h
+++ b/kernel/trace/rv/monitors/sleep/sleep.h
@@ -15,6 +15,7 @@ enum ltl_atom {
LTL_ABORT_SLEEP,
LTL_BLOCK_ON_RT_MUTEX,
LTL_CLOCK_NANOSLEEP,
+ LTL_EPOLL_WAIT,
LTL_FUTEX_LOCK_PI,
LTL_FUTEX_WAIT,
LTL_KERNEL_THREAD,
@@ -40,6 +41,7 @@ static const char *ltl_atom_str(enum ltl_atom atom)
"ab_sl",
"bl_on_rt_mu",
"cl_na",
+ "ep_wa",
"fu_lo_pi",
"fu_wa",
"ker_th",
@@ -75,39 +77,41 @@ static_assert(RV_NUM_BA_STATES <= RV_MAX_BA_STATES);
static void ltl_start(struct task_struct *task, struct ltl_monitor *mon)
{
- bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms);
- bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms);
- bool val40 = task_is_rcu || task_is_migration;
- bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms);
- bool val41 = futex_lock_pi || val40;
- bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms);
- bool val5 = block_on_rt_mutex || val41;
- bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon->atoms);
- bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms);
- bool val32 = abort_sleep || kthread_should_stop;
bool woken_by_nmi = test_bit(LTL_WOKEN_BY_NMI, mon->atoms);
- bool val33 = woken_by_nmi || val32;
bool woken_by_hardirq = test_bit(LTL_WOKEN_BY_HARDIRQ, mon->atoms);
- bool val34 = woken_by_hardirq || val33;
bool woken_by_equal_or_higher_prio = test_bit(LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO,
mon->atoms);
- bool val14 = woken_by_equal_or_higher_prio || val34;
bool wake = test_bit(LTL_WAKE, mon->atoms);
- bool val13 = !wake;
- bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms);
+ bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms);
+ bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms);
+ bool sleep = test_bit(LTL_SLEEP, mon->atoms);
+ bool rt = test_bit(LTL_RT, mon->atoms);
+ bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME, mon->atoms);
bool nanosleep_clock_tai = test_bit(LTL_NANOSLEEP_CLOCK_TAI, mon->atoms);
bool nanosleep_clock_monotonic = test_bit(LTL_NANOSLEEP_CLOCK_MONOTONIC, mon->atoms);
- bool val24 = nanosleep_clock_monotonic || nanosleep_clock_tai;
- bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME, mon->atoms);
- bool val25 = nanosleep_timer_abstime && val24;
- bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms);
- bool val18 = clock_nanosleep && val25;
+ bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon->atoms);
+ bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms);
bool futex_wait = test_bit(LTL_FUTEX_WAIT, mon->atoms);
- bool val9 = futex_wait || val18;
+ bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms);
+ bool epoll_wait = test_bit(LTL_EPOLL_WAIT, mon->atoms);
+ bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms);
+ bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms);
+ bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms);
+ bool val42 = task_is_rcu || task_is_migration;
+ bool val43 = futex_lock_pi || val42;
+ bool val5 = block_on_rt_mutex || val43;
+ bool val34 = abort_sleep || kthread_should_stop;
+ bool val35 = woken_by_nmi || val34;
+ bool val36 = woken_by_hardirq || val35;
+ bool val14 = woken_by_equal_or_higher_prio || val36;
+ bool val13 = !wake;
+ bool val26 = nanosleep_clock_monotonic || nanosleep_clock_tai;
+ bool val27 = nanosleep_timer_abstime && val26;
+ bool val18 = clock_nanosleep && val27;
+ bool val20 = val18 || epoll_wait;
+ bool val9 = futex_wait || val20;
bool val11 = val9 || kernel_thread;
- bool sleep = test_bit(LTL_SLEEP, mon->atoms);
bool val2 = !sleep;
- bool rt = test_bit(LTL_RT, mon->atoms);
bool val1 = !rt;
bool val3 = val1 || val2;
@@ -124,39 +128,41 @@ static void ltl_start(struct task_struct *task, struct ltl_monitor *mon)
static void
ltl_possible_next_states(struct ltl_monitor *mon, unsigned int state, unsigned long *next)
{
- bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms);
- bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms);
- bool val40 = task_is_rcu || task_is_migration;
- bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms);
- bool val41 = futex_lock_pi || val40;
- bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms);
- bool val5 = block_on_rt_mutex || val41;
- bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon->atoms);
- bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms);
- bool val32 = abort_sleep || kthread_should_stop;
bool woken_by_nmi = test_bit(LTL_WOKEN_BY_NMI, mon->atoms);
- bool val33 = woken_by_nmi || val32;
bool woken_by_hardirq = test_bit(LTL_WOKEN_BY_HARDIRQ, mon->atoms);
- bool val34 = woken_by_hardirq || val33;
bool woken_by_equal_or_higher_prio = test_bit(LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO,
mon->atoms);
- bool val14 = woken_by_equal_or_higher_prio || val34;
bool wake = test_bit(LTL_WAKE, mon->atoms);
- bool val13 = !wake;
- bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms);
+ bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms);
+ bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms);
+ bool sleep = test_bit(LTL_SLEEP, mon->atoms);
+ bool rt = test_bit(LTL_RT, mon->atoms);
+ bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME, mon->atoms);
bool nanosleep_clock_tai = test_bit(LTL_NANOSLEEP_CLOCK_TAI, mon->atoms);
bool nanosleep_clock_monotonic = test_bit(LTL_NANOSLEEP_CLOCK_MONOTONIC, mon->atoms);
- bool val24 = nanosleep_clock_monotonic || nanosleep_clock_tai;
- bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME, mon->atoms);
- bool val25 = nanosleep_timer_abstime && val24;
- bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms);
- bool val18 = clock_nanosleep && val25;
+ bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon->atoms);
+ bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms);
bool futex_wait = test_bit(LTL_FUTEX_WAIT, mon->atoms);
- bool val9 = futex_wait || val18;
+ bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms);
+ bool epoll_wait = test_bit(LTL_EPOLL_WAIT, mon->atoms);
+ bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms);
+ bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms);
+ bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms);
+ bool val42 = task_is_rcu || task_is_migration;
+ bool val43 = futex_lock_pi || val42;
+ bool val5 = block_on_rt_mutex || val43;
+ bool val34 = abort_sleep || kthread_should_stop;
+ bool val35 = woken_by_nmi || val34;
+ bool val36 = woken_by_hardirq || val35;
+ bool val14 = woken_by_equal_or_higher_prio || val36;
+ bool val13 = !wake;
+ bool val26 = nanosleep_clock_monotonic || nanosleep_clock_tai;
+ bool val27 = nanosleep_timer_abstime && val26;
+ bool val18 = clock_nanosleep && val27;
+ bool val20 = val18 || epoll_wait;
+ bool val9 = futex_wait || val20;
bool val11 = val9 || kernel_thread;
- bool sleep = test_bit(LTL_SLEEP, mon->atoms);
bool val2 = !sleep;
- bool rt = test_bit(LTL_RT, mon->atoms);
bool val1 = !rt;
bool val3 = val1 || val2;
diff --git a/kernel/trace/rv/monitors/stall/Kconfig b/kernel/trace/rv/monitors/stall/Kconfig
new file mode 100644
index 000000000000..6f846b642544
--- /dev/null
+++ b/kernel/trace/rv/monitors/stall/Kconfig
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_STALL
+ depends on RV
+ select HA_MON_EVENTS_ID
+ bool "stall monitor"
+ help
+ Enable the stall sample monitor that illustrates the usage of hybrid
+ automata monitors. It can be used to identify tasks stalled for
+ longer than a threshold.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_stall.rst
diff --git a/kernel/trace/rv/monitors/stall/stall.c b/kernel/trace/rv/monitors/stall/stall.c
new file mode 100644
index 000000000000..9ccfda6b0e73
--- /dev/null
+++ b/kernel/trace/rv/monitors/stall/stall.c
@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+#include <rv/instrumentation.h>
+
+#define MODULE_NAME "stall"
+
+#include <trace/events/sched.h>
+#include <rv_trace.h>
+
+#define RV_MON_TYPE RV_MON_PER_TASK
+#define HA_TIMER_TYPE HA_TIMER_WHEEL
+#include "stall.h"
+#include <rv/ha_monitor.h>
+
+static u64 threshold_jiffies = 1000;
+module_param(threshold_jiffies, ullong, 0644);
+
+static u64 ha_get_env(struct ha_monitor *ha_mon, enum envs_stall env, u64 time_ns)
+{
+ if (env == clk_stall)
+ return ha_get_clk_jiffy(ha_mon, env);
+ return ENV_INVALID_VALUE;
+}
+
+static void ha_reset_env(struct ha_monitor *ha_mon, enum envs_stall env, u64 time_ns)
+{
+ if (env == clk_stall)
+ ha_reset_clk_jiffy(ha_mon, env);
+}
+
+static inline bool ha_verify_invariants(struct ha_monitor *ha_mon,
+ enum states curr_state, enum events event,
+ enum states next_state, u64 time_ns)
+{
+ if (curr_state == enqueued_stall)
+ return ha_check_invariant_jiffy(ha_mon, clk_stall, time_ns);
+ return true;
+}
+
+static inline bool ha_verify_guards(struct ha_monitor *ha_mon,
+ enum states curr_state, enum events event,
+ enum states next_state, u64 time_ns)
+{
+ bool res = true;
+
+ if (curr_state == dequeued_stall && event == sched_wakeup_stall)
+ ha_reset_env(ha_mon, clk_stall, time_ns);
+ else if (curr_state == running_stall && event == sched_switch_preempt_stall)
+ ha_reset_env(ha_mon, clk_stall, time_ns);
+ return res;
+}
+
+static inline void ha_setup_invariants(struct ha_monitor *ha_mon,
+ enum states curr_state, enum events event,
+ enum states next_state, u64 time_ns)
+{
+ if (next_state == curr_state)
+ return;
+ if (next_state == enqueued_stall)
+ ha_start_timer_jiffy(ha_mon, clk_stall, threshold_jiffies, time_ns);
+ else if (curr_state == enqueued_stall)
+ ha_cancel_timer(ha_mon);
+}
+
+static bool ha_verify_constraint(struct ha_monitor *ha_mon,
+ enum states curr_state, enum events event,
+ enum states next_state, u64 time_ns)
+{
+ if (!ha_verify_invariants(ha_mon, curr_state, event, next_state, time_ns))
+ return false;
+
+ if (!ha_verify_guards(ha_mon, curr_state, event, next_state, time_ns))
+ return false;
+
+ ha_setup_invariants(ha_mon, curr_state, event, next_state, time_ns);
+
+ return true;
+}
+
+static void handle_sched_switch(void *data, bool preempt,
+ struct task_struct *prev,
+ struct task_struct *next,
+ unsigned int prev_state)
+{
+ if (!preempt && prev_state != TASK_RUNNING)
+ da_handle_start_event(prev, sched_switch_wait_stall);
+ else
+ da_handle_event(prev, sched_switch_preempt_stall);
+ da_handle_event(next, sched_switch_in_stall);
+}
+
+static void handle_sched_wakeup(void *data, struct task_struct *p)
+{
+ da_handle_event(p, sched_wakeup_stall);
+}
+
+static int enable_stall(void)
+{
+ int retval;
+
+ retval = da_monitor_init();
+ if (retval)
+ return retval;
+
+ rv_attach_trace_probe("stall", sched_switch, handle_sched_switch);
+ rv_attach_trace_probe("stall", sched_wakeup, handle_sched_wakeup);
+
+ return 0;
+}
+
+static void disable_stall(void)
+{
+ rv_this.enabled = 0;
+
+ rv_detach_trace_probe("stall", sched_switch, handle_sched_switch);
+ rv_detach_trace_probe("stall", sched_wakeup, handle_sched_wakeup);
+
+ da_monitor_destroy();
+}
+
+static struct rv_monitor rv_this = {
+ .name = "stall",
+ .description = "identify tasks stalled for longer than a threshold.",
+ .enable = enable_stall,
+ .disable = disable_stall,
+ .reset = da_monitor_reset_all,
+ .enabled = 0,
+};
+
+static int __init register_stall(void)
+{
+ return rv_register_monitor(&rv_this, NULL);
+}
+
+static void __exit unregister_stall(void)
+{
+ rv_unregister_monitor(&rv_this);
+}
+
+module_init(register_stall);
+module_exit(unregister_stall);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>");
+MODULE_DESCRIPTION("stall: identify tasks stalled for longer than a threshold.");
diff --git a/kernel/trace/rv/monitors/stall/stall.h b/kernel/trace/rv/monitors/stall/stall.h
new file mode 100644
index 000000000000..638520cb1082
--- /dev/null
+++ b/kernel/trace/rv/monitors/stall/stall.h
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Automatically generated C representation of stall automaton
+ * For further information about this format, see kernel documentation:
+ * Documentation/trace/rv/deterministic_automata.rst
+ */
+
+#define MONITOR_NAME stall
+
+enum states_stall {
+ dequeued_stall,
+ enqueued_stall,
+ running_stall,
+ state_max_stall,
+};
+
+#define INVALID_STATE state_max_stall
+
+enum events_stall {
+ sched_switch_in_stall,
+ sched_switch_preempt_stall,
+ sched_switch_wait_stall,
+ sched_wakeup_stall,
+ event_max_stall,
+};
+
+enum envs_stall {
+ clk_stall,
+ env_max_stall,
+ env_max_stored_stall = env_max_stall,
+};
+
+_Static_assert(env_max_stored_stall <= MAX_HA_ENV_LEN, "Not enough slots");
+
+struct automaton_stall {
+ char *state_names[state_max_stall];
+ char *event_names[event_max_stall];
+ char *env_names[env_max_stall];
+ unsigned char function[state_max_stall][event_max_stall];
+ unsigned char initial_state;
+ bool final_states[state_max_stall];
+};
+
+static const struct automaton_stall automaton_stall = {
+ .state_names = {
+ "dequeued",
+ "enqueued",
+ "running",
+ },
+ .event_names = {
+ "sched_switch_in",
+ "sched_switch_preempt",
+ "sched_switch_wait",
+ "sched_wakeup",
+ },
+ .env_names = {
+ "clk",
+ },
+ .function = {
+ {
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE,
+ enqueued_stall,
+ },
+ {
+ running_stall,
+ INVALID_STATE,
+ INVALID_STATE,
+ enqueued_stall,
+ },
+ {
+ running_stall,
+ enqueued_stall,
+ dequeued_stall,
+ running_stall,
+ },
+ },
+ .initial_state = dequeued_stall,
+ .final_states = { 1, 0, 0 },
+};
diff --git a/kernel/trace/rv/monitors/stall/stall_trace.h b/kernel/trace/rv/monitors/stall/stall_trace.h
new file mode 100644
index 000000000000..6a7cc1b1d040
--- /dev/null
+++ b/kernel/trace/rv/monitors/stall/stall_trace.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_STALL
+DEFINE_EVENT(event_da_monitor_id, event_stall,
+ TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state),
+ TP_ARGS(id, state, event, next_state, final_state));
+
+DEFINE_EVENT(error_da_monitor_id, error_stall,
+ TP_PROTO(int id, char *state, char *event),
+ TP_ARGS(id, state, event));
+
+DEFINE_EVENT(error_env_da_monitor_id, error_env_stall,
+ TP_PROTO(int id, char *state, char *event, char *env),
+ TP_ARGS(id, state, event, env));
+#endif /* CONFIG_RV_MON_STALL */
diff --git a/kernel/trace/rv/rv_trace.h b/kernel/trace/rv/rv_trace.h
index 4a6faddac614..9622c269789c 100644
--- a/kernel/trace/rv/rv_trace.h
+++ b/kernel/trace/rv/rv_trace.h
@@ -62,9 +62,39 @@ DECLARE_EVENT_CLASS(error_da_monitor,
#include <monitors/scpd/scpd_trace.h>
#include <monitors/snep/snep_trace.h>
#include <monitors/sts/sts_trace.h>
-#include <monitors/opid/opid_trace.h>
// Add new monitors based on CONFIG_DA_MON_EVENTS_IMPLICIT here
+#ifdef CONFIG_HA_MON_EVENTS_IMPLICIT
+/* For simplicity this class is marked as DA although relevant only for HA */
+DECLARE_EVENT_CLASS(error_env_da_monitor,
+
+ TP_PROTO(char *state, char *event, char *env),
+
+ TP_ARGS(state, event, env),
+
+ TP_STRUCT__entry(
+ __string( state, state )
+ __string( event, event )
+ __string( env, env )
+ ),
+
+ TP_fast_assign(
+ __assign_str(state);
+ __assign_str(event);
+ __assign_str(env);
+ ),
+
+ TP_printk("event %s not expected in the state %s with env %s",
+ __get_str(event),
+ __get_str(state),
+ __get_str(env))
+);
+
+#include <monitors/opid/opid_trace.h>
+// Add new monitors based on CONFIG_HA_MON_EVENTS_IMPLICIT here
+
+#endif
+
#endif /* CONFIG_DA_MON_EVENTS_IMPLICIT */
#ifdef CONFIG_DA_MON_EVENTS_ID
@@ -128,6 +158,41 @@ DECLARE_EVENT_CLASS(error_da_monitor_id,
#include <monitors/sssw/sssw_trace.h>
// Add new monitors based on CONFIG_DA_MON_EVENTS_ID here
+#ifdef CONFIG_HA_MON_EVENTS_ID
+/* For simplicity this class is marked as DA although relevant only for HA */
+DECLARE_EVENT_CLASS(error_env_da_monitor_id,
+
+ TP_PROTO(int id, char *state, char *event, char *env),
+
+ TP_ARGS(id, state, event, env),
+
+ TP_STRUCT__entry(
+ __field( int, id )
+ __string( state, state )
+ __string( event, event )
+ __string( env, env )
+ ),
+
+ TP_fast_assign(
+ __assign_str(state);
+ __assign_str(event);
+ __assign_str(env);
+ __entry->id = id;
+ ),
+
+ TP_printk("%d: event %s not expected in the state %s with env %s",
+ __entry->id,
+ __get_str(event),
+ __get_str(state),
+ __get_str(env))
+);
+
+#include <monitors/stall/stall_trace.h>
+#include <monitors/nomiss/nomiss_trace.h>
+// Add new monitors based on CONFIG_HA_MON_EVENTS_ID here
+
+#endif
+
#endif /* CONFIG_DA_MON_EVENTS_ID */
#ifdef CONFIG_LTL_MON_EVENTS_ID
DECLARE_EVENT_CLASS(event_ltl_monitor_id,
diff --git a/kernel/trace/simple_ring_buffer.c b/kernel/trace/simple_ring_buffer.c
new file mode 100644
index 000000000000..f4642f5adda3
--- /dev/null
+++ b/kernel/trace/simple_ring_buffer.c
@@ -0,0 +1,517 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2025 - Google LLC
+ * Author: Vincent Donnefort <vdonnefort@google.com>
+ */
+
+#include <linux/atomic.h>
+#include <linux/simple_ring_buffer.h>
+
+#include <asm/barrier.h>
+#include <asm/local.h>
+
+enum simple_rb_link_type {
+ SIMPLE_RB_LINK_NORMAL = 0,
+ SIMPLE_RB_LINK_HEAD = 1,
+ SIMPLE_RB_LINK_HEAD_MOVING
+};
+
+#define SIMPLE_RB_LINK_MASK ~(SIMPLE_RB_LINK_HEAD | SIMPLE_RB_LINK_HEAD_MOVING)
+
+static void simple_bpage_set_head_link(struct simple_buffer_page *bpage)
+{
+ unsigned long link = (unsigned long)bpage->link.next;
+
+ link &= SIMPLE_RB_LINK_MASK;
+ link |= SIMPLE_RB_LINK_HEAD;
+
+ /*
+ * Paired with simple_rb_find_head() to order access between the head
+ * link and overrun. It ensures we always report an up-to-date value
+ * after swapping the reader page.
+ */
+ smp_store_release(&bpage->link.next, (struct list_head *)link);
+}
+
+static bool simple_bpage_unset_head_link(struct simple_buffer_page *bpage,
+ struct simple_buffer_page *dst,
+ enum simple_rb_link_type new_type)
+{
+ unsigned long *link = (unsigned long *)(&bpage->link.next);
+ unsigned long old = (*link & SIMPLE_RB_LINK_MASK) | SIMPLE_RB_LINK_HEAD;
+ unsigned long new = (unsigned long)(&dst->link) | new_type;
+
+ return try_cmpxchg(link, &old, new);
+}
+
+static void simple_bpage_set_normal_link(struct simple_buffer_page *bpage)
+{
+ unsigned long link = (unsigned long)bpage->link.next;
+
+ WRITE_ONCE(bpage->link.next, (struct list_head *)(link & SIMPLE_RB_LINK_MASK));
+}
+
+static struct simple_buffer_page *simple_bpage_from_link(struct list_head *link)
+{
+ unsigned long ptr = (unsigned long)link & SIMPLE_RB_LINK_MASK;
+
+ return container_of((struct list_head *)ptr, struct simple_buffer_page, link);
+}
+
+static struct simple_buffer_page *simple_bpage_next_page(struct simple_buffer_page *bpage)
+{
+ return simple_bpage_from_link(bpage->link.next);
+}
+
+static void simple_bpage_reset(struct simple_buffer_page *bpage)
+{
+ bpage->write = 0;
+ bpage->entries = 0;
+
+ local_set(&bpage->page->commit, 0);
+}
+
+static void simple_bpage_init(struct simple_buffer_page *bpage, void *page)
+{
+ INIT_LIST_HEAD(&bpage->link);
+ bpage->page = (struct buffer_data_page *)page;
+
+ simple_bpage_reset(bpage);
+}
+
+#define simple_rb_meta_inc(__meta, __inc) \
+ WRITE_ONCE((__meta), (__meta + __inc))
+
+static bool simple_rb_loaded(struct simple_rb_per_cpu *cpu_buffer)
+{
+ return !!cpu_buffer->bpages;
+}
+
+static int simple_rb_find_head(struct simple_rb_per_cpu *cpu_buffer)
+{
+ int retry = cpu_buffer->nr_pages * 2;
+ struct simple_buffer_page *head;
+
+ head = cpu_buffer->head_page;
+
+ while (retry--) {
+ unsigned long link;
+
+spin:
+ /* See smp_store_release in simple_bpage_set_head_link() */
+ link = (unsigned long)smp_load_acquire(&head->link.prev->next);
+
+ switch (link & ~SIMPLE_RB_LINK_MASK) {
+ /* Found the head */
+ case SIMPLE_RB_LINK_HEAD:
+ cpu_buffer->head_page = head;
+ return 0;
+ /* The writer caught the head, we can spin, that won't be long */
+ case SIMPLE_RB_LINK_HEAD_MOVING:
+ goto spin;
+ }
+
+ head = simple_bpage_next_page(head);
+ }
+
+ return -EBUSY;
+}
+
+/**
+ * simple_ring_buffer_swap_reader_page - Swap ring-buffer head with the reader
+ * @cpu_buffer: A simple_rb_per_cpu
+ *
+ * This function enables consuming reading. It ensures the current head page will not be overwritten
+ * and can be safely read.
+ *
+ * Returns 0 on success, -ENODEV if @cpu_buffer was unloaded or -EBUSY if we failed to catch the
+ * head page.
+ */
+int simple_ring_buffer_swap_reader_page(struct simple_rb_per_cpu *cpu_buffer)
+{
+ struct simple_buffer_page *last, *head, *reader;
+ unsigned long overrun;
+ int retry = 8;
+ int ret;
+
+ if (!simple_rb_loaded(cpu_buffer))
+ return -ENODEV;
+
+ reader = cpu_buffer->reader_page;
+
+ do {
+ /* Run after the writer to find the head */
+ ret = simple_rb_find_head(cpu_buffer);
+ if (ret)
+ return ret;
+
+ head = cpu_buffer->head_page;
+
+ /* Connect the reader page around the header page */
+ reader->link.next = head->link.next;
+ reader->link.prev = head->link.prev;
+
+ /* The last page before the head */
+ last = simple_bpage_from_link(head->link.prev);
+
+ /* The reader page points to the new header page */
+ simple_bpage_set_head_link(reader);
+
+ overrun = cpu_buffer->meta->overrun;
+ } while (!simple_bpage_unset_head_link(last, reader, SIMPLE_RB_LINK_NORMAL) && retry--);
+
+ if (!retry)
+ return -EINVAL;
+
+ cpu_buffer->head_page = simple_bpage_from_link(reader->link.next);
+ cpu_buffer->head_page->link.prev = &reader->link;
+ cpu_buffer->reader_page = head;
+ cpu_buffer->meta->reader.lost_events = overrun - cpu_buffer->last_overrun;
+ cpu_buffer->meta->reader.id = cpu_buffer->reader_page->id;
+ cpu_buffer->last_overrun = overrun;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(simple_ring_buffer_swap_reader_page);
+
+static struct simple_buffer_page *simple_rb_move_tail(struct simple_rb_per_cpu *cpu_buffer)
+{
+ struct simple_buffer_page *tail, *new_tail;
+
+ tail = cpu_buffer->tail_page;
+ new_tail = simple_bpage_next_page(tail);
+
+ if (simple_bpage_unset_head_link(tail, new_tail, SIMPLE_RB_LINK_HEAD_MOVING)) {
+ /*
+ * Oh no! we've caught the head. There is none anymore and
+ * swap_reader will spin until we set the new one. Overrun must
+ * be written first, to make sure we report the correct number
+ * of lost events.
+ */
+ simple_rb_meta_inc(cpu_buffer->meta->overrun, new_tail->entries);
+ simple_rb_meta_inc(cpu_buffer->meta->pages_lost, 1);
+
+ simple_bpage_set_head_link(new_tail);
+ simple_bpage_set_normal_link(tail);
+ }
+
+ simple_bpage_reset(new_tail);
+ cpu_buffer->tail_page = new_tail;
+
+ simple_rb_meta_inc(cpu_buffer->meta->pages_touched, 1);
+
+ return new_tail;
+}
+
+static unsigned long rb_event_size(unsigned long length)
+{
+ struct ring_buffer_event *event;
+
+ return length + RB_EVNT_HDR_SIZE + sizeof(event->array[0]);
+}
+
+static struct ring_buffer_event *
+rb_event_add_ts_extend(struct ring_buffer_event *event, u64 delta)
+{
+ event->type_len = RINGBUF_TYPE_TIME_EXTEND;
+ event->time_delta = delta & TS_MASK;
+ event->array[0] = delta >> TS_SHIFT;
+
+ return (struct ring_buffer_event *)((unsigned long)event + 8);
+}
+
+static struct ring_buffer_event *
+simple_rb_reserve_next(struct simple_rb_per_cpu *cpu_buffer, unsigned long length, u64 timestamp)
+{
+ unsigned long ts_ext_size = 0, event_size = rb_event_size(length);
+ struct simple_buffer_page *tail = cpu_buffer->tail_page;
+ struct ring_buffer_event *event;
+ u32 write, prev_write;
+ u64 time_delta;
+
+ time_delta = timestamp - cpu_buffer->write_stamp;
+
+ if (test_time_stamp(time_delta))
+ ts_ext_size = 8;
+
+ prev_write = tail->write;
+ write = prev_write + event_size + ts_ext_size;
+
+ if (unlikely(write > (PAGE_SIZE - BUF_PAGE_HDR_SIZE)))
+ tail = simple_rb_move_tail(cpu_buffer);
+
+ if (!tail->entries) {
+ tail->page->time_stamp = timestamp;
+ time_delta = 0;
+ ts_ext_size = 0;
+ write = event_size;
+ prev_write = 0;
+ }
+
+ tail->write = write;
+ tail->entries++;
+
+ cpu_buffer->write_stamp = timestamp;
+
+ event = (struct ring_buffer_event *)(tail->page->data + prev_write);
+ if (ts_ext_size) {
+ event = rb_event_add_ts_extend(event, time_delta);
+ time_delta = 0;
+ }
+
+ event->type_len = 0;
+ event->time_delta = time_delta;
+ event->array[0] = event_size - RB_EVNT_HDR_SIZE;
+
+ return event;
+}
+
+/**
+ * simple_ring_buffer_reserve - Reserve an entry in @cpu_buffer
+ * @cpu_buffer: A simple_rb_per_cpu
+ * @length: Size of the entry in bytes
+ * @timestamp: Timestamp of the entry
+ *
+ * Returns the address of the entry where to write data or NULL
+ */
+void *simple_ring_buffer_reserve(struct simple_rb_per_cpu *cpu_buffer, unsigned long length,
+ u64 timestamp)
+{
+ struct ring_buffer_event *rb_event;
+
+ if (cmpxchg(&cpu_buffer->status, SIMPLE_RB_READY, SIMPLE_RB_WRITING) != SIMPLE_RB_READY)
+ return NULL;
+
+ rb_event = simple_rb_reserve_next(cpu_buffer, length, timestamp);
+
+ return &rb_event->array[1];
+}
+EXPORT_SYMBOL_GPL(simple_ring_buffer_reserve);
+
+/**
+ * simple_ring_buffer_commit - Commit the entry reserved with simple_ring_buffer_reserve()
+ * @cpu_buffer: The simple_rb_per_cpu where the entry has been reserved
+ */
+void simple_ring_buffer_commit(struct simple_rb_per_cpu *cpu_buffer)
+{
+ local_set(&cpu_buffer->tail_page->page->commit,
+ cpu_buffer->tail_page->write);
+ simple_rb_meta_inc(cpu_buffer->meta->entries, 1);
+
+ /*
+ * Paired with simple_rb_enable_tracing() to ensure data is
+ * written to the ring-buffer before teardown.
+ */
+ smp_store_release(&cpu_buffer->status, SIMPLE_RB_READY);
+}
+EXPORT_SYMBOL_GPL(simple_ring_buffer_commit);
+
+static u32 simple_rb_enable_tracing(struct simple_rb_per_cpu *cpu_buffer, bool enable)
+{
+ u32 prev_status;
+
+ if (enable)
+ return cmpxchg(&cpu_buffer->status, SIMPLE_RB_UNAVAILABLE, SIMPLE_RB_READY);
+
+ /* Wait for the buffer to be released */
+ do {
+ prev_status = cmpxchg_acquire(&cpu_buffer->status,
+ SIMPLE_RB_READY,
+ SIMPLE_RB_UNAVAILABLE);
+ } while (prev_status == SIMPLE_RB_WRITING);
+
+ return prev_status;
+}
+
+/**
+ * simple_ring_buffer_reset - Reset @cpu_buffer
+ * @cpu_buffer: A simple_rb_per_cpu
+ *
+ * This will not clear the content of the data, only reset counters and pointers
+ *
+ * Returns 0 on success or -ENODEV if @cpu_buffer was unloaded.
+ */
+int simple_ring_buffer_reset(struct simple_rb_per_cpu *cpu_buffer)
+{
+ struct simple_buffer_page *bpage;
+ u32 prev_status;
+ int ret;
+
+ if (!simple_rb_loaded(cpu_buffer))
+ return -ENODEV;
+
+ prev_status = simple_rb_enable_tracing(cpu_buffer, false);
+
+ ret = simple_rb_find_head(cpu_buffer);
+ if (ret)
+ return ret;
+
+ bpage = cpu_buffer->tail_page = cpu_buffer->head_page;
+ do {
+ simple_bpage_reset(bpage);
+ bpage = simple_bpage_next_page(bpage);
+ } while (bpage != cpu_buffer->head_page);
+
+ simple_bpage_reset(cpu_buffer->reader_page);
+
+ cpu_buffer->last_overrun = 0;
+ cpu_buffer->write_stamp = 0;
+
+ cpu_buffer->meta->reader.read = 0;
+ cpu_buffer->meta->reader.lost_events = 0;
+ cpu_buffer->meta->entries = 0;
+ cpu_buffer->meta->overrun = 0;
+ cpu_buffer->meta->read = 0;
+ cpu_buffer->meta->pages_lost = 0;
+ cpu_buffer->meta->pages_touched = 0;
+
+ if (prev_status == SIMPLE_RB_READY)
+ simple_rb_enable_tracing(cpu_buffer, true);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(simple_ring_buffer_reset);
+
+int simple_ring_buffer_init_mm(struct simple_rb_per_cpu *cpu_buffer,
+ struct simple_buffer_page *bpages,
+ const struct ring_buffer_desc *desc,
+ void *(*load_page)(unsigned long va),
+ void (*unload_page)(void *va))
+{
+ struct simple_buffer_page *bpage = bpages;
+ int ret = 0;
+ void *page;
+ int i;
+
+ /* At least 1 reader page and two pages in the ring-buffer */
+ if (desc->nr_page_va < 3)
+ return -EINVAL;
+
+ memset(cpu_buffer, 0, sizeof(*cpu_buffer));
+
+ cpu_buffer->meta = load_page(desc->meta_va);
+ if (!cpu_buffer->meta)
+ return -EINVAL;
+
+ memset(cpu_buffer->meta, 0, sizeof(*cpu_buffer->meta));
+ cpu_buffer->meta->meta_page_size = PAGE_SIZE;
+
+ /* The reader page is not part of the ring initially */
+ page = load_page(desc->page_va[0]);
+ if (!page) {
+ unload_page(cpu_buffer->meta);
+ return -EINVAL;
+ }
+
+ simple_bpage_init(bpage, page);
+ bpage->id = 0;
+
+ cpu_buffer->nr_pages = 1;
+
+ cpu_buffer->reader_page = bpage;
+ cpu_buffer->tail_page = bpage + 1;
+ cpu_buffer->head_page = bpage + 1;
+
+ for (i = 1; i < desc->nr_page_va; i++) {
+ page = load_page(desc->page_va[i]);
+ if (!page) {
+ ret = -EINVAL;
+ break;
+ }
+
+ simple_bpage_init(++bpage, page);
+
+ bpage->link.next = &(bpage + 1)->link;
+ bpage->link.prev = &(bpage - 1)->link;
+ bpage->id = i;
+
+ cpu_buffer->nr_pages = i + 1;
+ }
+
+ if (ret) {
+ for (i--; i >= 0; i--)
+ unload_page(bpages[i].page);
+ unload_page(cpu_buffer->meta);
+
+ return ret;
+ }
+
+ cpu_buffer->meta->nr_subbufs = cpu_buffer->nr_pages;
+ /* Close the ring */
+ bpage->link.next = &cpu_buffer->tail_page->link;
+ cpu_buffer->tail_page->link.prev = &bpage->link;
+
+ /* The last init'ed page points to the head page */
+ simple_bpage_set_head_link(bpage);
+
+ cpu_buffer->bpages = bpages;
+
+ return 0;
+}
+
+static void *__load_page(unsigned long page)
+{
+ return (void *)page;
+}
+
+static void __unload_page(void *page) { }
+
+/**
+ * simple_ring_buffer_init - Init @cpu_buffer based on @desc
+ * @cpu_buffer: A simple_rb_per_cpu buffer to init, allocated by the caller.
+ * @bpages: Array of simple_buffer_pages, with as many elements as @desc->nr_page_va
+ * @desc: A ring_buffer_desc
+ *
+ * Returns 0 on success or -EINVAL if the content of @desc is invalid
+ */
+int simple_ring_buffer_init(struct simple_rb_per_cpu *cpu_buffer, struct simple_buffer_page *bpages,
+ const struct ring_buffer_desc *desc)
+{
+ return simple_ring_buffer_init_mm(cpu_buffer, bpages, desc, __load_page, __unload_page);
+}
+EXPORT_SYMBOL_GPL(simple_ring_buffer_init);
+
+void simple_ring_buffer_unload_mm(struct simple_rb_per_cpu *cpu_buffer,
+ void (*unload_page)(void *))
+{
+ int p;
+
+ if (!simple_rb_loaded(cpu_buffer))
+ return;
+
+ simple_rb_enable_tracing(cpu_buffer, false);
+
+ unload_page(cpu_buffer->meta);
+ for (p = 0; p < cpu_buffer->nr_pages; p++)
+ unload_page(cpu_buffer->bpages[p].page);
+
+ cpu_buffer->bpages = NULL;
+}
+
+/**
+ * simple_ring_buffer_unload - Prepare @cpu_buffer for deletion
+ * @cpu_buffer: A simple_rb_per_cpu that will be deleted.
+ */
+void simple_ring_buffer_unload(struct simple_rb_per_cpu *cpu_buffer)
+{
+ return simple_ring_buffer_unload_mm(cpu_buffer, __unload_page);
+}
+EXPORT_SYMBOL_GPL(simple_ring_buffer_unload);
+
+/**
+ * simple_ring_buffer_enable_tracing - Enable or disable writing to @cpu_buffer
+ * @cpu_buffer: A simple_rb_per_cpu
+ * @enable: True to enable tracing, False to disable it
+ *
+ * Returns 0 on success or -ENODEV if @cpu_buffer was unloaded
+ */
+int simple_ring_buffer_enable_tracing(struct simple_rb_per_cpu *cpu_buffer, bool enable)
+{
+ if (!simple_rb_loaded(cpu_buffer))
+ return -ENODEV;
+
+ simple_rb_enable_tracing(cpu_buffer, enable);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(simple_ring_buffer_enable_tracing);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a626211ceb9a..6eb4d3097a4d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -47,7 +47,6 @@
#include <linux/trace.h>
#include <linux/sched/clock.h>
#include <linux/sched/rt.h>
-#include <linux/fsnotify.h>
#include <linux/irq_work.h>
#include <linux/workqueue.h>
#include <linux/sort.h>
@@ -219,14 +218,36 @@ static void ftrace_trace_userstack(struct trace_array *tr,
static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
static char *default_bootup_tracer;
-static bool allocate_snapshot;
-static bool snapshot_at_boot;
-
static char boot_instance_info[COMMAND_LINE_SIZE] __initdata;
static int boot_instance_index;
-static char boot_snapshot_info[COMMAND_LINE_SIZE] __initdata;
-static int boot_snapshot_index;
+/*
+ * Repeated boot parameters, including Bootconfig array expansions, need
+ * to stay in the delimiter form that the existing parser consumes.
+ */
+void __init trace_append_boot_param(char *buf, const char *str, char sep,
+ int size)
+{
+ int len, needed, str_len;
+
+ if (!*str)
+ return;
+
+ len = strlen(buf);
+ str_len = strlen(str);
+ needed = len + str_len + 1;
+
+ /* For continuation, account for the separator. */
+ if (len)
+ needed++;
+ if (needed > size)
+ return;
+
+ if (len)
+ buf[len++] = sep;
+
+ strscpy(buf + len, str, size - len);
+}
static int __init set_cmdline_ftrace(char *str)
{
@@ -276,38 +297,6 @@ static int __init stop_trace_on_warning(char *str)
}
__setup("traceoff_on_warning", stop_trace_on_warning);
-static int __init boot_alloc_snapshot(char *str)
-{
- char *slot = boot_snapshot_info + boot_snapshot_index;
- int left = sizeof(boot_snapshot_info) - boot_snapshot_index;
- int ret;
-
- if (str[0] == '=') {
- str++;
- if (strlen(str) >= left)
- return -1;
-
- ret = snprintf(slot, left, "%s\t", str);
- boot_snapshot_index += ret;
- } else {
- allocate_snapshot = true;
- /* We also need the main ring buffer expanded */
- trace_set_ring_buffer_expanded(NULL);
- }
- return 1;
-}
-__setup("alloc_snapshot", boot_alloc_snapshot);
-
-
-static int __init boot_snapshot(char *str)
-{
- snapshot_at_boot = true;
- boot_alloc_snapshot(str);
- return 1;
-}
-__setup("ftrace_boot_snapshot", boot_snapshot);
-
-
static int __init boot_instance(char *str)
{
char *slot = boot_instance_info + boot_instance_index;
@@ -329,7 +318,8 @@ static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata;
static int __init set_trace_boot_options(char *str)
{
- strscpy(trace_boot_options_buf, str, MAX_TRACER_SIZE);
+ trace_append_boot_param(trace_boot_options_buf, str, ',',
+ MAX_TRACER_SIZE);
return 1;
}
__setup("trace_options=", set_trace_boot_options);
@@ -578,8 +568,59 @@ void trace_set_ring_buffer_expanded(struct trace_array *tr)
tr->ring_buffer_expanded = true;
}
+static void trace_array_autoremove(struct work_struct *work)
+{
+ struct trace_array *tr = container_of(work, struct trace_array, autoremove_work);
+
+ trace_array_destroy(tr);
+}
+
+static struct workqueue_struct *autoremove_wq;
+
+static void trace_array_kick_autoremove(struct trace_array *tr)
+{
+ if (autoremove_wq)
+ queue_work(autoremove_wq, &tr->autoremove_work);
+}
+
+static void trace_array_cancel_autoremove(struct trace_array *tr)
+{
+ /*
+ * Since this can be called inside trace_array_autoremove(),
+ * it has to avoid deadlock of the workqueue.
+ */
+ if (work_pending(&tr->autoremove_work))
+ cancel_work_sync(&tr->autoremove_work);
+}
+
+static void trace_array_init_autoremove(struct trace_array *tr)
+{
+ INIT_WORK(&tr->autoremove_work, trace_array_autoremove);
+}
+
+static void trace_array_start_autoremove(void)
+{
+ if (autoremove_wq)
+ return;
+
+ autoremove_wq = alloc_workqueue("tr_autoremove_wq",
+ WQ_UNBOUND | WQ_HIGHPRI, 0);
+ if (!autoremove_wq)
+ pr_warn("Unable to allocate tr_autoremove_wq. autoremove disabled.\n");
+}
+
LIST_HEAD(ftrace_trace_arrays);
+static int __trace_array_get(struct trace_array *this_tr)
+{
+ /* When free_on_close is set, this is not available anymore. */
+ if (autoremove_wq && this_tr->free_on_close)
+ return -ENODEV;
+
+ this_tr->ref++;
+ return 0;
+}
+
int trace_array_get(struct trace_array *this_tr)
{
struct trace_array *tr;
@@ -587,8 +628,7 @@ int trace_array_get(struct trace_array *this_tr)
guard(mutex)(&trace_types_lock);
list_for_each_entry(tr, &ftrace_trace_arrays, list) {
if (tr == this_tr) {
- tr->ref++;
- return 0;
+ return __trace_array_get(tr);
}
}
@@ -599,6 +639,12 @@ static void __trace_array_put(struct trace_array *this_tr)
{
WARN_ON(!this_tr->ref);
this_tr->ref--;
+ /*
+ * When free_on_close is set, prepare removing the array
+ * when the last reference is released.
+ */
+ if (this_tr->ref == 1 && this_tr->free_on_close)
+ trace_array_kick_autoremove(this_tr);
}
/**
@@ -807,47 +853,6 @@ void tracing_on(void)
EXPORT_SYMBOL_GPL(tracing_on);
#ifdef CONFIG_TRACER_SNAPSHOT
-static void tracing_snapshot_instance_cond(struct trace_array *tr,
- void *cond_data)
-{
- unsigned long flags;
-
- if (in_nmi()) {
- trace_array_puts(tr, "*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n");
- trace_array_puts(tr, "*** snapshot is being ignored ***\n");
- return;
- }
-
- if (!tr->allocated_snapshot) {
- trace_array_puts(tr, "*** SNAPSHOT NOT ALLOCATED ***\n");
- trace_array_puts(tr, "*** stopping trace here! ***\n");
- tracer_tracing_off(tr);
- return;
- }
-
- if (tr->mapped) {
- trace_array_puts(tr, "*** BUFFER MEMORY MAPPED ***\n");
- trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n");
- return;
- }
-
- /* Note, snapshot can not be used when the tracer uses it */
- if (tracer_uses_snapshot(tr->current_trace)) {
- trace_array_puts(tr, "*** LATENCY TRACER ACTIVE ***\n");
- trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n");
- return;
- }
-
- local_irq_save(flags);
- update_max_tr(tr, current, smp_processor_id(), cond_data);
- local_irq_restore(flags);
-}
-
-void tracing_snapshot_instance(struct trace_array *tr)
-{
- tracing_snapshot_instance_cond(tr, NULL);
-}
-
/**
* tracing_snapshot - take a snapshot of the current buffer.
*
@@ -871,138 +876,6 @@ void tracing_snapshot(void)
EXPORT_SYMBOL_GPL(tracing_snapshot);
/**
- * tracing_snapshot_cond - conditionally take a snapshot of the current buffer.
- * @tr: The tracing instance to snapshot
- * @cond_data: The data to be tested conditionally, and possibly saved
- *
- * This is the same as tracing_snapshot() except that the snapshot is
- * conditional - the snapshot will only happen if the
- * cond_snapshot.update() implementation receiving the cond_data
- * returns true, which means that the trace array's cond_snapshot
- * update() operation used the cond_data to determine whether the
- * snapshot should be taken, and if it was, presumably saved it along
- * with the snapshot.
- */
-void tracing_snapshot_cond(struct trace_array *tr, void *cond_data)
-{
- tracing_snapshot_instance_cond(tr, cond_data);
-}
-EXPORT_SYMBOL_GPL(tracing_snapshot_cond);
-
-/**
- * tracing_cond_snapshot_data - get the user data associated with a snapshot
- * @tr: The tracing instance
- *
- * When the user enables a conditional snapshot using
- * tracing_snapshot_cond_enable(), the user-defined cond_data is saved
- * with the snapshot. This accessor is used to retrieve it.
- *
- * Should not be called from cond_snapshot.update(), since it takes
- * the tr->max_lock lock, which the code calling
- * cond_snapshot.update() has already done.
- *
- * Returns the cond_data associated with the trace array's snapshot.
- */
-void *tracing_cond_snapshot_data(struct trace_array *tr)
-{
- void *cond_data = NULL;
-
- local_irq_disable();
- arch_spin_lock(&tr->max_lock);
-
- if (tr->cond_snapshot)
- cond_data = tr->cond_snapshot->cond_data;
-
- arch_spin_unlock(&tr->max_lock);
- local_irq_enable();
-
- return cond_data;
-}
-EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data);
-
-static int resize_buffer_duplicate_size(struct array_buffer *trace_buf,
- struct array_buffer *size_buf, int cpu_id);
-static void set_buffer_entries(struct array_buffer *buf, unsigned long val);
-
-int tracing_alloc_snapshot_instance(struct trace_array *tr)
-{
- int order;
- int ret;
-
- if (!tr->allocated_snapshot) {
-
- /* Make the snapshot buffer have the same order as main buffer */
- order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer);
- ret = ring_buffer_subbuf_order_set(tr->snapshot_buffer.buffer, order);
- if (ret < 0)
- return ret;
-
- /* allocate spare buffer */
- ret = resize_buffer_duplicate_size(&tr->snapshot_buffer,
- &tr->array_buffer, RING_BUFFER_ALL_CPUS);
- if (ret < 0)
- return ret;
-
- tr->allocated_snapshot = true;
- }
-
- return 0;
-}
-
-static void free_snapshot(struct trace_array *tr)
-{
- /*
- * We don't free the ring buffer. instead, resize it because
- * The max_tr ring buffer has some state (e.g. ring->clock) and
- * we want preserve it.
- */
- ring_buffer_subbuf_order_set(tr->snapshot_buffer.buffer, 0);
- ring_buffer_resize(tr->snapshot_buffer.buffer, 1, RING_BUFFER_ALL_CPUS);
- set_buffer_entries(&tr->snapshot_buffer, 1);
- tracing_reset_online_cpus(&tr->snapshot_buffer);
- tr->allocated_snapshot = false;
-}
-
-static int tracing_arm_snapshot_locked(struct trace_array *tr)
-{
- int ret;
-
- lockdep_assert_held(&trace_types_lock);
-
- spin_lock(&tr->snapshot_trigger_lock);
- if (tr->snapshot == UINT_MAX || tr->mapped) {
- spin_unlock(&tr->snapshot_trigger_lock);
- return -EBUSY;
- }
-
- tr->snapshot++;
- spin_unlock(&tr->snapshot_trigger_lock);
-
- ret = tracing_alloc_snapshot_instance(tr);
- if (ret) {
- spin_lock(&tr->snapshot_trigger_lock);
- tr->snapshot--;
- spin_unlock(&tr->snapshot_trigger_lock);
- }
-
- return ret;
-}
-
-int tracing_arm_snapshot(struct trace_array *tr)
-{
- guard(mutex)(&trace_types_lock);
- return tracing_arm_snapshot_locked(tr);
-}
-
-void tracing_disarm_snapshot(struct trace_array *tr)
-{
- spin_lock(&tr->snapshot_trigger_lock);
- if (!WARN_ON(!tr->snapshot))
- tr->snapshot--;
- spin_unlock(&tr->snapshot_trigger_lock);
-}
-
-/**
* tracing_alloc_snapshot - allocate snapshot buffer.
*
* This only allocates the snapshot buffer if it isn't already
@@ -1022,159 +895,18 @@ int tracing_alloc_snapshot(void)
return ret;
}
-EXPORT_SYMBOL_GPL(tracing_alloc_snapshot);
-
-/**
- * tracing_snapshot_alloc - allocate and take a snapshot of the current buffer.
- *
- * This is similar to tracing_snapshot(), but it will allocate the
- * snapshot buffer if it isn't already allocated. Use this only
- * where it is safe to sleep, as the allocation may sleep.
- *
- * This causes a swap between the snapshot buffer and the current live
- * tracing buffer. You can use this to take snapshots of the live
- * trace when some condition is triggered, but continue to trace.
- */
-void tracing_snapshot_alloc(void)
-{
- int ret;
-
- ret = tracing_alloc_snapshot();
- if (ret < 0)
- return;
-
- tracing_snapshot();
-}
-EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
-
-/**
- * tracing_snapshot_cond_enable - enable conditional snapshot for an instance
- * @tr: The tracing instance
- * @cond_data: User data to associate with the snapshot
- * @update: Implementation of the cond_snapshot update function
- *
- * Check whether the conditional snapshot for the given instance has
- * already been enabled, or if the current tracer is already using a
- * snapshot; if so, return -EBUSY, else create a cond_snapshot and
- * save the cond_data and update function inside.
- *
- * Returns 0 if successful, error otherwise.
- */
-int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data,
- cond_update_fn_t update)
-{
- struct cond_snapshot *cond_snapshot __free(kfree) =
- kzalloc_obj(*cond_snapshot);
- int ret;
-
- if (!cond_snapshot)
- return -ENOMEM;
-
- cond_snapshot->cond_data = cond_data;
- cond_snapshot->update = update;
-
- guard(mutex)(&trace_types_lock);
-
- if (tracer_uses_snapshot(tr->current_trace))
- return -EBUSY;
-
- /*
- * The cond_snapshot can only change to NULL without the
- * trace_types_lock. We don't care if we race with it going
- * to NULL, but we want to make sure that it's not set to
- * something other than NULL when we get here, which we can
- * do safely with only holding the trace_types_lock and not
- * having to take the max_lock.
- */
- if (tr->cond_snapshot)
- return -EBUSY;
-
- ret = tracing_arm_snapshot_locked(tr);
- if (ret)
- return ret;
-
- local_irq_disable();
- arch_spin_lock(&tr->max_lock);
- tr->cond_snapshot = no_free_ptr(cond_snapshot);
- arch_spin_unlock(&tr->max_lock);
- local_irq_enable();
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable);
-
-/**
- * tracing_snapshot_cond_disable - disable conditional snapshot for an instance
- * @tr: The tracing instance
- *
- * Check whether the conditional snapshot for the given instance is
- * enabled; if so, free the cond_snapshot associated with it,
- * otherwise return -EINVAL.
- *
- * Returns 0 if successful, error otherwise.
- */
-int tracing_snapshot_cond_disable(struct trace_array *tr)
-{
- int ret = 0;
-
- local_irq_disable();
- arch_spin_lock(&tr->max_lock);
-
- if (!tr->cond_snapshot)
- ret = -EINVAL;
- else {
- kfree(tr->cond_snapshot);
- tr->cond_snapshot = NULL;
- }
-
- arch_spin_unlock(&tr->max_lock);
- local_irq_enable();
-
- tracing_disarm_snapshot(tr);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable);
#else
void tracing_snapshot(void)
{
WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used");
}
EXPORT_SYMBOL_GPL(tracing_snapshot);
-void tracing_snapshot_cond(struct trace_array *tr, void *cond_data)
-{
- WARN_ONCE(1, "Snapshot feature not enabled, but internal conditional snapshot used");
-}
-EXPORT_SYMBOL_GPL(tracing_snapshot_cond);
-int tracing_alloc_snapshot(void)
-{
- WARN_ONCE(1, "Snapshot feature not enabled, but snapshot allocation used");
- return -ENODEV;
-}
-EXPORT_SYMBOL_GPL(tracing_alloc_snapshot);
void tracing_snapshot_alloc(void)
{
/* Give warning */
tracing_snapshot();
}
EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
-void *tracing_cond_snapshot_data(struct trace_array *tr)
-{
- return NULL;
-}
-EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data);
-int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update)
-{
- return -ENODEV;
-}
-EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable);
-int tracing_snapshot_cond_disable(struct trace_array *tr)
-{
- return false;
-}
-EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable);
-#define free_snapshot(tr) do { } while (0)
-#define tracing_arm_snapshot_locked(tr) ({ -EBUSY; })
#endif /* CONFIG_TRACER_SNAPSHOT */
void tracer_tracing_off(struct trace_array *tr)
@@ -1487,206 +1219,6 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
unsigned long __read_mostly tracing_thresh;
-#ifdef CONFIG_TRACER_MAX_TRACE
-#ifdef LATENCY_FS_NOTIFY
-static struct workqueue_struct *fsnotify_wq;
-
-static void latency_fsnotify_workfn(struct work_struct *work)
-{
- struct trace_array *tr = container_of(work, struct trace_array,
- fsnotify_work);
- fsnotify_inode(tr->d_max_latency->d_inode, FS_MODIFY);
-}
-
-static void latency_fsnotify_workfn_irq(struct irq_work *iwork)
-{
- struct trace_array *tr = container_of(iwork, struct trace_array,
- fsnotify_irqwork);
- queue_work(fsnotify_wq, &tr->fsnotify_work);
-}
-
-__init static int latency_fsnotify_init(void)
-{
- fsnotify_wq = alloc_workqueue("tr_max_lat_wq",
- WQ_UNBOUND | WQ_HIGHPRI, 0);
- if (!fsnotify_wq) {
- pr_err("Unable to allocate tr_max_lat_wq\n");
- return -ENOMEM;
- }
- return 0;
-}
-
-late_initcall_sync(latency_fsnotify_init);
-
-void latency_fsnotify(struct trace_array *tr)
-{
- if (!fsnotify_wq)
- return;
- /*
- * We cannot call queue_work(&tr->fsnotify_work) from here because it's
- * possible that we are called from __schedule() or do_idle(), which
- * could cause a deadlock.
- */
- irq_work_queue(&tr->fsnotify_irqwork);
-}
-#endif /* !LATENCY_FS_NOTIFY */
-
-static const struct file_operations tracing_max_lat_fops;
-
-static void trace_create_maxlat_file(struct trace_array *tr,
- struct dentry *d_tracer)
-{
-#ifdef LATENCY_FS_NOTIFY
- INIT_WORK(&tr->fsnotify_work, latency_fsnotify_workfn);
- init_irq_work(&tr->fsnotify_irqwork, latency_fsnotify_workfn_irq);
-#endif
- tr->d_max_latency = trace_create_file("tracing_max_latency",
- TRACE_MODE_WRITE,
- d_tracer, tr,
- &tracing_max_lat_fops);
-}
-
-/*
- * Copy the new maximum trace into the separate maximum-trace
- * structure. (this way the maximum trace is permanently saved,
- * for later retrieval via /sys/kernel/tracing/tracing_max_latency)
- */
-static void
-__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
-{
- struct array_buffer *trace_buf = &tr->array_buffer;
- struct trace_array_cpu *data = per_cpu_ptr(trace_buf->data, cpu);
- struct array_buffer *max_buf = &tr->snapshot_buffer;
- struct trace_array_cpu *max_data = per_cpu_ptr(max_buf->data, cpu);
-
- max_buf->cpu = cpu;
- max_buf->time_start = data->preempt_timestamp;
-
- max_data->saved_latency = tr->max_latency;
- max_data->critical_start = data->critical_start;
- max_data->critical_end = data->critical_end;
-
- strscpy(max_data->comm, tsk->comm);
- max_data->pid = tsk->pid;
- /*
- * If tsk == current, then use current_uid(), as that does not use
- * RCU. The irq tracer can be called out of RCU scope.
- */
- if (tsk == current)
- max_data->uid = current_uid();
- else
- max_data->uid = task_uid(tsk);
-
- max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
- max_data->policy = tsk->policy;
- max_data->rt_priority = tsk->rt_priority;
-
- /* record this tasks comm */
- tracing_record_cmdline(tsk);
- latency_fsnotify(tr);
-}
-#else
-static inline void trace_create_maxlat_file(struct trace_array *tr,
- struct dentry *d_tracer) { }
-static inline void __update_max_tr(struct trace_array *tr,
- struct task_struct *tsk, int cpu) { }
-#endif /* CONFIG_TRACER_MAX_TRACE */
-
-#ifdef CONFIG_TRACER_SNAPSHOT
-/**
- * update_max_tr - snapshot all trace buffers from global_trace to max_tr
- * @tr: tracer
- * @tsk: the task with the latency
- * @cpu: The cpu that initiated the trace.
- * @cond_data: User data associated with a conditional snapshot
- *
- * Flip the buffers between the @tr and the max_tr and record information
- * about which task was the cause of this latency.
- */
-void
-update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu,
- void *cond_data)
-{
- if (tr->stop_count)
- return;
-
- WARN_ON_ONCE(!irqs_disabled());
-
- if (!tr->allocated_snapshot) {
- /* Only the nop tracer should hit this when disabling */
- WARN_ON_ONCE(tr->current_trace != &nop_trace);
- return;
- }
-
- arch_spin_lock(&tr->max_lock);
-
- /* Inherit the recordable setting from array_buffer */
- if (ring_buffer_record_is_set_on(tr->array_buffer.buffer))
- ring_buffer_record_on(tr->snapshot_buffer.buffer);
- else
- ring_buffer_record_off(tr->snapshot_buffer.buffer);
-
- if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data)) {
- arch_spin_unlock(&tr->max_lock);
- return;
- }
-
- swap(tr->array_buffer.buffer, tr->snapshot_buffer.buffer);
-
- __update_max_tr(tr, tsk, cpu);
-
- arch_spin_unlock(&tr->max_lock);
-
- /* Any waiters on the old snapshot buffer need to wake up */
- ring_buffer_wake_waiters(tr->array_buffer.buffer, RING_BUFFER_ALL_CPUS);
-}
-
-/**
- * update_max_tr_single - only copy one trace over, and reset the rest
- * @tr: tracer
- * @tsk: task with the latency
- * @cpu: the cpu of the buffer to copy.
- *
- * Flip the trace of a single CPU buffer between the @tr and the max_tr.
- */
-void
-update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
-{
- int ret;
-
- if (tr->stop_count)
- return;
-
- WARN_ON_ONCE(!irqs_disabled());
- if (!tr->allocated_snapshot) {
- /* Only the nop tracer should hit this when disabling */
- WARN_ON_ONCE(tr->current_trace != &nop_trace);
- return;
- }
-
- arch_spin_lock(&tr->max_lock);
-
- ret = ring_buffer_swap_cpu(tr->snapshot_buffer.buffer, tr->array_buffer.buffer, cpu);
-
- if (ret == -EBUSY) {
- /*
- * We failed to swap the buffer due to a commit taking
- * place on this CPU. We fail to record, but we reset
- * the max trace buffer (no one writes directly to it)
- * and flag that it failed.
- * Another reason is resize is in progress.
- */
- trace_array_printk_buf(tr->snapshot_buffer.buffer, _THIS_IP_,
- "Failed to swap buffers due to commit or resize in progress\n");
- }
-
- WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);
-
- __update_max_tr(tr, tsk, cpu);
- arch_spin_unlock(&tr->max_lock);
-}
-#endif /* CONFIG_TRACER_SNAPSHOT */
-
struct pipe_wait {
struct trace_iterator *iter;
int wait_index;
@@ -1995,7 +1527,7 @@ int __init register_tracer(struct tracer *type)
return 0;
}
-static void tracing_reset_cpu(struct array_buffer *buf, int cpu)
+void tracing_reset_cpu(struct array_buffer *buf, int cpu)
{
struct trace_buffer *buffer = buf->buffer;
@@ -3760,50 +3292,6 @@ static void test_ftrace_alive(struct seq_file *m)
"# MAY BE MISSING FUNCTION EVENTS\n");
}
-#ifdef CONFIG_TRACER_SNAPSHOT
-static void show_snapshot_main_help(struct seq_file *m)
-{
- seq_puts(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"
- "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"
- "# Takes a snapshot of the main buffer.\n"
- "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n"
- "# (Doesn't have to be '2' works with any number that\n"
- "# is not a '0' or '1')\n");
-}
-
-static void show_snapshot_percpu_help(struct seq_file *m)
-{
- seq_puts(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n");
-#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
- seq_puts(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"
- "# Takes a snapshot of the main buffer for this cpu.\n");
-#else
- seq_puts(m, "# echo 1 > snapshot : Not supported with this kernel.\n"
- "# Must use main snapshot file to allocate.\n");
-#endif
- seq_puts(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n"
- "# (Doesn't have to be '2' works with any number that\n"
- "# is not a '0' or '1')\n");
-}
-
-static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter)
-{
- if (iter->tr->allocated_snapshot)
- seq_puts(m, "#\n# * Snapshot is allocated *\n#\n");
- else
- seq_puts(m, "#\n# * Snapshot is freed *\n#\n");
-
- seq_puts(m, "# Snapshot commands:\n");
- if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
- show_snapshot_main_help(m);
- else
- show_snapshot_percpu_help(m);
-}
-#else
-/* Should never be called */
-static inline void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { }
-#endif
-
static int s_show(struct seq_file *m, void *v)
{
struct trace_iterator *iter = v;
@@ -3852,17 +3340,6 @@ static int s_show(struct seq_file *m, void *v)
return 0;
}
-/*
- * Should be used after trace_array_get(), trace_types_lock
- * ensures that i_cdev was already initialized.
- */
-static inline int tracing_get_cpu(struct inode *inode)
-{
- if (inode->i_cdev) /* See trace_create_cpu_file() */
- return (long)inode->i_cdev - 1;
- return RING_BUFFER_ALL_CPUS;
-}
-
static const struct seq_operations tracer_seq_ops = {
.start = s_start,
.next = s_next,
@@ -3889,7 +3366,7 @@ static void free_trace_iter_content(struct trace_iterator *iter)
free_cpumask_var(iter->started);
}
-static struct trace_iterator *
+struct trace_iterator *
__tracing_open(struct inode *inode, struct file *file, bool snapshot)
{
struct trace_array *tr = inode->i_private;
@@ -4022,6 +3499,11 @@ int tracing_open_generic_tr(struct inode *inode, struct file *filp)
if (ret)
return ret;
+ if ((filp->f_mode & FMODE_WRITE) && trace_array_is_readonly(tr)) {
+ trace_array_put(tr);
+ return -EACCES;
+ }
+
filp->private_data = inode->i_private;
return 0;
@@ -4050,8 +3532,6 @@ int tracing_open_file_tr(struct inode *inode, struct file *filp)
event_file_get(file);
}
- filp->private_data = inode->i_private;
-
return 0;
}
@@ -4071,7 +3551,7 @@ int tracing_single_release_file_tr(struct inode *inode, struct file *filp)
return single_release(inode, filp);
}
-static int tracing_release(struct inode *inode, struct file *file)
+int tracing_release(struct inode *inode, struct file *file)
{
struct trace_array *tr = inode->i_private;
struct seq_file *m = file->private_data;
@@ -5222,7 +4702,7 @@ int tracer_init(struct tracer *t, struct trace_array *tr)
return t->init(tr);
}
-static void set_buffer_entries(struct array_buffer *buf, unsigned long val)
+void trace_set_buffer_entries(struct array_buffer *buf, unsigned long val)
{
int cpu;
@@ -5233,40 +4713,12 @@ static void set_buffer_entries(struct array_buffer *buf, unsigned long val)
static void update_buffer_entries(struct array_buffer *buf, int cpu)
{
if (cpu == RING_BUFFER_ALL_CPUS) {
- set_buffer_entries(buf, ring_buffer_size(buf->buffer, 0));
+ trace_set_buffer_entries(buf, ring_buffer_size(buf->buffer, 0));
} else {
per_cpu_ptr(buf->data, cpu)->entries = ring_buffer_size(buf->buffer, cpu);
}
}
-#ifdef CONFIG_TRACER_SNAPSHOT
-/* resize @tr's buffer to the size of @size_tr's entries */
-static int resize_buffer_duplicate_size(struct array_buffer *trace_buf,
- struct array_buffer *size_buf, int cpu_id)
-{
- int cpu, ret = 0;
-
- if (cpu_id == RING_BUFFER_ALL_CPUS) {
- for_each_tracing_cpu(cpu) {
- ret = ring_buffer_resize(trace_buf->buffer,
- per_cpu_ptr(size_buf->data, cpu)->entries, cpu);
- if (ret < 0)
- break;
- per_cpu_ptr(trace_buf->data, cpu)->entries =
- per_cpu_ptr(size_buf->data, cpu)->entries;
- }
- } else {
- ret = ring_buffer_resize(trace_buf->buffer,
- per_cpu_ptr(size_buf->data, cpu_id)->entries, cpu_id);
- if (ret == 0)
- per_cpu_ptr(trace_buf->data, cpu_id)->entries =
- per_cpu_ptr(size_buf->data, cpu_id)->entries;
- }
-
- return ret;
-}
-#endif /* CONFIG_TRACER_SNAPSHOT */
-
static int __tracing_resize_ring_buffer(struct trace_array *tr,
unsigned long size, int cpu)
{
@@ -5462,6 +4914,10 @@ static void update_last_data(struct trace_array *tr)
/* Only if the buffer has previous boot data clear and update it. */
tr->flags &= ~TRACE_ARRAY_FL_LAST_BOOT;
+ /* If this is a backup instance, mark it for autoremove. */
+ if (tr->flags & TRACE_ARRAY_FL_VMALLOC)
+ tr->free_on_close = true;
+
/* Reset the module list and reload them */
if (tr->scratch) {
struct trace_scratch *tscratch = tr->scratch;
@@ -5685,9 +5141,8 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
return ret;
}
-static ssize_t
-tracing_nsecs_read(unsigned long *ptr, char __user *ubuf,
- size_t cnt, loff_t *ppos)
+ssize_t tracing_nsecs_read(unsigned long *ptr, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
{
char buf[64];
int r;
@@ -5699,9 +5154,8 @@ tracing_nsecs_read(unsigned long *ptr, char __user *ubuf,
return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}
-static ssize_t
-tracing_nsecs_write(unsigned long *ptr, const char __user *ubuf,
- size_t cnt, loff_t *ppos)
+ssize_t tracing_nsecs_write(unsigned long *ptr, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
{
unsigned long val;
int ret;
@@ -5743,28 +5197,6 @@ tracing_thresh_write(struct file *filp, const char __user *ubuf,
return cnt;
}
-#ifdef CONFIG_TRACER_MAX_TRACE
-
-static ssize_t
-tracing_max_lat_read(struct file *filp, char __user *ubuf,
- size_t cnt, loff_t *ppos)
-{
- struct trace_array *tr = filp->private_data;
-
- return tracing_nsecs_read(&tr->max_latency, ubuf, cnt, ppos);
-}
-
-static ssize_t
-tracing_max_lat_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *ppos)
-{
- struct trace_array *tr = filp->private_data;
-
- return tracing_nsecs_write(&tr->max_latency, ubuf, cnt, ppos);
-}
-
-#endif
-
static int open_pipe_on_cpu(struct trace_array *tr, int cpu)
{
if (cpu == RING_BUFFER_ALL_CPUS) {
@@ -7097,6 +6529,11 @@ static int tracing_clock_open(struct inode *inode, struct file *file)
if (ret)
return ret;
+ if ((file->f_mode & FMODE_WRITE) && trace_array_is_readonly(tr)) {
+ trace_array_put(tr);
+ return -EACCES;
+ }
+
ret = single_open(file, tracing_clock_show, inode->i_private);
if (ret < 0)
trace_array_put(tr);
@@ -7142,194 +6579,6 @@ u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_eve
return ring_buffer_event_time_stamp(buffer, rbe);
}
-struct ftrace_buffer_info {
- struct trace_iterator iter;
- void *spare;
- unsigned int spare_cpu;
- unsigned int spare_size;
- unsigned int read;
-};
-
-#ifdef CONFIG_TRACER_SNAPSHOT
-static int tracing_snapshot_open(struct inode *inode, struct file *file)
-{
- struct trace_array *tr = inode->i_private;
- struct trace_iterator *iter;
- struct seq_file *m;
- int ret;
-
- ret = tracing_check_open_get_tr(tr);
- if (ret)
- return ret;
-
- if (file->f_mode & FMODE_READ) {
- iter = __tracing_open(inode, file, true);
- if (IS_ERR(iter))
- ret = PTR_ERR(iter);
- } else {
- /* Writes still need the seq_file to hold the private data */
- ret = -ENOMEM;
- m = kzalloc_obj(*m);
- if (!m)
- goto out;
- iter = kzalloc_obj(*iter);
- if (!iter) {
- kfree(m);
- goto out;
- }
- ret = 0;
-
- iter->tr = tr;
- iter->array_buffer = &tr->snapshot_buffer;
- iter->cpu_file = tracing_get_cpu(inode);
- m->private = iter;
- file->private_data = m;
- }
-out:
- if (ret < 0)
- trace_array_put(tr);
-
- return ret;
-}
-
-static void tracing_swap_cpu_buffer(void *tr)
-{
- update_max_tr_single((struct trace_array *)tr, current, smp_processor_id());
-}
-
-static ssize_t
-tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
- loff_t *ppos)
-{
- struct seq_file *m = filp->private_data;
- struct trace_iterator *iter = m->private;
- struct trace_array *tr = iter->tr;
- unsigned long val;
- int ret;
-
- ret = tracing_update_buffers(tr);
- if (ret < 0)
- return ret;
-
- ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
- if (ret)
- return ret;
-
- guard(mutex)(&trace_types_lock);
-
- if (tracer_uses_snapshot(tr->current_trace))
- return -EBUSY;
-
- local_irq_disable();
- arch_spin_lock(&tr->max_lock);
- if (tr->cond_snapshot)
- ret = -EBUSY;
- arch_spin_unlock(&tr->max_lock);
- local_irq_enable();
- if (ret)
- return ret;
-
- switch (val) {
- case 0:
- if (iter->cpu_file != RING_BUFFER_ALL_CPUS)
- return -EINVAL;
- if (tr->allocated_snapshot)
- free_snapshot(tr);
- break;
- case 1:
-/* Only allow per-cpu swap if the ring buffer supports it */
-#ifndef CONFIG_RING_BUFFER_ALLOW_SWAP
- if (iter->cpu_file != RING_BUFFER_ALL_CPUS)
- return -EINVAL;
-#endif
- if (tr->allocated_snapshot)
- ret = resize_buffer_duplicate_size(&tr->snapshot_buffer,
- &tr->array_buffer, iter->cpu_file);
-
- ret = tracing_arm_snapshot_locked(tr);
- if (ret)
- return ret;
-
- /* Now, we're going to swap */
- if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
- local_irq_disable();
- update_max_tr(tr, current, smp_processor_id(), NULL);
- local_irq_enable();
- } else {
- smp_call_function_single(iter->cpu_file, tracing_swap_cpu_buffer,
- (void *)tr, 1);
- }
- tracing_disarm_snapshot(tr);
- break;
- default:
- if (tr->allocated_snapshot) {
- if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
- tracing_reset_online_cpus(&tr->snapshot_buffer);
- else
- tracing_reset_cpu(&tr->snapshot_buffer, iter->cpu_file);
- }
- break;
- }
-
- if (ret >= 0) {
- *ppos += cnt;
- ret = cnt;
- }
-
- return ret;
-}
-
-static int tracing_snapshot_release(struct inode *inode, struct file *file)
-{
- struct seq_file *m = file->private_data;
- int ret;
-
- ret = tracing_release(inode, file);
-
- if (file->f_mode & FMODE_READ)
- return ret;
-
- /* If write only, the seq_file is just a stub */
- if (m)
- kfree(m->private);
- kfree(m);
-
- return 0;
-}
-
-static int tracing_buffers_open(struct inode *inode, struct file *filp);
-static ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf,
- size_t count, loff_t *ppos);
-static int tracing_buffers_release(struct inode *inode, struct file *file);
-static ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos,
- struct pipe_inode_info *pipe, size_t len, unsigned int flags);
-
-static int snapshot_raw_open(struct inode *inode, struct file *filp)
-{
- struct ftrace_buffer_info *info;
- int ret;
-
- /* The following checks for tracefs lockdown */
- ret = tracing_buffers_open(inode, filp);
- if (ret < 0)
- return ret;
-
- info = filp->private_data;
-
- if (tracer_uses_snapshot(info->iter.trace)) {
- tracing_buffers_release(inode, filp);
- return -EBUSY;
- }
-
- info->iter.snapshot = true;
- info->iter.array_buffer = &info->iter.tr->snapshot_buffer;
-
- return ret;
-}
-
-#endif /* CONFIG_TRACER_SNAPSHOT */
-
-
static const struct file_operations tracing_thresh_fops = {
.open = tracing_open_generic,
.read = tracing_thresh_read,
@@ -7337,16 +6586,6 @@ static const struct file_operations tracing_thresh_fops = {
.llseek = generic_file_llseek,
};
-#ifdef CONFIG_TRACER_MAX_TRACE
-static const struct file_operations tracing_max_lat_fops = {
- .open = tracing_open_generic_tr,
- .read = tracing_max_lat_read,
- .write = tracing_max_lat_write,
- .llseek = generic_file_llseek,
- .release = tracing_release_generic_tr,
-};
-#endif
-
static const struct file_operations set_tracer_fops = {
.open = tracing_open_generic_tr,
.read = tracing_set_trace_read,
@@ -7433,24 +6672,6 @@ static const struct file_operations last_boot_fops = {
.release = tracing_seq_release,
};
-#ifdef CONFIG_TRACER_SNAPSHOT
-static const struct file_operations snapshot_fops = {
- .open = tracing_snapshot_open,
- .read = seq_read,
- .write = tracing_snapshot_write,
- .llseek = tracing_lseek,
- .release = tracing_snapshot_release,
-};
-
-static const struct file_operations snapshot_raw_fops = {
- .open = snapshot_raw_open,
- .read = tracing_buffers_read,
- .release = tracing_buffers_release,
- .splice_read = tracing_buffers_splice_read,
-};
-
-#endif /* CONFIG_TRACER_SNAPSHOT */
-
/*
* trace_min_max_write - Write a u64 value to a trace_min_max_param struct
* @filp: The active open file structure
@@ -7810,7 +7031,7 @@ static const struct file_operations tracing_err_log_fops = {
.release = tracing_err_log_release,
};
-static int tracing_buffers_open(struct inode *inode, struct file *filp)
+int tracing_buffers_open(struct inode *inode, struct file *filp)
{
struct trace_array *tr = inode->i_private;
struct ftrace_buffer_info *info;
@@ -7858,9 +7079,8 @@ tracing_buffers_poll(struct file *filp, poll_table *poll_table)
return trace_poll(iter, filp, poll_table);
}
-static ssize_t
-tracing_buffers_read(struct file *filp, char __user *ubuf,
- size_t count, loff_t *ppos)
+ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf,
+ size_t count, loff_t *ppos)
{
struct ftrace_buffer_info *info = filp->private_data;
struct trace_iterator *iter = &info->iter;
@@ -7961,7 +7181,7 @@ static int tracing_buffers_flush(struct file *file, fl_owner_t id)
return 0;
}
-static int tracing_buffers_release(struct inode *inode, struct file *file)
+int tracing_buffers_release(struct inode *inode, struct file *file)
{
struct ftrace_buffer_info *info = file->private_data;
struct trace_iterator *iter = &info->iter;
@@ -8035,10 +7255,9 @@ static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i)
spd->partial[i].private = 0;
}
-static ssize_t
-tracing_buffers_splice_read(struct file *file, loff_t *ppos,
- struct pipe_inode_info *pipe, size_t len,
- unsigned int flags)
+ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags)
{
struct ftrace_buffer_info *info = file->private_data;
struct trace_iterator *iter = &info->iter;
@@ -8192,44 +7411,6 @@ static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned
return 0;
}
-#ifdef CONFIG_TRACER_SNAPSHOT
-static int get_snapshot_map(struct trace_array *tr)
-{
- int err = 0;
-
- /*
- * Called with mmap_lock held. lockdep would be unhappy if we would now
- * take trace_types_lock. Instead use the specific
- * snapshot_trigger_lock.
- */
- spin_lock(&tr->snapshot_trigger_lock);
-
- if (tr->snapshot || tr->mapped == UINT_MAX)
- err = -EBUSY;
- else
- tr->mapped++;
-
- spin_unlock(&tr->snapshot_trigger_lock);
-
- /* Wait for update_max_tr() to observe iter->tr->mapped */
- if (tr->mapped == 1)
- synchronize_rcu();
-
- return err;
-
-}
-static void put_snapshot_map(struct trace_array *tr)
-{
- spin_lock(&tr->snapshot_trigger_lock);
- if (!WARN_ON(!tr->mapped))
- tr->mapped--;
- spin_unlock(&tr->snapshot_trigger_lock);
-}
-#else
-static inline int get_snapshot_map(struct trace_array *tr) { return 0; }
-static inline void put_snapshot_map(struct trace_array *tr) { }
-#endif
-
/*
* This is called when a VMA is duplicated (e.g., on fork()) to increment
* the user_mapped counter without remapping pages.
@@ -8410,170 +7591,6 @@ static const struct file_operations tracing_dyn_info_fops = {
};
#endif /* CONFIG_DYNAMIC_FTRACE */
-#if defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE)
-static void
-ftrace_snapshot(unsigned long ip, unsigned long parent_ip,
- struct trace_array *tr, struct ftrace_probe_ops *ops,
- void *data)
-{
- tracing_snapshot_instance(tr);
-}
-
-static void
-ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip,
- struct trace_array *tr, struct ftrace_probe_ops *ops,
- void *data)
-{
- struct ftrace_func_mapper *mapper = data;
- long *count = NULL;
-
- if (mapper)
- count = (long *)ftrace_func_mapper_find_ip(mapper, ip);
-
- if (count) {
-
- if (*count <= 0)
- return;
-
- (*count)--;
- }
-
- tracing_snapshot_instance(tr);
-}
-
-static int
-ftrace_snapshot_print(struct seq_file *m, unsigned long ip,
- struct ftrace_probe_ops *ops, void *data)
-{
- struct ftrace_func_mapper *mapper = data;
- long *count = NULL;
-
- seq_printf(m, "%ps:", (void *)ip);
-
- seq_puts(m, "snapshot");
-
- if (mapper)
- count = (long *)ftrace_func_mapper_find_ip(mapper, ip);
-
- if (count)
- seq_printf(m, ":count=%ld\n", *count);
- else
- seq_puts(m, ":unlimited\n");
-
- return 0;
-}
-
-static int
-ftrace_snapshot_init(struct ftrace_probe_ops *ops, struct trace_array *tr,
- unsigned long ip, void *init_data, void **data)
-{
- struct ftrace_func_mapper *mapper = *data;
-
- if (!mapper) {
- mapper = allocate_ftrace_func_mapper();
- if (!mapper)
- return -ENOMEM;
- *data = mapper;
- }
-
- return ftrace_func_mapper_add_ip(mapper, ip, init_data);
-}
-
-static void
-ftrace_snapshot_free(struct ftrace_probe_ops *ops, struct trace_array *tr,
- unsigned long ip, void *data)
-{
- struct ftrace_func_mapper *mapper = data;
-
- if (!ip) {
- if (!mapper)
- return;
- free_ftrace_func_mapper(mapper, NULL);
- return;
- }
-
- ftrace_func_mapper_remove_ip(mapper, ip);
-}
-
-static struct ftrace_probe_ops snapshot_probe_ops = {
- .func = ftrace_snapshot,
- .print = ftrace_snapshot_print,
-};
-
-static struct ftrace_probe_ops snapshot_count_probe_ops = {
- .func = ftrace_count_snapshot,
- .print = ftrace_snapshot_print,
- .init = ftrace_snapshot_init,
- .free = ftrace_snapshot_free,
-};
-
-static int
-ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash,
- char *glob, char *cmd, char *param, int enable)
-{
- struct ftrace_probe_ops *ops;
- void *count = (void *)-1;
- char *number;
- int ret;
-
- if (!tr)
- return -ENODEV;
-
- /* hash funcs only work with set_ftrace_filter */
- if (!enable)
- return -EINVAL;
-
- ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops;
-
- if (glob[0] == '!') {
- ret = unregister_ftrace_function_probe_func(glob+1, tr, ops);
- if (!ret)
- tracing_disarm_snapshot(tr);
-
- return ret;
- }
-
- if (!param)
- goto out_reg;
-
- number = strsep(&param, ":");
-
- if (!strlen(number))
- goto out_reg;
-
- /*
- * We use the callback data field (which is a pointer)
- * as our counter.
- */
- ret = kstrtoul(number, 0, (unsigned long *)&count);
- if (ret)
- return ret;
-
- out_reg:
- ret = tracing_arm_snapshot(tr);
- if (ret < 0)
- return ret;
-
- ret = register_ftrace_function_probe(glob, tr, ops, count);
- if (ret < 0)
- tracing_disarm_snapshot(tr);
-
- return ret < 0 ? ret : 0;
-}
-
-static struct ftrace_func_command ftrace_snapshot_cmd = {
- .name = "snapshot",
- .func = ftrace_trace_snapshot_callback,
-};
-
-static __init int register_snapshot_cmd(void)
-{
- return register_ftrace_command(&ftrace_snapshot_cmd);
-}
-#else
-static inline __init int register_snapshot_cmd(void) { return 0; }
-#endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */
-
static struct dentry *tracing_get_dentry(struct trace_array *tr)
{
/* Top directory uses NULL as the parent */
@@ -8606,7 +7623,7 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)
return tr->percpu_dir;
}
-static struct dentry *
+struct dentry *
trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent,
void *data, long cpu, const struct file_operations *fops)
{
@@ -9366,8 +8383,7 @@ static void setup_trace_scratch(struct trace_array *tr,
memset(tscratch, 0, size);
}
-static int
-allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, unsigned long size)
+int allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size)
{
enum ring_buffer_flags rb_flags;
struct trace_scratch *tscratch;
@@ -9406,8 +8422,8 @@ allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, unsigned
}
/* Allocate the first page for all buffers */
- set_buffer_entries(&tr->array_buffer,
- ring_buffer_size(tr->array_buffer.buffer, 0));
+ trace_set_buffer_entries(&tr->array_buffer,
+ ring_buffer_size(tr->array_buffer.buffer, 0));
return 0;
}
@@ -9430,23 +8446,11 @@ static int allocate_trace_buffers(struct trace_array *tr, unsigned long size)
if (ret)
return ret;
-#ifdef CONFIG_TRACER_SNAPSHOT
- /* Fix mapped buffer trace arrays do not have snapshot buffers */
- if (tr->range_addr_start)
- return 0;
-
- ret = allocate_trace_buffer(tr, &tr->snapshot_buffer,
- allocate_snapshot ? size : 1);
- if (MEM_FAIL(ret, "Failed to allocate trace buffer\n")) {
+ ret = trace_allocate_snapshot(tr, size);
+ if (MEM_FAIL(ret, "Failed to allocate trace buffer\n"))
free_trace_buffer(&tr->array_buffer);
- return -ENOMEM;
- }
- tr->allocated_snapshot = allocate_snapshot;
- allocate_snapshot = false;
-#endif
-
- return 0;
+ return ret;
}
static void free_trace_buffers(struct trace_array *tr)
@@ -9527,8 +8531,8 @@ struct trace_array *trace_array_find_get(const char *instance)
guard(mutex)(&trace_types_lock);
tr = trace_array_find(instance);
- if (tr)
- tr->ref++;
+ if (tr && __trace_array_get(tr) < 0)
+ tr = NULL;
return tr;
}
@@ -9625,6 +8629,8 @@ trace_array_create_systems(const char *name, const char *systems,
if (ftrace_allocate_ftrace_ops(tr) < 0)
goto out_free_tr;
+ trace_array_init_autoremove(tr);
+
ftrace_init_trace_array(tr);
init_trace_flags_index(tr);
@@ -9735,7 +8741,9 @@ struct trace_array *trace_array_get_by_name(const char *name, const char *system
list_for_each_entry(tr, &ftrace_trace_arrays, list) {
if (tr->name && strcmp(tr->name, name) == 0) {
- tr->ref++;
+ /* if this fails, @tr is going to be removed. */
+ if (__trace_array_get(tr) < 0)
+ tr = NULL;
return tr;
}
}
@@ -9774,6 +8782,7 @@ static int __remove_instance(struct trace_array *tr)
set_tracer_flag(tr, 1ULL << i, 0);
}
+ trace_array_cancel_autoremove(tr);
tracing_set_nop(tr);
clear_ftrace_function_probes(tr);
event_trace_del_tracer(tr);
@@ -9866,17 +8875,22 @@ static __init void create_trace_instances(struct dentry *d_tracer)
static void
init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
{
+ umode_t writable_mode = TRACE_MODE_WRITE;
int cpu;
+ if (trace_array_is_readonly(tr))
+ writable_mode = TRACE_MODE_READ;
+
trace_create_file("available_tracers", TRACE_MODE_READ, d_tracer,
- tr, &show_traces_fops);
+ tr, &show_traces_fops);
- trace_create_file("current_tracer", TRACE_MODE_WRITE, d_tracer,
- tr, &set_tracer_fops);
+ trace_create_file("current_tracer", writable_mode, d_tracer,
+ tr, &set_tracer_fops);
- trace_create_file("tracing_cpumask", TRACE_MODE_WRITE, d_tracer,
+ trace_create_file("tracing_cpumask", writable_mode, d_tracer,
tr, &tracing_cpumask_fops);
+ /* Options are used for changing print-format even for readonly instance. */
trace_create_file("trace_options", TRACE_MODE_WRITE, d_tracer,
tr, &tracing_iter_fops);
@@ -9886,12 +8900,36 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
trace_create_file("trace_pipe", TRACE_MODE_READ, d_tracer,
tr, &tracing_pipe_fops);
- trace_create_file("buffer_size_kb", TRACE_MODE_WRITE, d_tracer,
+ trace_create_file("buffer_size_kb", writable_mode, d_tracer,
tr, &tracing_entries_fops);
trace_create_file("buffer_total_size_kb", TRACE_MODE_READ, d_tracer,
tr, &tracing_total_entries_fops);
+ trace_create_file("trace_clock", writable_mode, d_tracer, tr,
+ &trace_clock_fops);
+
+ trace_create_file("timestamp_mode", TRACE_MODE_READ, d_tracer, tr,
+ &trace_time_stamp_mode_fops);
+
+ tr->buffer_percent = 50;
+
+ trace_create_file("buffer_subbuf_size_kb", writable_mode, d_tracer,
+ tr, &buffer_subbuf_size_fops);
+
+ create_trace_options_dir(tr);
+
+ if (tr->range_addr_start)
+ trace_create_file("last_boot_info", TRACE_MODE_READ, d_tracer,
+ tr, &last_boot_fops);
+
+ for_each_tracing_cpu(cpu)
+ tracing_init_tracefs_percpu(tr, cpu);
+
+ /* Read-only instance has above files only. */
+ if (trace_array_is_readonly(tr))
+ return;
+
trace_create_file("free_buffer", 0200, d_tracer,
tr, &tracing_free_buffer_fops);
@@ -9903,49 +8941,29 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
trace_create_file("trace_marker_raw", 0220, d_tracer,
tr, &tracing_mark_raw_fops);
- trace_create_file("trace_clock", TRACE_MODE_WRITE, d_tracer, tr,
- &trace_clock_fops);
-
- trace_create_file("tracing_on", TRACE_MODE_WRITE, d_tracer,
- tr, &rb_simple_fops);
-
- trace_create_file("timestamp_mode", TRACE_MODE_READ, d_tracer, tr,
- &trace_time_stamp_mode_fops);
-
- tr->buffer_percent = 50;
-
trace_create_file("buffer_percent", TRACE_MODE_WRITE, d_tracer,
- tr, &buffer_percent_fops);
-
- trace_create_file("buffer_subbuf_size_kb", TRACE_MODE_WRITE, d_tracer,
- tr, &buffer_subbuf_size_fops);
+ tr, &buffer_percent_fops);
trace_create_file("syscall_user_buf_size", TRACE_MODE_WRITE, d_tracer,
- tr, &tracing_syscall_buf_fops);
+ tr, &tracing_syscall_buf_fops);
- create_trace_options_dir(tr);
+ trace_create_file("tracing_on", TRACE_MODE_WRITE, d_tracer,
+ tr, &rb_simple_fops);
trace_create_maxlat_file(tr, d_tracer);
if (ftrace_create_function_files(tr, d_tracer))
MEM_FAIL(1, "Could not allocate function filter files");
- if (tr->range_addr_start) {
- trace_create_file("last_boot_info", TRACE_MODE_READ, d_tracer,
- tr, &last_boot_fops);
#ifdef CONFIG_TRACER_SNAPSHOT
- } else {
+ if (!tr->range_addr_start)
trace_create_file("snapshot", TRACE_MODE_WRITE, d_tracer,
tr, &snapshot_fops);
#endif
- }
trace_create_file("error_log", TRACE_MODE_WRITE, d_tracer,
tr, &tracing_err_log_fops);
- for_each_tracing_cpu(cpu)
- tracing_init_tracefs_percpu(tr, cpu);
-
ftrace_init_tracefs(tr, d_tracer);
}
@@ -10554,47 +9572,6 @@ ssize_t trace_parse_run_command(struct file *file, const char __user *buffer,
return done;
}
-#ifdef CONFIG_TRACER_SNAPSHOT
-__init static bool tr_needs_alloc_snapshot(const char *name)
-{
- char *test;
- int len = strlen(name);
- bool ret;
-
- if (!boot_snapshot_index)
- return false;
-
- if (strncmp(name, boot_snapshot_info, len) == 0 &&
- boot_snapshot_info[len] == '\t')
- return true;
-
- test = kmalloc(strlen(name) + 3, GFP_KERNEL);
- if (!test)
- return false;
-
- sprintf(test, "\t%s\t", name);
- ret = strstr(boot_snapshot_info, test) == NULL;
- kfree(test);
- return ret;
-}
-
-__init static void do_allocate_snapshot(const char *name)
-{
- if (!tr_needs_alloc_snapshot(name))
- return;
-
- /*
- * When allocate_snapshot is set, the next call to
- * allocate_trace_buffers() (called by trace_array_get_by_name())
- * will allocate the snapshot buffer. That will also clear
- * this flag.
- */
- allocate_snapshot = true;
-}
-#else
-static inline void do_allocate_snapshot(const char *name) { }
-#endif
-
__init static int backup_instance_area(const char *backup,
unsigned long *addr, phys_addr_t *size)
{
@@ -10744,8 +9721,7 @@ __init static void enable_instances(void)
}
} else {
/* Only non mapped buffers have snapshot buffers */
- if (IS_ENABLED(CONFIG_TRACER_SNAPSHOT))
- do_allocate_snapshot(name);
+ do_allocate_snapshot(name);
}
tr = trace_array_create_systems(name, NULL, addr, size);
@@ -10771,17 +9747,41 @@ __init static void enable_instances(void)
/*
* Backup buffers can be freed but need vfree().
*/
- if (backup)
- tr->flags |= TRACE_ARRAY_FL_VMALLOC;
+ if (backup) {
+ tr->flags |= TRACE_ARRAY_FL_VMALLOC | TRACE_ARRAY_FL_RDONLY;
+ trace_array_start_autoremove();
+ }
if (start || backup) {
tr->flags |= TRACE_ARRAY_FL_BOOT | TRACE_ARRAY_FL_LAST_BOOT;
tr->range_name = no_free_ptr(rname);
}
+ /*
+ * Save the events to start and enabled them after all boot instances
+ * have been created.
+ */
+ tr->boot_events = curr_str;
+ }
+
+ /* Enable the events after all boot instances have been created */
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+
+ if (!tr->boot_events || !(*tr->boot_events)) {
+ tr->boot_events = NULL;
+ continue;
+ }
+
+ curr_str = tr->boot_events;
+
+ /* Clear the instance if this is a persistent buffer */
+ if (tr->flags & TRACE_ARRAY_FL_LAST_BOOT)
+ update_last_data(tr);
+
while ((tok = strsep(&curr_str, ","))) {
early_enable_events(tr, tok, true);
}
+ tr->boot_events = NULL;
}
}
@@ -10937,24 +9937,6 @@ struct trace_array *trace_get_global_array(void)
}
#endif
-void __init ftrace_boot_snapshot(void)
-{
-#ifdef CONFIG_TRACER_SNAPSHOT
- struct trace_array *tr;
-
- if (!snapshot_at_boot)
- return;
-
- list_for_each_entry(tr, &ftrace_trace_arrays, list) {
- if (!tr->allocated_snapshot)
- continue;
-
- tracing_snapshot_instance(tr);
- trace_array_puts(tr, "** Boot snapshot taken **\n");
- }
-#endif
-}
-
void __init early_trace_init(void)
{
if (tracepoint_printk) {
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b8f3804586a0..80fe152af1dd 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -264,6 +264,7 @@ static inline bool still_need_pid_events(int type, struct trace_pid_list *pid_li
typedef bool (*cond_update_fn_t)(struct trace_array *tr, void *cond_data);
+#ifdef CONFIG_TRACER_SNAPSHOT
/**
* struct cond_snapshot - conditional snapshot data and callback
*
@@ -306,6 +307,7 @@ struct cond_snapshot {
void *cond_data;
cond_update_fn_t update;
};
+#endif /* CONFIG_TRACER_SNAPSHOT */
/*
* struct trace_func_repeats - used to keep track of the consecutive
@@ -405,7 +407,10 @@ struct trace_array {
unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE];
unsigned int flags;
raw_spinlock_t start_lock;
- const char *system_names;
+ union {
+ const char *system_names;
+ char *boot_events;
+ };
struct list_head err_log;
struct dentry *dir;
struct dentry *options;
@@ -453,6 +458,12 @@ struct trace_array {
* we do not waste memory on systems that are not using tracing.
*/
bool ring_buffer_expanded;
+ /*
+ * If the ring buffer is a read only backup instance, it will be
+ * removed after dumping all data via pipe, because no readable data.
+ */
+ bool free_on_close;
+ struct work_struct autoremove_work;
};
enum {
@@ -462,6 +473,7 @@ enum {
TRACE_ARRAY_FL_MOD_INIT = BIT(3),
TRACE_ARRAY_FL_MEMMAP = BIT(4),
TRACE_ARRAY_FL_VMALLOC = BIT(5),
+ TRACE_ARRAY_FL_RDONLY = BIT(6),
};
#ifdef CONFIG_MODULES
@@ -491,6 +503,12 @@ extern unsigned long trace_adjust_address(struct trace_array *tr, unsigned long
extern struct trace_array *printk_trace;
+static inline bool trace_array_is_readonly(struct trace_array *tr)
+{
+ /* backup instance is read only. */
+ return tr->flags & TRACE_ARRAY_FL_RDONLY;
+}
+
/*
* The global tracer (top) should be the first trace array added,
* but we check the flag anyway.
@@ -675,6 +693,7 @@ void tracing_reset_all_online_cpus(void);
void tracing_reset_all_online_cpus_unlocked(void);
int tracing_open_generic(struct inode *inode, struct file *filp);
int tracing_open_generic_tr(struct inode *inode, struct file *filp);
+int tracing_release(struct inode *inode, struct file *file);
int tracing_release_generic_tr(struct inode *inode, struct file *file);
int tracing_open_file_tr(struct inode *inode, struct file *filp);
int tracing_release_file_tr(struct inode *inode, struct file *filp);
@@ -684,12 +703,54 @@ void tracer_tracing_on(struct trace_array *tr);
void tracer_tracing_off(struct trace_array *tr);
void tracer_tracing_disable(struct trace_array *tr);
void tracer_tracing_enable(struct trace_array *tr);
+int allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size);
struct dentry *trace_create_file(const char *name,
umode_t mode,
struct dentry *parent,
void *data,
const struct file_operations *fops);
+struct dentry *trace_create_cpu_file(const char *name,
+ umode_t mode,
+ struct dentry *parent,
+ void *data,
+ long cpu,
+ const struct file_operations *fops);
+
+struct trace_iterator *__tracing_open(struct inode *inode, struct file *file,
+ bool snapshot);
+int tracing_buffers_open(struct inode *inode, struct file *filp);
+ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf,
+ size_t count, loff_t *ppos);
+int tracing_buffers_release(struct inode *inode, struct file *file);
+ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len, unsigned int flags);
+
+ssize_t tracing_nsecs_read(unsigned long *ptr, char __user *ubuf,
+ size_t cnt, loff_t *ppos);
+ssize_t tracing_nsecs_write(unsigned long *ptr, const char __user *ubuf,
+ size_t cnt, loff_t *ppos);
+
+void trace_set_buffer_entries(struct array_buffer *buf, unsigned long val);
+/*
+ * Should be used after trace_array_get(), trace_types_lock
+ * ensures that i_cdev was already initialized.
+ */
+static inline int tracing_get_cpu(struct inode *inode)
+{
+ if (inode->i_cdev) /* See trace_create_cpu_file() */
+ return (long)inode->i_cdev - 1;
+ return RING_BUFFER_ALL_CPUS;
+}
+void tracing_reset_cpu(struct array_buffer *buf, int cpu);
+
+struct ftrace_buffer_info {
+ struct trace_iterator iter;
+ void *spare;
+ unsigned int spare_cpu;
+ unsigned int spare_size;
+ unsigned int read;
+};
/**
* tracer_tracing_is_on_cpu - show real state of ring buffer enabled on for a cpu
@@ -806,13 +867,13 @@ void update_max_tr_single(struct trace_array *tr,
#if defined(CONFIG_TRACER_MAX_TRACE) && defined(CONFIG_FSNOTIFY)
# define LATENCY_FS_NOTIFY
#endif
+#endif /* CONFIG_TRACER_SNAPSHOT */
#ifdef LATENCY_FS_NOTIFY
void latency_fsnotify(struct trace_array *tr);
#else
static inline void latency_fsnotify(struct trace_array *tr) { }
#endif
-#endif /* CONFIG_TRACER_SNAPSHOT */
#ifdef CONFIG_STACKTRACE
void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, int skip);
@@ -828,11 +889,15 @@ static inline bool tracer_uses_snapshot(struct tracer *tracer)
{
return tracer->use_max_tr;
}
+void trace_create_maxlat_file(struct trace_array *tr,
+ struct dentry *d_tracer);
#else
static inline bool tracer_uses_snapshot(struct tracer *tracer)
{
return false;
}
+static inline void trace_create_maxlat_file(struct trace_array *tr,
+ struct dentry *d_tracer) { }
#endif
void trace_last_func_repeats(struct trace_array *tr,
@@ -862,6 +927,8 @@ extern int DYN_FTRACE_TEST_NAME(void);
#define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2
extern int DYN_FTRACE_TEST_NAME2(void);
+void __init trace_append_boot_param(char *buf, const char *str,
+ char sep, int size);
extern void trace_set_ring_buffer_expanded(struct trace_array *tr);
extern bool tracing_selftest_disabled;
@@ -1802,11 +1869,6 @@ extern struct trace_event_file *find_event_file(struct trace_array *tr,
const char *system,
const char *event);
-static inline void *event_file_data(struct file *filp)
-{
- return READ_ONCE(file_inode(filp)->i_private);
-}
-
extern struct mutex event_mutex;
extern struct list_head ftrace_events;
@@ -1827,12 +1889,22 @@ static inline struct trace_event_file *event_file_file(struct file *filp)
struct trace_event_file *file;
lockdep_assert_held(&event_mutex);
- file = READ_ONCE(file_inode(filp)->i_private);
+ file = file_inode(filp)->i_private;
if (!file || file->flags & EVENT_FILE_FL_FREED)
return NULL;
return file;
}
+static inline void *event_file_data(struct file *filp)
+{
+ struct trace_event_file *file;
+
+ lockdep_assert_held(&event_mutex);
+ file = file_inode(filp)->i_private;
+ WARN_ON(!file || file->flags & EVENT_FILE_FL_FREED);
+ return file;
+}
+
extern const struct file_operations event_trigger_fops;
extern const struct file_operations event_hist_fops;
extern const struct file_operations event_hist_debug_fops;
@@ -2135,12 +2207,6 @@ static inline bool event_command_needs_rec(struct event_command *cmd_ops)
extern int trace_event_enable_disable(struct trace_event_file *file,
int enable, int soft_disable);
-extern int tracing_alloc_snapshot(void);
-extern void tracing_snapshot_cond(struct trace_array *tr, void *cond_data);
-extern int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update);
-
-extern int tracing_snapshot_cond_disable(struct trace_array *tr);
-extern void *tracing_cond_snapshot_data(struct trace_array *tr);
extern const char *__start___trace_bprintk_fmt[];
extern const char *__stop___trace_bprintk_fmt[];
@@ -2228,19 +2294,71 @@ static inline void trace_event_update_all(struct trace_eval_map **map, int len)
#endif
#ifdef CONFIG_TRACER_SNAPSHOT
+extern const struct file_operations snapshot_fops;
+extern const struct file_operations snapshot_raw_fops;
+
+/* Used when creating instances */
+int trace_allocate_snapshot(struct trace_array *tr, int size);
+
+int tracing_alloc_snapshot(void);
+void tracing_snapshot_cond(struct trace_array *tr, void *cond_data);
+int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update);
+int tracing_snapshot_cond_disable(struct trace_array *tr);
+void *tracing_cond_snapshot_data(struct trace_array *tr);
void tracing_snapshot_instance(struct trace_array *tr);
int tracing_alloc_snapshot_instance(struct trace_array *tr);
+int tracing_arm_snapshot_locked(struct trace_array *tr);
int tracing_arm_snapshot(struct trace_array *tr);
void tracing_disarm_snapshot(struct trace_array *tr);
-#else
+void free_snapshot(struct trace_array *tr);
+void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter);
+int get_snapshot_map(struct trace_array *tr);
+void put_snapshot_map(struct trace_array *tr);
+int resize_buffer_duplicate_size(struct array_buffer *trace_buf,
+ struct array_buffer *size_buf, int cpu_id);
+__init void do_allocate_snapshot(const char *name);
+# ifdef CONFIG_DYNAMIC_FTRACE
+__init int register_snapshot_cmd(void);
+# else
+static inline int register_snapshot_cmd(void) { return 0; }
+# endif
+#else /* !CONFIG_TRACER_SNAPSHOT */
+static inline int trace_allocate_snapshot(struct trace_array *tr, int size) { return 0; }
static inline void tracing_snapshot_instance(struct trace_array *tr) { }
static inline int tracing_alloc_snapshot_instance(struct trace_array *tr)
{
return 0;
}
+static inline int tracing_arm_snapshot_locked(struct trace_array *tr) { return -EBUSY; }
static inline int tracing_arm_snapshot(struct trace_array *tr) { return 0; }
static inline void tracing_disarm_snapshot(struct trace_array *tr) { }
-#endif
+static inline void free_snapshot(struct trace_array *tr) {}
+static inline void tracing_snapshot_cond(struct trace_array *tr, void *cond_data)
+{
+ WARN_ONCE(1, "Snapshot feature not enabled, but internal conditional snapshot used");
+}
+static inline void *tracing_cond_snapshot_data(struct trace_array *tr)
+{
+ return NULL;
+}
+static inline int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update)
+{
+ return -ENODEV;
+}
+static inline int tracing_snapshot_cond_disable(struct trace_array *tr)
+{
+ return false;
+}
+static inline void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter)
+{
+ /* Should never be called */
+ WARN_ONCE(1, "Snapshot print function called without snapshot configured");
+}
+static inline int get_snapshot_map(struct trace_array *tr) { return 0; }
+static inline void put_snapshot_map(struct trace_array *tr) { }
+static inline void do_allocate_snapshot(const char *name) { }
+static inline int register_snapshot_cmd(void) { return 0; }
+#endif /* CONFIG_TRACER_SNAPSHOT */
#ifdef CONFIG_PREEMPT_TRACER
void tracer_preempt_on(unsigned long a0, unsigned long a1);
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index dbe29b4c6a7a..2ca2541c8a58 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -61,7 +61,8 @@ trace_boot_set_instance_options(struct trace_array *tr, struct xbc_node *node)
v = memparse(p, NULL);
if (v < PAGE_SIZE)
pr_err("Buffer size is too small: %s\n", p);
- if (tracing_resize_ring_buffer(tr, v, RING_BUFFER_ALL_CPUS) < 0)
+ if (trace_array_is_readonly(tr) ||
+ tracing_resize_ring_buffer(tr, v, RING_BUFFER_ALL_CPUS) < 0)
pr_err("Failed to resize trace buffer to %s\n", p);
}
@@ -597,7 +598,7 @@ trace_boot_enable_tracer(struct trace_array *tr, struct xbc_node *node)
p = xbc_node_find_value(node, "tracer", NULL);
if (p && *p != '\0') {
- if (tracing_set_tracer(tr, p) < 0)
+ if (trace_array_is_readonly(tr) || tracing_set_tracer(tr, p) < 0)
pr_err("Failed to set given tracer: %s\n", p);
}
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 6809b370e991..d1564db95a8f 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -373,10 +373,10 @@ __init static int init_annotated_branch_stats(void)
int ret;
ret = register_stat_tracer(&annotated_branch_stats);
- if (!ret) {
+ if (ret) {
printk(KERN_WARNING "Warning: could not register "
"annotated branches stats\n");
- return 1;
+ return ret;
}
return 0;
}
@@ -438,10 +438,10 @@ __init static int all_annotated_branch_stats(void)
int ret;
ret = register_stat_tracer(&all_branch_stats);
- if (!ret) {
+ if (ret) {
printk(KERN_WARNING "Warning: could not register "
"all branches stats\n");
- return 1;
+ return ret;
}
return 0;
}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 249d1cba72c0..c46e623e7e0d 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1401,6 +1401,9 @@ static int __ftrace_set_clr_event(struct trace_array *tr, const char *match,
{
int ret;
+ if (trace_array_is_readonly(tr))
+ return -EACCES;
+
mutex_lock(&event_mutex);
ret = __ftrace_set_clr_event_nolock(tr, match, sub, event, set, mod);
mutex_unlock(&event_mutex);
@@ -1718,7 +1721,7 @@ static int t_show_filters(struct seq_file *m, void *v)
len = get_call_len(call);
- seq_printf(m, "%s:%s%*.s%s\n", call->class->system,
+ seq_printf(m, "%s:%s%*s%s\n", call->class->system,
trace_event_name(call), len, "", filter->filter_string);
return 0;
@@ -1750,7 +1753,7 @@ static int t_show_triggers(struct seq_file *m, void *v)
len = get_call_len(call);
list_for_each_entry_rcu(data, &file->triggers, list) {
- seq_printf(m, "%s:%s%*.s", call->class->system,
+ seq_printf(m, "%s:%s%*s", call->class->system,
trace_event_name(call), len, "");
data->cmd_ops->print(m, data);
@@ -2184,12 +2187,12 @@ static int trace_format_open(struct inode *inode, struct file *file)
static ssize_t
event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
{
- int id = (long)event_file_data(filp);
+ /* id is directly in i_private and available for inode's lifetime. */
+ int id = (long)file_inode(filp)->i_private;
char buf[32];
int len;
- if (unlikely(!id))
- return -ENODEV;
+ WARN_ON(!id);
len = sprintf(buf, "%d\n", id);
@@ -2247,12 +2250,8 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
mutex_lock(&event_mutex);
file = event_file_file(filp);
- if (file) {
- if (file->flags & EVENT_FILE_FL_FREED)
- err = -ENODEV;
- else
- err = apply_event_filter(file, buf);
- }
+ if (file)
+ err = apply_event_filter(file, buf);
mutex_unlock(&event_mutex);
kfree(buf);
@@ -2973,8 +2972,8 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
} else
__get_system(system);
- /* ftrace only has directories no files */
- if (strcmp(name, "ftrace") == 0)
+ /* ftrace only has directories no files, readonly instance too. */
+ if (strcmp(name, "ftrace") == 0 || trace_array_is_readonly(tr))
nr_entries = 0;
else
nr_entries = ARRAY_SIZE(system_entries);
@@ -3139,28 +3138,30 @@ event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file)
int ret;
static struct eventfs_entry event_entries[] = {
{
- .name = "enable",
+ .name = "format",
.callback = event_callback,
- .release = event_release,
},
+#ifdef CONFIG_PERF_EVENTS
{
- .name = "filter",
+ .name = "id",
.callback = event_callback,
},
+#endif
+#define NR_RO_EVENT_ENTRIES (1 + IS_ENABLED(CONFIG_PERF_EVENTS))
+/* Readonly files must be above this line and counted by NR_RO_EVENT_ENTRIES. */
{
- .name = "trigger",
+ .name = "enable",
.callback = event_callback,
+ .release = event_release,
},
{
- .name = "format",
+ .name = "filter",
.callback = event_callback,
},
-#ifdef CONFIG_PERF_EVENTS
{
- .name = "id",
+ .name = "trigger",
.callback = event_callback,
},
-#endif
#ifdef CONFIG_HIST_TRIGGERS
{
.name = "hist",
@@ -3193,7 +3194,10 @@ event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file)
if (!e_events)
return -ENOMEM;
- nr_entries = ARRAY_SIZE(event_entries);
+ if (trace_array_is_readonly(tr))
+ nr_entries = NR_RO_EVENT_ENTRIES;
+ else
+ nr_entries = ARRAY_SIZE(event_entries);
name = trace_event_name(call);
ei = eventfs_create_dir(name, e_events, event_entries, nr_entries, file);
@@ -3679,20 +3683,27 @@ static struct boot_triggers {
} bootup_triggers[MAX_BOOT_TRIGGERS];
static char bootup_trigger_buf[COMMAND_LINE_SIZE];
+static int boot_trigger_buf_len;
static int nr_boot_triggers;
static __init int setup_trace_triggers(char *str)
{
char *trigger;
char *buf;
+ int len = boot_trigger_buf_len;
int i;
- strscpy(bootup_trigger_buf, str, COMMAND_LINE_SIZE);
+ if (len >= COMMAND_LINE_SIZE)
+ return 1;
+
+ strscpy(bootup_trigger_buf + len, str, COMMAND_LINE_SIZE - len);
trace_set_ring_buffer_expanded(NULL);
disable_tracing_selftest("running event triggers");
- buf = bootup_trigger_buf;
- for (i = 0; i < MAX_BOOT_TRIGGERS; i++) {
+ buf = bootup_trigger_buf + len;
+ boot_trigger_buf_len += strlen(buf) + 1;
+
+ for (i = nr_boot_triggers; i < MAX_BOOT_TRIGGERS; i++) {
trigger = strsep(&buf, ",");
if (!trigger)
break;
@@ -4536,31 +4547,44 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
int nr_entries;
static struct eventfs_entry events_entries[] = {
{
- .name = "enable",
+ .name = "header_page",
.callback = events_callback,
},
{
- .name = "header_page",
+ .name = "header_event",
.callback = events_callback,
},
+#define NR_RO_TOP_ENTRIES 2
+/* Readonly files must be above this line and counted by NR_RO_TOP_ENTRIES. */
{
- .name = "header_event",
+ .name = "enable",
.callback = events_callback,
},
};
- entry = trace_create_file("set_event", TRACE_MODE_WRITE, parent,
- tr, &ftrace_set_event_fops);
- if (!entry)
- return -ENOMEM;
+ if (!trace_array_is_readonly(tr)) {
+ entry = trace_create_file("set_event", TRACE_MODE_WRITE, parent,
+ tr, &ftrace_set_event_fops);
+ if (!entry)
+ return -ENOMEM;
+
+ /* There are not as crucial, just warn if they are not created */
+ trace_create_file("show_event_filters", TRACE_MODE_READ, parent, tr,
+ &ftrace_show_event_filters_fops);
- trace_create_file("show_event_filters", TRACE_MODE_READ, parent, tr,
- &ftrace_show_event_filters_fops);
+ trace_create_file("show_event_triggers", TRACE_MODE_READ, parent, tr,
+ &ftrace_show_event_triggers_fops);
- trace_create_file("show_event_triggers", TRACE_MODE_READ, parent, tr,
- &ftrace_show_event_triggers_fops);
+ trace_create_file("set_event_pid", TRACE_MODE_WRITE, parent,
+ tr, &ftrace_set_event_pid_fops);
- nr_entries = ARRAY_SIZE(events_entries);
+ trace_create_file("set_event_notrace_pid",
+ TRACE_MODE_WRITE, parent, tr,
+ &ftrace_set_event_notrace_pid_fops);
+ nr_entries = ARRAY_SIZE(events_entries);
+ } else {
+ nr_entries = NR_RO_TOP_ENTRIES;
+ }
e_events = eventfs_create_events_dir("events", parent, events_entries,
nr_entries, tr);
@@ -4569,15 +4593,6 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
return -ENOMEM;
}
- /* There are not as crucial, just warn if they are not created */
-
- trace_create_file("set_event_pid", TRACE_MODE_WRITE, parent,
- tr, &ftrace_set_event_pid_fops);
-
- trace_create_file("set_event_notrace_pid",
- TRACE_MODE_WRITE, parent, tr,
- &ftrace_set_event_notrace_pid_fops);
-
tr->event_dir = e_events;
return 0;
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 73ea180cad55..eb2c2bc8bc3d 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -1361,13 +1361,16 @@ static const char *hist_field_name(struct hist_field *field,
field->flags & HIST_FIELD_FL_VAR_REF) {
if (field->system) {
static char full_name[MAX_FILTER_STR_VAL];
+ static char *fmt;
+ int len;
- strcat(full_name, field->system);
- strcat(full_name, ".");
- strcat(full_name, field->event_name);
- strcat(full_name, ".");
- strcat(full_name, field->name);
- field_name = full_name;
+ fmt = field->flags & HIST_FIELD_FL_VAR_REF ? "%s.%s.$%s" : "%s.%s.%s";
+
+ len = snprintf(full_name, sizeof(full_name), fmt,
+ field->system, field->event_name,
+ field->name);
+ if (len < sizeof(full_name))
+ field_name = full_name;
} else
field_name = field->name;
} else if (field->flags & HIST_FIELD_FL_TIMESTAMP)
@@ -1740,9 +1743,10 @@ static const char *get_hist_field_flags(struct hist_field *hist_field)
static void expr_field_str(struct hist_field *field, char *expr)
{
- if (field->flags & HIST_FIELD_FL_VAR_REF)
- strcat(expr, "$");
- else if (field->flags & HIST_FIELD_FL_CONST) {
+ if (field->flags & HIST_FIELD_FL_VAR_REF) {
+ if (!field->system)
+ strcat(expr, "$");
+ } else if (field->flags & HIST_FIELD_FL_CONST) {
char str[HIST_CONST_DIGITS_MAX];
snprintf(str, HIST_CONST_DIGITS_MAX, "%llu", field->constant);
@@ -5836,8 +5840,6 @@ static int event_hist_open(struct inode *inode, struct file *file)
hist_file->file = file;
hist_file->last_act = get_hist_hit_count(event_file);
- /* Clear private_data to avoid warning in single_open() */
- file->private_data = NULL;
ret = single_open(file, hist_show, hist_file);
if (ret) {
kfree(hist_file);
@@ -6126,8 +6128,6 @@ static int event_hist_debug_open(struct inode *inode, struct file *file)
if (ret)
return ret;
- /* Clear private_data to avoid warning in single_open() */
- file->private_data = NULL;
ret = single_open(file, hist_debug_show, file);
if (ret)
tracing_release_file_tr(inode, file);
@@ -6158,7 +6158,8 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
else if (field_name) {
if (hist_field->flags & HIST_FIELD_FL_VAR_REF ||
hist_field->flags & HIST_FIELD_FL_ALIAS)
- seq_putc(m, '$');
+ if (!hist_field->system)
+ seq_putc(m, '$');
seq_printf(m, "%s", field_name);
} else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP)
seq_puts(m, "common_timestamp");
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 8bb95b2a6fcf..39ac4eba0702 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -395,7 +395,7 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter,
n_u64++;
} else {
struct trace_print_flags __flags[] = {
- __def_gfpflag_names, {-1, NULL} };
+ __def_gfpflag_names };
char *space = (i == se->n_fields - 1 ? "" : " ");
print_synth_event_num_val(s, print_fmt,
@@ -408,7 +408,7 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter,
trace_seq_puts(s, " (");
trace_print_flags_seq(s, "|",
entry->fields[n_u64].as_u64,
- __flags);
+ __flags, ARRAY_SIZE(__flags));
trace_seq_putc(s, ')');
}
n_u64++;
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index d5230b759a2d..655db2e82513 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -22,6 +22,39 @@ static struct task_struct *trigger_kthread;
static struct llist_head trigger_data_free_list;
static DEFINE_MUTEX(trigger_data_kthread_mutex);
+static int trigger_kthread_fn(void *ignore);
+
+static void trigger_create_kthread_locked(void)
+{
+ lockdep_assert_held(&trigger_data_kthread_mutex);
+
+ if (!trigger_kthread) {
+ struct task_struct *kthread;
+
+ kthread = kthread_create(trigger_kthread_fn, NULL,
+ "trigger_data_free");
+ if (!IS_ERR(kthread))
+ WRITE_ONCE(trigger_kthread, kthread);
+ }
+}
+
+static void trigger_data_free_queued_locked(void)
+{
+ struct event_trigger_data *data, *tmp;
+ struct llist_node *llnodes;
+
+ lockdep_assert_held(&trigger_data_kthread_mutex);
+
+ llnodes = llist_del_all(&trigger_data_free_list);
+ if (!llnodes)
+ return;
+
+ tracepoint_synchronize_unregister();
+
+ llist_for_each_entry_safe(data, tmp, llnodes, llist)
+ kfree(data);
+}
+
/* Bulk garbage collection of event_trigger_data elements */
static int trigger_kthread_fn(void *ignore)
{
@@ -56,30 +89,50 @@ void trigger_data_free(struct event_trigger_data *data)
if (data->cmd_ops->set_filter)
data->cmd_ops->set_filter(NULL, data, NULL);
+ /*
+ * Boot-time trigger registration can fail before kthread creation
+ * works. Keep the deferred-free semantics during boot and let late
+ * init start the kthread to drain the list.
+ */
+ if (system_state == SYSTEM_BOOTING && !trigger_kthread) {
+ llist_add(&data->llist, &trigger_data_free_list);
+ return;
+ }
+
if (unlikely(!trigger_kthread)) {
guard(mutex)(&trigger_data_kthread_mutex);
+
+ trigger_create_kthread_locked();
/* Check again after taking mutex */
if (!trigger_kthread) {
- struct task_struct *kthread;
-
- kthread = kthread_create(trigger_kthread_fn, NULL,
- "trigger_data_free");
- if (!IS_ERR(kthread))
- WRITE_ONCE(trigger_kthread, kthread);
+ llist_add(&data->llist, &trigger_data_free_list);
+ /* Drain the queued frees synchronously if creation failed. */
+ trigger_data_free_queued_locked();
+ return;
}
}
- if (!trigger_kthread) {
- /* Do it the slow way */
- tracepoint_synchronize_unregister();
- kfree(data);
- return;
- }
-
llist_add(&data->llist, &trigger_data_free_list);
wake_up_process(trigger_kthread);
}
+static int __init trigger_data_free_init(void)
+{
+ guard(mutex)(&trigger_data_kthread_mutex);
+
+ if (llist_empty(&trigger_data_free_list))
+ return 0;
+
+ trigger_create_kthread_locked();
+ if (trigger_kthread)
+ wake_up_process(trigger_kthread);
+ else
+ trigger_data_free_queued_locked();
+
+ return 0;
+}
+late_initcall(trigger_data_free_init);
+
static inline void data_ops_trigger(struct event_trigger_data *data,
struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *event)
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index a5dbb72528e0..a8420e6abb56 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -31,7 +31,8 @@ static char kprobe_boot_events_buf[COMMAND_LINE_SIZE] __initdata;
static int __init set_kprobe_boot_events(char *str)
{
- strscpy(kprobe_boot_events_buf, str, COMMAND_LINE_SIZE);
+ trace_append_boot_param(kprobe_boot_events_buf, str, ';',
+ COMMAND_LINE_SIZE);
disable_tracing_selftest("running kprobe events");
return 1;
@@ -765,6 +766,14 @@ static unsigned int number_of_same_symbols(const char *mod, const char *func_nam
if (!mod)
kallsyms_on_each_match_symbol(count_symbols, func_name, &ctx.count);
+ /*
+ * If the symbol is found in vmlinux, use vmlinux resolution only.
+ * This prevents module symbols from shadowing vmlinux symbols
+ * and causing -EADDRNOTAVAIL for unqualified kprobe targets.
+ */
+ if (!mod && ctx.count > 0)
+ return ctx.count;
+
module_kallsyms_on_each_symbol(mod, count_mod_symbols, &ctx);
return ctx.count;
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index dee610e465b9..75678053b21c 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -58,6 +58,7 @@ enum osnoise_options_index {
OSN_PANIC_ON_STOP,
OSN_PREEMPT_DISABLE,
OSN_IRQ_DISABLE,
+ OSN_TIMERLAT_ALIGN,
OSN_MAX
};
@@ -66,7 +67,8 @@ static const char * const osnoise_options_str[OSN_MAX] = {
"OSNOISE_WORKLOAD",
"PANIC_ON_STOP",
"OSNOISE_PREEMPT_DISABLE",
- "OSNOISE_IRQ_DISABLE" };
+ "OSNOISE_IRQ_DISABLE",
+ "TIMERLAT_ALIGN" };
#define OSN_DEFAULT_OPTIONS 0x2
static unsigned long osnoise_options = OSN_DEFAULT_OPTIONS;
@@ -251,6 +253,11 @@ struct timerlat_variables {
static DEFINE_PER_CPU(struct timerlat_variables, per_cpu_timerlat_var);
/*
+ * timerlat wake-up offset for next thread with TIMERLAT_ALIGN set.
+ */
+static atomic64_t align_next;
+
+/*
* this_cpu_tmr_var - Return the per-cpu timerlat_variables on its relative CPU
*/
static inline struct timerlat_variables *this_cpu_tmr_var(void)
@@ -268,6 +275,7 @@ static inline void tlat_var_reset(void)
/* Synchronize with the timerlat interfaces */
mutex_lock(&interface_lock);
+
/*
* So far, all the values are initialized as 0, so
* zeroing the structure is perfect.
@@ -278,6 +286,12 @@ static inline void tlat_var_reset(void)
hrtimer_cancel(&tlat_var->timer);
memset(tlat_var, 0, sizeof(*tlat_var));
}
+ /*
+ * Reset also align_next, to be filled by a new offset by the first timerlat
+ * thread that wakes up, if TIMERLAT_ALIGN is set.
+ */
+ atomic64_set(&align_next, 0);
+
mutex_unlock(&interface_lock);
}
#else /* CONFIG_TIMERLAT_TRACER */
@@ -326,6 +340,7 @@ static struct osnoise_data {
u64 stop_tracing_total; /* stop trace in the final operation (report/thread) */
#ifdef CONFIG_TIMERLAT_TRACER
u64 timerlat_period; /* timerlat period */
+ u64 timerlat_align_us; /* timerlat alignment */
u64 print_stack; /* print IRQ stack if total > */
int timerlat_tracer; /* timerlat tracer */
#endif
@@ -338,6 +353,7 @@ static struct osnoise_data {
#ifdef CONFIG_TIMERLAT_TRACER
.print_stack = 0,
.timerlat_period = DEFAULT_TIMERLAT_PERIOD,
+ .timerlat_align_us = 0,
.timerlat_tracer = 0,
#endif
};
@@ -1830,6 +1846,26 @@ static int wait_next_period(struct timerlat_variables *tlat)
tlat->abs_period = (u64) ktime_to_ns(next_abs_period);
/*
+ * Align thread in the first cycle on each CPU to the set alignment
+ * if TIMERLAT_ALIGN is set.
+ *
+ * This is done by using an atomic64_t to store the next absolute period.
+ * The first thread that wakes up will set the atomic64_t to its
+ * absolute period, and the other threads will increment it by
+ * the alignment value.
+ */
+ if (test_bit(OSN_TIMERLAT_ALIGN, &osnoise_options) && !tlat->count
+ && atomic64_cmpxchg_relaxed(&align_next, 0, tlat->abs_period)) {
+ /*
+ * A thread has already set align_next, use it and increment it
+ * to be used by the next thread that wakes up after this one.
+ */
+ tlat->abs_period = atomic64_add_return_relaxed(
+ osnoise_data.timerlat_align_us * 1000, &align_next);
+ next_abs_period = ns_to_ktime(tlat->abs_period);
+ }
+
+ /*
* If the new abs_period is in the past, skip the activation.
*/
while (ktime_compare(now, next_abs_period) > 0) {
@@ -2073,8 +2109,8 @@ static void osnoise_hotplug_workfn(struct work_struct *dummy)
if (!osnoise_has_registered_instances())
return;
- guard(mutex)(&interface_lock);
guard(cpus_read_lock)();
+ guard(mutex)(&interface_lock);
if (!cpu_online(cpu))
return;
@@ -2237,11 +2273,11 @@ static ssize_t osnoise_options_write(struct file *filp, const char __user *ubuf,
if (running)
stop_per_cpu_kthreads();
- mutex_lock(&interface_lock);
/*
* avoid CPU hotplug operations that might read options.
*/
cpus_read_lock();
+ mutex_lock(&interface_lock);
retval = cnt;
@@ -2257,8 +2293,8 @@ static ssize_t osnoise_options_write(struct file *filp, const char __user *ubuf,
clear_bit(option, &osnoise_options);
}
- cpus_read_unlock();
mutex_unlock(&interface_lock);
+ cpus_read_unlock();
if (running)
start_per_cpu_kthreads();
@@ -2345,16 +2381,16 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count,
if (running)
stop_per_cpu_kthreads();
- mutex_lock(&interface_lock);
/*
* osnoise_cpumask is read by CPU hotplug operations.
*/
cpus_read_lock();
+ mutex_lock(&interface_lock);
cpumask_copy(&osnoise_cpumask, osnoise_cpumask_new);
- cpus_read_unlock();
mutex_unlock(&interface_lock);
+ cpus_read_unlock();
if (running)
start_per_cpu_kthreads();
@@ -2650,6 +2686,17 @@ static struct trace_min_max_param timerlat_period = {
.min = &timerlat_min_period,
};
+/*
+ * osnoise/timerlat_align_us: align the first wakeup of all timerlat
+ * threads to a common boundary (in us). 0 means disabled.
+ */
+static struct trace_min_max_param timerlat_align_us = {
+ .lock = &interface_lock,
+ .val = &osnoise_data.timerlat_align_us,
+ .max = NULL,
+ .min = NULL,
+};
+
static const struct file_operations timerlat_fd_fops = {
.open = timerlat_fd_open,
.read = timerlat_fd_read,
@@ -2746,6 +2793,11 @@ static int init_timerlat_tracefs(struct dentry *top_dir)
if (!tmp)
return -ENOMEM;
+ tmp = tracefs_create_file("timerlat_align_us", TRACE_MODE_WRITE, top_dir,
+ &timerlat_align_us, &trace_min_max_fops);
+ if (!tmp)
+ return -ENOMEM;
+
retval = osnoise_create_cpu_timerlat_fd(top_dir);
if (retval)
return retval;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 1996d7aba038..a5ad76175d10 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -69,14 +69,15 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
const char *
trace_print_flags_seq(struct trace_seq *p, const char *delim,
unsigned long flags,
- const struct trace_print_flags *flag_array)
+ const struct trace_print_flags *flag_array,
+ size_t flag_array_size)
{
unsigned long mask;
const char *str;
const char *ret = trace_seq_buffer_ptr(p);
int i, first = 1;
- for (i = 0; flag_array[i].name && flags; i++) {
+ for (i = 0; i < flag_array_size && flags; i++) {
mask = flag_array[i].mask;
if ((flags & mask) != mask)
@@ -106,12 +107,13 @@ EXPORT_SYMBOL(trace_print_flags_seq);
const char *
trace_print_symbols_seq(struct trace_seq *p, unsigned long val,
- const struct trace_print_flags *symbol_array)
+ const struct trace_print_flags *symbol_array,
+ size_t symbol_array_size)
{
int i;
const char *ret = trace_seq_buffer_ptr(p);
- for (i = 0; symbol_array[i].name; i++) {
+ for (i = 0; i < symbol_array_size; i++) {
if (val != symbol_array[i].mask)
continue;
@@ -133,14 +135,15 @@ EXPORT_SYMBOL(trace_print_symbols_seq);
const char *
trace_print_flags_seq_u64(struct trace_seq *p, const char *delim,
unsigned long long flags,
- const struct trace_print_flags_u64 *flag_array)
+ const struct trace_print_flags_u64 *flag_array,
+ size_t flag_array_size)
{
unsigned long long mask;
const char *str;
const char *ret = trace_seq_buffer_ptr(p);
int i, first = 1;
- for (i = 0; flag_array[i].name && flags; i++) {
+ for (i = 0; i < flag_array_size && flags; i++) {
mask = flag_array[i].mask;
if ((flags & mask) != mask)
@@ -170,12 +173,13 @@ EXPORT_SYMBOL(trace_print_flags_seq_u64);
const char *
trace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
- const struct trace_print_flags_u64 *symbol_array)
+ const struct trace_print_flags_u64 *symbol_array,
+ size_t symbol_array_size)
{
int i;
const char *ret = trace_seq_buffer_ptr(p);
- for (i = 0; symbol_array[i].name; i++) {
+ for (i = 0; i < symbol_array_size; i++) {
if (val != symbol_array[i].mask)
continue;
@@ -719,12 +723,13 @@ void print_function_args(struct trace_seq *s, unsigned long *args,
{
const struct btf_param *param;
const struct btf_type *t;
+ const struct btf_enum *enums;
const char *param_name;
char name[KSYM_NAME_LEN];
unsigned long arg;
struct btf *btf;
s32 tid, nr = 0;
- int a, p, x;
+ int a, p, x, i;
u16 encode;
trace_seq_printf(s, "(");
@@ -778,6 +783,15 @@ void print_function_args(struct trace_seq *s, unsigned long *args,
break;
case BTF_KIND_ENUM:
trace_seq_printf(s, "%ld", arg);
+ enums = btf_enum(t);
+ for (i = 0; i < btf_vlen(t); i++) {
+ if (arg == enums[i].val) {
+ trace_seq_printf(s, " [%s]",
+ btf_name_by_offset(btf,
+ enums[i].name_off));
+ break;
+ }
+ }
break;
default:
/* This does not handle complex arguments */
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 5ea5e0d76f00..3ea17af60169 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -197,6 +197,7 @@ struct notifier_block module_trace_bprintk_format_nb = {
.notifier_call = module_trace_bprintk_format_notify,
};
+__printf(2, 3)
int __trace_bprintk(unsigned long ip, const char *fmt, ...)
{
int ret;
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index e0a5dc86c07e..44c22d4e7881 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -962,8 +962,6 @@ static int parse_probe_vars(char *orig_arg, const struct fetch_type *t,
code->op = FETCH_OP_COMM;
return 0;
}
- /* backward compatibility */
- ctx->offset = 0;
goto inval;
}
@@ -1068,7 +1066,7 @@ static int __parse_imm_string(char *str, char **pbuf, int offs)
{
size_t len = strlen(str);
- if (str[len - 1] != '"') {
+ if (!len || str[len - 1] != '"') {
trace_probe_log_err(offs + len, IMMSTR_NO_CLOSE);
return -EINVAL;
}
@@ -1523,6 +1521,12 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
parg->offset = *size;
*size += parg->type->size * (parg->count ?: 1);
+ if (*size > MAX_PROBE_EVENT_SIZE) {
+ ret = -E2BIG;
+ trace_probe_log_err(ctx->offset, EVENT_TOO_BIG);
+ goto fail;
+ }
+
if (parg->count) {
len = strlen(parg->type->fmttype) + 6;
parg->fmt = kmalloc(len, GFP_KERNEL);
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 9fc56c937130..262d8707a3df 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -38,6 +38,7 @@
#define MAX_BTF_ARGS_LEN 128
#define MAX_DENTRY_ARGS_LEN 256
#define MAX_STRING_SIZE PATH_MAX
+#define MAX_PROBE_EVENT_SIZE 3072
/* Reserved field names */
#define FIELD_STRING_IP "__probe_ip"
@@ -561,7 +562,8 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
C(BAD_TYPE4STR, "This type does not fit for string."),\
C(NEED_STRING_TYPE, "$comm and immediate-string only accepts string type"),\
C(TOO_MANY_ARGS, "Too many arguments are specified"), \
- C(TOO_MANY_EARGS, "Too many entry arguments specified"),
+ C(TOO_MANY_EARGS, "Too many entry arguments specified"), \
+ C(EVENT_TOO_BIG, "Event too big (too many fields?)"),
#undef C
#define C(a, b) TP_ERR_##a
diff --git a/kernel/trace/trace_remote.c b/kernel/trace/trace_remote.c
new file mode 100644
index 000000000000..d6c3f94d67cd
--- /dev/null
+++ b/kernel/trace/trace_remote.c
@@ -0,0 +1,1384 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2025 - Google LLC
+ * Author: Vincent Donnefort <vdonnefort@google.com>
+ */
+
+#include <linux/kstrtox.h>
+#include <linux/lockdep.h>
+#include <linux/mutex.h>
+#include <linux/tracefs.h>
+#include <linux/trace_remote.h>
+#include <linux/trace_seq.h>
+#include <linux/types.h>
+
+#include "trace.h"
+
+#define TRACEFS_DIR "remotes"
+#define TRACEFS_MODE_WRITE 0640
+#define TRACEFS_MODE_READ 0440
+
+enum tri_type {
+ TRI_CONSUMING,
+ TRI_NONCONSUMING,
+};
+
+struct trace_remote_iterator {
+ struct trace_remote *remote;
+ struct trace_seq seq;
+ struct delayed_work poll_work;
+ unsigned long lost_events;
+ u64 ts;
+ struct ring_buffer_iter *rb_iter;
+ struct ring_buffer_iter **rb_iters;
+ struct remote_event_hdr *evt;
+ int cpu;
+ int evt_cpu;
+ loff_t pos;
+ enum tri_type type;
+};
+
+struct trace_remote {
+ struct trace_remote_callbacks *cbs;
+ void *priv;
+ struct trace_buffer *trace_buffer;
+ struct trace_buffer_desc *trace_buffer_desc;
+ struct dentry *dentry;
+ struct eventfs_inode *eventfs;
+ struct remote_event *events;
+ unsigned long nr_events;
+ unsigned long trace_buffer_size;
+ struct ring_buffer_remote rb_remote;
+ struct mutex lock;
+ struct rw_semaphore reader_lock;
+ struct rw_semaphore *pcpu_reader_locks;
+ unsigned int nr_readers;
+ unsigned int poll_ms;
+ bool tracing_on;
+};
+
+static bool trace_remote_loaded(struct trace_remote *remote)
+{
+ return !!remote->trace_buffer;
+}
+
+static int trace_remote_load(struct trace_remote *remote)
+{
+ struct ring_buffer_remote *rb_remote = &remote->rb_remote;
+ struct trace_buffer_desc *desc;
+
+ lockdep_assert_held(&remote->lock);
+
+ if (trace_remote_loaded(remote))
+ return 0;
+
+ desc = remote->cbs->load_trace_buffer(remote->trace_buffer_size, remote->priv);
+ if (IS_ERR(desc))
+ return PTR_ERR(desc);
+
+ rb_remote->desc = desc;
+ rb_remote->swap_reader_page = remote->cbs->swap_reader_page;
+ rb_remote->priv = remote->priv;
+ rb_remote->reset = remote->cbs->reset;
+ remote->trace_buffer = ring_buffer_alloc_remote(rb_remote);
+ if (!remote->trace_buffer) {
+ remote->cbs->unload_trace_buffer(desc, remote->priv);
+ return -ENOMEM;
+ }
+
+ remote->trace_buffer_desc = desc;
+
+ return 0;
+}
+
+static void trace_remote_try_unload(struct trace_remote *remote)
+{
+ lockdep_assert_held(&remote->lock);
+
+ if (!trace_remote_loaded(remote))
+ return;
+
+ /* The buffer is being read or writable */
+ if (remote->nr_readers || remote->tracing_on)
+ return;
+
+ /* The buffer has readable data */
+ if (!ring_buffer_empty(remote->trace_buffer))
+ return;
+
+ ring_buffer_free(remote->trace_buffer);
+ remote->trace_buffer = NULL;
+ remote->cbs->unload_trace_buffer(remote->trace_buffer_desc, remote->priv);
+}
+
+static int trace_remote_enable_tracing(struct trace_remote *remote)
+{
+ int ret;
+
+ lockdep_assert_held(&remote->lock);
+
+ if (remote->tracing_on)
+ return 0;
+
+ ret = trace_remote_load(remote);
+ if (ret)
+ return ret;
+
+ ret = remote->cbs->enable_tracing(true, remote->priv);
+ if (ret) {
+ trace_remote_try_unload(remote);
+ return ret;
+ }
+
+ remote->tracing_on = true;
+
+ return 0;
+}
+
+static int trace_remote_disable_tracing(struct trace_remote *remote)
+{
+ int ret;
+
+ lockdep_assert_held(&remote->lock);
+
+ if (!remote->tracing_on)
+ return 0;
+
+ ret = remote->cbs->enable_tracing(false, remote->priv);
+ if (ret)
+ return ret;
+
+ ring_buffer_poll_remote(remote->trace_buffer, RING_BUFFER_ALL_CPUS);
+ remote->tracing_on = false;
+ trace_remote_try_unload(remote);
+
+ return 0;
+}
+
+static void trace_remote_reset(struct trace_remote *remote, int cpu)
+{
+ lockdep_assert_held(&remote->lock);
+
+ if (!trace_remote_loaded(remote))
+ return;
+
+ if (cpu == RING_BUFFER_ALL_CPUS)
+ ring_buffer_reset(remote->trace_buffer);
+ else
+ ring_buffer_reset_cpu(remote->trace_buffer, cpu);
+
+ trace_remote_try_unload(remote);
+}
+
+static ssize_t
+tracing_on_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct seq_file *seq = filp->private_data;
+ struct trace_remote *remote = seq->private;
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ guard(mutex)(&remote->lock);
+
+ ret = val ? trace_remote_enable_tracing(remote) : trace_remote_disable_tracing(remote);
+ if (ret)
+ return ret;
+
+ return cnt;
+}
+static int tracing_on_show(struct seq_file *s, void *unused)
+{
+ struct trace_remote *remote = s->private;
+
+ seq_printf(s, "%d\n", remote->tracing_on);
+
+ return 0;
+}
+DEFINE_SHOW_STORE_ATTRIBUTE(tracing_on);
+
+static ssize_t buffer_size_kb_write(struct file *filp, const char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ struct seq_file *seq = filp->private_data;
+ struct trace_remote *remote = seq->private;
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ /* KiB to Bytes */
+ if (!val || check_shl_overflow(val, 10, &val))
+ return -EINVAL;
+
+ guard(mutex)(&remote->lock);
+
+ if (trace_remote_loaded(remote))
+ return -EBUSY;
+
+ remote->trace_buffer_size = val;
+
+ return cnt;
+}
+
+static int buffer_size_kb_show(struct seq_file *s, void *unused)
+{
+ struct trace_remote *remote = s->private;
+
+ seq_printf(s, "%lu (%s)\n", remote->trace_buffer_size >> 10,
+ trace_remote_loaded(remote) ? "loaded" : "unloaded");
+
+ return 0;
+}
+DEFINE_SHOW_STORE_ATTRIBUTE(buffer_size_kb);
+
+static int trace_remote_get(struct trace_remote *remote, int cpu)
+{
+ int ret;
+
+ if (remote->nr_readers == UINT_MAX)
+ return -EBUSY;
+
+ ret = trace_remote_load(remote);
+ if (ret)
+ return ret;
+
+ if (cpu != RING_BUFFER_ALL_CPUS && !remote->pcpu_reader_locks) {
+ int lock_cpu;
+
+ remote->pcpu_reader_locks = kcalloc(nr_cpu_ids, sizeof(*remote->pcpu_reader_locks),
+ GFP_KERNEL);
+ if (!remote->pcpu_reader_locks) {
+ trace_remote_try_unload(remote);
+ return -ENOMEM;
+ }
+
+ for_each_possible_cpu(lock_cpu)
+ init_rwsem(&remote->pcpu_reader_locks[lock_cpu]);
+ }
+
+ remote->nr_readers++;
+
+ return 0;
+}
+
+static void trace_remote_put(struct trace_remote *remote)
+{
+ if (WARN_ON(!remote->nr_readers))
+ return;
+
+ remote->nr_readers--;
+ if (remote->nr_readers)
+ return;
+
+ kfree(remote->pcpu_reader_locks);
+ remote->pcpu_reader_locks = NULL;
+
+ trace_remote_try_unload(remote);
+}
+
+static bool trace_remote_has_cpu(struct trace_remote *remote, int cpu)
+{
+ if (cpu == RING_BUFFER_ALL_CPUS)
+ return true;
+
+ return ring_buffer_poll_remote(remote->trace_buffer, cpu) == 0;
+}
+
+static void __poll_remote(struct work_struct *work)
+{
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct trace_remote_iterator *iter;
+
+ iter = container_of(dwork, struct trace_remote_iterator, poll_work);
+ ring_buffer_poll_remote(iter->remote->trace_buffer, iter->cpu);
+ schedule_delayed_work((struct delayed_work *)work,
+ msecs_to_jiffies(iter->remote->poll_ms));
+}
+
+static void __free_ring_buffer_iter(struct trace_remote_iterator *iter, int cpu)
+{
+ if (cpu != RING_BUFFER_ALL_CPUS) {
+ ring_buffer_read_finish(iter->rb_iter);
+ return;
+ }
+
+ for_each_possible_cpu(cpu) {
+ if (iter->rb_iters[cpu])
+ ring_buffer_read_finish(iter->rb_iters[cpu]);
+ }
+
+ kfree(iter->rb_iters);
+}
+
+static int __alloc_ring_buffer_iter(struct trace_remote_iterator *iter, int cpu)
+{
+ if (cpu != RING_BUFFER_ALL_CPUS) {
+ iter->rb_iter = ring_buffer_read_start(iter->remote->trace_buffer, cpu, GFP_KERNEL);
+
+ return iter->rb_iter ? 0 : -ENOMEM;
+ }
+
+ iter->rb_iters = kcalloc(nr_cpu_ids, sizeof(*iter->rb_iters), GFP_KERNEL);
+ if (!iter->rb_iters)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ iter->rb_iters[cpu] = ring_buffer_read_start(iter->remote->trace_buffer, cpu,
+ GFP_KERNEL);
+ if (!iter->rb_iters[cpu]) {
+ /* This CPU isn't part of trace_buffer. Skip it */
+ if (!trace_remote_has_cpu(iter->remote, cpu))
+ continue;
+
+ __free_ring_buffer_iter(iter, RING_BUFFER_ALL_CPUS);
+ return -ENOMEM;
+ }
+ }
+
+ return 0;
+}
+
+static struct trace_remote_iterator
+*trace_remote_iter(struct trace_remote *remote, int cpu, enum tri_type type)
+{
+ struct trace_remote_iterator *iter = NULL;
+ int ret;
+
+ lockdep_assert_held(&remote->lock);
+
+ if (type == TRI_NONCONSUMING && !trace_remote_loaded(remote))
+ return NULL;
+
+ ret = trace_remote_get(remote, cpu);
+ if (ret)
+ return ERR_PTR(ret);
+
+ if (!trace_remote_has_cpu(remote, cpu)) {
+ ret = -ENODEV;
+ goto err;
+ }
+
+ iter = kzalloc_obj(*iter);
+ if (iter) {
+ iter->remote = remote;
+ iter->cpu = cpu;
+ iter->type = type;
+ trace_seq_init(&iter->seq);
+
+ switch (type) {
+ case TRI_CONSUMING:
+ ring_buffer_poll_remote(remote->trace_buffer, cpu);
+ INIT_DELAYED_WORK(&iter->poll_work, __poll_remote);
+ schedule_delayed_work(&iter->poll_work, msecs_to_jiffies(remote->poll_ms));
+ break;
+ case TRI_NONCONSUMING:
+ ret = __alloc_ring_buffer_iter(iter, cpu);
+ break;
+ }
+
+ if (ret)
+ goto err;
+
+ return iter;
+ }
+ ret = -ENOMEM;
+
+err:
+ kfree(iter);
+ trace_remote_put(remote);
+
+ return ERR_PTR(ret);
+}
+
+static void trace_remote_iter_free(struct trace_remote_iterator *iter)
+{
+ struct trace_remote *remote;
+
+ if (!iter)
+ return;
+
+ remote = iter->remote;
+
+ lockdep_assert_held(&remote->lock);
+
+ switch (iter->type) {
+ case TRI_CONSUMING:
+ cancel_delayed_work_sync(&iter->poll_work);
+ break;
+ case TRI_NONCONSUMING:
+ __free_ring_buffer_iter(iter, iter->cpu);
+ break;
+ }
+
+ kfree(iter);
+ trace_remote_put(remote);
+}
+
+static void trace_remote_iter_read_start(struct trace_remote_iterator *iter)
+{
+ struct trace_remote *remote = iter->remote;
+ int cpu = iter->cpu;
+
+ /* Acquire global reader lock */
+ if (cpu == RING_BUFFER_ALL_CPUS && iter->type == TRI_CONSUMING)
+ down_write(&remote->reader_lock);
+ else
+ down_read(&remote->reader_lock);
+
+ if (cpu == RING_BUFFER_ALL_CPUS)
+ return;
+
+ /*
+ * No need for the remote lock here, iter holds a reference on
+ * remote->nr_readers
+ */
+
+ /* Get the per-CPU one */
+ if (WARN_ON_ONCE(!remote->pcpu_reader_locks))
+ return;
+
+ if (iter->type == TRI_CONSUMING)
+ down_write(&remote->pcpu_reader_locks[cpu]);
+ else
+ down_read(&remote->pcpu_reader_locks[cpu]);
+}
+
+static void trace_remote_iter_read_finished(struct trace_remote_iterator *iter)
+{
+ struct trace_remote *remote = iter->remote;
+ int cpu = iter->cpu;
+
+ /* Release per-CPU reader lock */
+ if (cpu != RING_BUFFER_ALL_CPUS) {
+ /*
+ * No need for the remote lock here, iter holds a reference on
+ * remote->nr_readers
+ */
+ if (iter->type == TRI_CONSUMING)
+ up_write(&remote->pcpu_reader_locks[cpu]);
+ else
+ up_read(&remote->pcpu_reader_locks[cpu]);
+ }
+
+ /* Release global reader lock */
+ if (cpu == RING_BUFFER_ALL_CPUS && iter->type == TRI_CONSUMING)
+ up_write(&remote->reader_lock);
+ else
+ up_read(&remote->reader_lock);
+}
+
+static struct ring_buffer_iter *__get_rb_iter(struct trace_remote_iterator *iter, int cpu)
+{
+ return iter->cpu != RING_BUFFER_ALL_CPUS ? iter->rb_iter : iter->rb_iters[cpu];
+}
+
+static struct ring_buffer_event *
+__peek_event(struct trace_remote_iterator *iter, int cpu, u64 *ts, unsigned long *lost_events)
+{
+ struct ring_buffer_event *rb_evt;
+ struct ring_buffer_iter *rb_iter;
+
+ switch (iter->type) {
+ case TRI_CONSUMING:
+ return ring_buffer_peek(iter->remote->trace_buffer, cpu, ts, lost_events);
+ case TRI_NONCONSUMING:
+ rb_iter = __get_rb_iter(iter, cpu);
+ if (!rb_iter)
+ return NULL;
+
+ rb_evt = ring_buffer_iter_peek(rb_iter, ts);
+ if (!rb_evt)
+ return NULL;
+
+ *lost_events = ring_buffer_iter_dropped(rb_iter);
+
+ return rb_evt;
+ }
+
+ return NULL;
+}
+
+static bool trace_remote_iter_read_event(struct trace_remote_iterator *iter)
+{
+ struct trace_buffer *trace_buffer = iter->remote->trace_buffer;
+ struct ring_buffer_event *rb_evt;
+ int cpu = iter->cpu;
+
+ if (cpu != RING_BUFFER_ALL_CPUS) {
+ if (ring_buffer_empty_cpu(trace_buffer, cpu))
+ return false;
+
+ rb_evt = __peek_event(iter, cpu, &iter->ts, &iter->lost_events);
+ if (!rb_evt)
+ return false;
+
+ iter->evt_cpu = cpu;
+ iter->evt = ring_buffer_event_data(rb_evt);
+ return true;
+ }
+
+ iter->ts = U64_MAX;
+ for_each_possible_cpu(cpu) {
+ unsigned long lost_events;
+ u64 ts;
+
+ if (ring_buffer_empty_cpu(trace_buffer, cpu))
+ continue;
+
+ rb_evt = __peek_event(iter, cpu, &ts, &lost_events);
+ if (!rb_evt)
+ continue;
+
+ if (ts >= iter->ts)
+ continue;
+
+ iter->ts = ts;
+ iter->evt_cpu = cpu;
+ iter->evt = ring_buffer_event_data(rb_evt);
+ iter->lost_events = lost_events;
+ }
+
+ return iter->ts != U64_MAX;
+}
+
+static void trace_remote_iter_move(struct trace_remote_iterator *iter)
+{
+ struct trace_buffer *trace_buffer = iter->remote->trace_buffer;
+
+ switch (iter->type) {
+ case TRI_CONSUMING:
+ ring_buffer_consume(trace_buffer, iter->evt_cpu, NULL, NULL);
+ break;
+ case TRI_NONCONSUMING:
+ ring_buffer_iter_advance(__get_rb_iter(iter, iter->evt_cpu));
+ break;
+ }
+}
+
+static struct remote_event *trace_remote_find_event(struct trace_remote *remote, unsigned short id);
+
+static int trace_remote_iter_print_event(struct trace_remote_iterator *iter)
+{
+ struct remote_event *evt;
+ unsigned long usecs_rem;
+ u64 ts = iter->ts;
+
+ if (iter->lost_events)
+ trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
+ iter->evt_cpu, iter->lost_events);
+
+ do_div(ts, 1000);
+ usecs_rem = do_div(ts, USEC_PER_SEC);
+
+ trace_seq_printf(&iter->seq, "[%03d]\t%5llu.%06lu: ", iter->evt_cpu,
+ ts, usecs_rem);
+
+ evt = trace_remote_find_event(iter->remote, iter->evt->id);
+ if (!evt)
+ trace_seq_printf(&iter->seq, "UNKNOWN id=%d\n", iter->evt->id);
+ else
+ evt->print(iter->evt, &iter->seq);
+
+ return trace_seq_has_overflowed(&iter->seq) ? -EOVERFLOW : 0;
+}
+
+static int trace_pipe_open(struct inode *inode, struct file *filp)
+{
+ struct trace_remote *remote = inode->i_private;
+ struct trace_remote_iterator *iter;
+ int cpu = tracing_get_cpu(inode);
+
+ guard(mutex)(&remote->lock);
+
+ iter = trace_remote_iter(remote, cpu, TRI_CONSUMING);
+ if (IS_ERR(iter))
+ return PTR_ERR(iter);
+
+ filp->private_data = iter;
+
+ return IS_ERR(iter) ? PTR_ERR(iter) : 0;
+}
+
+static int trace_pipe_release(struct inode *inode, struct file *filp)
+{
+ struct trace_remote_iterator *iter = filp->private_data;
+ struct trace_remote *remote = iter->remote;
+
+ guard(mutex)(&remote->lock);
+
+ trace_remote_iter_free(iter);
+
+ return 0;
+}
+
+static ssize_t trace_pipe_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct trace_remote_iterator *iter = filp->private_data;
+ struct trace_buffer *trace_buffer = iter->remote->trace_buffer;
+ int ret;
+
+copy_to_user:
+ ret = trace_seq_to_user(&iter->seq, ubuf, cnt);
+ if (ret != -EBUSY)
+ return ret;
+
+ trace_seq_init(&iter->seq);
+
+ ret = ring_buffer_wait(trace_buffer, iter->cpu, 0, NULL, NULL);
+ if (ret < 0)
+ return ret;
+
+ trace_remote_iter_read_start(iter);
+
+ while (trace_remote_iter_read_event(iter)) {
+ int prev_len = iter->seq.seq.len;
+
+ if (trace_remote_iter_print_event(iter)) {
+ iter->seq.seq.len = prev_len;
+ break;
+ }
+
+ trace_remote_iter_move(iter);
+ }
+
+ trace_remote_iter_read_finished(iter);
+
+ goto copy_to_user;
+}
+
+static const struct file_operations trace_pipe_fops = {
+ .open = trace_pipe_open,
+ .read = trace_pipe_read,
+ .release = trace_pipe_release,
+};
+
+static void *trace_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct trace_remote_iterator *iter = m->private;
+
+ ++*pos;
+
+ if (!iter || !trace_remote_iter_read_event(iter))
+ return NULL;
+
+ trace_remote_iter_move(iter);
+ iter->pos++;
+
+ return iter;
+}
+
+static void *trace_start(struct seq_file *m, loff_t *pos)
+{
+ struct trace_remote_iterator *iter = m->private;
+ loff_t i;
+
+ if (!iter)
+ return NULL;
+
+ trace_remote_iter_read_start(iter);
+
+ if (!*pos) {
+ iter->pos = -1;
+ return trace_next(m, NULL, &i);
+ }
+
+ i = iter->pos;
+ while (i < *pos) {
+ iter = trace_next(m, NULL, &i);
+ if (!iter)
+ return NULL;
+ }
+
+ return iter;
+}
+
+static int trace_show(struct seq_file *m, void *v)
+{
+ struct trace_remote_iterator *iter = v;
+
+ trace_seq_init(&iter->seq);
+
+ if (trace_remote_iter_print_event(iter)) {
+ seq_printf(m, "[EVENT %d PRINT TOO BIG]\n", iter->evt->id);
+ return 0;
+ }
+
+ return trace_print_seq(m, &iter->seq);
+}
+
+static void trace_stop(struct seq_file *m, void *v)
+{
+ struct trace_remote_iterator *iter = m->private;
+
+ if (iter)
+ trace_remote_iter_read_finished(iter);
+}
+
+static const struct seq_operations trace_sops = {
+ .start = trace_start,
+ .next = trace_next,
+ .show = trace_show,
+ .stop = trace_stop,
+};
+
+static int trace_open(struct inode *inode, struct file *filp)
+{
+ struct trace_remote *remote = inode->i_private;
+ struct trace_remote_iterator *iter = NULL;
+ int cpu = tracing_get_cpu(inode);
+ int ret;
+
+ if (!(filp->f_mode & FMODE_READ))
+ return 0;
+
+ guard(mutex)(&remote->lock);
+
+ iter = trace_remote_iter(remote, cpu, TRI_NONCONSUMING);
+ if (IS_ERR(iter))
+ return PTR_ERR(iter);
+
+ ret = seq_open(filp, &trace_sops);
+ if (ret) {
+ trace_remote_iter_free(iter);
+ return ret;
+ }
+
+ ((struct seq_file *)filp->private_data)->private = (void *)iter;
+
+ return 0;
+}
+
+static int trace_release(struct inode *inode, struct file *filp)
+{
+ struct trace_remote_iterator *iter;
+
+ if (!(filp->f_mode & FMODE_READ))
+ return 0;
+
+ iter = ((struct seq_file *)filp->private_data)->private;
+ seq_release(inode, filp);
+
+ if (!iter)
+ return 0;
+
+ guard(mutex)(&iter->remote->lock);
+
+ trace_remote_iter_free(iter);
+
+ return 0;
+}
+
+static ssize_t trace_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct inode *inode = file_inode(filp);
+ struct trace_remote *remote = inode->i_private;
+ int cpu = tracing_get_cpu(inode);
+
+ guard(mutex)(&remote->lock);
+
+ trace_remote_reset(remote, cpu);
+
+ return cnt;
+}
+
+static const struct file_operations trace_fops = {
+ .open = trace_open,
+ .write = trace_write,
+ .read = seq_read,
+ .read_iter = seq_read_iter,
+ .release = trace_release,
+};
+
+static int trace_remote_init_tracefs(const char *name, struct trace_remote *remote)
+{
+ struct dentry *remote_d, *percpu_d, *d;
+ static struct dentry *root;
+ static DEFINE_MUTEX(lock);
+ bool root_inited = false;
+ int cpu;
+
+ guard(mutex)(&lock);
+
+ if (!root) {
+ root = tracefs_create_dir(TRACEFS_DIR, NULL);
+ if (!root) {
+ pr_err("Failed to create tracefs dir "TRACEFS_DIR"\n");
+ return -ENOMEM;
+ }
+ root_inited = true;
+ }
+
+ remote_d = tracefs_create_dir(name, root);
+ if (!remote_d) {
+ pr_err("Failed to create tracefs dir "TRACEFS_DIR"%s/\n", name);
+ goto err;
+ }
+
+ d = trace_create_file("tracing_on", TRACEFS_MODE_WRITE, remote_d, remote, &tracing_on_fops);
+ if (!d)
+ goto err;
+
+ d = trace_create_file("buffer_size_kb", TRACEFS_MODE_WRITE, remote_d, remote,
+ &buffer_size_kb_fops);
+ if (!d)
+ goto err;
+
+ d = trace_create_file("trace_pipe", TRACEFS_MODE_READ, remote_d, remote, &trace_pipe_fops);
+ if (!d)
+ goto err;
+
+ d = trace_create_file("trace", TRACEFS_MODE_WRITE, remote_d, remote, &trace_fops);
+ if (!d)
+ goto err;
+
+ percpu_d = tracefs_create_dir("per_cpu", remote_d);
+ if (!percpu_d) {
+ pr_err("Failed to create tracefs dir "TRACEFS_DIR"%s/per_cpu/\n", name);
+ goto err;
+ }
+
+ for_each_possible_cpu(cpu) {
+ struct dentry *cpu_d;
+ char cpu_name[16];
+
+ snprintf(cpu_name, sizeof(cpu_name), "cpu%d", cpu);
+ cpu_d = tracefs_create_dir(cpu_name, percpu_d);
+ if (!cpu_d) {
+ pr_err("Failed to create tracefs dir "TRACEFS_DIR"%s/percpu/cpu%d\n",
+ name, cpu);
+ goto err;
+ }
+
+ d = trace_create_cpu_file("trace_pipe", TRACEFS_MODE_READ, cpu_d, remote, cpu,
+ &trace_pipe_fops);
+ if (!d)
+ goto err;
+
+ d = trace_create_cpu_file("trace", TRACEFS_MODE_WRITE, cpu_d, remote, cpu,
+ &trace_fops);
+ if (!d)
+ goto err;
+ }
+
+ remote->dentry = remote_d;
+
+ return 0;
+
+err:
+ if (root_inited) {
+ tracefs_remove(root);
+ root = NULL;
+ } else {
+ tracefs_remove(remote_d);
+ }
+
+ return -ENOMEM;
+}
+
+static int trace_remote_register_events(const char *remote_name, struct trace_remote *remote,
+ struct remote_event *events, size_t nr_events);
+
+/**
+ * trace_remote_register() - Register a Tracefs remote
+ * @name: Name of the remote, used for the Tracefs remotes/ directory.
+ * @cbs: Set of callbacks used to control the remote.
+ * @priv: Private data, passed to each callback from @cbs.
+ * @events: Array of events. &remote_event.name and &remote_event.id must be
+ * filled by the caller.
+ * @nr_events: Number of events in the @events array.
+ *
+ * A trace remote is an entity, outside of the kernel (most likely firmware or
+ * hypervisor) capable of writing events into a Tracefs compatible ring-buffer.
+ * The kernel would then act as a reader.
+ *
+ * The registered remote will be found under the Tracefs directory
+ * remotes/<name>.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, void *priv,
+ struct remote_event *events, size_t nr_events)
+{
+ struct trace_remote *remote;
+ int ret;
+
+ remote = kzalloc_obj(*remote);
+ if (!remote)
+ return -ENOMEM;
+
+ remote->cbs = cbs;
+ remote->priv = priv;
+ remote->trace_buffer_size = 7 << 10;
+ remote->poll_ms = 100;
+ mutex_init(&remote->lock);
+ init_rwsem(&remote->reader_lock);
+
+ if (trace_remote_init_tracefs(name, remote)) {
+ kfree(remote);
+ return -ENOMEM;
+ }
+
+ ret = trace_remote_register_events(name, remote, events, nr_events);
+ if (ret) {
+ pr_err("Failed to register events for trace remote '%s' (%d)\n",
+ name, ret);
+ return ret;
+ }
+
+ ret = cbs->init ? cbs->init(remote->dentry, priv) : 0;
+ if (ret)
+ pr_err("Init failed for trace remote '%s' (%d)\n", name, ret);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(trace_remote_register);
+
+/**
+ * trace_remote_free_buffer() - Free trace buffer allocated with trace_remote_alloc_buffer()
+ * @desc: Descriptor of the per-CPU ring-buffers, originally filled by
+ * trace_remote_alloc_buffer()
+ *
+ * Most likely called from &trace_remote_callbacks.unload_trace_buffer.
+ */
+void trace_remote_free_buffer(struct trace_buffer_desc *desc)
+{
+ struct ring_buffer_desc *rb_desc;
+ int cpu;
+
+ for_each_ring_buffer_desc(rb_desc, cpu, desc) {
+ unsigned int id;
+
+ free_page(rb_desc->meta_va);
+
+ for (id = 0; id < rb_desc->nr_page_va; id++)
+ free_page(rb_desc->page_va[id]);
+ }
+}
+EXPORT_SYMBOL_GPL(trace_remote_free_buffer);
+
+/**
+ * trace_remote_alloc_buffer() - Dynamically allocate a trace buffer
+ * @desc: Uninitialized trace_buffer_desc
+ * @desc_size: Size of the trace_buffer_desc. Must be at least equal to
+ * trace_buffer_desc_size()
+ * @buffer_size: Size in bytes of each per-CPU ring-buffer
+ * @cpumask: CPUs to allocate a ring-buffer for
+ *
+ * Helper to dynamically allocate a set of pages (enough to cover @buffer_size)
+ * for each CPU from @cpumask and fill @desc. Most likely called from
+ * &trace_remote_callbacks.load_trace_buffer.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int trace_remote_alloc_buffer(struct trace_buffer_desc *desc, size_t desc_size, size_t buffer_size,
+ const struct cpumask *cpumask)
+{
+ unsigned int nr_pages = max(DIV_ROUND_UP(buffer_size, PAGE_SIZE), 2UL) + 1;
+ void *desc_end = desc + desc_size;
+ struct ring_buffer_desc *rb_desc;
+ int cpu, ret = -ENOMEM;
+
+ if (desc_size < struct_size(desc, __data, 0))
+ return -EINVAL;
+
+ desc->nr_cpus = 0;
+ desc->struct_len = struct_size(desc, __data, 0);
+
+ rb_desc = (struct ring_buffer_desc *)&desc->__data[0];
+
+ for_each_cpu(cpu, cpumask) {
+ unsigned int id;
+
+ if ((void *)rb_desc + struct_size(rb_desc, page_va, nr_pages) > desc_end) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ rb_desc->cpu = cpu;
+ rb_desc->nr_page_va = 0;
+ rb_desc->meta_va = (unsigned long)__get_free_page(GFP_KERNEL);
+ if (!rb_desc->meta_va)
+ goto err;
+
+ for (id = 0; id < nr_pages; id++) {
+ rb_desc->page_va[id] = (unsigned long)__get_free_page(GFP_KERNEL);
+ if (!rb_desc->page_va[id])
+ goto err;
+
+ rb_desc->nr_page_va++;
+ }
+ desc->nr_cpus++;
+ desc->struct_len += offsetof(struct ring_buffer_desc, page_va);
+ desc->struct_len += struct_size(rb_desc, page_va, rb_desc->nr_page_va);
+ rb_desc = __next_ring_buffer_desc(rb_desc);
+ }
+
+ return 0;
+
+err:
+ trace_remote_free_buffer(desc);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(trace_remote_alloc_buffer);
+
+static int
+trace_remote_enable_event(struct trace_remote *remote, struct remote_event *evt, bool enable)
+{
+ int ret;
+
+ lockdep_assert_held(&remote->lock);
+
+ if (evt->enabled == enable)
+ return 0;
+
+ ret = remote->cbs->enable_event(evt->id, enable, remote->priv);
+ if (ret)
+ return ret;
+
+ evt->enabled = enable;
+
+ return 0;
+}
+
+static int remote_event_enable_show(struct seq_file *s, void *unused)
+{
+ struct remote_event *evt = s->private;
+
+ seq_printf(s, "%d\n", evt->enabled);
+
+ return 0;
+}
+
+static ssize_t remote_event_enable_write(struct file *filp, const char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ struct seq_file *seq = filp->private_data;
+ struct remote_event *evt = seq->private;
+ struct trace_remote *remote = evt->remote;
+ u8 enable;
+ int ret;
+
+ ret = kstrtou8_from_user(ubuf, count, 10, &enable);
+ if (ret)
+ return ret;
+
+ guard(mutex)(&remote->lock);
+
+ ret = trace_remote_enable_event(remote, evt, enable);
+ if (ret)
+ return ret;
+
+ return count;
+}
+DEFINE_SHOW_STORE_ATTRIBUTE(remote_event_enable);
+
+static int remote_event_id_show(struct seq_file *s, void *unused)
+{
+ struct remote_event *evt = s->private;
+
+ seq_printf(s, "%d\n", evt->id);
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(remote_event_id);
+
+static int remote_event_format_show(struct seq_file *s, void *unused)
+{
+ size_t offset = sizeof(struct remote_event_hdr);
+ struct remote_event *evt = s->private;
+ struct trace_event_fields *field;
+
+ seq_printf(s, "name: %s\n", evt->name);
+ seq_printf(s, "ID: %d\n", evt->id);
+ seq_puts(s,
+ "format:\n\tfield:unsigned short common_type;\toffset:0;\tsize:2;\tsigned:0;\n\n");
+
+ field = &evt->fields[0];
+ while (field->name) {
+ seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%u;\tsigned:%d;\n",
+ field->type, field->name, offset, field->size,
+ field->is_signed);
+ offset += field->size;
+ field++;
+ }
+
+ if (field != &evt->fields[0])
+ seq_puts(s, "\n");
+
+ seq_printf(s, "print fmt: %s\n", evt->print_fmt);
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(remote_event_format);
+
+static int remote_event_callback(const char *name, umode_t *mode, void **data,
+ const struct file_operations **fops)
+{
+ if (!strcmp(name, "enable")) {
+ *mode = TRACEFS_MODE_WRITE;
+ *fops = &remote_event_enable_fops;
+ return 1;
+ }
+
+ if (!strcmp(name, "id")) {
+ *mode = TRACEFS_MODE_READ;
+ *fops = &remote_event_id_fops;
+ return 1;
+ }
+
+ if (!strcmp(name, "format")) {
+ *mode = TRACEFS_MODE_READ;
+ *fops = &remote_event_format_fops;
+ return 1;
+ }
+
+ return 0;
+}
+
+static ssize_t remote_events_dir_enable_write(struct file *filp, const char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ struct trace_remote *remote = file_inode(filp)->i_private;
+ int i, ret;
+ u8 enable;
+
+ ret = kstrtou8_from_user(ubuf, count, 10, &enable);
+ if (ret)
+ return ret;
+
+ guard(mutex)(&remote->lock);
+
+ for (i = 0; i < remote->nr_events; i++) {
+ struct remote_event *evt = &remote->events[i];
+
+ trace_remote_enable_event(remote, evt, enable);
+ }
+
+ return count;
+}
+
+static ssize_t remote_events_dir_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ struct trace_remote *remote = file_inode(filp)->i_private;
+ const char enabled_char[] = {'0', '1', 'X'};
+ char enabled_str[] = " \n";
+ int i, enabled = -1;
+
+ guard(mutex)(&remote->lock);
+
+ for (i = 0; i < remote->nr_events; i++) {
+ struct remote_event *evt = &remote->events[i];
+
+ if (enabled == -1) {
+ enabled = evt->enabled;
+ } else if (enabled != evt->enabled) {
+ enabled = 2;
+ break;
+ }
+ }
+
+ enabled_str[0] = enabled_char[enabled == -1 ? 0 : enabled];
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, enabled_str, 2);
+}
+
+static const struct file_operations remote_events_dir_enable_fops = {
+ .write = remote_events_dir_enable_write,
+ .read = remote_events_dir_enable_read,
+};
+
+static ssize_t
+remote_events_dir_header_page_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct trace_seq *s;
+ int ret;
+
+ s = kmalloc(sizeof(*s), GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
+
+ trace_seq_init(s);
+
+ ring_buffer_print_page_header(NULL, s);
+ ret = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, trace_seq_used(s));
+ kfree(s);
+
+ return ret;
+}
+
+static const struct file_operations remote_events_dir_header_page_fops = {
+ .read = remote_events_dir_header_page_read,
+};
+
+static ssize_t
+remote_events_dir_header_event_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct trace_seq *s;
+ int ret;
+
+ s = kmalloc(sizeof(*s), GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
+
+ trace_seq_init(s);
+
+ ring_buffer_print_entry_header(s);
+ ret = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, trace_seq_used(s));
+ kfree(s);
+
+ return ret;
+}
+
+static const struct file_operations remote_events_dir_header_event_fops = {
+ .read = remote_events_dir_header_event_read,
+};
+
+static int remote_events_dir_callback(const char *name, umode_t *mode, void **data,
+ const struct file_operations **fops)
+{
+ if (!strcmp(name, "enable")) {
+ *mode = TRACEFS_MODE_WRITE;
+ *fops = &remote_events_dir_enable_fops;
+ return 1;
+ }
+
+ if (!strcmp(name, "header_page")) {
+ *mode = TRACEFS_MODE_READ;
+ *fops = &remote_events_dir_header_page_fops;
+ return 1;
+ }
+
+ if (!strcmp(name, "header_event")) {
+ *mode = TRACEFS_MODE_READ;
+ *fops = &remote_events_dir_header_event_fops;
+ return 1;
+ }
+
+ return 0;
+}
+
+static int trace_remote_init_eventfs(const char *remote_name, struct trace_remote *remote,
+ struct remote_event *evt)
+{
+ struct eventfs_inode *eventfs = remote->eventfs;
+ static struct eventfs_entry dir_entries[] = {
+ {
+ .name = "enable",
+ .callback = remote_events_dir_callback,
+ }, {
+ .name = "header_page",
+ .callback = remote_events_dir_callback,
+ }, {
+ .name = "header_event",
+ .callback = remote_events_dir_callback,
+ }
+ };
+ static struct eventfs_entry entries[] = {
+ {
+ .name = "enable",
+ .callback = remote_event_callback,
+ }, {
+ .name = "id",
+ .callback = remote_event_callback,
+ }, {
+ .name = "format",
+ .callback = remote_event_callback,
+ }
+ };
+ bool eventfs_create = false;
+
+ if (!eventfs) {
+ eventfs = eventfs_create_events_dir("events", remote->dentry, dir_entries,
+ ARRAY_SIZE(dir_entries), remote);
+ if (IS_ERR(eventfs))
+ return PTR_ERR(eventfs);
+
+ /*
+ * Create similar hierarchy as local events even if a single system is supported at
+ * the moment
+ */
+ eventfs = eventfs_create_dir(remote_name, eventfs, NULL, 0, NULL);
+ if (IS_ERR(eventfs))
+ return PTR_ERR(eventfs);
+
+ remote->eventfs = eventfs;
+ eventfs_create = true;
+ }
+
+ eventfs = eventfs_create_dir(evt->name, eventfs, entries, ARRAY_SIZE(entries), evt);
+ if (IS_ERR(eventfs)) {
+ if (eventfs_create) {
+ eventfs_remove_events_dir(remote->eventfs);
+ remote->eventfs = NULL;
+ }
+ return PTR_ERR(eventfs);
+ }
+
+ return 0;
+}
+
+static int trace_remote_attach_events(struct trace_remote *remote, struct remote_event *events,
+ size_t nr_events)
+{
+ int i;
+
+ for (i = 0; i < nr_events; i++) {
+ struct remote_event *evt = &events[i];
+
+ if (evt->remote)
+ return -EEXIST;
+
+ evt->remote = remote;
+
+ /* We need events to be sorted for efficient lookup */
+ if (i && evt->id <= events[i - 1].id)
+ return -EINVAL;
+ }
+
+ remote->events = events;
+ remote->nr_events = nr_events;
+
+ return 0;
+}
+
+static int trace_remote_register_events(const char *remote_name, struct trace_remote *remote,
+ struct remote_event *events, size_t nr_events)
+{
+ int i, ret;
+
+ ret = trace_remote_attach_events(remote, events, nr_events);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < nr_events; i++) {
+ struct remote_event *evt = &events[i];
+
+ ret = trace_remote_init_eventfs(remote_name, remote, evt);
+ if (ret)
+ pr_warn("Failed to init eventfs for event '%s' (%d)",
+ evt->name, ret);
+ }
+
+ return 0;
+}
+
+static int __cmp_events(const void *key, const void *data)
+{
+ const struct remote_event *evt = data;
+ int id = (int)((long)key);
+
+ return id - (int)evt->id;
+}
+
+static struct remote_event *trace_remote_find_event(struct trace_remote *remote, unsigned short id)
+{
+ return bsearch((const void *)(unsigned long)id, remote->events, remote->nr_events,
+ sizeof(*remote->events), __cmp_events);
+}
diff --git a/kernel/trace/trace_snapshot.c b/kernel/trace/trace_snapshot.c
new file mode 100644
index 000000000000..07b43c9863a2
--- /dev/null
+++ b/kernel/trace/trace_snapshot.c
@@ -0,0 +1,1066 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/fsnotify.h>
+
+#include <asm/setup.h> /* COMMAND_LINE_SIZE */
+
+#include "trace.h"
+
+/* Used if snapshot allocated at boot */
+static bool allocate_snapshot;
+static bool snapshot_at_boot;
+
+static char boot_snapshot_info[COMMAND_LINE_SIZE] __initdata;
+static int boot_snapshot_index;
+
+static int __init boot_alloc_snapshot(char *str)
+{
+ char *slot = boot_snapshot_info + boot_snapshot_index;
+ int left = sizeof(boot_snapshot_info) - boot_snapshot_index;
+ int ret;
+
+ if (str[0] == '=') {
+ str++;
+ if (strlen(str) >= left)
+ return -1;
+
+ ret = snprintf(slot, left, "%s\t", str);
+ boot_snapshot_index += ret;
+ } else {
+ allocate_snapshot = true;
+ /* We also need the main ring buffer expanded */
+ trace_set_ring_buffer_expanded(NULL);
+ }
+ return 1;
+}
+__setup("alloc_snapshot", boot_alloc_snapshot);
+
+
+static int __init boot_snapshot(char *str)
+{
+ snapshot_at_boot = true;
+ boot_alloc_snapshot(str);
+ return 1;
+}
+__setup("ftrace_boot_snapshot", boot_snapshot);
+static void tracing_snapshot_instance_cond(struct trace_array *tr,
+ void *cond_data)
+{
+ unsigned long flags;
+
+ if (in_nmi()) {
+ trace_array_puts(tr, "*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n");
+ trace_array_puts(tr, "*** snapshot is being ignored ***\n");
+ return;
+ }
+
+ if (!tr->allocated_snapshot) {
+ trace_array_puts(tr, "*** SNAPSHOT NOT ALLOCATED ***\n");
+ trace_array_puts(tr, "*** stopping trace here! ***\n");
+ tracer_tracing_off(tr);
+ return;
+ }
+
+ if (tr->mapped) {
+ trace_array_puts(tr, "*** BUFFER MEMORY MAPPED ***\n");
+ trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n");
+ return;
+ }
+
+ /* Note, snapshot can not be used when the tracer uses it */
+ if (tracer_uses_snapshot(tr->current_trace)) {
+ trace_array_puts(tr, "*** LATENCY TRACER ACTIVE ***\n");
+ trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n");
+ return;
+ }
+
+ local_irq_save(flags);
+ update_max_tr(tr, current, smp_processor_id(), cond_data);
+ local_irq_restore(flags);
+}
+
+void tracing_snapshot_instance(struct trace_array *tr)
+{
+ tracing_snapshot_instance_cond(tr, NULL);
+}
+
+/**
+ * tracing_snapshot_cond - conditionally take a snapshot of the current buffer.
+ * @tr: The tracing instance to snapshot
+ * @cond_data: The data to be tested conditionally, and possibly saved
+ *
+ * This is the same as tracing_snapshot() except that the snapshot is
+ * conditional - the snapshot will only happen if the
+ * cond_snapshot.update() implementation receiving the cond_data
+ * returns true, which means that the trace array's cond_snapshot
+ * update() operation used the cond_data to determine whether the
+ * snapshot should be taken, and if it was, presumably saved it along
+ * with the snapshot.
+ */
+void tracing_snapshot_cond(struct trace_array *tr, void *cond_data)
+{
+ tracing_snapshot_instance_cond(tr, cond_data);
+}
+EXPORT_SYMBOL_GPL(tracing_snapshot_cond);
+
+/**
+ * tracing_cond_snapshot_data - get the user data associated with a snapshot
+ * @tr: The tracing instance
+ *
+ * When the user enables a conditional snapshot using
+ * tracing_snapshot_cond_enable(), the user-defined cond_data is saved
+ * with the snapshot. This accessor is used to retrieve it.
+ *
+ * Should not be called from cond_snapshot.update(), since it takes
+ * the tr->max_lock lock, which the code calling
+ * cond_snapshot.update() has already done.
+ *
+ * Returns the cond_data associated with the trace array's snapshot.
+ */
+void *tracing_cond_snapshot_data(struct trace_array *tr)
+{
+ void *cond_data = NULL;
+
+ local_irq_disable();
+ arch_spin_lock(&tr->max_lock);
+
+ if (tr->cond_snapshot)
+ cond_data = tr->cond_snapshot->cond_data;
+
+ arch_spin_unlock(&tr->max_lock);
+ local_irq_enable();
+
+ return cond_data;
+}
+EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data);
+
+/* resize @tr's buffer to the size of @size_tr's entries */
+int resize_buffer_duplicate_size(struct array_buffer *trace_buf,
+ struct array_buffer *size_buf, int cpu_id)
+{
+ int cpu, ret = 0;
+
+ if (cpu_id == RING_BUFFER_ALL_CPUS) {
+ for_each_tracing_cpu(cpu) {
+ ret = ring_buffer_resize(trace_buf->buffer,
+ per_cpu_ptr(size_buf->data, cpu)->entries, cpu);
+ if (ret < 0)
+ break;
+ per_cpu_ptr(trace_buf->data, cpu)->entries =
+ per_cpu_ptr(size_buf->data, cpu)->entries;
+ }
+ } else {
+ ret = ring_buffer_resize(trace_buf->buffer,
+ per_cpu_ptr(size_buf->data, cpu_id)->entries, cpu_id);
+ if (ret == 0)
+ per_cpu_ptr(trace_buf->data, cpu_id)->entries =
+ per_cpu_ptr(size_buf->data, cpu_id)->entries;
+ }
+
+ return ret;
+}
+
+int tracing_alloc_snapshot_instance(struct trace_array *tr)
+{
+ int order;
+ int ret;
+
+ if (!tr->allocated_snapshot) {
+
+ /* Make the snapshot buffer have the same order as main buffer */
+ order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer);
+ ret = ring_buffer_subbuf_order_set(tr->snapshot_buffer.buffer, order);
+ if (ret < 0)
+ return ret;
+
+ /* allocate spare buffer */
+ ret = resize_buffer_duplicate_size(&tr->snapshot_buffer,
+ &tr->array_buffer, RING_BUFFER_ALL_CPUS);
+ if (ret < 0)
+ return ret;
+
+ tr->allocated_snapshot = true;
+ }
+
+ return 0;
+}
+
+void free_snapshot(struct trace_array *tr)
+{
+ /*
+ * We don't free the ring buffer. instead, resize it because
+ * The max_tr ring buffer has some state (e.g. ring->clock) and
+ * we want preserve it.
+ */
+ ring_buffer_subbuf_order_set(tr->snapshot_buffer.buffer, 0);
+ ring_buffer_resize(tr->snapshot_buffer.buffer, 1, RING_BUFFER_ALL_CPUS);
+ trace_set_buffer_entries(&tr->snapshot_buffer, 1);
+ tracing_reset_online_cpus(&tr->snapshot_buffer);
+ tr->allocated_snapshot = false;
+}
+
+int tracing_arm_snapshot_locked(struct trace_array *tr)
+{
+ int ret;
+
+ lockdep_assert_held(&trace_types_lock);
+
+ spin_lock(&tr->snapshot_trigger_lock);
+ if (tr->snapshot == UINT_MAX || tr->mapped) {
+ spin_unlock(&tr->snapshot_trigger_lock);
+ return -EBUSY;
+ }
+
+ tr->snapshot++;
+ spin_unlock(&tr->snapshot_trigger_lock);
+
+ ret = tracing_alloc_snapshot_instance(tr);
+ if (ret) {
+ spin_lock(&tr->snapshot_trigger_lock);
+ tr->snapshot--;
+ spin_unlock(&tr->snapshot_trigger_lock);
+ }
+
+ return ret;
+}
+
+int tracing_arm_snapshot(struct trace_array *tr)
+{
+ guard(mutex)(&trace_types_lock);
+ return tracing_arm_snapshot_locked(tr);
+}
+
+void tracing_disarm_snapshot(struct trace_array *tr)
+{
+ spin_lock(&tr->snapshot_trigger_lock);
+ if (!WARN_ON(!tr->snapshot))
+ tr->snapshot--;
+ spin_unlock(&tr->snapshot_trigger_lock);
+}
+
+/**
+ * tracing_snapshot_alloc - allocate and take a snapshot of the current buffer.
+ *
+ * This is similar to tracing_snapshot(), but it will allocate the
+ * snapshot buffer if it isn't already allocated. Use this only
+ * where it is safe to sleep, as the allocation may sleep.
+ *
+ * This causes a swap between the snapshot buffer and the current live
+ * tracing buffer. You can use this to take snapshots of the live
+ * trace when some condition is triggered, but continue to trace.
+ */
+void tracing_snapshot_alloc(void)
+{
+ int ret;
+
+ ret = tracing_alloc_snapshot();
+ if (ret < 0)
+ return;
+
+ tracing_snapshot();
+}
+EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
+
+/**
+ * tracing_snapshot_cond_enable - enable conditional snapshot for an instance
+ * @tr: The tracing instance
+ * @cond_data: User data to associate with the snapshot
+ * @update: Implementation of the cond_snapshot update function
+ *
+ * Check whether the conditional snapshot for the given instance has
+ * already been enabled, or if the current tracer is already using a
+ * snapshot; if so, return -EBUSY, else create a cond_snapshot and
+ * save the cond_data and update function inside.
+ *
+ * Returns 0 if successful, error otherwise.
+ */
+int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data,
+ cond_update_fn_t update)
+{
+ struct cond_snapshot *cond_snapshot __free(kfree) =
+ kzalloc_obj(*cond_snapshot);
+ int ret;
+
+ if (!cond_snapshot)
+ return -ENOMEM;
+
+ cond_snapshot->cond_data = cond_data;
+ cond_snapshot->update = update;
+
+ guard(mutex)(&trace_types_lock);
+
+ if (tracer_uses_snapshot(tr->current_trace))
+ return -EBUSY;
+
+ /*
+ * The cond_snapshot can only change to NULL without the
+ * trace_types_lock. We don't care if we race with it going
+ * to NULL, but we want to make sure that it's not set to
+ * something other than NULL when we get here, which we can
+ * do safely with only holding the trace_types_lock and not
+ * having to take the max_lock.
+ */
+ if (tr->cond_snapshot)
+ return -EBUSY;
+
+ ret = tracing_arm_snapshot_locked(tr);
+ if (ret)
+ return ret;
+
+ local_irq_disable();
+ arch_spin_lock(&tr->max_lock);
+ tr->cond_snapshot = no_free_ptr(cond_snapshot);
+ arch_spin_unlock(&tr->max_lock);
+ local_irq_enable();
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable);
+
+/**
+ * tracing_snapshot_cond_disable - disable conditional snapshot for an instance
+ * @tr: The tracing instance
+ *
+ * Check whether the conditional snapshot for the given instance is
+ * enabled; if so, free the cond_snapshot associated with it,
+ * otherwise return -EINVAL.
+ *
+ * Returns 0 if successful, error otherwise.
+ */
+int tracing_snapshot_cond_disable(struct trace_array *tr)
+{
+ int ret = 0;
+
+ local_irq_disable();
+ arch_spin_lock(&tr->max_lock);
+
+ if (!tr->cond_snapshot)
+ ret = -EINVAL;
+ else {
+ kfree(tr->cond_snapshot);
+ tr->cond_snapshot = NULL;
+ }
+
+ arch_spin_unlock(&tr->max_lock);
+ local_irq_enable();
+
+ tracing_disarm_snapshot(tr);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+#ifdef LATENCY_FS_NOTIFY
+static struct workqueue_struct *fsnotify_wq;
+
+static void latency_fsnotify_workfn(struct work_struct *work)
+{
+ struct trace_array *tr = container_of(work, struct trace_array,
+ fsnotify_work);
+ fsnotify_inode(tr->d_max_latency->d_inode, FS_MODIFY);
+}
+
+static void latency_fsnotify_workfn_irq(struct irq_work *iwork)
+{
+ struct trace_array *tr = container_of(iwork, struct trace_array,
+ fsnotify_irqwork);
+ queue_work(fsnotify_wq, &tr->fsnotify_work);
+}
+
+__init static int latency_fsnotify_init(void)
+{
+ fsnotify_wq = alloc_workqueue("tr_max_lat_wq",
+ WQ_UNBOUND | WQ_HIGHPRI, 0);
+ if (!fsnotify_wq) {
+ pr_err("Unable to allocate tr_max_lat_wq\n");
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+late_initcall_sync(latency_fsnotify_init);
+
+void latency_fsnotify(struct trace_array *tr)
+{
+ if (!fsnotify_wq)
+ return;
+ /*
+ * We cannot call queue_work(&tr->fsnotify_work) from here because it's
+ * possible that we are called from __schedule() or do_idle(), which
+ * could cause a deadlock.
+ */
+ irq_work_queue(&tr->fsnotify_irqwork);
+}
+#endif /* LATENCY_FS_NOTIFY */
+
+static const struct file_operations tracing_max_lat_fops;
+
+void trace_create_maxlat_file(struct trace_array *tr,
+ struct dentry *d_tracer)
+{
+#ifdef LATENCY_FS_NOTIFY
+ INIT_WORK(&tr->fsnotify_work, latency_fsnotify_workfn);
+ init_irq_work(&tr->fsnotify_irqwork, latency_fsnotify_workfn_irq);
+#endif
+ tr->d_max_latency = trace_create_file("tracing_max_latency",
+ TRACE_MODE_WRITE,
+ d_tracer, tr,
+ &tracing_max_lat_fops);
+}
+
+/*
+ * Copy the new maximum trace into the separate maximum-trace
+ * structure. (this way the maximum trace is permanently saved,
+ * for later retrieval via /sys/kernel/tracing/tracing_max_latency)
+ */
+static void
+__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
+{
+ struct array_buffer *trace_buf = &tr->array_buffer;
+ struct trace_array_cpu *data = per_cpu_ptr(trace_buf->data, cpu);
+ struct array_buffer *max_buf = &tr->snapshot_buffer;
+ struct trace_array_cpu *max_data = per_cpu_ptr(max_buf->data, cpu);
+
+ max_buf->cpu = cpu;
+ max_buf->time_start = data->preempt_timestamp;
+
+ max_data->saved_latency = tr->max_latency;
+ max_data->critical_start = data->critical_start;
+ max_data->critical_end = data->critical_end;
+
+ strscpy(max_data->comm, tsk->comm);
+ max_data->pid = tsk->pid;
+ /*
+ * If tsk == current, then use current_uid(), as that does not use
+ * RCU. The irq tracer can be called out of RCU scope.
+ */
+ if (tsk == current)
+ max_data->uid = current_uid();
+ else
+ max_data->uid = task_uid(tsk);
+
+ max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
+ max_data->policy = tsk->policy;
+ max_data->rt_priority = tsk->rt_priority;
+
+ /* record this tasks comm */
+ tracing_record_cmdline(tsk);
+ latency_fsnotify(tr);
+}
+#else
+static inline void __update_max_tr(struct trace_array *tr,
+ struct task_struct *tsk, int cpu) { }
+#endif /* CONFIG_TRACER_MAX_TRACE */
+
+/**
+ * update_max_tr - snapshot all trace buffers from global_trace to max_tr
+ * @tr: tracer
+ * @tsk: the task with the latency
+ * @cpu: The cpu that initiated the trace.
+ * @cond_data: User data associated with a conditional snapshot
+ *
+ * Flip the buffers between the @tr and the max_tr and record information
+ * about which task was the cause of this latency.
+ */
+void
+update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu,
+ void *cond_data)
+{
+ if (tr->stop_count)
+ return;
+
+ WARN_ON_ONCE(!irqs_disabled());
+
+ if (!tr->allocated_snapshot) {
+ /* Only the nop tracer should hit this when disabling */
+ WARN_ON_ONCE(tr->current_trace != &nop_trace);
+ return;
+ }
+
+ arch_spin_lock(&tr->max_lock);
+
+ /* Inherit the recordable setting from array_buffer */
+ if (ring_buffer_record_is_set_on(tr->array_buffer.buffer))
+ ring_buffer_record_on(tr->snapshot_buffer.buffer);
+ else
+ ring_buffer_record_off(tr->snapshot_buffer.buffer);
+
+ if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data)) {
+ arch_spin_unlock(&tr->max_lock);
+ return;
+ }
+
+ swap(tr->array_buffer.buffer, tr->snapshot_buffer.buffer);
+
+ __update_max_tr(tr, tsk, cpu);
+
+ arch_spin_unlock(&tr->max_lock);
+
+ /* Any waiters on the old snapshot buffer need to wake up */
+ ring_buffer_wake_waiters(tr->array_buffer.buffer, RING_BUFFER_ALL_CPUS);
+}
+
+/**
+ * update_max_tr_single - only copy one trace over, and reset the rest
+ * @tr: tracer
+ * @tsk: task with the latency
+ * @cpu: the cpu of the buffer to copy.
+ *
+ * Flip the trace of a single CPU buffer between the @tr and the max_tr.
+ */
+void
+update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
+{
+ int ret;
+
+ if (tr->stop_count)
+ return;
+
+ WARN_ON_ONCE(!irqs_disabled());
+ if (!tr->allocated_snapshot) {
+ /* Only the nop tracer should hit this when disabling */
+ WARN_ON_ONCE(tr->current_trace != &nop_trace);
+ return;
+ }
+
+ arch_spin_lock(&tr->max_lock);
+
+ ret = ring_buffer_swap_cpu(tr->snapshot_buffer.buffer, tr->array_buffer.buffer, cpu);
+
+ if (ret == -EBUSY) {
+ /*
+ * We failed to swap the buffer due to a commit taking
+ * place on this CPU. We fail to record, but we reset
+ * the max trace buffer (no one writes directly to it)
+ * and flag that it failed.
+ * Another reason is resize is in progress.
+ */
+ trace_array_printk_buf(tr->snapshot_buffer.buffer, _THIS_IP_,
+ "Failed to swap buffers due to commit or resize in progress\n");
+ }
+
+ WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);
+
+ __update_max_tr(tr, tsk, cpu);
+ arch_spin_unlock(&tr->max_lock);
+}
+
+static void show_snapshot_main_help(struct seq_file *m)
+{
+ seq_puts(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"
+ "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"
+ "# Takes a snapshot of the main buffer.\n"
+ "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n"
+ "# (Doesn't have to be '2' works with any number that\n"
+ "# is not a '0' or '1')\n");
+}
+
+static void show_snapshot_percpu_help(struct seq_file *m)
+{
+ seq_puts(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n");
+#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
+ seq_puts(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"
+ "# Takes a snapshot of the main buffer for this cpu.\n");
+#else
+ seq_puts(m, "# echo 1 > snapshot : Not supported with this kernel.\n"
+ "# Must use main snapshot file to allocate.\n");
+#endif
+ seq_puts(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n"
+ "# (Doesn't have to be '2' works with any number that\n"
+ "# is not a '0' or '1')\n");
+}
+
+void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter)
+{
+ if (iter->tr->allocated_snapshot)
+ seq_puts(m, "#\n# * Snapshot is allocated *\n#\n");
+ else
+ seq_puts(m, "#\n# * Snapshot is freed *\n#\n");
+
+ seq_puts(m, "# Snapshot commands:\n");
+ if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
+ show_snapshot_main_help(m);
+ else
+ show_snapshot_percpu_help(m);
+}
+
+static int tracing_snapshot_open(struct inode *inode, struct file *file)
+{
+ struct trace_array *tr = inode->i_private;
+ struct trace_iterator *iter;
+ struct seq_file *m;
+ int ret;
+
+ ret = tracing_check_open_get_tr(tr);
+ if (ret)
+ return ret;
+
+ if (file->f_mode & FMODE_READ) {
+ iter = __tracing_open(inode, file, true);
+ if (IS_ERR(iter))
+ ret = PTR_ERR(iter);
+ } else {
+ /* Writes still need the seq_file to hold the private data */
+ ret = -ENOMEM;
+ m = kzalloc_obj(*m);
+ if (!m)
+ goto out;
+ iter = kzalloc_obj(*iter);
+ if (!iter) {
+ kfree(m);
+ goto out;
+ }
+ ret = 0;
+
+ iter->tr = tr;
+ iter->array_buffer = &tr->snapshot_buffer;
+ iter->cpu_file = tracing_get_cpu(inode);
+ m->private = iter;
+ file->private_data = m;
+ }
+out:
+ if (ret < 0)
+ trace_array_put(tr);
+
+ return ret;
+}
+
+static void tracing_swap_cpu_buffer(void *tr)
+{
+ update_max_tr_single((struct trace_array *)tr, current, smp_processor_id());
+}
+
+static ssize_t
+tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ struct seq_file *m = filp->private_data;
+ struct trace_iterator *iter = m->private;
+ struct trace_array *tr = iter->tr;
+ unsigned long val;
+ int ret;
+
+ ret = tracing_update_buffers(tr);
+ if (ret < 0)
+ return ret;
+
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ guard(mutex)(&trace_types_lock);
+
+ if (tracer_uses_snapshot(tr->current_trace))
+ return -EBUSY;
+
+ local_irq_disable();
+ arch_spin_lock(&tr->max_lock);
+ if (tr->cond_snapshot)
+ ret = -EBUSY;
+ arch_spin_unlock(&tr->max_lock);
+ local_irq_enable();
+ if (ret)
+ return ret;
+
+ switch (val) {
+ case 0:
+ if (iter->cpu_file != RING_BUFFER_ALL_CPUS)
+ return -EINVAL;
+ if (tr->allocated_snapshot)
+ free_snapshot(tr);
+ break;
+ case 1:
+/* Only allow per-cpu swap if the ring buffer supports it */
+#ifndef CONFIG_RING_BUFFER_ALLOW_SWAP
+ if (iter->cpu_file != RING_BUFFER_ALL_CPUS)
+ return -EINVAL;
+#endif
+ if (tr->allocated_snapshot)
+ ret = resize_buffer_duplicate_size(&tr->snapshot_buffer,
+ &tr->array_buffer, iter->cpu_file);
+
+ ret = tracing_arm_snapshot_locked(tr);
+ if (ret)
+ return ret;
+
+ /* Now, we're going to swap */
+ if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
+ local_irq_disable();
+ update_max_tr(tr, current, smp_processor_id(), NULL);
+ local_irq_enable();
+ } else {
+ smp_call_function_single(iter->cpu_file, tracing_swap_cpu_buffer,
+ (void *)tr, 1);
+ }
+ tracing_disarm_snapshot(tr);
+ break;
+ default:
+ if (tr->allocated_snapshot) {
+ if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
+ tracing_reset_online_cpus(&tr->snapshot_buffer);
+ else
+ tracing_reset_cpu(&tr->snapshot_buffer, iter->cpu_file);
+ }
+ break;
+ }
+
+ if (ret >= 0) {
+ *ppos += cnt;
+ ret = cnt;
+ }
+
+ return ret;
+}
+
+static int tracing_snapshot_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *m = file->private_data;
+ int ret;
+
+ ret = tracing_release(inode, file);
+
+ if (file->f_mode & FMODE_READ)
+ return ret;
+
+ /* If write only, the seq_file is just a stub */
+ if (m)
+ kfree(m->private);
+ kfree(m);
+
+ return 0;
+}
+
+static int snapshot_raw_open(struct inode *inode, struct file *filp)
+{
+ struct ftrace_buffer_info *info;
+ int ret;
+
+ /* The following checks for tracefs lockdown */
+ ret = tracing_buffers_open(inode, filp);
+ if (ret < 0)
+ return ret;
+
+ info = filp->private_data;
+
+ if (tracer_uses_snapshot(info->iter.trace)) {
+ tracing_buffers_release(inode, filp);
+ return -EBUSY;
+ }
+
+ info->iter.snapshot = true;
+ info->iter.array_buffer = &info->iter.tr->snapshot_buffer;
+
+ return ret;
+}
+
+const struct file_operations snapshot_fops = {
+ .open = tracing_snapshot_open,
+ .read = seq_read,
+ .write = tracing_snapshot_write,
+ .llseek = tracing_lseek,
+ .release = tracing_snapshot_release,
+};
+
+const struct file_operations snapshot_raw_fops = {
+ .open = snapshot_raw_open,
+ .read = tracing_buffers_read,
+ .release = tracing_buffers_release,
+ .splice_read = tracing_buffers_splice_read,
+};
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+static ssize_t
+tracing_max_lat_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct trace_array *tr = filp->private_data;
+
+ return tracing_nsecs_read(&tr->max_latency, ubuf, cnt, ppos);
+}
+
+static ssize_t
+tracing_max_lat_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct trace_array *tr = filp->private_data;
+
+ return tracing_nsecs_write(&tr->max_latency, ubuf, cnt, ppos);
+}
+
+static const struct file_operations tracing_max_lat_fops = {
+ .open = tracing_open_generic_tr,
+ .read = tracing_max_lat_read,
+ .write = tracing_max_lat_write,
+ .llseek = generic_file_llseek,
+ .release = tracing_release_generic_tr,
+};
+#endif /* CONFIG_TRACER_MAX_TRACE */
+
+int get_snapshot_map(struct trace_array *tr)
+{
+ int err = 0;
+
+ /*
+ * Called with mmap_lock held. lockdep would be unhappy if we would now
+ * take trace_types_lock. Instead use the specific
+ * snapshot_trigger_lock.
+ */
+ spin_lock(&tr->snapshot_trigger_lock);
+
+ if (tr->snapshot || tr->mapped == UINT_MAX)
+ err = -EBUSY;
+ else
+ tr->mapped++;
+
+ spin_unlock(&tr->snapshot_trigger_lock);
+
+ /* Wait for update_max_tr() to observe iter->tr->mapped */
+ if (tr->mapped == 1)
+ synchronize_rcu();
+
+ return err;
+
+}
+
+void put_snapshot_map(struct trace_array *tr)
+{
+ spin_lock(&tr->snapshot_trigger_lock);
+ if (!WARN_ON(!tr->mapped))
+ tr->mapped--;
+ spin_unlock(&tr->snapshot_trigger_lock);
+}
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+static void
+ftrace_snapshot(unsigned long ip, unsigned long parent_ip,
+ struct trace_array *tr, struct ftrace_probe_ops *ops,
+ void *data)
+{
+ tracing_snapshot_instance(tr);
+}
+
+static void
+ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip,
+ struct trace_array *tr, struct ftrace_probe_ops *ops,
+ void *data)
+{
+ struct ftrace_func_mapper *mapper = data;
+ long *count = NULL;
+
+ if (mapper)
+ count = (long *)ftrace_func_mapper_find_ip(mapper, ip);
+
+ if (count) {
+
+ if (*count <= 0)
+ return;
+
+ (*count)--;
+ }
+
+ tracing_snapshot_instance(tr);
+}
+
+static int
+ftrace_snapshot_print(struct seq_file *m, unsigned long ip,
+ struct ftrace_probe_ops *ops, void *data)
+{
+ struct ftrace_func_mapper *mapper = data;
+ long *count = NULL;
+
+ seq_printf(m, "%ps:", (void *)ip);
+
+ seq_puts(m, "snapshot");
+
+ if (mapper)
+ count = (long *)ftrace_func_mapper_find_ip(mapper, ip);
+
+ if (count)
+ seq_printf(m, ":count=%ld\n", *count);
+ else
+ seq_puts(m, ":unlimited\n");
+
+ return 0;
+}
+
+static int
+ftrace_snapshot_init(struct ftrace_probe_ops *ops, struct trace_array *tr,
+ unsigned long ip, void *init_data, void **data)
+{
+ struct ftrace_func_mapper *mapper = *data;
+
+ if (!mapper) {
+ mapper = allocate_ftrace_func_mapper();
+ if (!mapper)
+ return -ENOMEM;
+ *data = mapper;
+ }
+
+ return ftrace_func_mapper_add_ip(mapper, ip, init_data);
+}
+
+static void
+ftrace_snapshot_free(struct ftrace_probe_ops *ops, struct trace_array *tr,
+ unsigned long ip, void *data)
+{
+ struct ftrace_func_mapper *mapper = data;
+
+ if (!ip) {
+ if (!mapper)
+ return;
+ free_ftrace_func_mapper(mapper, NULL);
+ return;
+ }
+
+ ftrace_func_mapper_remove_ip(mapper, ip);
+}
+
+static struct ftrace_probe_ops snapshot_probe_ops = {
+ .func = ftrace_snapshot,
+ .print = ftrace_snapshot_print,
+};
+
+static struct ftrace_probe_ops snapshot_count_probe_ops = {
+ .func = ftrace_count_snapshot,
+ .print = ftrace_snapshot_print,
+ .init = ftrace_snapshot_init,
+ .free = ftrace_snapshot_free,
+};
+
+static int
+ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash,
+ char *glob, char *cmd, char *param, int enable)
+{
+ struct ftrace_probe_ops *ops;
+ void *count = (void *)-1;
+ char *number;
+ int ret;
+
+ if (!tr)
+ return -ENODEV;
+
+ /* hash funcs only work with set_ftrace_filter */
+ if (!enable)
+ return -EINVAL;
+
+ ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops;
+
+ if (glob[0] == '!') {
+ ret = unregister_ftrace_function_probe_func(glob+1, tr, ops);
+ if (!ret)
+ tracing_disarm_snapshot(tr);
+
+ return ret;
+ }
+
+ if (!param)
+ goto out_reg;
+
+ number = strsep(&param, ":");
+
+ if (!strlen(number))
+ goto out_reg;
+
+ /*
+ * We use the callback data field (which is a pointer)
+ * as our counter.
+ */
+ ret = kstrtoul(number, 0, (unsigned long *)&count);
+ if (ret)
+ return ret;
+
+ out_reg:
+ ret = tracing_arm_snapshot(tr);
+ if (ret < 0)
+ return ret;
+
+ ret = register_ftrace_function_probe(glob, tr, ops, count);
+ if (ret < 0)
+ tracing_disarm_snapshot(tr);
+
+ return ret < 0 ? ret : 0;
+}
+
+static struct ftrace_func_command ftrace_snapshot_cmd = {
+ .name = "snapshot",
+ .func = ftrace_trace_snapshot_callback,
+};
+
+__init int register_snapshot_cmd(void)
+{
+ return register_ftrace_command(&ftrace_snapshot_cmd);
+}
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+int trace_allocate_snapshot(struct trace_array *tr, int size)
+{
+ int ret;
+
+ /* Fix mapped buffer trace arrays do not have snapshot buffers */
+ if (tr->range_addr_start)
+ return 0;
+
+ /* allocate_snapshot can only be true during system boot */
+ ret = allocate_trace_buffer(tr, &tr->snapshot_buffer,
+ allocate_snapshot ? size : 1);
+ if (ret < 0)
+ return -ENOMEM;
+
+ tr->allocated_snapshot = allocate_snapshot;
+
+ allocate_snapshot = false;
+ return 0;
+}
+
+__init static bool tr_needs_alloc_snapshot(const char *name)
+{
+ char *test;
+ int len = strlen(name);
+ bool ret;
+
+ if (!boot_snapshot_index)
+ return false;
+
+ if (strncmp(name, boot_snapshot_info, len) == 0 &&
+ boot_snapshot_info[len] == '\t')
+ return true;
+
+ test = kmalloc(strlen(name) + 3, GFP_KERNEL);
+ if (!test)
+ return false;
+
+ sprintf(test, "\t%s\t", name);
+ ret = strstr(boot_snapshot_info, test) == NULL;
+ kfree(test);
+ return ret;
+}
+
+__init void do_allocate_snapshot(const char *name)
+{
+ if (!tr_needs_alloc_snapshot(name))
+ return;
+
+ /*
+ * When allocate_snapshot is set, the next call to
+ * allocate_trace_buffers() (called by trace_array_get_by_name())
+ * will allocate the snapshot buffer. That will also clear
+ * this flag.
+ */
+ allocate_snapshot = true;
+}
+
+void __init ftrace_boot_snapshot(void)
+{
+ struct trace_array *tr;
+
+ if (!snapshot_at_boot)
+ return;
+
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ if (!tr->allocated_snapshot)
+ continue;
+
+ tracing_snapshot_instance(tr);
+ trace_array_puts(tr, "** Boot snapshot taken **\n");
+ }
+}
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 37317b81fcda..8ad72e17d8eb 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -174,7 +174,6 @@ sys_enter_openat_print(struct syscall_trace_enter *trace, struct syscall_metadat
{ O_NOFOLLOW, "O_NOFOLLOW" },
{ O_NOATIME, "O_NOATIME" },
{ O_CLOEXEC, "O_CLOEXEC" },
- { -1, NULL }
};
trace_seq_printf(s, "%s(", entry->name);
@@ -205,7 +204,7 @@ sys_enter_openat_print(struct syscall_trace_enter *trace, struct syscall_metadat
trace_seq_puts(s, "O_RDONLY|");
}
- trace_print_flags_seq(s, "|", bits, __flags);
+ trace_print_flags_seq(s, "|", bits, __flags, ARRAY_SIZE(__flags));
/*
* trace_print_flags_seq() adds a '\0' to the
* buffer, but this needs to append more to the seq.
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index bf1a507695b6..0dd7927df22a 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -386,13 +386,11 @@ static void tracing_map_elt_init_fields(struct tracing_map_elt *elt)
}
}
-static void tracing_map_elt_free(struct tracing_map_elt *elt)
+static void __tracing_map_elt_free(struct tracing_map_elt *elt)
{
if (!elt)
return;
- if (elt->map->ops && elt->map->ops->elt_free)
- elt->map->ops->elt_free(elt);
kfree(elt->fields);
kfree(elt->vars);
kfree(elt->var_set);
@@ -400,6 +398,17 @@ static void tracing_map_elt_free(struct tracing_map_elt *elt)
kfree(elt);
}
+static void tracing_map_elt_free(struct tracing_map_elt *elt)
+{
+ if (!elt)
+ return;
+
+ /* Only objects initialized with alloc_elt() should be passed to free_elt().*/
+ if (elt->map->ops && elt->map->ops->elt_free)
+ elt->map->ops->elt_free(elt);
+ __tracing_map_elt_free(elt);
+}
+
static struct tracing_map_elt *tracing_map_elt_alloc(struct tracing_map *map)
{
struct tracing_map_elt *elt;
@@ -444,7 +453,7 @@ static struct tracing_map_elt *tracing_map_elt_alloc(struct tracing_map *map)
}
return elt;
free:
- tracing_map_elt_free(elt);
+ __tracing_map_elt_free(elt);
return ERR_PTR(err);
}
diff --git a/kernel/trace/undefsyms_base.c b/kernel/trace/undefsyms_base.c
new file mode 100644
index 000000000000..e65baf58e6ff
--- /dev/null
+++ b/kernel/trace/undefsyms_base.c
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * simple_ring_buffer is used by the pKVM hypervisor which does not have access
+ * to all kernel symbols. Whatever is undefined when compiling this file is
+ * compiler and tooling-generated symbols that can safely be ignored for
+ * simple_ring_buffer.
+ */
+
+#include <linux/atomic.h>
+#include <linux/string.h>
+#include <asm/page.h>
+
+void undefsyms_base(void *p, int n);
+
+static char page[PAGE_SIZE] __aligned(PAGE_SIZE);
+
+void undefsyms_base(void *p, int n)
+{
+ char buffer[256] = { 0 };
+
+ u32 u = 0;
+ memset((char * volatile)page, 8, PAGE_SIZE);
+ memset((char * volatile)buffer, 8, sizeof(buffer));
+ memcpy((void * volatile)p, buffer, sizeof(buffer));
+ cmpxchg((u32 * volatile)&u, 0, 8);
+ WARN_ON(n == 0xdeadbeef);
+}