From ec9dd352d591f0c90402ec67a317c1ed4fb2e638 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 18 Sep 2017 16:38:36 -0700 Subject: bpf: one perf event close won't free bpf program attached by another perf event This patch fixes a bug exhibited by the following scenario: 1. fd1 = perf_event_open with attr.config = ID1 2. attach bpf program prog1 to fd1 3. fd2 = perf_event_open with attr.config = ID1 4. user program closes fd2 and prog1 is detached from the tracepoint. 5. user program with fd1 does not work properly as tracepoint no output any more. The issue happens at step 4. Multiple perf_event_open can be called successfully, but only one bpf prog pointer in the tp_event. In the current logic, any fd release for the same tp_event will free the tp_event->prog. The fix is to free tp_event->prog only when the closing fd corresponds to the one which registered the program. Signed-off-by: Yonghong Song Signed-off-by: David S. Miller --- kernel/events/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 3e691b75b2db..6bc21e202ae4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8171,6 +8171,7 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) } } event->tp_event->prog = prog; + event->tp_event->bpf_prog_owner = event; return 0; } @@ -8185,7 +8186,7 @@ static void perf_event_free_bpf_prog(struct perf_event *event) return; prog = event->tp_event->prog; - if (prog) { + if (prog && event->tp_event->bpf_prog_owner == event) { event->tp_event->prog = NULL; bpf_prog_put(prog); } -- cgit v1.2.3 From 441430eb54a00586f95f1aefc48e0801bbd6a923 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Wed, 6 Sep 2017 19:08:11 +0300 Subject: perf/aux: Only update ->aux_wakeup in non-overwrite mode The following commit: d9a50b0256 ("perf/aux: Ensure aux_wakeup represents most recent wakeup index") changed the AUX wakeup position calculation to rounddown(), which causes a division-by-zero in AUX overwrite mode (aka "snapshot mode"). The zero denominator results from the fact that perf record doesn't set aux_watermark to anything, in which case the kernel will set it to half the AUX buffer size, but only for non-overwrite mode. In the overwrite mode aux_watermark stays zero. The good news is that, AUX overwrite mode, wakeups don't happen and related bookkeeping is not relevant, so we can simply forego the whole wakeup updates. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: will.deacon@arm.com Link: http://lkml.kernel.org/r/20170906160811.16510-1-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/ring_buffer.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index af71a84e12ee..f684d8e5fa2b 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -412,6 +412,19 @@ err: return NULL; } +static bool __always_inline rb_need_aux_wakeup(struct ring_buffer *rb) +{ + if (rb->aux_overwrite) + return false; + + if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { + rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark); + return true; + } + + return false; +} + /* * Commit the data written by hardware into the ring buffer by adjusting * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the @@ -451,10 +464,8 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) } rb->user_page->aux_head = rb->aux_head; - if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { + if (rb_need_aux_wakeup(rb)) wakeup = true; - rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark); - } if (wakeup) { if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED) @@ -484,9 +495,8 @@ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) rb->aux_head += size; rb->user_page->aux_head = rb->aux_head; - if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { + if (rb_need_aux_wakeup(rb)) { perf_output_wakeup(handle); - rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark); handle->wakeup = rb->aux_wakeup + rb->aux_watermark; } -- cgit v1.2.3 From 5bce9db1894c998c5b85a34036d679ea6517668f Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 29 Aug 2017 17:01:03 +0300 Subject: perf/core: Explain perf_sched_mutex To clarify why atomic_inc_return(&perf_sched_events) is not sufficient and a mutex is needed to order static branch enabling vs the atomic counter increment, this adds a comment with a short explanation. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170829140103.6563-1-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 6bc21e202ae4..5ee62714f9a6 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9394,6 +9394,11 @@ static void account_event(struct perf_event *event) inc = true; if (inc) { + /* + * We need the mutex here because static_branch_enable() + * must complete *before* the perf_sched_count increment + * becomes visible. + */ if (atomic_inc_not_zero(&perf_sched_count)) goto enabled; -- cgit v1.2.3 From 97562633bcbac4a07d605ae628d7655fa71caaf5 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 5 Oct 2017 09:19:19 -0700 Subject: bpf: perf event change needed for subsequent bpf helpers This patch does not impact existing functionalities. It contains the changes in perf event area needed for subsequent bpf_perf_event_read_value and bpf_perf_prog_read_value helpers. Signed-off-by: Yonghong Song Acked-by: Peter Zijlstra (Intel) Signed-off-by: David S. Miller --- kernel/events/core.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 6bc21e202ae4..902149f05381 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3684,10 +3684,12 @@ static inline u64 perf_event_count(struct perf_event *event) * will not be local and we cannot read them atomically * - must not have a pmu::count method */ -int perf_event_read_local(struct perf_event *event, u64 *value) +int perf_event_read_local(struct perf_event *event, u64 *value, + u64 *enabled, u64 *running) { unsigned long flags; int ret = 0; + u64 now; /* * Disabling interrupts avoids all counter scheduling (context @@ -3718,13 +3720,21 @@ int perf_event_read_local(struct perf_event *event, u64 *value) goto out; } + now = event->shadow_ctx_time + perf_clock(); + if (enabled) + *enabled = now - event->tstamp_enabled; /* * If the event is currently on this CPU, its either a per-task event, * or local to this CPU. Furthermore it means its ACTIVE (otherwise * oncpu == -1). */ - if (event->oncpu == smp_processor_id()) + if (event->oncpu == smp_processor_id()) { event->pmu->read(event); + if (running) + *running = now - event->tstamp_running; + } else if (running) { + *running = event->total_time_running; + } *value = local64_read(&event->count); out: @@ -8072,6 +8082,7 @@ static void bpf_overflow_handler(struct perf_event *event, struct bpf_perf_event_data_kern ctx = { .data = data, .regs = regs, + .event = event, }; int ret = 0; -- cgit v1.2.3 From df0062b27ebf473b372914a3e3574d93790e2b72 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 3 Oct 2017 15:20:50 +0100 Subject: perf/core: Avoid freeing static PMU contexts when PMU is unregistered Since commit: 1fd7e4169954 ("perf/core: Remove perf_cpu_context::unique_pmu") ... when a PMU is unregistered then its associated ->pmu_cpu_context is unconditionally freed. Whilst this is fine for dynamically allocated context types (i.e. those registered using perf_invalid_context), this causes a problem for sharing of static contexts such as perf_{sw,hw}_context, which are used by multiple built-in PMUs and effectively have a global lifetime. Whilst testing the ARM SPE driver, which must use perf_sw_context to support per-task AUX tracing, unregistering the driver as a result of a module unload resulted in: Unable to handle kernel NULL pointer dereference at virtual address 00000038 Internal error: Oops: 96000004 [#1] PREEMPT SMP Modules linked in: [last unloaded: arm_spe_pmu] PC is at ctx_resched+0x38/0xe8 LR is at perf_event_exec+0x20c/0x278 [...] ctx_resched+0x38/0xe8 perf_event_exec+0x20c/0x278 setup_new_exec+0x88/0x118 load_elf_binary+0x26c/0x109c search_binary_handler+0x90/0x298 do_execveat_common.isra.14+0x540/0x618 SyS_execve+0x38/0x48 since the software context has been freed and the ctx.pmu->pmu_disable_count field has been set to NULL. This patch fixes the problem by avoiding the freeing of static PMU contexts altogether. Whilst the sharing of dynamic contexts is questionable, this actually requires the caller to share their context pointer explicitly and so the burden is on them to manage the object lifetime. Reported-by: Kim Phillips Signed-off-by: Will Deacon Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mark Rutland Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 1fd7e4169954 ("perf/core: Remove perf_cpu_context::unique_pmu") Link: http://lkml.kernel.org/r/1507040450-7730-1-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 6bc21e202ae4..243bfc68d0fe 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8955,6 +8955,14 @@ static struct perf_cpu_context __percpu *find_pmu_context(int ctxn) static void free_pmu_context(struct pmu *pmu) { + /* + * Static contexts such as perf_sw_context have a global lifetime + * and may be shared between different PMUs. Avoid freeing them + * when a single PMU is going away. + */ + if (pmu->task_ctx_nr > perf_invalid_context) + return; + mutex_lock(&pmus_lock); free_percpu(pmu->pmu_cpu_context); mutex_unlock(&pmus_lock); -- cgit v1.2.3 From e6a5203399d19871021c1fa0eb2a08fc63b67e91 Mon Sep 17 00:00:00 2001 From: "leilei.lin" Date: Fri, 29 Sep 2017 13:54:44 +0800 Subject: perf/core: Fix cgroup time when scheduling descendants Update cgroup time when an event is scheduled in by descendants. Reviewed-and-tested-by: Jiri Olsa Signed-off-by: leilei.lin Signed-off-by: Peter Zijlstra (Intel) Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: acme@kernel.org Cc: alexander.shishkin@linux.intel.com Cc: brendan.d.gregg@gmail.com Cc: yang_oliver@hotmail.com Link: http://lkml.kernel.org/r/CALPjY3mkHiekRkRECzMi9G-bjUQOvOjVBAqxmWkTzc-g+0LwMg@mail.gmail.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 243bfc68d0fe..9d93db81fa36 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -662,7 +662,7 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) /* * Do not update time when cgroup is not active */ - if (cgrp == event->cgrp) + if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) __update_cgrp_time(event->cgrp); } -- cgit v1.2.3 From 8fd0fbbe8888f295eb34172a7e47bf7d3a0a4687 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 11 Oct 2017 09:45:29 +0200 Subject: perf/ftrace: Revert ("perf/ftrace: Fix double traces of perf on ftrace:function") Revert commit: 75e8387685f6 ("perf/ftrace: Fix double traces of perf on ftrace:function") The reason I instantly stumbled on that patch is that it only addresses the ftrace situation and doesn't mention the other _5_ places that use this interface. It doesn't explain why those don't have the problem and if not, why their solution doesn't work for ftrace. It doesn't, but this is just putting more duct tape on. Link: http://lkml.kernel.org/r/20171011080224.200565770@infradead.org Cc: Zhou Chengming Cc: Jiri Olsa Cc: Ingo Molnar Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Steven Rostedt (VMware) --- kernel/events/core.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 6bc21e202ae4..b8db80c5513b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7954,15 +7954,16 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, } } perf_tp_event(call->event.type, count, raw_data, size, regs, head, - rctx, task, NULL); + rctx, task); } EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit); void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct pt_regs *regs, struct hlist_head *head, int rctx, - struct task_struct *task, struct perf_event *event) + struct task_struct *task) { struct perf_sample_data data; + struct perf_event *event; struct perf_raw_record raw = { .frag = { @@ -7976,15 +7977,9 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, perf_trace_buf_update(record, event_type); - /* Use the given event instead of the hlist */ - if (event) { + hlist_for_each_entry_rcu(event, head, hlist_entry) { if (perf_tp_event_match(event, &data, regs)) perf_swevent_event(event, count, &data, regs); - } else { - hlist_for_each_entry_rcu(event, head, hlist_entry) { - if (perf_tp_event_match(event, &data, regs)) - perf_swevent_event(event, count, &data, regs); - } } /* -- cgit v1.2.3 From bc1d202023eb66f088f736ba423bee1cf135c720 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 16 Aug 2016 16:53:15 +0100 Subject: perf/core: Export AUX buffer helpers to modules Perf PMU drivers using AUX buffers cannot be built as modules unless the AUX helpers are exported. This patch exports perf_aux_output_{begin,end,skip} and perf_get_aux to modules. Cc: Peter Zijlstra Signed-off-by: Will Deacon --- kernel/events/ring_buffer.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel/events') diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index f684d8e5fa2b..6d9bffe4d6cc 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -411,6 +411,7 @@ err: return NULL; } +EXPORT_SYMBOL_GPL(perf_aux_output_begin); static bool __always_inline rb_need_aux_wakeup(struct ring_buffer *rb) { @@ -480,6 +481,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) rb_free_aux(rb); ring_buffer_put(rb); } +EXPORT_SYMBOL_GPL(perf_aux_output_end); /* * Skip over a given number of bytes in the AUX buffer, due to, for example, @@ -505,6 +507,7 @@ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) return 0; } +EXPORT_SYMBOL_GPL(perf_aux_output_skip); void *perf_get_aux(struct perf_output_handle *handle) { @@ -514,6 +517,7 @@ void *perf_get_aux(struct perf_output_handle *handle) return handle->rb->aux_priv; } +EXPORT_SYMBOL_GPL(perf_get_aux); #define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY) -- cgit v1.2.3 From 506458efaf153c1ea480591c5602a5a3ba5a3b76 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 24 Oct 2017 11:22:48 +0100 Subject: locking/barriers: Convert users of lockless_dereference() to READ_ONCE() READ_ONCE() now has an implicit smp_read_barrier_depends() call, so it can be used instead of lockless_dereference() without any change in semantics. Signed-off-by: Will Deacon Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1508840570-22169-4-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 9d93db81fa36..824a583079a1 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4231,7 +4231,7 @@ static void perf_remove_from_owner(struct perf_event *event) * indeed free this event, otherwise we need to serialize on * owner->perf_event_mutex. */ - owner = lockless_dereference(event->owner); + owner = READ_ONCE(event->owner); if (owner) { /* * Since delayed_put_task_struct() also drops the last @@ -4328,7 +4328,7 @@ again: * Cannot change, child events are not migrated, see the * comment with perf_event_ctx_lock_nested(). */ - ctx = lockless_dereference(child->ctx); + ctx = READ_ONCE(child->ctx); /* * Since child_mutex nests inside ctx::mutex, we must jump * through hoops. We start by grabbing a reference on the ctx. -- cgit v1.2.3 From 0b4c6841fee03e096b735074a0c4aab3a8e92986 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 23 Oct 2017 23:53:07 -0700 Subject: bpf: use the same condition in perf event set/free bpf handler This is a cleanup such that doing the same check in perf_event_free_bpf_prog as we already do in perf_event_set_bpf_prog step. Signed-off-by: Yonghong Song Acked-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- kernel/events/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 31ee304a5844..9f78a6825bbe 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8191,10 +8191,10 @@ static void perf_event_free_bpf_prog(struct perf_event *event) { struct bpf_prog *prog; - perf_event_free_bpf_handler(event); - - if (!event->tp_event) + if (event->attr.type != PERF_TYPE_TRACEPOINT) { + perf_event_free_bpf_handler(event); return; + } prog = event->tp_event->prog; if (prog && event->tp_event->bpf_prog_owner == event) { -- cgit v1.2.3 From e87c6bc3852b981e71c757be20771546ce9f76f3 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 23 Oct 2017 23:53:08 -0700 Subject: bpf: permit multiple bpf attachments for a single perf event This patch enables multiple bpf attachments for a kprobe/uprobe/tracepoint single trace event. Each trace_event keeps a list of attached perf events. When an event happens, all attached bpf programs will be executed based on the order of attachment. A global bpf_event_mutex lock is introduced to protect prog_array attaching and detaching. An alternative will be introduce a mutex lock in every trace_event_call structure, but it takes a lot of extra memory. So a global bpf_event_mutex lock is a good compromise. The bpf prog detachment involves allocation of memory. If the allocation fails, a dummy do-nothing program will replace to-be-detached program in-place. Signed-off-by: Yonghong Song Acked-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- kernel/events/core.c | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 9f78a6825bbe..9660ee65fbef 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7954,11 +7954,9 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, struct pt_regs *regs, struct hlist_head *head, struct task_struct *task) { - struct bpf_prog *prog = call->prog; - - if (prog) { + if (bpf_prog_array_valid(call)) { *(struct pt_regs **)raw_data = regs; - if (!trace_call_bpf(prog, raw_data) || hlist_empty(head)) { + if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) { perf_swevent_put_recursion_context(rctx); return; } @@ -8147,13 +8145,11 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) { bool is_kprobe, is_tracepoint, is_syscall_tp; struct bpf_prog *prog; + int ret; if (event->attr.type != PERF_TYPE_TRACEPOINT) return perf_event_set_bpf_handler(event, prog_fd); - if (event->tp_event->prog) - return -EEXIST; - is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE; is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT; is_syscall_tp = is_syscall_trace_event(event->tp_event); @@ -8181,26 +8177,20 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) return -EACCES; } } - event->tp_event->prog = prog; - event->tp_event->bpf_prog_owner = event; - return 0; + ret = perf_event_attach_bpf_prog(event, prog); + if (ret) + bpf_prog_put(prog); + return ret; } static void perf_event_free_bpf_prog(struct perf_event *event) { - struct bpf_prog *prog; - if (event->attr.type != PERF_TYPE_TRACEPOINT) { perf_event_free_bpf_handler(event); return; } - - prog = event->tp_event->prog; - if (prog && event->tp_event->bpf_prog_owner == event) { - event->tp_event->prog = NULL; - bpf_prog_put(prog); - } + perf_event_detach_bpf_prog(event); } #else -- cgit v1.2.3 From 6aa7de059173a986114ac43b8f50b297a86f09a8 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 23 Oct 2017 14:07:29 -0700 Subject: locking/atomics: COCCINELLE/treewide: Convert trivial ACCESS_ONCE() patterns to READ_ONCE()/WRITE_ONCE() Please do not apply this to mainline directly, instead please re-run the coccinelle script shown below and apply its output. For several reasons, it is desirable to use {READ,WRITE}_ONCE() in preference to ACCESS_ONCE(), and new code is expected to use one of the former. So far, there's been no reason to change most existing uses of ACCESS_ONCE(), as these aren't harmful, and changing them results in churn. However, for some features, the read/write distinction is critical to correct operation. To distinguish these cases, separate read/write accessors must be used. This patch migrates (most) remaining ACCESS_ONCE() instances to {READ,WRITE}_ONCE(), using the following coccinelle script: ---- // Convert trivial ACCESS_ONCE() uses to equivalent READ_ONCE() and // WRITE_ONCE() // $ make coccicheck COCCI=/home/mark/once.cocci SPFLAGS="--include-headers" MODE=patch virtual patch @ depends on patch @ expression E1, E2; @@ - ACCESS_ONCE(E1) = E2 + WRITE_ONCE(E1, E2) @ depends on patch @ expression E; @@ - ACCESS_ONCE(E) + READ_ONCE(E) ---- Signed-off-by: Mark Rutland Signed-off-by: Paul E. McKenney Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: davem@davemloft.net Cc: linux-arch@vger.kernel.org Cc: mpe@ellerman.id.au Cc: shuah@kernel.org Cc: snitzer@redhat.com Cc: thor.thayer@linux.intel.com Cc: tj@kernel.org Cc: viro@zeniv.linux.org.uk Cc: will.deacon@arm.com Link: http://lkml.kernel.org/r/1508792849-3115-19-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 6 +++--- kernel/events/ring_buffer.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 824a583079a1..8fd2f2d1358a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1200,7 +1200,7 @@ perf_event_ctx_lock_nested(struct perf_event *event, int nesting) again: rcu_read_lock(); - ctx = ACCESS_ONCE(event->ctx); + ctx = READ_ONCE(event->ctx); if (!atomic_inc_not_zero(&ctx->refcount)) { rcu_read_unlock(); goto again; @@ -5302,8 +5302,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (!rb) goto aux_unlock; - aux_offset = ACCESS_ONCE(rb->user_page->aux_offset); - aux_size = ACCESS_ONCE(rb->user_page->aux_size); + aux_offset = READ_ONCE(rb->user_page->aux_offset); + aux_size = READ_ONCE(rb->user_page->aux_size); if (aux_offset < perf_data_size(rb) + PAGE_SIZE) goto aux_unlock; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index f684d8e5fa2b..f3e37971c842 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -381,7 +381,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, * (B) <-> (C) ordering is still observed by the pmu driver. */ if (!rb->aux_overwrite) { - aux_tail = ACCESS_ONCE(rb->user_page->aux_tail); + aux_tail = READ_ONCE(rb->user_page->aux_tail); handle->wakeup = rb->aux_wakeup + rb->aux_watermark; if (aux_head - aux_tail < perf_aux_size(rb)) handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb)); -- cgit v1.2.3 From 7d9285e82db5defca4d9674ba089429eeca0c697 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 5 Oct 2017 09:19:19 -0700 Subject: perf/bpf: Extend the perf_event_read_local() interface, a.k.a. "bpf: perf event change needed for subsequent bpf helpers" eBPF programs would like access to the (perf) event enabled and running times along with the event value, such that they can deal with event multiplexing (among other things). This patch extends the interface; a future eBPF patch will utilize the new functionality. [ Note, there's a same-content commit with a poor changelog and a meaningless title in the networking tree as well - but we need this change for subsequent perf work, so apply it here as well, with a proper changelog. Hopefully Git will be able to sort out this somewhat messy workflow, if there are no other, conflicting changes to these files. ] Signed-off-by: Yonghong Song [ Rewrote the changelog. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Cc: Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: David S. Miller Link: http://lkml.kernel.org/r/20171005161923.332790-2-yhs@fb.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 04989fb769f0..74671e101b92 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3684,10 +3684,12 @@ static inline u64 perf_event_count(struct perf_event *event) * will not be local and we cannot read them atomically * - must not have a pmu::count method */ -int perf_event_read_local(struct perf_event *event, u64 *value) +int perf_event_read_local(struct perf_event *event, u64 *value, + u64 *enabled, u64 *running) { unsigned long flags; int ret = 0; + u64 now; /* * Disabling interrupts avoids all counter scheduling (context @@ -3718,13 +3720,21 @@ int perf_event_read_local(struct perf_event *event, u64 *value) goto out; } + now = event->shadow_ctx_time + perf_clock(); + if (enabled) + *enabled = now - event->tstamp_enabled; /* * If the event is currently on this CPU, its either a per-task event, * or local to this CPU. Furthermore it means its ACTIVE (otherwise * oncpu == -1). */ - if (event->oncpu == smp_processor_id()) + if (event->oncpu == smp_processor_id()) { event->pmu->read(event); + if (running) + *running = now - event->tstamp_running; + } else if (running) { + *running = event->total_time_running; + } *value = local64_read(&event->count); out: @@ -8072,6 +8082,7 @@ static void bpf_overflow_handler(struct perf_event *event, struct bpf_perf_event_data_kern ctx = { .data = data, .regs = regs, + .event = event, }; int ret = 0; -- cgit v1.2.3 From ca0dd44cf37bfe316751e1701439c3ff44beb05c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Sep 2017 13:23:44 +0200 Subject: perf/core: Fix perf_event_read_value() locking perf_event_read_value() is an external accessor, just like perf_event_{en,dis}able() and should thus use perf_event_ctx_lock(). Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: f63a8daa5812 ("perf: Fix event->ctx locking") Signed-off-by: Ingo Molnar --- kernel/events/core.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 74671e101b92..d636f1d0e2ab 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4398,7 +4398,7 @@ static int perf_release(struct inode *inode, struct file *file) return 0; } -u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) +static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) { struct perf_event *child; u64 total = 0; @@ -4426,6 +4426,18 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) return total; } + +u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) +{ + struct perf_event_context *ctx; + u64 count; + + ctx = perf_event_ctx_lock(event); + count = __perf_event_read_value(event, enabled, running); + perf_event_ctx_unlock(event, ctx); + + return count; +} EXPORT_SYMBOL_GPL(perf_event_read_value); static int __perf_read_group_add(struct perf_event *leader, @@ -4528,7 +4540,7 @@ static int perf_read_one(struct perf_event *event, u64 values[4]; int n = 0; - values[n++] = perf_event_read_value(event, &enabled, &running); + values[n++] = __perf_event_read_value(event, &enabled, &running); if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) values[n++] = enabled; if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) -- cgit v1.2.3 From 0ee098c97a6ebe163180e74b740c0a37bcdaa173 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Sep 2017 13:24:28 +0200 Subject: perf/core: Update ctx time before detaching events We should make sure the ctx time is updated before we detach events; which will want to update event times. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/events/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index d636f1d0e2ab..5fae98626b15 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11129,6 +11129,7 @@ static void __perf_event_exit_context(void *__info) struct perf_event *event; raw_spin_lock(&ctx->lock); + ctx_sched_out(ctx, cpuctx, EVENT_TIME); list_for_each_entry(event, &ctx->event_list, event_entry) __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP); raw_spin_unlock(&ctx->lock); -- cgit v1.2.3 From a9cd8194e1e6bd09619954721dfaf0f94fe2003e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Sep 2017 13:38:24 +0200 Subject: perf/core: Fix __perf_read_group_add() locking Event timestamps are serialized using ctx->lock, make sure to hold it over reading all values. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/events/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 5fae98626b15..0f34f2a47eb6 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4453,6 +4453,8 @@ static int __perf_read_group_add(struct perf_event *leader, if (ret) return ret; + raw_spin_lock_irqsave(&ctx->lock, flags); + /* * Since we co-schedule groups, {enabled,running} times of siblings * will be identical to those of the leader, so we only publish one @@ -4475,8 +4477,6 @@ static int __perf_read_group_add(struct perf_event *leader, if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); - raw_spin_lock_irqsave(&ctx->lock, flags); - list_for_each_entry(sub, &leader->sibling_list, group_entry) { values[n++] += perf_event_count(sub); if (read_format & PERF_FORMAT_ID) -- cgit v1.2.3 From 3c5c8711dcb32115156cc7672fa095afa4339eaa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Sep 2017 13:44:51 +0200 Subject: perf/core: Make sure to update ctx time before using it We should make sure to update ctx time before we use it to update event times. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/events/core.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 0f34f2a47eb6..b249e0f197a7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1893,6 +1893,11 @@ __perf_remove_from_context(struct perf_event *event, { unsigned long flags = (unsigned long)info; + if (ctx->is_active & EVENT_TIME) { + update_context_time(ctx); + update_cgrp_time_from_cpuctx(cpuctx); + } + event_sched_out(event, cpuctx, ctx); if (flags & DETACH_GROUP) perf_group_detach(event); @@ -1955,8 +1960,11 @@ static void __perf_event_disable(struct perf_event *event, if (event->state < PERF_EVENT_STATE_INACTIVE) return; - update_context_time(ctx); - update_cgrp_time_from_event(event); + if (ctx->is_active & EVENT_TIME) { + update_context_time(ctx); + update_cgrp_time_from_event(event); + } + update_group_times(event); if (event == event->group_leader) group_sched_out(event, cpuctx, ctx); -- cgit v1.2.3 From 8ca2bd41c7d1c135e9ac6f25970c2d491865088a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Sep 2017 14:12:35 +0200 Subject: perf/core: Rename 'enum perf_event_active_state' Its a weird name, active is one of the states, it should not be part of the name, also, its too long. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index b249e0f197a7..6322e245176c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10801,7 +10801,7 @@ inherit_event(struct perf_event *parent_event, struct perf_event *group_leader, struct perf_event_context *child_ctx) { - enum perf_event_active_state parent_state = parent_event->state; + enum perf_event_state parent_state = parent_event->state; struct perf_event *child_event; unsigned long flags; -- cgit v1.2.3 From 7f0ec32526d2446fdd79b0c6c6d08e64ef9fb1f3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Sep 2017 14:17:58 +0200 Subject: perf/core: Remove wrong barrier The barrier and comment make no sense: - if what the barrier says is true, it should be wmb() but that should then be part of the arch driver, not the generic code. - if it is an SMP barrier, there must be a matching barrier, and there isn't one. So kill it. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/events/core.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 6322e245176c..205a4321f678 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2097,11 +2097,6 @@ event_sched_in(struct perf_event *event, event->hw.interrupts = 0; } - /* - * The new state must be visible before we turn it on in the hardware: - */ - smp_wmb(); - perf_pmu_disable(event->pmu); perf_set_shadow_time(event, ctx, tstamp); -- cgit v1.2.3 From 0c1cbc18df9e38182a0604b15535699c84d7342a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Sep 2017 16:26:44 +0200 Subject: perf/core: Fix perf_event_read() perf_event_read() has a number of issues regarding the timekeeping bits. - The IPI didn't update group times when it found INACTIVE - The direct call would not re-check ->state after taking ctx->lock which can result in ->count and timestamps getting out of sync. And we can make use of the ordering introduced for perf_event_stop() to make it more accurate for ACTIVE. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/events/core.c | 55 +++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 39 insertions(+), 16 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 205a4321f678..6f74f9c35490 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2081,8 +2081,9 @@ event_sched_in(struct perf_event *event, WRITE_ONCE(event->oncpu, smp_processor_id()); /* - * Order event::oncpu write to happen before the ACTIVE state - * is visible. + * Order event::oncpu write to happen before the ACTIVE state is + * visible. This allows perf_event_{stop,read}() to observe the correct + * ->oncpu if it sees ACTIVE. */ smp_wmb(); WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE); @@ -3638,12 +3639,16 @@ static void __perf_event_read(void *info) return; raw_spin_lock(&ctx->lock); - if (ctx->is_active) { + if (ctx->is_active & EVENT_TIME) { update_context_time(ctx); update_cgrp_time_from_event(event); } - update_event_times(event); + if (!data->group) + update_event_times(event); + else + update_group_times(event); + if (event->state != PERF_EVENT_STATE_ACTIVE) goto unlock; @@ -3658,7 +3663,6 @@ static void __perf_event_read(void *info) pmu->read(event); list_for_each_entry(sub, &event->sibling_list, group_entry) { - update_event_times(sub); if (sub->state == PERF_EVENT_STATE_ACTIVE) { /* * Use sibling's PMU rather than @event's since @@ -3748,23 +3752,35 @@ out: static int perf_event_read(struct perf_event *event, bool group) { + enum perf_event_state state = READ_ONCE(event->state); int event_cpu, ret = 0; /* * If event is enabled and currently active on a CPU, update the * value in the event structure: */ - if (event->state == PERF_EVENT_STATE_ACTIVE) { - struct perf_read_data data = { - .event = event, - .group = group, - .ret = 0, - }; +again: + if (state == PERF_EVENT_STATE_ACTIVE) { + struct perf_read_data data; + + /* + * Orders the ->state and ->oncpu loads such that if we see + * ACTIVE we must also see the right ->oncpu. + * + * Matches the smp_wmb() from event_sched_in(). + */ + smp_rmb(); event_cpu = READ_ONCE(event->oncpu); if ((unsigned)event_cpu >= nr_cpu_ids) return 0; + data = (struct perf_read_data){ + .event = event, + .group = group, + .ret = 0, + }; + preempt_disable(); event_cpu = __perf_event_read_cpu(event, event_cpu); @@ -3781,20 +3797,27 @@ static int perf_event_read(struct perf_event *event, bool group) (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1); preempt_enable(); ret = data.ret; - } else if (event->state == PERF_EVENT_STATE_INACTIVE) { + + } else if (state == PERF_EVENT_STATE_INACTIVE) { struct perf_event_context *ctx = event->ctx; unsigned long flags; raw_spin_lock_irqsave(&ctx->lock, flags); + state = event->state; + if (state != PERF_EVENT_STATE_INACTIVE) { + raw_spin_unlock_irqrestore(&ctx->lock, flags); + goto again; + } + /* - * may read while context is not active - * (e.g., thread is blocked), in that case - * we cannot update context time + * May read while context is not active (e.g., thread is + * blocked), in that case we cannot update context time */ - if (ctx->is_active) { + if (ctx->is_active & EVENT_TIME) { update_context_time(ctx); update_cgrp_time_from_event(event); } + if (group) update_group_times(event); else -- cgit v1.2.3 From 0d3d73aac2ff05c78387aa9dcc2c8aa3804405e7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Sep 2017 14:16:28 +0200 Subject: perf/core: Rewrite event timekeeping The current even timekeeping, which computes enabled and running times, uses 3 distinct timestamps to reflect the various event states: OFF (stopped), INACTIVE (enabled) and ACTIVE (running). Furthermore, the update rules are such that even INACTIVE events need their timestamps updated. This is undesirable because we'd like to not touch INACTIVE events if at all possible, this makes event scheduling (much) more expensive than needed. Rewrite the timekeeping to directly use event->state, this greatly simplifies the code and results in only having to update things when we change state, or an up-to-date value is requested (read). Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/events/core.c | 385 +++++++++++++++++---------------------------------- 1 file changed, 129 insertions(+), 256 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 6f74f9c35490..2551e8ce7224 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -582,6 +582,88 @@ static inline u64 perf_event_clock(struct perf_event *event) return event->clock(); } +/* + * State based event timekeeping... + * + * The basic idea is to use event->state to determine which (if any) time + * fields to increment with the current delta. This means we only need to + * update timestamps when we change state or when they are explicitly requested + * (read). + * + * Event groups make things a little more complicated, but not terribly so. The + * rules for a group are that if the group leader is OFF the entire group is + * OFF, irrespecive of what the group member states are. This results in + * __perf_effective_state(). + * + * A futher ramification is that when a group leader flips between OFF and + * !OFF, we need to update all group member times. + * + * + * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we + * need to make sure the relevant context time is updated before we try and + * update our timestamps. + */ + +static __always_inline enum perf_event_state +__perf_effective_state(struct perf_event *event) +{ + struct perf_event *leader = event->group_leader; + + if (leader->state <= PERF_EVENT_STATE_OFF) + return leader->state; + + return event->state; +} + +static __always_inline void +__perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running) +{ + enum perf_event_state state = __perf_effective_state(event); + u64 delta = now - event->tstamp; + + *enabled = event->total_time_enabled; + if (state >= PERF_EVENT_STATE_INACTIVE) + *enabled += delta; + + *running = event->total_time_running; + if (state >= PERF_EVENT_STATE_ACTIVE) + *running += delta; +} + +static void perf_event_update_time(struct perf_event *event) +{ + u64 now = perf_event_time(event); + + __perf_update_times(event, now, &event->total_time_enabled, + &event->total_time_running); + event->tstamp = now; +} + +static void perf_event_update_sibling_time(struct perf_event *leader) +{ + struct perf_event *sibling; + + list_for_each_entry(sibling, &leader->sibling_list, group_entry) + perf_event_update_time(sibling); +} + +static void +perf_event_set_state(struct perf_event *event, enum perf_event_state state) +{ + if (event->state == state) + return; + + perf_event_update_time(event); + /* + * If a group leader gets enabled/disabled all its siblings + * are affected too. + */ + if ((event->state < 0) ^ (state < 0)) + perf_event_update_sibling_time(event); + + WRITE_ONCE(event->state, state); +} + #ifdef CONFIG_CGROUP_PERF static inline bool @@ -841,40 +923,6 @@ perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) event->shadow_ctx_time = now - t->timestamp; } -static inline void -perf_cgroup_defer_enabled(struct perf_event *event) -{ - /* - * when the current task's perf cgroup does not match - * the event's, we need to remember to call the - * perf_mark_enable() function the first time a task with - * a matching perf cgroup is scheduled in. - */ - if (is_cgroup_event(event) && !perf_cgroup_match(event)) - event->cgrp_defer_enabled = 1; -} - -static inline void -perf_cgroup_mark_enabled(struct perf_event *event, - struct perf_event_context *ctx) -{ - struct perf_event *sub; - u64 tstamp = perf_event_time(event); - - if (!event->cgrp_defer_enabled) - return; - - event->cgrp_defer_enabled = 0; - - event->tstamp_enabled = tstamp - event->total_time_enabled; - list_for_each_entry(sub, &event->sibling_list, group_entry) { - if (sub->state >= PERF_EVENT_STATE_INACTIVE) { - sub->tstamp_enabled = tstamp - sub->total_time_enabled; - sub->cgrp_defer_enabled = 0; - } - } -} - /* * Update cpuctx->cgrp so that it is set when first cgroup event is added and * cleared when last cgroup event is removed. @@ -972,17 +1020,6 @@ static inline u64 perf_cgroup_event_time(struct perf_event *event) return 0; } -static inline void -perf_cgroup_defer_enabled(struct perf_event *event) -{ -} - -static inline void -perf_cgroup_mark_enabled(struct perf_event *event, - struct perf_event_context *ctx) -{ -} - static inline void list_update_cgroup_event(struct perf_event *event, struct perf_event_context *ctx, bool add) @@ -1396,60 +1433,6 @@ static u64 perf_event_time(struct perf_event *event) return ctx ? ctx->time : 0; } -/* - * Update the total_time_enabled and total_time_running fields for a event. - */ -static void update_event_times(struct perf_event *event) -{ - struct perf_event_context *ctx = event->ctx; - u64 run_end; - - lockdep_assert_held(&ctx->lock); - - if (event->state < PERF_EVENT_STATE_INACTIVE || - event->group_leader->state < PERF_EVENT_STATE_INACTIVE) - return; - - /* - * in cgroup mode, time_enabled represents - * the time the event was enabled AND active - * tasks were in the monitored cgroup. This is - * independent of the activity of the context as - * there may be a mix of cgroup and non-cgroup events. - * - * That is why we treat cgroup events differently - * here. - */ - if (is_cgroup_event(event)) - run_end = perf_cgroup_event_time(event); - else if (ctx->is_active) - run_end = ctx->time; - else - run_end = event->tstamp_stopped; - - event->total_time_enabled = run_end - event->tstamp_enabled; - - if (event->state == PERF_EVENT_STATE_INACTIVE) - run_end = event->tstamp_stopped; - else - run_end = perf_event_time(event); - - event->total_time_running = run_end - event->tstamp_running; - -} - -/* - * Update total_time_enabled and total_time_running for all events in a group. - */ -static void update_group_times(struct perf_event *leader) -{ - struct perf_event *event; - - update_event_times(leader); - list_for_each_entry(event, &leader->sibling_list, group_entry) - update_event_times(event); -} - static enum event_type_t get_event_type(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; @@ -1492,6 +1475,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); event->attach_state |= PERF_ATTACH_CONTEXT; + event->tstamp = perf_event_time(event); + /* * If we're a stand alone event or group leader, we go to the context * list, group events are kept attached to the group so that @@ -1699,8 +1684,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) if (event->group_leader == event) list_del_init(&event->group_entry); - update_group_times(event); - /* * If event was in error state, then keep it * that way, otherwise bogus counts will be @@ -1709,7 +1692,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) * of the event */ if (event->state > PERF_EVENT_STATE_OFF) - event->state = PERF_EVENT_STATE_OFF; + perf_event_set_state(event, PERF_EVENT_STATE_OFF); ctx->generation++; } @@ -1808,38 +1791,24 @@ event_sched_out(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { - u64 tstamp = perf_event_time(event); - u64 delta; + enum perf_event_state state = PERF_EVENT_STATE_INACTIVE; WARN_ON_ONCE(event->ctx != ctx); lockdep_assert_held(&ctx->lock); - /* - * An event which could not be activated because of - * filter mismatch still needs to have its timings - * maintained, otherwise bogus information is return - * via read() for time_enabled, time_running: - */ - if (event->state == PERF_EVENT_STATE_INACTIVE && - !event_filter_match(event)) { - delta = tstamp - event->tstamp_stopped; - event->tstamp_running += delta; - event->tstamp_stopped = tstamp; - } - if (event->state != PERF_EVENT_STATE_ACTIVE) return; perf_pmu_disable(event->pmu); - event->tstamp_stopped = tstamp; event->pmu->del(event, 0); event->oncpu = -1; - event->state = PERF_EVENT_STATE_INACTIVE; + if (event->pending_disable) { event->pending_disable = 0; - event->state = PERF_EVENT_STATE_OFF; + state = PERF_EVENT_STATE_OFF; } + perf_event_set_state(event, state); if (!is_software_event(event)) cpuctx->active_oncpu--; @@ -1859,7 +1828,9 @@ group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx) { struct perf_event *event; - int state = group_event->state; + + if (group_event->state != PERF_EVENT_STATE_ACTIVE) + return; perf_pmu_disable(ctx->pmu); @@ -1873,7 +1844,7 @@ group_sched_out(struct perf_event *group_event, perf_pmu_enable(ctx->pmu); - if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive) + if (group_event->attr.exclusive) cpuctx->exclusive = 0; } @@ -1965,12 +1936,12 @@ static void __perf_event_disable(struct perf_event *event, update_cgrp_time_from_event(event); } - update_group_times(event); if (event == event->group_leader) group_sched_out(event, cpuctx, ctx); else event_sched_out(event, cpuctx, ctx); - event->state = PERF_EVENT_STATE_OFF; + + perf_event_set_state(event, PERF_EVENT_STATE_OFF); } /* @@ -2027,8 +1998,7 @@ void perf_event_disable_inatomic(struct perf_event *event) } static void perf_set_shadow_time(struct perf_event *event, - struct perf_event_context *ctx, - u64 tstamp) + struct perf_event_context *ctx) { /* * use the correct time source for the time snapshot @@ -2056,9 +2026,9 @@ static void perf_set_shadow_time(struct perf_event *event, * is cleaner and simpler to understand. */ if (is_cgroup_event(event)) - perf_cgroup_set_shadow_time(event, tstamp); + perf_cgroup_set_shadow_time(event, event->tstamp); else - event->shadow_ctx_time = tstamp - ctx->timestamp; + event->shadow_ctx_time = event->tstamp - ctx->timestamp; } #define MAX_INTERRUPTS (~0ULL) @@ -2071,7 +2041,6 @@ event_sched_in(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { - u64 tstamp = perf_event_time(event); int ret = 0; lockdep_assert_held(&ctx->lock); @@ -2086,7 +2055,7 @@ event_sched_in(struct perf_event *event, * ->oncpu if it sees ACTIVE. */ smp_wmb(); - WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE); + perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE); /* * Unthrottle events, since we scheduled we might have missed several @@ -2100,19 +2069,17 @@ event_sched_in(struct perf_event *event, perf_pmu_disable(event->pmu); - perf_set_shadow_time(event, ctx, tstamp); + perf_set_shadow_time(event, ctx); perf_log_itrace_start(event); if (event->pmu->add(event, PERF_EF_START)) { - event->state = PERF_EVENT_STATE_INACTIVE; + perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); event->oncpu = -1; ret = -EAGAIN; goto out; } - event->tstamp_running += tstamp - event->tstamp_stopped; - if (!is_software_event(event)) cpuctx->active_oncpu++; if (!ctx->nr_active++) @@ -2136,8 +2103,6 @@ group_sched_in(struct perf_event *group_event, { struct perf_event *event, *partial_group = NULL; struct pmu *pmu = ctx->pmu; - u64 now = ctx->time; - bool simulate = false; if (group_event->state == PERF_EVENT_STATE_OFF) return 0; @@ -2167,27 +2132,13 @@ group_error: /* * Groups can be scheduled in as one unit only, so undo any * partial group before returning: - * The events up to the failed event are scheduled out normally, - * tstamp_stopped will be updated. - * - * The failed events and the remaining siblings need to have - * their timings updated as if they had gone thru event_sched_in() - * and event_sched_out(). This is required to get consistent timings - * across the group. This also takes care of the case where the group - * could never be scheduled by ensuring tstamp_stopped is set to mark - * the time the event was actually stopped, such that time delta - * calculation in update_event_times() is correct. + * The events up to the failed event are scheduled out normally. */ list_for_each_entry(event, &group_event->sibling_list, group_entry) { if (event == partial_group) - simulate = true; + break; - if (simulate) { - event->tstamp_running += now - event->tstamp_stopped; - event->tstamp_stopped = now; - } else { - event_sched_out(event, cpuctx, ctx); - } + event_sched_out(event, cpuctx, ctx); } event_sched_out(group_event, cpuctx, ctx); @@ -2229,46 +2180,11 @@ static int group_can_go_on(struct perf_event *event, return can_add_hw; } -/* - * Complement to update_event_times(). This computes the tstamp_* values to - * continue 'enabled' state from @now, and effectively discards the time - * between the prior tstamp_stopped and now (as we were in the OFF state, or - * just switched (context) time base). - * - * This further assumes '@event->state == INACTIVE' (we just came from OFF) and - * cannot have been scheduled in yet. And going into INACTIVE state means - * '@event->tstamp_stopped = @now'. - * - * Thus given the rules of update_event_times(): - * - * total_time_enabled = tstamp_stopped - tstamp_enabled - * total_time_running = tstamp_stopped - tstamp_running - * - * We can insert 'tstamp_stopped == now' and reverse them to compute new - * tstamp_* values. - */ -static void __perf_event_enable_time(struct perf_event *event, u64 now) -{ - WARN_ON_ONCE(event->state != PERF_EVENT_STATE_INACTIVE); - - event->tstamp_stopped = now; - event->tstamp_enabled = now - event->total_time_enabled; - event->tstamp_running = now - event->total_time_running; -} - static void add_event_to_ctx(struct perf_event *event, struct perf_event_context *ctx) { - u64 tstamp = perf_event_time(event); - list_add_event(event, ctx); perf_group_attach(event); - /* - * We can be called with event->state == STATE_OFF when we create with - * .disabled = 1. In that case the IOC_ENABLE will call this function. - */ - if (event->state == PERF_EVENT_STATE_INACTIVE) - __perf_event_enable_time(event, tstamp); } static void ctx_sched_out(struct perf_event_context *ctx, @@ -2499,28 +2415,6 @@ again: raw_spin_unlock_irq(&ctx->lock); } -/* - * Put a event into inactive state and update time fields. - * Enabling the leader of a group effectively enables all - * the group members that aren't explicitly disabled, so we - * have to update their ->tstamp_enabled also. - * Note: this works for group members as well as group leaders - * since the non-leader members' sibling_lists will be empty. - */ -static void __perf_event_mark_enabled(struct perf_event *event) -{ - struct perf_event *sub; - u64 tstamp = perf_event_time(event); - - event->state = PERF_EVENT_STATE_INACTIVE; - __perf_event_enable_time(event, tstamp); - list_for_each_entry(sub, &event->sibling_list, group_entry) { - /* XXX should not be > INACTIVE if event isn't */ - if (sub->state >= PERF_EVENT_STATE_INACTIVE) - __perf_event_enable_time(sub, tstamp); - } -} - /* * Cross CPU call to enable a performance event */ @@ -2539,14 +2433,12 @@ static void __perf_event_enable(struct perf_event *event, if (ctx->is_active) ctx_sched_out(ctx, cpuctx, EVENT_TIME); - __perf_event_mark_enabled(event); + perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); if (!ctx->is_active) return; if (!event_filter_match(event)) { - if (is_cgroup_event(event)) - perf_cgroup_defer_enabled(event); ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); return; } @@ -2866,18 +2758,10 @@ static void __perf_event_sync_stat(struct perf_event *event, * we know the event must be on the current CPU, therefore we * don't need to use it. */ - switch (event->state) { - case PERF_EVENT_STATE_ACTIVE: + if (event->state == PERF_EVENT_STATE_ACTIVE) event->pmu->read(event); - /* fall-through */ - case PERF_EVENT_STATE_INACTIVE: - update_event_times(event); - break; - - default: - break; - } + perf_event_update_time(event); /* * In order to keep per-task stats reliable we need to flip the event @@ -3114,10 +2998,6 @@ ctx_pinned_sched_in(struct perf_event_context *ctx, if (!event_filter_match(event)) continue; - /* may need to reset tstamp_enabled */ - if (is_cgroup_event(event)) - perf_cgroup_mark_enabled(event, ctx); - if (group_can_go_on(event, cpuctx, 1)) group_sched_in(event, cpuctx, ctx); @@ -3125,10 +3005,8 @@ ctx_pinned_sched_in(struct perf_event_context *ctx, * If this pinned group hasn't been scheduled, * put it in error state. */ - if (event->state == PERF_EVENT_STATE_INACTIVE) { - update_group_times(event); - event->state = PERF_EVENT_STATE_ERROR; - } + if (event->state == PERF_EVENT_STATE_INACTIVE) + perf_event_set_state(event, PERF_EVENT_STATE_ERROR); } } @@ -3150,10 +3028,6 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, if (!event_filter_match(event)) continue; - /* may need to reset tstamp_enabled */ - if (is_cgroup_event(event)) - perf_cgroup_mark_enabled(event, ctx); - if (group_can_go_on(event, cpuctx, can_add_hw)) { if (group_sched_in(event, cpuctx, ctx)) can_add_hw = 0; @@ -3545,7 +3419,7 @@ static int event_enable_on_exec(struct perf_event *event, if (event->state >= PERF_EVENT_STATE_INACTIVE) return 0; - __perf_event_mark_enabled(event); + perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); return 1; } @@ -3644,10 +3518,9 @@ static void __perf_event_read(void *info) update_cgrp_time_from_event(event); } - if (!data->group) - update_event_times(event); - else - update_group_times(event); + perf_event_update_time(event); + if (data->group) + perf_event_update_sibling_time(event); if (event->state != PERF_EVENT_STATE_ACTIVE) goto unlock; @@ -3696,7 +3569,6 @@ int perf_event_read_local(struct perf_event *event, u64 *value, { unsigned long flags; int ret = 0; - u64 now; /* * Disabling interrupts avoids all counter scheduling (context @@ -3727,23 +3599,26 @@ int perf_event_read_local(struct perf_event *event, u64 *value, goto out; } - now = event->shadow_ctx_time + perf_clock(); - if (enabled) - *enabled = now - event->tstamp_enabled; + /* * If the event is currently on this CPU, its either a per-task event, * or local to this CPU. Furthermore it means its ACTIVE (otherwise * oncpu == -1). */ - if (event->oncpu == smp_processor_id()) { + if (event->oncpu == smp_processor_id()) event->pmu->read(event); - if (running) - *running = now - event->tstamp_running; - } else if (running) { - *running = event->total_time_running; - } *value = local64_read(&event->count); + if (enabled || running) { + u64 now = event->shadow_ctx_time + perf_clock(); + u64 __enabled, __running; + + __perf_update_times(event, now, &__enabled, &__running); + if (enabled) + *enabled = __enabled; + if (running) + *running = __running; + } out: local_irq_restore(flags); @@ -3818,10 +3693,9 @@ again: update_cgrp_time_from_event(event); } + perf_event_update_time(event); if (group) - update_group_times(event); - else - update_event_times(event); + perf_event_update_sibling_time(event); raw_spin_unlock_irqrestore(&ctx->lock, flags); } @@ -4945,8 +4819,7 @@ static void calc_timer_values(struct perf_event *event, *now = perf_clock(); ctx_time = event->shadow_ctx_time + *now; - *enabled = ctx_time - event->tstamp_enabled; - *running = ctx_time - event->tstamp_running; + __perf_update_times(event, ctx_time, enabled, running); } static void perf_event_init_userpage(struct perf_event *event) @@ -10581,7 +10454,7 @@ perf_event_exit_event(struct perf_event *child_event, if (parent_event) perf_group_detach(child_event); list_del_event(child_event, child_ctx); - child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */ + perf_event_set_state(child_event, PERF_EVENT_STATE_EXIT); /* is_event_hup() */ raw_spin_unlock_irq(&child_ctx->lock); /* -- cgit v1.2.3 From be96b316deff35e119760982c43af74e606fa143 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 28 Oct 2017 09:49:37 -0700 Subject: perf/cgroup: Fix perf cgroup hierarchy support The following commit: 864c2357ca89 ("perf/core: Do not set cpuctx->cgrp for unscheduled cgroups") made list_update_cgroup_event() skip setting cpuctx->cgrp if no cgroup event targets %current's cgroup. This breaks perf_event's hierarchical support because events which target one of the ancestors get ignored. Fix it by using cgroup_is_descendant() test instead of equality. Signed-off-by: Tejun Heo Acked-by: Thomas Gleixner Cc: Arnaldo Carvalho de Melo Cc: David Carrillo-Cisneros Cc: Linus Torvalds Cc: Peter Zijlstra Cc: kernel-team@fb.com Cc: stable@vger.kernel.org # v4.9+ Fixes: 864c2357ca89 ("perf/core: Do not set cpuctx->cgrp for unscheduled cgroups") Link: http://lkml.kernel.org/r/20171028164237.GA972780@devbig577.frc2.facebook.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 9d93db81fa36..10cdb9c26b5d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -901,9 +901,11 @@ list_update_cgroup_event(struct perf_event *event, cpuctx_entry = &cpuctx->cgrp_cpuctx_entry; /* cpuctx->cgrp is NULL unless a cgroup event is active in this CPU .*/ if (add) { + struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx); + list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list)); - if (perf_cgroup_from_task(current, ctx) == event->cgrp) - cpuctx->cgrp = event->cgrp; + if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) + cpuctx->cgrp = cgrp; } else { list_del(cpuctx_entry); cpuctx->cgrp = NULL; -- cgit v1.2.3 From b24413180f5600bcb3bb70fbed5cf186b60864bd Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 1 Nov 2017 15:07:57 +0100 Subject: License cleanup: add SPDX GPL-2.0 license identifier to files with no license Many source files in the tree are missing licensing information, which makes it harder for compliance tools to determine the correct license. By default all files without license information are under the default license of the kernel, which is GPL version 2. Update the files which contain no license information with the 'GPL-2.0' SPDX license identifier. The SPDX identifier is a legally binding shorthand, which can be used instead of the full boiler plate text. This patch is based on work done by Thomas Gleixner and Kate Stewart and Philippe Ombredanne. How this work was done: Patches were generated and checked against linux-4.14-rc6 for a subset of the use cases: - file had no licensing information it it. - file was a */uapi/* one with no licensing information in it, - file was a */uapi/* one with existing licensing information, Further patches will be generated in subsequent months to fix up cases where non-standard license headers were used, and references to license had to be inferred by heuristics based on keywords. The analysis to determine which SPDX License Identifier to be applied to a file was done in a spreadsheet of side by side results from of the output of two independent scanners (ScanCode & Windriver) producing SPDX tag:value files created by Philippe Ombredanne. Philippe prepared the base worksheet, and did an initial spot review of a few 1000 files. The 4.13 kernel was the starting point of the analysis with 60,537 files assessed. Kate Stewart did a file by file comparison of the scanner results in the spreadsheet to determine which SPDX license identifier(s) to be applied to the file. She confirmed any determination that was not immediately clear with lawyers working with the Linux Foundation. Criteria used to select files for SPDX license identifier tagging was: - Files considered eligible had to be source code files. - Make and config files were included as candidates if they contained >5 lines of source - File already had some variant of a license header in it (even if <5 lines). All documentation files were explicitly excluded. The following heuristics were used to determine which SPDX license identifiers to apply. - when both scanners couldn't find any license traces, file was considered to have no license information in it, and the top level COPYING file license applied. For non */uapi/* files that summary was: SPDX license identifier # files ---------------------------------------------------|------- GPL-2.0 11139 and resulted in the first patch in this series. If that file was a */uapi/* path one, it was "GPL-2.0 WITH Linux-syscall-note" otherwise it was "GPL-2.0". Results of that was: SPDX license identifier # files ---------------------------------------------------|------- GPL-2.0 WITH Linux-syscall-note 930 and resulted in the second patch in this series. - if a file had some form of licensing information in it, and was one of the */uapi/* ones, it was denoted with the Linux-syscall-note if any GPL family license was found in the file or had no licensing in it (per prior point). Results summary: SPDX license identifier # files ---------------------------------------------------|------ GPL-2.0 WITH Linux-syscall-note 270 GPL-2.0+ WITH Linux-syscall-note 169 ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) 21 ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) 17 LGPL-2.1+ WITH Linux-syscall-note 15 GPL-1.0+ WITH Linux-syscall-note 14 ((GPL-2.0+ WITH Linux-syscall-note) OR BSD-3-Clause) 5 LGPL-2.0+ WITH Linux-syscall-note 4 LGPL-2.1 WITH Linux-syscall-note 3 ((GPL-2.0 WITH Linux-syscall-note) OR MIT) 3 ((GPL-2.0 WITH Linux-syscall-note) AND MIT) 1 and that resulted in the third patch in this series. - when the two scanners agreed on the detected license(s), that became the concluded license(s). - when there was disagreement between the two scanners (one detected a license but the other didn't, or they both detected different licenses) a manual inspection of the file occurred. - In most cases a manual inspection of the information in the file resulted in a clear resolution of the license that should apply (and which scanner probably needed to revisit its heuristics). - When it was not immediately clear, the license identifier was confirmed with lawyers working with the Linux Foundation. - If there was any question as to the appropriate license identifier, the file was flagged for further research and to be revisited later in time. In total, over 70 hours of logged manual review was done on the spreadsheet to determine the SPDX license identifiers to apply to the source files by Kate, Philippe, Thomas and, in some cases, confirmation by lawyers working with the Linux Foundation. Kate also obtained a third independent scan of the 4.13 code base from FOSSology, and compared selected files where the other two scanners disagreed against that SPDX file, to see if there was new insights. The Windriver scanner is based on an older version of FOSSology in part, so they are related. Thomas did random spot checks in about 500 files from the spreadsheets for the uapi headers and agreed with SPDX license identifier in the files he inspected. For the non-uapi files Thomas did random spot checks in about 15000 files. In initial set of patches against 4.14-rc6, 3 files were found to have copy/paste license identifier errors, and have been fixed to reflect the correct identifier. Additionally Philippe spent 10 hours this week doing a detailed manual inspection and review of the 12,461 patched files from the initial patch version early this week with: - a full scancode scan run, collecting the matched texts, detected license ids and scores - reviewing anything where there was a license detected (about 500+ files) to ensure that the applied SPDX license was correct - reviewing anything where there was no detection but the patch license was not GPL-2.0 WITH Linux-syscall-note to ensure that the applied SPDX license was correct This produced a worksheet with 20 files needing minor correction. This worksheet was then exported into 3 different .csv files for the different types of files to be modified. These .csv files were then reviewed by Greg. Thomas wrote a script to parse the csv files and add the proper SPDX tag to the file, in the format that the file expected. This script was further refined by Greg based on the output to detect more types of files automatically and to distinguish between header and source .c files (which need different comment types.) Finally Greg ran the script using the .csv files to generate the patches. Reviewed-by: Kate Stewart Reviewed-by: Philippe Ombredanne Reviewed-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- kernel/events/Makefile | 1 + kernel/events/internal.h | 1 + 2 files changed, 2 insertions(+) (limited to 'kernel/events') diff --git a/kernel/events/Makefile b/kernel/events/Makefile index 2925188f50ea..3c022e33c109 100644 --- a/kernel/events/Makefile +++ b/kernel/events/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE) endif diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 843e97047335..09b1537ae06c 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _KERNEL_EVENTS_INTERNAL_H #define _KERNEL_EVENTS_INTERNAL_H -- cgit v1.2.3 From 164446455a5d3f1402b5a0ea42acce33fd576ed7 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 6 Nov 2017 16:01:24 +0100 Subject: perf/core: Use lockdep to assert IRQs are disabled/enabled Use lockdep to check that IRQs are enabled or disabled as expected. This way the sanity check only shows overhead when concurrency correctness debug code is enabled. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Cc: David S . Miller Cc: Lai Jiangshan Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Tejun Heo Link: http://lkml.kernel.org/r/1509980490-4285-9-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index b315aebbcc3f..c298847d4b85 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -209,7 +209,7 @@ static int event_function(void *info) struct perf_event_context *task_ctx = cpuctx->task_ctx; int ret = 0; - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); perf_ctx_lock(cpuctx, task_ctx); /* @@ -306,7 +306,7 @@ static void event_function_local(struct perf_event *event, event_f func, void *d struct task_struct *task = READ_ONCE(ctx->task); struct perf_event_context *task_ctx = NULL; - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); if (task) { if (task == TASK_TOMBSTONE) @@ -1006,7 +1006,7 @@ static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr) struct perf_cpu_context *cpuctx; int rotations = 0; - WARN_ON(!irqs_disabled()); + lockdep_assert_irqs_disabled(); cpuctx = container_of(hr, struct perf_cpu_context, hrtimer); rotations = perf_rotate_context(cpuctx); @@ -1093,7 +1093,7 @@ static void perf_event_ctx_activate(struct perf_event_context *ctx) { struct list_head *head = this_cpu_ptr(&active_ctx_list); - WARN_ON(!irqs_disabled()); + lockdep_assert_irqs_disabled(); WARN_ON(!list_empty(&ctx->active_ctx_list)); @@ -1102,7 +1102,7 @@ static void perf_event_ctx_activate(struct perf_event_context *ctx) static void perf_event_ctx_deactivate(struct perf_event_context *ctx) { - WARN_ON(!irqs_disabled()); + lockdep_assert_irqs_disabled(); WARN_ON(list_empty(&ctx->active_ctx_list)); @@ -3523,7 +3523,7 @@ void perf_event_task_tick(void) struct perf_event_context *ctx, *tmp; int throttled; - WARN_ON(!irqs_disabled()); + lockdep_assert_irqs_disabled(); __this_cpu_inc(perf_throttled_seq); throttled = __this_cpu_xchg(perf_throttled_count, 0); -- cgit v1.2.3 From dd0bb688eaa241b5655d396d45366cba9225aed9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 7 Nov 2017 15:28:42 -0500 Subject: bpf: add a bpf_override_function helper Error injection is sloppy and very ad-hoc. BPF could fill this niche perfectly with it's kprobe functionality. We could make sure errors are only triggered in specific call chains that we care about with very specific situations. Accomplish this with the bpf_override_funciton helper. This will modify the probe'd callers return value to the specified value and set the PC to an override function that simply returns, bypassing the originally probed function. This gives us a nice clean way to implement systematic error injection for all of our code paths. Acked-by: Alexei Starovoitov Signed-off-by: Josef Bacik Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/events/core.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 42d24bd64ea4..ac240d31b5bf 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8171,6 +8171,13 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) return -EINVAL; } + /* Kprobe override only works for kprobes, not uprobes. */ + if (prog->kprobe_override && + !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) { + bpf_prog_put(prog); + return -EINVAL; + } + if (is_tracepoint || is_syscall_tp) { int off = trace_event_get_offsets(event->tp_event); -- cgit v1.2.3 From f3edacbd697f94a743fff1a3d26910ab99948ba7 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sat, 11 Nov 2017 18:24:55 +0900 Subject: bpf: Revert bpf_overrid_function() helper changes. NACK'd by x86 maintainer. Signed-off-by: David S. Miller --- kernel/events/core.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index ac240d31b5bf..42d24bd64ea4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8171,13 +8171,6 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) return -EINVAL; } - /* Kprobe override only works for kprobes, not uprobes. */ - if (prog->kprobe_override && - !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) { - bpf_prog_put(prog); - return -EINVAL; - } - if (is_tracepoint || is_syscall_tp) { int off = trace_event_get_offsets(event->tp_event); -- cgit v1.2.3 From 4a31b424ac0656d1bb17520ee861144fe7a19664 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Wed, 15 Nov 2017 08:47:02 +0300 Subject: perf/core: Fix memory leak triggered by perf --namespace perf with --namespace key leaks various memory objects including namespaces 4.14.0+ pid_namespace 1 12 2568 12 8 user_namespace 1 39 824 39 8 net_namespace 1 5 6272 5 8 This happen because perf_fill_ns_link_info() struct patch ns_path: during initialization ns_path incremented counters on related mnt and dentry, but without lost path_put nobody decremented them back. Leaked dentry is name of related namespace, and its leak does not allow to free unused namespace. Signed-off-by: Vasily Averin Acked-by: Peter Zijlstra Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Hari Bathini Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Thomas Gleixner Fixes: commit e422267322cd ("perf: Add PERF_RECORD_NAMESPACES to include namespaces related info") Link: http://lkml.kernel.org/r/c510711b-3904-e5e1-d296-61273d21118d@virtuozzo.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 10cdb9c26b5d..ab5ac84f82e2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6756,6 +6756,7 @@ static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info, ns_inode = ns_path.dentry->d_inode; ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev); ns_link_info->ino = ns_inode->i_ino; + path_put(&ns_path); } } -- cgit v1.2.3