diff options
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/audit.c | 4 | ||||
| -rw-r--r-- | kernel/auditsc.c | 2 | ||||
| -rw-r--r-- | kernel/cgroup/cpuset-internal.h | 1 | ||||
| -rw-r--r-- | kernel/cgroup/cpuset.c | 56 | ||||
| -rw-r--r-- | kernel/cgroup/dmem.c | 1 | ||||
| -rw-r--r-- | kernel/cgroup/rstat.c | 37 | ||||
| -rw-r--r-- | kernel/dma/debug.c | 9 | ||||
| -rw-r--r-- | kernel/dma/direct.c | 4 | ||||
| -rw-r--r-- | kernel/dma/mapping.c | 4 | ||||
| -rw-r--r-- | kernel/exit.c | 2 | ||||
| -rw-r--r-- | kernel/irq/chip.c | 9 | ||||
| -rw-r--r-- | kernel/irq_work.c | 7 | ||||
| -rw-r--r-- | kernel/liveupdate/kexec_handover.c | 2 | ||||
| -rw-r--r-- | kernel/ptrace.c | 22 | ||||
| -rw-r--r-- | kernel/rcu/srcutree.c | 12 | ||||
| -rw-r--r-- | kernel/sched/deadline.c | 13 | ||||
| -rw-r--r-- | kernel/sched/ext.c | 316 | ||||
| -rw-r--r-- | kernel/trace/Makefile | 7 | ||||
| -rw-r--r-- | kernel/trace/bpf_trace.c | 3 | ||||
| -rw-r--r-- | kernel/trace/fprobe.c | 23 | ||||
| -rw-r--r-- | kernel/trace/remote_test.c | 4 | ||||
| -rw-r--r-- | kernel/trace/ring_buffer.c | 30 | ||||
| -rw-r--r-- | kernel/trace/simple_ring_buffer.c | 4 | ||||
| -rw-r--r-- | kernel/trace/trace_events_hist.c | 6 | ||||
| -rw-r--r-- | kernel/trace/tracing_map.c | 17 | ||||
| -rw-r--r-- | kernel/workqueue.c | 16 |
26 files changed, 437 insertions, 174 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index e1d489bc2dff..34dc7cb246ff 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1468,6 +1468,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh, err = audit_list_rules_send(skb, seq); break; case AUDIT_TRIM: + if (audit_enabled == AUDIT_LOCKED) + return -EPERM; audit_trim_trees(); audit_log_common_recv_msg(audit_context(), &ab, AUDIT_CONFIG_CHANGE); @@ -1480,6 +1482,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh, size_t msglen = data_len; char *old, *new; + if (audit_enabled == AUDIT_LOCKED) + return -EPERM; err = -EINVAL; if (msglen < 2 * sizeof(u32)) break; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index ab54fccba215..abdf8da3be93 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2786,7 +2786,7 @@ void __audit_log_capset(const struct cred *new, const struct cred *old) context->capset.pid = task_tgid_nr(current); context->capset.cap.effective = new->cap_effective; - context->capset.cap.inheritable = new->cap_effective; + context->capset.cap.inheritable = new->cap_inheritable; context->capset.cap.permitted = new->cap_permitted; context->capset.cap.ambient = new->cap_ambient; context->type = AUDIT_CAPSET; diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index bb4e692bea30..f7aaf01f7cd5 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -167,6 +167,7 @@ struct cpuset { */ int nr_deadline_tasks; int nr_migrate_dl_tasks; + /* DL bandwidth that needs destination reservation for this attach. */ u64 sum_migrate_dl_bw; /* * CPU used for temporary DL bandwidth allocation during attach; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index e3a081a07c6d..5c33ab20cc20 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1718,7 +1718,8 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, */ if (is_partition_valid(parent)) adding = cpumask_and(tmp->addmask, - xcpus, parent->effective_xcpus); + cs->effective_xcpus, + parent->effective_xcpus); if (old_prs > 0) new_prs = -old_prs; @@ -2993,7 +2994,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) struct cpuset *cs, *oldcs; struct task_struct *task; bool setsched_check; - int ret; + int cpu, ret; /* used later by cpuset_attach() */ cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); @@ -3038,31 +3039,31 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) } if (dl_task(task)) { + /* + * Count all migrating DL tasks for cpuset task accounting. + * Only tasks that need a root-domain bandwidth move + * contribute to sum_migrate_dl_bw. + */ cs->nr_migrate_dl_tasks++; - cs->sum_migrate_dl_bw += task->dl.dl_bw; + if (dl_task_needs_bw_move(task, cs->effective_cpus)) + cs->sum_migrate_dl_bw += task->dl.dl_bw; } } - if (!cs->nr_migrate_dl_tasks) + if (!cs->sum_migrate_dl_bw) goto out_success; - if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) { - int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus); - - if (unlikely(cpu >= nr_cpu_ids)) { - reset_migrate_dl_data(cs); - ret = -EINVAL; - goto out_unlock; - } + cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus); + if (unlikely(cpu >= nr_cpu_ids)) { + ret = -EINVAL; + goto out_unlock; + } - ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw); - if (ret) { - reset_migrate_dl_data(cs); - goto out_unlock; - } + ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw); + if (ret) + goto out_unlock; - cs->dl_bw_cpu = cpu; - } + cs->dl_bw_cpu = cpu; out_success: /* @@ -3070,7 +3071,10 @@ out_success: * changes which zero cpus/mems_allowed. */ cs->attach_in_progress++; + out_unlock: + if (ret) + reset_migrate_dl_data(cs); mutex_unlock(&cpuset_mutex); return ret; } @@ -4176,11 +4180,11 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this * node is set in the nearest hardwalled cpuset ancestor to current's cpuset, * yes. If current has access to memory reserves as an oom victim, yes. - * Otherwise, no. + * If the current task is PF_EXITING, yes. Otherwise, no. * * GFP_USER allocations are marked with the __GFP_HARDWALL bit, * and do not allow allocations outside the current tasks cpuset - * unless the task has been OOM killed. + * unless the task has been OOM killed or is exiting. * GFP_KERNEL allocations are not so marked, so can escape to the * nearest enclosing hardwalled ancestor cpuset. * @@ -4194,7 +4198,9 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * The first call here from mm/page_alloc:get_page_from_freelist() * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, * so no allocation on a node outside the cpuset is allowed (unless - * in interrupt, of course). + * in interrupt, of course). The PF_EXITING check must therefore + * come before the __GFP_HARDWALL check, otherwise a dying task + * would be blocked on the fast path. * * The second pass through get_page_from_freelist() doesn't even call * here for GFP_ATOMIC calls. For those calls, the __alloc_pages() @@ -4204,6 +4210,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * in_interrupt - any node ok (current task context irrelevant) * GFP_ATOMIC - any node ok * tsk_is_oom_victim - any node ok + * PF_EXITING - any node ok (let dying task exit quickly) * GFP_KERNEL - any node in enclosing hardwalled cpuset ok * GFP_USER - only nodes in current tasks mems allowed ok. */ @@ -4223,11 +4230,10 @@ bool cpuset_current_node_allowed(int node, gfp_t gfp_mask) */ if (unlikely(tsk_is_oom_victim(current))) return true; - if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ - return false; - if (current->flags & PF_EXITING) /* Let dying task have memory */ return true; + if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ + return false; /* Not hardwall and node outside mems_allowed: scan up cpusets */ spin_lock_irqsave(&callback_lock, flags); diff --git a/kernel/cgroup/dmem.c b/kernel/cgroup/dmem.c index 1ab1fb47f271..4753a67d0f0f 100644 --- a/kernel/cgroup/dmem.c +++ b/kernel/cgroup/dmem.c @@ -602,6 +602,7 @@ get_cg_pool_unlocked(struct dmemcg_state *cg, struct dmem_cgroup_region *region) pool = NULL; continue; } + pool = ERR_PTR(-ENOMEM); } } diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 150e5871e66f..de816a43db9f 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only #include "cgroup-internal.h" +#include <linux/cpumask.h> #include <linux/sched/cputime.h> #include <linux/bpf.h> @@ -53,7 +54,7 @@ static inline struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu) } /** - * css_rstat_updated - keep track of updated rstat_cpu + * __css_rstat_updated - keep track of updated rstat_cpu * @css: target cgroup subsystem state * @cpu: cpu on which rstat_cpu was updated * @@ -63,31 +64,27 @@ static inline struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu) * * NOTE: if the user needs the guarantee that the updater either add itself in * the lockless list or the concurrent flusher flushes its updated stats, a - * memory barrier is needed before the call to css_rstat_updated() i.e. a + * memory barrier is needed before the call to __css_rstat_updated() i.e. a * barrier after updating the per-cpu stats and before calling - * css_rstat_updated(). + * __css_rstat_updated(). */ -__bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) +void __css_rstat_updated(struct cgroup_subsys_state *css, int cpu) { struct llist_head *lhead; struct css_rstat_cpu *rstatc; struct llist_node *self; - /* - * Since bpf programs can call this function, prevent access to - * uninitialized rstat pointers. - */ + /* Prevent access to uninitialized rstat pointers. */ if (!css_uses_rstat(css)) return; lockdep_assert_preemption_disabled(); /* - * For archs withnot nmi safe cmpxchg or percpu ops support, ignore - * the requests from nmi context. + * The lockless insertion below relies on NMI-safe cmpxchg; + * bail out in NMI on archs that don't provide it. */ - if ((!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) || - !IS_ENABLED(CONFIG_ARCH_HAS_NMI_SAFE_THIS_CPU_OPS)) && in_nmi()) + if (!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) && in_nmi()) return; rstatc = css_rstat_cpu(css, cpu); @@ -125,6 +122,18 @@ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) llist_add(&rstatc->lnode, lhead); } +/* + * BPF-facing wrapper for __css_rstat_updated(). Validate the caller-provided + * CPU before passing it to the internal rstat updater. + */ +__bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) +{ + if (unlikely(cpu < 0 || cpu >= nr_cpu_ids || !cpu_possible(cpu))) + return; + + __css_rstat_updated(css, cpu); +} + static void __css_process_update_tree(struct cgroup_subsys_state *css, int cpu) { /* put @css and all ancestors on the corresponding updated lists */ @@ -170,7 +179,7 @@ static void css_process_update_tree(struct cgroup_subsys *ss, int cpu) * flusher flush the stats updated by the updater who have * observed that they are already on the list. The * corresponding barrier pair for this one should be before - * css_rstat_updated() by the user. + * __css_rstat_updated() by the user. * * For now, there aren't any such user, so not adding the * barrier here but if such a use-case arise, please add @@ -614,7 +623,7 @@ static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp, unsigned long flags) { u64_stats_update_end_irqrestore(&rstatbc->bsync, flags); - css_rstat_updated(&cgrp->self, smp_processor_id()); + __css_rstat_updated(&cgrp->self, smp_processor_id()); put_cpu_ptr(rstatbc); } diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 1a725edbbbf6..3248f8b4d096 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -1251,7 +1251,14 @@ void debug_dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, entry->direction = direction; entry->map_err_type = MAP_ERR_NOT_CHECKED; - if (!(attrs & DMA_ATTR_MMIO)) { + if (attrs & DMA_ATTR_MMIO) { + unsigned long pfn = PHYS_PFN(phys); + + if (pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn))) + err_printk(dev, entry, + "dma_map_resource called for RAM address %pa\n", + &phys); + } else { check_for_stack(dev, phys); if (!PhysHighMem(phys)) diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index ec887f443741..583c5922bca2 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -39,7 +39,7 @@ static inline struct page *dma_direct_to_page(struct device *dev, u64 dma_direct_get_required_mask(struct device *dev) { - phys_addr_t phys = (phys_addr_t)(max_pfn - 1) << PAGE_SHIFT; + phys_addr_t phys = ((phys_addr_t)max_pfn << PAGE_SHIFT) - 1; u64 max_dma = phys_to_dma_direct(dev, phys); return (1ULL << (fls64(max_dma) - 1)) * 2 - 1; @@ -553,7 +553,7 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma, int dma_direct_supported(struct device *dev, u64 mask) { - u64 min_mask = (max_pfn - 1) << PAGE_SHIFT; + u64 min_mask = ((u64)max_pfn << PAGE_SHIFT) - 1; /* * Because 32-bit DMA masks are so common we expect every architecture diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 23ed8eb9233e..e6b07f160d20 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -365,10 +365,6 @@ EXPORT_SYMBOL(dma_unmap_sg_attrs); dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { - if (IS_ENABLED(CONFIG_DMA_API_DEBUG) && - WARN_ON_ONCE(pfn_valid(PHYS_PFN(phys_addr)))) - return DMA_MAPPING_ERROR; - return dma_map_phys(dev, phys_addr, size, dir, attrs | DMA_ATTR_MMIO); } EXPORT_SYMBOL(dma_map_resource); diff --git a/kernel/exit.c b/kernel/exit.c index 25e9cb6de7e7..f50d73c272d6 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -571,6 +571,7 @@ static void exit_mm(void) */ smp_mb__after_spinlock(); local_irq_disable(); + current->user_dumpable = (get_dumpable(mm) == SUID_DUMP_USER); current->mm = NULL; membarrier_update_current_mm(NULL); enter_lazy_tlb(mm, current); @@ -1073,6 +1074,7 @@ void __noreturn make_task_dead(int signr) futex_exit_recursive(tsk); tsk->exit_state = EXIT_DEAD; refcount_inc(&tsk->rcu_users); + preempt_disable(); do_task_dead(); } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 6c9b1dc4e7d4..b635e3c5d5b6 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -14,6 +14,7 @@ #include <linux/interrupt.h> #include <linux/kernel_stat.h> #include <linux/irqdomain.h> +#include <linux/preempt.h> #include <linux/random.h> #include <trace/events/irq.h> @@ -893,7 +894,10 @@ void handle_percpu_irq(struct irq_desc *desc) * * action->percpu_dev_id is a pointer to percpu variables which * contain the real device id for the cpu on which this handler is - * called + * called. + * + * May be used for NMI interrupt lines, and so may be called in IRQ or NMI + * context. */ void handle_percpu_devid_irq(struct irq_desc *desc) { @@ -930,7 +934,8 @@ void handle_percpu_devid_irq(struct irq_desc *desc) enabled ? " and unmasked" : "", irq, cpu); } - add_interrupt_randomness(irq); + if (!in_nmi()) + add_interrupt_randomness(irq); if (chip->irq_eoi) chip->irq_eoi(&desc->irq_data); diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 120fd7365fbe..f7e2dc2c30c6 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -292,6 +292,12 @@ void irq_work_sync(struct irq_work *work) !arch_irq_work_has_interrupt()) { rcuwait_wait_event(&work->irqwait, !irq_work_is_busy(work), TASK_UNINTERRUPTIBLE); + /* + * Ensure irq_work_single() does not access @work + * after removing IRQ_WORK_BUSY. It is always + * accessed within a RCU-read section. + */ + synchronize_rcu(); return; } @@ -302,6 +308,7 @@ EXPORT_SYMBOL_GPL(irq_work_sync); static void run_irq_workd(unsigned int cpu) { + guard(rcu)(); irq_work_run_list(this_cpu_ptr(&lazy_list)); } diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 18509d8082ea..2592f7ca16e2 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -1707,7 +1707,7 @@ int kho_fill_kimage(struct kimage *image) int err = 0; struct kexec_buf scratch; - if (!kho_enable) + if (!kho_enable || image->type == KEXEC_TYPE_CRASH) return 0; image->kho.fdt = virt_to_phys(kho_out.fdt); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 68c17daef8d4..130043bfc209 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -272,11 +272,24 @@ static bool ptrace_has_cap(struct user_namespace *ns, unsigned int mode) return ns_capable(ns, CAP_SYS_PTRACE); } +static bool task_still_dumpable(struct task_struct *task, unsigned int mode) +{ + struct mm_struct *mm = task->mm; + if (mm) { + if (get_dumpable(mm) == SUID_DUMP_USER) + return true; + return ptrace_has_cap(mm->user_ns, mode); + } + + if (task->user_dumpable) + return true; + return ptrace_has_cap(&init_user_ns, mode); +} + /* Returns 0 on success, -errno on denial. */ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) { const struct cred *cred = current_cred(), *tcred; - struct mm_struct *mm; kuid_t caller_uid; kgid_t caller_gid; @@ -337,11 +350,8 @@ ok: * Pairs with a write barrier in commit_creds(). */ smp_rmb(); - mm = task->mm; - if (mm && - ((get_dumpable(mm) != SUID_DUMP_USER) && - !ptrace_has_cap(mm->user_ns, mode))) - return -EPERM; + if (!task_still_dumpable(task, mode)) + return -EPERM; return security_ptrace_access_check(task, mode); } diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 0d01cd8c4b4a..7c2f7cc131f7 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -897,11 +897,9 @@ static void srcu_schedule_cbs_snp(struct srcu_struct *ssp, struct srcu_node *snp { int cpu; - for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { - if (!(mask & (1UL << (cpu - snp->grplo)))) - continue; - srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, cpu), delay); - } + for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) + if ((mask & (1UL << (cpu - snp->grplo))) && rcu_cpu_beenfullyonline(cpu)) + srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, cpu), delay); } /* @@ -1322,7 +1320,9 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, */ idx = __srcu_read_lock_nmisafe(ssp); ss_state = smp_load_acquire(&ssp->srcu_sup->srcu_size_state); - if (ss_state < SRCU_SIZE_WAIT_CALL) + // If !rcu_cpu_beenfullyonline(), interrupts are still disabled, + // so no migration is possible in either direction from this CPU. + if (ss_state < SRCU_SIZE_WAIT_CALL || !rcu_cpu_beenfullyonline(raw_smp_processor_id())) sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id()); else sdp = raw_cpu_ptr(ssp->sda); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index edca7849b165..7db4c87df83b 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -3107,20 +3107,18 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) static void set_cpus_allowed_dl(struct task_struct *p, struct affinity_context *ctx) { - struct root_domain *src_rd; struct rq *rq; WARN_ON_ONCE(!dl_task(p)); rq = task_rq(p); - src_rd = rq->rd; /* * Migrating a SCHED_DEADLINE task between exclusive * cpusets (different root_domains) entails a bandwidth * update. We already made space for us in the destination * domain (see cpuset_can_attach()). */ - if (!cpumask_intersects(src_rd->span, ctx->new_mask)) { + if (dl_task_needs_bw_move(p, ctx->new_mask)) { struct dl_bw *src_dl_b; src_dl_b = dl_bw_of(cpu_of(rq)); @@ -3137,6 +3135,15 @@ static void set_cpus_allowed_dl(struct task_struct *p, set_cpus_allowed_common(p, ctx); } +bool dl_task_needs_bw_move(struct task_struct *p, + const struct cpumask *new_mask) +{ + if (!dl_task(p)) + return false; + + return !cpumask_intersects(task_rq(p)->rd->span, new_mask); +} + /* Assumes rq->lock is held */ static void rq_online_dl(struct rq *rq) { diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 38d90baf78cf..65631e577ee9 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -297,7 +297,6 @@ static void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch) #else /* CONFIG_EXT_SUB_SCHED */ static struct scx_sched *scx_parent(struct scx_sched *sch) { return NULL; } static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, struct scx_sched *root) { return pos ? NULL : root; } -static struct scx_sched *scx_find_sub_sched(u64 cgroup_id) { return NULL; } static void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch) {} #endif /* CONFIG_EXT_SUB_SCHED */ @@ -712,6 +711,51 @@ struct bpf_iter_scx_dsq { } __attribute__((aligned(8))); +static u32 scx_get_task_state(const struct task_struct *p) +{ + return p->scx.flags & SCX_TASK_STATE_MASK; +} + +static void scx_set_task_state(struct task_struct *p, u32 state) +{ + u32 prev_state = scx_get_task_state(p); + bool warn = false; + + switch (state) { + case SCX_TASK_NONE: + warn = prev_state == SCX_TASK_DEAD; + break; + case SCX_TASK_INIT_BEGIN: + warn = prev_state != SCX_TASK_NONE; + break; + case SCX_TASK_INIT: + warn = prev_state != SCX_TASK_INIT_BEGIN; + p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; + break; + case SCX_TASK_READY: + warn = !(prev_state == SCX_TASK_INIT || + prev_state == SCX_TASK_ENABLED); + break; + case SCX_TASK_ENABLED: + warn = prev_state != SCX_TASK_READY; + break; + case SCX_TASK_DEAD: + warn = !(prev_state == SCX_TASK_NONE || + prev_state == SCX_TASK_INIT_BEGIN); + break; + default: + WARN_ONCE(1, "sched_ext: Invalid task state %d -> %d for %s[%d]", + prev_state, state, p->comm, p->pid); + return; + } + + WARN_ONCE(warn, "sched_ext: Invalid task state transition 0x%x -> 0x%x for %s[%d]", + prev_state, state, p->comm, p->pid); + + p->scx.flags &= ~SCX_TASK_STATE_MASK; + p->scx.flags |= state; +} + /* * SCX task iterator. */ @@ -937,11 +981,11 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) /* * cgroup_task_dead() removes the dead tasks from cset->tasks * after sched_ext_dead() and cgroup iteration may see tasks - * which already finished sched_ext_dead(). %SCX_TASK_OFF_TASKS - * is set by sched_ext_dead() under @p's rq lock. Test it to + * which already finished sched_ext_dead(). %SCX_TASK_DEAD is + * set by sched_ext_dead() under @p's rq lock. Test it to * avoid visiting tasks which are already dead from SCX POV. */ - if (p->scx.flags & SCX_TASK_OFF_TASKS) { + if (scx_get_task_state(p) == SCX_TASK_DEAD) { __scx_task_iter_rq_unlock(iter); continue; } @@ -2034,6 +2078,7 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags) /* dequeue is always temporary, don't reset runnable_at */ clr_task_runnable(p, false); +retry: /* acquire ensures that we see the preceding updates on QUEUED */ opss = atomic_long_read_acquire(&p->scx.ops_state); @@ -2047,8 +2092,20 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags) */ BUG(); case SCX_OPSS_QUEUED: - /* A queued task must always be in BPF scheduler's custody */ - WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_IN_CUSTODY)); + /* + * A queued task must always be in BPF scheduler's custody. If + * SCX_TASK_IN_CUSTODY is clear, finish_dispatch() on another + * CPU has already passed call_task_dequeue() (which clears the + * flag), but has not yet written SCX_OPSS_NONE. That final + * store does not require this rq's lock, so retrying with + * cpu_relax() is bounded: we will observe NONE (or DISPATCHING, + * handled by the fallthrough) on a subsequent iteration. + */ + if (unlikely(!(READ_ONCE(p->scx.flags) & SCX_TASK_IN_CUSTODY))) { + cpu_relax(); + goto retry; + } + if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, SCX_OPSS_NONE)) break; @@ -3500,41 +3557,6 @@ static struct cgroup *tg_cgrp(struct task_group *tg) #endif /* CONFIG_EXT_GROUP_SCHED */ -static u32 scx_get_task_state(const struct task_struct *p) -{ - return p->scx.flags & SCX_TASK_STATE_MASK; -} - -static void scx_set_task_state(struct task_struct *p, u32 state) -{ - u32 prev_state = scx_get_task_state(p); - bool warn = false; - - switch (state) { - case SCX_TASK_NONE: - break; - case SCX_TASK_INIT: - warn = prev_state != SCX_TASK_NONE; - break; - case SCX_TASK_READY: - warn = prev_state == SCX_TASK_NONE; - break; - case SCX_TASK_ENABLED: - warn = prev_state != SCX_TASK_READY; - break; - default: - WARN_ONCE(1, "sched_ext: Invalid task state %d -> %d for %s[%d]", - prev_state, state, p->comm, p->pid); - return; - } - - WARN_ONCE(warn, "sched_ext: Invalid task state transition 0x%x -> 0x%x for %s[%d]", - prev_state, state, p->comm, p->pid); - - p->scx.flags &= ~SCX_TASK_STATE_MASK; - p->scx.flags |= state; -} - static int __scx_init_task(struct scx_sched *sch, struct task_struct *p, bool fork) { int ret; @@ -3586,22 +3608,6 @@ static int __scx_init_task(struct scx_sched *sch, struct task_struct *p, bool fo return 0; } -static int scx_init_task(struct scx_sched *sch, struct task_struct *p, bool fork) -{ - int ret; - - ret = __scx_init_task(sch, p, fork); - if (!ret) { - /* - * While @p's rq is not locked. @p is not visible to the rest of - * SCX yet and it's safe to update the flags and state. - */ - p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; - scx_set_task_state(p, SCX_TASK_INIT); - } - return ret; -} - static void __scx_enable_task(struct scx_sched *sch, struct task_struct *p) { struct rq *rq = task_rq(p); @@ -3716,7 +3722,8 @@ static void scx_disable_and_exit_task(struct scx_sched *sch, * If set, @p exited between __scx_init_task() and scx_enable_task() in * scx_sub_enable() and is initialized for both the associated sched and * its parent. Exit for the child too - scx_enable_task() never ran for - * it, so undo only init_task. + * it, so undo only init_task. The flag is only set on the sub-enable + * path, so it's always clear when @p arrives here in %SCX_TASK_NONE. */ if (p->scx.flags & SCX_TASK_SUB_INIT) { if (!WARN_ON_ONCE(!scx_enabling_sub_sched)) @@ -3764,10 +3771,14 @@ int scx_fork(struct task_struct *p, struct kernel_clone_args *kargs) #else struct scx_sched *sch = scx_root; #endif - ret = scx_init_task(sch, p, true); - if (!ret) - scx_set_task_sched(p, sch); - return ret; + scx_set_task_state(p, SCX_TASK_INIT_BEGIN); + ret = __scx_init_task(sch, p, true); + if (unlikely(ret)) { + scx_set_task_state(p, SCX_TASK_NONE); + return ret; + } + scx_set_task_state(p, SCX_TASK_INIT); + scx_set_task_sched(p, sch); } return 0; @@ -3862,18 +3873,23 @@ void sched_ext_dead(struct task_struct *p) * @p is off scx_tasks and wholly ours. scx_root_enable()'s READY -> * ENABLED transitions can't race us. Disable ops for @p. * - * %SCX_TASK_OFF_TASKS synchronizes against cgroup task iteration - see + * %SCX_TASK_DEAD synchronizes against cgroup task iteration - see * scx_task_iter_next_locked(). NONE tasks need no marking: cgroup * iteration is only used from sub-sched paths, which require root * enabled. Root enable transitions every live task to at least READY. + * + * %INIT_BEGIN means ops.init_task() is running for @p. Don't call + * into ops; transition to %DEAD so the post-init recheck unwinds + * via scx_sub_init_cancel_task(). */ if (scx_get_task_state(p) != SCX_TASK_NONE) { struct rq_flags rf; struct rq *rq; rq = task_rq_lock(p, &rf); - scx_disable_and_exit_task(scx_task_sched(p), p); - p->scx.flags |= SCX_TASK_OFF_TASKS; + if (scx_get_task_state(p) != SCX_TASK_INIT_BEGIN) + scx_disable_and_exit_task(scx_task_sched(p), p); + scx_set_task_state(p, SCX_TASK_DEAD); task_rq_unlock(rq, p, &rf); } } @@ -3919,6 +3935,16 @@ static void switched_from_scx(struct rq *rq, struct task_struct *p) if (task_dead_and_done(p)) return; + /* + * %NONE means SCX is no longer tracking @p at the task level (e.g. + * scx_fail_parent() handed @p back to the parent at NONE pending the + * parent's own teardown). There is nothing to disable; calling + * scx_disable_task() would WARN on the non-%ENABLED state and trigger a + * NONE -> READY validation failure. + */ + if (scx_get_task_state(p) == SCX_TASK_NONE) + return; + scx_disable_task(scx_task_sched(p), p); } @@ -4808,6 +4834,8 @@ static void scx_sched_free_rcu_work(struct work_struct *work) kfree(sch->cgrp_path); if (sch_cgroup(sch)) cgroup_put(sch_cgroup(sch)); + if (sch->sub_kset) + kobject_put(&sch->sub_kset->kobj); #endif /* CONFIG_EXT_SUB_SCHED */ for_each_possible_cpu(cpu) { @@ -4931,10 +4959,30 @@ static const struct kset_uevent_ops scx_uevent_ops = { */ bool task_should_scx(int policy) { - if (!scx_enabled() || unlikely(scx_enable_state() == SCX_DISABLING)) + /* if disabled, nothing should be on it */ + if (!scx_enabled()) return false; + + /* scx is taking over all SCHED_OTHER and SCHED_EXT tasks */ if (READ_ONCE(scx_switching_all)) return true; + + /* + * scx is tearing down - keep new SCHED_EXT tasks out. + * + * Must come after scx_switching_all test, which serves as a proxy + * for __scx_switched_all. While __scx_switched_all is set, we must + * return true via the branch above: a fork routed to fair would + * stall because next_active_class() skips fair. + * + * This can develop into a deadlock - scx holds scx_enable_mutex across + * kthread_create() in scx_alloc_and_add_sched(); if the new kthread is + * the stalled task, the disable path can never grab the mutex to clear + * scx_switching_all. + */ + if (unlikely(scx_enable_state() == SCX_DISABLING)) + return false; + return policy == SCHED_EXT; } @@ -5585,10 +5633,12 @@ static void refresh_watchdog(void) static s32 scx_link_sched(struct scx_sched *sch) { + const char *err_msg = ""; + s32 ret = 0; + scoped_guard(raw_spinlock_irq, &scx_sched_lock) { #ifdef CONFIG_EXT_SUB_SCHED struct scx_sched *parent = scx_parent(sch); - s32 ret; if (parent) { /* @@ -5598,15 +5648,16 @@ static s32 scx_link_sched(struct scx_sched *sch) * parent can shoot us down. */ if (atomic_read(&parent->exit_kind) != SCX_EXIT_NONE) { - scx_error(sch, "parent disabled"); - return -ENOENT; + err_msg = "parent disabled"; + ret = -ENOENT; + break; } ret = rhashtable_lookup_insert_fast(&scx_sched_hash, &sch->hash_node, scx_sched_hash_params); if (ret) { - scx_error(sch, "failed to insert into scx_sched_hash (%d)", ret); - return ret; + err_msg = "failed to insert into scx_sched_hash"; + break; } list_add_tail(&sch->sibling, &parent->children); @@ -5616,6 +5667,15 @@ static s32 scx_link_sched(struct scx_sched *sch) list_add_tail_rcu(&sch->all, &scx_sched_all); } + /* + * scx_error() takes scx_sched_lock via scx_claim_exit(), so it must run after + * the guard above is released. + */ + if (ret) { + scx_error(sch, "%s (%d)", err_msg, ret); + return ret; + } + refresh_watchdog(); return 0; } @@ -5685,7 +5745,7 @@ static void scx_fail_parent(struct scx_sched *sch, scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { scx_disable_and_exit_task(sch, p); - rcu_assign_pointer(p->scx.sched, parent); + scx_set_task_sched(p, parent); } } scx_task_iter_stop(&sti); @@ -5763,6 +5823,21 @@ static void scx_sub_disable(struct scx_sched *sch) } rq = task_rq_lock(p, &rf); + + if (scx_get_task_state(p) == SCX_TASK_DEAD) { + /* + * sched_ext_dead() raced us between __scx_init_task() + * and this rq lock and ran exit_task() on @sch (the + * sched @p was on at that point), not on $parent. + * $parent's just-completed init is owed an exit_task() + * and we issue it here. + */ + scx_sub_init_cancel_task(parent, p); + task_rq_unlock(rq, p, &rf); + put_task_struct(p); + continue; + } + scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { /* * $p is initialized for $parent and still attached to @@ -5771,13 +5846,14 @@ static void scx_sub_disable(struct scx_sched *sch) * $p having already been initialized, and then enable. */ scx_disable_and_exit_task(sch, p); + scx_set_task_state(p, SCX_TASK_INIT_BEGIN); scx_set_task_state(p, SCX_TASK_INIT); - rcu_assign_pointer(p->scx.sched, parent); + scx_set_task_sched(p, parent); scx_set_task_state(p, SCX_TASK_READY); scx_enable_task(parent, p); } - task_rq_unlock(rq, p, &rf); + task_rq_unlock(rq, p, &rf); put_task_struct(p); } scx_task_iter_stop(&sti); @@ -5820,7 +5896,7 @@ static void scx_sub_disable(struct scx_sched *sch) if (sch->ops.exit) SCX_CALL_OP(sch, exit, NULL, sch->exit_info); if (sch->sub_kset) - kset_unregister(sch->sub_kset); + kobject_del(&sch->sub_kset->kobj); kobject_del(&sch->kobj); } #else /* CONFIG_EXT_SUB_SCHED */ @@ -5954,7 +6030,7 @@ static void scx_root_disable(struct scx_sched *sch) */ #ifdef CONFIG_EXT_SUB_SCHED if (sch->sub_kset) - kset_unregister(sch->sub_kset); + kobject_del(&sch->sub_kset->kobj); #endif kobject_del(&sch->kobj); @@ -6578,7 +6654,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, sch->slice_dfl = SCX_SLICE_DFL; atomic_set(&sch->exit_kind, SCX_EXIT_NONE); - init_irq_work(&sch->disable_irq_work, scx_disable_irq_workfn); + sch->disable_irq_work = IRQ_WORK_INIT_HARD(scx_disable_irq_workfn); kthread_init_work(&sch->disable_work, scx_disable_workfn); timer_setup(&sch->bypass_lb_timer, scx_bypass_lb_timerfn, 0); @@ -6594,6 +6670,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, rcu_assign_pointer(ops->priv, sch); sch->kobj.kset = scx_kset; + INIT_LIST_HEAD(&sch->all); #ifdef CONFIG_EXT_SUB_SCHED char *buf = kzalloc(PATH_MAX, GFP_KERNEL); @@ -6621,6 +6698,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root"); if (ret < 0) { + RCU_INIT_POINTER(ops->priv, NULL); kobject_put(&sch->kobj); return ERR_PTR(ret); } @@ -6628,6 +6706,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, if (ops->sub_attach) { sch->sub_kset = kset_create_and_add("sub", NULL, &sch->kobj); if (!sch->sub_kset) { + RCU_INIT_POINTER(ops->priv, NULL); kobject_put(&sch->kobj); return ERR_PTR(-ENOMEM); } @@ -6635,14 +6714,18 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, #else /* CONFIG_EXT_SUB_SCHED */ ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root"); if (ret < 0) { + RCU_INIT_POINTER(ops->priv, NULL); kobject_put(&sch->kobj); return ERR_PTR(ret); } #endif /* CONFIG_EXT_SUB_SCHED */ return sch; +#ifdef CONFIG_EXT_SUB_SCHED err_free_lb_resched: + RCU_INIT_POINTER(ops->priv, NULL); free_cpumask_var(sch->bypass_lb_resched_cpumask); +#endif err_free_lb_cpumask: free_cpumask_var(sch->bypass_lb_donee_cpumask); err_stop_helper: @@ -6752,6 +6835,19 @@ static void scx_root_enable_workfn(struct kthread_work *work) goto err_unlock; } + /* + * @ops->priv binds @ops to its scx_sched instance. It is set here by + * scx_alloc_and_add_sched() and cleared at the tail of bpf_scx_unreg(), + * which runs after scx_root_disable() has dropped scx_enable_mutex. If + * it's still non-NULL here, a previous attachment on @ops has not + * finished tearing down; proceeding would let the in-flight unreg's + * RCU_INIT_POINTER(NULL) clobber the @ops->priv we are about to assign. + */ + if (rcu_access_pointer(ops->priv)) { + ret = -EBUSY; + goto err_unlock; + } + ret = alloc_kick_syncs(); if (ret) goto err_unlock; @@ -6874,6 +6970,9 @@ static void scx_root_enable_workfn(struct kthread_work *work) scx_task_iter_start(&sti, NULL); while ((p = scx_task_iter_next_locked(&sti))) { + struct rq_flags rf; + struct rq *rq; + /* * @p may already be dead, have lost all its usages counts and * be waiting for RCU grace period before being freed. @p can't @@ -6882,20 +6981,47 @@ static void scx_root_enable_workfn(struct kthread_work *work) if (!tryget_task_struct(p)) continue; + /* + * Set %INIT_BEGIN under the iter's rq lock so that a concurrent + * sched_ext_dead() does not call ops.exit_task() on @p while + * ops.init_task() is running. If sched_ext_dead() runs before + * this store, it has already removed @p from scx_tasks and the + * iter won't visit @p; if it runs after, it observes + * %INIT_BEGIN and transitions to %DEAD without calling ops, + * leaving the post-init recheck below to unwind. + */ + scx_set_task_state(p, SCX_TASK_INIT_BEGIN); scx_task_iter_unlock(&sti); - ret = scx_init_task(sch, p, false); - if (ret) { - put_task_struct(p); + ret = __scx_init_task(sch, p, false); + + rq = task_rq_lock(p, &rf); + + if (unlikely(ret)) { + if (scx_get_task_state(p) != SCX_TASK_DEAD) + scx_set_task_state(p, SCX_TASK_NONE); + task_rq_unlock(rq, p, &rf); scx_task_iter_stop(&sti); scx_error(sch, "ops.init_task() failed (%d) for %s[%d]", ret, p->comm, p->pid); + put_task_struct(p); goto err_disable_unlock_all; } - scx_set_task_sched(p, sch); - scx_set_task_state(p, SCX_TASK_READY); + if (scx_get_task_state(p) == SCX_TASK_DEAD) { + /* + * sched_ext_dead() observed %INIT_BEGIN and set %DEAD. + * ops.exit_task() is owed to the sched __scx_init_task() + * ran against; call it now. + */ + scx_sub_init_cancel_task(sch, p); + } else { + scx_set_task_state(p, SCX_TASK_INIT); + scx_set_task_sched(p, sch); + scx_set_task_state(p, SCX_TASK_READY); + } + task_rq_unlock(rq, p, &rf); put_task_struct(p); } scx_task_iter_stop(&sti); @@ -7039,6 +7165,12 @@ static void scx_sub_enable_workfn(struct kthread_work *work) goto out_unlock; } + /* See scx_root_enable_workfn() for the @ops->priv check. */ + if (rcu_access_pointer(ops->priv)) { + ret = -EBUSY; + goto out_unlock; + } + cgrp = cgroup_get_from_id(ops->sub_cgroup_id); if (IS_ERR(cgrp)) { ret = PTR_ERR(cgrp); @@ -7165,6 +7297,21 @@ static void scx_sub_enable_workfn(struct kthread_work *work) goto abort; rq = task_rq_lock(p, &rf); + + if (scx_get_task_state(p) == SCX_TASK_DEAD) { + /* + * sched_ext_dead() raced us between __scx_init_task() + * and this rq lock and ran exit_task() on $parent (the + * sched @p was on at that point), not on @sch. @sch's + * just-completed init is owed an exit_task() and we + * issue it here. + */ + scx_sub_init_cancel_task(sch, p); + task_rq_unlock(rq, p, &rf); + put_task_struct(p); + continue; + } + p->scx.flags |= SCX_TASK_SUB_INIT; task_rq_unlock(rq, p, &rf); @@ -7199,7 +7346,7 @@ static void scx_sub_enable_workfn(struct kthread_work *work) * $p is now only initialized for @sch and READY, which * is what we want. Assign it to @sch and enable. */ - rcu_assign_pointer(p->scx.sched, sch); + scx_set_task_sched(p, sch); scx_enable_task(sch, p); p->scx.flags &= ~SCX_TASK_SUB_INIT; @@ -7301,8 +7448,7 @@ static s32 scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) static DEFINE_MUTEX(helper_mutex); struct scx_enable_cmd cmd; - if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN), - cpu_possible_mask)) { + if (housekeeping_enabled(HK_TYPE_DOMAIN_BOOT)) { pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n"); return -EINVAL; } diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 1decdce8cbef..8d3d96e847d8 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -143,8 +143,8 @@ obj-$(CONFIG_TRACE_REMOTE_TEST) += remote_test.o targets += undefsyms_base.o KASAN_SANITIZE_undefsyms_base.o := y -UNDEFINED_ALLOWLIST = __asan __gcov __kasan __kcsan __hwasan __sancov __sanitizer __tsan __ubsan __x86_indirect_thunk \ - __msan simple_ring_buffer \ +UNDEFINED_ALLOWLIST = __asan __gcov __kasan __kcsan __hwasan __sancov __sanitizer __tsan __ubsan __msan \ + __aeabi_unwind_cpp __s390_indirect_jump __x86_indirect_thunk simple_ring_buffer \ $(shell $(NM) -u $(obj)/undefsyms_base.o 2>/dev/null | awk '{print $$2}') quiet_cmd_check_undefined = NM $< @@ -154,7 +154,8 @@ quiet_cmd_check_undefined = NM $< echo "Unexpected symbols in $<:" >&2; \ echo "$$undefsyms" >&2; \ false; \ - fi + fi; \ + touch $@ $(obj)/%.o.checked: $(obj)/%.o $(obj)/undefsyms_base.o FORCE $(call if_changed,check_undefined) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index af7079aa0f36..a02bd258677e 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2384,7 +2384,8 @@ static void bpf_kprobe_multi_link_release(struct bpf_link *link) struct bpf_kprobe_multi_link *kmulti_link; kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); - unregister_fprobe(&kmulti_link->fp); + /* Don't wait for RCU GP here. */ + unregister_fprobe_async(&kmulti_link->fp); kprobe_multi_put_modules(kmulti_link->mods, kmulti_link->mods_cnt); } diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index cc49ebd2a773..f378613ad120 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -1093,14 +1093,15 @@ static int unregister_fprobe_nolock(struct fprobe *fp) } /** - * unregister_fprobe() - Unregister fprobe. + * unregister_fprobe_async() - Unregister fprobe without RCU GP wait * @fp: A fprobe data structure to be unregistered. * * Unregister fprobe (and remove ftrace hooks from the function entries). + * This function will NOT wait until the fprobe is no longer used. * * Return 0 if @fp is unregistered successfully, -errno if not. */ -int unregister_fprobe(struct fprobe *fp) +int unregister_fprobe_async(struct fprobe *fp) { guard(mutex)(&fprobe_mutex); if (!fp || !fprobe_registered(fp)) @@ -1108,6 +1109,24 @@ int unregister_fprobe(struct fprobe *fp) return unregister_fprobe_nolock(fp); } + +/** + * unregister_fprobe() - Unregister fprobe with RCU GP wait + * @fp: A fprobe data structure to be unregistered. + * + * Unregister fprobe (and remove ftrace hooks from the function entries). + * This function will block until the fprobe is no longer used. + * + * Return 0 if @fp is unregistered successfully, -errno if not. + */ +int unregister_fprobe(struct fprobe *fp) +{ + int ret = unregister_fprobe_async(fp); + + if (!ret) + synchronize_rcu(); + return ret; +} EXPORT_SYMBOL_GPL(unregister_fprobe); static int __init fprobe_initcall(void) diff --git a/kernel/trace/remote_test.c b/kernel/trace/remote_test.c index 6c1b7701ddae..a3e2c9b606eb 100644 --- a/kernel/trace/remote_test.c +++ b/kernel/trace/remote_test.c @@ -110,9 +110,9 @@ static struct trace_buffer_desc *remote_test_load(unsigned long size, void *unus return remote_test_buffer_desc; err_unload: - for_each_ring_buffer_desc(rb_desc, cpu, remote_test_buffer_desc) + for_each_ring_buffer_desc(rb_desc, cpu, desc) remote_test_unload_simple_rb(rb_desc->cpu); - trace_remote_free_buffer(remote_test_buffer_desc); + trace_remote_free_buffer(desc); err_free_desc: kfree(desc); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 5326924615a4..7b07d2004cc6 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -7,6 +7,7 @@ #include <linux/ring_buffer_types.h> #include <linux/sched/isolation.h> #include <linux/trace_recursion.h> +#include <linux/panic_notifier.h> #include <linux/trace_events.h> #include <linux/ring_buffer.h> #include <linux/trace_clock.h> @@ -31,6 +32,7 @@ #include <linux/oom.h> #include <linux/mm.h> +#include <asm/ring_buffer.h> #include <asm/local64.h> #include <asm/local.h> #include <asm/setup.h> @@ -559,6 +561,7 @@ struct trace_buffer { unsigned long range_addr_start; unsigned long range_addr_end; + struct notifier_block flush_nb; struct ring_buffer_meta *meta; @@ -2521,6 +2524,16 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) kfree(cpu_buffer); } +/* Stop recording on a persistent buffer and flush cache if needed. */ +static int rb_flush_buffer_cb(struct notifier_block *nb, unsigned long event, void *data) +{ + struct trace_buffer *buffer = container_of(nb, struct trace_buffer, flush_nb); + + ring_buffer_record_off(buffer); + arch_ring_buffer_flush_range(buffer->range_addr_start, buffer->range_addr_end); + return NOTIFY_DONE; +} + static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, int order, unsigned long start, unsigned long end, @@ -2651,6 +2664,12 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, mutex_init(&buffer->mutex); + /* Persistent ring buffer needs to flush cache before reboot. */ + if (start && end) { + buffer->flush_nb.notifier_call = rb_flush_buffer_cb; + atomic_notifier_chain_register(&panic_notifier_list, &buffer->flush_nb); + } + return_ptr(buffer); fail_free_buffers: @@ -2749,6 +2768,9 @@ ring_buffer_free(struct trace_buffer *buffer) { int cpu; + if (buffer->range_addr_start && buffer->range_addr_end) + atomic_notifier_chain_unregister(&panic_notifier_list, &buffer->flush_nb); + cpuhp_state_remove_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node); irq_work_sync(&buffer->irq_work.work); @@ -5407,6 +5429,7 @@ static void rb_iter_reset(struct ring_buffer_iter *iter) iter->head_page = cpu_buffer->reader_page; iter->head = cpu_buffer->reader_page->read; iter->next_event = iter->head; + iter->missed_events = 0; iter->cache_reader_page = iter->head_page; iter->cache_read = cpu_buffer->read; @@ -6086,10 +6109,7 @@ ring_buffer_peek(struct trace_buffer *buffer, int cpu, u64 *ts, */ bool ring_buffer_iter_dropped(struct ring_buffer_iter *iter) { - bool ret = iter->missed_events != 0; - - iter->missed_events = 0; - return ret; + return iter->missed_events != 0; } EXPORT_SYMBOL_GPL(ring_buffer_iter_dropped); @@ -6251,7 +6271,7 @@ void ring_buffer_iter_advance(struct ring_buffer_iter *iter) unsigned long flags; raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); - + iter->missed_events = 0; rb_advance_iter(iter); raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); diff --git a/kernel/trace/simple_ring_buffer.c b/kernel/trace/simple_ring_buffer.c index 02af2297ae5a..f4642f5adda3 100644 --- a/kernel/trace/simple_ring_buffer.c +++ b/kernel/trace/simple_ring_buffer.c @@ -395,7 +395,6 @@ int simple_ring_buffer_init_mm(struct simple_rb_per_cpu *cpu_buffer, memset(cpu_buffer->meta, 0, sizeof(*cpu_buffer->meta)); cpu_buffer->meta->meta_page_size = PAGE_SIZE; - cpu_buffer->meta->nr_subbufs = cpu_buffer->nr_pages; /* The reader page is not part of the ring initially */ page = load_page(desc->page_va[0]); @@ -431,12 +430,13 @@ int simple_ring_buffer_init_mm(struct simple_rb_per_cpu *cpu_buffer, if (ret) { for (i--; i >= 0; i--) - unload_page((void *)desc->page_va[i]); + unload_page(bpages[i].page); unload_page(cpu_buffer->meta); return ret; } + cpu_buffer->meta->nr_subbufs = cpu_buffer->nr_pages; /* Close the ring */ bpage->link.next = &cpu_buffer->tail_page->link; cpu_buffer->tail_page->link.prev = &bpage->link; diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 0dbbf6cca9bc..eb2c2bc8bc3d 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1369,10 +1369,8 @@ static const char *hist_field_name(struct hist_field *field, len = snprintf(full_name, sizeof(full_name), fmt, field->system, field->event_name, field->name); - if (len >= sizeof(full_name)) - return NULL; - - field_name = full_name; + if (len < sizeof(full_name)) + field_name = full_name; } else field_name = field->name; } else if (field->flags & HIST_FIELD_FL_TIMESTAMP) diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index bf1a507695b6..0dd7927df22a 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -386,13 +386,11 @@ static void tracing_map_elt_init_fields(struct tracing_map_elt *elt) } } -static void tracing_map_elt_free(struct tracing_map_elt *elt) +static void __tracing_map_elt_free(struct tracing_map_elt *elt) { if (!elt) return; - if (elt->map->ops && elt->map->ops->elt_free) - elt->map->ops->elt_free(elt); kfree(elt->fields); kfree(elt->vars); kfree(elt->var_set); @@ -400,6 +398,17 @@ static void tracing_map_elt_free(struct tracing_map_elt *elt) kfree(elt); } +static void tracing_map_elt_free(struct tracing_map_elt *elt) +{ + if (!elt) + return; + + /* Only objects initialized with alloc_elt() should be passed to free_elt().*/ + if (elt->map->ops && elt->map->ops->elt_free) + elt->map->ops->elt_free(elt); + __tracing_map_elt_free(elt); +} + static struct tracing_map_elt *tracing_map_elt_alloc(struct tracing_map *map) { struct tracing_map_elt *elt; @@ -444,7 +453,7 @@ static struct tracing_map_elt *tracing_map_elt_alloc(struct tracing_map *map) } return elt; free: - tracing_map_elt_free(elt); + __tracing_map_elt_free(elt); return ERR_PTR(err); } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 3d2e3b2ec528..33b721a9af02 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2296,6 +2296,18 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) && WARN_ONCE(!is_chained_work(wq), "workqueue: cannot queue %ps on wq %s\n", work->func, wq->name))) { + struct work_offq_data offqd; + + /* + * State on entry: PENDING is set, work is off-queue (no + * insert_work() has run). + * + * Returning without clearing PENDING would leave the work + * in a weird state (PENDING=1, PWQ=0, entry empty) + */ + work_offqd_unpack(&offqd, *work_data_bits(work)); + set_work_pool_and_clear_pending(work, offqd.pool_id, + work_offqd_pack_flags(&offqd)); return; } rcu_read_lock(); @@ -5642,7 +5654,9 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) ret = apply_workqueue_attrs_locked(wq, unbound_std_wq_attrs[highpri]); } - return ret; + if (ret) + goto enomem; + return 0; enomem: if (wq->cpu_pwq) { |
