summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit.c4
-rw-r--r--kernel/auditsc.c2
-rw-r--r--kernel/cgroup/cpuset-internal.h1
-rw-r--r--kernel/cgroup/cpuset.c56
-rw-r--r--kernel/cgroup/dmem.c1
-rw-r--r--kernel/cgroup/rstat.c37
-rw-r--r--kernel/dma/debug.c9
-rw-r--r--kernel/dma/direct.c4
-rw-r--r--kernel/dma/mapping.c4
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/irq/chip.c9
-rw-r--r--kernel/irq_work.c7
-rw-r--r--kernel/liveupdate/kexec_handover.c2
-rw-r--r--kernel/ptrace.c22
-rw-r--r--kernel/rcu/srcutree.c12
-rw-r--r--kernel/sched/deadline.c13
-rw-r--r--kernel/sched/ext.c316
-rw-r--r--kernel/trace/Makefile7
-rw-r--r--kernel/trace/bpf_trace.c3
-rw-r--r--kernel/trace/fprobe.c23
-rw-r--r--kernel/trace/remote_test.c4
-rw-r--r--kernel/trace/ring_buffer.c30
-rw-r--r--kernel/trace/simple_ring_buffer.c4
-rw-r--r--kernel/trace/trace_events_hist.c6
-rw-r--r--kernel/trace/tracing_map.c17
-rw-r--r--kernel/workqueue.c16
26 files changed, 437 insertions, 174 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index e1d489bc2dff..34dc7cb246ff 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1468,6 +1468,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
err = audit_list_rules_send(skb, seq);
break;
case AUDIT_TRIM:
+ if (audit_enabled == AUDIT_LOCKED)
+ return -EPERM;
audit_trim_trees();
audit_log_common_recv_msg(audit_context(), &ab,
AUDIT_CONFIG_CHANGE);
@@ -1480,6 +1482,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
size_t msglen = data_len;
char *old, *new;
+ if (audit_enabled == AUDIT_LOCKED)
+ return -EPERM;
err = -EINVAL;
if (msglen < 2 * sizeof(u32))
break;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index ab54fccba215..abdf8da3be93 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2786,7 +2786,7 @@ void __audit_log_capset(const struct cred *new, const struct cred *old)
context->capset.pid = task_tgid_nr(current);
context->capset.cap.effective = new->cap_effective;
- context->capset.cap.inheritable = new->cap_effective;
+ context->capset.cap.inheritable = new->cap_inheritable;
context->capset.cap.permitted = new->cap_permitted;
context->capset.cap.ambient = new->cap_ambient;
context->type = AUDIT_CAPSET;
diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h
index bb4e692bea30..f7aaf01f7cd5 100644
--- a/kernel/cgroup/cpuset-internal.h
+++ b/kernel/cgroup/cpuset-internal.h
@@ -167,6 +167,7 @@ struct cpuset {
*/
int nr_deadline_tasks;
int nr_migrate_dl_tasks;
+ /* DL bandwidth that needs destination reservation for this attach. */
u64 sum_migrate_dl_bw;
/*
* CPU used for temporary DL bandwidth allocation during attach;
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index e3a081a07c6d..5c33ab20cc20 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -1718,7 +1718,8 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
*/
if (is_partition_valid(parent))
adding = cpumask_and(tmp->addmask,
- xcpus, parent->effective_xcpus);
+ cs->effective_xcpus,
+ parent->effective_xcpus);
if (old_prs > 0)
new_prs = -old_prs;
@@ -2993,7 +2994,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
struct cpuset *cs, *oldcs;
struct task_struct *task;
bool setsched_check;
- int ret;
+ int cpu, ret;
/* used later by cpuset_attach() */
cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
@@ -3038,31 +3039,31 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
}
if (dl_task(task)) {
+ /*
+ * Count all migrating DL tasks for cpuset task accounting.
+ * Only tasks that need a root-domain bandwidth move
+ * contribute to sum_migrate_dl_bw.
+ */
cs->nr_migrate_dl_tasks++;
- cs->sum_migrate_dl_bw += task->dl.dl_bw;
+ if (dl_task_needs_bw_move(task, cs->effective_cpus))
+ cs->sum_migrate_dl_bw += task->dl.dl_bw;
}
}
- if (!cs->nr_migrate_dl_tasks)
+ if (!cs->sum_migrate_dl_bw)
goto out_success;
- if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) {
- int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus);
-
- if (unlikely(cpu >= nr_cpu_ids)) {
- reset_migrate_dl_data(cs);
- ret = -EINVAL;
- goto out_unlock;
- }
+ cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus);
+ if (unlikely(cpu >= nr_cpu_ids)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
- ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw);
- if (ret) {
- reset_migrate_dl_data(cs);
- goto out_unlock;
- }
+ ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw);
+ if (ret)
+ goto out_unlock;
- cs->dl_bw_cpu = cpu;
- }
+ cs->dl_bw_cpu = cpu;
out_success:
/*
@@ -3070,7 +3071,10 @@ out_success:
* changes which zero cpus/mems_allowed.
*/
cs->attach_in_progress++;
+
out_unlock:
+ if (ret)
+ reset_migrate_dl_data(cs);
mutex_unlock(&cpuset_mutex);
return ret;
}
@@ -4176,11 +4180,11 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this
* node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
* yes. If current has access to memory reserves as an oom victim, yes.
- * Otherwise, no.
+ * If the current task is PF_EXITING, yes. Otherwise, no.
*
* GFP_USER allocations are marked with the __GFP_HARDWALL bit,
* and do not allow allocations outside the current tasks cpuset
- * unless the task has been OOM killed.
+ * unless the task has been OOM killed or is exiting.
* GFP_KERNEL allocations are not so marked, so can escape to the
* nearest enclosing hardwalled ancestor cpuset.
*
@@ -4194,7 +4198,9 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* The first call here from mm/page_alloc:get_page_from_freelist()
* has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
* so no allocation on a node outside the cpuset is allowed (unless
- * in interrupt, of course).
+ * in interrupt, of course). The PF_EXITING check must therefore
+ * come before the __GFP_HARDWALL check, otherwise a dying task
+ * would be blocked on the fast path.
*
* The second pass through get_page_from_freelist() doesn't even call
* here for GFP_ATOMIC calls. For those calls, the __alloc_pages()
@@ -4204,6 +4210,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* in_interrupt - any node ok (current task context irrelevant)
* GFP_ATOMIC - any node ok
* tsk_is_oom_victim - any node ok
+ * PF_EXITING - any node ok (let dying task exit quickly)
* GFP_KERNEL - any node in enclosing hardwalled cpuset ok
* GFP_USER - only nodes in current tasks mems allowed ok.
*/
@@ -4223,11 +4230,10 @@ bool cpuset_current_node_allowed(int node, gfp_t gfp_mask)
*/
if (unlikely(tsk_is_oom_victim(current)))
return true;
- if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */
- return false;
-
if (current->flags & PF_EXITING) /* Let dying task have memory */
return true;
+ if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */
+ return false;
/* Not hardwall and node outside mems_allowed: scan up cpusets */
spin_lock_irqsave(&callback_lock, flags);
diff --git a/kernel/cgroup/dmem.c b/kernel/cgroup/dmem.c
index 1ab1fb47f271..4753a67d0f0f 100644
--- a/kernel/cgroup/dmem.c
+++ b/kernel/cgroup/dmem.c
@@ -602,6 +602,7 @@ get_cg_pool_unlocked(struct dmemcg_state *cg, struct dmem_cgroup_region *region)
pool = NULL;
continue;
}
+ pool = ERR_PTR(-ENOMEM);
}
}
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 150e5871e66f..de816a43db9f 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-only
#include "cgroup-internal.h"
+#include <linux/cpumask.h>
#include <linux/sched/cputime.h>
#include <linux/bpf.h>
@@ -53,7 +54,7 @@ static inline struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu)
}
/**
- * css_rstat_updated - keep track of updated rstat_cpu
+ * __css_rstat_updated - keep track of updated rstat_cpu
* @css: target cgroup subsystem state
* @cpu: cpu on which rstat_cpu was updated
*
@@ -63,31 +64,27 @@ static inline struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu)
*
* NOTE: if the user needs the guarantee that the updater either add itself in
* the lockless list or the concurrent flusher flushes its updated stats, a
- * memory barrier is needed before the call to css_rstat_updated() i.e. a
+ * memory barrier is needed before the call to __css_rstat_updated() i.e. a
* barrier after updating the per-cpu stats and before calling
- * css_rstat_updated().
+ * __css_rstat_updated().
*/
-__bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
+void __css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
{
struct llist_head *lhead;
struct css_rstat_cpu *rstatc;
struct llist_node *self;
- /*
- * Since bpf programs can call this function, prevent access to
- * uninitialized rstat pointers.
- */
+ /* Prevent access to uninitialized rstat pointers. */
if (!css_uses_rstat(css))
return;
lockdep_assert_preemption_disabled();
/*
- * For archs withnot nmi safe cmpxchg or percpu ops support, ignore
- * the requests from nmi context.
+ * The lockless insertion below relies on NMI-safe cmpxchg;
+ * bail out in NMI on archs that don't provide it.
*/
- if ((!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) ||
- !IS_ENABLED(CONFIG_ARCH_HAS_NMI_SAFE_THIS_CPU_OPS)) && in_nmi())
+ if (!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) && in_nmi())
return;
rstatc = css_rstat_cpu(css, cpu);
@@ -125,6 +122,18 @@ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
llist_add(&rstatc->lnode, lhead);
}
+/*
+ * BPF-facing wrapper for __css_rstat_updated(). Validate the caller-provided
+ * CPU before passing it to the internal rstat updater.
+ */
+__bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
+{
+ if (unlikely(cpu < 0 || cpu >= nr_cpu_ids || !cpu_possible(cpu)))
+ return;
+
+ __css_rstat_updated(css, cpu);
+}
+
static void __css_process_update_tree(struct cgroup_subsys_state *css, int cpu)
{
/* put @css and all ancestors on the corresponding updated lists */
@@ -170,7 +179,7 @@ static void css_process_update_tree(struct cgroup_subsys *ss, int cpu)
* flusher flush the stats updated by the updater who have
* observed that they are already on the list. The
* corresponding barrier pair for this one should be before
- * css_rstat_updated() by the user.
+ * __css_rstat_updated() by the user.
*
* For now, there aren't any such user, so not adding the
* barrier here but if such a use-case arise, please add
@@ -614,7 +623,7 @@ static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
unsigned long flags)
{
u64_stats_update_end_irqrestore(&rstatbc->bsync, flags);
- css_rstat_updated(&cgrp->self, smp_processor_id());
+ __css_rstat_updated(&cgrp->self, smp_processor_id());
put_cpu_ptr(rstatbc);
}
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 1a725edbbbf6..3248f8b4d096 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -1251,7 +1251,14 @@ void debug_dma_map_phys(struct device *dev, phys_addr_t phys, size_t size,
entry->direction = direction;
entry->map_err_type = MAP_ERR_NOT_CHECKED;
- if (!(attrs & DMA_ATTR_MMIO)) {
+ if (attrs & DMA_ATTR_MMIO) {
+ unsigned long pfn = PHYS_PFN(phys);
+
+ if (pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn)))
+ err_printk(dev, entry,
+ "dma_map_resource called for RAM address %pa\n",
+ &phys);
+ } else {
check_for_stack(dev, phys);
if (!PhysHighMem(phys))
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index ec887f443741..583c5922bca2 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -39,7 +39,7 @@ static inline struct page *dma_direct_to_page(struct device *dev,
u64 dma_direct_get_required_mask(struct device *dev)
{
- phys_addr_t phys = (phys_addr_t)(max_pfn - 1) << PAGE_SHIFT;
+ phys_addr_t phys = ((phys_addr_t)max_pfn << PAGE_SHIFT) - 1;
u64 max_dma = phys_to_dma_direct(dev, phys);
return (1ULL << (fls64(max_dma) - 1)) * 2 - 1;
@@ -553,7 +553,7 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma,
int dma_direct_supported(struct device *dev, u64 mask)
{
- u64 min_mask = (max_pfn - 1) << PAGE_SHIFT;
+ u64 min_mask = ((u64)max_pfn << PAGE_SHIFT) - 1;
/*
* Because 32-bit DMA masks are so common we expect every architecture
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 23ed8eb9233e..e6b07f160d20 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -365,10 +365,6 @@ EXPORT_SYMBOL(dma_unmap_sg_attrs);
dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr,
size_t size, enum dma_data_direction dir, unsigned long attrs)
{
- if (IS_ENABLED(CONFIG_DMA_API_DEBUG) &&
- WARN_ON_ONCE(pfn_valid(PHYS_PFN(phys_addr))))
- return DMA_MAPPING_ERROR;
-
return dma_map_phys(dev, phys_addr, size, dir, attrs | DMA_ATTR_MMIO);
}
EXPORT_SYMBOL(dma_map_resource);
diff --git a/kernel/exit.c b/kernel/exit.c
index 25e9cb6de7e7..f50d73c272d6 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -571,6 +571,7 @@ static void exit_mm(void)
*/
smp_mb__after_spinlock();
local_irq_disable();
+ current->user_dumpable = (get_dumpable(mm) == SUID_DUMP_USER);
current->mm = NULL;
membarrier_update_current_mm(NULL);
enter_lazy_tlb(mm, current);
@@ -1073,6 +1074,7 @@ void __noreturn make_task_dead(int signr)
futex_exit_recursive(tsk);
tsk->exit_state = EXIT_DEAD;
refcount_inc(&tsk->rcu_users);
+ preempt_disable();
do_task_dead();
}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 6c9b1dc4e7d4..b635e3c5d5b6 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -14,6 +14,7 @@
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
#include <linux/irqdomain.h>
+#include <linux/preempt.h>
#include <linux/random.h>
#include <trace/events/irq.h>
@@ -893,7 +894,10 @@ void handle_percpu_irq(struct irq_desc *desc)
*
* action->percpu_dev_id is a pointer to percpu variables which
* contain the real device id for the cpu on which this handler is
- * called
+ * called.
+ *
+ * May be used for NMI interrupt lines, and so may be called in IRQ or NMI
+ * context.
*/
void handle_percpu_devid_irq(struct irq_desc *desc)
{
@@ -930,7 +934,8 @@ void handle_percpu_devid_irq(struct irq_desc *desc)
enabled ? " and unmasked" : "", irq, cpu);
}
- add_interrupt_randomness(irq);
+ if (!in_nmi())
+ add_interrupt_randomness(irq);
if (chip->irq_eoi)
chip->irq_eoi(&desc->irq_data);
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 120fd7365fbe..f7e2dc2c30c6 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -292,6 +292,12 @@ void irq_work_sync(struct irq_work *work)
!arch_irq_work_has_interrupt()) {
rcuwait_wait_event(&work->irqwait, !irq_work_is_busy(work),
TASK_UNINTERRUPTIBLE);
+ /*
+ * Ensure irq_work_single() does not access @work
+ * after removing IRQ_WORK_BUSY. It is always
+ * accessed within a RCU-read section.
+ */
+ synchronize_rcu();
return;
}
@@ -302,6 +308,7 @@ EXPORT_SYMBOL_GPL(irq_work_sync);
static void run_irq_workd(unsigned int cpu)
{
+ guard(rcu)();
irq_work_run_list(this_cpu_ptr(&lazy_list));
}
diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c
index 18509d8082ea..2592f7ca16e2 100644
--- a/kernel/liveupdate/kexec_handover.c
+++ b/kernel/liveupdate/kexec_handover.c
@@ -1707,7 +1707,7 @@ int kho_fill_kimage(struct kimage *image)
int err = 0;
struct kexec_buf scratch;
- if (!kho_enable)
+ if (!kho_enable || image->type == KEXEC_TYPE_CRASH)
return 0;
image->kho.fdt = virt_to_phys(kho_out.fdt);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 68c17daef8d4..130043bfc209 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -272,11 +272,24 @@ static bool ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
return ns_capable(ns, CAP_SYS_PTRACE);
}
+static bool task_still_dumpable(struct task_struct *task, unsigned int mode)
+{
+ struct mm_struct *mm = task->mm;
+ if (mm) {
+ if (get_dumpable(mm) == SUID_DUMP_USER)
+ return true;
+ return ptrace_has_cap(mm->user_ns, mode);
+ }
+
+ if (task->user_dumpable)
+ return true;
+ return ptrace_has_cap(&init_user_ns, mode);
+}
+
/* Returns 0 on success, -errno on denial. */
static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
{
const struct cred *cred = current_cred(), *tcred;
- struct mm_struct *mm;
kuid_t caller_uid;
kgid_t caller_gid;
@@ -337,11 +350,8 @@ ok:
* Pairs with a write barrier in commit_creds().
*/
smp_rmb();
- mm = task->mm;
- if (mm &&
- ((get_dumpable(mm) != SUID_DUMP_USER) &&
- !ptrace_has_cap(mm->user_ns, mode)))
- return -EPERM;
+ if (!task_still_dumpable(task, mode))
+ return -EPERM;
return security_ptrace_access_check(task, mode);
}
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 0d01cd8c4b4a..7c2f7cc131f7 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -897,11 +897,9 @@ static void srcu_schedule_cbs_snp(struct srcu_struct *ssp, struct srcu_node *snp
{
int cpu;
- for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
- if (!(mask & (1UL << (cpu - snp->grplo))))
- continue;
- srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, cpu), delay);
- }
+ for (cpu = snp->grplo; cpu <= snp->grphi; cpu++)
+ if ((mask & (1UL << (cpu - snp->grplo))) && rcu_cpu_beenfullyonline(cpu))
+ srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, cpu), delay);
}
/*
@@ -1322,7 +1320,9 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
*/
idx = __srcu_read_lock_nmisafe(ssp);
ss_state = smp_load_acquire(&ssp->srcu_sup->srcu_size_state);
- if (ss_state < SRCU_SIZE_WAIT_CALL)
+ // If !rcu_cpu_beenfullyonline(), interrupts are still disabled,
+ // so no migration is possible in either direction from this CPU.
+ if (ss_state < SRCU_SIZE_WAIT_CALL || !rcu_cpu_beenfullyonline(raw_smp_processor_id()))
sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id());
else
sdp = raw_cpu_ptr(ssp->sda);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index edca7849b165..7db4c87df83b 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -3107,20 +3107,18 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
static void set_cpus_allowed_dl(struct task_struct *p,
struct affinity_context *ctx)
{
- struct root_domain *src_rd;
struct rq *rq;
WARN_ON_ONCE(!dl_task(p));
rq = task_rq(p);
- src_rd = rq->rd;
/*
* Migrating a SCHED_DEADLINE task between exclusive
* cpusets (different root_domains) entails a bandwidth
* update. We already made space for us in the destination
* domain (see cpuset_can_attach()).
*/
- if (!cpumask_intersects(src_rd->span, ctx->new_mask)) {
+ if (dl_task_needs_bw_move(p, ctx->new_mask)) {
struct dl_bw *src_dl_b;
src_dl_b = dl_bw_of(cpu_of(rq));
@@ -3137,6 +3135,15 @@ static void set_cpus_allowed_dl(struct task_struct *p,
set_cpus_allowed_common(p, ctx);
}
+bool dl_task_needs_bw_move(struct task_struct *p,
+ const struct cpumask *new_mask)
+{
+ if (!dl_task(p))
+ return false;
+
+ return !cpumask_intersects(task_rq(p)->rd->span, new_mask);
+}
+
/* Assumes rq->lock is held */
static void rq_online_dl(struct rq *rq)
{
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 38d90baf78cf..65631e577ee9 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -297,7 +297,6 @@ static void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch)
#else /* CONFIG_EXT_SUB_SCHED */
static struct scx_sched *scx_parent(struct scx_sched *sch) { return NULL; }
static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, struct scx_sched *root) { return pos ? NULL : root; }
-static struct scx_sched *scx_find_sub_sched(u64 cgroup_id) { return NULL; }
static void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch) {}
#endif /* CONFIG_EXT_SUB_SCHED */
@@ -712,6 +711,51 @@ struct bpf_iter_scx_dsq {
} __attribute__((aligned(8)));
+static u32 scx_get_task_state(const struct task_struct *p)
+{
+ return p->scx.flags & SCX_TASK_STATE_MASK;
+}
+
+static void scx_set_task_state(struct task_struct *p, u32 state)
+{
+ u32 prev_state = scx_get_task_state(p);
+ bool warn = false;
+
+ switch (state) {
+ case SCX_TASK_NONE:
+ warn = prev_state == SCX_TASK_DEAD;
+ break;
+ case SCX_TASK_INIT_BEGIN:
+ warn = prev_state != SCX_TASK_NONE;
+ break;
+ case SCX_TASK_INIT:
+ warn = prev_state != SCX_TASK_INIT_BEGIN;
+ p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
+ break;
+ case SCX_TASK_READY:
+ warn = !(prev_state == SCX_TASK_INIT ||
+ prev_state == SCX_TASK_ENABLED);
+ break;
+ case SCX_TASK_ENABLED:
+ warn = prev_state != SCX_TASK_READY;
+ break;
+ case SCX_TASK_DEAD:
+ warn = !(prev_state == SCX_TASK_NONE ||
+ prev_state == SCX_TASK_INIT_BEGIN);
+ break;
+ default:
+ WARN_ONCE(1, "sched_ext: Invalid task state %d -> %d for %s[%d]",
+ prev_state, state, p->comm, p->pid);
+ return;
+ }
+
+ WARN_ONCE(warn, "sched_ext: Invalid task state transition 0x%x -> 0x%x for %s[%d]",
+ prev_state, state, p->comm, p->pid);
+
+ p->scx.flags &= ~SCX_TASK_STATE_MASK;
+ p->scx.flags |= state;
+}
+
/*
* SCX task iterator.
*/
@@ -937,11 +981,11 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
/*
* cgroup_task_dead() removes the dead tasks from cset->tasks
* after sched_ext_dead() and cgroup iteration may see tasks
- * which already finished sched_ext_dead(). %SCX_TASK_OFF_TASKS
- * is set by sched_ext_dead() under @p's rq lock. Test it to
+ * which already finished sched_ext_dead(). %SCX_TASK_DEAD is
+ * set by sched_ext_dead() under @p's rq lock. Test it to
* avoid visiting tasks which are already dead from SCX POV.
*/
- if (p->scx.flags & SCX_TASK_OFF_TASKS) {
+ if (scx_get_task_state(p) == SCX_TASK_DEAD) {
__scx_task_iter_rq_unlock(iter);
continue;
}
@@ -2034,6 +2078,7 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
/* dequeue is always temporary, don't reset runnable_at */
clr_task_runnable(p, false);
+retry:
/* acquire ensures that we see the preceding updates on QUEUED */
opss = atomic_long_read_acquire(&p->scx.ops_state);
@@ -2047,8 +2092,20 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
*/
BUG();
case SCX_OPSS_QUEUED:
- /* A queued task must always be in BPF scheduler's custody */
- WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_IN_CUSTODY));
+ /*
+ * A queued task must always be in BPF scheduler's custody. If
+ * SCX_TASK_IN_CUSTODY is clear, finish_dispatch() on another
+ * CPU has already passed call_task_dequeue() (which clears the
+ * flag), but has not yet written SCX_OPSS_NONE. That final
+ * store does not require this rq's lock, so retrying with
+ * cpu_relax() is bounded: we will observe NONE (or DISPATCHING,
+ * handled by the fallthrough) on a subsequent iteration.
+ */
+ if (unlikely(!(READ_ONCE(p->scx.flags) & SCX_TASK_IN_CUSTODY))) {
+ cpu_relax();
+ goto retry;
+ }
+
if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
SCX_OPSS_NONE))
break;
@@ -3500,41 +3557,6 @@ static struct cgroup *tg_cgrp(struct task_group *tg)
#endif /* CONFIG_EXT_GROUP_SCHED */
-static u32 scx_get_task_state(const struct task_struct *p)
-{
- return p->scx.flags & SCX_TASK_STATE_MASK;
-}
-
-static void scx_set_task_state(struct task_struct *p, u32 state)
-{
- u32 prev_state = scx_get_task_state(p);
- bool warn = false;
-
- switch (state) {
- case SCX_TASK_NONE:
- break;
- case SCX_TASK_INIT:
- warn = prev_state != SCX_TASK_NONE;
- break;
- case SCX_TASK_READY:
- warn = prev_state == SCX_TASK_NONE;
- break;
- case SCX_TASK_ENABLED:
- warn = prev_state != SCX_TASK_READY;
- break;
- default:
- WARN_ONCE(1, "sched_ext: Invalid task state %d -> %d for %s[%d]",
- prev_state, state, p->comm, p->pid);
- return;
- }
-
- WARN_ONCE(warn, "sched_ext: Invalid task state transition 0x%x -> 0x%x for %s[%d]",
- prev_state, state, p->comm, p->pid);
-
- p->scx.flags &= ~SCX_TASK_STATE_MASK;
- p->scx.flags |= state;
-}
-
static int __scx_init_task(struct scx_sched *sch, struct task_struct *p, bool fork)
{
int ret;
@@ -3586,22 +3608,6 @@ static int __scx_init_task(struct scx_sched *sch, struct task_struct *p, bool fo
return 0;
}
-static int scx_init_task(struct scx_sched *sch, struct task_struct *p, bool fork)
-{
- int ret;
-
- ret = __scx_init_task(sch, p, fork);
- if (!ret) {
- /*
- * While @p's rq is not locked. @p is not visible to the rest of
- * SCX yet and it's safe to update the flags and state.
- */
- p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
- scx_set_task_state(p, SCX_TASK_INIT);
- }
- return ret;
-}
-
static void __scx_enable_task(struct scx_sched *sch, struct task_struct *p)
{
struct rq *rq = task_rq(p);
@@ -3716,7 +3722,8 @@ static void scx_disable_and_exit_task(struct scx_sched *sch,
* If set, @p exited between __scx_init_task() and scx_enable_task() in
* scx_sub_enable() and is initialized for both the associated sched and
* its parent. Exit for the child too - scx_enable_task() never ran for
- * it, so undo only init_task.
+ * it, so undo only init_task. The flag is only set on the sub-enable
+ * path, so it's always clear when @p arrives here in %SCX_TASK_NONE.
*/
if (p->scx.flags & SCX_TASK_SUB_INIT) {
if (!WARN_ON_ONCE(!scx_enabling_sub_sched))
@@ -3764,10 +3771,14 @@ int scx_fork(struct task_struct *p, struct kernel_clone_args *kargs)
#else
struct scx_sched *sch = scx_root;
#endif
- ret = scx_init_task(sch, p, true);
- if (!ret)
- scx_set_task_sched(p, sch);
- return ret;
+ scx_set_task_state(p, SCX_TASK_INIT_BEGIN);
+ ret = __scx_init_task(sch, p, true);
+ if (unlikely(ret)) {
+ scx_set_task_state(p, SCX_TASK_NONE);
+ return ret;
+ }
+ scx_set_task_state(p, SCX_TASK_INIT);
+ scx_set_task_sched(p, sch);
}
return 0;
@@ -3862,18 +3873,23 @@ void sched_ext_dead(struct task_struct *p)
* @p is off scx_tasks and wholly ours. scx_root_enable()'s READY ->
* ENABLED transitions can't race us. Disable ops for @p.
*
- * %SCX_TASK_OFF_TASKS synchronizes against cgroup task iteration - see
+ * %SCX_TASK_DEAD synchronizes against cgroup task iteration - see
* scx_task_iter_next_locked(). NONE tasks need no marking: cgroup
* iteration is only used from sub-sched paths, which require root
* enabled. Root enable transitions every live task to at least READY.
+ *
+ * %INIT_BEGIN means ops.init_task() is running for @p. Don't call
+ * into ops; transition to %DEAD so the post-init recheck unwinds
+ * via scx_sub_init_cancel_task().
*/
if (scx_get_task_state(p) != SCX_TASK_NONE) {
struct rq_flags rf;
struct rq *rq;
rq = task_rq_lock(p, &rf);
- scx_disable_and_exit_task(scx_task_sched(p), p);
- p->scx.flags |= SCX_TASK_OFF_TASKS;
+ if (scx_get_task_state(p) != SCX_TASK_INIT_BEGIN)
+ scx_disable_and_exit_task(scx_task_sched(p), p);
+ scx_set_task_state(p, SCX_TASK_DEAD);
task_rq_unlock(rq, p, &rf);
}
}
@@ -3919,6 +3935,16 @@ static void switched_from_scx(struct rq *rq, struct task_struct *p)
if (task_dead_and_done(p))
return;
+ /*
+ * %NONE means SCX is no longer tracking @p at the task level (e.g.
+ * scx_fail_parent() handed @p back to the parent at NONE pending the
+ * parent's own teardown). There is nothing to disable; calling
+ * scx_disable_task() would WARN on the non-%ENABLED state and trigger a
+ * NONE -> READY validation failure.
+ */
+ if (scx_get_task_state(p) == SCX_TASK_NONE)
+ return;
+
scx_disable_task(scx_task_sched(p), p);
}
@@ -4808,6 +4834,8 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
kfree(sch->cgrp_path);
if (sch_cgroup(sch))
cgroup_put(sch_cgroup(sch));
+ if (sch->sub_kset)
+ kobject_put(&sch->sub_kset->kobj);
#endif /* CONFIG_EXT_SUB_SCHED */
for_each_possible_cpu(cpu) {
@@ -4931,10 +4959,30 @@ static const struct kset_uevent_ops scx_uevent_ops = {
*/
bool task_should_scx(int policy)
{
- if (!scx_enabled() || unlikely(scx_enable_state() == SCX_DISABLING))
+ /* if disabled, nothing should be on it */
+ if (!scx_enabled())
return false;
+
+ /* scx is taking over all SCHED_OTHER and SCHED_EXT tasks */
if (READ_ONCE(scx_switching_all))
return true;
+
+ /*
+ * scx is tearing down - keep new SCHED_EXT tasks out.
+ *
+ * Must come after scx_switching_all test, which serves as a proxy
+ * for __scx_switched_all. While __scx_switched_all is set, we must
+ * return true via the branch above: a fork routed to fair would
+ * stall because next_active_class() skips fair.
+ *
+ * This can develop into a deadlock - scx holds scx_enable_mutex across
+ * kthread_create() in scx_alloc_and_add_sched(); if the new kthread is
+ * the stalled task, the disable path can never grab the mutex to clear
+ * scx_switching_all.
+ */
+ if (unlikely(scx_enable_state() == SCX_DISABLING))
+ return false;
+
return policy == SCHED_EXT;
}
@@ -5585,10 +5633,12 @@ static void refresh_watchdog(void)
static s32 scx_link_sched(struct scx_sched *sch)
{
+ const char *err_msg = "";
+ s32 ret = 0;
+
scoped_guard(raw_spinlock_irq, &scx_sched_lock) {
#ifdef CONFIG_EXT_SUB_SCHED
struct scx_sched *parent = scx_parent(sch);
- s32 ret;
if (parent) {
/*
@@ -5598,15 +5648,16 @@ static s32 scx_link_sched(struct scx_sched *sch)
* parent can shoot us down.
*/
if (atomic_read(&parent->exit_kind) != SCX_EXIT_NONE) {
- scx_error(sch, "parent disabled");
- return -ENOENT;
+ err_msg = "parent disabled";
+ ret = -ENOENT;
+ break;
}
ret = rhashtable_lookup_insert_fast(&scx_sched_hash,
&sch->hash_node, scx_sched_hash_params);
if (ret) {
- scx_error(sch, "failed to insert into scx_sched_hash (%d)", ret);
- return ret;
+ err_msg = "failed to insert into scx_sched_hash";
+ break;
}
list_add_tail(&sch->sibling, &parent->children);
@@ -5616,6 +5667,15 @@ static s32 scx_link_sched(struct scx_sched *sch)
list_add_tail_rcu(&sch->all, &scx_sched_all);
}
+ /*
+ * scx_error() takes scx_sched_lock via scx_claim_exit(), so it must run after
+ * the guard above is released.
+ */
+ if (ret) {
+ scx_error(sch, "%s (%d)", err_msg, ret);
+ return ret;
+ }
+
refresh_watchdog();
return 0;
}
@@ -5685,7 +5745,7 @@ static void scx_fail_parent(struct scx_sched *sch,
scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
scx_disable_and_exit_task(sch, p);
- rcu_assign_pointer(p->scx.sched, parent);
+ scx_set_task_sched(p, parent);
}
}
scx_task_iter_stop(&sti);
@@ -5763,6 +5823,21 @@ static void scx_sub_disable(struct scx_sched *sch)
}
rq = task_rq_lock(p, &rf);
+
+ if (scx_get_task_state(p) == SCX_TASK_DEAD) {
+ /*
+ * sched_ext_dead() raced us between __scx_init_task()
+ * and this rq lock and ran exit_task() on @sch (the
+ * sched @p was on at that point), not on $parent.
+ * $parent's just-completed init is owed an exit_task()
+ * and we issue it here.
+ */
+ scx_sub_init_cancel_task(parent, p);
+ task_rq_unlock(rq, p, &rf);
+ put_task_struct(p);
+ continue;
+ }
+
scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
/*
* $p is initialized for $parent and still attached to
@@ -5771,13 +5846,14 @@ static void scx_sub_disable(struct scx_sched *sch)
* $p having already been initialized, and then enable.
*/
scx_disable_and_exit_task(sch, p);
+ scx_set_task_state(p, SCX_TASK_INIT_BEGIN);
scx_set_task_state(p, SCX_TASK_INIT);
- rcu_assign_pointer(p->scx.sched, parent);
+ scx_set_task_sched(p, parent);
scx_set_task_state(p, SCX_TASK_READY);
scx_enable_task(parent, p);
}
- task_rq_unlock(rq, p, &rf);
+ task_rq_unlock(rq, p, &rf);
put_task_struct(p);
}
scx_task_iter_stop(&sti);
@@ -5820,7 +5896,7 @@ static void scx_sub_disable(struct scx_sched *sch)
if (sch->ops.exit)
SCX_CALL_OP(sch, exit, NULL, sch->exit_info);
if (sch->sub_kset)
- kset_unregister(sch->sub_kset);
+ kobject_del(&sch->sub_kset->kobj);
kobject_del(&sch->kobj);
}
#else /* CONFIG_EXT_SUB_SCHED */
@@ -5954,7 +6030,7 @@ static void scx_root_disable(struct scx_sched *sch)
*/
#ifdef CONFIG_EXT_SUB_SCHED
if (sch->sub_kset)
- kset_unregister(sch->sub_kset);
+ kobject_del(&sch->sub_kset->kobj);
#endif
kobject_del(&sch->kobj);
@@ -6578,7 +6654,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
sch->slice_dfl = SCX_SLICE_DFL;
atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
- init_irq_work(&sch->disable_irq_work, scx_disable_irq_workfn);
+ sch->disable_irq_work = IRQ_WORK_INIT_HARD(scx_disable_irq_workfn);
kthread_init_work(&sch->disable_work, scx_disable_workfn);
timer_setup(&sch->bypass_lb_timer, scx_bypass_lb_timerfn, 0);
@@ -6594,6 +6670,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
rcu_assign_pointer(ops->priv, sch);
sch->kobj.kset = scx_kset;
+ INIT_LIST_HEAD(&sch->all);
#ifdef CONFIG_EXT_SUB_SCHED
char *buf = kzalloc(PATH_MAX, GFP_KERNEL);
@@ -6621,6 +6698,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
if (ret < 0) {
+ RCU_INIT_POINTER(ops->priv, NULL);
kobject_put(&sch->kobj);
return ERR_PTR(ret);
}
@@ -6628,6 +6706,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
if (ops->sub_attach) {
sch->sub_kset = kset_create_and_add("sub", NULL, &sch->kobj);
if (!sch->sub_kset) {
+ RCU_INIT_POINTER(ops->priv, NULL);
kobject_put(&sch->kobj);
return ERR_PTR(-ENOMEM);
}
@@ -6635,14 +6714,18 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
#else /* CONFIG_EXT_SUB_SCHED */
ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
if (ret < 0) {
+ RCU_INIT_POINTER(ops->priv, NULL);
kobject_put(&sch->kobj);
return ERR_PTR(ret);
}
#endif /* CONFIG_EXT_SUB_SCHED */
return sch;
+#ifdef CONFIG_EXT_SUB_SCHED
err_free_lb_resched:
+ RCU_INIT_POINTER(ops->priv, NULL);
free_cpumask_var(sch->bypass_lb_resched_cpumask);
+#endif
err_free_lb_cpumask:
free_cpumask_var(sch->bypass_lb_donee_cpumask);
err_stop_helper:
@@ -6752,6 +6835,19 @@ static void scx_root_enable_workfn(struct kthread_work *work)
goto err_unlock;
}
+ /*
+ * @ops->priv binds @ops to its scx_sched instance. It is set here by
+ * scx_alloc_and_add_sched() and cleared at the tail of bpf_scx_unreg(),
+ * which runs after scx_root_disable() has dropped scx_enable_mutex. If
+ * it's still non-NULL here, a previous attachment on @ops has not
+ * finished tearing down; proceeding would let the in-flight unreg's
+ * RCU_INIT_POINTER(NULL) clobber the @ops->priv we are about to assign.
+ */
+ if (rcu_access_pointer(ops->priv)) {
+ ret = -EBUSY;
+ goto err_unlock;
+ }
+
ret = alloc_kick_syncs();
if (ret)
goto err_unlock;
@@ -6874,6 +6970,9 @@ static void scx_root_enable_workfn(struct kthread_work *work)
scx_task_iter_start(&sti, NULL);
while ((p = scx_task_iter_next_locked(&sti))) {
+ struct rq_flags rf;
+ struct rq *rq;
+
/*
* @p may already be dead, have lost all its usages counts and
* be waiting for RCU grace period before being freed. @p can't
@@ -6882,20 +6981,47 @@ static void scx_root_enable_workfn(struct kthread_work *work)
if (!tryget_task_struct(p))
continue;
+ /*
+ * Set %INIT_BEGIN under the iter's rq lock so that a concurrent
+ * sched_ext_dead() does not call ops.exit_task() on @p while
+ * ops.init_task() is running. If sched_ext_dead() runs before
+ * this store, it has already removed @p from scx_tasks and the
+ * iter won't visit @p; if it runs after, it observes
+ * %INIT_BEGIN and transitions to %DEAD without calling ops,
+ * leaving the post-init recheck below to unwind.
+ */
+ scx_set_task_state(p, SCX_TASK_INIT_BEGIN);
scx_task_iter_unlock(&sti);
- ret = scx_init_task(sch, p, false);
- if (ret) {
- put_task_struct(p);
+ ret = __scx_init_task(sch, p, false);
+
+ rq = task_rq_lock(p, &rf);
+
+ if (unlikely(ret)) {
+ if (scx_get_task_state(p) != SCX_TASK_DEAD)
+ scx_set_task_state(p, SCX_TASK_NONE);
+ task_rq_unlock(rq, p, &rf);
scx_task_iter_stop(&sti);
scx_error(sch, "ops.init_task() failed (%d) for %s[%d]",
ret, p->comm, p->pid);
+ put_task_struct(p);
goto err_disable_unlock_all;
}
- scx_set_task_sched(p, sch);
- scx_set_task_state(p, SCX_TASK_READY);
+ if (scx_get_task_state(p) == SCX_TASK_DEAD) {
+ /*
+ * sched_ext_dead() observed %INIT_BEGIN and set %DEAD.
+ * ops.exit_task() is owed to the sched __scx_init_task()
+ * ran against; call it now.
+ */
+ scx_sub_init_cancel_task(sch, p);
+ } else {
+ scx_set_task_state(p, SCX_TASK_INIT);
+ scx_set_task_sched(p, sch);
+ scx_set_task_state(p, SCX_TASK_READY);
+ }
+ task_rq_unlock(rq, p, &rf);
put_task_struct(p);
}
scx_task_iter_stop(&sti);
@@ -7039,6 +7165,12 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
goto out_unlock;
}
+ /* See scx_root_enable_workfn() for the @ops->priv check. */
+ if (rcu_access_pointer(ops->priv)) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
cgrp = cgroup_get_from_id(ops->sub_cgroup_id);
if (IS_ERR(cgrp)) {
ret = PTR_ERR(cgrp);
@@ -7165,6 +7297,21 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
goto abort;
rq = task_rq_lock(p, &rf);
+
+ if (scx_get_task_state(p) == SCX_TASK_DEAD) {
+ /*
+ * sched_ext_dead() raced us between __scx_init_task()
+ * and this rq lock and ran exit_task() on $parent (the
+ * sched @p was on at that point), not on @sch. @sch's
+ * just-completed init is owed an exit_task() and we
+ * issue it here.
+ */
+ scx_sub_init_cancel_task(sch, p);
+ task_rq_unlock(rq, p, &rf);
+ put_task_struct(p);
+ continue;
+ }
+
p->scx.flags |= SCX_TASK_SUB_INIT;
task_rq_unlock(rq, p, &rf);
@@ -7199,7 +7346,7 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
* $p is now only initialized for @sch and READY, which
* is what we want. Assign it to @sch and enable.
*/
- rcu_assign_pointer(p->scx.sched, sch);
+ scx_set_task_sched(p, sch);
scx_enable_task(sch, p);
p->scx.flags &= ~SCX_TASK_SUB_INIT;
@@ -7301,8 +7448,7 @@ static s32 scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
static DEFINE_MUTEX(helper_mutex);
struct scx_enable_cmd cmd;
- if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN),
- cpu_possible_mask)) {
+ if (housekeeping_enabled(HK_TYPE_DOMAIN_BOOT)) {
pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n");
return -EINVAL;
}
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 1decdce8cbef..8d3d96e847d8 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -143,8 +143,8 @@ obj-$(CONFIG_TRACE_REMOTE_TEST) += remote_test.o
targets += undefsyms_base.o
KASAN_SANITIZE_undefsyms_base.o := y
-UNDEFINED_ALLOWLIST = __asan __gcov __kasan __kcsan __hwasan __sancov __sanitizer __tsan __ubsan __x86_indirect_thunk \
- __msan simple_ring_buffer \
+UNDEFINED_ALLOWLIST = __asan __gcov __kasan __kcsan __hwasan __sancov __sanitizer __tsan __ubsan __msan \
+ __aeabi_unwind_cpp __s390_indirect_jump __x86_indirect_thunk simple_ring_buffer \
$(shell $(NM) -u $(obj)/undefsyms_base.o 2>/dev/null | awk '{print $$2}')
quiet_cmd_check_undefined = NM $<
@@ -154,7 +154,8 @@ quiet_cmd_check_undefined = NM $<
echo "Unexpected symbols in $<:" >&2; \
echo "$$undefsyms" >&2; \
false; \
- fi
+ fi; \
+ touch $@
$(obj)/%.o.checked: $(obj)/%.o $(obj)/undefsyms_base.o FORCE
$(call if_changed,check_undefined)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index af7079aa0f36..a02bd258677e 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2384,7 +2384,8 @@ static void bpf_kprobe_multi_link_release(struct bpf_link *link)
struct bpf_kprobe_multi_link *kmulti_link;
kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
- unregister_fprobe(&kmulti_link->fp);
+ /* Don't wait for RCU GP here. */
+ unregister_fprobe_async(&kmulti_link->fp);
kprobe_multi_put_modules(kmulti_link->mods, kmulti_link->mods_cnt);
}
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index cc49ebd2a773..f378613ad120 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -1093,14 +1093,15 @@ static int unregister_fprobe_nolock(struct fprobe *fp)
}
/**
- * unregister_fprobe() - Unregister fprobe.
+ * unregister_fprobe_async() - Unregister fprobe without RCU GP wait
* @fp: A fprobe data structure to be unregistered.
*
* Unregister fprobe (and remove ftrace hooks from the function entries).
+ * This function will NOT wait until the fprobe is no longer used.
*
* Return 0 if @fp is unregistered successfully, -errno if not.
*/
-int unregister_fprobe(struct fprobe *fp)
+int unregister_fprobe_async(struct fprobe *fp)
{
guard(mutex)(&fprobe_mutex);
if (!fp || !fprobe_registered(fp))
@@ -1108,6 +1109,24 @@ int unregister_fprobe(struct fprobe *fp)
return unregister_fprobe_nolock(fp);
}
+
+/**
+ * unregister_fprobe() - Unregister fprobe with RCU GP wait
+ * @fp: A fprobe data structure to be unregistered.
+ *
+ * Unregister fprobe (and remove ftrace hooks from the function entries).
+ * This function will block until the fprobe is no longer used.
+ *
+ * Return 0 if @fp is unregistered successfully, -errno if not.
+ */
+int unregister_fprobe(struct fprobe *fp)
+{
+ int ret = unregister_fprobe_async(fp);
+
+ if (!ret)
+ synchronize_rcu();
+ return ret;
+}
EXPORT_SYMBOL_GPL(unregister_fprobe);
static int __init fprobe_initcall(void)
diff --git a/kernel/trace/remote_test.c b/kernel/trace/remote_test.c
index 6c1b7701ddae..a3e2c9b606eb 100644
--- a/kernel/trace/remote_test.c
+++ b/kernel/trace/remote_test.c
@@ -110,9 +110,9 @@ static struct trace_buffer_desc *remote_test_load(unsigned long size, void *unus
return remote_test_buffer_desc;
err_unload:
- for_each_ring_buffer_desc(rb_desc, cpu, remote_test_buffer_desc)
+ for_each_ring_buffer_desc(rb_desc, cpu, desc)
remote_test_unload_simple_rb(rb_desc->cpu);
- trace_remote_free_buffer(remote_test_buffer_desc);
+ trace_remote_free_buffer(desc);
err_free_desc:
kfree(desc);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 5326924615a4..7b07d2004cc6 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -7,6 +7,7 @@
#include <linux/ring_buffer_types.h>
#include <linux/sched/isolation.h>
#include <linux/trace_recursion.h>
+#include <linux/panic_notifier.h>
#include <linux/trace_events.h>
#include <linux/ring_buffer.h>
#include <linux/trace_clock.h>
@@ -31,6 +32,7 @@
#include <linux/oom.h>
#include <linux/mm.h>
+#include <asm/ring_buffer.h>
#include <asm/local64.h>
#include <asm/local.h>
#include <asm/setup.h>
@@ -559,6 +561,7 @@ struct trace_buffer {
unsigned long range_addr_start;
unsigned long range_addr_end;
+ struct notifier_block flush_nb;
struct ring_buffer_meta *meta;
@@ -2521,6 +2524,16 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
kfree(cpu_buffer);
}
+/* Stop recording on a persistent buffer and flush cache if needed. */
+static int rb_flush_buffer_cb(struct notifier_block *nb, unsigned long event, void *data)
+{
+ struct trace_buffer *buffer = container_of(nb, struct trace_buffer, flush_nb);
+
+ ring_buffer_record_off(buffer);
+ arch_ring_buffer_flush_range(buffer->range_addr_start, buffer->range_addr_end);
+ return NOTIFY_DONE;
+}
+
static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
int order, unsigned long start,
unsigned long end,
@@ -2651,6 +2664,12 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
mutex_init(&buffer->mutex);
+ /* Persistent ring buffer needs to flush cache before reboot. */
+ if (start && end) {
+ buffer->flush_nb.notifier_call = rb_flush_buffer_cb;
+ atomic_notifier_chain_register(&panic_notifier_list, &buffer->flush_nb);
+ }
+
return_ptr(buffer);
fail_free_buffers:
@@ -2749,6 +2768,9 @@ ring_buffer_free(struct trace_buffer *buffer)
{
int cpu;
+ if (buffer->range_addr_start && buffer->range_addr_end)
+ atomic_notifier_chain_unregister(&panic_notifier_list, &buffer->flush_nb);
+
cpuhp_state_remove_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node);
irq_work_sync(&buffer->irq_work.work);
@@ -5407,6 +5429,7 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
iter->head_page = cpu_buffer->reader_page;
iter->head = cpu_buffer->reader_page->read;
iter->next_event = iter->head;
+ iter->missed_events = 0;
iter->cache_reader_page = iter->head_page;
iter->cache_read = cpu_buffer->read;
@@ -6086,10 +6109,7 @@ ring_buffer_peek(struct trace_buffer *buffer, int cpu, u64 *ts,
*/
bool ring_buffer_iter_dropped(struct ring_buffer_iter *iter)
{
- bool ret = iter->missed_events != 0;
-
- iter->missed_events = 0;
- return ret;
+ return iter->missed_events != 0;
}
EXPORT_SYMBOL_GPL(ring_buffer_iter_dropped);
@@ -6251,7 +6271,7 @@ void ring_buffer_iter_advance(struct ring_buffer_iter *iter)
unsigned long flags;
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
-
+ iter->missed_events = 0;
rb_advance_iter(iter);
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
diff --git a/kernel/trace/simple_ring_buffer.c b/kernel/trace/simple_ring_buffer.c
index 02af2297ae5a..f4642f5adda3 100644
--- a/kernel/trace/simple_ring_buffer.c
+++ b/kernel/trace/simple_ring_buffer.c
@@ -395,7 +395,6 @@ int simple_ring_buffer_init_mm(struct simple_rb_per_cpu *cpu_buffer,
memset(cpu_buffer->meta, 0, sizeof(*cpu_buffer->meta));
cpu_buffer->meta->meta_page_size = PAGE_SIZE;
- cpu_buffer->meta->nr_subbufs = cpu_buffer->nr_pages;
/* The reader page is not part of the ring initially */
page = load_page(desc->page_va[0]);
@@ -431,12 +430,13 @@ int simple_ring_buffer_init_mm(struct simple_rb_per_cpu *cpu_buffer,
if (ret) {
for (i--; i >= 0; i--)
- unload_page((void *)desc->page_va[i]);
+ unload_page(bpages[i].page);
unload_page(cpu_buffer->meta);
return ret;
}
+ cpu_buffer->meta->nr_subbufs = cpu_buffer->nr_pages;
/* Close the ring */
bpage->link.next = &cpu_buffer->tail_page->link;
cpu_buffer->tail_page->link.prev = &bpage->link;
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 0dbbf6cca9bc..eb2c2bc8bc3d 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -1369,10 +1369,8 @@ static const char *hist_field_name(struct hist_field *field,
len = snprintf(full_name, sizeof(full_name), fmt,
field->system, field->event_name,
field->name);
- if (len >= sizeof(full_name))
- return NULL;
-
- field_name = full_name;
+ if (len < sizeof(full_name))
+ field_name = full_name;
} else
field_name = field->name;
} else if (field->flags & HIST_FIELD_FL_TIMESTAMP)
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index bf1a507695b6..0dd7927df22a 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -386,13 +386,11 @@ static void tracing_map_elt_init_fields(struct tracing_map_elt *elt)
}
}
-static void tracing_map_elt_free(struct tracing_map_elt *elt)
+static void __tracing_map_elt_free(struct tracing_map_elt *elt)
{
if (!elt)
return;
- if (elt->map->ops && elt->map->ops->elt_free)
- elt->map->ops->elt_free(elt);
kfree(elt->fields);
kfree(elt->vars);
kfree(elt->var_set);
@@ -400,6 +398,17 @@ static void tracing_map_elt_free(struct tracing_map_elt *elt)
kfree(elt);
}
+static void tracing_map_elt_free(struct tracing_map_elt *elt)
+{
+ if (!elt)
+ return;
+
+ /* Only objects initialized with alloc_elt() should be passed to free_elt().*/
+ if (elt->map->ops && elt->map->ops->elt_free)
+ elt->map->ops->elt_free(elt);
+ __tracing_map_elt_free(elt);
+}
+
static struct tracing_map_elt *tracing_map_elt_alloc(struct tracing_map *map)
{
struct tracing_map_elt *elt;
@@ -444,7 +453,7 @@ static struct tracing_map_elt *tracing_map_elt_alloc(struct tracing_map *map)
}
return elt;
free:
- tracing_map_elt_free(elt);
+ __tracing_map_elt_free(elt);
return ERR_PTR(err);
}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 3d2e3b2ec528..33b721a9af02 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2296,6 +2296,18 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) &&
WARN_ONCE(!is_chained_work(wq), "workqueue: cannot queue %ps on wq %s\n",
work->func, wq->name))) {
+ struct work_offq_data offqd;
+
+ /*
+ * State on entry: PENDING is set, work is off-queue (no
+ * insert_work() has run).
+ *
+ * Returning without clearing PENDING would leave the work
+ * in a weird state (PENDING=1, PWQ=0, entry empty)
+ */
+ work_offqd_unpack(&offqd, *work_data_bits(work));
+ set_work_pool_and_clear_pending(work, offqd.pool_id,
+ work_offqd_pack_flags(&offqd));
return;
}
rcu_read_lock();
@@ -5642,7 +5654,9 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
ret = apply_workqueue_attrs_locked(wq, unbound_std_wq_attrs[highpri]);
}
- return ret;
+ if (ret)
+ goto enomem;
+ return 0;
enomem:
if (wq->cpu_pwq) {