From 87019cb6c26178cef8fb9f9265b6ab7c4bda5262 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2026 05:33:41 -1000
Subject: sched_ext: Mark scx_sched_hash insecure_elasticity

scx_sched_hash is inserted into under scx_sched_lock (raw_spinlock_irq)
in scx_link_sched(). rhashtable's sync grow path calls get_random_u32()
and does a GFP_ATOMIC allocation; both acquire regular spinlocks, which
is unsafe under raw_spinlock_t. Set insecure_elasticity to skip the
sync grow.

v2:
- Dropped dsq_hash changes. Insertion is not under raw_spin_lock.

- Switched from no_sync_grow flag to insecure_elasticity.

Fixes: 25037af712eb ("sched_ext: Add rhashtable lookup for sub-schedulers")
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 012ca8bd70fb..7edd46f3ac43 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -32,6 +32,7 @@ static const struct rhashtable_params scx_sched_hash_params = {
 	.key_len		= sizeof_field(struct scx_sched, ops.sub_cgroup_id),
 	.key_offset		= offsetof(struct scx_sched, ops.sub_cgroup_id),
 	.head_offset		= offsetof(struct scx_sched, hash_node),
+	.insecure_elasticity	= true,	/* inserted under scx_sched_lock */
 };
 
 static struct rhashtable scx_sched_hash;
-- 
cgit v1.2.3


From 2d2b026c3ea792a0c91d4acf4430d8b65bedf271 Mon Sep 17 00:00:00 2001
From: Cheng-Yang Chou <yphbchou0911@gmail.com>
Date: Mon, 20 Apr 2026 17:28:47 +0800
Subject: sched_ext: Deny SCX kfuncs to non-SCX struct_ops programs

scx_kfunc_context_filter() currently allows non-SCX struct_ops programs
(e.g. tcp_congestion_ops) to call SCX unlocked kfuncs. This is wrong
for two reasons:

- It is semantically incorrect: a TCP congestion control program has no
  business calling SCX kfuncs such as scx_bpf_kick_cpu().

- With CONFIG_EXT_SUB_SCHED=y, kfuncs like scx_bpf_kick_cpu() call
  scx_prog_sched(aux), which invokes bpf_prog_get_assoc_struct_ops(aux)
  and casts the result to struct sched_ext_ops * before reading ops->priv.
  For a non-SCX struct_ops program the returned pointer is the kdata of
  that struct_ops type, which is far smaller than sched_ext_ops, making
  the read an out-of-bounds access (confirmed with KASAN).

Extend the filter to cover scx_kfunc_set_any and scx_kfunc_set_idle as
well, and deny all SCX kfuncs for any struct_ops program that is not the
SCX struct_ops. This addresses both issues: the semantic contract is
enforced at the verifier level, and the runtime out-of-bounds access
becomes unreachable.

Fixes: d1d3c1c6ae36 ("sched_ext: Add verifier-time kfunc context filter")
Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Cheng-Yang Chou <yphbchou0911@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c      | 32 ++++++++++++++++++--------------
 kernel/sched/ext_idle.c |  1 +
 kernel/sched/ext_idle.h |  1 +
 3 files changed, 20 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 7edd46f3ac43..d66fea57ee69 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -9480,6 +9480,7 @@ BTF_KFUNCS_END(scx_kfunc_ids_any)
 static const struct btf_kfunc_id_set scx_kfunc_set_any = {
 	.owner			= THIS_MODULE,
 	.set			= &scx_kfunc_ids_any,
+	.filter			= scx_kfunc_context_filter,
 };
 
 /*
@@ -9527,13 +9528,12 @@ static const u32 scx_kf_allow_flags[] = {
 };
 
 /*
- * Verifier-time filter for context-sensitive SCX kfuncs. Registered via the
- * .filter field on each per-group btf_kfunc_id_set. The BPF core invokes this
- * for every kfunc call in the registered hook (BPF_PROG_TYPE_STRUCT_OPS or
+ * Verifier-time filter for SCX kfuncs. Registered via the .filter field on
+ * each per-group btf_kfunc_id_set. The BPF core invokes this for every kfunc
+ * call in the registered hook (BPF_PROG_TYPE_STRUCT_OPS or
  * BPF_PROG_TYPE_SYSCALL), regardless of which set originally introduced the
- * kfunc - so the filter must short-circuit on kfuncs it doesn't govern (e.g.
- * scx_kfunc_ids_any) by falling through to "allow" when none of the
- * context-sensitive sets contain the kfunc.
+ * kfunc - so the filter must short-circuit on kfuncs it doesn't govern by
+ * falling through to "allow" when none of the SCX sets contain the kfunc.
  */
 int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id)
 {
@@ -9542,18 +9542,21 @@ int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id)
 	bool in_enqueue = btf_id_set8_contains(&scx_kfunc_ids_enqueue_dispatch, kfunc_id);
 	bool in_dispatch = btf_id_set8_contains(&scx_kfunc_ids_dispatch, kfunc_id);
 	bool in_cpu_release = btf_id_set8_contains(&scx_kfunc_ids_cpu_release, kfunc_id);
+	bool in_idle = btf_id_set8_contains(&scx_kfunc_ids_idle, kfunc_id);
+	bool in_any = btf_id_set8_contains(&scx_kfunc_ids_any, kfunc_id);
 	u32 moff, flags;
 
-	/* Not a context-sensitive kfunc (e.g. from scx_kfunc_ids_any) - allow. */
-	if (!(in_unlocked || in_select_cpu || in_enqueue || in_dispatch || in_cpu_release))
+	/* Not an SCX kfunc - allow. */
+	if (!(in_unlocked || in_select_cpu || in_enqueue || in_dispatch ||
+	      in_cpu_release || in_idle || in_any))
 		return 0;
 
 	/* SYSCALL progs (e.g. BPF test_run()) may call unlocked and select_cpu kfuncs. */
 	if (prog->type == BPF_PROG_TYPE_SYSCALL)
-		return (in_unlocked || in_select_cpu) ? 0 : -EACCES;
+		return (in_unlocked || in_select_cpu || in_idle || in_any) ? 0 : -EACCES;
 
 	if (prog->type != BPF_PROG_TYPE_STRUCT_OPS)
-		return -EACCES;
+		return (in_any || in_idle) ? 0 : -EACCES;
 
 	/*
 	 * add_subprog_and_kfunc() collects all kfunc calls, including dead code
@@ -9566,14 +9569,15 @@ int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id)
 		return 0;
 
 	/*
-	 * Non-SCX struct_ops: only unlocked kfuncs are safe. The other
-	 * context-sensitive kfuncs assume the rq lock is held by the SCX
-	 * dispatch path, which doesn't apply to other struct_ops users.
+	 * Non-SCX struct_ops: SCX kfuncs are not permitted.
 	 */
 	if (prog->aux->st_ops != &bpf_sched_ext_ops)
-		return in_unlocked ? 0 : -EACCES;
+		return -EACCES;
 
 	/* SCX struct_ops: check the per-op allow list. */
+	if (in_any || in_idle)
+		return 0;
+
 	moff = prog->aux->attach_st_ops_member_off;
 	flags = scx_kf_allow_flags[SCX_MOFF_IDX(moff)];
 
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index 443d12a3df67..c43d62d90e40 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -1467,6 +1467,7 @@ BTF_KFUNCS_END(scx_kfunc_ids_idle)
 static const struct btf_kfunc_id_set scx_kfunc_set_idle = {
 	.owner			= THIS_MODULE,
 	.set			= &scx_kfunc_ids_idle,
+	.filter			= scx_kfunc_context_filter,
 };
 
 /*
diff --git a/kernel/sched/ext_idle.h b/kernel/sched/ext_idle.h
index dc35f850481e..8d169d3bbdf9 100644
--- a/kernel/sched/ext_idle.h
+++ b/kernel/sched/ext_idle.h
@@ -12,6 +12,7 @@
 
 struct sched_ext_ops;
 
+extern struct btf_id_set8 scx_kfunc_ids_idle;
 extern struct btf_id_set8 scx_kfunc_ids_select_cpu;
 
 void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops);
-- 
cgit v1.2.3


From 4e3d7c89e15ac5dbf45b7d7a49bb374650c03339 Mon Sep 17 00:00:00 2001
From: zhidao su <suzhidao@xiaomi.com>
Date: Thu, 23 Apr 2026 10:58:32 +0800
Subject: sched_ext: Fix local_dsq_post_enq() to use task's scheduler in
 sub-sched

local_dsq_post_enq() calls call_task_dequeue() with scx_root instead of
the scheduler instance actually managing the task. When
CONFIG_EXT_SUB_SCHED is enabled, tasks may be managed by a sub-scheduler
whose ops.dequeue() callback differs from root's. Using scx_root causes
the wrong scheduler's ops.dequeue() to be consulted: sub-sched tasks
dispatched to a local DSQ via scx_bpf_dsq_move_to_local() will have
SCX_TASK_IN_CUSTODY cleared but the sub-scheduler's ops.dequeue() is
never invoked, violating the custody exit semantics.

Fix by adding a 'struct scx_sched *sch' parameter to local_dsq_post_enq()
and move_local_task_to_local_dsq(), and propagating the correct scheduler
from their callers dispatch_enqueue(), move_task_between_dsqs(), and
consume_dispatch_q().

This is consistent with dispatch_enqueue()'s non-local path which already
passes 'sch' directly to call_task_dequeue() for global/bypass DSQs.

Fixes: ebf1ccff79c4 ("sched_ext: Fix ops.dequeue() semantics")
Signed-off-by: zhidao su <suzhidao@xiaomi.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index d66fea57ee69..1f670028bf19 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1389,13 +1389,13 @@ static void call_task_dequeue(struct scx_sched *sch, struct rq *rq,
 	p->scx.flags &= ~SCX_TASK_IN_CUSTODY;
 }
 
-static void local_dsq_post_enq(struct scx_dispatch_q *dsq, struct task_struct *p,
-			       u64 enq_flags)
+static void local_dsq_post_enq(struct scx_sched *sch, struct scx_dispatch_q *dsq,
+			       struct task_struct *p, u64 enq_flags)
 {
 	struct rq *rq = container_of(dsq, struct rq, scx.local_dsq);
 	bool preempt = false;
 
-	call_task_dequeue(scx_root, rq, p, 0);
+	call_task_dequeue(sch, rq, p, 0);
 
 	/*
 	 * If @rq is in balance, the CPU is already vacant and looking for the
@@ -1519,7 +1519,7 @@ static void dispatch_enqueue(struct scx_sched *sch, struct rq *rq,
 	 * concurrently in a non-atomic way.
 	 */
 	if (is_local) {
-		local_dsq_post_enq(dsq, p, enq_flags);
+		local_dsq_post_enq(sch, dsq, p, enq_flags);
 	} else {
 		/*
 		 * Task on global/bypass DSQ: leave custody, task on
@@ -2130,7 +2130,8 @@ static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p, int wake_fl
 		schedule_reenq_local(rq, 0);
 }
 
-static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
+static void move_local_task_to_local_dsq(struct scx_sched *sch,
+					 struct task_struct *p, u64 enq_flags,
 					 struct scx_dispatch_q *src_dsq,
 					 struct rq *dst_rq)
 {
@@ -2150,7 +2151,7 @@ static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
 	dsq_inc_nr(dst_dsq, p, enq_flags);
 	p->scx.dsq = dst_dsq;
 
-	local_dsq_post_enq(dst_dsq, p, enq_flags);
+	local_dsq_post_enq(sch, dst_dsq, p, enq_flags);
 }
 
 /**
@@ -2371,7 +2372,7 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch,
 		/* @p is going from a non-local DSQ to a local DSQ */
 		if (src_rq == dst_rq) {
 			task_unlink_from_dsq(p, src_dsq);
-			move_local_task_to_local_dsq(p, enq_flags,
+			move_local_task_to_local_dsq(sch, p, enq_flags,
 						     src_dsq, dst_rq);
 			raw_spin_unlock(&src_dsq->lock);
 		} else {
@@ -2424,7 +2425,7 @@ retry:
 
 		if (rq == task_rq) {
 			task_unlink_from_dsq(p, dsq);
-			move_local_task_to_local_dsq(p, enq_flags, dsq, rq);
+			move_local_task_to_local_dsq(sch, p, enq_flags, dsq, rq);
 			raw_spin_unlock(&dsq->lock);
 			return true;
 		}
-- 
cgit v1.2.3


From 510a27055446b8f0d29487ca8b8d2033dc2b6ca6 Mon Sep 17 00:00:00 2001
From: Richard Cheng <icheng@nvidia.com>
Date: Fri, 24 Apr 2026 18:02:21 +0800
Subject: sched_ext: sync disable_irq_work in bpf_scx_unreg()

When unregistered my self-written scx scheduler, the following panic
occurs.

[  229.923133] Kernel text patching generated an invalid instruction at 0xffff80009bc2c1f8!
[  229.923146] Internal error: Oops - BRK: 00000000f2000100 [#1]  SMP
[  230.077871] CPU: 48 UID: 0 PID: 1760 Comm: kworker/u583:7 Not tainted 7.0.0+ #3 PREEMPT(full)
[  230.086677] Hardware name: NVIDIA GB200 NVL/P3809-BMC, BIOS 02.05.12 20251107
[  230.093972] Workqueue: events_unbound bpf_map_free_deferred
[  230.099675] Sched_ext: invariant_0.1.0_aarch64_unknown_linux_gnu_debug (disabling), task: runnable_at=-174ms
[  230.116843] pc : 0xffff80009bc2c1f8
[  230.120406] lr : dequeue_task_scx+0x270/0x2d0
[  230.217749] Call trace:
[  230.228515]  0xffff80009bc2c1f8 (P)
[  230.232077]  dequeue_task+0x84/0x188
[  230.235728]  sched_change_begin+0x1dc/0x250
[  230.240000]  __set_cpus_allowed_ptr_locked+0x17c/0x240
[  230.245250]  __set_cpus_allowed_ptr+0x74/0xf0
[  230.249701]  ___migrate_enable+0x4c/0xa0
[  230.253707]  bpf_map_free_deferred+0x1a4/0x1b0
[  230.258246]  process_one_work+0x184/0x540
[  230.262342]  worker_thread+0x19c/0x348
[  230.266170]  kthread+0x13c/0x150
[  230.269465]  ret_from_fork+0x10/0x20
[  230.281393] Code: d4202000 d4202000 d4202000 d4202000 (d4202000)
[  230.287621] ---[ end trace 0000000000000000 ]---
[  231.160046] Kernel panic - not syncing: Oops - BRK: Fatal exception in interrupt

The root cause is that the JIT page backing ops->quiescent() is freed
before all callers of that function have stopped.

The expected ordering during teardown is:
    bitmap_zero(sch->has_op) + synchronize_rcu()
        -> guarantees no CPU will ever call sch->ops.* again
    -> only THEN free the BPF struct_ops JIT page

bpf_scx_unreg() is supposed to enforce the order, but after
commit f4a6c506d118 ("sched_ext: Always bounce scx_disable() through
irq_work"), disable_work is no longer queued directly, causing
kthread_flush_work() to be a noop. Thus, the caller drops the struct_ops
map too early and poisoned with AARCH64_BREAK_FAULT before
disable_workfn ever execute.

So the subsequent dequeue_task() still sees SCX_HAS_OP(sch, quiescent)
as true and calls ops.quiescent, which hit on the poisoned page and BRK
panic.

Add a helper scx_flush_disable_work() so the future use cases that want
to flush disable_work can use it.
Also amend the call for scx_root_enable_workfn() and
scx_sub_enable_workfn() which have similar pattern in the error path.

Fixes: f4a6c506d118 ("sched_ext: Always bounce scx_disable() through irq_work")
Signed-off-by: Richard Cheng <icheng@nvidia.com>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Reviewed-by: Cheng-Yang Chou <yphbchou0911@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 1f670028bf19..a018034dd81c 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -5923,6 +5923,20 @@ static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind)
 		irq_work_queue(&sch->disable_irq_work);
 }
 
+/**
+ * scx_flush_disable_work - flush the disable work and wait for it to finish
+ * @sch: the scheduler
+ *
+ * sch->disable_work might still not queued, causing kthread_flush_work()
+ * as a noop. Syncing the irq_work first is required to guarantee the
+ * kthread work has been queued before waiting for it.
+ */
+static void scx_flush_disable_work(struct scx_sched *sch)
+{
+	irq_work_sync(&sch->disable_irq_work);
+	kthread_flush_work(&sch->disable_work);
+}
+
 static void dump_newline(struct seq_buf *s)
 {
 	trace_sched_ext_dump("");
@@ -6823,7 +6837,7 @@ err_disable:
 	 * completion. sch's base reference will be put by bpf_scx_unreg().
 	 */
 	scx_error(sch, "scx_root_enable() failed (%d)", ret);
-	kthread_flush_work(&sch->disable_work);
+	scx_flush_disable_work(sch);
 	cmd->ret = 0;
 }
 
@@ -7090,7 +7104,7 @@ err_unlock_and_disable:
 	percpu_up_write(&scx_fork_rwsem);
 err_disable:
 	mutex_unlock(&scx_enable_mutex);
-	kthread_flush_work(&sch->disable_work);
+	scx_flush_disable_work(sch);
 	cmd->ret = 0;
 }
 
@@ -7351,7 +7365,7 @@ static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
 	struct scx_sched *sch = rcu_dereference_protected(ops->priv, true);
 
 	scx_disable(sch, SCX_EXIT_UNREG);
-	kthread_flush_work(&sch->disable_work);
+	scx_flush_disable_work(sch);
 	RCU_INIT_POINTER(ops->priv, NULL);
 	kobject_put(&sch->kobj);
 }
-- 
cgit v1.2.3


From bd2d76455b65aab77652823919db128a8e585825 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 24 Apr 2026 10:14:32 -1000
Subject: sched_ext: Defer scx_hardlockup() out of NMI

scx_hardlockup() runs from NMI and eventually calls scx_claim_exit(),
which takes scx_sched_lock. scx_sched_lock isn't NMI-safe and grabbing
it from NMI context can lead to deadlocks.

The hardlockup handler is best-effort recovery and the disable path it
triggers runs off of irq_work anyway. Move the handle_lockup() call into
an irq_work so it runs in IRQ context.

Fixes: ebeca1f930ea ("sched_ext: Introduce cgroup sub-sched support")
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 33 +++++++++++++++++++++++++++------
 1 file changed, 27 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index a018034dd81c..34de1c9b7a7c 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4940,6 +4940,25 @@ void scx_softlockup(u32 dur_s)
 			smp_processor_id(), dur_s);
 }
 
+/*
+ * scx_hardlockup() runs from NMI and eventually calls scx_claim_exit(),
+ * which takes scx_sched_lock. scx_sched_lock isn't NMI-safe and grabbing
+ * it from NMI context can lead to deadlocks. Defer via irq_work; the
+ * disable path runs off irq_work anyway.
+ */
+static atomic_t scx_hardlockup_cpu = ATOMIC_INIT(-1);
+
+static void scx_hardlockup_irq_workfn(struct irq_work *work)
+{
+	int cpu = atomic_xchg(&scx_hardlockup_cpu, -1);
+
+	if (cpu >= 0 && handle_lockup("hard lockup - CPU %d", cpu))
+		printk_deferred(KERN_ERR "sched_ext: Hard lockup - CPU %d, disabling BPF scheduler\n",
+				cpu);
+}
+
+static DEFINE_IRQ_WORK(scx_hardlockup_irq_work, scx_hardlockup_irq_workfn);
+
 /**
  * scx_hardlockup - sched_ext hardlockup handler
  *
@@ -4948,17 +4967,19 @@ void scx_softlockup(u32 dur_s)
  * Try kicking out the current scheduler in an attempt to recover the system to
  * a good state before taking more drastic actions.
  *
- * Returns %true if sched_ext is enabled and abort was initiated, which may
- * resolve the reported hardlockup. %false if sched_ext is not enabled or
- * someone else already initiated abort.
+ * Queues an irq_work; the handle_lockup() call happens in IRQ context (see
+ * scx_hardlockup_irq_workfn).
+ *
+ * Returns %true if sched_ext is enabled and the work was queued, %false
+ * otherwise.
  */
 bool scx_hardlockup(int cpu)
 {
-	if (!handle_lockup("hard lockup - CPU %d", cpu))
+	if (!rcu_access_pointer(scx_root))
 		return false;
 
-	printk_deferred(KERN_ERR "sched_ext: Hard lockup - CPU %d, disabling BPF scheduler\n",
-			cpu);
+	atomic_cmpxchg(&scx_hardlockup_cpu, -1, cpu);
+	irq_work_queue(&scx_hardlockup_irq_work);
 	return true;
 }
 
-- 
cgit v1.2.3


From 411d3ef1a70589755e3beed2f5bf1f8aa0c27d1a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 24 Apr 2026 14:31:35 -1000
Subject: sched_ext: Unregister sub_kset on scheduler disable

When ops.sub_attach is set, scx_alloc_and_add_sched() creates sub_kset as a
child of &sch->kobj, which pins the parent with its own reference. The
disable paths never call kset_unregister(), so the final kobject_put() in
bpf_scx_unreg() leaves a stale reference and scx_kobj_release() never runs,
leaking the whole struct scx_sched on every load/unload cycle.

Unregister sub_kset in scx_root_disable() and scx_sub_disable() before
kobject_del(&sch->kobj).

Fixes: ebeca1f930ea ("sched_ext: Introduce cgroup sub-sched support")
Reported-by: Chris Mason <clm@meta.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 34de1c9b7a7c..7f991ecb1398 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -5721,6 +5721,8 @@ static void scx_sub_disable(struct scx_sched *sch)
 
 	if (sch->ops.exit)
 		SCX_CALL_OP(sch, exit, NULL, sch->exit_info);
+	if (sch->sub_kset)
+		kset_unregister(sch->sub_kset);
 	kobject_del(&sch->kobj);
 }
 #else	/* CONFIG_EXT_SUB_SCHED */
@@ -5852,6 +5854,10 @@ static void scx_root_disable(struct scx_sched *sch)
 	 * could observe an object of the same name still in the hierarchy when
 	 * the next scheduler is loaded.
 	 */
+#ifdef CONFIG_EXT_SUB_SCHED
+	if (sch->sub_kset)
+		kset_unregister(sch->sub_kset);
+#endif
 	kobject_del(&sch->kobj);
 
 	free_kick_syncs();
-- 
cgit v1.2.3


From 4fda9f0e7c950da4fe03cedeb2ac818edf5d03e9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 24 Apr 2026 14:31:35 -1000
Subject: sched_ext: Guard scx_dsq_move() against NULL kit->dsq after failed
 iter_new

bpf_iter_scx_dsq_new() clears kit->dsq on failure and
bpf_iter_scx_dsq_{next,destroy}() guard against that. scx_dsq_move() doesn't -
it dereferences kit->dsq immediately, so a BPF program that calls
scx_bpf_dsq_move[_vtime]() after a failed iter_new oopses the kernel.

Return false if kit->dsq is NULL.

Fixes: 4c30f5ce4f7a ("sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq()")
Cc: stable@vger.kernel.org # v6.12+
Reported-by: Chris Mason <clm@meta.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 7f991ecb1398..68c67113204f 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -8076,12 +8076,22 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
 			 struct task_struct *p, u64 dsq_id, u64 enq_flags)
 {
 	struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq;
-	struct scx_sched *sch = src_dsq->sched;
+	struct scx_sched *sch;
 	struct rq *this_rq, *src_rq, *locked_rq;
 	bool dispatched = false;
 	bool in_balance;
 	unsigned long flags;
 
+	/*
+	 * The verifier considers an iterator slot initialized on any
+	 * KF_ITER_NEW return, so a BPF program may legally reach here after
+	 * bpf_iter_scx_dsq_new() failed and left @kit->dsq NULL.
+	 */
+	if (unlikely(!src_dsq))
+		return false;
+
+	sch = src_dsq->sched;
+
 	if (!scx_vet_enq_flags(sch, dsq_id, &enq_flags))
 		return false;
 
-- 
cgit v1.2.3


From da2d81b4118a74e65d2335e221a38d665902a98c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 24 Apr 2026 14:31:35 -1000
Subject: sched_ext: Skip tasks with stale task_rq in bypass_lb_cpu()

bypass_lb_cpu() transfers tasks between per-CPU bypass DSQs without
migrating them - task_cpu() only updates when the donee later consumes the
task via move_remote_task_to_local_dsq(). If the LB timer fires again before
consumption and the new DSQ becomes a donor, @p is still on the previous CPU
and task_rq(@p) != donor_rq. @p can't be moved without its own rq locked.

Skip such tasks.

Fixes: 95d1df610cdc ("sched_ext: Implement load balancer for bypass mode")
Cc: stable@vger.kernel.org # v6.19+
Reported-by: Chris Mason <clm@meta.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 68c67113204f..f8500ce37b22 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -5023,6 +5023,15 @@ resume:
 		if (cpumask_empty(donee_mask))
 			break;
 
+		/*
+		 * If an earlier pass placed @p on @donor_dsq from a different
+		 * CPU and the donee hasn't consumed it yet, @p is still on the
+		 * previous CPU and task_rq(@p) != @donor_rq. @p can't be moved
+		 * without its rq locked. Skip.
+		 */
+		if (task_rq(p) != donor_rq)
+			continue;
+
 		donee = cpumask_any_and_distribute(donee_mask, p->cpus_ptr);
 		if (donee >= nr_cpu_ids)
 			continue;
-- 
cgit v1.2.3


From 21a5a97ba47842ef0c52d6c89e501dce27806550 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 24 Apr 2026 14:31:35 -1000
Subject: sched_ext: Don't disable tasks in scx_sub_enable_workfn() abort path

scx_sub_enable_workfn()'s prep loop calls __scx_init_task(sch, p, false)
without transitioning task state, then sets SCX_TASK_SUB_INIT. If prep fails
partway, the abort path runs __scx_disable_and_exit_task(sch, p) on the
marked tasks. Task state is still the parent's ENABLED, so that dispatches
to the SCX_TASK_ENABLED arm and calls scx_disable_task(sch, p) - i.e.
child->ops.disable() - for tasks on which child->ops.enable() never ran. A
BPF sub-scheduler allocating per-task state in enable/freeing in disable
would operate on uninitialized state.

The dying-task branch in scx_disable_and_exit_task() has the same problem,
and scx_enabling_sub_sched was cleared before the abort cleanup loop - a
task exiting during cleanup tripped the WARN and skipped both ops.exit_task
and the SCX_TASK_SUB_INIT clear, leaking per-task resources and leaving the
task stuck.

Introduce scx_sub_init_cancel_task() that calls ops.exit_task with
cancelled=true - matching what the top-level init path does when init_task
itself returns -errno. Use it in the abort loop and in the dying-task
branch. scx_enabling_sub_sched now stays set until the abort loop finishes
clearing SUB_INIT, so concurrent exits hitting the dying-task branch can
still find @sch. That branch also clears SCX_TASK_SUB_INIT unconditionally
when seen, leaving the task unmarked even if the WARN fires.

Fixes: 337ec00b1d9c ("sched_ext: Implement cgroup sub-sched enabling and disabling")
Reported-by: Chris Mason <clm@meta.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 36 ++++++++++++++++++++++++++++++------
 1 file changed, 30 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index f8500ce37b22..dd0539ab9ba8 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3633,6 +3633,22 @@ static void __scx_disable_and_exit_task(struct scx_sched *sch,
 		SCX_CALL_OP_TASK(sch, exit_task, task_rq(p), p, &args);
 }
 
+/*
+ * Undo a completed __scx_init_task(sch, p, false) when scx_enable_task() never
+ * ran. The task state has not been transitioned, so this mirrors the
+ * SCX_TASK_INIT branch in __scx_disable_and_exit_task().
+ */
+static void scx_sub_init_cancel_task(struct scx_sched *sch, struct task_struct *p)
+{
+	struct scx_exit_task_args args = { .cancelled = true };
+
+	lockdep_assert_held(&p->pi_lock);
+	lockdep_assert_rq_held(task_rq(p));
+
+	if (SCX_HAS_OP(sch, exit_task))
+		SCX_CALL_OP_TASK(sch, exit_task, task_rq(p), p, &args);
+}
+
 static void scx_disable_and_exit_task(struct scx_sched *sch,
 				      struct task_struct *p)
 {
@@ -3641,11 +3657,12 @@ static void scx_disable_and_exit_task(struct scx_sched *sch,
 	/*
 	 * If set, @p exited between __scx_init_task() and scx_enable_task() in
 	 * scx_sub_enable() and is initialized for both the associated sched and
-	 * its parent. Disable and exit for the child too.
+	 * its parent. Exit for the child too - scx_enable_task() never ran for
+	 * it, so undo only init_task.
 	 */
-	if ((p->scx.flags & SCX_TASK_SUB_INIT) &&
-	    !WARN_ON_ONCE(!scx_enabling_sub_sched)) {
-		__scx_disable_and_exit_task(scx_enabling_sub_sched, p);
+	if (p->scx.flags & SCX_TASK_SUB_INIT) {
+		if (!WARN_ON_ONCE(!scx_enabling_sub_sched))
+			scx_sub_init_cancel_task(scx_enabling_sub_sched, p);
 		p->scx.flags &= ~SCX_TASK_SUB_INIT;
 	}
 
@@ -7124,16 +7141,23 @@ out_unlock:
 abort:
 	put_task_struct(p);
 	scx_task_iter_stop(&sti);
-	scx_enabling_sub_sched = NULL;
 
+	/*
+	 * Undo __scx_init_task() for tasks we marked. scx_enable_task() never
+	 * ran for @sch on them, so calling scx_disable_task() here would invoke
+	 * ops.disable() without a matching ops.enable(). scx_enabling_sub_sched
+	 * must stay set until SUB_INIT is cleared from every marked task -
+	 * scx_disable_and_exit_task() reads it when a task exits concurrently.
+	 */
 	scx_task_iter_start(&sti, sch->cgrp);
 	while ((p = scx_task_iter_next_locked(&sti))) {
 		if (p->scx.flags & SCX_TASK_SUB_INIT) {
-			__scx_disable_and_exit_task(sch, p);
+			scx_sub_init_cancel_task(sch, p);
 			p->scx.flags &= ~SCX_TASK_SUB_INIT;
 		}
 	}
 	scx_task_iter_stop(&sti);
+	scx_enabling_sub_sched = NULL;
 err_unlock_and_disable:
 	/* we'll soon enter disable path, keep bypass on */
 	scx_cgroup_unlock();
-- 
cgit v1.2.3


From 80afd4c84bc8f5e80145ce35279f5ce53f6043db Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 24 Apr 2026 14:31:35 -1000
Subject: sched_ext: Read scx_root under scx_cgroup_ops_rwsem in cgroup setters

scx_group_set_{weight,idle,bandwidth}() cache scx_root before acquiring
scx_cgroup_ops_rwsem, so the pointer can be stale by the time the op runs.
If the loaded scheduler is disabled and freed (via RCU work) and another is
enabled between the naked load and the rwsem acquire, the reader sees
scx_cgroup_enabled=true (the new scheduler's) but dereferences the freed one
- UAF on SCX_HAS_OP(sch, ...) / SCX_CALL_OP(sch, ...).

scx_cgroup_enabled is toggled only under scx_cgroup_ops_rwsem write
(scx_cgroup_{init,exit}), so reading scx_root inside the rwsem read section
correlates @sch with the enabled snapshot.

Fixes: a5bd6ba30b33 ("sched_ext: Use cgroup_lock/unlock() to synchronize against cgroup operations")
Cc: stable@vger.kernel.org # v6.18+
Reported-by: Chris Mason <clm@meta.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index dd0539ab9ba8..f6d22636a4de 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4343,9 +4343,10 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
 
 void scx_group_set_weight(struct task_group *tg, unsigned long weight)
 {
-	struct scx_sched *sch = scx_root;
+	struct scx_sched *sch;
 
 	percpu_down_read(&scx_cgroup_ops_rwsem);
+	sch = scx_root;
 
 	if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) &&
 	    tg->scx.weight != weight)
@@ -4358,9 +4359,10 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight)
 
 void scx_group_set_idle(struct task_group *tg, bool idle)
 {
-	struct scx_sched *sch = scx_root;
+	struct scx_sched *sch;
 
 	percpu_down_read(&scx_cgroup_ops_rwsem);
+	sch = scx_root;
 
 	if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_idle))
 		SCX_CALL_OP(sch, cgroup_set_idle, NULL, tg_cgrp(tg), idle);
@@ -4374,9 +4376,10 @@ void scx_group_set_idle(struct task_group *tg, bool idle)
 void scx_group_set_bandwidth(struct task_group *tg,
 			     u64 period_us, u64 quota_us, u64 burst_us)
 {
-	struct scx_sched *sch = scx_root;
+	struct scx_sched *sch;
 
 	percpu_down_read(&scx_cgroup_ops_rwsem);
+	sch = scx_root;
 
 	if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_bandwidth) &&
 	    (tg->scx.bw_period_us != period_us ||
-- 
cgit v1.2.3


From cc2a387d330d1fc51a9b7f211a7e5d39c9f0ab94 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 24 Apr 2026 14:31:35 -1000
Subject: sched_ext: Resolve caller's scheduler in scx_bpf_destroy_dsq() /
 scx_bpf_dsq_nr_queued()

scx_bpf_create_dsq() resolves the calling scheduler via scx_prog_sched(aux)
and inserts the new DSQ into that scheduler's dsq_hash. Its inverse
scx_bpf_destroy_dsq() and the query helper scx_bpf_dsq_nr_queued() were
hard-coded to rcu_dereference(scx_root), so a sub-scheduler could only
destroy or query DSQs in the root scheduler's hash - never its own. If the
root had a DSQ with the same id, the sub-sched silently destroyed it and the
root aborted on the next dispatch ("invalid DSQ ID 0x0..").

Take a const struct bpf_prog_aux *aux via KF_IMPLICIT_ARGS and resolve the
scheduler with scx_prog_sched(aux), matching scx_bpf_create_dsq().

Fixes: ebeca1f930ea ("sched_ext: Introduce cgroup sub-sched support")
Reported-by: Chris Mason <clm@meta.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index f6d22636a4de..cc5df32db8ff 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -8722,11 +8722,12 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags, const struct bpf_prog_aux
 /**
  * scx_bpf_dsq_nr_queued - Return the number of queued tasks
  * @dsq_id: id of the DSQ
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
  *
  * Return the number of tasks in the DSQ matching @dsq_id. If not found,
  * -%ENOENT is returned.
  */
-__bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id)
+__bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id, const struct bpf_prog_aux *aux)
 {
 	struct scx_sched *sch;
 	struct scx_dispatch_q *dsq;
@@ -8734,7 +8735,7 @@ __bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id)
 
 	preempt_disable();
 
-	sch = rcu_dereference_sched(scx_root);
+	sch = scx_prog_sched(aux);
 	if (unlikely(!sch)) {
 		ret = -ENODEV;
 		goto out;
@@ -8766,21 +8767,21 @@ out:
 /**
  * scx_bpf_destroy_dsq - Destroy a custom DSQ
  * @dsq_id: DSQ to destroy
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
  *
  * Destroy the custom DSQ identified by @dsq_id. Only DSQs created with
  * scx_bpf_create_dsq() can be destroyed. The caller must ensure that the DSQ is
  * empty and no further tasks are dispatched to it. Ignored if called on a DSQ
  * which doesn't exist. Can be called from any online scx_ops operations.
  */
-__bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id)
+__bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id, const struct bpf_prog_aux *aux)
 {
 	struct scx_sched *sch;
 
-	rcu_read_lock();
-	sch = rcu_dereference(scx_root);
+	guard(rcu)();
+	sch = scx_prog_sched(aux);
 	if (sch)
 		destroy_dsq(sch, dsq_id);
-	rcu_read_unlock();
 }
 
 /**
@@ -9534,8 +9535,8 @@ BTF_KFUNCS_START(scx_kfunc_ids_any)
 BTF_ID_FLAGS(func, scx_bpf_task_set_slice, KF_IMPLICIT_ARGS | KF_RCU);
 BTF_ID_FLAGS(func, scx_bpf_task_set_dsq_vtime, KF_IMPLICIT_ARGS | KF_RCU);
 BTF_ID_FLAGS(func, scx_bpf_kick_cpu, KF_IMPLICIT_ARGS)
-BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued)
-BTF_ID_FLAGS(func, scx_bpf_destroy_dsq)
+BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_destroy_dsq, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_IMPLICIT_ARGS | KF_RCU_PROTECTED | KF_RET_NULL)
 BTF_ID_FLAGS(func, scx_bpf_dsq_reenq, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2, KF_IMPLICIT_ARGS)
-- 
cgit v1.2.3


From 2f2ea77092660b53bfcbc4acc590b57ce9ab5dce Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 24 Apr 2026 14:31:35 -1000
Subject: sched_ext: Use dsq->first_task instead of list_empty() in
 dispatch_enqueue() FIFO-tail

dispatch_enqueue()'s FIFO-tail path used list_empty(&dsq->list) to decide
whether to set dsq->first_task on enqueue. dsq->list can contain parked BPF
iterator cursors (SCX_DSQ_LNODE_ITER_CURSOR), so list_empty() is not a
reliable "no real task" check. If the last real task is unlinked while a
cursor is parked, first_task becomes NULL; the next FIFO-tail enqueue then
sees list_empty() == false and skips the first_task update, leaving
scx_bpf_dsq_peek() returning NULL for a non-empty DSQ.

Test dsq->first_task directly, which already tracks only real tasks and is
maintained under dsq->lock.

Fixes: 44f5c8ec5b9a ("sched_ext: Add lockless peek operation for DSQs")
Cc: stable@vger.kernel.org # v6.19+
Reported-by: Chris Mason <clm@meta.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Cc: Ryan Newton <newton@meta.com>
---
 kernel/sched/ext.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index cc5df32db8ff..8a2a90659c65 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1495,11 +1495,13 @@ static void dispatch_enqueue(struct scx_sched *sch, struct rq *rq,
 			if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN))
 				rcu_assign_pointer(dsq->first_task, p);
 		} else {
-			bool was_empty;
-
-			was_empty = list_empty(&dsq->list);
+			/*
+			 * dsq->list can contain parked BPF iterator cursors, so
+			 * list_empty() here isn't a reliable proxy for "no real
+			 * task in the DSQ". Test dsq->first_task directly.
+			 */
 			list_add_tail(&p->scx.dsq_list.node, &dsq->list);
-			if (was_empty && !(dsq->id & SCX_DSQ_FLAG_BUILTIN))
+			if (!dsq->first_task && !(dsq->id & SCX_DSQ_FLAG_BUILTIN))
 				rcu_assign_pointer(dsq->first_task, p);
 		}
 	}
-- 
cgit v1.2.3


From 7fb39e4eb4c3db52e4707a6a1cd45362f7e803f5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 24 Apr 2026 14:31:36 -1000
Subject: sched_ext: Save and restore scx_locked_rq across SCX_CALL_OP

SCX_CALL_OP{,_RET}() unconditionally clears scx_locked_rq_state to NULL on
exit. Correct at the top level, but ops can recurse via
scx_bpf_sub_dispatch(): a parent's ops.dispatch calls the helper, which
invokes the child's ops.dispatch under another SCX_CALL_OP. When the inner
call returns, the NULL clobbers the outer's state. The parent's BPF then
calls kfuncs like scx_bpf_cpuperf_set() which read scx_locked_rq()==NULL and
re-acquire the already-held rq.

Snapshot scx_locked_rq_state on entry and restore on exit. Rename the rq
parameter to locked_rq across all SCX_CALL_OP* macros so the snapshot local
can be typed as 'struct rq *' without colliding with the parameter token in
the expansion. SCX_CALL_OP_TASK{,_RET}() and SCX_CALL_OP_2TASKS_RET() funnel
through the two base macros and inherit the fix.

Fixes: 4f8b122848db ("sched_ext: Add basic building blocks for nested sub-scheduler dispatching")
Reported-by: Chris Mason <clm@meta.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 49 ++++++++++++++++++++++++++++++-------------------
 1 file changed, 30 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 8a2a90659c65..26968d0a6752 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -470,24 +470,35 @@ static inline void update_locked_rq(struct rq *rq)
 	__this_cpu_write(scx_locked_rq_state, rq);
 }
 
-#define SCX_CALL_OP(sch, op, rq, args...)					\
+/*
+ * SCX ops can recurse via scx_bpf_sub_dispatch() - the inner call must not
+ * clobber the outer's scx_locked_rq_state. Save it on entry, restore on exit.
+ */
+#define SCX_CALL_OP(sch, op, locked_rq, args...)				\
 do {										\
-	if (rq)									\
-		update_locked_rq(rq);						\
+	struct rq *__prev_locked_rq;						\
+										\
+	if (locked_rq) {							\
+		__prev_locked_rq = scx_locked_rq();				\
+		update_locked_rq(locked_rq);					\
+	}									\
 	(sch)->ops.op(args);							\
-	if (rq)									\
-		update_locked_rq(NULL);						\
+	if (locked_rq)								\
+		update_locked_rq(__prev_locked_rq);				\
 } while (0)
 
-#define SCX_CALL_OP_RET(sch, op, rq, args...)					\
+#define SCX_CALL_OP_RET(sch, op, locked_rq, args...)				\
 ({										\
+	struct rq *__prev_locked_rq;						\
 	__typeof__((sch)->ops.op(args)) __ret;					\
 										\
-	if (rq)									\
-		update_locked_rq(rq);						\
+	if (locked_rq) {							\
+		__prev_locked_rq = scx_locked_rq();				\
+		update_locked_rq(locked_rq);					\
+	}									\
 	__ret = (sch)->ops.op(args);						\
-	if (rq)									\
-		update_locked_rq(NULL);						\
+	if (locked_rq)								\
+		update_locked_rq(__prev_locked_rq);				\
 	__ret;									\
 })
 
@@ -499,39 +510,39 @@ do {										\
  * those subject tasks.
  *
  * Every SCX_CALL_OP_TASK*() call site invokes its op with @p's rq lock held -
- * either via the @rq argument here, or (for ops.select_cpu()) via @p's pi_lock
- * held by try_to_wake_up() with rq tracking via scx_rq.in_select_cpu. So if
- * kf_tasks[] is set, @p's scheduler-protected fields are stable.
+ * either via the @locked_rq argument here, or (for ops.select_cpu()) via @p's
+ * pi_lock held by try_to_wake_up() with rq tracking via scx_rq.in_select_cpu.
+ * So if kf_tasks[] is set, @p's scheduler-protected fields are stable.
  *
  * kf_tasks[] can not stack, so task-based SCX ops must not nest. The
  * WARN_ON_ONCE() in each macro catches a re-entry of any of the three variants
  * while a previous one is still in progress.
  */
-#define SCX_CALL_OP_TASK(sch, op, rq, task, args...)				\
+#define SCX_CALL_OP_TASK(sch, op, locked_rq, task, args...)			\
 do {										\
 	WARN_ON_ONCE(current->scx.kf_tasks[0]);					\
 	current->scx.kf_tasks[0] = task;					\
-	SCX_CALL_OP((sch), op, rq, task, ##args);				\
+	SCX_CALL_OP((sch), op, locked_rq, task, ##args);			\
 	current->scx.kf_tasks[0] = NULL;					\
 } while (0)
 
-#define SCX_CALL_OP_TASK_RET(sch, op, rq, task, args...)			\
+#define SCX_CALL_OP_TASK_RET(sch, op, locked_rq, task, args...)			\
 ({										\
 	__typeof__((sch)->ops.op(task, ##args)) __ret;				\
 	WARN_ON_ONCE(current->scx.kf_tasks[0]);					\
 	current->scx.kf_tasks[0] = task;					\
-	__ret = SCX_CALL_OP_RET((sch), op, rq, task, ##args);			\
+	__ret = SCX_CALL_OP_RET((sch), op, locked_rq, task, ##args);		\
 	current->scx.kf_tasks[0] = NULL;					\
 	__ret;									\
 })
 
-#define SCX_CALL_OP_2TASKS_RET(sch, op, rq, task0, task1, args...)		\
+#define SCX_CALL_OP_2TASKS_RET(sch, op, locked_rq, task0, task1, args...)	\
 ({										\
 	__typeof__((sch)->ops.op(task0, task1, ##args)) __ret;			\
 	WARN_ON_ONCE(current->scx.kf_tasks[0]);					\
 	current->scx.kf_tasks[0] = task0;					\
 	current->scx.kf_tasks[1] = task1;					\
-	__ret = SCX_CALL_OP_RET((sch), op, rq, task0, task1, ##args);		\
+	__ret = SCX_CALL_OP_RET((sch), op, locked_rq, task0, task1, ##args);	\
 	current->scx.kf_tasks[0] = NULL;					\
 	current->scx.kf_tasks[1] = NULL;					\
 	__ret;									\
-- 
cgit v1.2.3


From 207d76a372fb1bb324eadc8cb5bcaa0a8da7cefd Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 24 Apr 2026 14:31:36 -1000
Subject: sched_ext: Pass held rq to SCX_CALL_OP() for dump_cpu/dump_task

scx_dump_state() walks CPUs with rq_lock_irqsave() held and invokes
ops.dump_cpu / ops.dump_task with NULL locked_rq, leaving
scx_locked_rq_state NULL. If the BPF callback calls a kfunc that
re-acquires rq based on scx_locked_rq() - e.g. scx_bpf_cpuperf_set(cpu)
- it re-acquires the already-held rq.

Pass the held rq to SCX_CALL_OP(). Thread it into scx_dump_task() too.
The pre-loop ops.dump call runs before rq_lock_irqsave() so keeps
rq=NULL.

Fixes: 07814a9439a3 ("sched_ext: Print debug dump after an error exit")
Cc: stable@vger.kernel.org # v6.12+
Reported-by: Chris Mason <clm@meta.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 26968d0a6752..73d629559d6d 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -6117,9 +6117,8 @@ static void ops_dump_exit(void)
 	scx_dump_data.cpu = -1;
 }
 
-static void scx_dump_task(struct scx_sched *sch,
-			  struct seq_buf *s, struct scx_dump_ctx *dctx,
-			  struct task_struct *p, char marker)
+static void scx_dump_task(struct scx_sched *sch, struct seq_buf *s, struct scx_dump_ctx *dctx,
+			  struct rq *rq, struct task_struct *p, char marker)
 {
 	static unsigned long bt[SCX_EXIT_BT_LEN];
 	struct scx_sched *task_sch = scx_task_sched(p);
@@ -6160,7 +6159,7 @@ static void scx_dump_task(struct scx_sched *sch,
 
 	if (SCX_HAS_OP(sch, dump_task)) {
 		ops_dump_init(s, "    ");
-		SCX_CALL_OP(sch, dump_task, NULL, dctx, p);
+		SCX_CALL_OP(sch, dump_task, rq, dctx, p);
 		ops_dump_exit();
 	}
 
@@ -6284,8 +6283,7 @@ static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei,
 		used = seq_buf_used(&ns);
 		if (SCX_HAS_OP(sch, dump_cpu)) {
 			ops_dump_init(&ns, "  ");
-			SCX_CALL_OP(sch, dump_cpu, NULL,
-				    &dctx, cpu, idle);
+			SCX_CALL_OP(sch, dump_cpu, rq, &dctx, cpu, idle);
 			ops_dump_exit();
 		}
 
@@ -6308,11 +6306,11 @@ static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei,
 
 		if (rq->curr->sched_class == &ext_sched_class &&
 		    (dump_all_tasks || scx_task_on_sched(sch, rq->curr)))
-			scx_dump_task(sch, &s, &dctx, rq->curr, '*');
+			scx_dump_task(sch, &s, &dctx, rq, rq->curr, '*');
 
 		list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node)
 			if (dump_all_tasks || scx_task_on_sched(sch, p))
-				scx_dump_task(sch, &s, &dctx, p, ' ');
+				scx_dump_task(sch, &s, &dctx, rq, p, ' ');
 	next:
 		rq_unlock_irqrestore(rq, &rf);
 	}
-- 
cgit v1.2.3


From 4155fb489fa175ec74eedde7d02219cf2fe74303 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 24 Apr 2026 14:31:36 -1000
Subject: sched_ext: Pass held rq to SCX_CALL_OP() for core_sched_before

scx_prio_less() runs from core-sched's pick_next_task() path with rq
locked but invokes ops.core_sched_before() with NULL locked_rq, leaving
scx_locked_rq_state NULL. If the BPF callback calls a kfunc that
re-acquires rq based on scx_locked_rq() - e.g. scx_bpf_cpuperf_set(cpu)
- it re-acquires the already-held rq.

Pass task_rq(a).

Fixes: 7b0888b7cc19 ("sched_ext: Implement core-sched support")
Cc: stable@vger.kernel.org # v6.12+
Reported-by: Chris Mason <clm@meta.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 73d629559d6d..ba977154273c 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3198,7 +3198,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
 	if (sch_a == sch_b && SCX_HAS_OP(sch_a, core_sched_before) &&
 	    !scx_bypassing(sch_a, task_cpu(a)))
 		return SCX_CALL_OP_2TASKS_RET(sch_a, core_sched_before,
-					      NULL,
+					      task_rq(a),
 					      (struct task_struct *)a,
 					      (struct task_struct *)b);
 	else
-- 
cgit v1.2.3


From d292aa00de1aea72961f94c0db43f6b5c72684c9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 24 Apr 2026 14:31:36 -1000
Subject: sched_ext: Make bypass LB cpumasks per-scheduler

scx_bypass_lb_{donee,resched}_cpumask were file-scope statics shared by all
scheduler instances. With CONFIG_EXT_SUB_SCHED, multiple sched instances
each arm their own bypass_lb_timer; concurrent bypass_lb_node() calls RMW
the global cpumasks with no lock, corrupting donee/resched decisions.

Move the cpumasks into struct scx_sched, allocate them alongside the timer
in scx_alloc_and_add_sched(), free them in scx_sched_free_rcu_work().

Fixes: 95d1df610cdc ("sched_ext: Implement load balancer for bypass mode")
Cc: stable@vger.kernel.org # v6.19+
Reported-by: Chris Mason <clm@meta.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c          | 33 +++++++++++++++++++--------------
 kernel/sched/ext_internal.h |  2 ++
 2 files changed, 21 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index ba977154273c..e07f8c46e399 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -53,8 +53,6 @@ DEFINE_STATIC_KEY_FALSE(__scx_enabled);
 DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
 static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED);
 static DEFINE_RAW_SPINLOCK(scx_bypass_lock);
-static cpumask_var_t scx_bypass_lb_donee_cpumask;
-static cpumask_var_t scx_bypass_lb_resched_cpumask;
 static bool scx_init_task_enabled;
 static bool scx_switching_all;
 DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
@@ -4747,6 +4745,8 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
 	irq_work_sync(&sch->disable_irq_work);
 	kthread_destroy_worker(sch->helper);
 	timer_shutdown_sync(&sch->bypass_lb_timer);
+	free_cpumask_var(sch->bypass_lb_donee_cpumask);
+	free_cpumask_var(sch->bypass_lb_resched_cpumask);
 
 #ifdef CONFIG_EXT_SUB_SCHED
 	kfree(sch->cgrp_path);
@@ -5123,8 +5123,8 @@ resume:
 static void bypass_lb_node(struct scx_sched *sch, int node)
 {
 	const struct cpumask *node_mask = cpumask_of_node(node);
-	struct cpumask *donee_mask = scx_bypass_lb_donee_cpumask;
-	struct cpumask *resched_mask = scx_bypass_lb_resched_cpumask;
+	struct cpumask *donee_mask = sch->bypass_lb_donee_cpumask;
+	struct cpumask *resched_mask = sch->bypass_lb_resched_cpumask;
 	u32 nr_tasks = 0, nr_cpus = 0, nr_balanced = 0;
 	u32 nr_target, nr_donor_target;
 	u32 before_min = U32_MAX, before_max = 0;
@@ -6520,6 +6520,15 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
 	init_irq_work(&sch->disable_irq_work, scx_disable_irq_workfn);
 	kthread_init_work(&sch->disable_work, scx_disable_workfn);
 	timer_setup(&sch->bypass_lb_timer, scx_bypass_lb_timerfn, 0);
+
+	if (!alloc_cpumask_var(&sch->bypass_lb_donee_cpumask, GFP_KERNEL)) {
+		ret = -ENOMEM;
+		goto err_stop_helper;
+	}
+	if (!alloc_cpumask_var(&sch->bypass_lb_resched_cpumask, GFP_KERNEL)) {
+		ret = -ENOMEM;
+		goto err_free_lb_cpumask;
+	}
 	sch->ops = *ops;
 	rcu_assign_pointer(ops->priv, sch);
 
@@ -6529,14 +6538,14 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
 	char *buf = kzalloc(PATH_MAX, GFP_KERNEL);
 	if (!buf) {
 		ret = -ENOMEM;
-		goto err_stop_helper;
+		goto err_free_lb_resched;
 	}
 	cgroup_path(cgrp, buf, PATH_MAX);
 	sch->cgrp_path = kstrdup(buf, GFP_KERNEL);
 	kfree(buf);
 	if (!sch->cgrp_path) {
 		ret = -ENOMEM;
-		goto err_stop_helper;
+		goto err_free_lb_resched;
 	}
 
 	sch->cgrp = cgrp;
@@ -6571,10 +6580,12 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
 #endif	/* CONFIG_EXT_SUB_SCHED */
 	return sch;
 
-#ifdef CONFIG_EXT_SUB_SCHED
+err_free_lb_resched:
+	free_cpumask_var(sch->bypass_lb_resched_cpumask);
+err_free_lb_cpumask:
+	free_cpumask_var(sch->bypass_lb_donee_cpumask);
 err_stop_helper:
 	kthread_destroy_worker(sch->helper);
-#endif
 err_free_pcpu:
 	for_each_possible_cpu(cpu) {
 		if (cpu == bypass_fail_cpu)
@@ -9761,12 +9772,6 @@ static int __init scx_init(void)
 		return ret;
 	}
 
-	if (!alloc_cpumask_var(&scx_bypass_lb_donee_cpumask, GFP_KERNEL) ||
-	    !alloc_cpumask_var(&scx_bypass_lb_resched_cpumask, GFP_KERNEL)) {
-		pr_err("sched_ext: Failed to allocate cpumasks\n");
-		return -ENOMEM;
-	}
-
 	return 0;
 }
 __initcall(scx_init);
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index 62ce4eaf6a3f..a075732d4430 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -1075,6 +1075,8 @@ struct scx_sched {
 	struct irq_work		disable_irq_work;
 	struct kthread_work	disable_work;
 	struct timer_list	bypass_lb_timer;
+	cpumask_var_t		bypass_lb_donee_cpumask;
+	cpumask_var_t		bypass_lb_resched_cpumask;
 	struct rcu_work		rcu_work;
 
 	/* all ancestors including self */
-- 
cgit v1.2.3


From c0e8ddc76d54402171787414b1b8eb387812f1f6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 24 Apr 2026 14:31:36 -1000
Subject: sched_ext: Align cgroup #ifdef guards with SUB_SCHED vs GROUP_SCHED

Two EXT_GROUP_SCHED/SUB_SCHED guards are misclassified:

- scx_root_enable_workfn()'s cgroup_get(cgrp) and the err_put_cgrp unwind
  in scx_alloc_and_add_sched() are under `#if GROUP || SUB`, but the
  matching cgroup_put() in scx_sched_free_rcu_work() is inside `#ifdef SUB`
  only (via sch->cgrp, stored only under SUB). GROUP-only would leak a
  reference on every root-sched enable.

- sch_cgroup() / set_cgroup_sched() live under `#if GROUP || SUB` but touch
  SUB-only fields (sch->cgrp, cgroup->scx_sched). GROUP-only wouldn't
  compile.

GROUP needs CGROUP_SCHED; SUB needs only CGROUPS. CGROUPS=y/CGROUP_SCHED=n
gives the reachable GROUP=n, SUB=y combination; GROUP=y, SUB=n isn't
reachable today (SUB is def_bool y under CGROUPS). Neither miscategorization
triggers a real bug in any reachable config, but keep the guards honest:

- Narrow cgroup_get and err_put_cgrp to `#ifdef SUB` (matches the free-side
  put).
- Move sch_cgroup() and set_cgroup_sched() to a separate `#ifdef SUB` block
  with no-op stubs for the !SUB case; keep root_cgroup() and scx_cgroup_{
  lock,unlock}() under `#if GROUP || SUB` since those only need cgroup core.

Fixes: ebeca1f930ea ("sched_ext: Introduce cgroup sub-sched support")
Reported-by: Chris Mason <clm@meta.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 41 ++++++++++++++++++++++-------------------
 1 file changed, 22 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index e07f8c46e399..e2898d60315b 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4413,21 +4413,6 @@ static struct cgroup *root_cgroup(void)
 	return &cgrp_dfl_root.cgrp;
 }
 
-static struct cgroup *sch_cgroup(struct scx_sched *sch)
-{
-	return sch->cgrp;
-}
-
-/* for each descendant of @cgrp including self, set ->scx_sched to @sch */
-static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch)
-{
-	struct cgroup *pos;
-	struct cgroup_subsys_state *css;
-
-	cgroup_for_each_live_descendant_pre(pos, css, cgrp)
-		rcu_assign_pointer(pos->scx_sched, sch);
-}
-
 static void scx_cgroup_lock(void)
 {
 #ifdef CONFIG_EXT_GROUP_SCHED
@@ -4445,12 +4430,30 @@ static void scx_cgroup_unlock(void)
 }
 #else	/* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */
 static struct cgroup *root_cgroup(void) { return NULL; }
-static struct cgroup *sch_cgroup(struct scx_sched *sch) { return NULL; }
-static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) {}
 static void scx_cgroup_lock(void) {}
 static void scx_cgroup_unlock(void) {}
 #endif	/* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */
 
+#ifdef CONFIG_EXT_SUB_SCHED
+static struct cgroup *sch_cgroup(struct scx_sched *sch)
+{
+	return sch->cgrp;
+}
+
+/* for each descendant of @cgrp including self, set ->scx_sched to @sch */
+static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch)
+{
+	struct cgroup *pos;
+	struct cgroup_subsys_state *css;
+
+	cgroup_for_each_live_descendant_pre(pos, css, cgrp)
+		rcu_assign_pointer(pos->scx_sched, sch);
+}
+#else	/* CONFIG_EXT_SUB_SCHED */
+static struct cgroup *sch_cgroup(struct scx_sched *sch) { return NULL; }
+static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) {}
+#endif	/* CONFIG_EXT_SUB_SCHED */
+
 /*
  * Omitted operations:
  *
@@ -6604,7 +6607,7 @@ err_free_ei:
 err_free_sch:
 	kfree(sch);
 err_put_cgrp:
-#if defined(CONFIG_EXT_GROUP_SCHED) || defined(CONFIG_EXT_SUB_SCHED)
+#ifdef CONFIG_EXT_SUB_SCHED
 	cgroup_put(cgrp);
 #endif
 	return ERR_PTR(ret);
@@ -6695,7 +6698,7 @@ static void scx_root_enable_workfn(struct kthread_work *work)
 	if (ret)
 		goto err_unlock;
 
-#if defined(CONFIG_EXT_GROUP_SCHED) || defined(CONFIG_EXT_SUB_SCHED)
+#ifdef CONFIG_EXT_SUB_SCHED
 	cgroup_get(cgrp);
 #endif
 	sch = scx_alloc_and_add_sched(ops, cgrp, NULL);
-- 
cgit v1.2.3


From ea7c716a24aebe887e0990649ab697bd698cc325 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 24 Apr 2026 14:31:36 -1000
Subject: sched_ext: Refuse cross-task select_cpu_from_kfunc calls

select_cpu_from_kfunc() skipped pi_lock for @p when called from
ops.select_cpu() or another rq-locked SCX op, assuming the held lock
protects @p. scx_bpf_select_cpu_dfl() / __scx_bpf_select_cpu_and() accept an
arbitrary KF_RCU task_struct, so a caller in e.g. ops.select_cpu(p1) or
ops.enqueue(p1) can pass some other p2 - the held pi_lock / rq lock is p1's,
not p2's - and reading p2->cpus_ptr / nr_cpus_allowed races with
set_cpus_allowed_ptr() and migrate_disable_switch() on another CPU.

Abort the scheduler on cross-task calls in both branches: for
ops.select_cpu() use scx_kf_arg_task_ok() to verify @p is the wake-up
task recorded in current->scx.kf_tasks[] by SCX_CALL_OP_TASK_RET();
for other rq-locked SCX ops compare task_rq(p) against scx_locked_rq().

v2: Switch the in_select_cpu cross-task check from direct_dispatch_task
    comparison to scx_kf_arg_task_ok(). The former spuriously rejects when
    ops.select_cpu() calls scx_bpf_dsq_insert() first, then calls
    scx_bpf_select_cpu_*() on the same task. (Andrea Righi)

Fixes: 0022b328504d ("sched_ext: Decouple kfunc unlocked-context check from kf_mask")
Reported-by: Chris Mason <clm@meta.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext_idle.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index c43d62d90e40..7468560a6d80 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -927,14 +927,24 @@ static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p,
 	 * Accessing p->cpus_ptr / p->nr_cpus_allowed needs either @p's rq
 	 * lock or @p's pi_lock. Three cases:
 	 *
-	 *  - inside ops.select_cpu(): try_to_wake_up() holds @p's pi_lock.
+	 *  - inside ops.select_cpu(): try_to_wake_up() holds the wake-up
+	 *    task's pi_lock; the wake-up task is recorded in kf_tasks[0]
+	 *    by SCX_CALL_OP_TASK_RET().
 	 *  - other rq-locked SCX op: scx_locked_rq() points at the held rq.
 	 *  - truly unlocked (UNLOCKED ops, SYSCALL, non-SCX struct_ops):
 	 *    nothing held, take pi_lock ourselves.
+	 *
+	 * In the first two cases, BPF schedulers may pass an arbitrary task
+	 * that the held lock doesn't cover. Refuse those.
 	 */
 	if (this_rq()->scx.in_select_cpu) {
+		if (!scx_kf_arg_task_ok(sch, p))
+			return -EINVAL;
 		lockdep_assert_held(&p->pi_lock);
-	} else if (!scx_locked_rq()) {
+	} else if (scx_locked_rq()) {
+		if (task_rq(p) != scx_locked_rq())
+			goto cross_task;
+	} else {
 		raw_spin_lock_irqsave(&p->pi_lock, irq_flags);
 		we_locked = true;
 	}
@@ -960,6 +970,11 @@ static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p,
 		raw_spin_unlock_irqrestore(&p->pi_lock, irq_flags);
 
 	return cpu;
+
+cross_task:
+	scx_error(sch, "select_cpu kfunc called cross-task on %s[%d]",
+		  p->comm, p->pid);
+	return -EINVAL;
 }
 
 /**
-- 
cgit v1.2.3


From 05b4a9a9bc37f1fa289a8f07b4fbfc3ae681b650 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 24 Apr 2026 14:31:36 -1000
Subject: sched_ext: Reject NULL-sch callers in
 scx_bpf_task_set_slice/dsq_vtime

scx_prog_sched(aux) returns NULL for TRACING / SYSCALL BPF progs that
have no struct_ops association when the root scheduler has sub_attach
set. scx_bpf_task_set_slice() and scx_bpf_task_set_dsq_vtime() pass
that NULL into scx_task_on_sched(sch, p), which under
CONFIG_EXT_SUB_SCHED is rcu_access_pointer(p->scx.sched) == sch. For
any non-scx task p->scx.sched is NULL, so NULL == NULL returns true
and the authority gate is bypassed - a privileged but
non-struct_ops-associated prog can poke p->scx.slice /
p->scx.dsq_vtime on arbitrary tasks.

Reject !sch up front so the gate only admits callers with a resolved
scheduler.

Fixes: 245d09c594ea ("sched_ext: Enforce scheduler ownership when updating slice and dsq_vtime")
Reported-by: Chris Mason <clm@meta.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index e2898d60315b..f333fd0cb83f 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -8640,7 +8640,7 @@ __bpf_kfunc bool scx_bpf_task_set_slice(struct task_struct *p, u64 slice,
 
 	guard(rcu)();
 	sch = scx_prog_sched(aux);
-	if (unlikely(!scx_task_on_sched(sch, p)))
+	if (unlikely(!sch || !scx_task_on_sched(sch, p)))
 		return false;
 
 	p->scx.slice = slice;
@@ -8663,7 +8663,7 @@ __bpf_kfunc bool scx_bpf_task_set_dsq_vtime(struct task_struct *p, u64 vtime,
 
 	guard(rcu)();
 	sch = scx_prog_sched(aux);
-	if (unlikely(!scx_task_on_sched(sch, p)))
+	if (unlikely(!sch || !scx_task_on_sched(sch, p)))
 		return false;
 
 	p->scx.dsq_vtime = vtime;
-- 
cgit v1.2.3


From deb7b2f93d0129b79425f830a1e5e7e1bb2c4973 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 24 Apr 2026 14:31:36 -1000
Subject: sched_ext: Release cpus_read_lock on scx_link_sched() failure in root
 enable

scx_root_enable_workfn() takes cpus_read_lock() before
scx_link_sched(sch), but the `if (ret) goto err_disable` on failure
skips the matching cpus_read_unlock() - all other err_disable gotos
along this path drop the lock first.

scx_link_sched() only returns non-zero on the sub-sched path
(parent != NULL), so the leak path is unreachable via the root
caller today. Still, the unwind is out of line with the surrounding
paths.

Drop cpus_read_lock() before goto err_disable.

v2: Correct Fixes: tag (Andrea Righi).

Fixes: 25037af712eb ("sched_ext: Add rhashtable lookup for sub-schedulers")
Reported-by: Chris Mason <clm@meta.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index f333fd0cb83f..9eda20e5fdb8 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -6736,8 +6736,10 @@ static void scx_root_enable_workfn(struct kthread_work *work)
 	rcu_assign_pointer(scx_root, sch);
 
 	ret = scx_link_sched(sch);
-	if (ret)
+	if (ret) {
+		cpus_read_unlock();
 		goto err_disable;
+	}
 
 	scx_idle_enable(ops);
 
-- 
cgit v1.2.3


From 163f8b7f9a84086c67c76aeadc04e6d43e32df6e Mon Sep 17 00:00:00 2001
From: Kuba Piecuch <jpiecuch@google.com>
Date: Tue, 28 Apr 2026 12:46:01 +0000
Subject: sched_ext: Call wakeup_preempt() in local_dsq_post_enq()

There are several edge cases (see linked thread) where an IMMED task
can be left lingering on a local DSQ if an RT task swoops in at the
wrong time. All of these edge cases are due to rq->next_class being idle
even after dispatching a task to rq's local DSQ. We should bump
rq->next_class to &ext_sched_class as soon as we've inserted a task into
the local DSQ.

To optimize the common case of rq->next_class == &ext_sched_class,
only call wakeup_preempt() if rq->next_class is below EXT. If next_class
is EXT or above, wakeup_preempt() is a no-op anyway.

This lets us also simplify the preempt_curr() logic a bit since
wakeup_preempt() will call preempt_curr() for us if next_class is
below EXT.

Link: https://lore.kernel.org/all/DHZPHUFXB4N3.2RY28MUEWBNYK@google.com/
Signed-off-by: Kuba Piecuch <jpiecuch@google.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 44 +++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 39 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 9eda20e5fdb8..cac0b18239fe 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1402,14 +1402,51 @@ static void local_dsq_post_enq(struct scx_sched *sch, struct scx_dispatch_q *dsq
 			       struct task_struct *p, u64 enq_flags)
 {
 	struct rq *rq = container_of(dsq, struct rq, scx.local_dsq);
-	bool preempt = false;
 
 	call_task_dequeue(sch, rq, p, 0);
 
+	/*
+	 * Note that @rq's lock may be dropped between this enqueue and @p
+	 * actually getting on CPU. This gives higher-class tasks (e.g. RT)
+	 * an opportunity to wake up on @rq and prevent @p from running.
+	 * Here are some concrete examples:
+	 *
+	 * Example 1:
+	 *
+	 * We dispatch two tasks from a single ops.dispatch():
+	 * - First, a local task to this CPU's local DSQ;
+	 * - Second, a local/remote task to a remote CPU's local DSQ.
+	 * We must drop the local rq lock in order to finish the second
+	 * dispatch. In that time, an RT task can wake up on the local rq.
+	 *
+	 * Example 2:
+	 *
+	 * We dispatch a local/remote task to a remote CPU's local DSQ.
+	 * We must drop the remote rq lock before the dispatched task can run,
+	 * which gives an RT task an opportunity to wake up on the remote rq.
+	 *
+	 * Both examples work the same if we replace dispatching with moving
+	 * the tasks from a user-created DSQ.
+	 *
+	 * We must detect these wakeups so that we can re-enqueue IMMED tasks
+	 * from @rq's local DSQ. scx_wakeup_preempt() serves exactly this
+	 * purpose, but for it to be invoked, we must ensure that we bump
+	 * @rq->next_class to &ext_sched_class if it's currently idle.
+	 *
+	 * wakeup_preempt() does the bumping, and since we only invoke it if
+	 * @rq->next_class is below &ext_sched_class, it will also
+	 * resched_curr(rq).
+	 */
+	if (sched_class_above(p->sched_class, rq->next_class))
+		wakeup_preempt(rq, p, 0);
+
 	/*
 	 * If @rq is in balance, the CPU is already vacant and looking for the
 	 * next task to run. No need to preempt or trigger resched after moving
 	 * @p into its local DSQ.
+	 * Note that the wakeup_preempt() above may have already triggered
+	 * a resched if @rq->next_class was idle. It's harmless, since
+	 * need_resched is cleared immediately after task pick.
 	 */
 	if (rq->scx.flags & SCX_RQ_IN_BALANCE)
 		return;
@@ -1417,11 +1454,8 @@ static void local_dsq_post_enq(struct scx_sched *sch, struct scx_dispatch_q *dsq
 	if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr &&
 	    rq->curr->sched_class == &ext_sched_class) {
 		rq->curr->scx.slice = 0;
-		preempt = true;
-	}
-
-	if (preempt || sched_class_above(&ext_sched_class, rq->curr->sched_class))
 		resched_curr(rq);
+	}
 }
 
 static void dispatch_enqueue(struct scx_sched *sch, struct rq *rq,
-- 
cgit v1.2.3


From d99f7a32f09dccbe396187370ec1a74a31b73d7e Mon Sep 17 00:00:00 2001
From: Cheng-Yang Chou <yphbchou0911@gmail.com>
Date: Wed, 29 Apr 2026 01:36:12 +0800
Subject: sched_ext: Fix scx_flush_disable_work() UAF race

scx_flush_disable_work() calls irq_work_sync() followed by
kthread_flush_work() to ensure that the disable kthread work has
fully completed before bpf_scx_unreg() frees the SCX scheduler.

However, a concurrent scx_vexit() (e.g., triggered by a watchdog stall)
creates a race window between scx_claim_exit() and irq_work_queue():

  CPU A (scx_vexit (watchdog))        CPU B (bpf_scx_unreg)
  ----                                ----
  scx_claim_exit()
    atomic_try_cmpxchg(NONE->kind)
  stack_trace_save()
  vscnprintf()
                                      scx_disable()
                                        scx_claim_exit() -> FAIL
                                      scx_flush_disable_work()
                                        irq_work_sync()      // no-op: not queued yet
                                        kthread_flush_work() // no-op: not queued yet
                                      kobject_put(&sch->kobj) -> free %sch
  irq_work_queue() -> UAF on %sch
  scx_disable_irq_workfn()
    kthread_queue_work() -> UAF

The root cause is that CPU B's scx_flush_disable_work() returns after
syncing an irq_work that has not yet been queued, while CPU A is still
executing the code between scx_claim_exit() and irq_work_queue().

Loop until exit_kind reaches SCX_EXIT_DONE or SCX_EXIT_NONE, draining
disable_irq_work and disable_work in each pass. This ensures that any
work queued after the previous check is caught, while also correctly
handling cases where no disable was triggered (e.g., the
scx_sub_enable_workfn() abort path).

Fixes: 510a27055446 ("sched_ext: sync disable_irq_work in bpf_scx_unreg()")
Reported-by: https://sashiko.dev/#/patchset/20260424100221.32407-1-icheng%40nvidia.com
Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Cheng-Yang Chou <yphbchou0911@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index cac0b18239fe..9483be03a4ca 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -6039,8 +6039,13 @@ static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind)
  */
 static void scx_flush_disable_work(struct scx_sched *sch)
 {
-	irq_work_sync(&sch->disable_irq_work);
-	kthread_flush_work(&sch->disable_work);
+	int kind;
+
+	do {
+		irq_work_sync(&sch->disable_irq_work);
+		kthread_flush_work(&sch->disable_work);
+		kind = atomic_read(&sch->exit_kind);
+	} while (kind != SCX_EXIT_NONE && kind != SCX_EXIT_DONE);
 }
 
 static void dump_newline(struct seq_buf *s)
-- 
cgit v1.2.3