summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched/ext.c398
-rw-r--r--kernel/sched/ext_idle.c20
-rw-r--r--kernel/sched/ext_idle.h1
-rw-r--r--kernel/sched/ext_internal.h2
4 files changed, 297 insertions, 124 deletions
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index e426e27b6794..345aa11b84b2 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -32,6 +32,7 @@ static const struct rhashtable_params scx_sched_hash_params = {
.key_len = sizeof_field(struct scx_sched, ops.sub_cgroup_id),
.key_offset = offsetof(struct scx_sched, ops.sub_cgroup_id),
.head_offset = offsetof(struct scx_sched, hash_node),
+ .insecure_elasticity = true, /* inserted under scx_sched_lock */
};
static struct rhashtable scx_sched_hash;
@@ -52,8 +53,6 @@ DEFINE_STATIC_KEY_FALSE(__scx_enabled);
DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED);
static DEFINE_RAW_SPINLOCK(scx_bypass_lock);
-static cpumask_var_t scx_bypass_lb_donee_cpumask;
-static cpumask_var_t scx_bypass_lb_resched_cpumask;
static bool scx_init_task_enabled;
static bool scx_switching_all;
DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
@@ -469,24 +468,35 @@ static inline void update_locked_rq(struct rq *rq)
__this_cpu_write(scx_locked_rq_state, rq);
}
-#define SCX_CALL_OP(sch, op, rq, args...) \
+/*
+ * SCX ops can recurse via scx_bpf_sub_dispatch() - the inner call must not
+ * clobber the outer's scx_locked_rq_state. Save it on entry, restore on exit.
+ */
+#define SCX_CALL_OP(sch, op, locked_rq, args...) \
do { \
- if (rq) \
- update_locked_rq(rq); \
+ struct rq *__prev_locked_rq; \
+ \
+ if (locked_rq) { \
+ __prev_locked_rq = scx_locked_rq(); \
+ update_locked_rq(locked_rq); \
+ } \
(sch)->ops.op(args); \
- if (rq) \
- update_locked_rq(NULL); \
+ if (locked_rq) \
+ update_locked_rq(__prev_locked_rq); \
} while (0)
-#define SCX_CALL_OP_RET(sch, op, rq, args...) \
+#define SCX_CALL_OP_RET(sch, op, locked_rq, args...) \
({ \
+ struct rq *__prev_locked_rq; \
__typeof__((sch)->ops.op(args)) __ret; \
\
- if (rq) \
- update_locked_rq(rq); \
+ if (locked_rq) { \
+ __prev_locked_rq = scx_locked_rq(); \
+ update_locked_rq(locked_rq); \
+ } \
__ret = (sch)->ops.op(args); \
- if (rq) \
- update_locked_rq(NULL); \
+ if (locked_rq) \
+ update_locked_rq(__prev_locked_rq); \
__ret; \
})
@@ -498,39 +508,39 @@ do { \
* those subject tasks.
*
* Every SCX_CALL_OP_TASK*() call site invokes its op with @p's rq lock held -
- * either via the @rq argument here, or (for ops.select_cpu()) via @p's pi_lock
- * held by try_to_wake_up() with rq tracking via scx_rq.in_select_cpu. So if
- * kf_tasks[] is set, @p's scheduler-protected fields are stable.
+ * either via the @locked_rq argument here, or (for ops.select_cpu()) via @p's
+ * pi_lock held by try_to_wake_up() with rq tracking via scx_rq.in_select_cpu.
+ * So if kf_tasks[] is set, @p's scheduler-protected fields are stable.
*
* kf_tasks[] can not stack, so task-based SCX ops must not nest. The
* WARN_ON_ONCE() in each macro catches a re-entry of any of the three variants
* while a previous one is still in progress.
*/
-#define SCX_CALL_OP_TASK(sch, op, rq, task, args...) \
+#define SCX_CALL_OP_TASK(sch, op, locked_rq, task, args...) \
do { \
WARN_ON_ONCE(current->scx.kf_tasks[0]); \
current->scx.kf_tasks[0] = task; \
- SCX_CALL_OP((sch), op, rq, task, ##args); \
+ SCX_CALL_OP((sch), op, locked_rq, task, ##args); \
current->scx.kf_tasks[0] = NULL; \
} while (0)
-#define SCX_CALL_OP_TASK_RET(sch, op, rq, task, args...) \
+#define SCX_CALL_OP_TASK_RET(sch, op, locked_rq, task, args...) \
({ \
__typeof__((sch)->ops.op(task, ##args)) __ret; \
WARN_ON_ONCE(current->scx.kf_tasks[0]); \
current->scx.kf_tasks[0] = task; \
- __ret = SCX_CALL_OP_RET((sch), op, rq, task, ##args); \
+ __ret = SCX_CALL_OP_RET((sch), op, locked_rq, task, ##args); \
current->scx.kf_tasks[0] = NULL; \
__ret; \
})
-#define SCX_CALL_OP_2TASKS_RET(sch, op, rq, task0, task1, args...) \
+#define SCX_CALL_OP_2TASKS_RET(sch, op, locked_rq, task0, task1, args...) \
({ \
__typeof__((sch)->ops.op(task0, task1, ##args)) __ret; \
WARN_ON_ONCE(current->scx.kf_tasks[0]); \
current->scx.kf_tasks[0] = task0; \
current->scx.kf_tasks[1] = task1; \
- __ret = SCX_CALL_OP_RET((sch), op, rq, task0, task1, ##args); \
+ __ret = SCX_CALL_OP_RET((sch), op, locked_rq, task0, task1, ##args); \
current->scx.kf_tasks[0] = NULL; \
current->scx.kf_tasks[1] = NULL; \
__ret; \
@@ -1388,18 +1398,55 @@ static void call_task_dequeue(struct scx_sched *sch, struct rq *rq,
p->scx.flags &= ~SCX_TASK_IN_CUSTODY;
}
-static void local_dsq_post_enq(struct scx_dispatch_q *dsq, struct task_struct *p,
- u64 enq_flags)
+static void local_dsq_post_enq(struct scx_sched *sch, struct scx_dispatch_q *dsq,
+ struct task_struct *p, u64 enq_flags)
{
struct rq *rq = container_of(dsq, struct rq, scx.local_dsq);
- bool preempt = false;
- call_task_dequeue(scx_root, rq, p, 0);
+ call_task_dequeue(sch, rq, p, 0);
+
+ /*
+ * Note that @rq's lock may be dropped between this enqueue and @p
+ * actually getting on CPU. This gives higher-class tasks (e.g. RT)
+ * an opportunity to wake up on @rq and prevent @p from running.
+ * Here are some concrete examples:
+ *
+ * Example 1:
+ *
+ * We dispatch two tasks from a single ops.dispatch():
+ * - First, a local task to this CPU's local DSQ;
+ * - Second, a local/remote task to a remote CPU's local DSQ.
+ * We must drop the local rq lock in order to finish the second
+ * dispatch. In that time, an RT task can wake up on the local rq.
+ *
+ * Example 2:
+ *
+ * We dispatch a local/remote task to a remote CPU's local DSQ.
+ * We must drop the remote rq lock before the dispatched task can run,
+ * which gives an RT task an opportunity to wake up on the remote rq.
+ *
+ * Both examples work the same if we replace dispatching with moving
+ * the tasks from a user-created DSQ.
+ *
+ * We must detect these wakeups so that we can re-enqueue IMMED tasks
+ * from @rq's local DSQ. scx_wakeup_preempt() serves exactly this
+ * purpose, but for it to be invoked, we must ensure that we bump
+ * @rq->next_class to &ext_sched_class if it's currently idle.
+ *
+ * wakeup_preempt() does the bumping, and since we only invoke it if
+ * @rq->next_class is below &ext_sched_class, it will also
+ * resched_curr(rq).
+ */
+ if (sched_class_above(p->sched_class, rq->next_class))
+ wakeup_preempt(rq, p, 0);
/*
* If @rq is in balance, the CPU is already vacant and looking for the
* next task to run. No need to preempt or trigger resched after moving
* @p into its local DSQ.
+ * Note that the wakeup_preempt() above may have already triggered
+ * a resched if @rq->next_class was idle. It's harmless, since
+ * need_resched is cleared immediately after task pick.
*/
if (rq->scx.flags & SCX_RQ_IN_BALANCE)
return;
@@ -1407,11 +1454,8 @@ static void local_dsq_post_enq(struct scx_dispatch_q *dsq, struct task_struct *p
if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr &&
rq->curr->sched_class == &ext_sched_class) {
rq->curr->scx.slice = 0;
- preempt = true;
- }
-
- if (preempt || sched_class_above(&ext_sched_class, rq->curr->sched_class))
resched_curr(rq);
+ }
}
static void dispatch_enqueue(struct scx_sched *sch, struct rq *rq,
@@ -1494,11 +1538,13 @@ static void dispatch_enqueue(struct scx_sched *sch, struct rq *rq,
if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN))
rcu_assign_pointer(dsq->first_task, p);
} else {
- bool was_empty;
-
- was_empty = list_empty(&dsq->list);
+ /*
+ * dsq->list can contain parked BPF iterator cursors, so
+ * list_empty() here isn't a reliable proxy for "no real
+ * task in the DSQ". Test dsq->first_task directly.
+ */
list_add_tail(&p->scx.dsq_list.node, &dsq->list);
- if (was_empty && !(dsq->id & SCX_DSQ_FLAG_BUILTIN))
+ if (!dsq->first_task && !(dsq->id & SCX_DSQ_FLAG_BUILTIN))
rcu_assign_pointer(dsq->first_task, p);
}
}
@@ -1518,7 +1564,7 @@ static void dispatch_enqueue(struct scx_sched *sch, struct rq *rq,
* concurrently in a non-atomic way.
*/
if (is_local) {
- local_dsq_post_enq(dsq, p, enq_flags);
+ local_dsq_post_enq(sch, dsq, p, enq_flags);
} else {
/*
* Task on global/bypass DSQ: leave custody, task on
@@ -2129,7 +2175,8 @@ static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p, int wake_fl
schedule_reenq_local(rq, 0);
}
-static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
+static void move_local_task_to_local_dsq(struct scx_sched *sch,
+ struct task_struct *p, u64 enq_flags,
struct scx_dispatch_q *src_dsq,
struct rq *dst_rq)
{
@@ -2149,7 +2196,7 @@ static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
dsq_inc_nr(dst_dsq, p, enq_flags);
p->scx.dsq = dst_dsq;
- local_dsq_post_enq(dst_dsq, p, enq_flags);
+ local_dsq_post_enq(sch, dst_dsq, p, enq_flags);
}
/**
@@ -2370,7 +2417,7 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch,
/* @p is going from a non-local DSQ to a local DSQ */
if (src_rq == dst_rq) {
task_unlink_from_dsq(p, src_dsq);
- move_local_task_to_local_dsq(p, enq_flags,
+ move_local_task_to_local_dsq(sch, p, enq_flags,
src_dsq, dst_rq);
raw_spin_unlock(&src_dsq->lock);
} else {
@@ -2423,7 +2470,7 @@ retry:
if (rq == task_rq) {
task_unlink_from_dsq(p, dsq);
- move_local_task_to_local_dsq(p, enq_flags, dsq, rq);
+ move_local_task_to_local_dsq(sch, p, enq_flags, dsq, rq);
raw_spin_unlock(&dsq->lock);
return true;
}
@@ -3183,7 +3230,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
if (sch_a == sch_b && SCX_HAS_OP(sch_a, core_sched_before) &&
!scx_bypassing(sch_a, task_cpu(a)))
return SCX_CALL_OP_2TASKS_RET(sch_a, core_sched_before,
- NULL,
+ task_rq(a),
(struct task_struct *)a,
(struct task_struct *)b);
else
@@ -3631,6 +3678,22 @@ static void __scx_disable_and_exit_task(struct scx_sched *sch,
SCX_CALL_OP_TASK(sch, exit_task, task_rq(p), p, &args);
}
+/*
+ * Undo a completed __scx_init_task(sch, p, false) when scx_enable_task() never
+ * ran. The task state has not been transitioned, so this mirrors the
+ * SCX_TASK_INIT branch in __scx_disable_and_exit_task().
+ */
+static void scx_sub_init_cancel_task(struct scx_sched *sch, struct task_struct *p)
+{
+ struct scx_exit_task_args args = { .cancelled = true };
+
+ lockdep_assert_held(&p->pi_lock);
+ lockdep_assert_rq_held(task_rq(p));
+
+ if (SCX_HAS_OP(sch, exit_task))
+ SCX_CALL_OP_TASK(sch, exit_task, task_rq(p), p, &args);
+}
+
static void scx_disable_and_exit_task(struct scx_sched *sch,
struct task_struct *p)
{
@@ -3639,11 +3702,12 @@ static void scx_disable_and_exit_task(struct scx_sched *sch,
/*
* If set, @p exited between __scx_init_task() and scx_enable_task() in
* scx_sub_enable() and is initialized for both the associated sched and
- * its parent. Disable and exit for the child too.
+ * its parent. Exit for the child too - scx_enable_task() never ran for
+ * it, so undo only init_task.
*/
- if ((p->scx.flags & SCX_TASK_SUB_INIT) &&
- !WARN_ON_ONCE(!scx_enabling_sub_sched)) {
- __scx_disable_and_exit_task(scx_enabling_sub_sched, p);
+ if (p->scx.flags & SCX_TASK_SUB_INIT) {
+ if (!WARN_ON_ONCE(!scx_enabling_sub_sched))
+ scx_sub_init_cancel_task(scx_enabling_sub_sched, p);
p->scx.flags &= ~SCX_TASK_SUB_INIT;
}
@@ -4324,9 +4388,10 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
void scx_group_set_weight(struct task_group *tg, unsigned long weight)
{
- struct scx_sched *sch = scx_root;
+ struct scx_sched *sch;
percpu_down_read(&scx_cgroup_ops_rwsem);
+ sch = scx_root;
if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) &&
tg->scx.weight != weight)
@@ -4339,9 +4404,10 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight)
void scx_group_set_idle(struct task_group *tg, bool idle)
{
- struct scx_sched *sch = scx_root;
+ struct scx_sched *sch;
percpu_down_read(&scx_cgroup_ops_rwsem);
+ sch = scx_root;
if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_idle))
SCX_CALL_OP(sch, cgroup_set_idle, NULL, tg_cgrp(tg), idle);
@@ -4355,9 +4421,10 @@ void scx_group_set_idle(struct task_group *tg, bool idle)
void scx_group_set_bandwidth(struct task_group *tg,
u64 period_us, u64 quota_us, u64 burst_us)
{
- struct scx_sched *sch = scx_root;
+ struct scx_sched *sch;
percpu_down_read(&scx_cgroup_ops_rwsem);
+ sch = scx_root;
if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_bandwidth) &&
(tg->scx.bw_period_us != period_us ||
@@ -4380,21 +4447,6 @@ static struct cgroup *root_cgroup(void)
return &cgrp_dfl_root.cgrp;
}
-static struct cgroup *sch_cgroup(struct scx_sched *sch)
-{
- return sch->cgrp;
-}
-
-/* for each descendant of @cgrp including self, set ->scx_sched to @sch */
-static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch)
-{
- struct cgroup *pos;
- struct cgroup_subsys_state *css;
-
- cgroup_for_each_live_descendant_pre(pos, css, cgrp)
- rcu_assign_pointer(pos->scx_sched, sch);
-}
-
static void scx_cgroup_lock(void)
{
#ifdef CONFIG_EXT_GROUP_SCHED
@@ -4412,12 +4464,30 @@ static void scx_cgroup_unlock(void)
}
#else /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */
static struct cgroup *root_cgroup(void) { return NULL; }
-static struct cgroup *sch_cgroup(struct scx_sched *sch) { return NULL; }
-static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) {}
static void scx_cgroup_lock(void) {}
static void scx_cgroup_unlock(void) {}
#endif /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */
+#ifdef CONFIG_EXT_SUB_SCHED
+static struct cgroup *sch_cgroup(struct scx_sched *sch)
+{
+ return sch->cgrp;
+}
+
+/* for each descendant of @cgrp including self, set ->scx_sched to @sch */
+static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch)
+{
+ struct cgroup *pos;
+ struct cgroup_subsys_state *css;
+
+ cgroup_for_each_live_descendant_pre(pos, css, cgrp)
+ rcu_assign_pointer(pos->scx_sched, sch);
+}
+#else /* CONFIG_EXT_SUB_SCHED */
+static struct cgroup *sch_cgroup(struct scx_sched *sch) { return NULL; }
+static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) {}
+#endif /* CONFIG_EXT_SUB_SCHED */
+
/*
* Omitted operations:
*
@@ -4712,6 +4782,8 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
irq_work_sync(&sch->disable_irq_work);
kthread_destroy_worker(sch->helper);
timer_shutdown_sync(&sch->bypass_lb_timer);
+ free_cpumask_var(sch->bypass_lb_donee_cpumask);
+ free_cpumask_var(sch->bypass_lb_resched_cpumask);
#ifdef CONFIG_EXT_SUB_SCHED
kfree(sch->cgrp_path);
@@ -4938,6 +5010,25 @@ void scx_softlockup(u32 dur_s)
smp_processor_id(), dur_s);
}
+/*
+ * scx_hardlockup() runs from NMI and eventually calls scx_claim_exit(),
+ * which takes scx_sched_lock. scx_sched_lock isn't NMI-safe and grabbing
+ * it from NMI context can lead to deadlocks. Defer via irq_work; the
+ * disable path runs off irq_work anyway.
+ */
+static atomic_t scx_hardlockup_cpu = ATOMIC_INIT(-1);
+
+static void scx_hardlockup_irq_workfn(struct irq_work *work)
+{
+ int cpu = atomic_xchg(&scx_hardlockup_cpu, -1);
+
+ if (cpu >= 0 && handle_lockup("hard lockup - CPU %d", cpu))
+ printk_deferred(KERN_ERR "sched_ext: Hard lockup - CPU %d, disabling BPF scheduler\n",
+ cpu);
+}
+
+static DEFINE_IRQ_WORK(scx_hardlockup_irq_work, scx_hardlockup_irq_workfn);
+
/**
* scx_hardlockup - sched_ext hardlockup handler
*
@@ -4946,17 +5037,19 @@ void scx_softlockup(u32 dur_s)
* Try kicking out the current scheduler in an attempt to recover the system to
* a good state before taking more drastic actions.
*
- * Returns %true if sched_ext is enabled and abort was initiated, which may
- * resolve the reported hardlockup. %false if sched_ext is not enabled or
- * someone else already initiated abort.
+ * Queues an irq_work; the handle_lockup() call happens in IRQ context (see
+ * scx_hardlockup_irq_workfn).
+ *
+ * Returns %true if sched_ext is enabled and the work was queued, %false
+ * otherwise.
*/
bool scx_hardlockup(int cpu)
{
- if (!handle_lockup("hard lockup - CPU %d", cpu))
+ if (!rcu_access_pointer(scx_root))
return false;
- printk_deferred(KERN_ERR "sched_ext: Hard lockup - CPU %d, disabling BPF scheduler\n",
- cpu);
+ atomic_cmpxchg(&scx_hardlockup_cpu, -1, cpu);
+ irq_work_queue(&scx_hardlockup_irq_work);
return true;
}
@@ -5000,6 +5093,15 @@ resume:
if (cpumask_empty(donee_mask))
break;
+ /*
+ * If an earlier pass placed @p on @donor_dsq from a different
+ * CPU and the donee hasn't consumed it yet, @p is still on the
+ * previous CPU and task_rq(@p) != @donor_rq. @p can't be moved
+ * without its rq locked. Skip.
+ */
+ if (task_rq(p) != donor_rq)
+ continue;
+
donee = cpumask_any_and_distribute(donee_mask, p->cpus_ptr);
if (donee >= nr_cpu_ids)
continue;
@@ -5058,8 +5160,8 @@ resume:
static void bypass_lb_node(struct scx_sched *sch, int node)
{
const struct cpumask *node_mask = cpumask_of_node(node);
- struct cpumask *donee_mask = scx_bypass_lb_donee_cpumask;
- struct cpumask *resched_mask = scx_bypass_lb_resched_cpumask;
+ struct cpumask *donee_mask = sch->bypass_lb_donee_cpumask;
+ struct cpumask *resched_mask = sch->bypass_lb_resched_cpumask;
u32 nr_tasks = 0, nr_cpus = 0, nr_balanced = 0;
u32 nr_target, nr_donor_target;
u32 before_min = U32_MAX, before_max = 0;
@@ -5698,6 +5800,8 @@ static void scx_sub_disable(struct scx_sched *sch)
if (sch->ops.exit)
SCX_CALL_OP(sch, exit, NULL, sch->exit_info);
+ if (sch->sub_kset)
+ kset_unregister(sch->sub_kset);
kobject_del(&sch->kobj);
}
#else /* CONFIG_EXT_SUB_SCHED */
@@ -5829,6 +5933,10 @@ static void scx_root_disable(struct scx_sched *sch)
* could observe an object of the same name still in the hierarchy when
* the next scheduler is loaded.
*/
+#ifdef CONFIG_EXT_SUB_SCHED
+ if (sch->sub_kset)
+ kset_unregister(sch->sub_kset);
+#endif
kobject_del(&sch->kobj);
free_kick_syncs();
@@ -5921,6 +6029,25 @@ static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind)
irq_work_queue(&sch->disable_irq_work);
}
+/**
+ * scx_flush_disable_work - flush the disable work and wait for it to finish
+ * @sch: the scheduler
+ *
+ * sch->disable_work might still not queued, causing kthread_flush_work()
+ * as a noop. Syncing the irq_work first is required to guarantee the
+ * kthread work has been queued before waiting for it.
+ */
+static void scx_flush_disable_work(struct scx_sched *sch)
+{
+ int kind;
+
+ do {
+ irq_work_sync(&sch->disable_irq_work);
+ kthread_flush_work(&sch->disable_work);
+ kind = atomic_read(&sch->exit_kind);
+ } while (kind != SCX_EXIT_NONE && kind != SCX_EXIT_DONE);
+}
+
static void dump_newline(struct seq_buf *s)
{
trace_sched_ext_dump("");
@@ -6032,9 +6159,8 @@ static void ops_dump_exit(void)
scx_dump_data.cpu = -1;
}
-static void scx_dump_task(struct scx_sched *sch,
- struct seq_buf *s, struct scx_dump_ctx *dctx,
- struct task_struct *p, char marker)
+static void scx_dump_task(struct scx_sched *sch, struct seq_buf *s, struct scx_dump_ctx *dctx,
+ struct rq *rq, struct task_struct *p, char marker)
{
static unsigned long bt[SCX_EXIT_BT_LEN];
struct scx_sched *task_sch = scx_task_sched(p);
@@ -6075,7 +6201,7 @@ static void scx_dump_task(struct scx_sched *sch,
if (SCX_HAS_OP(sch, dump_task)) {
ops_dump_init(s, " ");
- SCX_CALL_OP(sch, dump_task, NULL, dctx, p);
+ SCX_CALL_OP(sch, dump_task, rq, dctx, p);
ops_dump_exit();
}
@@ -6199,8 +6325,7 @@ static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei,
used = seq_buf_used(&ns);
if (SCX_HAS_OP(sch, dump_cpu)) {
ops_dump_init(&ns, " ");
- SCX_CALL_OP(sch, dump_cpu, NULL,
- &dctx, cpu, idle);
+ SCX_CALL_OP(sch, dump_cpu, rq, &dctx, cpu, idle);
ops_dump_exit();
}
@@ -6223,11 +6348,11 @@ static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei,
if (rq->curr->sched_class == &ext_sched_class &&
(dump_all_tasks || scx_task_on_sched(sch, rq->curr)))
- scx_dump_task(sch, &s, &dctx, rq->curr, '*');
+ scx_dump_task(sch, &s, &dctx, rq, rq->curr, '*');
list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node)
if (dump_all_tasks || scx_task_on_sched(sch, p))
- scx_dump_task(sch, &s, &dctx, p, ' ');
+ scx_dump_task(sch, &s, &dctx, rq, p, ' ');
next:
rq_unlock_irqrestore(rq, &rf);
}
@@ -6437,6 +6562,15 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
init_irq_work(&sch->disable_irq_work, scx_disable_irq_workfn);
kthread_init_work(&sch->disable_work, scx_disable_workfn);
timer_setup(&sch->bypass_lb_timer, scx_bypass_lb_timerfn, 0);
+
+ if (!alloc_cpumask_var(&sch->bypass_lb_donee_cpumask, GFP_KERNEL)) {
+ ret = -ENOMEM;
+ goto err_stop_helper;
+ }
+ if (!alloc_cpumask_var(&sch->bypass_lb_resched_cpumask, GFP_KERNEL)) {
+ ret = -ENOMEM;
+ goto err_free_lb_cpumask;
+ }
sch->ops = *ops;
rcu_assign_pointer(ops->priv, sch);
@@ -6446,14 +6580,14 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
char *buf = kzalloc(PATH_MAX, GFP_KERNEL);
if (!buf) {
ret = -ENOMEM;
- goto err_stop_helper;
+ goto err_free_lb_resched;
}
cgroup_path(cgrp, buf, PATH_MAX);
sch->cgrp_path = kstrdup(buf, GFP_KERNEL);
kfree(buf);
if (!sch->cgrp_path) {
ret = -ENOMEM;
- goto err_stop_helper;
+ goto err_free_lb_resched;
}
sch->cgrp = cgrp;
@@ -6488,10 +6622,12 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
#endif /* CONFIG_EXT_SUB_SCHED */
return sch;
-#ifdef CONFIG_EXT_SUB_SCHED
+err_free_lb_resched:
+ free_cpumask_var(sch->bypass_lb_resched_cpumask);
+err_free_lb_cpumask:
+ free_cpumask_var(sch->bypass_lb_donee_cpumask);
err_stop_helper:
kthread_destroy_worker(sch->helper);
-#endif
err_free_pcpu:
for_each_possible_cpu(cpu) {
if (cpu == bypass_fail_cpu)
@@ -6510,7 +6646,7 @@ err_free_ei:
err_free_sch:
kfree(sch);
err_put_cgrp:
-#if defined(CONFIG_EXT_GROUP_SCHED) || defined(CONFIG_EXT_SUB_SCHED)
+#ifdef CONFIG_EXT_SUB_SCHED
cgroup_put(cgrp);
#endif
return ERR_PTR(ret);
@@ -6601,7 +6737,7 @@ static void scx_root_enable_workfn(struct kthread_work *work)
if (ret)
goto err_unlock;
-#if defined(CONFIG_EXT_GROUP_SCHED) || defined(CONFIG_EXT_SUB_SCHED)
+#ifdef CONFIG_EXT_SUB_SCHED
cgroup_get(cgrp);
#endif
sch = scx_alloc_and_add_sched(ops, cgrp, NULL);
@@ -6639,8 +6775,10 @@ static void scx_root_enable_workfn(struct kthread_work *work)
rcu_assign_pointer(scx_root, sch);
ret = scx_link_sched(sch);
- if (ret)
+ if (ret) {
+ cpus_read_unlock();
goto err_disable;
+ }
scx_idle_enable(ops);
@@ -6821,7 +6959,7 @@ err_disable:
* completion. sch's base reference will be put by bpf_scx_unreg().
*/
scx_error(sch, "scx_root_enable() failed (%d)", ret);
- kthread_flush_work(&sch->disable_work);
+ scx_flush_disable_work(sch);
cmd->ret = 0;
}
@@ -7072,23 +7210,30 @@ out_unlock:
abort:
put_task_struct(p);
scx_task_iter_stop(&sti);
- scx_enabling_sub_sched = NULL;
+ /*
+ * Undo __scx_init_task() for tasks we marked. scx_enable_task() never
+ * ran for @sch on them, so calling scx_disable_task() here would invoke
+ * ops.disable() without a matching ops.enable(). scx_enabling_sub_sched
+ * must stay set until SUB_INIT is cleared from every marked task -
+ * scx_disable_and_exit_task() reads it when a task exits concurrently.
+ */
scx_task_iter_start(&sti, sch->cgrp);
while ((p = scx_task_iter_next_locked(&sti))) {
if (p->scx.flags & SCX_TASK_SUB_INIT) {
- __scx_disable_and_exit_task(sch, p);
+ scx_sub_init_cancel_task(sch, p);
p->scx.flags &= ~SCX_TASK_SUB_INIT;
}
}
scx_task_iter_stop(&sti);
+ scx_enabling_sub_sched = NULL;
err_unlock_and_disable:
/* we'll soon enter disable path, keep bypass on */
scx_cgroup_unlock();
percpu_up_write(&scx_fork_rwsem);
err_disable:
mutex_unlock(&scx_enable_mutex);
- kthread_flush_work(&sch->disable_work);
+ scx_flush_disable_work(sch);
cmd->ret = 0;
}
@@ -7349,7 +7494,7 @@ static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
struct scx_sched *sch = rcu_dereference_protected(ops->priv, true);
scx_disable(sch, SCX_EXIT_UNREG);
- kthread_flush_work(&sch->disable_work);
+ scx_flush_disable_work(sch);
RCU_INIT_POINTER(ops->priv, NULL);
kobject_put(&sch->kobj);
}
@@ -8033,12 +8178,22 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
struct task_struct *p, u64 dsq_id, u64 enq_flags)
{
struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq;
- struct scx_sched *sch = src_dsq->sched;
+ struct scx_sched *sch;
struct rq *this_rq, *src_rq, *locked_rq;
bool dispatched = false;
bool in_balance;
unsigned long flags;
+ /*
+ * The verifier considers an iterator slot initialized on any
+ * KF_ITER_NEW return, so a BPF program may legally reach here after
+ * bpf_iter_scx_dsq_new() failed and left @kit->dsq NULL.
+ */
+ if (unlikely(!src_dsq))
+ return false;
+
+ sch = src_dsq->sched;
+
if (!scx_vet_enq_flags(sch, dsq_id, &enq_flags))
return false;
@@ -8526,7 +8681,7 @@ __bpf_kfunc bool scx_bpf_task_set_slice(struct task_struct *p, u64 slice,
guard(rcu)();
sch = scx_prog_sched(aux);
- if (unlikely(!scx_task_on_sched(sch, p)))
+ if (unlikely(!sch || !scx_task_on_sched(sch, p)))
return false;
p->scx.slice = slice;
@@ -8549,7 +8704,7 @@ __bpf_kfunc bool scx_bpf_task_set_dsq_vtime(struct task_struct *p, u64 vtime,
guard(rcu)();
sch = scx_prog_sched(aux);
- if (unlikely(!scx_task_on_sched(sch, p)))
+ if (unlikely(!sch || !scx_task_on_sched(sch, p)))
return false;
p->scx.dsq_vtime = vtime;
@@ -8633,11 +8788,12 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags, const struct bpf_prog_aux
/**
* scx_bpf_dsq_nr_queued - Return the number of queued tasks
* @dsq_id: id of the DSQ
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
*
* Return the number of tasks in the DSQ matching @dsq_id. If not found,
* -%ENOENT is returned.
*/
-__bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id)
+__bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id, const struct bpf_prog_aux *aux)
{
struct scx_sched *sch;
struct scx_dispatch_q *dsq;
@@ -8645,7 +8801,7 @@ __bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id)
preempt_disable();
- sch = rcu_dereference_sched(scx_root);
+ sch = scx_prog_sched(aux);
if (unlikely(!sch)) {
ret = -ENODEV;
goto out;
@@ -8677,21 +8833,21 @@ out:
/**
* scx_bpf_destroy_dsq - Destroy a custom DSQ
* @dsq_id: DSQ to destroy
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
*
* Destroy the custom DSQ identified by @dsq_id. Only DSQs created with
* scx_bpf_create_dsq() can be destroyed. The caller must ensure that the DSQ is
* empty and no further tasks are dispatched to it. Ignored if called on a DSQ
* which doesn't exist. Can be called from any online scx_ops operations.
*/
-__bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id)
+__bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id, const struct bpf_prog_aux *aux)
{
struct scx_sched *sch;
- rcu_read_lock();
- sch = rcu_dereference(scx_root);
+ guard(rcu)();
+ sch = scx_prog_sched(aux);
if (sch)
destroy_dsq(sch, dsq_id);
- rcu_read_unlock();
}
/**
@@ -9445,8 +9601,8 @@ BTF_KFUNCS_START(scx_kfunc_ids_any)
BTF_ID_FLAGS(func, scx_bpf_task_set_slice, KF_IMPLICIT_ARGS | KF_RCU);
BTF_ID_FLAGS(func, scx_bpf_task_set_dsq_vtime, KF_IMPLICIT_ARGS | KF_RCU);
BTF_ID_FLAGS(func, scx_bpf_kick_cpu, KF_IMPLICIT_ARGS)
-BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued)
-BTF_ID_FLAGS(func, scx_bpf_destroy_dsq)
+BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_destroy_dsq, KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_IMPLICIT_ARGS | KF_RCU_PROTECTED | KF_RET_NULL)
BTF_ID_FLAGS(func, scx_bpf_dsq_reenq, KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2, KF_IMPLICIT_ARGS)
@@ -9479,6 +9635,7 @@ BTF_KFUNCS_END(scx_kfunc_ids_any)
static const struct btf_kfunc_id_set scx_kfunc_set_any = {
.owner = THIS_MODULE,
.set = &scx_kfunc_ids_any,
+ .filter = scx_kfunc_context_filter,
};
/*
@@ -9526,13 +9683,12 @@ static const u32 scx_kf_allow_flags[] = {
};
/*
- * Verifier-time filter for context-sensitive SCX kfuncs. Registered via the
- * .filter field on each per-group btf_kfunc_id_set. The BPF core invokes this
- * for every kfunc call in the registered hook (BPF_PROG_TYPE_STRUCT_OPS or
+ * Verifier-time filter for SCX kfuncs. Registered via the .filter field on
+ * each per-group btf_kfunc_id_set. The BPF core invokes this for every kfunc
+ * call in the registered hook (BPF_PROG_TYPE_STRUCT_OPS or
* BPF_PROG_TYPE_SYSCALL), regardless of which set originally introduced the
- * kfunc - so the filter must short-circuit on kfuncs it doesn't govern (e.g.
- * scx_kfunc_ids_any) by falling through to "allow" when none of the
- * context-sensitive sets contain the kfunc.
+ * kfunc - so the filter must short-circuit on kfuncs it doesn't govern by
+ * falling through to "allow" when none of the SCX sets contain the kfunc.
*/
int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id)
{
@@ -9541,18 +9697,21 @@ int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id)
bool in_enqueue = btf_id_set8_contains(&scx_kfunc_ids_enqueue_dispatch, kfunc_id);
bool in_dispatch = btf_id_set8_contains(&scx_kfunc_ids_dispatch, kfunc_id);
bool in_cpu_release = btf_id_set8_contains(&scx_kfunc_ids_cpu_release, kfunc_id);
+ bool in_idle = btf_id_set8_contains(&scx_kfunc_ids_idle, kfunc_id);
+ bool in_any = btf_id_set8_contains(&scx_kfunc_ids_any, kfunc_id);
u32 moff, flags;
- /* Not a context-sensitive kfunc (e.g. from scx_kfunc_ids_any) - allow. */
- if (!(in_unlocked || in_select_cpu || in_enqueue || in_dispatch || in_cpu_release))
+ /* Not an SCX kfunc - allow. */
+ if (!(in_unlocked || in_select_cpu || in_enqueue || in_dispatch ||
+ in_cpu_release || in_idle || in_any))
return 0;
/* SYSCALL progs (e.g. BPF test_run()) may call unlocked and select_cpu kfuncs. */
if (prog->type == BPF_PROG_TYPE_SYSCALL)
- return (in_unlocked || in_select_cpu) ? 0 : -EACCES;
+ return (in_unlocked || in_select_cpu || in_idle || in_any) ? 0 : -EACCES;
if (prog->type != BPF_PROG_TYPE_STRUCT_OPS)
- return -EACCES;
+ return (in_any || in_idle) ? 0 : -EACCES;
/*
* add_subprog_and_kfunc() collects all kfunc calls, including dead code
@@ -9565,14 +9724,15 @@ int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id)
return 0;
/*
- * Non-SCX struct_ops: only unlocked kfuncs are safe. The other
- * context-sensitive kfuncs assume the rq lock is held by the SCX
- * dispatch path, which doesn't apply to other struct_ops users.
+ * Non-SCX struct_ops: SCX kfuncs are not permitted.
*/
if (prog->aux->st_ops != &bpf_sched_ext_ops)
- return in_unlocked ? 0 : -EACCES;
+ return -EACCES;
/* SCX struct_ops: check the per-op allow list. */
+ if (in_any || in_idle)
+ return 0;
+
moff = prog->aux->attach_st_ops_member_off;
flags = scx_kf_allow_flags[SCX_MOFF_IDX(moff)];
@@ -9656,12 +9816,6 @@ static int __init scx_init(void)
return ret;
}
- if (!alloc_cpumask_var(&scx_bypass_lb_donee_cpumask, GFP_KERNEL) ||
- !alloc_cpumask_var(&scx_bypass_lb_resched_cpumask, GFP_KERNEL)) {
- pr_err("sched_ext: Failed to allocate cpumasks\n");
- return -ENOMEM;
- }
-
return 0;
}
__initcall(scx_init);
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index 443d12a3df67..7468560a6d80 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -927,14 +927,24 @@ static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p,
* Accessing p->cpus_ptr / p->nr_cpus_allowed needs either @p's rq
* lock or @p's pi_lock. Three cases:
*
- * - inside ops.select_cpu(): try_to_wake_up() holds @p's pi_lock.
+ * - inside ops.select_cpu(): try_to_wake_up() holds the wake-up
+ * task's pi_lock; the wake-up task is recorded in kf_tasks[0]
+ * by SCX_CALL_OP_TASK_RET().
* - other rq-locked SCX op: scx_locked_rq() points at the held rq.
* - truly unlocked (UNLOCKED ops, SYSCALL, non-SCX struct_ops):
* nothing held, take pi_lock ourselves.
+ *
+ * In the first two cases, BPF schedulers may pass an arbitrary task
+ * that the held lock doesn't cover. Refuse those.
*/
if (this_rq()->scx.in_select_cpu) {
+ if (!scx_kf_arg_task_ok(sch, p))
+ return -EINVAL;
lockdep_assert_held(&p->pi_lock);
- } else if (!scx_locked_rq()) {
+ } else if (scx_locked_rq()) {
+ if (task_rq(p) != scx_locked_rq())
+ goto cross_task;
+ } else {
raw_spin_lock_irqsave(&p->pi_lock, irq_flags);
we_locked = true;
}
@@ -960,6 +970,11 @@ static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p,
raw_spin_unlock_irqrestore(&p->pi_lock, irq_flags);
return cpu;
+
+cross_task:
+ scx_error(sch, "select_cpu kfunc called cross-task on %s[%d]",
+ p->comm, p->pid);
+ return -EINVAL;
}
/**
@@ -1467,6 +1482,7 @@ BTF_KFUNCS_END(scx_kfunc_ids_idle)
static const struct btf_kfunc_id_set scx_kfunc_set_idle = {
.owner = THIS_MODULE,
.set = &scx_kfunc_ids_idle,
+ .filter = scx_kfunc_context_filter,
};
/*
diff --git a/kernel/sched/ext_idle.h b/kernel/sched/ext_idle.h
index dc35f850481e..8d169d3bbdf9 100644
--- a/kernel/sched/ext_idle.h
+++ b/kernel/sched/ext_idle.h
@@ -12,6 +12,7 @@
struct sched_ext_ops;
+extern struct btf_id_set8 scx_kfunc_ids_idle;
extern struct btf_id_set8 scx_kfunc_ids_select_cpu;
void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops);
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index 62ce4eaf6a3f..a075732d4430 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -1075,6 +1075,8 @@ struct scx_sched {
struct irq_work disable_irq_work;
struct kthread_work disable_work;
struct timer_list bypass_lb_timer;
+ cpumask_var_t bypass_lb_donee_cpumask;
+ cpumask_var_t bypass_lb_resched_cpumask;
struct rcu_work rcu_work;
/* all ancestors including self */