diff options
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/sched/ext.c | 398 | ||||
| -rw-r--r-- | kernel/sched/ext_idle.c | 20 | ||||
| -rw-r--r-- | kernel/sched/ext_idle.h | 1 | ||||
| -rw-r--r-- | kernel/sched/ext_internal.h | 2 |
4 files changed, 297 insertions, 124 deletions
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index e426e27b6794..345aa11b84b2 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -32,6 +32,7 @@ static const struct rhashtable_params scx_sched_hash_params = { .key_len = sizeof_field(struct scx_sched, ops.sub_cgroup_id), .key_offset = offsetof(struct scx_sched, ops.sub_cgroup_id), .head_offset = offsetof(struct scx_sched, hash_node), + .insecure_elasticity = true, /* inserted under scx_sched_lock */ }; static struct rhashtable scx_sched_hash; @@ -52,8 +53,6 @@ DEFINE_STATIC_KEY_FALSE(__scx_enabled); DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED); static DEFINE_RAW_SPINLOCK(scx_bypass_lock); -static cpumask_var_t scx_bypass_lb_donee_cpumask; -static cpumask_var_t scx_bypass_lb_resched_cpumask; static bool scx_init_task_enabled; static bool scx_switching_all; DEFINE_STATIC_KEY_FALSE(__scx_switched_all); @@ -469,24 +468,35 @@ static inline void update_locked_rq(struct rq *rq) __this_cpu_write(scx_locked_rq_state, rq); } -#define SCX_CALL_OP(sch, op, rq, args...) \ +/* + * SCX ops can recurse via scx_bpf_sub_dispatch() - the inner call must not + * clobber the outer's scx_locked_rq_state. Save it on entry, restore on exit. + */ +#define SCX_CALL_OP(sch, op, locked_rq, args...) \ do { \ - if (rq) \ - update_locked_rq(rq); \ + struct rq *__prev_locked_rq; \ + \ + if (locked_rq) { \ + __prev_locked_rq = scx_locked_rq(); \ + update_locked_rq(locked_rq); \ + } \ (sch)->ops.op(args); \ - if (rq) \ - update_locked_rq(NULL); \ + if (locked_rq) \ + update_locked_rq(__prev_locked_rq); \ } while (0) -#define SCX_CALL_OP_RET(sch, op, rq, args...) \ +#define SCX_CALL_OP_RET(sch, op, locked_rq, args...) \ ({ \ + struct rq *__prev_locked_rq; \ __typeof__((sch)->ops.op(args)) __ret; \ \ - if (rq) \ - update_locked_rq(rq); \ + if (locked_rq) { \ + __prev_locked_rq = scx_locked_rq(); \ + update_locked_rq(locked_rq); \ + } \ __ret = (sch)->ops.op(args); \ - if (rq) \ - update_locked_rq(NULL); \ + if (locked_rq) \ + update_locked_rq(__prev_locked_rq); \ __ret; \ }) @@ -498,39 +508,39 @@ do { \ * those subject tasks. * * Every SCX_CALL_OP_TASK*() call site invokes its op with @p's rq lock held - - * either via the @rq argument here, or (for ops.select_cpu()) via @p's pi_lock - * held by try_to_wake_up() with rq tracking via scx_rq.in_select_cpu. So if - * kf_tasks[] is set, @p's scheduler-protected fields are stable. + * either via the @locked_rq argument here, or (for ops.select_cpu()) via @p's + * pi_lock held by try_to_wake_up() with rq tracking via scx_rq.in_select_cpu. + * So if kf_tasks[] is set, @p's scheduler-protected fields are stable. * * kf_tasks[] can not stack, so task-based SCX ops must not nest. The * WARN_ON_ONCE() in each macro catches a re-entry of any of the three variants * while a previous one is still in progress. */ -#define SCX_CALL_OP_TASK(sch, op, rq, task, args...) \ +#define SCX_CALL_OP_TASK(sch, op, locked_rq, task, args...) \ do { \ WARN_ON_ONCE(current->scx.kf_tasks[0]); \ current->scx.kf_tasks[0] = task; \ - SCX_CALL_OP((sch), op, rq, task, ##args); \ + SCX_CALL_OP((sch), op, locked_rq, task, ##args); \ current->scx.kf_tasks[0] = NULL; \ } while (0) -#define SCX_CALL_OP_TASK_RET(sch, op, rq, task, args...) \ +#define SCX_CALL_OP_TASK_RET(sch, op, locked_rq, task, args...) \ ({ \ __typeof__((sch)->ops.op(task, ##args)) __ret; \ WARN_ON_ONCE(current->scx.kf_tasks[0]); \ current->scx.kf_tasks[0] = task; \ - __ret = SCX_CALL_OP_RET((sch), op, rq, task, ##args); \ + __ret = SCX_CALL_OP_RET((sch), op, locked_rq, task, ##args); \ current->scx.kf_tasks[0] = NULL; \ __ret; \ }) -#define SCX_CALL_OP_2TASKS_RET(sch, op, rq, task0, task1, args...) \ +#define SCX_CALL_OP_2TASKS_RET(sch, op, locked_rq, task0, task1, args...) \ ({ \ __typeof__((sch)->ops.op(task0, task1, ##args)) __ret; \ WARN_ON_ONCE(current->scx.kf_tasks[0]); \ current->scx.kf_tasks[0] = task0; \ current->scx.kf_tasks[1] = task1; \ - __ret = SCX_CALL_OP_RET((sch), op, rq, task0, task1, ##args); \ + __ret = SCX_CALL_OP_RET((sch), op, locked_rq, task0, task1, ##args); \ current->scx.kf_tasks[0] = NULL; \ current->scx.kf_tasks[1] = NULL; \ __ret; \ @@ -1388,18 +1398,55 @@ static void call_task_dequeue(struct scx_sched *sch, struct rq *rq, p->scx.flags &= ~SCX_TASK_IN_CUSTODY; } -static void local_dsq_post_enq(struct scx_dispatch_q *dsq, struct task_struct *p, - u64 enq_flags) +static void local_dsq_post_enq(struct scx_sched *sch, struct scx_dispatch_q *dsq, + struct task_struct *p, u64 enq_flags) { struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); - bool preempt = false; - call_task_dequeue(scx_root, rq, p, 0); + call_task_dequeue(sch, rq, p, 0); + + /* + * Note that @rq's lock may be dropped between this enqueue and @p + * actually getting on CPU. This gives higher-class tasks (e.g. RT) + * an opportunity to wake up on @rq and prevent @p from running. + * Here are some concrete examples: + * + * Example 1: + * + * We dispatch two tasks from a single ops.dispatch(): + * - First, a local task to this CPU's local DSQ; + * - Second, a local/remote task to a remote CPU's local DSQ. + * We must drop the local rq lock in order to finish the second + * dispatch. In that time, an RT task can wake up on the local rq. + * + * Example 2: + * + * We dispatch a local/remote task to a remote CPU's local DSQ. + * We must drop the remote rq lock before the dispatched task can run, + * which gives an RT task an opportunity to wake up on the remote rq. + * + * Both examples work the same if we replace dispatching with moving + * the tasks from a user-created DSQ. + * + * We must detect these wakeups so that we can re-enqueue IMMED tasks + * from @rq's local DSQ. scx_wakeup_preempt() serves exactly this + * purpose, but for it to be invoked, we must ensure that we bump + * @rq->next_class to &ext_sched_class if it's currently idle. + * + * wakeup_preempt() does the bumping, and since we only invoke it if + * @rq->next_class is below &ext_sched_class, it will also + * resched_curr(rq). + */ + if (sched_class_above(p->sched_class, rq->next_class)) + wakeup_preempt(rq, p, 0); /* * If @rq is in balance, the CPU is already vacant and looking for the * next task to run. No need to preempt or trigger resched after moving * @p into its local DSQ. + * Note that the wakeup_preempt() above may have already triggered + * a resched if @rq->next_class was idle. It's harmless, since + * need_resched is cleared immediately after task pick. */ if (rq->scx.flags & SCX_RQ_IN_BALANCE) return; @@ -1407,11 +1454,8 @@ static void local_dsq_post_enq(struct scx_dispatch_q *dsq, struct task_struct *p if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr && rq->curr->sched_class == &ext_sched_class) { rq->curr->scx.slice = 0; - preempt = true; - } - - if (preempt || sched_class_above(&ext_sched_class, rq->curr->sched_class)) resched_curr(rq); + } } static void dispatch_enqueue(struct scx_sched *sch, struct rq *rq, @@ -1494,11 +1538,13 @@ static void dispatch_enqueue(struct scx_sched *sch, struct rq *rq, if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN)) rcu_assign_pointer(dsq->first_task, p); } else { - bool was_empty; - - was_empty = list_empty(&dsq->list); + /* + * dsq->list can contain parked BPF iterator cursors, so + * list_empty() here isn't a reliable proxy for "no real + * task in the DSQ". Test dsq->first_task directly. + */ list_add_tail(&p->scx.dsq_list.node, &dsq->list); - if (was_empty && !(dsq->id & SCX_DSQ_FLAG_BUILTIN)) + if (!dsq->first_task && !(dsq->id & SCX_DSQ_FLAG_BUILTIN)) rcu_assign_pointer(dsq->first_task, p); } } @@ -1518,7 +1564,7 @@ static void dispatch_enqueue(struct scx_sched *sch, struct rq *rq, * concurrently in a non-atomic way. */ if (is_local) { - local_dsq_post_enq(dsq, p, enq_flags); + local_dsq_post_enq(sch, dsq, p, enq_flags); } else { /* * Task on global/bypass DSQ: leave custody, task on @@ -2129,7 +2175,8 @@ static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p, int wake_fl schedule_reenq_local(rq, 0); } -static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags, +static void move_local_task_to_local_dsq(struct scx_sched *sch, + struct task_struct *p, u64 enq_flags, struct scx_dispatch_q *src_dsq, struct rq *dst_rq) { @@ -2149,7 +2196,7 @@ static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags, dsq_inc_nr(dst_dsq, p, enq_flags); p->scx.dsq = dst_dsq; - local_dsq_post_enq(dst_dsq, p, enq_flags); + local_dsq_post_enq(sch, dst_dsq, p, enq_flags); } /** @@ -2370,7 +2417,7 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch, /* @p is going from a non-local DSQ to a local DSQ */ if (src_rq == dst_rq) { task_unlink_from_dsq(p, src_dsq); - move_local_task_to_local_dsq(p, enq_flags, + move_local_task_to_local_dsq(sch, p, enq_flags, src_dsq, dst_rq); raw_spin_unlock(&src_dsq->lock); } else { @@ -2423,7 +2470,7 @@ retry: if (rq == task_rq) { task_unlink_from_dsq(p, dsq); - move_local_task_to_local_dsq(p, enq_flags, dsq, rq); + move_local_task_to_local_dsq(sch, p, enq_flags, dsq, rq); raw_spin_unlock(&dsq->lock); return true; } @@ -3183,7 +3230,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, if (sch_a == sch_b && SCX_HAS_OP(sch_a, core_sched_before) && !scx_bypassing(sch_a, task_cpu(a))) return SCX_CALL_OP_2TASKS_RET(sch_a, core_sched_before, - NULL, + task_rq(a), (struct task_struct *)a, (struct task_struct *)b); else @@ -3631,6 +3678,22 @@ static void __scx_disable_and_exit_task(struct scx_sched *sch, SCX_CALL_OP_TASK(sch, exit_task, task_rq(p), p, &args); } +/* + * Undo a completed __scx_init_task(sch, p, false) when scx_enable_task() never + * ran. The task state has not been transitioned, so this mirrors the + * SCX_TASK_INIT branch in __scx_disable_and_exit_task(). + */ +static void scx_sub_init_cancel_task(struct scx_sched *sch, struct task_struct *p) +{ + struct scx_exit_task_args args = { .cancelled = true }; + + lockdep_assert_held(&p->pi_lock); + lockdep_assert_rq_held(task_rq(p)); + + if (SCX_HAS_OP(sch, exit_task)) + SCX_CALL_OP_TASK(sch, exit_task, task_rq(p), p, &args); +} + static void scx_disable_and_exit_task(struct scx_sched *sch, struct task_struct *p) { @@ -3639,11 +3702,12 @@ static void scx_disable_and_exit_task(struct scx_sched *sch, /* * If set, @p exited between __scx_init_task() and scx_enable_task() in * scx_sub_enable() and is initialized for both the associated sched and - * its parent. Disable and exit for the child too. + * its parent. Exit for the child too - scx_enable_task() never ran for + * it, so undo only init_task. */ - if ((p->scx.flags & SCX_TASK_SUB_INIT) && - !WARN_ON_ONCE(!scx_enabling_sub_sched)) { - __scx_disable_and_exit_task(scx_enabling_sub_sched, p); + if (p->scx.flags & SCX_TASK_SUB_INIT) { + if (!WARN_ON_ONCE(!scx_enabling_sub_sched)) + scx_sub_init_cancel_task(scx_enabling_sub_sched, p); p->scx.flags &= ~SCX_TASK_SUB_INIT; } @@ -4324,9 +4388,10 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) void scx_group_set_weight(struct task_group *tg, unsigned long weight) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch; percpu_down_read(&scx_cgroup_ops_rwsem); + sch = scx_root; if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) && tg->scx.weight != weight) @@ -4339,9 +4404,10 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight) void scx_group_set_idle(struct task_group *tg, bool idle) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch; percpu_down_read(&scx_cgroup_ops_rwsem); + sch = scx_root; if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_idle)) SCX_CALL_OP(sch, cgroup_set_idle, NULL, tg_cgrp(tg), idle); @@ -4355,9 +4421,10 @@ void scx_group_set_idle(struct task_group *tg, bool idle) void scx_group_set_bandwidth(struct task_group *tg, u64 period_us, u64 quota_us, u64 burst_us) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch; percpu_down_read(&scx_cgroup_ops_rwsem); + sch = scx_root; if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_bandwidth) && (tg->scx.bw_period_us != period_us || @@ -4380,21 +4447,6 @@ static struct cgroup *root_cgroup(void) return &cgrp_dfl_root.cgrp; } -static struct cgroup *sch_cgroup(struct scx_sched *sch) -{ - return sch->cgrp; -} - -/* for each descendant of @cgrp including self, set ->scx_sched to @sch */ -static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) -{ - struct cgroup *pos; - struct cgroup_subsys_state *css; - - cgroup_for_each_live_descendant_pre(pos, css, cgrp) - rcu_assign_pointer(pos->scx_sched, sch); -} - static void scx_cgroup_lock(void) { #ifdef CONFIG_EXT_GROUP_SCHED @@ -4412,12 +4464,30 @@ static void scx_cgroup_unlock(void) } #else /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */ static struct cgroup *root_cgroup(void) { return NULL; } -static struct cgroup *sch_cgroup(struct scx_sched *sch) { return NULL; } -static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) {} static void scx_cgroup_lock(void) {} static void scx_cgroup_unlock(void) {} #endif /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */ +#ifdef CONFIG_EXT_SUB_SCHED +static struct cgroup *sch_cgroup(struct scx_sched *sch) +{ + return sch->cgrp; +} + +/* for each descendant of @cgrp including self, set ->scx_sched to @sch */ +static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) +{ + struct cgroup *pos; + struct cgroup_subsys_state *css; + + cgroup_for_each_live_descendant_pre(pos, css, cgrp) + rcu_assign_pointer(pos->scx_sched, sch); +} +#else /* CONFIG_EXT_SUB_SCHED */ +static struct cgroup *sch_cgroup(struct scx_sched *sch) { return NULL; } +static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) {} +#endif /* CONFIG_EXT_SUB_SCHED */ + /* * Omitted operations: * @@ -4712,6 +4782,8 @@ static void scx_sched_free_rcu_work(struct work_struct *work) irq_work_sync(&sch->disable_irq_work); kthread_destroy_worker(sch->helper); timer_shutdown_sync(&sch->bypass_lb_timer); + free_cpumask_var(sch->bypass_lb_donee_cpumask); + free_cpumask_var(sch->bypass_lb_resched_cpumask); #ifdef CONFIG_EXT_SUB_SCHED kfree(sch->cgrp_path); @@ -4938,6 +5010,25 @@ void scx_softlockup(u32 dur_s) smp_processor_id(), dur_s); } +/* + * scx_hardlockup() runs from NMI and eventually calls scx_claim_exit(), + * which takes scx_sched_lock. scx_sched_lock isn't NMI-safe and grabbing + * it from NMI context can lead to deadlocks. Defer via irq_work; the + * disable path runs off irq_work anyway. + */ +static atomic_t scx_hardlockup_cpu = ATOMIC_INIT(-1); + +static void scx_hardlockup_irq_workfn(struct irq_work *work) +{ + int cpu = atomic_xchg(&scx_hardlockup_cpu, -1); + + if (cpu >= 0 && handle_lockup("hard lockup - CPU %d", cpu)) + printk_deferred(KERN_ERR "sched_ext: Hard lockup - CPU %d, disabling BPF scheduler\n", + cpu); +} + +static DEFINE_IRQ_WORK(scx_hardlockup_irq_work, scx_hardlockup_irq_workfn); + /** * scx_hardlockup - sched_ext hardlockup handler * @@ -4946,17 +5037,19 @@ void scx_softlockup(u32 dur_s) * Try kicking out the current scheduler in an attempt to recover the system to * a good state before taking more drastic actions. * - * Returns %true if sched_ext is enabled and abort was initiated, which may - * resolve the reported hardlockup. %false if sched_ext is not enabled or - * someone else already initiated abort. + * Queues an irq_work; the handle_lockup() call happens in IRQ context (see + * scx_hardlockup_irq_workfn). + * + * Returns %true if sched_ext is enabled and the work was queued, %false + * otherwise. */ bool scx_hardlockup(int cpu) { - if (!handle_lockup("hard lockup - CPU %d", cpu)) + if (!rcu_access_pointer(scx_root)) return false; - printk_deferred(KERN_ERR "sched_ext: Hard lockup - CPU %d, disabling BPF scheduler\n", - cpu); + atomic_cmpxchg(&scx_hardlockup_cpu, -1, cpu); + irq_work_queue(&scx_hardlockup_irq_work); return true; } @@ -5000,6 +5093,15 @@ resume: if (cpumask_empty(donee_mask)) break; + /* + * If an earlier pass placed @p on @donor_dsq from a different + * CPU and the donee hasn't consumed it yet, @p is still on the + * previous CPU and task_rq(@p) != @donor_rq. @p can't be moved + * without its rq locked. Skip. + */ + if (task_rq(p) != donor_rq) + continue; + donee = cpumask_any_and_distribute(donee_mask, p->cpus_ptr); if (donee >= nr_cpu_ids) continue; @@ -5058,8 +5160,8 @@ resume: static void bypass_lb_node(struct scx_sched *sch, int node) { const struct cpumask *node_mask = cpumask_of_node(node); - struct cpumask *donee_mask = scx_bypass_lb_donee_cpumask; - struct cpumask *resched_mask = scx_bypass_lb_resched_cpumask; + struct cpumask *donee_mask = sch->bypass_lb_donee_cpumask; + struct cpumask *resched_mask = sch->bypass_lb_resched_cpumask; u32 nr_tasks = 0, nr_cpus = 0, nr_balanced = 0; u32 nr_target, nr_donor_target; u32 before_min = U32_MAX, before_max = 0; @@ -5698,6 +5800,8 @@ static void scx_sub_disable(struct scx_sched *sch) if (sch->ops.exit) SCX_CALL_OP(sch, exit, NULL, sch->exit_info); + if (sch->sub_kset) + kset_unregister(sch->sub_kset); kobject_del(&sch->kobj); } #else /* CONFIG_EXT_SUB_SCHED */ @@ -5829,6 +5933,10 @@ static void scx_root_disable(struct scx_sched *sch) * could observe an object of the same name still in the hierarchy when * the next scheduler is loaded. */ +#ifdef CONFIG_EXT_SUB_SCHED + if (sch->sub_kset) + kset_unregister(sch->sub_kset); +#endif kobject_del(&sch->kobj); free_kick_syncs(); @@ -5921,6 +6029,25 @@ static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind) irq_work_queue(&sch->disable_irq_work); } +/** + * scx_flush_disable_work - flush the disable work and wait for it to finish + * @sch: the scheduler + * + * sch->disable_work might still not queued, causing kthread_flush_work() + * as a noop. Syncing the irq_work first is required to guarantee the + * kthread work has been queued before waiting for it. + */ +static void scx_flush_disable_work(struct scx_sched *sch) +{ + int kind; + + do { + irq_work_sync(&sch->disable_irq_work); + kthread_flush_work(&sch->disable_work); + kind = atomic_read(&sch->exit_kind); + } while (kind != SCX_EXIT_NONE && kind != SCX_EXIT_DONE); +} + static void dump_newline(struct seq_buf *s) { trace_sched_ext_dump(""); @@ -6032,9 +6159,8 @@ static void ops_dump_exit(void) scx_dump_data.cpu = -1; } -static void scx_dump_task(struct scx_sched *sch, - struct seq_buf *s, struct scx_dump_ctx *dctx, - struct task_struct *p, char marker) +static void scx_dump_task(struct scx_sched *sch, struct seq_buf *s, struct scx_dump_ctx *dctx, + struct rq *rq, struct task_struct *p, char marker) { static unsigned long bt[SCX_EXIT_BT_LEN]; struct scx_sched *task_sch = scx_task_sched(p); @@ -6075,7 +6201,7 @@ static void scx_dump_task(struct scx_sched *sch, if (SCX_HAS_OP(sch, dump_task)) { ops_dump_init(s, " "); - SCX_CALL_OP(sch, dump_task, NULL, dctx, p); + SCX_CALL_OP(sch, dump_task, rq, dctx, p); ops_dump_exit(); } @@ -6199,8 +6325,7 @@ static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei, used = seq_buf_used(&ns); if (SCX_HAS_OP(sch, dump_cpu)) { ops_dump_init(&ns, " "); - SCX_CALL_OP(sch, dump_cpu, NULL, - &dctx, cpu, idle); + SCX_CALL_OP(sch, dump_cpu, rq, &dctx, cpu, idle); ops_dump_exit(); } @@ -6223,11 +6348,11 @@ static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei, if (rq->curr->sched_class == &ext_sched_class && (dump_all_tasks || scx_task_on_sched(sch, rq->curr))) - scx_dump_task(sch, &s, &dctx, rq->curr, '*'); + scx_dump_task(sch, &s, &dctx, rq, rq->curr, '*'); list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) if (dump_all_tasks || scx_task_on_sched(sch, p)) - scx_dump_task(sch, &s, &dctx, p, ' '); + scx_dump_task(sch, &s, &dctx, rq, p, ' '); next: rq_unlock_irqrestore(rq, &rf); } @@ -6437,6 +6562,15 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, init_irq_work(&sch->disable_irq_work, scx_disable_irq_workfn); kthread_init_work(&sch->disable_work, scx_disable_workfn); timer_setup(&sch->bypass_lb_timer, scx_bypass_lb_timerfn, 0); + + if (!alloc_cpumask_var(&sch->bypass_lb_donee_cpumask, GFP_KERNEL)) { + ret = -ENOMEM; + goto err_stop_helper; + } + if (!alloc_cpumask_var(&sch->bypass_lb_resched_cpumask, GFP_KERNEL)) { + ret = -ENOMEM; + goto err_free_lb_cpumask; + } sch->ops = *ops; rcu_assign_pointer(ops->priv, sch); @@ -6446,14 +6580,14 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, char *buf = kzalloc(PATH_MAX, GFP_KERNEL); if (!buf) { ret = -ENOMEM; - goto err_stop_helper; + goto err_free_lb_resched; } cgroup_path(cgrp, buf, PATH_MAX); sch->cgrp_path = kstrdup(buf, GFP_KERNEL); kfree(buf); if (!sch->cgrp_path) { ret = -ENOMEM; - goto err_stop_helper; + goto err_free_lb_resched; } sch->cgrp = cgrp; @@ -6488,10 +6622,12 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, #endif /* CONFIG_EXT_SUB_SCHED */ return sch; -#ifdef CONFIG_EXT_SUB_SCHED +err_free_lb_resched: + free_cpumask_var(sch->bypass_lb_resched_cpumask); +err_free_lb_cpumask: + free_cpumask_var(sch->bypass_lb_donee_cpumask); err_stop_helper: kthread_destroy_worker(sch->helper); -#endif err_free_pcpu: for_each_possible_cpu(cpu) { if (cpu == bypass_fail_cpu) @@ -6510,7 +6646,7 @@ err_free_ei: err_free_sch: kfree(sch); err_put_cgrp: -#if defined(CONFIG_EXT_GROUP_SCHED) || defined(CONFIG_EXT_SUB_SCHED) +#ifdef CONFIG_EXT_SUB_SCHED cgroup_put(cgrp); #endif return ERR_PTR(ret); @@ -6601,7 +6737,7 @@ static void scx_root_enable_workfn(struct kthread_work *work) if (ret) goto err_unlock; -#if defined(CONFIG_EXT_GROUP_SCHED) || defined(CONFIG_EXT_SUB_SCHED) +#ifdef CONFIG_EXT_SUB_SCHED cgroup_get(cgrp); #endif sch = scx_alloc_and_add_sched(ops, cgrp, NULL); @@ -6639,8 +6775,10 @@ static void scx_root_enable_workfn(struct kthread_work *work) rcu_assign_pointer(scx_root, sch); ret = scx_link_sched(sch); - if (ret) + if (ret) { + cpus_read_unlock(); goto err_disable; + } scx_idle_enable(ops); @@ -6821,7 +6959,7 @@ err_disable: * completion. sch's base reference will be put by bpf_scx_unreg(). */ scx_error(sch, "scx_root_enable() failed (%d)", ret); - kthread_flush_work(&sch->disable_work); + scx_flush_disable_work(sch); cmd->ret = 0; } @@ -7072,23 +7210,30 @@ out_unlock: abort: put_task_struct(p); scx_task_iter_stop(&sti); - scx_enabling_sub_sched = NULL; + /* + * Undo __scx_init_task() for tasks we marked. scx_enable_task() never + * ran for @sch on them, so calling scx_disable_task() here would invoke + * ops.disable() without a matching ops.enable(). scx_enabling_sub_sched + * must stay set until SUB_INIT is cleared from every marked task - + * scx_disable_and_exit_task() reads it when a task exits concurrently. + */ scx_task_iter_start(&sti, sch->cgrp); while ((p = scx_task_iter_next_locked(&sti))) { if (p->scx.flags & SCX_TASK_SUB_INIT) { - __scx_disable_and_exit_task(sch, p); + scx_sub_init_cancel_task(sch, p); p->scx.flags &= ~SCX_TASK_SUB_INIT; } } scx_task_iter_stop(&sti); + scx_enabling_sub_sched = NULL; err_unlock_and_disable: /* we'll soon enter disable path, keep bypass on */ scx_cgroup_unlock(); percpu_up_write(&scx_fork_rwsem); err_disable: mutex_unlock(&scx_enable_mutex); - kthread_flush_work(&sch->disable_work); + scx_flush_disable_work(sch); cmd->ret = 0; } @@ -7349,7 +7494,7 @@ static void bpf_scx_unreg(void *kdata, struct bpf_link *link) struct scx_sched *sch = rcu_dereference_protected(ops->priv, true); scx_disable(sch, SCX_EXIT_UNREG); - kthread_flush_work(&sch->disable_work); + scx_flush_disable_work(sch); RCU_INIT_POINTER(ops->priv, NULL); kobject_put(&sch->kobj); } @@ -8033,12 +8178,22 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit, struct task_struct *p, u64 dsq_id, u64 enq_flags) { struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq; - struct scx_sched *sch = src_dsq->sched; + struct scx_sched *sch; struct rq *this_rq, *src_rq, *locked_rq; bool dispatched = false; bool in_balance; unsigned long flags; + /* + * The verifier considers an iterator slot initialized on any + * KF_ITER_NEW return, so a BPF program may legally reach here after + * bpf_iter_scx_dsq_new() failed and left @kit->dsq NULL. + */ + if (unlikely(!src_dsq)) + return false; + + sch = src_dsq->sched; + if (!scx_vet_enq_flags(sch, dsq_id, &enq_flags)) return false; @@ -8526,7 +8681,7 @@ __bpf_kfunc bool scx_bpf_task_set_slice(struct task_struct *p, u64 slice, guard(rcu)(); sch = scx_prog_sched(aux); - if (unlikely(!scx_task_on_sched(sch, p))) + if (unlikely(!sch || !scx_task_on_sched(sch, p))) return false; p->scx.slice = slice; @@ -8549,7 +8704,7 @@ __bpf_kfunc bool scx_bpf_task_set_dsq_vtime(struct task_struct *p, u64 vtime, guard(rcu)(); sch = scx_prog_sched(aux); - if (unlikely(!scx_task_on_sched(sch, p))) + if (unlikely(!sch || !scx_task_on_sched(sch, p))) return false; p->scx.dsq_vtime = vtime; @@ -8633,11 +8788,12 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags, const struct bpf_prog_aux /** * scx_bpf_dsq_nr_queued - Return the number of queued tasks * @dsq_id: id of the DSQ + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Return the number of tasks in the DSQ matching @dsq_id. If not found, * -%ENOENT is returned. */ -__bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id) +__bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id, const struct bpf_prog_aux *aux) { struct scx_sched *sch; struct scx_dispatch_q *dsq; @@ -8645,7 +8801,7 @@ __bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id) preempt_disable(); - sch = rcu_dereference_sched(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) { ret = -ENODEV; goto out; @@ -8677,21 +8833,21 @@ out: /** * scx_bpf_destroy_dsq - Destroy a custom DSQ * @dsq_id: DSQ to destroy + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Destroy the custom DSQ identified by @dsq_id. Only DSQs created with * scx_bpf_create_dsq() can be destroyed. The caller must ensure that the DSQ is * empty and no further tasks are dispatched to it. Ignored if called on a DSQ * which doesn't exist. Can be called from any online scx_ops operations. */ -__bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id) +__bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id, const struct bpf_prog_aux *aux) { struct scx_sched *sch; - rcu_read_lock(); - sch = rcu_dereference(scx_root); + guard(rcu)(); + sch = scx_prog_sched(aux); if (sch) destroy_dsq(sch, dsq_id); - rcu_read_unlock(); } /** @@ -9445,8 +9601,8 @@ BTF_KFUNCS_START(scx_kfunc_ids_any) BTF_ID_FLAGS(func, scx_bpf_task_set_slice, KF_IMPLICIT_ARGS | KF_RCU); BTF_ID_FLAGS(func, scx_bpf_task_set_dsq_vtime, KF_IMPLICIT_ARGS | KF_RCU); BTF_ID_FLAGS(func, scx_bpf_kick_cpu, KF_IMPLICIT_ARGS) -BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued) -BTF_ID_FLAGS(func, scx_bpf_destroy_dsq) +BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_destroy_dsq, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_IMPLICIT_ARGS | KF_RCU_PROTECTED | KF_RET_NULL) BTF_ID_FLAGS(func, scx_bpf_dsq_reenq, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2, KF_IMPLICIT_ARGS) @@ -9479,6 +9635,7 @@ BTF_KFUNCS_END(scx_kfunc_ids_any) static const struct btf_kfunc_id_set scx_kfunc_set_any = { .owner = THIS_MODULE, .set = &scx_kfunc_ids_any, + .filter = scx_kfunc_context_filter, }; /* @@ -9526,13 +9683,12 @@ static const u32 scx_kf_allow_flags[] = { }; /* - * Verifier-time filter for context-sensitive SCX kfuncs. Registered via the - * .filter field on each per-group btf_kfunc_id_set. The BPF core invokes this - * for every kfunc call in the registered hook (BPF_PROG_TYPE_STRUCT_OPS or + * Verifier-time filter for SCX kfuncs. Registered via the .filter field on + * each per-group btf_kfunc_id_set. The BPF core invokes this for every kfunc + * call in the registered hook (BPF_PROG_TYPE_STRUCT_OPS or * BPF_PROG_TYPE_SYSCALL), regardless of which set originally introduced the - * kfunc - so the filter must short-circuit on kfuncs it doesn't govern (e.g. - * scx_kfunc_ids_any) by falling through to "allow" when none of the - * context-sensitive sets contain the kfunc. + * kfunc - so the filter must short-circuit on kfuncs it doesn't govern by + * falling through to "allow" when none of the SCX sets contain the kfunc. */ int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id) { @@ -9541,18 +9697,21 @@ int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id) bool in_enqueue = btf_id_set8_contains(&scx_kfunc_ids_enqueue_dispatch, kfunc_id); bool in_dispatch = btf_id_set8_contains(&scx_kfunc_ids_dispatch, kfunc_id); bool in_cpu_release = btf_id_set8_contains(&scx_kfunc_ids_cpu_release, kfunc_id); + bool in_idle = btf_id_set8_contains(&scx_kfunc_ids_idle, kfunc_id); + bool in_any = btf_id_set8_contains(&scx_kfunc_ids_any, kfunc_id); u32 moff, flags; - /* Not a context-sensitive kfunc (e.g. from scx_kfunc_ids_any) - allow. */ - if (!(in_unlocked || in_select_cpu || in_enqueue || in_dispatch || in_cpu_release)) + /* Not an SCX kfunc - allow. */ + if (!(in_unlocked || in_select_cpu || in_enqueue || in_dispatch || + in_cpu_release || in_idle || in_any)) return 0; /* SYSCALL progs (e.g. BPF test_run()) may call unlocked and select_cpu kfuncs. */ if (prog->type == BPF_PROG_TYPE_SYSCALL) - return (in_unlocked || in_select_cpu) ? 0 : -EACCES; + return (in_unlocked || in_select_cpu || in_idle || in_any) ? 0 : -EACCES; if (prog->type != BPF_PROG_TYPE_STRUCT_OPS) - return -EACCES; + return (in_any || in_idle) ? 0 : -EACCES; /* * add_subprog_and_kfunc() collects all kfunc calls, including dead code @@ -9565,14 +9724,15 @@ int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id) return 0; /* - * Non-SCX struct_ops: only unlocked kfuncs are safe. The other - * context-sensitive kfuncs assume the rq lock is held by the SCX - * dispatch path, which doesn't apply to other struct_ops users. + * Non-SCX struct_ops: SCX kfuncs are not permitted. */ if (prog->aux->st_ops != &bpf_sched_ext_ops) - return in_unlocked ? 0 : -EACCES; + return -EACCES; /* SCX struct_ops: check the per-op allow list. */ + if (in_any || in_idle) + return 0; + moff = prog->aux->attach_st_ops_member_off; flags = scx_kf_allow_flags[SCX_MOFF_IDX(moff)]; @@ -9656,12 +9816,6 @@ static int __init scx_init(void) return ret; } - if (!alloc_cpumask_var(&scx_bypass_lb_donee_cpumask, GFP_KERNEL) || - !alloc_cpumask_var(&scx_bypass_lb_resched_cpumask, GFP_KERNEL)) { - pr_err("sched_ext: Failed to allocate cpumasks\n"); - return -ENOMEM; - } - return 0; } __initcall(scx_init); diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index 443d12a3df67..7468560a6d80 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -927,14 +927,24 @@ static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p, * Accessing p->cpus_ptr / p->nr_cpus_allowed needs either @p's rq * lock or @p's pi_lock. Three cases: * - * - inside ops.select_cpu(): try_to_wake_up() holds @p's pi_lock. + * - inside ops.select_cpu(): try_to_wake_up() holds the wake-up + * task's pi_lock; the wake-up task is recorded in kf_tasks[0] + * by SCX_CALL_OP_TASK_RET(). * - other rq-locked SCX op: scx_locked_rq() points at the held rq. * - truly unlocked (UNLOCKED ops, SYSCALL, non-SCX struct_ops): * nothing held, take pi_lock ourselves. + * + * In the first two cases, BPF schedulers may pass an arbitrary task + * that the held lock doesn't cover. Refuse those. */ if (this_rq()->scx.in_select_cpu) { + if (!scx_kf_arg_task_ok(sch, p)) + return -EINVAL; lockdep_assert_held(&p->pi_lock); - } else if (!scx_locked_rq()) { + } else if (scx_locked_rq()) { + if (task_rq(p) != scx_locked_rq()) + goto cross_task; + } else { raw_spin_lock_irqsave(&p->pi_lock, irq_flags); we_locked = true; } @@ -960,6 +970,11 @@ static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p, raw_spin_unlock_irqrestore(&p->pi_lock, irq_flags); return cpu; + +cross_task: + scx_error(sch, "select_cpu kfunc called cross-task on %s[%d]", + p->comm, p->pid); + return -EINVAL; } /** @@ -1467,6 +1482,7 @@ BTF_KFUNCS_END(scx_kfunc_ids_idle) static const struct btf_kfunc_id_set scx_kfunc_set_idle = { .owner = THIS_MODULE, .set = &scx_kfunc_ids_idle, + .filter = scx_kfunc_context_filter, }; /* diff --git a/kernel/sched/ext_idle.h b/kernel/sched/ext_idle.h index dc35f850481e..8d169d3bbdf9 100644 --- a/kernel/sched/ext_idle.h +++ b/kernel/sched/ext_idle.h @@ -12,6 +12,7 @@ struct sched_ext_ops; +extern struct btf_id_set8 scx_kfunc_ids_idle; extern struct btf_id_set8 scx_kfunc_ids_select_cpu; void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops); diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h index 62ce4eaf6a3f..a075732d4430 100644 --- a/kernel/sched/ext_internal.h +++ b/kernel/sched/ext_internal.h @@ -1075,6 +1075,8 @@ struct scx_sched { struct irq_work disable_irq_work; struct kthread_work disable_work; struct timer_list bypass_lb_timer; + cpumask_var_t bypass_lb_donee_cpumask; + cpumask_var_t bypass_lb_resched_cpumask; struct rcu_work rcu_work; /* all ancestors including self */ |
