kernel/sched/ext/cid.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294

/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Topological CPU IDs (cids)
 * --------------------------
 *
 * Raw cpu numbers are clumsy for sharding work and communication across
 * topology units, especially from BPF: the space can be sparse, numerical
 * closeness doesn't imply topological closeness (x86 hyperthreading often puts
 * SMT siblings far apart), and a range of cpu ids doesn't mean anything.
 * Sub-scheds make this acute - cpu allocation, revocation and other state are
 * constantly communicated across sub-scheds, and passing whole cpumasks scales
 * poorly with cpu count. cpumasks are also awkward in BPF: a variable-length
 * kernel type sized for the maximum NR_CPUS (4k), with verbose helper sequences
 * for every op.
 *
 * cids give every cpu a dense, topology-ordered id. CPUs sharing a core, LLC or
 * NUMA node get contiguous cid ranges, so a topology unit becomes a (start,
 * length) slice of cid space. Communication can pass a slice instead of a
 * cpumask, and BPF code can process, for example, a u64 word's worth of cids at
 * a time.
 *
 * The mapping is built once at root scheduler enable time by walking the
 * topology of online cpus only. Going by online cpus is out of necessity:
 * depending on the arch, topology info isn't reliably available for offline
 * cpus. The expected usage model is restarting the scheduler on hotplug events
 * so the mapping is rebuilt against the new online set. A scheduler that wants
 * to handle hotplug without a restart can provide its own cid and shard mapping
 * through the override interface.
 *
 * Copyright (c) 2026 Meta Platforms, Inc. and affiliates.
 * Copyright (c) 2026 Tejun Heo <tj@kernel.org>
 */
#ifndef _KERNEL_SCHED_EXT_CID_H
#define _KERNEL_SCHED_EXT_CID_H

#include "internal.h"

struct scx_sched;

/*
 * Cid space (total is always num_possible_cpus()) is laid out with
 * topology-annotated cids first, then no-topo cids at the tail. The
 * topology-annotated block covers the cpus that were online when scx_cid_init()
 * ran and remains valid even after those cpus go offline. The tail block covers
 * possible-but-not-online cpus and carries all-(-1) topo info (see
 * scx_cid_topo); callers detect it via the -1 sentinels.
 *
 * See the comment above the table definitions in cid.c for the
 * memory-ordering and visibility contract.
 */
extern s16 *scx_cid_to_cpu_tbl;
extern s16 *scx_cpu_to_cid_tbl;
extern struct scx_cid_topo *scx_cid_topo;
extern struct btf_id_set8 scx_kfunc_ids_init;

void scx_cmask_clear(struct scx_cmask *m);
void scx_cmask_fill(struct scx_cmask *m);
void scx_cmask_and(struct scx_cmask *dst, const struct scx_cmask *src);
void scx_cmask_or(struct scx_cmask *dst, const struct scx_cmask *src);
void scx_cmask_or_racy(struct scx_cmask *dst, const struct scx_cmask *src);
void scx_cmask_copy(struct scx_cmask *dst, const struct scx_cmask *src);
void scx_cmask_copy_racy(struct scx_cmask *dst, const struct scx_cmask *src);
void scx_cmask_andnot(struct scx_cmask *dst, const struct scx_cmask *src);
bool scx_cmask_subset(const struct scx_cmask *sub, const struct scx_cmask *super);
bool scx_cmask_intersects(const struct scx_cmask *a, const struct scx_cmask *b);
bool scx_cmask_empty(const struct scx_cmask *m);
s32 scx_cid_init(struct scx_sched *sch);
int scx_cid_kfunc_init(void);
void scx_cpumask_to_cmask(const struct cpumask *src, struct scx_cmask *dst);

/**
 * cid_valid - Verify a cid value, to be used on ops input args
 * @sch: scx_sched to abort on error
 * @cid: cid which came from a BPF ops
 *
 * Return true if @cid is in [0, num_possible_cpus()). On failure, trigger
 * scx_error() and return false.
 */
static inline bool cid_valid(struct scx_sched *sch, s32 cid)
{
	if (likely(cid >= 0 && cid < num_possible_cpus()))
		return true;
	scx_error(sch, "invalid cid %d", cid);
	return false;
}

/**
 * __scx_cid_to_cpu - Unchecked cid->cpu table lookup
 * @cid: cid to look up. Must be in [0, num_possible_cpus()).
 *
 * Intended for callsites that have already validated @cid and that hold a
 * non-NULL @sch from scx_prog_sched() - a live sched implies the table has
 * been allocated, so no NULL check is needed here.
 */
static inline s32 __scx_cid_to_cpu(s32 cid)
{
	/* READ_ONCE pairs with WRITE_ONCE in scx_cid_arrays_alloc() */
	return READ_ONCE(scx_cid_to_cpu_tbl)[cid];
}

/**
 * __scx_cpu_to_cid - Unchecked cpu->cid table lookup
 * @cpu: cpu to look up. Must be a valid possible cpu id.
 *
 * Same usage constraints as __scx_cid_to_cpu().
 */
static inline s32 __scx_cpu_to_cid(s32 cpu)
{
	return READ_ONCE(scx_cpu_to_cid_tbl)[cpu];
}

/**
 * scx_cid_to_cpu - Translate @cid to its cpu
 * @sch: scx_sched for error reporting
 * @cid: cid to look up
 *
 * Return the cpu for @cid or a negative errno on failure. Invalid cid triggers
 * scx_error() on @sch. The cid arrays are allocated on first scheduler enable
 * and never freed, so the returned cpu is stable for the lifetime of the loaded
 * scheduler.
 */
static inline s32 scx_cid_to_cpu(struct scx_sched *sch, s32 cid)
{
	if (!cid_valid(sch, cid))
		return -EINVAL;
	return __scx_cid_to_cpu(cid);
}

/**
 * scx_cpu_to_cid - Translate @cpu to its cid
 * @sch: scx_sched for error reporting
 * @cpu: cpu to look up
 *
 * Return the cid for @cpu or a negative errno on failure. Invalid cpu triggers
 * scx_error() on @sch. Same lifetime guarantee as scx_cid_to_cpu().
 */
static inline s32 scx_cpu_to_cid(struct scx_sched *sch, s32 cpu)
{
	if (!scx_cpu_valid(sch, cpu, NULL))
		return -EINVAL;
	return __scx_cpu_to_cid(cpu);
}

/**
 * scx_is_cid_type - Test whether the active scheduler hierarchy is cid-form
 */
static inline bool scx_is_cid_type(void)
{
	return static_branch_unlikely(&__scx_is_cid_type);
}

static inline bool __scx_cmask_contains(u32 cid, const struct scx_cmask *m)
{
	return likely(cid >= m->base && cid < m->base + m->nr_cids);
}

/* Word in bits[] covering @cid. @cid must satisfy __scx_cmask_contains(). */
static inline u64 *__scx_cmask_word(u32 cid, const struct scx_cmask *m)
{
	return (u64 *)&m->bits[cid / 64 - m->base / 64];
}

/**
 * __scx_cmask_init - Initialize @m with explicit storage capacity
 * @m: cmask to initialize
 * @base: first cid of the active range
 * @nr_cids: number of cids in the active range
 * @alloc_cids: storage capacity in cids, at least @nr_cids
 *
 * Use when storage is sized larger than the initial active range. All of
 * bits[] is zeroed.
 */
static inline void __scx_cmask_init(struct scx_cmask *m, u32 base, u32 nr_cids,
				    u32 alloc_cids)
{
	if (WARN_ON_ONCE(alloc_cids < nr_cids))
		nr_cids = alloc_cids;

	m->base = base;
	m->nr_cids = nr_cids;
	m->alloc_words = SCX_CMASK_NR_WORDS(alloc_cids);
	memset(m->bits, 0, m->alloc_words * sizeof(u64));
}

/**
 * scx_cmask_init - Initialize @m on tight storage
 * @m: cmask to initialize
 * @base: first cid of the active range
 * @nr_cids: number of cids in the active range
 *
 * All of bits[] is zeroed.
 */
static inline void scx_cmask_init(struct scx_cmask *m, u32 base, u32 nr_cids)
{
	__scx_cmask_init(m, base, nr_cids, nr_cids);
}

/**
 * scx_cmask_reframe - Reshape @m's active range without resizing storage
 * @m: cmask to reframe
 * @base: new active range base
 * @nr_cids: new active range length, must fit within @m->alloc_words
 *
 * Body bits within the new range become garbage - only the head and tail
 * words are zeroed to keep the padding invariant.
 */
static inline void scx_cmask_reframe(struct scx_cmask *m, u32 base, u32 nr_cids)
{
	if (WARN_ON_ONCE(SCX_CMASK_NR_WORDS(nr_cids) > m->alloc_words))
		return;

	if (nr_cids) {
		u32 last_word = ((base & 63) + nr_cids - 1) / 64;

		m->bits[0] = 0;
		m->bits[last_word] = 0;
	}

	m->base = base;
	m->nr_cids = nr_cids;
}

static inline void __scx_cmask_set(u32 cid, struct scx_cmask *m)
{
	if (!__scx_cmask_contains(cid, m))
		return;
	*__scx_cmask_word(cid, m) |= BIT_U64(cid & 63);
}

/**
 * scx_cmask_test - test whether @cid is set in @m
 * @cid: cid to test
 * @m: cmask to test
 *
 * Return %false if @cid is outside @m's active range. Otherwise return the
 * bit's value. Read via READ_ONCE so callers can race set/clear writers.
 */
static inline bool scx_cmask_test(u32 cid, const struct scx_cmask *m)
{
	if (!__scx_cmask_contains(cid, m))
		return false;
	return READ_ONCE(*__scx_cmask_word(cid, m)) & BIT_U64(cid & 63);
}

/*
 * Words of bits[] the active range spans, 0 if empty. Tighter than the storage
 * SCX_CMASK_NR_WORDS() sizes for the worst-case base alignment.
 */
static inline u32 scx_cmask_nr_used_words(const struct scx_cmask *m)
{
	if (!m->nr_cids)
		return 0;
	return ((m->base & 63) + m->nr_cids - 1) / 64 + 1;
}

/**
 * scx_cmask_for_each_cid - iterate set cids in @m
 * @cid: s32 loop var that receives each set cid in turn
 * @m: cmask to iterate
 *
 * Visits set bits within @m's active range in ascending order. Scans only the
 * words the active range spans, where head and tail padding is kept zero, so
 * no per-cid range check is needed.
 */
#define scx_cmask_for_each_cid(cid, m)						\
	for (u64 __bs = (m)->base & ~63u, __wi = 0,				\
		     __nw = scx_cmask_nr_used_words(m);				\
	     __wi < __nw; __wi++)						\
		for (u64 __w = READ_ONCE((m)->bits[__wi]);			\
		     __w && ((cid) = __bs + __wi * 64 + __ffs64(__w), true);	\
		     __w &= __w - 1)

/*
 * scx_cpu_arg() wraps a cpu arg being handed to an SCX op. For cid-form
 * schedulers it resolves to the matching cid; for cpu-form it passes @cpu
 * through. scx_cpu_ret() is the inverse for a cpu/cid returned from an op
 * (currently only ops.select_cpu); it validates the BPF-supplied cid and
 * triggers scx_error() on @sch if invalid.
 */
static inline s32 scx_cpu_arg(s32 cpu)
{
	if (scx_is_cid_type())
		return __scx_cpu_to_cid(cpu);
	return cpu;
}

static inline s32 scx_cpu_ret(struct scx_sched *sch, s32 cpu_or_cid)
{
	if (cpu_or_cid < 0 || !scx_is_cid_type())
		return cpu_or_cid;
	return scx_cid_to_cpu(sch, cpu_or_cid);
}

#endif /* _KERNEL_SCHED_EXT_CID_H */