summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2026-03-06 07:46:32 -1000
committerTejun Heo <tj@kernel.org>2026-03-06 07:46:32 -1000
commit32e940f2bd3b16551f23ea44be47f6f5d1746d64 (patch)
tree23037a80fd683ff1a52ca05e2d7432c0cbc8f98b /kernel
parent477174ac35c510d0ed3043f5bd4fba25546a21ce (diff)
parent1dde502587657045b267f179d7a1ecc7b8a1a265 (diff)
Merge branch 'for-7.0-fixes' into for-7.1
To prepare for hierarchical scheduling patchset which will cause multiple conflicts otherwise. Signed-off-by: Tejun Heo <tj@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/arena.c2
-rw-r--r--kernel/bpf/arraymap.c2
-rw-r--r--kernel/bpf/bloom_filter.c2
-rw-r--r--kernel/bpf/bpf_insn_array.c2
-rw-r--r--kernel/bpf/bpf_local_storage.c75
-rw-r--r--kernel/bpf/cpumap.c17
-rw-r--r--kernel/bpf/devmap.c47
-rw-r--r--kernel/bpf/hashtab.c86
-rw-r--r--kernel/bpf/local_storage.c2
-rw-r--r--kernel/bpf/lpm_trie.c2
-rw-r--r--kernel/bpf/memalloc.c58
-rw-r--r--kernel/bpf/syscall.c2
-rw-r--r--kernel/bpf/tnum.c56
-rw-r--r--kernel/bpf/verifier.c30
-rw-r--r--kernel/cgroup/cgroup.c1
-rw-r--r--kernel/cgroup/cpuset.c222
-rw-r--r--kernel/configs/debug.config1
-rw-r--r--kernel/dma/direct.h2
-rw-r--r--kernel/events/core.c83
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/kcsan/kcsan_test.c2
-rw-r--r--kernel/liveupdate/luo_file.c41
-rw-r--r--kernel/rseq.c8
-rw-r--r--kernel/sched/core.c1
-rw-r--r--kernel/sched/ext.c113
-rw-r--r--kernel/sched/ext_idle.c5
-rw-r--r--kernel/sched/ext_internal.h116
-rw-r--r--kernel/sched/fair.c150
-rw-r--r--kernel/sched/isolation.c4
-rw-r--r--kernel/sched/sched.h11
-rw-r--r--kernel/time/time.c19
-rw-r--r--kernel/time/timer_migration.c4
-rw-r--r--kernel/trace/bpf_trace.c4
33 files changed, 889 insertions, 283 deletions
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 144f30e740e8..f355cf1c1a16 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -303,7 +303,7 @@ static long arena_map_update_elem(struct bpf_map *map, void *key,
return -EOPNOTSUPP;
}
-static int arena_map_check_btf(const struct bpf_map *map, const struct btf *btf,
+static int arena_map_check_btf(struct bpf_map *map, const struct btf *btf,
const struct btf_type *key_type, const struct btf_type *value_type)
{
return 0;
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 26763df6134a..33de68c95ad8 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -548,7 +548,7 @@ static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key,
rcu_read_unlock();
}
-static int array_map_check_btf(const struct bpf_map *map,
+static int array_map_check_btf(struct bpf_map *map,
const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type)
diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c
index 35e1ddca74d2..b73336c976b7 100644
--- a/kernel/bpf/bloom_filter.c
+++ b/kernel/bpf/bloom_filter.c
@@ -180,7 +180,7 @@ static long bloom_map_update_elem(struct bpf_map *map, void *key,
return -EINVAL;
}
-static int bloom_map_check_btf(const struct bpf_map *map,
+static int bloom_map_check_btf(struct bpf_map *map,
const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type)
diff --git a/kernel/bpf/bpf_insn_array.c b/kernel/bpf/bpf_insn_array.c
index c0286f25ca3c..a2f84afe6f7c 100644
--- a/kernel/bpf/bpf_insn_array.c
+++ b/kernel/bpf/bpf_insn_array.c
@@ -98,7 +98,7 @@ static long insn_array_delete_elem(struct bpf_map *map, void *key)
return -EINVAL;
}
-static int insn_array_check_btf(const struct bpf_map *map,
+static int insn_array_check_btf(struct bpf_map *map,
const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type)
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index b28f07d3a0db..9c96a4477f81 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -107,14 +107,12 @@ static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
{
struct bpf_local_storage *local_storage;
- /* If RCU Tasks Trace grace period implies RCU grace period, do
- * kfree(), else do kfree_rcu().
+ /*
+ * RCU Tasks Trace grace period implies RCU grace period, do
+ * kfree() directly.
*/
local_storage = container_of(rcu, struct bpf_local_storage, rcu);
- if (rcu_trace_implies_rcu_gp())
- kfree(local_storage);
- else
- kfree_rcu(local_storage, rcu);
+ kfree(local_storage);
}
/* Handle use_kmalloc_nolock == false */
@@ -138,10 +136,11 @@ static void bpf_local_storage_free_rcu(struct rcu_head *rcu)
static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
{
- if (rcu_trace_implies_rcu_gp())
- bpf_local_storage_free_rcu(rcu);
- else
- call_rcu(rcu, bpf_local_storage_free_rcu);
+ /*
+ * RCU Tasks Trace grace period implies RCU grace period, do
+ * kfree() directly.
+ */
+ bpf_local_storage_free_rcu(rcu);
}
static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
@@ -164,16 +163,29 @@ static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
bpf_local_storage_free_trace_rcu);
}
-/* rcu tasks trace callback for use_kmalloc_nolock == false */
-static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu)
+/* rcu callback for use_kmalloc_nolock == false */
+static void __bpf_selem_free_rcu(struct rcu_head *rcu)
{
struct bpf_local_storage_elem *selem;
+ struct bpf_local_storage_map *smap;
selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
- if (rcu_trace_implies_rcu_gp())
- kfree(selem);
- else
- kfree_rcu(selem, rcu);
+ /* bpf_selem_unlink_nofail may have already cleared smap and freed fields. */
+ smap = rcu_dereference_check(SDATA(selem)->smap, 1);
+
+ if (smap)
+ bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
+ kfree(selem);
+}
+
+/* rcu tasks trace callback for use_kmalloc_nolock == false */
+static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu)
+{
+ /*
+ * RCU Tasks Trace grace period implies RCU grace period, do
+ * kfree() directly.
+ */
+ __bpf_selem_free_rcu(rcu);
}
/* Handle use_kmalloc_nolock == false */
@@ -181,7 +193,7 @@ static void __bpf_selem_free(struct bpf_local_storage_elem *selem,
bool vanilla_rcu)
{
if (vanilla_rcu)
- kfree_rcu(selem, rcu);
+ call_rcu(&selem->rcu, __bpf_selem_free_rcu);
else
call_rcu_tasks_trace(&selem->rcu, __bpf_selem_free_trace_rcu);
}
@@ -195,37 +207,29 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu)
/* The bpf_local_storage_map_free will wait for rcu_barrier */
smap = rcu_dereference_check(SDATA(selem)->smap, 1);
- if (smap) {
- migrate_disable();
+ if (smap)
bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
- migrate_enable();
- }
kfree_nolock(selem);
}
static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
{
- if (rcu_trace_implies_rcu_gp())
- bpf_selem_free_rcu(rcu);
- else
- call_rcu(rcu, bpf_selem_free_rcu);
+ /*
+ * RCU Tasks Trace grace period implies RCU grace period, do
+ * kfree() directly.
+ */
+ bpf_selem_free_rcu(rcu);
}
void bpf_selem_free(struct bpf_local_storage_elem *selem,
bool reuse_now)
{
- struct bpf_local_storage_map *smap;
-
- smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
-
if (!selem->use_kmalloc_nolock) {
/*
* No uptr will be unpin even when reuse_now == false since uptr
* is only supported in task local storage, where
* smap->use_kmalloc_nolock == true.
*/
- if (smap)
- bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
__bpf_selem_free(selem, reuse_now);
return;
}
@@ -797,7 +801,7 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
return 0;
}
-int bpf_local_storage_map_check_btf(const struct bpf_map *map,
+int bpf_local_storage_map_check_btf(struct bpf_map *map,
const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type)
@@ -958,10 +962,9 @@ restart:
*/
synchronize_rcu();
- if (smap->use_kmalloc_nolock) {
- rcu_barrier_tasks_trace();
- rcu_barrier();
- }
+ /* smap remains in use regardless of kmalloc_nolock, so wait unconditionally. */
+ rcu_barrier_tasks_trace();
+ rcu_barrier();
kvfree(smap->buckets);
bpf_map_area_free(smap);
}
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 04171fbc39cb..32b43cb9061b 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -29,6 +29,7 @@
#include <linux/sched.h>
#include <linux/workqueue.h>
#include <linux/kthread.h>
+#include <linux/local_lock.h>
#include <linux/completion.h>
#include <trace/events/xdp.h>
#include <linux/btf_ids.h>
@@ -52,6 +53,7 @@ struct xdp_bulk_queue {
struct list_head flush_node;
struct bpf_cpu_map_entry *obj;
unsigned int count;
+ local_lock_t bq_lock;
};
/* Struct for every remote "destination" CPU in map */
@@ -451,6 +453,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
for_each_possible_cpu(i) {
bq = per_cpu_ptr(rcpu->bulkq, i);
bq->obj = rcpu;
+ local_lock_init(&bq->bq_lock);
}
/* Alloc queue */
@@ -722,6 +725,8 @@ static void bq_flush_to_queue(struct xdp_bulk_queue *bq)
struct ptr_ring *q;
int i;
+ lockdep_assert_held(&bq->bq_lock);
+
if (unlikely(!bq->count))
return;
@@ -749,11 +754,15 @@ static void bq_flush_to_queue(struct xdp_bulk_queue *bq)
}
/* Runs under RCU-read-side, plus in softirq under NAPI protection.
- * Thus, safe percpu variable access.
+ * Thus, safe percpu variable access. PREEMPT_RT relies on
+ * local_lock_nested_bh() to serialise access to the per-CPU bq.
*/
static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
{
- struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
+ struct xdp_bulk_queue *bq;
+
+ local_lock_nested_bh(&rcpu->bulkq->bq_lock);
+ bq = this_cpu_ptr(rcpu->bulkq);
if (unlikely(bq->count == CPU_MAP_BULK_SIZE))
bq_flush_to_queue(bq);
@@ -774,6 +783,8 @@ static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
list_add(&bq->flush_node, flush_list);
}
+
+ local_unlock_nested_bh(&rcpu->bulkq->bq_lock);
}
int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf,
@@ -810,7 +821,9 @@ void __cpu_map_flush(struct list_head *flush_list)
struct xdp_bulk_queue *bq, *tmp;
list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
+ local_lock_nested_bh(&bq->obj->bulkq->bq_lock);
bq_flush_to_queue(bq);
+ local_unlock_nested_bh(&bq->obj->bulkq->bq_lock);
/* If already running, costs spin_lock_irqsave + smb_mb */
wake_up_process(bq->obj->kthread);
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 2625601de76e..3d619d01088e 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -45,6 +45,7 @@
* types of devmap; only the lookup and insertion is different.
*/
#include <linux/bpf.h>
+#include <linux/local_lock.h>
#include <net/xdp.h>
#include <linux/filter.h>
#include <trace/events/xdp.h>
@@ -60,6 +61,7 @@ struct xdp_dev_bulk_queue {
struct net_device *dev_rx;
struct bpf_prog *xdp_prog;
unsigned int count;
+ local_lock_t bq_lock;
};
struct bpf_dtab_netdev {
@@ -381,6 +383,8 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
int to_send = cnt;
int i;
+ lockdep_assert_held(&bq->bq_lock);
+
if (unlikely(!cnt))
return;
@@ -425,10 +429,12 @@ void __dev_flush(struct list_head *flush_list)
struct xdp_dev_bulk_queue *bq, *tmp;
list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
+ local_lock_nested_bh(&bq->dev->xdp_bulkq->bq_lock);
bq_xmit_all(bq, XDP_XMIT_FLUSH);
bq->dev_rx = NULL;
bq->xdp_prog = NULL;
__list_del_clearprev(&bq->flush_node);
+ local_unlock_nested_bh(&bq->dev->xdp_bulkq->bq_lock);
}
}
@@ -451,12 +457,16 @@ static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
/* Runs in NAPI, i.e., softirq under local_bh_disable(). Thus, safe percpu
* variable access, and map elements stick around. See comment above
- * xdp_do_flush() in filter.c.
+ * xdp_do_flush() in filter.c. PREEMPT_RT relies on local_lock_nested_bh()
+ * to serialise access to the per-CPU bq.
*/
static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
struct net_device *dev_rx, struct bpf_prog *xdp_prog)
{
- struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq);
+ struct xdp_dev_bulk_queue *bq;
+
+ local_lock_nested_bh(&dev->xdp_bulkq->bq_lock);
+ bq = this_cpu_ptr(dev->xdp_bulkq);
if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
bq_xmit_all(bq, 0);
@@ -477,6 +487,8 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
}
bq->q[bq->count++] = xdpf;
+
+ local_unlock_nested_bh(&dev->xdp_bulkq->bq_lock);
}
static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
@@ -588,18 +600,22 @@ static inline bool is_ifindex_excluded(int *excluded, int num_excluded, int ifin
}
/* Get ifindex of each upper device. 'indexes' must be able to hold at
- * least MAX_NEST_DEV elements.
- * Returns the number of ifindexes added.
+ * least 'max' elements.
+ * Returns the number of ifindexes added, or -EOVERFLOW if there are too
+ * many upper devices.
*/
-static int get_upper_ifindexes(struct net_device *dev, int *indexes)
+static int get_upper_ifindexes(struct net_device *dev, int *indexes, int max)
{
struct net_device *upper;
struct list_head *iter;
int n = 0;
netdev_for_each_upper_dev_rcu(dev, upper, iter) {
+ if (n >= max)
+ return -EOVERFLOW;
indexes[n++] = upper->ifindex;
}
+
return n;
}
@@ -615,7 +631,11 @@ int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx,
int err;
if (exclude_ingress) {
- num_excluded = get_upper_ifindexes(dev_rx, excluded_devices);
+ num_excluded = get_upper_ifindexes(dev_rx, excluded_devices,
+ ARRAY_SIZE(excluded_devices) - 1);
+ if (num_excluded < 0)
+ return num_excluded;
+
excluded_devices[num_excluded++] = dev_rx->ifindex;
}
@@ -733,7 +753,11 @@ int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
int err;
if (exclude_ingress) {
- num_excluded = get_upper_ifindexes(dev, excluded_devices);
+ num_excluded = get_upper_ifindexes(dev, excluded_devices,
+ ARRAY_SIZE(excluded_devices) - 1);
+ if (num_excluded < 0)
+ return num_excluded;
+
excluded_devices[num_excluded++] = dev->ifindex;
}
@@ -1115,8 +1139,13 @@ static int dev_map_notification(struct notifier_block *notifier,
if (!netdev->xdp_bulkq)
return NOTIFY_BAD;
- for_each_possible_cpu(cpu)
- per_cpu_ptr(netdev->xdp_bulkq, cpu)->dev = netdev;
+ for_each_possible_cpu(cpu) {
+ struct xdp_dev_bulk_queue *bq;
+
+ bq = per_cpu_ptr(netdev->xdp_bulkq, cpu);
+ bq->dev = netdev;
+ local_lock_init(&bq->bq_lock);
+ }
break;
case NETDEV_UNREGISTER:
/* This rcu_read_lock/unlock pair is needed because
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 3b9d297a53be..bc6bc8bb871d 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -125,6 +125,11 @@ struct htab_elem {
char key[] __aligned(8);
};
+struct htab_btf_record {
+ struct btf_record *record;
+ u32 key_size;
+};
+
static inline bool htab_is_prealloc(const struct bpf_htab *htab)
{
return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
@@ -457,6 +462,83 @@ static int htab_map_alloc_check(union bpf_attr *attr)
return 0;
}
+static void htab_mem_dtor(void *obj, void *ctx)
+{
+ struct htab_btf_record *hrec = ctx;
+ struct htab_elem *elem = obj;
+ void *map_value;
+
+ if (IS_ERR_OR_NULL(hrec->record))
+ return;
+
+ map_value = htab_elem_value(elem, hrec->key_size);
+ bpf_obj_free_fields(hrec->record, map_value);
+}
+
+static void htab_pcpu_mem_dtor(void *obj, void *ctx)
+{
+ void __percpu *pptr = *(void __percpu **)obj;
+ struct htab_btf_record *hrec = ctx;
+ int cpu;
+
+ if (IS_ERR_OR_NULL(hrec->record))
+ return;
+
+ for_each_possible_cpu(cpu)
+ bpf_obj_free_fields(hrec->record, per_cpu_ptr(pptr, cpu));
+}
+
+static void htab_dtor_ctx_free(void *ctx)
+{
+ struct htab_btf_record *hrec = ctx;
+
+ btf_record_free(hrec->record);
+ kfree(ctx);
+}
+
+static int htab_set_dtor(struct bpf_htab *htab, void (*dtor)(void *, void *))
+{
+ u32 key_size = htab->map.key_size;
+ struct bpf_mem_alloc *ma;
+ struct htab_btf_record *hrec;
+ int err;
+
+ /* No need for dtors. */
+ if (IS_ERR_OR_NULL(htab->map.record))
+ return 0;
+
+ hrec = kzalloc(sizeof(*hrec), GFP_KERNEL);
+ if (!hrec)
+ return -ENOMEM;
+ hrec->key_size = key_size;
+ hrec->record = btf_record_dup(htab->map.record);
+ if (IS_ERR(hrec->record)) {
+ err = PTR_ERR(hrec->record);
+ kfree(hrec);
+ return err;
+ }
+ ma = htab_is_percpu(htab) ? &htab->pcpu_ma : &htab->ma;
+ bpf_mem_alloc_set_dtor(ma, dtor, htab_dtor_ctx_free, hrec);
+ return 0;
+}
+
+static int htab_map_check_btf(struct bpf_map *map, const struct btf *btf,
+ const struct btf_type *key_type, const struct btf_type *value_type)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+
+ if (htab_is_prealloc(htab))
+ return 0;
+ /*
+ * We must set the dtor using this callback, as map's BTF record is not
+ * populated in htab_map_alloc(), so it will always appear as NULL.
+ */
+ if (htab_is_percpu(htab))
+ return htab_set_dtor(htab, htab_pcpu_mem_dtor);
+ else
+ return htab_set_dtor(htab, htab_mem_dtor);
+}
+
static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
{
bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
@@ -2281,6 +2363,7 @@ const struct bpf_map_ops htab_map_ops = {
.map_seq_show_elem = htab_map_seq_show_elem,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_hash_elem,
+ .map_check_btf = htab_map_check_btf,
.map_mem_usage = htab_map_mem_usage,
BATCH_OPS(htab),
.map_btf_id = &htab_map_btf_ids[0],
@@ -2303,6 +2386,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
.map_seq_show_elem = htab_map_seq_show_elem,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_hash_elem,
+ .map_check_btf = htab_map_check_btf,
.map_mem_usage = htab_map_mem_usage,
BATCH_OPS(htab_lru),
.map_btf_id = &htab_map_btf_ids[0],
@@ -2482,6 +2566,7 @@ const struct bpf_map_ops htab_percpu_map_ops = {
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_hash_elem,
+ .map_check_btf = htab_map_check_btf,
.map_mem_usage = htab_map_mem_usage,
BATCH_OPS(htab_percpu),
.map_btf_id = &htab_map_btf_ids[0],
@@ -2502,6 +2587,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_hash_elem,
+ .map_check_btf = htab_map_check_btf,
.map_mem_usage = htab_map_mem_usage,
BATCH_OPS(htab_lru_percpu),
.map_btf_id = &htab_map_btf_ids[0],
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 1ccbf28b2ad9..8fca0c64f7b1 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -364,7 +364,7 @@ static long cgroup_storage_delete_elem(struct bpf_map *map, void *key)
return -EINVAL;
}
-static int cgroup_storage_check_btf(const struct bpf_map *map,
+static int cgroup_storage_check_btf(struct bpf_map *map,
const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type)
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 1adeb4d3b8cf..0f57608b385d 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -751,7 +751,7 @@ free_stack:
return err;
}
-static int trie_check_btf(const struct bpf_map *map,
+static int trie_check_btf(struct bpf_map *map,
const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type)
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index bd45dda9dc35..682a9f34214b 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -102,6 +102,8 @@ struct bpf_mem_cache {
int percpu_size;
bool draining;
struct bpf_mem_cache *tgt;
+ void (*dtor)(void *obj, void *ctx);
+ void *dtor_ctx;
/* list of objects to be freed after RCU GP */
struct llist_head free_by_rcu;
@@ -260,12 +262,14 @@ static void free_one(void *obj, bool percpu)
kfree(obj);
}
-static int free_all(struct llist_node *llnode, bool percpu)
+static int free_all(struct bpf_mem_cache *c, struct llist_node *llnode, bool percpu)
{
struct llist_node *pos, *t;
int cnt = 0;
llist_for_each_safe(pos, t, llnode) {
+ if (c->dtor)
+ c->dtor((void *)pos + LLIST_NODE_SZ, c->dtor_ctx);
free_one(pos, percpu);
cnt++;
}
@@ -276,7 +280,7 @@ static void __free_rcu(struct rcu_head *head)
{
struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu_ttrace);
- free_all(llist_del_all(&c->waiting_for_gp_ttrace), !!c->percpu_size);
+ free_all(c, llist_del_all(&c->waiting_for_gp_ttrace), !!c->percpu_size);
atomic_set(&c->call_rcu_ttrace_in_progress, 0);
}
@@ -308,7 +312,7 @@ static void do_call_rcu_ttrace(struct bpf_mem_cache *c)
if (atomic_xchg(&c->call_rcu_ttrace_in_progress, 1)) {
if (unlikely(READ_ONCE(c->draining))) {
llnode = llist_del_all(&c->free_by_rcu_ttrace);
- free_all(llnode, !!c->percpu_size);
+ free_all(c, llnode, !!c->percpu_size);
}
return;
}
@@ -417,7 +421,7 @@ static void check_free_by_rcu(struct bpf_mem_cache *c)
dec_active(c, &flags);
if (unlikely(READ_ONCE(c->draining))) {
- free_all(llist_del_all(&c->waiting_for_gp), !!c->percpu_size);
+ free_all(c, llist_del_all(&c->waiting_for_gp), !!c->percpu_size);
atomic_set(&c->call_rcu_in_progress, 0);
} else {
call_rcu_hurry(&c->rcu, __free_by_rcu);
@@ -635,13 +639,13 @@ static void drain_mem_cache(struct bpf_mem_cache *c)
* Except for waiting_for_gp_ttrace list, there are no concurrent operations
* on these lists, so it is safe to use __llist_del_all().
*/
- free_all(llist_del_all(&c->free_by_rcu_ttrace), percpu);
- free_all(llist_del_all(&c->waiting_for_gp_ttrace), percpu);
- free_all(__llist_del_all(&c->free_llist), percpu);
- free_all(__llist_del_all(&c->free_llist_extra), percpu);
- free_all(__llist_del_all(&c->free_by_rcu), percpu);
- free_all(__llist_del_all(&c->free_llist_extra_rcu), percpu);
- free_all(llist_del_all(&c->waiting_for_gp), percpu);
+ free_all(c, llist_del_all(&c->free_by_rcu_ttrace), percpu);
+ free_all(c, llist_del_all(&c->waiting_for_gp_ttrace), percpu);
+ free_all(c, __llist_del_all(&c->free_llist), percpu);
+ free_all(c, __llist_del_all(&c->free_llist_extra), percpu);
+ free_all(c, __llist_del_all(&c->free_by_rcu), percpu);
+ free_all(c, __llist_del_all(&c->free_llist_extra_rcu), percpu);
+ free_all(c, llist_del_all(&c->waiting_for_gp), percpu);
}
static void check_mem_cache(struct bpf_mem_cache *c)
@@ -680,6 +684,9 @@ static void check_leaked_objs(struct bpf_mem_alloc *ma)
static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
{
+ /* We can free dtor ctx only once all callbacks are done using it. */
+ if (ma->dtor_ctx_free)
+ ma->dtor_ctx_free(ma->dtor_ctx);
check_leaked_objs(ma);
free_percpu(ma->cache);
free_percpu(ma->caches);
@@ -1014,3 +1021,32 @@ int bpf_mem_alloc_check_size(bool percpu, size_t size)
return 0;
}
+
+void bpf_mem_alloc_set_dtor(struct bpf_mem_alloc *ma, void (*dtor)(void *obj, void *ctx),
+ void (*dtor_ctx_free)(void *ctx), void *ctx)
+{
+ struct bpf_mem_caches *cc;
+ struct bpf_mem_cache *c;
+ int cpu, i;
+
+ ma->dtor_ctx_free = dtor_ctx_free;
+ ma->dtor_ctx = ctx;
+
+ if (ma->cache) {
+ for_each_possible_cpu(cpu) {
+ c = per_cpu_ptr(ma->cache, cpu);
+ c->dtor = dtor;
+ c->dtor_ctx = ctx;
+ }
+ }
+ if (ma->caches) {
+ for_each_possible_cpu(cpu) {
+ cc = per_cpu_ptr(ma->caches, cpu);
+ for (i = 0; i < NUM_CACHES; i++) {
+ c = &cc->cache[i];
+ c->dtor = dtor;
+ c->dtor_ctx = ctx;
+ }
+ }
+ }
+}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0378e83b4099..274039e36465 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1234,7 +1234,7 @@ int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size)
}
EXPORT_SYMBOL_GPL(bpf_obj_name_cpy);
-int map_check_no_btf(const struct bpf_map *map,
+int map_check_no_btf(struct bpf_map *map,
const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type)
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
index 26fbfbb01700..4abc359b3db0 100644
--- a/kernel/bpf/tnum.c
+++ b/kernel/bpf/tnum.c
@@ -269,3 +269,59 @@ struct tnum tnum_bswap64(struct tnum a)
{
return TNUM(swab64(a.value), swab64(a.mask));
}
+
+/* Given tnum t, and a number z such that tmin <= z < tmax, where tmin
+ * is the smallest member of the t (= t.value) and tmax is the largest
+ * member of t (= t.value | t.mask), returns the smallest member of t
+ * larger than z.
+ *
+ * For example,
+ * t = x11100x0
+ * z = 11110001 (241)
+ * result = 11110010 (242)
+ *
+ * Note: if this function is called with z >= tmax, it just returns
+ * early with tmax; if this function is called with z < tmin, the
+ * algorithm already returns tmin.
+ */
+u64 tnum_step(struct tnum t, u64 z)
+{
+ u64 tmax, j, p, q, r, s, v, u, w, res;
+ u8 k;
+
+ tmax = t.value | t.mask;
+
+ /* if z >= largest member of t, return largest member of t */
+ if (z >= tmax)
+ return tmax;
+
+ /* if z < smallest member of t, return smallest member of t */
+ if (z < t.value)
+ return t.value;
+
+ /* keep t's known bits, and match all unknown bits to z */
+ j = t.value | (z & t.mask);
+
+ if (j > z) {
+ p = ~z & t.value & ~t.mask;
+ k = fls64(p); /* k is the most-significant 0-to-1 flip */
+ q = U64_MAX << k;
+ r = q & z; /* positions > k matched to z */
+ s = ~q & t.value; /* positions <= k matched to t.value */
+ v = r | s;
+ res = v;
+ } else {
+ p = z & ~t.value & ~t.mask;
+ k = fls64(p); /* k is the most-significant 1-to-0 flip */
+ q = U64_MAX << k;
+ r = q & t.mask & z; /* unknown positions > k, matched to z */
+ s = q & ~t.mask; /* known positions > k, set to 1 */
+ v = r | s;
+ /* add 1 to unknown positions > k to make value greater than z */
+ u = v + (1ULL << k);
+ /* extract bits in unknown positions > k from u, rest from t.value */
+ w = (u & t.mask) | t.value;
+ res = w;
+ }
+ return res;
+}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index bb12ba020649..401d6c4960ec 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2379,6 +2379,9 @@ static void __update_reg32_bounds(struct bpf_reg_state *reg)
static void __update_reg64_bounds(struct bpf_reg_state *reg)
{
+ u64 tnum_next, tmax;
+ bool umin_in_tnum;
+
/* min signed is max(sign bit) | min(other bits) */
reg->smin_value = max_t(s64, reg->smin_value,
reg->var_off.value | (reg->var_off.mask & S64_MIN));
@@ -2388,6 +2391,33 @@ static void __update_reg64_bounds(struct bpf_reg_state *reg)
reg->umin_value = max(reg->umin_value, reg->var_off.value);
reg->umax_value = min(reg->umax_value,
reg->var_off.value | reg->var_off.mask);
+
+ /* Check if u64 and tnum overlap in a single value */
+ tnum_next = tnum_step(reg->var_off, reg->umin_value);
+ umin_in_tnum = (reg->umin_value & ~reg->var_off.mask) == reg->var_off.value;
+ tmax = reg->var_off.value | reg->var_off.mask;
+ if (umin_in_tnum && tnum_next > reg->umax_value) {
+ /* The u64 range and the tnum only overlap in umin.
+ * u64: ---[xxxxxx]-----
+ * tnum: --xx----------x-
+ */
+ ___mark_reg_known(reg, reg->umin_value);
+ } else if (!umin_in_tnum && tnum_next == tmax) {
+ /* The u64 range and the tnum only overlap in the maximum value
+ * represented by the tnum, called tmax.
+ * u64: ---[xxxxxx]-----
+ * tnum: xx-----x--------
+ */
+ ___mark_reg_known(reg, tmax);
+ } else if (!umin_in_tnum && tnum_next <= reg->umax_value &&
+ tnum_step(reg->var_off, tnum_next) > reg->umax_value) {
+ /* The u64 range and the tnum only overlap in between umin
+ * (excluded) and umax.
+ * u64: ---[xxxxxx]-----
+ * tnum: xx----x-------x-
+ */
+ ___mark_reg_known(reg, tnum_next);
+ }
}
static void __update_reg_bounds(struct bpf_reg_state *reg)
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index c22cda7766d8..be1d71dda317 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2608,6 +2608,7 @@ static void cgroup_migrate_add_task(struct task_struct *task,
mgctx->tset.nr_tasks++;
+ css_set_skip_task_iters(cset, task);
list_move_tail(&task->cg_list, &cset->mg_tasks);
if (list_empty(&cset->mg_node))
list_add_tail(&cset->mg_node,
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 9faf34377a88..e200de7c60b6 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -62,6 +62,75 @@ static const char * const perr_strings[] = {
};
/*
+ * CPUSET Locking Convention
+ * -------------------------
+ *
+ * Below are the four global/local locks guarding cpuset structures in lock
+ * acquisition order:
+ * - cpuset_top_mutex
+ * - cpu_hotplug_lock (cpus_read_lock/cpus_write_lock)
+ * - cpuset_mutex
+ * - callback_lock (raw spinlock)
+ *
+ * As cpuset will now indirectly flush a number of different workqueues in
+ * housekeeping_update() to update housekeeping cpumasks when the set of
+ * isolated CPUs is going to be changed, it may be vulnerable to deadlock
+ * if we hold cpus_read_lock while calling into housekeeping_update().
+ *
+ * The first cpuset_top_mutex will be held except when calling into
+ * cpuset_handle_hotplug() from the CPU hotplug code where cpus_write_lock
+ * and cpuset_mutex will be held instead. The main purpose of this mutex
+ * is to prevent regular cpuset control file write actions from interfering
+ * with the call to housekeeping_update(), though CPU hotplug operation can
+ * still happen in parallel. This mutex also provides protection for some
+ * internal variables.
+ *
+ * A task must hold all the remaining three locks to modify externally visible
+ * or used fields of cpusets, though some of the internally used cpuset fields
+ * and internal variables can be modified without holding callback_lock. If only
+ * reliable read access of the externally used fields are needed, a task can
+ * hold either cpuset_mutex or callback_lock which are exposed to other
+ * external subsystems.
+ *
+ * If a task holds cpu_hotplug_lock and cpuset_mutex, it blocks others,
+ * ensuring that it is the only task able to also acquire callback_lock and
+ * be able to modify cpusets. It can perform various checks on the cpuset
+ * structure first, knowing nothing will change. It can also allocate memory
+ * without holding callback_lock. While it is performing these checks, various
+ * callback routines can briefly acquire callback_lock to query cpusets. Once
+ * it is ready to make the changes, it takes callback_lock, blocking everyone
+ * else.
+ *
+ * Calls to the kernel memory allocator cannot be made while holding
+ * callback_lock which is a spinlock, as the memory allocator may sleep or
+ * call back into cpuset code and acquire callback_lock.
+ *
+ * Now, the task_struct fields mems_allowed and mempolicy may be changed
+ * by other task, we use alloc_lock in the task_struct fields to protect
+ * them.
+ *
+ * The cpuset_common_seq_show() handlers only hold callback_lock across
+ * small pieces of code, such as when reading out possibly multi-word
+ * cpumasks and nodemasks.
+ */
+
+static DEFINE_MUTEX(cpuset_top_mutex);
+static DEFINE_MUTEX(cpuset_mutex);
+
+/*
+ * File level internal variables below follow one of the following exclusion
+ * rules.
+ *
+ * RWCS: Read/write-able by holding either cpus_write_lock (and optionally
+ * cpuset_mutex) or both cpus_read_lock and cpuset_mutex.
+ *
+ * CSCB: Readable by holding either cpuset_mutex or callback_lock. Writable
+ * by holding both cpuset_mutex and callback_lock.
+ *
+ * T: Read/write-able by holding the cpuset_top_mutex.
+ */
+
+/*
* For local partitions, update to subpartitions_cpus & isolated_cpus is done
* in update_parent_effective_cpumask(). For remote partitions, it is done in
* the remote_partition_*() and remote_cpus_update() helpers.
@@ -70,19 +139,22 @@ static const char * const perr_strings[] = {
* Exclusive CPUs distributed out to local or remote sub-partitions of
* top_cpuset
*/
-static cpumask_var_t subpartitions_cpus;
+static cpumask_var_t subpartitions_cpus; /* RWCS */
+
+/*
+ * Exclusive CPUs in isolated partitions (shown in cpuset.cpus.isolated)
+ */
+static cpumask_var_t isolated_cpus; /* CSCB */
/*
- * Exclusive CPUs in isolated partitions
+ * Set if housekeeping cpumasks are to be updated.
*/
-static cpumask_var_t isolated_cpus;
+static bool update_housekeeping; /* RWCS */
/*
- * isolated_cpus updating flag (protected by cpuset_mutex)
- * Set if isolated_cpus is going to be updated in the current
- * cpuset_mutex crtical section.
+ * Copy of isolated_cpus to be passed to housekeeping_update()
*/
-static bool isolated_cpus_updating;
+static cpumask_var_t isolated_hk_cpus; /* T */
/*
* A flag to force sched domain rebuild at the end of an operation.
@@ -98,7 +170,7 @@ static bool isolated_cpus_updating;
* Note that update_relax_domain_level() in cpuset-v1.c can still call
* rebuild_sched_domains_locked() directly without using this flag.
*/
-static bool force_sd_rebuild;
+static bool force_sd_rebuild; /* RWCS */
/*
* Partition root states:
@@ -218,42 +290,6 @@ struct cpuset top_cpuset = {
.partition_root_state = PRS_ROOT,
};
-/*
- * There are two global locks guarding cpuset structures - cpuset_mutex and
- * callback_lock. The cpuset code uses only cpuset_mutex. Other kernel
- * subsystems can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset
- * structures. Note that cpuset_mutex needs to be a mutex as it is used in
- * paths that rely on priority inheritance (e.g. scheduler - on RT) for
- * correctness.
- *
- * A task must hold both locks to modify cpusets. If a task holds
- * cpuset_mutex, it blocks others, ensuring that it is the only task able to
- * also acquire callback_lock and be able to modify cpusets. It can perform
- * various checks on the cpuset structure first, knowing nothing will change.
- * It can also allocate memory while just holding cpuset_mutex. While it is
- * performing these checks, various callback routines can briefly acquire
- * callback_lock to query cpusets. Once it is ready to make the changes, it
- * takes callback_lock, blocking everyone else.
- *
- * Calls to the kernel memory allocator can not be made while holding
- * callback_lock, as that would risk double tripping on callback_lock
- * from one of the callbacks into the cpuset code from within
- * __alloc_pages().
- *
- * If a task is only holding callback_lock, then it has read-only
- * access to cpusets.
- *
- * Now, the task_struct fields mems_allowed and mempolicy may be changed
- * by other task, we use alloc_lock in the task_struct fields to protect
- * them.
- *
- * The cpuset_common_seq_show() handlers only hold callback_lock across
- * small pieces of code, such as when reading out possibly multi-word
- * cpumasks and nodemasks.
- */
-
-static DEFINE_MUTEX(cpuset_mutex);
-
/**
* cpuset_lock - Acquire the global cpuset mutex
*
@@ -283,6 +319,7 @@ void lockdep_assert_cpuset_lock_held(void)
*/
void cpuset_full_lock(void)
{
+ mutex_lock(&cpuset_top_mutex);
cpus_read_lock();
mutex_lock(&cpuset_mutex);
}
@@ -291,12 +328,14 @@ void cpuset_full_unlock(void)
{
mutex_unlock(&cpuset_mutex);
cpus_read_unlock();
+ mutex_unlock(&cpuset_top_mutex);
}
#ifdef CONFIG_LOCKDEP
bool lockdep_is_cpuset_held(void)
{
- return lockdep_is_held(&cpuset_mutex);
+ return lockdep_is_held(&cpuset_mutex) ||
+ lockdep_is_held(&cpuset_top_mutex);
}
#endif
@@ -961,7 +1000,7 @@ void rebuild_sched_domains_locked(void)
* offline CPUs, a warning is emitted and we return directly to
* prevent the panic.
*/
- for (i = 0; i < ndoms; ++i) {
+ for (i = 0; doms && i < ndoms; i++) {
if (WARN_ON_ONCE(!cpumask_subset(doms[i], cpu_active_mask)))
return;
}
@@ -1161,12 +1200,18 @@ static void reset_partition_data(struct cpuset *cs)
static void isolated_cpus_update(int old_prs, int new_prs, struct cpumask *xcpus)
{
WARN_ON_ONCE(old_prs == new_prs);
- if (new_prs == PRS_ISOLATED)
+ lockdep_assert_held(&callback_lock);
+ lockdep_assert_held(&cpuset_mutex);
+ if (new_prs == PRS_ISOLATED) {
+ if (cpumask_subset(xcpus, isolated_cpus))
+ return;
cpumask_or(isolated_cpus, isolated_cpus, xcpus);
- else
+ } else {
+ if (!cpumask_intersects(xcpus, isolated_cpus))
+ return;
cpumask_andnot(isolated_cpus, isolated_cpus, xcpus);
-
- isolated_cpus_updating = true;
+ }
+ update_housekeeping = true;
}
/*
@@ -1219,8 +1264,8 @@ static void partition_xcpus_del(int old_prs, struct cpuset *parent,
isolated_cpus_update(old_prs, parent->partition_root_state,
xcpus);
- cpumask_and(xcpus, xcpus, cpu_active_mask);
cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus);
+ cpumask_and(parent->effective_cpus, parent->effective_cpus, cpu_active_mask);
}
/*
@@ -1284,22 +1329,43 @@ static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus)
}
/*
- * update_isolation_cpumasks - Update external isolation related CPU masks
+ * update_hk_sched_domains - Update HK cpumasks & rebuild sched domains
*
- * The following external CPU masks will be updated if necessary:
- * - workqueue unbound cpumask
+ * Update housekeeping cpumasks and rebuild sched domains if necessary.
+ * This should be called at the end of cpuset or hotplug actions.
*/
-static void update_isolation_cpumasks(void)
+static void update_hk_sched_domains(void)
{
- int ret;
-
- if (!isolated_cpus_updating)
- return;
+ if (update_housekeeping) {
+ /* Updating HK cpumasks implies rebuild sched domains */
+ update_housekeeping = false;
+ force_sd_rebuild = true;
+ cpumask_copy(isolated_hk_cpus, isolated_cpus);
- ret = housekeeping_update(isolated_cpus);
- WARN_ON_ONCE(ret < 0);
+ /*
+ * housekeeping_update() is now called without holding
+ * cpus_read_lock and cpuset_mutex. Only cpuset_top_mutex
+ * is still being held for mutual exclusion.
+ */
+ mutex_unlock(&cpuset_mutex);
+ cpus_read_unlock();
+ WARN_ON_ONCE(housekeeping_update(isolated_hk_cpus));
+ cpus_read_lock();
+ mutex_lock(&cpuset_mutex);
+ }
+ /* force_sd_rebuild will be cleared in rebuild_sched_domains_locked() */
+ if (force_sd_rebuild)
+ rebuild_sched_domains_locked();
+}
- isolated_cpus_updating = false;
+/*
+ * Work function to invoke update_hk_sched_domains()
+ */
+static void hk_sd_workfn(struct work_struct *work)
+{
+ cpuset_full_lock();
+ update_hk_sched_domains();
+ cpuset_full_unlock();
}
/**
@@ -1450,7 +1516,6 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs,
cs->remote_partition = true;
cpumask_copy(cs->effective_xcpus, tmp->new_cpus);
spin_unlock_irq(&callback_lock);
- update_isolation_cpumasks();
cpuset_force_rebuild();
cs->prs_err = 0;
@@ -1495,7 +1560,6 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
compute_excpus(cs, cs->effective_xcpus);
reset_partition_data(cs);
spin_unlock_irq(&callback_lock);
- update_isolation_cpumasks();
cpuset_force_rebuild();
/*
@@ -1566,7 +1630,6 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus,
if (xcpus)
cpumask_copy(cs->exclusive_cpus, xcpus);
spin_unlock_irq(&callback_lock);
- update_isolation_cpumasks();
if (adding || deleting)
cpuset_force_rebuild();
@@ -1910,7 +1973,6 @@ write_error:
partition_xcpus_add(new_prs, parent, tmp->delmask);
spin_unlock_irq(&callback_lock);
- update_isolation_cpumasks();
if ((old_prs != new_prs) && (cmd == partcmd_update))
update_partition_exclusive_flag(cs, new_prs);
@@ -2155,7 +2217,7 @@ get_css:
WARN_ON(!is_in_v2_mode() &&
!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
- cpuset_update_tasks_cpumask(cp, cp->effective_cpus);
+ cpuset_update_tasks_cpumask(cp, tmp->new_cpus);
/*
* On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE
@@ -2878,7 +2940,6 @@ out:
else if (isolcpus_updated)
isolated_cpus_update(old_prs, new_prs, cs->effective_xcpus);
spin_unlock_irq(&callback_lock);
- update_isolation_cpumasks();
/* Force update if switching back to member & update effective_xcpus */
update_cpumasks_hier(cs, &tmpmask, !new_prs);
@@ -3168,9 +3229,8 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
}
free_cpuset(trialcs);
- if (force_sd_rebuild)
- rebuild_sched_domains_locked();
out_unlock:
+ update_hk_sched_domains();
cpuset_full_unlock();
if (of_cft(of)->private == FILE_MEMLIST)
schedule_flush_migrate_mm();
@@ -3278,6 +3338,7 @@ static ssize_t cpuset_partition_write(struct kernfs_open_file *of, char *buf,
cpuset_full_lock();
if (is_cpuset_online(cs))
retval = update_prstate(cs, val);
+ update_hk_sched_domains();
cpuset_full_unlock();
return retval ?: nbytes;
}
@@ -3452,6 +3513,7 @@ static void cpuset_css_killed(struct cgroup_subsys_state *css)
/* Reset valid partition back to member */
if (is_partition_valid(cs))
update_prstate(cs, PRS_MEMBER);
+ update_hk_sched_domains();
cpuset_full_unlock();
}
@@ -3607,6 +3669,7 @@ int __init cpuset_init(void)
BUG_ON(!alloc_cpumask_var(&top_cpuset.exclusive_cpus, GFP_KERNEL));
BUG_ON(!zalloc_cpumask_var(&subpartitions_cpus, GFP_KERNEL));
BUG_ON(!zalloc_cpumask_var(&isolated_cpus, GFP_KERNEL));
+ BUG_ON(!zalloc_cpumask_var(&isolated_hk_cpus, GFP_KERNEL));
cpumask_setall(top_cpuset.cpus_allowed);
nodes_setall(top_cpuset.mems_allowed);
@@ -3778,6 +3841,7 @@ unlock:
*/
static void cpuset_handle_hotplug(void)
{
+ static DECLARE_WORK(hk_sd_work, hk_sd_workfn);
static cpumask_t new_cpus;
static nodemask_t new_mems;
bool cpus_updated, mems_updated;
@@ -3859,9 +3923,21 @@ static void cpuset_handle_hotplug(void)
rcu_read_unlock();
}
- /* rebuild sched domains if necessary */
- if (force_sd_rebuild)
- rebuild_sched_domains_cpuslocked();
+
+ /*
+ * Queue a work to call housekeeping_update() & rebuild_sched_domains()
+ * There will be a slight delay before the HK_TYPE_DOMAIN housekeeping
+ * cpumask can correctly reflect what is in isolated_cpus.
+ *
+ * We rely on WORK_STRUCT_PENDING_BIT to not requeue a work item that
+ * is still pending. Before the pending bit is cleared, the work data
+ * is copied out and work item dequeued. So it is possible to queue
+ * the work again before the hk_sd_workfn() is invoked to process the
+ * previously queued work. Since hk_sd_workfn() doesn't use the work
+ * item at all, this is not a problem.
+ */
+ if (update_housekeeping || force_sd_rebuild)
+ queue_work(system_unbound_wq, &hk_sd_work);
free_tmpmasks(ptmp);
}
diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config
index 774702591d26..307c97ac5fa9 100644
--- a/kernel/configs/debug.config
+++ b/kernel/configs/debug.config
@@ -29,7 +29,6 @@ CONFIG_SECTION_MISMATCH_WARN_ONLY=y
# CONFIG_UBSAN_ALIGNMENT is not set
# CONFIG_UBSAN_DIV_ZERO is not set
# CONFIG_UBSAN_TRAP is not set
-# CONFIG_WARN_ALL_UNSEEDED_RANDOM is not set
CONFIG_DEBUG_FS=y
CONFIG_DEBUG_FS_ALLOW_ALL=y
CONFIG_DEBUG_IRQFLAGS=y
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index f476c63b668c..e89f175e9c2d 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -85,7 +85,7 @@ static inline dma_addr_t dma_direct_map_phys(struct device *dev,
if (is_swiotlb_force_bounce(dev)) {
if (attrs & DMA_ATTR_MMIO)
- goto err_overflow;
+ return DMA_MAPPING_ERROR;
return swiotlb_map(dev, phys, size, dir, attrs);
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ac70d68217b6..1f5699b339ec 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4138,7 +4138,8 @@ static int merge_sched_in(struct perf_event *event, void *data)
if (*perf_event_fasync(event))
event->pending_kill = POLL_ERR;
- perf_event_wakeup(event);
+ event->pending_wakeup = 1;
+ irq_work_queue(&event->pending_irq);
} else {
struct perf_cpu_pmu_context *cpc = this_cpc(event->pmu_ctx->pmu);
@@ -7464,28 +7465,28 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
ret = perf_mmap_aux(vma, event, nr_pages);
if (ret)
return ret;
- }
- /*
- * Since pinned accounting is per vm we cannot allow fork() to copy our
- * vma.
- */
- vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP);
- vma->vm_ops = &perf_mmap_vmops;
+ /*
+ * Since pinned accounting is per vm we cannot allow fork() to copy our
+ * vma.
+ */
+ vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP);
+ vma->vm_ops = &perf_mmap_vmops;
- mapped = get_mapped(event, event_mapped);
- if (mapped)
- mapped(event, vma->vm_mm);
+ mapped = get_mapped(event, event_mapped);
+ if (mapped)
+ mapped(event, vma->vm_mm);
- /*
- * Try to map it into the page table. On fail, invoke
- * perf_mmap_close() to undo the above, as the callsite expects
- * full cleanup in this case and therefore does not invoke
- * vmops::close().
- */
- ret = map_range(event->rb, vma);
- if (ret)
- perf_mmap_close(vma);
+ /*
+ * Try to map it into the page table. On fail, invoke
+ * perf_mmap_close() to undo the above, as the callsite expects
+ * full cleanup in this case and therefore does not invoke
+ * vmops::close().
+ */
+ ret = map_range(event->rb, vma);
+ if (ret)
+ perf_mmap_close(vma);
+ }
return ret;
}
@@ -10776,6 +10777,13 @@ int perf_event_overflow(struct perf_event *event,
struct perf_sample_data *data,
struct pt_regs *regs)
{
+ /*
+ * Entry point from hardware PMI, interrupts should be disabled here.
+ * This serializes us against perf_event_remove_from_context() in
+ * things like perf_event_release_kernel().
+ */
+ lockdep_assert_irqs_disabled();
+
return __perf_event_overflow(event, 1, data, regs);
}
@@ -10852,6 +10860,19 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
{
struct hw_perf_event *hwc = &event->hw;
+ /*
+ * This is:
+ * - software preempt
+ * - tracepoint preempt
+ * - tp_target_task irq (ctx->lock)
+ * - uprobes preempt/irq
+ * - kprobes preempt/irq
+ * - hw_breakpoint irq
+ *
+ * Any of these are sufficient to hold off RCU and thus ensure @event
+ * exists.
+ */
+ lockdep_assert_preemption_disabled();
local64_add(nr, &event->count);
if (!regs)
@@ -10860,6 +10881,16 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
if (!is_sampling_event(event))
return;
+ /*
+ * Serialize against event_function_call() IPIs like normal overflow
+ * event handling. Specifically, must not allow
+ * perf_event_release_kernel() -> perf_remove_from_context() to make
+ * progress and 'release' the event from under us.
+ */
+ guard(irqsave)();
+ if (event->state != PERF_EVENT_STATE_ACTIVE)
+ return;
+
if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
data->period = nr;
return perf_swevent_overflow(event, 1, data, regs);
@@ -11358,6 +11389,11 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
struct perf_sample_data data;
struct perf_event *event;
+ /*
+ * Per being a tracepoint, this runs with preemption disabled.
+ */
+ lockdep_assert_preemption_disabled();
+
struct perf_raw_record raw = {
.frag = {
.size = entry_size,
@@ -11690,6 +11726,11 @@ void perf_bp_event(struct perf_event *bp, void *data)
struct perf_sample_data sample;
struct pt_regs *regs = data;
+ /*
+ * Exception context, will have interrupts disabled.
+ */
+ lockdep_assert_irqs_disabled();
+
perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
if (!bp->hw.state && !perf_exclude_event(bp, regs))
@@ -12154,7 +12195,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
if (regs && !perf_exclude_event(event, regs)) {
if (!(event->attr.exclude_idle && is_idle_task(current)))
- if (__perf_event_overflow(event, 1, &data, regs))
+ if (perf_event_overflow(event, &data, regs))
ret = HRTIMER_NORESTART;
}
diff --git a/kernel/fork.c b/kernel/fork.c
index e832da9d15a4..65113a304518 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -3085,7 +3085,7 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
return 0;
/* don't need lock here; in the worst case we'll do useless copy */
- if (fs->users == 1)
+ if (!(unshare_flags & CLONE_NEWNS) && fs->users == 1)
return 0;
*new_fsp = copy_fs_struct(fs);
diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c
index 79e655ea4ca1..ae758150ccb9 100644
--- a/kernel/kcsan/kcsan_test.c
+++ b/kernel/kcsan/kcsan_test.c
@@ -168,7 +168,7 @@ static bool __report_matches(const struct expect_report *r)
if (!report_available())
return false;
- expect = kmalloc_obj(observed.lines);
+ expect = (typeof(expect))kmalloc_obj(observed.lines);
if (WARN_ON(!expect))
return false;
diff --git a/kernel/liveupdate/luo_file.c b/kernel/liveupdate/luo_file.c
index 8c79058253e1..5acee4174bf0 100644
--- a/kernel/liveupdate/luo_file.c
+++ b/kernel/liveupdate/luo_file.c
@@ -134,9 +134,12 @@ static LIST_HEAD(luo_file_handler_list);
* state that is not preserved. Set by the handler's .preserve()
* callback, and must be freed in the handler's .unpreserve()
* callback.
- * @retrieved: A flag indicating whether a user/kernel in the new kernel has
+ * @retrieve_status: Status code indicating whether a user/kernel in the new kernel has
* successfully called retrieve() on this file. This prevents
- * multiple retrieval attempts.
+ * multiple retrieval attempts. A value of 0 means a retrieve()
+ * has not been attempted, a positive value means the retrieve()
+ * was successful, and a negative value means the retrieve()
+ * failed, and the value is the error code of the call.
* @mutex: A mutex that protects the fields of this specific instance
* (e.g., @retrieved, @file), ensuring that operations like
* retrieving or finishing a file are atomic.
@@ -161,7 +164,7 @@ struct luo_file {
struct file *file;
u64 serialized_data;
void *private_data;
- bool retrieved;
+ int retrieve_status;
struct mutex mutex;
struct list_head list;
u64 token;
@@ -298,7 +301,6 @@ int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd)
luo_file->file = file;
luo_file->fh = fh;
luo_file->token = token;
- luo_file->retrieved = false;
mutex_init(&luo_file->mutex);
args.handler = fh;
@@ -577,7 +579,12 @@ int luo_retrieve_file(struct luo_file_set *file_set, u64 token,
return -ENOENT;
guard(mutex)(&luo_file->mutex);
- if (luo_file->retrieved) {
+ if (luo_file->retrieve_status < 0) {
+ /* Retrieve was attempted and it failed. Return the error code. */
+ return luo_file->retrieve_status;
+ }
+
+ if (luo_file->retrieve_status > 0) {
/*
* Someone is asking for this file again, so get a reference
* for them.
@@ -590,16 +597,19 @@ int luo_retrieve_file(struct luo_file_set *file_set, u64 token,
args.handler = luo_file->fh;
args.serialized_data = luo_file->serialized_data;
err = luo_file->fh->ops->retrieve(&args);
- if (!err) {
- luo_file->file = args.file;
-
- /* Get reference so we can keep this file in LUO until finish */
- get_file(luo_file->file);
- *filep = luo_file->file;
- luo_file->retrieved = true;
+ if (err) {
+ /* Keep the error code for later use. */
+ luo_file->retrieve_status = err;
+ return err;
}
- return err;
+ luo_file->file = args.file;
+ /* Get reference so we can keep this file in LUO until finish */
+ get_file(luo_file->file);
+ *filep = luo_file->file;
+ luo_file->retrieve_status = 1;
+
+ return 0;
}
static int luo_file_can_finish_one(struct luo_file_set *file_set,
@@ -615,7 +625,7 @@ static int luo_file_can_finish_one(struct luo_file_set *file_set,
args.handler = luo_file->fh;
args.file = luo_file->file;
args.serialized_data = luo_file->serialized_data;
- args.retrieved = luo_file->retrieved;
+ args.retrieve_status = luo_file->retrieve_status;
can_finish = luo_file->fh->ops->can_finish(&args);
}
@@ -632,7 +642,7 @@ static void luo_file_finish_one(struct luo_file_set *file_set,
args.handler = luo_file->fh;
args.file = luo_file->file;
args.serialized_data = luo_file->serialized_data;
- args.retrieved = luo_file->retrieved;
+ args.retrieve_status = luo_file->retrieve_status;
luo_file->fh->ops->finish(&args);
luo_flb_file_finish(luo_file->fh);
@@ -788,7 +798,6 @@ int luo_file_deserialize(struct luo_file_set *file_set,
luo_file->file = NULL;
luo_file->serialized_data = file_ser[i].data;
luo_file->token = file_ser[i].token;
- luo_file->retrieved = false;
mutex_init(&luo_file->mutex);
list_add_tail(&luo_file->list, &file_set->files_list);
}
diff --git a/kernel/rseq.c b/kernel/rseq.c
index b0973d19f366..38d3ef540760 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -80,6 +80,7 @@
#include <linux/syscalls.h>
#include <linux/uaccess.h>
#include <linux/types.h>
+#include <linux/rseq.h>
#include <asm/ptrace.h>
#define CREATE_TRACE_POINTS
@@ -449,13 +450,14 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32
* auxiliary vector AT_RSEQ_ALIGN. If rseq_len is the original rseq
* size, the required alignment is the original struct rseq alignment.
*
- * In order to be valid, rseq_len is either the original rseq size, or
- * large enough to contain all supported fields, as communicated to
+ * The rseq_len is required to be greater or equal to the original rseq
+ * size. In order to be valid, rseq_len is either the original rseq size,
+ * or large enough to contain all supported fields, as communicated to
* user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE.
*/
if (rseq_len < ORIG_RSEQ_SIZE ||
(rseq_len == ORIG_RSEQ_SIZE && !IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE)) ||
- (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) ||
+ (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, rseq_alloc_align()) ||
rseq_len < offsetof(struct rseq, end))))
return -EINVAL;
if (!access_ok(rseq, rseq_len))
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 759777694c78..b7f77c165a6e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6830,6 +6830,7 @@ static void __sched notrace __schedule(int sched_mode)
/* SCX must consult the BPF scheduler to tell if rq is empty */
if (!rq->nr_running && !scx_enabled()) {
next = prev;
+ rq->next_class = &idle_sched_class;
goto picked;
}
} else if (!preempt && prev_state) {
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index d5e688b9acc0..8d48a1385835 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -976,8 +976,12 @@ static bool scx_dsq_priq_less(struct rb_node *node_a,
static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta)
{
- /* scx_bpf_dsq_nr_queued() reads ->nr without locking, use WRITE_ONCE() */
- WRITE_ONCE(dsq->nr, dsq->nr + delta);
+ /*
+ * scx_bpf_dsq_nr_queued() reads ->nr without locking. Use READ_ONCE()
+ * on the read side and WRITE_ONCE() on the write side to properly
+ * annotate the concurrent lockless access and avoid KCSAN warnings.
+ */
+ WRITE_ONCE(dsq->nr, READ_ONCE(dsq->nr) + delta);
}
static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p)
@@ -1133,7 +1137,7 @@ static void dispatch_enqueue(struct scx_sched *sch, struct rq *rq,
}
/* seq records the order tasks are queued, used by BPF DSQ iterator */
- dsq->seq++;
+ WRITE_ONCE(dsq->seq, dsq->seq + 1);
p->scx.dsq_seq = dsq->seq;
dsq_mod_nr(dsq, 1);
@@ -2542,7 +2546,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
/* see kick_cpus_irq_workfn() */
smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
- rq->next_class = &ext_sched_class;
+ rq_modified_begin(rq, &ext_sched_class);
rq_unpin_lock(rq, rf);
balance_one(rq, prev);
@@ -2557,7 +2561,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
* If @force_scx is true, always try to pick a SCHED_EXT task,
* regardless of any higher-priority sched classes activity.
*/
- if (!force_scx && sched_class_above(rq->next_class, &ext_sched_class))
+ if (!force_scx && rq_modified_above(rq, &ext_sched_class))
return RETRY_TASK;
keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
@@ -2817,7 +2821,7 @@ static bool check_rq_for_timeouts(struct rq *rq)
unsigned long last_runnable = p->scx.runnable_at;
if (unlikely(time_after(jiffies,
- last_runnable + scx_watchdog_timeout))) {
+ last_runnable + READ_ONCE(scx_watchdog_timeout)))) {
u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable);
scx_exit(sch, SCX_EXIT_ERROR_STALL, 0,
@@ -2845,7 +2849,7 @@ static void scx_watchdog_workfn(struct work_struct *work)
cond_resched();
}
queue_delayed_work(system_unbound_wq, to_delayed_work(work),
- scx_watchdog_timeout / 2);
+ READ_ONCE(scx_watchdog_timeout) / 2);
}
void scx_tick(struct rq *rq)
@@ -3681,7 +3685,6 @@ static int scx_cgroup_init(struct scx_sched *sch)
ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, NULL,
css->cgroup, &args);
if (ret) {
- css_put(css);
scx_error(sch, "ops.cgroup_init() failed (%d)", ret);
return ret;
}
@@ -3804,7 +3807,9 @@ static void scx_kobj_release(struct kobject *kobj)
static ssize_t scx_attr_ops_show(struct kobject *kobj,
struct kobj_attribute *ka, char *buf)
{
- return sysfs_emit(buf, "%s\n", scx_root->ops.name);
+ struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj);
+
+ return sysfs_emit(buf, "%s\n", sch->ops.name);
}
SCX_ATTR(ops);
@@ -3848,7 +3853,9 @@ static const struct kobj_type scx_ktype = {
static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env)
{
- return add_uevent_var(env, "SCXOPS=%s", scx_root->ops.name);
+ const struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj);
+
+ return add_uevent_var(env, "SCXOPS=%s", sch->ops.name);
}
static const struct kset_uevent_ops scx_uevent_ops = {
@@ -3997,8 +4004,8 @@ static u32 bypass_lb_cpu(struct scx_sched *sch, struct rq *rq,
* consider offloading iff the total queued duration is over the
* threshold.
*/
- min_delta_us = scx_bypass_lb_intv_us / SCX_BYPASS_LB_MIN_DELTA_DIV;
- if (delta < DIV_ROUND_UP(min_delta_us, scx_slice_bypass_us))
+ min_delta_us = READ_ONCE(scx_bypass_lb_intv_us) / SCX_BYPASS_LB_MIN_DELTA_DIV;
+ if (delta < DIV_ROUND_UP(min_delta_us, READ_ONCE(scx_slice_bypass_us)))
return 0;
raw_spin_rq_lock_irq(rq);
@@ -4226,7 +4233,7 @@ static void scx_bypass(bool bypass)
WARN_ON_ONCE(scx_bypass_depth <= 0);
if (scx_bypass_depth != 1)
goto unlock;
- WRITE_ONCE(scx_slice_dfl, scx_slice_bypass_us * NSEC_PER_USEC);
+ WRITE_ONCE(scx_slice_dfl, READ_ONCE(scx_slice_bypass_us) * NSEC_PER_USEC);
bypass_timestamp = ktime_get_ns();
if (sch)
scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1);
@@ -4519,10 +4526,19 @@ done:
scx_bypass(false);
}
+/*
+ * Claim the exit on @sch. The caller must ensure that the helper kthread work
+ * is kicked before the current task can be preempted. Once exit_kind is
+ * claimed, scx_error() can no longer trigger, so if the current task gets
+ * preempted and the BPF scheduler fails to schedule it back, the helper work
+ * will never be kicked and the whole system can wedge.
+ */
static bool scx_claim_exit(struct scx_sched *sch, enum scx_exit_kind kind)
{
int none = SCX_EXIT_NONE;
+ lockdep_assert_preemption_disabled();
+
if (!atomic_try_cmpxchg(&sch->exit_kind, &none, kind))
return false;
@@ -4545,6 +4561,7 @@ static void scx_disable(enum scx_exit_kind kind)
rcu_read_lock();
sch = rcu_dereference(scx_root);
if (sch) {
+ guard(preempt)();
scx_claim_exit(sch, kind);
kthread_queue_work(sch->helper, &sch->disable_work);
}
@@ -4867,6 +4884,8 @@ static bool scx_vexit(struct scx_sched *sch,
{
struct scx_exit_info *ei = sch->exit_info;
+ guard(preempt)();
+
if (!scx_claim_exit(sch, kind))
return false;
@@ -5051,20 +5070,30 @@ static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops)
return 0;
}
-static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
+/*
+ * scx_enable() is offloaded to a dedicated system-wide RT kthread to avoid
+ * starvation. During the READY -> ENABLED task switching loop, the calling
+ * thread's sched_class gets switched from fair to ext. As fair has higher
+ * priority than ext, the calling thread can be indefinitely starved under
+ * fair-class saturation, leading to a system hang.
+ */
+struct scx_enable_cmd {
+ struct kthread_work work;
+ struct sched_ext_ops *ops;
+ int ret;
+};
+
+static void scx_enable_workfn(struct kthread_work *work)
{
+ struct scx_enable_cmd *cmd =
+ container_of(work, struct scx_enable_cmd, work);
+ struct sched_ext_ops *ops = cmd->ops;
struct scx_sched *sch;
struct scx_task_iter sti;
struct task_struct *p;
unsigned long timeout;
int i, cpu, ret;
- if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN),
- cpu_possible_mask)) {
- pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n");
- return -EINVAL;
- }
-
mutex_lock(&scx_enable_mutex);
if (scx_enable_state() != SCX_DISABLED) {
@@ -5156,7 +5185,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
WRITE_ONCE(scx_watchdog_timeout, timeout);
WRITE_ONCE(scx_watchdog_timestamp, jiffies);
queue_delayed_work(system_unbound_wq, &scx_watchdog_work,
- scx_watchdog_timeout / 2);
+ READ_ONCE(scx_watchdog_timeout) / 2);
/*
* Once __scx_enabled is set, %current can be switched to SCX anytime.
@@ -5281,13 +5310,15 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
atomic_long_inc(&scx_enable_seq);
- return 0;
+ cmd->ret = 0;
+ return;
err_free_ksyncs:
free_kick_syncs();
err_unlock:
mutex_unlock(&scx_enable_mutex);
- return ret;
+ cmd->ret = ret;
+ return;
err_disable_unlock_all:
scx_cgroup_unlock();
@@ -5306,7 +5337,41 @@ err_disable:
*/
scx_error(sch, "scx_enable() failed (%d)", ret);
kthread_flush_work(&sch->disable_work);
- return 0;
+ cmd->ret = 0;
+}
+
+static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
+{
+ static struct kthread_worker *helper;
+ static DEFINE_MUTEX(helper_mutex);
+ struct scx_enable_cmd cmd;
+
+ if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN),
+ cpu_possible_mask)) {
+ pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n");
+ return -EINVAL;
+ }
+
+ if (!READ_ONCE(helper)) {
+ mutex_lock(&helper_mutex);
+ if (!helper) {
+ helper = kthread_run_worker(0, "scx_enable_helper");
+ if (IS_ERR_OR_NULL(helper)) {
+ helper = NULL;
+ mutex_unlock(&helper_mutex);
+ return -ENOMEM;
+ }
+ sched_set_fifo(helper->task);
+ }
+ mutex_unlock(&helper_mutex);
+ }
+
+ kthread_init_work(&cmd.work, scx_enable_workfn);
+ cmd.ops = ops;
+
+ kthread_queue_work(READ_ONCE(helper), &cmd.work);
+ kthread_flush_work(&cmd.work);
+ return cmd.ret;
}
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index c5a3b0bac7c3..ba298ac3ce6c 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -663,9 +663,8 @@ void scx_idle_init_masks(void)
BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.cpu, GFP_KERNEL));
BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.smt, GFP_KERNEL));
- /* Allocate per-node idle cpumasks */
- scx_idle_node_masks = kzalloc_objs(*scx_idle_node_masks,
- num_possible_nodes());
+ /* Allocate per-node idle cpumasks (use nr_node_ids for non-contiguous NUMA nodes) */
+ scx_idle_node_masks = kzalloc_objs(*scx_idle_node_masks, nr_node_ids);
BUG_ON(!scx_idle_node_masks);
for_each_node(i) {
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index befa9a5d6e53..417d3c6f02fe 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -74,7 +74,7 @@ enum scx_exit_flags {
* info communication. The following flag indicates whether ops.init()
* finished successfully.
*/
- SCX_EFLAG_INITIALIZED,
+ SCX_EFLAG_INITIALIZED = 1LLU << 0,
};
/*
@@ -1042,26 +1042,108 @@ static const char *scx_enable_state_str[] = {
};
/*
- * sched_ext_entity->ops_state
+ * Task Ownership State Machine (sched_ext_entity->ops_state)
*
- * Used to track the task ownership between the SCX core and the BPF scheduler.
- * State transitions look as follows:
+ * The sched_ext core uses this state machine to track task ownership
+ * between the SCX core and the BPF scheduler. This allows the BPF
+ * scheduler to dispatch tasks without strict ordering requirements, while
+ * the SCX core safely rejects invalid dispatches.
*
- * NONE -> QUEUEING -> QUEUED -> DISPATCHING
- * ^ | |
- * | v v
- * \-------------------------------/
+ * State Transitions
*
- * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call
- * sites for explanations on the conditions being waited upon and why they are
- * safe. Transitions out of them into NONE or QUEUED must store_release and the
- * waiters should load_acquire.
+ * .------------> NONE (owned by SCX core)
+ * | | ^
+ * | enqueue | | direct dispatch
+ * | v |
+ * | QUEUEING -------'
+ * | |
+ * | enqueue |
+ * | completes |
+ * | v
+ * | QUEUED (owned by BPF scheduler)
+ * | |
+ * | dispatch |
+ * | |
+ * | v
+ * | DISPATCHING
+ * | |
+ * | dispatch |
+ * | completes |
+ * `---------------'
*
- * Tracking scx_ops_state enables sched_ext core to reliably determine whether
- * any given task can be dispatched by the BPF scheduler at all times and thus
- * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler
- * to try to dispatch any task anytime regardless of its state as the SCX core
- * can safely reject invalid dispatches.
+ * State Descriptions
+ *
+ * - %SCX_OPSS_NONE:
+ * Task is owned by the SCX core. It's either on a run queue, running,
+ * or being manipulated by the core scheduler. The BPF scheduler has no
+ * claim on this task.
+ *
+ * - %SCX_OPSS_QUEUEING:
+ * Transitional state while transferring a task from the SCX core to
+ * the BPF scheduler. The task's rq lock is held during this state.
+ * Since QUEUEING is both entered and exited under the rq lock, dequeue
+ * can never observe this state (it would be a BUG). When finishing a
+ * dispatch, if the task is still in %SCX_OPSS_QUEUEING the completion
+ * path busy-waits for it to leave this state (via wait_ops_state())
+ * before retrying.
+ *
+ * - %SCX_OPSS_QUEUED:
+ * Task is owned by the BPF scheduler. It's on a DSQ (dispatch queue)
+ * and the BPF scheduler is responsible for dispatching it. A QSEQ
+ * (queue sequence number) is embedded in this state to detect
+ * dispatch/dequeue races: if a task is dequeued and re-enqueued, the
+ * QSEQ changes and any in-flight dispatch operations targeting the old
+ * QSEQ are safely ignored.
+ *
+ * - %SCX_OPSS_DISPATCHING:
+ * Transitional state while transferring a task from the BPF scheduler
+ * back to the SCX core. This state indicates the BPF scheduler has
+ * selected the task for execution. When dequeue needs to take the task
+ * off a DSQ and it is still in %SCX_OPSS_DISPATCHING, the dequeue path
+ * busy-waits for it to leave this state (via wait_ops_state()) before
+ * proceeding. Exits to %SCX_OPSS_NONE when dispatch completes.
+ *
+ * Memory Ordering
+ *
+ * Transitions out of %SCX_OPSS_QUEUEING and %SCX_OPSS_DISPATCHING into
+ * %SCX_OPSS_NONE or %SCX_OPSS_QUEUED must use atomic_long_set_release()
+ * and waiters must use atomic_long_read_acquire(). This ensures proper
+ * synchronization between concurrent operations.
+ *
+ * Cross-CPU Task Migration
+ *
+ * When moving a task in the %SCX_OPSS_DISPATCHING state, we can't simply
+ * grab the target CPU's rq lock because a concurrent dequeue might be
+ * waiting on %SCX_OPSS_DISPATCHING while holding the source rq lock
+ * (deadlock).
+ *
+ * The sched_ext core uses a "lock dancing" protocol coordinated by
+ * p->scx.holding_cpu. When moving a task to a different rq:
+ *
+ * 1. Verify task can be moved (CPU affinity, migration_disabled, etc.)
+ * 2. Set p->scx.holding_cpu to the current CPU
+ * 3. Set task state to %SCX_OPSS_NONE; dequeue waits while DISPATCHING
+ * is set, so clearing DISPATCHING first prevents the circular wait
+ * (safe to lock the rq we need)
+ * 4. Unlock the current CPU's rq
+ * 5. Lock src_rq (where the task currently lives)
+ * 6. Verify p->scx.holding_cpu == current CPU, if not, dequeue won the
+ * race (dequeue clears holding_cpu to -1 when it takes the task), in
+ * this case migration is aborted
+ * 7. If src_rq == dst_rq: clear holding_cpu and enqueue directly
+ * into dst_rq's local DSQ (no lock swap needed)
+ * 8. Otherwise: call move_remote_task_to_local_dsq(), which releases
+ * src_rq, locks dst_rq, and performs the deactivate/activate
+ * migration cycle (dst_rq is held on return)
+ * 9. Unlock dst_rq and re-lock the current CPU's rq to restore
+ * the lock state expected by the caller
+ *
+ * If any verification fails, abort the migration.
+ *
+ * This state tracking allows the BPF scheduler to try to dispatch any task
+ * at any time regardless of its state. The SCX core can safely
+ * reject/ignore invalid dispatches, simplifying the BPF scheduler
+ * implementation.
*/
enum scx_ops_state {
SCX_OPSS_NONE, /* owned by the SCX core */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index eea99ec01a3f..bf948db905ed 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -589,6 +589,21 @@ static inline bool entity_before(const struct sched_entity *a,
return vruntime_cmp(a->deadline, "<", b->deadline);
}
+/*
+ * Per avg_vruntime() below, cfs_rq::zero_vruntime is only slightly stale
+ * and this value should be no more than two lag bounds. Which puts it in the
+ * general order of:
+ *
+ * (slice + TICK_NSEC) << NICE_0_LOAD_SHIFT
+ *
+ * which is around 44 bits in size (on 64bit); that is 20 for
+ * NICE_0_LOAD_SHIFT, another 20 for NSEC_PER_MSEC and then a handful for
+ * however many msec the actual slice+tick ends up begin.
+ *
+ * (disregarding the actual divide-by-weight part makes for the worst case
+ * weight of 2, which nicely cancels vs the fuzz in zero_vruntime not actually
+ * being the zero-lag point).
+ */
static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
return vruntime_op(se->vruntime, "-", cfs_rq->zero_vruntime);
@@ -676,41 +691,65 @@ sum_w_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
static inline
-void sum_w_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
+void update_zero_vruntime(struct cfs_rq *cfs_rq, s64 delta)
{
/*
- * v' = v + d ==> sum_w_vruntime' = sum_runtime - d*sum_weight
+ * v' = v + d ==> sum_w_vruntime' = sum_w_vruntime - d*sum_weight
*/
cfs_rq->sum_w_vruntime -= cfs_rq->sum_weight * delta;
+ cfs_rq->zero_vruntime += delta;
}
/*
- * Specifically: avg_runtime() + 0 must result in entity_eligible() := true
+ * Specifically: avg_vruntime() + 0 must result in entity_eligible() := true
* For this to be so, the result of this function must have a left bias.
+ *
+ * Called in:
+ * - place_entity() -- before enqueue
+ * - update_entity_lag() -- before dequeue
+ * - entity_tick()
+ *
+ * This means it is one entry 'behind' but that puts it close enough to where
+ * the bound on entity_key() is at most two lag bounds.
*/
u64 avg_vruntime(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
- s64 avg = cfs_rq->sum_w_vruntime;
- long load = cfs_rq->sum_weight;
+ long weight = cfs_rq->sum_weight;
+ s64 delta = 0;
- if (curr && curr->on_rq) {
- unsigned long weight = scale_load_down(curr->load.weight);
+ if (curr && !curr->on_rq)
+ curr = NULL;
- avg += entity_key(cfs_rq, curr) * weight;
- load += weight;
- }
+ if (weight) {
+ s64 runtime = cfs_rq->sum_w_vruntime;
+
+ if (curr) {
+ unsigned long w = scale_load_down(curr->load.weight);
+
+ runtime += entity_key(cfs_rq, curr) * w;
+ weight += w;
+ }
- if (load) {
/* sign flips effective floor / ceiling */
- if (avg < 0)
- avg -= (load - 1);
- avg = div_s64(avg, load);
+ if (runtime < 0)
+ runtime -= (weight - 1);
+
+ delta = div_s64(runtime, weight);
+ } else if (curr) {
+ /*
+ * When there is but one element, it is the average.
+ */
+ delta = curr->vruntime - cfs_rq->zero_vruntime;
}
- return cfs_rq->zero_vruntime + avg;
+ update_zero_vruntime(cfs_rq, delta);
+
+ return cfs_rq->zero_vruntime;
}
+static inline u64 cfs_rq_max_slice(struct cfs_rq *cfs_rq);
+
/*
* lag_i = S - s_i = w_i * (V - v_i)
*
@@ -724,17 +763,16 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
* EEVDF gives the following limit for a steady state system:
*
* -r_max < lag < max(r_max, q)
- *
- * XXX could add max_slice to the augmented data to track this.
*/
static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
+ u64 max_slice = cfs_rq_max_slice(cfs_rq) + TICK_NSEC;
s64 vlag, limit;
WARN_ON_ONCE(!se->on_rq);
vlag = avg_vruntime(cfs_rq) - se->vruntime;
- limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
+ limit = calc_delta_fair(max_slice, se);
se->vlag = clamp(vlag, -limit, limit);
}
@@ -777,16 +815,6 @@ int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
return vruntime_eligible(cfs_rq, se->vruntime);
}
-static void update_zero_vruntime(struct cfs_rq *cfs_rq)
-{
- u64 vruntime = avg_vruntime(cfs_rq);
- s64 delta = vruntime_op(vruntime, "-", cfs_rq->zero_vruntime);
-
- sum_w_vruntime_update(cfs_rq, delta);
-
- cfs_rq->zero_vruntime = vruntime;
-}
-
static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq)
{
struct sched_entity *root = __pick_root_entity(cfs_rq);
@@ -802,6 +830,21 @@ static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq)
return min_slice;
}
+static inline u64 cfs_rq_max_slice(struct cfs_rq *cfs_rq)
+{
+ struct sched_entity *root = __pick_root_entity(cfs_rq);
+ struct sched_entity *curr = cfs_rq->curr;
+ u64 max_slice = 0ULL;
+
+ if (curr && curr->on_rq)
+ max_slice = curr->slice;
+
+ if (root)
+ max_slice = max(max_slice, root->max_slice);
+
+ return max_slice;
+}
+
static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
{
return entity_before(__node_2_se(a), __node_2_se(b));
@@ -826,6 +869,15 @@ static inline void __min_slice_update(struct sched_entity *se, struct rb_node *n
}
}
+static inline void __max_slice_update(struct sched_entity *se, struct rb_node *node)
+{
+ if (node) {
+ struct sched_entity *rse = __node_2_se(node);
+ if (rse->max_slice > se->max_slice)
+ se->max_slice = rse->max_slice;
+ }
+}
+
/*
* se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime)
*/
@@ -833,6 +885,7 @@ static inline bool min_vruntime_update(struct sched_entity *se, bool exit)
{
u64 old_min_vruntime = se->min_vruntime;
u64 old_min_slice = se->min_slice;
+ u64 old_max_slice = se->max_slice;
struct rb_node *node = &se->run_node;
se->min_vruntime = se->vruntime;
@@ -843,8 +896,13 @@ static inline bool min_vruntime_update(struct sched_entity *se, bool exit)
__min_slice_update(se, node->rb_right);
__min_slice_update(se, node->rb_left);
+ se->max_slice = se->slice;
+ __max_slice_update(se, node->rb_right);
+ __max_slice_update(se, node->rb_left);
+
return se->min_vruntime == old_min_vruntime &&
- se->min_slice == old_min_slice;
+ se->min_slice == old_min_slice &&
+ se->max_slice == old_max_slice;
}
RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
@@ -856,7 +914,6 @@ RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
sum_w_vruntime_add(cfs_rq, se);
- update_zero_vruntime(cfs_rq);
se->min_vruntime = se->vruntime;
se->min_slice = se->slice;
rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
@@ -868,7 +925,6 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
&min_vruntime_cb);
sum_w_vruntime_sub(cfs_rq, se);
- update_zero_vruntime(cfs_rq);
}
struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq)
@@ -3790,6 +3846,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
{
bool curr = cfs_rq->curr == se;
+ bool rel_vprot = false;
+ u64 vprot;
if (se->on_rq) {
/* commit outstanding execution time */
@@ -3797,6 +3855,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
update_entity_lag(cfs_rq, se);
se->deadline -= se->vruntime;
se->rel_deadline = 1;
+ if (curr && protect_slice(se)) {
+ vprot = se->vprot - se->vruntime;
+ rel_vprot = true;
+ }
+
cfs_rq->nr_queued--;
if (!curr)
__dequeue_entity(cfs_rq, se);
@@ -3812,6 +3875,9 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
if (se->rel_deadline)
se->deadline = div_s64(se->deadline * se->load.weight, weight);
+ if (rel_vprot)
+ vprot = div_s64(vprot * se->load.weight, weight);
+
update_load_set(&se->load, weight);
do {
@@ -3823,6 +3889,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
enqueue_load_avg(cfs_rq, se);
if (se->on_rq) {
place_entity(cfs_rq, se, 0);
+ if (rel_vprot)
+ se->vprot = se->vruntime + vprot;
update_load_add(&cfs_rq->load, se->load.weight);
if (!curr)
__enqueue_entity(cfs_rq, se);
@@ -5420,7 +5488,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
}
static void
-set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, bool first)
{
clear_buddies(cfs_rq, se);
@@ -5435,7 +5503,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
__dequeue_entity(cfs_rq, se);
update_load_avg(cfs_rq, se, UPDATE_TG);
- set_protect_slice(cfs_rq, se);
+ if (first)
+ set_protect_slice(cfs_rq, se);
}
update_stats_curr_start(cfs_rq, se);
@@ -5524,6 +5593,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
update_load_avg(cfs_rq, curr, UPDATE_TG);
update_cfs_group(curr);
+ /*
+ * Pulls along cfs_rq::zero_vruntime.
+ */
+ avg_vruntime(cfs_rq);
+
#ifdef CONFIG_SCHED_HRTICK
/*
* queued ticks are scheduled to match the slice, so don't bother
@@ -8948,13 +9022,13 @@ again:
pse = parent_entity(pse);
}
if (se_depth >= pse_depth) {
- set_next_entity(cfs_rq_of(se), se);
+ set_next_entity(cfs_rq_of(se), se, true);
se = parent_entity(se);
}
}
put_prev_entity(cfs_rq, pse);
- set_next_entity(cfs_rq, se);
+ set_next_entity(cfs_rq, se, true);
__set_next_task_fair(rq, p, true);
}
@@ -12908,7 +12982,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
t0 = sched_clock_cpu(this_cpu);
__sched_balance_update_blocked_averages(this_rq);
- this_rq->next_class = &fair_sched_class;
+ rq_modified_begin(this_rq, &fair_sched_class);
raw_spin_rq_unlock(this_rq);
for_each_domain(this_cpu, sd) {
@@ -12975,7 +13049,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
pulled_task = 1;
/* If a higher prio class was modified, restart the pick */
- if (sched_class_above(this_rq->next_class, &fair_sched_class))
+ if (rq_modified_above(this_rq, &fair_sched_class))
pulled_task = -1;
out:
@@ -13568,7 +13642,7 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- set_next_entity(cfs_rq, se);
+ set_next_entity(cfs_rq, se, first);
/* ensure bandwidth has been allocated on our new cfs_rq */
account_cfs_rq_runtime(cfs_rq, 0);
}
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 3b725d39c06e..ef152d401fe2 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -123,8 +123,6 @@ int housekeeping_update(struct cpumask *isol_mask)
struct cpumask *trial, *old = NULL;
int err;
- lockdep_assert_cpus_held();
-
trial = kmalloc(cpumask_size(), GFP_KERNEL);
if (!trial)
return -ENOMEM;
@@ -136,7 +134,7 @@ int housekeeping_update(struct cpumask *isol_mask)
}
if (!housekeeping.flags)
- static_branch_enable_cpuslocked(&housekeeping_overridden);
+ static_branch_enable(&housekeeping_overridden);
if (housekeeping.flags & HK_FLAG_DOMAIN)
old = housekeeping_cpumask_dereference(HK_TYPE_DOMAIN);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b82fb70a9d54..43bbf0693cca 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2748,6 +2748,17 @@ static inline const struct sched_class *next_active_class(const struct sched_cla
#define sched_class_above(_a, _b) ((_a) < (_b))
+static inline void rq_modified_begin(struct rq *rq, const struct sched_class *class)
+{
+ if (sched_class_above(rq->next_class, class))
+ rq->next_class = class;
+}
+
+static inline bool rq_modified_above(struct rq *rq, const struct sched_class *class)
+{
+ return sched_class_above(rq->next_class, class);
+}
+
static inline bool sched_stop_runnable(struct rq *rq)
{
return rq->stop && task_on_rq_queued(rq->stop);
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 0ba8e3c50d62..36fd2313ae7e 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -365,20 +365,16 @@ SYSCALL_DEFINE1(adjtimex_time32, struct old_timex32 __user *, utp)
}
#endif
+#if HZ > MSEC_PER_SEC || (MSEC_PER_SEC % HZ)
/**
* jiffies_to_msecs - Convert jiffies to milliseconds
* @j: jiffies value
*
- * Avoid unnecessary multiplications/divisions in the
- * two most common HZ cases.
- *
* Return: milliseconds value
*/
unsigned int jiffies_to_msecs(const unsigned long j)
{
-#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
- return (MSEC_PER_SEC / HZ) * j;
-#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
+#if HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
#else
# if BITS_PER_LONG == 32
@@ -390,7 +386,9 @@ unsigned int jiffies_to_msecs(const unsigned long j)
#endif
}
EXPORT_SYMBOL(jiffies_to_msecs);
+#endif
+#if (USEC_PER_SEC % HZ)
/**
* jiffies_to_usecs - Convert jiffies to microseconds
* @j: jiffies value
@@ -405,17 +403,14 @@ unsigned int jiffies_to_usecs(const unsigned long j)
*/
BUILD_BUG_ON(HZ > USEC_PER_SEC);
-#if !(USEC_PER_SEC % HZ)
- return (USEC_PER_SEC / HZ) * j;
-#else
-# if BITS_PER_LONG == 32
+#if BITS_PER_LONG == 32
return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32;
-# else
+#else
return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN;
-# endif
#endif
}
EXPORT_SYMBOL(jiffies_to_usecs);
+#endif
/**
* mktime64 - Converts date to seconds.
diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
index c1ed0d5e8de6..155eeaea4113 100644
--- a/kernel/time/timer_migration.c
+++ b/kernel/time/timer_migration.c
@@ -1559,8 +1559,6 @@ int tmigr_isolated_exclude_cpumask(struct cpumask *exclude_cpumask)
cpumask_var_t cpumask __free(free_cpumask_var) = CPUMASK_VAR_NULL;
int cpu;
- lockdep_assert_cpus_held();
-
if (!works)
return -ENOMEM;
if (!alloc_cpumask_var(&cpumask, GFP_KERNEL))
@@ -1570,6 +1568,7 @@ int tmigr_isolated_exclude_cpumask(struct cpumask *exclude_cpumask)
* First set previously isolated CPUs as available (unisolate).
* This cpumask contains only CPUs that switched to available now.
*/
+ guard(cpus_read_lock)();
cpumask_andnot(cpumask, cpu_online_mask, exclude_cpumask);
cpumask_andnot(cpumask, cpumask, tmigr_available_cpumask);
@@ -1626,7 +1625,6 @@ static int __init tmigr_init_isolation(void)
cpumask_andnot(cpumask, cpu_possible_mask, housekeeping_cpumask(HK_TYPE_DOMAIN));
/* Protect against RCU torture hotplug testing */
- guard(cpus_read_lock)();
return tmigr_isolated_exclude_cpumask(cpumask);
}
late_initcall(tmigr_init_isolation);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 9bc0dfd235af..0b040a417442 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2454,8 +2454,10 @@ static void bpf_kprobe_multi_show_fdinfo(const struct bpf_link *link,
struct seq_file *seq)
{
struct bpf_kprobe_multi_link *kmulti_link;
+ bool has_cookies;
kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
+ has_cookies = !!kmulti_link->cookies;
seq_printf(seq,
"kprobe_cnt:\t%u\n"
@@ -2467,7 +2469,7 @@ static void bpf_kprobe_multi_show_fdinfo(const struct bpf_link *link,
for (int i = 0; i < kmulti_link->cnt; i++) {
seq_printf(seq,
"%llu\t %pS\n",
- kmulti_link->cookies[i],
+ has_cookies ? kmulti_link->cookies[i] : 0,
(void *)kmulti_link->addrs[i]);
}
}