From 57df858a46f0a4cc104716e0ec88864e5c386ca4 Mon Sep 17 00:00:00 2001 From: Jassi Brar Date: Mon, 9 Feb 2026 17:36:19 -0600 Subject: mailbox: add API to query available TX queue slots Clients sometimes need to know whether the mailbox TX queue has room before posting a new message. Rather than exposing internal queue state through a struct field, provide a proper accessor function that returns the number of available slots for a given channel. This lets clients choose to back off when the queue is full instead of hitting the -ENOBUFS error path and the misleading "Try increasing MBOX_TX_QUEUE_LEN" warning. Tested-by: Tanmay Shah Signed-off-by: Jassi Brar --- include/linux/mailbox_client.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mailbox_client.h b/include/linux/mailbox_client.h index c6eea9afb943..e5997120f45c 100644 --- a/include/linux/mailbox_client.h +++ b/include/linux/mailbox_client.h @@ -45,6 +45,7 @@ int mbox_send_message(struct mbox_chan *chan, void *mssg); int mbox_flush(struct mbox_chan *chan, unsigned long timeout); void mbox_client_txdone(struct mbox_chan *chan, int r); /* atomic */ bool mbox_client_peek_data(struct mbox_chan *chan); /* atomic */ +unsigned int mbox_chan_tx_slots_available(struct mbox_chan *chan); /* atomic */ void mbox_free_channel(struct mbox_chan *chan); /* may sleep */ #endif /* __MAILBOX_CLIENT_H */ -- cgit v1.2.3 From 89e5d7d616009e5fada5da081b1d79cdd59150ab Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Fri, 27 Mar 2026 16:10:21 +0100 Subject: mailbox: remove superfluous internal header Quite some controller drivers use the defines from the internal header already. This prevents controller drivers outside the mailbox directory. Move the defines to the public controller header to allow this again as the defines are not strictly internal anyhow. Signed-off-by: Wolfram Sang Reviewed-by: Sudeep Holla Reviewed-by: Daniel Baluta Signed-off-by: Jassi Brar --- include/linux/mailbox_controller.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mailbox_controller.h b/include/linux/mailbox_controller.h index 80a427c7ca29..16fef421c30c 100644 --- a/include/linux/mailbox_controller.h +++ b/include/linux/mailbox_controller.h @@ -3,6 +3,7 @@ #ifndef __MAILBOX_CONTROLLER_H #define __MAILBOX_CONTROLLER_H +#include #include #include #include @@ -11,6 +12,10 @@ struct mbox_chan; +#define TXDONE_BY_IRQ BIT(0) /* controller has remote RTR irq */ +#define TXDONE_BY_POLL BIT(1) /* controller can read status of last TX */ +#define TXDONE_BY_ACK BIT(2) /* S/W ACK received by Client ticks the TX */ + /** * struct mbox_chan_ops - methods to control mailbox channels * @send_data: The API asks the MBOX controller driver, in atomic -- cgit v1.2.3 From c58e9456e30c7098cbcd9f04571992be8a2e4e63 Mon Sep 17 00:00:00 2001 From: Jassi Brar Date: Fri, 27 Mar 2026 17:00:40 -0500 Subject: mailbox: Fix NULL message support in mbox_send_message() The active_req field serves double duty as both the "is a TX in flight" flag (NULL means idle) and the storage for the in-flight message pointer. When a client sends NULL via mbox_send_message(), active_req is set to NULL, which the framework misinterprets as "no active request". This breaks the TX state machine by: - tx_tick() short-circuits on (!mssg), skipping the tx_done callback and the tx_complete completion - txdone_hrtimer() skips the channel entirely since active_req is NULL, so poll-based TX-done detection never fires. Fix this by introducing a MBOX_NO_MSG sentinel value that means "no active request," freeing NULL to be valid message data. The sentinel is defined in the subsystem-internal mailbox.h so that controller drivers within drivers/mailbox/ can reference it, but it is not exposed to clients outside the subsystem. Fifteen in-tree callers send NULL (doorbell-style IPCs on Qualcomm, Tegra, TI, Xilinx, i.MX, SCMI, and PCC platforms). All were audited for regression: - Most already work around the bug via knows_txdone=true with a manual mbox_client_txdone() call, making the framework's tracking irrelevant. These are unaffected. - Poll-based callers (Xilinx zynqmp/r5) are strictly better off: the poll timer now correctly detects NULL-active channels instead of silently skipping them. - irq-qcom-mpm.c was a pre-existing bug -- the only Qualcomm caller that omitted the knows_txdone + mbox_client_txdone() pattern. Fixed in a companion commit ("irqchip/qcom-mpm: Fix missing mailbox TX done acknowledgment"). - No caller sets both a tx_done callback and sends NULL, nor combines tx_block=true with NULL sends, so the newly reachable callback/completion paths are never exercised. Also update tegra-hsp's flush callback, which directly inspects active_req to wait for the channel to drain: the old "!= NULL" check becomes "!= MBOX_NO_MSG", otherwise flush spins until timeout since the sentinel is non-NULL. The only tradeoff is that 'MBOX_NO_MSG' can not be used as a message by clients. Reported-by: Joonwon Kang Reviewed-by: Douglas Anderson Signed-off-by: Jassi Brar --- include/linux/mailbox_controller.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mailbox_controller.h b/include/linux/mailbox_controller.h index 16fef421c30c..e3896b08f22e 100644 --- a/include/linux/mailbox_controller.h +++ b/include/linux/mailbox_controller.h @@ -12,6 +12,9 @@ struct mbox_chan; +/* Sentinel value distinguishing "no active request" from "NULL message data" */ +#define MBOX_NO_MSG ((void *)-1) + #define TXDONE_BY_IRQ BIT(0) /* controller has remote RTR irq */ #define TXDONE_BY_POLL BIT(1) /* controller can read status of last TX */ #define TXDONE_BY_ACK BIT(2) /* S/W ACK received by Client ticks the TX */ -- cgit v1.2.3 From 0bd75b7abafb3ed199df830c539c57ef9b62c2a2 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Fri, 10 Apr 2026 14:49:12 +0200 Subject: mailbox: prefix new constants with MBOX_ Commit 89e5d7d61600 ("mailbox: remove superfluous internal header") moved some constants to a public header but forgot to add a mailbox specific prefix. Add this now to prevent future collisions on a too generic naming. Link: https://sashiko.dev/#/patchset/20260327151112.5202-2-wsa%2Brenesas%40sang-engineering.com Signed-off-by: Wolfram Sang Reviewed-by: Sudeep Holla Signed-off-by: Jassi Brar --- include/linux/mailbox_controller.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mailbox_controller.h b/include/linux/mailbox_controller.h index e3896b08f22e..a49ee687d4cf 100644 --- a/include/linux/mailbox_controller.h +++ b/include/linux/mailbox_controller.h @@ -15,9 +15,9 @@ struct mbox_chan; /* Sentinel value distinguishing "no active request" from "NULL message data" */ #define MBOX_NO_MSG ((void *)-1) -#define TXDONE_BY_IRQ BIT(0) /* controller has remote RTR irq */ -#define TXDONE_BY_POLL BIT(1) /* controller can read status of last TX */ -#define TXDONE_BY_ACK BIT(2) /* S/W ACK received by Client ticks the TX */ +#define MBOX_TXDONE_BY_IRQ BIT(0) /* controller has remote RTR irq */ +#define MBOX_TXDONE_BY_POLL BIT(1) /* controller can read status of last TX */ +#define MBOX_TXDONE_BY_ACK BIT(2) /* S/W ACK received by Client ticks the TX */ /** * struct mbox_chan_ops - methods to control mailbox channels -- cgit v1.2.3 From 7746e3bd4cc19b5092e00d32d676e329bfcb6900 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 10 Apr 2026 16:49:47 +0200 Subject: fanotify: fix false positive on permission events fsnotify_get_mark_safe() may return false for a mark on an unrelated group, which results in bypassing the permission check. Fix by skipping over detached marks that are not in the current group. CC: stable@vger.kernel.org Fixes: abc77577a669 ("fsnotify: Provide framework for dropping SRCU lock in ->handle_event") Signed-off-by: Miklos Szeredi Link: https://patch.msgid.link/20260410144950.156160-1-mszeredi@redhat.com Signed-off-by: Jan Kara --- include/linux/fsnotify_backend.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 95985400d3d8..e5cde39d6e85 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -915,6 +915,7 @@ extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group, unsigned int obj_type); extern void fsnotify_get_mark(struct fsnotify_mark *mark); extern void fsnotify_put_mark(struct fsnotify_mark *mark); +struct fsnotify_mark *fsnotify_next_mark(struct fsnotify_mark *mark); extern void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info); extern bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info); -- cgit v1.2.3 From a068c4d42c035c63b26ff91c394e6dc2cb7dc5d0 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 13 Apr 2026 12:42:39 +0200 Subject: mailbox: update kdoc for struct mbox_controller Add field for missing lock around the hrtimer. Add 'Required' where the core checks for valid entries. Signed-off-by: Wolfram Sang Reviewed-by: Geert Uytterhoeven Signed-off-by: Jassi Brar --- include/linux/mailbox_controller.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mailbox_controller.h b/include/linux/mailbox_controller.h index a49ee687d4cf..dc93287a2a01 100644 --- a/include/linux/mailbox_controller.h +++ b/include/linux/mailbox_controller.h @@ -62,10 +62,10 @@ struct mbox_chan_ops { /** * struct mbox_controller - Controller of a class of communication channels - * @dev: Device backing this controller - * @ops: Operators that work on each communication chan - * @chans: Array of channels - * @num_chans: Number of channels in the 'chans' array. + * @dev: Device backing this controller. Required. + * @ops: Operators that work on each communication chan. Required. + * @chans: Array of channels. Required. + * @num_chans: Number of channels in the 'chans' array. Required. * @txdone_irq: Indicates if the controller can report to API when * the last transmitted data was read by the remote. * Eg, if it has some TX ACK irq. @@ -78,6 +78,7 @@ struct mbox_chan_ops { * @of_xlate: Controller driver specific mapping of channel via DT * @poll_hrt: API private. hrtimer used to poll for TXDONE on all * channels. + * @poll_hrt_lock: API private. Lock protecting access to poll_hrt. * @node: API private. To hook into list of controllers. */ struct mbox_controller { -- cgit v1.2.3 From 73bd1227787bfe73eea3d04c63a89cb55db9c23e Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 18 Apr 2026 09:41:21 +0800 Subject: rhashtable: Restore insecure_elasticity toggle Some users of rhashtable cannot handle insertion failures, and are happy to accept the consequences of a hash table that having very long chains. Restore the insecure_elasticity toggle for these users. In addition to disabling the chain length checks, this also removes the emergency resize that would otherwise occur when the hash table occupancy hits 100% (an async resize is still scheduled at 75%). Signed-off-by: Herbert Xu Signed-off-by: Tejun Heo --- include/linux/rhashtable-types.h | 2 ++ include/linux/rhashtable.h | 5 +++-- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h index 015c8298bebc..72082428d6c6 100644 --- a/include/linux/rhashtable-types.h +++ b/include/linux/rhashtable-types.h @@ -49,6 +49,7 @@ typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg, * @head_offset: Offset of rhash_head in struct to be hashed * @max_size: Maximum size while expanding * @min_size: Minimum size while shrinking + * @insecure_elasticity: Set to true to disable chain length checks * @automatic_shrinking: Enable automatic shrinking of tables * @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash) * @obj_hashfn: Function to hash object @@ -61,6 +62,7 @@ struct rhashtable_params { u16 head_offset; unsigned int max_size; u16 min_size; + bool insecure_elasticity; bool automatic_shrinking; rht_hashfn_t hashfn; rht_obj_hashfn_t obj_hashfn; diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 0480509a6339..7def3f0f556b 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -821,14 +821,15 @@ slow_path: goto out; } - if (elasticity <= 0) + if (elasticity <= 0 && !params.insecure_elasticity) goto slow_path; data = ERR_PTR(-E2BIG); if (unlikely(rht_grow_above_max(ht, tbl))) goto out_unlock; - if (unlikely(rht_grow_above_100(ht, tbl))) + if (unlikely(rht_grow_above_100(ht, tbl)) && + !params.insecure_elasticity) goto slow_path; /* Inserting at head of list makes unlocking free. */ -- cgit v1.2.3 From 4fe985292709eeb6a4653c71660f893e26c2f2dd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 20 Apr 2026 20:03:26 -1000 Subject: rhashtable: Bounce deferred worker kick through irq_work Inserts past 75% load call schedule_work(&ht->run_work) to kick an async resize. If a caller holds a raw spinlock (e.g. an insecure_elasticity user), schedule_work() under that lock records caller_lock -> pool->lock -> pi_lock -> rq->__lock A cycle forms if any of these locks is acquired in the reverse direction elsewhere. sched_ext, the only current insecure_elasticity user, hits this: it holds scx_sched_lock across rhashtable inserts of sub-schedulers, while scx_bypass() takes rq->__lock -> scx_sched_lock. Exercising the resize path produces: Chain exists of: &pool->lock --> &rq->__lock --> scx_sched_lock Bounce the kick from the insert paths through irq_work so schedule_work() runs from hard IRQ context with the caller's lock no longer held. rht_deferred_worker()'s self-rearm on error stays on schedule_work(&ht->run_work) - the worker runs in process context with no caller lock held, and keeping the self-requeue on @run_work lets cancel_work_sync() in rhashtable_free_and_destroy() drain it. v3: Keep rht_deferred_worker()'s self-rearm on schedule_work(&run_work). Routing it through irq_work in v2 broke cancel_work_sync()'s self-requeue handling - an irq_work queued after irq_work_sync() returned but while cancel_work_sync() was still waiting could fire post-teardown. v2: Bounce unconditionally instead of gating on insecure_elasticity, as suggested by Herbert. Signed-off-by: Tejun Heo Acked-by: Herbert Xu --- include/linux/rhashtable-types.h | 3 +++ include/linux/rhashtable.h | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h index 72082428d6c6..fc2f596a6df1 100644 --- a/include/linux/rhashtable-types.h +++ b/include/linux/rhashtable-types.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -77,6 +78,7 @@ struct rhashtable_params { * @p: Configuration parameters * @rhlist: True if this is an rhltable * @run_work: Deferred worker to expand/shrink asynchronously + * @run_irq_work: Bounces the @run_work kick through hard IRQ context. * @mutex: Mutex to protect current/future table swapping * @lock: Spin lock to protect walker list * @nelems: Number of elements in table @@ -88,6 +90,7 @@ struct rhashtable { struct rhashtable_params p; bool rhlist; struct work_struct run_work; + struct irq_work run_irq_work; struct mutex mutex; spinlock_t lock; atomic_t nelems; diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 7def3f0f556b..ef5230cece36 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -847,7 +848,7 @@ slow_path: rht_assign_unlock(tbl, bkt, obj, flags); if (rht_grow_above_75(ht, tbl)) - schedule_work(&ht->run_work); + irq_work_queue(&ht->run_irq_work); data = NULL; out: -- cgit v1.2.3 From f902877b635551513729bdf9a8d1422c4aab7741 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 15 Apr 2026 17:56:02 +0200 Subject: rculist: add list_splice_rcu() for private lists This patch adds a helper function, list_splice_rcu(), to safely splice a private (non-RCU-protected) list into an RCU-protected list. The function ensures that only the pointer visible to RCU readers (prev->next) is updated using rcu_assign_pointer(), while the rest of the list manipulations are performed with regular assignments, as the source list is private and not visible to concurrent RCU readers. This is useful for moving elements from a private list into a global RCU-protected list, ensuring safe publication for RCU readers. Subsystems with some sort of batching mechanism from userspace can benefit from this new function. The function __list_splice_rcu() has been added for clarity and to follow the same pattern as in the existing list_splice*() interfaces, where there is a check to ensure that the list to splice is not empty. Note that __list_splice_rcu() has no documentation for this reason. Reviewed-by: Paul E. McKenney Signed-off-by: Pablo Neira Ayuso --- include/linux/rculist.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 2abba7552605..e3bc44225692 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -261,6 +261,35 @@ static inline void list_replace_rcu(struct list_head *old, old->prev = LIST_POISON2; } +static inline void __list_splice_rcu(struct list_head *list, + struct list_head *prev, + struct list_head *next) +{ + struct list_head *first = list->next; + struct list_head *last = list->prev; + + last->next = next; + first->prev = prev; + next->prev = last; + rcu_assign_pointer(list_next_rcu(prev), first); +} + +/** + * list_splice_rcu - splice a non-RCU list into an RCU-protected list, + * designed for stacks. + * @list: the non RCU-protected list to splice + * @head: the place in the existing RCU-protected list to splice + * + * The list pointed to by @head can be RCU-read traversed concurrently with + * this function. + */ +static inline void list_splice_rcu(struct list_head *list, + struct list_head *head) +{ + if (!list_empty(list)) + __list_splice_rcu(list, head, head->next); +} + /** * __list_splice_init_rcu - join an RCU-protected list into an existing list. * @list: the RCU-protected list to splice -- cgit v1.2.3 From 54377fcab51f6f1f8807827d3751be42279e1a6a Mon Sep 17 00:00:00 2001 From: KaFai Wan Date: Tue, 21 Apr 2026 23:58:02 +0800 Subject: bpf: Reject TCP_NODELAY in bpf-tcp-cc A BPF TCP congestion control program can call bpf_setsockopt() from its callbacks. In current kernels, if it calls bpf_setsockopt(TCP_NODELAY) from cwnd_event_tx_start(), the call can re-enter the TCP transmit path before the outer tcp_transmit_skb() has completed and advanced the send head. This can re-trigger CA_EVENT_TX_START and lead to unbounded recursion: tcp_transmit_skb() -> tcp_event_data_sent() -> tcp_ca_event(sk, CA_EVENT_TX_START) -> cwnd_event_tx_start() -> bpf_setsockopt(TCP_NODELAY) -> tcp_push_pending_frames() -> tcp_write_xmit() -> tcp_transmit_skb() This leads to unbounded recursion and can overflow the kernel stack. Reject TCP_NODELAY with -EOPNOTSUPP for bpf-tcp-cc by introducing a dedicated setsockopt proto for BPF_PROG_TYPE_STRUCT_OPS TCP congestion control programs. To keep it simple, all tcp-cc ops is rejected for TCP_NODELAY. Fixes: 7e41df5dbba2 ("bpf: Add a few optnames to bpf_setsockopt") Suggested-by: Martin KaFai Lau Signed-off-by: KaFai Wan Signed-off-by: Martin KaFai Lau Reviewed-by: Jiayuan Chen Link: https://patch.msgid.link/20260421155804.135786-3-kafai.wan@linux.dev --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b4b703c90ca9..01e203964892 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3725,6 +3725,7 @@ extern const struct bpf_func_proto bpf_for_each_map_elem_proto; extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto; extern const struct bpf_func_proto bpf_sk_setsockopt_proto; extern const struct bpf_func_proto bpf_sk_getsockopt_proto; +extern const struct bpf_func_proto bpf_sk_setsockopt_nodelay_proto; extern const struct bpf_func_proto bpf_unlocked_sk_setsockopt_proto; extern const struct bpf_func_proto bpf_unlocked_sk_getsockopt_proto; extern const struct bpf_func_proto bpf_find_vma_proto; -- cgit v1.2.3 From bd7b7ce96db4487bb77692a85ee4489fd2c395df Mon Sep 17 00:00:00 2001 From: Chris Leech Date: Wed, 22 Apr 2026 12:06:36 -0700 Subject: nvme-auth: Hash DH shared secret to create session key The NVMe Base Specification 8.3.5.5.9 states that the session key Ks shall be computed from the ephemeral DH key by applying the hash function selected by the HashID parameter. The current implementation stores the raw DH shared secret as the session key without hashing it. This causes redundant hash operations: 1. Augmented challenge computation (section 8.3.5.5.4) requires Ca = HMAC(H(g^xy mod p), C). The code compensates by hashing the unhashed session key in nvme_auth_augmented_challenge() to produce the correct result. 2. PSK generation (section 8.3.5.5.9) requires PSK = HMAC(Ks, C1 || C2) where Ks should already be H(g^xy mod p). As the DH shared secret is always larger than the HMAC block size, HMAC internally hashes it before use, accidentally producing the correct result. When using secure channel concatenation with bidirectional authentication, this results in hashing the DH value three times: twice for augmented challenge calculations and once during PSK generation. Fix this by: - Modifying nvme_auth_gen_shared_secret() to hash the DH shared secret once after computation: Ks = H(g^xy mod p) - Removing the hash operation from nvme_auth_augmented_challenge() as the session key is now already hashed - Updating session key buffer size from DH key size to hash output size - Adding specification references in comments This avoid storing the raw DH shared secret and reduces the number of hash operations from three to one when using secure channel concatenation. Reviewed-by: Hannes Reinecke Reviewed-by: Eric Biggers Signed-off-by: Chris Leech Signed-off-by: Keith Busch --- include/linux/nvme-auth.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme-auth.h b/include/linux/nvme-auth.h index 184a1f9510fa..89902ae8b929 100644 --- a/include/linux/nvme-auth.h +++ b/include/linux/nvme-auth.h @@ -49,9 +49,9 @@ int nvme_auth_augmented_challenge(u8 hmac_id, const u8 *skey, size_t skey_len, int nvme_auth_gen_privkey(struct crypto_kpp *dh_tfm, u8 dh_gid); int nvme_auth_gen_pubkey(struct crypto_kpp *dh_tfm, u8 *host_key, size_t host_key_len); -int nvme_auth_gen_shared_secret(struct crypto_kpp *dh_tfm, - const u8 *ctrl_key, size_t ctrl_key_len, - u8 *sess_key, size_t sess_key_len); +int nvme_auth_gen_session_key(struct crypto_kpp *dh_tfm, + const u8 *public_key, size_t public_key_len, + u8 *sess_key, size_t sess_key_len, u8 hash_id); int nvme_auth_generate_psk(u8 hmac_id, const u8 *skey, size_t skey_len, const u8 *c1, const u8 *c2, size_t hash_len, u8 **ret_psk, size_t *ret_len); -- cgit v1.2.3 From 0b13173d27fa15679463b62a10cfa8b3d6c3a71c Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Wed, 15 Apr 2026 13:41:01 +0800 Subject: dma-buf: fix stale @lock references in struct dma_buf documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The kernel-doc comments for vmapping_counter and vmap_ptr in struct dma_buf reference "@lock" as the protecting lock, but struct dma_buf no longer has a "lock" member. The mutex was removed in favor of using the dma_resv lock exclusively. The implementation correctly uses dma_resv_assert_held(dmabuf->resv) in dma_buf_vmap() and dma_buf_vunmap(), so update the documentation to reference @resv instead. Signed-off-by: gaoxiang17 Reviewed-by: Christian König Signed-off-by: Christian König Link: https://lore.kernel.org/r/20260415054101.535520-1-gxxa03070307@gmail.com --- include/linux/dma-buf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index 133b9e637b55..ef6d93fd7a2c 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -322,13 +322,13 @@ struct dma_buf { * @vmapping_counter: * * Used internally to refcnt the vmaps returned by dma_buf_vmap(). - * Protected by @lock. + * Protected by @resv. */ unsigned vmapping_counter; /** * @vmap_ptr: - * The current vmap ptr if @vmapping_counter > 0. Protected by @lock. + * The current vmap ptr if @vmapping_counter > 0. Protected by @resv. */ struct iosys_map vmap_ptr; -- cgit v1.2.3 From 619eab23e1ce7c97e54bfc5a417306d94b3f6f13 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 21 Apr 2026 11:21:50 +0100 Subject: mm/vma: do not try to unmap a VMA if mmap_prepare() invoked from mmap() The mmap_prepare hook functionality includes the ability to invoke mmap_prepare() from the mmap() hook of existing 'stacked' drivers, that is ones which are capable of calling the mmap hooks of other drivers/file systems (e.g. overlayfs, shm). As part of the mmap_prepare action functionality, we deal with errors by unmapping the VMA should one arise. This works in the usual mmap_prepare case, as we invoke this action at the last moment, when the VMA is established in the maple tree. However, the mmap() hook passes a not-fully-established VMA pointer to the caller (which is the motivation behind the mmap_prepare() work), which is detached. So attempting to unmap a VMA in this state will be problematic, with the most obvious symptom being a warning in vma_mark_detached(), because the VMA is already detached. It's also unncessary - the mmap() handler will clean up the VMA on error. So to fix this issue, this patch propagates whether or not an mmap action is being completed via the compatibility layer or directly. If the former, then we do not attempt VMA cleanup, if the latter, then we do. This patch also updates the userland VMA tests to reflect the change. Link: https://lore.kernel.org/20260421102150.189982-1-ljs@kernel.org Fixes: ac0a3fc9c07d ("mm: add ability to take further action in vm_area_desc") Signed-off-by: Lorenzo Stoakes Reported-by: syzbot+db390288d141a1dccf96@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/69e69734.050a0220.24bfd3.0027.GAE@google.com/ Cc: David Hildenbrand Cc: Jann Horn Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Pedro Falcato Cc: Suren Baghdasaryan Cc: Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0b776907152e..af23453e9dbd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4391,7 +4391,7 @@ static inline void mmap_action_map_kernel_pages_full(struct vm_area_desc *desc, int mmap_action_prepare(struct vm_area_desc *desc); int mmap_action_complete(struct vm_area_struct *vma, - struct mmap_action *action); + struct mmap_action *action, bool is_compat); /* Look up the first VMA which exactly match the interval vm_start ... vm_end */ static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, -- cgit v1.2.3 From 77a50e9652ac3c669c6690088bce97d960f5fd17 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 22 Apr 2026 14:43:10 -0400 Subject: MAINTAINERS: update Liam's email address Switching to private email address. Update all contact information Add an entry to mailmap at the same time. Link: https://lore.kernel.org/20260422184310.2682901-1-liam@infradead.org Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 0c464eade1d6..4a5631906aff 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -4,7 +4,7 @@ /* * Maple Tree - An RCU-safe adaptive tree for storing ranges * Copyright (c) 2018-2022 Oracle - * Authors: Liam R. Howlett + * Authors: Liam R. Howlett * Matthew Wilcox */ -- cgit v1.2.3 From 5e25407b68f460142539536e31fa20338db6146f Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Fri, 10 Apr 2026 19:41:03 +0200 Subject: mtd: spinand: Add support for packed read data ODTR commands Some devices stuff address bits in the double byte opcode (in place of the repeated byte) in order to be able to increase the size of the devices, without adding extra address bytes. Create a flag to identify those devices. When the flag is set, use the "packed" variant for the read data operation. Signed-off-by: Miquel Raynal --- include/linux/mtd/spinand.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index 58abd306ebe3..782984ba3a20 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -290,6 +290,12 @@ SPI_MEM_OP_NO_DUMMY, \ SPI_MEM_OP_NO_DATA) +#define SPINAND_PAGE_READ_PACKED_8D_8D_0_OP(addr) \ + SPI_MEM_OP(SPI_MEM_DTR_OP_PACKED_CMD(0x13, addr >> 16, 8), \ + SPI_MEM_DTR_OP_ADDR(2, addr & 0xffff, 8), \ + SPI_MEM_OP_NO_DUMMY, \ + SPI_MEM_OP_NO_DATA) + #define SPINAND_PAGE_READ_FROM_CACHE_8D_8D_8D_OP(addr, ndummy, buf, len, freq) \ SPI_MEM_OP(SPI_MEM_DTR_OP_RPT_CMD(0x9d, 8), \ SPI_MEM_DTR_OP_ADDR(2, addr, 8), \ @@ -483,6 +489,7 @@ struct spinand_ecc_info { #define SPINAND_HAS_PROG_PLANE_SELECT_BIT BIT(2) #define SPINAND_HAS_READ_PLANE_SELECT_BIT BIT(3) #define SPINAND_NO_RAW_ACCESS BIT(4) +#define SPINAND_ODTR_PACKED_PAGE_READ BIT(5) /** * struct spinand_ondie_ecc_conf - private SPI-NAND on-die ECC engine structure -- cgit v1.2.3 From 0898a817621a2f0cddca8122d9b974003fe5036d Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Mon, 27 Apr 2026 22:01:39 +0100 Subject: cdrom, scsi: sr: propagate read-only status to block layer via set_disk_ro() The cdrom core never calls set_disk_ro() for a registered device, so BLKROGET on a CD-ROM device always returns 0 (writable), even when the drive has no write capabilities and writes will inevitably fail. This causes problems for userspace that relies on BLKROGET to determine whether a block device is read-only. For example, systemd's loop device setup uses BLKROGET to decide whether to create a loop device with LO_FLAGS_READ_ONLY. Without the read-only flag, writes pass through the loop device to the CD-ROM and fail with I/O errors. systemd-fsck similarly checks BLKROGET to decide whether to run fsck in no-repair mode (-n). The write-capability bits in cdi->mask come from two different sources: CDC_DVD_RAM and CDC_CD_RW are populated by the driver from the MODE SENSE capabilities page (page 0x2A) before register_cdrom() is called, while CDC_MRW_W and CDC_RAM require the MMC GET CONFIGURATION command and were only probed by cdrom_open_write() at device open time. This meant that any attempt to compute the writable state from the full mask at probe time was incorrect, because the GET CONFIGURATION bits were still unset (and cdi->mask is initialized such that capabilities are assumed present). Fix this by factoring the GET CONFIGURATION probing out of cdrom_open_write() into a new exported helper, cdrom_probe_write_features(), and having sr call it from sr_probe() right after get_capabilities() has populated the MODE SENSE bits. register_cdrom() then calls set_disk_ro() based on the full write-capability mask (CDC_DVD_RAM | CDC_MRW_W | CDC_RAM | CDC_CD_RW) so the block layer reflects the drive's actual write support. The feature queries used (CDF_MRW and CDF_RWRT via GET CONFIGURATION with RT=00) report drive-level capabilities that are persistent across media, so a single probe before register_cdrom() is sufficient and the redundant probe at open time is dropped. With set_disk_ro() now accurate, the long-vestigial cd->writeable flag in sr can go: get_capabilities() used to set cd->writeable based on the same four mask bits, but because CDC_MRW_W and CDC_RAM default to "capability present" in cdi->mask and aren't touched by MODE SENSE, the condition that gated cd->writeable was always true, making it unconditionally 1. Replace the corresponding gate in sr_init_command() with get_disk_ro(cd->disk), which turns a previously no-op check into a real one and also catches kernel-internal bio writers that bypass blkdev_write_iter()'s bdev_read_only() check. The sd driver (SCSI disks) does not have this problem because it checks the MODE SENSE Write Protect bit and calls set_disk_ro() accordingly. The sr driver cannot use the same approach because the MMC specification does not define the WP bit in the MODE SENSE device-specific parameter byte for CD-ROM devices. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Daan De Meyer Reviewed-by: Phillip Potter Reviewed-by: Martin K. Petersen Signed-off-by: Phillip Potter Link: https://patch.msgid.link/20260427210139.1400-2-phil@philpotter.co.uk Signed-off-by: Jens Axboe --- include/linux/cdrom.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h index b907e6c2307d..260d7968cf72 100644 --- a/include/linux/cdrom.h +++ b/include/linux/cdrom.h @@ -108,6 +108,7 @@ int cdrom_ioctl(struct cdrom_device_info *cdi, struct block_device *bdev, extern unsigned int cdrom_check_events(struct cdrom_device_info *cdi, unsigned int clearing); +extern void cdrom_probe_write_features(struct cdrom_device_info *cdi); extern int register_cdrom(struct gendisk *disk, struct cdrom_device_info *cdi); extern void unregister_cdrom(struct cdrom_device_info *cdi); -- cgit v1.2.3 From b3b6babf47517fde6b6de2493dea28e8831b9347 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 23 Apr 2026 05:34:54 +0000 Subject: ipmr: Free mr_table after RCU grace period. With CONFIG_IP_MROUTE_MULTIPLE_TABLES=n, ipmr_fib_lookup() does not check if net->ipv4.mrt is NULL. Since default_device_exit_batch() is called after ->exit_rtnl(), a device could receive IGMP packets and access net->ipv4.mrt during/after ipmr_rules_exit_rtnl(). If ipmr_rules_exit_rtnl() had already cleared it and freed the memory, the access would trigger null-ptr-deref or use-after-free. Let's fix it by using RCU helper and free mrt after RCU grace period. In addition, check_net(net) is added to mroute_clean_tables() and ipmr_cache_unresolved() to synchronise via mfc_unres_lock. This prevents ipmr_cache_unresolved() from putting skb into c->_c.mfc_un.unres.unresolved after mroute_clean_tables() purges it. For the same reason, timer_shutdown_sync() is moved after mroute_clean_tables(). Since rhltable_destroy() holds mutex internally, rcu_work is used, and it is placed as the first member because rcu_head must be placed within <4K offset. mr_table is alraedy 3864 bytes without rcu_work. Note that IP6MR is not yet converted to ->exit_rtnl(), so this change is not needed for now but will be. Fixes: b22b01867406 ("ipmr: Convert ipmr_net_exit_batch() to ->exit_rtnl().") Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260423053456.4097409-1-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/linux/mroute_base.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index cf3374580f74..5d75cc5b057e 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -226,6 +226,7 @@ struct mr_table_ops { /** * struct mr_table - a multicast routing table + * @work: used for table destruction * @list: entry within a list of multicast routing tables * @net: net where this table belongs * @ops: protocol specific operations @@ -243,6 +244,7 @@ struct mr_table_ops { * @mroute_reg_vif_num: PIM-device vif index */ struct mr_table { + struct rcu_work work; struct list_head list; possible_net_t net; struct mr_table_ops ops; @@ -274,6 +276,7 @@ void vif_device_init(struct vif_device *v, unsigned short flags, unsigned short get_iflink_mask); +void mr_table_free(struct mr_table *mrt); struct mr_table * mr_table_alloc(struct net *net, u32 id, struct mr_table_ops *ops, -- cgit v1.2.3 From 0de4cb473aed57ee4ba7e0551ad27bddc19fc519 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 28 Apr 2026 08:10:43 -0700 Subject: workqueue: fix devm_alloc_workqueue() va_list misuse devm_alloc_workqueue() built a va_list and passed it as a single positional argument to the variadic alloc_workqueue() macro: va_start(args, max_active); wq = alloc_workqueue(fmt, flags, max_active, args); va_end(args); C does not allow forwarding a va_list through a ... parameter. alloc_workqueue() expands to alloc_workqueue_noprof(), which runs its own va_start() over its ... params, so the inner vsnprintf(wq->name, sizeof(wq->name), fmt, args) in __alloc_workqueue() received the outer va_list object as the first variadic slot rather than the caller's actual format arguments. Add a new static helper alloc_workqueue_va() that wraps __alloc_workqueue() and runs wq_init_lockdep() on success, and fold both alloc_workqueue_noprof() and devm_alloc_workqueue_noprof() onto it as suggested by Tejun. The wq_init_lockdep() step is required on the devm path too, otherwise __flush_workqueue()'s on-stack COMPLETION_INITIALIZER_ONSTACK_MAP would NULL-deref wq->lockdep_map. No caller changes are required. devm_alloc_ordered_workqueue() is a macro forwarding to devm_alloc_workqueue() and inherits the fix. Two in-tree callers actively trigger the broken path on every probe: drivers/power/supply/mt6370-charger.c:889 drivers/power/supply/max77705_charger.c:649 both of which use devm_alloc_ordered_workqueue(dev, "%s", 0, dev_name(dev)). A standalone reproducer module is available at[1]. Link: https://github.com/leitao/debug/blob/main/workqueue/valist/wq_va_test.c [1] Fixes: 1dfc9d60a69e ("workqueue: devres: Add device-managed allocate workqueue") Signed-off-by: Breno Leitao Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index ab6cb70ca1a5..6177624539b3 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -534,8 +534,10 @@ alloc_workqueue_noprof(const char *fmt, unsigned int flags, int max_active, ...) * Pointer to the allocated workqueue on success, %NULL on failure. */ __printf(2, 5) struct workqueue_struct * -devm_alloc_workqueue(struct device *dev, const char *fmt, unsigned int flags, - int max_active, ...); +devm_alloc_workqueue_noprof(struct device *dev, const char *fmt, + unsigned int flags, int max_active, ...); +#define devm_alloc_workqueue(...) \ + alloc_hooks(devm_alloc_workqueue_noprof(__VA_ARGS__)) #ifdef CONFIG_LOCKDEP /** -- cgit v1.2.3 From 5ec07d5204b4544271f32f6261ee097fe53cb081 Mon Sep 17 00:00:00 2001 From: Sheng Che Peng Date: Wed, 22 Apr 2026 10:18:19 +0800 Subject: tracepoint: Fix typo in tracepoint.h comment Change "my" to "may" in the description of subsystem configurations. Link: https://patch.msgid.link/20260422021819.1788091-1-synte4028@gmail.com Signed-off-by: Sheng Che Peng Signed-off-by: Steven Rostedt --- include/linux/tracepoint.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 578e520b6ee6..763eea4d80d8 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -202,7 +202,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #define TP_CONDITION(args...) args /* - * Individual subsystem my have a separate configuration to + * Individual subsystem may have a separate configuration to * enable their tracepoints. By default, this file will create * the tracepoints if CONFIG_TRACEPOINTS is defined. If a subsystem * wants to be able to disable its tracepoints from being created -- cgit v1.2.3 From 6813985ca456d1f5677ad9554f55805cbf27e16f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 28 Apr 2026 17:35:18 +0200 Subject: netfilter: x_tables: add .check_hooks to matches and targets Add a new .check_hooks interface for checking if the match/target is used from the validate hook according to its configuration. Move existing conditional hook check based on the match/target configuration from .checkentry to .check_hooks for the following matches/targets: - addrtype - devgroup - physdev - policy - set - TCPMSS - SET This is a preparation patch to fix nft_compat, not functional changes are intended. Based on patch from Florian Westphal. Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 77c778d84d4c..a81b46af5118 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -146,6 +146,9 @@ struct xt_match { /* Called when user tries to insert an entry of this type. */ int (*checkentry)(const struct xt_mtchk_param *); + /* Called to validate hooks based on the match configuration. */ + int (*check_hooks)(const struct xt_mtchk_param *); + /* Called when entry of this type deleted. */ void (*destroy)(const struct xt_mtdtor_param *); #ifdef CONFIG_NETFILTER_XTABLES_COMPAT @@ -187,6 +190,9 @@ struct xt_target { /* Should return 0 on success or an error code otherwise (-Exxxx). */ int (*checkentry)(const struct xt_tgchk_param *); + /* Called to validate hooks based on the target configuration. */ + int (*check_hooks)(const struct xt_tgchk_param *); + /* Called when entry of this type deleted. */ void (*destroy)(const struct xt_tgdtor_param *); #ifdef CONFIG_NETFILTER_XTABLES_COMPAT @@ -279,8 +285,10 @@ bool xt_find_jump_offset(const unsigned int *offsets, int xt_check_proc_name(const char *name, unsigned int size); +int xt_check_hooks_match(struct xt_mtchk_param *par); int xt_check_match(struct xt_mtchk_param *, unsigned int size, u16 proto, bool inv_proto); +int xt_check_hooks_target(struct xt_tgchk_param *par); int xt_check_target(struct xt_tgchk_param *, unsigned int size, u16 proto, bool inv_proto); -- cgit v1.2.3 From 620055cb1036a6125fd912e7a14b47a6572b809b Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Mon, 27 Apr 2026 22:22:21 -0700 Subject: dpll: export __dpll_pin_change_ntf() for use under dpll_lock Export __dpll_pin_change_ntf() so that drivers can send pin change notifications from within pin callbacks, which are already called under dpll_lock. Using dpll_pin_change_ntf() in that context would deadlock. Add lockdep_assert_held() to catch misuse without the lock held. Acked-by: Vadim Fedorenko Signed-off-by: Ivan Vecera Signed-off-by: Petr Oros Tested-by: Alexander Nowlin Reviewed-by: Arkadiusz Kubalewski Signed-off-by: Jacob Keller Link: https://patch.msgid.link/20260427-jk-iwl-net-petr-oros-fixes-v1-9-cdcb48303fd8@intel.com Signed-off-by: Paolo Abeni --- include/linux/dpll.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/dpll.h b/include/linux/dpll.h index b7277a8b484d..f8037f1ab20b 100644 --- a/include/linux/dpll.h +++ b/include/linux/dpll.h @@ -286,6 +286,7 @@ int dpll_pin_ref_sync_pair_add(struct dpll_pin *pin, int dpll_device_change_ntf(struct dpll_device *dpll); +int __dpll_pin_change_ntf(struct dpll_pin *pin); int dpll_pin_change_ntf(struct dpll_pin *pin); int register_dpll_notifier(struct notifier_block *nb); -- cgit v1.2.3 From e9766e6f7d330dce7530918d8c6e3ec96d6c6e24 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Apr 2026 10:14:41 +0200 Subject: rseq: Protect rseq_reset() against interrupts rseq_reset() uses memset() to clear the tasks rseq data. That's racy against membarrier() and preemption. Guard it with irqsave to cure this. Fixes: faba9d250eae ("rseq: Introduce struct rseq_data") Reported-by: Dmitry Vyukov Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Tested-by: Dmitry Vyukov Link: https://patch.msgid.link/20260428224427.353887714%40kernel.org Cc: stable@vger.kernel.org --- include/linux/rseq.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rseq.h b/include/linux/rseq.h index b9d62fc2140d..f446909551df 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -119,6 +119,8 @@ static inline void rseq_virt_userspace_exit(void) static inline void rseq_reset(struct task_struct *t) { + /* Protect against preemption and membarrier IPI */ + guard(irqsave)(); memset(&t->rseq, 0, sizeof(t->rseq)); t->rseq.ids.cpu_id = RSEQ_CPU_ID_UNINITIALIZED; } -- cgit v1.2.3 From e768103cfbac30a49860aca08a7710d39dbdd470 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Wed, 29 Apr 2026 15:43:36 +0200 Subject: smb: smbdirect: introduce and use include/linux/smbdirect.h This makes it easier to rebuild cifs.ko and ksmbd.ko against a running kernel. Suggested-by: Christoph Hellwig Link: https://lore.kernel.org/linux-cifs/aehrPuY60VMcYGU8@infradead.org/ Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: Namjae Jeon Cc: Christoph Hellwig Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- include/linux/smbdirect.h | 186 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 186 insertions(+) create mode 100644 include/linux/smbdirect.h (limited to 'include/linux') diff --git a/include/linux/smbdirect.h b/include/linux/smbdirect.h new file mode 100644 index 000000000000..97f5ba730fa7 --- /dev/null +++ b/include/linux/smbdirect.h @@ -0,0 +1,186 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2025, Stefan Metzmacher + */ + +#ifndef __LINUX_SMBDIRECT_H__ +#define __LINUX_SMBDIRECT_H__ + +#include + +/* SMB-DIRECT buffer descriptor V1 structure [MS-SMBD] 2.2.3.1 */ +struct smbdirect_buffer_descriptor_v1 { + __le64 offset; + __le32 token; + __le32 length; +} __packed; + +/* + * Connection parameters mostly from [MS-SMBD] 3.1.1.1 + * + * These are setup and negotiated at the beginning of a + * connection and remain constant unless explicitly changed. + * + * Some values are important for the upper layer. + */ +struct smbdirect_socket_parameters { + __u64 flags; +#define SMBDIRECT_FLAG_PORT_RANGE_ONLY_IB ((__u64)0x1) +#define SMBDIRECT_FLAG_PORT_RANGE_ONLY_IW ((__u64)0x2) + __u32 resolve_addr_timeout_msec; + __u32 resolve_route_timeout_msec; + __u32 rdma_connect_timeout_msec; + __u32 negotiate_timeout_msec; + __u16 initiator_depth; /* limited to U8_MAX */ + __u16 responder_resources; /* limited to U8_MAX */ + __u16 recv_credit_max; + __u16 send_credit_target; + __u32 max_send_size; + __u32 max_fragmented_send_size; + __u32 max_recv_size; + __u32 max_fragmented_recv_size; + __u32 max_read_write_size; + __u32 max_frmr_depth; + __u32 keepalive_interval_msec; + __u32 keepalive_timeout_msec; +} __packed; + +#define SMBDIRECT_FLAG_PORT_RANGE_MASK ( \ + SMBDIRECT_FLAG_PORT_RANGE_ONLY_IB | \ + SMBDIRECT_FLAG_PORT_RANGE_ONLY_IW) + +struct smbdirect_socket; +struct smbdirect_send_batch; +struct smbdirect_mr_io; + +#include + +u8 smbdirect_netdev_rdma_capable_node_type(struct net_device *netdev); + +bool smbdirect_frwr_is_supported(const struct ib_device_attr *attrs); + +int smbdirect_socket_create_kern(struct net *net, struct smbdirect_socket **_sc); + +int smbdirect_socket_create_accepting(struct rdma_cm_id *id, struct smbdirect_socket **_sc); + +int smbdirect_socket_set_initial_parameters(struct smbdirect_socket *sc, + const struct smbdirect_socket_parameters *sp); + +const struct smbdirect_socket_parameters * +smbdirect_socket_get_current_parameters(struct smbdirect_socket *sc); + +int smbdirect_socket_set_kernel_settings(struct smbdirect_socket *sc, + enum ib_poll_context poll_ctx, + gfp_t gfp_mask); + +#define SMBDIRECT_LOG_ERR 0x0 +#define SMBDIRECT_LOG_INFO 0x1 + +#define SMBDIRECT_LOG_OUTGOING 0x1 +#define SMBDIRECT_LOG_INCOMING 0x2 +#define SMBDIRECT_LOG_READ 0x4 +#define SMBDIRECT_LOG_WRITE 0x8 +#define SMBDIRECT_LOG_RDMA_SEND 0x10 +#define SMBDIRECT_LOG_RDMA_RECV 0x20 +#define SMBDIRECT_LOG_KEEP_ALIVE 0x40 +#define SMBDIRECT_LOG_RDMA_EVENT 0x80 +#define SMBDIRECT_LOG_RDMA_MR 0x100 +#define SMBDIRECT_LOG_RDMA_RW 0x200 +#define SMBDIRECT_LOG_NEGOTIATE 0x400 +void smbdirect_socket_set_logging(struct smbdirect_socket *sc, + void *private_ptr, + bool (*needed)(struct smbdirect_socket *sc, + void *private_ptr, + unsigned int lvl, + unsigned int cls), + void (*vaprintf)(struct smbdirect_socket *sc, + const char *func, + unsigned int line, + void *private_ptr, + unsigned int lvl, + unsigned int cls, + struct va_format *vaf)); + +bool smbdirect_connection_is_connected(struct smbdirect_socket *sc); + +int smbdirect_connection_wait_for_connected(struct smbdirect_socket *sc); + +int smbdirect_socket_bind(struct smbdirect_socket *sc, struct sockaddr *addr); + +void smbdirect_socket_shutdown(struct smbdirect_socket *sc); + +void smbdirect_socket_release(struct smbdirect_socket *sc); + +int smbdirect_connection_send_batch_flush(struct smbdirect_socket *sc, + struct smbdirect_send_batch *batch, + bool is_last); + +/* + * This is only temporary and only needed + * as long as the client still requires + * to use smbdirect_connection_send_single_iter() + */ +struct smbdirect_send_batch_storage { + union { + struct list_head __msg_list; + __aligned_u64 __space[5]; + }; +}; + +struct smbdirect_send_batch * +smbdirect_init_send_batch_storage(struct smbdirect_send_batch_storage *storage, + bool need_invalidate_rkey, + unsigned int remote_key); + +int smbdirect_connection_send_single_iter(struct smbdirect_socket *sc, + struct smbdirect_send_batch *batch, + struct iov_iter *iter, + unsigned int flags, + u32 remaining_data_length); + +int smbdirect_connection_send_wait_zero_pending(struct smbdirect_socket *sc); + +int smbdirect_connection_send_iter(struct smbdirect_socket *sc, + struct iov_iter *iter, + unsigned int flags, + bool need_invalidate, + unsigned int remote_key); + +int smbdirect_connection_recvmsg(struct smbdirect_socket *sc, + struct msghdr *msg, + unsigned int flags); + +int smbdirect_connect(struct smbdirect_socket *sc, + const struct sockaddr *dst); + +int smbdirect_connect_sync(struct smbdirect_socket *sc, + const struct sockaddr *dst); + +int smbdirect_socket_listen(struct smbdirect_socket *sc, int backlog); + +struct smbdirect_socket *smbdirect_socket_accept(struct smbdirect_socket *lsc, + long timeo, + struct proto_accept_arg *arg); + +int smbdirect_connection_rdma_xmit(struct smbdirect_socket *sc, + void *buf, size_t buf_len, + struct smbdirect_buffer_descriptor_v1 *desc, + size_t desc_len, + bool is_read); + +struct smbdirect_mr_io * +smbdirect_connection_register_mr_io(struct smbdirect_socket *sc, + struct iov_iter *iter, + bool writing, + bool need_invalidate); + +void smbdirect_mr_io_fill_buffer_descriptor(struct smbdirect_mr_io *mr, + struct smbdirect_buffer_descriptor_v1 *v1); + +void smbdirect_connection_deregister_mr_io(struct smbdirect_mr_io *mr); + +void smbdirect_connection_legacy_debug_proc_show(struct smbdirect_socket *sc, + unsigned int rdma_readwrite_threshold, + struct seq_file *m); + +#endif /* __LINUX_SMBDIRECT_H__ */ -- cgit v1.2.3 From d4c14903bf5e28e740516c4fbb7db01e0dedf3af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Mon, 10 Nov 2025 19:58:10 +0100 Subject: dma-buf/dma_fence_array: remove unused functionality v4 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Amdgpu was the only user of the signal on any feature and we dropped that use case recently, so we can remove that functionality. v2: update num_pending only after the fence is signaled v3: separate out simplifying dma_fence_array implementation v4: fix XE patch split fallout Signed-off-by: Christian König Reviewed-by: Tvrtko Ursulin Link: https://lore.kernel.org/r/20260422103012.1647-1-christian.koenig@amd.com --- include/linux/dma-fence-array.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-fence-array.h b/include/linux/dma-fence-array.h index 370b3d2bba37..1b1d87579c38 100644 --- a/include/linux/dma-fence-array.h +++ b/include/linux/dma-fence-array.h @@ -81,13 +81,11 @@ to_dma_fence_array(struct dma_fence *fence) struct dma_fence_array *dma_fence_array_alloc(int num_fences); void dma_fence_array_init(struct dma_fence_array *array, int num_fences, struct dma_fence **fences, - u64 context, unsigned seqno, - bool signal_on_any); + u64 context, unsigned seqno); struct dma_fence_array *dma_fence_array_create(int num_fences, struct dma_fence **fences, - u64 context, unsigned seqno, - bool signal_on_any); + u64 context, unsigned seqno); bool dma_fence_match_context(struct dma_fence *fence, u64 context); -- cgit v1.2.3 From 93618edf753838a727dbff63c7c291dee22d656b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 1 May 2026 08:31:22 -1000 Subject: cgroup: Defer css percpu_ref kill on rmdir until cgroup is depopulated A chain of commits going back to v7.0 reworked rmdir to satisfy the controller invariant that a subsystem's ->css_offline() must not run while tasks are still doing kernel-side work in the cgroup. [1] d245698d727a ("cgroup: Defer task cgroup unlink until after the task is done switching out") [2] a72f73c4dd9b ("cgroup: Don't expose dead tasks in cgroup") [3] 1b164b876c36 ("cgroup: Wait for dying tasks to leave on rmdir") [4] 4c56a8ac6869 ("cgroup: Fix cgroup_drain_dying() testing the wrong condition") [5] 13e786b64bd3 ("cgroup: Increment nr_dying_subsys_* from rmdir context") [1] moved task cset unlink from do_exit() to finish_task_switch() so a task's cset link drops only after the task has fully stopped scheduling. That made tasks past exit_signals() linger on cset->tasks until their final context switch, which led to a series of problems as what userspace expected to see after rmdir diverged from what the kernel needs to wait for. [2]-[5] tried to bridge that divergence: [2] filtered the exiting tasks from cgroup.procs; [3] had rmdir(2) sleep in TASK_UNINTERRUPTIBLE for them; [4] fixed the wait's condition; [5] made nr_dying_subsys_* visible synchronously. The cgroup_drain_dying() wait in [3] turned out to be a dead end. When the rmdir caller is also the reaper of a zombie that pins a pidns teardown (e.g. host PID 1 systemd reaping orphan pids that were re-parented to it during the same teardown), rmdir blocks in TASK_UNINTERRUPTIBLE waiting for those pids to free, the pids can't free because PID 1 is the reaper and it's stuck in rmdir, and the system A-A deadlocks. No internal lock ordering breaks this; the wait itself is the bug. The css killing side that drove the original reorder, however, can be made cleanly asynchronous: ->css_offline() is already async, run from css_killed_work_fn() driven by percpu_ref_kill_and_confirm(). The fix is to make that chain start only after all tasks have left the cgroup. rmdir's user-visible side then returns as soon as cgroup.procs and friends are empty, while ->css_offline() still runs only after the cgroup is fully drained. Verified by the original reproducer (pidns teardown + zombie reaper, runs under vng) which hangs vanilla and succeeds here, and by per-commit deterministic repros for [2], [3], [4], [5] with a boot parameter that widens the post-exit_signals() window so each state is reliably reachable. Some stress tests on top of that. cgroup_apply_control_disable() has the same shape of pre-existing race: when a controller is disabled via subtree_control, kill_css() ran synchronously while tasks past exit_signals() could still be linked to the cgroup's csets, and ->css_offline() could fire before they drained. This patch preserves the existing synchronous behavior at that call site (kill_css_sync() + kill_css_finish() back-to-back) and a follow-up patch will defer kill_css_finish() there using a per-css trigger. This seems like the right approach and I don't see problems with it. The changes are somewhat invasive but not excessively so, so backporting to -stable should be okay. If something does turn out to be wrong, the fallback is to revert the entire chain ([1]-[5]) and rework in the development branch instead. v2: Pin cgrp across the deferred destroy work with explicit cgroup_get()/cgroup_put() around queue_work() and the work_fn. v1 wasn't actually broken (ordered cgroup_offline_wq + queue_work order in cgroup_task_dead() saved it) but the explicit ref removes the dependency on those non-obvious invariants. Also note the pre-existing cgroup_apply_control_disable() race in the description; a follow-up will defer kill_css_finish() there. Fixes: 1b164b876c36 ("cgroup: Wait for dying tasks to leave on rmdir") Cc: stable@vger.kernel.org # v7.0+ Reported-and-tested-by: Martin Pitt Link: https://lore.kernel.org/all/afHNg2VX2jy9bW7y@piware.de/ Link: https://lore.kernel.org/all/35e0670adb4abeab13da2c321582af9f@kernel.org/ Signed-off-by: Tejun Heo Acked-by: Sebastian Andrzej Siewior --- include/linux/cgroup-defs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index f42563739d2e..50a784da7a81 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -611,8 +611,8 @@ struct cgroup { /* used to wait for offlining of csses */ wait_queue_head_t offline_waitq; - /* used by cgroup_rmdir() to wait for dying tasks to leave */ - wait_queue_head_t dying_populated_waitq; + /* defers killing csses after removal until cgroup is depopulated */ + struct work_struct finish_destroy_work; /* used to schedule release agent */ struct work_struct release_agent_work; -- cgit v1.2.3 From 60f21a2649308bbd84919ba6656d5ccd660953cf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 27 Apr 2026 14:16:34 -1000 Subject: cgroup, sched_ext: Include exiting tasks in cgroup iter a72f73c4dd9b ("cgroup: Don't expose dead tasks in cgroup") made css_task_iter_advance() skip exiting tasks so cgroup.procs stays consistent with waitpid() visibility. Unfortunately, this broke scx_task_iter. scx_task_iter walks either scx_tasks (global) or a cgroup subtree via css_task_iter() and the two modes are expected to cover the same set of tasks. After the above change the cgroup-scoped mode silently skips tasks past exit_signals() that are still on scx_tasks. scx_sub_enable_workfn()'s abort path is one of the symptoms: an exiting SCX_TASK_SUB_INIT task can race past the cgroup iter leaking __scx_init_task() state. Other iterations share the same gap. Add CSS_TASK_ITER_WITH_DEAD to opt out of the skip and use it from scx_task_iter(). Fixes: b0e4c2f8a0f0 ("sched_ext: Implement cgroup subtree iteration for scx_task_iter") Reported-by: Cheng-Yang Chou Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index e52160e85af4..f6d037a30fd8 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -53,6 +53,7 @@ struct kernel_clone_args; enum css_task_iter_flags { CSS_TASK_ITER_PROCS = (1U << 0), /* walk only threadgroup leaders */ CSS_TASK_ITER_THREADED = (1U << 1), /* walk all threaded css_sets in the domain */ + CSS_TASK_ITER_WITH_DEAD = (1U << 2), /* include exiting tasks */ CSS_TASK_ITER_SKIPPED = (1U << 16), /* internal flags */ }; -- cgit v1.2.3 From ff9eda4ea906b1f02fc260ddc42d2d9bd736a49c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 27 Apr 2026 14:16:35 -1000 Subject: sched_ext: Skip past-sched_ext_dead() tasks in scx_task_iter_next_locked() scx_task_iter's cgroup-scoped mode can return tasks whose sched_ext_dead() has already completed: cgroup_task_dead() removes from cset->tasks after sched_ext_dead() in finish_task_switch() and is irq-work deferred on PREEMPT_RT. The global mode is fine - sched_ext_dead() removes from scx_tasks via list_del_init() first. Callers (sub-sched enable prep/abort/apply, scx_sub_disable(), scx_fail_parent()) assume returned tasks are still on @sch and trip WARN_ON_ONCE() or operate on torn-down state otherwise. Set %SCX_TASK_OFF_TASKS in sched_ext_dead() under @p's rq lock and have scx_task_iter_next_locked() skip flagged tasks under the same lock. Setter and reader serialize on the per-task rq lock - no race. Signed-off-by: Tejun Heo --- include/linux/sched/ext.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 1a3af2ea2a79..adb9a4de068a 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -101,6 +101,7 @@ enum scx_ent_flags { SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */ SCX_TASK_SUB_INIT = 1 << 4, /* task being initialized for a sub sched */ SCX_TASK_IMMED = 1 << 5, /* task is on local DSQ with %SCX_ENQ_IMMED */ + SCX_TASK_OFF_TASKS = 1 << 6, /* removed from scx_tasks by sched_ext_dead() */ /* * Bits 8 and 9 are used to carry task state: -- cgit v1.2.3 From 8f78b749f3da0f43990490b4c1193b5ede3eec0a Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 30 Apr 2026 10:44:20 +0300 Subject: sched/isolation: Make HK_TYPE_KTHREAD an alias of HK_TYPE_DOMAIN Since commit 041ee6f3727a ("kthread: Rely on HK_TYPE_DOMAIN for preferred affinity management"), kthreads default to use the HK_TYPE_DOMAIN cpumask. IOW, it is no longer affected by the setting of the nohz_full boot kernel parameter. That means HK_TYPE_KTHREAD should now be an alias of HK_TYPE_DOMAIN instead of HK_TYPE_KERNEL_NOISE to correctly reflect the current kthread behavior. Make the change as HK_TYPE_KTHREAD is still being used in some networking code. Fixes: 041ee6f3727a ("kthread: Rely on HK_TYPE_DOMAIN for preferred affinity management") Signed-off-by: Waiman Long Signed-off-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- include/linux/sched/isolation.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index dc3975ff1b2e..cf0fd03dd7a2 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h @@ -20,6 +20,11 @@ enum hk_type { HK_TYPE_KERNEL_NOISE, HK_TYPE_MAX, + /* + * HK_TYPE_KTHREAD is now an alias of HK_TYPE_DOMAIN + */ + HK_TYPE_KTHREAD = HK_TYPE_DOMAIN, + /* * The following housekeeping types are only set by the nohz_full * boot commandline option. So they can share the same value. @@ -29,7 +34,6 @@ enum hk_type { HK_TYPE_RCU = HK_TYPE_KERNEL_NOISE, HK_TYPE_MISC = HK_TYPE_KERNEL_NOISE, HK_TYPE_WQ = HK_TYPE_KERNEL_NOISE, - HK_TYPE_KTHREAD = HK_TYPE_KERNEL_NOISE }; #ifdef CONFIG_CPU_ISOLATION -- cgit v1.2.3 From dad0d91cc2c3e6b6fb285ccfe7ddf71525797198 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 28 Apr 2026 18:14:18 +0200 Subject: mm/slab: Add kvfree_atomic() helper kvmalloc() now supports non-sleeping GFP flags, including the vmalloc fallback path. This means it may return vmalloc memory even for GFP_ATOMIC and GFP_NOWAIT allocations. Freeing such memory with kvfree() may then end up calling vfree(), which is not safe for non-sleeping contexts. Introduce kvfree_atomic() helper for such cases. It mirrors kvfree(), but uses vfree_atomic() for vmalloced memory. Signed-off-by: Uladzislau Rezki (Sony) Acked-by: Vlastimil Babka (SUSE) Acked-by: Harry Yoo (Oracle) Signed-off-by: Herbert Xu --- include/linux/slab.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 15a60b501b95..2b5ab488e96b 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -1234,6 +1234,9 @@ void *kvrealloc_node_align_noprof(const void *p, size_t size, unsigned long alig extern void kvfree(const void *addr); DEFINE_FREE(kvfree, void *, if (!IS_ERR_OR_NULL(_T)) kvfree(_T)) +extern void kvfree_atomic(const void *addr); +DEFINE_FREE(kvfree_atomic, void *, if (!IS_ERR_OR_NULL(_T)) kvfree_atomic(_T)) + extern void kvfree_sensitive(const void *addr, size_t len); unsigned int kmem_cache_size(struct kmem_cache *s); -- cgit v1.2.3 From b9eac6a9d93c952c4b7775a24d5c7a1bbf4c3c00 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 25 Apr 2026 00:47:54 +0200 Subject: rseq: Revert to historical performance killing behaviour The recent RSEQ optimization work broke the TCMalloc abuse of the RSEQ ABI as it not longer unconditionally updates the CPU, node, mm_cid fields, which are documented as read only for user space. Due to the observed behavior of the kernel it was possible for TCMalloc to overwrite the cpu_id_start field for their own purposes and rely on the kernel to update it unconditionally after each context switch and before signal delivery. The RSEQ ABI only guarantees that these fields are updated when the data changes, i.e. the task is migrated or the MMCID of the task changes due to switching from or to per CPU ownership mode. The optimization work eliminated the unconditional updates and reduced them to the documented ABI guarantees, which results in a massive performance win for syscall, scheduling heavy work loads, which in turn breaks the TCMalloc expectations. There have been several options discussed to restore the TCMalloc functionality while preserving the optimization benefits. They all end up in a series of hard to maintain workarounds, which in the worst case introduce overhead for everyone, e.g. in the scheduler. The requirements of TCMalloc and the optimization work are diametral and the required work arounds are a maintainence burden. They end up as fragile constructs, which are blocking further optimization work and are pretty much guaranteed to cause more subtle issues down the road. The optimization work heavily depends on the generic entry code, which is not used by all architectures yet. So the rework preserved the original mechanism moslty unmodified to keep the support for architectures, which handle rseq in their own exit to user space loop. That code is currently optimized out by the compiler on architectures which use the generic entry code. This allows to revert back to the original behaviour by replacing the compile time constant conditions with a runtime condition where required, which disables the optimization and the dependend time slice extension feature until the run-time condition can be enabled in the RSEQ registration code on a per task basis again. The following changes are required to restore the original behavior, which makes TCMalloc work again: 1) Replace the compile time constant conditionals with runtime conditionals where appropriate to prevent the compiler from optimizing the legacy mode out 2) Enforce unconditional update of IDs on context switch for the non-optimized v1 mode 3) Enforce update of IDs in the pre signal delivery path for the non-optimized v1 mode 4) Enforce update of IDs in the membarrier(RSEQ) IPI for the non-optimized v1 mode 5) Make time slice and future extensions depend on optimized v2 mode This brings back the full performance problems, but preserves the v2 optimization code and for generic entry code using architectures also the TIF_RSEQ optimization which avoids a full evaluation of the exit to user mode loop in many cases. Fixes: 566d8015f7ee ("rseq: Avoid CPU/MM CID updates when no event pending") Reported-by: Mathias Stearn Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Tested-by: Dmitry Vyukov Closes: https://lore.kernel.org/CAHnCjA25b+nO2n5CeifknSKHssJpPrjnf+dtr7UgzRw4Zgu=oA@mail.gmail.com Link: https://patch.msgid.link/20260428224427.517051752%40kernel.org Cc: stable@vger.kernel.org --- include/linux/rseq.h | 35 ++++++++++++++++++++++++----------- include/linux/rseq_entry.h | 39 +++++++++++++++++++++++++++++---------- include/linux/rseq_types.h | 9 ++++++++- 3 files changed, 61 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rseq.h b/include/linux/rseq.h index f446909551df..7ef79b25e714 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -9,6 +9,11 @@ void __rseq_handle_slowpath(struct pt_regs *regs); +static __always_inline bool rseq_v2(struct task_struct *t) +{ + return IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY) && likely(t->rseq.event.has_rseq > 1); +} + /* Invoked from resume_user_mode_work() */ static inline void rseq_handle_slowpath(struct pt_regs *regs) { @@ -16,8 +21,7 @@ static inline void rseq_handle_slowpath(struct pt_regs *regs) if (current->rseq.event.slowpath) __rseq_handle_slowpath(regs); } else { - /* '&' is intentional to spare one conditional branch */ - if (current->rseq.event.sched_switch & current->rseq.event.has_rseq) + if (current->rseq.event.sched_switch && current->rseq.event.has_rseq) __rseq_handle_slowpath(regs); } } @@ -30,9 +34,9 @@ void __rseq_signal_deliver(int sig, struct pt_regs *regs); */ static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { - if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { - /* '&' is intentional to spare one conditional branch */ - if (current->rseq.event.has_rseq & current->rseq.event.user_irq) + if (rseq_v2(current)) { + /* has_rseq is implied in rseq_v2() */ + if (current->rseq.event.user_irq) __rseq_signal_deliver(ksig->sig, regs); } else { if (current->rseq.event.has_rseq) @@ -50,15 +54,22 @@ static __always_inline void rseq_sched_switch_event(struct task_struct *t) { struct rseq_event *ev = &t->rseq.event; - if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { + /* + * Only apply the user_irq optimization for RSEQ ABI V2 registrations. + * Legacy users like TCMalloc rely on the original ABI V1 behaviour + * which updates IDs on every context swtich. + */ + if (rseq_v2(t)) { /* - * Avoid a boat load of conditionals by using simple logic - * to determine whether NOTIFY_RESUME needs to be raised. + * Avoid a boat load of conditionals by using simple logic to + * determine whether TIF_NOTIFY_RESUME or TIF_RSEQ needs to be + * raised. * - * It's required when the CPU or MM CID has changed or - * the entry was from user space. + * It's required when the CPU or MM CID has changed or the entry + * was via interrupt from user space. ev->has_rseq does not have + * to be evaluated here because rseq_v2() implies has_rseq. */ - bool raise = (ev->user_irq | ev->ids_changed) & ev->has_rseq; + bool raise = ev->user_irq | ev->ids_changed; if (raise) { ev->sched_switch = true; @@ -66,6 +77,7 @@ static __always_inline void rseq_sched_switch_event(struct task_struct *t) } } else { if (ev->has_rseq) { + t->rseq.event.ids_changed = true; t->rseq.event.sched_switch = true; rseq_raise_notify_resume(t); } @@ -161,6 +173,7 @@ static inline unsigned int rseq_alloc_align(void) } #else /* CONFIG_RSEQ */ +static inline bool rseq_v2(struct task_struct *t) { return false; } static inline void rseq_handle_slowpath(struct pt_regs *regs) { } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } static inline void rseq_sched_switch_event(struct task_struct *t) { } diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index f11ebd34f8b9..934db41ec782 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -111,6 +111,20 @@ static __always_inline void rseq_slice_clear_grant(struct task_struct *t) t->rseq.slice.state.granted = false; } +/* + * Open coded, so it can be invoked within a user access region. + * + * This clears the user space state of the time slice extensions field only when + * the task has registered the optimized RSEQ_ABI V2. Some legacy registrations, + * e.g. TCMalloc, have conflicting non-ABI fields in struct RSEQ, which would be + * overwritten by an unconditional write. + */ +#define rseq_slice_clear_user(rseq, efault) \ +do { \ + if (rseq_slice_extension_enabled()) \ + unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); \ +} while (0) + static __always_inline bool __rseq_grant_slice_extension(bool work_pending) { struct task_struct *curr = current; @@ -230,6 +244,7 @@ static __always_inline bool rseq_slice_extension_enabled(void) { return false; } static __always_inline bool rseq_arm_slice_extension_timer(void) { return false; } static __always_inline void rseq_slice_clear_grant(struct task_struct *t) { } static __always_inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask) { return false; } +#define rseq_slice_clear_user(rseq, efault) do { } while (0) #endif /* !CONFIG_RSEQ_SLICE_EXTENSION */ bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr); @@ -517,11 +532,9 @@ bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids, if (csaddr) unsafe_get_user(*csaddr, &rseq->rseq_cs, efault); - /* Open coded, so it's in the same user access region */ - if (rseq_slice_extension_enabled()) { - /* Unconditionally clear it, no point in conditionals */ - unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); - } + /* RSEQ ABI V2 only operations */ + if (rseq_v2(t)) + rseq_slice_clear_user(rseq, efault); } rseq_slice_clear_grant(t); @@ -612,6 +625,14 @@ static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct t * interrupts disabled */ guard(pagefault)(); + /* + * This optimization is only valid when the task registered for the + * optimized RSEQ_ABI_V2 variant. Some legacy users rely on the original + * RSEQ implementation behaviour which unconditionally updated the IDs. + * rseq_sched_switch_event() ensures that legacy registrations always + * have both sched_switch and ids_changed set, which is compatible with + * the historical TIF_NOTIFY_RESUME behaviour. + */ if (likely(!t->rseq.event.ids_changed)) { struct rseq __user *rseq = t->rseq.usrptr; /* @@ -623,11 +644,9 @@ static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct t scoped_user_rw_access(rseq, efault) { unsafe_get_user(csaddr, &rseq->rseq_cs, efault); - /* Open coded, so it's in the same user access region */ - if (rseq_slice_extension_enabled()) { - /* Unconditionally clear it, no point in conditionals */ - unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); - } + /* RSEQ ABI V2 only operations */ + if (rseq_v2(t)) + rseq_slice_clear_user(rseq, efault); } rseq_slice_clear_grant(t); diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index 0b42045988db..a469c1870849 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -9,6 +9,12 @@ #ifdef CONFIG_RSEQ struct rseq; +/* + * rseq_event::has_rseq contains the ABI version number so preserving it + * in AND operations requires a mask. + */ +#define RSEQ_HAS_RSEQ_VERSION_MASK 0xff + /** * struct rseq_event - Storage for rseq related event management * @all: Compound to initialize and clear the data efficiently @@ -17,7 +23,8 @@ struct rseq; * exit to user * @ids_changed: Indicator that IDs need to be updated * @user_irq: True on interrupt entry from user mode - * @has_rseq: True if the task has a rseq pointer installed + * @has_rseq: Greater than 0 if the task has a rseq pointer installed. + * Contains the RSEQ version number * @error: Compound error code for the slow path to analyze * @fatal: User space data corrupted or invalid * @slowpath: Indicator that slow path processing via TIF_NOTIFY_RESUME -- cgit v1.2.3 From 82f572449cfe75f12ea985986da60e11f308f77d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 26 Apr 2026 16:21:02 +0200 Subject: rseq: Implement read only ABI enforcement for optimized RSEQ V2 mode The optimized RSEQ V2 mode requires that user space adheres to the ABI specification and does not modify the read-only fields cpu_id_start, cpu_id, node_id and mm_cid behind the kernel's back. While the kernel does not rely on these fields, the adherence to this is a fundamental prerequisite to allow multiple entities, e.g. libraries, in an application to utilize the full potential of RSEQ without stepping on each other toes. Validate this adherence on every update of these fields. If the kernel detects that user space modified the fields, the application is force terminated. Fixes: d6200245c75e ("rseq: Allow registering RSEQ with slice extension") Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Tested-by: Dmitry Vyukov Link: https://patch.msgid.link/20260428224427.845230956%40kernel.org Cc: stable@vger.kernel.org --- include/linux/rseq_entry.h | 83 +++++++++++++++++----------------------------- include/linux/rseq_types.h | 4 ++- 2 files changed, 33 insertions(+), 54 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index 934db41ec782..2d0295df5107 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -248,7 +248,6 @@ static __always_inline bool rseq_grant_slice_extension(unsigned long ti_work, un #endif /* !CONFIG_RSEQ_SLICE_EXTENSION */ bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr); -bool rseq_debug_validate_ids(struct task_struct *t); static __always_inline void rseq_note_user_irq_entry(void) { @@ -368,43 +367,6 @@ efault: return false; } -/* - * On debug kernels validate that user space did not mess with it if the - * debug branch is enabled. - */ -bool rseq_debug_validate_ids(struct task_struct *t) -{ - struct rseq __user *rseq = t->rseq.usrptr; - u32 cpu_id, uval, node_id; - - /* - * On the first exit after registering the rseq region CPU ID is - * RSEQ_CPU_ID_UNINITIALIZED and node_id in user space is 0! - */ - node_id = t->rseq.ids.cpu_id != RSEQ_CPU_ID_UNINITIALIZED ? - cpu_to_node(t->rseq.ids.cpu_id) : 0; - - scoped_user_read_access(rseq, efault) { - unsafe_get_user(cpu_id, &rseq->cpu_id_start, efault); - if (cpu_id != t->rseq.ids.cpu_id) - goto die; - unsafe_get_user(uval, &rseq->cpu_id, efault); - if (uval != cpu_id) - goto die; - unsafe_get_user(uval, &rseq->node_id, efault); - if (uval != node_id) - goto die; - unsafe_get_user(uval, &rseq->mm_cid, efault); - if (uval != t->rseq.ids.mm_cid) - goto die; - } - return true; -die: - t->rseq.event.fatal = true; -efault: - return false; -} - #endif /* RSEQ_BUILD_SLOW_PATH */ /* @@ -514,20 +476,32 @@ efault: * faults in task context are fatal too. */ static rseq_inline -bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids, - u32 node_id, u64 *csaddr) +bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids, u64 *csaddr) { struct rseq __user *rseq = t->rseq.usrptr; - if (static_branch_unlikely(&rseq_debug_enabled)) { - if (!rseq_debug_validate_ids(t)) - return false; - } - scoped_user_rw_access(rseq, efault) { + /* Validate the R/O fields for debug and optimized mode */ + if (static_branch_unlikely(&rseq_debug_enabled) || rseq_v2(t)) { + u32 cpu_id, uval; + + unsafe_get_user(cpu_id, &rseq->cpu_id_start, efault); + if (cpu_id != t->rseq.ids.cpu_id) + goto die; + unsafe_get_user(uval, &rseq->cpu_id, efault); + if (uval != cpu_id) + goto die; + unsafe_get_user(uval, &rseq->node_id, efault); + if (uval != t->rseq.ids.node_id) + goto die; + unsafe_get_user(uval, &rseq->mm_cid, efault); + if (uval != t->rseq.ids.mm_cid) + goto die; + } + unsafe_put_user(ids->cpu_id, &rseq->cpu_id_start, efault); unsafe_put_user(ids->cpu_id, &rseq->cpu_id, efault); - unsafe_put_user(node_id, &rseq->node_id, efault); + unsafe_put_user(ids->node_id, &rseq->node_id, efault); unsafe_put_user(ids->mm_cid, &rseq->mm_cid, efault); if (csaddr) unsafe_get_user(*csaddr, &rseq->rseq_cs, efault); @@ -539,10 +513,13 @@ bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids, rseq_slice_clear_grant(t); /* Cache the new values */ - t->rseq.ids.cpu_cid = ids->cpu_cid; + t->rseq.ids = *ids; rseq_stat_inc(rseq_stats.ids); rseq_trace_update(t, ids); return true; + +die: + t->rseq.event.fatal = true; efault: return false; } @@ -552,11 +529,11 @@ efault: * is in a critical section. */ static rseq_inline bool rseq_update_usr(struct task_struct *t, struct pt_regs *regs, - struct rseq_ids *ids, u32 node_id) + struct rseq_ids *ids) { u64 csaddr; - if (!rseq_set_ids_get_csaddr(t, ids, node_id, &csaddr)) + if (!rseq_set_ids_get_csaddr(t, ids, &csaddr)) return false; /* @@ -659,12 +636,12 @@ static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct t } struct rseq_ids ids = { - .cpu_id = task_cpu(t), - .mm_cid = task_mm_cid(t), + .cpu_id = task_cpu(t), + .mm_cid = task_mm_cid(t), + .node_id = cpu_to_node(ids.cpu_id), }; - u32 node_id = cpu_to_node(ids.cpu_id); - return rseq_update_usr(t, regs, &ids, node_id); + return rseq_update_usr(t, regs, &ids); efault: return false; } diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index a469c1870849..85739a63e85e 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -66,8 +66,9 @@ struct rseq_event { * compiler emit a single compare on 64-bit * @cpu_id: The CPU ID which was written last to user space * @mm_cid: The MM CID which was written last to user space + * @node_id: The node ID which was written last to user space * - * @cpu_id and @mm_cid are updated when the data is written to user space. + * @cpu_id, @mm_cid and @node_id are updated when the data is written to user space. */ struct rseq_ids { union { @@ -77,6 +78,7 @@ struct rseq_ids { u32 mm_cid; }; }; + u32 node_id; }; /** -- cgit v1.2.3 From 1f7305d87aa23db2579df222eba504a333c2c978 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 5 May 2026 17:52:03 +0100 Subject: KVM: arm64: Work around C1-Pro erratum 4193714 for protected guests C1-Pro cores with SME have an erratum where TLBI+DSB does not complete all outstanding SME accesses. Instead a DSB needs to be executed on the affected CPUs. The implication is that pages cannot be unmapped from the host Stage 2 and then provided to a protected guest or to the hypervisor. Host SME accesses may still complete after this point. This erratum breaks pKVM's guarantees, and the workaround is hard to implement as EL2 and EL1 share a security state meaning EL1 can mask IPIs sent by EL2, leading to interrupt blackouts. Instead, do this in EL3. This has the advantage of a separate security state, meaning lower EL cannot mask the IPI. It is also simpler for EL3 to know about CPUs that are off or in PSCI's CPU_SUSPEND. Add the needed hook to host_stage2_set_owner_metadata_locked(). This covers the cases where the host loses access to a page: __pkvm_host_donate_guest() __pkvm_guest_unshare_host() host_stage2_set_owner_locked() when owner_id == PKVM_ID_HYP Since pKVM relies on the firmware call for correctness, check for the firmware counterpart during protected KVM initialisation and fail the pKVM initialisation if it is missing. Signed-off-by: James Morse Co-developed-by: Catalin Marinas Signed-off-by: Catalin Marinas Cc: Mark Rutland Cc: Marc Zyngier Cc: Oliver Upton Cc: Will Deacon Cc: Vincent Donnefort Cc: Lorenzo Pieralisi Cc: Sudeep Holla Link: https://patch.msgid.link/20260505165205.2690919-1-catalin.marinas@arm.com Signed-off-by: Marc Zyngier --- include/linux/arm-smccc.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index 50b47eba7d01..e7195750d21b 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -105,6 +105,12 @@ ARM_SMCCC_SMC_32, \ 0, 0x3fff) +/* C1-Pro erratum 4193714: SME DVMSync early acknowledgement */ +#define ARM_SMCCC_CPU_WORKAROUND_4193714 \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_32, \ + ARM_SMCCC_OWNER_CPU, 0x10) + #define ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID \ ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ ARM_SMCCC_SMC_32, \ -- cgit v1.2.3 From 57c347a2e2473bfb5c1f1132a3209c55efbe640b Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Thu, 30 Apr 2026 08:11:02 -0700 Subject: platform/x86: intel: Add notifiers support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In some cases a driver using services of vsec_tpmi driver requires some processing before vsec_tpmi exits. For example a children using debugfs can't use debugfs as this will be deleted by the vsec_tpmi driver. This is the case when unbind using PCI driver interface. In this case the remove callback of vsec_tpmi driver is called first, then remove callback of its children. Add support of blocking chain notifiers support. Notify on successful probe and before clean up in the remove callback. Fixes: 811f67c51636 ("platform/x86/intel/tpmi: Add new auxiliary driver for performance limits") Signed-off-by: Srinivas Pandruvada Cc: Stable@vger.kernel.org Link: https://patch.msgid.link/20260430151103.1549733-3-srinivas.pandruvada@linux.intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/intel_tpmi.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/intel_tpmi.h b/include/linux/intel_tpmi.h index 94c06bf214fb..15f02422e9ca 100644 --- a/include/linux/intel_tpmi.h +++ b/include/linux/intel_tpmi.h @@ -28,6 +28,12 @@ enum intel_tpmi_id { TPMI_INFO_ID = 0x81, /* Special ID for PCI BDF and Package ID information */ }; +#define TPMI_CORE_INIT 0 +#define TPMI_CORE_EXIT 1 + +int tpmi_register_notifier(struct notifier_block *nb); +int tpmi_unregister_notifier(struct notifier_block *nb); + struct oobmsm_plat_info *tpmi_get_platform_data(struct auxiliary_device *auxdev); struct resource *tpmi_get_resource_at_index(struct auxiliary_device *auxdev, int index); int tpmi_get_resource_count(struct auxiliary_device *auxdev); -- cgit v1.2.3 From b62eb8dcf2c47d4d676a434efbd57c4f776f7829 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 6 May 2026 12:07:14 +0200 Subject: netfilter: x_tables: allocate hook ops while under mutex arp/ip(6)t_register_table() add the table to the per-netns list via xt_register_table() before allocating the per-netns hook ops copy via kmemdup_array(). This leaves a window where the table is visible in the list with ops=NULL. If the pernet exit happens runs concurrently the pre_exit callback finds the table via xt_find_table() and passes the NULL ops pointer to nf_unregister_net_hooks(), causing a NULL dereference: general protection fault in nf_unregister_net_hooks+0xbc/0x150 RIP: nf_unregister_net_hooks (net/netfilter/core.c:613) Call Trace: ipt_unregister_table_pre_exit iptable_mangle_net_pre_exit ops_pre_exit_list cleanup_net Fix by moving the ops allocation into the xtables core so the table is never in the list without valid ops. Also ensure the table is no longer processing packets before its torn down on error unwind. nf_register_net_hooks might have published at least one hook; call synchronize_rcu() if there was an error. audit log register message gets deferred until all operations have passed, this avoids need to emit another ureg message in case of error unwinding. Based on earlier patch by Tristan Madani. Fixes: f9006acc8dfe5 ("netfilter: arp_tables: pass table pointer via nf_hook_ops") Fixes: ee177a54413a ("netfilter: ip6_tables: pass table pointer via nf_hook_ops") Fixes: ae689334225f ("netfilter: ip_tables: pass table pointer via nf_hook_ops") Link: https://lore.kernel.org/netfilter-devel/20260429175613.1459342-1-tristmd@gmail.com/ Signed-off-by: Tristan Madani Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index a81b46af5118..cb4b694dd9e4 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -305,6 +305,7 @@ struct xt_counters *xt_counters_alloc(unsigned int counters); struct xt_table *xt_register_table(struct net *net, const struct xt_table *table, + const struct nf_hook_ops *template_ops, struct xt_table_info *bootstrap, struct xt_table_info *newinfo); void *xt_unregister_table(struct xt_table *table); -- cgit v1.2.3 From 527d6931473b75d90e38942aae6537d1a527f1fd Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 6 May 2026 12:07:15 +0200 Subject: netfilter: x_tables: add and use xt_unregister_table_pre_exit Remove the copypasted variants of _pre_exit and add one single function in the xtables core. ebtables is not compatible with x_tables and therefore unchanged. This is a preparation patch to reduce noise in the followup bug fixes. Reviewed-by: Tristan Madani Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 1 + include/linux/netfilter_arp/arp_tables.h | 1 - include/linux/netfilter_ipv4/ip_tables.h | 1 - include/linux/netfilter_ipv6/ip6_tables.h | 1 - 4 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index cb4b694dd9e4..74486714ae20 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -309,6 +309,7 @@ struct xt_table *xt_register_table(struct net *net, struct xt_table_info *bootstrap, struct xt_table_info *newinfo); void *xt_unregister_table(struct xt_table *table); +void xt_unregister_table_pre_exit(struct net *net, u8 af, const char *name); struct xt_table_info *xt_replace_table(struct xt_table *table, unsigned int num_counters, diff --git a/include/linux/netfilter_arp/arp_tables.h b/include/linux/netfilter_arp/arp_tables.h index a40aaf645fa4..05631a25e622 100644 --- a/include/linux/netfilter_arp/arp_tables.h +++ b/include/linux/netfilter_arp/arp_tables.h @@ -53,7 +53,6 @@ int arpt_register_table(struct net *net, const struct xt_table *table, const struct arpt_replace *repl, const struct nf_hook_ops *ops); void arpt_unregister_table(struct net *net, const char *name); -void arpt_unregister_table_pre_exit(struct net *net, const char *name); extern unsigned int arpt_do_table(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); diff --git a/include/linux/netfilter_ipv4/ip_tables.h b/include/linux/netfilter_ipv4/ip_tables.h index 132b0e4a6d4d..13593391d605 100644 --- a/include/linux/netfilter_ipv4/ip_tables.h +++ b/include/linux/netfilter_ipv4/ip_tables.h @@ -26,7 +26,6 @@ int ipt_register_table(struct net *net, const struct xt_table *table, const struct ipt_replace *repl, const struct nf_hook_ops *ops); -void ipt_unregister_table_pre_exit(struct net *net, const char *name); void ipt_unregister_table_exit(struct net *net, const char *name); /* Standard entry. */ diff --git a/include/linux/netfilter_ipv6/ip6_tables.h b/include/linux/netfilter_ipv6/ip6_tables.h index 8b8885a73c76..c6d5b927830d 100644 --- a/include/linux/netfilter_ipv6/ip6_tables.h +++ b/include/linux/netfilter_ipv6/ip6_tables.h @@ -27,7 +27,6 @@ extern void *ip6t_alloc_initial_table(const struct xt_table *); int ip6t_register_table(struct net *net, const struct xt_table *table, const struct ip6t_replace *repl, const struct nf_hook_ops *ops); -void ip6t_unregister_table_pre_exit(struct net *net, const char *name); void ip6t_unregister_table_exit(struct net *net, const char *name); extern unsigned int ip6t_do_table(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); -- cgit v1.2.3 From b4597d5fd7d2f8cebfffd40dffb5e003cc78964c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 6 May 2026 12:07:17 +0200 Subject: netfilter: x_tables: add and use xtables_unregister_table_exit Previous change added xtables_unregister_table_pre_exit to detach the table from the packetpath and to unlink it from the active table list. In case of rmmod, userspace that is doing set/getsockopt for this table will not be able to re-instantiate the table: 1. The larval table has been removed already 2. existing instantiated table is no longer on the xt pernet table list. This adds the second stage helper: unlink the table from the dying list, free the hook ops (if any) and do the audit notification. It replaces xt_unregister_table(). Fixes: fdacd57c79b7 ("netfilter: x_tables: never register tables by default") Reported-by: Tristan Madani Reviewed-by: Tristan Madani Closes: https://lore.kernel.org/netfilter-devel/20260429175613.1459342-1-tristmd@gmail.com/ Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 74486714ae20..5a1c5c336fa4 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -308,8 +308,8 @@ struct xt_table *xt_register_table(struct net *net, const struct nf_hook_ops *template_ops, struct xt_table_info *bootstrap, struct xt_table_info *newinfo); -void *xt_unregister_table(struct xt_table *table); void xt_unregister_table_pre_exit(struct net *net, u8 af, const char *name); +struct xt_table *xt_unregister_table_exit(struct net *net, u8 af, const char *name); struct xt_table_info *xt_replace_table(struct xt_table *table, unsigned int num_counters, -- cgit v1.2.3 From 35b535db69589ea0025ec3f06df08f2e3faad26f Mon Sep 17 00:00:00 2001 From: Tejas Upadhyay Date: Fri, 8 May 2026 12:25:45 +0530 Subject: drm/buddy: Integrate lockdep annotations for gpu buddy manager gpu_buddy APIs are expected to be called with the driver-provided lock held, but there is no runtime enforcement of this contract. Add lockdep annotations to catch locking violations early. Introduce gpu_buddy_driver_set_lock() for the driver to register the lock that protects the buddy manager. Add gpu_buddy_driver_lock_held() assertions to all exported gpu_buddy and drm_buddy APIs that access/modify the manager state. The lock_dep_map field is only compiled in when CONFIG_LOCKDEP is enabled, adding zero overhead to production builds. Wire up xe_ttm_vram_mgr to register its mutex with the buddy manager after initialization. Assisted-by: Copilot:claude-opus-4.6 Suggested-by: Matthew Brost Signed-off-by: Tejas Upadhyay Reviewed-by: Matthew Auld Signed-off-by: Arunpravin Paneer Selvam Link: https://patch.msgid.link/20260508065544.4049240-2-tejas.upadhyay@intel.com --- include/linux/gpu_buddy.h | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) (limited to 'include/linux') diff --git a/include/linux/gpu_buddy.h b/include/linux/gpu_buddy.h index 5fa917ba5450..71941a039648 100644 --- a/include/linux/gpu_buddy.h +++ b/include/linux/gpu_buddy.h @@ -154,6 +154,7 @@ struct gpu_buddy_block { * @avail: Total free space currently available for allocation in bytes. * @clear_avail: Free space available in the clear tree (zeroed memory) in bytes. * This is a subset of @avail. + * @lock_dep_map: Annotates gpu_buddy API with a driver provided lock. */ struct gpu_buddy { /* private: */ @@ -179,8 +180,48 @@ struct gpu_buddy { u64 size; u64 avail; u64 clear_avail; +#ifdef CONFIG_LOCKDEP + struct lockdep_map *lock_dep_map; +#endif }; +#ifdef CONFIG_LOCKDEP +/** + * gpu_buddy_driver_set_lock() - Set the lock protecting accesses to GPU BUDDY + * @mm: Pointer to GPU buddy structure. + * @lock: the lock used to protect the gpu buddy. The locking primitive + * must contain a dep_map field. + * + * Call this to annotate gpu_buddy APIs which access/modify gpu_buddy manager + */ +#define gpu_buddy_driver_set_lock(mm, lock) \ + do { \ + struct gpu_buddy *__mm = (mm); \ + if (!WARN(__mm->lock_dep_map, "GPU BUDDY MM lock should be set only once.")) \ + __mm->lock_dep_map = &(lock)->dep_map; \ + } while (0) +#else +#define gpu_buddy_driver_set_lock(mm, lock) do { (void)(mm); (void)(lock); } while (0) +#endif + +#ifdef CONFIG_LOCKDEP +/** + * gpu_buddy_driver_lock_held() - Assert GPU BUDDY manager lock is held + * @mm: Pointer to the GPU BUDDY structure. + * + * Ensure driver lock is held. + */ +static inline void gpu_buddy_driver_lock_held(struct gpu_buddy *mm) +{ + if (mm->lock_dep_map) + lockdep_assert(lock_is_held_type(mm->lock_dep_map, 0)); +} +#else +static inline void gpu_buddy_driver_lock_held(struct gpu_buddy *mm) +{ +} +#endif + static inline u64 gpu_buddy_block_offset(const struct gpu_buddy_block *block) { -- cgit v1.2.3 From 411c1cf430392c905e39f12bc305dd994da0b426 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 8 May 2026 15:20:23 +0100 Subject: arm64/entry: Fix arm64-specific rseq brokenness Mathias Stearn reports that since v6.19, there are two big issues affecting rseq: (1) On arm64 specifically, rseq critical sections aren't aborted when they should be. (2) The 'cpu_id_start' field is no longer written by the kernel in all cases it used to be, including some cases where TCMalloc depends on the kernel clobbering the field. This patch fixes issue #1. This patch DOES NOT fix issue #2, which will need to be addressed by other patches. The arm64-specific brokenness is a result of commits: 2fc0e4b4126c ("rseq: Record interrupt from user space") 39a167560a61 ("rseq: Optimize event setting") The first commit failed to add a call to rseq_note_user_irq_entry() on arm64. Thus arm64 never sets rseq_event::user_irq to record that it may be necessary to abort an active rseq critical section upon return to userspace. On its own, this commit had no functional impact as the value of rseq_event::user_irq was not consumed. The second commit relied upon rseq_event::user_irq to determine whether or not to bother to perform rseq work when returning to userspace. As rseq_event::user_irq wasn't set on arm64, this work would be skipped, and consequently an active rseq critical section would not be aborted. Fix this by giving arm64 syscall-specific entry/exit paths, and performing the relevant logic in syscall and non-syscall paths, including calling rseq_note_user_irq_entry() for non-syscall entry. Currently arm64 cannot use syscall_enter_from_user_mode(), syscall_exit_to_user_mode(), and irqentry_exit_to_user_mode(), due to ordering constraints with exception masking, and risk of ABI breakage for syscall tracing/audit/etc. For the moment the entry/exit logic is left as arm64-specific, directly using enter_from_user_mode() and exit_to_user_mode(), but mirroring the generic code. I intend to follow up with refactoring/cleanup, as we did for kernel mode entry paths in commit: 041aa7a85390 ("entry: Split preemption from irqentry_exit_to_kernel_mode()") ... which will allow arm64 to use the GENERIC_IRQ_ENTRY functions directly. Fixes: 39a167560a61 ("rseq: Optimize event setting") Reported-by: Mathias Stearn Signed-off-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Acked-by: Catalin Marinas Link: https://lore.kernel.org/regressions/CAHnCjA25b+nO2n5CeifknSKHssJpPrjnf+dtr7UgzRw4Zgu=oA@mail.gmail.com/ Link: https://patch.msgid.link/20260508142023.3268622-1-mark.rutland@arm.com --- include/linux/irq-entry-common.h | 8 -------- include/linux/rseq_entry.h | 19 ------------------- 2 files changed, 27 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index 167fba7dbf04..1fabf0f5ea8e 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -218,14 +218,6 @@ static __always_inline void __exit_to_user_mode_validate(void) lockdep_sys_exit(); } -/* Temporary workaround to keep ARM64 alive */ -static __always_inline void exit_to_user_mode_prepare_legacy(struct pt_regs *regs) -{ - __exit_to_user_mode_prepare(regs, EXIT_TO_USER_MODE_WORK); - rseq_exit_to_user_mode_legacy(); - __exit_to_user_mode_validate(); -} - /** * syscall_exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required * @regs: Pointer to pt_regs on entry stack diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index 2d0295df5107..63bc72086e75 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -749,24 +749,6 @@ static __always_inline void rseq_irqentry_exit_to_user_mode(void) ev->events = 0; } -/* Required to keep ARM64 working */ -static __always_inline void rseq_exit_to_user_mode_legacy(void) -{ - struct rseq_event *ev = ¤t->rseq.event; - - rseq_stat_inc(rseq_stats.exit); - - if (static_branch_unlikely(&rseq_debug_enabled)) - WARN_ON_ONCE(ev->sched_switch); - - /* - * Ensure that event (especially user_irq) is cleared when the - * interrupt did not result in a schedule and therefore the - * rseq processing did not clear it. - */ - ev->events = 0; -} - void __rseq_debug_syscall_return(struct pt_regs *regs); static __always_inline void rseq_debug_syscall_return(struct pt_regs *regs) @@ -782,7 +764,6 @@ static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned } static inline void rseq_syscall_exit_to_user_mode(void) { } static inline void rseq_irqentry_exit_to_user_mode(void) { } -static inline void rseq_exit_to_user_mode_legacy(void) { } static inline void rseq_debug_syscall_return(struct pt_regs *regs) { } static inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask) { return false; } #endif /* !CONFIG_RSEQ */ -- cgit v1.2.3 From cceb8fa9cb2cf98e31d81ecf6353b6ba5ac57744 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 10 May 2026 10:08:16 -1000 Subject: sched_ext: Replace SCX_TASK_OFF_TASKS flag with SCX_TASK_DEAD state SCX_TASK_OFF_TASKS marked tasks already through sched_ext_dead() so cgroup task iteration would skip them. This can be expressed better with a task state. Replace the flag with SCX_TASK_DEAD. scx_disable_and_exit_task() resets state to NONE on its way out, so sched_ext_dead() now sets DEAD after the wrapper returns. The validation matrix grows NONE -> DEAD, warns on DEAD -> NONE, and tightens READY's predecessor to INIT or ENABLED so the new DEAD value cannot silently transition to READY. Prepares for the following enable vs dead race fix. Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- include/linux/sched/ext.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index adb9a4de068a..9f1a326ad03e 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -101,24 +101,25 @@ enum scx_ent_flags { SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */ SCX_TASK_SUB_INIT = 1 << 4, /* task being initialized for a sub sched */ SCX_TASK_IMMED = 1 << 5, /* task is on local DSQ with %SCX_ENQ_IMMED */ - SCX_TASK_OFF_TASKS = 1 << 6, /* removed from scx_tasks by sched_ext_dead() */ /* - * Bits 8 and 9 are used to carry task state: + * Bits 8 to 10 are used to carry task state: * * NONE ops.init_task() not called yet * INIT ops.init_task() succeeded, but task can be cancelled * READY fully initialized, but not in sched_ext * ENABLED fully initialized and in sched_ext + * DEAD terminal state set by sched_ext_dead() */ - SCX_TASK_STATE_SHIFT = 8, /* bits 8 and 9 are used to carry task state */ - SCX_TASK_STATE_BITS = 2, + SCX_TASK_STATE_SHIFT = 8, + SCX_TASK_STATE_BITS = 3, SCX_TASK_STATE_MASK = ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT, SCX_TASK_NONE = 0 << SCX_TASK_STATE_SHIFT, SCX_TASK_INIT = 1 << SCX_TASK_STATE_SHIFT, SCX_TASK_READY = 2 << SCX_TASK_STATE_SHIFT, SCX_TASK_ENABLED = 3 << SCX_TASK_STATE_SHIFT, + SCX_TASK_DEAD = 4 << SCX_TASK_STATE_SHIFT, /* * Bits 12 and 13 are used to carry reenqueue reason. In addition to -- cgit v1.2.3 From c941d7391f258d5d06e0f7e962a52f99a547a83e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 10 May 2026 10:08:16 -1000 Subject: sched_ext: Close root-enable vs sched_ext_dead() race with SCX_TASK_INIT_BEGIN scx_root_enable_workfn() drops the iter rq lock for ops.init_task() and a TASK_DEAD @p can fall through sched_ext_dead() in that window. The race hits when sched_ext_dead() observes SCX_TASK_INIT (the intermediate state before @p->scx.sched is published) and dereferences NULL via SCX_HAS_OP(NULL, exit_task), or observes SCX_TASK_NONE during the unlocked init window and skips cleanup so exit_task() never runs. Add SCX_TASK_INIT_BEGIN. The enable path writes NONE -> INIT_BEGIN under the iter rq lock, then takes the rq lock again after init to walk INIT_BEGIN -> INIT -> READY. sched_ext_dead() that wins the rq-lock race observes INIT_BEGIN and sets DEAD without calling into ops; the post-init recheck unwinds via scx_sub_init_cancel_task(). scx_fork() runs single-threaded against sched_ext_dead() (the task is not on scx_tasks until scx_post_fork() adds it) so its INIT_BEGIN -> INIT walk needs no rq-lock pairing; it rolls back to NONE on ops.init_task() failure. The validation matrix grows the INIT_BEGIN row and the INIT_BEGIN -> DEAD edge; INIT now requires INIT_BEGIN as the predecessor. scx_sub_disable()'s migration writes INIT_BEGIN as a synthetic predecessor to satisfy the tightened verification. The sub-sched paths still race with sched_ext_dead() during the unlocked init window. This will be fixed by the next patch. Reported-by: zhidao su Link: https://lore.kernel.org/all/20260429133155.3825247-1-suzhidao@xiaomi.com/ Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- include/linux/sched/ext.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 9f1a326ad03e..2129e18ada58 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -106,6 +106,7 @@ enum scx_ent_flags { * Bits 8 to 10 are used to carry task state: * * NONE ops.init_task() not called yet + * INIT_BEGIN ops.init_task() in flight; see sched_ext_dead() * INIT ops.init_task() succeeded, but task can be cancelled * READY fully initialized, but not in sched_ext * ENABLED fully initialized and in sched_ext @@ -116,10 +117,11 @@ enum scx_ent_flags { SCX_TASK_STATE_MASK = ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT, SCX_TASK_NONE = 0 << SCX_TASK_STATE_SHIFT, - SCX_TASK_INIT = 1 << SCX_TASK_STATE_SHIFT, - SCX_TASK_READY = 2 << SCX_TASK_STATE_SHIFT, - SCX_TASK_ENABLED = 3 << SCX_TASK_STATE_SHIFT, - SCX_TASK_DEAD = 4 << SCX_TASK_STATE_SHIFT, + SCX_TASK_INIT_BEGIN = 1 << SCX_TASK_STATE_SHIFT, + SCX_TASK_INIT = 2 << SCX_TASK_STATE_SHIFT, + SCX_TASK_READY = 3 << SCX_TASK_STATE_SHIFT, + SCX_TASK_ENABLED = 4 << SCX_TASK_STATE_SHIFT, + SCX_TASK_DEAD = 5 << SCX_TASK_STATE_SHIFT, /* * Bits 12 and 13 are used to carry reenqueue reason. In addition to -- cgit v1.2.3 From 657b594b2084b39a4bc6d8493aa2140cb00cea49 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 7 May 2026 16:46:29 +0900 Subject: fprobe: Fix unregister_fprobe() to wait for RCU grace period Commit 4346ba1604093 ("fprobe: Rewrite fprobe on function-graph tracer") changed fprobe to register struct fprobe to an rcu-hlist, but it forgot to wait for RCU GP. Thus there can be use-after-free if the fprobe is released right after unregistering. This can be happened on fprobe event and sample module code. To fix this issue, add synchronize_rcu() in unregister_fprobe(). Note that BPF is OK because fprobe is used as a part of bpf_kprobe_multi_link. This unregisters its fprobe in bpf_kprobe_multi_link_release() and it is deallocated via bpf_kprobe_multi_link_dealloc(), which is invoked from bpf_link_defer_dealloc_rcu_gp() RCU callback. For BPF, this also introduced unregister_fprobe_async() which does NOT wait for RCU grace priod. Link: https://lore.kernel.org/all/177813998919.256460.2809243930741138224.stgit@mhiramat.tok.corp.google.com/ Fixes: 4346ba1604093 ("fprobe: Rewrite fprobe on function-graph tracer") Signed-off-by: Masami Hiramatsu (Google) --- include/linux/fprobe.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fprobe.h b/include/linux/fprobe.h index 0a3bcd1718f3..be1b38c981d4 100644 --- a/include/linux/fprobe.h +++ b/include/linux/fprobe.h @@ -94,6 +94,7 @@ int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num); int register_fprobe_syms(struct fprobe *fp, const char **syms, int num); int unregister_fprobe(struct fprobe *fp); +int unregister_fprobe_async(struct fprobe *fp); bool fprobe_is_registered(struct fprobe *fp); int fprobe_count_ips_from_filter(const char *filter, const char *notfilter); #else @@ -113,6 +114,10 @@ static inline int unregister_fprobe(struct fprobe *fp) { return -EOPNOTSUPP; } +static inline int unregister_fprobe_async(struct fprobe *fp) +{ + return -EOPNOTSUPP; +} static inline bool fprobe_is_registered(struct fprobe *fp) { return false; -- cgit v1.2.3 From dec85d2fbd20de3711a71e65397dfdb40c3fa953 Mon Sep 17 00:00:00 2001 From: Sascha Bischoff Date: Wed, 6 May 2026 09:37:02 +0000 Subject: irqchip/gic-v5: Move LPI allocation into the LPI domain The IPI and ITS MSI domains currently allocate and release LPIs directly, then pass the selected LPI ID to the parent LPI domain. This leaks the LPI domain's allocation policy into its child domains and forces each child to duplicate part of the parent domain's teardown. Make the LPI domain allocate LPIs in its .alloc() callback and release them in a matching .free() callback. Child domains can then request a parent interrupt without passing an implementation-specific LPI ID, and the LPI lifetime is tied to the domain that owns the LPI namespace. Remove the gicv5_alloc_lpi() and gicv5_free_lpi() wrappers now that no external caller needs to manage LPIs directly. This is a preparatory change for an actual leakage problem in the allocation code and therefore tagged with the same Fixes tag. Fixes: 0f0101325876 ("irqchip/gic-v5: Add GICv5 LPI/IPI support") Signed-off-by: Sascha Bischoff Signed-off-by: Thomas Gleixner Reviewed-by: Marc Zyngier Reviewed-by: Lorenzo Pieralisi Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20260506093634.382062-2-sascha.bischoff@arm.com --- include/linux/irqchip/arm-gic-v5.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqchip/arm-gic-v5.h b/include/linux/irqchip/arm-gic-v5.h index 40d2fce68294..f78787e654f4 100644 --- a/include/linux/irqchip/arm-gic-v5.h +++ b/include/linux/irqchip/arm-gic-v5.h @@ -425,9 +425,6 @@ struct gicv5_its_itt_cfg { void gicv5_init_lpis(u32 max); void gicv5_deinit_lpis(void); -int gicv5_alloc_lpi(void); -void gicv5_free_lpi(u32 lpi); - void __init gicv5_its_of_probe(struct device_node *parent); void __init gicv5_its_acpi_probe(void); #endif -- cgit v1.2.3 From 4314a44564eb1565349fed7a4192344c5f46fc85 Mon Sep 17 00:00:00 2001 From: Yazhou Tang Date: Wed, 6 May 2026 17:47:12 +0800 Subject: bpf: Fix out-of-bounds read in bpf_patch_call_args() The interpreters_args array only accommodates stack depths up to MAX_BPF_STACK (512 bytes). However, do_misc_fixups() may allow a larger stack depth if JIT is requested. If JIT compilation later fails and falls back to the interpreter, the verifier invokes bpf_patch_call_args() with this oversized stack depth. This causes a load-time out-of-bounds (OOB) read when calculating the interpreter function pointer index. Fix this by changing bpf_patch_call_args() to return an int and explicitly rejecting the JIT fallback (returning -EINVAL) if the stack depth exceeds MAX_BPF_STACK. Fixes: 1ea47e01ad6e ("bpf: add support for bpf_call to interpreter") Co-developed-by: Tianci Cao Signed-off-by: Tianci Cao Co-developed-by: Shenghao Yuan Signed-off-by: Shenghao Yuan Signed-off-by: Yazhou Tang Acked-by: Xu Kuohai Link: https://lore.kernel.org/r/20260506094714.419842-2-tangyazhou@zju.edu.cn Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 01e203964892..52b30e9ea431 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2917,7 +2917,7 @@ int bpf_check_uarg_tail_zero(bpfptr_t uaddr, size_t expected_size, int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size); #ifndef CONFIG_BPF_JIT_ALWAYS_ON -void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); +int bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); #endif struct btf *bpf_get_btf_vmlinux(void); -- cgit v1.2.3 From 58a8f3e2501dc14b8e00e883d6aaf0600a239da7 Mon Sep 17 00:00:00 2001 From: Yazhou Tang Date: Wed, 6 May 2026 17:47:13 +0800 Subject: bpf: Fix s16 truncation for large bpf-to-bpf call offsets Currently, the BPF instruction set allows bpf-to-bpf calls (or internal calls, pseudo calls) to use a 32-bit imm field to represent the relative jump offset. However, when JIT is disabled or falls back to the interpreter, the verifier invokes bpf_patch_call_args() to rewrite the call instruction. In this function, the 32-bit imm is downcast to s16 and stored in the off field. void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth) { stack_depth = max_t(u32, stack_depth, 1); insn->off = (s16) insn->imm; insn->imm = interpreters_args[(round_up(stack_depth, 32) / 32) - 1] - __bpf_call_base_args; insn->code = BPF_JMP | BPF_CALL_ARGS; } If the original imm exceeds the s16 range (i.e., a jump offset greater than 32767 instructions), this downcast silently truncates the offset, resulting in an incorrect call target. Fix this by: 1. In bpf_patch_call_args(), keeping the imm field unchanged and using the off field to store the index of the interpreter function. 2. In ___bpf_prog_run() for the JMP_CALL_ARGS case, retrieving the interpreter function pointer from the interpreters_args array using the off field as the index, and passing the original imm to calculate the last argument of the interpreter function. After these changes, the truncation issue is resolved, and __bpf_call_base_args is also no longer needed and can be removed, which makes the code cleaner. Performance: In ___bpf_prog_run() for the JMP_CALL_ARGS case, changing the retrieval of the interpreter function pointer from pointer addition to direct array indexing improves performance. The possible reason is that the latter has better instruction-level parallelism. See the v5 discussion [1] for more details. [1] https://lore.kernel.org/bpf/f120c3c4-6999-414a-b514-518bb64b4758@zju.edu.cn/ To avoid requiring bpftool changes, keep the new imm/off encoding internal and restore the legacy xlated dump layout in bpf_insn_prepare_dump(). For bpf-to-bpf call offsets that do not fit in s16, export off as 0 instead of a truncated and misleading value. Fixes: 1ea47e01ad6e ("bpf: add support for bpf_call to interpreter") Fixes: 7105e828c087 ("bpf: allow for correlation of maps and helpers in dump") Suggested-by: Xu Kuohai Suggested-by: Puranjay Mohan Co-developed-by: Tianci Cao Signed-off-by: Tianci Cao Co-developed-by: Shenghao Yuan Signed-off-by: Shenghao Yuan Signed-off-by: Yazhou Tang Link: https://lore.kernel.org/r/20260506094714.419842-3-tangyazhou@zju.edu.cn Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 ++++++ include/linux/filter.h | 3 --- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 52b30e9ea431..cd191c5fdb0a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2918,6 +2918,12 @@ int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr, u32 ua #ifndef CONFIG_BPF_JIT_ALWAYS_ON int bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); +s32 bpf_call_args_imm(s16 idx); +#else +static inline s32 bpf_call_args_imm(s16 idx) +{ + return 0; +} #endif struct btf *bpf_get_btf_vmlinux(void); diff --git a/include/linux/filter.h b/include/linux/filter.h index 1ec6d5ba64cc..88a241aac36a 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1151,9 +1151,6 @@ bool sk_filter_charge(struct sock *sk, struct sk_filter *fp); void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp); u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); -#define __bpf_call_base_args \ - ((u64 (*)(u64, u64, u64, u64, u64, const struct bpf_insn *)) \ - (void *)__bpf_call_base) struct bpf_prog *bpf_int_jit_compile(struct bpf_verifier_env *env, struct bpf_prog *prog); void bpf_jit_compile(struct bpf_prog *prog); -- cgit v1.2.3 From 5dd74441cbf42c22e874450eb6a6bbb19390a216 Mon Sep 17 00:00:00 2001 From: Guopeng Zhang Date: Sat, 9 May 2026 18:20:31 +0800 Subject: cgroup/cpuset: Reserve DL bandwidth only for root-domain moves cpuset_can_attach() currently adds the bandwidth of all migrating SCHED_DEADLINE tasks to sum_migrate_dl_bw. If the source and destination cpuset effective CPU masks do not overlap, the whole sum is then reserved in the destination root domain. set_cpus_allowed_dl(), however, subtracts bandwidth from the source root domain only when the affinity change really moves the task between root domains. A DL task can move between cpusets that are still in the same root domain, so including that task in sum_migrate_dl_bw can reserve destination bandwidth without a matching source-side subtraction. Share the root-domain move test with set_cpus_allowed_dl(). Keep nr_migrate_dl_tasks counting all migrating deadline tasks for cpuset DL task accounting, but add to sum_migrate_dl_bw only for tasks that need a root-domain bandwidth move. Keep using the destination cpuset effective CPU mask and leave the broader can_attach()/attach() transaction model unchanged. Fixes: 2ef269ef1ac0 ("cgroup/cpuset: Free DL BW in case can_attach() fails") Cc: stable@vger.kernel.org # v6.10+ Signed-off-by: Guopeng Zhang Reviewed-by: Waiman Long Acked-by: Juri Lelli Tested-by: Juri Lelli Signed-off-by: Tejun Heo --- include/linux/sched/deadline.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h index 1198138cb839..273538200a44 100644 --- a/include/linux/sched/deadline.h +++ b/include/linux/sched/deadline.h @@ -33,6 +33,15 @@ struct root_domain; extern void dl_add_task_root_domain(struct task_struct *p); extern void dl_clear_root_domain(struct root_domain *rd); extern void dl_clear_root_domain_cpu(int cpu); +/* + * Return whether moving DL task @p to @new_mask requires moving DL + * bandwidth accounting between root domains. This helper is specific to + * DL bandwidth move accounting semantics and is shared by + * cpuset_can_attach() and set_cpus_allowed_dl() so both paths use the + * same source root-domain test. + */ +extern bool dl_task_needs_bw_move(struct task_struct *p, + const struct cpumask *new_mask); extern u64 dl_cookie; extern bool dl_bw_visited(int cpu, u64 cookie); -- cgit v1.2.3 From b5782e2d462c028096f922abca46318cec890670 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 12 May 2026 13:33:40 +0100 Subject: netfs: Fix missing barriers when accessing stream->subrequests locklessly The list of subrequests attached to stream->subrequests is accessed without locks by netfs_collect_read_results() and netfs_collect_write_results(), and then they access subreq->flags without taking a barrier after getting the subreq pointer from the list. Relatedly, the functions that build the list don't use any sort of write barrier when constructing the list to make sure that the NETFS_SREQ_IN_PROGRESS flag is perceived to be set first if no lock is taken. Fix this by: (1) Add a new list_add_tail_release() function that uses a release barrier to set the pointer to the new member of the list. (2) Add a new list_first_entry_or_null_acquire() function that uses an acquire barrier to read the pointer to the first member in a list (or return NULL). (3) Use list_add_tail_release() when adding a subreq to ->subrequests. (4) Use list_first_entry_or_null_acquire() when initially accessing the front of the list (when an item is removed, the pointer to the new front iterm is obtained under the same lock). Fixes: e2d46f2ec332 ("netfs: Change the read result collector to only use one work item") Fixes: 288ace2f57c9 ("netfs: New writeback implementation") Link: https://sashiko.dev/#/patchset/20260326104544.509518-1-dhowells%40redhat.com Signed-off-by: David Howells Link: https://patch.msgid.link/20260512123404.719402-4-dhowells@redhat.com cc: Paulo Alcantara cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/linux/list.h | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'include/linux') diff --git a/include/linux/list.h b/include/linux/list.h index 00ea8e5fb88b..09d979976b3b 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -191,6 +191,29 @@ static inline void list_add_tail(struct list_head *new, struct list_head *head) __list_add(new, head->prev, head); } +/** + * list_add_tail_release - add a new entry with release barrier + * @new: new entry to be added + * @head: list head to add it before + * + * Insert a new entry before the specified head, using a release barrier to set + * the ->next pointer that points to it. This is useful for implementing + * queues, in particular one that the elements will be walked through forwards + * locklessly. + */ +static inline void list_add_tail_release(struct list_head *new, + struct list_head *head) +{ + struct list_head *prev = head->prev; + + if (__list_add_valid(new, prev, head)) { + new->next = head; + new->prev = prev; + head->prev = new; + smp_store_release(&prev->next, new); + } +} + /* * Delete a list entry by making the prev/next entries * point to each other. @@ -644,6 +667,20 @@ static inline void list_splice_tail_init(struct list_head *list, pos__ != head__ ? list_entry(pos__, type, member) : NULL; \ }) +/** + * list_first_entry_or_null_acquire - get the first element from a list with barrier + * @ptr: the list head to take the element from. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_head within the struct. + * + * Note that if the list is empty, it returns NULL. + */ +#define list_first_entry_or_null_acquire(ptr, type, member) ({ \ + struct list_head *head__ = (ptr); \ + struct list_head *pos__ = smp_load_acquire(&head__->next); \ + pos__ != head__ ? list_entry(pos__, type, member) : NULL; \ +}) + /** * list_last_entry_or_null - get the last element from a list * @ptr: the list head to take the element from. -- cgit v1.2.3 From 2c8f4742bb76117d735f92a3932d85239b16c494 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 12 May 2026 13:33:42 +0100 Subject: netfs: Fix potential for tearing in ->remote_i_size and ->zero_point Fix potential tearing in using ->remote_i_size and ->zero_point by copying i_size_read() and i_size_write() and using the same seqcount as for i_size. We need to make sure that netfslib and the filesystems that use it always hold i_lock whilst updating any of the sizes to prevent i_size_seqcount from getting corrupted. Fixes: 4058f742105e ("netfs: Keep track of the actual remote file size") Fixes: 100ccd18bb41 ("netfs: Optimise away reads above the point at which there can be no data") Closes: https://sashiko.dev/#/patchset/20260414082004.3756080-1-dhowells%40redhat.com Signed-off-by: David Howells Link: https://patch.msgid.link/20260512123404.719402-6-dhowells@redhat.com cc: Paulo Alcantara cc: Matthew Wilcox cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/linux/netfs.h | 293 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 284 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index ba17ac5bf356..4fd1d796ad73 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -62,8 +62,8 @@ struct netfs_inode { struct fscache_cookie *cache; #endif struct mutex wb_lock; /* Writeback serialisation */ - loff_t remote_i_size; /* Size of the remote file */ - loff_t zero_point; /* Size after which we assume there's no data + loff_t _remote_i_size; /* Size of the remote file */ + loff_t _zero_point; /* Size after which we assume there's no data * on the server */ atomic_t io_count; /* Number of outstanding reqs */ unsigned long flags; @@ -474,6 +474,254 @@ static inline struct netfs_inode *netfs_inode(struct inode *inode) return container_of(inode, struct netfs_inode, inode); } +/** + * netfs_read_remote_i_size - Read remote_i_size safely + * @inode: The inode to access + * + * Read remote_i_size safely without the potential for tearing on 32-bit + * arches. + * + * NOTE: in a 32bit arch with a preemptable kernel and an UP compile the + * i_size_read/write must be atomic with respect to the local cpu (unlike with + * preempt disabled), but they don't need to be atomic with respect to other + * cpus like in true SMP (so they need either to either locally disable irq + * around the read or for example on x86 they can be still implemented as a + * cmpxchg8b without the need of the lock prefix). For SMP compiles and 64bit + * archs it makes no difference if preempt is enabled or not. + */ +static inline unsigned long long netfs_read_remote_i_size(const struct inode *inode) +{ + const struct netfs_inode *ictx = container_of(inode, struct netfs_inode, inode); + unsigned long long remote_i_size; + +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + unsigned int seq; + + do { + seq = read_seqcount_begin(&inode->i_size_seqcount); + remote_i_size = ictx->_remote_i_size; + } while (read_seqcount_retry(&inode->i_size_seqcount, seq)); +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) + preempt_disable(); + remote_i_size = ictx->_remote_i_size; + preempt_enable(); +#else + /* Pairs with smp_store_release() in netfs_write_remote_i_size() */ + remote_i_size = smp_load_acquire(&ictx->_remote_i_size); +#endif + return remote_i_size; +} + +/* + * netfs_write_remote_i_size - Set remote_i_size safely + * @inode: The inode to access + * @remote_i_size: The new value for the size of the file on the server + * + * Set remote_i_size safely without the potential for tearing on 32-bit arches. + * + * Context: The caller must hold inode->i_lock. + * + * NOTE: unlike netfs_read_remote_i_size(), netfs_write_remote_i_size() does + * need locking around it (normally i_rwsem), otherwise on 32bit/SMP an update + * of i_size_seqcount can be lost, resulting in subsequent i_size_read() calls + * spinning forever. + */ +static inline void netfs_write_remote_i_size(struct inode *inode, + unsigned long long remote_i_size) +{ + struct netfs_inode *ictx = netfs_inode(inode); + +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + write_seqcount_begin(&inode->i_size_seqcount); + ictx->_remote_i_size = remote_i_size; + write_seqcount_end(&inode->i_size_seqcount); +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) + preempt_disable(); + ictx->_remote_i_size = remote_i_size; + preempt_enable(); +#else + /* + * Pairs with smp_load_acquire() in netfs_read_remote_i_size() to + * ensure changes related to inode size (such as page contents) are + * visible before we see the changed inode size. + */ + smp_store_release(&ictx->_remote_i_size, remote_i_size); +#endif +} + +/** + * netfs_read_zero_point - Read zero_point safely + * @inode: The inode to access + * + * Read zero_point safely without the potential for tearing on 32-bit + * arches. + * + * NOTE: in a 32bit arch with a preemptable kernel and an UP compile the + * i_size_read/write must be atomic with respect to the local cpu (unlike with + * preempt disabled), but they don't need to be atomic with respect to other + * cpus like in true SMP (so they need either to either locally disable irq + * around the read or for example on x86 they can be still implemented as a + * cmpxchg8b without the need of the lock prefix). For SMP compiles and 64bit + * archs it makes no difference if preempt is enabled or not. + */ +static inline unsigned long long netfs_read_zero_point(const struct inode *inode) +{ + struct netfs_inode *ictx = container_of(inode, struct netfs_inode, inode); + unsigned long long zero_point; + +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + unsigned int seq; + + do { + seq = read_seqcount_begin(&inode->i_size_seqcount); + zero_point = ictx->_zero_point; + } while (read_seqcount_retry(&inode->i_size_seqcount, seq)); +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) + preempt_disable(); + zero_point = ictx->_zero_point; + preempt_enable(); +#else + /* Pairs with smp_store_release() in netfs_write_zero_point() */ + zero_point = smp_load_acquire(&ictx->_zero_point); +#endif + return zero_point; +} + +/* + * netfs_write_zero_point - Set zero_point safely + * @inode: The inode to access + * @zero_point: The new value for the point beyond which the server has no data + * + * Set zero_point safely without the potential for tearing on 32-bit arches. + * + * Context: The caller must hold inode->i_lock. + * + * NOTE: unlike netfs_read_zero_point(), netfs_write_zero_point() does need + * locking around it (normally i_rwsem), otherwise on 32bit/SMP an update of + * i_size_seqcount can be lost, resulting in subsequent read calls spinning + * forever. + */ +static inline void netfs_write_zero_point(struct inode *inode, + unsigned long long zero_point) +{ + struct netfs_inode *ictx = netfs_inode(inode); + +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + write_seqcount_begin(&inode->i_size_seqcount); + ictx->_zero_point = zero_point; + write_seqcount_end(&inode->i_size_seqcount); +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) + preempt_disable(); + ictx->_zero_point = zero_point; + preempt_enable(); +#else + /* + * Pairs with smp_load_acquire() in netfs_read_zero_point() to + * ensure changes related to inode size (such as page contents) are + * visible before we see the changed inode size. + */ + smp_store_release(&ictx->_zero_point, zero_point); +#endif +} + +/** + * netfs_read_sizes - Read remote_i_size and zero_point safely + * @inode: The inode to access + * @i_size: Where to return the local file size. + * @remote_i_size: Where to return the size of the file on the server + * @zero_point: Where to return the the point beyond which the server has no data + * + * Read remote_i_size and zero_point safely without the potential for tearing + * on 32-bit arches. + * + * NOTE: in a 32bit arch with a preemptable kernel and an UP compile the + * i_size_read/write must be atomic with respect to the local cpu (unlike with + * preempt disabled), but they don't need to be atomic with respect to other + * cpus like in true SMP (so they need either to either locally disable irq + * around the read or for example on x86 they can be still implemented as a + * cmpxchg8b without the need of the lock prefix). For SMP compiles and 64bit + * archs it makes no difference if preempt is enabled or not. + */ +static inline void netfs_read_sizes(const struct inode *inode, + unsigned long long *i_size, + unsigned long long *remote_i_size, + unsigned long long *zero_point) +{ + const struct netfs_inode *ictx = container_of(inode, struct netfs_inode, inode); +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + unsigned int seq; + + do { + seq = read_seqcount_begin(&inode->i_size_seqcount); + *i_size = inode->i_size; + *remote_i_size = ictx->_remote_i_size; + *zero_point = ictx->_zero_point; + } while (read_seqcount_retry(&inode->i_size_seqcount, seq)); +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) + preempt_disable(); + *i_size = inode->i_size; + *remote_i_size = ictx->_remote_i_size; + *zero_point = ictx->_zero_point; + preempt_enable(); +#else + /* Pairs with smp_store_release() in i_size_write() */ + *i_size = smp_load_acquire(&inode->i_size); + /* Pairs with smp_store_release() in netfs_write_remote_i_size() */ + *remote_i_size = smp_load_acquire(&ictx->_remote_i_size); + /* Pairs with smp_store_release() in netfs_write_zero_point() */ + *zero_point = smp_load_acquire(&ictx->_zero_point); +#endif +} + +/* + * netfs_write_sizes - Set i_size, remote_i_size and zero_point safely + * @inode: The inode to access + * @i_size: The new value for the local size of the file + * @remote_i_size: The new value for the size of the file on the server + * @zero_point: The new value for the point beyond which the server has no data + * + * Set both remote_i_size and zero_point safely without the potential for + * tearing on 32-bit arches. + * + * Context: The caller must hold inode->i_lock. + * + * NOTE: unlike netfs_read_zero_point(), netfs_write_zero_point() does need + * locking around it (normally i_rwsem), otherwise on 32bit/SMP an update of + * i_size_seqcount can be lost, resulting in subsequent read calls spinning + * forever. + */ +static inline void netfs_write_sizes(struct inode *inode, + unsigned long long i_size, + unsigned long long remote_i_size, + unsigned long long zero_point) +{ + struct netfs_inode *ictx = netfs_inode(inode); + +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + write_seqcount_begin(&inode->i_size_seqcount); + inode->i_size = i_size; + ictx->_remote_i_size = remote_i_size; + ictx->_zero_point = zero_point; + write_seqcount_end(&inode->i_size_seqcount); +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) + preempt_disable(); + inode->i_size = i_size; + ictx->_remote_i_size = remote_i_size; + ictx->_zero_point = zero_point; + preempt_enable(); +#else + /* + * Pairs with smp_load_acquire() in i_size_read(), + * netfs_read_remote_i_size() and netfs_read_zero_point() to ensure + * changes related to inode size (such as page contents) are visible + * before we see the changed inode size. + */ + smp_store_release(&inode->i_size, i_size); + smp_store_release(&ictx->_remote_i_size, remote_i_size); + smp_store_release(&ictx->_zero_point, zero_point); +#endif +} + /** * netfs_inode_init - Initialise a netfslib inode context * @ctx: The netfs inode to initialise @@ -488,8 +736,8 @@ static inline void netfs_inode_init(struct netfs_inode *ctx, bool use_zero_point) { ctx->ops = ops; - ctx->remote_i_size = i_size_read(&ctx->inode); - ctx->zero_point = LLONG_MAX; + ctx->_remote_i_size = i_size_read(&ctx->inode); + ctx->_zero_point = LLONG_MAX; ctx->flags = 0; atomic_set(&ctx->io_count, 0); #if IS_ENABLED(CONFIG_FSCACHE) @@ -498,7 +746,7 @@ static inline void netfs_inode_init(struct netfs_inode *ctx, mutex_init(&ctx->wb_lock); /* ->releasepage() drives zero_point */ if (use_zero_point) { - ctx->zero_point = ctx->remote_i_size; + ctx->_zero_point = ctx->_remote_i_size; mapping_set_release_always(ctx->inode.i_mapping); } } @@ -511,13 +759,40 @@ static inline void netfs_inode_init(struct netfs_inode *ctx, * * Inform the netfs lib that a file got resized so that it can adjust its state. */ -static inline void netfs_resize_file(struct netfs_inode *ctx, loff_t new_i_size, +static inline void netfs_resize_file(struct netfs_inode *ictx, + unsigned long long new_i_size, bool changed_on_server) { +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + struct inode *inode = &ictx->inode; + + preempt_disable(); + write_seqcount_begin(&inode->i_size_seqcount); + if (changed_on_server) + ictx->_remote_i_size = new_i_size; + if (new_i_size < ictx->_zero_point) + ictx->_zero_point = new_i_size; + write_seqcount_end(&inode->i_size_seqcount); + preempt_enable(); +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) + preempt_disable(); if (changed_on_server) - ctx->remote_i_size = new_i_size; - if (new_i_size < ctx->zero_point) - ctx->zero_point = new_i_size; + ictx->_remote_i_size = new_i_size; + if (new_i_size < ictx->_zero_point) + ictx->_zero_point = new_i_size; + preempt_enable(); +#else + /* + * Pairs with smp_load_acquire() in netfs_read_remote_i_size and + * netfs_read_zero_point() to ensure changes related to inode size + * (such as page contents) are visible before we see the changed inode + * size. + */ + if (changed_on_server) + smp_store_release(&ictx->_remote_i_size, new_i_size); + if (new_i_size < ictx->_zero_point) + smp_store_release(&ictx->_zero_point, new_i_size); +#endif } /** -- cgit v1.2.3 From dbe556972100fabb8e5a1b3d2163831ff07b1e8e Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 12 May 2026 13:33:56 +0100 Subject: netfs: Fix potential UAF in netfs_unlock_abandoned_read_pages() netfs_unlock_abandoned_read_pages(rreq) accesses the index of the folios it is wanting to unlock and compares that to rreq->no_unlock_folio so that it doesn't unlock a folio being read for netfs_perform_write() or netfs_write_begin(). However, given that netfs_unlock_abandoned_read_pages() is called _after_ NETFS_RREQ_IN_PROGRESS is cleared, the one folio that it's not allowed to dereference is the one specified by ->no_unlock_folio as ownership immediately reverts to the caller. Fix this by storing the folio pointer instead and using that rather than the index. Also fix netfs_unlock_read_folio() where the same applies. Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading") Closes: https://sashiko.dev/#/patchset/20260414082004.3756080-1-dhowells%40redhat.com Signed-off-by: David Howells Link: https://patch.msgid.link/20260512123404.719402-20-dhowells@redhat.com cc: Paulo Alcantara cc: Viacheslav Dubeyko cc: Matthew Wilcox cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/linux/netfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 4fd1d796ad73..243c0f737938 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -252,7 +252,7 @@ struct netfs_io_request { unsigned long long collected_to; /* Point we've collected to */ unsigned long long cleaned_to; /* Position we've cleaned folios to */ unsigned long long abandon_to; /* Position to abandon folios to */ - pgoff_t no_unlock_folio; /* Don't unlock this folio after read */ + const struct folio *no_unlock_folio; /* Don't unlock this folio after read */ unsigned int direct_bv_count; /* Number of elements in direct_bv[] */ unsigned int debug_id; unsigned int rsize; /* Maximum read size (0 for none) */ -- cgit v1.2.3 From 2c85c61d1332e1e16f020d76951baf167dcb6f7a Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Mon, 4 May 2026 10:47:22 +0200 Subject: HID: pass the buffer size to hid_report_raw_event commit 0a3fe972a7cb ("HID: core: Mitigate potential OOB by removing bogus memset()") enforced the provided data to be at least the size of the declared buffer in the report descriptor to prevent a buffer overflow. However, we can try to be smarter by providing both the buffer size and the data size, meaning that hid_report_raw_event() can make better decision whether we should plaining reject the buffer (buffer overflow attempt) or if we can safely memset it to 0 and pass it to the rest of the stack. Fixes: 0a3fe972a7cb ("HID: core: Mitigate potential OOB by removing bogus memset()") Cc: stable@vger.kernel.org Signed-off-by: Benjamin Tissoires Acked-by: Johan Hovold Reviewed-by: Greg Kroah-Hartman Signed-off-by: Jiri Kosina --- include/linux/hid.h | 4 ++-- include/linux/hid_bpf.h | 14 +++++++++----- 2 files changed, 11 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index 442a80d79e89..ac432a2ef415 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -1298,8 +1298,8 @@ static inline u32 hid_report_len(struct hid_report *report) return DIV_ROUND_UP(report->size, 8) + (report->id > 0); } -int hid_report_raw_event(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size, - int interrupt); +int hid_report_raw_event(struct hid_device *hid, enum hid_report_type type, u8 *data, + size_t bufsize, u32 size, int interrupt); /* HID quirks API */ unsigned long hid_lookup_quirk(const struct hid_device *hdev); diff --git a/include/linux/hid_bpf.h b/include/linux/hid_bpf.h index a2e47dbcf82c..19fffa4574a4 100644 --- a/include/linux/hid_bpf.h +++ b/include/linux/hid_bpf.h @@ -72,8 +72,8 @@ struct hid_ops { int (*hid_hw_output_report)(struct hid_device *hdev, __u8 *buf, size_t len, u64 source, bool from_bpf); int (*hid_input_report)(struct hid_device *hid, enum hid_report_type type, - u8 *data, u32 size, int interrupt, u64 source, bool from_bpf, - bool lock_already_taken); + u8 *data, size_t bufsize, u32 size, int interrupt, u64 source, + bool from_bpf, bool lock_already_taken); struct module *owner; const struct bus_type *bus_type; }; @@ -200,7 +200,8 @@ struct hid_bpf { #ifdef CONFIG_HID_BPF u8 *dispatch_hid_bpf_device_event(struct hid_device *hid, enum hid_report_type type, u8 *data, - u32 *size, int interrupt, u64 source, bool from_bpf); + size_t *buf_size, u32 *size, int interrupt, u64 source, + bool from_bpf); int dispatch_hid_bpf_raw_requests(struct hid_device *hdev, unsigned char reportnum, __u8 *buf, u32 size, enum hid_report_type rtype, @@ -215,8 +216,11 @@ int hid_bpf_device_init(struct hid_device *hid); const u8 *call_hid_bpf_rdesc_fixup(struct hid_device *hdev, const u8 *rdesc, unsigned int *size); #else /* CONFIG_HID_BPF */ static inline u8 *dispatch_hid_bpf_device_event(struct hid_device *hid, enum hid_report_type type, - u8 *data, u32 *size, int interrupt, - u64 source, bool from_bpf) { return data; } + u8 *data, size_t *buf_size, u32 *size, + int interrupt, u64 source, bool from_bpf) +{ + return data; +} static inline int dispatch_hid_bpf_raw_requests(struct hid_device *hdev, unsigned char reportnum, u8 *buf, u32 size, enum hid_report_type rtype, -- cgit v1.2.3 From 206342541fc887ae919774a43942dc883161fece Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Mon, 4 May 2026 10:47:23 +0200 Subject: HID: core: introduce hid_safe_input_report() hid_input_report() is used in too many places to have a commit that doesn't cross subsystem borders. Instead of changing the API, introduce a new one when things matters in the transport layers: - usbhid - i2chid This effectively revert to the old behavior for those two transport layers. Fixes: 0a3fe972a7cb ("HID: core: Mitigate potential OOB by removing bogus memset()") Cc: stable@vger.kernel.org Signed-off-by: Benjamin Tissoires Signed-off-by: Jiri Kosina --- include/linux/hid.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index ac432a2ef415..bfb9859f391e 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -1030,6 +1030,8 @@ struct hid_field *hid_find_field(struct hid_device *hdev, unsigned int report_ty int hid_set_field(struct hid_field *, unsigned, __s32); int hid_input_report(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size, int interrupt); +int hid_safe_input_report(struct hid_device *hid, enum hid_report_type type, u8 *data, + size_t bufsize, u32 size, int interrupt); struct hid_field *hidinput_get_led_field(struct hid_device *hid); unsigned int hidinput_count_leds(struct hid_device *hid); __s32 hidinput_calc_abs_res(const struct hid_field *field, __u16 code); -- cgit v1.2.3 From 32d5019ed3b6ff4439cb075fb275f655c8a2059c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 May 2026 07:01:47 +0200 Subject: block: pass a minsize argument to bio_iov_iter_bounce When bouncing for block size > PAGE_SIZE file systems that require file system block size alignment (e.g. zoned XFS), the bio needs to be big enough to fit an entire block. Fixes: 8dd5e7c75d7b ("block: add helpers to bounce buffer an iov_iter into bios") Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Link: https://patch.msgid.link/20260507050153.1298375-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bio.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 97d747320b35..dc17780d6c1e 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -475,7 +475,8 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty); extern void bio_set_pages_dirty(struct bio *bio); extern void bio_check_pages_dirty(struct bio *bio); -int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter, size_t maxlen); +int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter, size_t maxlen, + size_t minsize); void bio_iov_iter_unbounce(struct bio *bio, bool is_error, bool mark_dirty); extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, -- cgit v1.2.3 From df733ddc263dbe5f471e7c80c8b669532f56bf76 Mon Sep 17 00:00:00 2001 From: Matt Evans Date: Mon, 11 May 2026 07:46:42 -0700 Subject: vfio/pci: Make VFIO_PCI_OFFSET_TO_INDEX() return unsigned VFIO_PCI_OFFSET_TO_INDEX() is used in several places with a signed parameter (e.g. loff_t). Because it makes no sense for a BAR/resource index to be negative, enforce this in the macro. This fixes at least one current issue, where vfio_pci_ioeventfd() uses this macro with an unvalidated signed loff_t returned into a signed type, leading to a possible negative array access. This instance does test against an out-of-bounds positive value, so treating the index as unsigned fixes this issue. Fixes: 89e1f7d4c66d8 ("vfio: Add PCI device driver") Signed-off-by: Matt Evans Link: https://lore.kernel.org/r/20260511144642.2926799-1-mattev@meta.com Signed-off-by: Alex Williamson --- include/linux/vfio_pci_core.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 2ebba746c18f..89165b769e5c 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -21,7 +21,7 @@ #define VFIO_PCI_CORE_H #define VFIO_PCI_OFFSET_SHIFT 40 -#define VFIO_PCI_OFFSET_TO_INDEX(off) (off >> VFIO_PCI_OFFSET_SHIFT) +#define VFIO_PCI_OFFSET_TO_INDEX(off) ((u64)(off) >> VFIO_PCI_OFFSET_SHIFT) #define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT) #define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1) -- cgit v1.2.3 From 6a288a4ddb4a994490505ab5f41c445f8e6b6467 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Tue, 21 Apr 2026 17:39:07 +0200 Subject: mm/page_alloc: fix initialization of tags of the huge zero folio with init_on_free __GFP_ZEROTAGS semantics are currently a bit weird, but effectively this flag is only ever set alongside __GFP_ZERO and __GFP_SKIP_KASAN. If we run with init_on_free, we will zero out pages during __free_pages_prepare(), to skip zeroing on the allocation path. However, when allocating with __GFP_ZEROTAG set, post_alloc_hook() will consequently not only skip clearing page content, but also skip clearing tag memory. Not clearing tags through __GFP_ZEROTAGS is irrelevant for most pages that will get mapped to user space through set_pte_at() later: set_pte_at() and friends will detect that the tags have not been initialized yet (PG_mte_tagged not set), and initialize them. However, for the huge zero folio, which will be mapped through a PMD marked as special, this initialization will not be performed, ending up exposing whatever tags were still set for the pages. The docs (Documentation/arch/arm64/memory-tagging-extension.rst) state that allocation tags are set to 0 when a page is first mapped to user space. That no longer holds with the huge zero folio when init_on_free is enabled. Fix it by decoupling __GFP_ZEROTAGS from __GFP_ZERO, passing to tag_clear_highpages() whether we want to also clear page content. Invert the meaning of the tag_clear_highpages() return value to have clearer semantics. Reproduced with the huge zero folio by modifying the check_buffer_fill arm64/mte selftest to use a 2 MiB area, after making sure that pages have a non-0 tag set when freeing (note that, during boot, we will not actually initialize tags, but only set KASAN_TAG_KERNEL in the page flags). $ ./check_buffer_fill 1..20 ... not ok 17 Check initial tags with private mapping, sync error mode and mmap memory not ok 18 Check initial tags with private mapping, sync error mode and mmap/mprotect memory ... This code needs more cleanups; we'll tackle that next, like decoupling __GFP_ZEROTAGS from __GFP_SKIP_KASAN. [akpm@linux-foundation.org: s/__GPF_ZERO/__GFP_ZERO/, per David] Link: https://lore.kernel.org/20260421-zerotags-v2-1-05cb1035482e@kernel.org Fixes: adfb6609c680 ("mm/huge_memory: initialise the tags of the huge zero folio") Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Catalin Marinas Tested-by: Lance Yang Cc: Brendan Jackman Cc: Dev Jain Cc: Johannes Weiner Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Mark Brown Cc: Michal Hocko Cc: Mike Rapoport Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Will Deacon Cc: Zi Yan Cc: Signed-off-by: Andrew Morton --- include/linux/gfp_types.h | 10 +++++----- include/linux/highmem.h | 7 ++++--- 2 files changed, 9 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index 6c75df30a281..cd4972a7c97c 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -273,11 +273,11 @@ enum { * * %__GFP_ZERO returns a zeroed page on success. * - * %__GFP_ZEROTAGS zeroes memory tags at allocation time if the memory itself - * is being zeroed (either via __GFP_ZERO or via init_on_alloc, provided that - * __GFP_SKIP_ZERO is not set). This flag is intended for optimization: setting - * memory tags at the same time as zeroing memory has minimal additional - * performance impact. + * %__GFP_ZEROTAGS zeroes memory tags at allocation time. Setting memory tags at + * the same time as zeroing memory (e.g., with __GFP_ZERO) has minimal + * additional performance impact. However, __GFP_ZEROTAGS also zeroes the tags + * even if memory is not getting zeroed at allocation time (e.g., + * with init_on_free). * * %__GFP_SKIP_KASAN makes KASAN skip unpoisoning on page allocation. * Used for userspace and vmalloc pages; the latter are unpoisoned by diff --git a/include/linux/highmem.h b/include/linux/highmem.h index af03db851a1d..d7aac9de1c8a 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -347,10 +347,11 @@ static inline void clear_highpage_kasan_tagged(struct page *page) #ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGES -/* Return false to let people know we did not initialize the pages */ -static inline bool tag_clear_highpages(struct page *page, int numpages) +/* Returns true if the caller has to initialize the pages */ +static inline bool tag_clear_highpages(struct page *page, int numpages, + bool clear_pages) { - return false; + return clear_pages; } #endif -- cgit v1.2.3 From 31e62c2ebbfdc3fe3dbdf5e02c92a9dc67087a3a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 13 May 2026 11:37:18 -0700 Subject: ptrace: slightly saner 'get_dumpable()' logic The 'dumpability' of a task is fundamentally about the memory image of the task - the concept comes from whether it can core dump or not - and makes no sense when you don't have an associated mm. And almost all users do in fact use it only for the case where the task has a mm pointer. But we have one odd special case: ptrace_may_access() uses 'dumpable' to check various other things entirely independently of the MM (typically explicitly using flags like PTRACE_MODE_READ_FSCREDS). Including for threads that no longer have a VM (and maybe never did, like most kernel threads). It's not what this flag was designed for, but it is what it is. The ptrace code does check that the uid/gid matches, so you do have to be uid-0 to see kernel thread details, but this means that the traditional "drop capabilities" model doesn't make any difference for this all. Make it all make a *bit* more sense by saying that if you don't have a MM pointer, we'll use a cached "last dumpability" flag if the thread ever had a MM (it will be zero for kernel threads since it is never set), and require a proper CAP_SYS_PTRACE capability to override. Reported-by: Qualys Security Advisory Cc: Oleg Nesterov Cc: Kees Cook Signed-off-by: Linus Torvalds --- include/linux/sched.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 368c7b4d7cb5..ee06cba5c6f5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1002,6 +1002,9 @@ struct task_struct { unsigned sched_rt_mutex:1; #endif + /* Save user-dumpable when mm goes away */ + unsigned user_dumpable:1; + /* Bit to tell TOMOYO we're in execve(): */ unsigned in_execve:1; unsigned in_iowait:1; -- cgit v1.2.3 From 3d562d35a044ae798cab421c65a116f8cedfa5d4 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sun, 17 May 2026 09:55:28 +0200 Subject: bpf: Check global subprog exception paths Global subprogs are verified independently and are not descended into when their callers are symbolically executed. This means a caller can hold references or locks across a global subprog call that may throw, while the verifier only checks the non-exceptional return path at the call site. Record whether a subprog might throw in the CFG summary pass, alongside the existing might_sleep and packet-data-changing summaries, and propagate that effect through reachable callees. When a global subprog is marked as possibly throwing, push the normal continuation and validate the exceptional path immediately at the call site, avoiding a synthetic exception state and associated special case in the pruning checks. Fixes: f18b03fabaa9 ("bpf: Implement BPF exceptions") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260517075530.3461166-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index b148f816f25b..185b2aa43a42 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -729,6 +729,7 @@ struct bpf_subprog_info { */ s16 fastcall_stack_off; bool has_tail_call: 1; + bool might_throw: 1; bool tail_call_reachable: 1; bool has_ld_abs: 1; bool is_cb: 1; @@ -1308,6 +1309,7 @@ void bpf_fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask); bool bpf_subprog_is_global(const struct bpf_verifier_env *env, int subprog); int bpf_find_subprog(struct bpf_verifier_env *env, int off); +bool bpf_is_throw_kfunc(struct bpf_insn *insn); int bpf_compute_const_regs(struct bpf_verifier_env *env); int bpf_prune_dead_branches(struct bpf_verifier_env *env); int bpf_check_cfg(struct bpf_verifier_env *env); -- cgit v1.2.3 From f233124fb36cd57ef09f96d517a38ab4b902e15e Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Thu, 14 May 2026 09:39:01 +0200 Subject: ata: libata-scsi: do not use the deferred QC feature on PMPs with CBS When using Port Multipliers (PMPs) with Command-Based Switching (CBS), you can only issue commands to one link at a time. For PMPs with CBS, there is already code to handle commands being sent to different links in sata_pmp_qc_defer_cmd_switch() using ap->excl_link. sata_sil24 also makes use of ap->excl_link. A user on the list reported that commit 0ea84089dbf6 ("ata: libata-scsi: avoid Non-NCQ command starvation") broke PMPs with CBS. The commit introduced code that stores a deferred qc in ap->deferred_qc, to later be issued via a workqueue. It turns out that this change is incompatible with the existing ap->excl_link handling used by PMPs with CBS. Thus, modify sata_pmp_qc_defer_cmd_switch() and sil24_qc_defer() to return ATA_DEFER_LINK_EXCL, and make sure that the deferred QC handling via workqueue is not used for this return value. This way, PMPs with CBS will work once again. Note that the starvation referenced in commit 0ea84089dbf6 ("ata: libata-scsi: avoid Non-NCQ command starvation") can only happen on libsas ports, and libsas does not support Port Multipliers, thus there is no harm of reverting back to the previous way of deferring commands for PMPs with CBS. Non-libsas ports connected to anything but a PMP with CBS (e.g. a normal drive or a PMP with FBS) will continue using the deferred workqueue, since it does result in lower completion latencies for non-NCQ commands, even though the workqueue is not strictly needed to avoid starvation for non-libsas ports. If we want to modify the scope of the workqueue issuing to also handle PMPs with CBS, then we should ensure that we can save both NCQ and non-NCQ commands in ap->deferred_qc, while also removing the existing PMP CBS handling using ap->excl_link, such that we don't duplicate features. While at it, also add a comment explaining how the ap->excl_link mechanism works. Fixes: 0ea84089dbf6 ("ata: libata-scsi: avoid Non-NCQ command starvation") Tested-by: Tommy Kelly Reported-by: Tommy Kelly Closes: https://lore.kernel.org/linux-ide/ce09cc21-a8e9-4845-b205-35411e22fba9@tkel.ly/ Reviewed-by: Damien Le Moal Signed-off-by: Niklas Cassel --- include/linux/libata.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 5c085ef4eda7..360776016b50 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -371,6 +371,7 @@ enum { /* return values for ->qc_defer */ ATA_DEFER_LINK = 1, ATA_DEFER_PORT = 2, + ATA_DEFER_LINK_EXCL = 3, /* desc_len for ata_eh_info and context */ ATA_EH_DESC_LEN = 80, -- cgit v1.2.3 From 759e8756da00aa115d504a18155b1d1ee1cc12e8 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Thu, 14 May 2026 09:39:02 +0200 Subject: ata: libata-scsi: do not needlessly defer commands when using PMP with FBS The ACS specification does not allow a non-NCQ command to be issued while an NCQ command is outstanding. Commit 0ea84089dbf6 ("ata: libata-scsi: avoid Non-NCQ command starvation") introduced a feature where a deferred non-NCQ command gets issued from a workqueue. The design stores a single non-NCQ command per port. However, when using Port Multipliers (PMPs), specifically PMPs that support FIS-Based Switching (FBS), non-NCQ and NCQ commands can be mixed on the same port, just not for the same link, see e.g. ata_std_qc_defer() which is, and always has operated on a per-link basis. Therefore, move the deferred_qc from struct ata_port to struct ata_link. This way, when using a PMP with FBS, we will not needlessly defer commands to all other links, just because one link issued a non-NCQ command while having an NCQ command outstanding. Only commands for that specific link will be deferred. This is in line with how PMPs with FBS worked before commit 0ea84089dbf6 ("ata: libata-scsi: avoid Non-NCQ command starvation"). Fixes: 0ea84089dbf6 ("ata: libata-scsi: avoid Non-NCQ command starvation") Tested-by: Tommy Kelly Reviewed-by: Damien Le Moal Signed-off-by: Niklas Cassel --- include/linux/libata.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 360776016b50..127229fbd1a6 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -855,6 +855,9 @@ struct ata_link { unsigned int sata_spd; /* current SATA PHY speed */ enum ata_lpm_policy lpm_policy; + struct work_struct deferred_qc_work; + struct ata_queued_cmd *deferred_qc; + /* record runtime error info, protected by host_set lock */ struct ata_eh_info eh_info; /* EH context */ @@ -900,9 +903,6 @@ struct ata_port { u64 qc_active; int nr_active_links; /* #links with active qcs */ - struct work_struct deferred_qc_work; - struct ata_queued_cmd *deferred_qc; - struct ata_link link; /* host default link */ struct ata_link *slave_link; /* see ata_slave_link_init() */ -- cgit v1.2.3 From 8817005efbdfdf5d4e4814cb5dc52b53d12917d7 Mon Sep 17 00:00:00 2001 From: Qing Ming Date: Sat, 16 May 2026 15:08:49 +0800 Subject: cgroup/rstat: validate cpu before css_rstat_cpu() access css_rstat_updated() is exposed as a BPF kfunc and accepts a caller-provided cpu argument. The function uses cpu for per-cpu rstat lookups without checking whether it refers to a valid possible CPU. A BPF iter/cgroup program with CAP_BPF and CAP_PERFMON can pass an invalid cpu value. On an unfixed UBSCAN_BOUNDS test kernel, cpu == 0x7fffffff triggers: UBSAN: array-index-out-of-bounds in kernel/cgroup/rstat.c:31:9 index 2147483647 is out of range for type 'long unsigned int [64]' Call Trace: css_rstat_updated bpf_iter_run_prog cgroup_iter_seq_show bpf_seq_read Add cpu validation to the BPF-facing css_rstat_updated() kfunc and move the common implementation to __css_rstat_updated() for in-kernel callers. Fixes: a319185be9f5 ("cgroup: bpf: enable bpf programs to integrate with rstat") Signed-off-by: Qing Ming Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index e52160e85af4..e011dc43fcf1 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -776,6 +776,7 @@ static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) /* * cgroup scalable recursive statistics. */ +void __css_rstat_updated(struct cgroup_subsys_state *css, int cpu); void css_rstat_updated(struct cgroup_subsys_state *css, int cpu); void css_rstat_flush(struct cgroup_subsys_state *css); -- cgit v1.2.3 From 8939562b16052c75b908d3c5f968bffb526fc6e9 Mon Sep 17 00:00:00 2001 From: Rong Tao Date: Mon, 18 May 2026 15:02:08 +0800 Subject: efi: efi.h: Remove extra semicolon Remove extra semicolons from comments. Signed-off-by: Rong Tao Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index 72e76ec54641..ccbc35479684 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -61,7 +61,7 @@ typedef void *efi_handle_t; /* * The UEFI spec and EDK2 reference implementation both define EFI_GUID as - * struct { u32 a; u16; b; u16 c; u8 d[8]; }; and so the implied alignment + * struct { u32 a; u16 b; u16 c; u8 d[8]; }; and so the implied alignment * is 32 bits not 8 bits like our guid_t. In some cases (i.e., on 32-bit ARM), * this means that firmware services invoked by the kernel may assume that * efi_guid_t* arguments are 32-bit aligned, and use memory accessors that -- cgit v1.2.3 From 0cb5a74faa3bdcfa3b18735d554e12c0f615e35d Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Mon, 18 May 2026 15:44:57 +0200 Subject: net: airoha: Fix NPU RX DMA descriptor bits In an internal review from Airoha, it was notice that the RX DMA descriptor bits and mask are wrong. These values probably refer to an old NPU firmware never published. The previous value works correctly but it was reported that in some specific condition in mixed scenario with both Ethernet and WiFi offload it's possible that RX DMA descriptor signal wrong value with the problem to the RX ring or packets getting dropped. To handle these specific scenario, apply the new suggested bits mask from Airoha. Correct functionality of both AN7581 NPU and MT7996 variant were verified and confirmed working. Fixes: a7fc8c641cab ("net: airoha: Fix npu rx DMA definitions") Signed-off-by: Christian Marangi Acked-by: Lorenzo Bianconi Link: https://patch.msgid.link/20260518134530.3683-1-ansuelsmth@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/soc/airoha/airoha_offload.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soc/airoha/airoha_offload.h b/include/linux/soc/airoha/airoha_offload.h index d01ef4a6b3d7..7589fccfeef6 100644 --- a/include/linux/soc/airoha/airoha_offload.h +++ b/include/linux/soc/airoha/airoha_offload.h @@ -71,9 +71,9 @@ static inline void airoha_ppe_dev_check_skb(struct airoha_ppe_dev *dev, #define NPU_RX1_DESC_NUM 512 /* CTRL */ -#define NPU_RX_DMA_DESC_LAST_MASK BIT(27) -#define NPU_RX_DMA_DESC_LEN_MASK GENMASK(26, 14) -#define NPU_RX_DMA_DESC_CUR_LEN_MASK GENMASK(13, 1) +#define NPU_RX_DMA_DESC_LAST_MASK BIT(29) +#define NPU_RX_DMA_DESC_LEN_MASK GENMASK(28, 15) +#define NPU_RX_DMA_DESC_CUR_LEN_MASK GENMASK(14, 1) #define NPU_RX_DMA_DESC_DONE_MASK BIT(0) /* INFO */ #define NPU_RX_DMA_PKT_COUNT_MASK GENMASK(31, 29) -- cgit v1.2.3 From 988369959e9cc6dad1c3260ebaeb7a3bb5653ab2 Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Wed, 20 May 2026 16:43:37 +0200 Subject: video/hdmi: Add common TMDS character rate constants Several DRM drivers already define their own constants for minimum and maximum TMDS character rates. By defining common rate constants in a shared header, drivers can just use them instead of having driver local define macros or use magic numbers. The values defined in the header correspond to maximum TMDS character rates defined by each HDMI specification version: - HDMI_TMDS_CHAR_RATE_MIN_HZ: 25 MHz (minimum for all versions) - HDMI_1_0_TMDS_CHAR_RATE_MAX_HZ: 165 MHz (HDMI 1.0 maximum) - HDMI_1_3_TMDS_CHAR_RATE_MAX_HZ: 340 MHz (HDMI 1.3 maximum) - HDMI_2_0_TMDS_CHAR_RATE_MAX_HZ: 600 MHz (HDMI 2.0 maximum) Suggested-by: Maxime Ripard Reviewed-by: Heiko Stuebner Reviewed-by: Neil Armstrong Reviewed-by: Dmitry Baryshkov Reviewed-by: Maxime Ripard Link: https://patch.msgid.link/20260520144424.1633354-2-javierm@redhat.com Signed-off-by: Javier Martinez Canillas --- include/linux/hdmi.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hdmi.h b/include/linux/hdmi.h index 96bda41d9148..8dab78e1f61b 100644 --- a/include/linux/hdmi.h +++ b/include/linux/hdmi.h @@ -50,6 +50,12 @@ enum hdmi_infoframe_type { HDMI_INFOFRAME_TYPE_DRM = 0x87, }; +/* HDMI spec maximum TMDS character rates, in Hz */ +#define HDMI_TMDS_CHAR_RATE_MIN_HZ 25000000 +#define HDMI_1_0_TMDS_CHAR_RATE_MAX_HZ 165000000 +#define HDMI_1_3_TMDS_CHAR_RATE_MAX_HZ 340000000 +#define HDMI_2_0_TMDS_CHAR_RATE_MAX_HZ 600000000 + #define HDMI_IEEE_OUI 0x000c03 #define HDMI_FORUM_IEEE_OUI 0xc45dd8 #define HDMI_INFOFRAME_HEADER_SIZE 4 -- cgit v1.2.3 From 215c90ee656114f5e8c32408228d97082f8e0eef Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 6 May 2026 13:57:00 +0200 Subject: device property: set fwnode->secondary to NULL in fwnode_init() If a firmware node is allocated on the stack (for instance: temporary software node whose life-time we control) or on the heap - but using a non-zeroing allocation function - and initialized using fwnode_init(), its secondary pointer will contain uninitalized memory which likely will be neither NULL nor IS_ERR() and so may end up being dereferenced (for example: in dev_to_swnode()). Set fwnode->secondary to NULL on initialization. Cc: stable Fixes: 01bb86b380a3 ("driver core: Add fwnode_init()") Signed-off-by: Bartosz Golaszewski Reviewed-by: Rafael J. Wysocki (Intel) Reviewed-by: Andy Shevchenko Reviewed-by: Sakari Ailus Link: https://patch.msgid.link/20260506115701.23035-1-bartosz.golaszewski@oss.qualcomm.com Signed-off-by: Greg Kroah-Hartman --- include/linux/fwnode.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h index 80b38fbf2121..31df7608737e 100644 --- a/include/linux/fwnode.h +++ b/include/linux/fwnode.h @@ -208,6 +208,7 @@ struct fwnode_operations { static inline void fwnode_init(struct fwnode_handle *fwnode, const struct fwnode_operations *ops) { + fwnode->secondary = NULL; fwnode->ops = ops; INIT_LIST_HEAD(&fwnode->consumers); INIT_LIST_HEAD(&fwnode->suppliers); -- cgit v1.2.3