From da142f3d373a6ddaca0119615a8db2175ddc4121 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 15:26:55 -0800 Subject: KVM: Remove subtle "struct kvm_stats_desc" pseudo-overlay Remove KVM's internal pseudo-overlay of kvm_stats_desc, which subtly aliases the flexible name[] in the uAPI definition with a fixed-size array of the same name. The unusual embedded structure results in compiler warnings due to -Wflex-array-member-not-at-end, and also necessitates an extra level of dereferencing in KVM. To avoid the "overlay", define the uAPI structure to have a fixed-size name when building for the kernel. Opportunistically clean up the indentation for the stats macros, and replace spaces with tabs. No functional change intended. Reported-by: Gustavo A. R. Silva Closes: https://lore.kernel.org/all/aPfNKRpLfhmhYqfP@kspp Acked-by: Marc Zyngier Acked-by: Christian Borntraeger [..] Acked-by: Anup Patel Reviewed-by: Bibo Mao Acked-by: Gustavo A. R. Silva Link: https://patch.msgid.link/20251205232655.445294-1-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 83 ++++++++++++++++++++---------------------------- 1 file changed, 35 insertions(+), 48 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d93f75b05ae2..7428d9949382 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1927,56 +1927,43 @@ enum kvm_stat_kind { struct kvm_stat_data { struct kvm *kvm; - const struct _kvm_stats_desc *desc; + const struct kvm_stats_desc *desc; enum kvm_stat_kind kind; }; -struct _kvm_stats_desc { - struct kvm_stats_desc desc; - char name[KVM_STATS_NAME_SIZE]; -}; - -#define STATS_DESC_COMMON(type, unit, base, exp, sz, bsz) \ - .flags = type | unit | base | \ - BUILD_BUG_ON_ZERO(type & ~KVM_STATS_TYPE_MASK) | \ - BUILD_BUG_ON_ZERO(unit & ~KVM_STATS_UNIT_MASK) | \ - BUILD_BUG_ON_ZERO(base & ~KVM_STATS_BASE_MASK), \ - .exponent = exp, \ - .size = sz, \ +#define STATS_DESC_COMMON(type, unit, base, exp, sz, bsz) \ + .flags = type | unit | base | \ + BUILD_BUG_ON_ZERO(type & ~KVM_STATS_TYPE_MASK) | \ + BUILD_BUG_ON_ZERO(unit & ~KVM_STATS_UNIT_MASK) | \ + BUILD_BUG_ON_ZERO(base & ~KVM_STATS_BASE_MASK), \ + .exponent = exp, \ + .size = sz, \ .bucket_size = bsz -#define VM_GENERIC_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ - { \ - { \ - STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ - .offset = offsetof(struct kvm_vm_stat, generic.stat) \ - }, \ - .name = #stat, \ - } -#define VCPU_GENERIC_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ - { \ - { \ - STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ - .offset = offsetof(struct kvm_vcpu_stat, generic.stat) \ - }, \ - .name = #stat, \ - } -#define VM_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ - { \ - { \ - STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ - .offset = offsetof(struct kvm_vm_stat, stat) \ - }, \ - .name = #stat, \ - } -#define VCPU_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ - { \ - { \ - STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ - .offset = offsetof(struct kvm_vcpu_stat, stat) \ - }, \ - .name = #stat, \ - } +#define VM_GENERIC_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ +{ \ + STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ + .offset = offsetof(struct kvm_vm_stat, generic.stat), \ + .name = #stat, \ +} +#define VCPU_GENERIC_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ +{ \ + STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ + .offset = offsetof(struct kvm_vcpu_stat, generic.stat), \ + .name = #stat, \ +} +#define VM_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ +{ \ + STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ + .offset = offsetof(struct kvm_vm_stat, stat), \ + .name = #stat, \ +} +#define VCPU_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ +{ \ + STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ + .offset = offsetof(struct kvm_vcpu_stat, stat), \ + .name = #stat, \ +} /* SCOPE: VM, VM_GENERIC, VCPU, VCPU_GENERIC */ #define STATS_DESC(SCOPE, stat, type, unit, base, exp, sz, bsz) \ SCOPE##_STATS_DESC(stat, type, unit, base, exp, sz, bsz) @@ -2053,7 +2040,7 @@ struct _kvm_stats_desc { STATS_DESC_IBOOLEAN(VCPU_GENERIC, blocking) ssize_t kvm_stats_read(char *id, const struct kvm_stats_header *header, - const struct _kvm_stats_desc *desc, + const struct kvm_stats_desc *desc, void *stats, size_t size_stats, char __user *user_buffer, size_t size, loff_t *offset); @@ -2098,9 +2085,9 @@ static inline void kvm_stats_log_hist_update(u64 *data, size_t size, u64 value) extern const struct kvm_stats_header kvm_vm_stats_header; -extern const struct _kvm_stats_desc kvm_vm_stats_desc[]; +extern const struct kvm_stats_desc kvm_vm_stats_desc[]; extern const struct kvm_stats_header kvm_vcpu_stats_header; -extern const struct _kvm_stats_desc kvm_vcpu_stats_desc[]; +extern const struct kvm_stats_desc kvm_vcpu_stats_desc[]; #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER static inline int mmu_invalidate_retry(struct kvm *kvm, unsigned long mmu_seq) -- cgit v1.2.3 From ef246da8e63c486780dca4d9b4d79589cbebf5e5 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sat, 24 Jan 2026 21:14:13 +0200 Subject: dma-buf: Rename .move_notify() callback to a clearer identifier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename the .move_notify() callback to .invalidate_mappings() to make its purpose explicit and highlight that it is responsible for invalidating existing mappings. Suggested-by: Christian König Reviewed-by: Christian König Signed-off-by: Leon Romanovsky Link: https://lore.kernel.org/r/20260124-dmabuf-revoke-v5-1-f98fca917e96@nvidia.com Signed-off-by: Christian König --- include/linux/dma-buf.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index 91f4939db89b..d9ee4499b37d 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -407,7 +407,7 @@ struct dma_buf { * through the device. * * - Dynamic importers should set fences for any access that they can't - * disable immediately from their &dma_buf_attach_ops.move_notify + * disable immediately from their &dma_buf_attach_ops.invalidate_mappings * callback. * * IMPORTANT: @@ -446,7 +446,7 @@ struct dma_buf_attach_ops { bool allow_peer2peer; /** - * @move_notify: [optional] notification that the DMA-buf is moving + * @invalidate_mappings: [optional] notification that the DMA-buf is moving * * If this callback is provided the framework can avoid pinning the * backing store while mappings exists. @@ -463,7 +463,7 @@ struct dma_buf_attach_ops { * New mappings can be created after this callback returns, and will * point to the new location of the DMA-buf. */ - void (*move_notify)(struct dma_buf_attachment *attach); + void (*invalidate_mappings)(struct dma_buf_attachment *attach); }; /** -- cgit v1.2.3 From 95308225e5baeaae1e313816059c59a0036ab6b2 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sat, 24 Jan 2026 21:14:14 +0200 Subject: dma-buf: Rename dma_buf_move_notify() to dma_buf_invalidate_mappings() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Along with renaming the .move_notify() callback, rename the corresponding dma-buf core function. This makes the expected behavior clear to exporters calling this function. Signed-off-by: Leon Romanovsky Reviewed-by: Christian König Link: https://lore.kernel.org/r/20260124-dmabuf-revoke-v5-2-f98fca917e96@nvidia.com Signed-off-by: Christian König --- include/linux/dma-buf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index d9ee4499b37d..d0470af8887e 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -588,7 +588,7 @@ struct sg_table *dma_buf_map_attachment(struct dma_buf_attachment *, enum dma_data_direction); void dma_buf_unmap_attachment(struct dma_buf_attachment *, struct sg_table *, enum dma_data_direction); -void dma_buf_move_notify(struct dma_buf *dma_buf); +void dma_buf_invalidate_mappings(struct dma_buf *dma_buf); int dma_buf_begin_cpu_access(struct dma_buf *dma_buf, enum dma_data_direction dir); int dma_buf_end_cpu_access(struct dma_buf *dma_buf, -- cgit v1.2.3 From 2bcbc706dfa02ae50118173a6f6d8a12e735480c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Fri, 19 Dec 2025 11:41:54 +0100 Subject: dma-buf: add dma_fence_was_initialized function v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some driver use fence->ops to test if a fence was initialized or not. The problem is that this utilizes internal behavior of the dma_fence implementation. So better abstract that into a function. v2: use a flag instead of testing fence->ops, rename the function, move to the beginning of the patch set. Signed-off-by: Christian König Reviewed-by: Tvrtko Ursulin Link: https://lore.kernel.org/r/20260120105655.7134-2-christian.koenig@amd.com --- include/linux/dma-fence.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h index d4c92fd35092..9c4d25289239 100644 --- a/include/linux/dma-fence.h +++ b/include/linux/dma-fence.h @@ -48,6 +48,7 @@ struct seq_file; * atomic ops (bit_*), so taking the spinlock will not be needed most * of the time. * + * DMA_FENCE_FLAG_INITIALIZED_BIT - fence was initialized * DMA_FENCE_FLAG_SIGNALED_BIT - fence is already signaled * DMA_FENCE_FLAG_TIMESTAMP_BIT - timestamp recorded for fence signaling * DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT - enable_signaling might have been called @@ -98,6 +99,7 @@ struct dma_fence { }; enum dma_fence_flag_bits { + DMA_FENCE_FLAG_INITIALIZED_BIT, DMA_FENCE_FLAG_SEQNO64_BIT, DMA_FENCE_FLAG_SIGNALED_BIT, DMA_FENCE_FLAG_TIMESTAMP_BIT, @@ -263,6 +265,19 @@ void dma_fence_release(struct kref *kref); void dma_fence_free(struct dma_fence *fence); void dma_fence_describe(struct dma_fence *fence, struct seq_file *seq); +/** + * dma_fence_was_initialized - test if fence was initialized + * @fence: fence to test + * + * Return: True if fence was ever initialized, false otherwise. Works correctly + * only when memory backing the fence structure is zero initialized on + * allocation. + */ +static inline bool dma_fence_was_initialized(struct dma_fence *fence) +{ + return fence && test_bit(DMA_FENCE_FLAG_INITIALIZED_BIT, &fence->flags); +} + /** * dma_fence_put - decreases refcount of the fence * @fence: fence to reduce refcount of -- cgit v1.2.3 From 4a9671a03f2be13acde0cb15c5208767a9cc56e4 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Fri, 6 Feb 2026 08:52:38 +1000 Subject: gpu: Move DRM buddy allocator one level up (part one) Move the DRM buddy allocator one level up so that it can be used by GPU drivers (example, nova-core) that have usecases other than DRM (such as VFIO vGPU support). Modify the API, structures and Kconfigs to use "gpu_buddy" terminology. Adapt the drivers and tests to use the new API. The commit cannot be split due to bisectability, however no functional change is intended. Verified by running K-UNIT tests and build tested various configurations. Signed-off-by: Joel Fernandes Reviewed-by: Dave Airlie [airlied: I've split this into two so git can find copies easier. I've also just nuked drm_random library, that stuff needs to be done elsewhere and only the buddy tests seem to be using it]. Signed-off-by: Dave Airlie --- include/linux/gpu_buddy.h | 171 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 171 insertions(+) create mode 100644 include/linux/gpu_buddy.h (limited to 'include/linux') diff --git a/include/linux/gpu_buddy.h b/include/linux/gpu_buddy.h new file mode 100644 index 000000000000..b909fa8f810a --- /dev/null +++ b/include/linux/gpu_buddy.h @@ -0,0 +1,171 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2021 Intel Corporation + */ + +#ifndef __DRM_BUDDY_H__ +#define __DRM_BUDDY_H__ + +#include +#include +#include +#include +#include + +struct drm_printer; + +#define DRM_BUDDY_RANGE_ALLOCATION BIT(0) +#define DRM_BUDDY_TOPDOWN_ALLOCATION BIT(1) +#define DRM_BUDDY_CONTIGUOUS_ALLOCATION BIT(2) +#define DRM_BUDDY_CLEAR_ALLOCATION BIT(3) +#define DRM_BUDDY_CLEARED BIT(4) +#define DRM_BUDDY_TRIM_DISABLE BIT(5) + +struct drm_buddy_block { +#define DRM_BUDDY_HEADER_OFFSET GENMASK_ULL(63, 12) +#define DRM_BUDDY_HEADER_STATE GENMASK_ULL(11, 10) +#define DRM_BUDDY_ALLOCATED (1 << 10) +#define DRM_BUDDY_FREE (2 << 10) +#define DRM_BUDDY_SPLIT (3 << 10) +#define DRM_BUDDY_HEADER_CLEAR GENMASK_ULL(9, 9) +/* Free to be used, if needed in the future */ +#define DRM_BUDDY_HEADER_UNUSED GENMASK_ULL(8, 6) +#define DRM_BUDDY_HEADER_ORDER GENMASK_ULL(5, 0) + u64 header; + + struct drm_buddy_block *left; + struct drm_buddy_block *right; + struct drm_buddy_block *parent; + + void *private; /* owned by creator */ + + /* + * While the block is allocated by the user through drm_buddy_alloc*, + * the user has ownership of the link, for example to maintain within + * a list, if so desired. As soon as the block is freed with + * drm_buddy_free* ownership is given back to the mm. + */ + union { + struct rb_node rb; + struct list_head link; + }; + + struct list_head tmp_link; +}; + +/* Order-zero must be at least SZ_4K */ +#define DRM_BUDDY_MAX_ORDER (63 - 12) + +/* + * Binary Buddy System. + * + * Locking should be handled by the user, a simple mutex around + * drm_buddy_alloc* and drm_buddy_free* should suffice. + */ +struct drm_buddy { + /* Maintain a free list for each order. */ + struct rb_root **free_trees; + + /* + * Maintain explicit binary tree(s) to track the allocation of the + * address space. This gives us a simple way of finding a buddy block + * and performing the potentially recursive merge step when freeing a + * block. Nodes are either allocated or free, in which case they will + * also exist on the respective free list. + */ + struct drm_buddy_block **roots; + + /* + * Anything from here is public, and remains static for the lifetime of + * the mm. Everything above is considered do-not-touch. + */ + unsigned int n_roots; + unsigned int max_order; + + /* Must be at least SZ_4K */ + u64 chunk_size; + u64 size; + u64 avail; + u64 clear_avail; +}; + +static inline u64 +drm_buddy_block_offset(const struct drm_buddy_block *block) +{ + return block->header & DRM_BUDDY_HEADER_OFFSET; +} + +static inline unsigned int +drm_buddy_block_order(struct drm_buddy_block *block) +{ + return block->header & DRM_BUDDY_HEADER_ORDER; +} + +static inline unsigned int +drm_buddy_block_state(struct drm_buddy_block *block) +{ + return block->header & DRM_BUDDY_HEADER_STATE; +} + +static inline bool +drm_buddy_block_is_allocated(struct drm_buddy_block *block) +{ + return drm_buddy_block_state(block) == DRM_BUDDY_ALLOCATED; +} + +static inline bool +drm_buddy_block_is_clear(struct drm_buddy_block *block) +{ + return block->header & DRM_BUDDY_HEADER_CLEAR; +} + +static inline bool +drm_buddy_block_is_free(struct drm_buddy_block *block) +{ + return drm_buddy_block_state(block) == DRM_BUDDY_FREE; +} + +static inline bool +drm_buddy_block_is_split(struct drm_buddy_block *block) +{ + return drm_buddy_block_state(block) == DRM_BUDDY_SPLIT; +} + +static inline u64 +drm_buddy_block_size(struct drm_buddy *mm, + struct drm_buddy_block *block) +{ + return mm->chunk_size << drm_buddy_block_order(block); +} + +int drm_buddy_init(struct drm_buddy *mm, u64 size, u64 chunk_size); + +void drm_buddy_fini(struct drm_buddy *mm); + +struct drm_buddy_block * +drm_get_buddy(struct drm_buddy_block *block); + +int drm_buddy_alloc_blocks(struct drm_buddy *mm, + u64 start, u64 end, u64 size, + u64 min_page_size, + struct list_head *blocks, + unsigned long flags); + +int drm_buddy_block_trim(struct drm_buddy *mm, + u64 *start, + u64 new_size, + struct list_head *blocks); + +void drm_buddy_reset_clear(struct drm_buddy *mm, bool is_clear); + +void drm_buddy_free_block(struct drm_buddy *mm, struct drm_buddy_block *block); + +void drm_buddy_free_list(struct drm_buddy *mm, + struct list_head *objects, + unsigned int flags); + +void drm_buddy_print(struct drm_buddy *mm, struct drm_printer *p); +void drm_buddy_block_print(struct drm_buddy *mm, + struct drm_buddy_block *block, + struct drm_printer *p); +#endif -- cgit v1.2.3 From ba110db8e1bc206c13fd7d985e79b033f53bfdea Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Fri, 6 Feb 2026 08:52:38 +1000 Subject: gpu: Move DRM buddy allocator one level up (part two) Move the DRM buddy allocator one level up so that it can be used by GPU drivers (example, nova-core) that have usecases other than DRM (such as VFIO vGPU support). Modify the API, structures and Kconfigs to use "gpu_buddy" terminology. Adapt the drivers and tests to use the new API. The commit cannot be split due to bisectability, however no functional change is intended. Verified by running K-UNIT tests and build tested various configurations. Signed-off-by: Joel Fernandes Reviewed-by: Dave Airlie [airlied: I've split this into two so git can find copies easier. I've also just nuked drm_random library, that stuff needs to be done elsewhere and only the buddy tests seem to be using it]. Signed-off-by: Dave Airlie --- include/linux/gpu_buddy.h | 124 ++++++++++++++++++++++++---------------------- 1 file changed, 65 insertions(+), 59 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpu_buddy.h b/include/linux/gpu_buddy.h index b909fa8f810a..07ac65db6d2e 100644 --- a/include/linux/gpu_buddy.h +++ b/include/linux/gpu_buddy.h @@ -3,8 +3,8 @@ * Copyright © 2021 Intel Corporation */ -#ifndef __DRM_BUDDY_H__ -#define __DRM_BUDDY_H__ +#ifndef __GPU_BUDDY_H__ +#define __GPU_BUDDY_H__ #include #include @@ -12,38 +12,45 @@ #include #include -struct drm_printer; - -#define DRM_BUDDY_RANGE_ALLOCATION BIT(0) -#define DRM_BUDDY_TOPDOWN_ALLOCATION BIT(1) -#define DRM_BUDDY_CONTIGUOUS_ALLOCATION BIT(2) -#define DRM_BUDDY_CLEAR_ALLOCATION BIT(3) -#define DRM_BUDDY_CLEARED BIT(4) -#define DRM_BUDDY_TRIM_DISABLE BIT(5) - -struct drm_buddy_block { -#define DRM_BUDDY_HEADER_OFFSET GENMASK_ULL(63, 12) -#define DRM_BUDDY_HEADER_STATE GENMASK_ULL(11, 10) -#define DRM_BUDDY_ALLOCATED (1 << 10) -#define DRM_BUDDY_FREE (2 << 10) -#define DRM_BUDDY_SPLIT (3 << 10) -#define DRM_BUDDY_HEADER_CLEAR GENMASK_ULL(9, 9) +#define GPU_BUDDY_RANGE_ALLOCATION BIT(0) +#define GPU_BUDDY_TOPDOWN_ALLOCATION BIT(1) +#define GPU_BUDDY_CONTIGUOUS_ALLOCATION BIT(2) +#define GPU_BUDDY_CLEAR_ALLOCATION BIT(3) +#define GPU_BUDDY_CLEARED BIT(4) +#define GPU_BUDDY_TRIM_DISABLE BIT(5) + +enum gpu_buddy_free_tree { + GPU_BUDDY_CLEAR_TREE = 0, + GPU_BUDDY_DIRTY_TREE, + GPU_BUDDY_MAX_FREE_TREES, +}; + +#define for_each_free_tree(tree) \ + for ((tree) = 0; (tree) < GPU_BUDDY_MAX_FREE_TREES; (tree)++) + +struct gpu_buddy_block { +#define GPU_BUDDY_HEADER_OFFSET GENMASK_ULL(63, 12) +#define GPU_BUDDY_HEADER_STATE GENMASK_ULL(11, 10) +#define GPU_BUDDY_ALLOCATED (1 << 10) +#define GPU_BUDDY_FREE (2 << 10) +#define GPU_BUDDY_SPLIT (3 << 10) +#define GPU_BUDDY_HEADER_CLEAR GENMASK_ULL(9, 9) /* Free to be used, if needed in the future */ -#define DRM_BUDDY_HEADER_UNUSED GENMASK_ULL(8, 6) -#define DRM_BUDDY_HEADER_ORDER GENMASK_ULL(5, 0) +#define GPU_BUDDY_HEADER_UNUSED GENMASK_ULL(8, 6) +#define GPU_BUDDY_HEADER_ORDER GENMASK_ULL(5, 0) u64 header; - struct drm_buddy_block *left; - struct drm_buddy_block *right; - struct drm_buddy_block *parent; + struct gpu_buddy_block *left; + struct gpu_buddy_block *right; + struct gpu_buddy_block *parent; void *private; /* owned by creator */ /* - * While the block is allocated by the user through drm_buddy_alloc*, + * While the block is allocated by the user through gpu_buddy_alloc*, * the user has ownership of the link, for example to maintain within * a list, if so desired. As soon as the block is freed with - * drm_buddy_free* ownership is given back to the mm. + * gpu_buddy_free* ownership is given back to the mm. */ union { struct rb_node rb; @@ -54,15 +61,15 @@ struct drm_buddy_block { }; /* Order-zero must be at least SZ_4K */ -#define DRM_BUDDY_MAX_ORDER (63 - 12) +#define GPU_BUDDY_MAX_ORDER (63 - 12) /* * Binary Buddy System. * * Locking should be handled by the user, a simple mutex around - * drm_buddy_alloc* and drm_buddy_free* should suffice. + * gpu_buddy_alloc* and gpu_buddy_free* should suffice. */ -struct drm_buddy { +struct gpu_buddy { /* Maintain a free list for each order. */ struct rb_root **free_trees; @@ -73,7 +80,7 @@ struct drm_buddy { * block. Nodes are either allocated or free, in which case they will * also exist on the respective free list. */ - struct drm_buddy_block **roots; + struct gpu_buddy_block **roots; /* * Anything from here is public, and remains static for the lifetime of @@ -90,82 +97,81 @@ struct drm_buddy { }; static inline u64 -drm_buddy_block_offset(const struct drm_buddy_block *block) +gpu_buddy_block_offset(const struct gpu_buddy_block *block) { - return block->header & DRM_BUDDY_HEADER_OFFSET; + return block->header & GPU_BUDDY_HEADER_OFFSET; } static inline unsigned int -drm_buddy_block_order(struct drm_buddy_block *block) +gpu_buddy_block_order(struct gpu_buddy_block *block) { - return block->header & DRM_BUDDY_HEADER_ORDER; + return block->header & GPU_BUDDY_HEADER_ORDER; } static inline unsigned int -drm_buddy_block_state(struct drm_buddy_block *block) +gpu_buddy_block_state(struct gpu_buddy_block *block) { - return block->header & DRM_BUDDY_HEADER_STATE; + return block->header & GPU_BUDDY_HEADER_STATE; } static inline bool -drm_buddy_block_is_allocated(struct drm_buddy_block *block) +gpu_buddy_block_is_allocated(struct gpu_buddy_block *block) { - return drm_buddy_block_state(block) == DRM_BUDDY_ALLOCATED; + return gpu_buddy_block_state(block) == GPU_BUDDY_ALLOCATED; } static inline bool -drm_buddy_block_is_clear(struct drm_buddy_block *block) +gpu_buddy_block_is_clear(struct gpu_buddy_block *block) { - return block->header & DRM_BUDDY_HEADER_CLEAR; + return block->header & GPU_BUDDY_HEADER_CLEAR; } static inline bool -drm_buddy_block_is_free(struct drm_buddy_block *block) +gpu_buddy_block_is_free(struct gpu_buddy_block *block) { - return drm_buddy_block_state(block) == DRM_BUDDY_FREE; + return gpu_buddy_block_state(block) == GPU_BUDDY_FREE; } static inline bool -drm_buddy_block_is_split(struct drm_buddy_block *block) +gpu_buddy_block_is_split(struct gpu_buddy_block *block) { - return drm_buddy_block_state(block) == DRM_BUDDY_SPLIT; + return gpu_buddy_block_state(block) == GPU_BUDDY_SPLIT; } static inline u64 -drm_buddy_block_size(struct drm_buddy *mm, - struct drm_buddy_block *block) +gpu_buddy_block_size(struct gpu_buddy *mm, + struct gpu_buddy_block *block) { - return mm->chunk_size << drm_buddy_block_order(block); + return mm->chunk_size << gpu_buddy_block_order(block); } -int drm_buddy_init(struct drm_buddy *mm, u64 size, u64 chunk_size); +int gpu_buddy_init(struct gpu_buddy *mm, u64 size, u64 chunk_size); -void drm_buddy_fini(struct drm_buddy *mm); +void gpu_buddy_fini(struct gpu_buddy *mm); -struct drm_buddy_block * -drm_get_buddy(struct drm_buddy_block *block); +struct gpu_buddy_block * +gpu_get_buddy(struct gpu_buddy_block *block); -int drm_buddy_alloc_blocks(struct drm_buddy *mm, +int gpu_buddy_alloc_blocks(struct gpu_buddy *mm, u64 start, u64 end, u64 size, u64 min_page_size, struct list_head *blocks, unsigned long flags); -int drm_buddy_block_trim(struct drm_buddy *mm, +int gpu_buddy_block_trim(struct gpu_buddy *mm, u64 *start, u64 new_size, struct list_head *blocks); -void drm_buddy_reset_clear(struct drm_buddy *mm, bool is_clear); +void gpu_buddy_reset_clear(struct gpu_buddy *mm, bool is_clear); -void drm_buddy_free_block(struct drm_buddy *mm, struct drm_buddy_block *block); +void gpu_buddy_free_block(struct gpu_buddy *mm, struct gpu_buddy_block *block); -void drm_buddy_free_list(struct drm_buddy *mm, +void gpu_buddy_free_list(struct gpu_buddy *mm, struct list_head *objects, unsigned int flags); -void drm_buddy_print(struct drm_buddy *mm, struct drm_printer *p); -void drm_buddy_block_print(struct drm_buddy *mm, - struct drm_buddy_block *block, - struct drm_printer *p); +void gpu_buddy_print(struct gpu_buddy *mm); +void gpu_buddy_block_print(struct gpu_buddy *mm, + struct gpu_buddy_block *block); #endif -- cgit v1.2.3 From a69d1ab971a624c6f112cea61536569d579c3215 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Tue, 10 Feb 2026 12:56:53 +0100 Subject: mm: Fix a hmm_range_fault() livelock / starvation problem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If hmm_range_fault() fails a folio_trylock() in do_swap_page, trying to acquire the lock of a device-private folio for migration, to ram, the function will spin until it succeeds grabbing the lock. However, if the process holding the lock is depending on a work item to be completed, which is scheduled on the same CPU as the spinning hmm_range_fault(), that work item might be starved and we end up in a livelock / starvation situation which is never resolved. This can happen, for example if the process holding the device-private folio lock is stuck in migrate_device_unmap()->lru_add_drain_all() sinc lru_add_drain_all() requires a short work-item to be run on all online cpus to complete. A prerequisite for this to happen is: a) Both zone device and system memory folios are considered in migrate_device_unmap(), so that there is a reason to call lru_add_drain_all() for a system memory folio while a folio lock is held on a zone device folio. b) The zone device folio has an initial mapcount > 1 which causes at least one migration PTE entry insertion to be deferred to try_to_migrate(), which can happen after the call to lru_add_drain_all(). c) No or voluntary only preemption. This all seems pretty unlikely to happen, but indeed is hit by the "xe_exec_system_allocator" igt test. Resolve this by waiting for the folio to be unlocked if the folio_trylock() fails in do_swap_page(). Rename migration_entry_wait_on_locked() to softleaf_entry_wait_unlock() and update its documentation to indicate the new use-case. Future code improvements might consider moving the lru_add_drain_all() call in migrate_device_unmap() to be called *after* all pages have migration entries inserted. That would eliminate also b) above. v2: - Instead of a cond_resched() in hmm_range_fault(), eliminate the problem by waiting for the folio to be unlocked in do_swap_page() (Alistair Popple, Andrew Morton) v3: - Add a stub migration_entry_wait_on_locked() for the !CONFIG_MIGRATION case. (Kernel Test Robot) v4: - Rename migrate_entry_wait_on_locked() to softleaf_entry_wait_on_locked() and update docs (Alistair Popple) v5: - Add a WARN_ON_ONCE() for the !CONFIG_MIGRATION version of softleaf_entry_wait_on_locked(). - Modify wording around function names in the commit message (Andrew Morton) Suggested-by: Alistair Popple Fixes: 1afaeb8293c9 ("mm/migrate: Trylock device page in do_swap_page") Cc: Ralph Campbell Cc: Christoph Hellwig Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: Leon Romanovsky Cc: Andrew Morton Cc: Matthew Brost Cc: John Hubbard Cc: Alistair Popple Cc: linux-mm@kvack.org Cc: Signed-off-by: Thomas Hellström Cc: # v6.15+ Reviewed-by: John Hubbard #v3 Reviewed-by: Alistair Popple Link: https://patch.msgid.link/20260210115653.92413-1-thomas.hellstrom@linux.intel.com --- include/linux/migrate.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 26ca00c325d9..d5af2b7f577b 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -65,7 +65,7 @@ bool isolate_folio_to_list(struct folio *folio, struct list_head *list); int migrate_huge_page_move_mapping(struct address_space *mapping, struct folio *dst, struct folio *src); -void migration_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl) +void softleaf_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl) __releases(ptl); void folio_migrate_flags(struct folio *newfolio, struct folio *folio); int folio_migrate_mapping(struct address_space *mapping, @@ -97,6 +97,14 @@ static inline int set_movable_ops(const struct movable_operations *ops, enum pag return -ENOSYS; } +static inline void softleaf_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl) + __releases(ptl) +{ + WARN_ON_ONCE(1); + + spin_unlock(ptl); +} + #endif /* CONFIG_MIGRATION */ #ifdef CONFIG_NUMA_BALANCING -- cgit v1.2.3 From 022ac075088366b62e130da5e1b200bc93a47191 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 12 Feb 2026 13:34:22 -0800 Subject: bpf: use reg->var_off instead of reg->off for pointers This commit consolidates static and varying pointer offset tracking logic. All offsets are now represented solely using `.var_off` and min/max fields. The reasons are twofold: - This simplifies pointer tracking code, as each relevant function needs to check the `.var_off` field anyway. - It makes it easier to widen pointer registers for the purpose of loop convergence checks, by forgoing the `regsafe()` logic demanding `.off` fields to be identical. The changes are spread across many functions and are hard to group into smaller patches. Some of the logical changes include: - Checks in __check_ptr_off_reg() are reordered so that the tnum_is_const() check is done before operating on reg->var_off.value. - check_packet_access() now uses check_mem_region_access() to handle possible 'off' overflow cases. - In check_helper_mem_access() utility functions like check_packet_access() are now called with 'off=0', as these utility functions now account for the complete register offset range. - In check_reg_type() a call to __check_ptr_off_reg() is added before a call to btf_struct_ids_match(). This prevents btf_struct_ids_match() from potentially working on non-constant reg->var_off.value. - regsafe() is relaxed to avoid comparing '.off' field for pointers. As a precaution, the changes are verified in [1] by adding a pass checking that no pointer has non-zero '.off' field on each do_check_insn() iteration. [1] https://github.com/eddyz87/bpf/tree/ptrs-off-migration Notable selftests changes: - `.var_off` value changed because it now combines static and varying offsets. Affected tests: - linked_list/incorrect_node_var_off - linked_list/incorrect_head_var_off2 - verifier_align/packet_variable_offset - Overflowing `smax_value` bound leads to a pointer with big negative or positive offset to be rejected immediately (previously overflowing `rX += const` instruction updated `.off` field avoiding the overflow). Affected tests: - verifier_align/dubious_pointer_arithmetic - verifier_bounds/var_off_insn_off_test1 - Invalid access to packet now reports full offset inside a packet. Affected tests: - verifier_direct_packet_access/test23_x_pkt_ptr_4 - A change in check_mem_region_access() behavior: when register `.smin_value` is negative, it reports "rX min value is negative..." before calling into __check_mem_access() which reports "invalid access to ...". In the tests below, the `.off` field was negative, while `.smin_value` remained positive. This is no longer the case after the changes in this commit. Affected tests: - verifier_gotox/jump_table_invalid_mem_acceess_neg - verifier_helper_packet_access/test15_cls_helper_fail_sub - verifier_helper_value_access/imm_out_of_bound_2 - verifier_helper_value_access/reg_out_of_bound_2 - verifier_meta_access/meta_access_test2 - verifier_value_ptr_arith/known_scalar_from_different_maps - lower_oob_arith_test_1 - value_ptr_known_scalar_3 - access_value_ptr_known_scalar - Usage of check_mem_region_access() instead of __check_mem_access() in check_packet_access() changes the reported message from "rX offset is outside ..." to "rX min/max value is outside ...". Affected tests: - verifier_xdp_direct_packet_access/* - In check_func_arg_reg_off() the check for zero offset now operates on `.var_off` field instead of `.off` field. For tests where the pattern looks like `kfunc(reg_with_var_off, ...)`, this changes the reported error: - previously the error "variable ... access ... disallowed" was reported by __check_ptr_off_reg(); - now "R1 must have zero offset ..." is reported by check_func_arg_reg_off() itself. Affected tests: - verifier/calls.c "calls: invalid kfunc call: PTR_TO_BTF_ID with variable offset" Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260212-ptrs-off-migration-v2-2-00820e4d3438@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index ef8e45a362d9..a97bdbf3a07b 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -38,8 +38,7 @@ struct bpf_reg_state { /* Ordering of fields matters. See states_equal() */ enum bpf_reg_type type; /* - * Fixed part of pointer offset, pointer types only. - * Or constant delta between "linked" scalars with the same ID. + * Constant delta between "linked" scalars with the same ID. */ s32 off; union { -- cgit v1.2.3 From 3d91c618aca403a7f7d2064272f528a97b849475 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 12 Feb 2026 13:34:24 -0800 Subject: bpf: rename bpf_reg_state->off to bpf_reg_state->delta This field is now used only for linked scalar registers tracking. Rename it to 'delta' to better describe it's purpose: constant delta between "linked" scalars with the same ID. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260212-ptrs-off-migration-v2-4-00820e4d3438@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index a97bdbf3a07b..c1e30096ea7b 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -40,7 +40,7 @@ struct bpf_reg_state { /* * Constant delta between "linked" scalars with the same ID. */ - s32 off; + s32 delta; union { /* valid when type == PTR_TO_PACKET */ int range; @@ -145,9 +145,9 @@ struct bpf_reg_state { * Upper bit of ID is used to remember relationship between "linked" * registers. Example: * r1 = r2; both will have r1->id == r2->id == N - * r1 += 10; r1->id == N | BPF_ADD_CONST and r1->off == 10 + * r1 += 10; r1->id == N | BPF_ADD_CONST and r1->delta == 10 * r3 = r2; both will have r3->id == r2->id == N - * w3 += 10; r3->id == N | BPF_ADD_CONST32 and r3->off == 10 + * w3 += 10; r3->id == N | BPF_ADD_CONST32 and r3->delta == 10 */ #define BPF_ADD_CONST64 (1U << 31) #define BPF_ADD_CONST32 (1U << 30) -- cgit v1.2.3 From a235d3bcd28ba2d472f5b4cb6b259baeab75bafd Mon Sep 17 00:00:00 2001 From: Kundan Kumar Date: Fri, 13 Feb 2026 11:16:31 +0530 Subject: writeback: prep helpers for dirty-limit and writeback accounting Add helper APIs needed by filesystems to avoid poking into writeback internals. Reviewed-by: Jan Kara Reviewed-by: Jeff Layton Reviewed-by: Andreas Gruenbacher Suggested-by: Christoph Hellwig Signed-off-by: Kundan Kumar Signed-off-by: Anuj Gupta Link: https://patch.msgid.link/20260213054634.79785-2-kundan.kumar@samsung.com Signed-off-by: Christian Brauner --- include/linux/backing-dev.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 0c8342747cab..5b7d12b40d5e 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -136,6 +136,19 @@ static inline bool mapping_can_writeback(struct address_space *mapping) return inode_to_bdi(mapping->host)->capabilities & BDI_CAP_WRITEBACK; } +/* Must not be used by file systems that support cgroup writeback */ +static inline int bdi_wb_dirty_exceeded(struct backing_dev_info *bdi) +{ + return bdi->wb.dirty_exceeded; +} + +/* Must not be used by file systems that support cgroup writeback */ +static inline void bdi_wb_stat_mod(struct inode *inode, enum wb_stat_item item, + s64 amount) +{ + wb_stat_mod(&inode_to_bdi(inode)->wb, item, amount); +} + #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi, -- cgit v1.2.3 From 0d799df5b147e08828887fe7299efd7a9e0eea40 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 19 Feb 2026 07:50:01 +0100 Subject: fs: mark bool_names static The bool_names array is only used in fs_parser.c so mark it static. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20260219065014.3550402-2-hch@lst.de Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs_parser.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs_parser.h b/include/linux/fs_parser.h index 5e8a3b546033..ac8253cca2bc 100644 --- a/include/linux/fs_parser.h +++ b/include/linux/fs_parser.h @@ -84,8 +84,6 @@ extern int fs_lookup_param(struct fs_context *fc, extern int lookup_constant(const struct constant_table tbl[], const char *name, int not_found); -extern const struct constant_table bool_names[]; - #ifdef CONFIG_VALIDATE_FS_PARSER extern bool fs_validate_description(const char *name, const struct fs_parameter_spec *desc); -- cgit v1.2.3 From 8823db29744fceda9f94e674f74deea446c620b3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 19 Feb 2026 07:50:02 +0100 Subject: fs: remove fsparam_blob / fs_param_is_blob These are not used anywhere even after the fs_context conversion is finished, so remove them. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20260219065014.3550402-3-hch@lst.de Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs_parser.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs_parser.h b/include/linux/fs_parser.h index ac8253cca2bc..961562b101c5 100644 --- a/include/linux/fs_parser.h +++ b/include/linux/fs_parser.h @@ -27,7 +27,7 @@ typedef int fs_param_type(struct p_log *, * The type of parameter expected. */ fs_param_type fs_param_is_bool, fs_param_is_u32, fs_param_is_s32, fs_param_is_u64, - fs_param_is_enum, fs_param_is_string, fs_param_is_blob, fs_param_is_blockdev, + fs_param_is_enum, fs_param_is_string, fs_param_is_blockdev, fs_param_is_path, fs_param_is_fd, fs_param_is_uid, fs_param_is_gid, fs_param_is_file_or_string; @@ -125,7 +125,6 @@ static inline bool fs_validate_description(const char *name, #define fsparam_enum(NAME, OPT, array) __fsparam(fs_param_is_enum, NAME, OPT, 0, array) #define fsparam_string(NAME, OPT) \ __fsparam(fs_param_is_string, NAME, OPT, 0, NULL) -#define fsparam_blob(NAME, OPT) __fsparam(fs_param_is_blob, NAME, OPT, 0, NULL) #define fsparam_bdev(NAME, OPT) __fsparam(fs_param_is_blockdev, NAME, OPT, 0, NULL) #define fsparam_path(NAME, OPT) __fsparam(fs_param_is_path, NAME, OPT, 0, NULL) #define fsparam_fd(NAME, OPT) __fsparam(fs_param_is_fd, NAME, OPT, 0, NULL) -- cgit v1.2.3 From d2f2f7cf8e898f7b80fe031a263df9c7de94b0b7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 19 Feb 2026 07:50:03 +0100 Subject: fs: remove fsparam_path / fs_param_is_path These are not used anywhere even after the fs_context conversion is finished, so remove them. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20260219065014.3550402-4-hch@lst.de Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs_parser.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs_parser.h b/include/linux/fs_parser.h index 961562b101c5..98b83708f92b 100644 --- a/include/linux/fs_parser.h +++ b/include/linux/fs_parser.h @@ -28,7 +28,7 @@ typedef int fs_param_type(struct p_log *, */ fs_param_type fs_param_is_bool, fs_param_is_u32, fs_param_is_s32, fs_param_is_u64, fs_param_is_enum, fs_param_is_string, fs_param_is_blockdev, - fs_param_is_path, fs_param_is_fd, fs_param_is_uid, fs_param_is_gid, + fs_param_is_fd, fs_param_is_uid, fs_param_is_gid, fs_param_is_file_or_string; /* @@ -126,7 +126,6 @@ static inline bool fs_validate_description(const char *name, #define fsparam_string(NAME, OPT) \ __fsparam(fs_param_is_string, NAME, OPT, 0, NULL) #define fsparam_bdev(NAME, OPT) __fsparam(fs_param_is_blockdev, NAME, OPT, 0, NULL) -#define fsparam_path(NAME, OPT) __fsparam(fs_param_is_path, NAME, OPT, 0, NULL) #define fsparam_fd(NAME, OPT) __fsparam(fs_param_is_fd, NAME, OPT, 0, NULL) #define fsparam_file_or_string(NAME, OPT) \ __fsparam(fs_param_is_file_or_string, NAME, OPT, 0, NULL) -- cgit v1.2.3 From 6b3e458806e34f1142592f786d3eb0ebac209cc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnther=20Noack?= Date: Thu, 19 Feb 2026 16:43:35 +0100 Subject: HID: Document memory allocation properties of report_fixup() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The memory pointer returned by the report_fixup() hook does not get freed by the caller. Instead, report_fixup() must return (in return value and *rsize) a memory buffer with at least the same lifetime as the input buffer (defined by rdesc and original *rsize). This is usually achieved using one of the following techniques: * Returning a pointer and size to a sub-portion of the input buffer * Returning a pointer to a static buffer * Allocating a buffer with a devm_*() function, which will automatically get freed when the device is removed. Signed-off-by: Günther Noack Signed-off-by: Benjamin Tissoires --- include/linux/hid.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index dce862cafbbd..2990b9f94cb5 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -836,6 +836,12 @@ struct hid_usage_id { * raw_event and event should return negative on error, any other value will * pass the event on to .event() typically return 0 for success. * + * report_fixup must return a report descriptor pointer whose lifetime is at + * least that of the input rdesc. This is usually done by mutating the input + * rdesc and returning it or a sub-portion of it. In case a new buffer is + * allocated and returned, the implementation of report_fixup is responsible for + * freeing it later. + * * input_mapping shall return a negative value to completely ignore this usage * (e.g. doubled or invalid usage), zero to continue with parsing of this * usage by generic code (no special handling needed) or positive to skip -- cgit v1.2.3 From 95cef38e70250234a254e6228eb7342b6deaaffa Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Tue, 17 Feb 2026 16:56:18 +0100 Subject: firmware: google: Export coreboot table entries Move types for coreboot table entries to . Allows drivers in other subsystems to use these structures. Signed-off-by: Thomas Zimmermann Acked-by: Tzung-Bi Shih Acked-by: Julius Werner Link: https://patch.msgid.link/20260217155836.96267-9-tzimmermann@suse.de --- include/linux/coreboot.h | 66 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 include/linux/coreboot.h (limited to 'include/linux') diff --git a/include/linux/coreboot.h b/include/linux/coreboot.h new file mode 100644 index 000000000000..48705b439c6e --- /dev/null +++ b/include/linux/coreboot.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * coreboot.h + * + * Coreboot device and driver interfaces. + * + * Copyright 2014 Gerd Hoffmann + * Copyright 2017 Google Inc. + * Copyright 2017 Samuel Holland + */ + +#ifndef _LINUX_COREBOOT_H +#define _LINUX_COREBOOT_H + +#include + +/* List of coreboot entry structures that is used */ + +#define CB_TAG_FRAMEBUFFER 0x12 +#define LB_TAG_CBMEM_ENTRY 0x31 + +/* Generic */ +struct coreboot_table_entry { + u32 tag; + u32 size; +}; + +/* Points to a CBMEM entry */ +struct lb_cbmem_ref { + u32 tag; + u32 size; + + u64 cbmem_addr; +}; + +/* Corresponds to LB_TAG_CBMEM_ENTRY */ +struct lb_cbmem_entry { + u32 tag; + u32 size; + + u64 address; + u32 entry_size; + u32 id; +}; + +/* Describes framebuffer setup by coreboot */ +struct lb_framebuffer { + u32 tag; + u32 size; + + u64 physical_address; + u32 x_resolution; + u32 y_resolution; + u32 bytes_per_line; + u8 bits_per_pixel; + u8 red_mask_pos; + u8 red_mask_size; + u8 green_mask_pos; + u8 green_mask_size; + u8 blue_mask_pos; + u8 blue_mask_size; + u8 reserved_mask_pos; + u8 reserved_mask_size; +}; + +#endif /* _LINUX_COREBOOT_H */ -- cgit v1.2.3 From 27fc52b5505a3acca96b884a4bf1345344e5a566 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Tue, 17 Feb 2026 16:56:19 +0100 Subject: firmware: google: Pack structures for coreboot table entries Pack the fields in the coreboot table entries. These entries are part of the coreboot ABI, so they don't follow regular calling conventions. Fields of type u64 are aligned to boundaries of 4 bytes instead of 8. [1] So far this has not been a problem. In the future, padding bytes should be added where explicit alignment is required. Signed-off-by: Thomas Zimmermann Link: https://github.com/coreboot/coreboot/blob/main/payloads/libpayload/include/coreboot_tables.h#L96 # [1] Suggested-by: Julius Werner Acked-by: Julius Werner Acked-by: Tzung-Bi Shih Link: https://patch.msgid.link/20260217155836.96267-10-tzimmermann@suse.de --- include/linux/coreboot.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/coreboot.h b/include/linux/coreboot.h index 48705b439c6e..5746b99a070d 100644 --- a/include/linux/coreboot.h +++ b/include/linux/coreboot.h @@ -12,8 +12,11 @@ #ifndef _LINUX_COREBOOT_H #define _LINUX_COREBOOT_H +#include #include +typedef __aligned(4) u64 cb_u64; + /* List of coreboot entry structures that is used */ #define CB_TAG_FRAMEBUFFER 0x12 @@ -30,7 +33,7 @@ struct lb_cbmem_ref { u32 tag; u32 size; - u64 cbmem_addr; + cb_u64 cbmem_addr; }; /* Corresponds to LB_TAG_CBMEM_ENTRY */ @@ -38,7 +41,7 @@ struct lb_cbmem_entry { u32 tag; u32 size; - u64 address; + cb_u64 address; u32 entry_size; u32 id; }; @@ -48,7 +51,7 @@ struct lb_framebuffer { u32 tag; u32 size; - u64 physical_address; + cb_u64 physical_address; u32 x_resolution; u32 y_resolution; u32 bytes_per_line; -- cgit v1.2.3 From a29a1f0ec8d69ee917a9d4c84b844df0decff0ef Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Tue, 17 Feb 2026 16:56:21 +0100 Subject: drm/sysfb: corebootdrm: Add DRM driver for coreboot framebuffers Add corebootdrm, a DRM driver for coreboot framebuffers. The driver supports a pre-initialized framebuffer with various packed RGB formats. The driver code is fairly small and uses the same logic as the other sysfb drivers. Most of the implementation comes from existing sysfb helpers. Until now, coreboot relied on simpledrm or simplefb for boot-up graphics output. Initialize the platform device for corebootdrm in the same place in framebuffer_probe(). With a later commit, the simple-framebuffer should be removed. v4: - sort include statements (Tzung-Bi) v3: - comment on _HAS_LFB semantics (Tzung-Bi) - fix typo in commit description (Tzung-Bi) - comment on simple-framebuffer being obsolete for coreboot v2: - reimplement as platform driver - limit resources and mappings to known framebuffer memory; no page alignment - create corebootdrm device from coreboot framebuffer code Signed-off-by: Thomas Zimmermann Reviewed-by: Javier Martinez Canillas Acked-by: Julius Werner Acked-by: Tzung-Bi Shih # coreboot Link: https://patch.msgid.link/20260217155836.96267-12-tzimmermann@suse.de --- include/linux/coreboot.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/coreboot.h b/include/linux/coreboot.h index 5746b99a070d..885da106fee3 100644 --- a/include/linux/coreboot.h +++ b/include/linux/coreboot.h @@ -13,6 +13,7 @@ #define _LINUX_COREBOOT_H #include +#include #include typedef __aligned(4) u64 cb_u64; @@ -66,4 +67,11 @@ struct lb_framebuffer { u8 reserved_mask_size; }; +/* + * True if the coreboot-provided data is large enough to hold information + * on the linear framebuffer. False otherwise. + */ +#define LB_FRAMEBUFFER_HAS_LFB(__fb) \ + ((__fb)->size >= offsetofend(struct lb_framebuffer, reserved_mask_size)) + #endif /* _LINUX_COREBOOT_H */ -- cgit v1.2.3 From 058fc04b8587ad07a86dfa8f99d8d99db0a55443 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Tue, 17 Feb 2026 16:56:22 +0100 Subject: drm/sysfb: corebootdrm: Support panel orientation Add fields and constants for coreboot framebuffer orientation. Set corebootdrm's DRM connector state from the values. Not all firmware provides orientation, so make it optional. Systems without, continue to use unknown orientation. v3: - comment on _HAS_ORIENTATION semantics (Tzung-Bi) Signed-off-by: Thomas Zimmermann Reviewed-by: Javier Martinez Canillas Acked-by: Julius Werner Acked-by: Tzung-Bi Shih # coreboot Link: https://patch.msgid.link/20260217155836.96267-13-tzimmermann@suse.de --- include/linux/coreboot.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/coreboot.h b/include/linux/coreboot.h index 885da106fee3..5d40ca7a1d89 100644 --- a/include/linux/coreboot.h +++ b/include/linux/coreboot.h @@ -47,6 +47,11 @@ struct lb_cbmem_entry { u32 id; }; +#define LB_FRAMEBUFFER_ORIENTATION_NORMAL 0 +#define LB_FRAMEBUFFER_ORIENTATION_BOTTOM_UP 1 +#define LB_FRAMEBUFFER_ORIENTATION_LEFT_UP 2 +#define LB_FRAMEBUFFER_ORIENTATION_RIGHT_UP 3 + /* Describes framebuffer setup by coreboot */ struct lb_framebuffer { u32 tag; @@ -65,6 +70,7 @@ struct lb_framebuffer { u8 blue_mask_size; u8 reserved_mask_pos; u8 reserved_mask_size; + u8 orientation; }; /* @@ -74,4 +80,11 @@ struct lb_framebuffer { #define LB_FRAMEBUFFER_HAS_LFB(__fb) \ ((__fb)->size >= offsetofend(struct lb_framebuffer, reserved_mask_size)) +/* + * True if the coreboot-provided data is large enough to hold information + * on the display orientation. False otherwise. + */ +#define LB_FRAMEBUFFER_HAS_ORIENTATION(__fb) \ + ((__fb)->size >= offsetofend(struct lb_framebuffer, orientation)) + #endif /* _LINUX_COREBOOT_H */ -- cgit v1.2.3 From dc652a33cf08ecd7c9935bf9168a1a27c9a246f0 Mon Sep 17 00:00:00 2001 From: Brian Masney Date: Fri, 12 Dec 2025 08:41:42 +0900 Subject: clk: remove round_rate() clk ops The round_rate() clk ops is deprecated, and all in tree drivers have been converted, so let's go ahead and remove any references to the round_rate() clk ops. Signed-off-by: Brian Masney --- include/linux/clk-provider.h | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index 630705a47129..1cda2c78dffa 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -136,10 +136,6 @@ struct clk_duty { * 0. Returns the calculated rate. Optional, but recommended - if * this op is not set then clock rate will be initialized to 0. * - * @round_rate: Given a target rate as input, returns the closest rate actually - * supported by the clock. The parent rate is an input/output - * parameter. - * * @determine_rate: Given a target rate as input, returns the closest rate * actually supported by the clock, and optionally the parent clock * that should be used to provide the clock rate. @@ -163,13 +159,13 @@ struct clk_duty { * * @set_rate: Change the rate of this clock. The requested rate is specified * by the second argument, which should typically be the return - * of .round_rate call. The third argument gives the parent rate - * which is likely helpful for most .set_rate implementation. + * of .determine_rate call. The third argument gives the parent + * rate which is likely helpful for most .set_rate implementation. * Returns 0 on success, -EERROR otherwise. * * @set_rate_and_parent: Change the rate and the parent of this clock. The * requested rate is specified by the second argument, which - * should typically be the return of .round_rate call. The + * should typically be the return of clk_round_rate() call. The * third argument gives the parent rate which is likely helpful * for most .set_rate_and_parent implementation. The fourth * argument gives the parent index. This callback is optional (and @@ -244,8 +240,6 @@ struct clk_ops { void (*restore_context)(struct clk_hw *hw); unsigned long (*recalc_rate)(struct clk_hw *hw, unsigned long parent_rate); - long (*round_rate)(struct clk_hw *hw, unsigned long rate, - unsigned long *parent_rate); int (*determine_rate)(struct clk_hw *hw, struct clk_rate_request *req); int (*set_parent)(struct clk_hw *hw, u8 index); @@ -679,7 +673,7 @@ struct clk_div_table { * @lock: register lock * * Clock with an adjustable divider affecting its output frequency. Implements - * .recalc_rate, .set_rate and .round_rate + * .recalc_rate, .set_rate and .determine_rate * * @flags: * CLK_DIVIDER_ONE_BASED - by default the divisor is the value read from the @@ -1126,7 +1120,7 @@ void of_fixed_factor_clk_setup(struct device_node *node); * * Clock with a fixed multiplier and divider. The output frequency is the * parent clock rate divided by div and multiplied by mult. - * Implements .recalc_rate, .set_rate, .round_rate and .recalc_accuracy + * Implements .recalc_rate, .set_rate, .determine_rate and .recalc_accuracy * * Flags: * * CLK_FIXED_FACTOR_FIXED_ACCURACY - Use the value in @acc instead of the @@ -1254,7 +1248,7 @@ void clk_hw_unregister_fractional_divider(struct clk_hw *hw); * @lock: register lock * * Clock with an adjustable multiplier affecting its output frequency. - * Implements .recalc_rate, .set_rate and .round_rate + * Implements .recalc_rate, .set_rate and .determine_rate * * @flags: * CLK_MULTIPLIER_ZERO_BYPASS - By default, the multiplier is the value read -- cgit v1.2.3 From 4b5231d608d00749a2346a3dd11bd6d05c0662e3 Mon Sep 17 00:00:00 2001 From: Brian Masney Date: Thu, 8 Jan 2026 16:16:44 -0500 Subject: clk: divider: remove divider_ro_round_rate_parent() There are no remaining users of divider_ro_round_rate_parent(), so let's go ahead and remove it. Signed-off-by: Brian Masney --- include/linux/clk-provider.h | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index 1cda2c78dffa..0d31077749fb 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -737,10 +737,6 @@ long divider_round_rate_parent(struct clk_hw *hw, struct clk_hw *parent, unsigned long rate, unsigned long *prate, const struct clk_div_table *table, u8 width, unsigned long flags); -long divider_ro_round_rate_parent(struct clk_hw *hw, struct clk_hw *parent, - unsigned long rate, unsigned long *prate, - const struct clk_div_table *table, u8 width, - unsigned long flags, unsigned int val); int divider_determine_rate(struct clk_hw *hw, struct clk_rate_request *req, const struct clk_div_table *table, u8 width, unsigned long flags); @@ -1440,17 +1436,6 @@ static inline long divider_round_rate(struct clk_hw *hw, unsigned long rate, rate, prate, table, width, flags); } -static inline long divider_ro_round_rate(struct clk_hw *hw, unsigned long rate, - unsigned long *prate, - const struct clk_div_table *table, - u8 width, unsigned long flags, - unsigned int val) -{ - return divider_ro_round_rate_parent(hw, clk_hw_get_parent(hw), - rate, prate, table, width, flags, - val); -} - /* * FIXME clock api without lock protection */ -- cgit v1.2.3 From d4851759742c1322f498021dab882d322fc34a1d Mon Sep 17 00:00:00 2001 From: Brian Masney Date: Thu, 8 Jan 2026 16:16:45 -0500 Subject: clk: divider: remove divider_round_rate() and divider_round_rate_parent() There are no remaining users of divider_round_rate() and divider_round_rate_parent(), so let's go ahead and remove them. Signed-off-by: Brian Masney --- include/linux/clk-provider.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index 0d31077749fb..4d21602d7dbd 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -733,10 +733,6 @@ extern const struct clk_ops clk_divider_ro_ops; unsigned long divider_recalc_rate(struct clk_hw *hw, unsigned long parent_rate, unsigned int val, const struct clk_div_table *table, unsigned long flags, unsigned long width); -long divider_round_rate_parent(struct clk_hw *hw, struct clk_hw *parent, - unsigned long rate, unsigned long *prate, - const struct clk_div_table *table, - u8 width, unsigned long flags); int divider_determine_rate(struct clk_hw *hw, struct clk_rate_request *req, const struct clk_div_table *table, u8 width, unsigned long flags); @@ -1427,15 +1423,6 @@ static inline void __clk_hw_set_clk(struct clk_hw *dst, struct clk_hw *src) dst->core = src->core; } -static inline long divider_round_rate(struct clk_hw *hw, unsigned long rate, - unsigned long *prate, - const struct clk_div_table *table, - u8 width, unsigned long flags) -{ - return divider_round_rate_parent(hw, clk_hw_get_parent(hw), - rate, prate, table, width, flags); -} - /* * FIXME clock api without lock protection */ -- cgit v1.2.3 From 5cab6d386bd30c3bb4efceb05b25842a6f144693 Mon Sep 17 00:00:00 2001 From: Sanjay Yadav Date: Thu, 12 Feb 2026 14:55:29 +0530 Subject: drm/buddy: Add kernel-doc for allocator structures and flags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add missing kernel-doc for GPU buddy allocator flags, gpu_buddy_block, and gpu_buddy. The documentation covers block header fields, allocator roots, free trees, and allocation flags such as RANGE, TOPDOWN, CONTIGUOUS, CLEAR, and TRIM_DISABLE. Private members are marked with kernel-doc private markers and documented with regular comments. No functional changes. v2: - Corrected GPU_BUDDY_CLEAR_TREE and GPU_BUDDY_DIRTY_TREE index values (Arun) - Rebased after DRM buddy allocator moved to drivers/gpu/ - Updated commit message v3: - Document reserved bits 8:6 in header layout (Arun) - Fix checkpatch warning Cc: Christian König Cc: Arunpravin Paneer Selvam Suggested-by: Matthew Auld Signed-off-by: Sanjay Yadav Reviewed-by: Arunpravin Paneer Selvam Signed-off-by: Arunpravin Paneer Selvam Link: https://patch.msgid.link/20260212092527.718455-5-sanjay.kumar.yadav@intel.com --- include/linux/gpu_buddy.h | 123 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 103 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpu_buddy.h b/include/linux/gpu_buddy.h index 07ac65db6d2e..bf2a42256536 100644 --- a/include/linux/gpu_buddy.h +++ b/include/linux/gpu_buddy.h @@ -12,11 +12,58 @@ #include #include +/** + * GPU_BUDDY_RANGE_ALLOCATION - Allocate within a specific address range + * + * When set, allocation is restricted to the range [start, end) specified + * in gpu_buddy_alloc_blocks(). Without this flag, start/end are ignored + * and allocation can use any free space. + */ #define GPU_BUDDY_RANGE_ALLOCATION BIT(0) + +/** + * GPU_BUDDY_TOPDOWN_ALLOCATION - Allocate from top of address space + * + * Allocate starting from high addresses and working down. Useful for + * separating different allocation types (e.g., kernel vs userspace) + * to reduce fragmentation. + */ #define GPU_BUDDY_TOPDOWN_ALLOCATION BIT(1) + +/** + * GPU_BUDDY_CONTIGUOUS_ALLOCATION - Require physically contiguous blocks + * + * The allocation must be satisfied with a single contiguous block. + * If the requested size cannot be allocated contiguously, the + * allocation fails with -ENOSPC. + */ #define GPU_BUDDY_CONTIGUOUS_ALLOCATION BIT(2) + +/** + * GPU_BUDDY_CLEAR_ALLOCATION - Prefer pre-cleared (zeroed) memory + * + * Attempt to allocate from the clear tree first. If insufficient clear + * memory is available, falls back to dirty memory. Useful when the + * caller needs zeroed memory and wants to avoid GPU clear operations. + */ #define GPU_BUDDY_CLEAR_ALLOCATION BIT(3) + +/** + * GPU_BUDDY_CLEARED - Mark returned blocks as cleared + * + * Used with gpu_buddy_free_list() to indicate that the memory being + * freed has been cleared (zeroed). The blocks will be placed in the + * clear tree for future GPU_BUDDY_CLEAR_ALLOCATION requests. + */ #define GPU_BUDDY_CLEARED BIT(4) + +/** + * GPU_BUDDY_TRIM_DISABLE - Disable automatic block trimming + * + * By default, if an allocation is smaller than the allocated block, + * excess memory is trimmed and returned to the free pool. This flag + * disables trimming, keeping the full power-of-two block size. + */ #define GPU_BUDDY_TRIM_DISABLE BIT(5) enum gpu_buddy_free_tree { @@ -28,7 +75,28 @@ enum gpu_buddy_free_tree { #define for_each_free_tree(tree) \ for ((tree) = 0; (tree) < GPU_BUDDY_MAX_FREE_TREES; (tree)++) +/** + * struct gpu_buddy_block - Block within a buddy allocator + * + * Each block in the buddy allocator is represented by this structure. + * Blocks are organized in a binary tree where each parent block can be + * split into two children (left and right buddies). The allocator manages + * blocks at various orders (power-of-2 sizes) from chunk_size up to the + * largest contiguous region. + * + * @private: Private data owned by the allocator user (e.g., driver-specific data) + * @link: List node for user ownership while block is allocated + */ struct gpu_buddy_block { +/* private: */ + /* + * Header bit layout: + * - Bits 63:12: block offset within the address space + * - Bits 11:10: state (ALLOCATED, FREE, or SPLIT) + * - Bit 9: clear bit (1 if memory is zeroed) + * - Bits 8:6: reserved + * - Bits 5:0: order (log2 of size relative to chunk_size) + */ #define GPU_BUDDY_HEADER_OFFSET GENMASK_ULL(63, 12) #define GPU_BUDDY_HEADER_STATE GENMASK_ULL(11, 10) #define GPU_BUDDY_ALLOCATED (1 << 10) @@ -43,7 +111,7 @@ struct gpu_buddy_block { struct gpu_buddy_block *left; struct gpu_buddy_block *right; struct gpu_buddy_block *parent; - +/* public: */ void *private; /* owned by creator */ /* @@ -53,43 +121,58 @@ struct gpu_buddy_block { * gpu_buddy_free* ownership is given back to the mm. */ union { +/* private: */ struct rb_node rb; +/* public: */ struct list_head link; }; - +/* private: */ struct list_head tmp_link; }; /* Order-zero must be at least SZ_4K */ #define GPU_BUDDY_MAX_ORDER (63 - 12) -/* - * Binary Buddy System. +/** + * struct gpu_buddy - GPU binary buddy allocator + * + * The buddy allocator provides efficient power-of-two memory allocation + * with fast allocation and free operations. It is commonly used for GPU + * memory management where allocations can be split into power-of-two + * block sizes. * - * Locking should be handled by the user, a simple mutex around - * gpu_buddy_alloc* and gpu_buddy_free* should suffice. + * Locking should be handled by the user; a simple mutex around + * gpu_buddy_alloc_blocks() and gpu_buddy_free_block()/gpu_buddy_free_list() + * should suffice. + * + * @n_roots: Number of root blocks in the roots array. + * @max_order: Maximum block order (log2 of largest block size / chunk_size). + * @chunk_size: Minimum allocation granularity in bytes. Must be at least SZ_4K. + * @size: Total size of the address space managed by this allocator in bytes. + * @avail: Total free space currently available for allocation in bytes. + * @clear_avail: Free space available in the clear tree (zeroed memory) in bytes. + * This is a subset of @avail. */ struct gpu_buddy { - /* Maintain a free list for each order. */ - struct rb_root **free_trees; - +/* private: */ /* - * Maintain explicit binary tree(s) to track the allocation of the - * address space. This gives us a simple way of finding a buddy block - * and performing the potentially recursive merge step when freeing a - * block. Nodes are either allocated or free, in which case they will - * also exist on the respective free list. + * Array of red-black trees for free block management. + * Indexed as free_trees[clear/dirty][order] where: + * - Index 0 (GPU_BUDDY_CLEAR_TREE): blocks with zeroed content + * - Index 1 (GPU_BUDDY_DIRTY_TREE): blocks with unknown content + * Each tree holds free blocks of the corresponding order. */ - struct gpu_buddy_block **roots; - + struct rb_root **free_trees; /* - * Anything from here is public, and remains static for the lifetime of - * the mm. Everything above is considered do-not-touch. + * Array of root blocks representing the top-level blocks of the + * binary tree(s). Multiple roots exist when the total size is not + * a power of two, with each root being the largest power-of-two + * that fits in the remaining space. */ + struct gpu_buddy_block **roots; +/* public: */ unsigned int n_roots; unsigned int max_order; - - /* Must be at least SZ_4K */ u64 chunk_size; u64 size; u64 avail; -- cgit v1.2.3 From df8c7892e06efa5df2aa780a338f33a4f666370b Mon Sep 17 00:00:00 2001 From: Sanjay Yadav Date: Thu, 12 Feb 2026 14:55:30 +0530 Subject: drm/buddy: Move internal helpers to buddy.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move gpu_buddy_block_state(), gpu_buddy_block_is_allocated(), and gpu_buddy_block_is_split() from gpu_buddy.h to gpu_buddy.c as static functions since they have no external callers. Remove gpu_get_buddy() as it was an unused exported wrapper around the internal __get_buddy(). No functional changes. v2: - Rebased after DRM buddy allocator moved to drivers/gpu/ - Keep gpu_buddy_block_is_free() in header since it's now used by drm_buddy.c - Updated commit message Cc: Christian König Cc: Arunpravin Paneer Selvam Suggested-by: Matthew Auld Signed-off-by: Sanjay Yadav Reviewed-by: Arunpravin Paneer Selvam Signed-off-by: Arunpravin Paneer Selvam Link: https://patch.msgid.link/20260212092527.718455-6-sanjay.kumar.yadav@intel.com --- include/linux/gpu_buddy.h | 25 ++----------------------- 1 file changed, 2 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpu_buddy.h b/include/linux/gpu_buddy.h index bf2a42256536..f1fb6eff604a 100644 --- a/include/linux/gpu_buddy.h +++ b/include/linux/gpu_buddy.h @@ -191,16 +191,10 @@ gpu_buddy_block_order(struct gpu_buddy_block *block) return block->header & GPU_BUDDY_HEADER_ORDER; } -static inline unsigned int -gpu_buddy_block_state(struct gpu_buddy_block *block) -{ - return block->header & GPU_BUDDY_HEADER_STATE; -} - static inline bool -gpu_buddy_block_is_allocated(struct gpu_buddy_block *block) +gpu_buddy_block_is_free(struct gpu_buddy_block *block) { - return gpu_buddy_block_state(block) == GPU_BUDDY_ALLOCATED; + return (block->header & GPU_BUDDY_HEADER_STATE) == GPU_BUDDY_FREE; } static inline bool @@ -209,18 +203,6 @@ gpu_buddy_block_is_clear(struct gpu_buddy_block *block) return block->header & GPU_BUDDY_HEADER_CLEAR; } -static inline bool -gpu_buddy_block_is_free(struct gpu_buddy_block *block) -{ - return gpu_buddy_block_state(block) == GPU_BUDDY_FREE; -} - -static inline bool -gpu_buddy_block_is_split(struct gpu_buddy_block *block) -{ - return gpu_buddy_block_state(block) == GPU_BUDDY_SPLIT; -} - static inline u64 gpu_buddy_block_size(struct gpu_buddy *mm, struct gpu_buddy_block *block) @@ -232,9 +214,6 @@ int gpu_buddy_init(struct gpu_buddy *mm, u64 size, u64 chunk_size); void gpu_buddy_fini(struct gpu_buddy *mm); -struct gpu_buddy_block * -gpu_get_buddy(struct gpu_buddy_block *block); - int gpu_buddy_alloc_blocks(struct gpu_buddy *mm, u64 start, u64 end, u64 size, u64 min_page_size, -- cgit v1.2.3 From 8167d7f674648cfd428ed49773522f9df5c4fdfd Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 15 Feb 2026 21:44:18 -0800 Subject: soundwire: sdw.h: repair names and format of kernel-doc comments Fix all kernel-doc warnings in sdw.h: Warning: include/linux/soundwire/sdw.h:538 cannot understand function prototype: 'enum sdw_reg_bank' Warning: include/linux/soundwire/sdw.h:779 struct member 'port_num' not described in 'sdw_transport_params' Warning: include/linux/soundwire/sdw.h:792 struct member 'port_num' not described in 'sdw_enable_ch' Warning: include/linux/soundwire/sdw.h:892 cannot understand function prototype: 'struct sdw_port_config' Warning: include/linux/soundwire/sdw.h:906 cannot understand function prototype: 'struct sdw_stream_config' Warning: include/linux/soundwire/sdw.h:925 cannot understand function prototype: 'enum sdw_stream_state' Warning: include/linux/soundwire/sdw.h:942 cannot understand function prototype: 'struct sdw_stream_params' Warning: include/linux/soundwire/sdw.h:960 cannot understand function prototype: 'struct sdw_stream_runtime' Warning: include/linux/soundwire/sdw.h:1047 struct member 'bpt_stream_refcount' not described in 'sdw_bus' Signed-off-by: Randy Dunlap Reviewed-by: Charles Keepax Link: https://patch.msgid.link/20260216054418.2766846-1-rdunlap@infradead.org Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index f462717acf20..6147eb1fb210 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -532,7 +532,7 @@ struct sdw_slave_intr_status { }; /** - * sdw_reg_bank - SoundWire register banks + * enum sdw_reg_bank - SoundWire register banks * @SDW_BANK0: Soundwire register bank 0 * @SDW_BANK1: Soundwire register bank 1 */ @@ -751,7 +751,7 @@ struct sdw_port_params { * struct sdw_transport_params: Data Port Transport Parameters * * @blk_grp_ctrl_valid: Port implements block group control - * @num: Port number + * @port_num: Port number * @blk_grp_ctrl: Block group control value * @sample_interval: Sample interval * @offset1: Blockoffset of the payload data @@ -782,7 +782,7 @@ struct sdw_transport_params { /** * struct sdw_enable_ch: Enable/disable Data Port channel * - * @num: Port number + * @port_num: Port number * @ch_mask: Active channel mask * @enable: Enable (true) /disable (false) channel */ @@ -885,7 +885,7 @@ void sdw_bus_master_delete(struct sdw_bus *bus); void sdw_show_ping_status(struct sdw_bus *bus, bool sync_delay); /** - * sdw_port_config: Master or Slave Port configuration + * struct sdw_port_config: Master or Slave Port configuration * * @num: Port number * @ch_mask: channels mask for port @@ -896,7 +896,7 @@ struct sdw_port_config { }; /** - * sdw_stream_config: Master or Slave stream configuration + * struct sdw_stream_config: Master or Slave stream configuration * * @frame_rate: Audio frame rate of the stream, in Hz * @ch_count: Channel count of the stream @@ -913,7 +913,7 @@ struct sdw_stream_config { }; /** - * sdw_stream_state: Stream states + * enum sdw_stream_state: Stream states * * @SDW_STREAM_ALLOCATED: New stream allocated. * @SDW_STREAM_CONFIGURED: Stream configured @@ -934,7 +934,7 @@ enum sdw_stream_state { }; /** - * sdw_stream_params: Stream parameters + * struct sdw_stream_params: Stream parameters * * @rate: Sampling frequency, in Hz * @ch_count: Number of channels @@ -947,7 +947,7 @@ struct sdw_stream_params { }; /** - * sdw_stream_runtime: Runtime stream parameters + * struct sdw_stream_runtime: Runtime stream parameters * * @name: SoundWire stream name * @params: Stream parameters @@ -983,7 +983,7 @@ struct sdw_stream_runtime { * @defer_msg: Defer message * @params: Current bus parameters * @stream_refcount: number of streams currently using this bus - * @btp_stream_refcount: number of BTP streams currently using this bus (should + * @bpt_stream_refcount: number of BTP streams currently using this bus (should * be zero or one, multiple streams per link is not supported). * @bpt_stream: pointer stored to handle BTP streams. * @ops: Master callback ops -- cgit v1.2.3 From 88440208c6074e639a7ccc038c6a7ed4b6f8bb99 Mon Sep 17 00:00:00 2001 From: Tomas Melin Date: Tue, 10 Feb 2026 10:53:34 +0000 Subject: iio: industrialio-backend: support backend capabilities MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Not all backends support the full set of capabilities provided by the industrialio-backend framework. Capability bits can be used in frontends and backends for checking for a certain feature set, or if using related functions can be expected to fail. Capability bits should be set by a compatible backend and provided when registering the backend. Reviewed-by: Nuno Sá Reviewed-by: David Lechner Signed-off-by: Tomas Melin Signed-off-by: Jonathan Cameron --- include/linux/iio/backend.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/backend.h b/include/linux/iio/backend.h index 7f815f3fed6a..4d15c2a9802c 100644 --- a/include/linux/iio/backend.h +++ b/include/linux/iio/backend.h @@ -84,6 +84,27 @@ enum iio_backend_filter_type { IIO_BACKEND_FILTER_TYPE_MAX }; +/** + * enum iio_backend_capabilities - Backend capabilities + * Backend capabilities can be used by frontends to check if a given + * functionality is supported by the backend. This is useful for frontend + * devices which are expected to work with alternative backend + * implementations. Capabilities are loosely coupled with operations, + * meaning that a capability requires certain operations to be implemented + * by the backend. A capability might be mapped to a single operation or + * multiple operations. + * + * @IIO_BACKEND_CAP_CALIBRATION: Backend supports digital interface + * calibration. Calibration procedure is device specific. + * @IIO_BACKEND_CAP_BUFFER: Support for IIO buffer interface. + * @IIO_BACKEND_CAP_ENABLE: Backend can be explicitly enabled/disabled. + */ +enum iio_backend_capabilities { + IIO_BACKEND_CAP_CALIBRATION = BIT(0), + IIO_BACKEND_CAP_BUFFER = BIT(1), + IIO_BACKEND_CAP_ENABLE = BIT(2), +}; + /** * struct iio_backend_ops - operations structure for an iio_backend * @enable: Enable backend. @@ -179,10 +200,12 @@ struct iio_backend_ops { * struct iio_backend_info - info structure for an iio_backend * @name: Backend name. * @ops: Backend operations. + * @caps: Backend capabilities. (bitmask of enum iio_backend_capabilities). */ struct iio_backend_info { const char *name; const struct iio_backend_ops *ops; + u32 caps; }; int iio_backend_chan_enable(struct iio_backend *back, unsigned int chan); @@ -235,6 +258,7 @@ int iio_backend_read_raw(struct iio_backend *back, long mask); int iio_backend_extend_chan_spec(struct iio_backend *back, struct iio_chan_spec *chan); +bool iio_backend_has_caps(struct iio_backend *back, u32 caps); void *iio_backend_get_priv(const struct iio_backend *conv); struct iio_backend *devm_iio_backend_get(struct device *dev, const char *name); struct iio_backend *devm_iio_backend_fwnode_get(struct device *dev, -- cgit v1.2.3 From 8b65eb52d93e4e496bd26e6867152344554eb39e Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 17 Feb 2026 11:15:10 -0800 Subject: locking/mutex: Rename mutex_init_lockep() Typo, this wants to be _lockdep(). Fixes: 51d7a054521d ("locking/mutex: Redo __mutex_init() to reduce generated code size") Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260217191512.1180151-2-dave@stgolabs.net --- include/linux/mutex.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index ecaa0440f6ec..8126da959088 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -87,12 +87,12 @@ do { \ struct mutex mutexname = __MUTEX_INITIALIZER(mutexname) #ifdef CONFIG_DEBUG_LOCK_ALLOC -void mutex_init_lockep(struct mutex *lock, const char *name, struct lock_class_key *key); +void mutex_init_lockdep(struct mutex *lock, const char *name, struct lock_class_key *key); static inline void __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) { - mutex_init_lockep(lock, name, key); + mutex_init_lockdep(lock, name, key); } #else extern void mutex_init_generic(struct mutex *lock); -- cgit v1.2.3 From babcde3be8c9148aa60a14b17831e8f249854963 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 17 Feb 2026 11:15:11 -0800 Subject: locking/mutex: Fix wrong comment for CONFIG_DEBUG_LOCK_ALLOC ... that endif block should be CONFIG_DEBUG_LOCK_ALLOC, not CONFIG_LOCKDEP. Fixes: 51d7a054521d ("locking/mutex: Redo __mutex_init() to reduce generated code size") Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260217191512.1180151-3-dave@stgolabs.net --- include/linux/mutex.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 8126da959088..f57d2a97da57 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -146,7 +146,7 @@ static inline void __mutex_init(struct mutex *lock, const char *name, { mutex_rt_init_generic(lock); } -#endif /* !CONFIG_LOCKDEP */ +#endif /* !CONFIG_DEBUG_LOCK_ALLOC */ #endif /* CONFIG_PREEMPT_RT */ #ifdef CONFIG_DEBUG_MUTEXES -- cgit v1.2.3 From 50214dc4382055352fb1d7b9779550dabf5059e5 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 17 Feb 2026 11:15:12 -0800 Subject: locking/mutex: Add killable flavor to guard definitions The mutex guard family defines _try and _intr variants but is missing the killable one. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260217191512.1180151-4-dave@stgolabs.net --- include/linux/mutex.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index f57d2a97da57..2f648ee204e7 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -253,6 +253,7 @@ extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock) __cond_a DEFINE_LOCK_GUARD_1(mutex, struct mutex, mutex_lock(_T->lock), mutex_unlock(_T->lock)) DEFINE_LOCK_GUARD_1_COND(mutex, _try, mutex_trylock(_T->lock)) DEFINE_LOCK_GUARD_1_COND(mutex, _intr, mutex_lock_interruptible(_T->lock), _RET == 0) +DEFINE_LOCK_GUARD_1_COND(mutex, _kill, mutex_lock_killable(_T->lock), _RET == 0) DEFINE_LOCK_GUARD_1(mutex_init, struct mutex, mutex_init(_T->lock), /* */) DECLARE_LOCK_GUARD_1_ATTRS(mutex, __acquires(_T), __releases(*(struct mutex **)_T)) @@ -261,6 +262,8 @@ DECLARE_LOCK_GUARD_1_ATTRS(mutex_try, __acquires(_T), __releases(*(struct mutex #define class_mutex_try_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(mutex_try, _T) DECLARE_LOCK_GUARD_1_ATTRS(mutex_intr, __acquires(_T), __releases(*(struct mutex **)_T)) #define class_mutex_intr_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(mutex_intr, _T) +DECLARE_LOCK_GUARD_1_ATTRS(mutex_kill, __acquires(_T), __releases(*(struct mutex **)_T)) +#define class_mutex_kill_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(mutex_kill, _T) DECLARE_LOCK_GUARD_1_ATTRS(mutex_init, __acquires(_T), __releases(*(struct mutex **)_T)) #define class_mutex_init_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(mutex_init, _T) -- cgit v1.2.3 From 263ff314cc5602599d481b0912a381555fcbad28 Mon Sep 17 00:00:00 2001 From: Avri Altman Date: Fri, 28 Nov 2025 07:20:11 +0200 Subject: mmc: core: Add quirk for incorrect manufacturing date Some eMMC vendors need to report manufacturing dates beyond 2025 but are reluctant to update the EXT_CSD revision from 8 to 9. Changing the Updating the EXT_CSD revision may involve additional testing or qualification steps with customers. To ease this transition and avoid a full re-qualification process, a workaround is needed. This patch introduces a temporary quirk that re-purposes the year codes corresponding to 2010, 2011, and 2012 to represent the years 2026, 2027, and 2028, respectively. This solution is only valid for this three-year period. After 2028, vendors must update their firmware to set EXT_CSD_REV=9 to continue reporting the correct manufacturing date in compliance with the JEDEC standard. The `MMC_QUIRK_BROKEN_MDT` is introduced and enabled for all Sandisk devices to handle this behavior. Signed-off-by: Avri Altman Signed-off-by: Ulf Hansson --- include/linux/mmc/card.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h index e9e964c20e53..4722dd7e46ce 100644 --- a/include/linux/mmc/card.h +++ b/include/linux/mmc/card.h @@ -329,6 +329,7 @@ struct mmc_card { #define MMC_QUIRK_BROKEN_CACHE_FLUSH (1<<16) /* Don't flush cache until the write has occurred */ #define MMC_QUIRK_BROKEN_SD_POWEROFF_NOTIFY (1<<17) /* Disable broken SD poweroff notify support */ #define MMC_QUIRK_NO_UHS_DDR50_TUNING (1<<18) /* Disable DDR50 tuning */ +#define MMC_QUIRK_BROKEN_MDT (1<<19) /* Wrong manufacturing year */ bool written_flag; /* Indicates eMMC has been written since power on */ bool reenable_cmdq; /* Re-enable Command Queue */ -- cgit v1.2.3 From c0b68bc25efeaca8a1ef3a0cfab5fbf5f81d002d Mon Sep 17 00:00:00 2001 From: Jeff Chen Date: Tue, 13 Jan 2026 11:15:17 +0800 Subject: mmc: sdio: add NXP vendor and IW61x device IDs Add NXP's SDIO vendor ID (0x0471) and IW61x device ID (0x0205) to sdio_ids.h for future support of NXP Wi-Fi chips over SDIO. Signed-off-by: Jeff Chen Signed-off-by: Ulf Hansson --- include/linux/mmc/sdio_ids.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h index 673cbdf43453..39ac2b612e4a 100644 --- a/include/linux/mmc/sdio_ids.h +++ b/include/linux/mmc/sdio_ids.h @@ -116,6 +116,9 @@ #define SDIO_VENDOR_ID_MICROCHIP_WILC 0x0296 #define SDIO_DEVICE_ID_MICROCHIP_WILC1000 0x5347 +#define SDIO_VENDOR_ID_NXP 0x0471 +#define SDIO_DEVICE_ID_NXP_IW61X 0x0205 + #define SDIO_VENDOR_ID_REALTEK 0x024c #define SDIO_DEVICE_ID_REALTEK_RTW8723BS 0xb723 #define SDIO_DEVICE_ID_REALTEK_RTW8821BS 0xb821 -- cgit v1.2.3 From d6bf2e64dec87322f2b11565ddb59c0e967f96e3 Mon Sep 17 00:00:00 2001 From: Luke Wang Date: Wed, 4 Feb 2026 11:40:03 +0800 Subject: mmc: core: Optimize time for secure erase/trim for some Kingston eMMCs Kingston eMMC IY2964 and IB2932 takes a fixed ~2 seconds for each secure erase/trim operation regardless of size - that is, a single secure erase/trim operation of 1MB takes the same time as 1GB. With default calculated 3.5MB max discard size, secure erase 1GB requires ~300 separate operations taking ~10 minutes total. Add a card quirk, MMC_QUIRK_FIXED_SECURE_ERASE_TRIM_TIME, to set maximum secure erase size for those devices. This allows 1GB secure erase to complete in a single operation, reducing time from 10 minutes to just 2 seconds. Signed-off-by: Luke Wang Cc: stable@vger.kernel.org Signed-off-by: Ulf Hansson --- include/linux/mmc/card.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h index 4722dd7e46ce..9dc4750296af 100644 --- a/include/linux/mmc/card.h +++ b/include/linux/mmc/card.h @@ -330,6 +330,7 @@ struct mmc_card { #define MMC_QUIRK_BROKEN_SD_POWEROFF_NOTIFY (1<<17) /* Disable broken SD poweroff notify support */ #define MMC_QUIRK_NO_UHS_DDR50_TUNING (1<<18) /* Disable DDR50 tuning */ #define MMC_QUIRK_BROKEN_MDT (1<<19) /* Wrong manufacturing year */ +#define MMC_QUIRK_FIXED_SECURE_ERASE_TRIM_TIME (1<<20) /* Secure erase/trim time is fixed regardless of size */ bool written_flag; /* Indicates eMMC has been written since power on */ bool reenable_cmdq; /* Re-enable Command Queue */ -- cgit v1.2.3 From 94d709be8c0dc875dfc9ebb64d3b8093d0790c15 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 16 Feb 2026 14:31:57 +0100 Subject: xattr: add rcu_head and rhash_head to struct simple_xattr In preparation for converting simple_xattrs from rbtree to rhashtable, add rhash_head and rcu_head members to struct simple_xattr. The rhashtable implementation will use rhash_head for hash table linkage and RCU-based lockless reads, requiring that replaced or removed xattr entries be freed via call_rcu() rather than immediately. Add simple_xattr_free_rcu() which schedules RCU-deferred freeing of an xattr entry. This will be used by callers of simple_xattr_set() once they switch to the rhashtable-based xattr store. No functional changes. Link: https://patch.msgid.link/20260216-work-xattr-socket-v1-1-c2efa4f74cb7@kernel.org Acked-by: Darrick J. Wong Signed-off-by: Christian Brauner --- include/linux/xattr.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/xattr.h b/include/linux/xattr.h index 296b5ee5c979..fdbd2095414a 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -112,6 +113,8 @@ struct simple_xattrs { struct simple_xattr { struct rb_node rb_node; + struct rhash_head hash_node; + struct rcu_head rcu; char *name; size_t size; char value[] __counted_by(size); @@ -122,6 +125,7 @@ void simple_xattrs_free(struct simple_xattrs *xattrs, size_t *freed_space); size_t simple_xattr_space(const char *name, size_t size); struct simple_xattr *simple_xattr_alloc(const void *value, size_t size); void simple_xattr_free(struct simple_xattr *xattr); +void simple_xattr_free_rcu(struct simple_xattr *xattr); int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, void *buffer, size_t size); struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs, -- cgit v1.2.3 From b32c4a213698ab351b44da2fd1b2a5976c7fa033 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 16 Feb 2026 14:31:58 +0100 Subject: xattr: add rhashtable-based simple_xattr infrastructure Add rhashtable support to the simple_xattr subsystem while keeping the existing rbtree code fully functional. This allows consumers to be migrated one at a time without breaking any intermediate build. struct simple_xattrs gains a dispatch flag and a union holding either the rbtree (rb_root + rwlock) or rhashtable state: struct simple_xattrs { bool use_rhashtable; union { struct { struct rb_root rb_root; rwlock_t lock; }; struct rhashtable ht; }; }; simple_xattrs_init() continues to set up the rbtree path for existing embedded-struct callers. Add simple_xattrs_alloc() which dynamically allocates a simple_xattrs and initializes the rhashtable path. This is the entry point for consumers switching to pointer-based lazy allocation. The five core functions (get, set, list, add, free) dispatch based on the use_rhashtable flag. Existing callers continue to use the rbtree path unchanged. As each consumer is converted it will switch to simple_xattrs_alloc() and the rhashtable path. Once all consumers are converted a follow-up patch will remove the rbtree code. Link: https://patch.msgid.link/20260216-work-xattr-socket-v1-2-c2efa4f74cb7@kernel.org Acked-by: Darrick J. Wong Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/xattr.h | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/xattr.h b/include/linux/xattr.h index fdbd2095414a..832a44358661 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -107,8 +107,14 @@ static inline const char *xattr_prefix(const struct xattr_handler *handler) } struct simple_xattrs { - struct rb_root rb_root; - rwlock_t lock; + bool use_rhashtable; + union { + struct { + struct rb_root rb_root; + rwlock_t lock; + }; + struct rhashtable ht; + }; }; struct simple_xattr { @@ -121,6 +127,9 @@ struct simple_xattr { }; void simple_xattrs_init(struct simple_xattrs *xattrs); +struct simple_xattrs *simple_xattrs_alloc(void); +struct simple_xattrs *simple_xattrs_lazy_alloc(struct simple_xattrs **xattrsp, + const void *value, int flags); void simple_xattrs_free(struct simple_xattrs *xattrs, size_t *freed_space); size_t simple_xattr_space(const char *name, size_t size); struct simple_xattr *simple_xattr_alloc(const void *value, size_t size); @@ -137,4 +146,16 @@ void simple_xattr_add(struct simple_xattrs *xattrs, struct simple_xattr *new_xattr); int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name); +DEFINE_CLASS(simple_xattr, + struct simple_xattr *, + if (!IS_ERR_OR_NULL(_T)) simple_xattr_free(_T), + simple_xattr_alloc(value, size), + const void *value, size_t size) + +DEFINE_CLASS(simple_xattrs, + struct simple_xattrs *, + if (!IS_ERR_OR_NULL(_T)) { simple_xattrs_free(_T, NULL); kfree(_T); }, + simple_xattrs_alloc(), + void) + #endif /* _LINUX_XATTR_H */ -- cgit v1.2.3 From 52b364fed6e1578e551fee20c76fecb3fc0e10ed Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 16 Feb 2026 14:31:59 +0100 Subject: shmem: adapt to rhashtable-based simple_xattrs with lazy allocation Adapt tmpfs/shmem to use the rhashtable-based xattr path and switch from an embedded struct to pointer-based lazy allocation. Change shmem_inode_info.xattrs from embedded 'struct simple_xattrs' to a pointer 'struct simple_xattrs *', initialized to NULL. This avoids the rhashtable overhead for every tmpfs inode, which helps when a lot of inodes exist. The xattr store is allocated on first use: - shmem_initxattrs(): Allocates via simple_xattrs_alloc() when security modules set initial xattrs during inode creation. - shmem_xattr_handler_set(): Allocates on first setxattr, with a short-circuit for removal when no xattrs are stored yet. All read paths (shmem_xattr_handler_get, shmem_listxattr) check for NULL xattrs pointer and return -ENODATA or 0 respectively. Replaced xattr entries are freed via simple_xattr_free_rcu() to allow concurrent RCU readers to finish. shmem_evict_inode() conditionally frees the xattr store only when allocated. Also change simple_xattr_add() from void to int to propagate rhashtable insertion failures. shmem_initxattrs() is the only caller. Link: https://patch.msgid.link/20260216-work-xattr-socket-v1-3-c2efa4f74cb7@kernel.org Acked-by: Darrick J. Wong Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/shmem_fs.h | 2 +- include/linux/xattr.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index a8273b32e041..f6a2d3402d76 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -48,7 +48,7 @@ struct shmem_inode_info { }; struct timespec64 i_crtime; /* file creation time */ struct shared_policy policy; /* NUMA memory alloc policy */ - struct simple_xattrs xattrs; /* list of xattrs */ + struct simple_xattrs *xattrs; /* list of xattrs */ pgoff_t fallocend; /* highest fallocate endindex */ unsigned int fsflags; /* for FS_IOC_[SG]ETFLAGS */ atomic_t stop_eviction; /* hold when working on inode */ diff --git a/include/linux/xattr.h b/include/linux/xattr.h index 832a44358661..6e619e185e90 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -142,8 +142,8 @@ struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs, size_t size, int flags); ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, char *buffer, size_t size); -void simple_xattr_add(struct simple_xattrs *xattrs, - struct simple_xattr *new_xattr); +int simple_xattr_add(struct simple_xattrs *xattrs, + struct simple_xattr *new_xattr); int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name); DEFINE_CLASS(simple_xattr, -- cgit v1.2.3 From f4cc3ab824d6772a48ca9d9c74ac623b3309985d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Tue, 7 Oct 2025 14:06:05 +0200 Subject: dma-buf: protected fence ops by RCU v8 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The fence ops of a dma_fence currently need to life as long as the dma_fence is alive. This means that the module which originally issued a dma_fence can't unload unless all fences are freed up. As first step to solve this issue protect the fence ops by RCU. While it is counter intuitive to protect a constant function pointer table by RCU it allows modules to wait for an RCU grace period before they unload, to make sure that nobody is executing their functions any more. This patch has not much functional change, but only adds the RCU handling for the static checker to test. v2: make one the now duplicated lockdep warnings a comment instead. v3: Add more documentation to ->wait and ->release callback. v4: fix typo in documentation v5: rebased on drm-tip v6: improve code comments v7: improve commit message and code comments v8: fix sparse rcu warnings Signed-off-by: Christian König Reviewed-by: Tvrtko Ursulin Reviewed-by: Boris Brezillon Link: https://lore.kernel.org/r/20260219160822.1529-2-christian.koenig@amd.com --- include/linux/dma-fence.h | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h index 9c4d25289239..fa3cfe3e98ac 100644 --- a/include/linux/dma-fence.h +++ b/include/linux/dma-fence.h @@ -67,7 +67,7 @@ struct seq_file; */ struct dma_fence { spinlock_t *lock; - const struct dma_fence_ops *ops; + const struct dma_fence_ops __rcu *ops; /* * We clear the callback list on kref_put so that by the time we * release the fence it is unused. No one should be adding to the @@ -220,6 +220,10 @@ struct dma_fence_ops { * timed out. Can also return other error values on custom implementations, * which should be treated as if the fence is signaled. For example a hardware * lockup could be reported like that. + * + * Implementing this callback prevents the fence from detaching after + * signaling and so it is necessary for the module providing the + * dma_fence_ops to stay loaded as long as the dma_fence exists. */ signed long (*wait)(struct dma_fence *fence, bool intr, signed long timeout); @@ -231,6 +235,13 @@ struct dma_fence_ops { * Can be called from irq context. This callback is optional. If it is * NULL, then dma_fence_free() is instead called as the default * implementation. + * + * Implementing this callback prevents the fence from detaching after + * signaling and so it is necessary for the module providing the + * dma_fence_ops to stay loaded as long as the dma_fence exists. + * + * If the callback is implemented the memory backing the dma_fence + * object must be freed RCU safe. */ void (*release)(struct dma_fence *fence); @@ -454,13 +465,19 @@ dma_fence_test_signaled_flag(struct dma_fence *fence) static inline bool dma_fence_is_signaled_locked(struct dma_fence *fence) { + const struct dma_fence_ops *ops; + if (dma_fence_test_signaled_flag(fence)) return true; - if (fence->ops->signaled && fence->ops->signaled(fence)) { + rcu_read_lock(); + ops = rcu_dereference(fence->ops); + if (ops->signaled && ops->signaled(fence)) { + rcu_read_unlock(); dma_fence_signal_locked(fence); return true; } + rcu_read_unlock(); return false; } @@ -484,13 +501,19 @@ dma_fence_is_signaled_locked(struct dma_fence *fence) static inline bool dma_fence_is_signaled(struct dma_fence *fence) { + const struct dma_fence_ops *ops; + if (dma_fence_test_signaled_flag(fence)) return true; - if (fence->ops->signaled && fence->ops->signaled(fence)) { + rcu_read_lock(); + ops = rcu_dereference(fence->ops); + if (ops->signaled && ops->signaled(fence)) { + rcu_read_unlock(); dma_fence_signal(fence); return true; } + rcu_read_unlock(); return false; } @@ -695,7 +718,7 @@ extern const struct dma_fence_ops dma_fence_chain_ops; */ static inline bool dma_fence_is_array(struct dma_fence *fence) { - return fence->ops == &dma_fence_array_ops; + return rcu_access_pointer(fence->ops) == &dma_fence_array_ops; } /** @@ -706,7 +729,7 @@ static inline bool dma_fence_is_array(struct dma_fence *fence) */ static inline bool dma_fence_is_chain(struct dma_fence *fence) { - return fence->ops == &dma_fence_chain_ops; + return rcu_access_pointer(fence->ops) == &dma_fence_chain_ops; } /** -- cgit v1.2.3 From 541c8f2468b933acc5d129e84bd264923675a66e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Wed, 8 Oct 2025 18:12:46 +0200 Subject: dma-buf: detach fence ops on signal v3 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When neither a release nor a wait backend ops is specified it is possible to let the dma_fence live on independently of the module who issued it. This makes it possible to unload drivers and only wait for all their fences to signal. v2: fix typo in comment v3: fix sparse rcu warnings Signed-off-by: Christian König Reviewed-by: Tvrtko Ursulin Reviewed-by: Philipp Stanner Reviewed-by: Boris Brezillon Link: https://lore.kernel.org/r/20260219160822.1529-3-christian.koenig@amd.com --- include/linux/dma-fence.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h index fa3cfe3e98ac..9ff2c4a09cdc 100644 --- a/include/linux/dma-fence.h +++ b/include/linux/dma-fence.h @@ -472,7 +472,7 @@ dma_fence_is_signaled_locked(struct dma_fence *fence) rcu_read_lock(); ops = rcu_dereference(fence->ops); - if (ops->signaled && ops->signaled(fence)) { + if (ops && ops->signaled && ops->signaled(fence)) { rcu_read_unlock(); dma_fence_signal_locked(fence); return true; @@ -508,7 +508,7 @@ dma_fence_is_signaled(struct dma_fence *fence) rcu_read_lock(); ops = rcu_dereference(fence->ops); - if (ops->signaled && ops->signaled(fence)) { + if (ops && ops->signaled && ops->signaled(fence)) { rcu_read_unlock(); dma_fence_signal(fence); return true; -- cgit v1.2.3 From 3e5067931b5df667f5350fafe4410554e228e53e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Thu, 9 Oct 2025 10:40:06 +0200 Subject: dma-buf: abstract fence locking v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add dma_fence_lock_irqsafe() and dma_fence_unlock_irqrestore() wrappers and mechanically apply them everywhere. Just a pre-requisite cleanup for a follow up patch. v2: add some missing i915 bits, add abstraction for lockdep assertion as well v3: one more suggestion by Tvrtko Signed-off-by: Christian König Reviewed-by: Tvrtko Ursulin Link: https://lore.kernel.org/r/20260219160822.1529-4-christian.koenig@amd.com --- include/linux/dma-fence.h | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h index 9ff2c4a09cdc..85d6eac9fa85 100644 --- a/include/linux/dma-fence.h +++ b/include/linux/dma-fence.h @@ -377,6 +377,44 @@ dma_fence_get_rcu_safe(struct dma_fence __rcu **fencep) } while (1); } +/** + * dma_fence_spinlock - return pointer to the spinlock protecting the fence + * @fence: the fence to get the lock from + * + * Return the pointer to the extern lock. + */ +static inline spinlock_t *dma_fence_spinlock(struct dma_fence *fence) +{ + return fence->lock; +} + +/** + * dma_fence_lock_irqsave - irqsave lock the fence + * @fence: the fence to lock + * @flags: where to store the CPU flags. + * + * Lock the fence, preventing it from changing to the signaled state. + */ +#define dma_fence_lock_irqsave(fence, flags) \ + spin_lock_irqsave(fence->lock, flags) + +/** + * dma_fence_unlock_irqrestore - unlock the fence and irqrestore + * @fence: the fence to unlock + * @flags the CPU flags to restore + * + * Unlock the fence, allowing it to change it's state to signaled again. + */ +#define dma_fence_unlock_irqrestore(fence, flags) \ + spin_unlock_irqrestore(fence->lock, flags) + +/** + * dma_fence_assert_held - lockdep assertion that fence is locked + * @fence: the fence which should be locked + */ +#define dma_fence_assert_held(fence) \ + lockdep_assert_held(dma_fence_spinlock(fence)); + #ifdef CONFIG_LOCKDEP bool dma_fence_begin_signalling(void); void dma_fence_end_signalling(bool cookie); -- cgit v1.2.3 From 1f32f310a13c9fb67a9993ab67f596b3f960206f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Thu, 9 Oct 2025 10:40:06 +0200 Subject: dma-buf: inline spinlock for fence protection v5 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement per-fence spinlocks, allowing implementations to not give an external spinlock to protect the fence internal state. Instead a spinlock embedded into the fence structure itself is used in this case. Shared spinlocks have the problem that implementations need to guarantee that the lock lives at least as long all fences referencing them. Using a per-fence spinlock allows completely decoupling spinlock producer and consumer life times, simplifying the handling in most use cases. v2: improve naming, coverage and function documentation v3: fix one additional locking in the selftests v4: separate out some changes to make the patch smaller, fix one amdgpu crash found by CI systems v5: improve comments Signed-off-by: Christian König Reviewed-by: Boris Brezillon Link: https://lore.kernel.org/r/20260219160822.1529-5-christian.koenig@amd.com --- include/linux/dma-fence.h | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h index 85d6eac9fa85..3dc93f068bf6 100644 --- a/include/linux/dma-fence.h +++ b/include/linux/dma-fence.h @@ -34,7 +34,8 @@ struct seq_file; * @ops: dma_fence_ops associated with this fence * @rcu: used for releasing fence with kfree_rcu * @cb_list: list of all callbacks to call - * @lock: spin_lock_irqsave used for locking + * @extern_lock: external spin_lock_irqsave used for locking (deprecated) + * @inline_lock: alternative internal spin_lock_irqsave used for locking * @context: execution context this fence belongs to, returned by * dma_fence_context_alloc() * @seqno: the sequence number of this fence inside the execution context, @@ -49,6 +50,7 @@ struct seq_file; * of the time. * * DMA_FENCE_FLAG_INITIALIZED_BIT - fence was initialized + * DMA_FENCE_FLAG_INLINE_LOCK_BIT - use inline spinlock instead of external one * DMA_FENCE_FLAG_SIGNALED_BIT - fence is already signaled * DMA_FENCE_FLAG_TIMESTAMP_BIT - timestamp recorded for fence signaling * DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT - enable_signaling might have been called @@ -66,7 +68,10 @@ struct seq_file; * been completed, or never called at all. */ struct dma_fence { - spinlock_t *lock; + union { + spinlock_t *extern_lock; + spinlock_t inline_lock; + }; const struct dma_fence_ops __rcu *ops; /* * We clear the callback list on kref_put so that by the time we @@ -100,6 +105,7 @@ struct dma_fence { enum dma_fence_flag_bits { DMA_FENCE_FLAG_INITIALIZED_BIT, + DMA_FENCE_FLAG_INLINE_LOCK_BIT, DMA_FENCE_FLAG_SEQNO64_BIT, DMA_FENCE_FLAG_SIGNALED_BIT, DMA_FENCE_FLAG_TIMESTAMP_BIT, @@ -381,11 +387,12 @@ dma_fence_get_rcu_safe(struct dma_fence __rcu **fencep) * dma_fence_spinlock - return pointer to the spinlock protecting the fence * @fence: the fence to get the lock from * - * Return the pointer to the extern lock. + * Return either the pointer to the embedded or the external spin lock. */ static inline spinlock_t *dma_fence_spinlock(struct dma_fence *fence) { - return fence->lock; + return test_bit(DMA_FENCE_FLAG_INLINE_LOCK_BIT, &fence->flags) ? + &fence->inline_lock : fence->extern_lock; } /** @@ -396,7 +403,7 @@ static inline spinlock_t *dma_fence_spinlock(struct dma_fence *fence) * Lock the fence, preventing it from changing to the signaled state. */ #define dma_fence_lock_irqsave(fence, flags) \ - spin_lock_irqsave(fence->lock, flags) + spin_lock_irqsave(dma_fence_spinlock(fence), flags) /** * dma_fence_unlock_irqrestore - unlock the fence and irqrestore @@ -406,7 +413,7 @@ static inline spinlock_t *dma_fence_spinlock(struct dma_fence *fence) * Unlock the fence, allowing it to change it's state to signaled again. */ #define dma_fence_unlock_irqrestore(fence, flags) \ - spin_unlock_irqrestore(fence->lock, flags) + spin_unlock_irqrestore(dma_fence_spinlock(fence), flags) /** * dma_fence_assert_held - lockdep assertion that fence is locked -- cgit v1.2.3 From 5943243914b9fed8e26edcb9d45421721a5e3576 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Thu, 9 Oct 2025 16:18:53 +0200 Subject: dma-buf: use inline lock for the dma-fence-array MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using the inline lock is now the recommended way for dma_fence implementations. So use this approach for the framework's internal fences as well. Also saves about 4 bytes for the external spinlock. Signed-off-by: Christian König Reviewed-by: Tvrtko Ursulin Reviewed-by: Philipp Stanner Reviewed-by: Boris Brezillon Link: https://lore.kernel.org/r/20260219160822.1529-8-christian.koenig@amd.com --- include/linux/dma-fence-array.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dma-fence-array.h b/include/linux/dma-fence-array.h index 079b3dec0a16..370b3d2bba37 100644 --- a/include/linux/dma-fence-array.h +++ b/include/linux/dma-fence-array.h @@ -38,7 +38,6 @@ struct dma_fence_array_cb { struct dma_fence_array { struct dma_fence base; - spinlock_t lock; unsigned num_fences; atomic_t num_pending; struct dma_fence **fences; -- cgit v1.2.3 From a408c0ca0c411ca1ead995bdae3112a806c87556 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Thu, 9 Oct 2025 16:32:33 +0200 Subject: dma-buf: use inline lock for the dma-fence-chain MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using the inline lock is now the recommended way for dma_fence implementations. So use this approach for the framework's internal fences as well. Also saves about 4 bytes for the external spinlock. Signed-off-by: Christian König Reviewed-by: Tvrtko Ursulin Reviewed-by: Philipp Stanner Reviewed-by: Boris Brezillon Link: https://lore.kernel.org/r/20260219160822.1529-9-christian.koenig@amd.com --- include/linux/dma-fence-chain.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dma-fence-chain.h b/include/linux/dma-fence-chain.h index 5cd3ba53b4a1..df3beadf1515 100644 --- a/include/linux/dma-fence-chain.h +++ b/include/linux/dma-fence-chain.h @@ -46,7 +46,6 @@ struct dma_fence_chain { */ struct irq_work work; }; - spinlock_t lock; }; -- cgit v1.2.3 From 2a7b7652b1bb3fadc3bd47d622bfb127a93ab6b0 Mon Sep 17 00:00:00 2001 From: Leif Skunberg Date: Tue, 10 Feb 2026 14:21:29 +0100 Subject: platform/x86: int3472: Handle GPIO type 0x10 (DOVDD) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Lenovo ThinkPad X1 Fold 16 Gen 1 has an OV5675 sensor (ACPI HID OVTI5675) behind an INT3472 discrete PMIC controller. The INT3472 _DSM returns GPIO type 0x10 for one of the pins, which controls the DOVDD (digital I/O power) regulator enable. Type 0x10 is not currently handled by the driver, causing the GPIO to be ignored with a warning. Add INT3472_GPIO_TYPE_DOVDD (0x10) and handle it as a regulator with con_id "dovdd" to match the supply name used by sensor drivers (e.g. ov5675). Also increase GPIO_SUPPLY_NAME_LENGTH from 5 to 6 to accommodate the "dovdd" name (5 chars + null terminator). Signed-off-by: Leif Skunberg Reviewed-by: Hans de Goede Link: https://patch.msgid.link/20260210132129.17943-1-diamondback@cohunt.app Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/platform_data/x86/int3472.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/int3472.h b/include/linux/platform_data/x86/int3472.h index b1b837583d54..dbe745dc88d5 100644 --- a/include/linux/platform_data/x86/int3472.h +++ b/include/linux/platform_data/x86/int3472.h @@ -26,6 +26,7 @@ #define INT3472_GPIO_TYPE_POWER_ENABLE 0x0b #define INT3472_GPIO_TYPE_CLK_ENABLE 0x0c #define INT3472_GPIO_TYPE_PRIVACY_LED 0x0d +#define INT3472_GPIO_TYPE_DOVDD 0x10 #define INT3472_GPIO_TYPE_HANDSHAKE 0x12 #define INT3472_GPIO_TYPE_HOTPLUG_DETECT 0x13 @@ -33,8 +34,8 @@ #define INT3472_MAX_SENSOR_GPIOS 3 #define INT3472_MAX_REGULATORS 3 -/* E.g. "avdd\0" */ -#define GPIO_SUPPLY_NAME_LENGTH 5 +/* E.g. "dovdd\0" */ +#define GPIO_SUPPLY_NAME_LENGTH 6 /* 12 chars for acpi_dev_name() + "-", e.g. "ABCD1234:00-" */ #define GPIO_REGULATOR_NAME_LENGTH (12 + GPIO_SUPPLY_NAME_LENGTH) /* lower- and upper-case mapping */ -- cgit v1.2.3 From 9fe89f022c05d99c052d6bc088b82d4ff83bf463 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 27 Jan 2026 16:17:48 +0100 Subject: sched/fair: More complex proportional newidle balance It turns out that a few workloads (easyWave, fio) have a fairly low success rate on newidle balance, but still benefit greatly from having it anyway. Luckliky these workloads have a faily low newidle rate, so the cost if doing the newidle is relatively low, even if unsuccessfull. Add a simple rate based part to the newidle ratio compute, such that low rate newidle will still have a high newidle ratio. This cures the easyWave and fio workloads while not affecting the schbench numbers either (which have a very high newidle rate). Reported-by: Mario Roy Reported-by: "Mohamed Abuelfotoh, Hazem" Signed-off-by: Peter Zijlstra (Intel) Tested-by: Mario Roy Tested-by: "Mohamed Abuelfotoh, Hazem" Link: https://patch.msgid.link/20260127151748.GA1079264@noisy.programming.kicks-ass.net --- include/linux/sched/topology.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 45c0022b91ce..a1e1032426dc 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -95,6 +95,7 @@ struct sched_domain { unsigned int newidle_call; unsigned int newidle_success; unsigned int newidle_ratio; + u64 newidle_stamp; u64 max_newidle_lb_cost; unsigned long last_decay_max_lb_cost; -- cgit v1.2.3 From be6d4c9e9d714ebbf358be41332726a0f94b9ffa Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sat, 31 Jan 2026 07:34:16 +0200 Subject: dma-buf: Add dma_buf_attach_revocable() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some exporters need a flow to synchronously revoke access to the DMA-buf by importers. Once revoke is completed the importer is not permitted to touch the memory otherwise they may get IOMMU faults, AERs, or worse. DMA-buf today defines a revoke flow, for both pinned and dynamic importers, which is broadly: dma_resv_lock(dmabuf->resv, NULL); // Prevent new mappings from being established priv->revoked = true; // Tell all importers to eventually unmap dma_buf_invalidate_mappings(dmabuf); // Wait for any inprogress fences on the old mapping dma_resv_wait_timeout(dmabuf->resv, DMA_RESV_USAGE_BOOKKEEP, false, MAX_SCHEDULE_TIMEOUT); dma_resv_unlock(dmabuf->resv, NULL); // Wait for all importers to complete unmap wait_for_completion(&priv->unmapped_comp); This works well, and an importer that continues to access the DMA-buf after unmapping it is very buggy. However, the final wait for unmap is effectively unbounded. Several importers do not support invalidate_mappings() at all and won't unmap until userspace triggers it. This unbounded wait is not suitable for exporters like VFIO and RDMA tha need to issue revoke as part of their normal operations. Add dma_buf_attach_revocable() to allow exporters to determine the difference between importers that can complete the above in bounded time, and those that can't. It can be called inside the exporter's attach op to reject incompatible importers. Document these details about how dma_buf_invalidate_mappings() works and what the required sequence is to achieve a full revocation. Signed-off-by: Leon Romanovsky Reviewed-by: Christian König Signed-off-by: Christian König Link: https://lore.kernel.org/r/20260131-dmabuf-revoke-v7-6-463d956bd527@nvidia.com --- include/linux/dma-buf.h | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index e744b8f9bfad..166933b82e27 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -456,12 +456,8 @@ struct dma_buf_attach_ops { * called with this lock held as well. This makes sure that no mapping * is created concurrently with an ongoing move operation. * - * Mappings stay valid and are not directly affected by this callback. - * But the DMA-buf can now be in a different physical location, so all - * mappings should be destroyed and re-created as soon as possible. - * - * New mappings can be created after this callback returns, and will - * point to the new location of the DMA-buf. + * See the kdoc for dma_buf_invalidate_mappings() for details on the + * required behavior. */ void (*invalidate_mappings)(struct dma_buf_attachment *attach); }; @@ -579,6 +575,7 @@ struct sg_table *dma_buf_map_attachment(struct dma_buf_attachment *, void dma_buf_unmap_attachment(struct dma_buf_attachment *, struct sg_table *, enum dma_data_direction); void dma_buf_invalidate_mappings(struct dma_buf *dma_buf); +bool dma_buf_attach_revocable(struct dma_buf_attachment *attach); int dma_buf_begin_cpu_access(struct dma_buf *dma_buf, enum dma_data_direction dir); int dma_buf_end_cpu_access(struct dma_buf *dma_buf, -- cgit v1.2.3 From ebf1ccff79c43f860cbd2f9d6cfab9a462d0cb2d Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Wed, 18 Feb 2026 09:32:17 +0100 Subject: sched_ext: Fix ops.dequeue() semantics Currently, ops.dequeue() is only invoked when the sched_ext core knows that a task resides in BPF-managed data structures, which causes it to miss scheduling property change events. In addition, ops.dequeue() callbacks are completely skipped when tasks are dispatched to non-local DSQs from ops.select_cpu(). As a result, BPF schedulers cannot reliably track task state. Fix this by guaranteeing that each task entering the BPF scheduler's custody triggers exactly one ops.dequeue() call when it leaves that custody, whether the exit is due to a dispatch (regular or via a core scheduling pick) or to a scheduling property change (e.g. sched_setaffinity(), sched_setscheduler(), set_user_nice(), NUMA balancing, etc.). BPF scheduler custody concept: a task is considered to be in the BPF scheduler's custody when the scheduler is responsible for managing its lifecycle. This includes tasks dispatched to user-created DSQs or stored in the BPF scheduler's internal data structures from ops.enqueue(). Custody ends when the task is dispatched to a terminal DSQ (such as the local DSQ or %SCX_DSQ_GLOBAL), selected by core scheduling, or removed due to a property change. Tasks directly dispatched to terminal DSQs bypass the BPF scheduler entirely and are never in its custody. Terminal DSQs include: - Local DSQs (%SCX_DSQ_LOCAL or %SCX_DSQ_LOCAL_ON): per-CPU queues where tasks go directly to execution. - Global DSQ (%SCX_DSQ_GLOBAL): the built-in fallback queue where the BPF scheduler is considered "done" with the task. As a result, ops.dequeue() is not invoked for tasks directly dispatched to terminal DSQs. To identify dequeues triggered by scheduling property changes, introduce the new ops.dequeue() flag %SCX_DEQ_SCHED_CHANGE: when this flag is set, the dequeue was caused by a scheduling property change. New ops.dequeue() semantics: - ops.dequeue() is invoked exactly once when the task leaves the BPF scheduler's custody, in one of the following cases: a) regular dispatch: a task dispatched to a user DSQ or stored in internal BPF data structures is moved to a terminal DSQ (ops.dequeue() called without any special flags set), b) core scheduling dispatch: core-sched picks task before dispatch (ops.dequeue() called with %SCX_DEQ_CORE_SCHED_EXEC flag set), c) property change: task properties modified before dispatch, (ops.dequeue() called with %SCX_DEQ_SCHED_CHANGE flag set). This allows BPF schedulers to: - reliably track task ownership and lifecycle, - maintain accurate accounting of managed tasks, - update internal state when tasks change properties. Cc: Tejun Heo Cc: Emil Tsalapatis Cc: Kuba Piecuch Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- include/linux/sched/ext.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index bcb962d5ee7d..4601e5ecb43c 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -84,6 +84,7 @@ struct scx_dispatch_q { /* scx_entity.flags */ enum scx_ent_flags { SCX_TASK_QUEUED = 1 << 0, /* on ext runqueue */ + SCX_TASK_IN_CUSTODY = 1 << 1, /* in custody, needs ops.dequeue() when leaving */ SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */ SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */ -- cgit v1.2.3 From baff45179e90276a14acb9dffce17ff517708453 Mon Sep 17 00:00:00 2001 From: Jishnu Prakash Date: Fri, 30 Jan 2026 17:24:20 +0530 Subject: iio: adc: Add support for QCOM PMIC5 Gen3 ADC The ADC architecture on PMIC5 Gen3 is similar to that on PMIC5 Gen2, with all SW communication to ADC going through PMK8550 which communicates with other PMICs through PBS. One major difference is that the register interface used here is that of an SDAM (Shared Direct Access Memory) peripheral present on PMK8550. There may be more than one SDAM used for ADC5 Gen3 and each has eight channels, which may be used for either immediate reads (same functionality as previous PMIC5 and PMIC5 Gen2 ADC peripherals) or recurring measurements (same as ADC_TM functionality). By convention, we reserve the first channel of the first SDAM for all immediate reads and use the remaining channels across all SDAMs for ADC_TM monitoring functionality. Add support for PMIC5 Gen3 ADC driver for immediate read functionality. ADC_TM is implemented as an auxiliary thermal driver under this ADC driver. Signed-off-by: Jishnu Prakash Signed-off-by: Jonathan Cameron --- include/linux/iio/adc/qcom-adc5-gen3-common.h | 211 ++++++++++++++++++++++++++ 1 file changed, 211 insertions(+) create mode 100644 include/linux/iio/adc/qcom-adc5-gen3-common.h (limited to 'include/linux') diff --git a/include/linux/iio/adc/qcom-adc5-gen3-common.h b/include/linux/iio/adc/qcom-adc5-gen3-common.h new file mode 100644 index 000000000000..6303eaa6640b --- /dev/null +++ b/include/linux/iio/adc/qcom-adc5-gen3-common.h @@ -0,0 +1,211 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + * + * Code used in the main and auxiliary Qualcomm PMIC voltage ADCs + * of type ADC5 Gen3. + */ + +#ifndef QCOM_ADC5_GEN3_COMMON_H +#define QCOM_ADC5_GEN3_COMMON_H + +#include +#include +#include +#include +#include +#include +#include + +#define ADC5_GEN3_HS 0x45 +#define ADC5_GEN3_HS_BUSY BIT(7) +#define ADC5_GEN3_HS_READY BIT(0) + +#define ADC5_GEN3_STATUS1 0x46 +#define ADC5_GEN3_STATUS1_CONV_FAULT BIT(7) +#define ADC5_GEN3_STATUS1_THR_CROSS BIT(6) +#define ADC5_GEN3_STATUS1_EOC BIT(0) + +#define ADC5_GEN3_TM_EN_STS 0x47 +#define ADC5_GEN3_TM_HIGH_STS 0x48 +#define ADC5_GEN3_TM_LOW_STS 0x49 + +#define ADC5_GEN3_EOC_STS 0x4a +#define ADC5_GEN3_EOC_CHAN_0 BIT(0) + +#define ADC5_GEN3_EOC_CLR 0x4b +#define ADC5_GEN3_TM_HIGH_STS_CLR 0x4c +#define ADC5_GEN3_TM_LOW_STS_CLR 0x4d +#define ADC5_GEN3_CONV_ERR_CLR 0x4e +#define ADC5_GEN3_CONV_ERR_CLR_REQ BIT(0) + +#define ADC5_GEN3_SID 0x4f +#define ADC5_GEN3_SID_MASK GENMASK(3, 0) + +#define ADC5_GEN3_PERPH_CH 0x50 +#define ADC5_GEN3_CHAN_CONV_REQ BIT(7) + +#define ADC5_GEN3_TIMER_SEL 0x51 +#define ADC5_GEN3_TIME_IMMEDIATE 0x1 + +#define ADC5_GEN3_DIG_PARAM 0x52 +#define ADC5_GEN3_DIG_PARAM_CAL_SEL_MASK GENMASK(5, 4) +#define ADC5_GEN3_DIG_PARAM_DEC_RATIO_SEL_MASK GENMASK(3, 2) + +#define ADC5_GEN3_FAST_AVG 0x53 +#define ADC5_GEN3_FAST_AVG_CTL_EN BIT(7) +#define ADC5_GEN3_FAST_AVG_CTL_SAMPLES_MASK GENMASK(2, 0) + +#define ADC5_GEN3_ADC_CH_SEL_CTL 0x54 +#define ADC5_GEN3_DELAY_CTL 0x55 +#define ADC5_GEN3_HW_SETTLE_DELAY_MASK GENMASK(3, 0) + +#define ADC5_GEN3_CH_EN 0x56 +#define ADC5_GEN3_HIGH_THR_INT_EN BIT(1) +#define ADC5_GEN3_LOW_THR_INT_EN BIT(0) + +#define ADC5_GEN3_LOW_THR0 0x57 +#define ADC5_GEN3_LOW_THR1 0x58 +#define ADC5_GEN3_HIGH_THR0 0x59 +#define ADC5_GEN3_HIGH_THR1 0x5a + +#define ADC5_GEN3_CH_DATA0(channel) (0x5c + (channel) * 2) +#define ADC5_GEN3_CH_DATA1(channel) (0x5d + (channel) * 2) + +#define ADC5_GEN3_CONV_REQ 0xe5 +#define ADC5_GEN3_CONV_REQ_REQ BIT(0) + +#define ADC5_GEN3_VIRTUAL_SID_MASK GENMASK(15, 8) +#define ADC5_GEN3_CHANNEL_MASK GENMASK(7, 0) +#define ADC5_GEN3_V_CHAN(x) \ + (FIELD_PREP(ADC5_GEN3_VIRTUAL_SID_MASK, (x).sid) | (x).channel) + +/* ADC channels for PMIC5 Gen3 */ +#define ADC5_GEN3_REF_GND 0x00 +#define ADC5_GEN3_1P25VREF 0x01 +#define ADC5_GEN3_DIE_TEMP 0x03 +#define ADC5_GEN3_USB_SNS_V_16 0x11 +#define ADC5_GEN3_VIN_DIV16_MUX 0x12 +#define ADC5_GEN3_VPH_PWR 0x8e +#define ADC5_GEN3_VBAT_SNS_QBG 0x8f +/* 100k pull-up channels */ +#define ADC5_GEN3_AMUX1_THM_100K_PU 0x44 +#define ADC5_GEN3_AMUX2_THM_100K_PU 0x45 +#define ADC5_GEN3_AMUX3_THM_100K_PU 0x46 +#define ADC5_GEN3_AMUX4_THM_100K_PU 0x47 +#define ADC5_GEN3_AMUX5_THM_100K_PU 0x48 +#define ADC5_GEN3_AMUX6_THM_100K_PU 0x49 +#define ADC5_GEN3_AMUX1_GPIO_100K_PU 0x4a +#define ADC5_GEN3_AMUX2_GPIO_100K_PU 0x4b +#define ADC5_GEN3_AMUX3_GPIO_100K_PU 0x4c +#define ADC5_GEN3_AMUX4_GPIO_100K_PU 0x4d + +#define ADC5_MAX_CHANNEL 0xc0 + +enum adc5_cal_method { + ADC5_NO_CAL = 0, + ADC5_RATIOMETRIC_CAL, + ADC5_ABSOLUTE_CAL, +}; + +enum adc5_time_select { + MEAS_INT_DISABLE = 0, + MEAS_INT_IMMEDIATE, + MEAS_INT_50MS, + MEAS_INT_100MS, + MEAS_INT_1S, + MEAS_INT_NONE, +}; + +/** + * struct adc5_sdam_data - data per SDAM allocated for adc usage + * @base_addr: base address for the ADC SDAM peripheral. + * @irq_name: ADC IRQ name. + * @irq: ADC IRQ number. + */ +struct adc5_sdam_data { + u16 base_addr; + const char *irq_name; + int irq; +}; + +/** + * struct adc5_device_data - Top-level ADC device data + * @regmap: ADC peripheral register map field. + * @base: array of SDAM data. + * @num_sdams: number of ADC SDAM peripherals. + */ +struct adc5_device_data { + struct regmap *regmap; + struct adc5_sdam_data *base; + int num_sdams; +}; + +/** + * struct adc5_channel_common_prop - ADC channel properties (common to ADC and TM). + * @channel: channel number, refer to the channel list. + * @cal_method: calibration method. + * @decimation: sampling rate supported for the channel. + * @sid: ID of PMIC owning the channel. + * @label: Channel name used in device tree. + * @prescale: channel scaling performed on the input signal. + * @hw_settle_time_us: the time between AMUX being configured and the + * start of conversion in uS. + * @avg_samples: ability to provide single result from the ADC + * that is an average of multiple measurements. + * @scale_fn_type: Represents the scaling function to convert voltage + * physical units desired by the client for the channel. + */ +struct adc5_channel_common_prop { + unsigned int channel; + enum adc5_cal_method cal_method; + unsigned int decimation; + unsigned int sid; + const char *label; + unsigned int prescale; + unsigned int hw_settle_time_us; + unsigned int avg_samples; + enum vadc_scale_fn_type scale_fn_type; +}; + +/** + * struct tm5_aux_dev_wrapper - wrapper structure around TM auxiliary device + * @aux_dev: TM auxiliary device structure. + * @dev_data: Top-level ADC device data. + * @tm_props: Array of common ADC channel properties for TM channels. + * @n_tm_channels: number of TM channels. + */ +struct tm5_aux_dev_wrapper { + struct auxiliary_device aux_dev; + struct adc5_device_data *dev_data; + struct adc5_channel_common_prop *tm_props; + unsigned int n_tm_channels; +}; + +int adc5_gen3_read(struct adc5_device_data *adc, unsigned int sdam_index, + u16 offset, u8 *data, int len); + +int adc5_gen3_write(struct adc5_device_data *adc, unsigned int sdam_index, + u16 offset, u8 *data, int len); + +int adc5_gen3_poll_wait_hs(struct adc5_device_data *adc, + unsigned int sdam_index); + +void adc5_gen3_update_dig_param(struct adc5_channel_common_prop *prop, + u8 *data); + +int adc5_gen3_status_clear(struct adc5_device_data *adc, + int sdam_index, u16 offset, u8 *val, int len); + +void adc5_gen3_mutex_lock(struct device *dev); +void adc5_gen3_mutex_unlock(struct device *dev); +int adc5_gen3_get_scaled_reading(struct device *dev, + struct adc5_channel_common_prop *common_props, + int *val); +int adc5_gen3_therm_code_to_temp(struct device *dev, + struct adc5_channel_common_prop *common_props, + u16 code, int *val); +void adc5_gen3_register_tm_event_notifier(struct device *dev, + void (*handler)(struct auxiliary_device *)); + +#endif /* QCOM_ADC5_GEN3_COMMON_H */ -- cgit v1.2.3 From 3d35d41169d000f4fbf3c23999b8443e1173efce Mon Sep 17 00:00:00 2001 From: Fabio Baltieri Date: Mon, 23 Feb 2026 13:25:55 -0800 Subject: Input: export input_default_setkeycode Export input_default_setkeycode so that a driver can set a custom setkeycode handler to take some driver specific action but still call the default handler at some point. Signed-off-by: Fabio Baltieri Reviewed-by: Tzung-Bi Shih Link: https://patch.msgid.link/20260222003717.471977-1-dmitry.torokhov@gmail.com Signed-off-by: Dmitry Torokhov --- include/linux/input.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/input.h b/include/linux/input.h index 7d7cb0593a63..06ca62328db1 100644 --- a/include/linux/input.h +++ b/include/linux/input.h @@ -517,6 +517,10 @@ INPUT_GENERATE_ABS_ACCESSORS(res, resolution) int input_scancode_to_scalar(const struct input_keymap_entry *ke, unsigned int *scancode); +int input_default_setkeycode(struct input_dev *dev, + const struct input_keymap_entry *ke, + unsigned int *old_keycode); + int input_get_keycode(struct input_dev *dev, struct input_keymap_entry *ke); int input_set_keycode(struct input_dev *dev, const struct input_keymap_entry *ke); -- cgit v1.2.3 From 4ced4cf5c9d172d91f181df3accdf949d3761aab Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 17 Feb 2026 18:01:05 +0000 Subject: binfmt_elf_fdpic: fix AUXV size calculation for ELF_HWCAP3 and ELF_HWCAP4 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 4e6e8c2b757f ("binfmt_elf: Wire up AT_HWCAP3 at AT_HWCAP4") added support for AT_HWCAP3 and AT_HWCAP4, but it missed updating the AUX vector size calculation in create_elf_fdpic_tables() and AT_VECTOR_SIZE_BASE in include/linux/auxvec.h. Similar to the fix for AT_HWCAP2 in commit c6a09e342f8e ("binfmt_elf_fdpic: fix AUXV size calculation when ELF_HWCAP2 is defined"), this omission leads to a mismatch between the reserved space and the actual number of AUX entries, eventually triggering a kernel BUG_ON(csp != sp). Fix this by incrementing nitems when ELF_HWCAP3 or ELF_HWCAP4 are defined and updating AT_VECTOR_SIZE_BASE. Cc: Mark Brown Cc: Max Filippov Reviewed-by: Michal Koutný Reviewed-by: Mark Brown Reviewed-by: Cyrill Gorcunov Reviewed-by: Alexander Mikhalitsyn Fixes: 4e6e8c2b757f ("binfmt_elf: Wire up AT_HWCAP3 at AT_HWCAP4") Signed-off-by: Andrei Vagin Link: https://patch.msgid.link/20260217180108.1420024-2-avagin@google.com Signed-off-by: Kees Cook --- include/linux/auxvec.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/auxvec.h b/include/linux/auxvec.h index 407f7005e6d6..8bcb9b726262 100644 --- a/include/linux/auxvec.h +++ b/include/linux/auxvec.h @@ -4,6 +4,6 @@ #include -#define AT_VECTOR_SIZE_BASE 22 /* NEW_AUX_ENT entries in auxiliary table */ +#define AT_VECTOR_SIZE_BASE 24 /* NEW_AUX_ENT entries in auxiliary table */ /* number of "#define AT_.*" above, minus {AT_NULL, AT_IGNORE, AT_NOTELF} */ #endif /* _LINUX_AUXVEC_H */ -- cgit v1.2.3 From fa4f81a8c15d4018eb2053b093bf1584777e80d4 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 19 Feb 2026 10:47:03 +0900 Subject: ata: libata-scsi: make ata_scsi_simulate() static ata_scsi_simulate() is called only from libata-scsi.c. Move this function definition as a static function before its call in __ata_scsi_queuecmd() and remove its declaration from include/linux/libata.h. No functional changes. Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke --- include/linux/libata.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 00346ce3af5e..db87c99e4189 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1205,7 +1205,6 @@ extern unsigned int ata_do_dev_read_id(struct ata_device *dev, struct ata_taskfile *tf, __le16 *id); extern void ata_qc_complete(struct ata_queued_cmd *qc); extern u64 ata_qc_get_active(struct ata_port *ap); -extern void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd); extern int ata_std_bios_param(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]); -- cgit v1.2.3 From ecfa23b486b22844855844202424bc1966cebb33 Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Tue, 17 Feb 2026 12:26:15 +0100 Subject: jiffies: Remove unused __jiffy_arch_data The __jiffy_arch_data definition was added in 2017 by commit 60b0a8c3d248 ("frv: declare jiffies to be located in the .data section") for the needs of the frv port. The frv support was removed in 2018 by commit fd8773f9f544 ("arch: remove frv port") and no other architecture has required __jiffy_arch_data. Therefore, remove this unused definition. Signed-off-by: Petr Pavlu Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260217112638.1525094-1-petr.pavlu@suse.com --- include/linux/jiffies.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index fdef2c155c27..1a393d160420 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -67,10 +67,6 @@ extern void register_refined_jiffies(long clock_tick_rate); /* USER_TICK_USEC is the time between ticks in usec assuming fake USER_HZ */ #define USER_TICK_USEC ((1000000UL + USER_HZ/2) / USER_HZ) -#ifndef __jiffy_arch_data -#define __jiffy_arch_data -#endif - /* * The 64-bit value is not atomic on 32-bit systems - you MUST NOT read it * without sampling the sequence number in jiffies_lock. @@ -83,7 +79,7 @@ extern void register_refined_jiffies(long clock_tick_rate); * See arch/ARCH/kernel/vmlinux.lds.S */ extern u64 __cacheline_aligned_in_smp jiffies_64; -extern unsigned long volatile __cacheline_aligned_in_smp __jiffy_arch_data jiffies; +extern unsigned long volatile __cacheline_aligned_in_smp jiffies; #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void); -- cgit v1.2.3 From 38ab6557234d8629407a824be90e82514d6129a0 Mon Sep 17 00:00:00 2001 From: Sander Vanheule Date: Fri, 20 Feb 2026 17:01:11 +0100 Subject: regmap: sort header includes Sort the included headers to make spotting duplicates easier and avoid discussions on where to add new includes. Signed-off-by: Sander Vanheule Link: https://patch.msgid.link/20260220160112.543391-1-sander@svanheule.net Signed-off-by: Mark Brown --- include/linux/regmap.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/regmap.h b/include/linux/regmap.h index caff2240bdab..c8a6a05bdba1 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -10,15 +10,15 @@ * Author: Mark Brown */ -#include -#include -#include +#include #include #include -#include -#include -#include #include +#include +#include +#include +#include +#include struct module; struct clk; -- cgit v1.2.3 From 37983fad7f3ef296fa0504c8e945987459dc5487 Mon Sep 17 00:00:00 2001 From: Sander Vanheule Date: Fri, 20 Feb 2026 17:01:12 +0100 Subject: regmap: define cleanup helper for regmap_field For temporary field allocation, the user has to perform manual cleanup, or rely on devm_regmap_field_alloc() to (eventually) clean up the allocated resources when an error occurs. Add a cleanup helper that takes care of freeing the allocated regmap_field whenever it goes out of scope. This can simplify this example: struct regmap_field *field = regmap_field_alloc(...); if (IS_ERR(field)) return PTR_ERR(field); int err = regmap_field_read(...); if (err) goto out; /* some logic that may also error */ err = regmap_field_write(...); out: regmap_field_free(field); return err; into the shorter: struct regmap_field *field __free(regmap_field) = regmap_field_alloc(...); if (IS_ERR(field)) return PTR_ERR(field); int err = regmap_field_read(...); if (err) return err; /* some logic that may also error */ return regmap_field_write(...); Signed-off-by: Sander Vanheule Link: https://patch.msgid.link/20260220160112.543391-2-sander@svanheule.net Signed-off-by: Mark Brown --- include/linux/regmap.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regmap.h b/include/linux/regmap.h index c8a6a05bdba1..f1c5cb63c171 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -11,6 +11,7 @@ */ #include +#include #include #include #include @@ -1460,6 +1461,8 @@ struct regmap_field *regmap_field_alloc(struct regmap *regmap, struct reg_field reg_field); void regmap_field_free(struct regmap_field *field); +DEFINE_FREE(regmap_field, struct regmap_field *, if (_T) regmap_field_free(_T)) + struct regmap_field *devm_regmap_field_alloc(struct device *dev, struct regmap *regmap, struct reg_field reg_field); void devm_regmap_field_free(struct device *dev, struct regmap_field *field); -- cgit v1.2.3 From aa8671af0c380c15989b88325a2d5a6c5341771d Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Tue, 24 Feb 2026 12:10:43 +0100 Subject: PCI/PTM: Drop pci_enable_ptm() granularity parameter No pci_enable_ptm() callers supply the "granularity" pointer where the clock granularity would be returned. Drop the unused pci_enable_ptm() parameter. Signed-off-by: Mika Westerberg [bhelgaas: commit log] Signed-off-by: Bjorn Helgaas Reviewed-by: Andy Shevchenko Link: https://patch.msgid.link/20260224111044.3487873-5-mika.westerberg@linux.intel.com --- include/linux/pci.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 1c270f1d5123..8aaa72dcb164 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1975,11 +1975,11 @@ struct pci_ptm_debugfs { }; #ifdef CONFIG_PCIE_PTM -int pci_enable_ptm(struct pci_dev *dev, u8 *granularity); +int pci_enable_ptm(struct pci_dev *dev); void pci_disable_ptm(struct pci_dev *dev); bool pcie_ptm_enabled(struct pci_dev *dev); #else -static inline int pci_enable_ptm(struct pci_dev *dev, u8 *granularity) +static inline int pci_enable_ptm(struct pci_dev *dev) { return -EINVAL; } static inline void pci_disable_ptm(struct pci_dev *dev) { } static inline bool pcie_ptm_enabled(struct pci_dev *dev) -- cgit v1.2.3 From e02902dd493bf9c9b05353c761737ac514ad7a5c Mon Sep 17 00:00:00 2001 From: Antoniu Miclaus Date: Mon, 23 Feb 2026 18:21:01 +0200 Subject: spi: add devm_spi_new_ancillary_device() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a devres-managed version of spi_new_ancillary_device() that automatically unregisters the ancillary SPI device when the parent device is removed. This follows the same devm_add_action_or_reset() pattern used by the other managed SPI functions (devm_spi_optimize_message, devm_spi_register_controller, etc.) and eliminates the need for drivers to open-code their own devm cleanup callbacks for ancillary devices. Acked-by: Nuno Sá Signed-off-by: Antoniu Miclaus Link: https://patch.msgid.link/20260223162110.156746-3-antoniu.miclaus@analog.com Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index af7cfee7b8f6..1c9aab627583 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -387,6 +387,7 @@ static inline void spi_unregister_driver(struct spi_driver *sdrv) } extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 chip_select); +extern struct spi_device *devm_spi_new_ancillary_device(struct spi_device *spi, u8 chip_select); /* Use a define to avoid include chaining to get THIS_MODULE */ #define spi_register_driver(driver) \ -- cgit v1.2.3 From 477174ac35c510d0ed3043f5bd4fba25546a21ce Mon Sep 17 00:00:00 2001 From: David Carlier Date: Tue, 24 Feb 2026 05:56:37 +0000 Subject: sched_ext: Optimize sched_ext_entity layout for cache locality Reorder struct sched_ext_entity to place ops_state, ddsp_dsq_id, and ddsp_enq_flags immediately after dsq. These fields are accessed together in the do_enqueue_task() and finish_dispatch() hot paths but were previously spread across three different cache lines. Grouping them on the same cache line reduces cache misses on every enqueue and dispatch operation. Signed-off-by: David Carlier Signed-off-by: Tejun Heo --- include/linux/sched/ext.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 4601e5ecb43c..0150b3fe6230 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -163,6 +163,9 @@ struct scx_dsq_list_node { */ struct sched_ext_entity { struct scx_dispatch_q *dsq; + atomic_long_t ops_state; + u64 ddsp_dsq_id; + u64 ddsp_enq_flags; struct scx_dsq_list_node dsq_list; /* dispatch order */ struct rb_node dsq_priq; /* p->scx.dsq_vtime order */ u32 dsq_seq; @@ -174,7 +177,6 @@ struct sched_ext_entity { s32 selected_cpu; u32 kf_mask; /* see scx_kf_mask above */ struct task_struct *kf_tasks[2]; /* see SCX_CALL_OP_TASK() */ - atomic_long_t ops_state; struct list_head runnable_node; /* rq->scx.runnable_list */ unsigned long runnable_at; @@ -182,8 +184,6 @@ struct sched_ext_entity { #ifdef CONFIG_SCHED_CORE u64 core_sched_at; /* see scx_prio_less() */ #endif - u64 ddsp_dsq_id; - u64 ddsp_enq_flags; /* BPF scheduler modifiable fields */ -- cgit v1.2.3 From cd0aa651535092afd5d776bfe94e4fdf750f89c3 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 23 Feb 2026 09:34:41 +0000 Subject: net: stmmac: pass interface mode into fix_mac_speed() method Pass the current interface mode reported by phylink into the fix_mac_speed() method. This will be used by qcom-ethqos for its "SGMII" configuration. Reviewed-by: Maxime Chevallier Tested-by: Mohd Ayaan Anwar Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1vuSKv-0000000AScG-1zv6@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 32352a216567..b96ae9dadfab 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -256,7 +256,8 @@ struct plat_stmmacenet_data { int (*set_phy_intf_sel)(void *priv, u8 phy_intf_sel); int (*set_clk_tx_rate)(void *priv, struct clk *clk_tx_i, phy_interface_t interface, int speed); - void (*fix_mac_speed)(void *priv, int speed, unsigned int mode); + void (*fix_mac_speed)(void *priv, phy_interface_t interface, + int speed, unsigned int mode); int (*fix_soc_reset)(struct stmmac_priv *priv); int (*serdes_powerup)(struct net_device *ndev, void *priv); void (*serdes_powerdown)(struct net_device *ndev, void *priv); -- cgit v1.2.3 From 46a9d97069cab311738c950d0fcef85a459c7b8f Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 24 Feb 2026 11:06:38 +0900 Subject: ata: libata-eh: avoid unnecessary calls to ata_scsi_port_error_handler() When handling SCSI command timeouts, if we had no actual command timeouts (either because the command was a deferred qc or the completion path won the race with ata_scsi_cmd_error_handler()), we do not need to go through a port error handling, as there was in fact no errors at all. Modify ata_scsi_cmd_error_handler() to return the number of commands that timed out and use this return value in ata_scsi_error() to call ata_scsi_port_error_handler() only if we had command timeouts, or if the port EH has already been scheduled due to failed commands. Otherwise, simply call scsi_eh_flush_done_q() to finish the completed commands without running the full port error handling. Signed-off-by: Damien Le Moal Reviewed-by: Martin K. Petersen Reviewed-by: Niklas Cassel Signed-off-by: Niklas Cassel --- include/linux/libata.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index db87c99e4189..5c085ef4eda7 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1225,7 +1225,8 @@ extern int ata_ncq_prio_enable(struct ata_port *ap, struct scsi_device *sdev, extern struct ata_device *ata_dev_pair(struct ata_device *adev); int ata_set_mode(struct ata_link *link, struct ata_device **r_failed_dev); extern void ata_scsi_port_error_handler(struct Scsi_Host *host, struct ata_port *ap); -extern void ata_scsi_cmd_error_handler(struct Scsi_Host *host, struct ata_port *ap, struct list_head *eh_q); +int ata_scsi_cmd_error_handler(struct Scsi_Host *host, struct ata_port *ap, + struct list_head *eh_q); /* * SATA specific code - drivers/ata/libata-sata.c -- cgit v1.2.3 From d9d5e1bdd18074ea27985c777ddc3a8a0b007468 Mon Sep 17 00:00:00 2001 From: Koichiro Den Date: Mon, 16 Feb 2026 00:22:16 +0900 Subject: dmaengine: dw-edma: Add virtual IRQ for interrupt-emulation doorbells Interrupt emulation can assert the dw-edma IRQ line without updating the DONE/ABORT bits. With the shared read/write/common IRQ handlers, the driver cannot reliably distinguish such an emulated interrupt from a real one and leaving a level IRQ asserted may wedge the line. Allocate a dedicated, requestable Linux virtual IRQ (db_irq) for interrupt emulation and attach an irq_chip whose .irq_ack runs the core-specific deassert sequence (.ack_emulated_irq()). The physical dw-edma interrupt handlers raise this virtual IRQ via generic_handle_irq(), ensuring emulated IRQs are always deasserted. Export the virtual IRQ number (db_irq) and the doorbell register offset (db_offset) via struct dw_edma_chip so platform users can expose interrupt emulation as a doorbell. Without this, a single interrupt-emulation write can leave the level IRQ line asserted and cause the generic IRQ layer to disable it. Signed-off-by: Koichiro Den Reviewed-by: Frank Li Link: https://patch.msgid.link/20260215152216.3393561-3-den@valinux.co.jp Signed-off-by: Vinod Koul --- include/linux/dma/edma.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dma/edma.h b/include/linux/dma/edma.h index 270b5458aecf..9da53c75e49b 100644 --- a/include/linux/dma/edma.h +++ b/include/linux/dma/edma.h @@ -73,6 +73,8 @@ enum dw_edma_chip_flags { * @ll_region_rd: DMA descriptor link list memory for read channel * @dt_region_wr: DMA data memory for write channel * @dt_region_rd: DMA data memory for read channel + * @db_irq: Virtual IRQ dedicated to interrupt emulation + * @db_offset: Offset from DMA register base * @mf: DMA register map format * @dw: struct dw_edma that is filled by dw_edma_probe() */ @@ -94,6 +96,10 @@ struct dw_edma_chip { struct dw_edma_region dt_region_wr[EDMA_MAX_WR_CH]; struct dw_edma_region dt_region_rd[EDMA_MAX_RD_CH]; + /* interrupt emulation */ + int db_irq; + resource_size_t db_offset; + enum dw_edma_map_format mf; struct dw_edma *dw; -- cgit v1.2.3 From 0289ada4a31661016a0611a41a4886bb958e9985 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Mon, 9 Feb 2026 12:44:33 +0000 Subject: coresight: Fix memory leak in coresight_alloc_device_name() The memory leak detector reports: echo clear > /sys/kernel/debug/kmemleak modprobe coresight_funnel rmmod coresight_funnel # Scan memory leak and report it echo scan > /sys/kernel/debug/kmemleak cat /sys/kernel/debug/kmemleak unreferenced object 0xffff0008020c7200 (size 64): comm "modprobe", pid 410, jiffies 4295333721 hex dump (first 32 bytes): d8 da fe 7e 09 00 ff ff e8 2e ff 7e 09 00 ff ff ...~.......~.... b0 6c ff 7e 09 00 ff ff 30 83 00 7f 09 00 ff ff .l.~....0....... backtrace (crc 4116a690): kmemleak_alloc+0xd8/0xf8 __kmalloc_node_track_caller_noprof+0x2c8/0x6f0 krealloc_node_align_noprof+0x13c/0x2c8 coresight_alloc_device_name+0xe4/0x158 [coresight] 0xffffd327ecef8394 0xffffd327ecef85ec amba_probe+0x118/0x1c8 really_probe+0xc8/0x3f0 __driver_probe_device+0x88/0x190 driver_probe_device+0x44/0x120 __driver_attach+0x100/0x238 bus_for_each_dev+0x84/0xf0 driver_attach+0x2c/0x40 bus_add_driver+0x128/0x258 driver_register+0x64/0x138 __amba_driver_register+0x2c/0x48 The memory leak is caused by not freeing the device list that maintains device indices. This device list preserves stable device indices across unbind and rebind device operations, so it does not share the same lifetime as a device instances and must only be freed when the module is unloaded. Some modules do not implement a module exit callback because they are registered using module_platform_driver(). As a result, the device list cannot be released during module exit for those modules. Fix this by moving the device list into the core layer. As a general solution, instead of maintaining a static list in each driver, drivers now allocate device lists via coresight_allocate_device_list() and device indices via coresight_allocate_device_idx(). The list is released only when the core module is unloaded by calling coresight_release_device_list(), avoiding the leak. Fixes: 0f5f9b6ba9e1 ("coresight: Use platform agnostic names") Reviewed-by: James Clark Signed-off-by: Leo Yan Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20260209-arm_coresight_refactor_dev_register-v4-1-62d6042f76f7@arm.com --- include/linux/coresight.h | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 2b48be97fcd0..2131febebee9 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -306,24 +306,19 @@ struct coresight_device { * coresight_dev_list - Mapping for devices to "name" index for device * names. * + * @node: Node on the global device index list. * @nr_idx: Number of entries already allocated. * @pfx: Prefix pattern for device name. * @fwnode_list: Array of fwnode_handles associated with each allocated * index, upto nr_idx entries. */ struct coresight_dev_list { + struct list_head node; int nr_idx; - const char *pfx; + char *pfx; struct fwnode_handle **fwnode_list; }; -#define DEFINE_CORESIGHT_DEVLIST(var, dev_pfx) \ -static struct coresight_dev_list (var) = { \ - .pfx = dev_pfx, \ - .nr_idx = 0, \ - .fwnode_list = NULL, \ -} - #define to_coresight_device(d) container_of(d, struct coresight_device, dev) /** @@ -663,8 +658,7 @@ void coresight_clear_self_claim_tag(struct csdev_access *csa); void coresight_clear_self_claim_tag_unlocked(struct csdev_access *csa); void coresight_disclaim_device(struct coresight_device *csdev); void coresight_disclaim_device_unlocked(struct coresight_device *csdev); -char *coresight_alloc_device_name(struct coresight_dev_list *devs, - struct device *dev); +char *coresight_alloc_device_name(const char *prefix, struct device *dev); bool coresight_loses_context_with_cpu(struct device *dev); -- cgit v1.2.3 From 59509da0cb51dc48e4edc57d7d3ef1d424c58fc9 Mon Sep 17 00:00:00 2001 From: Amit Kumar Mahapatra Date: Wed, 4 Feb 2026 09:32:17 +0100 Subject: mtd: Move struct mtd_concat definition to header file To enable a more generic approach for concatenating MTD devices, struct mtd_concat should be accessible beyond the mtdconcat driver. Therefore, the definition is being moved to a header file. Signed-off-by: Amit Kumar Mahapatra Signed-off-by: Luca Ceresoli Signed-off-by: Miquel Raynal --- include/linux/mtd/concat.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mtd/concat.h b/include/linux/mtd/concat.h index d6f653e07426..b42d9af87c4e 100644 --- a/include/linux/mtd/concat.h +++ b/include/linux/mtd/concat.h @@ -9,6 +9,18 @@ #define MTD_CONCAT_H +/* + * Our storage structure: + * Subdev points to an array of pointers to struct mtd_info objects + * which is allocated along with this structure + * + */ +struct mtd_concat { + struct mtd_info mtd; + int num_subdev; + struct mtd_info **subdev; +}; + struct mtd_info *mtd_concat_create( struct mtd_info *subdev[], /* subdevices to concatenate */ int num_devs, /* number of subdevices */ -- cgit v1.2.3 From 43db6366fc2de02050e66389f5628d3fdc9af10a Mon Sep 17 00:00:00 2001 From: Amit Kumar Mahapatra Date: Wed, 4 Feb 2026 09:32:18 +0100 Subject: mtd: Add driver for concatenating devices Introducing CONFIG_MTD_VIRT_CONCAT to separate the legacy flow from the new approach, where only the concatenated partition is registered as an MTD device, while the individual partitions that form it are not registered independently, as they are typically not required by the user. CONFIG_MTD_VIRT_CONCAT is a boolean configuration option that depends on CONFIG_MTD_PARTITIONED_MASTER. When enabled, it allows flash nodes to be exposed as individual MTD devices along with the other partitions. The solution focuses on fixed-partitions description only as it depends on device boundaries. It supports multiple sets of concatenated devices, each comprising two or more partitions. flash@0 { reg = <0>; partitions { compatible = "fixed-partitions"; part0@0 { part-concat-next = <&flash0_part1>; label = "part0_0"; reg = <0x0 0x800000>; }; flash0_part1: part1@800000 { label = "part0_1"; reg = <800000 0x800000>; }; part2@1000000 { part-concat-next = <&flash1_part0>; label = "part0_2"; reg = <0x800000 0x800000>; }; }; }; flash@1 { reg = <1>; partitions { compatible = "fixed-partitions"; flash1_part0: part1@0 { label = "part1_0"; reg = <0x0 0x800000>; }; part1@800000 { label = "part1_1"; reg = <0x800000 0x800000>; }; }; }; The partitions that gets created are flash@0 part0_0-part0_1-concat flash@1 part1_1 part0_2-part1_0-concat Suggested-by: Bernhard Frauendienst Suggested-by: Miquel Raynal Signed-off-by: Amit Kumar Mahapatra Signed-off-by: Luca Ceresoli Signed-off-by: Miquel Raynal --- include/linux/mtd/concat.h | 51 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mtd/concat.h b/include/linux/mtd/concat.h index b42d9af87c4e..2cd9d48958a8 100644 --- a/include/linux/mtd/concat.h +++ b/include/linux/mtd/concat.h @@ -28,5 +28,54 @@ struct mtd_info *mtd_concat_create( void mtd_concat_destroy(struct mtd_info *mtd); -#endif +/** + * mtd_virt_concat_node_create - Create a component for concatenation + * + * Returns a positive number representing the no. of devices found for + * concatenation, or a negative error code. + * + * List all the devices for concatenations found in DT and create a + * component for concatenation. + */ +int mtd_virt_concat_node_create(void); + +/** + * mtd_virt_concat_add - add mtd_info object to the list of subdevices for concatenation + * @mtd: pointer to new MTD device info structure + * + * Returns true if the mtd_info object is added successfully else returns false. + * + * The mtd_info object is added to the list of subdevices for concatenation. + * It returns true if a match is found, and false if all subdevices have + * already been added or if the mtd_info object does not match any of the + * intended MTD devices. + */ +bool mtd_virt_concat_add(struct mtd_info *mtd); +/** + * mtd_virt_concat_create_join - Create and register the concatenated MTD device + * + * Returns 0 on succes, or a negative error code. + * + * Creates and registers the concatenated MTD device + */ +int mtd_virt_concat_create_join(void); + +/** + * mtd_virt_concat_destroy - Remove the concat that includes a specific mtd device + * as one of its components. + * @mtd: pointer to MTD device info structure. + * + * Returns 0 on succes, or a negative error code. + * + * If the mtd_info object is part of a concatenated device, all other MTD devices + * within that concat are registered individually. The concatenated device is then + * removed, along with its concatenation component. + * + */ +int mtd_virt_concat_destroy(struct mtd_info *mtd); + +void mtd_virt_concat_destroy_joins(void); +void mtd_virt_concat_destroy_items(void); + +#endif -- cgit v1.2.3 From 43479bb3703f17da6cdfaa2a7f4b93db9c6908bc Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 5 Feb 2026 19:49:15 +0100 Subject: mtd: spinand: Clean the flags section Mention that we are declaring the main SPI NAND flags with a comment. Align the values with tabs. Signed-off-by: Miquel Raynal --- include/linux/mtd/spinand.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index 6a024cf1c53a..58abd306ebe3 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -477,8 +477,9 @@ struct spinand_ecc_info { const struct mtd_ooblayout_ops *ooblayout; }; -#define SPINAND_HAS_QE_BIT BIT(0) -#define SPINAND_HAS_CR_FEAT_BIT BIT(1) +/* SPI NAND flags */ +#define SPINAND_HAS_QE_BIT BIT(0) +#define SPINAND_HAS_CR_FEAT_BIT BIT(1) #define SPINAND_HAS_PROG_PLANE_SELECT_BIT BIT(2) #define SPINAND_HAS_READ_PLANE_SELECT_BIT BIT(3) #define SPINAND_NO_RAW_ACCESS BIT(4) -- cgit v1.2.3 From 28aaa9c39945b7925a1cc1d513c8f21ed38f5e4f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 26 Feb 2026 10:43:55 +0100 Subject: kthread: consolidate kthread exit paths to prevent use-after-free Guillaume reported crashes via corrupted RCU callback function pointers during KUnit testing. The crash was traced back to the pidfs rhashtable conversion which replaced the 24-byte rb_node with an 8-byte rhash_head in struct pid, shrinking it from 160 to 144 bytes. struct kthread (without CONFIG_BLK_CGROUP) is also 144 bytes. With CONFIG_SLAB_MERGE_DEFAULT and SLAB_HWCACHE_ALIGN both round up to 192 bytes and share the same slab cache. struct pid.rcu.func and struct kthread.affinity_node both sit at offset 0x78. When a kthread exits via make_task_dead() it bypasses kthread_exit() and misses the affinity_node cleanup. free_kthread_struct() frees the memory while the node is still linked into the global kthread_affinity_list. A subsequent list_del() by another kthread writes through dangling list pointers into the freed and reused memory, corrupting the pid's rcu.func pointer. Instead of patching free_kthread_struct() to handle the missed cleanup, consolidate all kthread exit paths. Turn kthread_exit() into a macro that calls do_exit() and add kthread_do_exit() which is called from do_exit() for any task with PF_KTHREAD set. This guarantees that kthread-specific cleanup always happens regardless of the exit path - make_task_dead(), direct do_exit(), or kthread_exit(). Replace __to_kthread() with a new tsk_is_kthread() accessor in the public header. Export do_exit() since module code using the kthread_exit() macro now needs it directly. Reported-by: Guillaume Tucker Tested-by: Guillaume Tucker Tested-by: Mark Brown Tested-by: David Gow Cc: Link: https://lore.kernel.org/all/20260224-mittlerweile-besessen-2738831ae7f6@brauner Co-developed-by: Linus Torvalds Fixes: 4d13f4304fa4 ("kthread: Implement preferred affinity") Signed-off-by: Linus Torvalds Signed-off-by: Christian Brauner --- include/linux/kthread.h | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kthread.h b/include/linux/kthread.h index c92c1149ee6e..a01a474719a7 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -7,6 +7,24 @@ struct mm_struct; +/* opaque kthread data */ +struct kthread; + +/* + * When "(p->flags & PF_KTHREAD)" is set the task is a kthread and will + * always remain a kthread. For kthreads p->worker_private always + * points to a struct kthread. For tasks that are not kthreads + * p->worker_private is used to point to other things. + * + * Return NULL for any task that is not a kthread. + */ +static inline struct kthread *tsk_is_kthread(struct task_struct *p) +{ + if (p->flags & PF_KTHREAD) + return p->worker_private; + return NULL; +} + __printf(4, 5) struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), void *data, @@ -98,9 +116,10 @@ void *kthread_probe_data(struct task_struct *k); int kthread_park(struct task_struct *k); void kthread_unpark(struct task_struct *k); void kthread_parkme(void); -void kthread_exit(long result) __noreturn; +#define kthread_exit(result) do_exit(result) void kthread_complete_and_exit(struct completion *, long) __noreturn; int kthreads_update_housekeeping(void); +void kthread_do_exit(struct kthread *, long); int kthreadd(void *unused); extern struct task_struct *kthreadd_task; -- cgit v1.2.3 From 15c9ed1d8286dc0297f01347dc74f5a8cbc173de Mon Sep 17 00:00:00 2001 From: Qingfang Deng Date: Tue, 24 Feb 2026 09:50:52 +0800 Subject: pppoe: remove kernel-mode relay support The kernel-mode PPPoE relay feature and its two associated ioctls (PPPOEIOCSFWD and PPPOEIOCDFWD) are not used by any existing userspace PPPoE implementations. The most commonly-used package, RP-PPPoE [1], handles the relaying entirely in userspace. This legacy code has remained in the driver since its introduction in kernel 2.3.99-pre7 for over two decades, but has served no practical purpose. Remove the unused relay code. [1] https://dianne.skoll.ca/projects/rp-pppoe/ Signed-off-by: Qingfang Deng Acked-by: Arnd Bergmann Reviewed-by: Guillaume Nault Link: https://patch.msgid.link/20260224015053.42472-1-dqfext@gmail.com Signed-off-by: Paolo Abeni --- include/linux/if_pppox.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/if_pppox.h b/include/linux/if_pppox.h index db45d6f1c4f4..8bbf676c2a85 100644 --- a/include/linux/if_pppox.h +++ b/include/linux/if_pppox.h @@ -25,8 +25,6 @@ struct pppoe_opt { struct net_device *dev; /* device associated with socket*/ int ifindex; /* ifindex of device associated with socket */ struct pppoe_addr pa; /* what this socket is bound to*/ - struct sockaddr_pppox relay; /* what socket data will be - relayed to (PPPoE relaying) */ struct work_struct padt_work;/* Work item for handling PADT */ }; @@ -53,7 +51,6 @@ struct pppox_sock { #define pppoe_dev proto.pppoe.dev #define pppoe_ifindex proto.pppoe.ifindex #define pppoe_pa proto.pppoe.pa -#define pppoe_relay proto.pppoe.relay static inline struct pppox_sock *pppox_sk(struct sock *sk) { @@ -80,14 +77,11 @@ extern void pppox_unbind_sock(struct sock *sk);/* delete ppp-channel binding */ extern int pppox_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); extern int pppox_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); -#define PPPOEIOCSFWD32 _IOW(0xB1 ,0, compat_size_t) - /* PPPoX socket states */ enum { PPPOX_NONE = 0, /* initial state */ PPPOX_CONNECTED = 1, /* connection established ==TCP_ESTABLISHED */ PPPOX_BOUND = 2, /* bound to ppp device */ - PPPOX_RELAY = 4, /* forwarding is enabled */ PPPOX_DEAD = 16 /* dead, useless, please clean me up!*/ }; -- cgit v1.2.3 From 1ae2f435350ec05224a39995c3a680aa6fdae5a5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 23 Feb 2026 16:29:37 +0100 Subject: ACPI: x86: cmos_rtc: Create a CMOS RTC platform device Make the CMOS RTC ACPI scan handler create a platform device that will be used subsequently by rtc-cmos for driver binding on x86 systems with ACPI and update add_rtc_cmos() to skip registering a fallback platform device for the CMOS RTC when the above one has been registered. Signed-off-by: Rafael J. Wysocki Acked-by: Dave Hansen # x86 Link: https://patch.msgid.link/1962427.tdWV9SEqCh@rafael.j.wysocki --- include/linux/acpi.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 4d2f0bed7a06..2bdb801cee01 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -791,6 +791,8 @@ const char *acpi_get_subsystem_id(acpi_handle handle); int acpi_mrrm_max_mem_region(void); #endif +extern bool cmos_rtc_platform_device_present; + #else /* !CONFIG_ACPI */ #define acpi_disabled 1 @@ -1116,6 +1118,8 @@ static inline int acpi_mrrm_max_mem_region(void) return 1; } +#define cmos_rtc_platform_device_present false + #endif /* !CONFIG_ACPI */ #ifdef CONFIG_ACPI_HMAT -- cgit v1.2.3 From 2a78e42104444f948698f1225deaf515e9b7224d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 23 Feb 2026 16:30:21 +0100 Subject: ACPI: x86/rtc-cmos: Use platform device for driver binding Modify the rtc-cmos driver to bind to a platform device on systems with ACPI via acpi_match_table and advertise the CMOST RTC ACPI device IDs for driver auto-loading. Note that adding the requisite device IDs to it and exposing them via MODULE_DEVICE_TABLE() is sufficient for this purpose. Since the ACPI device IDs in question are the same as for the CMOS RTC ACPI scan handler, put them into a common header file and use the definition from there in both places. Additionally, to prevent a PNP device from being created for the CMOS RTC if a platform one is present already, make is_cmos_rtc_device() check cmos_rtc_platform_device_present introduced previously. Signed-off-by: Rafael J. Wysocki Acked-by: Alexandre Belloni Link: https://patch.msgid.link/13969123.uLZWGnKmhe@rafael.j.wysocki --- include/linux/acpi.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 2bdb801cee01..5ecdcdaf31aa 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -791,6 +791,12 @@ const char *acpi_get_subsystem_id(acpi_handle handle); int acpi_mrrm_max_mem_region(void); #endif +#define ACPI_CMOS_RTC_IDS \ + { "PNP0B00", }, \ + { "PNP0B01", }, \ + { "PNP0B02", }, \ + { "", } + extern bool cmos_rtc_platform_device_present; #else /* !CONFIG_ACPI */ -- cgit v1.2.3 From c8e9b1d9febc83ee94944695a07cfd40a1b29743 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 25 Feb 2026 21:12:20 -0800 Subject: dmaengine: fsl-edma: fix all kernel-doc warnings Use the correct kernel-doc format and struct member names to eliminate these kernel-doc warnings: Warning: include/linux/platform_data/dma-mcf-edma.h:35 struct member 'dma_channels' not described in 'mcf_edma_platform_data' Warning: include/linux/platform_data/dma-mcf-edma.h:35 struct member 'slave_map' not described in 'mcf_edma_platform_data' Warning: include/linux/platform_data/dma-mcf-edma.h:35 struct member 'slavecnt' not described in 'mcf_edma_platform_data' Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20260226051220.548566-1-rdunlap@infradead.org Signed-off-by: Vinod Koul --- include/linux/platform_data/dma-mcf-edma.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/dma-mcf-edma.h b/include/linux/platform_data/dma-mcf-edma.h index d718ccfa3421..0b31af66a1ac 100644 --- a/include/linux/platform_data/dma-mcf-edma.h +++ b/include/linux/platform_data/dma-mcf-edma.h @@ -26,8 +26,9 @@ bool mcf_edma_filter_fn(struct dma_chan *chan, void *param); /** * struct mcf_edma_platform_data - platform specific data for eDMA engine * - * @ver The eDMA module version. - * @dma_channels The number of eDMA channels. + * @dma_channels: The number of eDMA channels. + * @slave_map: Slave device map + * @slavecnt: Number of entries in @slave_map */ struct mcf_edma_platform_data { int dma_channels; -- cgit v1.2.3 From b2d51bc1601c762c63f19c119589a0a0c44bc8ec Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 26 Feb 2026 10:20:23 +0100 Subject: gpio: generic: Don't use 'proxy' headers Update header inclusions to follow IWYU (Include What You Use) principle. Signed-off-by: Andy Shevchenko Reviewed-by: Linus Walleij Link: https://patch.msgid.link/20260226092023.4096921-1-andriy.shevchenko@linux.intel.com Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/generic.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/gpio/generic.h b/include/linux/gpio/generic.h index ff566dc9c3cb..de43c06c83ef 100644 --- a/include/linux/gpio/generic.h +++ b/include/linux/gpio/generic.h @@ -3,9 +3,15 @@ #ifndef __LINUX_GPIO_GENERIC_H #define __LINUX_GPIO_GENERIC_H +#include +#include #include -#include +#include +#include #include +#include + +#include struct device; -- cgit v1.2.3 From fa4a3a95139e7293c1333a33bd7b19e7261e3bd0 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 23 Feb 2026 18:20:06 +0100 Subject: gpio: introduce a header for symbols shared by suppliers and consumers GPIO_LINE_DIRECTION_IN/OUT definitions are used both in supplier (GPIO controller drivers) as well as consumer code. In order to not force the consumers to include gpio/driver.h or - even worse - to redefine these values, create a new header file - gpio/defs.h - and move them over there. Include this header from both gpio/consumer.h and gpio/driver.h. Reviewed-by: Linus Walleij Suggested-by: Andy Shevchenko Reviewed-by: Andy Shevchenko Link: https://patch.msgid.link/20260223172006.204268-1-bartosz.golaszewski@oss.qualcomm.com Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/consumer.h | 2 ++ include/linux/gpio/defs.h | 9 +++++++++ include/linux/gpio/driver.h | 5 ++--- 3 files changed, 13 insertions(+), 3 deletions(-) create mode 100644 include/linux/gpio/defs.h (limited to 'include/linux') diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h index 0d8408582918..3efb5cb1e1d1 100644 --- a/include/linux/gpio/consumer.h +++ b/include/linux/gpio/consumer.h @@ -6,6 +6,8 @@ #include #include +#include "defs.h" + struct acpi_device; struct device; struct fwnode_handle; diff --git a/include/linux/gpio/defs.h b/include/linux/gpio/defs.h new file mode 100644 index 000000000000..b69fd7c041b2 --- /dev/null +++ b/include/linux/gpio/defs.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __LINUX_GPIO_DEFS_H +#define __LINUX_GPIO_DEFS_H + +#define GPIO_LINE_DIRECTION_IN 1 +#define GPIO_LINE_DIRECTION_OUT 0 + +#endif /* __LINUX_GPIO_DEFS_H */ diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index fabe2baf7b50..5f5ddcbfa445 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -20,6 +20,8 @@ #include #endif +#include "defs.h" + struct device; struct irq_chip; struct irq_data; @@ -42,9 +44,6 @@ union gpio_irq_fwspec { #endif }; -#define GPIO_LINE_DIRECTION_IN 1 -#define GPIO_LINE_DIRECTION_OUT 0 - /** * struct gpio_irq_chip - GPIO interrupt controller */ -- cgit v1.2.3 From 3350c2b3f2b8a3b985a020a4ef4f2f050a4b6a1d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 25 Feb 2026 21:12:29 -0800 Subject: platform_data/mlxreg: mlxreg.h: fix all kernel-doc warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the correct kernel-doc format & notation to eliminate kernel-doc warnings: Warning: include/linux/platform_data/mlxreg.h:24 Enum value 'MLX_WDT_TYPE1' not described in enum 'mlxreg_wdt_type' Warning: include/linux/platform_data/mlxreg.h:24 Enum value 'MLX_WDT_TYPE2' not described in enum 'mlxreg_wdt_type' Warning: include/linux/platform_data/mlxreg.h:24 Enum value 'MLX_WDT_TYPE3' not described in enum 'mlxreg_wdt_type' Warning: include/linux/platform_data/mlxreg.h:37 bad line: PHYs ready / unready state; Warning: include/linux/platform_data/mlxreg.h:153 struct member 'np' not described in 'mlxreg_core_data' Warning: include/linux/platform_data/mlxreg.h:153 struct member 'hpdev' not described in 'mlxreg_core_data' Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20260226051232.549537-1-rdunlap@infradead.org Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/platform_data/mlxreg.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/mlxreg.h b/include/linux/platform_data/mlxreg.h index f6cca7a035c7..50b6be57da66 100644 --- a/include/linux/platform_data/mlxreg.h +++ b/include/linux/platform_data/mlxreg.h @@ -13,10 +13,10 @@ /** * enum mlxreg_wdt_type - type of HW watchdog * - * TYPE1 HW watchdog implementation exist in old systems. - * All new systems have TYPE2 HW watchdog. - * TYPE3 HW watchdog can exist on all systems with new CPLD. - * TYPE3 is selected by WD capability bit. + * @MLX_WDT_TYPE1: HW watchdog implementation in old systems. + * @MLX_WDT_TYPE2: All new systems have TYPE2 HW watchdog. + * @MLX_WDT_TYPE3: HW watchdog that can exist on all systems with new CPLD. + * TYPE3 is selected by WD capability bit. */ enum mlxreg_wdt_type { MLX_WDT_TYPE1, @@ -35,7 +35,7 @@ enum mlxreg_wdt_type { * @MLXREG_HOTPLUG_LC_SYNCED: entry for line card synchronization events, coming * after hardware-firmware synchronization handshake; * @MLXREG_HOTPLUG_LC_READY: entry for line card ready events, indicating line card - PHYs ready / unready state; + * PHYs ready / unready state; * @MLXREG_HOTPLUG_LC_ACTIVE: entry for line card active events, indicating firmware * availability / unavailability for the ports on line card; * @MLXREG_HOTPLUG_LC_THERMAL: entry for line card thermal shutdown events, positive @@ -123,8 +123,8 @@ struct mlxreg_hotplug_device { * @reg_pwr: attribute power register; * @reg_ena: attribute enable register; * @mode: access mode; - * @np - pointer to node platform associated with attribute; - * @hpdev - hotplug device data; + * @np: pointer to node platform associated with attribute; + * @hpdev: hotplug device data; * @notifier: pointer to event notifier block; * @health_cntr: dynamic device health indication counter; * @attached: true if device has been attached after good health indication; -- cgit v1.2.3 From 0a93d30861617ecf207dcc4c6c736435fac36dae Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:35:42 +0100 Subject: hrtimer: Provide a static branch based hrtimer_hres_enabled() The scheduler evaluates this via hrtimer_is_hres_active() every time it has to update HRTICK. This needs to follow three pointers, which is expensive. Provide a static branch based mechanism to avoid that. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163429.136503358@kernel.org --- include/linux/hrtimer.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 74adbd4e7003..c9ca105ba009 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -153,17 +153,22 @@ static inline int hrtimer_is_hres_active(struct hrtimer *timer) } #ifdef CONFIG_HIGH_RES_TIMERS +extern unsigned int hrtimer_resolution; struct clock_event_device; extern void hrtimer_interrupt(struct clock_event_device *dev); -extern unsigned int hrtimer_resolution; +extern struct static_key_false hrtimer_highres_enabled_key; -#else +static inline bool hrtimer_highres_enabled(void) +{ + return static_branch_likely(&hrtimer_highres_enabled_key); +} +#else /* CONFIG_HIGH_RES_TIMERS */ #define hrtimer_resolution (unsigned int)LOW_RES_NSEC - -#endif +static inline bool hrtimer_highres_enabled(void) { return false; } +#endif /* !CONFIG_HIGH_RES_TIMERS */ static inline ktime_t __hrtimer_expires_remaining_adjusted(const struct hrtimer *timer, ktime_t now) -- cgit v1.2.3 From c3a92213eb3dd8ea6f664d16a08eda800e34eaad Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:35:47 +0100 Subject: sched: Use hrtimer_highres_enabled() Use the static branch based variant and thereby avoid following three pointers. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163429.203610956@kernel.org --- include/linux/hrtimer.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index c9ca105ba009..b5003856fd60 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -146,12 +146,6 @@ static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer) return ktime_sub(timer->node.expires, hrtimer_cb_get_time(timer)); } -static inline int hrtimer_is_hres_active(struct hrtimer *timer) -{ - return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ? - timer->base->cpu_base->hres_active : 0; -} - #ifdef CONFIG_HIGH_RES_TIMERS extern unsigned int hrtimer_resolution; struct clock_event_device; -- cgit v1.2.3 From b7dd64778aa3f89de9afa1e81171cfe110ddc525 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 24 Feb 2026 17:36:01 +0100 Subject: hrtimer: Provide LAZY_REARM mode The hrtick timer is frequently rearmed before expiry and most of the time the new expiry is past the armed one. As this happens on every context switch it becomes expensive with scheduling heavy work loads especially in virtual machines as the "hardware" reprogamming implies a VM exit. Add a lazy rearm mode flag which skips the reprogamming if: 1) The timer was the first expiring timer before the rearm 2) The new expiry time is farther out than the armed time This avoids a massive amount of reprogramming operations of the hrtick timer for the price of eventually taking the alredy armed interrupt for nothing. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163429.408524456@kernel.org --- include/linux/hrtimer.h | 8 ++++++++ include/linux/hrtimer_types.h | 3 +++ 2 files changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index b5003856fd60..c924bb2498db 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -31,6 +31,13 @@ * soft irq context * HRTIMER_MODE_HARD - Timer callback function will be executed in * hard irq context even on PREEMPT_RT. + * HRTIMER_MODE_LAZY_REARM - Avoid reprogramming if the timer was the + * first expiring timer and is moved into the + * future. Special mode for the HRTICK timer to + * avoid extensive reprogramming of the hardware, + * which is expensive in virtual machines. Risks + * a pointless expiry, but that's better than + * reprogramming on every context switch, */ enum hrtimer_mode { HRTIMER_MODE_ABS = 0x00, @@ -38,6 +45,7 @@ enum hrtimer_mode { HRTIMER_MODE_PINNED = 0x02, HRTIMER_MODE_SOFT = 0x04, HRTIMER_MODE_HARD = 0x08, + HRTIMER_MODE_LAZY_REARM = 0x10, HRTIMER_MODE_ABS_PINNED = HRTIMER_MODE_ABS | HRTIMER_MODE_PINNED, HRTIMER_MODE_REL_PINNED = HRTIMER_MODE_REL | HRTIMER_MODE_PINNED, diff --git a/include/linux/hrtimer_types.h b/include/linux/hrtimer_types.h index 8fbbb6bdf7a1..64381c64cdbd 100644 --- a/include/linux/hrtimer_types.h +++ b/include/linux/hrtimer_types.h @@ -33,6 +33,8 @@ enum hrtimer_restart { * @is_soft: Set if hrtimer will be expired in soft interrupt context. * @is_hard: Set if hrtimer will be expired in hard interrupt context * even on RT. + * @is_lazy: Set if the timer is frequently rearmed to avoid updates + * of the clock event device * * The hrtimer structure must be initialized by hrtimer_setup() */ @@ -45,6 +47,7 @@ struct hrtimer { u8 is_rel; u8 is_soft; u8 is_hard; + u8 is_lazy; }; #endif /* _LINUX_HRTIMER_TYPES_H */ -- cgit v1.2.3 From 70802807398c65f5a49b2baec87e1f6c8db43de6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:36:15 +0100 Subject: clockevents: Remove redundant CLOCK_EVT_FEAT_KTIME The only real usecase for this is the hrtimer based broadcast device. No point in using two different feature flags for this. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163429.609049777@kernel.org --- include/linux/clockchips.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index b0df28ddd394..5e8f7819f6a6 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -45,7 +45,6 @@ enum clock_event_state { */ # define CLOCK_EVT_FEAT_PERIODIC 0x000001 # define CLOCK_EVT_FEAT_ONESHOT 0x000002 -# define CLOCK_EVT_FEAT_KTIME 0x000004 /* * x86(64) specific (mis)features: -- cgit v1.2.3 From 2e27beeb66e43f3b84aef5a07e486a5d50695c06 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:36:20 +0100 Subject: timekeeping: Allow inlining clocksource::read() On some architectures clocksource::read() boils down to a single instruction, so the indirect function call is just a massive overhead especially with speculative execution mitigations in effect. Allow architectures to enable conditional inlining of that read to avoid that by: - providing a static branch to switch to the inlined variant - disabling the branch before clocksource changes - enabling the branch after a clocksource change, when the clocksource indicates in a feature flag that it is the one which provides the inlined variant This is intentionally not a static call as that would only remove the indirect call, but not the rest of the overhead. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163429.675151545@kernel.org --- include/linux/clocksource.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 65b7c41471c3..54366d5c4d19 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -149,6 +149,8 @@ struct clocksource { #define CLOCK_SOURCE_SUSPEND_NONSTOP 0x80 #define CLOCK_SOURCE_RESELECT 0x100 #define CLOCK_SOURCE_VERIFY_PERCPU 0x200 +#define CLOCK_SOURCE_CAN_INLINE_READ 0x400 + /* simplify initialization of mask field */ #define CLOCKSOURCE_MASK(bits) GENMASK_ULL((bits) - 1, 0) -- cgit v1.2.3 From cd38bdb8e696a1a1eb12fc6662a6e420977aacfd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:36:40 +0100 Subject: timekeeping: Provide infrastructure for coupled clockevents Some architectures have clockevent devices which are coupled to the system clocksource by implementing a less than or equal comparator which compares the programmed absolute expiry time against the underlying time counter. Well known examples are TSC/TSC deadline timer and the S390 TOD clocksource/comparator. While the concept is nice it has some downsides: 1) The clockevents core code is strictly based on relative expiry times as that's the most common case for clockevent device hardware. That requires to convert the absolute expiry time provided by the caller (hrtimers, NOHZ code) to a relative expiry time by reading and substracting the current time. The clockevent::set_next_event() callback must then read the counter again to convert the relative expiry back into a absolute one. 2) The conversion factors from nanoseconds to counter clock cycles are set up when the clockevent is registered. When NTP applies corrections then the clockevent conversion factors can deviate from the clocksource conversion substantially which either results in timers firing late or in the worst case early. The early expiry then needs to do a reprogam with a short delta. In most cases this is papered over by the fact that the read in the set_next_event() callback happens after the read which is used to calculate the delta. So the tendency is that timers expire mostly late. All of this can be avoided by providing support for these devices in the core code: 1) The timekeeping core keeps track of the last update to the clocksource by storing the base nanoseconds and the corresponding clocksource counter value. That's used to keep the conversion math for reading the time within 64-bit in the common case. This information can be used to avoid both reads of the underlying clocksource in the clockevents reprogramming path: delta = expiry - base_ns; cycles = base_cycles + ((delta * clockevent::mult) >> clockevent::shift); The resulting cycles value can be directly used to program the comparator. 2) As #1 does not longer provide the "compensation" through the second read the deviation of the clocksource and clockevent conversions caused by NTP become more prominent. This can be cured by letting the timekeeping core compute and store the reverse conversion factors when the clocksource cycles to nanoseconds factors are modified by NTP: CS::MULT (1 << NS_TO_CYC_SHIFT) --------------- = ---------------------- (1 << CS:SHIFT) NS_TO_CYC_MULT Ergo: NS_TO_CYC_MULT = (1 << (CS::SHIFT + NS_TO_CYC_SHIFT)) / CS::MULT The NS_TO_CYC_SHIFT value is calculated when the clocksource is installed so that it aims for a one hour maximum sleep time. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163429.944763521@kernel.org --- include/linux/clocksource.h | 1 + include/linux/timekeeper_internal.h | 8 ++++++++ 2 files changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 54366d5c4d19..25774fc5b53d 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -150,6 +150,7 @@ struct clocksource { #define CLOCK_SOURCE_RESELECT 0x100 #define CLOCK_SOURCE_VERIFY_PERCPU 0x200 #define CLOCK_SOURCE_CAN_INLINE_READ 0x400 +#define CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT 0x800 /* simplify initialization of mask field */ #define CLOCKSOURCE_MASK(bits) GENMASK_ULL((bits) - 1, 0) diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index b8ae89ea28ab..e36d11e33e0c 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -72,6 +72,10 @@ struct tk_read_base { * @id: The timekeeper ID * @tkr_raw: The readout base structure for CLOCK_MONOTONIC_RAW * @raw_sec: CLOCK_MONOTONIC_RAW time in seconds + * @cs_id: The ID of the current clocksource + * @cs_ns_to_cyc_mult: Multiplicator for nanoseconds to cycles conversion + * @cs_ns_to_cyc_shift: Shift value for nanoseconds to cycles conversion + * @cs_ns_to_cyc_maxns: Maximum nanoseconds to cyles conversion range * @clock_was_set_seq: The sequence number of clock was set events * @cs_was_changed_seq: The sequence number of clocksource change events * @clock_valid: Indicator for valid clock @@ -159,6 +163,10 @@ struct timekeeper { u64 raw_sec; /* Cachline 3 and 4 (timekeeping internal variables): */ + enum clocksource_ids cs_id; + u32 cs_ns_to_cyc_mult; + u32 cs_ns_to_cyc_shift; + u64 cs_ns_to_cyc_maxns; unsigned int clock_was_set_seq; u8 cs_was_changed_seq; u8 clock_valid; -- cgit v1.2.3 From 89f951a1e8ad781e7ac70eccddab0e0c270485f9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:36:45 +0100 Subject: clockevents: Provide support for clocksource coupled comparators Some clockevent devices are coupled to the system clocksource by implementing a less than or equal comparator which compares the programmed absolute expiry time against the underlying time counter. The timekeeping core provides a function to convert and absolute CLOCK_MONOTONIC based expiry time to a absolute clock cycles time which can be directly fed into the comparator. That spares two time reads in the next event progamming path, one to convert the absolute nanoseconds time to a delta value and the other to convert the delta value back to a absolute time value suitable for the comparator. Provide a new clocksource callback which takes the absolute cycle value and wire it up in clockevents_program_event(). Similar to clocksources allow architectures to inline the rearm operation. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163430.010425428@kernel.org --- include/linux/clockchips.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index 5e8f7819f6a6..92d90220c0d4 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -43,8 +43,9 @@ enum clock_event_state { /* * Clock event features */ -# define CLOCK_EVT_FEAT_PERIODIC 0x000001 -# define CLOCK_EVT_FEAT_ONESHOT 0x000002 +# define CLOCK_EVT_FEAT_PERIODIC 0x000001 +# define CLOCK_EVT_FEAT_ONESHOT 0x000002 +# define CLOCK_EVT_FEAT_CLOCKSOURCE_COUPLED 0x000004 /* * x86(64) specific (mis)features: @@ -100,6 +101,7 @@ struct clock_event_device { void (*event_handler)(struct clock_event_device *); int (*set_next_event)(unsigned long evt, struct clock_event_device *); int (*set_next_ktime)(ktime_t expires, struct clock_event_device *); + void (*set_next_coupled)(u64 cycles, struct clock_event_device *); ktime_t next_event; u64 max_delta_ns; u64 min_delta_ns; @@ -107,6 +109,7 @@ struct clock_event_device { u32 shift; enum clock_event_state state_use_accessors; unsigned int features; + enum clocksource_ids cs_id; unsigned long retries; int (*set_state_periodic)(struct clock_event_device *); -- cgit v1.2.3 From 7d27eafe54659d19cef10dab4520cbcdfb17b0e3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:37:18 +0100 Subject: hrtimer: Replace the bitfield in hrtimer_cpu_base Use bool for the various flags as that creates better code in the hot path. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163430.475262618@kernel.org --- include/linux/hrtimer_defs.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h index 02b010df6570..f9fbf9a48f59 100644 --- a/include/linux/hrtimer_defs.h +++ b/include/linux/hrtimer_defs.h @@ -83,11 +83,11 @@ struct hrtimer_cpu_base { unsigned int cpu; unsigned int active_bases; unsigned int clock_was_set_seq; - unsigned int hres_active : 1, - in_hrtirq : 1, - hang_detected : 1, - softirq_activated : 1, - online : 1; + bool hres_active; + bool in_hrtirq; + bool hang_detected; + bool softirq_activated; + bool online; #ifdef CONFIG_HIGH_RES_TIMERS unsigned int nr_events; unsigned short nr_retries; -- cgit v1.2.3 From 22f011be7aaa77ca8f502b9dd07b7334f9965d18 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:37:23 +0100 Subject: hrtimer: Convert state and properties to boolean All 'u8' flags are true booleans, so make it entirely clear that these can only contain true or false. This is especially true for hrtimer::state, which has a historical leftover of using the state with bitwise operations. That was used in the early hrtimer implementation with several bits, but then converted to a boolean state. But that conversion missed to replace the bit OR and bit check operations all over the place, which creates suboptimal code. As of today 'state' is a misnomer because it's only purpose is to reflect whether the timer is enqueued into the RB-tree or not. Rename it to 'is_queued' and make all operations on it boolean. This reduces text size from 8926 to 8732 bytes. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163430.542427240@kernel.org --- include/linux/hrtimer.h | 31 ++----------------------------- include/linux/hrtimer_types.h | 12 ++++++------ 2 files changed, 8 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index c924bb2498db..4ad4a454b4c5 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -63,33 +63,6 @@ enum hrtimer_mode { HRTIMER_MODE_REL_PINNED_HARD = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_HARD, }; -/* - * Values to track state of the timer - * - * Possible states: - * - * 0x00 inactive - * 0x01 enqueued into rbtree - * - * The callback state is not part of the timer->state because clearing it would - * mean touching the timer after the callback, this makes it impossible to free - * the timer from the callback function. - * - * Therefore we track the callback state in: - * - * timer->base->cpu_base->running == timer - * - * On SMP it is possible to have a "callback function running and enqueued" - * status. It happens for example when a posix timer expired and the callback - * queued a signal. Between dropping the lock which protects the posix timer - * and reacquiring the base lock of the hrtimer, another CPU can deliver the - * signal and rearm the timer. - * - * All state transitions are protected by cpu_base->lock. - */ -#define HRTIMER_STATE_INACTIVE 0x00 -#define HRTIMER_STATE_ENQUEUED 0x01 - /** * struct hrtimer_sleeper - simple sleeper structure * @timer: embedded timer structure @@ -300,8 +273,8 @@ extern bool hrtimer_active(const struct hrtimer *timer); */ static inline bool hrtimer_is_queued(struct hrtimer *timer) { - /* The READ_ONCE pairs with the update functions of timer->state */ - return !!(READ_ONCE(timer->state) & HRTIMER_STATE_ENQUEUED); + /* The READ_ONCE pairs with the update functions of timer->is_queued */ + return READ_ONCE(timer->is_queued); } /* diff --git a/include/linux/hrtimer_types.h b/include/linux/hrtimer_types.h index 64381c64cdbd..0e22bc91d00f 100644 --- a/include/linux/hrtimer_types.h +++ b/include/linux/hrtimer_types.h @@ -28,7 +28,7 @@ enum hrtimer_restart { * was armed. * @function: timer expiry callback function * @base: pointer to the timer base (per cpu and per clock) - * @state: state information (See bit values above) + * @is_queued: Indicates whether a timer is enqueued or not * @is_rel: Set if the timer was armed relative * @is_soft: Set if hrtimer will be expired in soft interrupt context. * @is_hard: Set if hrtimer will be expired in hard interrupt context @@ -43,11 +43,11 @@ struct hrtimer { ktime_t _softexpires; enum hrtimer_restart (*__private function)(struct hrtimer *); struct hrtimer_clock_base *base; - u8 state; - u8 is_rel; - u8 is_soft; - u8 is_hard; - u8 is_lazy; + bool is_queued; + bool is_rel; + bool is_soft; + bool is_hard; + bool is_lazy; }; #endif /* _LINUX_HRTIMER_TYPES_H */ -- cgit v1.2.3 From 9e07a9c980eaa93fd1bba722d31eeb4bf0cbbfb4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:37:53 +0100 Subject: hrtimer: Rename hrtimer_cpu_base::in_hrtirq to deferred_rearm The upcoming deferred rearming scheme has the same effect as the deferred rearming when the hrtimer interrupt is executing. So it can reuse the in_hrtirq flag, but when it gets deferred beyond the hrtimer interrupt path, then the name does not make sense anymore. Rename it to deferred_rearm upfront to keep the actual functional change separate from the mechanical rename churn. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163430.935623347@kernel.org --- include/linux/hrtimer_defs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h index f9fbf9a48f59..2c3bdbd562d2 100644 --- a/include/linux/hrtimer_defs.h +++ b/include/linux/hrtimer_defs.h @@ -53,7 +53,7 @@ enum hrtimer_base_type { * @active_bases: Bitfield to mark bases with active timers * @clock_was_set_seq: Sequence counter of clock was set events * @hres_active: State of high resolution mode - * @in_hrtirq: hrtimer_interrupt() is currently executing + * @deferred_rearm: A deferred rearm is pending * @hang_detected: The last hrtimer interrupt detected a hang * @softirq_activated: displays, if the softirq is raised - update of softirq * related settings is not required then. @@ -84,7 +84,7 @@ struct hrtimer_cpu_base { unsigned int active_bases; unsigned int clock_was_set_seq; bool hres_active; - bool in_hrtirq; + bool deferred_rearm; bool hang_detected; bool softirq_activated; bool online; -- cgit v1.2.3 From a43b4856bc039675165a50d9ef5f41b28520f0f4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 24 Feb 2026 17:37:58 +0100 Subject: hrtimer: Prepare stubs for deferred rearming The hrtimer interrupt expires timers and at the end of the interrupt it rearms the clockevent device for the next expiring timer. That's obviously correct, but in the case that a expired timer set NEED_RESCHED the return from interrupt ends up in schedule(). If HRTICK is enabled then schedule() will modify the hrtick timer, which causes another reprogramming of the hardware. That can be avoided by deferring the rearming to the return from interrupt path and if the return results in a immediate schedule() invocation then it can be deferred until the end of schedule(). To make this correct the affected code parts need to be made aware of this. Provide empty stubs for the deferred rearming mechanism, so that the relevant code changes for entry, softirq and scheduler can be split up into separate changes independent of the actual enablement in the hrtimer code. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163431.000891171@kernel.org --- include/linux/hrtimer.h | 1 + include/linux/hrtimer_rearm.h | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+) create mode 100644 include/linux/hrtimer_rearm.h (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 4ad4a454b4c5..c087b7142330 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -13,6 +13,7 @@ #define _LINUX_HRTIMER_H #include +#include #include #include #include diff --git a/include/linux/hrtimer_rearm.h b/include/linux/hrtimer_rearm.h new file mode 100644 index 000000000000..6293076c03a6 --- /dev/null +++ b/include/linux/hrtimer_rearm.h @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef _LINUX_HRTIMER_REARM_H +#define _LINUX_HRTIMER_REARM_H + +#ifdef CONFIG_HRTIMER_REARM_DEFERRED +static __always_inline void __hrtimer_rearm_deferred(void) { } +static __always_inline void hrtimer_rearm_deferred(void) { } +static __always_inline void hrtimer_rearm_deferred_tif(unsigned long tif_work) { } +static __always_inline bool +hrtimer_rearm_deferred_user_irq(unsigned long *tif_work, const unsigned long tif_mask) { return false; } +static __always_inline bool hrtimer_test_and_clear_rearm_deferred(void) { return false; } +#else /* CONFIG_HRTIMER_REARM_DEFERRED */ +static __always_inline void __hrtimer_rearm_deferred(void) { } +static __always_inline void hrtimer_rearm_deferred(void) { } +static __always_inline void hrtimer_rearm_deferred_tif(unsigned long tif_work) { } +static __always_inline bool +hrtimer_rearm_deferred_user_irq(unsigned long *tif_work, const unsigned long tif_mask) { return false; } +static __always_inline bool hrtimer_test_and_clear_rearm_deferred(void) { return false; } +#endif /* !CONFIG_HRTIMER_REARM_DEFERRED */ + +#endif -- cgit v1.2.3 From 0e98eb14814ef669e07ca6effaa03df2e57ef956 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 24 Feb 2026 17:38:03 +0100 Subject: entry: Prepare for deferred hrtimer rearming The hrtimer interrupt expires timers and at the end of the interrupt it rearms the clockevent device for the next expiring timer. That's obviously correct, but in the case that a expired timer sets NEED_RESCHED the return from interrupt ends up in schedule(). If HRTICK is enabled then schedule() will modify the hrtick timer, which causes another reprogramming of the hardware. That can be avoided by deferring the rearming to the return from interrupt path and if the return results in a immediate schedule() invocation then it can be deferred until the end of schedule(), which avoids multiple rearms and re-evaluation of the timer wheel. As this is only relevant for interrupt to user return split the work masks up and hand them in as arguments from the relevant exit to user functions, which allows the compiler to optimize the deferred handling out for the syscall exit to user case. Add the rearm checks to the approritate places in the exit to user loop and the interrupt return to kernel path, so that the rearming is always guaranteed. In the return to user space path this is handled in the same way as TIF_RSEQ to avoid extra instructions in the fast path, which are truly hurtful for device interrupt heavy work loads as the extra instructions and conditionals while benign at first sight accumulate quickly into measurable regressions. The return from syscall path is completely unaffected due to the above mentioned split so syscall heavy workloads wont have any extra burden. For now this is just placing empty stubs at the right places which are all optimized out by the compiler until the actual functionality is in place. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163431.066469985@kernel.org --- include/linux/irq-entry-common.h | 25 +++++++++++++++++++------ include/linux/rseq_entry.h | 16 +++++++++++++--- 2 files changed, 32 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index d26d1b1bcbfb..b976946b3cdb 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -3,6 +3,7 @@ #define __LINUX_IRQENTRYCOMMON_H #include +#include #include #include #include @@ -33,6 +34,14 @@ _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | _TIF_RSEQ | \ ARCH_EXIT_TO_USER_MODE_WORK) +#ifdef CONFIG_HRTIMER_REARM_DEFERRED +# define EXIT_TO_USER_MODE_WORK_SYSCALL (EXIT_TO_USER_MODE_WORK) +# define EXIT_TO_USER_MODE_WORK_IRQ (EXIT_TO_USER_MODE_WORK | _TIF_HRTIMER_REARM) +#else +# define EXIT_TO_USER_MODE_WORK_SYSCALL (EXIT_TO_USER_MODE_WORK) +# define EXIT_TO_USER_MODE_WORK_IRQ (EXIT_TO_USER_MODE_WORK) +#endif + /** * arch_enter_from_user_mode - Architecture specific sanity check for user mode regs * @regs: Pointer to currents pt_regs @@ -203,6 +212,7 @@ unsigned long exit_to_user_mode_loop(struct pt_regs *regs, unsigned long ti_work /** * __exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required * @regs: Pointer to pt_regs on entry stack + * @work_mask: Which TIF bits need to be evaluated * * 1) check that interrupts are disabled * 2) call tick_nohz_user_enter_prepare() @@ -212,7 +222,8 @@ unsigned long exit_to_user_mode_loop(struct pt_regs *regs, unsigned long ti_work * * Don't invoke directly, use the syscall/irqentry_ prefixed variants below */ -static __always_inline void __exit_to_user_mode_prepare(struct pt_regs *regs) +static __always_inline void __exit_to_user_mode_prepare(struct pt_regs *regs, + const unsigned long work_mask) { unsigned long ti_work; @@ -222,8 +233,10 @@ static __always_inline void __exit_to_user_mode_prepare(struct pt_regs *regs) tick_nohz_user_enter_prepare(); ti_work = read_thread_flags(); - if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) - ti_work = exit_to_user_mode_loop(regs, ti_work); + if (unlikely(ti_work & work_mask)) { + if (!hrtimer_rearm_deferred_user_irq(&ti_work, work_mask)) + ti_work = exit_to_user_mode_loop(regs, ti_work); + } arch_exit_to_user_mode_prepare(regs, ti_work); } @@ -239,7 +252,7 @@ static __always_inline void __exit_to_user_mode_validate(void) /* Temporary workaround to keep ARM64 alive */ static __always_inline void exit_to_user_mode_prepare_legacy(struct pt_regs *regs) { - __exit_to_user_mode_prepare(regs); + __exit_to_user_mode_prepare(regs, EXIT_TO_USER_MODE_WORK); rseq_exit_to_user_mode_legacy(); __exit_to_user_mode_validate(); } @@ -253,7 +266,7 @@ static __always_inline void exit_to_user_mode_prepare_legacy(struct pt_regs *reg */ static __always_inline void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) { - __exit_to_user_mode_prepare(regs); + __exit_to_user_mode_prepare(regs, EXIT_TO_USER_MODE_WORK_SYSCALL); rseq_syscall_exit_to_user_mode(); __exit_to_user_mode_validate(); } @@ -267,7 +280,7 @@ static __always_inline void syscall_exit_to_user_mode_prepare(struct pt_regs *re */ static __always_inline void irqentry_exit_to_user_mode_prepare(struct pt_regs *regs) { - __exit_to_user_mode_prepare(regs); + __exit_to_user_mode_prepare(regs, EXIT_TO_USER_MODE_WORK_IRQ); rseq_irqentry_exit_to_user_mode(); __exit_to_user_mode_validate(); } diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index cbc4a791618b..17956e119e81 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -40,6 +40,7 @@ DECLARE_PER_CPU(struct rseq_stats, rseq_stats); #endif /* !CONFIG_RSEQ_STATS */ #ifdef CONFIG_RSEQ +#include #include #include #include @@ -110,7 +111,7 @@ static __always_inline void rseq_slice_clear_grant(struct task_struct *t) t->rseq.slice.state.granted = false; } -static __always_inline bool rseq_grant_slice_extension(bool work_pending) +static __always_inline bool __rseq_grant_slice_extension(bool work_pending) { struct task_struct *curr = current; struct rseq_slice_ctrl usr_ctrl; @@ -215,11 +216,20 @@ efault: return false; } +static __always_inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask) +{ + if (unlikely(__rseq_grant_slice_extension(ti_work & mask))) { + hrtimer_rearm_deferred_tif(ti_work); + return true; + } + return false; +} + #else /* CONFIG_RSEQ_SLICE_EXTENSION */ static inline bool rseq_slice_extension_enabled(void) { return false; } static inline bool rseq_arm_slice_extension_timer(void) { return false; } static inline void rseq_slice_clear_grant(struct task_struct *t) { } -static inline bool rseq_grant_slice_extension(bool work_pending) { return false; } +static inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask) { return false; } #endif /* !CONFIG_RSEQ_SLICE_EXTENSION */ bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr); @@ -778,7 +788,7 @@ static inline void rseq_syscall_exit_to_user_mode(void) { } static inline void rseq_irqentry_exit_to_user_mode(void) { } static inline void rseq_exit_to_user_mode_legacy(void) { } static inline void rseq_debug_syscall_return(struct pt_regs *regs) { } -static inline bool rseq_grant_slice_extension(bool work_pending) { return false; } +static inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask) { return false; } #endif /* !CONFIG_RSEQ */ #endif /* _LINUX_RSEQ_ENTRY_H */ -- cgit v1.2.3 From 15dd3a9488557d3e6ebcecacab79f4e56b69ab54 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 24 Feb 2026 17:38:18 +0100 Subject: hrtimer: Push reprogramming timers into the interrupt return path Currently hrtimer_interrupt() runs expired timers, which can re-arm themselves, after which it computes the next expiration time and re-programs the hardware. However, things like HRTICK, a highres timer driving preemption, cannot re-arm itself at the point of running, since the next task has not been determined yet. The schedule() in the interrupt return path will switch to the next task, which then causes a new hrtimer to be programmed. This then results in reprogramming the hardware at least twice, once after running the timers, and once upon selecting the new task. Notably, *both* events happen in the interrupt. By pushing the hrtimer reprogram all the way into the interrupt return path, it runs after schedule() picks the new task and the double reprogram can be avoided. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163431.273488269@kernel.org --- include/linux/hrtimer_rearm.h | 72 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 67 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer_rearm.h b/include/linux/hrtimer_rearm.h index 6293076c03a6..a6f2e5d5e1c7 100644 --- a/include/linux/hrtimer_rearm.h +++ b/include/linux/hrtimer_rearm.h @@ -3,12 +3,74 @@ #define _LINUX_HRTIMER_REARM_H #ifdef CONFIG_HRTIMER_REARM_DEFERRED -static __always_inline void __hrtimer_rearm_deferred(void) { } -static __always_inline void hrtimer_rearm_deferred(void) { } -static __always_inline void hrtimer_rearm_deferred_tif(unsigned long tif_work) { } +#include + +void __hrtimer_rearm_deferred(void); + +/* + * This is purely CPU local, so check the TIF bit first to avoid the overhead of + * the atomic test_and_clear_bit() operation for the common case where the bit + * is not set. + */ +static __always_inline bool hrtimer_test_and_clear_rearm_deferred_tif(unsigned long tif_work) +{ + lockdep_assert_irqs_disabled(); + + if (unlikely(tif_work & _TIF_HRTIMER_REARM)) { + clear_thread_flag(TIF_HRTIMER_REARM); + return true; + } + return false; +} + +#define TIF_REARM_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | _TIF_HRTIMER_REARM) + +/* Invoked from the exit to user before invoking exit_to_user_mode_loop() */ static __always_inline bool -hrtimer_rearm_deferred_user_irq(unsigned long *tif_work, const unsigned long tif_mask) { return false; } -static __always_inline bool hrtimer_test_and_clear_rearm_deferred(void) { return false; } +hrtimer_rearm_deferred_user_irq(unsigned long *tif_work, const unsigned long tif_mask) +{ + /* Help the compiler to optimize the function out for syscall returns */ + if (!(tif_mask & _TIF_HRTIMER_REARM)) + return false; + /* + * Rearm the timer if none of the resched flags is set before going into + * the loop which re-enables interrupts. + */ + if (unlikely((*tif_work & TIF_REARM_MASK) == _TIF_HRTIMER_REARM)) { + clear_thread_flag(TIF_HRTIMER_REARM); + __hrtimer_rearm_deferred(); + /* Don't go into the loop if HRTIMER_REARM was the only flag */ + *tif_work &= ~TIF_HRTIMER_REARM; + return !*tif_work; + } + return false; +} + +/* Invoked from the time slice extension decision function */ +static __always_inline void hrtimer_rearm_deferred_tif(unsigned long tif_work) +{ + if (hrtimer_test_and_clear_rearm_deferred_tif(tif_work)) + __hrtimer_rearm_deferred(); +} + +/* + * This is to be called on all irqentry_exit() paths that will enable + * interrupts. + */ +static __always_inline void hrtimer_rearm_deferred(void) +{ + hrtimer_rearm_deferred_tif(read_thread_flags()); +} + +/* + * Invoked from the scheduler on entry to __schedule() so it can defer + * rearming after the load balancing callbacks which might change hrtick. + */ +static __always_inline bool hrtimer_test_and_clear_rearm_deferred(void) +{ + return hrtimer_test_and_clear_rearm_deferred_tif(read_thread_flags()); +} + #else /* CONFIG_HRTIMER_REARM_DEFERRED */ static __always_inline void __hrtimer_rearm_deferred(void) { } static __always_inline void hrtimer_rearm_deferred(void) { } -- cgit v1.2.3 From b95c4442b02162904e9012e670b602ebeb3c6c1b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:38:23 +0100 Subject: hrtimer: Avoid re-evaluation when nothing changed Most times there is no change between hrtimer_interrupt() deferring the rearm and the invocation of hrtimer_rearm_deferred(). In those cases it's a pointless exercise to re-evaluate the next expiring timer. Cache the required data and use it if nothing changed. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163431.338569372@kernel.org --- include/linux/hrtimer_defs.h | 53 ++++++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h index 2c3bdbd562d2..b6846efec210 100644 --- a/include/linux/hrtimer_defs.h +++ b/include/linux/hrtimer_defs.h @@ -47,32 +47,31 @@ enum hrtimer_base_type { /** * struct hrtimer_cpu_base - the per cpu clock bases - * @lock: lock protecting the base and associated clock bases - * and timers - * @cpu: cpu number - * @active_bases: Bitfield to mark bases with active timers - * @clock_was_set_seq: Sequence counter of clock was set events - * @hres_active: State of high resolution mode - * @deferred_rearm: A deferred rearm is pending - * @hang_detected: The last hrtimer interrupt detected a hang - * @softirq_activated: displays, if the softirq is raised - update of softirq - * related settings is not required then. - * @nr_events: Total number of hrtimer interrupt events - * @nr_retries: Total number of hrtimer interrupt retries - * @nr_hangs: Total number of hrtimer interrupt hangs - * @max_hang_time: Maximum time spent in hrtimer_interrupt - * @softirq_expiry_lock: Lock which is taken while softirq based hrtimer are - * expired - * @online: CPU is online from an hrtimers point of view - * @timer_waiters: A hrtimer_cancel() invocation waits for the timer - * callback to finish. - * @expires_next: absolute time of the next event, is required for remote - * hrtimer enqueue; it is the total first expiry time (hard - * and soft hrtimer are taken into account) - * @next_timer: Pointer to the first expiring timer - * @softirq_expires_next: Time to check, if soft queues needs also to be expired - * @softirq_next_timer: Pointer to the first expiring softirq based timer - * @clock_base: array of clock bases for this cpu + * @lock: lock protecting the base and associated clock bases and timers + * @cpu: cpu number + * @active_bases: Bitfield to mark bases with active timers + * @clock_was_set_seq: Sequence counter of clock was set events + * @hres_active: State of high resolution mode + * @deferred_rearm: A deferred rearm is pending + * @deferred_needs_update: The deferred rearm must re-evaluate the first timer + * @hang_detected: The last hrtimer interrupt detected a hang + * @softirq_activated: displays, if the softirq is raised - update of softirq + * related settings is not required then. + * @nr_events: Total number of hrtimer interrupt events + * @nr_retries: Total number of hrtimer interrupt retries + * @nr_hangs: Total number of hrtimer interrupt hangs + * @max_hang_time: Maximum time spent in hrtimer_interrupt + * @softirq_expiry_lock: Lock which is taken while softirq based hrtimer are expired + * @online: CPU is online from an hrtimers point of view + * @timer_waiters: A hrtimer_cancel() waiters for the timer callback to finish. + * @expires_next: Absolute time of the next event, is required for remote + * hrtimer enqueue; it is the total first expiry time (hard + * and soft hrtimer are taken into account) + * @next_timer: Pointer to the first expiring timer + * @softirq_expires_next: Time to check, if soft queues needs also to be expired + * @softirq_next_timer: Pointer to the first expiring softirq based timer + * @deferred_expires_next: Cached expires next value for deferred rearm + * @clock_base: Array of clock bases for this cpu * * Note: next_timer is just an optimization for __remove_hrtimer(). * Do not dereference the pointer because it is not reliable on @@ -85,6 +84,7 @@ struct hrtimer_cpu_base { unsigned int clock_was_set_seq; bool hres_active; bool deferred_rearm; + bool deferred_needs_update; bool hang_detected; bool softirq_activated; bool online; @@ -102,6 +102,7 @@ struct hrtimer_cpu_base { struct hrtimer *next_timer; ktime_t softirq_expires_next; struct hrtimer *softirq_next_timer; + ktime_t deferred_expires_next; struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; call_single_data_t csd; } ____cacheline_aligned; -- cgit v1.2.3 From eddffab8282e388dddf032f3295fcec87eb08095 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:38:28 +0100 Subject: hrtimer: Keep track of first expiring timer per clock base Evaluating the next expiry time of all clock bases is cache line expensive as the expiry time of the first expiring timer is not cached in the base and requires to access the timer itself, which is definitely in a different cache line. It's way more efficient to keep track of the expiry time on enqueue and dequeue operations as the relevant data is already in the cache at that point. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163431.404839710@kernel.org --- include/linux/hrtimer_defs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h index b6846efec210..fb38df4c0b64 100644 --- a/include/linux/hrtimer_defs.h +++ b/include/linux/hrtimer_defs.h @@ -19,6 +19,7 @@ * timer to a base on another cpu. * @clockid: clock id for per_cpu support * @seq: seqcount around __run_hrtimer + * @expires_next: Absolute time of the next event in this clock base * @running: pointer to the currently running hrtimer * @active: red black tree root node for the active timers * @offset: offset of this clock to the monotonic base @@ -28,6 +29,7 @@ struct hrtimer_clock_base { unsigned int index; clockid_t clockid; seqcount_raw_spinlock_t seq; + ktime_t expires_next; struct hrtimer *running; struct timerqueue_head active; ktime_t offset; -- cgit v1.2.3 From 671047943dce5af24e023bca3c5cc244d7565f5a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:38:47 +0100 Subject: rbtree: Provide rbtree with links Some RB tree users require quick access to the next and the previous node, e.g. to check whether a modification of the node results in a change of the nodes position in the tree. If the node position does not change, then the modification can happen in place without going through a full enqueue requeue cycle. A upcoming use case for this are the timer queues of the hrtimer subsystem as they can optimize for timers which are frequently rearmed while enqueued. This can be obviously achieved with rb_next() and rb_prev(), but those turned out to be quite expensive for hotpath operations depending on the tree depth. Add a linked RB tree variant where add() and erase() maintain the links between the nodes. Like the cached variant it provides a pointer to the left most node in the root. It intentionally does not use a [h]list head as there is no real need for true list operations as the list is strictly coupled to the tree and and cannot be manipulated independently. It sets the nodes previous pointer to NULL for the left most node and the next pointer to NULL for the right most node. This allows a quick check especially for the left most node without consulting the list head address, which creates better code. Aside of the rb_leftmost cached pointer this could trivially provide a rb_rightmost pointer as well, but there is no usage for that (yet). Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163431.668401024@kernel.org --- include/linux/rbtree.h | 81 +++++++++++++++++++++++++++++++++++++++----- include/linux/rbtree_types.h | 16 +++++++++ 2 files changed, 88 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index 4091e978aef2..48acdc3889dd 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -35,10 +35,15 @@ #define RB_CLEAR_NODE(node) \ ((node)->__rb_parent_color = (unsigned long)(node)) +#define RB_EMPTY_LINKED_NODE(lnode) RB_EMPTY_NODE(&(lnode)->node) +#define RB_CLEAR_LINKED_NODE(lnode) ({ \ + RB_CLEAR_NODE(&(lnode)->node); \ + (lnode)->prev = (lnode)->next = NULL; \ +}) extern void rb_insert_color(struct rb_node *, struct rb_root *); extern void rb_erase(struct rb_node *, struct rb_root *); - +extern bool rb_erase_linked(struct rb_node_linked *, struct rb_root_linked *); /* Find logical next and previous nodes in a tree */ extern struct rb_node *rb_next(const struct rb_node *); @@ -213,15 +218,10 @@ rb_add_cached(struct rb_node *node, struct rb_root_cached *tree, return leftmost ? node : NULL; } -/** - * rb_add() - insert @node into @tree - * @node: node to insert - * @tree: tree to insert @node into - * @less: operator defining the (partial) node order - */ static __always_inline void -rb_add(struct rb_node *node, struct rb_root *tree, - bool (*less)(struct rb_node *, const struct rb_node *)) +__rb_add(struct rb_node *node, struct rb_root *tree, + bool (*less)(struct rb_node *, const struct rb_node *), + void (*linkop)(struct rb_node *, struct rb_node *, struct rb_node **)) { struct rb_node **link = &tree->rb_node; struct rb_node *parent = NULL; @@ -234,10 +234,73 @@ rb_add(struct rb_node *node, struct rb_root *tree, link = &parent->rb_right; } + linkop(node, parent, link); rb_link_node(node, parent, link); rb_insert_color(node, tree); } +#define __node_2_linked_node(_n) \ + rb_entry((_n), struct rb_node_linked, node) + +static inline void +rb_link_linked_node(struct rb_node *node, struct rb_node *parent, struct rb_node **link) +{ + if (!parent) + return; + + struct rb_node_linked *nnew = __node_2_linked_node(node); + struct rb_node_linked *npar = __node_2_linked_node(parent); + + if (link == &parent->rb_left) { + nnew->prev = npar->prev; + nnew->next = npar; + npar->prev = nnew; + if (nnew->prev) + nnew->prev->next = nnew; + } else { + nnew->next = npar->next; + nnew->prev = npar; + npar->next = nnew; + if (nnew->next) + nnew->next->prev = nnew; + } +} + +/** + * rb_add_linked() - insert @node into the leftmost linked tree @tree + * @node: node to insert + * @tree: linked tree to insert @node into + * @less: operator defining the (partial) node order + * + * Returns @true when @node is the new leftmost, @false otherwise. + */ +static __always_inline bool +rb_add_linked(struct rb_node_linked *node, struct rb_root_linked *tree, + bool (*less)(struct rb_node *, const struct rb_node *)) +{ + __rb_add(&node->node, &tree->rb_root, less, rb_link_linked_node); + if (!node->prev) + tree->rb_leftmost = node; + return !node->prev; +} + +/* Empty linkop function which is optimized away by the compiler */ +static __always_inline void +rb_link_noop(struct rb_node *n, struct rb_node *p, struct rb_node **l) { } + +/** + * rb_add() - insert @node into @tree + * @node: node to insert + * @tree: tree to insert @node into + * @less: operator defining the (partial) node order + */ +static __always_inline void +rb_add(struct rb_node *node, struct rb_root *tree, + bool (*less)(struct rb_node *, const struct rb_node *)) +{ + __rb_add(node, tree, less, rb_link_noop); +} + /** * rb_find_add_cached() - find equivalent @node in @tree, or add @node * @node: node to look-for / insert diff --git a/include/linux/rbtree_types.h b/include/linux/rbtree_types.h index 45b6ecde3665..3c7ae53e8139 100644 --- a/include/linux/rbtree_types.h +++ b/include/linux/rbtree_types.h @@ -9,6 +9,12 @@ struct rb_node { } __attribute__((aligned(sizeof(long)))); /* The alignment might seem pointless, but allegedly CRIS needs it */ +struct rb_node_linked { + struct rb_node node; + struct rb_node_linked *prev; + struct rb_node_linked *next; +}; + struct rb_root { struct rb_node *rb_node; }; @@ -28,7 +34,17 @@ struct rb_root_cached { struct rb_node *rb_leftmost; }; +/* + * Leftmost tree with links. This would allow a trivial rb_rightmost update, + * but that has been omitted due to the lack of users. + */ +struct rb_root_linked { + struct rb_root rb_root; + struct rb_node_linked *rb_leftmost; +}; + #define RB_ROOT (struct rb_root) { NULL, } #define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL } +#define RB_ROOT_LINKED (struct rb_root_linked) { {NULL, }, NULL } #endif -- cgit v1.2.3 From 1339eeb73d6b99cf3aa9981f3f91d6ac4a49c72e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:38:52 +0100 Subject: timerqueue: Provide linked timerqueue The hrtimer subsystem wants to peak ahead to the next and previous timer to evaluated whether a to be rearmed timer can stay at the same position in the RB tree with the new expiry time. The linked RB tree provides the infrastructure for this as it maintains links to the previous and next nodes for each entry in the tree. Provide timerqueue wrappers around that. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163431.734827095@kernel.org --- include/linux/timerqueue.h | 56 ++++++++++++++++++++++++++++++++++------ include/linux/timerqueue_types.h | 15 ++++++++--- 2 files changed, 60 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/timerqueue.h b/include/linux/timerqueue.h index d306d9dd2207..7d0aaa766580 100644 --- a/include/linux/timerqueue.h +++ b/include/linux/timerqueue.h @@ -5,12 +5,11 @@ #include #include -extern bool timerqueue_add(struct timerqueue_head *head, - struct timerqueue_node *node); -extern bool timerqueue_del(struct timerqueue_head *head, - struct timerqueue_node *node); -extern struct timerqueue_node *timerqueue_iterate_next( - struct timerqueue_node *node); +bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node); +bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node); +struct timerqueue_node *timerqueue_iterate_next(struct timerqueue_node *node); + +bool timerqueue_linked_add(struct timerqueue_linked_head *head, struct timerqueue_linked_node *node); /** * timerqueue_getnext - Returns the timer with the earliest expiration time @@ -19,8 +18,7 @@ extern struct timerqueue_node *timerqueue_iterate_next( * * Returns a pointer to the timer node that has the earliest expiration time. */ -static inline -struct timerqueue_node *timerqueue_getnext(struct timerqueue_head *head) +static inline struct timerqueue_node *timerqueue_getnext(struct timerqueue_head *head) { struct rb_node *leftmost = rb_first_cached(&head->rb_root); @@ -41,4 +39,46 @@ static inline void timerqueue_init_head(struct timerqueue_head *head) { head->rb_root = RB_ROOT_CACHED; } + +/* Timer queues with linked nodes */ + +static __always_inline +struct timerqueue_linked_node *timerqueue_linked_first(struct timerqueue_linked_head *head) +{ + return rb_entry_safe(head->rb_root.rb_leftmost, struct timerqueue_linked_node, node); +} + +static __always_inline +struct timerqueue_linked_node *timerqueue_linked_next(struct timerqueue_linked_node *node) +{ + return rb_entry_safe(node->node.next, struct timerqueue_linked_node, node); +} + +static __always_inline +struct timerqueue_linked_node *timerqueue_linked_prev(struct timerqueue_linked_node *node) +{ + return rb_entry_safe(node->node.prev, struct timerqueue_linked_node, node); +} + +static __always_inline +bool timerqueue_linked_del(struct timerqueue_linked_head *head, struct timerqueue_linked_node *node) +{ + return rb_erase_linked(&node->node, &head->rb_root); +} + +static __always_inline void timerqueue_linked_init(struct timerqueue_linked_node *node) +{ + RB_CLEAR_LINKED_NODE(&node->node); +} + +static __always_inline bool timerqueue_linked_node_queued(struct timerqueue_linked_node *node) +{ + return !RB_EMPTY_LINKED_NODE(&node->node); +} + +static __always_inline void timerqueue_linked_init_head(struct timerqueue_linked_head *head) +{ + head->rb_root = RB_ROOT_LINKED; +} + #endif /* _LINUX_TIMERQUEUE_H */ diff --git a/include/linux/timerqueue_types.h b/include/linux/timerqueue_types.h index dc298d0923e3..be2218b147c4 100644 --- a/include/linux/timerqueue_types.h +++ b/include/linux/timerqueue_types.h @@ -6,12 +6,21 @@ #include struct timerqueue_node { - struct rb_node node; - ktime_t expires; + struct rb_node node; + ktime_t expires; }; struct timerqueue_head { - struct rb_root_cached rb_root; + struct rb_root_cached rb_root; +}; + +struct timerqueue_linked_node { + struct rb_node_linked node; + ktime_t expires; +}; + +struct timerqueue_linked_head { + struct rb_root_linked rb_root; }; #endif /* _LINUX_TIMERQUEUE_TYPES_H */ -- cgit v1.2.3 From b7418e6e9b87b849af4df93d527ff83498d1e4c3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:38:57 +0100 Subject: hrtimer: Use linked timerqueue To prepare for optimizing the rearming of enqueued timers, switch to the linked timerqueue. That allows to check whether the new expiry time changes the position of the timer in the RB tree or not, by checking the new expiry time against the previous and the next timers expiry. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163431.806643179@kernel.org --- include/linux/hrtimer_defs.h | 16 ++++++++-------- include/linux/hrtimer_types.h | 8 ++++---- 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h index fb38df4c0b64..0f851b2432c3 100644 --- a/include/linux/hrtimer_defs.h +++ b/include/linux/hrtimer_defs.h @@ -25,14 +25,14 @@ * @offset: offset of this clock to the monotonic base */ struct hrtimer_clock_base { - struct hrtimer_cpu_base *cpu_base; - unsigned int index; - clockid_t clockid; - seqcount_raw_spinlock_t seq; - ktime_t expires_next; - struct hrtimer *running; - struct timerqueue_head active; - ktime_t offset; + struct hrtimer_cpu_base *cpu_base; + unsigned int index; + clockid_t clockid; + seqcount_raw_spinlock_t seq; + ktime_t expires_next; + struct hrtimer *running; + struct timerqueue_linked_head active; + ktime_t offset; } __hrtimer_clock_base_align; enum hrtimer_base_type { diff --git a/include/linux/hrtimer_types.h b/include/linux/hrtimer_types.h index 0e22bc91d00f..b5dacc8271a4 100644 --- a/include/linux/hrtimer_types.h +++ b/include/linux/hrtimer_types.h @@ -17,7 +17,7 @@ enum hrtimer_restart { /** * struct hrtimer - the basic hrtimer structure - * @node: timerqueue node, which also manages node.expires, + * @node: Linked timerqueue node, which also manages node.expires, * the absolute expiry time in the hrtimers internal * representation. The time is related to the clock on * which the timer is based. Is setup by adding @@ -39,15 +39,15 @@ enum hrtimer_restart { * The hrtimer structure must be initialized by hrtimer_setup() */ struct hrtimer { - struct timerqueue_node node; - ktime_t _softexpires; - enum hrtimer_restart (*__private function)(struct hrtimer *); + struct timerqueue_linked_node node; struct hrtimer_clock_base *base; bool is_queued; bool is_rel; bool is_soft; bool is_hard; bool is_lazy; + ktime_t _softexpires; + enum hrtimer_restart (*__private function)(struct hrtimer *); }; #endif /* _LINUX_HRTIMER_TYPES_H */ -- cgit v1.2.3 From 38e18d825f7281fdc16d3241df5115ce6eaeaf79 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 25 Feb 2026 10:32:41 -0800 Subject: locking: Fix rwlock and spinlock lock context annotations Fix two incorrect rwlock_t lock context annotations. Add the raw_spinlock_t lock context annotations that are missing. Fixes: f16a802d402d ("locking/rwlock, spinlock: Support Clang's context analysis") Signed-off-by: Bart Van Assche Signed-off-by: Peter Zijlstra (Intel) Acked-by: Marco Elver Link: https://patch.msgid.link/20260225183244.4035378-2-bvanassche@acm.org --- include/linux/rwlock.h | 4 ++-- include/linux/rwlock_api_smp.h | 6 ++++-- include/linux/spinlock.h | 3 ++- 3 files changed, 8 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h index 3390d21c95dd..21ceefc4a49f 100644 --- a/include/linux/rwlock.h +++ b/include/linux/rwlock.h @@ -30,10 +30,10 @@ do { \ #ifdef CONFIG_DEBUG_SPINLOCK extern void do_raw_read_lock(rwlock_t *lock) __acquires_shared(lock); - extern int do_raw_read_trylock(rwlock_t *lock); + extern int do_raw_read_trylock(rwlock_t *lock) __cond_acquires_shared(true, lock); extern void do_raw_read_unlock(rwlock_t *lock) __releases_shared(lock); extern void do_raw_write_lock(rwlock_t *lock) __acquires(lock); - extern int do_raw_write_trylock(rwlock_t *lock); +extern int do_raw_write_trylock(rwlock_t *lock) __cond_acquires(true, lock); extern void do_raw_write_unlock(rwlock_t *lock) __releases(lock); #else # define do_raw_read_lock(rwlock) do {__acquire_shared(lock); arch_read_lock(&(rwlock)->raw_lock); } while (0) diff --git a/include/linux/rwlock_api_smp.h b/include/linux/rwlock_api_smp.h index 61a852609eab..9e02a5f28cd1 100644 --- a/include/linux/rwlock_api_smp.h +++ b/include/linux/rwlock_api_smp.h @@ -23,7 +23,7 @@ void __lockfunc _raw_write_lock_bh(rwlock_t *lock) __acquires(lock); void __lockfunc _raw_read_lock_irq(rwlock_t *lock) __acquires_shared(lock); void __lockfunc _raw_write_lock_irq(rwlock_t *lock) __acquires(lock); unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock) - __acquires(lock); + __acquires_shared(lock); unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock) __acquires(lock); int __lockfunc _raw_read_trylock(rwlock_t *lock) __cond_acquires_shared(true, lock); @@ -36,7 +36,7 @@ void __lockfunc _raw_read_unlock_irq(rwlock_t *lock) __releases_shared(lock); void __lockfunc _raw_write_unlock_irq(rwlock_t *lock) __releases(lock); void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) - __releases(lock); + __releases_shared(lock); void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) __releases(lock); @@ -116,6 +116,7 @@ _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) #endif static inline int __raw_read_trylock(rwlock_t *lock) + __cond_acquires_shared(true, lock) { preempt_disable(); if (do_raw_read_trylock(lock)) { @@ -127,6 +128,7 @@ static inline int __raw_read_trylock(rwlock_t *lock) } static inline int __raw_write_trylock(rwlock_t *lock) + __cond_acquires(true, lock) { preempt_disable(); if (do_raw_write_trylock(lock)) { diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index e1e2f144af9b..241277cd34cf 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -178,7 +178,7 @@ do { \ #ifdef CONFIG_DEBUG_SPINLOCK extern void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock); - extern int do_raw_spin_trylock(raw_spinlock_t *lock); + extern int do_raw_spin_trylock(raw_spinlock_t *lock) __cond_acquires(true, lock); extern void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock); #else static inline void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock) @@ -189,6 +189,7 @@ static inline void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock) } static inline int do_raw_spin_trylock(raw_spinlock_t *lock) + __cond_acquires(true, lock) { int ret = arch_spin_trylock(&(lock)->raw_lock); -- cgit v1.2.3 From 39be7b21af24d1d2ed3b18caac57dd219fef226e Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 25 Feb 2026 10:32:42 -0800 Subject: signal: Fix the lock_task_sighand() annotation lock_task_sighand() may return NULL. Make this clear in its lock context annotation. Fixes: 04e49d926f43 ("sched: Enable context analysis for core.c and fair.c") Signed-off-by: Bart Van Assche Signed-off-by: Peter Zijlstra (Intel) Acked-by: Marco Elver Link: https://patch.msgid.link/20260225183244.4035378-3-bvanassche@acm.org --- include/linux/sched/signal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index a22248aebcf9..a4835a7de07e 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -739,7 +739,7 @@ static inline int thread_group_empty(struct task_struct *p) extern struct sighand_struct *lock_task_sighand(struct task_struct *task, unsigned long *flags) - __acquires(&task->sighand->siglock); + __cond_acquires(nonnull, &task->sighand->siglock); static inline void unlock_task_sighand(struct task_struct *task, unsigned long *flags) -- cgit v1.2.3 From 3dcef70e41ab13483803c536ddea8d5f1803ee25 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 25 Feb 2026 10:32:43 -0800 Subject: ww-mutex: Fix the ww_acquire_ctx function annotations The ww_acquire_done() call is optional. Reflect this in the annotations of ww_acquire_done(). Fixes: 47907461e4f6 ("locking/ww_mutex: Support Clang's context analysis") Signed-off-by: Bart Van Assche Signed-off-by: Peter Zijlstra (Intel) Acked-by: Maarten Lankhorst Acked-by: Marco Elver Link: https://patch.msgid.link/20260225183244.4035378-4-bvanassche@acm.org --- include/linux/ww_mutex.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h index 85b1fff02fde..0c95ead5a297 100644 --- a/include/linux/ww_mutex.h +++ b/include/linux/ww_mutex.h @@ -181,7 +181,7 @@ static inline void ww_acquire_init(struct ww_acquire_ctx *ctx, * data structures. */ static inline void ww_acquire_done(struct ww_acquire_ctx *ctx) - __releases(ctx) __acquires_shared(ctx) __no_context_analysis + __must_hold(ctx) { #ifdef DEBUG_WW_MUTEXES lockdep_assert_held(ctx); @@ -199,7 +199,7 @@ static inline void ww_acquire_done(struct ww_acquire_ctx *ctx) * mutexes have been released with ww_mutex_unlock. */ static inline void ww_acquire_fini(struct ww_acquire_ctx *ctx) - __releases_shared(ctx) __no_context_analysis + __releases(ctx) __no_context_analysis { #ifdef CONFIG_DEBUG_LOCK_ALLOC mutex_release(&ctx->first_lock_dep_map, _THIS_IP_); -- cgit v1.2.3 From 4a91a33f15c634fb3477d122bdf1eef098d77ee3 Mon Sep 17 00:00:00 2001 From: Mallesh Koujalagi Date: Fri, 27 Feb 2026 14:24:01 +0530 Subject: workqueue: Update documentation as per system_percpu_wq naming Update documentation to use "per-CPU workqueue" instead of "global workqueue" to match the system_wq to system_percpu_wq rename. The workqueue behavior remains unchanged; this just aligns terminology with the clearer naming. Fixes: a2be943b46b4 ("workqueue: replace use of system_wq with system_percpu_wq") Signed-off-by: Mallesh Koujalagi Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index a4749f56398f..fc5744402a66 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -712,14 +712,14 @@ static inline bool schedule_work_on(int cpu, struct work_struct *work) } /** - * schedule_work - put work task in global workqueue + * schedule_work - put work task in per-CPU workqueue * @work: job to be done * - * Returns %false if @work was already on the kernel-global workqueue and + * Returns %false if @work was already on the system per-CPU workqueue and * %true otherwise. * - * This puts a job in the kernel-global workqueue if it was not already - * queued and leaves it in the same position on the kernel-global + * This puts a job in the system per-CPU workqueue if it was not already + * queued and leaves it in the same position on the system per-CPU * workqueue otherwise. * * Shares the same memory-ordering properties of queue_work(), cf. the @@ -796,12 +796,12 @@ extern void __warn_flushing_systemwide_wq(void) }) /** - * schedule_delayed_work_on - queue work in global workqueue on CPU after delay + * schedule_delayed_work_on - queue work in per-CPU workqueue on CPU after delay * @cpu: cpu to use * @dwork: job to be done * @delay: number of jiffies to wait * - * After waiting for a given time this puts a job in the kernel-global + * After waiting for a given time this puts a job in the system per-CPU * workqueue on the specified CPU. */ static inline bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork, @@ -811,11 +811,11 @@ static inline bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork, } /** - * schedule_delayed_work - put work task in global workqueue after delay + * schedule_delayed_work - put work task in per-CPU workqueue after delay * @dwork: job to be done * @delay: number of jiffies to wait or 0 for immediate execution * - * After waiting for a given time this puts a job in the kernel-global + * After waiting for a given time this puts a job in the system per-CPU * workqueue. */ static inline bool schedule_delayed_work(struct delayed_work *dwork, -- cgit v1.2.3 From e1092d5e15e6a9b168bf830af9a26d7ea17cd57d Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Tue, 24 Feb 2026 12:10:44 +0100 Subject: PCI/PTM: Do not enable PTM automatically for Root and Switch Upstream Ports Currently we enable PTM automatically for Root and Switch Upstream Ports if the advertised capabilities support the relevant role. However, there are a few issues with this. First of all, if there is no Endpoint that actually needs the PTM functionality, this is just wasting link bandwidth. There are just a couple of drivers calling pci_ptm_enable() in the tree. Secondly, we do the enablement in pci_ptm_init() that is called pretty early for the Switch Upstream Port before Downstream Ports are even enumerated. Since the Upstream Port configuration affects the whole Switch, enabling it this early might cause PTM requests to be sent. We actually do see effects of this: pcieport 0000:00:07.1: pciehp: Slot(6-1): Card present pcieport 0000:00:07.1: pciehp: Slot(6-1): Link Up pci 0000:2c:00.0: [8086:5786] type 01 class 0x060400 PCIe Switch Upstream Port ... pci 0000:2c:00.0: PTM enabled, 4ns granularity At this point we have only enumerated the Switch Upstream Port and now PTM got enabled which immediately triggers a flood of errors: pcieport 0000:00:07.1: AER: Multiple Uncorrectable (Non-Fatal) error message received from 0000:00:07.1 pcieport 0000:00:07.1: PCIe Bus Error: severity=Uncorrectable (Non-Fatal), type=Transaction Layer, (Receiver ID) pcieport 0000:00:07.1: device [8086:d44f] error status/mask=00200000/00000000 pcieport 0000:00:07.1: [21] ACSViol (First) pcieport 0000:00:07.1: AER: TLP Header: 0x34000000 0x00000052 0x00000000 0x00000000 pcieport 0000:00:07.1: AER: device recovery successful pcieport 0000:00:07.1: AER: Uncorrectable (Non-Fatal) error message received from 0000:00:07.1 In the above TLP Header the Requester ID is 0 which causes an error as we have ACS Source Validation enabled. Change the PTM enablement to happen at the time pci_enable_ptm() is called. It will try to enable PTM first for upstream devices before enabling for the Endpoint itself. For disable path we need to keep count of how many times PTM has been enabled and disable it only on the last, so change the dev->ptm_enabled to a counter (and rename it to dev->ptm_enable_cnt analogous to dev->pci_enable_cnt). Signed-off-by: Mika Westerberg Signed-off-by: Bjorn Helgaas Link: https://patch.msgid.link/20260224111044.3487873-6-mika.westerberg@linux.intel.com --- include/linux/pci.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 8aaa72dcb164..c620d4b6c52e 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -518,7 +518,7 @@ struct pci_dev { unsigned int ptm_root:1; unsigned int ptm_responder:1; unsigned int ptm_requester:1; - unsigned int ptm_enabled:1; + atomic_t ptm_enable_cnt; u8 ptm_granularity; #endif #ifdef CONFIG_PCI_MSI -- cgit v1.2.3 From e6b899f08066e744f89df16ceb782e06868bd148 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 26 Feb 2026 14:50:09 +0100 Subject: nsfs: tighten permission checks for ns iteration ioctls Even privileged services should not necessarily be able to see other privileged service's namespaces so they can't leak information to each other. Use may_see_all_namespaces() helper that centralizes this policy until the nstree adapts. Link: https://patch.msgid.link/20260226-work-visibility-fixes-v1-1-d2c2853313bd@kernel.org Fixes: a1d220d9dafa ("nsfs: iterate through mount namespaces") Reviewed-by: Jeff Layton Cc: stable@kernel.org # v6.12+ Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 825f5865bfc5..c8e227a3f9e2 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -55,6 +55,8 @@ static __always_inline bool is_ns_init_id(const struct ns_common *ns) #define ns_common_free(__ns) __ns_common_free(to_ns_common((__ns))) +bool may_see_all_namespaces(void); + static __always_inline __must_check int __ns_ref_active_read(const struct ns_common *ns) { return atomic_read(&ns->__ns_ref_active); -- cgit v1.2.3 From 11c0663a595801b6e6f7a937adec8532706ef486 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jens=20Emil=20Schulz=20=C3=98stergaard?= Date: Thu, 26 Feb 2026 09:24:19 +0100 Subject: net: phy: micrel: Add support for lan9645x internal phy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LAN9645X is a family of switch chips with 5 internal copper phys. The internal PHY is based on parts of LAN8832. This is a low-power, single port triple-speed (10BASE-T/100BASE-TX/1000BASE-T) ethernet physical layer transceiver (PHY) that supports transmission and reception of data on standard CAT-5, as well as CAT-5e and CAT-6 Unshielded Twisted Pair (UTP) cables. Add support for the internal PHY of the lan9645x chip family. Reviewed-by: Steen Hegelund Reviewed-by: Daniel Machon Signed-off-by: Jens Emil Schulz Østergaard Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20260226-phy_micrel_add_support_for_lan9645x_internal_phy-v3-1-1fe82379962b@microchip.com Signed-off-by: Jakub Kicinski --- include/linux/micrel_phy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/micrel_phy.h b/include/linux/micrel_phy.h index ca691641788b..9c6f9817383f 100644 --- a/include/linux/micrel_phy.h +++ b/include/linux/micrel_phy.h @@ -33,6 +33,7 @@ #define PHY_ID_LAN8804 0x00221670 #define PHY_ID_LAN8841 0x00221650 #define PHY_ID_LAN8842 0x002216C0 +#define PHY_ID_LAN9645X 0x002216D0 #define PHY_ID_KSZ886X 0x00221430 #define PHY_ID_KSZ8863 0x00221435 -- cgit v1.2.3 From 6466441a5ecd1c1168264e4c322bae455579b156 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Feb 2026 04:12:13 +0000 Subject: net: inline skb_add_rx_frag_netmem() This critical helper (via skb_add_rx_frag()) is mostly used from drivers rx fast path. It is time to inline it, this actually saves space in vmlinux: size vmlinux.old vmlinux text data bss dec hex filename 37350766 23092977 4846992 65290735 3e441ef vmlinux.old 37350600 23092977 4846992 65290569 3e44149 vmlinux Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260226041213.1892561-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index daa4e4944ce3..9cc98f850f1d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2682,8 +2682,17 @@ static inline void skb_fill_page_desc_noacc(struct sk_buff *skb, int i, shinfo->nr_frags = i + 1; } -void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, netmem_ref netmem, - int off, int size, unsigned int truesize); +static inline void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, + netmem_ref netmem, int off, + int size, unsigned int truesize) +{ + DEBUG_NET_WARN_ON_ONCE(size > truesize); + + skb_fill_netmem_desc(skb, i, netmem, off, size); + skb->len += size; + skb->data_len += size; + skb->truesize += truesize; +} static inline void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, int size, -- cgit v1.2.3 From 15fba71533bcdfaa8eeba69a5a5a2927afdf664a Mon Sep 17 00:00:00 2001 From: Valentin Spreckels Date: Thu, 26 Feb 2026 20:54:09 +0100 Subject: net: usb: r8152: add TRENDnet TUC-ET2G The TRENDnet TUC-ET2G is a RTL8156 based usb ethernet adapter. Add its vendor and product IDs. Signed-off-by: Valentin Spreckels Link: https://patch.msgid.link/20260226195409.7891-2-valentin@spreckels.dev Signed-off-by: Jakub Kicinski --- include/linux/usb/r8152.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/usb/r8152.h b/include/linux/usb/r8152.h index 2ca60828f28b..1502b2a355f9 100644 --- a/include/linux/usb/r8152.h +++ b/include/linux/usb/r8152.h @@ -32,6 +32,7 @@ #define VENDOR_ID_DLINK 0x2001 #define VENDOR_ID_DELL 0x413c #define VENDOR_ID_ASUS 0x0b05 +#define VENDOR_ID_TRENDNET 0x20f4 #if IS_REACHABLE(CONFIG_USB_RTL8152) extern u8 rtl8152_get_version(struct usb_interface *intf); -- cgit v1.2.3 From 8021729acf21f4bf3c43866b8919b68968028478 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 25 Feb 2026 21:12:57 -0800 Subject: iio: tsl2772: fix all kernel-doc warnings Use the correct kernel-doc notation for struct members to eliminate kernel-doc warnings: Warning: include/linux/platform_data/tsl2772.h:88 struct member 'prox_diode' not described in 'tsl2772_settings' Warning: include/linux/platform_data/tsl2772.h:88 struct member 'prox_power' not described in 'tsl2772_settings' Signed-off-by: Randy Dunlap Reviewed-by: Andy Shevchenko Signed-off-by: Jonathan Cameron --- include/linux/platform_data/tsl2772.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/tsl2772.h b/include/linux/platform_data/tsl2772.h index f8ade15a35e2..f042e82b39c3 100644 --- a/include/linux/platform_data/tsl2772.h +++ b/include/linux/platform_data/tsl2772.h @@ -61,9 +61,9 @@ struct tsl2772_lux { * @prox_pulse_count: Number if proximity emitter pulses. * @prox_max_samples_cal: The number of samples that are taken when performing * a proximity calibration. - * @prox_diode Which diode(s) to use for driving the external + * @prox_diode: Which diode(s) to use for driving the external * LED(s) for proximity sensing. - * @prox_power The amount of power to use for the external LED(s). + * @prox_power: The amount of power to use for the external LED(s). */ struct tsl2772_settings { int als_time; -- cgit v1.2.3 From a2be37eedb52ea26938fa4cc9de1ff84963c57ad Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 24 Feb 2026 11:42:04 +0100 Subject: firmware: exynos-acpm: Drop fake 'const' on handle pointer All the functions operating on the 'handle' pointer are claiming it is a pointer to const thus they should not modify the handle. In fact that's a false statement, because first thing these functions do is drop the cast to const with container_of: struct acpm_info *acpm = handle_to_acpm_info(handle); And with such cast the handle is easily writable with simple: acpm->handle.ops.pmic_ops.read_reg = NULL; The code is not correct logically, either, because functions like acpm_get_by_node() and acpm_handle_put() are meant to modify the handle reference counting, thus they must modify the handle. Modification here happens anyway, even if the reference counting is stored in the container which the handle is part of. The code does not have actual visible bug, but incorrect 'const' annotations could lead to incorrect compiler decisions. Fixes: a88927b534ba ("firmware: add Exynos ACPM protocol driver") Cc: stable@vger.kernel.org Signed-off-by: Krzysztof Kozlowski Link: https://patch.msgid.link/20260224104203.42950-2-krzysztof.kozlowski@oss.qualcomm.com Signed-off-by: Krzysztof Kozlowski --- .../linux/firmware/samsung/exynos-acpm-protocol.h | 40 +++++++++------------- 1 file changed, 17 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/firmware/samsung/exynos-acpm-protocol.h b/include/linux/firmware/samsung/exynos-acpm-protocol.h index 2091da965a5a..13f17dc4443b 100644 --- a/include/linux/firmware/samsung/exynos-acpm-protocol.h +++ b/include/linux/firmware/samsung/exynos-acpm-protocol.h @@ -14,30 +14,24 @@ struct acpm_handle; struct device_node; struct acpm_dvfs_ops { - int (*set_rate)(const struct acpm_handle *handle, - unsigned int acpm_chan_id, unsigned int clk_id, - unsigned long rate); - unsigned long (*get_rate)(const struct acpm_handle *handle, + int (*set_rate)(struct acpm_handle *handle, unsigned int acpm_chan_id, + unsigned int clk_id, unsigned long rate); + unsigned long (*get_rate)(struct acpm_handle *handle, unsigned int acpm_chan_id, unsigned int clk_id); }; struct acpm_pmic_ops { - int (*read_reg)(const struct acpm_handle *handle, - unsigned int acpm_chan_id, u8 type, u8 reg, u8 chan, - u8 *buf); - int (*bulk_read)(const struct acpm_handle *handle, - unsigned int acpm_chan_id, u8 type, u8 reg, u8 chan, - u8 count, u8 *buf); - int (*write_reg)(const struct acpm_handle *handle, - unsigned int acpm_chan_id, u8 type, u8 reg, u8 chan, - u8 value); - int (*bulk_write)(const struct acpm_handle *handle, - unsigned int acpm_chan_id, u8 type, u8 reg, u8 chan, - u8 count, const u8 *buf); - int (*update_reg)(const struct acpm_handle *handle, - unsigned int acpm_chan_id, u8 type, u8 reg, u8 chan, - u8 value, u8 mask); + int (*read_reg)(struct acpm_handle *handle, unsigned int acpm_chan_id, + u8 type, u8 reg, u8 chan, u8 *buf); + int (*bulk_read)(struct acpm_handle *handle, unsigned int acpm_chan_id, + u8 type, u8 reg, u8 chan, u8 count, u8 *buf); + int (*write_reg)(struct acpm_handle *handle, unsigned int acpm_chan_id, + u8 type, u8 reg, u8 chan, u8 value); + int (*bulk_write)(struct acpm_handle *handle, unsigned int acpm_chan_id, + u8 type, u8 reg, u8 chan, u8 count, const u8 *buf); + int (*update_reg)(struct acpm_handle *handle, unsigned int acpm_chan_id, + u8 type, u8 reg, u8 chan, u8 value, u8 mask); }; struct acpm_ops { @@ -56,12 +50,12 @@ struct acpm_handle { struct device; #if IS_ENABLED(CONFIG_EXYNOS_ACPM_PROTOCOL) -const struct acpm_handle *devm_acpm_get_by_node(struct device *dev, - struct device_node *np); +struct acpm_handle *devm_acpm_get_by_node(struct device *dev, + struct device_node *np); #else -static inline const struct acpm_handle *devm_acpm_get_by_node(struct device *dev, - struct device_node *np) +static inline struct acpm_handle *devm_acpm_get_by_node(struct device *dev, + struct device_node *np) { return NULL; } -- cgit v1.2.3 From 2a76a626670b2ef391da37f457e8e51f168432a6 Mon Sep 17 00:00:00 2001 From: Taha Ed-Dafili <0rayn.dev@gmail.com> Date: Thu, 26 Feb 2026 15:11:03 +0000 Subject: iio: core: Add IIO_EV_INFO_SCALE to event info Implement support for IIO_EV_INFO_SCALE in the internal enum iio_event_info to allow proper ABI compliance. This allows drivers (like the ADXL345) to expose event scale attributes using the standard IIO ABI rather than manual device attributes. Signed-off-by: Taha Ed-Dafili <0rayn.dev@gmail.com> Reviewed-by: David Lechner Signed-off-by: Jonathan Cameron --- include/linux/iio/types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/iio/types.h b/include/linux/iio/types.h index 34eebad12d2c..4e3099defc1d 100644 --- a/include/linux/iio/types.h +++ b/include/linux/iio/types.h @@ -21,6 +21,7 @@ enum iio_event_info { IIO_EV_INFO_TAP2_MIN_DELAY, IIO_EV_INFO_RUNNING_PERIOD, IIO_EV_INFO_RUNNING_COUNT, + IIO_EV_INFO_SCALE, }; #define IIO_VAL_INT 1 -- cgit v1.2.3 From d3b693a13b39bce16e284e1c737874966b3a96de Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 28 Feb 2026 17:47:43 -0800 Subject: spi: spi-mem: clean up kernel-doc in spi-mem.h Eliminate all kernel-doc warnings in spi-mem.h: - add missing struct member descriptions - don't use "struct" for function descrptions Warning: include/linux/spi/spi-mem.h:202 struct member 'cmd' not described in 'spi_mem_op' Warning: include/linux/spi/spi-mem.h:202 struct member 'addr' not described in 'spi_mem_op' Warning: include/linux/spi/spi-mem.h:202 struct member 'dummy' not described in 'spi_mem_op' Warning: include/linux/spi/spi-mem.h:202 struct member 'data' not described in 'spi_mem_op' Warning: include/linux/spi/spi-mem.h:286 Incorrect use of kernel-doc format: * struct spi_mem_get_drvdata() - get driver private data attached to a SPI mem Warning: include/linux/spi/spi-mem.h:298 Incorrect use of kernel-doc format: * struct spi_controller_mem_ops - SPI memory operations Warning: include/linux/spi/spi-mem.h:362 expecting prototype for struct spi_mem_set_drvdata. Prototype was for struct spi_controller_mem_ops instead Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20260301014743.3133167-1-rdunlap@infradead.org Signed-off-by: Mark Brown --- include/linux/spi/spi-mem.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/spi/spi-mem.h b/include/linux/spi/spi-mem.h index 5774e554c0f0..37f709784350 100644 --- a/include/linux/spi/spi-mem.h +++ b/include/linux/spi/spi-mem.h @@ -130,11 +130,13 @@ enum spi_mem_data_dir { /** * struct spi_mem_op - describes a SPI memory operation + * @cmd: the complete command * @cmd.nbytes: number of opcode bytes (only 1 or 2 are valid). The opcode is * sent MSB-first. * @cmd.buswidth: number of IO lines used to transmit the command * @cmd.opcode: operation opcode * @cmd.dtr: whether the command opcode should be sent in DTR mode or not + * @addr: the address attributes * @addr.nbytes: number of address bytes to send. Can be zero if the operation * does not need to send an address * @addr.buswidth: number of IO lines used to transmit the address cycles @@ -143,10 +145,12 @@ enum spi_mem_data_dir { * Note that only @addr.nbytes are taken into account in this * address value, so users should make sure the value fits in the * assigned number of bytes. + * @dummy: data for dummy operation * @dummy.nbytes: number of dummy bytes to send after an opcode or address. Can * be zero if the operation does not require dummy bytes * @dummy.buswidth: number of IO lanes used to transmit the dummy bytes * @dummy.dtr: whether the dummy bytes should be sent in DTR mode or not + * @data: the data attributes * @data.buswidth: number of IO lanes used to send/receive the data * @data.dtr: whether the data should be sent in DTR mode or not * @data.ecc: whether error correction is required or not @@ -273,7 +277,7 @@ struct spi_mem { }; /** - * struct spi_mem_set_drvdata() - attach driver private data to a SPI mem + * spi_mem_set_drvdata() - attach driver private data to a SPI mem * device * @mem: memory device * @data: data to attach to the memory device @@ -284,7 +288,7 @@ static inline void spi_mem_set_drvdata(struct spi_mem *mem, void *data) } /** - * struct spi_mem_get_drvdata() - get driver private data attached to a SPI mem + * spi_mem_get_drvdata() - get driver private data attached to a SPI mem * device * @mem: memory device * -- cgit v1.2.3 From 73942a6ea26bd7e02b7c260b8b7aa942397be894 Mon Sep 17 00:00:00 2001 From: Stefan Binding Date: Tue, 24 Feb 2026 16:18:05 +0000 Subject: firmware: cs_dsp: Add API to hibernate the DSP For some parts, the DSP is kept running when in low power mode (hibernation), leaving the firmware ALSA controls enabled, but the registers are inaccessible. Attempts to access volatile firmware controls whilst in this state would produce errors in the kernel log due to a regmap_raw_read() into DSP registers whilst the regmap is in cache_only. To remove this error log, add a hibernating flag to indicate that the controls are inaccessible, so we no longer try to read or write to the registers whilst the regmap is in cache_only. This would still produce an error when trying to read or write to these controls, but this would be a different error (-EPERM instead of -EBUSY), and would not produce a spurious error log in the kernel. Upon wake from hibernation, the control caches are re-synced to the hardware, if the DSP is running. Signed-off-by: Stefan Binding Link: https://patch.msgid.link/20260224161821.93365-2-sbinding@opensource.cirrus.com Reviewed-by: Richard Fitzgerald Signed-off-by: Mark Brown --- include/linux/firmware/cirrus/cs_dsp.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/firmware/cirrus/cs_dsp.h b/include/linux/firmware/cirrus/cs_dsp.h index 0ec1cdc5585d..4e3baa557068 100644 --- a/include/linux/firmware/cirrus/cs_dsp.h +++ b/include/linux/firmware/cirrus/cs_dsp.h @@ -179,6 +179,7 @@ struct cs_dsp { bool booted; bool running; + bool hibernating; struct list_head ctl_list; @@ -354,4 +355,6 @@ int cs_dsp_chunk_write(struct cs_dsp_chunk *ch, int nbits, u32 val); int cs_dsp_chunk_flush(struct cs_dsp_chunk *ch); int cs_dsp_chunk_read(struct cs_dsp_chunk *ch, int nbits); +void cs_dsp_hibernate(struct cs_dsp *dsp, bool hibernating); + #endif -- cgit v1.2.3 From bd77375097357b46af00db1316ceab5e82ccbc8b Mon Sep 17 00:00:00 2001 From: Kavita Kavita Date: Fri, 27 Feb 2026 00:25:51 +0530 Subject: wifi: cfg80211: add support for IEEE 802.1X Authentication Protocol Add an extended feature flag NL80211_EXT_FEATURE_IEEE8021X_AUTH to allow a driver to indicate support for the IEEE 802.1X authentication protocol in non-AP STA mode, as defined in "IEEE P802.11bi/D4.0, 12.16.5". In case of SME in userspace, the Authentication frame body is prepared in userspace while the driver finalizes the Authentication frame once it receives the required fields and elements. The driver indicates support for IEEE 802.1X authentication using the extended feature flag so that userspace can initiate IEEE 802.1X authentication. When the feature flag is set, process IEEE 802.1X Authentication frames from userspace in non-AP STA mode. If the flag is not set, reject IEEE 802.1X Authentication frames. Define a new authentication type NL80211_AUTHTYPE_IEEE8021X for IEEE 802.1X authentication. Signed-off-by: Kavita Kavita Link: https://patch.msgid.link/20260226185553.1516290-4-kavita.kavita@oss.qualcomm.com Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 0aa2fb8f88de..1bf806f85372 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1358,6 +1358,7 @@ struct ieee80211_tdls_data { #define WLAN_AUTH_FILS_SK 4 #define WLAN_AUTH_FILS_SK_PFS 5 #define WLAN_AUTH_FILS_PK 6 +#define WLAN_AUTH_IEEE8021X 8 #define WLAN_AUTH_EPPKE 9 #define WLAN_AUTH_LEAP 128 -- cgit v1.2.3 From 9347878b1513beee1a26bb249f5dc8326d450f75 Mon Sep 17 00:00:00 2001 From: Kavita Kavita Date: Fri, 27 Feb 2026 00:25:52 +0530 Subject: wifi: mac80211: Add support for IEEE 802.1X authentication protocol in non-AP STA mode Add support for the IEEE 802.1X authentication protocol in non-AP STA mode, as specified in "IEEE P802.11bi/D4.0, 12.16.5". IEEE 802.1X authentication involves multiple Authentication frame exchanges, with the non-AP STA and AP alternating transaction sequence numbers. The number of Authentication frame exchanges depends on the EAP method in use. For IEEE 802.1X authentication, process only Authentication frames with the expected transaction sequence number. For IEEE 802.1X Authentication, Table 9-71 specifies that the Encapsulation Length field as specified in Clause 9.4.1.82 shall be present in all IEEE 802.1X Authentication frames. Drop the frame in the mac80211 if the Encapsulation Length field is missing. After receiving the final Authentication frame with status code WLAN_STATUS_8021X_AUTH_SUCCESS from the AP, mac80211 marks the state as authenticated, as it indicates the EAP handshake has completed successfully over the Authentication frames as specified in Clause 12.16.5. In the PMKSA caching case, only two Authentication frames are exchanged if the AP identifies a valid PMKSA, then as specified in Clause 12.16.8.3, the AP shall set the Status Code to WLAN_STATUS_SUCCESS in the final Authentication frame and must not include an encapsulated EAPOL PDU. This frame will be the final Authentication frame from the AP when PMKSA caching is enabled, and mac80211 marks the state as authenticated. In case of authentication success or failure, forward the Authentication frame to userspace(e.g. wpa_supplicant), and let userspace validate the Authentication frame from the AP as per the specification. Signed-off-by: Kavita Kavita Link: https://patch.msgid.link/20260226185553.1516290-5-kavita.kavita@oss.qualcomm.com Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 1bf806f85372..3651b2e6c518 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1508,6 +1508,7 @@ enum ieee80211_statuscode { WLAN_STATUS_SAE_PK = 127, WLAN_STATUS_DENIED_TID_TO_LINK_MAPPING = 133, WLAN_STATUS_PREF_TID_TO_LINK_MAPPING_SUGGESTED = 134, + WLAN_STATUS_8021X_AUTH_SUCCESS = 153, }; -- cgit v1.2.3 From 8a9ebe8c3ca4c5bdad8f010656f4c2155da589fd Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 28 Feb 2026 17:48:22 -0800 Subject: gpio: timberdale: repair kernel-doc comments Use a ':' after struct member names to avoid kernel-doc warnings: Warning: include/linux/timb_gpio.h:22 struct member 'gpio_base' not described in 'timbgpio_platform_data' Warning: include/linux/timb_gpio.h:22 struct member 'nr_pins' not described in 'timbgpio_platform_data' Warning: include/linux/timb_gpio.h:22 struct member 'irq_base' not described in 'timbgpio_platform_data' Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20260301014822.3133268-1-rdunlap@infradead.org Signed-off-by: Bartosz Golaszewski --- include/linux/timb_gpio.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/timb_gpio.h b/include/linux/timb_gpio.h index 3faf5a6bb13e..74f5e73bf6db 100644 --- a/include/linux/timb_gpio.h +++ b/include/linux/timb_gpio.h @@ -9,10 +9,10 @@ /** * struct timbgpio_platform_data - Platform data of the Timberdale GPIO driver - * @gpio_base The number of the first GPIO pin, set to -1 for + * @gpio_base: The number of the first GPIO pin, set to -1 for * dynamic number allocation. - * @nr_pins Number of pins that is supported by the hardware (1-32) - * @irq_base If IRQ is supported by the hardware, this is the base + * @nr_pins: Number of pins that is supported by the hardware (1-32) + * @irq_base: If IRQ is supported by the hardware, this is the base * number of IRQ:s. One IRQ per pin will be used. Set to * -1 if IRQ:s is not supported. */ -- cgit v1.2.3 From 189645ba9cd9c1eed45151aacaae4347c1eb86a7 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 28 Feb 2026 17:48:11 -0800 Subject: gpio: nomadik: repair some kernel-doc comments Avoid these kernel-doc warnings by: - adding short descriptions for enums - using correct (matching) struct names in kernel-doc short descriptions - using the correct struct member name for @nfunctions Warning: include/linux/gpio/gpio-nomadik.h:116 missing initial short description on line: * enum prcm_gpiocr_reg_index Warning: include/linux/gpio/gpio-nomadik.h:125 missing initial short description on line: * enum prcm_gpiocr_altcx_index Warning: include/linux/gpio/gpio-nomadik.h:146 expecting prototype for struct prcm_gpio_altcx. Prototype was for struct prcm_gpiocr_altcx instead Warning: include/linux/gpio/gpio-nomadik.h:156 expecting prototype for struct prcm_gpio_altcx_pin_desc. Prototype was for struct prcm_gpiocr_altcx_pin_desc instead Warning: include/linux/gpio/gpio-nomadik.h:212 struct member 'nfunctions' not described in 'nmk_pinctrl_soc_data' Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20260301014811.3133250-1-rdunlap@infradead.org Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/gpio-nomadik.h | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio/gpio-nomadik.h b/include/linux/gpio/gpio-nomadik.h index 592a774a53cd..8061b9826361 100644 --- a/include/linux/gpio/gpio-nomadik.h +++ b/include/linux/gpio/gpio-nomadik.h @@ -114,8 +114,7 @@ struct nmk_gpio_chip { } /** - * enum prcm_gpiocr_reg_index - * Used to reference an PRCM GPIOCR register address. + * enum prcm_gpiocr_reg_index - Used to reference a PRCM GPIOCR register address. */ enum prcm_gpiocr_reg_index { PRCM_IDX_GPIOCR1, @@ -123,8 +122,7 @@ enum prcm_gpiocr_reg_index { PRCM_IDX_GPIOCR3 }; /** - * enum prcm_gpiocr_altcx_index - * Used to reference an Other alternate-C function. + * enum prcm_gpiocr_altcx_index - Used to reference an Other alternate-C function. */ enum prcm_gpiocr_altcx_index { PRCM_IDX_GPIOCR_ALTC1, @@ -135,7 +133,7 @@ enum prcm_gpiocr_altcx_index { }; /** - * struct prcm_gpio_altcx - Other alternate-C function + * struct prcm_gpiocr_altcx - Other alternate-C function * @used: other alternate-C function availability * @reg_index: PRCM GPIOCR register index used to control the function * @control_bit: PRCM GPIOCR bit used to control the function @@ -147,7 +145,7 @@ struct prcm_gpiocr_altcx { } __packed; /** - * struct prcm_gpio_altcx_pin_desc - Other alternate-C pin + * struct prcm_gpiocr_altcx_pin_desc - Other alternate-C pin * @pin: The pin number * @altcx: array of other alternate-C[1-4] functions */ @@ -193,7 +191,7 @@ struct nmk_pingroup { * numbering. * @npins: The number of entries in @pins. * @functions: The functions supported on this SoC. - * @nfunction: The number of entries in @functions. + * @nfunctions: The number of entries in @functions. * @groups: An array describing all pin groups the pin SoC supports. * @ngroups: The number of entries in @groups. * @altcx_pins: The pins that support Other alternate-C function on this SoC -- cgit v1.2.3 From 25ab7b6f34c74ea555b4489b57f7219612991433 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 16 Feb 2026 14:32:02 +0100 Subject: xattr: remove rbtree-based simple_xattr infrastructure Now that all consumers (shmem, kernfs, pidfs) have been converted to use the rhashtable-based simple_xattrs with pointer-based lazy allocation, remove the legacy rbtree code path. The rhashtable implementation provides O(1) average-case lookup with RCU-based lockless reads, replacing the O(log n) rbtree with reader-writer spinlock contention. Link: https://patch.msgid.link/20260216-work-xattr-socket-v1-6-c2efa4f74cb7@kernel.org Acked-by: Darrick J. Wong Signed-off-by: Christian Brauner --- include/linux/xattr.h | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/xattr.h b/include/linux/xattr.h index 6e619e185e90..3b5a5fd684eb 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -107,18 +107,10 @@ static inline const char *xattr_prefix(const struct xattr_handler *handler) } struct simple_xattrs { - bool use_rhashtable; - union { - struct { - struct rb_root rb_root; - rwlock_t lock; - }; - struct rhashtable ht; - }; + struct rhashtable ht; }; struct simple_xattr { - struct rb_node rb_node; struct rhash_head hash_node; struct rcu_head rcu; char *name; @@ -126,7 +118,7 @@ struct simple_xattr { char value[] __counted_by(size); }; -void simple_xattrs_init(struct simple_xattrs *xattrs); +int simple_xattrs_init(struct simple_xattrs *xattrs); struct simple_xattrs *simple_xattrs_alloc(void); struct simple_xattrs *simple_xattrs_lazy_alloc(struct simple_xattrs **xattrsp, const void *value, int flags); -- cgit v1.2.3 From 4fbe9e78bb415dd632ff63a9f620af0be58ef820 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 16 Feb 2026 14:32:05 +0100 Subject: xattr: move user limits for xattrs to generic infra Link: https://patch.msgid.link/20260216-work-xattr-socket-v1-9-c2efa4f74cb7@kernel.org Acked-by: Darrick J. Wong Signed-off-by: Christian Brauner --- include/linux/kernfs.h | 2 -- include/linux/xattr.h | 18 ++++++++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index b5a5f32fdfd1..d8f57f0af5e4 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -99,8 +99,6 @@ enum kernfs_node_type { #define KERNFS_TYPE_MASK 0x000f #define KERNFS_FLAG_MASK ~KERNFS_TYPE_MASK -#define KERNFS_MAX_USER_XATTRS 128 -#define KERNFS_USER_XATTR_SIZE_LIMIT (128 << 10) enum kernfs_node_flag { KERNFS_ACTIVATED = 0x0010, diff --git a/include/linux/xattr.h b/include/linux/xattr.h index 3b5a5fd684eb..8b6601367eae 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -118,6 +118,20 @@ struct simple_xattr { char value[] __counted_by(size); }; +#define SIMPLE_XATTR_MAX_NR 128 +#define SIMPLE_XATTR_MAX_SIZE (128 << 10) + +struct simple_xattr_limits { + atomic_t nr_xattrs; /* current user.* xattr count */ + atomic_t xattr_size; /* current total user.* value bytes */ +}; + +static inline void simple_xattr_limits_init(struct simple_xattr_limits *limits) +{ + atomic_set(&limits->nr_xattrs, 0); + atomic_set(&limits->xattr_size, 0); +} + int simple_xattrs_init(struct simple_xattrs *xattrs); struct simple_xattrs *simple_xattrs_alloc(void); struct simple_xattrs *simple_xattrs_lazy_alloc(struct simple_xattrs **xattrsp, @@ -132,6 +146,10 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs, const char *name, const void *value, size_t size, int flags); +int simple_xattr_set_limited(struct simple_xattrs *xattrs, + struct simple_xattr_limits *limits, + const char *name, const void *value, + size_t size, int flags); ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, char *buffer, size_t size); int simple_xattr_add(struct simple_xattrs *xattrs, -- cgit v1.2.3 From cc2f5e2aeb6c69556837e45756b3ddded98b3898 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 28 Feb 2026 17:48:02 -0800 Subject: pinctrl: pinconf-generic: fix an enum name description Correct an enum name in a kernel-doc comment to avoid kernel-doc warnings: Warning: include/linux/pinctrl/pinconf-generic.h:161 Enum value 'PIN_CONFIG_SKEW_DELAY_OUTPUT_PS' not described in enum 'pin_config_param' Warning: include/linux/pinctrl/pinconf-generic.h:161 Excess enum value '@PIN_CONFIG_SKEW_DELAY_OUPUT_PS' description in 'pin_config_param' Signed-off-by: Randy Dunlap Signed-off-by: Linus Walleij --- include/linux/pinctrl/pinconf-generic.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h index 89277808ea61..531dc3e9b3f7 100644 --- a/include/linux/pinctrl/pinconf-generic.h +++ b/include/linux/pinctrl/pinconf-generic.h @@ -115,7 +115,7 @@ struct pinctrl_map; * @PIN_CONFIG_SKEW_DELAY_INPUT_PS: if the pin has independent values for the * programmable skew rate (on inputs) and latch delay (on outputs), then * this parameter specifies the clock skew only. The argument is in ps. - * @PIN_CONFIG_SKEW_DELAY_OUPUT_PS: if the pin has independent values for the + * @PIN_CONFIG_SKEW_DELAY_OUTPUT_PS: if the pin has independent values for the * programmable skew rate (on inputs) and latch delay (on outputs), then * this parameter specifies the latch delay only. The argument is in ps. * @PIN_CONFIG_SLEEP_HARDWARE_STATE: indicate this is sleep related state. -- cgit v1.2.3 From 94798081732abfb5748471d5c3cced6ff187fa36 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 13 Feb 2026 18:52:43 -0800 Subject: driver core: platform: add kerneldoc to struct platform_device_info Add kernel documentation for struct platform_device_info and its individual members. While at it remove an extra indent level from the structure definition. Signed-off-by: Dmitry Torokhov Reviewed-by: Bartosz Golaszewski Link: https://patch.msgid.link/20260214025246.2095239-2-dmitry.torokhov@gmail.com Signed-off-by: Danilo Krummrich --- include/linux/platform_device.h | 53 ++++++++++++++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h index 813da101b5bf..5f54217930e1 100644 --- a/include/linux/platform_device.h +++ b/include/linux/platform_device.h @@ -118,22 +118,53 @@ extern int platform_get_irq_byname_optional(struct platform_device *dev, const char *name); extern int platform_add_devices(struct platform_device **, int); +/** + * struct platform_device_info - set of parameters for creating a platform device + * @parent: parent device for the new platform device. + * @fwnode: firmware node associated with the device. + * @of_node_reused: indicates that device tree node associated with the device + * is shared with another device, typically its ancestor. Setting this to + * %true prevents the device from being matched via the OF match table, + * and stops the device core from automatically binding pinctrl + * configuration to avoid disrupting the other device. + * @name: name of the device. + * @id: instance ID of the device. Use %PLATFORM_DEVID_NONE if there is only + * one instance of the device, or %PLATFORM_DEVID_AUTO to let the + * kernel automatically assign a unique instance ID. + * @res: set of resources to attach to the device. + * @num_res: number of entries in @res. + * @data: device-specific data for this platform device. + * @size_data: size of device-specific data. + * @dma_mask: DMA mask for the device. + * @properties: a set of software properties for the device. If provided, + * a managed software node will be automatically created and + * assigned to the device. The properties array must be terminated + * with a sentinel entry. + * + * This structure is used to hold information needed to create and register + * a platform device using platform_device_register_full(). + * + * platform_device_register_full() makes deep copies of @name, @res, @data and + * @properties, so the caller does not need to keep them after registration. + * If the registration is performed during initialization, these can be marked + * as __initconst. + */ struct platform_device_info { - struct device *parent; - struct fwnode_handle *fwnode; - bool of_node_reused; + struct device *parent; + struct fwnode_handle *fwnode; + bool of_node_reused; - const char *name; - int id; + const char *name; + int id; - const struct resource *res; - unsigned int num_res; + const struct resource *res; + unsigned int num_res; - const void *data; - size_t size_data; - u64 dma_mask; + const void *data; + size_t size_data; + u64 dma_mask; - const struct property_entry *properties; + const struct property_entry *properties; }; extern struct platform_device *platform_device_register_full( const struct platform_device_info *pdevinfo); -- cgit v1.2.3 From 0fc434bc2c45fceb9356f2138911db0f454b8ca6 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 13 Feb 2026 18:52:44 -0800 Subject: driver core: platform: allow attaching software nodes when creating devices Extend platform_device_info structure with an optional pointer to a software node to be used as a secondary firmware node for the device being created. If software node has not been registered yet it will be automatically registered. This reduces boilerplate needed when switching legacy board code to static device properties/GPIO references. Signed-off-by: Dmitry Torokhov Reviewed-by: Bartosz Golaszewski Link: https://patch.msgid.link/20260214025246.2095239-3-dmitry.torokhov@gmail.com Signed-off-by: Danilo Krummrich --- include/linux/platform_device.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h index 5f54217930e1..754e4bf2771a 100644 --- a/include/linux/platform_device.h +++ b/include/linux/platform_device.h @@ -136,10 +136,14 @@ extern int platform_add_devices(struct platform_device **, int); * @data: device-specific data for this platform device. * @size_data: size of device-specific data. * @dma_mask: DMA mask for the device. + * @swnode: a secondary software node to be attached to the device. The node + * will be automatically registered and its lifetime tied to the platform + * device if it is not registered yet. * @properties: a set of software properties for the device. If provided, * a managed software node will be automatically created and * assigned to the device. The properties array must be terminated - * with a sentinel entry. + * with a sentinel entry. Specifying both @properties and @swnode is not + * allowed. * * This structure is used to hold information needed to create and register * a platform device using platform_device_register_full(). @@ -164,6 +168,7 @@ struct platform_device_info { size_t size_data; u64 dma_mask; + const struct software_node *swnode; const struct property_entry *properties; }; extern struct platform_device *platform_device_register_full( -- cgit v1.2.3 From b570f37a2ce480be26c665345c5514686a8a0274 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Tue, 10 Feb 2026 12:56:53 +0100 Subject: mm: Fix a hmm_range_fault() livelock / starvation problem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If hmm_range_fault() fails a folio_trylock() in do_swap_page, trying to acquire the lock of a device-private folio for migration, to ram, the function will spin until it succeeds grabbing the lock. However, if the process holding the lock is depending on a work item to be completed, which is scheduled on the same CPU as the spinning hmm_range_fault(), that work item might be starved and we end up in a livelock / starvation situation which is never resolved. This can happen, for example if the process holding the device-private folio lock is stuck in migrate_device_unmap()->lru_add_drain_all() sinc lru_add_drain_all() requires a short work-item to be run on all online cpus to complete. A prerequisite for this to happen is: a) Both zone device and system memory folios are considered in migrate_device_unmap(), so that there is a reason to call lru_add_drain_all() for a system memory folio while a folio lock is held on a zone device folio. b) The zone device folio has an initial mapcount > 1 which causes at least one migration PTE entry insertion to be deferred to try_to_migrate(), which can happen after the call to lru_add_drain_all(). c) No or voluntary only preemption. This all seems pretty unlikely to happen, but indeed is hit by the "xe_exec_system_allocator" igt test. Resolve this by waiting for the folio to be unlocked if the folio_trylock() fails in do_swap_page(). Rename migration_entry_wait_on_locked() to softleaf_entry_wait_unlock() and update its documentation to indicate the new use-case. Future code improvements might consider moving the lru_add_drain_all() call in migrate_device_unmap() to be called *after* all pages have migration entries inserted. That would eliminate also b) above. v2: - Instead of a cond_resched() in hmm_range_fault(), eliminate the problem by waiting for the folio to be unlocked in do_swap_page() (Alistair Popple, Andrew Morton) v3: - Add a stub migration_entry_wait_on_locked() for the !CONFIG_MIGRATION case. (Kernel Test Robot) v4: - Rename migrate_entry_wait_on_locked() to softleaf_entry_wait_on_locked() and update docs (Alistair Popple) v5: - Add a WARN_ON_ONCE() for the !CONFIG_MIGRATION version of softleaf_entry_wait_on_locked(). - Modify wording around function names in the commit message (Andrew Morton) Suggested-by: Alistair Popple Fixes: 1afaeb8293c9 ("mm/migrate: Trylock device page in do_swap_page") Cc: Ralph Campbell Cc: Christoph Hellwig Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: Leon Romanovsky Cc: Andrew Morton Cc: Matthew Brost Cc: John Hubbard Cc: Alistair Popple Cc: linux-mm@kvack.org Cc: Signed-off-by: Thomas Hellström Cc: # v6.15+ Reviewed-by: John Hubbard #v3 Reviewed-by: Alistair Popple Link: https://patch.msgid.link/20260210115653.92413-1-thomas.hellstrom@linux.intel.com (cherry picked from commit a69d1ab971a624c6f112cea61536569d579c3215) Signed-off-by: Rodrigo Vivi --- include/linux/migrate.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 26ca00c325d9..d5af2b7f577b 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -65,7 +65,7 @@ bool isolate_folio_to_list(struct folio *folio, struct list_head *list); int migrate_huge_page_move_mapping(struct address_space *mapping, struct folio *dst, struct folio *src); -void migration_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl) +void softleaf_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl) __releases(ptl); void folio_migrate_flags(struct folio *newfolio, struct folio *folio); int folio_migrate_mapping(struct address_space *mapping, @@ -97,6 +97,14 @@ static inline int set_movable_ops(const struct movable_operations *ops, enum pag return -ENOSYS; } +static inline void softleaf_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl) + __releases(ptl) +{ + WARN_ON_ONCE(1); + + spin_unlock(ptl); +} + #endif /* CONFIG_MIGRATION */ #ifdef CONFIG_NUMA_BALANCING -- cgit v1.2.3 From af4e9ef3d78420feb8fe58cd9a1ab80c501b3c08 Mon Sep 17 00:00:00 2001 From: David Laight Date: Mon, 2 Mar 2026 13:27:51 +0000 Subject: uaccess: Fix scoped_user_read_access() for 'pointer to const' If a 'const struct foo __user *ptr' is used for the address passed to scoped_user_read_access() then you get a warning/error uaccess.h:691:1: error: initialization discards 'const' qualifier from pointer target type [-Werror=discarded-qualifiers] for the void __user *_tmpptr = __scoped_user_access_begin(mode, uptr, size, elbl) assignment. Fix by using 'auto' for both _tmpptr and the redeclaration of uptr. Replace the CLASS() with explicit __cleanup() functions on uptr. Fixes: e497310b4ffb ("uaccess: Provide scoped user access regions") Signed-off-by: David Laight Reviewed-and-tested-by: Christophe Leroy (CS GROUP) Signed-off-by: Linus Torvalds --- include/linux/uaccess.h | 54 ++++++++++++++++++------------------------------- 1 file changed, 20 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 1f3804245c06..809e4f7dfdbd 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -647,36 +647,22 @@ static inline void user_access_restore(unsigned long flags) { } /* Define RW variant so the below _mode macro expansion works */ #define masked_user_rw_access_begin(u) masked_user_access_begin(u) #define user_rw_access_begin(u, s) user_access_begin(u, s) -#define user_rw_access_end() user_access_end() /* Scoped user access */ -#define USER_ACCESS_GUARD(_mode) \ -static __always_inline void __user * \ -class_user_##_mode##_begin(void __user *ptr) \ -{ \ - return ptr; \ -} \ - \ -static __always_inline void \ -class_user_##_mode##_end(void __user *ptr) \ -{ \ - user_##_mode##_access_end(); \ -} \ - \ -DEFINE_CLASS(user_ ##_mode## _access, void __user *, \ - class_user_##_mode##_end(_T), \ - class_user_##_mode##_begin(ptr), void __user *ptr) \ - \ -static __always_inline class_user_##_mode##_access_t \ -class_user_##_mode##_access_ptr(void __user *scope) \ -{ \ - return scope; \ -} -USER_ACCESS_GUARD(read) -USER_ACCESS_GUARD(write) -USER_ACCESS_GUARD(rw) -#undef USER_ACCESS_GUARD +/* Cleanup wrapper functions */ +static __always_inline void __scoped_user_read_access_end(const void *p) +{ + user_read_access_end(); +}; +static __always_inline void __scoped_user_write_access_end(const void *p) +{ + user_write_access_end(); +}; +static __always_inline void __scoped_user_rw_access_end(const void *p) +{ + user_access_end(); +}; /** * __scoped_user_access_begin - Start a scoped user access @@ -750,13 +736,13 @@ USER_ACCESS_GUARD(rw) * * Don't use directly. Use scoped_masked_user_$MODE_access() instead. */ -#define __scoped_user_access(mode, uptr, size, elbl) \ -for (bool done = false; !done; done = true) \ - for (void __user *_tmpptr = __scoped_user_access_begin(mode, uptr, size, elbl); \ - !done; done = true) \ - for (CLASS(user_##mode##_access, scope)(_tmpptr); !done; done = true) \ - /* Force modified pointer usage within the scope */ \ - for (const typeof(uptr) uptr = _tmpptr; !done; done = true) +#define __scoped_user_access(mode, uptr, size, elbl) \ +for (bool done = false; !done; done = true) \ + for (auto _tmpptr = __scoped_user_access_begin(mode, uptr, size, elbl); \ + !done; done = true) \ + /* Force modified pointer usage within the scope */ \ + for (const auto uptr __cleanup(__scoped_user_##mode##_access_end) = \ + _tmpptr; !done; done = true) /** * scoped_user_read_access_size - Start a scoped user read access with given size -- cgit v1.2.3 From ba51cf9fcf511df8c7026feda4b7d65999d3517c Mon Sep 17 00:00:00 2001 From: Michael Guralnik Date: Thu, 26 Feb 2026 15:52:12 +0200 Subject: net/mlx5: Drop MR cache related code Following mlx5_ib move to using FRMR pools, drop all unused code of MR cache. Signed-off-by: Michael Guralnik Reviewed-by: Yishai Hadas Signed-off-by: Edward Srouji Link: https://patch.msgid.link/20260226-frmr_pools-v4-7-95360b54f15e@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/driver.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 04dcd09f7517..27d64f09683f 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -705,23 +705,12 @@ struct mlx5_st; enum { MLX5_PROF_MASK_QP_SIZE = (u64)1 << 0, - MLX5_PROF_MASK_MR_CACHE = (u64)1 << 1, -}; - -enum { - MKEY_CACHE_LAST_STD_ENTRY = 20, - MLX5_IMR_KSM_CACHE_ENTRY, - MAX_MKEY_CACHE_ENTRIES }; struct mlx5_profile { u64 mask; u8 log_max_qp; u8 num_cmd_caches; - struct { - int size; - int limit; - } mr_cache[MAX_MKEY_CACHE_ENTRIES]; }; struct mlx5_hca_cap { -- cgit v1.2.3 From 56bd57e7b161f75535df91b229b0b2c64c6e5581 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Sat, 28 Feb 2026 14:02:22 -0600 Subject: iio: add IIO_DECLARE_QUATERNION() macro Add a new IIO_DECLARE_QUATERNION() macro that is used to declare the field in an IIO buffer struct that contains a quaternion vector. Quaternions are currently the only IIO data type that uses the .repeat feature of struct iio_scan_type. This has an implicit rule that the element in the buffer must be aligned to the entire size of the repeated element. This macro will make that requirement explicit. Since this is the only user, we just call the macro IIO_DECLARE_QUATERNION() instead of something more generic. Signed-off-by: David Lechner Reviewed-by: Andy Shevchenko Cc: Signed-off-by: Jonathan Cameron --- include/linux/iio/iio.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index a9ecff191bd9..2c91b7659ce9 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -931,6 +931,18 @@ static inline void *iio_device_get_drvdata(const struct iio_dev *indio_dev) #define IIO_DECLARE_DMA_BUFFER_WITH_TS(type, name, count) \ __IIO_DECLARE_BUFFER_WITH_TS(type, name, count) __aligned(IIO_DMA_MINALIGN) +/** + * IIO_DECLARE_QUATERNION() - Declare a quaternion element + * @type: element type of the individual vectors + * @name: identifier name + * + * Quaternions are a vector composed of 4 elements (W, X, Y, Z). Use this macro + * to declare a quaternion element in a struct to ensure proper alignment in + * an IIO buffer. + */ +#define IIO_DECLARE_QUATERNION(type, name) \ + type name[4] __aligned(sizeof(type) * 4) + struct iio_dev *iio_device_alloc(struct device *parent, int sizeof_priv); /* The information at the returned address is guaranteed to be cacheline aligned */ -- cgit v1.2.3 From fe3c03b84ae69f34992a5e72cbb8384b9ebad738 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 1 Mar 2026 16:51:54 -0800 Subject: cred: fix kernel-doc warnings in cred.h Use the correct function parameter names, function names, or kernel-doc format, and add function return comment sections to avoid kernel-doc warnings: Warning: include/linux/cred.h:43 function parameter 'gi' not described in 'get_group_info' Warning: include/linux/cred.h:43 No description found for return value of 'get_group_info' Warning: include/linux/cred.h:213 No description found for return value of 'get_cred_many' Warning: include/linux/cred.h:260 function parameter '_cred' not described in 'put_cred_many' Warning: include/linux/cred.h:260 expecting prototype for put_cred(). Prototype was for put_cred_many() instead Signed-off-by: Randy Dunlap [PM: subject tweak] Signed-off-by: Paul Moore --- include/linux/cred.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cred.h b/include/linux/cred.h index ed1609d78cd7..c6676265a985 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -33,12 +33,14 @@ struct group_info { /** * get_group_info - Get a reference to a group info structure - * @group_info: The group info to reference + * @gi: The group info to reference * * This gets a reference to a set of supplementary groups. * * If the caller is accessing a task's credentials, they must hold the RCU read * lock when reading. + * + * Returns: @gi */ static inline struct group_info *get_group_info(struct group_info *gi) { @@ -209,6 +211,8 @@ DEFINE_CLASS(override_creds, * usage count. The purpose of this is to attempt to catch at compile time the * accidental alteration of a set of credentials that should be considered * immutable. + * + * Returns: @cred when the references are acquired, NULL otherwise. */ static inline const struct cred *get_cred_many(const struct cred *cred, int nr) { @@ -246,8 +250,8 @@ static inline const struct cred *get_cred_rcu(const struct cred *cred) } /** - * put_cred - Release a reference to a set of credentials - * @cred: The credentials to release + * put_cred_many - Release a reference to a set of credentials + * @_cred: The credentials to release * @nr: Number of references to release * * Release a reference to a set of credentials, deleting them when the last ref -- cgit v1.2.3 From 0b16e69d17d8c35c5c9d5918bf596c75a44655d3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 24 Feb 2026 17:20:36 -0800 Subject: KVM: x86: Use scratch field in MMIO fragment to hold small write values When exiting to userspace to service an emulated MMIO write, copy the to-be-written value to a scratch field in the MMIO fragment if the size of the data payload is 8 bytes or less, i.e. can fit in a single chunk, instead of pointing the fragment directly at the source value. This fixes a class of use-after-free bugs that occur when the emulator initiates a write using an on-stack, local variable as the source, the write splits a page boundary, *and* both pages are MMIO pages. Because KVM's ABI only allows for physically contiguous MMIO requests, accesses that split MMIO pages are separated into two fragments, and are sent to userspace one at a time. When KVM attempts to complete userspace MMIO in response to KVM_RUN after the first fragment, KVM will detect the second fragment and generate a second userspace exit, and reference the on-stack variable. The issue is most visible if the second KVM_RUN is performed by a separate task, in which case the stack of the initiating task can show up as truly freed data. ================================================================== BUG: KASAN: use-after-free in complete_emulated_mmio+0x305/0x420 Read of size 1 at addr ffff888009c378d1 by task syz-executor417/984 CPU: 1 PID: 984 Comm: syz-executor417 Not tainted 5.10.0-182.0.0.95.h2627.eulerosv2r13.x86_64 #3 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.15.0-0-g2dd4b9b3f840-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack+0xbe/0xfd print_address_description.constprop.0+0x19/0x170 __kasan_report.cold+0x6c/0x84 kasan_report+0x3a/0x50 check_memory_region+0xfd/0x1f0 memcpy+0x20/0x60 complete_emulated_mmio+0x305/0x420 kvm_arch_vcpu_ioctl_run+0x63f/0x6d0 kvm_vcpu_ioctl+0x413/0xb20 __se_sys_ioctl+0x111/0x160 do_syscall_64+0x30/0x40 entry_SYSCALL_64_after_hwframe+0x67/0xd1 RIP: 0033:0x42477d Code: <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007faa8e6890e8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00000000004d7338 RCX: 000000000042477d RDX: 0000000000000000 RSI: 000000000000ae80 RDI: 0000000000000005 RBP: 00000000004d7330 R08: 00007fff28d546df R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000004d733c R13: 0000000000000000 R14: 000000000040a200 R15: 00007fff28d54720 The buggy address belongs to the page: page:0000000029f6a428 refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x9c37 flags: 0xfffffc0000000(node=0|zone=1|lastcpupid=0x1fffff) raw: 000fffffc0000000 0000000000000000 ffffea0000270dc8 0000000000000000 raw: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff888009c37780: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff888009c37800: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff >ffff888009c37880: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ^ ffff888009c37900: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff888009c37980: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ================================================================== The bug can also be reproduced with a targeted KVM-Unit-Test by hacking KVM to fill a large on-stack variable in complete_emulated_mmio(), i.e. by overwrite the data value with garbage. Limit the use of the scratch fields to 8-byte or smaller accesses, and to just writes, as larger accesses and reads are not affected thanks to implementation details in the emulator, but add a sanity check to ensure those details don't change in the future. Specifically, KVM never uses on-stack variables for accesses larger that 8 bytes, e.g. uses an operand in the emulator context, and *all* reads are buffered through the mem_read cache. Note! Using the scratch field for reads is not only unnecessary, it's also extremely difficult to handle correctly. As above, KVM buffers all reads through the mem_read cache, and heavily relies on that behavior when re-emulating the instruction after a userspace MMIO read exit. If a read splits a page, the first page is NOT an MMIO page, and the second page IS an MMIO page, then the MMIO fragment needs to point at _just_ the second chunk of the destination, i.e. its position in the mem_read cache. Taking the "obvious" approach of copying the fragment value into the destination when re-emulating the instruction would clobber the first chunk of the destination, i.e. would clobber the data that was read from guest memory. Fixes: f78146b0f923 ("KVM: Fix page-crossing MMIO") Suggested-by: Yashu Zhang Reported-by: Yashu Zhang Closes: https://lore.kernel.org/all/369eaaa2b3c1425c85e8477066391bc7@huawei.com Cc: stable@vger.kernel.org Tested-by: Tom Lendacky Tested-by: Rick Edgecombe Link: https://patch.msgid.link/20260225012049.920665-2-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 34759a262b28..abb309372035 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -318,7 +318,8 @@ static inline bool kvm_vcpu_can_poll(ktime_t cur, ktime_t stop) struct kvm_mmio_fragment { gpa_t gpa; void *data; - unsigned len; + u64 val; + unsigned int len; }; struct kvm_vcpu { -- cgit v1.2.3 From 44a2ec96d374806ee74454ea915615536a76b152 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 27 Feb 2026 09:53:13 +0000 Subject: net: stmmac: remove plat_dat->port_node There are repeated instances of: fwnode = priv->plat->port_node; if (!fwnode) fwnode = dev_fwnode(priv->device); However, the only place that ->port_node is set is stmmac_probe_config_dt(): struct device_node *np = pdev->dev.of_node; ... /* PHYLINK automatically parses the phy-handle property */ plat->port_node = of_fwnode_handle(np); which is equivalent to dev_fwnode(&pdev->dev) and, as priv->device will be &pdev->dev, is also equivalent to dev_fwnode(priv->device). Thus, plat_dat->port_node doesn't provide any extra benefit over using dev_fwnode(priv->device) directly. There is one case where port_node is used directly, which can be found in stmmac_pcs_setup(). This may cause a change of behaviour as PCI drivers do not populate plat_dat->port_node, but dev_fwnode(priv->device) may be valid. PCI-based stmmac should be tested. Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1vvuX3-0000000Avme-3oej@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index b96ae9dadfab..77e51eaa5ec5 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -225,7 +225,6 @@ struct plat_stmmacenet_data { phy_interface_t phy_interface; struct stmmac_mdio_bus_data *mdio_bus_data; struct device_node *phy_node; - struct fwnode_handle *port_node; struct device_node *mdio_node; struct stmmac_dma_cfg *dma_cfg; struct stmmac_safety_feature_cfg *safety_feat_cfg; -- cgit v1.2.3 From 1558705afbb293549fdedd539682bc5240e1964b Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 27 Feb 2026 09:53:59 +0000 Subject: net: stmmac: make dma_cfg mixed/fixed burst boolean struct stmmac_dma_cfg mixed_burst/fixed_burst members are both boolean in nature - of_property_read_bool() are used to read these from DT, and they are only tested for non-zero values. Use bool to avoid unnecessary padding in this structure. Update dwmac-intel to initialise these using true rather than '1', and remove the '0' initialisers as the struct is already zero initialised on allocation. Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1vvuXn-0000000AvnX-4A1u@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 77e51eaa5ec5..2fc169c7117e 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -97,8 +97,8 @@ struct stmmac_dma_cfg { int txpbl; int rxpbl; bool pblx8; - int fixed_burst; - int mixed_burst; + bool fixed_burst; + bool mixed_burst; bool aal; bool eame; bool multi_msi_en; -- cgit v1.2.3 From 4480d5fa1f6ebe7dfc546e14371d63c8b915a82d Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 28 Feb 2026 22:17:31 +0000 Subject: ipmr/ip6mr: Convert net->ipv[46].ipmr_seq to atomic_t. We will no longer hold RTNL for ipmr_mfc_add() and ipmr_mfc_delete(). MFC entry can be loosely connected with VIF by its index for mrt->vif_table[] (stored in mfc_parent), but the two tables are not synchronised. i.e. Even if VIF 1 is removed, MFC for VIF 1 is not automatically removed. The only field that the MFC/VIF interfaces share is net->ipv[46].ipmr_seq, which is protected by RTNL. Adding a new mutex for both just to protect a single field is overkill. Let's convert the field to atomic_t. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260228221800.1082070-14-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/linux/mroute_base.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index 0075f6e5c3da..0baa6f994da9 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -76,7 +76,7 @@ static inline int mr_call_vif_notifiers(struct net *net, struct vif_device *vif, struct net_device *vif_dev, unsigned short vif_index, u32 tb_id, - unsigned int *ipmr_seq) + atomic_t *ipmr_seq) { struct vif_entry_notifier_info info = { .info = { @@ -89,7 +89,7 @@ static inline int mr_call_vif_notifiers(struct net *net, }; ASSERT_RTNL(); - (*ipmr_seq)++; + atomic_inc(ipmr_seq); return call_fib_notifiers(net, event_type, &info.info); } @@ -198,7 +198,7 @@ static inline int mr_call_mfc_notifiers(struct net *net, unsigned short family, enum fib_event_type event_type, struct mr_mfc *mfc, u32 tb_id, - unsigned int *ipmr_seq) + atomic_t *ipmr_seq) { struct mfc_entry_notifier_info info = { .info = { @@ -209,7 +209,7 @@ static inline int mr_call_mfc_notifiers(struct net *net, }; ASSERT_RTNL(); - (*ipmr_seq)++; + atomic_inc(ipmr_seq); return call_fib_notifiers(net, event_type, &info.info); } -- cgit v1.2.3 From bddafc06ca5ee1be4d10061f7954c6d6be5dc1d8 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 28 Feb 2026 22:17:33 +0000 Subject: ipmr: Don't hold RTNL for ipmr_rtm_route(). ipmr_mfc_add() and ipmr_mfc_delete() are already protected by a dedicated mutex. rtm_to_ipmr_mfcc() calls __ipmr_get_table(), __dev_get_by_index(), amd ipmr_find_vif(). Once __dev_get_by_index() is converted to dev_get_by_index_rcu(), we can move the other two functions under that same RCU section and drop RTNL for ipmr_rtm_route(). Let's do that conversion and drop ASSERT_RTNL() in mr_call_mfc_notifiers(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260228221800.1082070-16-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/linux/mroute_base.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index 0baa6f994da9..cf3374580f74 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -208,7 +208,6 @@ static inline int mr_call_mfc_notifiers(struct net *net, .tb_id = tb_id }; - ASSERT_RTNL(); atomic_inc(ipmr_seq); return call_fib_notifiers(net, event_type, &info.info); } -- cgit v1.2.3 From c69855ada28656fdd7e197b6e24cd40a04fe14d3 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 28 Feb 2026 14:08:45 -0800 Subject: atm: atmdev: add function parameter names and description kernel-doc reports function parameters not described for parameters that are not named. Add parameter names for these functions and then describe the function parameters in kernel-doc format. Fixes these warnings: Warning: include/linux/atmdev.h:316 function parameter '' not described in 'register_atm_ioctl' Warning: include/linux/atmdev.h:321 function parameter '' not described in 'deregister_atm_ioctl' Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20260228220845.2978547-1-rdunlap@infradead.org Signed-off-by: Jakub Kicinski --- include/linux/atmdev.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/atmdev.h b/include/linux/atmdev.h index 70807c679f1a..82a32526df64 100644 --- a/include/linux/atmdev.h +++ b/include/linux/atmdev.h @@ -309,17 +309,19 @@ struct atm_ioctl { /** * register_atm_ioctl - register handler for ioctl operations + * @ioctl: ioctl handler to register * * Special (non-device) handlers of ioctl's should * register here. If you're a normal device, you should * set .ioctl in your atmdev_ops instead. */ -void register_atm_ioctl(struct atm_ioctl *); +void register_atm_ioctl(struct atm_ioctl *ioctl); /** * deregister_atm_ioctl - remove the ioctl handler + * @ioctl: ioctl handler to deregister */ -void deregister_atm_ioctl(struct atm_ioctl *); +void deregister_atm_ioctl(struct atm_ioctl *ioctl); /* register_atmdevice_notifier - register atm_dev notify events -- cgit v1.2.3 From 710f5c76580306cdb9ec51fac8fcf6a8faff7821 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Feb 2026 17:26:03 +0000 Subject: indirect_call_wrapper: do not reevaluate function pointer We have an increasing number of READ_ONCE(xxx->function) combined with INDIRECT_CALL_[1234]() helpers. Unfortunately this forces INDIRECT_CALL_[1234]() to read xxx->function many times, which is not what we wanted. Fix these macros so that xxx->function value is not reloaded. $ scripts/bloat-o-meter -t vmlinux.0 vmlinux add/remove: 0/0 grow/shrink: 1/65 up/down: 122/-1084 (-962) Function old new delta ip_push_pending_frames 59 181 +122 ip6_finish_output 687 681 -6 __udp_enqueue_schedule_skb 1078 1072 -6 ioam6_output 2319 2312 -7 xfrm4_rcv_encap_finish2 64 56 -8 xfrm4_output 297 289 -8 vrf_ip_local_out 278 270 -8 vrf_ip6_local_out 278 270 -8 seg6_input_finish 64 56 -8 rpl_output 700 692 -8 ipmr_forward_finish 124 116 -8 ip_forward_finish 143 135 -8 ip6mr_forward2_finish 100 92 -8 ip6_forward_finish 73 65 -8 input_action_end_bpf 1091 1083 -8 dst_input 52 44 -8 __xfrm6_output 801 793 -8 __xfrm4_output 83 75 -8 bpf_input 500 491 -9 __tcp_check_space 530 521 -9 input_action_end_dt6 291 280 -11 vti6_tnl_xmit 1634 1622 -12 bpf_xmit 1203 1191 -12 rpl_input 497 483 -14 rawv6_send_hdrinc 1355 1341 -14 ndisc_send_skb 1030 1016 -14 ipv6_srh_rcv 1377 1363 -14 ip_send_unicast_reply 1253 1239 -14 ip_rcv_finish 226 212 -14 ip6_rcv_finish 300 286 -14 input_action_end_x_core 205 191 -14 input_action_end_x 355 341 -14 input_action_end_t 205 191 -14 input_action_end_dx6_finish 127 113 -14 input_action_end_dx4_finish 373 359 -14 input_action_end_dt4 426 412 -14 input_action_end_core 186 172 -14 input_action_end_b6_encap 292 278 -14 input_action_end_b6 198 184 -14 igmp6_send 1332 1318 -14 ip_sublist_rcv 864 848 -16 ip6_sublist_rcv 1091 1075 -16 ipv6_rpl_srh_rcv 1937 1920 -17 xfrm_policy_queue_process 1246 1228 -18 seg6_output_core 903 885 -18 mld_sendpack 856 836 -20 NF_HOOK 756 736 -20 vti_tunnel_xmit 1447 1426 -21 input_action_end_dx6 664 642 -22 input_action_end 1502 1480 -22 sock_sendmsg_nosec 134 111 -23 ip6mr_forward2 388 364 -24 sock_recvmsg_nosec 134 109 -25 seg6_input_core 836 810 -26 ip_send_skb 172 146 -26 ip_local_out 140 114 -26 ip6_local_out 140 114 -26 __sock_sendmsg 162 136 -26 __ip_queue_xmit 1196 1170 -26 __ip_finish_output 405 379 -26 ipmr_queue_fwd_xmit 373 346 -27 sock_recvmsg 173 145 -28 ip6_xmit 1635 1607 -28 xfrm_output_resume 1418 1389 -29 ip_build_and_send_pkt 625 591 -34 dst_output 504 432 -72 Total: Before=25217686, After=25216724, chg -0.00% Fixes: 283c16a2dfd3 ("indirect call wrappers: helpers to speed-up indirect calls of builtin") Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260227172603.1700433-1-edumazet@google.com Signed-off-by: Paolo Abeni --- include/linux/indirect_call_wrapper.h | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/indirect_call_wrapper.h b/include/linux/indirect_call_wrapper.h index 35227d47cfc9..dc272b514a01 100644 --- a/include/linux/indirect_call_wrapper.h +++ b/include/linux/indirect_call_wrapper.h @@ -16,22 +16,26 @@ */ #define INDIRECT_CALL_1(f, f1, ...) \ ({ \ - likely(f == f1) ? f1(__VA_ARGS__) : f(__VA_ARGS__); \ + typeof(f) __f1 = (f); \ + likely(__f1 == f1) ? f1(__VA_ARGS__) : __f1(__VA_ARGS__); \ }) #define INDIRECT_CALL_2(f, f2, f1, ...) \ ({ \ - likely(f == f2) ? f2(__VA_ARGS__) : \ - INDIRECT_CALL_1(f, f1, __VA_ARGS__); \ + typeof(f) __f2 = (f); \ + likely(__f2 == f2) ? f2(__VA_ARGS__) : \ + INDIRECT_CALL_1(__f2, f1, __VA_ARGS__); \ }) #define INDIRECT_CALL_3(f, f3, f2, f1, ...) \ ({ \ - likely(f == f3) ? f3(__VA_ARGS__) : \ - INDIRECT_CALL_2(f, f2, f1, __VA_ARGS__); \ + typeof(f) __f3 = (f); \ + likely(__f3 == f3) ? f3(__VA_ARGS__) : \ + INDIRECT_CALL_2(__f3, f2, f1, __VA_ARGS__); \ }) #define INDIRECT_CALL_4(f, f4, f3, f2, f1, ...) \ ({ \ - likely(f == f4) ? f4(__VA_ARGS__) : \ - INDIRECT_CALL_3(f, f3, f2, f1, __VA_ARGS__); \ + typeof(f) __f4 = (f); \ + likely(__f4 == f4) ? f4(__VA_ARGS__) : \ + INDIRECT_CALL_3(__f4, f3, f2, f1, __VA_ARGS__); \ }) #define INDIRECT_CALLABLE_DECLARE(f) f -- cgit v1.2.3 From 9de68394a61528d40f575c3e6719cc75c56f62c3 Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Mon, 2 Mar 2026 01:25:44 +0100 Subject: Revert "driver core: enforce device_lock for driver_match_device()" This reverts commit dc23806a7c47 ("driver core: enforce device_lock for driver_match_device()") and commit 289b14592cef ("driver core: fix inverted "locked" suffix of driver_match_device()"). While technically correct, there is a major downside to this approach: When a device is already present in the system and a driver is registered on the same bus, we iterate over all devices registered on this bus to see if one of them matches. If we come across an already bound one where the corresponding driver crashed while holding the device lock (e.g. in probe()) we can't make any progress anymore. However, drivers are typically the least tested code in the kernel and hence it is a case that is likely to happen regularly. Besides hurting developer ergonomics, it potentially decreases chances of shutting things down cleanly and obtaining logs in production environments as well [1]. This came up in the context of a firewire bug, which only in combination with the reverted commit, caused the machine to hang [2]. Additionally, it was observed in [3]. Thus, revert commit dc23806a7c47 ("driver core: enforce device_lock for driver_match_device()") and add a brief note clarifying that an implementer of struct bus_type must not expect match() to be called with the device lock held. Link: https://lore.kernel.org/driver-core/DGRGTIRHA62X.3RY09D9SOK77P@kernel.org/ [1] Link: https://lore.kernel.org/all/67f655bb-4d81-4609-b008-68d200255dd2@davidgow.net/ [2] Link: https://lore.kernel.org/lkml/CALbr=LZ4v7N=tO1vgOsyj9AS+XuNbn6kG-QcF+PacdMjSo0iyw@mail.gmail.com/ [3] Reported-by: Linus Torvalds Closes: https://lore.kernel.org/driver-core/CAHk-=wgJ_L1C=HjcYJotg_zrZEmiLFJaoic+PWthjuQrutrfJw@mail.gmail.com/ Reviewed-by: Gui-Dong Han Acked-by: Greg Kroah-Hartman Link: https://patch.msgid.link/20260302002545.19389-1-dakr@kernel.org [ Add additional Link: reference. - Danilo ] Signed-off-by: Danilo Krummrich --- include/linux/device/bus.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/device/bus.h b/include/linux/device/bus.h index 99c3c83ea520..63de5f053c33 100644 --- a/include/linux/device/bus.h +++ b/include/linux/device/bus.h @@ -35,6 +35,8 @@ struct fwnode_handle; * otherwise. It may also return error code if determining that * the driver supports the device is not possible. In case of * -EPROBE_DEFER it will queue the device for deferred probing. + * Note: This callback may be invoked with or without the device + * lock held. * @uevent: Called when a device is added, removed, or a few other things * that generate uevents to add the environment variables. * @probe: Called when a new device or driver add to this bus, and callback -- cgit v1.2.3 From 83419c8fdbbc1dacd12fa614c5a3561e498aac5f Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Sat, 28 Feb 2026 13:47:55 -0500 Subject: bpf: Factor out program return value calculation Factor the return value range calculation logic in check_return_code out of the function in preparation for separating the return value validation logic for BPF_EXIT and bpf_throw(). No functional changes. The change made in return_retval_code's handling of PROG_TRACING program types (not error'ing on the default case) is a no-op because the match on the program attach type is exhaustive. Acked-by: Eduard Zingerman Signed-off-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260228184759.108145-2-emil@etsalapatis.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c1e30096ea7b..090aa26d1c98 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -265,6 +265,7 @@ struct bpf_reference_state { struct bpf_retval_range { s32 minval; s32 maxval; + bool return_32bit; }; /* state of the program: -- cgit v1.2.3 From 2864fb6aa947703d290b52b1b030b0b74d0a6128 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Draszik?= Date: Mon, 2 Mar 2026 13:32:08 +0000 Subject: power: supply: max17042: initial support for Maxim MAX77759 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Maxim MAX77759 is a companion PMIC intended for use in mobile phones and tablets. It is used on Google Pixel 6 and 6 Pro (oriole and raven). Amongst others, it contains a fuel gauge that is similar to the ones supported by this driver. The fuel gauge can measure battery charge and discharge current, battery voltage, battery temperature, and the Type C connector's temperature. The MAX77759 incorporates the Maxim ModelGauge m5 algorithm. It, as well as previous generations like m3 on max17047/max17050, requires the host to save/restore some register values across power cycles to maintain full accuracy. Extending the driver for such support is out of scope in this initial commit. Reviewed-by: Peter Griffin Signed-off-by: André Draszik Link: https://patch.msgid.link/20260302-max77759-fg-v3-9-3c5f01dbda23@linaro.org Signed-off-by: Sebastian Reichel --- include/linux/power/max17042_battery.h | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/power/max17042_battery.h b/include/linux/power/max17042_battery.h index c417abd2ab70..05097f08ea36 100644 --- a/include/linux/power/max17042_battery.h +++ b/include/linux/power/max17042_battery.h @@ -105,7 +105,7 @@ enum max17042_register { MAX17042_OCV = 0xEE, - MAX17042_OCVInternal = 0xFB, /* MAX17055 VFOCV */ + MAX17042_OCVInternal = 0xFB, /* MAX17055/77759 VFOCV */ MAX17042_VFSOC = 0xFF, }; @@ -156,7 +156,7 @@ enum max17055_register { MAX17055_AtAvCap = 0xDF, }; -/* Registers specific to max17047/50/55 */ +/* Registers specific to max17047/50/55/77759 */ enum max17047_register { MAX17047_QRTbl00 = 0x12, MAX17047_FullSOCThr = 0x13, @@ -167,12 +167,32 @@ enum max17047_register { MAX17047_QRTbl30 = 0x42, }; +enum max77759_register { + MAX77759_AvgTA0 = 0x26, + MAX77759_AtTTF = 0x33, + MAX77759_Tconvert = 0x34, + MAX77759_AvgCurrent0 = 0x3B, + MAX77759_THMHOT = 0x40, + MAX77759_CTESample = 0x41, + MAX77759_ISys = 0x43, + MAX77759_AvgVCell0 = 0x44, + MAX77759_RlxSOC = 0x47, + MAX77759_AvgISys = 0x4B, + MAX77759_QH0 = 0x4C, + MAX77759_MixAtFull = 0x4F, + MAX77759_VSys = 0xB1, + MAX77759_TAlrtTh2 = 0xB2, + MAX77759_VByp = 0xB3, + MAX77759_IIn = 0xD0, +}; + enum max170xx_chip_type { MAXIM_DEVICE_TYPE_UNKNOWN = 0, MAXIM_DEVICE_TYPE_MAX17042, MAXIM_DEVICE_TYPE_MAX17047, MAXIM_DEVICE_TYPE_MAX17050, MAXIM_DEVICE_TYPE_MAX17055, + MAXIM_DEVICE_TYPE_MAX77759, MAXIM_DEVICE_TYPE_NUM }; -- cgit v1.2.3 From 83a86e27c34d06ec2dc117fb293e80f78402df49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Draszik?= Date: Mon, 2 Mar 2026 13:32:09 +0000 Subject: power: supply: max17042: consider task period (max77759) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Several (register) values reported by the fuel gauge depend on its internal task period and it needs to be taken into account when calculating results. All relevant example formulas in the data sheet assume the default task period (of 5760) and final results need to be adjusted based on the task period in effect. Update the code as and where necessary. Reviewed-by: Peter Griffin Signed-off-by: André Draszik Link: https://patch.msgid.link/20260302-max77759-fg-v3-10-3c5f01dbda23@linaro.org Signed-off-by: Sebastian Reichel --- include/linux/power/max17042_battery.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/power/max17042_battery.h b/include/linux/power/max17042_battery.h index 05097f08ea36..d5b08313cf11 100644 --- a/include/linux/power/max17042_battery.h +++ b/include/linux/power/max17042_battery.h @@ -17,6 +17,7 @@ #define MAX17042_DEFAULT_VMAX (4500) /* LiHV cell max */ #define MAX17042_DEFAULT_TEMP_MIN (0) /* For sys without temp sensor */ #define MAX17042_DEFAULT_TEMP_MAX (700) /* 70 degrees Celcius */ +#define MAX17042_DEFAULT_TASK_PERIOD (5760) /* Consider RepCap which is less then 10 units below FullCAP full */ #define MAX17042_FULL_THRESHOLD 10 -- cgit v1.2.3 From 48f7a50c027dd2abb9e7b8a6ecc8e531d87f2c21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 17 Feb 2026 14:11:11 +0100 Subject: stop_machine: Fix the documentation for a NULL cpus argument MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A recent refactoring of the kernel-docs for stop machine changed the description of the cpus parameter from "NULL = any online cpu" to "NULL = run on each online CPU". However the callback is only executed on a single CPU, not all of them. The old wording was a bit ambiguous and could have been read both ways. Reword the documentation to be correct again and hopefully also clearer. Fixes: fc6f89dc7078 ("stop_machine: Improve kernel-doc function-header comments") Signed-off-by: Thomas Weißschuh Signed-off-by: Paul E. McKenney Reviewed-by: Sebastian Andrzej Siewior --- include/linux/stop_machine.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index 72820503514c..01011113d226 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -99,7 +99,7 @@ static inline void print_stop_info(const char *log_lvl, struct task_struct *task * stop_machine: freeze the machine on all CPUs and run this function * @fn: the function to run * @data: the data ptr to pass to @fn() - * @cpus: the cpus to run @fn() on (NULL = run on each online CPU) + * @cpus: the cpus to run @fn() on (NULL = one unspecified online CPU) * * Description: This causes a thread to be scheduled on every CPU, which * will run with interrupts disabled. Each CPU specified by @cpus will @@ -133,7 +133,7 @@ int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus); * stop_machine_cpuslocked: freeze the machine on all CPUs and run this function * @fn: the function to run * @data: the data ptr to pass to @fn() - * @cpus: the cpus to run @fn() on (NULL = run on each online CPU) + * @cpus: the cpus to run @fn() on (NULL = one unspecified online CPU) * * Same as above. Avoids nested calls to cpus_read_lock(). * -- cgit v1.2.3 From e39bb9e02b68942f8e9359d2a3efe7d37ae6be0e Mon Sep 17 00:00:00 2001 From: Qing Wang Date: Fri, 27 Feb 2026 10:58:42 +0800 Subject: tracing: Fix WARN_ON in tracing_buffers_mmap_close When a process forks, the child process copies the parent's VMAs but the user_mapped reference count is not incremented. As a result, when both the parent and child processes exit, tracing_buffers_mmap_close() is called twice. On the second call, user_mapped is already 0, causing the function to return -ENODEV and triggering a WARN_ON. Normally, this isn't an issue as the memory is mapped with VM_DONTCOPY set. But this is only a hint, and the application can call madvise(MADVISE_DOFORK) which resets the VM_DONTCOPY flag. When the application does that, it can trigger this issue on fork. Fix it by incrementing the user_mapped reference count without re-mapping the pages in the VMA's open callback. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Vincent Donnefort Cc: Lorenzo Stoakes Link: https://patch.msgid.link/20260227025842.1085206-1-wangqing7171@gmail.com Fixes: cf9f0f7c4c5bb ("tracing: Allow user-space mapping of the ring-buffer") Reported-by: syzbot+3b5dd2030fe08afdf65d@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=3b5dd2030fe08afdf65d Tested-by: syzbot+3b5dd2030fe08afdf65d@syzkaller.appspotmail.com Signed-off-by: Qing Wang Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 876358cfe1b1..d862fa610270 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -248,6 +248,7 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node); int ring_buffer_map(struct trace_buffer *buffer, int cpu, struct vm_area_struct *vma); +void ring_buffer_map_dup(struct trace_buffer *buffer, int cpu); int ring_buffer_unmap(struct trace_buffer *buffer, int cpu); int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu); #endif /* _LINUX_RING_BUFFER_H */ -- cgit v1.2.3 From 1ac252ad036cdb18f5fb7f76bb6061adfed9cedf Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 3 Mar 2026 23:04:04 +0200 Subject: rculist_bl: add hlist_bl_for_each_entry_continue_rcu Change the old hlist_bl_first_rcu to hlist_bl_first_rcu_dereference to indicate that it is a RCU dereference. Add hlist_bl_next_rcu and hlist_bl_first_rcu to use RCU pointers and use them to fix sparse warnings. Add hlist_bl_for_each_entry_continue_rcu. Signed-off-by: Julian Anastasov Signed-off-by: Florian Westphal --- include/linux/rculist_bl.h | 49 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 40 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rculist_bl.h b/include/linux/rculist_bl.h index 0b952d06eb0b..36363b876e53 100644 --- a/include/linux/rculist_bl.h +++ b/include/linux/rculist_bl.h @@ -8,21 +8,31 @@ #include #include +/* return the first ptr or next element in an RCU protected list */ +#define hlist_bl_first_rcu(head) \ + (*((struct hlist_bl_node __rcu **)(&(head)->first))) +#define hlist_bl_next_rcu(node) \ + (*((struct hlist_bl_node __rcu **)(&(node)->next))) + static inline void hlist_bl_set_first_rcu(struct hlist_bl_head *h, struct hlist_bl_node *n) { LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK); LIST_BL_BUG_ON(((unsigned long)h->first & LIST_BL_LOCKMASK) != LIST_BL_LOCKMASK); - rcu_assign_pointer(h->first, + rcu_assign_pointer(hlist_bl_first_rcu(h), (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK)); } -static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h) -{ - return (struct hlist_bl_node *) - ((unsigned long)rcu_dereference_check(h->first, hlist_bl_is_locked(h)) & ~LIST_BL_LOCKMASK); -} +#define hlist_bl_first_rcu_dereference(head) \ +({ \ + struct hlist_bl_head *__head = (head); \ + \ + (struct hlist_bl_node *) \ + ((unsigned long)rcu_dereference_check(hlist_bl_first_rcu(__head), \ + hlist_bl_is_locked(__head)) & \ + ~LIST_BL_LOCKMASK); \ +}) /** * hlist_bl_del_rcu - deletes entry from hash list without re-initialization @@ -73,7 +83,7 @@ static inline void hlist_bl_add_head_rcu(struct hlist_bl_node *n, { struct hlist_bl_node *first; - /* don't need hlist_bl_first_rcu because we're under lock */ + /* don't need hlist_bl_first_rcu* because we're under lock */ first = hlist_bl_first(h); n->next = first; @@ -93,9 +103,30 @@ static inline void hlist_bl_add_head_rcu(struct hlist_bl_node *n, * */ #define hlist_bl_for_each_entry_rcu(tpos, pos, head, member) \ - for (pos = hlist_bl_first_rcu(head); \ + for (pos = hlist_bl_first_rcu_dereference(head); \ pos && \ ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1; }); \ - pos = rcu_dereference_raw(pos->next)) + pos = rcu_dereference_raw(hlist_bl_next_rcu(pos))) + +/** + * hlist_bl_for_each_entry_continue_rcu - continue iteration over list of given + * type + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct hlist_bl_node to use as a loop cursor. + * @member: the name of the hlist_bl_node within the struct. + * + * Continue to iterate over list of given type, continuing after + * the current position which must have been in the list when the RCU read + * lock was taken. + * This would typically require either that you obtained the node from a + * previous walk of the list in the same RCU read-side critical section, or + * that you held some sort of non-RCU reference (such as a reference count) + * to keep the node alive *and* in the list. + */ +#define hlist_bl_for_each_entry_continue_rcu(tpos, pos, member) \ + for (pos = rcu_dereference_raw(hlist_bl_next_rcu(&(tpos)->member)); \ + pos && \ + ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1; }); \ + pos = rcu_dereference_raw(hlist_bl_next_rcu(pos))) #endif -- cgit v1.2.3 From 44d93cf1abb6a85d65c3b4b027c82d44263de6a5 Mon Sep 17 00:00:00 2001 From: Karthikeyan Kathirvel Date: Wed, 4 Mar 2026 14:23:42 +0530 Subject: wifi: UHR: define DPS/DBE/P-EDCA elements and fix size parsing Add UHR Operation and Capability definitions and parsing helpers: - Define ieee80211_uhr_dps_info, ieee80211_uhr_dbe_info, ieee80211_uhr_p_edca_info with masks. - Update ieee80211_uhr_oper_size_ok() to account for optional DPS/DBE/P-EDCA blocks. - Move NPCA pointer position after DPS Operation Parameter if it is present in ieee80211_uhr_oper_size_ok(). - Move NPCA pointer position after DPS info if it is present in ieee80211_uhr_npca_info(). Signed-off-by: Karthikeyan Kathirvel Link: https://patch.msgid.link/20260304085343.1093993-2-karthikeyan.kathirvel@oss.qualcomm.com Signed-off-by: Johannes Berg --- include/linux/ieee80211-uhr.h | 271 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 265 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211-uhr.h b/include/linux/ieee80211-uhr.h index 132acced7d79..9729d23e4766 100644 --- a/include/linux/ieee80211-uhr.h +++ b/include/linux/ieee80211-uhr.h @@ -29,11 +29,216 @@ struct ieee80211_uhr_operation { #define IEEE80211_UHR_NPCA_PARAMS_MOPLEN 0x00400000 #define IEEE80211_UHR_NPCA_PARAMS_DIS_SUBCH_BMAP_PRES 0x00800000 +/** + * struct ieee80211_uhr_npca_info - npca operation information + * + * This structure is the "NPCA Operation Parameters field format" of "UHR + * Operation Element" fields as described in P802.11bn_D1.3 + * subclause 9.4.2.353. See Figure 9-aa4. + * + * Refer to IEEE80211_UHR_NPCA* + * @params: + * NPCA Primary Channel - NPCA primary channel + * NPCA_Min Duration Threshold - Minimum duration of inter-BSS activity + * NPCA Switching Delay - + * Time needed by an NPCA AP to switch from the + * BSS primary channel to the NPCA primary channel + * in the unit of 4 µs. + * NPCA Switching Back Delay - + * Time to switch from the NPCA primary channel + * to the BSS primary channel in the unit of 4 µs. + * NPCA Initial QSRC - + * Initialize the EDCAF QSRC[AC] variables + * when an NPCA STA in the BSS + * switches to NPCA operation. + * NPCA MOPLEN - + * Indicates which conditions can be used to + * initiate an NPCA operation, + * 1 -> both PHYLEN NPCA operation and MOPLEN + * NPCA operation are + * permitted in the BSS + * 0 -> only PHYLEN NPCA operation is allowed in the BSS. + * NPCA Disabled Subchannel Bitmap Present - + * Indicates whether the NPCA Disabled Subchannel + * Bitmap field is present. A 1 in this field indicates that + * the NPCA Disabled Subchannel Bitmap field is present + * @dis_subch_bmap: + * A bit in the bitmap that lies within the BSS bandwidth is set + * to 1 to indicate that the corresponding 20 MHz subchannel is + * punctured and is set to 0 to indicate that the corresponding + * 20 MHz subchannel is not punctured. A bit in the bitmap that + * falls outside of the BSS bandwidth is reserved. This field is + * present when the value of the NPCA Disabled Subchannel Bitmap + * Field Present field is equal to 1, and not present, otherwise + */ struct ieee80211_uhr_npca_info { __le32 params; __le16 dis_subch_bmap[]; } __packed; +#define IEEE80211_UHR_DPS_PADDING_DELAY 0x0000003F +#define IEEE80211_UHR_DPS_TRANSITION_DELAY 0x00003F00 +#define IEEE80211_UHR_DPS_ICF_REQUIRED 0x00010000 +#define IEEE80211_UHR_DPS_PARAMETERIZED_FLAG 0x00020000 +#define IEEE80211_UHR_DPS_LC_MODE_BW 0x001C0000 +#define IEEE80211_UHR_DPS_LC_MODE_NSS 0x01E00000 +#define IEEE80211_UHR_DPS_LC_MODE_MCS 0x1E000000 +#define IEEE80211_UHR_DPS_MOBILE_AP_DPS_STATIC_HCM 0x20000000 + +/** + * struct ieee80211_uhr_dps_info - DPS operation information + * + * This structure is the "DPS Operation Parameter field" of "UHR + * Operation Element" fields as described in P802.11bn_D1.3 + * subclause 9.4.1.87. See Figure 9-207u. + * + * Refer to IEEE80211_UHR_DPS* + * @params: + * DPS Padding Delay - + * Indicates the minimum MAC padding + * duration that is required by a DPS STA + * in an ICF to cause the STA to transition + * from the lower capability mode to the + * higher capability mode. The DPS Padding + * Delay field is in units of 4 µs. + * DPS Transition Delay - + * Indicates the amount of time required by a + * DPS STA to transition from the higher + * capability mode to the lower capability + * mode. The DPS Transition Delay field is in + * units of 4 µs. + * ICF Required - + * Indicates when the DPS assisting STA needs + * to transmit an ICF frame to the peer DPS STA + * before performing the frame exchanges with + * the peer DPS STA in a TXOP. + * 1 -> indicates that the transmission of the + * ICF frame to the peer DPS STA prior to + * any frame exchange is needed. + * 0 -> ICF transmission before the frame + * exchanges with the peer DPS STA is only + * needed if the frame exchange is performed + * in the HC mode. + * Parameterized Flag - + * 0 -> indicates that only 20 MHz, 1 SS, + * non-HT PPDU format with the data + * rate of 6, 12, and 24 Mb/s as the + * default mode are supported by the + * DPS STA in the LC mode + * 1 -> indicates that a bandwidth up to the + * bandwidth indicated in the LC Mode + * Bandwidth field, a number of spatial + * streams up to the NSS indicated in + * the LC Mode Nss field, and an MCS up + * to the MCS indicated in the LC Mode + * MCS fields are supported by the DPS + * STA in the LC mode as the + * parameterized mode. + * LC Mode Bandwidth - + * Indicates the maximum bandwidth supported + * by the STA in the LC mode. + * LC Mode NSS - + * Indicates the maximum number of the spatial + * streams supported by the STA in the LC mode. + * LC Mode MCS - + * Indicates the highest MCS supported by the STA + * in the LC mode. + * Mobile AP DPS Static HCM - + * 1 -> indicates that it will remain in the DPS high + * capability mode until the next TBTT on that + * link. + * 0 -> otherwise. + */ +struct ieee80211_uhr_dps_info { + __le32 params; +} __packed; + +#define IEEE80211_UHR_DBE_OPER_BANDWIDTH 0x07 +#define IEEE80211_UHR_DBE_OPER_DIS_SUBCHANNEL_BITMAP_PRES 0x08 + +/** + * enum ieee80211_uhr_dbe_oper_bw - DBE Operational Bandwidth + * + * Encoding for the DBE Operational Bandwidth field in the UHR Operation + * element (DBE Operation Parameters). + * + * @IEEE80211_UHR_DBE_OPER_BW_40: 40 MHz operational DBE bandwidth + * @IEEE80211_UHR_DBE_OPER_BW_80: 80 MHz operational DBE bandwidth + * @IEEE80211_UHR_DBE_OPER_BW_160: 160 MHz operational DBE bandwidth + * @IEEE80211_UHR_DBE_OPER_BW_320_1: 320-1 MHz operational DBE bandwidth + * @IEEE80211_UHR_DBE_OPER_BW_320_2: 320-2 MHz operational DBE bandwidth + */ +enum ieee80211_uhr_dbe_oper_bw { + IEEE80211_UHR_DBE_OPER_BW_40 = 1, + IEEE80211_UHR_DBE_OPER_BW_80 = 2, + IEEE80211_UHR_DBE_OPER_BW_160 = 3, + IEEE80211_UHR_DBE_OPER_BW_320_1 = 4, + IEEE80211_UHR_DBE_OPER_BW_320_2 = 5, +}; + +/** + * struct ieee80211_uhr_dbe_info - DBE operation information + * + * This structure is the "DBE Operation Parameters field" of + * "UHR Operation Element" fields as described in P802.11bn_D1.3 + * subclause 9.4.2.353. See Figure 9-aa6. + * + * Refer to IEEE80211_UHR_DBE_OPER* + * @params: + * B0-B2 - DBE Operational Bandwidth field, see + * "enum ieee80211_uhr_dbe_oper_bw" for values. + * Value 0 is reserved. + * Value 1 indicates 40 MHz operational DBE bandwidth. + * Value 2 indicates 80 MHz operational DBE bandwidth. + * Value 3 indicates 160 MHz operational DBE bandwidth. + * Value 4 indicates 320-1 MHz operational DBE bandwidth. + * Value 5 indicates 320-2 MHz operational DBE bandwidth. + * Values 6 to 7 are reserved. + * B3 - DBE Disabled Subchannel Bitmap Present. + * @dis_subch_bmap: DBE Disabled Subchannel Bitmap field is set to indicate + * disabled 20 MHz subchannels within the DBE Bandwidth. + */ +struct ieee80211_uhr_dbe_info { + u8 params; + __le16 dis_subch_bmap[]; +} __packed; + +#define IEEE80211_UHR_P_EDCA_ECWMIN 0x0F +#define IEEE80211_UHR_P_EDCA_ECWMAX 0xF0 +#define IEEE80211_UHR_P_EDCA_AIFSN 0x000F +#define IEEE80211_UHR_P_EDCA_CW_DS 0x0030 +#define IEEE80211_UHR_P_EDCA_PSRC_THRESHOLD 0x01C0 +#define IEEE80211_UHR_P_EDCA_QSRC_THRESHOLD 0x0600 + +/** + * struct ieee80211_uhr_p_edca_info - P-EDCA operation information + * + * This structure is the "P-EDCA Operation Parameters field" of + * "UHR Operation Element" fields as described in P802.11bn_D1.3 + * subclause 9.4.2.353. See Figure 9-aa5. + * + * Refer to IEEE80211_UHR_P_EDCA* + * @p_edca_ec: P-EDCA ECWmin and ECWmax. + * These fields indicate the CWmin and CWmax values used by a + * P-EDCA STA during P-EDCA contention. + * @params: AIFSN, CW DS, PSRC threshold, and QSRC threshold. + * - The AIFSN field indicates the AIFSN value used by a P-EDCA STA + * during P-EDCA contention. + * - The CW DS field indicates the value used for randomization of the + * transmission slot of the DS-CTS frame. The value 3 is reserved. + * The value 0 indicates that randomization is not enabled. + * - The P-EDCA PSRC threshold field indicates the maximum number of + * allowed consecutive DS-CTS transmissions. The value 0 and values + * greater than 4 are reserved. + * - The P-EDCA QSRC threshold field indicates the value of the + * QSRC[AC_VO] counter required to start P-EDCA contention. The + * value 0 is reserved. + */ +struct ieee80211_uhr_p_edca_info { + u8 p_edca_ec; + __le16 params; +} __packed; + static inline bool ieee80211_uhr_oper_size_ok(const u8 *data, u8 len, bool beacon) { @@ -47,19 +252,52 @@ static inline bool ieee80211_uhr_oper_size_ok(const u8 *data, u8 len, if (beacon) return true; - /* FIXME: DPS, DBE, P-EDCA (consider order, also relative to NPCA) */ + /* DPS Operation Parameters (fixed 4 bytes) */ + if (oper->params & cpu_to_le16(IEEE80211_UHR_OPER_PARAMS_DPS_ENA)) { + needed += sizeof(struct ieee80211_uhr_dps_info); + if (len < needed) + return false; + } + /* NPCA Operation Parameters (fixed 4 bytes + optional 2 bytes) */ if (oper->params & cpu_to_le16(IEEE80211_UHR_OPER_PARAMS_NPCA_ENA)) { const struct ieee80211_uhr_npca_info *npca = - (const void *)oper->variable; + (const void *)(data + needed); needed += sizeof(*npca); - if (len < needed) return false; - if (npca->params & cpu_to_le32(IEEE80211_UHR_NPCA_PARAMS_DIS_SUBCH_BMAP_PRES)) + if (npca->params & + cpu_to_le32(IEEE80211_UHR_NPCA_PARAMS_DIS_SUBCH_BMAP_PRES)) { needed += sizeof(npca->dis_subch_bmap[0]); + if (len < needed) + return false; + } + } + + /* P-EDCA Operation Parameters (fixed 3 bytes) */ + if (oper->params & cpu_to_le16(IEEE80211_UHR_OPER_PARAMS_PEDCA_ENA)) { + needed += sizeof(struct ieee80211_uhr_p_edca_info); + if (len < needed) + return false; + } + + /* DBE Operation Parameters (fixed 1 byte + optional 2 bytes) */ + if (oper->params & cpu_to_le16(IEEE80211_UHR_OPER_PARAMS_DBE_ENA)) { + const struct ieee80211_uhr_dbe_info *dbe = + (const void *)(data + needed); + + needed += sizeof(*dbe); + if (len < needed) + return false; + + if (dbe->params & + IEEE80211_UHR_DBE_OPER_DIS_SUBCHANNEL_BITMAP_PRES) { + needed += sizeof(dbe->dis_subch_bmap[0]); + if (len < needed) + return false; + } } return len >= needed; @@ -72,12 +310,15 @@ static inline bool ieee80211_uhr_oper_size_ok(const u8 *data, u8 len, static inline const struct ieee80211_uhr_npca_info * ieee80211_uhr_npca_info(const struct ieee80211_uhr_operation *oper) { + const u8 *pos = oper->variable; + if (!(oper->params & cpu_to_le16(IEEE80211_UHR_OPER_PARAMS_NPCA_ENA))) return NULL; - /* FIXME: DPS */ + if (oper->params & cpu_to_le16(IEEE80211_UHR_OPER_PARAMS_DPS_ENA)) + pos += sizeof(struct ieee80211_uhr_dps_info); - return (const void *)oper->variable; + return (const void *)pos; } static inline const __le16 * @@ -131,6 +372,24 @@ ieee80211_uhr_npca_dis_subch_bitmap(const struct ieee80211_uhr_operation *oper) #define IEEE80211_UHR_MAC_CAP_DBE_EHT_MCS_MAP_160_PRES 0x08 #define IEEE80211_UHR_MAC_CAP_DBE_EHT_MCS_MAP_320_PRES 0x10 +/** + * enum ieee80211_uhr_dbe_max_supported_bw - DBE Maximum Supported Bandwidth + * + * As per spec P802.11bn_D1.3 "Table 9-bb5—Encoding of the DBE Maximum + * Supported Bandwidth field". + * + * @IEEE80211_UHR_DBE_MAX_BW_40: Indicates 40 MHz DBE max supported bw + * @IEEE80211_UHR_DBE_MAX_BW_80: Indicates 80 MHz DBE max supported bw + * @IEEE80211_UHR_DBE_MAX_BW_160: Indicates 160 MHz DBE max supported bw + * @IEEE80211_UHR_DBE_MAX_BW_320: Indicates 320 MHz DBE max supported bw + */ +enum ieee80211_uhr_dbe_max_supported_bw { + IEEE80211_UHR_DBE_MAX_BW_40 = 1, + IEEE80211_UHR_DBE_MAX_BW_80 = 2, + IEEE80211_UHR_DBE_MAX_BW_160 = 3, + IEEE80211_UHR_DBE_MAX_BW_320 = 4, +}; + struct ieee80211_uhr_cap_mac { u8 mac_cap[5]; } __packed; -- cgit v1.2.3 From 4059172b2a78a71d15d8fcd8d3fd8ea1ba65d25b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 13 Feb 2026 17:26:47 -0800 Subject: KVM: x86: Move kvm_rebooting to x86 Move kvm_rebooting, which is only read by x86, to KVM x86 so that it can be moved again to core x86 code. Add a "shutdown" arch hook to facilate setting the flag in KVM x86, along with a pile of comments to provide more context around what KVM x86 is doing and why. Reviewed-by: Chao Gao Acked-by: Dave Hansen Tested-by: Chao Gao Reviewed-by: Dan Williams Tested-by: Sagi Shahar Link: https://patch.msgid.link/20260214012702.2368778-2-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 34759a262b28..7c4ebd5210ec 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1627,6 +1627,13 @@ static inline void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) {} #endif #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING +/* + * kvm_arch_shutdown() is invoked immediately prior to forcefully disabling + * hardware virtualization on all CPUs via IPI function calls (in preparation + * for shutdown or reboot), e.g. to allow arch code to prepare for disabling + * virtualization while KVM may be actively running vCPUs. + */ +void kvm_arch_shutdown(void); /* * kvm_arch_{enable,disable}_virtualization() are called on one CPU, under * kvm_usage_lock, immediately after/before 0=>1 and 1=>0 transitions of @@ -2313,7 +2320,6 @@ static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu) #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING extern bool enable_virt_at_load; -extern bool kvm_rebooting; #endif extern unsigned int halt_poll_ns; -- cgit v1.2.3 From d30372d0b7e637475c79a785d055f4eb8c863656 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 13 Feb 2026 17:27:01 -0800 Subject: KVM: Bury kvm_{en,dis}able_virtualization() in kvm_main.c once more Now that TDX handles doing VMXON without KVM's involvement, bury the top-level APIs to enable and disable virtualization back in kvm_main.c. No functional change intended. Reviewed-by: Dan Williams Reviewed-by: Chao Gao Tested-by: Chao Gao Tested-by: Sagi Shahar Link: https://patch.msgid.link/20260214012702.2368778-16-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 7c4ebd5210ec..fbd549bdf052 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2613,12 +2613,4 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, struct kvm_pre_fault_memory *range); #endif -#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING -int kvm_enable_virtualization(void); -void kvm_disable_virtualization(void); -#else -static inline int kvm_enable_virtualization(void) { return 0; } -static inline void kvm_disable_virtualization(void) { } -#endif - #endif -- cgit v1.2.3 From 2d28ed588f8d7d0d41b0a4fad7f0d05e4bbf1797 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Tue, 24 Feb 2026 16:24:34 -0800 Subject: Revert "ptdesc: remove references to folios from __pagetable_ctor() and pagetable_dtor()" This change swapped out mod_node_page_state for lruvec_stat_add_folio. But, these two APIs are not interchangeable: the lruvec version also increments memcg stats, in addition to "global" pgdat stats. So after this change, the "pagetables" memcg stat in memory.stat always yields "0", which is a userspace visible regression. I tried to look for a refactor where we add a variant of lruvec_stat_mod_folio which takes a pgdat and a memcg instead of a folio, to try to adhere to the spirit of the original patch. But at the end of the day this just means we have to call folio_memcg(ptdesc_folio(ptdesc)) anyway, which doesn't really accomplish much. This regression is visible in master as well as 6.18 stable, so CC stable too. Link: https://lkml.kernel.org/r/20260225002434.2953895-1-axelrasmussen@google.com Fixes: f0c92726e89f ("ptdesc: remove references to folios from __pagetable_ctor() and pagetable_dtor()") Signed-off-by: Axel Rasmussen Acked-by: Shakeel Butt Acked-by: Johannes Weiner Reviewed-by: Vishal Moola (Oracle) Cc: David Hildenbrand Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Roman Gushchin Cc: Muchun Song Cc: Signed-off-by: Andrew Morton --- include/linux/mm.h | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 5be3d8a8f806..abb4963c1f06 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3514,26 +3514,21 @@ static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; } static inline void ptlock_free(struct ptdesc *ptdesc) {} #endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */ -static inline unsigned long ptdesc_nr_pages(const struct ptdesc *ptdesc) -{ - return compound_nr(ptdesc_page(ptdesc)); -} - static inline void __pagetable_ctor(struct ptdesc *ptdesc) { - pg_data_t *pgdat = NODE_DATA(memdesc_nid(ptdesc->pt_flags)); + struct folio *folio = ptdesc_folio(ptdesc); - __SetPageTable(ptdesc_page(ptdesc)); - mod_node_page_state(pgdat, NR_PAGETABLE, ptdesc_nr_pages(ptdesc)); + __folio_set_pgtable(folio); + lruvec_stat_add_folio(folio, NR_PAGETABLE); } static inline void pagetable_dtor(struct ptdesc *ptdesc) { - pg_data_t *pgdat = NODE_DATA(memdesc_nid(ptdesc->pt_flags)); + struct folio *folio = ptdesc_folio(ptdesc); ptlock_free(ptdesc); - __ClearPageTable(ptdesc_page(ptdesc)); - mod_node_page_state(pgdat, NR_PAGETABLE, -ptdesc_nr_pages(ptdesc)); + __folio_clear_pgtable(folio); + lruvec_stat_sub_folio(folio, NR_PAGETABLE); } static inline void pagetable_dtor_free(struct ptdesc *ptdesc) -- cgit v1.2.3 From 7392f8e4ea632622b2cd2086675ba022db238b3a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 1 Mar 2026 16:52:29 -0800 Subject: uaccess: correct kernel-doc parameter format Use the correct kernel-doc function parameter format to avoid kernel-doc warnings: Warning: include/linux/uaccess.h:814 function parameter 'uptr' not described in 'scoped_user_rw_access_size' Warning: include/linux/uaccess.h:826 function parameter 'uptr' not described in 'scoped_user_rw_access' Link: https://lkml.kernel.org/r/20260302005229.3471955-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton --- include/linux/uaccess.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 1f3804245c06..001cfef21b61 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -806,7 +806,7 @@ for (bool done = false; !done; done = true) \ /** * scoped_user_rw_access_size - Start a scoped user read/write access with given size - * @uptr Pointer to the user space address to read from and write to + * @uptr: Pointer to the user space address to read from and write to * @size: Size of the access starting from @uptr * @elbl: Error label to goto when the access region is rejected * @@ -817,7 +817,7 @@ for (bool done = false; !done; done = true) \ /** * scoped_user_rw_access - Start a scoped user read/write access - * @uptr Pointer to the user space address to read from and write to + * @uptr: Pointer to the user space address to read from and write to * @elbl: Error label to goto when the access region is rejected * * The size of the access starting from @uptr is determined via sizeof(*@uptr)). -- cgit v1.2.3 From 599b4e290c8766b19378d85d4310c6ec8f90ade4 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 1 Mar 2026 16:52:22 -0800 Subject: mm/mmu_notifier: clean up mmu_notifier.h kernel-doc Eliminate kernel-doc warnings in mmu_notifier.h: - add a missing struct short description - use the correct format for function parameters - add missing function return comment sections Warning: include/linux/mmu_notifier.h:236 missing initial short description on line: * struct mmu_interval_notifier_ops Warning: include/linux/mmu_notifier.h:325 function parameter 'interval_sub' not described in 'mmu_interval_set_seq' Warning: include/linux/mmu_notifier.h:325 function parameter 'cur_seq' not described in 'mmu_interval_set_seq' Warning: include/linux/mmu_notifier.h:346 function parameter 'interval_sub' not described in 'mmu_interval_read_retry' Warning: include/linux/mmu_notifier.h:346 function parameter 'seq' not described in 'mmu_interval_read_retry' Warning: include/linux/mmu_notifier.h:346 No description found for return value of 'mmu_interval_read_retry' Warning: include/linux/mmu_notifier.h:370 function parameter 'interval_sub' not described in 'mmu_interval_check_retry' Warning: include/linux/mmu_notifier.h:370 function parameter 'seq' not described in 'mmu_interval_check_retry' Warning: include/linux/mmu_notifier.h:370 No description found for return value of 'mmu_interval_check_retry' Link: https://lkml.kernel.org/r/20260302005222.3470783-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Reviewed-by: Jason Gunthorpe Cc: David Hildenbrand Cc: "Liam R. Howlett" Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Randy Dunlap Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mmu_notifier.h | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 07a2bbaf86e9..8450e18a87c2 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -234,7 +234,7 @@ struct mmu_notifier { }; /** - * struct mmu_interval_notifier_ops + * struct mmu_interval_notifier_ops - callback for range notification * @invalidate: Upon return the caller must stop using any SPTEs within this * range. This function can sleep. Return false only if sleeping * was required but mmu_notifier_range_blockable(range) is false. @@ -309,8 +309,8 @@ void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub); /** * mmu_interval_set_seq - Save the invalidation sequence - * @interval_sub - The subscription passed to invalidate - * @cur_seq - The cur_seq passed to the invalidate() callback + * @interval_sub: The subscription passed to invalidate + * @cur_seq: The cur_seq passed to the invalidate() callback * * This must be called unconditionally from the invalidate callback of a * struct mmu_interval_notifier_ops under the same lock that is used to call @@ -329,8 +329,8 @@ mmu_interval_set_seq(struct mmu_interval_notifier *interval_sub, /** * mmu_interval_read_retry - End a read side critical section against a VA range - * interval_sub: The subscription - * seq: The return of the paired mmu_interval_read_begin() + * @interval_sub: The subscription + * @seq: The return of the paired mmu_interval_read_begin() * * This MUST be called under a user provided lock that is also held * unconditionally by op->invalidate() when it calls mmu_interval_set_seq(). @@ -338,7 +338,7 @@ mmu_interval_set_seq(struct mmu_interval_notifier *interval_sub, * Each call should be paired with a single mmu_interval_read_begin() and * should be used to conclude the read side. * - * Returns true if an invalidation collided with this critical section, and + * Returns: true if an invalidation collided with this critical section, and * the caller should retry. */ static inline bool @@ -350,20 +350,21 @@ mmu_interval_read_retry(struct mmu_interval_notifier *interval_sub, /** * mmu_interval_check_retry - Test if a collision has occurred - * interval_sub: The subscription - * seq: The return of the matching mmu_interval_read_begin() + * @interval_sub: The subscription + * @seq: The return of the matching mmu_interval_read_begin() * * This can be used in the critical section between mmu_interval_read_begin() - * and mmu_interval_read_retry(). A return of true indicates an invalidation - * has collided with this critical region and a future - * mmu_interval_read_retry() will return true. - * - * False is not reliable and only suggests a collision may not have - * occurred. It can be called many times and does not have to hold the user - * provided lock. + * and mmu_interval_read_retry(). * * This call can be used as part of loops and other expensive operations to * expedite a retry. + * It can be called many times and does not have to hold the user + * provided lock. + * + * Returns: true indicates an invalidation has collided with this critical + * region and a future mmu_interval_read_retry() will return true. + * False is not reliable and only suggests a collision may not have + * occurred. */ static inline bool mmu_interval_check_retry(struct mmu_interval_notifier *interval_sub, -- cgit v1.2.3 From c66e0f453d1afa82534383c58d503238a43fa76c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 4 Mar 2026 01:27:47 +0000 Subject: net: use ktime_t in struct scm_timestamping_internal Instead of using struct timespec64 in scm_timestamping_internal, use ktime_t, saving 24 bytes in kernel stack. This makes tcp_update_recv_tstamps() small enough to be inlined. The ktime_t -> timespec64 conversions happen after socket lock has been released in tcp_recvmsg(), and only if the application requested them. $ scripts/bloat-o-meter -t vmlinux.0 vmlinux add/remove: 0/2 grow/shrink: 5/4 up/down: 146/-277 (-131) Function old new delta tcp_zerocopy_receive 2383 2425 +42 mptcp_recvmsg 1565 1607 +42 tcp_recvmsg_locked 3797 3823 +26 put_cmsg_scm_timestamping64 131 149 +18 put_cmsg_scm_timestamping 131 149 +18 __pfx_tcp_update_recv_tstamps 16 - -16 do_tcp_getsockopt 4024 4006 -18 tcp_recv_timestamp 474 430 -44 tcp_zc_handle_leftover 417 371 -46 __sock_recv_timestamp 1087 1031 -56 tcp_update_recv_tstamps 97 - -97 Total: Before=25223788, After=25223657, chg -0.00% Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Reviewed-by: Jason Xing Link: https://patch.msgid.link/20260304012747.881644-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/socket.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/socket.h b/include/linux/socket.h index ec715ad4bf25..ec4a0a025793 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -415,7 +415,7 @@ struct __kernel_timespec; struct old_timespec32; struct scm_timestamping_internal { - struct timespec64 ts[3]; + ktime_t ts[3]; }; extern void put_cmsg_scm_timestamping64(struct msghdr *msg, struct scm_timestamping_internal *tss); -- cgit v1.2.3 From 01b7768578a68abe597cfb36ebe0fc47c9305f88 Mon Sep 17 00:00:00 2001 From: Maher Sanalla Date: Wed, 25 Feb 2026 16:19:31 +0200 Subject: net/mlx5: Add TLP emulation device capabilities Introduce the hardware structures and definitions needed for the driver support of TLP emulation in mlx5_ifc. Signed-off-by: Maher Sanalla Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 775cb0c56865..a3948b36820d 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1389,6 +1389,26 @@ struct mlx5_ifc_virtio_emulation_cap_bits { u8 reserved_at_1c0[0x640]; }; +struct mlx5_ifc_tlp_dev_emu_capabilities_bits { + u8 reserved_at_0[0x20]; + + u8 reserved_at_20[0x13]; + u8 log_tlp_rsp_gw_page_stride[0x5]; + u8 reserved_at_38[0x8]; + + u8 reserved_at_40[0xc0]; + + u8 reserved_at_100[0xc]; + u8 tlp_rsp_gw_num_pages[0x4]; + u8 reserved_at_110[0x10]; + + u8 reserved_at_120[0xa0]; + + u8 tlp_rsp_gw_pages_bar_offset[0x40]; + + u8 reserved_at_200[0x600]; +}; + enum { MLX5_ATOMIC_CAPS_ATOMIC_SIZE_QP_1_BYTE = 0x0, MLX5_ATOMIC_CAPS_ATOMIC_SIZE_QP_2_BYTES = 0x2, @@ -1961,7 +1981,7 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 log_max_rqt[0x5]; u8 reserved_at_390[0x3]; u8 log_max_rqt_size[0x5]; - u8 reserved_at_398[0x1]; + u8 tlp_device_emulation_manager[0x1]; u8 vnic_env_cnt_bar_uar_access[0x1]; u8 vnic_env_cnt_odp_page_fault[0x1]; u8 log_max_tis_per_sq[0x5]; @@ -3830,6 +3850,7 @@ union mlx5_ifc_hca_cap_union_bits { struct mlx5_ifc_tls_cap_bits tls_cap; struct mlx5_ifc_device_mem_cap_bits device_mem_cap; struct mlx5_ifc_virtio_emulation_cap_bits virtio_emulation_cap; + struct mlx5_ifc_tlp_dev_emu_capabilities_bits tlp_dev_emu_capabilities; struct mlx5_ifc_macsec_cap_bits macsec_cap; struct mlx5_ifc_crypto_cap_bits crypto_cap; struct mlx5_ifc_ipsec_cap_bits ipsec_cap; -- cgit v1.2.3 From 385a06f74ff7a03e3fb0b15fb87cfeb052d75073 Mon Sep 17 00:00:00 2001 From: Maher Sanalla Date: Wed, 25 Feb 2026 16:19:32 +0200 Subject: net/mlx5: Expose TLP emulation capabilities Expose and query TLP device emulation caps on driver load. Signed-off-by: Maher Sanalla Signed-off-by: Leon Romanovsky --- include/linux/mlx5/device.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index b37fe39cef27..25c6b42140b2 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1259,6 +1259,7 @@ enum mlx5_cap_type { MLX5_CAP_PORT_SELECTION = 0x25, MLX5_CAP_ADV_VIRTUALIZATION = 0x26, MLX5_CAP_ADV_RDMA = 0x28, + MLX5_CAP_TLP_EMULATION = 0x2a, /* NUM OF CAP Types */ MLX5_CAP_NUM }; @@ -1481,6 +1482,14 @@ enum mlx5_qcam_feature_groups { MLX5_GET64(virtio_emulation_cap, \ (mdev)->caps.hca[MLX5_CAP_VDPA_EMULATION]->cur, cap) +#define MLX5_CAP_DEV_TLP_EMULATION(mdev, cap)\ + MLX5_GET(tlp_dev_emu_capabilities, \ + (mdev)->caps.hca[MLX5_CAP_TLP_EMULATION]->cur, cap) + +#define MLX5_CAP64_DEV_TLP_EMULATION(mdev, cap)\ + MLX5_GET64(tlp_dev_emu_capabilities, \ + (mdev)->caps.hca[MLX5_CAP_TLP_EMULATION]->cur, cap) + #define MLX5_CAP_IPSEC(mdev, cap)\ MLX5_GET(ipsec_cap, (mdev)->caps.hca[MLX5_CAP_IPSEC]->cur, cap) -- cgit v1.2.3 From b824c3e16c1904bf80df489e293d1e3cbf98896d Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 2 Mar 2026 17:26:31 +0100 Subject: net: Provide a PREEMPT_RT specific check for netdev_queue::_xmit_lock After acquiring netdev_queue::_xmit_lock the number of the CPU owning the lock is recorded in netdev_queue::xmit_lock_owner. This works as long as the BH context is not preemptible. On PREEMPT_RT the softirq context is preemptible and without the softirq-lock it is possible to have multiple user in __dev_queue_xmit() submitting a skb on the same CPU. This is fine in general but this means also that the current CPU is recorded as netdev_queue::xmit_lock_owner. This in turn leads to the recursion alert and the skb is dropped. Instead checking the for CPU number, that owns the lock, PREEMPT_RT can check if the lockowner matches the current task. Add netif_tx_owned() which returns true if the current context owns the lock by comparing the provided CPU number with the recorded number. This resembles the current check by negating the condition (the current check returns true if the lock is not owned). On PREEMPT_RT use rt_mutex_owner() to return the lock owner and compare the current task against it. Use the new helper in __dev_queue_xmit() and netif_local_xmit_active() which provides a similar check. Update comments regarding pairing READ_ONCE(). Reported-by: Bert Karwatzki Closes: https://lore.kernel.org/all/20260216134333.412332-1-spasswolf@web.de Fixes: 3253cb49cbad4 ("softirq: Allow to drop the softirq-BKL lock on PREEMPT_RT") Signed-off-by: Sebastian Andrzej Siewior Reported-by: Bert Karwatzki Signed-off-by: Sebastian Andrzej Siewior Link: https://patch.msgid.link/20260302162631.uGUyIqDT@linutronix.de Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d4e6e00bb90a..67e25f6d15a4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4711,7 +4711,7 @@ static inline u32 netif_msg_init(int debug_value, int default_msg_enable_bits) static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu) { spin_lock(&txq->_xmit_lock); - /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + /* Pairs with READ_ONCE() in netif_tx_owned() */ WRITE_ONCE(txq->xmit_lock_owner, cpu); } @@ -4729,7 +4729,7 @@ static inline void __netif_tx_release(struct netdev_queue *txq) static inline void __netif_tx_lock_bh(struct netdev_queue *txq) { spin_lock_bh(&txq->_xmit_lock); - /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + /* Pairs with READ_ONCE() in netif_tx_owned() */ WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id()); } @@ -4738,7 +4738,7 @@ static inline bool __netif_tx_trylock(struct netdev_queue *txq) bool ok = spin_trylock(&txq->_xmit_lock); if (likely(ok)) { - /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + /* Pairs with READ_ONCE() in netif_tx_owned() */ WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id()); } return ok; @@ -4746,14 +4746,14 @@ static inline bool __netif_tx_trylock(struct netdev_queue *txq) static inline void __netif_tx_unlock(struct netdev_queue *txq) { - /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + /* Pairs with READ_ONCE() in netif_tx_owned() */ WRITE_ONCE(txq->xmit_lock_owner, -1); spin_unlock(&txq->_xmit_lock); } static inline void __netif_tx_unlock_bh(struct netdev_queue *txq) { - /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + /* Pairs with READ_ONCE() in netif_tx_owned() */ WRITE_ONCE(txq->xmit_lock_owner, -1); spin_unlock_bh(&txq->_xmit_lock); } @@ -4846,6 +4846,23 @@ static inline void netif_tx_disable(struct net_device *dev) local_bh_enable(); } +#ifndef CONFIG_PREEMPT_RT +static inline bool netif_tx_owned(struct netdev_queue *txq, unsigned int cpu) +{ + /* Other cpus might concurrently change txq->xmit_lock_owner + * to -1 or to their cpu id, but not to our id. + */ + return READ_ONCE(txq->xmit_lock_owner) == cpu; +} + +#else +static inline bool netif_tx_owned(struct netdev_queue *txq, unsigned int cpu) +{ + return rt_mutex_owner(&txq->_xmit_lock.lock) == current; +} + +#endif + static inline void netif_addr_lock(struct net_device *dev) { unsigned char nest_level = 0; -- cgit v1.2.3 From 90503f9ffee927c3abdc94a4862d13ae71ea9442 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Thu, 12 Feb 2026 15:30:39 -0800 Subject: powercap: intel_rapl: Use unit conversion macros from units.h Replace hardcoded numeric constants with standard unit conversion macros from linux/units.h for better code clarity and self-documentation. Add MICROJOULE_PER_JOULE and NANOJOULE_PER_JOULE to units.h to support energy unit conversions, following the existing pattern for power units. No functional changes. Signed-off-by: Kuppuswamy Sathyanarayanan Acked-by: Srinivas Pandruvada Link: https://patch.msgid.link/20260212233044.329790-8-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- include/linux/units.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/units.h b/include/linux/units.h index 80d57c50b9e3..c6d78988613a 100644 --- a/include/linux/units.h +++ b/include/linux/units.h @@ -57,6 +57,9 @@ #define MICROWATT_PER_MILLIWATT 1000UL #define MICROWATT_PER_WATT 1000000UL +#define MICROJOULE_PER_JOULE 1000000UL +#define NANOJOULE_PER_JOULE 1000000000UL + #define BYTES_PER_KBIT (KILO / BITS_PER_BYTE) #define BYTES_PER_MBIT (MEGA / BITS_PER_BYTE) #define BYTES_PER_GBIT (GIGA / BITS_PER_BYTE) -- cgit v1.2.3 From d7ca7d1488cc916dbf0a6a594abbda81d4eaeee9 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Thu, 12 Feb 2026 15:30:40 -0800 Subject: powercap: intel_rapl: Allow interface drivers to configure rapl_defaults RAPL default settings vary across different RAPL interfaces (MSR, TPMI, MMIO). Currently, these defaults are stored in the common RAPL driver, which requires interface-specific handling logic and makes the common layer unnecessarily complex. There is no strong reason for the common code to own these defaults, since they are inherently interface-specific. To prepare for moving default configuration into the individual interface drivers, 1. Move struct rapl_defaults into a shared header so that interface drivers can directly populate their own default settings. 2. Change the @defaults field in struct rapl_if_priv from void * to const struct rapl_defaults * to improve type safety and readability and update the common driver to use the typed defaults structure. 3. Update all internal getter functions and local pointers to use const struct rapl_defaults * to maintain const-correctness. 4. Rename and export the common helper functions (check_unit, set_floor_freq, compute_time_window) so interface drivers may reuse or override them as appropriate. No functional changes. This is a preparatory refactoring to allow interface drivers to supply their own RAPL default settings. Co-developed-by: Zhang Rui Signed-off-by: Zhang Rui Signed-off-by: Kuppuswamy Sathyanarayanan Acked-by: Srinivas Pandruvada Link: https://patch.msgid.link/20260212233044.329790-9-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- include/linux/intel_rapl.h | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h index fa1f328d6712..6d694099a3ad 100644 --- a/include/linux/intel_rapl.h +++ b/include/linux/intel_rapl.h @@ -128,6 +128,16 @@ struct reg_action { int err; }; +struct rapl_defaults { + u8 floor_freq_reg_addr; + int (*check_unit)(struct rapl_domain *rd); + void (*set_floor_freq)(struct rapl_domain *rd, bool mode); + u64 (*compute_time_window)(struct rapl_domain *rd, u64 val, bool to_raw); + unsigned int dram_domain_energy_unit; + unsigned int psys_domain_energy_unit; + bool spr_psys_bits; +}; + /** * struct rapl_if_priv: private data for different RAPL interfaces * @control_type: Each RAPL interface must have its own powercap @@ -142,7 +152,7 @@ struct reg_action { * registers. * @write_raw: Callback for writing RAPL interface specific * registers. - * @defaults: internal pointer to interface default settings + * @defaults: pointer to default settings * @rpi: internal pointer to interface primitive info */ struct rapl_if_priv { @@ -154,7 +164,7 @@ struct rapl_if_priv { int limits[RAPL_DOMAIN_MAX]; int (*read_raw)(int id, struct reg_action *ra, bool pmu_ctx); int (*write_raw)(int id, struct reg_action *ra); - void *defaults; + const struct rapl_defaults *defaults; void *rpi; }; @@ -211,6 +221,9 @@ void rapl_remove_package_cpuslocked(struct rapl_package *rp); struct rapl_package *rapl_find_package_domain(int id, struct rapl_if_priv *priv, bool id_is_cpu); struct rapl_package *rapl_add_package(int id, struct rapl_if_priv *priv, bool id_is_cpu); void rapl_remove_package(struct rapl_package *rp); +int rapl_default_check_unit(struct rapl_domain *rd); +void rapl_default_set_floor_freq(struct rapl_domain *rd, bool mode); +u64 rapl_default_compute_time_window(struct rapl_domain *rd, u64 value, bool to_raw); #ifdef CONFIG_PERF_EVENTS int rapl_package_add_pmu(struct rapl_package *rp); -- cgit v1.2.3 From cc39325f927850473d3a84b029ae6f9b508e9bd1 Mon Sep 17 00:00:00 2001 From: Mohsin Bashir Date: Mon, 2 Mar 2026 15:01:45 -0800 Subject: net: ethtool: Track pause storm events With TX pause enabled, if a device is unable to pass packets up to the stack (e.g., CPU is hanged), the device can cause pause storm. Given that devices can have native support to protect the neighbor from such flooding, such events need some tracking. This support is to track TX pause storm events for better observability. Reviewed-by: Oleksij Rempel Signed-off-by: Jakub Kicinski Signed-off-by: Mohsin Bashir Link: https://patch.msgid.link/20260302230149.1580195-2-mohsin.bashr@gmail.com Signed-off-by: Paolo Abeni --- include/linux/ethtool.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 798abec67a1b..83c375840835 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -512,12 +512,14 @@ struct ethtool_eth_ctrl_stats { * * Equivalent to `30.3.4.3 aPAUSEMACCtrlFramesReceived` * from the standard. + * @tx_pause_storm_events: TX pause storm event count (see ethtool.yaml). */ struct ethtool_pause_stats { enum ethtool_mac_stats_src src; struct_group(stats, u64 tx_pause_frames; u64 rx_pause_frames; + u64 tx_pause_storm_events; ); }; -- cgit v1.2.3 From 31a6a07eefeb4c84bd6730fbe9e95fd9221712cf Mon Sep 17 00:00:00 2001 From: Coiby Xu Date: Fri, 13 Feb 2026 09:28:46 +0800 Subject: integrity: Make arch_ima_get_secureboot integrity-wide EVM and other LSMs need the ability to query the secure boot status of the system, without directly calling the IMA arch_ima_get_secureboot function. Refactor the secure boot status check into a general function named arch_get_secureboot. Reported-and-suggested-by: Mimi Zohar Suggested-by: Roberto Sassu Signed-off-by: Coiby Xu Acked-by: Ard Biesheuvel Signed-off-by: Mimi Zohar --- include/linux/ima.h | 7 +------ include/linux/secure_boot.h | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 6 deletions(-) create mode 100644 include/linux/secure_boot.h (limited to 'include/linux') diff --git a/include/linux/ima.h b/include/linux/ima.h index abf8923f8fc5..8e08baf16c2f 100644 --- a/include/linux/ima.h +++ b/include/linux/ima.h @@ -11,6 +11,7 @@ #include #include #include +#include #include struct linux_binprm; @@ -73,14 +74,8 @@ int ima_validate_range(phys_addr_t phys, size_t size); #endif #ifdef CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT -extern bool arch_ima_get_secureboot(void); extern const char * const *arch_get_ima_policy(void); #else -static inline bool arch_ima_get_secureboot(void) -{ - return false; -} - static inline const char * const *arch_get_ima_policy(void) { return NULL; diff --git a/include/linux/secure_boot.h b/include/linux/secure_boot.h new file mode 100644 index 000000000000..3ded3f03655c --- /dev/null +++ b/include/linux/secure_boot.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2026 Red Hat, Inc. All Rights Reserved. + * + * Author: Coiby Xu + */ + +#ifndef _LINUX_SECURE_BOOT_H +#define _LINUX_SECURE_BOOT_H + +#include + +/* + * Returns true if the platform secure boot is enabled. + * Returns false if disabled or not supported. + */ +bool arch_get_secureboot(void); + +#endif /* _LINUX_SECURE_BOOT_H */ -- cgit v1.2.3 From 0ec959cf4b5a609d7f27bf84064ef5372e30ab80 Mon Sep 17 00:00:00 2001 From: Coiby Xu Date: Tue, 30 Sep 2025 10:26:56 +0800 Subject: evm: fix security.evm for a file with IMA signature When both IMA and EVM fix modes are enabled, accessing a file with IMA signature but missing EVM HMAC won't cause security.evm to be fixed. Add a function evm_fix_hmac which will be explicitly called to fix EVM HMAC for this case. Suggested-by: Mimi Zohar Signed-off-by: Coiby Xu Signed-off-by: Mimi Zohar --- include/linux/evm.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/evm.h b/include/linux/evm.h index ddece4a6b25d..913f4573b203 100644 --- a/include/linux/evm.h +++ b/include/linux/evm.h @@ -18,6 +18,8 @@ extern enum integrity_status evm_verifyxattr(struct dentry *dentry, const char *xattr_name, void *xattr_value, size_t xattr_value_len); +int evm_fix_hmac(struct dentry *dentry, const char *xattr_name, + const char *xattr_value, size_t xattr_value_len); int evm_inode_init_security(struct inode *inode, struct inode *dir, const struct qstr *qstr, struct xattr *xattrs, int *xattr_count); @@ -51,6 +53,12 @@ static inline enum integrity_status evm_verifyxattr(struct dentry *dentry, { return INTEGRITY_UNKNOWN; } + +static inline int evm_fix_hmac(struct dentry *dentry, const char *xattr_name, + const char *xattr_value, size_t xattr_value_len) +{ + return -EOPNOTSUPP; +} #endif static inline int evm_inode_init_security(struct inode *inode, struct inode *dir, -- cgit v1.2.3 From fab0c75d500fd23de6ea1b30e44635418a6dae65 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 4 Mar 2026 10:10:22 -0800 Subject: x86/cpu: Add platform ID to CPU matching structure The existing x86_match_cpu() infrastructure can be used to match a bunch of attributes of a CPU: vendor, family, model, steppings and CPU features. But, there's one more attribute that's missing and unable to be matched against: the platform ID, enumerated on Intel CPUs in MSR_IA32_PLATFORM_ID. It is a little more obscure and is only queried during microcode loading. This is because Intel sometimes has CPUs with identical family/model/stepping but which need different microcode. These CPUs are differentiated with the platform ID. Add a field in 'struct x86_cpu_id' for the platform ID. Similar to the stepping field, make the new field a mask of platform IDs. Some examples: 0x01: matches only platform ID 0x0 0x02: matches only platform ID 0x1 0x03: matches platform IDs 0x0 or 0x1 0x80: matches only platform ID 0x7 0xff: matches all 8 possible platform IDs Since the mask is only a byte wide, it nestles in next to another u8 and does not even increase the size of 'struct x86_cpu_id'. Reserve the all 0's value as the wildcard (X86_PLATFORM_ANY). This avoids forcing changes to existing 'struct x86_cpu_id' users. They can just continue to fill the field with 0's and their matching will work exactly as before. Note: If someone is ever looking for space in 'struct x86_cpu_id', this new field could probably get stuck over in ->driver_data for the one user that there is. Signed-off-by: Dave Hansen Reviewed-by: Sohil Mehta Link: https://patch.msgid.link/20260304181022.058DF07C@davehans-spike.ostc.intel.com --- include/linux/mod_devicetable.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index 5b1725fe9707..23ff24080dfd 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -691,6 +691,7 @@ struct x86_cpu_id { __u16 feature; /* bit index */ /* Solely for kernel-internal use: DO NOT EXPORT to userspace! */ __u16 flags; + __u8 platform_mask; __u8 type; kernel_ulong_t driver_data; }; @@ -702,6 +703,7 @@ struct x86_cpu_id { #define X86_STEPPING_ANY 0 #define X86_STEP_MIN 0 #define X86_STEP_MAX 0xf +#define X86_PLATFORM_ANY 0x0 #define X86_FEATURE_ANY 0 /* Same as FPU, you can't test for that */ #define X86_CPU_TYPE_ANY 0 -- cgit v1.2.3 From de70eef32e10883fe74f6df635c616785b24b867 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 25 Feb 2026 21:13:09 -0800 Subject: ARM: omap: fix all kernel-doc warnings Use the correct struct member names to avoid kernel-doc warnings: Warning: include/linux/platform_data/voltage-omap.h:27 struct member 'volt_nominal' not described in 'omap_volt_data' Warning: include/linux/platform_data/voltage-omap.h:27 struct member 'vp_errgain' not described in 'omap_volt_data' Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20260226051309.556228-1-rdunlap@infradead.org Signed-off-by: Kevin Hilman --- include/linux/platform_data/voltage-omap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/voltage-omap.h b/include/linux/platform_data/voltage-omap.h index 6d74e507dbd2..2b48f2b0135d 100644 --- a/include/linux/platform_data/voltage-omap.h +++ b/include/linux/platform_data/voltage-omap.h @@ -10,14 +10,14 @@ /** * struct omap_volt_data - Omap voltage specific data. - * @voltage_nominal: The possible voltage value in uV + * @volt_nominal: The possible voltage value in uV * @sr_efuse_offs: The offset of the efuse register(from system * control module base address) from where to read * the n-target value for the smartreflex module. * @sr_errminlimit: Error min limit value for smartreflex. This value * differs at differnet opp and thus is linked * with voltage. - * @vp_errorgain: Error gain value for the voltage processor. This + * @vp_errgain: Error gain value for the voltage processor. This * field also differs according to the voltage/opp. */ struct omap_volt_data { -- cgit v1.2.3 From d4d8c6e6fd2a1c5144339884ca5f66e654ad54a5 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 3 Mar 2026 23:54:16 +0000 Subject: tcp: Initialise ehash secrets during connect() and listen(). inet_ehashfn() and inet6_ehashfn() initialise random secrets on the first call by net_get_random_once(). While the init part is patched out using static keys, with CONFIG_STACKPROTECTOR_STRONG=y, this causes a compiler to generate a stack canary due to an automatic variable, unsigned long ___flags, in the DO_ONCE() macro being passed to __do_once_start(). With FDO, this is visible in __inet_lookup_established() and __inet6_lookup_established() too. Let's initialise the secrets by get_random_sleepable_once() in the slow paths: inet_hash() for listen(), and inet_hash_connect() and inet6_hash_connect() for connect(). Note that IPv6 listener will initialise both IPv4 & IPv6 secrets in inet_hash() for IPv4-mapped IPv6 address. With the patch, the stack size is reduced by 16 bytes (___flags + a stack canary) and NOPs for the static key go away. Before: __inet6_lookup_established() ... push %rbx sub $0x38,%rsp # stack is 56 bytes mov %edx,%ebx # sport mov %gs:0x299419f(%rip),%rax # load stack canary mov %rax,0x30(%rsp) and store it onto stack mov 0x440(%rdi),%r15 # net->ipv4.tcp_death_row.hashinfo nop 32: mov %r8d,%ebp # hnum shl $0x10,%ebp # hnum << 16 nop 3d: mov 0x70(%rsp),%r14d # sdif or %ebx,%ebp # INET_COMBINED_PORTS(sport, hnum) mov 0x11a8382(%rip),%eax # inet6_ehashfn() ... After: __inet6_lookup_established() ... push %rbx sub $0x28,%rsp # stack is 40 bytes mov 0x60(%rsp),%ebp # sdif mov %r8d,%r14d # hnum shl $0x10,%r14d # hnum << 16 or %edx,%r14d # INET_COMBINED_PORTS(sport, hnum) mov 0x440(%rdi),%rax # net->ipv4.tcp_death_row.hashinfo mov 0x1194f09(%rip),%r10d # inet6_ehashfn() ... Suggested-by: Eric Dumazet Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260303235424.3877267-1-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/linux/net.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/net.h b/include/linux/net.h index f58b38ab37f8..a8e818de95b3 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -304,6 +304,8 @@ do { \ #define net_get_random_once(buf, nbytes) \ get_random_once((buf), (nbytes)) +#define net_get_random_sleepable_once(buf, nbytes) \ + get_random_sleepable_once((buf), (nbytes)) /* * E.g. XFS meta- & log-data is in slab pages, or bcache meta -- cgit v1.2.3 From 5b30afc20b3fea29b9beb83c6415c4ff06f774aa Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 4 Mar 2026 11:26:47 -1000 Subject: cgroup: Expose some cgroup helpers Expose the following through cgroup.h: - cgroup_on_dfl() - cgroup_is_dead() - cgroup_for_each_live_child() - cgroup_for_each_live_descendant_pre() - cgroup_for_each_live_descendant_post() Until now, these didn't need to be exposed because controllers only cared about the css hierarchy. The planned sched_ext hierarchical scheduler support will be based on the default cgroup hierarchy, which is in line with the existing BPF cgroup support, and thus needs these exposed. Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 65 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 63 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index bc892e3b37ee..e52160e85af4 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -42,6 +42,14 @@ struct kernel_clone_args; #ifdef CONFIG_CGROUPS +/* + * To avoid confusing the compiler (and generating warnings) with code + * that attempts to access what would be a 0-element array (i.e. sized + * to a potentially empty array when CGROUP_SUBSYS_COUNT == 0), this + * constant expression can be added. + */ +#define CGROUP_HAS_SUBSYS_CONFIG (CGROUP_SUBSYS_COUNT > 0) + enum css_task_iter_flags { CSS_TASK_ITER_PROCS = (1U << 0), /* walk only threadgroup leaders */ CSS_TASK_ITER_THREADED = (1U << 1), /* walk all threaded css_sets in the domain */ @@ -76,6 +84,7 @@ enum cgroup_lifetime_events { extern struct file_system_type cgroup_fs_type; extern struct cgroup_root cgrp_dfl_root; extern struct css_set init_css_set; +extern struct mutex cgroup_mutex; extern spinlock_t css_set_lock; extern struct blocking_notifier_head cgroup_lifetime_notifier; @@ -103,6 +112,8 @@ extern struct blocking_notifier_head cgroup_lifetime_notifier; #define cgroup_subsys_on_dfl(ss) \ static_branch_likely(&ss ## _on_dfl_key) +bool cgroup_on_dfl(const struct cgroup *cgrp); + bool css_has_online_children(struct cgroup_subsys_state *css); struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss); struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgroup, @@ -274,6 +285,32 @@ void css_task_iter_end(struct css_task_iter *it); for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \ (pos) = css_next_descendant_post((pos), (css))) +/* iterate over child cgrps, lock should be held throughout iteration */ +#define cgroup_for_each_live_child(child, cgrp) \ + list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \ + if (({ lockdep_assert_held(&cgroup_mutex); \ + cgroup_is_dead(child); })) \ + ; \ + else + +/* walk live descendants in pre order */ +#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \ + css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \ + if (({ lockdep_assert_held(&cgroup_mutex); \ + (dsct) = (d_css)->cgroup; \ + cgroup_is_dead(dsct); })) \ + ; \ + else + +/* walk live descendants in postorder */ +#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \ + css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \ + if (({ lockdep_assert_held(&cgroup_mutex); \ + (dsct) = (d_css)->cgroup; \ + cgroup_is_dead(dsct); })) \ + ; \ + else + /** * cgroup_taskset_for_each - iterate cgroup_taskset * @task: the loop cursor @@ -336,6 +373,27 @@ static inline u64 cgroup_id(const struct cgroup *cgrp) return cgrp->kn->id; } +/** + * cgroup_css - obtain a cgroup's css for the specified subsystem + * @cgrp: the cgroup of interest + * @ss: the subsystem of interest (%NULL returns @cgrp->self) + * + * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This + * function must be called either under cgroup_mutex or rcu_read_lock() and + * the caller is responsible for pinning the returned css if it wants to + * keep accessing it outside the said locks. This function may return + * %NULL if @cgrp doesn't have @subsys_id enabled. + */ +static inline struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, + struct cgroup_subsys *ss) +{ + if (CGROUP_HAS_SUBSYS_CONFIG && ss) + return rcu_dereference_check(cgrp->subsys[ss->id], + lockdep_is_held(&cgroup_mutex)); + else + return &cgrp->self; +} + /** * css_is_dying - test whether the specified css is dying * @css: target css @@ -372,6 +430,11 @@ static inline bool css_is_self(struct cgroup_subsys_state *css) return false; } +static inline bool cgroup_is_dead(const struct cgroup *cgrp) +{ + return !(cgrp->self.flags & CSS_ONLINE); +} + static inline void cgroup_get(struct cgroup *cgrp) { css_get(&cgrp->self); @@ -387,8 +450,6 @@ static inline void cgroup_put(struct cgroup *cgrp) css_put(&cgrp->self); } -extern struct mutex cgroup_mutex; - static inline void cgroup_lock(void) { mutex_lock(&cgroup_mutex); -- cgit v1.2.3 From 336faf5d9115ca6982b82cf122e68738ea8c4173 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 25 Feb 2026 09:16:53 +1100 Subject: VFS: make lookup_one_qstr_excl() static. lookup_one_qstr_excl() is no longer used outside of namei.c, so make it static. Reviewed-by: Jeff Layton Signed-off-by: NeilBrown Link: https://patch.msgid.link/20260224222542.3458677-9-neilb@ownmail.net Signed-off-by: Christian Brauner --- include/linux/namei.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/namei.h b/include/linux/namei.h index 58600cf234bc..c7a7288cdd25 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -54,9 +54,6 @@ extern int path_pts(struct path *path); extern int user_path_at(int, const char __user *, unsigned, struct path *); -struct dentry *lookup_one_qstr_excl(const struct qstr *name, - struct dentry *base, - unsigned int flags); extern int kern_path(const char *, unsigned, struct path *); struct dentry *kern_path_parent(const char *name, struct path *parent); -- cgit v1.2.3 From 08e6183ed2568e733e05e7e1c9de737d91c21155 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 26 Feb 2026 18:36:07 +0100 Subject: wifi: move action code from per-type frame structs The action code actually serves to identify the type of action frame, so it really isn't part of the per-type structure. Pull it out and have it in the general action frame format. In theory, whether or not the action code is present in this way is up to each category, but all categories that are defined right now all have that value. While at it, and since this change requires changing all users, remove the 'u' and make it an anonymous union in this case, so that all code using this changes. Change IEEE80211_MIN_ACTION_SIZE to take an argument which says how much of the frame is needed, e.g. category, action_code or the specific frame type that's defined in the union. Again this also ensures that all code is updated. In some cases, fix bugs where the SKB length was checked after having accessed beyond the checked length, in particular in FTM code, e.g. ieee80211_is_ftm(). Link: https://patch.msgid.link/20260226183607.67e71846b59e.I9a24328e3ffcaae179466a935f1c3345029f9961@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 83 +++++++++++++++++------------------------------ 1 file changed, 30 insertions(+), 53 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 3651b2e6c518..aea360e90cb1 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1046,31 +1046,28 @@ struct ieee80211_mgmt { } __packed probe_resp; struct { u8 category; + u8 action_code; union { struct { - u8 action_code; u8 dialog_token; u8 status_code; u8 variable[]; } __packed wme_action; struct{ - u8 action_code; + u8 no_fixed_fields[0]; u8 variable[]; } __packed chan_switch; struct{ - u8 action_code; struct ieee80211_ext_chansw_ie data; u8 variable[]; } __packed ext_chan_switch; struct{ - u8 action_code; u8 dialog_token; u8 element_id; u8 length; struct ieee80211_msrment_ie msr_elem; } __packed measurement; struct{ - u8 action_code; u8 dialog_token; __le16 capab; __le16 timeout; @@ -1079,7 +1076,6 @@ struct ieee80211_mgmt { u8 variable[]; } __packed addba_req; struct{ - u8 action_code; u8 dialog_token; __le16 status; __le16 capab; @@ -1088,54 +1084,45 @@ struct ieee80211_mgmt { u8 variable[]; } __packed addba_resp; struct{ - u8 action_code; __le16 params; __le16 reason_code; } __packed delba; struct { - u8 action_code; + u8 no_fixed_fields[0]; u8 variable[]; } __packed self_prot; struct{ - u8 action_code; + u8 no_fixed_fields[0]; u8 variable[]; } __packed mesh_action; struct { - u8 action; u8 trans_id[WLAN_SA_QUERY_TR_ID_LEN]; } __packed sa_query; struct { - u8 action; u8 smps_control; } __packed ht_smps; struct { - u8 action_code; u8 chanwidth; } __packed ht_notify_cw; struct { - u8 action_code; u8 dialog_token; __le16 capability; u8 variable[]; } __packed tdls_discover_resp; struct { - u8 action_code; u8 operating_mode; } __packed vht_opmode_notif; struct { - u8 action_code; u8 membership[WLAN_MEMBERSHIP_LEN]; u8 position[WLAN_USER_POSITION_LEN]; } __packed vht_group_notif; struct { - u8 action_code; u8 dialog_token; u8 tpc_elem_id; u8 tpc_elem_length; struct ieee80211_tpc_report_ie tpc; } __packed tpc_report; struct { - u8 action_code; u8 dialog_token; u8 follow_up; u8 tod[6]; @@ -1145,11 +1132,10 @@ struct ieee80211_mgmt { u8 variable[]; } __packed ftm; struct { - u8 action_code; + u8 no_fixed_fields[0]; u8 variable[]; } __packed s1g; struct { - u8 action_code; u8 dialog_token; u8 follow_up; u32 tod; @@ -1158,41 +1144,37 @@ struct ieee80211_mgmt { u8 max_toa_error; } __packed wnm_timing_msr; struct { - u8 action_code; u8 dialog_token; u8 variable[]; } __packed ttlm_req; struct { - u8 action_code; u8 dialog_token; __le16 status_code; u8 variable[]; } __packed ttlm_res; struct { - u8 action_code; + u8 no_fixed_fields[0]; + /* no variable fields either */ } __packed ttlm_tear_down; struct { - u8 action_code; u8 dialog_token; u8 variable[]; } __packed ml_reconf_req; struct { - u8 action_code; u8 dialog_token; u8 count; u8 variable[]; } __packed ml_reconf_resp; struct { - u8 action_code; + u8 no_fixed_fields[0]; u8 variable[]; } __packed epcs; struct { - u8 action_code; u8 dialog_token; u8 control; u8 variable[]; } __packed eml_omn; - } u; + }; } __packed action; DECLARE_FLEX_ARRAY(u8, body); /* Generic frame body */ } u; @@ -1210,8 +1192,7 @@ struct ieee80211_mgmt { #define BSS_MEMBERSHIP_SELECTOR_MIN BSS_MEMBERSHIP_SELECTOR_UHR_PHY -/* mgmt header + 1 byte category code */ -#define IEEE80211_MIN_ACTION_SIZE offsetof(struct ieee80211_mgmt, u.action.u) +#define IEEE80211_MIN_ACTION_SIZE(type) offsetofend(struct ieee80211_mgmt, u.action.type) /* Management MIC information element (IEEE 802.11w) for CMAC */ @@ -2391,7 +2372,7 @@ static inline bool ieee80211_is_bufferable_mmpdu(struct sk_buff *skb) if (!ieee80211_is_action(fc)) return false; - if (skb->len < offsetofend(typeof(*mgmt), u.action.u.ftm.action_code)) + if (skb->len < IEEE80211_MIN_ACTION_SIZE(action_code)) return true; /* action frame - additionally check for non-bufferable FTM */ @@ -2400,8 +2381,8 @@ static inline bool ieee80211_is_bufferable_mmpdu(struct sk_buff *skb) mgmt->u.action.category != WLAN_CATEGORY_PROTECTED_DUAL_OF_ACTION) return true; - if (mgmt->u.action.u.ftm.action_code == WLAN_PUB_ACTION_FTM_REQUEST || - mgmt->u.action.u.ftm.action_code == WLAN_PUB_ACTION_FTM_RESPONSE) + if (mgmt->u.action.action_code == WLAN_PUB_ACTION_FTM_REQUEST || + mgmt->u.action.action_code == WLAN_PUB_ACTION_FTM_RESPONSE) return false; return true; @@ -2451,7 +2432,7 @@ static inline bool _ieee80211_is_robust_mgmt_frame(struct ieee80211_hdr *hdr) */ static inline bool ieee80211_is_robust_mgmt_frame(struct sk_buff *skb) { - if (skb->len < IEEE80211_MIN_ACTION_SIZE) + if (skb->len < IEEE80211_MIN_ACTION_SIZE(category)) return false; return _ieee80211_is_robust_mgmt_frame((void *)skb->data); } @@ -2467,7 +2448,7 @@ static inline bool ieee80211_is_public_action(struct ieee80211_hdr *hdr, { struct ieee80211_mgmt *mgmt = (void *)hdr; - if (len < IEEE80211_MIN_ACTION_SIZE) + if (len < IEEE80211_MIN_ACTION_SIZE(category)) return false; if (!ieee80211_is_action(hdr->frame_control)) return false; @@ -2485,13 +2466,14 @@ static inline bool ieee80211_is_public_action(struct ieee80211_hdr *hdr, static inline bool ieee80211_is_protected_dual_of_public_action(struct sk_buff *skb) { + struct ieee80211_mgmt *mgmt = (void *)skb->data; u8 action; if (!ieee80211_is_public_action((void *)skb->data, skb->len) || - skb->len < IEEE80211_MIN_ACTION_SIZE + 1) + skb->len < IEEE80211_MIN_ACTION_SIZE(action_code)) return false; - action = *(u8 *)(skb->data + IEEE80211_MIN_ACTION_SIZE); + action = mgmt->u.action.action_code; return action != WLAN_PUB_ACTION_20_40_BSS_COEX && action != WLAN_PUB_ACTION_DSE_REG_LOC_ANN && @@ -2530,7 +2512,7 @@ static inline bool _ieee80211_is_group_privacy_action(struct ieee80211_hdr *hdr) */ static inline bool ieee80211_is_group_privacy_action(struct sk_buff *skb) { - if (skb->len < IEEE80211_MIN_ACTION_SIZE) + if (skb->len < IEEE80211_MIN_ACTION_SIZE(category)) return false; return _ieee80211_is_group_privacy_action((void *)skb->data); } @@ -2626,8 +2608,7 @@ static inline bool ieee80211_action_contains_tpc(struct sk_buff *skb) if (!ieee80211_is_action(mgmt->frame_control)) return false; - if (skb->len < IEEE80211_MIN_ACTION_SIZE + - sizeof(mgmt->u.action.u.tpc_report)) + if (skb->len < IEEE80211_MIN_ACTION_SIZE(tpc_report)) return false; /* @@ -2646,12 +2627,11 @@ static inline bool ieee80211_action_contains_tpc(struct sk_buff *skb) return false; /* both spectrum mgmt and link measurement have same action code */ - if (mgmt->u.action.u.tpc_report.action_code != - WLAN_ACTION_SPCT_TPC_RPRT) + if (mgmt->u.action.action_code != WLAN_ACTION_SPCT_TPC_RPRT) return false; - if (mgmt->u.action.u.tpc_report.tpc_elem_id != WLAN_EID_TPC_REPORT || - mgmt->u.action.u.tpc_report.tpc_elem_length != + if (mgmt->u.action.tpc_report.tpc_elem_id != WLAN_EID_TPC_REPORT || + mgmt->u.action.tpc_report.tpc_elem_length != sizeof(struct ieee80211_tpc_report_ie)) return false; @@ -2667,16 +2647,15 @@ static inline bool ieee80211_is_timing_measurement(struct sk_buff *skb) { struct ieee80211_mgmt *mgmt = (void *)skb->data; - if (skb->len < IEEE80211_MIN_ACTION_SIZE) + if (skb->len < IEEE80211_MIN_ACTION_SIZE(wnm_timing_msr)) return false; if (!ieee80211_is_action(mgmt->frame_control)) return false; if (mgmt->u.action.category == WLAN_CATEGORY_WNM_UNPROTECTED && - mgmt->u.action.u.wnm_timing_msr.action_code == - WLAN_UNPROTECTED_WNM_ACTION_TIMING_MEASUREMENT_RESPONSE && - skb->len >= offsetofend(typeof(*mgmt), u.action.u.wnm_timing_msr)) + mgmt->u.action.action_code == + WLAN_UNPROTECTED_WNM_ACTION_TIMING_MEASUREMENT_RESPONSE) return true; return false; @@ -2691,15 +2670,13 @@ static inline bool ieee80211_is_ftm(struct sk_buff *skb) { struct ieee80211_mgmt *mgmt = (void *)skb->data; - if (!ieee80211_is_public_action((void *)mgmt, skb->len)) + if (skb->len < IEEE80211_MIN_ACTION_SIZE(ftm)) return false; - if (mgmt->u.action.u.ftm.action_code == - WLAN_PUB_ACTION_FTM_RESPONSE && - skb->len >= offsetofend(typeof(*mgmt), u.action.u.ftm)) - return true; + if (!ieee80211_is_public_action((void *)mgmt, skb->len)) + return false; - return false; + return mgmt->u.action.action_code == WLAN_PUB_ACTION_FTM_RESPONSE; } struct element { -- cgit v1.2.3 From 9aa84d5c6c99480c523aeb7a6ce93b6635f3e290 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 4 Mar 2026 14:41:48 +0100 Subject: wifi: ieee80211: fix UHR operation DBE vs. P-EDCA order Draft P802.11bn_D1.3 switched the order here to align with the order of the fields. Adjust the code accordingly. Link: https://patch.msgid.link/20260304144148.ce45942294e1.I22ab3f16e6376a19c3953cf81dd67105ea8e529d@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211-uhr.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211-uhr.h b/include/linux/ieee80211-uhr.h index 9729d23e4766..d199f3ebdba0 100644 --- a/include/linux/ieee80211-uhr.h +++ b/include/linux/ieee80211-uhr.h @@ -12,8 +12,8 @@ #define IEEE80211_UHR_OPER_PARAMS_DPS_ENA 0x0001 #define IEEE80211_UHR_OPER_PARAMS_NPCA_ENA 0x0002 -#define IEEE80211_UHR_OPER_PARAMS_DBE_ENA 0x0004 -#define IEEE80211_UHR_OPER_PARAMS_PEDCA_ENA 0x0008 +#define IEEE80211_UHR_OPER_PARAMS_PEDCA_ENA 0x0004 +#define IEEE80211_UHR_OPER_PARAMS_DBE_ENA 0x0008 struct ieee80211_uhr_operation { __le16 params; -- cgit v1.2.3 From 98acd4c1d9f7dc9c426e840c16e81b57315ff84b Mon Sep 17 00:00:00 2001 From: Ria Thomas Date: Thu, 5 Mar 2026 14:43:04 +0530 Subject: wifi: mac80211: add support for NDP ADDBA/DELBA for S1G S1G defines use of NDP Block Ack (BA) for aggregation, requiring negotiation of NDP ADDBA/DELBA action frames. If the S1G recipient supports HT-immediate block ack, the sender must send an NDP ADDBA Request indicating it expects only NDP BlockAck frames for the agreement. Introduce support for NDP ADDBA and DELBA exchange in mac80211. The implementation negotiates the BA mechanism during setup based on station capabilities and driver support (IEEE80211_HW_SUPPORTS_NDP_BLOCKACK). If negotiation fails due to mismatched expectations, a rejection with status code WLAN_STATUS_REJECTED_NDP_BLOCK_ACK_SUGGESTED is returned as per IEEE 802.11-2024. Trace sample: IEEE 802.11 Wireless Management Fixed parameters Category code: Block Ack (3) Action code: NDP ADDBA Request (0x80) Dialog token: 0x01 Block Ack Parameters: 0x1003, A-MSDUs, Block Ack Policy .... .... .... ...1 = A-MSDUs: Permitted in QoS Data MPDUs .... .... .... ..1. = Block Ack Policy: Immediate Block Ack .... .... ..00 00.. = Traffic Identifier: 0x0 0001 0000 00.. .... = Number of Buffers (1 Buffer = 2304 Bytes): 64 Block Ack Timeout: 0x0000 Block Ack Starting Sequence Control (SSC): 0x0010 .... .... .... 0000 = Fragment: 0 0000 0000 0001 .... = Starting Sequence Number: 1 IEEE 802.11 Wireless Management Fixed parameters Category code: Block Ack (3) Action code: NDP ADDBA Response (0x81) Dialog token: 0x02 Status code: BlockAck negotiation refused because, due to buffer constraints and other unspecified reasons, the recipient prefers to generate only NDP BlockAck frames (0x006d) Block Ack Parameters: 0x1002, Block Ack Policy .... .... .... ...0 = A-MSDUs: Not Permitted .... .... .... ..1. = Block Ack Policy: Immediate Block Ack .... .... ..00 00.. = Traffic Identifier: 0x0 0001 0000 00.. .... = Number of Buffers (1 Buffer = 2304 Bytes): 64 Block Ack Timeout: 0x0000 Signed-off-by: Ria Thomas Link: https://patch.msgid.link/20260305091304.310990-1-ria.thomas@morsemicro.com Signed-off-by: Johannes Berg --- include/linux/ieee80211-ht.h | 3 +++ include/linux/ieee80211.h | 2 ++ 2 files changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211-ht.h b/include/linux/ieee80211-ht.h index 21bbf470540f..7612b72f9c7c 100644 --- a/include/linux/ieee80211-ht.h +++ b/include/linux/ieee80211-ht.h @@ -281,6 +281,9 @@ enum ieee80211_back_actioncode { WLAN_ACTION_ADDBA_REQ = 0, WLAN_ACTION_ADDBA_RESP = 1, WLAN_ACTION_DELBA = 2, + WLAN_ACTION_NDP_ADDBA_REQ = 128, + WLAN_ACTION_NDP_ADDBA_RESP = 129, + WLAN_ACTION_NDP_DELBA = 130, }; /* BACK (block-ack) parties */ diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index aea360e90cb1..52db36120314 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1482,6 +1482,8 @@ enum ieee80211_statuscode { WLAN_STATUS_REJECT_DSE_BAND = 96, WLAN_STATUS_DENIED_WITH_SUGGESTED_BAND_AND_CHANNEL = 99, WLAN_STATUS_DENIED_DUE_TO_SPECTRUM_MANAGEMENT = 103, + /* 802.11ah */ + WLAN_STATUS_REJECTED_NDP_BLOCK_ACK_SUGGESTED = 109, /* 802.11ai */ WLAN_STATUS_FILS_AUTHENTICATION_FAILURE = 112, WLAN_STATUS_UNKNOWN_AUTHENTICATION_SERVER = 113, -- cgit v1.2.3 From 96fefcabf340fcf8b3208dcd8685961955a66040 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 4 Mar 2026 10:32:31 -0500 Subject: vfs: widen inode hash/lookup functions to u64 Change the inode hash/lookup VFS API functions to accept u64 parameters instead of unsigned long for inode numbers and hash values. This is preparation for widening i_ino itself to u64, which will allow filesystems to store full 64-bit inode numbers on 32-bit architectures. Since unsigned long implicitly widens to u64 on all architectures, this change is backward-compatible with all existing callers. Reviewed-by: Jan Kara Signed-off-by: Jeff Layton Link: https://patch.msgid.link/20260304-iino-u64-v3-1-2257ad83d372@kernel.org Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- include/linux/fs.h | 46 ++++++++++++++++++++++------------------------ 1 file changed, 22 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 8b3dd145b25e..b4f5e5fdbe4b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2934,33 +2934,31 @@ static inline int inode_generic_drop(struct inode *inode) } extern void d_mark_dontcache(struct inode *inode); -extern struct inode *ilookup5_nowait(struct super_block *sb, - unsigned long hashval, int (*test)(struct inode *, void *), - void *data, bool *isnew); -extern struct inode *ilookup5(struct super_block *sb, unsigned long hashval, - int (*test)(struct inode *, void *), void *data); -extern struct inode *ilookup(struct super_block *sb, unsigned long ino); - -extern struct inode *inode_insert5(struct inode *inode, unsigned long hashval, - int (*test)(struct inode *, void *), - int (*set)(struct inode *, void *), - void *data); -struct inode *iget5_locked(struct super_block *, unsigned long, +struct inode *ilookup5_nowait(struct super_block *sb, u64 hashval, + int (*test)(struct inode *, void *), void *data, + bool *isnew); +struct inode *ilookup5(struct super_block *sb, u64 hashval, + int (*test)(struct inode *, void *), void *data); +struct inode *ilookup(struct super_block *sb, u64 ino); + +struct inode *inode_insert5(struct inode *inode, u64 hashval, + int (*test)(struct inode *, void *), + int (*set)(struct inode *, void *), void *data); +struct inode *iget5_locked(struct super_block *, u64, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *); -struct inode *iget5_locked_rcu(struct super_block *, unsigned long, +struct inode *iget5_locked_rcu(struct super_block *, u64, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *); -extern struct inode * iget_locked(struct super_block *, unsigned long); -extern struct inode *find_inode_nowait(struct super_block *, - unsigned long, - int (*match)(struct inode *, - unsigned long, void *), - void *data); -extern struct inode *find_inode_rcu(struct super_block *, unsigned long, - int (*)(struct inode *, void *), void *); -extern struct inode *find_inode_by_ino_rcu(struct super_block *, unsigned long); -extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *); +struct inode *iget_locked(struct super_block *, u64); +struct inode *find_inode_nowait(struct super_block *, u64, + int (*match)(struct inode *, u64, void *), + void *data); +struct inode *find_inode_rcu(struct super_block *, u64, + int (*)(struct inode *, void *), void *); +struct inode *find_inode_by_ino_rcu(struct super_block *, u64); +int insert_inode_locked4(struct inode *, u64, + int (*test)(struct inode *, void *), void *); extern int insert_inode_locked(struct inode *); #ifdef CONFIG_DEBUG_LOCK_ALLOC extern void lockdep_annotate_inode_mutex_key(struct inode *inode); @@ -3015,7 +3013,7 @@ int setattr_should_drop_sgid(struct mnt_idmap *idmap, */ #define alloc_inode_sb(_sb, _cache, _gfp) kmem_cache_alloc_lru(_cache, &_sb->s_inode_lru, _gfp) -extern void __insert_inode_hash(struct inode *, unsigned long hashval); +void __insert_inode_hash(struct inode *, u64 hashval); static inline void insert_inode_hash(struct inode *inode) { __insert_inode_hash(inode, inode->i_ino); -- cgit v1.2.3 From 125dfa218134df7cc112667e92984de9d8cd0bf6 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 4 Mar 2026 10:32:32 -0500 Subject: audit: widen ino fields to u64 inode->i_ino is being widened from unsigned long to u64. The audit subsystem uses unsigned long ino in struct fields, function parameters, and local variables that store inode numbers from arbitrary filesystems. On 32-bit platforms this truncates inode numbers that exceed 32 bits, which will cause incorrect audit log entries and broken watch/mark comparisons. Widen all audit ino fields, parameters, and locals to u64, and update the inode format string from %lu to %llu to match. Signed-off-by: Jeff Layton Link: https://patch.msgid.link/20260304-iino-u64-v3-2-2257ad83d372@kernel.org Acked-by: Paul Moore Signed-off-by: Christian Brauner --- include/linux/audit.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/audit.h b/include/linux/audit.h index b642b5faca65..b915aaa7ed73 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -15,7 +15,7 @@ #include #include -#define AUDIT_INO_UNSET ((unsigned long)-1) +#define AUDIT_INO_UNSET ((u64)-1) #define AUDIT_DEV_UNSET ((dev_t)-1) struct audit_sig_info { -- cgit v1.2.3 From 0b2600f81cefcdfcda58d50df7be8fd48ada8ce2 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 4 Mar 2026 10:32:42 -0500 Subject: treewide: change inode->i_ino from unsigned long to u64 On 32-bit architectures, unsigned long is only 32 bits wide, which causes 64-bit inode numbers to be silently truncated. Several filesystems (NFS, XFS, BTRFS, etc.) can generate inode numbers that exceed 32 bits, and this truncation can lead to inode number collisions and other subtle bugs on 32-bit systems. Change the type of inode->i_ino from unsigned long to u64 to ensure that inode numbers are always represented as 64-bit values regardless of architecture. Update all format specifiers treewide from %lu/%lx to %llu/%llx to match the new type, along with corresponding local variable types. This is the bulk treewide conversion. Earlier patches in this series handled trace events separately to allow trace field reordering for better struct packing on 32-bit. Signed-off-by: Jeff Layton Link: https://patch.msgid.link/20260304-iino-u64-v3-12-2257ad83d372@kernel.org Acked-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Chuck Lever Signed-off-by: Christian Brauner --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index b4f5e5fdbe4b..e820c14f9c5a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -783,7 +783,7 @@ struct inode { #endif /* Stat data, not accessed from path walking */ - unsigned long i_ino; + u64 i_ino; /* * Filesystems may only read i_nlink directly. They shall use the * following functions for modification: -- cgit v1.2.3 From ebeca1f930eac8f11f815d58eb38fa5d07e7c16e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 6 Mar 2026 07:58:03 -1000 Subject: sched_ext: Introduce cgroup sub-sched support A system often runs multiple workloads especially in multi-tenant server environments where a system is split into partitions servicing separate more-or-less independent workloads each requiring an application-specific scheduler. To support such and other use cases, sched_ext is in the process of growing multiple scheduler support. When partitioning a system in terms of CPUs for such use cases, an oft-taken approach is hard partitioning the system using cpuset. While it would be possible to tie sched_ext multiple scheduler support to cpuset partitions, such an approach would have fundamental limitations stemming from the lack of dynamism and flexibility. Users often don't care which specific CPUs are assigned to which workload and want to take advantage of optimizations which are enabled by running workloads on a larger machine - e.g. opportunistic over-commit, improving latency critical workload characteristics while maintaining bandwidth fairness, employing control mechanisms based on different criteria than on-CPU time for e.g. flexible memory bandwidth isolation, packing similar parts from different workloads on same L3s to improve cache efficiency, and so on. As this sort of dynamic behaviors are impossible or difficult to implement with hard partitioning, sched_ext is implementing cgroup sub-sched support where schedulers can be attached to the cgroup hierarchy and a parent scheduler is responsible for controlling the CPUs that each child can use at any given moment. This makes CPU distribution dynamically controlled by BPF allowing high flexibility. This patch adds the skeletal sched_ext cgroup sub-sched support: - sched_ext_ops.sub_cgroup_id and .sub_attach/detach() are added. Non-zero sub_cgroup_id indicates that the scheduler is to be attached to the identified cgroup. A sub-sched is attached to the cgroup iff the nearest ancestor scheduler implements .sub_attach() and grants the attachment. Max nesting depth is limited by SCX_SUB_MAX_DEPTH. - When a scheduler exits, all its descendant schedulers are exited together. Also, cgroup.scx_sched added which points to the effective scheduler instance for the cgroup. This is updated on scheduler init/exit and inherited on cgroup online. When a cgroup is offlined, the attached scheduler is automatically exited. - Sub-sched support is gated on CONFIG_EXT_SUB_SCHED which is automatically enabled if both SCX and cgroups are enabled. Sub-sched support is not tied to the CPU controller but rather the cgroup hierarchy itself. This is intentional as the support for cpu.weight and cpu.max based resource control is orthogonal to sub-sched support. Note that CONFIG_CGROUPS around cgroup subtree iteration support for scx_task_iter is replaced with CONFIG_EXT_SUB_SCHED for consistency. - This allows loading sub-scheds and most framework operations such as propagating disable down the hierarchy work. However, sub-scheds are not operational yet and all tasks stay with the root sched. This will serve as the basis for building up full sub-sched support. - DSQs point to the scx_sched they belong to. - scx_qmap is updated to allow attachment of sub-scheds and also serving as sub-scheds. - scx_is_descendant() is added but not yet used in this patch. It is used by later changes in the series and placed here as this is where the function belongs. Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- include/linux/cgroup-defs.h | 4 ++++ include/linux/sched/ext.h | 3 +++ 2 files changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index bb92f5c169ca..dd61767cf9bb 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -624,6 +625,9 @@ struct cgroup { #ifdef CONFIG_BPF_SYSCALL struct bpf_local_storage __rcu *bpf_cgrp_storage; #endif +#ifdef CONFIG_EXT_SUB_SCHED + struct scx_sched __rcu *scx_sched; +#endif /* All ancestors including self */ union { diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 0150b3fe6230..fa4349b319e6 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -78,6 +78,7 @@ struct scx_dispatch_q { u64 id; struct rhash_head hash_node; struct llist_node free_node; + struct scx_sched *sched; struct rcu_head rcu; }; @@ -157,6 +158,8 @@ struct scx_dsq_list_node { .priv = (__priv), \ } +struct scx_sched; + /* * The following is embedded in task_struct and contains all fields necessary * for a task to be scheduled by SCX. -- cgit v1.2.3 From 88234b075c3fc23d57406e1867523b6aba783ebf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 6 Mar 2026 07:58:03 -1000 Subject: sched_ext: Introduce scx_task_sched[_rcu]() In preparation of multiple scheduler support, add p->scx.sched which points to the scx_sched instance that the task is scheduled by, which is currently always scx_root. Add scx_task_sched[_rcu]() accessors which return the associated scx_sched of the specified task and replace the raw scx_root dereferences with it where applicable. scx_task_on_sched() is also added to test whether a given task is on the specified sched. As scx_root is still the only scheduler, this shouldn't introduce user-visible behavior changes. Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- include/linux/sched/ext.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index fa4349b319e6..3213e31c7979 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -165,6 +165,13 @@ struct scx_sched; * for a task to be scheduled by SCX. */ struct sched_ext_entity { +#ifdef CONFIG_CGROUPS + /* + * Associated scx_sched. Updated either during fork or while holding + * both p->pi_lock and rq lock. + */ + struct scx_sched __rcu *sched; +#endif struct scx_dispatch_q *dsq; atomic_long_t ops_state; u64 ddsp_dsq_id; -- cgit v1.2.3 From 337ec00b1d9c676f637651c2cefddb8612b867ee Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 6 Mar 2026 07:58:04 -1000 Subject: sched_ext: Implement cgroup sub-sched enabling and disabling The preceding changes implemented the framework to support cgroup sub-scheds and updated scheduling paths and kfuncs so that they have minimal but working support for sub-scheds. However, actual sub-sched enabling/disabling hasn't been implemented yet and all tasks stayed on scx_root. Implement cgroup sub-sched enabling and disabling to actually activate sub-scheds: - Both enable and disable operations bypass only the tasks in the subtree of the child being enabled or disabled to limit disruptions. - When enabling, all candidate tasks are first initialized for the child sched. Once that succeeds, the tasks are exited for the parent and then switched over to the child. This adds a bit of complication but guarantees that child scheduler failures are always contained. - Disabling works the same way in the other direction. However, when the parent may fail to initialize a task, disabling is propagated up to the parent. While this means that a parent sched fail due to a child sched event, the failure can only originate from the parent itself (its ops.init_task()). The only effect a malfunctioning child can have on the parent is attempting to move the tasks back to the parent. After this change, although not all the necessary mechanisms are in place yet, sub-scheds can take control of their tasks and schedule them. v2: Fix missing scx_cgroup_unlock()/percpu_up_write() in abort path (Cheng-Yang Chou). Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- include/linux/sched/ext.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 3213e31c7979..f354d7d34306 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -88,6 +88,7 @@ enum scx_ent_flags { SCX_TASK_IN_CUSTODY = 1 << 1, /* in custody, needs ops.dequeue() when leaving */ SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */ SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */ + SCX_TASK_SUB_INIT = 1 << 4, /* task being initialized for a sub sched */ SCX_TASK_STATE_SHIFT = 8, /* bit 8 and 9 are used to carry scx_task_state */ SCX_TASK_STATE_BITS = 2, -- cgit v1.2.3 From 3cd963fa915c494a6d0da0287bd10cb6f2204f9e Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 5 Mar 2026 10:42:57 +0000 Subject: net: stmmac: mdio_bus_data->default_an_inband is boolean default_an_inband is declared as an unsigned int, but is set to true/ false and is assigned to phylink_config's member of the same name which is a bool. Declare this also as a bool for consistency. Signed-off-by: Russell King (Oracle) Reviewed-by: Maxime Chevallier Tested-by: Maxime Chevallier Link: https://patch.msgid.link/E1vy6AT-0000000BtxD-2qm7@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 2fc169c7117e..678d03d6d3bd 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -86,10 +86,10 @@ struct stmmac_priv; struct stmmac_mdio_bus_data { unsigned int phy_mask; unsigned int pcs_mask; - unsigned int default_an_inband; int *irqs; int probed_phy_irq; bool needs_reset; + bool default_an_inband; }; struct stmmac_dma_cfg { -- cgit v1.2.3 From e4fd855c52ec5af34d920206190be29919fadca3 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 5 Mar 2026 10:43:02 +0000 Subject: net: stmmac: make pcs_mask and phy_mask u32 The PCS and PHY masks are passed to the mdio bus layer as phy_mask to prevent bus addresses between 0 and 31 inclusive being scanned, and this is declared as u32. Also declare these as u32 in stmmac for type consistency. Since this is a u32, use BIT_U32() rather than BIT() to generate values for these fields. Signed-off-by: Russell King (Oracle) Reviewed-by: Maxime Chevallier Tested-by: Maxime Chevallier Link: https://patch.msgid.link/E1vy6AY-0000000BtxJ-3smT@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 678d03d6d3bd..965ada809fdf 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -84,8 +84,8 @@ struct stmmac_priv; /* Platfrom data for platform device structure's platform_data field */ struct stmmac_mdio_bus_data { - unsigned int phy_mask; - unsigned int pcs_mask; + u32 phy_mask; + u32 pcs_mask; int *irqs; int probed_phy_irq; bool needs_reset; -- cgit v1.2.3 From 55f854dd5bdd8e19b936a00ef1f8d776ac32c7b0 Mon Sep 17 00:00:00 2001 From: Laurent Vivier Date: Wed, 4 Mar 2026 14:43:38 +0100 Subject: qmi_wwan: allow max_mtu above hard_mtu to control rx_urb_size Commit c7159e960f14 ("usbnet: limit max_mtu based on device's hard_mtu") capped net->max_mtu to the device's hard_mtu in usbnet_probe(). While this correctly prevents oversized packets on standard USB network devices, it breaks the qmi_wwan driver. qmi_wwan relies on userspace (e.g. ModemManager) setting a large MTU on the wwan0 interface to configure rx_urb_size via usbnet_change_mtu(). QMI modems negotiate USB transfer sizes of 16,383 or 32,767 bytes, and the USB receive buffers must be sized accordingly. With max_mtu capped to hard_mtu (~1500 bytes), userspace can no longer raise the MTU, the receive buffers remain small, and download speeds drop from >300 Mbps to ~0.8 Mbps. Introduce a FLAG_NOMAXMTU driver flag that allows individual usbnet drivers to opt out of the max_mtu cap. Set this flag in qmi_wwan's driver_info structures to restore the previous behavior for QMI devices, while keeping the safety fix in place for all other usbnet drivers. Fixes: c7159e960f14 ("usbnet: limit max_mtu based on device's hard_mtu") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/lkml/CAPh3n803k8JcBPV5qEzUB-oKzWkAs-D5CU7z=Vd_nLRCr5ZqQg@mail.gmail.com/ Reported-by: Koen Vandeputte Tested-by: Daniele Palmas Signed-off-by: Laurent Vivier Link: https://patch.msgid.link/20260304134338.1785002-1-lvivier@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/usb/usbnet.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h index b0e84896e6ac..bbf799ccf3b3 100644 --- a/include/linux/usb/usbnet.h +++ b/include/linux/usb/usbnet.h @@ -132,6 +132,7 @@ struct driver_info { #define FLAG_MULTI_PACKET 0x2000 #define FLAG_RX_ASSEMBLE 0x4000 /* rx packets may span >1 frames */ #define FLAG_NOARP 0x8000 /* device can't do ARP */ +#define FLAG_NOMAXMTU 0x10000 /* allow max_mtu above hard_mtu */ /* init device ... can sleep, or cause probe() failure */ int (*bind)(struct usbnet *, struct usb_interface *); -- cgit v1.2.3 From 260d27b3aec9f30d68f9f3cacc674655897eb745 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 4 Mar 2026 21:17:28 +0100 Subject: net: phy: remove phy_attach 378e6523ebb1 ("net: bcmgenet: remove unused platform code") removed the last user of phy_attach(). So remove this function. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/8812176a-e319-4e9f-815d-99ea339df8b2@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 6f9979a26892..e9b0d7427b0e 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -2152,8 +2152,6 @@ int phy_suspend(struct phy_device *phydev); int phy_resume(struct phy_device *phydev); int __phy_resume(struct phy_device *phydev); int phy_loopback(struct phy_device *phydev, bool enable, int speed); -struct phy_device *phy_attach(struct net_device *dev, const char *bus_id, - phy_interface_t interface); struct phy_device *phy_find_next(struct mii_bus *bus, struct phy_device *pos); int phy_attach_direct(struct net_device *dev, struct phy_device *phydev, u32 flags, phy_interface_t interface); -- cgit v1.2.3 From c9429bf56405a326845a8a35357b5bdf1dc4558c Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Tue, 24 Feb 2026 19:29:54 +0000 Subject: rhashtable: consolidate hash computation in rht_key_get_hash() The else-if and else branches in rht_key_get_hash() both compute a hash using either params.hashfn or jhash, differing only in the source of key_len (params.key_len vs ht->p.key_len). Merge the two branches into one by using the ternary `params.key_len ?: ht->p.key_len` to select the key length, removing the duplicated logic. This also improves the performance of the else branch which previously always used jhash and never fell through to jhash2. This branch is going to be used by BPF resizable hashmap, which wraps rhashtable: https://lore.kernel.org/bpf/20260205-rhash-v1-0-30dd6d63c462@meta.com/ Signed-off-by: Mykyta Yatsenko Signed-off-by: Herbert Xu --- include/linux/rhashtable.h | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 133ccb39137a..0480509a6339 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -129,10 +129,10 @@ static __always_inline unsigned int rht_key_get_hash(struct rhashtable *ht, unsigned int hash; /* params must be equal to ht->p if it isn't constant. */ - if (!__builtin_constant_p(params.key_len)) + if (!__builtin_constant_p(params.key_len)) { hash = ht->p.hashfn(key, ht->key_len, hash_rnd); - else if (params.key_len) { - unsigned int key_len = params.key_len; + } else { + unsigned int key_len = params.key_len ? : ht->p.key_len; if (params.hashfn) hash = params.hashfn(key, key_len, hash_rnd); @@ -140,13 +140,6 @@ static __always_inline unsigned int rht_key_get_hash(struct rhashtable *ht, hash = jhash(key, key_len, hash_rnd); else hash = jhash2(key, key_len / sizeof(u32), hash_rnd); - } else { - unsigned int key_len = ht->p.key_len; - - if (params.hashfn) - hash = params.hashfn(key, key_len, hash_rnd); - else - hash = jhash(key, key_len, hash_rnd); } return hash; -- cgit v1.2.3 From 30b0515342db48ac9ffd9999648de0f7ca1d6a87 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 7 Mar 2026 05:29:50 -1000 Subject: sched_ext: Add per-CPU data to DSQs Add per-CPU data structure to dispatch queues. Each DSQ now has a percpu scx_dsq_pcpu which contains a back-pointer to the DSQ. This will be used by future changes to implement per-CPU reenqueue tracking for user DSQs. init_dsq() now allocates the percpu data and can fail, so it returns an error code. All callers are updated to handle failures. exit_dsq() is added to free the percpu data and is called from all DSQ cleanup paths. In scx_bpf_create_dsq(), init_dsq() is called before rcu_read_lock() since alloc_percpu() requires GFP_KERNEL context, and dsq->sched is set afterwards. v2: Fix err_free_pcpu to only exit_dsq() initialized bypass DSQs (Andrea Righi). Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- include/linux/sched/ext.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index f354d7d34306..98cc1f41b91e 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -62,6 +62,10 @@ enum scx_dsq_id_flags { SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLLU, }; +struct scx_dsq_pcpu { + struct scx_dispatch_q *dsq; +}; + /* * A dispatch queue (DSQ) can be either a FIFO or p->scx.dsq_vtime ordered * queue. A built-in DSQ is always a FIFO. The built-in local DSQs are used to @@ -79,6 +83,7 @@ struct scx_dispatch_q { struct rhash_head hash_node; struct llist_node free_node; struct scx_sched *sched; + struct scx_dsq_pcpu __percpu *pcpu; struct rcu_head rcu; }; -- cgit v1.2.3 From 35250720d6ed1e83e0d1e12b7e8bf7b8316d7d58 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 7 Mar 2026 05:29:50 -1000 Subject: sched_ext: Factor out nldsq_cursor_next_task() and nldsq_cursor_lost_task() Factor out cursor-based DSQ iteration from bpf_iter_scx_dsq_next() into nldsq_cursor_next_task() and the task-lost check from scx_dsq_move() into nldsq_cursor_lost_task() to prepare for reuse. As ->priv is only used to record dsq->seq for cursors, update INIT_DSQ_LIST_CURSOR() to take the DSQ pointer and set ->priv from dsq->seq so that users don't have to read it manually. Move scx_dsq_iter_flags enum earlier so nldsq_cursor_next_task() can use SCX_DSQ_ITER_REV. bypass_lb_cpu() now sets cursor.priv to dsq->seq but doesn't use it. Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- include/linux/sched/ext.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 98cc1f41b91e..303f57dfb947 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -157,11 +157,11 @@ struct scx_dsq_list_node { u32 priv; /* can be used by iter cursor */ }; -#define INIT_DSQ_LIST_CURSOR(__node, __flags, __priv) \ +#define INIT_DSQ_LIST_CURSOR(__cursor, __dsq, __flags) \ (struct scx_dsq_list_node) { \ - .node = LIST_HEAD_INIT((__node).node), \ + .node = LIST_HEAD_INIT((__cursor).node), \ .flags = SCX_DSQ_LNODE_ITER_CURSOR | (__flags), \ - .priv = (__priv), \ + .priv = READ_ONCE((__dsq)->seq), \ } struct scx_sched; -- cgit v1.2.3 From 84b1a0ea0b7c23dec240783a592e480780efe459 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 7 Mar 2026 05:29:50 -1000 Subject: sched_ext: Implement scx_bpf_dsq_reenq() for user DSQs scx_bpf_dsq_reenq() currently only supports local DSQs. Extend it to support user-defined DSQs by adding a deferred re-enqueue mechanism similar to the local DSQ handling. Add per-cpu deferred_reenq_user_node/flags to scx_dsq_pcpu and deferred_reenq_users list to scx_rq. When scx_bpf_dsq_reenq() is called on a user DSQ, the DSQ's per-cpu node is added to the current rq's deferred list. process_deferred_reenq_users() then iterates the DSQ using the cursor helpers and re-enqueues each task. Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- include/linux/sched/ext.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 303f57dfb947..e77504faa0bc 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -62,8 +62,14 @@ enum scx_dsq_id_flags { SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLLU, }; +struct scx_deferred_reenq_user { + struct list_head node; + u64 flags; +}; + struct scx_dsq_pcpu { struct scx_dispatch_q *dsq; + struct scx_deferred_reenq_user deferred_reenq_user; }; /* -- cgit v1.2.3 From 7203d77d6e04f83f7b78838eed099d9cac31700b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 7 Mar 2026 05:29:50 -1000 Subject: sched_ext: Simplify task state handling Task states (NONE, INIT, READY, ENABLED) were defined in a separate enum with unshifted values and then shifted when stored in scx_entity.flags. Simplify by defining them as pre-shifted values directly in scx_ent_flags and removing the separate scx_task_state enum. This removes the need for shifting when reading/writing state values. scx_get_task_state() now returns the masked flags value directly. scx_set_task_state() accepts the pre-shifted state value. scx_dump_task() shifts down for display to maintain readable output. No functional changes. Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- include/linux/sched/ext.h | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index e77504faa0bc..e822b374b17f 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -93,7 +93,7 @@ struct scx_dispatch_q { struct rcu_head rcu; }; -/* scx_entity.flags */ +/* sched_ext_entity.flags */ enum scx_ent_flags { SCX_TASK_QUEUED = 1 << 0, /* on ext runqueue */ SCX_TASK_IN_CUSTODY = 1 << 1, /* in custody, needs ops.dequeue() when leaving */ @@ -101,21 +101,25 @@ enum scx_ent_flags { SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */ SCX_TASK_SUB_INIT = 1 << 4, /* task being initialized for a sub sched */ - SCX_TASK_STATE_SHIFT = 8, /* bit 8 and 9 are used to carry scx_task_state */ + /* + * Bits 8 and 9 are used to carry task state: + * + * NONE ops.init_task() not called yet + * INIT ops.init_task() succeeded, but task can be cancelled + * READY fully initialized, but not in sched_ext + * ENABLED fully initialized and in sched_ext + */ + SCX_TASK_STATE_SHIFT = 8, /* bits 8 and 9 are used to carry task state */ SCX_TASK_STATE_BITS = 2, SCX_TASK_STATE_MASK = ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT, - SCX_TASK_CURSOR = 1 << 31, /* iteration cursor, not a task */ -}; - -/* scx_entity.flags & SCX_TASK_STATE_MASK */ -enum scx_task_state { - SCX_TASK_NONE, /* ops.init_task() not called yet */ - SCX_TASK_INIT, /* ops.init_task() succeeded, but task can be cancelled */ - SCX_TASK_READY, /* fully initialized, but not in sched_ext */ - SCX_TASK_ENABLED, /* fully initialized and in sched_ext */ + SCX_TASK_NONE = 0 << SCX_TASK_STATE_SHIFT, + SCX_TASK_INIT = 1 << SCX_TASK_STATE_SHIFT, + SCX_TASK_READY = 2 << SCX_TASK_STATE_SHIFT, + SCX_TASK_ENABLED = 3 << SCX_TASK_STATE_SHIFT, - SCX_TASK_NR_STATES, + /* iteration cursor, not a task */ + SCX_TASK_CURSOR = 1 << 31, }; /* scx_entity.dsq_flags */ -- cgit v1.2.3 From ce897abc21b2d5e74981ff2b848f3a08a580d50a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 7 Mar 2026 05:29:50 -1000 Subject: sched_ext: Add SCX_TASK_REENQ_REASON flags SCX_ENQ_REENQ indicates that a task is being re-enqueued but doesn't tell the BPF scheduler why. Add SCX_TASK_REENQ_REASON flags using bits 12-13 of p->scx.flags to communicate the reason during ops.enqueue(): - NONE: Not being reenqueued - KFUNC: Reenqueued by scx_bpf_dsq_reenq() and friends More reasons will be added. Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- include/linux/sched/ext.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index e822b374b17f..60a4f65d0174 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -118,6 +118,21 @@ enum scx_ent_flags { SCX_TASK_READY = 2 << SCX_TASK_STATE_SHIFT, SCX_TASK_ENABLED = 3 << SCX_TASK_STATE_SHIFT, + /* + * Bits 12 and 13 are used to carry reenqueue reason. In addition to + * %SCX_ENQ_REENQ flag, ops.enqueue() can also test for + * %SCX_TASK_REENQ_REASON_NONE to distinguish reenqueues. + * + * NONE not being reenqueued + * KFUNC reenqueued by scx_bpf_dsq_reenq() and friends + */ + SCX_TASK_REENQ_REASON_SHIFT = 12, + SCX_TASK_REENQ_REASON_BITS = 2, + SCX_TASK_REENQ_REASON_MASK = ((1 << SCX_TASK_REENQ_REASON_BITS) - 1) << SCX_TASK_REENQ_REASON_SHIFT, + + SCX_TASK_REENQ_NONE = 0 << SCX_TASK_REENQ_REASON_SHIFT, + SCX_TASK_REENQ_KFUNC = 1 << SCX_TASK_REENQ_REASON_SHIFT, + /* iteration cursor, not a task */ SCX_TASK_CURSOR = 1 << 31, }; -- cgit v1.2.3 From 1954c4f012206147c34acda8da04f827aa7d3ee3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 7 Mar 2026 20:07:15 +0000 Subject: eventpoll: Convert epoll_put_uevent() to scoped user access Saves two function calls, and one stac/clac pair. stac/clac is rather expensive on older cpus like Zen 2. A synthetic network stress test gives a ~1.5% increase of pps on AMD Zen 2. Signed-off-by: Eric Dumazet Cc: Christophe Leroy Cc: Dave Hansen Cc: Kuniyuki Iwashima Signed-off-by: Linus Torvalds --- include/linux/eventpoll.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h index ccb478eb174b..ea9ca0e4172a 100644 --- a/include/linux/eventpoll.h +++ b/include/linux/eventpoll.h @@ -82,11 +82,14 @@ static inline struct epoll_event __user * epoll_put_uevent(__poll_t revents, __u64 data, struct epoll_event __user *uevent) { - if (__put_user(revents, &uevent->events) || - __put_user(data, &uevent->data)) - return NULL; - + scoped_user_write_access_size(uevent, sizeof(*uevent), efault) { + unsafe_put_user(revents, &uevent->events, efault); + unsafe_put_user(data, &uevent->data, efault); + } return uevent+1; + +efault: + return NULL; } #endif -- cgit v1.2.3 From 1ea4b473504b6dc6a0d21c298519aff2d52433c9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 5 Mar 2026 19:55:41 +0000 Subject: locking/rwsem: Remove the list_head from struct rw_semaphore Instead of embedding a list_head in struct rw_semaphore, store a pointer to the first waiter. The list of waiters remains a doubly linked list so we can efficiently add to the tail of the list, remove from the front (or middle) of the list. Some of the list manipulation becomes more complicated, but it's a reasonable tradeoff on the slow paths to shrink some core data structures like struct inode. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260305195545.3707590-2-willy@infradead.org --- include/linux/rwsem.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 9bf1d93d3d7b..e7829531c4ba 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -57,7 +57,7 @@ context_lock_struct(rw_semaphore) { struct optimistic_spin_queue osq; /* spinner MCS lock */ #endif raw_spinlock_t wait_lock; - struct list_head wait_list; + struct rwsem_waiter *first_waiter; #ifdef CONFIG_DEBUG_RWSEMS void *magic; #endif @@ -106,7 +106,7 @@ static inline void rwsem_assert_held_write_nolockdep(const struct rw_semaphore * .owner = ATOMIC_LONG_INIT(0), \ __RWSEM_OPT_INIT(name) \ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),\ - .wait_list = LIST_HEAD_INIT((name).wait_list), \ + .first_waiter = NULL, \ __RWSEM_DEBUG_INIT(name) \ __RWSEM_DEP_MAP_INIT(name) } @@ -129,9 +129,9 @@ do { \ * rwsem to see if somebody from an incompatible type is wanting access to the * lock. */ -static inline int rwsem_is_contended(struct rw_semaphore *sem) +static inline bool rwsem_is_contended(struct rw_semaphore *sem) { - return !list_empty(&sem->wait_list); + return sem->first_waiter != NULL; } #if defined(CONFIG_DEBUG_RWSEMS) || defined(CONFIG_DETECT_HUNG_TASK_BLOCKER) -- cgit v1.2.3 From b9bdd4b6840454ef87f61b6506c9635c57a81650 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 5 Mar 2026 19:55:42 +0000 Subject: locking/semaphore: Remove the list_head from struct semaphore Instead of embedding a list_head in struct semaphore, store a pointer to the first waiter. The list of waiters remains a doubly linked list so we can efficiently add to the tail of the list and remove from the front (or middle) of the list. Some of the list manipulation becomes more complicated, but it's a reasonable tradeoff on the slow paths to shrink data structures which embed a semaphore. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260305195545.3707590-3-willy@infradead.org --- include/linux/semaphore.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/semaphore.h b/include/linux/semaphore.h index 89706157e622..a4c8651ef021 100644 --- a/include/linux/semaphore.h +++ b/include/linux/semaphore.h @@ -15,7 +15,7 @@ struct semaphore { raw_spinlock_t lock; unsigned int count; - struct list_head wait_list; + struct semaphore_waiter *first_waiter; #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER unsigned long last_holder; @@ -33,7 +33,7 @@ struct semaphore { { \ .lock = __RAW_SPIN_LOCK_UNLOCKED((name).lock), \ .count = n, \ - .wait_list = LIST_HEAD_INIT((name).wait_list) \ + .first_waiter = NULL \ __LAST_HOLDER_SEMAPHORE_INITIALIZER \ } -- cgit v1.2.3 From 25500ba7e77ce9d3d9b5a1929d41a2ee2e23f6fe Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 5 Mar 2026 19:55:43 +0000 Subject: locking/mutex: Remove the list_head from struct mutex Instead of embedding a list_head in struct mutex, store a pointer to the first waiter. The list of waiters remains a doubly linked list so we can efficiently add to the tail of the list, remove from the front (or middle) of the list. Some of the list manipulation becomes more complicated, but it's a reasonable tradeoff on the slow paths to shrink data structures which embed a mutex like struct file. Some of the debug checks have to be deleted because there's no equivalent to checking them in the new scheme (eg an empty waiter->list now means that it is the only waiter, not that the waiter is no longer on the list). Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260305195545.3707590-4-willy@infradead.org --- include/linux/mutex.h | 2 +- include/linux/mutex_types.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 2f648ee204e7..c471b129f703 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -79,7 +79,7 @@ do { \ #define __MUTEX_INITIALIZER(lockname) \ { .owner = ATOMIC_LONG_INIT(0) \ , .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(lockname.wait_lock) \ - , .wait_list = LIST_HEAD_INIT(lockname.wait_list) \ + , .first_waiter = NULL \ __DEBUG_MUTEX_INITIALIZER(lockname) \ __DEP_MAP_MUTEX_INITIALIZER(lockname) } diff --git a/include/linux/mutex_types.h b/include/linux/mutex_types.h index 80975935ec48..a8f119f81177 100644 --- a/include/linux/mutex_types.h +++ b/include/linux/mutex_types.h @@ -44,7 +44,7 @@ context_lock_struct(mutex) { #ifdef CONFIG_MUTEX_SPIN_ON_OWNER struct optimistic_spin_queue osq; /* Spinner MCS lock */ #endif - struct list_head wait_list; + struct mutex_waiter *first_waiter; #ifdef CONFIG_DEBUG_MUTEXES void *magic; #endif -- cgit v1.2.3 From 07574b8ebaac7927e2355b4f343b03b50e04494c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 20 Jan 2026 13:40:30 +0100 Subject: compiler-context-analysys: Add __cond_releases() Useful for things like unlock fastpaths, which on success release the lock. Suggested-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Marco Elver Link: https://patch.msgid.link/20260121111213.634625032@infradead.org --- include/linux/compiler-context-analysis.h | 32 +++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compiler-context-analysis.h b/include/linux/compiler-context-analysis.h index 00c074a2ccb0..a9317571e6af 100644 --- a/include/linux/compiler-context-analysis.h +++ b/include/linux/compiler-context-analysis.h @@ -320,6 +320,38 @@ static inline void _context_unsafe_alias(void **p) { } */ #define __releases(...) __releases_ctx_lock(__VA_ARGS__) +/* + * Clang's analysis does not care precisely about the value, only that it is + * either zero or non-zero. So the __cond_acquires() interface might be + * misleading if we say that @ret is the value returned if acquired. Instead, + * provide symbolic variants which we translate. + */ +#define __cond_acquires_impl_not_true(x, ...) __try_acquires##__VA_ARGS__##_ctx_lock(0, x) +#define __cond_acquires_impl_not_false(x, ...) __try_acquires##__VA_ARGS__##_ctx_lock(1, x) +#define __cond_acquires_impl_not_nonzero(x, ...) __try_acquires##__VA_ARGS__##_ctx_lock(0, x) +#define __cond_acquires_impl_not_0(x, ...) __try_acquires##__VA_ARGS__##_ctx_lock(1, x) +#define __cond_acquires_impl_not_nonnull(x, ...) __try_acquires##__VA_ARGS__##_ctx_lock(0, x) +#define __cond_acquires_impl_not_NULL(x, ...) __try_acquires##__VA_ARGS__##_ctx_lock(1, x) + +/** + * __cond_releases() - function attribute, function conditionally + * releases a context lock exclusively + * @ret: abstract value returned by function if context lock releases + * @x: context lock instance pointer + * + * Function attribute declaring that the function conditionally releases the + * given context lock instance @x exclusively. The associated context(s) must + * be active on entry. The function return value @ret denotes when the context + * lock is released. + * + * @ret may be one of: true, false, nonzero, 0, nonnull, NULL. + * + * NOTE: clang does not have a native attribute for this; instead implement + * it as an unconditional release and a conditional acquire for the + * inverted condition -- which is semantically equivalent. + */ +#define __cond_releases(ret, x) __releases(x) __cond_acquires_impl_not_##ret(x) + /** * __acquire() - function to acquire context lock exclusively * @x: context lock instance pointer -- cgit v1.2.3 From 5c4326231cde36fd5e90c41e403df9fac6238f4b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 20 Jan 2026 10:06:08 +0100 Subject: locking/mutex: Add context analysis Add compiler context analysis annotations. Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260121111213.745353747@infradead.org --- include/linux/mutex.h | 2 +- include/linux/mutex_types.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index c471b129f703..734048c02f4f 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -183,7 +183,7 @@ static inline int __must_check __devm_mutex_init(struct device *dev, struct mute */ #ifdef CONFIG_DEBUG_LOCK_ALLOC extern void mutex_lock_nested(struct mutex *lock, unsigned int subclass) __acquires(lock); -extern void _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock); +extern void _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock) __acquires(lock); extern int __must_check mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) __cond_acquires(0, lock); extern int __must_check _mutex_lock_killable(struct mutex *lock, diff --git a/include/linux/mutex_types.h b/include/linux/mutex_types.h index a8f119f81177..24ed599fdda8 100644 --- a/include/linux/mutex_types.h +++ b/include/linux/mutex_types.h @@ -44,7 +44,7 @@ context_lock_struct(mutex) { #ifdef CONFIG_MUTEX_SPIN_ON_OWNER struct optimistic_spin_queue osq; /* Spinner MCS lock */ #endif - struct mutex_waiter *first_waiter; + struct mutex_waiter *first_waiter __guarded_by(&wait_lock); #ifdef CONFIG_DEBUG_MUTEXES void *magic; #endif -- cgit v1.2.3 From 90bb681dcdf7e69c90b56a18f06c0389a0810b92 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 20 Jan 2026 18:17:50 +0100 Subject: locking/rtmutex: Add context analysis Add compiler context analysis annotations. Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260121111213.851599178@infradead.org --- include/linux/rtmutex.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h index ede4c6bf6f22..78e7e588817c 100644 --- a/include/linux/rtmutex.h +++ b/include/linux/rtmutex.h @@ -22,8 +22,8 @@ extern int max_lock_depth; struct rt_mutex_base { raw_spinlock_t wait_lock; - struct rb_root_cached waiters; - struct task_struct *owner; + struct rb_root_cached waiters __guarded_by(&wait_lock); + struct task_struct *owner __guarded_by(&wait_lock); }; #define __RT_MUTEX_BASE_INITIALIZER(rtbasename) \ @@ -41,7 +41,7 @@ struct rt_mutex_base { */ static inline bool rt_mutex_base_is_locked(struct rt_mutex_base *lock) { - return READ_ONCE(lock->owner) != NULL; + return data_race(READ_ONCE(lock->owner) != NULL); } #ifdef CONFIG_RT_MUTEXES @@ -49,7 +49,7 @@ static inline bool rt_mutex_base_is_locked(struct rt_mutex_base *lock) static inline struct task_struct *rt_mutex_owner(struct rt_mutex_base *lock) { - unsigned long owner = (unsigned long) READ_ONCE(lock->owner); + unsigned long owner = (unsigned long) data_race(READ_ONCE(lock->owner)); return (struct task_struct *) (owner & ~RT_MUTEX_HAS_WAITERS); } -- cgit v1.2.3 From 739690915ce1f017223ef4e6f3cc966ccfa3c861 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 6 Mar 2026 10:43:56 +0100 Subject: locking/rwsem: Add context analysis Add compiler context analysis annotations. Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260306101417.GT1282955@noisy.programming.kicks-ass.net --- include/linux/rwsem.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index e7829531c4ba..6a1a7bae5f81 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -57,7 +57,7 @@ context_lock_struct(rw_semaphore) { struct optimistic_spin_queue osq; /* spinner MCS lock */ #endif raw_spinlock_t wait_lock; - struct rwsem_waiter *first_waiter; + struct rwsem_waiter *first_waiter __guarded_by(&wait_lock); #ifdef CONFIG_DEBUG_RWSEMS void *magic; #endif @@ -131,7 +131,7 @@ do { \ */ static inline bool rwsem_is_contended(struct rw_semaphore *sem) { - return sem->first_waiter != NULL; + return data_race(sem->first_waiter != NULL); } #if defined(CONFIG_DEBUG_RWSEMS) || defined(CONFIG_DETECT_HUNG_TASK_BLOCKER) -- cgit v1.2.3 From 57df858a46f0a4cc104716e0ec88864e5c386ca4 Mon Sep 17 00:00:00 2001 From: Jassi Brar Date: Mon, 9 Feb 2026 17:36:19 -0600 Subject: mailbox: add API to query available TX queue slots Clients sometimes need to know whether the mailbox TX queue has room before posting a new message. Rather than exposing internal queue state through a struct field, provide a proper accessor function that returns the number of available slots for a given channel. This lets clients choose to back off when the queue is full instead of hitting the -ENOBUFS error path and the misleading "Try increasing MBOX_TX_QUEUE_LEN" warning. Tested-by: Tanmay Shah Signed-off-by: Jassi Brar --- include/linux/mailbox_client.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mailbox_client.h b/include/linux/mailbox_client.h index c6eea9afb943..e5997120f45c 100644 --- a/include/linux/mailbox_client.h +++ b/include/linux/mailbox_client.h @@ -45,6 +45,7 @@ int mbox_send_message(struct mbox_chan *chan, void *mssg); int mbox_flush(struct mbox_chan *chan, unsigned long timeout); void mbox_client_txdone(struct mbox_chan *chan, int r); /* atomic */ bool mbox_client_peek_data(struct mbox_chan *chan); /* atomic */ +unsigned int mbox_chan_tx_slots_available(struct mbox_chan *chan); /* atomic */ void mbox_free_channel(struct mbox_chan *chan); /* may sleep */ #endif /* __MAILBOX_CLIENT_H */ -- cgit v1.2.3 From 27ab4f1e4909a674dfd03058fb9802cae2343a36 Mon Sep 17 00:00:00 2001 From: Vijendar Mukunda Date: Thu, 26 Feb 2026 12:25:54 +0530 Subject: soundwire: amd: refactor bandwidth calculation logic For current platforms(ACP6.3/ACP7.0/ACP7.1/ACP7.2), AMD SoundWire manager doesn't have banked registers for data port programming on Manager's side. Need to use fixed block offsets, hstart & hstop for manager ports. Earlier amd manager driver has support for 12 MHz as a bus clock frequency where frame rate is 48000 and number of bits is 500, frame shape as 50 x 10 with fixed block offset mapping based on port number. Got a new requirement to support 6 MHz as a bus clock frequency. For 6 MHz bus clock frequency amd manager driver needs to support two different frame shapes i.e number of bits as 250 with frame rate as 48000 and frame shape as 125 x 2 and for the second combination number of bits as 500 where frame rate is 24000 and frame shape is 50 x 10. Few SoundWire peripherals doesn't support 125 x 2 as a frame shape for 6 MHz bus clock frequency. They have explicit requirement for the frame shape. In this scenario, amd manager driver needs to use 50 x 10 as a frame shape where frame rate is 24000. Based on the platform and SoundWire topology for 6Mhz support frame shape will be decided which is part of SoundWire manager DisCo tables. For current platforms, amd manager driver supports only two bus clock frequencies(12 MHz & 6 MHz). Refactor bandwidth logic to support different bus clock frequencies. Signed-off-by: Vijendar Mukunda Reviewed-by: Pierre-Louis Bossart Link: https://patch.msgid.link/20260226065638.1251771-3-Vijendar.Mukunda@amd.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw_amd.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw_amd.h b/include/linux/soundwire/sdw_amd.h index fe31773d5210..470360a2723c 100644 --- a/include/linux/soundwire/sdw_amd.h +++ b/include/linux/soundwire/sdw_amd.h @@ -66,8 +66,10 @@ struct sdw_amd_dai_runtime { * @status: peripheral devices status array * @num_din_ports: number of input ports * @num_dout_ports: number of output ports + * @max_ports: total number of input ports and output ports * @cols_index: Column index in frame shape * @rows_index: Rows index in frame shape + * @port_offset_map: dynamic array to map port block offset * @instance: SoundWire manager instance * @quirks: SoundWire manager quirks * @wake_en_mask: wake enable mask per SoundWire manager @@ -92,10 +94,12 @@ struct amd_sdw_manager { int num_din_ports; int num_dout_ports; + int max_ports; int cols_index; int rows_index; + int *port_offset_map; u32 instance; u32 quirks; u32 wake_en_mask; -- cgit v1.2.3 From 493740d790cce709d285cd1022d16d05439b7d5b Mon Sep 17 00:00:00 2001 From: Arunpravin Paneer Selvam Date: Fri, 6 Mar 2026 11:31:54 +0530 Subject: drm/buddy: Improve offset-aligned allocation handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Large alignment requests previously forced the buddy allocator to search by alignment order, which often caused higher-order free blocks to be split even when a suitably aligned smaller region already existed within them. This led to excessive fragmentation, especially for workloads requesting small sizes with large alignment constraints. This change prioritizes the requested allocation size during the search and uses an augmented RB-tree field (subtree_max_alignment) to efficiently locate free blocks that satisfy both size and offset-alignment requirements. As a result, the allocator can directly select an aligned sub-region without splitting larger blocks unnecessarily. A practical example is the VKCTS test dEQP-VK.memory.allocation.basic.size_8KiB.reverse.count_4000, which repeatedly allocates 8 KiB buffers with a 256 KiB alignment. Previously, such allocations caused large blocks to be split aggressively, despite smaller aligned regions being sufficient. With this change, those aligned regions are reused directly, significantly reducing fragmentation. This improvement is visible in the amdgpu VRAM buddy allocator state (/sys/kernel/debug/dri/1/amdgpu_vram_mm). After the change, higher-order blocks are preserved and the number of low-order fragments is substantially reduced. Before: order- 5 free: 1936 MiB, blocks: 15490 order- 4 free: 967 MiB, blocks: 15486 order- 3 free: 483 MiB, blocks: 15485 order- 2 free: 241 MiB, blocks: 15486 order- 1 free: 241 MiB, blocks: 30948 After: order- 5 free: 493 MiB, blocks: 3941 order- 4 free: 246 MiB, blocks: 3943 order- 3 free: 123 MiB, blocks: 4101 order- 2 free: 61 MiB, blocks: 4101 order- 1 free: 61 MiB, blocks: 8018 By avoiding unnecessary splits, this change improves allocator efficiency and helps maintain larger contiguous free regions under heavy offset-aligned allocation workloads. v2:(Matthew) - Update augmented information along the path to the inserted node. v3: - Move the patch to gpu/buddy.c file. v4:(Matthew) - Use the helper instead of calling _ffs directly - Remove gpu_buddy_block_order(block) >= order check and drop order - Drop !node check as all callers handle this already - Return larger than any other possible alignment for __ffs64(0) - Replace __ffs with __ffs64 v5:(Matthew) - Drop subtree_max_alignment initialization at gpu_block_alloc() Signed-off-by: Arunpravin Paneer Selvam Suggested-by: Christian König Reviewed-by: Matthew Auld Link: https://patch.msgid.link/20260306060155.2114-1-Arunpravin.PaneerSelvam@amd.com --- include/linux/gpu_buddy.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/gpu_buddy.h b/include/linux/gpu_buddy.h index f1fb6eff604a..5fa917ba5450 100644 --- a/include/linux/gpu_buddy.h +++ b/include/linux/gpu_buddy.h @@ -11,6 +11,7 @@ #include #include #include +#include /** * GPU_BUDDY_RANGE_ALLOCATION - Allocate within a specific address range @@ -128,6 +129,7 @@ struct gpu_buddy_block { }; /* private: */ struct list_head tmp_link; + unsigned int subtree_max_alignment; }; /* Order-zero must be at least SZ_4K */ -- cgit v1.2.3 From 5f88899ec7531e1680b1003f32584d7da5922902 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nuno=20S=C3=A1?= Date: Tue, 3 Mar 2026 10:25:00 +0000 Subject: dmaengine: Document cyclic transfer for dmaengine_prep_peripheral_dma_vec() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Document that the DMA_PREP_REPEAT flag can be used with the dmaengine_prep_peripheral_dma_vec() to mark a transfer as cyclic similar to dmaengine_prep_dma_cyclic(). Reviewed-by: Frank Li Signed-off-by: Nuno Sá Link: https://patch.msgid.link/20260303-axi-dac-cyclic-support-v2-1-0db27b4be95a@analog.com Signed-off-by: Vinod Koul --- include/linux/dmaengine.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 99efe2b9b4ea..b3d251c9734e 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -996,7 +996,8 @@ static inline struct dma_async_tx_descriptor *dmaengine_prep_slave_single( * @vecs: The array of DMA vectors that should be transferred * @nents: The number of DMA vectors in the array * @dir: Specifies the direction of the data transfer - * @flags: DMA engine flags + * @flags: DMA engine flags - DMA_PREP_REPEAT can be used to mark a cyclic + * DMA transfer */ static inline struct dma_async_tx_descriptor *dmaengine_prep_peripheral_dma_vec( struct dma_chan *chan, const struct dma_vec *vecs, size_t nents, -- cgit v1.2.3 From 70fbea9f1a44d80a4c573c225f119022d6e21360 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 28 Feb 2026 17:12:13 -0800 Subject: dmaengine: ti-cppi5: fix all kernel-doc warnings Add missing struct member, function parameter, and enum value descriptions. Add missing function Returns: sections. Use correct function name in kernel-doc to avoid mismatched prototypes. These repair all kernel-doc warnings in ti-cppi5.h: Warning: include/linux/dma/ti-cppi5.h:27 struct member 'pkt_info1' not described in 'cppi5_desc_hdr_t' Warning: include/linux/dma/ti-cppi5.h:27 struct member 'pkt_info2' not described in 'cppi5_desc_hdr_t' Warning: include/linux/dma/ti-cppi5.h:50 struct member 'epib' not described in 'cppi5_host_desc_t' Warning: include/linux/dma/ti-cppi5.h:142 struct member 'epib' not described in 'cppi5_monolithic_desc_t' Warning: include/linux/dma/ti-cppi5.h:413 function parameter 'pkt_len' not described in 'cppi5_hdesc_set_pktlen' Warning: include/linux/dma/ti-cppi5.h:436 function parameter 'ps_flags' not described in 'cppi5_hdesc_set_psflags' Warning: include/linux/dma/ti-cppi5.h:509 function parameter 'hbuf_desc' not described in 'cppi5_hdesc_link_hbdesc' Warning: include/linux/dma/ti-cppi5.h:839 struct member 'dicnt3' not described in 'cppi5_tr_type15_t' Warning: include/linux/dma/ti-cppi5.h:970 function parameter 'desc_hdr' not described in 'cppi5_trdesc_init' Warning: include/linux/dma/ti-cppi5.h:184 No description found for return value of 'cppi5_desc_is_tdcm' Warning: include/linux/dma/ti-cppi5.h:198 No description found for return value of 'cppi5_desc_get_type' Warning: include/linux/dma/ti-cppi5.h:210 No description found for return value of 'cppi5_desc_get_errflags' Warning: include/linux/dma/ti-cppi5.h:448 expecting prototype for cppi5_hdesc_get_errflags(). Prototype was for cppi5_hdesc_get_pkttype() instead Warning: include/linux/dma/ti-cppi5.h:460 expecting prototype for cppi5_hdesc_get_errflags(). Prototype was for cppi5_hdesc_set_pkttype() instead Warning: include/linux/dma/ti-cppi5.h:1053 expecting prototype for cppi5_tr_cflag_set(). Prototype was for cppi5_tr_csf_set() instead Warning: include/linux/dma/ti-cppi5.h:651 Enum value 'CPPI5_TR_TYPE_MAX' not described in enum 'cppi5_tr_types' Warning: include/linux/dma/ti-cppi5.h:676 Enum value 'CPPI5_TR_EVENT_SIZE_MAX' not described in enum 'cppi5_tr_event_size' Warning: include/linux/dma/ti-cppi5.h:693 Enum value 'CPPI5_TR_TRIGGER_MAX' not described in enum 'cppi5_tr_trigger' Warning: include/linux/dma/ti-cppi5.h:714 Enum value 'CPPI5_TR_TRIGGER_TYPE_MAX' not described in enum 'cppi5_tr_trigger_type' Warning: include/linux/dma/ti-cppi5.h:890 Enum value 'CPPI5_TR_RESPONSE_STATUS_MAX' not described in enum 'cppi5_tr_resp_status_type' Warning: include/linux/dma/ti-cppi5.h:906 Enum value 'CPPI5_TR_RESPONSE_STATUS_SUBMISSION_MAX' not described in enum 'cppi5_tr_resp_status_submission' Warning: include/linux/dma/ti-cppi5.h:934 Enum value 'CPPI5_TR_RESPONSE_STATUS_UNSUPPORTED_MAX' not described in enum 'cppi5_tr_resp_status_unsupported' Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20260301011213.3063688-1-rdunlap@infradead.org Signed-off-by: Vinod Koul --- include/linux/dma/ti-cppi5.h | 53 +++++++++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma/ti-cppi5.h b/include/linux/dma/ti-cppi5.h index c53c0f6e3b1a..3fe19b75ddf7 100644 --- a/include/linux/dma/ti-cppi5.h +++ b/include/linux/dma/ti-cppi5.h @@ -16,8 +16,8 @@ * struct cppi5_desc_hdr_t - Descriptor header, present in all types of * descriptors * @pkt_info0: Packet info word 0 (n/a in Buffer desc) - * @pkt_info0: Packet info word 1 (n/a in Buffer desc) - * @pkt_info0: Packet info word 2 (n/a in Buffer desc) + * @pkt_info1: Packet info word 1 (n/a in Buffer desc) + * @pkt_info2: Packet info word 2 (n/a in Buffer desc) * @src_dst_tag: Packet info word 3 (n/a in Buffer desc) */ struct cppi5_desc_hdr_t { @@ -35,7 +35,7 @@ struct cppi5_desc_hdr_t { * @buf_info1: word 8: Buffer valid data length * @org_buf_len: word 9: Original buffer length * @org_buf_ptr: word 10/11: Original buffer pointer - * @epib[0]: Extended Packet Info Data (optional, 4 words), and/or + * @epib: Extended Packet Info Data (optional, 4 words), and/or * Protocol Specific Data (optional, 0-128 bytes in * multiples of 4), and/or * Other Software Data (0-N bytes, optional) @@ -132,7 +132,7 @@ struct cppi5_desc_epib_t { /** * struct cppi5_monolithic_desc_t - Monolithic-mode packet descriptor * @hdr: Descriptor header - * @epib[0]: Extended Packet Info Data (optional, 4 words), and/or + * @epib: Extended Packet Info Data (optional, 4 words), and/or * Protocol Specific Data (optional, 0-128 bytes in * multiples of 4), and/or * Other Software Data (0-N bytes, optional) @@ -179,7 +179,7 @@ static inline void cppi5_desc_dump(void *desc, u32 size) * cppi5_desc_is_tdcm - check if the paddr indicates Teardown Complete Message * @paddr: Physical address of the packet popped from the ring * - * Returns true if the address indicates TDCM + * Returns: true if the address indicates TDCM */ static inline bool cppi5_desc_is_tdcm(dma_addr_t paddr) { @@ -190,7 +190,7 @@ static inline bool cppi5_desc_is_tdcm(dma_addr_t paddr) * cppi5_desc_get_type - get descriptor type * @desc_hdr: packet descriptor/TR header * - * Returns descriptor type: + * Returns: descriptor type: * CPPI5_INFO0_DESC_TYPE_VAL_HOST * CPPI5_INFO0_DESC_TYPE_VAL_MONO * CPPI5_INFO0_DESC_TYPE_VAL_TR @@ -205,7 +205,7 @@ static inline u32 cppi5_desc_get_type(struct cppi5_desc_hdr_t *desc_hdr) * cppi5_desc_get_errflags - get Error Flags from Desc * @desc_hdr: packet/TR descriptor header * - * Returns Error Flags from Packet/TR Descriptor + * Returns: Error Flags from Packet/TR Descriptor */ static inline u32 cppi5_desc_get_errflags(struct cppi5_desc_hdr_t *desc_hdr) { @@ -307,7 +307,7 @@ static inline void cppi5_desc_set_tags_ids(struct cppi5_desc_hdr_t *desc_hdr, * @psdata_size: PSDATA size * @sw_data_size: SWDATA size * - * Returns required Host Packet Descriptor size + * Returns: required Host Packet Descriptor size * 0 - if PSDATA > CPPI5_INFO0_HDESC_PSDATA_MAX_SIZE */ static inline u32 cppi5_hdesc_calc_size(bool epib, u32 psdata_size, @@ -381,6 +381,8 @@ cppi5_hdesc_update_psdata_size(struct cppi5_host_desc_t *desc, u32 psdata_size) /** * cppi5_hdesc_get_psdata_size - get PSdata size in bytes * @desc: Host packet descriptor + * + * Returns: PSdata size in bytes */ static inline u32 cppi5_hdesc_get_psdata_size(struct cppi5_host_desc_t *desc) { @@ -398,7 +400,7 @@ static inline u32 cppi5_hdesc_get_psdata_size(struct cppi5_host_desc_t *desc) * cppi5_hdesc_get_pktlen - get Packet Length from HDesc * @desc: Host packet descriptor * - * Returns Packet Length from Host Packet Descriptor + * Returns: Packet Length from Host Packet Descriptor */ static inline u32 cppi5_hdesc_get_pktlen(struct cppi5_host_desc_t *desc) { @@ -408,6 +410,7 @@ static inline u32 cppi5_hdesc_get_pktlen(struct cppi5_host_desc_t *desc) /** * cppi5_hdesc_set_pktlen - set Packet Length in HDesc * @desc: Host packet descriptor + * @pkt_len: Packet length to set */ static inline void cppi5_hdesc_set_pktlen(struct cppi5_host_desc_t *desc, u32 pkt_len) @@ -420,7 +423,7 @@ static inline void cppi5_hdesc_set_pktlen(struct cppi5_host_desc_t *desc, * cppi5_hdesc_get_psflags - get Protocol Specific Flags from HDesc * @desc: Host packet descriptor * - * Returns Protocol Specific Flags from Host Packet Descriptor + * Returns: Protocol Specific Flags from Host Packet Descriptor */ static inline u32 cppi5_hdesc_get_psflags(struct cppi5_host_desc_t *desc) { @@ -431,6 +434,7 @@ static inline u32 cppi5_hdesc_get_psflags(struct cppi5_host_desc_t *desc) /** * cppi5_hdesc_set_psflags - set Protocol Specific Flags in HDesc * @desc: Host packet descriptor + * @ps_flags: Protocol Specific flags to set */ static inline void cppi5_hdesc_set_psflags(struct cppi5_host_desc_t *desc, u32 ps_flags) @@ -442,8 +446,10 @@ static inline void cppi5_hdesc_set_psflags(struct cppi5_host_desc_t *desc, } /** - * cppi5_hdesc_get_errflags - get Packet Type from HDesc + * cppi5_hdesc_get_pkttype - get Packet Type from HDesc * @desc: Host packet descriptor + * + * Returns: Packet type */ static inline u32 cppi5_hdesc_get_pkttype(struct cppi5_host_desc_t *desc) { @@ -452,7 +458,7 @@ static inline u32 cppi5_hdesc_get_pkttype(struct cppi5_host_desc_t *desc) } /** - * cppi5_hdesc_get_errflags - set Packet Type in HDesc + * cppi5_hdesc_set_pkttype - set Packet Type in HDesc * @desc: Host packet descriptor * @pkt_type: Packet Type */ @@ -501,7 +507,7 @@ static inline void cppi5_hdesc_reset_to_original(struct cppi5_host_desc_t *desc) /** * cppi5_hdesc_link_hbdesc - link Host Buffer Descriptor to HDesc * @desc: Host Packet Descriptor - * @buf_desc: Host Buffer Descriptor physical address + * @hbuf_desc: Host Buffer Descriptor physical address * * add and link Host Buffer Descriptor to HDesc */ @@ -527,7 +533,7 @@ static inline void cppi5_hdesc_reset_hbdesc(struct cppi5_host_desc_t *desc) * cppi5_hdesc_epib_present - check if EPIB present * @desc_hdr: packet descriptor/TR header * - * Returns true if EPIB present in the packet + * Returns: true if EPIB present in the packet */ static inline bool cppi5_hdesc_epib_present(struct cppi5_desc_hdr_t *desc_hdr) { @@ -538,7 +544,7 @@ static inline bool cppi5_hdesc_epib_present(struct cppi5_desc_hdr_t *desc_hdr) * cppi5_hdesc_get_psdata - Get pointer on PSDATA * @desc: Host packet descriptor * - * Returns pointer on PSDATA in HDesc. + * Returns: pointer on PSDATA in HDesc. * NULL - if ps_data placed at the start of data buffer. */ static inline void *cppi5_hdesc_get_psdata(struct cppi5_host_desc_t *desc) @@ -568,7 +574,7 @@ static inline void *cppi5_hdesc_get_psdata(struct cppi5_host_desc_t *desc) * cppi5_hdesc_get_swdata - Get pointer on swdata * @desc: Host packet descriptor * - * Returns pointer on SWDATA in HDesc. + * Returns: pointer on SWDATA in HDesc. * NOTE. It's caller responsibility to be sure hdesc actually has swdata. */ static inline void *cppi5_hdesc_get_swdata(struct cppi5_host_desc_t *desc) @@ -648,6 +654,7 @@ enum cppi5_tr_types { CPPI5_TR_TYPE11, /* type12-14: Reserved */ CPPI5_TR_TYPE15 = 15, + /* private: */ CPPI5_TR_TYPE_MAX }; @@ -673,6 +680,7 @@ enum cppi5_tr_event_size { CPPI5_TR_EVENT_SIZE_ICNT1_DEC, CPPI5_TR_EVENT_SIZE_ICNT2_DEC, CPPI5_TR_EVENT_SIZE_ICNT3_DEC, + /* private: */ CPPI5_TR_EVENT_SIZE_MAX }; @@ -690,6 +698,7 @@ enum cppi5_tr_trigger { CPPI5_TR_TRIGGER_GLOBAL0, CPPI5_TR_TRIGGER_GLOBAL1, CPPI5_TR_TRIGGER_LOCAL_EVENT, + /* private: */ CPPI5_TR_TRIGGER_MAX }; @@ -711,6 +720,7 @@ enum cppi5_tr_trigger_type { CPPI5_TR_TRIGGER_TYPE_ICNT2_DEC, CPPI5_TR_TRIGGER_TYPE_ICNT3_DEC, CPPI5_TR_TRIGGER_TYPE_ALL, + /* private: */ CPPI5_TR_TRIGGER_TYPE_MAX }; @@ -815,7 +825,7 @@ struct cppi5_tr_type3_t { * destination * @dicnt1: Total loop iteration count for level 1 for destination * @dicnt2: Total loop iteration count for level 2 for destination - * @sicnt3: Total loop iteration count for level 3 (outermost) for + * @dicnt3: Total loop iteration count for level 3 (outermost) for * destination */ struct cppi5_tr_type15_t { @@ -887,6 +897,7 @@ enum cppi5_tr_resp_status_type { CPPI5_TR_RESPONSE_STATUS_UNSUPPORTED_ERR, CPPI5_TR_RESPONSE_STATUS_TRANSFER_EXCEPTION, CPPI5_TR_RESPONSE_STATUS__TEARDOWN_FLUSH, + /* private: */ CPPI5_TR_RESPONSE_STATUS_MAX }; @@ -903,6 +914,7 @@ enum cppi5_tr_resp_status_submission { CPPI5_TR_RESPONSE_STATUS_SUBMISSION_ICNT0, CPPI5_TR_RESPONSE_STATUS_SUBMISSION_FIFO_FULL, CPPI5_TR_RESPONSE_STATUS_SUBMISSION_OWN, + /* private: */ CPPI5_TR_RESPONSE_STATUS_SUBMISSION_MAX }; @@ -931,6 +943,7 @@ enum cppi5_tr_resp_status_unsupported { CPPI5_TR_RESPONSE_STATUS_UNSUPPORTED_DFMT, CPPI5_TR_RESPONSE_STATUS_UNSUPPORTED_SECTR, CPPI5_TR_RESPONSE_STATUS_UNSUPPORTED_AMODE_SPECIFIC, + /* private: */ CPPI5_TR_RESPONSE_STATUS_UNSUPPORTED_MAX }; @@ -939,7 +952,7 @@ enum cppi5_tr_resp_status_unsupported { * @tr_count: number of TR records * @tr_size: Nominal size of TR record (max) [16, 32, 64, 128] * - * Returns required TR Descriptor size + * Returns: required TR Descriptor size */ static inline size_t cppi5_trdesc_calc_size(u32 tr_count, u32 tr_size) { @@ -955,7 +968,7 @@ static inline size_t cppi5_trdesc_calc_size(u32 tr_count, u32 tr_size) /** * cppi5_trdesc_init - Init TR Descriptor - * @desc: TR Descriptor + * @desc_hdr: TR Descriptor * @tr_count: number of TR records * @tr_size: Nominal size of TR record (max) [16, 32, 64, 128] * @reload_idx: Absolute index to jump to on the 2nd and following passes @@ -1044,7 +1057,7 @@ static inline void cppi5_tr_set_trigger(cppi5_tr_flags_t *flags, } /** - * cppi5_tr_cflag_set - Update the Configuration specific flags + * cppi5_tr_csf_set - Update the Configuration specific flags * @flags: Pointer to the TR's flags * @csf: Configuration specific flags * -- cgit v1.2.3 From 7b84a00dd3528d980f1f35fd2c5015f72dc3f62a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 28 Feb 2026 17:12:03 -0800 Subject: dmaengine: qcom: qcom-gpi-dma.h: fix all kernel-doc warnings Add missing enum descriptions and spell one struct member correctly to avoid kernel-doc warnings: Warning: include/linux/dma/qcom-gpi-dma.h:15 Enum value 'SPI_TX' not described in enum 'spi_transfer_cmd' Warning: include/linux/dma/qcom-gpi-dma.h:15 Enum value 'SPI_RX' not described in enum 'spi_transfer_cmd' Warning: include/linux/dma/qcom-gpi-dma.h:15 Enum value 'SPI_DUPLEX' not described in enum 'spi_transfer_cmd' Warning: include/linux/dma/qcom-gpi-dma.h:80 struct member 'multi_msg' not described in 'gpi_i2c_config' Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20260301011203.3062658-1-rdunlap@infradead.org Signed-off-by: Vinod Koul --- include/linux/dma/qcom-gpi-dma.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dma/qcom-gpi-dma.h b/include/linux/dma/qcom-gpi-dma.h index 6680dd1a43c6..332be28427e4 100644 --- a/include/linux/dma/qcom-gpi-dma.h +++ b/include/linux/dma/qcom-gpi-dma.h @@ -8,6 +8,9 @@ /** * enum spi_transfer_cmd - spi transfer commands + * @SPI_TX: SPI peripheral TX command + * @SPI_RX: SPI peripheral RX command + * @SPI_DUPLEX: SPI peripheral Duplex command */ enum spi_transfer_cmd { SPI_TX = 1, @@ -64,7 +67,7 @@ enum i2c_op { * @set_config: set peripheral config * @rx_len: receive length for buffer * @op: i2c cmd - * @muli-msg: is part of multi i2c r-w msgs + * @multi_msg: is part of multi i2c r-w msgs */ struct gpi_i2c_config { u8 set_config; -- cgit v1.2.3 From 4d94ce88c77e74830a5b9d02ecb8286039ffa494 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 25 Feb 2026 09:17:00 +1100 Subject: VFS: unexport lock_rename(), lock_rename_child(), unlock_rename() These three function are now only used in namei.c, so they don't need to be exported. Reviewed-by: Jeff Layton Signed-off-by: NeilBrown Link: https://patch.msgid.link/20260224222542.3458677-16-neilb@ownmail.net Signed-off-by: Christian Brauner --- include/linux/namei.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/namei.h b/include/linux/namei.h index c7a7288cdd25..2ad6dd9987b9 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -165,9 +165,6 @@ extern int follow_down_one(struct path *); extern int follow_down(struct path *path, unsigned int flags); extern int follow_up(struct path *); -extern struct dentry *lock_rename(struct dentry *, struct dentry *); -extern struct dentry *lock_rename_child(struct dentry *, struct dentry *); -extern void unlock_rename(struct dentry *, struct dentry *); int start_renaming(struct renamedata *rd, int lookup_flags, struct qstr *old_last, struct qstr *new_last); int start_renaming_dentry(struct renamedata *rd, int lookup_flags, -- cgit v1.2.3 From 5645f805927c9bd4443e6143e487ef3ffea34aaf Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Fri, 6 Mar 2026 14:22:00 +0100 Subject: gpio: Document line value semantics It is not clearly documented that the GPIO driver API expect the driver to get/set the physical level of the GPIO line and the consumer API will get/set the logic level. Document this in relevant places. Reported-by: David Jander Signed-off-by: Linus Walleij Link: https://patch.msgid.link/20260306-gpio-doc-levels-v1-1-19928739e400@kernel.org Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/driver.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 5f5ddcbfa445..17511434ed07 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -343,11 +343,17 @@ struct gpio_irq_chip { * @direction_output: configures signal "offset" as output, returns 0 on * success or a negative error number. This can be omitted on input-only * or output-only gpio chips. - * @get: returns value for signal "offset", 0=low, 1=high, or negative error + * @get: returns value for signal "offset", 0=low, 1=high, or negative error. + * the low and high values are defined as physical low on the line + * in/out to the connector such as a physical pad, pin or rail. The GPIO + * library has internal logic to handle lines that are active low, such + * as indicated by overstrike or #name in a schematic, and the driver + * should not try to second-guess the logic value of a line. * @get_multiple: reads values for multiple signals defined by "mask" and * stores them in "bits", returns 0 on success or negative error * @set: assigns output value for signal "offset", returns 0 on success or - * negative error value + * negative error value. The output value follows the same semantic + * rules as for @get. * @set_multiple: assigns output values for multiple signals defined by * "mask", returns 0 on success or negative error value * @set_config: optional hook for all kinds of settings. Uses the same -- cgit v1.2.3 From ad9d28e68f4f9d15b9bde15e1ab79a4f85eff60e Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Fri, 6 Mar 2026 18:22:47 +0100 Subject: reset: gpio: simplify fallback device matching The of_args field of struct reset_controller_dev was introduced to allow the reset-gpio driver to pass the phandle arguments back to reset core. The thing is: it doesn't even have to do it. The core sets the platform data of the auxiliary device *AND* has access to it later on during the lookup. This means the field is unneeded and all can happen entirely in reset core. Remove the field from the public header and don't set it in reset-gpio.c. Retrieve the platform data in reset core when needed instead. Reviewed-by: Philipp Zabel Signed-off-by: Bartosz Golaszewski Signed-off-by: Philipp Zabel --- include/linux/reset-controller.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/reset-controller.h b/include/linux/reset-controller.h index 46514cb1b9e0..aa95b460fdf8 100644 --- a/include/linux/reset-controller.h +++ b/include/linux/reset-controller.h @@ -35,9 +35,6 @@ struct of_phandle_args; * @reset_control_head: head of internal list of requested reset controls * @dev: corresponding driver model device struct * @of_node: corresponding device tree node as phandle target - * @of_args: for reset-gpios controllers: corresponding phandle args with - * of_node and GPIO number complementing of_node; either this or - * of_node should be present * @of_reset_n_cells: number of cells in reset line specifiers * @of_xlate: translation function to translate from specifier as found in the * device tree to id as given to the reset control ops, defaults @@ -51,7 +48,6 @@ struct reset_controller_dev { struct list_head reset_control_head; struct device *dev; struct device_node *of_node; - const struct of_phandle_args *of_args; int of_reset_n_cells; int (*of_xlate)(struct reset_controller_dev *rcdev, const struct of_phandle_args *reset_spec); -- cgit v1.2.3 From 44a0acb2caca3bfd0ca459fbf0b19be75f1819c0 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Fri, 6 Mar 2026 18:22:53 +0100 Subject: reset: protect struct reset_controller_dev with its own mutex Currently we use a single, global mutex - misleadingly names reset_list_mutex - to protect the global list of reset devices, per-controller list of reset control handles and also internal fields of struct reset_control. Locking can be made a lot more fine-grained if we use a separate mutex for serializing operations on the list AND accessing the reset controller device. Signed-off-by: Bartosz Golaszewski Reviewed-by: Philipp Zabel Signed-off-by: Philipp Zabel --- include/linux/reset-controller.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/reset-controller.h b/include/linux/reset-controller.h index aa95b460fdf8..185d2a9bd7cd 100644 --- a/include/linux/reset-controller.h +++ b/include/linux/reset-controller.h @@ -3,6 +3,7 @@ #define _LINUX_RESET_CONTROLLER_H_ #include +#include struct reset_controller_dev; @@ -40,6 +41,7 @@ struct of_phandle_args; * device tree to id as given to the reset control ops, defaults * to :c:func:`of_reset_simple_xlate`. * @nr_resets: number of reset controls in this reset controller device + * @lock: protects the reset control list from concurrent access */ struct reset_controller_dev { const struct reset_control_ops *ops; @@ -52,6 +54,7 @@ struct reset_controller_dev { int (*of_xlate)(struct reset_controller_dev *rcdev, const struct of_phandle_args *reset_spec); unsigned int nr_resets; + struct mutex lock; }; #if IS_ENABLED(CONFIG_RESET_CONTROLLER) -- cgit v1.2.3 From ba8dbbb14b7e6734afbb5ba37d0679831aa3d590 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Fri, 6 Mar 2026 18:22:56 +0100 Subject: reset: convert the core API to using firmware nodes In order to simplify the commit converting the internals of reset core to using firmware nodes, first convert the user-facing API. Modify the signature of the core consumer functions but leave the specialized wrappers as is to avoid modifying users for now. No functional change intended. Reviewed-by: Philipp Zabel Signed-off-by: Bartosz Golaszewski Signed-off-by: Philipp Zabel --- include/linux/reset.h | 43 ++++++++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/reset.h b/include/linux/reset.h index 44f9e3415f92..9c391cf0c822 100644 --- a/include/linux/reset.h +++ b/include/linux/reset.h @@ -5,10 +5,12 @@ #include #include #include +#include #include struct device; struct device_node; +struct fwnode_handle; struct reset_control; /** @@ -84,7 +86,7 @@ int reset_control_bulk_deassert(int num_rstcs, struct reset_control_bulk_data *r int reset_control_bulk_acquire(int num_rstcs, struct reset_control_bulk_data *rstcs); void reset_control_bulk_release(int num_rstcs, struct reset_control_bulk_data *rstcs); -struct reset_control *__of_reset_control_get(struct device_node *node, +struct reset_control *__fwnode_reset_control_get(struct fwnode_handle *fwnode, const char *id, int index, enum reset_control_flags flags); struct reset_control *__reset_control_get(struct device *dev, const char *id, int index, enum reset_control_flags flags); @@ -103,7 +105,8 @@ int __devm_reset_control_bulk_get(struct device *dev, int num_rstcs, struct reset_control *devm_reset_control_array_get(struct device *dev, enum reset_control_flags flags); -struct reset_control *of_reset_control_array_get(struct device_node *np, enum reset_control_flags); +struct reset_control *fwnode_reset_control_array_get(struct fwnode_handle *fwnode, + enum reset_control_flags); int reset_control_get_count(struct device *dev); @@ -152,8 +155,8 @@ static inline int __device_reset(struct device *dev, bool optional) return optional ? 0 : -ENOTSUPP; } -static inline struct reset_control *__of_reset_control_get( - struct device_node *node, +static inline struct reset_control *__fwnode_reset_control_get( + struct fwnode_handle *fwnode, const char *id, int index, enum reset_control_flags flags) { bool optional = flags & RESET_CONTROL_FLAGS_BIT_OPTIONAL; @@ -242,7 +245,7 @@ devm_reset_control_array_get(struct device *dev, enum reset_control_flags flags) } static inline struct reset_control * -of_reset_control_array_get(struct device_node *np, enum reset_control_flags flags) +fwnode_reset_control_array_get(struct fwnode_handle *fwnode, enum reset_control_flags flags) { bool optional = flags & RESET_CONTROL_FLAGS_BIT_OPTIONAL; @@ -500,7 +503,8 @@ reset_control_bulk_get_optional_shared(struct device *dev, int num_rstcs, static inline struct reset_control *of_reset_control_get_exclusive( struct device_node *node, const char *id) { - return __of_reset_control_get(node, id, 0, RESET_CONTROL_EXCLUSIVE); + return __fwnode_reset_control_get(of_fwnode_handle(node), id, 0, + RESET_CONTROL_EXCLUSIVE); } /** @@ -520,7 +524,8 @@ static inline struct reset_control *of_reset_control_get_exclusive( static inline struct reset_control *of_reset_control_get_optional_exclusive( struct device_node *node, const char *id) { - return __of_reset_control_get(node, id, 0, RESET_CONTROL_OPTIONAL_EXCLUSIVE); + return __fwnode_reset_control_get(of_fwnode_handle(node), id, 0, + RESET_CONTROL_OPTIONAL_EXCLUSIVE); } /** @@ -545,7 +550,8 @@ static inline struct reset_control *of_reset_control_get_optional_exclusive( static inline struct reset_control *of_reset_control_get_shared( struct device_node *node, const char *id) { - return __of_reset_control_get(node, id, 0, RESET_CONTROL_SHARED); + return __fwnode_reset_control_get(of_fwnode_handle(node), id, 0, + RESET_CONTROL_SHARED); } /** @@ -562,7 +568,8 @@ static inline struct reset_control *of_reset_control_get_shared( static inline struct reset_control *of_reset_control_get_exclusive_by_index( struct device_node *node, int index) { - return __of_reset_control_get(node, NULL, index, RESET_CONTROL_EXCLUSIVE); + return __fwnode_reset_control_get(of_fwnode_handle(node), NULL, index, + RESET_CONTROL_EXCLUSIVE); } /** @@ -590,7 +597,8 @@ static inline struct reset_control *of_reset_control_get_exclusive_by_index( static inline struct reset_control *of_reset_control_get_shared_by_index( struct device_node *node, int index) { - return __of_reset_control_get(node, NULL, index, RESET_CONTROL_SHARED); + return __fwnode_reset_control_get(of_fwnode_handle(node), NULL, index, + RESET_CONTROL_SHARED); } /** @@ -1032,30 +1040,35 @@ devm_reset_control_array_get_optional_shared(struct device *dev) static inline struct reset_control * of_reset_control_array_get_exclusive(struct device_node *node) { - return of_reset_control_array_get(node, RESET_CONTROL_EXCLUSIVE); + return fwnode_reset_control_array_get(of_fwnode_handle(node), + RESET_CONTROL_EXCLUSIVE); } static inline struct reset_control * of_reset_control_array_get_exclusive_released(struct device_node *node) { - return of_reset_control_array_get(node, RESET_CONTROL_EXCLUSIVE_RELEASED); + return fwnode_reset_control_array_get(of_fwnode_handle(node), + RESET_CONTROL_EXCLUSIVE_RELEASED); } static inline struct reset_control * of_reset_control_array_get_shared(struct device_node *node) { - return of_reset_control_array_get(node, RESET_CONTROL_SHARED); + return fwnode_reset_control_array_get(of_fwnode_handle(node), + RESET_CONTROL_SHARED); } static inline struct reset_control * of_reset_control_array_get_optional_exclusive(struct device_node *node) { - return of_reset_control_array_get(node, RESET_CONTROL_OPTIONAL_EXCLUSIVE); + return fwnode_reset_control_array_get(of_fwnode_handle(node), + RESET_CONTROL_OPTIONAL_EXCLUSIVE); } static inline struct reset_control * of_reset_control_array_get_optional_shared(struct device_node *node) { - return of_reset_control_array_get(node, RESET_CONTROL_OPTIONAL_SHARED); + return fwnode_reset_control_array_get(of_fwnode_handle(node), + RESET_CONTROL_OPTIONAL_SHARED); } #endif -- cgit v1.2.3 From 9035073d0ef1de813c6335239250248bfe0a64aa Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Fri, 6 Mar 2026 18:22:57 +0100 Subject: reset: convert reset core to using firmware nodes With everything else now in place, we can convert the remaining parts of the reset subsystem to becoming fwnode-agnostic - meaning it will work with all kinds of firmware nodes, not only devicetree. To that end: extend struct reset_controller_dev with fields taking information relevant for using firmware nodes (which mirrors what we already do for OF-nodes) and limit using of_ APIs only to where it's absolutely necessary (mostly around the of_xlate callback). For backward compatibility of existing drivers we still support OF-nodes but firmware nodes become the preferred method. Signed-off-by: Bartosz Golaszewski Reviewed-by: Philipp Zabel Signed-off-by: Philipp Zabel --- include/linux/reset-controller.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/reset-controller.h b/include/linux/reset-controller.h index 185d2a9bd7cd..52a5a4e81f18 100644 --- a/include/linux/reset-controller.h +++ b/include/linux/reset-controller.h @@ -5,6 +5,8 @@ #include #include +struct fwnode_handle; +struct fwnode_reference_args; struct reset_controller_dev; /** @@ -38,8 +40,12 @@ struct of_phandle_args; * @of_node: corresponding device tree node as phandle target * @of_reset_n_cells: number of cells in reset line specifiers * @of_xlate: translation function to translate from specifier as found in the - * device tree to id as given to the reset control ops, defaults - * to :c:func:`of_reset_simple_xlate`. + * device tree to id as given to the reset control ops + * @fwnode: firmware node associated with this device + * @fwnode_reset_n_cells: number of cells in reset line specifiers + * @fwnode_xlate: translation function to translate from firmware specifier to + * id as given to the reset control ops, defaults to + * :c:func:`fwnode_reset_simple_xlate` * @nr_resets: number of reset controls in this reset controller device * @lock: protects the reset control list from concurrent access */ @@ -53,6 +59,10 @@ struct reset_controller_dev { int of_reset_n_cells; int (*of_xlate)(struct reset_controller_dev *rcdev, const struct of_phandle_args *reset_spec); + struct fwnode_handle *fwnode; + int fwnode_reset_n_cells; + int (*fwnode_xlate)(struct reset_controller_dev *rcdev, + const struct fwnode_reference_args *reset_spec); unsigned int nr_resets; struct mutex lock; }; -- cgit v1.2.3 From b6420bd5aa0c374331bad6c0fa2eb5f0f87cf5a0 Mon Sep 17 00:00:00 2001 From: Jialu Xu Date: Sat, 7 Mar 2026 11:06:26 +0800 Subject: gpio: remove of_get_named_gpio() and All in-tree consumers have been converted to the descriptor-based API. Remove the deprecated of_get_named_gpio() helper, delete the header, and drop the corresponding entry from MAINTAINERS. Also remove the completed TODO item for this cleanup. Signed-off-by: Jialu Xu Reviewed-by: Linus Walleij Link: https://patch.msgid.link/02ABDA1F9E3FAF1F+20260307030623.3495092-6-xujialu@vimux.org Signed-off-by: Bartosz Golaszewski --- include/linux/of_gpio.h | 38 -------------------------------------- 1 file changed, 38 deletions(-) delete mode 100644 include/linux/of_gpio.h (limited to 'include/linux') diff --git a/include/linux/of_gpio.h b/include/linux/of_gpio.h deleted file mode 100644 index d0f66a5e1b2a..000000000000 --- a/include/linux/of_gpio.h +++ /dev/null @@ -1,38 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0+ */ -/* - * OF helpers for the GPIO API - * - * Copyright (c) 2007-2008 MontaVista Software, Inc. - * - * Author: Anton Vorontsov - */ - -#ifndef __LINUX_OF_GPIO_H -#define __LINUX_OF_GPIO_H - -#include -#include -#include /* FIXME: Shouldn't be here */ -#include - -struct device_node; - -#ifdef CONFIG_OF_GPIO - -extern int of_get_named_gpio(const struct device_node *np, - const char *list_name, int index); - -#else /* CONFIG_OF_GPIO */ - -#include - -/* Drivers may not strictly depend on the GPIO support, so let them link. */ -static inline int of_get_named_gpio(const struct device_node *np, - const char *propname, int index) -{ - return -ENOSYS; -} - -#endif /* CONFIG_OF_GPIO */ - -#endif /* __LINUX_OF_GPIO_H */ -- cgit v1.2.3 From 9840bb66e7e5dffd72b03201318f154a10b06b4a Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sat, 7 Mar 2026 14:54:31 -0500 Subject: vfs: remove externs from fs.h on functions modified by i_ino widening Christoph says, in response to one of the patches in the i_ino widening series, which changes the prototype of several functions in fs.h: "Can you please drop all these pointless externs while you're at it?" Remove extern keyword from functions touched by that patch (and a few that happened to be nearby). Also add missing argument names to declarations that lacked them. Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Signed-off-by: Jeff Layton Link: https://patch.msgid.link/20260307-iino-u64-v2-1-a1a1696e0653@kernel.org Signed-off-by: Christian Brauner --- include/linux/fs.h | 62 +++++++++++++++++++++++++++--------------------------- 1 file changed, 31 insertions(+), 31 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index e820c14f9c5a..23f36a2613a3 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2912,27 +2912,27 @@ static inline bool name_contains_dotdot(const char *name) #include /* needed for stackable file system support */ -extern loff_t default_llseek(struct file *file, loff_t offset, int whence); +loff_t default_llseek(struct file *file, loff_t offset, int whence); -extern loff_t vfs_llseek(struct file *file, loff_t offset, int whence); +loff_t vfs_llseek(struct file *file, loff_t offset, int whence); -extern int inode_init_always_gfp(struct super_block *, struct inode *, gfp_t); +int inode_init_always_gfp(struct super_block *sb, struct inode *inode, gfp_t gfp); static inline int inode_init_always(struct super_block *sb, struct inode *inode) { return inode_init_always_gfp(sb, inode, GFP_NOFS); } -extern void inode_init_once(struct inode *); -extern void address_space_init_once(struct address_space *mapping); -extern struct inode * igrab(struct inode *); -extern ino_t iunique(struct super_block *, ino_t); -extern int inode_needs_sync(struct inode *inode); -extern int inode_just_drop(struct inode *inode); +void inode_init_once(struct inode *inode); +void address_space_init_once(struct address_space *mapping); +struct inode *igrab(struct inode *inode); +ino_t iunique(struct super_block *sb, ino_t max_reserved); +int inode_needs_sync(struct inode *inode); +int inode_just_drop(struct inode *inode); static inline int inode_generic_drop(struct inode *inode) { return !inode->i_nlink || inode_unhashed(inode); } -extern void d_mark_dontcache(struct inode *inode); +void d_mark_dontcache(struct inode *inode); struct inode *ilookup5_nowait(struct super_block *sb, u64 hashval, int (*test)(struct inode *, void *), void *data, @@ -2944,31 +2944,31 @@ struct inode *ilookup(struct super_block *sb, u64 ino); struct inode *inode_insert5(struct inode *inode, u64 hashval, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data); -struct inode *iget5_locked(struct super_block *, u64, +struct inode *iget5_locked(struct super_block *sb, u64 hashval, int (*test)(struct inode *, void *), - int (*set)(struct inode *, void *), void *); -struct inode *iget5_locked_rcu(struct super_block *, u64, + int (*set)(struct inode *, void *), void *data); +struct inode *iget5_locked_rcu(struct super_block *sb, u64 hashval, int (*test)(struct inode *, void *), - int (*set)(struct inode *, void *), void *); -struct inode *iget_locked(struct super_block *, u64); -struct inode *find_inode_nowait(struct super_block *, u64, + int (*set)(struct inode *, void *), void *data); +struct inode *iget_locked(struct super_block *sb, u64 ino); +struct inode *find_inode_nowait(struct super_block *sb, u64 hashval, int (*match)(struct inode *, u64, void *), void *data); -struct inode *find_inode_rcu(struct super_block *, u64, - int (*)(struct inode *, void *), void *); -struct inode *find_inode_by_ino_rcu(struct super_block *, u64); -int insert_inode_locked4(struct inode *, u64, - int (*test)(struct inode *, void *), void *); -extern int insert_inode_locked(struct inode *); +struct inode *find_inode_rcu(struct super_block *sb, u64 hashval, + int (*test)(struct inode *, void *), void *data); +struct inode *find_inode_by_ino_rcu(struct super_block *sb, u64 ino); +int insert_inode_locked4(struct inode *inode, u64 hashval, + int (*test)(struct inode *, void *), void *data); +int insert_inode_locked(struct inode *inode); #ifdef CONFIG_DEBUG_LOCK_ALLOC -extern void lockdep_annotate_inode_mutex_key(struct inode *inode); +void lockdep_annotate_inode_mutex_key(struct inode *inode); #else static inline void lockdep_annotate_inode_mutex_key(struct inode *inode) { }; #endif -extern void unlock_new_inode(struct inode *); -extern void discard_new_inode(struct inode *); -extern unsigned int get_next_ino(void); -extern void evict_inodes(struct super_block *sb); +void unlock_new_inode(struct inode *inode); +void discard_new_inode(struct inode *inode); +unsigned int get_next_ino(void); +void evict_inodes(struct super_block *sb); void dump_mapping(const struct address_space *); /* @@ -3013,21 +3013,21 @@ int setattr_should_drop_sgid(struct mnt_idmap *idmap, */ #define alloc_inode_sb(_sb, _cache, _gfp) kmem_cache_alloc_lru(_cache, &_sb->s_inode_lru, _gfp) -void __insert_inode_hash(struct inode *, u64 hashval); +void __insert_inode_hash(struct inode *inode, u64 hashval); static inline void insert_inode_hash(struct inode *inode) { __insert_inode_hash(inode, inode->i_ino); } -extern void __remove_inode_hash(struct inode *); +void __remove_inode_hash(struct inode *inode); static inline void remove_inode_hash(struct inode *inode) { if (!inode_unhashed(inode) && !hlist_fake(&inode->i_hash)) __remove_inode_hash(inode); } -extern void inode_sb_list_add(struct inode *inode); -extern void inode_lru_list_add(struct inode *inode); +void inode_sb_list_add(struct inode *inode); +void inode_lru_list_add(struct inode *inode); int generic_file_mmap(struct file *, struct vm_area_struct *); int generic_file_mmap_prepare(struct vm_area_desc *desc); -- cgit v1.2.3 From a6fe20d67dc7d512f9b5dc11c5777fb1e1ff70ce Mon Sep 17 00:00:00 2001 From: Maciej Strozek Date: Fri, 6 Mar 2026 15:28:10 +0000 Subject: mfd: cs42l43: Add support for the B variant Introducing CS42L43B codec, a variant of CS42L43 which can be driven by the same driver. Changes in CS42L43 driver specific for CS42L43B: - Decimator 1 and 2 are dedicated to ADC, can't be selected for PDM - Decimators 3 and 4 are connected to PDM1 - Added Decimator 5 and 6 for PDM2 - Supports SoundWire Clock Gearing - Updated ROM requiring no patching - Reduced RAM space - Each ISRC has 4 decimators now Signed-off-by: Maciej Strozek Acked-by: Lee Jones Reviewed-by: Charles Keepax Link: https://patch.msgid.link/20260306152829.3130530-4-mstrozek@opensource.cirrus.com Signed-off-by: Mark Brown --- include/linux/mfd/cs42l43-regs.h | 76 ++++++++++++++++++++++++++++++++++++++++ include/linux/mfd/cs42l43.h | 1 + 2 files changed, 77 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/cs42l43-regs.h b/include/linux/mfd/cs42l43-regs.h index c39a49269cb7..68831f113589 100644 --- a/include/linux/mfd/cs42l43-regs.h +++ b/include/linux/mfd/cs42l43-regs.h @@ -1181,4 +1181,80 @@ /* CS42L43_FW_MISSION_CTRL_MM_MCU_CFG_REG */ #define CS42L43_FW_MISSION_CTRL_MM_MCU_CFG_DISABLE_VAL 0xF05AA50F +/* CS42L43B VARIANT REGISTERS */ +#define CS42L43B_DEVID_VAL 0x0042A43B + +#define CS42L43B_DECIM_VOL_CTRL_CH1_CH2 0x00008280 +#define CS42L43B_DECIM_VOL_CTRL_CH3_CH4 0x00008284 + +#define CS42L43B_DECIM_VOL_CTRL_CH5_CH6 0x00008290 +#define CS42L43B_DECIM_VOL_CTRL_UPDATE 0x0000829C + +#define CS42L43B_DECIM_HPF_WNF_CTRL5 0x000082A0 +#define CS42L43B_DECIM_HPF_WNF_CTRL6 0x000082A4 + +#define CS42L43B_SWIRE_DP3_CH3_INPUT 0x0000C320 +#define CS42L43B_SWIRE_DP3_CH4_INPUT 0x0000C330 +#define CS42L43B_SWIRE_DP4_CH3_INPUT 0x0000C340 +#define CS42L43B_SWIRE_DP4_CH4_INPUT 0x0000C350 + +#define CS42L43B_ISRC1DEC3_INPUT1 0x0000C780 +#define CS42L43B_ISRC1DEC4_INPUT1 0x0000C790 +#define CS42L43B_ISRC2DEC3_INPUT1 0x0000C7A0 +#define CS42L43B_ISRC2DEC4_INPUT1 0x0000C7B0 + +#define CS42L43B_FW_MISSION_CTRL_NEED_CONFIGS 0x00117E00 +#define CS42L43B_FW_MISSION_CTRL_HAVE_CONFIGS 0x00117E04 +#define CS42L43B_FW_MISSION_CTRL_PATCH_START_ADDR_REG 0x00117E08 +#define CS42L43B_FW_MISSION_CTRL_MM_CTRL_SELECTION 0x00117E0C +#define CS42L43B_FW_MISSION_CTRL_MM_MCU_CFG_REG 0x00117E10 + +#define CS42L43B_MCU_SW_REV 0x00117314 +#define CS42L43B_PATCH_START_ADDR 0x00117318 +#define CS42L43B_CONFIG_SELECTION 0x0011731C +#define CS42L43B_NEED_CONFIGS 0x00117320 +#define CS42L43B_BOOT_STATUS 0x00117330 + +#define CS42L43B_FW_MISSION_CTRL_NEED_CONFIGS 0x00117E00 +#define CS42L43B_FW_MISSION_CTRL_HAVE_CONFIGS 0x00117E04 +#define CS42L43B_FW_MISSION_CTRL_PATCH_START_ADDR_REG 0x00117E08 +#define CS42L43B_FW_MISSION_CTRL_MM_CTRL_SELECTION 0x00117E0C +#define CS42L43B_FW_MISSION_CTRL_MM_MCU_CFG_REG 0x00117E10 + +#define CS42L43B_MCU_RAM_MAX 0x00117FFF + +/* CS42L43B_DECIM_DECIM_VOL_CTRL_CH5_CH6 */ +#define CS42L43B_DECIM6_MUTE_MASK 0x80000000 +#define CS42L43B_DECIM6_MUTE_SHIFT 31 +#define CS42L43B_DECIM6_VOL_MASK 0x3FC00000 +#define CS42L43B_DECIM6_VOL_SHIFT 22 +#define CS42L43B_DECIM6_PATH1_VOL_FALL_RATE_MASK 0x00380000 +#define CS42L43B_DECIM6_PATH1_VOL_FALL_RATE_SHIFT 19 +#define CS42L43B_DECIM6_PATH1_VOL_RISE_RATE_MASK 0x00070000 +#define CS42L43B_DECIM6_PATH1_VOL_RISE_RATE_SHIFT 16 +#define CS42L43B_DECIM5_MUTE_MASK 0x00008000 +#define CS42L43B_DECIM5_MUTE_SHIFT 15 +#define CS42L43B_DECIM5_VOL_MASK 0x00003FC0 +#define CS42L43B_DECIM5_VOL_SHIFT 6 +#define CS42L43B_DECIM5_PATH1_VOL_FALL_RATE_MASK 0x00000038 +#define CS42L43B_DECIM5_PATH1_VOL_FALL_RATE_SHIFT 3 +#define CS42L43B_DECIM5_PATH1_VOL_RISE_RATE_MASK 0x00000007 +#define CS42L43B_DECIM5_PATH1_VOL_RISE_RATE_SHIFT 0 + +/* CS42L43B_DECIM_VOL_CTRL_UPDATE */ +#define CS42L43B_DECIM6_PATH1_VOL_TRIG_MASK 0x00000800 +#define CS42L43B_DECIM6_PATH1_VOL_TRIG_SHIFT 11 +#define CS42L43B_DECIM5_PATH1_VOL_TRIG_MASK 0x00000100 +#define CS42L43B_DECIM5_PATH1_VOL_TRIG_SHIFT 8 +#define CS42L43B_DECIM4_VOL_UPDATE_MASK 0x00000020 +#define CS42L43B_DECIM4_VOL_UPDATE_SHIFT 5 + +/* CS42L43_ISRC1_CTRL..CS42L43_ISRC2_CTRL */ +#define CS42L43B_ISRC_DEC4_EN_MASK 0x00000008 +#define CS42L43B_ISRC_DEC4_EN_SHIFT 3 +#define CS42L43B_ISRC_DEC4_EN_WIDTH 1 +#define CS42L43B_ISRC_DEC3_EN_MASK 0x00000004 +#define CS42L43B_ISRC_DEC3_EN_SHIFT 2 +#define CS42L43B_ISRC_DEC3_EN_WIDTH 1 + #endif /* CS42L43_CORE_REGS_H */ diff --git a/include/linux/mfd/cs42l43.h b/include/linux/mfd/cs42l43.h index 2239d8585e78..ff0f7e365a19 100644 --- a/include/linux/mfd/cs42l43.h +++ b/include/linux/mfd/cs42l43.h @@ -98,6 +98,7 @@ struct cs42l43 { bool sdw_pll_active; bool attached; bool hw_lock; + long variant_id; }; #endif /* CS42L43_CORE_EXT_H */ -- cgit v1.2.3 From 3a005126c9d7f30093627a6f329656c358e16b3a Mon Sep 17 00:00:00 2001 From: Frank Li Date: Wed, 25 Feb 2026 16:41:37 -0500 Subject: dmaengine: of_dma: Add devm_of_dma_controller_register() Add a managed API, devm_of_dma_controller_register(), to simplify DMA engine controller registration by automatically handling resource cleanup. Signed-off-by: Frank Li Link: https://patch.msgid.link/20260225-mxsdma-module-v3-1-8f798b13baa6@nxp.com Signed-off-by: Vinod Koul --- include/linux/of_dma.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of_dma.h b/include/linux/of_dma.h index fd706cdf255c..16b08234d03b 100644 --- a/include/linux/of_dma.h +++ b/include/linux/of_dma.h @@ -38,6 +38,26 @@ extern int of_dma_controller_register(struct device_node *np, void *data); extern void of_dma_controller_free(struct device_node *np); +static void __of_dma_controller_free(void *np) +{ + of_dma_controller_free(np); +} + +static inline int +devm_of_dma_controller_register(struct device *dev, struct device_node *np, + struct dma_chan *(*of_dma_xlate) + (struct of_phandle_args *, struct of_dma *), + void *data) +{ + int ret; + + ret = of_dma_controller_register(np, of_dma_xlate, data); + if (ret) + return ret; + + return devm_add_action_or_reset(dev, __of_dma_controller_free, np); +} + extern int of_dma_router_register(struct device_node *np, void *(*of_dma_route_allocate) (struct of_phandle_args *, struct of_dma *), @@ -64,6 +84,15 @@ static inline void of_dma_controller_free(struct device_node *np) { } +static inline int +devm_of_dma_controller_register(struct device *dev, struct device_node *np, + struct dma_chan *(*of_dma_xlate) + (struct of_phandle_args *, struct of_dma *), + void *data) +{ + return -ENODEV; +} + static inline int of_dma_router_register(struct device_node *np, void *(*of_dma_route_allocate) (struct of_phandle_args *, struct of_dma *), -- cgit v1.2.3 From 9ded47ad003f09a94b6a710b5c47f4aa5ceb7429 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Tue, 24 Feb 2026 09:25:54 +0100 Subject: fbdev: defio: Disconnect deferred I/O from the lifetime of struct fb_info Hold state of deferred I/O in struct fb_deferred_io_state. Allocate an instance as part of initializing deferred I/O and remove it only after the final mapping has been closed. If the fb_info and the contained deferred I/O meanwhile goes away, clear struct fb_deferred_io_state.info to invalidate the mapping. Any access will then result in a SIGBUS signal. Fixes a long-standing problem, where a device hot-unplug happens while user space still has an active mapping of the graphics memory. The hot- unplug frees the instance of struct fb_info. Accessing the memory will operate on undefined state. Signed-off-by: Thomas Zimmermann Fixes: 60b59beafba8 ("fbdev: mm: Deferred IO support") Cc: Helge Deller Cc: linux-fbdev@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: stable@vger.kernel.org # v2.6.22+ Signed-off-by: Helge Deller --- include/linux/fb.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fb.h b/include/linux/fb.h index 6d4a58084fd5..aed17567fe50 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -218,13 +218,14 @@ struct fb_deferred_io { unsigned long delay; bool sort_pagereflist; /* sort pagelist by offset */ int open_count; /* number of opened files; protected by fb_info lock */ - struct mutex lock; /* mutex that protects the pageref list */ struct list_head pagereflist; /* list of pagerefs for touched pages */ struct address_space *mapping; /* page cache object for fb device */ /* callback */ struct page *(*get_page)(struct fb_info *info, unsigned long offset); void (*deferred_io)(struct fb_info *info, struct list_head *pagelist); }; + +struct fb_deferred_io_state; #endif /* @@ -487,6 +488,7 @@ struct fb_info { unsigned long npagerefs; struct fb_deferred_io_pageref *pagerefs; struct fb_deferred_io *fbdefio; + struct fb_deferred_io_state *fbdefio_state; #endif const struct fb_ops *fbops; -- cgit v1.2.3 From 648bfb62da4e7a970f6b153bb8cdab1703877fcd Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Tue, 24 Feb 2026 09:25:56 +0100 Subject: fbdev: defio: Move variable state into struct fb_deferred_io_state Move variable fields from struct fb_deferred_io into struct fb_deferred_io_state. These fields are internal to the defio code and should not be exposed to drivers. At some later point, struct fb_defered_io might become const in all defio code. Signed-off-by: Thomas Zimmermann Signed-off-by: Helge Deller --- include/linux/fb.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fb.h b/include/linux/fb.h index aed17567fe50..2791777f3a50 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -217,9 +217,6 @@ struct fb_deferred_io { /* delay between mkwrite and deferred handler */ unsigned long delay; bool sort_pagereflist; /* sort pagelist by offset */ - int open_count; /* number of opened files; protected by fb_info lock */ - struct list_head pagereflist; /* list of pagerefs for touched pages */ - struct address_space *mapping; /* page cache object for fb device */ /* callback */ struct page *(*get_page)(struct fb_info *info, unsigned long offset); void (*deferred_io)(struct fb_info *info, struct list_head *pagelist); -- cgit v1.2.3 From 02fe86e5fc22faee9b29e761d9fdd17fdfe583e6 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Tue, 24 Feb 2026 09:25:57 +0100 Subject: fbdev: defio: Move pageref array to struct fb_deferred_io_state The pageref array stores all pageref structures for a device's defio helpers. Move it into struct fb_deferred_io_state to not expose it to drivers. Signed-off-by: Thomas Zimmermann Signed-off-by: Helge Deller --- include/linux/fb.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fb.h b/include/linux/fb.h index 2791777f3a50..b27943719fab 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -482,8 +482,6 @@ struct fb_info { #ifdef CONFIG_FB_DEFERRED_IO struct delayed_work deferred_work; - unsigned long npagerefs; - struct fb_deferred_io_pageref *pagerefs; struct fb_deferred_io *fbdefio; struct fb_deferred_io_state *fbdefio_state; #endif -- cgit v1.2.3 From 993bcaf32c494014f56357f6cdd87fdfaa4e4d11 Mon Sep 17 00:00:00 2001 From: Josua Mayer Date: Thu, 26 Feb 2026 15:21:11 +0200 Subject: mux: Add helper functions for getting optional and selected mux-state In-tree phy-can-transceiver and phy_rcar_gen3_usb2 have already implemented local versions of devm_mux_state_get_optional. The omap-i2c driver gets and selects an optional mux in its probe function without using any helper. Add new helper functions covering both aforementioned use-cases: - mux_control_get_optional: Get a mux-control if specified in dt, return NULL otherwise. - devm_mux_state_get_optional: Get a mux-state if specified in dt, return NULL otherwise. - devm_mux_state_get_selected: Get and select a mux-state specified in dt, return error otherwise. - devm_mux_state_get_optional_selected: Get and select a mux-state if specified in dt, return error or NULL. Existing mux_get helper function is changed to take an extra argument indicating whether the mux is optional. In this case no error is printed, and NULL returned in case of ENOENT. Calling code is adapted to handle NULL return case, and to pass optional argument as required. To support automatic deselect for _selected helper, a new structure is created storing an exit pointer similar to clock core which is called on release. To facilitate code sharing between optional/mandatory/selected helpers, a new internal helper function is added to handle quiet (optional) and verbose (mandatory) errors, as well as storing the correct callback for devm release: __devm_mux_state_get Due to this structure devm_mux_state_get_*_selected can no longer print a useful error message when select fails. Instead callers should print errors where needed. Commit e153fdea9db04 ("phy: can-transceiver: Re-instate "mux-states" property presence check") noted that "mux_get() always prints an error message in case of an error, including when the property is not present, confusing the user." The first error message covers the case that a mux name is not matched in dt. The second error message is based on of_parse_phandle_with_args return value. In optional case no error is printed and NULL is returned. This ensures that the new helper functions will not confuse the user either. With the addition of optional helper functions it became clear that drivers should compile and link even if CONFIG_MULTIPLEXER was not enabled. Add stubs for all symbols exported by mux core. Acked-by: Wolfram Sang Signed-off-by: Josua Mayer Signed-off-by: Ulf Hansson --- include/linux/mux/consumer.h | 108 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 104 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mux/consumer.h b/include/linux/mux/consumer.h index 2e25c838f831..a961861a503b 100644 --- a/include/linux/mux/consumer.h +++ b/include/linux/mux/consumer.h @@ -16,6 +16,8 @@ struct device; struct mux_control; struct mux_state; +#if IS_ENABLED(CONFIG_MULTIPLEXER) + unsigned int mux_control_states(struct mux_control *mux); int __must_check mux_control_select_delay(struct mux_control *mux, unsigned int state, @@ -54,11 +56,109 @@ int mux_control_deselect(struct mux_control *mux); int mux_state_deselect(struct mux_state *mstate); struct mux_control *mux_control_get(struct device *dev, const char *mux_name); +struct mux_control *mux_control_get_optional(struct device *dev, const char *mux_name); void mux_control_put(struct mux_control *mux); -struct mux_control *devm_mux_control_get(struct device *dev, - const char *mux_name); -struct mux_state *devm_mux_state_get(struct device *dev, - const char *mux_name); +struct mux_control *devm_mux_control_get(struct device *dev, const char *mux_name); +struct mux_state *devm_mux_state_get(struct device *dev, const char *mux_name); +struct mux_state *devm_mux_state_get_optional(struct device *dev, const char *mux_name); +struct mux_state *devm_mux_state_get_selected(struct device *dev, const char *mux_name); +struct mux_state *devm_mux_state_get_optional_selected(struct device *dev, const char *mux_name); + +#else + +static inline unsigned int mux_control_states(struct mux_control *mux) +{ + return 0; +} +static inline int __must_check mux_control_select_delay(struct mux_control *mux, + unsigned int state, unsigned int delay_us) +{ + return -EOPNOTSUPP; +} +static inline int __must_check mux_state_select_delay(struct mux_state *mstate, + unsigned int delay_us) +{ + return -EOPNOTSUPP; +} +static inline int __must_check mux_control_try_select_delay(struct mux_control *mux, + unsigned int state, + unsigned int delay_us) +{ + return -EOPNOTSUPP; +} +static inline int __must_check mux_state_try_select_delay(struct mux_state *mstate, + unsigned int delay_us) +{ + return -EOPNOTSUPP; +} + +static inline int __must_check mux_control_select(struct mux_control *mux, + unsigned int state) +{ + return -EOPNOTSUPP; +} + +static inline int __must_check mux_state_select(struct mux_state *mstate) +{ + return -EOPNOTSUPP; +} + +static inline int __must_check mux_control_try_select(struct mux_control *mux, + unsigned int state) +{ + return -EOPNOTSUPP; +} + +static inline int __must_check mux_state_try_select(struct mux_state *mstate) +{ + return -EOPNOTSUPP; +} + +static inline int mux_control_deselect(struct mux_control *mux) +{ + return -EOPNOTSUPP; +} +static inline int mux_state_deselect(struct mux_state *mstate) +{ + return -EOPNOTSUPP; +} + +static inline struct mux_control *mux_control_get(struct device *dev, const char *mux_name) +{ + return ERR_PTR(-EOPNOTSUPP); +} +static inline struct mux_control *mux_control_get_optional(struct device *dev, + const char *mux_name) +{ + return NULL; +} +static inline void mux_control_put(struct mux_control *mux) {} + +static inline struct mux_control *devm_mux_control_get(struct device *dev, const char *mux_name) +{ + return ERR_PTR(-EOPNOTSUPP); +} +static inline struct mux_state *devm_mux_state_get(struct device *dev, const char *mux_name) +{ + return ERR_PTR(-EOPNOTSUPP); +} +static inline struct mux_state *devm_mux_state_get_optional(struct device *dev, + const char *mux_name) +{ + return NULL; +} +static inline struct mux_state *devm_mux_state_get_selected(struct device *dev, + const char *mux_name) +{ + return ERR_PTR(-EOPNOTSUPP); +} +static inline struct mux_state *devm_mux_state_get_optional_selected(struct device *dev, + const char *mux_name) +{ + return NULL; +} + +#endif /* CONFIG_MULTIPLEXER */ #endif /* _LINUX_MUX_CONSUMER_H */ -- cgit v1.2.3 From 7ea25eaad5ae3a6c837a3df9bdb822194f002565 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Feb 2026 05:20:01 -0800 Subject: block: factor out a bio_integrity_action helper Split the logic to see if a bio needs integrity metadata from bio_integrity_prep into a reusable helper than can be called from file system code. Signed-off-by: Christoph Hellwig Reviewed-by: Anuj Gupta Reviewed-by: Kanchan Joshi Reviewed-by: Martin K. Petersen Reviewed-by: Darrick J. Wong Tested-by: Anuj Gupta Signed-off-by: Jens Axboe --- include/linux/bio-integrity.h | 5 ++--- include/linux/blk-integrity.h | 23 +++++++++++++++++++++++ 2 files changed, 25 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 21e4652dcfd2..276cbbdd2c9d 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -78,7 +78,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter); int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta); void bio_integrity_unmap_user(struct bio *bio); -bool bio_integrity_prep(struct bio *bio); +void bio_integrity_prep(struct bio *bio, unsigned int action); void bio_integrity_advance(struct bio *bio, unsigned int bytes_done); void bio_integrity_trim(struct bio *bio); int bio_integrity_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp_mask); @@ -104,9 +104,8 @@ static inline void bio_integrity_unmap_user(struct bio *bio) { } -static inline bool bio_integrity_prep(struct bio *bio) +static inline void bio_integrity_prep(struct bio *bio, unsigned int action) { - return true; } static inline int bio_integrity_clone(struct bio *bio, struct bio *bio_src, diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index c15b1ac62765..fd3f3c8c0fcd 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -180,4 +180,27 @@ static inline struct bio_vec rq_integrity_vec(struct request *rq) } #endif /* CONFIG_BLK_DEV_INTEGRITY */ +enum bio_integrity_action { + BI_ACT_BUFFER = (1u << 0), /* allocate buffer */ + BI_ACT_CHECK = (1u << 1), /* generate / verify PI */ + BI_ACT_ZERO = (1u << 2), /* zero buffer */ +}; + +/** + * bio_integrity_action - return the integrity action needed for a bio + * @bio: bio to operate on + * + * Returns the mask of integrity actions (BI_ACT_*) that need to be performed + * for @bio. + */ +unsigned int __bio_integrity_action(struct bio *bio); +static inline unsigned int bio_integrity_action(struct bio *bio) +{ + if (!blk_get_integrity(bio->bi_bdev->bd_disk)) + return 0; + if (bio_integrity(bio)) + return 0; + return __bio_integrity_action(bio); +} + #endif /* _LINUX_BLK_INTEGRITY_H */ -- cgit v1.2.3 From a936655697cd8d1bab2fd5189e2c33dd6356a266 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Feb 2026 05:20:02 -0800 Subject: block: factor out a bio_integrity_setup_default helper Add a helper to set the seed and check flag based on useful defaults from the profile. Note that this includes a small behavior change, as we now only set the seed if any action is set, which is fine as nothing will look at it otherwise. Signed-off-by: Christoph Hellwig Reviewed-by: Anuj Gupta Reviewed-by: Kanchan Joshi Reviewed-by: Martin K. Petersen Reviewed-by: Darrick J. Wong Tested-by: Anuj Gupta Signed-off-by: Jens Axboe --- include/linux/bio-integrity.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 276cbbdd2c9d..232b86b9bbcb 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -143,5 +143,6 @@ static inline int bio_integrity_add_page(struct bio *bio, struct page *page, void bio_integrity_alloc_buf(struct bio *bio, bool zero_buffer); void bio_integrity_free_buf(struct bio_integrity_payload *bip); +void bio_integrity_setup_default(struct bio *bio); #endif /* _LINUX_BIO_INTEGRITY_H */ -- cgit v1.2.3 From 7afe93946dff63aa57c6db81f5eb43ac8233364e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Feb 2026 05:20:03 -0800 Subject: block: add a bdev_has_integrity_csum helper Factor out a helper to see if the block device has an integrity checksum from bdev_stable_writes so that it can be reused for other checks. Signed-off-by: Christoph Hellwig Reviewed-by: Anuj Gupta Reviewed-by: Kanchan Joshi Reviewed-by: Martin K. Petersen Reviewed-by: Darrick J. Wong Tested-by: Anuj Gupta Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d463b9b5a0a5..dec0acaed6e6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1477,14 +1477,18 @@ static inline bool bdev_synchronous(struct block_device *bdev) return bdev->bd_disk->queue->limits.features & BLK_FEAT_SYNCHRONOUS; } -static inline bool bdev_stable_writes(struct block_device *bdev) +static inline bool bdev_has_integrity_csum(struct block_device *bdev) { - struct request_queue *q = bdev_get_queue(bdev); + struct queue_limits *lim = bdev_limits(bdev); - if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) && - q->limits.integrity.csum_type != BLK_INTEGRITY_CSUM_NONE) - return true; - return q->limits.features & BLK_FEAT_STABLE_WRITES; + return IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) && + lim->integrity.csum_type != BLK_INTEGRITY_CSUM_NONE; +} + +static inline bool bdev_stable_writes(struct block_device *bdev) +{ + return bdev_has_integrity_csum(bdev) || + (bdev_limits(bdev)->features & BLK_FEAT_STABLE_WRITES); } static inline bool blk_queue_write_cache(struct request_queue *q) -- cgit v1.2.3 From 8c56ef10150ed7650cf4105539242c94c156148c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Feb 2026 05:20:05 -0800 Subject: block: make max_integrity_io_size public File systems that generate integrity will need this, so move it out of the block private or blk-mq specific headers. Signed-off-by: Christoph Hellwig Reviewed-by: Anuj Gupta Reviewed-by: Kanchan Joshi Reviewed-by: Martin K. Petersen Reviewed-by: Darrick J. Wong Tested-by: Anuj Gupta Signed-off-by: Jens Axboe --- include/linux/blk-integrity.h | 5 ----- include/linux/blkdev.h | 18 ++++++++++++++++++ 2 files changed, 18 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index fd3f3c8c0fcd..ea6d7d322ae3 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -8,11 +8,6 @@ struct request; -/* - * Maximum contiguous integrity buffer allocation. - */ -#define BLK_INTEGRITY_MAX_SIZE SZ_2M - enum blk_integrity_flags { BLK_INTEGRITY_NOVERIFY = 1 << 0, BLK_INTEGRITY_NOGENERATE = 1 << 1, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index dec0acaed6e6..11857ae13d10 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1881,6 +1881,24 @@ static inline int bio_split_rw_at(struct bio *bio, return bio_split_io_at(bio, lim, segs, max_bytes, lim->dma_alignment); } +/* + * Maximum contiguous integrity buffer allocation. + */ +#define BLK_INTEGRITY_MAX_SIZE SZ_2M + +/* + * Maximum size of I/O that needs a block layer integrity buffer. Limited + * by the number of intervals for which we can fit the integrity buffer into + * the buffer size. Because the buffer is a single segment it is also limited + * by the maximum segment size. + */ +static inline unsigned int max_integrity_io_size(struct queue_limits *lim) +{ + return min_t(unsigned int, lim->max_segment_size, + (BLK_INTEGRITY_MAX_SIZE / lim->integrity.metadata_size) << + lim->integrity.interval_exp); +} + #define DEFINE_IO_COMP_BATCH(name) struct io_comp_batch name = { } #endif /* _LINUX_BLKDEV_H */ -- cgit v1.2.3 From 0bde8a12b5540572a7fd6d2867bee6de15e4f289 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Feb 2026 05:20:06 -0800 Subject: block: add fs_bio_integrity helpers Add a set of helpers for file system initiated integrity information. These include mempool backed allocations and verifying based on a passed in sector and size which is often available from file system completion routines. Signed-off-by: Christoph Hellwig Reviewed-by: Anuj Gupta Reviewed-by: Kanchan Joshi Reviewed-by: Martin K. Petersen Reviewed-by: Darrick J. Wong Tested-by: Anuj Gupta Signed-off-by: Jens Axboe --- include/linux/bio-integrity.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 232b86b9bbcb..af5178434ec6 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -145,4 +145,10 @@ void bio_integrity_alloc_buf(struct bio *bio, bool zero_buffer); void bio_integrity_free_buf(struct bio_integrity_payload *bip); void bio_integrity_setup_default(struct bio *bio); +unsigned int fs_bio_integrity_alloc(struct bio *bio); +void fs_bio_integrity_free(struct bio *bio); +void fs_bio_integrity_generate(struct bio *bio); +int fs_bio_integrity_verify(struct bio *bio, sector_t sector, + unsigned int size); + #endif /* _LINUX_BIO_INTEGRITY_H */ -- cgit v1.2.3 From a9aa6045abde87b94168c3ba034b953417e27272 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Feb 2026 05:20:07 -0800 Subject: block: pass a maxlen argument to bio_iov_iter_bounce Allow the file system to limit the size processed in a single bounce operation. This is needed when generating integrity data so that the size of a single integrity segment can't overflow. Signed-off-by: Christoph Hellwig Reviewed-by: Anuj Gupta Reviewed-by: Kanchan Joshi Reviewed-by: Martin K. Petersen Reviewed-by: Darrick J. Wong Tested-by: Anuj Gupta Signed-off-by: Jens Axboe --- include/linux/bio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 36a3f2275ecd..9693a0d6fefe 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -474,7 +474,7 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty); extern void bio_set_pages_dirty(struct bio *bio); extern void bio_check_pages_dirty(struct bio *bio); -int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter); +int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter, size_t maxlen); void bio_iov_iter_unbounce(struct bio *bio, bool is_error, bool mark_dirty); extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, -- cgit v1.2.3 From b9e0180b2e6a48532eb80e5cd8793157196586cf Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 9 Mar 2026 15:14:43 +0100 Subject: fbdev: Declare src parameter of fb_pad_ helpers as constant Fbdev's padding helpers do not modify the source buffer. Declare the parameter as 'const'. Fbcon's font-rendering code calls these helpers with the font data. Declaring src as const will allow for making the font data constant as well. While at it, also remove the extern qualifier from the function declarations in the header file. Signed-off-by: Thomas Zimmermann Signed-off-by: Helge Deller --- include/linux/fb.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fb.h b/include/linux/fb.h index b27943719fab..5178a33c752c 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -602,9 +602,9 @@ extern int register_framebuffer(struct fb_info *fb_info); extern void unregister_framebuffer(struct fb_info *fb_info); extern int devm_register_framebuffer(struct device *dev, struct fb_info *fb_info); extern char* fb_get_buffer_offset(struct fb_info *info, struct fb_pixmap *buf, u32 size); -extern void fb_pad_unaligned_buffer(u8 *dst, u32 d_pitch, u8 *src, u32 idx, - u32 height, u32 shift_high, u32 shift_low, u32 mod); -extern void fb_pad_aligned_buffer(u8 *dst, u32 d_pitch, u8 *src, u32 s_pitch, u32 height); +void fb_pad_unaligned_buffer(u8 *dst, u32 d_pitch, const u8 *src, u32 idx, u32 height, + u32 shift_high, u32 shift_low, u32 mod); +void fb_pad_aligned_buffer(u8 *dst, u32 d_pitch, const u8 *src, u32 s_pitch, u32 height); extern void fb_set_suspend(struct fb_info *info, int state); extern int fb_get_color_depth(struct fb_var_screeninfo *var, struct fb_fix_screeninfo *fix); @@ -630,8 +630,8 @@ static inline struct device *dev_of_fbinfo(const struct fb_info *info) #endif } -static inline void __fb_pad_aligned_buffer(u8 *dst, u32 d_pitch, - u8 *src, u32 s_pitch, u32 height) +static inline void __fb_pad_aligned_buffer(u8 *dst, u32 d_pitch, const u8 *src, u32 s_pitch, + u32 height) { u32 i, j; -- cgit v1.2.3 From 982f8b002aadef2b5169147b3a60a9eb62f908df Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 9 Mar 2026 15:14:44 +0100 Subject: vt: Remove trailing whitespaces Fix coding style. No functional changes. Signed-off-by: Thomas Zimmermann Signed-off-by: Helge Deller --- include/linux/console_struct.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/console_struct.h b/include/linux/console_struct.h index 13b35637bd5a..ebdb9750d348 100644 --- a/include/linux/console_struct.h +++ b/include/linux/console_struct.h @@ -120,7 +120,7 @@ struct vc_data { unsigned short vc_complement_mask; /* [#] Xor mask for mouse pointer */ unsigned short vc_s_complement_mask; /* Saved mouse pointer mask */ unsigned long vc_pos; /* Cursor address */ - /* fonts */ + /* fonts */ unsigned short vc_hi_font_mask; /* [#] Attribute set for upper 256 chars of font or 0 if not supported */ struct console_font vc_font; /* Current VC font set */ unsigned short vc_video_erase_char; /* Background erase character */ -- cgit v1.2.3 From 61912c607fa9955dcf3fc018b227baa98a6776dc Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 9 Mar 2026 15:14:45 +0100 Subject: vt: Store font in struct vc_font Replace struct console_font with struct vc_font for the type of the vc_font field of struct vc_data. Struct console_font is UAPI, which prevents further changes. Hence a new data type is required. Struct console_font has a documented vertical pitch of 32 bytes. This is not the case after the font data has been loaded into the kernel. Changing the type of vc_font addresses this inconsistency. The font data is now declared as constant, as it might come from the kernel's read-only section. There's some fallout throughout the console code where non-const variables refer to it. Fix them. A later update will declare the font data to a dedicated data type. v3: - fix typos Signed-off-by: Thomas Zimmermann Signed-off-by: Helge Deller --- include/linux/console_struct.h | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/console_struct.h b/include/linux/console_struct.h index ebdb9750d348..ea0cdf4278a3 100644 --- a/include/linux/console_struct.h +++ b/include/linux/console_struct.h @@ -13,8 +13,9 @@ #ifndef _LINUX_CONSOLE_STRUCT_H #define _LINUX_CONSOLE_STRUCT_H -#include +#include #include +#include #include struct uni_pagedict; @@ -58,6 +59,30 @@ struct vc_state { bool reverse; }; +/** + * struct vc_font - Describes a font + * @width: The width of a single glyph in bits + * @height: The height of a single glyph in scanlines + * @charcount: The number of glyphs in the font + * @data: The raw font data + * + * Font data is organized as an array of glyphs. Each glyph is a bitmap with + * set bits indicating the foreground color. Unset bits indicate background + * color. The fields @width and @height store a single glyph's number of + * horizontal bits and vertical scanlines. If width is not a multiple of 8, + * there are trailing bits to fill up the byte. These bits should not be drawn. + * + * The field @data points to the first glyph's first byte. The value @charcount + * gives the number of glyphs in the font. There are no empty scanlines between + * two adjacent glyphs. + */ +struct vc_font { + unsigned int width; + unsigned int height; + unsigned int charcount; + const unsigned char *data; +}; + /* * Example: vc_data of a console that was scrolled 3 lines down. * @@ -122,7 +147,7 @@ struct vc_data { unsigned long vc_pos; /* Cursor address */ /* fonts */ unsigned short vc_hi_font_mask; /* [#] Attribute set for upper 256 chars of font or 0 if not supported */ - struct console_font vc_font; /* Current VC font set */ + struct vc_font vc_font; /* Current VC font set */ unsigned short vc_video_erase_char; /* Background erase character */ /* VT terminal data */ unsigned int vc_state; /* Escape sequence parser state */ -- cgit v1.2.3 From e370d84b79ad28ecf9a9e1dad967aa64dbfbd8d8 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 9 Mar 2026 15:14:46 +0100 Subject: vt: Calculate font-buffer size with vc_font_size() In fbcon, fbcon_resize() computes the size of the font buffer from the values stored in vc_font. Move these calculations to the dedicated helpers vc_font_pitch() and vc_font_size(). Signed-off-by: Thomas Zimmermann Signed-off-by: Helge Deller --- include/linux/console_struct.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include/linux') diff --git a/include/linux/console_struct.h b/include/linux/console_struct.h index ea0cdf4278a3..771cba16cb54 100644 --- a/include/linux/console_struct.h +++ b/include/linux/console_struct.h @@ -83,6 +83,34 @@ struct vc_font { const unsigned char *data; }; +/** + * vc_font_pitch - Calculates the number of bytes between two adjacent scanlines + * @font: The VC font + * + * Returns: + * The number of bytes between two adjacent scanlines in the font data + */ +static inline unsigned int vc_font_pitch(const struct vc_font *font) +{ + return DIV_ROUND_UP(font->width, 8); +} + +/** + * vc_font_size - Calculates the size of the font data in bytes + * @font: The VC font + * + * vc_font_size() calculates the number of bytes of font data in the + * font specified by @font. The function calculates the size from the + * font parameters. + * + * Returns: + * The size of the font data in bytes. + */ +static inline unsigned int vc_font_size(const struct vc_font *font) +{ + return font->height * vc_font_pitch(font) * font->charcount; +} + /* * Example: vc_data of a console that was scrolled 3 lines down. * -- cgit v1.2.3 From 773ac24c44614ad1d5a96258534160c4a0d48d72 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 9 Mar 2026 15:14:48 +0100 Subject: lib/fonts: Remove FNTCHARCNT() The character count in the font data is unused. The internal fonts also do not set it. Remove FNTCHARCNT(). Signed-off-by: Thomas Zimmermann Signed-off-by: Helge Deller --- include/linux/font.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/font.h b/include/linux/font.h index fd8625cd76b2..d929c5fa32ca 100644 --- a/include/linux/font.h +++ b/include/linux/font.h @@ -68,7 +68,6 @@ extern const struct font_desc *get_default_font(int xres, int yres, /* Extra word getters */ #define REFCOUNT(fd) (((int *)(fd))[-1]) #define FNTSIZE(fd) (((int *)(fd))[-2]) -#define FNTCHARCNT(fd) (((int *)(fd))[-3]) #define FNTSUM(fd) (((int *)(fd))[-4]) #define FONT_EXTRA_WORDS 4 -- cgit v1.2.3 From 04bd5abc8cbebc1bd7e02471a8e3af51b8aad029 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 9 Mar 2026 15:14:49 +0100 Subject: lib/fonts: Store font data as font_data_t; update consoles Store font data as pointer to font_data_t instead of unsigned char. Update consoles. Pointers to font data refer to the raw data. There is a hidden header before the data that contains additional state. Document the existing layout and semantics of font_data_t. The data field in struct vc_font can be used by any console. Therefore it still points to plain data without the additional header. Fbcon sets its value from struct fbcon_display.fontdata. Hence, update the size test in fbcon_resize() to use struct fbcon_display.fontdata instead of struct vc_font.data. v3: - fix typos (Helge) v2: - 'Font lookup' -> 'Font description' in Signed-off-by: Thomas Zimmermann Signed-off-by: Helge Deller --- include/linux/font.h | 47 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/font.h b/include/linux/font.h index d929c5fa32ca..746a0996a018 100644 --- a/include/linux/font.h +++ b/include/linux/font.h @@ -13,12 +13,57 @@ #include +/* + * font_data_t and helpers + */ + +/** + * font_data_t - Raw font data + * + * Values of type font_data_t store a pointer to raw font data. The format + * is monochrome. Each bit sets a pixel of a stored glyph. Font data does + * not store geometry information for the individual glyphs. Users of the + * font have to store glyph size, pitch and character count separately. + * + * Font data in font_data_t is not equivalent to raw u8. Each pointer stores + * an additional hidden header before the font data. The layout is + * + * +------+-----------------------------+ + * | -16 | CRC32 Checksum (optional) | + * | -12 | | + * | -8 | Number of data bytes | + * | -4 | Reference count | + * +------+-----------------------------+ + * | 0 | Data buffer | + * | ... | | + * +------+-----------------------------+ + * + * Use helpers to access font_data_t. Use font_data_buf() to get the stored data. + */ +typedef const unsigned char font_data_t; + +/** + * font_data_buf() - Returns the font data as raw bytes + * @fd: The font data + * + * Returns: + * The raw font data. The provided buffer is read-only. + */ +static inline const unsigned char *font_data_buf(font_data_t *fd) +{ + return (const unsigned char *)fd; +} + +/* + * Font description + */ + struct font_desc { int idx; const char *name; unsigned int width, height; unsigned int charcount; - const void *data; + font_data_t *data; int pref; }; -- cgit v1.2.3 From e2e000a0b22036a72474088d3399097ff47ace02 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 9 Mar 2026 15:14:50 +0100 Subject: lib/fonts: Read font size with font_data_size() Add font_data_size() and update consoles to use it. Signed-off-by: Thomas Zimmermann Signed-off-by: Helge Deller --- include/linux/font.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/font.h b/include/linux/font.h index 746a0996a018..5b8557813c5c 100644 --- a/include/linux/font.h +++ b/include/linux/font.h @@ -54,6 +54,8 @@ static inline const unsigned char *font_data_buf(font_data_t *fd) return (const unsigned char *)fd; } +unsigned int font_data_size(font_data_t *fd); + /* * Font description */ -- cgit v1.2.3 From 1de371b1f1b02dc82da598f9707089229699a604 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 9 Mar 2026 15:14:51 +0100 Subject: lib/fonts: Manage font-data lifetime with font_data_get/_put() Add font_data_get() and font_data_put(). Update consoles to use them over REFCOUNT() and plain kfree(). Newly allocated font data starts with a reference count of 1. Loading the font puts the previously loaded font. If the reference count reaches zero, font_data_put() frees the font data. The kernel stores a refcount of zero for internal font data. Invoking font_data_get() and font_data_put() tests this internally and returns success without further operation. From the caller's perspective, getting and putting works the same for all font data. Fbcon used the userfont flag distinguish between internal fonts and fonts loaded by user space. Only the latter where refcounted. With the new helper's automatic handling of internal font data, remove the userfont flag from fbcon. Newport_con uses a default font, FONT_DATA, until user space loads custom font data. Remove all special cases for FONT_DATA, as the get and put calls' read-only handling also covers this case. v3: - fix module linker error wrt font symbols (Nathan, Arnd) - fix typos Signed-off-by: Thomas Zimmermann Signed-off-by: Helge Deller --- include/linux/font.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/font.h b/include/linux/font.h index 5b8557813c5c..dd319d0f0201 100644 --- a/include/linux/font.h +++ b/include/linux/font.h @@ -54,6 +54,8 @@ static inline const unsigned char *font_data_buf(font_data_t *fd) return (const unsigned char *)fd; } +void font_data_get(font_data_t *fd); +bool font_data_put(font_data_t *fd); unsigned int font_data_size(font_data_t *fd); /* -- cgit v1.2.3 From 1e3c49aa03fbfeea595ca0c9a4ebaf1e9cc078af Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 9 Mar 2026 15:14:52 +0100 Subject: lib/fonts: Compare font data for equality with font_data_is_equal() Add font_data_is_equal() and update consoles to use it. Font data is equal if it has the same size and contains the same values on all bytes. Only fbcon uses a crc32 checksum. If set in both operands the checksums have to be equal. The new helper also guarantees to not compare internal fonts against fonts from user space. Internal fonts cannot be ref-counted, so making them equal to user-space fonts with the same byte sequence results in undefined behavior. The test only compares data buffers. Their interpretation is up each console. Therefore remove a width test in fbcon_set_font(). v3: - rebase onto font_data_{get,put}() Signed-off-by: Thomas Zimmermann Signed-off-by: Helge Deller --- include/linux/font.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/font.h b/include/linux/font.h index dd319d0f0201..58bf3c64cabb 100644 --- a/include/linux/font.h +++ b/include/linux/font.h @@ -57,6 +57,7 @@ static inline const unsigned char *font_data_buf(font_data_t *fd) void font_data_get(font_data_t *fd); bool font_data_put(font_data_t *fd); unsigned int font_data_size(font_data_t *fd); +bool font_data_is_equal(font_data_t *lhs, font_data_t *rhs); /* * Font description -- cgit v1.2.3 From 514d0de7cf403144d3e6c5b9fabb1ce4c15974ca Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 9 Mar 2026 15:14:53 +0100 Subject: lib/fonts: Create font_data_t from struct console_font with font_data_import() Add font_data_import() and update consoles to use it. The implementation of font_data_import() is based on code from fbcon, which supports overflow checks and crc32 checksums. Fbcon uses the crc32 checksum. Newport_con now implements the same overflow checks as fbcon. As before, this console does not support checksums, which are optional. Newport_con can now also handle input font data with a vertical pitch other than 32 bytes. (The vertical pitch is the offset between two glyphs in the font data.) As an internal change, remove the const qualifier from the data field if struct font_data. This allows font_data_import() to write the data without type casting. For all users of the font data via font_data_t, the stored data is still read only. v3: - fix typos Signed-off-by: Thomas Zimmermann Signed-off-by: Helge Deller --- include/linux/font.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/font.h b/include/linux/font.h index 58bf3c64cabb..3eb4818402c5 100644 --- a/include/linux/font.h +++ b/include/linux/font.h @@ -13,6 +13,8 @@ #include +struct console_font; + /* * font_data_t and helpers */ @@ -54,6 +56,8 @@ static inline const unsigned char *font_data_buf(font_data_t *fd) return (const unsigned char *)fd; } +font_data_t *font_data_import(const struct console_font *font, unsigned int vpitch, + u32 (*calc_csum)(u32, const void *, size_t)); void font_data_get(font_data_t *fd); bool font_data_put(font_data_t *fd); unsigned int font_data_size(font_data_t *fd); @@ -124,7 +128,7 @@ extern const struct font_desc *get_default_font(int xres, int yres, struct font_data { unsigned int extra[FONT_EXTRA_WORDS]; - const unsigned char data[]; + unsigned char data[]; } __packed; #endif /* _VIDEO_FONT_H */ -- cgit v1.2.3 From c37bd7c8d36f760c064de2639423866dc0270997 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 9 Mar 2026 15:14:54 +0100 Subject: lib/fonts: Store font data for user space with font_data_export() Add font_data_export() and update consoles to use it. The helper font_data_export() is based on code in fbcon_get_font(). It extends the size of a single glyph to match the requested vpitch, which us usually 32 bytes for fonts from user space. Internal fonts have a pitch according to the glyph's height. The implementation of font_data_export() differs in several ways from the original code. The original implementation distinguished between different pitches of the font data. This is not necessary as the pitch is a parameter in the copying. There was also special handling for a font pitch of 3 bytes, which got expanded to 4 bytes (with trailing bits on each scanline). The logic originated from long before git history exists even in the historical tree. So it is not clear why this was implemented. It is not what user space expects. The setfont utitlity loads font with 3-bytes pitches and expects to read such fonts with a 3-byte pitch. For any font width, the font pitch is always the width extended to the next multiple of 8. See [1] for the user-space font-reading code. With the changes to handling the font pitches, font_data_export() replaces the original code's various special cases with a single copying logic. v3: - fix typos (Helge) Signed-off-by: Thomas Zimmermann Link: https://github.com/legionus/kbd/blob/v2.9.0/src/libkfont/kdfontop.c#L73 # [1] Signed-off-by: Helge Deller --- include/linux/font.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/font.h b/include/linux/font.h index 3eb4818402c5..d80db66a5c17 100644 --- a/include/linux/font.h +++ b/include/linux/font.h @@ -62,6 +62,7 @@ void font_data_get(font_data_t *fd); bool font_data_put(font_data_t *fd); unsigned int font_data_size(font_data_t *fd); bool font_data_is_equal(font_data_t *lhs, font_data_t *rhs); +int font_data_export(font_data_t *fd, struct console_font *font, unsigned int vpitch); /* * Font description -- cgit v1.2.3 From db65872b38dc9f18a62669d6ae1e4ec7868a85a9 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 9 Mar 2026 15:14:55 +0100 Subject: lib/fonts: Remove internal symbols and macros from public header file Define access macros for font_data_t in fonts.c. Define struct font_data and declare most of the font symbols in the internal header font.h, where they can only be seen by the font code. Also move font indices into internal font.h. They appear to be unused though. There is m86k assembly code that operates on the idx field, so leave them in place for now. List all built-in fonts in a separate section in the public header file. v2: - do not add config guards around font symbols (Helge) - keep declaration of built-in fonts in public header Signed-off-by: Thomas Zimmermann Signed-off-by: Helge Deller --- include/linux/font.h | 57 ++++++++++++++++------------------------------------ 1 file changed, 17 insertions(+), 40 deletions(-) (limited to 'include/linux') diff --git a/include/linux/font.h b/include/linux/font.h index d80db66a5c17..5401f07dd6ce 100644 --- a/include/linux/font.h +++ b/include/linux/font.h @@ -77,36 +77,6 @@ struct font_desc { int pref; }; -#define VGA8x8_IDX 0 -#define VGA8x16_IDX 1 -#define PEARL8x8_IDX 2 -#define VGA6x11_IDX 3 -#define FONT7x14_IDX 4 -#define FONT10x18_IDX 5 -#define SUN8x16_IDX 6 -#define SUN12x22_IDX 7 -#define ACORN8x8_IDX 8 -#define MINI4x6_IDX 9 -#define FONT6x10_IDX 10 -#define TER16x32_IDX 11 -#define FONT6x8_IDX 12 -#define TER10x18_IDX 13 - -extern const struct font_desc font_vga_8x8, - font_vga_8x16, - font_pearl_8x8, - font_vga_6x11, - font_7x14, - font_10x18, - font_sun_8x16, - font_sun_12x22, - font_acorn_8x8, - font_mini_4x6, - font_6x10, - font_ter_16x32, - font_6x8, - font_ter_10x18; - /* Find a font with a specific name */ extern const struct font_desc *find_font(const char *name); @@ -120,16 +90,23 @@ extern const struct font_desc *get_default_font(int xres, int yres, /* Max. length for the name of a predefined font */ #define MAX_FONT_NAME 32 -/* Extra word getters */ -#define REFCOUNT(fd) (((int *)(fd))[-1]) -#define FNTSIZE(fd) (((int *)(fd))[-2]) -#define FNTSUM(fd) (((int *)(fd))[-4]) - -#define FONT_EXTRA_WORDS 4 +/* + * Built-in fonts + */ -struct font_data { - unsigned int extra[FONT_EXTRA_WORDS]; - unsigned char data[]; -} __packed; +extern const struct font_desc font_10x18; +extern const struct font_desc font_6x10; +extern const struct font_desc font_6x8; +extern const struct font_desc font_7x14; +extern const struct font_desc font_acorn_8x8; +extern const struct font_desc font_mini_4x6; +extern const struct font_desc font_pearl_8x8; +extern const struct font_desc font_sun_12x22; +extern const struct font_desc font_sun_8x16; +extern const struct font_desc font_ter_10x18; +extern const struct font_desc font_ter_16x32; +extern const struct font_desc font_vga_6x11; +extern const struct font_desc font_vga_8x16; +extern const struct font_desc font_vga_8x8; #endif /* _VIDEO_FONT_H */ -- cgit v1.2.3 From 2e67fabd8b77c4f482df9b211bca1b495c6c2c24 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Mon, 9 Mar 2026 16:24:49 +0000 Subject: ring-buffer: Introduce ring-buffer remotes Add ring-buffer remotes to support entities outside of the kernel (such as firmware or a hypervisor) that writes events into a ring-buffer using the tracefs format Require a description of the ring-buffer pages (struct trace_buffer_desc) and callbacks (swap_reader_page and reset) to set up the ring-buffer on the kernel side. Expect the remote entity to maintain and update the meta-page. Link: https://patch.msgid.link/20260309162516.2623589-4-vdonnefort@google.com Reviewed-by: Steven Rostedt (Google) Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 58 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index d862fa610270..994f52b34344 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -251,4 +251,62 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu, void ring_buffer_map_dup(struct trace_buffer *buffer, int cpu); int ring_buffer_unmap(struct trace_buffer *buffer, int cpu); int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu); + +struct ring_buffer_desc { + int cpu; + unsigned int nr_page_va; /* excludes the meta page */ + unsigned long meta_va; + unsigned long page_va[] __counted_by(nr_page_va); +}; + +struct trace_buffer_desc { + int nr_cpus; + size_t struct_len; + char __data[]; /* list of ring_buffer_desc */ +}; + +static inline struct ring_buffer_desc *__next_ring_buffer_desc(struct ring_buffer_desc *desc) +{ + size_t len = struct_size(desc, page_va, desc->nr_page_va); + + return (struct ring_buffer_desc *)((void *)desc + len); +} + +static inline struct ring_buffer_desc *__first_ring_buffer_desc(struct trace_buffer_desc *desc) +{ + return (struct ring_buffer_desc *)(&desc->__data[0]); +} + +static inline size_t trace_buffer_desc_size(size_t buffer_size, unsigned int nr_cpus) +{ + unsigned int nr_pages = max(DIV_ROUND_UP(buffer_size, PAGE_SIZE), 2UL) + 1; + struct ring_buffer_desc *rbdesc; + + return size_add(offsetof(struct trace_buffer_desc, __data), + size_mul(nr_cpus, struct_size(rbdesc, page_va, nr_pages))); +} + +#define for_each_ring_buffer_desc(__pdesc, __cpu, __trace_pdesc) \ + for (__pdesc = __first_ring_buffer_desc(__trace_pdesc), __cpu = 0; \ + (__cpu) < (__trace_pdesc)->nr_cpus; \ + (__cpu)++, __pdesc = __next_ring_buffer_desc(__pdesc)) + +struct ring_buffer_remote { + struct trace_buffer_desc *desc; + int (*swap_reader_page)(unsigned int cpu, void *priv); + int (*reset)(unsigned int cpu, void *priv); + void *priv; +}; + +int ring_buffer_poll_remote(struct trace_buffer *buffer, int cpu); + +struct trace_buffer * +__ring_buffer_alloc_remote(struct ring_buffer_remote *remote, + struct lock_class_key *key); + +#define ring_buffer_alloc_remote(remote) \ +({ \ + static struct lock_class_key __key; \ + __ring_buffer_alloc_remote(remote, &__key); \ +}) #endif /* _LINUX_RING_BUFFER_H */ -- cgit v1.2.3 From 96e43537af5461b26f50904c6055046ba65d742f Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Mon, 9 Mar 2026 16:24:51 +0000 Subject: tracing: Introduce trace remotes A trace remote relies on ring-buffer remotes to read and control compatible tracing buffers, written by entity such as firmware or hypervisor. Add a Tracefs directory remotes/ that contains all instances of trace remotes. Each instance follows the same hierarchy as any other to ease the support by existing user-space tools. This currently does not provide any event support, which will come later. Link: https://patch.msgid.link/20260309162516.2623589-6-vdonnefort@google.com Reviewed-by: Steven Rostedt (Google) Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_remote.h | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 include/linux/trace_remote.h (limited to 'include/linux') diff --git a/include/linux/trace_remote.h b/include/linux/trace_remote.h new file mode 100644 index 000000000000..65b7e7b8267c --- /dev/null +++ b/include/linux/trace_remote.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _LINUX_TRACE_REMOTE_H +#define _LINUX_TRACE_REMOTE_H + +#include + +/** + * struct trace_remote_callbacks - Callbacks used by Tracefs to control the remote + * @load_trace_buffer: Called before Tracefs accesses the trace buffer for the first + * time. Must return a &trace_buffer_desc + * (most likely filled with trace_remote_alloc_buffer()) + * @unload_trace_buffer: + * Called once Tracefs has no use for the trace buffer + * (most likely call trace_remote_free_buffer()) + * @enable_tracing: Called on Tracefs tracing_on. It is expected from the + * remote to allow writing. + * @swap_reader_page: Called when Tracefs consumes a new page from a + * ring-buffer. It is expected from the remote to isolate a + * new reader-page from the @cpu ring-buffer. + */ +struct trace_remote_callbacks { + struct trace_buffer_desc *(*load_trace_buffer)(unsigned long size, void *priv); + void (*unload_trace_buffer)(struct trace_buffer_desc *desc, void *priv); + int (*enable_tracing)(bool enable, void *priv); + int (*swap_reader_page)(unsigned int cpu, void *priv); +}; + +int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, void *priv); + +int trace_remote_alloc_buffer(struct trace_buffer_desc *desc, size_t desc_size, size_t buffer_size, + const struct cpumask *cpumask); + +void trace_remote_free_buffer(struct trace_buffer_desc *desc); + +#endif -- cgit v1.2.3 From 9af4ab0e11e336e2671d303ffcc6578e3546d9fc Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Mon, 9 Mar 2026 16:24:52 +0000 Subject: tracing: Add reset to trace remotes Allow to reset the trace remote buffer by writing to the Tracefs "trace" file. This is similar to the regular Tracefs interface. Link: https://patch.msgid.link/20260309162516.2623589-7-vdonnefort@google.com Reviewed-by: Steven Rostedt (Google) Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_remote.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/trace_remote.h b/include/linux/trace_remote.h index 65b7e7b8267c..10ca03dc192b 100644 --- a/include/linux/trace_remote.h +++ b/include/linux/trace_remote.h @@ -17,6 +17,8 @@ * remote to allow writing. * @swap_reader_page: Called when Tracefs consumes a new page from a * ring-buffer. It is expected from the remote to isolate a + * @reset: Called on `echo 0 > trace`. It is expected from the + * remote to reset all ring-buffer pages. * new reader-page from the @cpu ring-buffer. */ struct trace_remote_callbacks { @@ -24,6 +26,7 @@ struct trace_remote_callbacks { void (*unload_trace_buffer)(struct trace_buffer_desc *desc, void *priv); int (*enable_tracing)(bool enable, void *priv); int (*swap_reader_page)(unsigned int cpu, void *priv); + int (*reset)(unsigned int cpu, void *priv); }; int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, void *priv); -- cgit v1.2.3 From bf2ba0f8ca1af14aaaa765cbb93caf564d383aad Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Mon, 9 Mar 2026 16:24:54 +0000 Subject: tracing: Add init callback to trace remotes Add a .init call back so the trace remote callers can add entries to the tracefs directory. Link: https://patch.msgid.link/20260309162516.2623589-9-vdonnefort@google.com Reviewed-by: Steven Rostedt (Google) Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_remote.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/trace_remote.h b/include/linux/trace_remote.h index 10ca03dc192b..090c58b7d92b 100644 --- a/include/linux/trace_remote.h +++ b/include/linux/trace_remote.h @@ -3,10 +3,13 @@ #ifndef _LINUX_TRACE_REMOTE_H #define _LINUX_TRACE_REMOTE_H +#include #include /** * struct trace_remote_callbacks - Callbacks used by Tracefs to control the remote + * @init: Called once the remote has been registered. Allows the + * caller to extend the Tracefs remote directory * @load_trace_buffer: Called before Tracefs accesses the trace buffer for the first * time. Must return a &trace_buffer_desc * (most likely filled with trace_remote_alloc_buffer()) @@ -22,6 +25,7 @@ * new reader-page from the @cpu ring-buffer. */ struct trace_remote_callbacks { + int (*init)(struct dentry *d, void *priv); struct trace_buffer_desc *(*load_trace_buffer)(unsigned long size, void *priv); void (*unload_trace_buffer)(struct trace_buffer_desc *desc, void *priv); int (*enable_tracing)(bool enable, void *priv); -- cgit v1.2.3 From 072529158e604cc964feb78dcf094c6975828146 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Mon, 9 Mar 2026 16:24:55 +0000 Subject: tracing: Add events to trace remotes An event is predefined point in the writer code that allows to log data. Following the same scheme as kernel events, add remote events, described to user-space within the events/ tracefs directory found in the corresponding trace remote. Remote events are expected to be described during the trace remote registration. Add also a .enable_event callback for trace_remote to toggle the event logging, if supported. Link: https://patch.msgid.link/20260309162516.2623589-10-vdonnefort@google.com Reviewed-by: Steven Rostedt (Google) Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_remote.h | 7 ++++++- include/linux/trace_remote_event.h | 23 +++++++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) create mode 100644 include/linux/trace_remote_event.h (limited to 'include/linux') diff --git a/include/linux/trace_remote.h b/include/linux/trace_remote.h index 090c58b7d92b..fcd1d46ea466 100644 --- a/include/linux/trace_remote.h +++ b/include/linux/trace_remote.h @@ -5,6 +5,7 @@ #include #include +#include /** * struct trace_remote_callbacks - Callbacks used by Tracefs to control the remote @@ -23,6 +24,8 @@ * @reset: Called on `echo 0 > trace`. It is expected from the * remote to reset all ring-buffer pages. * new reader-page from the @cpu ring-buffer. + * @enable_event: Called on events/event_name/enable. It is expected from + * the remote to allow the writing event @id. */ struct trace_remote_callbacks { int (*init)(struct dentry *d, void *priv); @@ -31,9 +34,11 @@ struct trace_remote_callbacks { int (*enable_tracing)(bool enable, void *priv); int (*swap_reader_page)(unsigned int cpu, void *priv); int (*reset)(unsigned int cpu, void *priv); + int (*enable_event)(unsigned short id, bool enable, void *priv); }; -int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, void *priv); +int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, void *priv, + struct remote_event *events, size_t nr_events); int trace_remote_alloc_buffer(struct trace_buffer_desc *desc, size_t desc_size, size_t buffer_size, const struct cpumask *cpumask); diff --git a/include/linux/trace_remote_event.h b/include/linux/trace_remote_event.h new file mode 100644 index 000000000000..a4449008a075 --- /dev/null +++ b/include/linux/trace_remote_event.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _LINUX_TRACE_REMOTE_EVENTS_H +#define _LINUX_TRACE_REMOTE_EVENTS_H + +struct trace_remote; +struct trace_event_fields; + +struct remote_event_hdr { + unsigned short id; +}; + +#define REMOTE_EVENT_NAME_MAX 30 +struct remote_event { + char name[REMOTE_EVENT_NAME_MAX]; + unsigned short id; + bool enabled; + struct trace_remote *remote; + struct trace_event_fields *fields; + char *print_fmt; + void (*print)(void *evt, struct trace_seq *seq); +}; +#endif -- cgit v1.2.3 From 5f3efd1dcebc35d44cce39630ae00980a45d9247 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Mon, 9 Mar 2026 16:24:57 +0000 Subject: tracing: Add helpers to create trace remote events Declaring remote events can be cumbersome let's add a set of macros to simplify developers life. The declaration of a remote event is very similar to kernel's events: REMOTE_EVENT(name, id, RE_STRUCT( re_field(u64 foo) ), RE_PRINTK("foo=%llu", __entry->foo) ) Link: https://patch.msgid.link/20260309162516.2623589-12-vdonnefort@google.com Reviewed-by: Steven Rostedt (Google) Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_remote_event.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/trace_remote_event.h b/include/linux/trace_remote_event.h index a4449008a075..c8ae1e1f5e72 100644 --- a/include/linux/trace_remote_event.h +++ b/include/linux/trace_remote_event.h @@ -5,6 +5,7 @@ struct trace_remote; struct trace_event_fields; +struct trace_seq; struct remote_event_hdr { unsigned short id; @@ -20,4 +21,13 @@ struct remote_event { char *print_fmt; void (*print)(void *evt, struct trace_seq *seq); }; + +#define RE_STRUCT(__args...) __args +#define re_field(__type, __field) __type __field; + +#define REMOTE_EVENT_FORMAT(__name, __struct) \ + struct remote_event_format_##__name { \ + struct remote_event_hdr hdr; \ + __struct \ + } #endif -- cgit v1.2.3 From 93ae1b76fff9e745f870a2f2cd32f472328c4a8f Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Mon, 9 Mar 2026 16:24:58 +0000 Subject: ring-buffer: Export buffer_data_page and macros In preparation for allowing the writing of ring-buffer compliant pages outside of ring_buffer.c, move buffer_data_page and timestamps encoding macros into the publicly available ring_buffer_types.h. Link: https://patch.msgid.link/20260309162516.2623589-13-vdonnefort@google.com Reviewed-by: Steven Rostedt (Google) Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer_types.h | 41 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 include/linux/ring_buffer_types.h (limited to 'include/linux') diff --git a/include/linux/ring_buffer_types.h b/include/linux/ring_buffer_types.h new file mode 100644 index 000000000000..54577021a49d --- /dev/null +++ b/include/linux/ring_buffer_types.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_RING_BUFFER_TYPES_H +#define _LINUX_RING_BUFFER_TYPES_H + +#include + +#define TS_SHIFT 27 +#define TS_MASK ((1ULL << TS_SHIFT) - 1) +#define TS_DELTA_TEST (~TS_MASK) + +/* + * We need to fit the time_stamp delta into 27 bits. + */ +static inline bool test_time_stamp(u64 delta) +{ + return !!(delta & TS_DELTA_TEST); +} + +#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data) + +#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) +#define RB_ALIGNMENT 4U +#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) +#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ + +#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS +# define RB_FORCE_8BYTE_ALIGNMENT 0 +# define RB_ARCH_ALIGNMENT RB_ALIGNMENT +#else +# define RB_FORCE_8BYTE_ALIGNMENT 1 +# define RB_ARCH_ALIGNMENT 8U +#endif + +#define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT) + +struct buffer_data_page { + u64 time_stamp; /* page time stamp */ + local_t commit; /* write committed index */ + unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */ +}; +#endif -- cgit v1.2.3 From 34e5b958bdad0f9cf16306368bbc2dc5b2a50143 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Mon, 9 Mar 2026 16:24:59 +0000 Subject: tracing: Introduce simple_ring_buffer Add a simple implementation of the kernel ring-buffer. This intends to be used later by ring-buffer remotes such as the pKVM hypervisor, hence the need for a cut down version (write only) without any dependency. Link: https://patch.msgid.link/20260309162516.2623589-14-vdonnefort@google.com Reviewed-by: Steven Rostedt (Google) Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- include/linux/simple_ring_buffer.h | 57 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 include/linux/simple_ring_buffer.h (limited to 'include/linux') diff --git a/include/linux/simple_ring_buffer.h b/include/linux/simple_ring_buffer.h new file mode 100644 index 000000000000..2c4c0ae336bc --- /dev/null +++ b/include/linux/simple_ring_buffer.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SIMPLE_RING_BUFFER_H +#define _LINUX_SIMPLE_RING_BUFFER_H + +#include +#include +#include +#include + +/* + * Ideally those struct would stay private but the caller needs to know + * the allocation size for simple_ring_buffer_init(). + */ +struct simple_buffer_page { + struct list_head link; + struct buffer_data_page *page; + u64 entries; + u32 write; + u32 id; +}; + +struct simple_rb_per_cpu { + struct simple_buffer_page *tail_page; + struct simple_buffer_page *reader_page; + struct simple_buffer_page *head_page; + struct simple_buffer_page *bpages; + struct trace_buffer_meta *meta; + u32 nr_pages; + +#define SIMPLE_RB_UNAVAILABLE 0 +#define SIMPLE_RB_READY 1 +#define SIMPLE_RB_WRITING 2 + u32 status; + + u64 last_overrun; + u64 write_stamp; + + struct simple_rb_cbs *cbs; +}; + +int simple_ring_buffer_init(struct simple_rb_per_cpu *cpu_buffer, struct simple_buffer_page *bpages, + const struct ring_buffer_desc *desc); + +void simple_ring_buffer_unload(struct simple_rb_per_cpu *cpu_buffer); + +void *simple_ring_buffer_reserve(struct simple_rb_per_cpu *cpu_buffer, unsigned long length, + u64 timestamp); + +void simple_ring_buffer_commit(struct simple_rb_per_cpu *cpu_buffer); + +int simple_ring_buffer_enable_tracing(struct simple_rb_per_cpu *cpu_buffer, bool enable); + +int simple_ring_buffer_reset(struct simple_rb_per_cpu *cpu_buffer); + +int simple_ring_buffer_swap_reader_page(struct simple_rb_per_cpu *cpu_buffer); + +#endif -- cgit v1.2.3 From 635923081c792c830fb87e680d6dd5f348926b3f Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Mon, 9 Mar 2026 16:25:03 +0000 Subject: tracing: load/unload page callbacks for simple_ring_buffer Add load/unload callback used for each admitted page in the ring-buffer. This will be later useful for the pKVM hypervisor which uses a different VA space and need to dynamically map/unmap the ring-buffer pages. Link: https://patch.msgid.link/20260309162516.2623589-18-vdonnefort@google.com Reviewed-by: Steven Rostedt (Google) Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- include/linux/simple_ring_buffer.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/simple_ring_buffer.h b/include/linux/simple_ring_buffer.h index 2c4c0ae336bc..21aec556293e 100644 --- a/include/linux/simple_ring_buffer.h +++ b/include/linux/simple_ring_buffer.h @@ -54,4 +54,12 @@ int simple_ring_buffer_reset(struct simple_rb_per_cpu *cpu_buffer); int simple_ring_buffer_swap_reader_page(struct simple_rb_per_cpu *cpu_buffer); +int simple_ring_buffer_init_mm(struct simple_rb_per_cpu *cpu_buffer, + struct simple_buffer_page *bpages, + const struct ring_buffer_desc *desc, + void *(*load_page)(unsigned long va), + void (*unload_page)(void *va)); + +void simple_ring_buffer_unload_mm(struct simple_rb_per_cpu *cpu_buffer, + void (*unload_page)(void *)); #endif -- cgit v1.2.3 From c116737e972ea74f4468a1bd0703d623a3c0ee4a Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Mon, 9 Mar 2026 14:15:28 +0100 Subject: workqueue: Add system_dfl_long_wq for long unbound works Currently there are users of queue_delayed_work() who specify system_long_wq, the per-cpu workqueue. This workqueue should be used for long per-cpu works, but queue_delayed_work() queue the work using: queue_delayed_work_on(WORK_CPU_UNBOUND, ...); This would end up calling __queue_delayed_work() that does: if (housekeeping_enabled(HK_TYPE_TIMER)) { // [....] } else { if (likely(cpu == WORK_CPU_UNBOUND)) add_timer_global(timer); else add_timer_on(timer, cpu); } So when cpu == WORK_CPU_UNBOUND the timer is global and is not using a specific CPU. Later, when __queue_work() is called: if (req_cpu == WORK_CPU_UNBOUND) { if (wq->flags & WQ_UNBOUND) cpu = wq_select_unbound_cpu(raw_smp_processor_id()); else cpu = raw_smp_processor_id(); } Because the wq is not unbound, it takes the CPU where the timer fired and enqueue the work on that CPU. The consequence of all of this is that the work can run anywhere, depending on where the timer fired. Introduce system_dfl_long_wq in order to change, in a future step, users that are still calling: queue_delayed_work(system_long_wq, ...); with the new system_dfl_long_wq instead, so that the work may benefit from scheduler task placement. Signed-off-by: Marco Crivellari Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index fc5744402a66..8e0855d56e74 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -440,6 +440,9 @@ enum wq_consts { * system_long_wq is similar to system_percpu_wq but may host long running * works. Queue flushing might take relatively long. * + * system_dfl_long_wq is similar to system_dfl_wq but it may host long running + * works. + * * system_dfl_wq is unbound workqueue. Workers are not bound to * any specific CPU, not concurrency managed, and all queued works are * executed immediately as long as max_active limit is not reached and @@ -468,6 +471,7 @@ extern struct workqueue_struct *system_power_efficient_wq; extern struct workqueue_struct *system_freezable_power_efficient_wq; extern struct workqueue_struct *system_bh_wq; extern struct workqueue_struct *system_bh_highpri_wq; +extern struct workqueue_struct *system_dfl_long_wq; void workqueue_softirq_action(bool highpri); void workqueue_softirq_dead(unsigned int cpu); @@ -783,6 +787,8 @@ extern void __warn_flushing_systemwide_wq(void) _wq == system_highpri_wq) || \ (__builtin_constant_p(_wq == system_long_wq) && \ _wq == system_long_wq) || \ + (__builtin_constant_p(_wq == system_dfl_long_wq) && \ + _wq == system_dfl_long_wq) || \ (__builtin_constant_p(_wq == system_dfl_wq) && \ _wq == system_dfl_wq) || \ (__builtin_constant_p(_wq == system_freezable_wq) && \ -- cgit v1.2.3 From 878004e2852bc22ce0687c5597d6fe3909fb59f3 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 6 Mar 2026 14:10:32 -0800 Subject: iopoll: fix function parameter names in read_poll_timeout_atomic() Correct the function parameter names to avoid kernel-doc warnings and to emphasize this function is atomic (non-sleeping). Warning: include/linux/iopoll.h:169 function parameter 'sleep_us' not described in 'read_poll_timeout_atomic' Warning: ../include/linux/iopoll.h:169 function parameter 'sleep_before_read' not described in 'read_poll_timeout_atomic' Fixes: 9df8043a546d ("iopoll: Generalize read_poll_timeout() into poll_timeout_us()") Signed-off-by: Randy Dunlap Reviewed-by: Jani Nikula Link: https://patch.msgid.link/20260306221033.2357305-1-rdunlap@infradead.org Signed-off-by: Jani Nikula --- include/linux/iopoll.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iopoll.h b/include/linux/iopoll.h index bdd2e0652bc3..53edd69acb9b 100644 --- a/include/linux/iopoll.h +++ b/include/linux/iopoll.h @@ -159,7 +159,7 @@ * * This macro does not rely on timekeeping. Hence it is safe to call even when * timekeeping is suspended, at the expense of an underestimation of wall clock - * time, which is rather minimal with a non-zero delay_us. + * time, which is rather minimal with a non-zero @delay_us. * * When available, you'll probably want to use one of the specialized * macros defined below rather than this macro directly. @@ -167,9 +167,9 @@ * Returns: 0 on success and -ETIMEDOUT upon a timeout. In either * case, the last read value at @args is stored in @val. */ -#define read_poll_timeout_atomic(op, val, cond, sleep_us, timeout_us, \ - sleep_before_read, args...) \ - poll_timeout_us_atomic((val) = op(args), cond, sleep_us, timeout_us, sleep_before_read) +#define read_poll_timeout_atomic(op, val, cond, delay_us, timeout_us, \ + delay_before_read, args...) \ + poll_timeout_us_atomic((val) = op(args), cond, delay_us, timeout_us, delay_before_read) /** * readx_poll_timeout - Periodically poll an address until a condition is met or a timeout occurs -- cgit v1.2.3 From aca086ff27c3f67e81617e4b063d1126544a4f19 Mon Sep 17 00:00:00 2001 From: Ondrej Kozina Date: Fri, 6 Feb 2026 15:17:58 +0100 Subject: sed-opal: add IOC_OPAL_REACTIVATE_LSP. This adds the 'Reactivate' method as described in the "TCG Storage Opal SSC Feature Set: Single User Mode" document (ch. 3.1.1.1). The method enables switching an already active SED OPAL2 device, with appropriate firmware support for Single User Mode (SUM), to or from SUM. Signed-off-by: Ondrej Kozina Reviewed-and-tested-by: Milan Broz Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/sed-opal.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index 80f33a93f944..2ae5e6b0ac21 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -53,6 +53,7 @@ static inline bool is_sed_ioctl(unsigned int cmd) case IOC_OPAL_DISCOVERY: case IOC_OPAL_REVERT_LSP: case IOC_OPAL_SET_SID_PW: + case IOC_OPAL_REACTIVATE_LSP: return true; } return false; -- cgit v1.2.3 From 8e3d34a7ce7386b01947dd649bd24775544e4d3e Mon Sep 17 00:00:00 2001 From: Ondrej Kozina Date: Fri, 6 Feb 2026 15:18:00 +0100 Subject: sed-opal: add IOC_OPAL_LR_SET_START_LEN ioctl. This ioctl is used to set up locking range start (offset) and locking range length attributes only. In Single User Mode (SUM), if the RangeStartRangeLengthPolicy parameter is set in the 'Reactivate' method, only Admin authority maintains the locking range length and start (offset) attributes of Locking objects set up for SUM. All other attributes from struct opal_user_lr_setup (RLE - read locking enabled, WLE - write locking enabled) shall remain in possession of the User authority associated with the Locking object set for SUM. Therefore, we need a separate function for setting up locking range start and locking range length because it may require two different authorities (and sessions) if the RangeStartRangeLengthPolicy attribute is set. With the IOC_OPAL_LR_SET_START_LEN ioctl, the opal_user_lr_setup members 'RLE' and 'WLE' of the ioctl argument are ignored. Signed-off-by: Ondrej Kozina Reviewed-and-tested-by: Milan Broz Signed-off-by: Jens Axboe --- include/linux/sed-opal.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index 2ae5e6b0ac21..a0df6819b0a9 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -54,6 +54,7 @@ static inline bool is_sed_ioctl(unsigned int cmd) case IOC_OPAL_REVERT_LSP: case IOC_OPAL_SET_SID_PW: case IOC_OPAL_REACTIVATE_LSP: + case IOC_OPAL_LR_SET_START_LEN: return true; } return false; -- cgit v1.2.3 From a441a9d22433fea561de131e27fff41715c2d186 Mon Sep 17 00:00:00 2001 From: Ondrej Kozina Date: Fri, 6 Feb 2026 15:18:01 +0100 Subject: sed-opal: add IOC_OPAL_ENABLE_DISABLE_LR. This ioctl is used to set up RLE (read lock enabled) and WLE (write lock enabled) parameters of the Locking object. In Single User Mode (SUM), if the RangeStartRangeLengthPolicy parameter is set in the 'Reactivate' method, only Admin authority maintains the locking range length and start (offset) attributes of Locking objects set up for SUM. All other attributes from struct opal_user_lr_setup (RLE - read locking enabled, WLE - write locking enabled) shall remain in possession of the User authority associated with the Locking object set for SUM. With the IOC_OPAL_ENABLE_DISABLE_LR ioctl, the opal_user_lr_setup members 'range_start' and 'range_length' of the ioctl argument are ignored. Signed-off-by: Ondrej Kozina Reviewed-and-tested-by: Milan Broz Signed-off-by: Jens Axboe --- include/linux/sed-opal.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index a0df6819b0a9..1d63479838cf 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -55,6 +55,7 @@ static inline bool is_sed_ioctl(unsigned int cmd) case IOC_OPAL_SET_SID_PW: case IOC_OPAL_REACTIVATE_LSP: case IOC_OPAL_LR_SET_START_LEN: + case IOC_OPAL_ENABLE_DISABLE_LR: return true; } return false; -- cgit v1.2.3 From 0cc9293bccb234552b81c3ebc074f5839f019e01 Mon Sep 17 00:00:00 2001 From: Ondrej Kozina Date: Fri, 6 Feb 2026 15:18:03 +0100 Subject: sed-opal: add IOC_OPAL_GET_SUM_STATUS ioctl. This adds a function for retrieving the set of Locking objects enabled for Single User Mode (SUM) and the value of the RangeStartRangeLengthPolicy parameter. It retrieves data from the LockingInfo table, specifically the columns SingleUserModeRanges and RangeStartLengthPolicy, which were added according to the TCG Opal Feature Set: Single User Mode, as described in chapters 4.4.3.1 and 4.4.3.2. Signed-off-by: Ondrej Kozina Reviewed-and-tested-by: Milan Broz Signed-off-by: Jens Axboe --- include/linux/sed-opal.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index 1d63479838cf..aa006edb612b 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -56,6 +56,7 @@ static inline bool is_sed_ioctl(unsigned int cmd) case IOC_OPAL_REACTIVATE_LSP: case IOC_OPAL_LR_SET_START_LEN: case IOC_OPAL_ENABLE_DISABLE_LR: + case IOC_OPAL_GET_SUM_STATUS: return true; } return false; -- cgit v1.2.3 From 0ee8ab5d4dc51704be1157470f3df8090629f9fc Mon Sep 17 00:00:00 2001 From: Bill Wendling Date: Wed, 25 Feb 2026 20:51:05 +0000 Subject: block: annotate struct request_queue with __counted_by_ptr The queue_hw_ctx field in struct request_queue is an array of pointers to struct blk_mq_hw_ctx. The number of elements in this array is tracked by the nr_hw_queues field. The array is allocated in __blk_mq_realloc_hw_ctxs() using kcalloc_node() with set->nr_hw_queues elements. q->nr_hw_queues is subsequently updated to set->nr_hw_queues. When growing the array, the new array is assigned to queue_hw_ctx before nr_hw_queues is updated. This is safe because nr_hw_queues (the old smaller count) is used for bounds checking, which is within the new larger allocation. When shrinking the array, nr_hw_queues is updated to the smaller value, while queue_hw_ctx retains the larger allocation. This is also safe as the count is within the allocation bounds. Annotating queue_hw_ctx with __counted_by_ptr(nr_hw_queues) allows the compiler (with kSAN) to verify that accesses to queue_hw_ctx are within the valid range defined by nr_hw_queues. This patch was generated by CodeMender and reviewed by Bill Wendling. Tested by running blktests. Reviewed-by: Daniel Wagner Signed-off-by: Bill Wendling [axboe: massage commit message] Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d463b9b5a0a5..540c2c6c9afd 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -502,7 +502,7 @@ struct request_queue { /* hw dispatch queues */ unsigned int nr_hw_queues; - struct blk_mq_hw_ctx * __rcu *queue_hw_ctx; + struct blk_mq_hw_ctx * __rcu *queue_hw_ctx __counted_by_ptr(nr_hw_queues); struct percpu_ref q_usage_counter; struct lock_class_key io_lock_cls_key; -- cgit v1.2.3 From b7cbc30e93e3a64ea058230f6d0c764d6d80276f Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 27 Feb 2026 22:19:48 +0900 Subject: block: rename struct gendisk zone_wplugs_lock field Rename struct gendisk zone_wplugs_lock field to zone_wplugs_hash_lock to clearly indicates that this is the spinlock used for manipulating the hash table of zone write plugs. Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 540c2c6c9afd..a49a1e38c6e7 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -200,7 +200,7 @@ struct gendisk { u8 __rcu *zones_cond; unsigned int zone_wplugs_hash_bits; atomic_t nr_zone_wplugs; - spinlock_t zone_wplugs_lock; + spinlock_t zone_wplugs_hash_lock; struct mempool *zone_wplugs_pool; struct hlist_head *zone_wplugs_hash; struct workqueue_struct *zone_wplugs_wq; -- cgit v1.2.3 From 1365b6904fd050bf22ab9f3df375a396de5837a1 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 27 Feb 2026 22:19:49 +0900 Subject: block: allow submitting all zone writes from a single context In order to maintain sequential write patterns per zone with zoned block devices, zone write plugging issues only a single write BIO per zone at any time. This works well but has the side effect that when large sequential write streams are issued by the user and these streams cross zone boundaries, the device ends up receiving a discontiguous set of write commands for different zones. The same also happens when a user writes simultaneously at high queue depth multiple zones: the device does not see all sequential writes per zone and receives discontiguous writes to different zones. While this does not affect the performance of solid state zoned block devices, when using an SMR HDD, this pattern change from sequential writes to discontiguous writes to different zones significantly increases head seek which results in degraded write throughput. In order to reduce this seek overhead for rotational media devices, introduce a per disk zone write plugs kernel thread to issue all write BIOs to zones. This single zone write issuing context is enabled for any zoned block device that has a request queue flagged with the new QUEUE_ZONED_QD1_WRITES flag. The flag QUEUE_ZONED_QD1_WRITES is visible as the sysfs queue attribute zoned_qd1_writes for zoned devices. For regular block devices, this attribute is not visible. For zoned block devices, a user can override the default value set to force the global write maximum queue depth of 1 for a zoned block device, or clear this attribute to fallback to the default behavior of zone write plugging which limits writes to QD=1 per sequential zone. Writing to a zoned block device flagged with QUEUE_ZONED_QD1_WRITES is implemented using a list of zone write plugs that have a non-empty BIO list. Listed zone write plugs are processed by the disk zone write plugs worker kthread in FIFO order, and all BIOs of a zone write plug are all processed before switching to the next listed zone write plug. A newly submitted BIO for a non-FULL zone write plug that is not yet listed causes the addition of the zone write plug at the end of the disk list of zone write plugs. Since the write BIOs queued in a zone write plug BIO list are necessarilly sequential, for rotational media, using the single zone write plugs kthread to issue all BIOs maintains a sequential write pattern and thus reduces seek overhead and improves write throughput. This processing essentially result in always writing to HDDs at QD=1, which is not an issue for HDDs operating with write caching enabled. Performance with write cache disabled is also not degraded thanks to the efficient write handling of modern SMR HDDs. A disk list of zone write plugs is defined using the new struct gendisk zone_wplugs_list, and accesses to this list is protected using the zone_wplugs_list_lock spinlock. The per disk kthread (zone_wplugs_worker) code is implemented by the function disk_zone_wplugs_worker(). A reference on listed zone write plugs is always held until all BIOs of the zone write plug are processed by the worker kthread. BIO issuing at QD=1 is driven using a completion structure (zone_wplugs_worker_bio_done) and calls to blk_io_wait(). With this change, performance when sequentially writing the zones of a 30 TB SMR SATA HDD connected to an AHCI adapter changes as follows (1MiB direct I/Os, results in MB/s unit): +--------------------+ | Write BW (MB/s) | +------------------+----------+---------+ | Sequential write | Baseline | Patched | | Queue Depth | 6.19-rc8 | | +------------------+----------+---------+ | 1 | 244 | 245 | | 2 | 244 | 245 | | 4 | 245 | 245 | | 8 | 242 | 245 | | 16 | 222 | 246 | | 32 | 211 | 245 | | 64 | 193 | 244 | | 128 | 112 | 246 | +------------------+----------+---------+ With the current code (baseline), as the sequential write stream crosses a zone boundary, higher queue depth creates a gap between the last IO to the previous zone and the first IOs to the following zones, causing head seeks and degrading performance. Using the disk zone write plugs worker thread, this pattern disappears and the maximum throughput of the drive is maintained, leading to over 100% improvements in throughput for high queue depth write. Using 16 fio jobs all writing to randomly chosen zones at QD=32 with 1 MiB direct IOs, write throughput also increases significantly. +--------------------+ | Write BW (MB/s) | +------------------+----------+---------+ | Random write | Baseline | Patched | | Number of zones | 6.19-rc7 | | +------------------+----------+---------+ | 1 | 191 | 192 | | 2 | 101 | 128 | | 4 | 115 | 123 | | 8 | 90 | 120 | | 16 | 64 | 115 | | 32 | 58 | 105 | | 64 | 56 | 101 | | 128 | 55 | 99 | +------------------+----------+---------+ Tests using XFS shows that buffered write speed with 8 jobs writing files increases by 12% to 35% depending on the workload. +--------------------+ | Write BW (MB/s) | +------------------+----------+---------+ | Workload | Baseline | Patched | | | 6.19-rc7 | | +------------------+----------+---------+ | 256MiB file size | 212 | 238 | +------------------+----------+---------+ | 4MiB .. 128 MiB | 213 | 243 | | random file size | | | +------------------+----------+---------+ | 2MiB .. 8 MiB | 179 | 242 | | random file size | | | +------------------+----------+---------+ Performance gains are even more significant when using an HBA that limits the maximum size of commands to a small value, e.g. HBAs controlled with the mpi3mr driver limit commands to a maximum of 1 MiB. In such case, the write throughput gains are over 40%. +--------------------+ | Write BW (MB/s) | +------------------+----------+---------+ | Workload | Baseline | Patched | | | 6.19-rc7 | | +------------------+----------+---------+ | 256MiB file size | 175 | 245 | +------------------+----------+---------+ | 4MiB .. 128 MiB | 174 | 244 | | random file size | | | +------------------+----------+---------+ | 2MiB .. 8 MiB | 171 | 243 | | random file size | | | +------------------+----------+---------+ Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index a49a1e38c6e7..ef6457487d23 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -204,6 +205,10 @@ struct gendisk { struct mempool *zone_wplugs_pool; struct hlist_head *zone_wplugs_hash; struct workqueue_struct *zone_wplugs_wq; + spinlock_t zone_wplugs_list_lock; + struct list_head zone_wplugs_list; + struct task_struct *zone_wplugs_worker; + struct completion zone_wplugs_worker_bio_done; #endif /* CONFIG_BLK_DEV_ZONED */ #if IS_ENABLED(CONFIG_CDROM) @@ -668,6 +673,7 @@ enum { QUEUE_FLAG_NO_ELV_SWITCH, /* can't switch elevator any more */ QUEUE_FLAG_QOS_ENABLED, /* qos is enabled */ QUEUE_FLAG_BIO_ISSUE_TIME, /* record bio->issue_time_ns */ + QUEUE_FLAG_ZONED_QD1_WRITES, /* Limit zoned devices writes to QD=1 */ QUEUE_FLAG_MAX }; @@ -707,6 +713,8 @@ void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); test_bit(QUEUE_FLAG_DISABLE_WBT_DEF, &(q)->queue_flags) #define blk_queue_no_elv_switch(q) \ test_bit(QUEUE_FLAG_NO_ELV_SWITCH, &(q)->queue_flags) +#define blk_queue_zoned_qd1_writes(q) \ + test_bit(QUEUE_FLAG_ZONED_QD1_WRITES, &(q)->queue_flags) extern void blk_set_pm_only(struct request_queue *q); extern void blk_clear_pm_only(struct request_queue *q); -- cgit v1.2.3 From ecd92cfec5349876d6a80f8188ea98c5920094b6 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 26 Feb 2026 16:54:48 +0900 Subject: block: remove bdev_nonrot() bdev_nonrot() is simply the negative return value of bdev_rot(). So replace all call sites of bdev_nonrot() with calls to bdev_rot() and remove bdev_nonrot(). Signed-off-by: Damien Le Moal Reviewed-by: Martin K. Petersen Reviewed-by: Paul Menzel Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ef6457487d23..8d93d8e356d8 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1475,11 +1475,6 @@ static inline bool bdev_rot(struct block_device *bdev) return blk_queue_rot(bdev_get_queue(bdev)); } -static inline bool bdev_nonrot(struct block_device *bdev) -{ - return !bdev_rot(bdev); -} - static inline bool bdev_synchronous(struct block_device *bdev) { return bdev->bd_disk->queue->limits.features & BLK_FEAT_SYNCHRONOUS; -- cgit v1.2.3 From 588e7c048d7d2bfcbe7776ee0888ee248adf01d1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 2 Mar 2026 06:18:09 -0800 Subject: ext4, fscrypt: merge fscrypt_mergeable_bio_bh into io_submit_need_new_bio ext4 already has the inode and folio and can't have a NULL folio->mapping in this path. Open code fscrypt_mergeable_bio_bh in io_submit_need_new_bio based on these simplifying assumptions. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20260302141922.370070-5-hch@lst.de Signed-off-by: Eric Biggers --- include/linux/fscrypt.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 516aba5b858b..6af3c1907adc 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -876,9 +876,6 @@ void fscrypt_set_bio_crypt_ctx_bh(struct bio *bio, bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, u64 next_lblk); -bool fscrypt_mergeable_bio_bh(struct bio *bio, - const struct buffer_head *next_bh); - bool fscrypt_dio_supported(struct inode *inode); u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks); @@ -906,12 +903,6 @@ static inline bool fscrypt_mergeable_bio(struct bio *bio, return true; } -static inline bool fscrypt_mergeable_bio_bh(struct bio *bio, - const struct buffer_head *next_bh) -{ - return true; -} - static inline bool fscrypt_dio_supported(struct inode *inode) { return !fscrypt_needs_contents_encryption(inode); -- cgit v1.2.3 From a18b1ab81654b06e7ff402e5d0b85249e9504bcb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 2 Mar 2026 06:18:10 -0800 Subject: fscrypt: move fscrypt_set_bio_crypt_ctx_bh to buffer.c fscrypt_set_bio_crypt_ctx_bh is only used by submit_bh_wbc now. Move it there and merge bh_get_inode_and_lblk_num into it. Note that this does not add ifdefs for fscrypt as the compiler will optimize away the dead code if it is not built in. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20260302141922.370070-6-hch@lst.de Signed-off-by: Eric Biggers --- include/linux/fscrypt.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 6af3c1907adc..26561b7994e0 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -869,10 +869,6 @@ void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, u64 first_lblk, gfp_t gfp_mask); -void fscrypt_set_bio_crypt_ctx_bh(struct bio *bio, - const struct buffer_head *first_bh, - gfp_t gfp_mask); - bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, u64 next_lblk); @@ -891,11 +887,6 @@ static inline void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, u64 first_lblk, gfp_t gfp_mask) { } -static inline void fscrypt_set_bio_crypt_ctx_bh( - struct bio *bio, - const struct buffer_head *first_bh, - gfp_t gfp_mask) { } - static inline bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, u64 next_lblk) -- cgit v1.2.3 From 22be86a23c5956254b752e4e98f0ef2799565a41 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 2 Mar 2026 06:18:12 -0800 Subject: fscrypt: pass a byte offset to fscrypt_mergeable_bio Logical offsets into an inode are usually expressed as bytes in the VFS. Switch fscrypt_mergeable_bio to that convention. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20260302141922.370070-8-hch@lst.de Signed-off-by: Eric Biggers --- include/linux/fscrypt.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 26561b7994e0..98fb14660d40 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -870,7 +870,7 @@ void fscrypt_set_bio_crypt_ctx(struct bio *bio, gfp_t gfp_mask); bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, - u64 next_lblk); + loff_t pos); bool fscrypt_dio_supported(struct inode *inode); @@ -889,7 +889,7 @@ static inline void fscrypt_set_bio_crypt_ctx(struct bio *bio, static inline bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, - u64 next_lblk) + loff_t pos) { return true; } -- cgit v1.2.3 From 3c7eaa775d8e008135646bd4b7aa7db7c5e40a0e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 2 Mar 2026 06:18:13 -0800 Subject: fscrypt: pass a byte offset to fscrypt_set_bio_crypt_ctx Logical offsets into an inode are usually expressed as bytes in the VFS. Switch fscrypt_set_bio_crypt_ctx to that convention. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20260302141922.370070-9-hch@lst.de Signed-off-by: Eric Biggers --- include/linux/fscrypt.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 98fb14660d40..90f75fe0e1c9 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -865,9 +865,8 @@ static inline void fscrypt_set_ops(struct super_block *sb, bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode); -void fscrypt_set_bio_crypt_ctx(struct bio *bio, - const struct inode *inode, u64 first_lblk, - gfp_t gfp_mask); +void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, + loff_t pos, gfp_t gfp_mask); bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, loff_t pos); @@ -885,7 +884,7 @@ static inline bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode) static inline void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, - u64 first_lblk, gfp_t gfp_mask) { } + loff_t pos, gfp_t gfp_mask) { } static inline bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, -- cgit v1.2.3 From cd7db2e7dfeef99c901156f58ab4a38256b0c3f1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 2 Mar 2026 06:18:16 -0800 Subject: fscrypt: pass a byte offset to fscrypt_zeroout_range Logical offsets into an inode are usually expressed as bytes in the VFS. Switch fscrypt_zeroout_range to that convention. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20260302141922.370070-12-hch@lst.de Signed-off-by: Eric Biggers --- include/linux/fscrypt.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 90f75fe0e1c9..9fc15e1fbe57 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -450,7 +450,7 @@ u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name); /* bio.c */ bool fscrypt_decrypt_bio(struct bio *bio); -int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, +int fscrypt_zeroout_range(const struct inode *inode, loff_t pos, sector_t pblk, unsigned int len); /* hooks.c */ @@ -755,7 +755,7 @@ static inline bool fscrypt_decrypt_bio(struct bio *bio) return true; } -static inline int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, +static inline int fscrypt_zeroout_range(const struct inode *inode, loff_t pos, sector_t pblk, unsigned int len) { return -EOPNOTSUPP; -- cgit v1.2.3 From fb87ab4ad3d0df2397648e5ce2384de26463c183 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 2 Mar 2026 06:18:17 -0800 Subject: fscrypt: pass a byte length to fscrypt_zeroout_range Range lengths are usually expressed as bytes in the VFS, switch fscrypt_zeroout_range to this convention. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20260302141922.370070-13-hch@lst.de Signed-off-by: Eric Biggers --- include/linux/fscrypt.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 9fc15e1fbe57..90ac62fda926 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -450,8 +450,8 @@ u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name); /* bio.c */ bool fscrypt_decrypt_bio(struct bio *bio); -int fscrypt_zeroout_range(const struct inode *inode, loff_t pos, - sector_t pblk, unsigned int len); +int fscrypt_zeroout_range(const struct inode *inode, loff_t pos, sector_t pblk, + u64 len); /* hooks.c */ int fscrypt_file_open(struct inode *inode, struct file *filp); @@ -756,7 +756,7 @@ static inline bool fscrypt_decrypt_bio(struct bio *bio) } static inline int fscrypt_zeroout_range(const struct inode *inode, loff_t pos, - sector_t pblk, unsigned int len) + sector_t pblk, u64 len) { return -EOPNOTSUPP; } -- cgit v1.2.3 From 5ca1a1f017ea0f0e0bcb6ec52064735f2ac1c393 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 2 Mar 2026 06:18:18 -0800 Subject: fscrypt: pass a real sector_t to fscrypt_zeroout_range While the pblk argument to fscrypt_zeroout_range is declared as a sector_t, it actually is interpreted as a logical block size unit, which is highly unusual. Switch to passing the 512 byte units that sector_t is defined for. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20260302141922.370070-14-hch@lst.de Signed-off-by: Eric Biggers --- include/linux/fscrypt.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 90ac62fda926..54712ec61ffb 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -450,8 +450,8 @@ u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name); /* bio.c */ bool fscrypt_decrypt_bio(struct bio *bio); -int fscrypt_zeroout_range(const struct inode *inode, loff_t pos, sector_t pblk, - u64 len); +int fscrypt_zeroout_range(const struct inode *inode, loff_t pos, + sector_t sector, u64 len); /* hooks.c */ int fscrypt_file_open(struct inode *inode, struct file *filp); @@ -756,7 +756,7 @@ static inline bool fscrypt_decrypt_bio(struct bio *bio) } static inline int fscrypt_zeroout_range(const struct inode *inode, loff_t pos, - sector_t pblk, u64 len) + sector_t sector, u64 len) { return -EOPNOTSUPP; } -- cgit v1.2.3 From 102c8b26b54e363f85c4c86099ca049a0a76bb58 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 17 Feb 2026 08:08:35 -0800 Subject: PCI: Allow all bus devices to use the same slot A PCIe hotplug slot applies to the entire secondary bus. Thus, pciehp only allocates a single hotplug_slot for the bridge to that bus. The existing PCI slot, though, would only match to functions on device 0, meaning any devices beyond that, e.g., ARI functions, are not matched to any slot even though they share it. A slot reset will break all the missing devices because the handling skips them. For example, ARI devices with more than 8 functions fail because their state is not properly handled, nor is the attached driver notified of the reset. In the best case, the device will appear unresponsive to the driver, resulting in unexpected errors. A worse possibility may panic the kernel if in-flight transactions trigger hardware reported errors like this real observation: vfio-pci 0000:01:00.0: resetting vfio-pci 0000:01:00.0: reset done {1}[Hardware Error]: Error 1, type: fatal {1}[Hardware Error]: section_type: PCIe error {1}[Hardware Error]: port_type: 0, PCIe end point {1}[Hardware Error]: version: 0.2 {1}[Hardware Error]: command: 0x0140, status: 0x0010 {1}[Hardware Error]: device_id: 0000:01:01.0 {1}[Hardware Error]: slot: 0 {1}[Hardware Error]: secondary_bus: 0x00 {1}[Hardware Error]: vendor_id: 0x1d9b, device_id: 0x0207 {1}[Hardware Error]: class_code: 020000 {1}[Hardware Error]: bridge: secondary_status: 0x0000, control: 0x0000 {1}[Hardware Error]: aer_cor_status: 0x00008000, aer_cor_mask: 0x00002000 {1}[Hardware Error]: aer_uncor_status: 0x00010000, aer_uncor_mask: 0x00100000 {1}[Hardware Error]: aer_uncor_severity: 0x006f6030 {1}[Hardware Error]: TLP Header: 0a412800 00192080 60000004 00000004 GHES: Fatal hardware error but panic disabled Kernel panic - not syncing: GHES: Fatal hardware error Allow a slot to be created to claim all devices on a bus, not just a matching device. This is done by introducing a sentinel value, named PCI_SLOT_ALL_DEVICES, which then has the PCI slot match to any device on the bus. This fixes slot resets for pciehp. Since 0xff already has special meaning, the chosen value for this new feature is 0xfe. This will not clash with any actual slot number since they are limited to 5 bits. Signed-off-by: Keith Busch Signed-off-by: Bjorn Helgaas Reviewed-by: Dan Williams Link: https://patch.msgid.link/20260217160836.2709885-3-kbusch@meta.com --- include/linux/pci.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 1c270f1d5123..5ae2dfdb2d6f 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -72,12 +72,20 @@ /* return bus from PCI devid = ((u16)bus_number) << 8) | devfn */ #define PCI_BUS_NUM(x) (((x) >> 8) & 0xff) +/* + * PCI_SLOT_ALL_DEVICES indicates a slot that covers all devices on the bus. + * Used for PCIe hotplug where the physical slot is the entire secondary bus, + * and, if ARI Forwarding is enabled, functions may appear to be on multiple + * devices. + */ +#define PCI_SLOT_ALL_DEVICES 0xfe + /* pci_slot represents a physical slot */ struct pci_slot { struct pci_bus *bus; /* Bus this slot is on */ struct list_head list; /* Node in list of slots */ struct hotplug_slot *hotplug; /* Hotplug info (move here) */ - unsigned char number; /* PCI_SLOT(pci_dev->devfn) */ + unsigned char number; /* Device nr, or PCI_SLOT_ALL_DEVICES */ struct kobject kobj; }; -- cgit v1.2.3 From abb0eb0b033a0a8980eb9215e02626e4801ead3f Mon Sep 17 00:00:00 2001 From: Qingfang Deng Date: Fri, 6 Mar 2026 17:36:49 +0800 Subject: ppp: simplify input error handling Currently, ppp_input_error() indicates an error by allocating a 0-length skb and calling ppp_do_recv(). It takes an error code argument, which is stored in skb->cb, but not used by ppp_receive_frame(). Simplify the error handling by removing the unused parameter and the unnecessary skb allocation. Instead, call ppp_receive_error() directly from ppp_input_error() under the recv lock, and the length check in ppp_receive_frame() can be removed. Signed-off-by: Qingfang Deng Signed-off-by: Jakub Kicinski --- include/linux/ppp_channel.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ppp_channel.h b/include/linux/ppp_channel.h index f73fbea0dbc2..ca8ad03eeef0 100644 --- a/include/linux/ppp_channel.h +++ b/include/linux/ppp_channel.h @@ -55,7 +55,7 @@ extern void ppp_input(struct ppp_channel *, struct sk_buff *); /* Called by the channel when an input error occurs, indicating that we may have missed a packet. */ -extern void ppp_input_error(struct ppp_channel *, int code); +extern void ppp_input_error(struct ppp_channel *); /* Attach a channel to a given PPP unit in specified net. */ extern int ppp_register_net_channel(struct net *, struct ppp_channel *); -- cgit v1.2.3 From 4b78c9cbd8f1fbb9517aee48b372646f4cf05442 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 8 Mar 2026 12:23:02 +0000 Subject: tcp: move tp->chrono_type next tp->chrono_stat[] chrono_type is currently in tcp_sock_read_txrx group, which is supposed to hold read-mostly fields. But chrono_type is mostly written in tx path, it should be moved to tcp_sock_write_tx group, close to other chrono fields (chrono_stat[], chrono_start). Note this adds holes, but data locality is far more important. Use a full u8 for the time being, compiler can generate more efficient code. Signed-off-by: Eric Dumazet Reviewed-by: Neal Cardwell Link: https://patch.msgid.link/20260308122302.2895067-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/tcp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index f72eef31fa23..c44cf9ae8d16 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -228,8 +228,7 @@ struct tcp_sock { u32 sacked_out; /* SACK'd packets */ u16 tcp_header_len; /* Bytes of tcp header to send */ u8 scaling_ratio; /* see tcp_win_from_space() */ - u8 chrono_type : 2, /* current chronograph type */ - repair : 1, + u8 repair : 1, tcp_usec_ts : 1, /* TSval values in usec */ is_sack_reneg:1, /* in recovery from loss with SACK reneg? */ is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */ @@ -264,6 +263,7 @@ struct tcp_sock { * total number of data bytes sent. */ u32 snd_sml; /* Last byte of the most recently transmitted small packet */ + u8 chrono_type; /* current chronograph type */ u32 chrono_start; /* Start time in jiffies of a TCP chrono */ u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */ u32 write_seq; /* Tail(+1) of data held in tcp send buffer */ -- cgit v1.2.3 From 4d25c7d68896b4002c4ab5cd646775392bb7fbb4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Feb 2026 05:20:09 -0800 Subject: iomap: pass the iomap_iter to ->submit_read This provides additional context for file systems. Rename the fuse instance to match the method name while we're at it. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20260223132021.292832-10-hch@lst.de Tested-by: Anuj Gupta Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- include/linux/iomap.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 99b7209dabd7..6fbe121e2adf 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -512,7 +512,8 @@ struct iomap_read_ops { * * This is optional. */ - void (*submit_read)(struct iomap_read_folio_ctx *ctx); + void (*submit_read)(const struct iomap_iter *iter, + struct iomap_read_folio_ctx *ctx); }; /* -- cgit v1.2.3 From 5f4fe046cb3c84eed719f7becbe822000e1a589e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Feb 2026 05:20:11 -0800 Subject: iomap: allow file systems to hook into buffered read bio submission File systems such as btrfs have additional operations with bios such as verifying data checksums. Allow file systems to hook into submission of the bio to allow for this processing by replacing the direct submit_bio call in iomap_read_alloc_bio with a call into ->submit_read and exporting iomap_read_alloc_bio. Also add a new field to struct iomap_read_folio_ctx to track the file logic offset of the current read context. Based on a patch from Goldwyn Rodrigues . Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20260223132021.292832-12-hch@lst.de Tested-by: Anuj Gupta Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- include/linux/iomap.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 6fbe121e2adf..b2b9e649a3b8 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -493,6 +493,7 @@ struct iomap_read_folio_ctx { struct folio *cur_folio; struct readahead_control *rac; void *read_ctx; + loff_t read_ctx_file_offset; }; struct iomap_read_ops { @@ -599,6 +600,9 @@ int iomap_swapfile_activate(struct swap_info_struct *sis, extern struct bio_set iomap_ioend_bioset; #ifdef CONFIG_BLOCK +int iomap_bio_read_folio_range(const struct iomap_iter *iter, + struct iomap_read_folio_ctx *ctx, size_t plen); + extern const struct iomap_read_ops iomap_bio_read_ops; static inline void iomap_bio_read_folio(struct folio *folio, -- cgit v1.2.3 From 57287771fa8d77841149bf847b629f29acbad35b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Feb 2026 05:20:13 -0800 Subject: iomap: add a bioset pointer to iomap_read_folio_ops Optionally allocate the bio from the bioset provided in iomap_read_folio_ops. If no bioset is provided, fs_bio_set is still used, which is the standard bioset for file systems. Based on a patch from Goldwyn Rodrigues . Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20260223132021.292832-14-hch@lst.de Tested-by: Anuj Gupta Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- include/linux/iomap.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index b2b9e649a3b8..387a1174522f 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -515,6 +515,12 @@ struct iomap_read_ops { */ void (*submit_read)(const struct iomap_iter *iter, struct iomap_read_folio_ctx *ctx); + + /* + * Optional, allows filesystem to specify own bio_set, so new bio's + * can be allocated from the provided bio_set. + */ + struct bio_set *bio_set; }; /* -- cgit v1.2.3 From 0b10a370529cbd7b918c1eef43d409e43d9e0b78 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Feb 2026 05:20:15 -0800 Subject: iomap: support T10 protection information Add support for generating / verifying protection information in iomap. This is done by hooking into the bio submission and then using the generic PI helpers. Compared to just using the block layer auto PI this extends the protection envelope and also prepares for eventually passing through PI from userspace at least for direct I/O. To generate or verify PI, the file system needs to set the IOMAP_F_INTEGRITY flag on the iomap for the request, and ensure the ioends are used for all integrity I/O. Additionally the file system must defer read I/O completions to user context so that the guard tag validation isn't run from interrupt context. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20260223132021.292832-16-hch@lst.de Tested-by: Anuj Gupta Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- include/linux/iomap.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 387a1174522f..531f9ebdeeae 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -65,6 +65,8 @@ struct vm_fault; * * IOMAP_F_ATOMIC_BIO indicates that (write) I/O will be issued as an atomic * bio, i.e. set REQ_ATOMIC. + * + * IOMAP_F_INTEGRITY indicates that the filesystems handles integrity metadata. */ #define IOMAP_F_NEW (1U << 0) #define IOMAP_F_DIRTY (1U << 1) @@ -79,6 +81,11 @@ struct vm_fault; #define IOMAP_F_BOUNDARY (1U << 6) #define IOMAP_F_ANON_WRITE (1U << 7) #define IOMAP_F_ATOMIC_BIO (1U << 8) +#ifdef CONFIG_BLK_DEV_INTEGRITY +#define IOMAP_F_INTEGRITY (1U << 9) +#else +#define IOMAP_F_INTEGRITY 0 +#endif /* CONFIG_BLK_DEV_INTEGRITY */ /* * Flag reserved for file system specific usage -- cgit v1.2.3 From 6f1a9140ecda3baba3d945b9a6155af4268aafc4 Mon Sep 17 00:00:00 2001 From: Weiming Shi Date: Sat, 7 Mar 2026 00:01:34 +0800 Subject: net: add xmit recursion limit to tunnel xmit functions Tunnel xmit functions (iptunnel_xmit, ip6tunnel_xmit) lack their own recursion limit. When a bond device in broadcast mode has GRE tap interfaces as slaves, and those GRE tunnels route back through the bond, multicast/broadcast traffic triggers infinite recursion between bond_xmit_broadcast() and ip_tunnel_xmit()/ip6_tnl_xmit(), causing kernel stack overflow. The existing XMIT_RECURSION_LIMIT (8) in the no-qdisc path is not sufficient because tunnel recursion involves route lookups and full IP output, consuming much more stack per level. Use a lower limit of 4 (IP_TUNNEL_RECURSION_LIMIT) to prevent overflow. Add recursion detection using dev_xmit_recursion helpers directly in iptunnel_xmit() and ip6tunnel_xmit() to cover all IPv4/IPv6 tunnel paths including UDP encapsulated tunnels (VXLAN, Geneve, etc.). Move dev_xmit_recursion helpers from net/core/dev.h to public header include/linux/netdevice.h so they can be used by tunnel code. BUG: KASAN: stack-out-of-bounds in blake2s.constprop.0+0xe7/0x160 Write of size 32 at addr ffff88810033fed0 by task kworker/0:1/11 Workqueue: mld mld_ifc_work Call Trace: __build_flow_key.constprop.0 (net/ipv4/route.c:515) ip_rt_update_pmtu (net/ipv4/route.c:1073) iptunnel_xmit (net/ipv4/ip_tunnel_core.c:84) ip_tunnel_xmit (net/ipv4/ip_tunnel.c:847) gre_tap_xmit (net/ipv4/ip_gre.c:779) dev_hard_start_xmit (net/core/dev.c:3887) sch_direct_xmit (net/sched/sch_generic.c:347) __dev_queue_xmit (net/core/dev.c:4802) bond_dev_queue_xmit (drivers/net/bonding/bond_main.c:312) bond_xmit_broadcast (drivers/net/bonding/bond_main.c:5279) bond_start_xmit (drivers/net/bonding/bond_main.c:5530) dev_hard_start_xmit (net/core/dev.c:3887) __dev_queue_xmit (net/core/dev.c:4841) ip_finish_output2 (net/ipv4/ip_output.c:237) ip_output (net/ipv4/ip_output.c:438) iptunnel_xmit (net/ipv4/ip_tunnel_core.c:86) gre_tap_xmit (net/ipv4/ip_gre.c:779) dev_hard_start_xmit (net/core/dev.c:3887) sch_direct_xmit (net/sched/sch_generic.c:347) __dev_queue_xmit (net/core/dev.c:4802) bond_dev_queue_xmit (drivers/net/bonding/bond_main.c:312) bond_xmit_broadcast (drivers/net/bonding/bond_main.c:5279) bond_start_xmit (drivers/net/bonding/bond_main.c:5530) dev_hard_start_xmit (net/core/dev.c:3887) __dev_queue_xmit (net/core/dev.c:4841) ip_finish_output2 (net/ipv4/ip_output.c:237) ip_output (net/ipv4/ip_output.c:438) iptunnel_xmit (net/ipv4/ip_tunnel_core.c:86) ip_tunnel_xmit (net/ipv4/ip_tunnel.c:847) gre_tap_xmit (net/ipv4/ip_gre.c:779) dev_hard_start_xmit (net/core/dev.c:3887) sch_direct_xmit (net/sched/sch_generic.c:347) __dev_queue_xmit (net/core/dev.c:4802) bond_dev_queue_xmit (drivers/net/bonding/bond_main.c:312) bond_xmit_broadcast (drivers/net/bonding/bond_main.c:5279) bond_start_xmit (drivers/net/bonding/bond_main.c:5530) dev_hard_start_xmit (net/core/dev.c:3887) __dev_queue_xmit (net/core/dev.c:4841) mld_sendpack mld_ifc_work process_one_work worker_thread Fixes: 745e20f1b626 ("net: add a recursion limit in xmit path") Reported-by: Xiang Mei Signed-off-by: Weiming Shi Link: https://patch.msgid.link/20260306160133.3852900-2-bestswngs@gmail.com Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 67e25f6d15a4..ae269a2e7f4d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3576,17 +3576,49 @@ struct page_pool_bh { }; DECLARE_PER_CPU(struct page_pool_bh, system_page_pool); +#define XMIT_RECURSION_LIMIT 8 + #ifndef CONFIG_PREEMPT_RT static inline int dev_recursion_level(void) { return this_cpu_read(softnet_data.xmit.recursion); } + +static inline bool dev_xmit_recursion(void) +{ + return unlikely(__this_cpu_read(softnet_data.xmit.recursion) > + XMIT_RECURSION_LIMIT); +} + +static inline void dev_xmit_recursion_inc(void) +{ + __this_cpu_inc(softnet_data.xmit.recursion); +} + +static inline void dev_xmit_recursion_dec(void) +{ + __this_cpu_dec(softnet_data.xmit.recursion); +} #else static inline int dev_recursion_level(void) { return current->net_xmit.recursion; } +static inline bool dev_xmit_recursion(void) +{ + return unlikely(current->net_xmit.recursion > XMIT_RECURSION_LIMIT); +} + +static inline void dev_xmit_recursion_inc(void) +{ + current->net_xmit.recursion++; +} + +static inline void dev_xmit_recursion_dec(void) +{ + current->net_xmit.recursion--; +} #endif void __netif_schedule(struct Qdisc *q); -- cgit v1.2.3 From fa655a9ca73f7df32b8ca4d14ce11742f9578288 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 3 Mar 2026 22:31:01 +0100 Subject: nvme: Annotate struct nvme_dhchap_key with __counted_by Add the __counted_by() compiler attribute to the flexible array member 'key' to improve access bounds-checking via CONFIG_UBSAN_BOUNDS and CONFIG_FORTIFY_SOURCE. Reviewed-by: Christoph Hellwig Signed-off-by: Thorsten Blum Signed-off-by: Keith Busch --- include/linux/nvme-auth.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nvme-auth.h b/include/linux/nvme-auth.h index 60e069a6757f..e75c29c51464 100644 --- a/include/linux/nvme-auth.h +++ b/include/linux/nvme-auth.h @@ -11,7 +11,7 @@ struct nvme_dhchap_key { size_t len; u8 hash; - u8 key[]; + u8 key[] __counted_by(len); }; u32 nvme_auth_get_seqnum(void); -- cgit v1.2.3 From 22fd7f7fed2ae3702f90d1985c326354e86b9c75 Mon Sep 17 00:00:00 2001 From: Muhammad Amirul Asyraf Mohamad Jamian Date: Thu, 5 Mar 2026 01:31:51 -0800 Subject: firmware: stratix10-svc: Add Multi SVC clients support In the current implementation, SVC client drivers such as socfpga-hwmon, intel_fcs, stratix10-soc, stratix10-rsu each send an SMC command that triggers a single thread in the stratix10-svc driver. Upon receiving a callback, the initiating client driver sends a stratix10-svc-done signal, terminating the thread without waiting for other pending SMC commands to complete. This leads to a timeout issue in the firmware SVC mailbox service when multiple client drivers send SMC commands concurrently. To resolve this issue, a dedicated thread is now created per channel. The stratix10-svc driver will support up to the number of channels defined by SVC_NUM_CHANNEL. Thread synchronization is handled using a mutex to prevent simultaneous issuance of SMC commands by multiple threads. SVC_NUM_DATA_IN_FIFO is reduced from 32 to 8, since each channel now has its own dedicated FIFO and the SDM processes commands one at a time. 8 entries per channel is sufficient while keeping the total aggregate capacity the same (4 channels x 8 = 32 entries). Additionally, a thread task is now validated before invoking kthread_stop when the user aborts, ensuring safe termination. Timeout values have also been adjusted to accommodate the increased load from concurrent client driver activity. Fixes: 7ca5ce896524 ("firmware: add Intel Stratix10 service layer driver") Cc: stable@vger.kernel.org Signed-off-by: Ang Tien Sung Signed-off-by: Fong, Yan Kei Signed-off-by: Muhammad Amirul Asyraf Mohamad Jamian Link: https://lore.kernel.org/all/20260305093151.2678-1-muhammad.amirul.asyraf.mohamad.jamian@altera.com Signed-off-by: Dinh Nguyen --- include/linux/firmware/intel/stratix10-svc-client.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/firmware/intel/stratix10-svc-client.h b/include/linux/firmware/intel/stratix10-svc-client.h index d290060f4c73..91013161e9db 100644 --- a/include/linux/firmware/intel/stratix10-svc-client.h +++ b/include/linux/firmware/intel/stratix10-svc-client.h @@ -68,12 +68,12 @@ * timeout value used in Stratix10 FPGA manager driver. * timeout value used in RSU driver */ -#define SVC_RECONFIG_REQUEST_TIMEOUT_MS 300 -#define SVC_RECONFIG_BUFFER_TIMEOUT_MS 720 -#define SVC_RSU_REQUEST_TIMEOUT_MS 300 +#define SVC_RECONFIG_REQUEST_TIMEOUT_MS 5000 +#define SVC_RECONFIG_BUFFER_TIMEOUT_MS 5000 +#define SVC_RSU_REQUEST_TIMEOUT_MS 2000 #define SVC_FCS_REQUEST_TIMEOUT_MS 2000 #define SVC_COMPLETED_TIMEOUT_MS 30000 -#define SVC_HWMON_REQUEST_TIMEOUT_MS 300 +#define SVC_HWMON_REQUEST_TIMEOUT_MS 2000 struct stratix10_svc_chan; -- cgit v1.2.3 From 1b891f4c852817b3afd231712ab7e171932e1eb1 Mon Sep 17 00:00:00 2001 From: "Derek J. Clark" Date: Tue, 10 Mar 2026 07:29:19 +0000 Subject: include: device.h: Add named device attributes Adds DEVICE_ATTR_[RW|RO|WO]_NAMED macros for adding attributes that reuse the same sysfs name in a driver under separate subdirectories. When dealing with some devices it can be useful to be able to reuse the same name for similar attributes under a different subdirectory. For example, a single logical HID endpoint may provide a configuration interface for multiple physical devices. In such a case it is useful to provide symmetrical attribute names under different subdirectories on the configuration device. The Lenovo Legion Go is one such device, providing configuration to a detachable left controller, detachable right controller, the wireless transmission dongle, and the MCU. It is therefore beneficial to treat each of these as individual devices in the driver, providing a subdirectory for each physical device in the sysfs. As some attributes are reused by each physical device, it provides a much cleaner interface if the same driver can reuse the same attribute name in sysfs while uniquely distinguishing the store/show functions in the driver, rather than repeat string portions. Example new WO attrs: ATTRS{left_handle/reset}=="(not readable)" ATTRS{right_handle/reset}=="(not readable)" ATTRS{tx_dongle/reset}=="(not readable)" vs old WO attrs in a subdir: ATTRS{left_handle/left_handle_reset}=="(not readable)" ATTRS{right_handle/right_handle_reset}=="(not readable)" ATTRS{tx_dongle/tx_dongle_reset}=="(not readable)" or old WO attrs with no subdir: ATTRS{left_handle_reset}=="(not readable)" ATTRS{right_handle_reset}=="(not readable)" ATTRS{tx_dongle_reset}=="(not readable)" While the third option is usable, it doesn't logically break up the physical devices and creates a device directory with over 80 attributes once all attrs are defined. Reviewed-by: Mark Pearson Signed-off-by: Derek J. Clark Acked-by: Greg Kroah-Hartman Signed-off-by: Jiri Kosina --- include/linux/device.h | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 0be95294b6e6..381463baed6d 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -189,6 +189,22 @@ ssize_t device_show_string(struct device *dev, struct device_attribute *attr, #define DEVICE_ATTR_ADMIN_RW(_name) \ struct device_attribute dev_attr_##_name = __ATTR_RW_MODE(_name, 0600) +/** + * DEVICE_ATTR_RW_NAMED - Define a read-write device attribute with a sysfs name + * that differs from the function name. + * @_name: Attribute function preface + * @_attrname: Attribute name as it wil be exposed in the sysfs. + * + * Like DEVICE_ATTR_RW(), but allows for reusing names under separate paths in + * the same driver. + */ +#define DEVICE_ATTR_RW_NAMED(_name, _attrname) \ + struct device_attribute dev_attr_##_name = { \ + .attr = { .name = _attrname, .mode = 0644 }, \ + .show = _name##_show, \ + .store = _name##_store, \ + } + /** * DEVICE_ATTR_RO - Define a readable device attribute. * @_name: Attribute name. @@ -207,6 +223,21 @@ ssize_t device_show_string(struct device *dev, struct device_attribute *attr, #define DEVICE_ATTR_ADMIN_RO(_name) \ struct device_attribute dev_attr_##_name = __ATTR_RO_MODE(_name, 0400) +/** + * DEVICE_ATTR_RO_NAMED - Define a read-only device attribute with a sysfs name + * that differs from the function name. + * @_name: Attribute function preface + * @_attrname: Attribute name as it wil be exposed in the sysfs. + * + * Like DEVICE_ATTR_RO(), but allows for reusing names under separate paths in + * the same driver. + */ +#define DEVICE_ATTR_RO_NAMED(_name, _attrname) \ + struct device_attribute dev_attr_##_name = { \ + .attr = { .name = _attrname, .mode = 0444 }, \ + .show = _name##_show, \ + } + /** * DEVICE_ATTR_WO - Define an admin-only writable device attribute. * @_name: Attribute name. @@ -216,6 +247,21 @@ ssize_t device_show_string(struct device *dev, struct device_attribute *attr, #define DEVICE_ATTR_WO(_name) \ struct device_attribute dev_attr_##_name = __ATTR_WO(_name) +/** + * DEVICE_ATTR_WO_NAMED - Define a read-only device attribute with a sysfs name + * that differs from the function name. + * @_name: Attribute function preface + * @_attrname: Attribute name as it wil be exposed in the sysfs. + * + * Like DEVICE_ATTR_WO(), but allows for reusing names under separate paths in + * the same driver. + */ +#define DEVICE_ATTR_WO_NAMED(_name, _attrname) \ + struct device_attribute dev_attr_##_name = { \ + .attr = { .name = _attrname, .mode = 0200 }, \ + .store = _name##_store, \ + } + /** * DEVICE_ULONG_ATTR - Define a device attribute backed by an unsigned long. * @_name: Attribute name. -- cgit v1.2.3 From 6ca9029c823b7853e980585e757343e0e84227cd Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 10 Mar 2026 07:29:27 +0000 Subject: HID: Include firmware version in the uevent Userspace software fwupd probes some HID devices when the daemon starts up to determine the current firmware version in order to be able to offer updated firmware if the manufacturer has made it available. In order to do this fwupd will detach the existing kernel driver if one is present, send a HID command and then reattach the kernel driver. This can be problematic if the user is using the HID device at the time that fwupd probes the hardware and can cause a few frames of input to be dropped. In some cases HID drivers already have a command to look up the firmware version, and so if that is exported to userspace fwupd can discover it and avoid needing to detach the kernel driver until it's time to update the device. Introduce a new member in the struct hid_device for the version and export a new uevent variable HID_FIRMWARE_VERSION that will display the version that HID drivers obtained. Reviewed-by: Derek J. Clark Reviewed-by: Mark Pearson Cc: Richard Hughes Signed-off-by: Mario Limonciello Signed-off-by: Jiri Kosina --- include/linux/hid.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index 2990b9f94cb5..b0b70c05049d 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -698,6 +698,7 @@ struct hid_device { char name[128]; /* Device name */ char phys[64]; /* Device physical location */ char uniq[64]; /* Device unique identifier (serial #) */ + u64 firmware_version; /* Firmware version */ void *driver_data; -- cgit v1.2.3 From 1dfc9d60a69ec148e1cb709256617d86e5f0e8f8 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 5 Mar 2026 22:45:40 +0100 Subject: workqueue: devres: Add device-managed allocate workqueue Add a Resource-managed version of alloc_workqueue() to fix common problem of drivers mixing devm() calls with destroy_workqueue. Such naive and discouraged driver approach leads to difficult to debug bugs when the driver: 1. Allocates workqueue in standard way and destroys it in driver remove() callback, 2. Sets work struct with devm_work_autocancel(), 3. Registers interrupt handler with devm_request_threaded_irq(). Which leads to following unbind/removal path: 1. destroy_workqueue() via driver remove(), Any interrupt coming now would still execute the interrupt handler, which queues work on destroyed workqueue. 2. devm_irq_release(), 3. devm_work_drop() -> cancel_work_sync() on destroyed workqueue. devm_alloc_workqueue() has two benefits: 1. Solves above problem of mix-and-match devres and non-devres code in driver, 2. Simplify any sane drivers which were correctly using alloc_workqueue() + devm_add_action_or_reset(). Signed-off-by: Krzysztof Kozlowski Acked-by: Tejun Heo Reviewed-by: Andy Shevchenko Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index a4749f56398f..f8d235aef10d 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -512,6 +512,26 @@ __printf(1, 4) struct workqueue_struct * alloc_workqueue_noprof(const char *fmt, unsigned int flags, int max_active, ...); #define alloc_workqueue(...) alloc_hooks(alloc_workqueue_noprof(__VA_ARGS__)) +/** + * devm_alloc_workqueue - Resource-managed allocate a workqueue + * @dev: Device to allocate workqueue for + * @fmt: printf format for the name of the workqueue + * @flags: WQ_* flags + * @max_active: max in-flight work items, 0 for default + * @...: args for @fmt + * + * Resource managed workqueue, see alloc_workqueue() for details. + * + * The workqueue will be automatically destroyed on driver detach. Typically + * this should be used in drivers already relying on devm interafaces. + * + * RETURNS: + * Pointer to the allocated workqueue on success, %NULL on failure. + */ +__printf(2, 5) struct workqueue_struct * +devm_alloc_workqueue(struct device *dev, const char *fmt, unsigned int flags, + int max_active, ...); + #ifdef CONFIG_LOCKDEP /** * alloc_workqueue_lockdep_map - allocate a workqueue with user-defined lockdep_map @@ -568,6 +588,8 @@ alloc_workqueue_lockdep_map(const char *fmt, unsigned int flags, int max_active, */ #define alloc_ordered_workqueue(fmt, flags, args...) \ alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, ##args) +#define devm_alloc_ordered_workqueue(dev, fmt, flags, args...) \ + devm_alloc_workqueue(dev, fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, ##args) #define create_workqueue(name) \ alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM | WQ_PERCPU, 1, (name)) -- cgit v1.2.3 From 5a8103a6fb0ae9cf99c0271b17474468d6bae2b2 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 4 Mar 2026 18:21:57 +0100 Subject: genirq: Document interaction between and DT binding defines Document that the DT binding definitions in shadow the first six IRQ_TYPE_* definitions in . The values must be the same anyway, so this is harmless (as long as the latter is included first when both are included), but it is good to document this explicitly. Signed-off-by: Geert Uytterhoeven Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/fbcc65dcee6c5437fab5ef18d21766bb4effb7cb.1772644406.git.geert+renesas@glider.be --- include/linux/irq.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 951acbdb9f84..efa514ee562f 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -35,6 +35,10 @@ enum irqchip_irq_state; * * Bits 0-7 are the same as the IRQF_* bits in linux/interrupt.h * + * Note that the first 6 definitions are shadowed by C preprocessor definitions + * in include/dt-bindings/interrupt-controller/irq.h. This is not an issue, as + * the actual values must be the same, due to being part of the stable DT ABI. + * * IRQ_TYPE_NONE - default, unspecified type * IRQ_TYPE_EDGE_RISING - rising edge triggered * IRQ_TYPE_EDGE_FALLING - falling edge triggered -- cgit v1.2.3 From 360160f75592bdc85edba8fe78fb20d90924c7e8 Mon Sep 17 00:00:00 2001 From: Ricardo Robaina Date: Mon, 9 Mar 2026 10:05:33 -0300 Subject: audit: handle unknown status requests in audit_receive_msg() Currently, audit_receive_msg() ignores unknown status bits in AUDIT_SET requests, incorrectly returning success to newer user space tools querying unsupported features. This breaks forward compatibility. Fix this by defining AUDIT_STATUS_ALL and returning -EINVAL if any unrecognized bits are set (s.mask & ~AUDIT_STATUS_ALL). This ensures invalid requests are safely rejected, allowing user space to reliably test for and gracefully handle feature detection on older kernels. Suggested-by: Steve Grubb Signed-off-by: Ricardo Robaina [PM: subject line tweak] Signed-off-by: Paul Moore --- include/linux/audit.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/audit.h b/include/linux/audit.h index b642b5faca65..d79218bf075a 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -15,6 +15,15 @@ #include #include +#define AUDIT_STATUS_ALL (AUDIT_STATUS_ENABLED | \ + AUDIT_STATUS_FAILURE | \ + AUDIT_STATUS_PID | \ + AUDIT_STATUS_RATE_LIMIT | \ + AUDIT_STATUS_BACKLOG_LIMIT | \ + AUDIT_STATUS_BACKLOG_WAIT_TIME | \ + AUDIT_STATUS_LOST | \ + AUDIT_STATUS_BACKLOG_WAIT_TIME_ACTUAL) + #define AUDIT_INO_UNSET ((unsigned long)-1) #define AUDIT_DEV_UNSET ((dev_t)-1) -- cgit v1.2.3 From 6ffd853b0b10e1e292cef0bfd0997986471254de Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 1 Mar 2026 16:51:44 -0800 Subject: build_bug.h: correct function parameters names in kernel-doc Use the correct function (or macro) names to avoid kernel-doc warnings: Warning: include/linux/build_bug.h:38 function parameter 'cond' not described in 'BUILD_BUG_ON_MSG' Warning: include/linux/build_bug.h:38 function parameter 'msg' not described in 'BUILD_BUG_ON_MSG' Warning: include/linux/build_bug.h:76 function parameter 'expr' not described in 'static_assert' Link: https://lkml.kernel.org/r/20260302005144.3467019-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/build_bug.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/build_bug.h b/include/linux/build_bug.h index 2cfbb4c65c78..d3dc5dc5f916 100644 --- a/include/linux/build_bug.h +++ b/include/linux/build_bug.h @@ -32,7 +32,8 @@ /** * BUILD_BUG_ON_MSG - break compile if a condition is true & emit supplied * error message. - * @condition: the condition which the compiler should know is false. + * @cond: the condition which the compiler should know is false. + * @msg: build-time error message * * See BUILD_BUG_ON for description. */ @@ -60,6 +61,7 @@ /** * static_assert - check integer constant expression at build time + * @expr: expression to be checked * * static_assert() is a wrapper for the C11 _Static_assert, with a * little macro magic to make the message optional (defaulting to the -- cgit v1.2.3 From 7a6387dec8cee5a237dc5092269e97028f5a983b Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 9 Mar 2026 09:39:18 +0000 Subject: net: stmmac: provide plat_dat->dma_cfg in stmmac_plat_dat_alloc() plat_dat->dma_cfg is unconditionally required for the operation of the driver, so it would make sense to allocate it along with the plat_dat. On Arm64, sizeof(*plat_dat) has recently shrunk from 880 to 816 bytes and sizeof(*plat_dat->dma_cfg) has shrunk from 32 to 20 bytes. Given that dma_cfg is required, and it is now less than a cache line, It doesn't make sense to allocate this separateny, so place it at the end of struct plat_stmmacenet_data, and set plat_dat->dma_cfg to point at that to avoid mass changes. Signed-off-by: Russell King (Oracle) Reviewed-by: Mohd Ayaan Anwar Tested-by: Mohd Ayaan Anwar Link: https://patch.msgid.link/E1vzX54-0000000CVrw-2jfu@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 965ada809fdf..919196713c05 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -306,5 +306,6 @@ struct plat_stmmacenet_data { int msi_tx_base_vec; const struct dwmac4_addrs *dwmac4_addrs; unsigned int flags; + struct stmmac_dma_cfg __dma_cfg; }; #endif -- cgit v1.2.3 From c3d08424e025aaac8fb54134f76e611ef919cd08 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 9 Mar 2026 09:39:23 +0000 Subject: net: stmmac: convert plat_stmmacenet_data booleans to type bool Convert members of struct plat_stmmacenet_data that are booleans to type 'bool' and ensure their initialisers are true/false. Move the has_xxx for the GMAC cores together, and move the COE members to the end of the list of bool to avoid unused holes in the struct. Signed-off-by: Russell King (Oracle) Reviewed-by: Mohd Ayaan Anwar Tested-by: Mohd Ayaan Anwar Link: https://patch.msgid.link/E1vzX59-0000000CVs2-3MHc@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 919196713c05..9420da96a4ff 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -229,14 +229,14 @@ struct plat_stmmacenet_data { struct stmmac_dma_cfg *dma_cfg; struct stmmac_safety_feature_cfg *safety_feat_cfg; int clk_csr; - int enh_desc; - int tx_coe; + bool enh_desc; + bool tx_coe; + bool bugged_jumbo; + bool pmt; + bool force_sf_dma_mode; + bool force_thresh_dma_mode; + bool riwt_off; int rx_coe; - int bugged_jumbo; - int pmt; - int force_sf_dma_mode; - int force_thresh_dma_mode; - int riwt_off; int max_speed; int maxmtu; int multicast_filter_bins; -- cgit v1.2.3 From 3357642e65e9454c3da64b62c0ed987ee4010008 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 9 Mar 2026 09:39:28 +0000 Subject: net: stmmac: reorder structs to reduce memory consumption Reorder some of the stmmac structures to allow them to pack better, thereby using less memory. On aarch64, sizeof(struct stmmac_priv) was 880, and with this change becomes 816, saving 64 bytes, which is an 8% saving. Signed-off-by: Russell King (Oracle) Tested-by: Mohd Ayaan Anwar Link: https://patch.msgid.link/E1vzX5E-0000000CVs8-40w4@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 9420da96a4ff..411cdd3ea034 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -108,37 +108,37 @@ struct stmmac_dma_cfg { #define AXI_BLEN 7 struct stmmac_axi { - bool axi_lpi_en; - bool axi_xit_frm; u32 axi_wr_osr_lmt; u32 axi_rd_osr_lmt; - bool axi_kbbe; u32 axi_blen_regval; + bool axi_lpi_en; + bool axi_xit_frm; + bool axi_kbbe; bool axi_fb; bool axi_mb; bool axi_rb; }; struct stmmac_rxq_cfg { - u8 mode_to_use; u32 chan; + u32 prio; + u8 mode_to_use; u8 pkt_route; bool use_prio; - u32 prio; }; struct stmmac_txq_cfg { u32 weight; - bool coe_unsupported; - u8 mode_to_use; /* Credit Base Shaper parameters */ u32 send_slope; u32 idle_slope; u32 high_credit; u32 low_credit; - bool use_prio; u32 prio; int tbs_en; + bool use_prio; + bool coe_unsupported; + u8 mode_to_use; }; struct stmmac_safety_feature_cfg { -- cgit v1.2.3 From 94808793fed71ee47741df0923d353024b6904ff Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 9 Mar 2026 09:39:34 +0000 Subject: net: stmmac: use u8 for ?x_queues_to_use and number_?x_queues The maximum number of queues is a compile time constant of only eight. This makes using a 32-bit quantity wastefulf. Instead, use u8 for these and their associated variables. When reading the DT properties, saturdate at U8_MAX. Provided the core provides DMA capabilities to describe the number of queues, this will be capped by stmmac_hw_init() with a warning. Signed-off-by: Russell King (Oracle) Tested-by: Mohd Ayaan Anwar Link: https://patch.msgid.link/E1vzX5K-0000000CVsE-0J0Y@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 411cdd3ea034..03fd85060a73 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -244,8 +244,8 @@ struct plat_stmmacenet_data { int tx_fifo_size; int rx_fifo_size; u32 host_dma_width; - u32 rx_queues_to_use; - u32 tx_queues_to_use; + u8 rx_queues_to_use; + u8 tx_queues_to_use; u8 rx_sched_algorithm; u8 tx_sched_algorithm; struct stmmac_rxq_cfg rx_queues_cfg[MTL_MAX_RX_QUEUES]; -- cgit v1.2.3 From 758ed85aadd0668c66cb359c63f384992b10938c Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 9 Mar 2026 09:39:39 +0000 Subject: net: stmmac: use u8 for host_dma_width and similar struct members We aren't going to see >= 256-bit address busses soon, so reduce host_dma_width and associated other struct members that initialise this from u32 to u8. Signed-off-by: Russell King (Oracle) Acked-by: Mohd Ayaan Anwar # qcom-ethqos Tested-by: Mohd Ayaan Anwar Link: https://patch.msgid.link/E1vzX5P-0000000CVsK-0iwX@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 03fd85060a73..11886189bf51 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -243,7 +243,7 @@ struct plat_stmmacenet_data { int unicast_filter_entries; int tx_fifo_size; int rx_fifo_size; - u32 host_dma_width; + u8 host_dma_width; u8 rx_queues_to_use; u8 tx_queues_to_use; u8 rx_sched_algorithm; -- cgit v1.2.3 From 9fe167ab790b10c9eb9ef82f46a03c83f9953b61 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 9 Mar 2026 09:39:44 +0000 Subject: net: stmmac: add documentation for stmmac_dma_cfg members Add documentation of each of the struct stmmac_dma_cfg members. dche remains undocumented as I don't have documentation that covers this. Signed-off-by: Russell King (Oracle) Tested-by: Mohd Ayaan Anwar Link: https://patch.msgid.link/E1vzX5U-0000000CVsQ-162V@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 11886189bf51..1af3a5e197c9 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -93,16 +93,37 @@ struct stmmac_mdio_bus_data { }; struct stmmac_dma_cfg { + /* pbl: programmable burst limit + * txpbl: transmit programmable burst limit + * rxpbl: receive programmable burst limit + * If txpbl or rxpbl are zero, the value of pbl will be substituted. + * Range 0 - 63. + */ int pbl; int txpbl; int rxpbl; + /* pblx8: multiplies pbl, txpbl, rxpbl by a factor of 8 for dwmac >= + * 3.50a, or a factor of 4 for previous versions. + */ bool pblx8; + /* fixed_burst: + * when set, AXI bursts defined by axi_blen_regval are permitted. + * AHB uses SINGLE, INCR4, INCR8 or INCR16 during burst transfers. + * when clear, AXI and AHB use SINGLE or INCR bursts. + */ bool fixed_burst; + /* mixed_burst: + * when set and fixed_burst is clear, AHB uses INCR for bursts > 16 + * and SINGLE or INCRx for bursts <= 16. + */ bool mixed_burst; + /* aal: address aligned bursts for AHB and AXI master interface */ bool aal; + bool dche; bool eame; + /* multi_msi_en: stmmac core internal */ bool multi_msi_en; - bool dche; + /* atds: stmmac core internal */ bool atds; }; -- cgit v1.2.3 From 315bab9411f3bd3465a47a64a3e44323bfab60be Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 9 Mar 2026 09:39:49 +0000 Subject: net: stmmac: add documentation for clocks Add documentation covering stmmac_clk, pclk, clk_ptp_ref and clk_tx_i in the hope that this will help understand what each of these clocks are for. There is confusion around stmmac_clk and pclk which can't be easily resolved today as the Imagination Technologies Pistachio board that pclk was introduced for has no public documentation and is likely now obsolete. So the origins of pclk are lost to the winds of time. Signed-off-by: Russell King (Oracle) Tested-by: Mohd Ayaan Anwar Link: https://patch.msgid.link/E1vzX5Z-0000000CVsb-1XTm@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 1af3a5e197c9..937985276e6b 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -300,10 +300,41 @@ struct plat_stmmacenet_data { struct phylink_pcs *(*select_pcs)(struct stmmac_priv *priv, phy_interface_t interface); void *bsp_priv; + + /* stmmac clocks: + * stmmac_clk: CSR clock (which can be hclk_i, clk_csr_i, aclk_i, + * or clk_app_i depending on GMAC configuration). This clock + * generates the MDC clock. + * + * pclk: introduced for Imagination Technologies Pistachio board - + * see 5f9755d26fbf ("stmmac: Add an optional register interface + * clock"). This is probably used for cases where separate clocks + * are provided for the host interface and register interface. In + * this case, as the MDC clock is derived from stmmac_clk, pclk + * can only really be the "application clock" for the "host + * interface" and not the "register interface" aka CSR clock as + * it is never used when determining the divider for the MDC + * clock. + * + * clk_ptp_ref: optional PTP reference clock (clk_ptp_ref_i). When + * present, this clock increments the timestamp value. Otherwise, + * the rate of stmmac_clk will be used. + * + * clk_tx_i: MAC transmit clock, which will be 2.5MHz for 10M, + * 25MHz for 100M, or 125MHz for 1G irrespective of the interface + * mode. For the DWMAC PHY interface modes: + * + * GMII/MII PHY's transmit clock for 10M (2.5MHz) or 100M (25MHz), + * or 125MHz local clock for 1G mode + * RMII 50MHz RMII clock divided by 2 or 20. + * RGMII 125MHz local clock divided by 1, 5, or 50. + * SGMII 125MHz SerDes clock divided by 1, 5, or 50. + * TBI/RTBI 125MHz SerDes clock + */ struct clk *stmmac_clk; struct clk *pclk; struct clk *clk_ptp_ref; - struct clk *clk_tx_i; /* clk_tx_i to MAC core */ + struct clk *clk_tx_i; unsigned long clk_ptp_rate; unsigned long clk_ref_rate; struct clk_bulk_data *clks; -- cgit v1.2.3 From 7aba71dbc41641e43a79fb9f6fac91719094b4fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Thu, 5 Mar 2026 10:39:06 +0100 Subject: mm/mmu_notifier: Allow two-pass struct mmu_interval_notifiers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GPU use-cases for mmu_interval_notifiers with hmm often involve starting a gpu operation and then waiting for it to complete. These operations are typically context preemption or TLB flushing. With single-pass notifiers per GPU this doesn't scale in multi-gpu scenarios. In those scenarios we'd want to first start preemption- or TLB flushing on all GPUs and as a second pass wait for them to complete. One can do this on per-driver basis multiplexing per-driver notifiers but that would mean sharing the notifier "user" lock across all GPUs and that doesn't scale well either, so adding support for multi-pass in the core appears to be the right choice. Implement two-pass capability in the mmu_interval_notifier. Use a linked list for the final passes to minimize the impact for use-cases that don't need the multi-pass functionality by avoiding a second interval tree walk, and to be able to easily pass data between the two passes. v1: - Restrict to two passes (Jason Gunthorpe) - Improve on documentation (Jason Gunthorpe) - Improve on function naming (Alistair Popple) v2: - Include the invalidate_finish() callback in the struct mmu_interval_notifier_ops. - Update documentation (GitHub Copilot:claude-sonnet-4.6) - Use lockless list for list management. v3: - Update kerneldoc for the struct mmu_interval_notifier_finish::list member (Matthew Brost) - Add a WARN_ON_ONCE() checking for NULL invalidate_finish() op if if invalidate_start() is non-NULL. (Matthew Brost) v4: - Addressed documentation review comments by David Hildenbrand. Cc: Matthew Brost Cc: Christian König Cc: David Hildenbrand Cc: Lorenzo Stoakes Cc: Liam R. Howlett Cc: Vlastimil Babka Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Michal Hocko Cc: Jason Gunthorpe Cc: Andrew Morton Cc: Simona Vetter Cc: Dave Airlie Cc: Alistair Popple Cc: Cc: Cc: Assisted-by: GitHub Copilot:claude-sonnet-4.6 # Documentation only. Signed-off-by: Thomas Hellström Acked-by: David Hildenbrand (Arm) Reviewed-by: Maarten Lankhorst Link: https://patch.msgid.link/20260305093909.43623-2-thomas.hellstrom@linux.intel.com --- include/linux/mmu_notifier.h | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index d1094c2d5fb6..b60673a8e0bb 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -233,16 +233,58 @@ struct mmu_notifier { unsigned int users; }; +/** + * struct mmu_interval_notifier_finish - mmu_interval_notifier two-pass abstraction + * @link: Lockless list link for the notifiers pending pass list + * @notifier: The mmu_interval_notifier for which the finish pass is called. + * + * Allocate, typically using GFP_NOWAIT in the interval notifier's start pass. + * Note that with a large number of notifiers implementing two passes, + * allocation with GFP_NOWAIT will become increasingly likely to fail, so consider + * implementing a small pool instead of using kmalloc() allocations. + * + * If the implementation needs to pass data between the start and the finish passes, + * the recommended way is to embed struct mmu_interval_notifier_finish into a larger + * structure that also contains the data needed to be shared. Keep in mind that + * a notifier callback can be invoked in parallel, and each invocation needs its + * own struct mmu_interval_notifier_finish. + * + * If allocation fails, then the &mmu_interval_notifier_ops->invalidate_start op + * needs to implements the full notifier functionality. Please refer to its + * documentation. + */ +struct mmu_interval_notifier_finish { + struct llist_node link; + struct mmu_interval_notifier *notifier; +}; + /** * struct mmu_interval_notifier_ops * @invalidate: Upon return the caller must stop using any SPTEs within this * range. This function can sleep. Return false only if sleeping * was required but mmu_notifier_range_blockable(range) is false. + * @invalidate_start: Similar to @invalidate, but intended for two-pass notifier + * callbacks where the call to @invalidate_start is the first + * pass and any struct mmu_interval_notifier_finish pointer + * returned in the @finish parameter describes the finish pass. + * If *@finish is %NULL on return, then no final pass will be + * called, and @invalidate_start needs to implement the full + * notifier, behaving like @invalidate. The value of *@finish + * is guaranteed to be %NULL at function entry. + * @invalidate_finish: Called as the second pass for any notifier that returned + * a non-NULL *@finish from @invalidate_start. The @finish + * pointer passed here is the same one returned by + * @invalidate_start. */ struct mmu_interval_notifier_ops { bool (*invalidate)(struct mmu_interval_notifier *interval_sub, const struct mmu_notifier_range *range, unsigned long cur_seq); + bool (*invalidate_start)(struct mmu_interval_notifier *interval_sub, + const struct mmu_notifier_range *range, + unsigned long cur_seq, + struct mmu_interval_notifier_finish **finish); + void (*invalidate_finish)(struct mmu_interval_notifier_finish *finish); }; struct mmu_interval_notifier { -- cgit v1.2.3 From 05988dba11791ccbb458254484826b32f17f4ad2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Wed, 4 Mar 2026 08:49:00 +0100 Subject: vdso/datastore: Allocate data pages dynamically MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allocating the data pages as part of the kernel image does not work on SPARC. The MMU will raise a fault when userspace tries to access them. Allocate the data pages through the page allocator instead. Unused pages in the vDSO VMA are still allocated to keep the virtual addresses aligned. Switch the mapping from PFNs to 'struct page' as that is required for dynamically allocated pages. This also aligns the allocation of the datapages with the code pages and is a prerequisite for mlockall() support. VM_MIXEDMAP is necessary for the call to vmf_insert_page() in the timens prefault path to work. The data pages need to be order-0, non-compound pages so that the mapping to userspace and the different orderings work. These pages are also used by the timekeeping, random pool and architecture initialization code. Some of these are running before the page allocator is available. To keep these subsytems working without changes, introduce early, statically data storage which will then replaced by the real one as soon as that is available. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Reviewed-by: Christophe Leroy (CS GROUP) Link: https://patch.msgid.link/20260304-vdso-sparc64-generic-2-v6-3-d8eb3b0e1410@linutronix.de --- include/linux/vdso_datastore.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vdso_datastore.h b/include/linux/vdso_datastore.h index a91fa24b06e0..0b530428db71 100644 --- a/include/linux/vdso_datastore.h +++ b/include/linux/vdso_datastore.h @@ -2,9 +2,15 @@ #ifndef _LINUX_VDSO_DATASTORE_H #define _LINUX_VDSO_DATASTORE_H +#ifdef CONFIG_HAVE_GENERIC_VDSO #include extern const struct vm_special_mapping vdso_vvar_mapping; struct vm_area_struct *vdso_install_vvar_mapping(struct mm_struct *mm, unsigned long addr); +void __init vdso_setup_data_pages(void); +#else /* !CONFIG_HAVE_GENERIC_VDSO */ +static inline void vdso_setup_data_pages(void) { } +#endif /* CONFIG_HAVE_GENERIC_VDSO */ + #endif /* _LINUX_VDSO_DATASTORE_H */ -- cgit v1.2.3 From c453b9abb4f422461c1493ef74d63af0961a2d30 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 4 Mar 2026 08:49:11 +0100 Subject: clocksource: Remove ARCH_CLOCKSOURCE_DATA MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After sparc64, there are no remaining users of ARCH_CLOCKSOURCE_DATA and it can just be removed. Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Tested-by: Andreas Larsson Reviewed-by: Andreas Larsson Acked-by: John Stultz Link: https://patch.msgid.link/20260304-vdso-sparc64-generic-2-v6-14-d8eb3b0e1410@linutronix.de [Thomas: drop sparc64 bits from the patch] --- include/linux/clocksource.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 65b7c41471c3..12d853b18832 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -25,8 +25,7 @@ struct clocksource_base; struct clocksource; struct module; -#if defined(CONFIG_ARCH_CLOCKSOURCE_DATA) || \ - defined(CONFIG_GENERIC_GETTIMEOFDAY) +#if defined(CONFIG_GENERIC_GETTIMEOFDAY) #include #endif @@ -106,9 +105,6 @@ struct clocksource { u64 max_idle_ns; u32 maxadj; u32 uncertainty_margin; -#ifdef CONFIG_ARCH_CLOCKSOURCE_DATA - struct arch_clocksource_data archdata; -#endif u64 max_cycles; u64 max_raw_delta; const char *name; -- cgit v1.2.3 From b2e48c429ec54715d16fefa719dd2fbded2e65be Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 10 Mar 2026 21:28:53 +0100 Subject: sched/mmcid: Prevent CID stalls due to concurrent forks A newly forked task is accounted as MMCID user before the task is visible in the process' thread list and the global task list. This creates the following problem: CPU1 CPU2 fork() sched_mm_cid_fork(tnew1) tnew1->mm.mm_cid_users++; tnew1->mm_cid.cid = getcid() -> preemption fork() sched_mm_cid_fork(tnew2) tnew2->mm.mm_cid_users++; // Reaches the per CPU threshold mm_cid_fixup_tasks_to_cpus() for_each_other(current, p) .... As tnew1 is not visible yet, this fails to fix up the already allocated CID of tnew1. As a consequence a subsequent schedule in might fail to acquire a (transitional) CID and the machine stalls. Move the invocation of sched_mm_cid_fork() after the new task becomes visible in the thread and the task list to prevent this. This also makes it symmetrical vs. exit() where the task is removed as CID user before the task is removed from the thread and task lists. Fixes: fbd0e71dc370 ("sched/mmcid: Provide CID ownership mode fixup functions") Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Tested-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260310202525.969061974@kernel.org --- include/linux/sched.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index a7b4a980eb2f..5a5d3dbc9cdf 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2354,7 +2354,6 @@ static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct allo #ifdef CONFIG_SCHED_MM_CID void sched_mm_cid_before_execve(struct task_struct *t); void sched_mm_cid_after_execve(struct task_struct *t); -void sched_mm_cid_fork(struct task_struct *t); void sched_mm_cid_exit(struct task_struct *t); static __always_inline int task_mm_cid(struct task_struct *t) { @@ -2363,7 +2362,6 @@ static __always_inline int task_mm_cid(struct task_struct *t) #else static inline void sched_mm_cid_before_execve(struct task_struct *t) { } static inline void sched_mm_cid_after_execve(struct task_struct *t) { } -static inline void sched_mm_cid_fork(struct task_struct *t) { } static inline void sched_mm_cid_exit(struct task_struct *t) { } static __always_inline int task_mm_cid(struct task_struct *t) { -- cgit v1.2.3 From 192d852129b1b7c4f0ddbab95d0de1efd5ee1405 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 10 Mar 2026 21:29:09 +0100 Subject: sched/mmcid: Avoid full tasklist walks Chasing vfork()'ed tasks on a CID ownership mode switch requires a full task list walk, which is obviously expensive on large systems. Avoid that by keeping a list of tasks using a mm MMCID entity in mm::mm_cid and walk this list instead. This removes the proven to be flaky counting logic and avoids a full task list walk in the case of vfork()'ed tasks. Fixes: fbd0e71dc370 ("sched/mmcid: Provide CID ownership mode fixup functions") Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Tested-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260310202526.183824481@kernel.org --- include/linux/rseq_types.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index da5fa6f40294..0b42045988db 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -133,10 +133,12 @@ struct rseq_data { }; * @active: MM CID is active for the task * @cid: The CID associated to the task either permanently or * borrowed from the CPU + * @node: Queued in the per MM MMCID list */ struct sched_mm_cid { unsigned int active; unsigned int cid; + struct hlist_node node; }; /** @@ -157,6 +159,7 @@ struct mm_cid_pcpu { * @work: Regular work to handle the affinity mode change case * @lock: Spinlock to protect against affinity setting which can't take @mutex * @mutex: Mutex to serialize forks and exits related to this mm + * @user_list: List of the MM CID users of a MM * @nr_cpus_allowed: The number of CPUs in the per MM allowed CPUs map. The map * is growth only. * @users: The number of tasks sharing this MM. Separate from mm::mm_users @@ -177,13 +180,14 @@ struct mm_mm_cid { raw_spinlock_t lock; struct mutex mutex; + struct hlist_head user_list; /* Low frequency modified */ unsigned int nr_cpus_allowed; unsigned int users; unsigned int pcpu_thrs; unsigned int update_deferred; -}____cacheline_aligned_in_smp; +} ____cacheline_aligned; #else /* CONFIG_SCHED_MM_CID */ struct mm_mm_cid { }; struct sched_mm_cid { }; -- cgit v1.2.3 From 282b8eec8a4eab9a3ff3addf6dad2ce699594fe8 Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Tue, 10 Feb 2026 13:22:08 +0100 Subject: net: cdc-ncm: cleanup device descriptor Flags are boolean values, hence they should be typed as bool, not as u8. Signed-off-by: Oliver Neukum Link: https://patch.msgid.link/20260210122208.29244-1-oneukum@suse.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/cdc_ncm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb/cdc_ncm.h b/include/linux/usb/cdc_ncm.h index 4ac082a63173..97ef37a1ff4a 100644 --- a/include/linux/usb/cdc_ncm.h +++ b/include/linux/usb/cdc_ncm.h @@ -118,8 +118,8 @@ struct cdc_ncm_ctx { u32 timer_interval; u32 max_ndp_size; - u8 is_ndp16; - u8 filtering_supported; + bool is_ndp16; + bool filtering_supported; union { struct usb_cdc_ncm_ndp16 *delayed_ndp16; struct usb_cdc_ncm_ndp32 *delayed_ndp32; -- cgit v1.2.3 From 227312b4a65c373d5d8b4683b7fc36203fedc516 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 28 Feb 2026 15:52:58 +0100 Subject: HID: input: Add HID_BATTERY_QUIRK_DYNAMIC for Elan touchscreens Elan touchscreens have a HID-battery device for the stylus which is always there even if there is no stylus. This is causing upower to report an empty battery for the stylus and some desktop-environments will show a notification about this, which is quite annoying. Because of this the HID-battery is being ignored on all Elan I2c and USB touchscreens, but this causes there to be no battery reporting for the stylus at all. This adds a new HID_BATTERY_QUIRK_DYNAMIC and uses these for the Elan touchscreens. This new quirks causes the present value of the battery to start at 0, which will make userspace ignore it and only sets present to 1 after receiving a battery input report which only happens when the stylus gets in range. Reported-by: ggrundik@gmail.com Closes: https://bugzilla.kernel.org/show_bug.cgi?id=221118 Signed-off-by: Hans de Goede Reviewed-by: Sebastian Reichel Signed-off-by: Jiri Kosina --- include/linux/hid.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index 2990b9f94cb5..31324609af4d 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -682,6 +682,7 @@ struct hid_device { __s32 battery_charge_status; enum hid_battery_status battery_status; bool battery_avoid_query; + bool battery_present; ktime_t battery_ratelimit_time; #endif -- cgit v1.2.3 From b558a9cc107287bd49bd9256e5d965afa80acfd6 Mon Sep 17 00:00:00 2001 From: Amit Sunil Dhamne Date: Mon, 23 Feb 2026 20:05:38 +0000 Subject: usb: typec: tcpm: add support for Sink Cap Extended msg response Add support for responding to Sink Cap Extended msg request. To achieve this, include parsing support for DT properties related to Sink Cap Extended. The request for Sink Cap Ext is a control message while the response is an extended message (chunked). As the Sink Caps Extended Data Block size (24 Byte) is less than MaxExtendedMsgChunkLen (26 Byte), a single chunk is sufficient to complete this AMS. Supporting sink cap extended messages while responding to a Get_Sink_Caps_Extended request when port is in Sink role is required in order to be compliant with at least USB PD Rev3.1 Ver1.8. Signed-off-by: Amit Sunil Dhamne Reviewed-by: Badhri Jagan Sridharan Reviewed-by: Heikki Krogerus Link: https://patch.msgid.link/20260223-skedb-v2-2-60675765bc7e@google.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/pd.h | 82 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 80 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb/pd.h b/include/linux/usb/pd.h index 6ccd1b2af993..5a98983195cb 100644 --- a/include/linux/usb/pd.h +++ b/include/linux/usb/pd.h @@ -34,7 +34,8 @@ enum pd_ctrl_msg_type { PD_CTRL_FR_SWAP = 19, PD_CTRL_GET_PPS_STATUS = 20, PD_CTRL_GET_COUNTRY_CODES = 21, - /* 22-23 Reserved */ + PD_CTRL_GET_SINK_CAP_EXT = 22, + /* 23 Reserved */ PD_CTRL_GET_REVISION = 24, /* 25-31 Reserved */ }; @@ -72,7 +73,8 @@ enum pd_ext_msg_type { PD_EXT_PPS_STATUS = 12, PD_EXT_COUNTRY_INFO = 13, PD_EXT_COUNTRY_CODES = 14, - /* 15-31 Reserved */ + PD_EXT_SINK_CAP_EXT = 15, + /* 16-31 Reserved */ }; #define PD_REV10 0x0 @@ -205,6 +207,72 @@ struct pd_message { }; } __packed; +/* + * count_chunked_data_objs - Helper to calculate number of Data Objects on a 4 + * byte boundary. + * @size: Size of data block for extended message. Should *not* include extended + * header size. + */ +static inline u8 count_chunked_data_objs(u32 size) +{ + size += offsetof(struct pd_chunked_ext_message_data, data); + return ((size / 4) + (size % 4 ? 1 : 0)); +} + +/* Sink Caps Extended Data Block Version */ +#define SKEDB_VER_1_0 1 + +/* Sink Caps Extended Sink Modes */ +#define SINK_MODE_PPS BIT(0) +#define SINK_MODE_VBUS BIT(1) +#define SINK_MODE_AC_SUPPLY BIT(2) +#define SINK_MODE_BATT BIT(3) +#define SINK_MODE_BATT_UL BIT(4) /* Unlimited battery power supply */ +#define SINK_MODE_AVS BIT(5) + +/** + * struct sink_caps_ext_msg - Sink extended capability PD message + * @vid: Vendor ID + * @pid: Product ID + * @xid: Value assigned by USB-IF for product + * @fw: Firmware version + * @hw: Hardware version + * @skedb_ver: Sink Caps Extended Data Block (SKEDB) Version + * @load_step: Indicates the load step slew rate. + * @load_char: Sink overload characteristics + * @compliance: Types of sources the sink has been tested & certified on + * @touch_temp: Indicates the IEC standard to which the touch temperature + * conforms to (if applicable). + * @batt_info: Indicates number batteries and hot swappable ports + * @modes: Charging caps & power sources supported + * @spr_min_pdp: Sink Minimum PDP for SPR mode + * @spr_op_pdp: Sink Operational PDP for SPR mode + * @spr_max_pdp: Sink Maximum PDP for SPR mode + * @epr_min_pdp: Sink Minimum PDP for EPR mode + * @epr_op_pdp: Sink Operational PDP for EPR mode + * @epr_max_pdp: Sink Maximum PDP for EPR mode + */ +struct sink_caps_ext_msg { + __le16 vid; + __le16 pid; + __le32 xid; + u8 fw; + u8 hw; + u8 skedb_ver; + u8 load_step; + __le16 load_char; + u8 compliance; + u8 touch_temp; + u8 batt_info; + u8 modes; + u8 spr_min_pdp; + u8 spr_op_pdp; + u8 spr_max_pdp; + u8 epr_min_pdp; + u8 epr_op_pdp; + u8 epr_max_pdp; +} __packed; + /* PDO: Power Data Object */ #define PDO_MAX_OBJECTS 7 @@ -329,6 +397,11 @@ enum pd_apdo_type { #define PDO_SPR_AVS_APDO_9V_TO_15V_MAX_CURR GENMASK(19, 10) /* 10mA unit */ #define PDO_SPR_AVS_APDO_15V_TO_20V_MAX_CURR GENMASK(9, 0) /* 10mA unit */ +/* SPR AVS has two different current ranges 9V - 15V, 15V - 20V */ +#define SPR_AVS_TIER1_MIN_VOLT_MV 9000 +#define SPR_AVS_TIER1_MAX_VOLT_MV 15000 +#define SPR_AVS_TIER2_MAX_VOLT_MV 20000 + static inline enum pd_pdo_type pdo_type(u32 pdo) { return (pdo >> PDO_TYPE_SHIFT) & PDO_TYPE_MASK; @@ -339,6 +412,11 @@ static inline unsigned int pdo_fixed_voltage(u32 pdo) return ((pdo >> PDO_FIXED_VOLT_SHIFT) & PDO_VOLT_MASK) * 50; } +static inline unsigned int pdo_fixed_current(u32 pdo) +{ + return ((pdo >> PDO_FIXED_CURR_SHIFT) & PDO_CURR_MASK) * 10; +} + static inline unsigned int pdo_min_voltage(u32 pdo) { return ((pdo >> PDO_VAR_MIN_VOLT_SHIFT) & PDO_VOLT_MASK) * 50; -- cgit v1.2.3 From 416909962e7cdf29fd01ac523c953f37708df93d Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Tue, 17 Feb 2026 22:07:47 -0500 Subject: USB: usbcore: Introduce usb_bulk_msg_killable() The synchronous message API in usbcore (usb_control_msg(), usb_bulk_msg(), and so on) uses uninterruptible waits. However, drivers may call these routines in the context of a user thread, which means it ought to be possible to at least kill them. For this reason, introduce a new usb_bulk_msg_killable() function which behaves the same as usb_bulk_msg() except for using wait_for_completion_killable_timeout() instead of wait_for_completion_timeout(). The same can be done later for usb_control_msg() later on, if it turns out to be needed. Signed-off-by: Alan Stern Suggested-by: Oliver Neukum Link: https://lore.kernel.org/linux-usb/3acfe838-6334-4f6d-be7c-4bb01704b33d@rowland.harvard.edu/ Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") CC: stable@vger.kernel.org Link: https://patch.msgid.link/248628b4-cc83-4e81-a620-3ce4e0376d41@rowland.harvard.edu Signed-off-by: Greg Kroah-Hartman --- include/linux/usb.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb.h b/include/linux/usb.h index fbfcc70b07fb..57ceeb02a7cb 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -1868,8 +1868,9 @@ extern int usb_control_msg(struct usb_device *dev, unsigned int pipe, extern int usb_interrupt_msg(struct usb_device *usb_dev, unsigned int pipe, void *data, int len, int *actual_length, int timeout); extern int usb_bulk_msg(struct usb_device *usb_dev, unsigned int pipe, - void *data, int len, int *actual_length, - int timeout); + void *data, int len, int *actual_length, int timeout); +extern int usb_bulk_msg_killable(struct usb_device *usb_dev, unsigned int pipe, + void *data, int len, int *actual_length, int timeout); /* wrappers around usb_control_msg() for the most common standard requests */ int usb_control_msg_send(struct usb_device *dev, __u8 endpoint, __u8 request, -- cgit v1.2.3 From 1015c27a5e1a63efae2b18a9901494474b4d1dc3 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Tue, 17 Feb 2026 22:10:32 -0500 Subject: USB: core: Limit the length of unkillable synchronous timeouts The usb_control_msg(), usb_bulk_msg(), and usb_interrupt_msg() APIs in usbcore allow unlimited timeout durations. And since they use uninterruptible waits, this leaves open the possibility of hanging a task for an indefinitely long time, with no way to kill it short of unplugging the target device. To prevent this sort of problem, enforce a maximum limit on the length of these unkillable timeouts. The limit chosen here, somewhat arbitrarily, is 60 seconds. On many systems (although not all) this is short enough to avoid triggering the kernel's hung-task detector. In addition, clear up the ambiguity of negative timeout values by treating them the same as 0, i.e., using the maximum allowed timeout. Signed-off-by: Alan Stern Link: https://lore.kernel.org/linux-usb/3acfe838-6334-4f6d-be7c-4bb01704b33d@rowland.harvard.edu/ Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") CC: stable@vger.kernel.org Link: https://patch.msgid.link/15fc9773-a007-47b0-a703-df89a8cf83dd@rowland.harvard.edu Signed-off-by: Greg Kroah-Hartman --- include/linux/usb.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb.h b/include/linux/usb.h index 57ceeb02a7cb..04277af4bb9d 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -1862,6 +1862,9 @@ void usb_free_noncoherent(struct usb_device *dev, size_t size, * SYNCHRONOUS CALL SUPPORT * *-------------------------------------------------------------------*/ +/* Maximum value allowed for timeout in synchronous routines below */ +#define USB_MAX_SYNCHRONOUS_TIMEOUT 60000 /* ms */ + extern int usb_control_msg(struct usb_device *dev, unsigned int pipe, __u8 request, __u8 requesttype, __u16 value, __u16 index, void *data, __u16 size, int timeout); -- cgit v1.2.3 From 9f6a983cfa22ac662c86e60816d3a357d4b551e9 Mon Sep 17 00:00:00 2001 From: Jie Deng Date: Fri, 27 Feb 2026 16:49:31 +0800 Subject: usb: core: new quirk to handle devices with zero configurations Some USB devices incorrectly report bNumConfigurations as 0 in their device descriptor, which causes the USB core to reject them during enumeration. logs: usb 1-2: device descriptor read/64, error -71 usb 1-2: no configurations usb 1-2: can't read configurations, error -22 However, these devices actually work correctly when treated as having a single configuration. Add a new quirk USB_QUIRK_FORCE_ONE_CONFIG to handle such devices. When this quirk is set, assume the device has 1 configuration instead of failing with -EINVAL. This quirk is applied to the device with VID:PID 5131:2007 which exhibits this behavior. Signed-off-by: Jie Deng Link: https://patch.msgid.link/20260227084931.1527461-1-dengjie03@kylinos.cn Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/quirks.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/quirks.h b/include/linux/usb/quirks.h index 2f7bd2fdc616..b3cc7beab4a3 100644 --- a/include/linux/usb/quirks.h +++ b/include/linux/usb/quirks.h @@ -78,4 +78,7 @@ /* skip BOS descriptor request */ #define USB_QUIRK_NO_BOS BIT(17) +/* Device claims zero configurations, forcing to 1 */ +#define USB_QUIRK_FORCE_ONE_CONFIG BIT(18) + #endif /* __LINUX_USB_QUIRKS_H */ -- cgit v1.2.3 From e19eaffc5213fdd6179e849d3032929fae0d8c2c Mon Sep 17 00:00:00 2001 From: Rosen Penev Date: Thu, 5 Mar 2026 14:44:10 -0800 Subject: mtd: concat: replace alloc + calloc with 1 alloc A flex array can be used to reduce the allocation to 1. And actually mtdconcat was using the pointer + 1 trick to point to the overallocated area. Better alternatives exist. Signed-off-by: Rosen Penev Signed-off-by: Miquel Raynal --- include/linux/mtd/concat.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mtd/concat.h b/include/linux/mtd/concat.h index 2cd9d48958a8..f8d4d6ac1fc1 100644 --- a/include/linux/mtd/concat.h +++ b/include/linux/mtd/concat.h @@ -18,7 +18,7 @@ struct mtd_concat { struct mtd_info mtd; int num_subdev; - struct mtd_info **subdev; + struct mtd_info *subdev[]; }; struct mtd_info *mtd_concat_create( -- cgit v1.2.3 From 77dd8adabbc8ff845177b460de48b9d2cd579966 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 9 Mar 2026 13:52:22 +0100 Subject: efi: Drop unused efi_range_is_wc() function efi_range_is_wc() has no callers, so remove it. Reviewed-by: Ilias Apalodimas Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 21 --------------------- 1 file changed, 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index 664898d09ff5..72e76ec54641 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -832,27 +832,6 @@ extern int __init parse_efi_signature_list( const void *data, size_t size, efi_element_handler_t (*get_handler_for_guid)(const efi_guid_t *)); -/** - * efi_range_is_wc - check the WC bit on an address range - * @start: starting kvirt address - * @len: length of range - * - * Consult the EFI memory map and make sure it's ok to set this range WC. - * Returns true or false. - */ -static inline int efi_range_is_wc(unsigned long start, unsigned long len) -{ - unsigned long i; - - for (i = 0; i < len; i += (1UL << EFI_PAGE_SHIFT)) { - unsigned long paddr = __pa(start + i); - if (!(efi_mem_attributes(paddr) & EFI_MEMORY_WC)) - return 0; - } - /* The range checked out */ - return 1; -} - /* * We play games with efi_enabled so that the compiler will, if * possible, remove EFI-related code altogether. -- cgit v1.2.3 From 96189080265e6bb5dde3a4afbaf947af493e3f82 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 9 Mar 2026 14:21:37 -0600 Subject: io_uring: ensure ctx->rings is stable for task work flags manipulation If DEFER_TASKRUN | SETUP_TASKRUN is used and task work is added while the ring is being resized, it's possible for the OR'ing of IORING_SQ_TASKRUN to happen in the small window of swapping into the new rings and the old rings being freed. Prevent this by adding a 2nd ->rings pointer, ->rings_rcu, which is protected by RCU. The task work flags manipulation is inside RCU already, and if the resize ring freeing is done post an RCU synchronize, then there's no need to add locking to the fast path of task work additions. Note: this is only done for DEFER_TASKRUN, as that's the only setup mode that supports ring resizing. If this ever changes, then they too need to use the io_ctx_mark_taskrun() helper. Link: https://lore.kernel.org/io-uring/20260309062759.482210-1-naup96721@gmail.com/ Cc: stable@vger.kernel.org Fixes: 79cfe9e59c2a ("io_uring/register: add IORING_REGISTER_RESIZE_RINGS") Reported-by: Hao-Yu Yang Suggested-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 3e4a82a6f817..dd1420bfcb73 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -388,6 +388,7 @@ struct io_ring_ctx { * regularly bounce b/w CPUs. */ struct { + struct io_rings __rcu *rings_rcu; struct llist_head work_llist; struct llist_head retry_llist; unsigned long check_cq; -- cgit v1.2.3 From 12ae2c81b21cfaa193db2faf035d495807edc3a7 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 26 Feb 2026 14:50:59 +0100 Subject: clone: add CLONE_AUTOREAP Add a new clone3() flag CLONE_AUTOREAP that makes a child process auto-reap on exit without ever becoming a zombie. This is a per-process property in contrast to the existing auto-reap mechanism via SA_NOCLDWAIT or SIG_IGN for SIGCHLD which applies to all children of a given parent. Currently the only way to automatically reap children is to set SA_NOCLDWAIT or SIG_IGN on SIGCHLD. This is a parent-scoped property affecting all children which makes it unsuitable for libraries or applications that need selective auto-reaping of specific children while still being able to wait() on others. CLONE_AUTOREAP stores an autoreap flag in the child's signal_struct. When the child exits do_notify_parent() checks this flag and causes exit_notify() to transition the task directly to EXIT_DEAD. Since the flag lives on the child it survives reparenting: if the original parent exits and the child is reparented to a subreaper or init the child still auto-reaps when it eventually exits. CLONE_AUTOREAP can be combined with CLONE_PIDFD to allow the parent to monitor the child's exit via poll() and retrieve exit status via PIDFD_GET_INFO. Without CLONE_PIDFD it provides a fire-and-forget pattern where the parent simply doesn't care about the child's exit status. No exit signal is delivered so exit_signal must be zero. CLONE_AUTOREAP is rejected in combination with CLONE_PARENT. If a CLONE_AUTOREAP child were to clone(CLONE_PARENT) the new grandchild would inherit exit_signal == 0 from the autoreap parent's group leader but without signal->autoreap. This grandchild would become a zombie that never sends a signal and is never autoreaped - confusing and arguably broken behavior. The flag is not inherited by the autoreap process's own children. Each child that should be autoreaped must be explicitly created with CLONE_AUTOREAP. Link: https://github.com/uapi-group/kernel-features/issues/45 Link: https://patch.msgid.link/20260226-work-pidfs-autoreap-v5-1-d148b984a989@kernel.org Reviewed-by: Oleg Nesterov Signed-off-by: Christian Brauner --- include/linux/sched/signal.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index a22248aebcf9..f842c86b806f 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -132,6 +132,7 @@ struct signal_struct { */ unsigned int is_child_subreaper:1; unsigned int has_child_subreaper:1; + unsigned int autoreap:1; #ifdef CONFIG_POSIX_TIMERS -- cgit v1.2.3 From 4ef420b3450026b56807e5d53001f80eb495403c Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Tue, 10 Mar 2026 18:01:01 -0700 Subject: cgroup: replace global cgroup_file_kn_lock with per-cgroup_file lock Replace the global cgroup_file_kn_lock with a per-cgroup_file spinlock to eliminate cross-cgroup contention as it is not really protecting data shared between different cgroups. The lock is initialized in cgroup_add_file() alongside timer_setup(). No lock acquisition is needed during initialization since the cgroup directory is being populated under cgroup_mutex and no concurrent accessors exist at that point. Reported-by: Jakub Kicinski Signed-off-by: Shakeel Butt Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index bb92f5c169ca..ba26b5d05ce3 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -167,6 +167,7 @@ struct cgroup_file { struct kernfs_node *kn; unsigned long notified_at; struct timer_list notify_timer; + spinlock_t lock; }; /* -- cgit v1.2.3 From 94a4b1f959989de9c54d43c3a102fb1ee92e1414 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 7 Mar 2026 17:50:53 -0300 Subject: ipv6: move the disable_ipv6_mod knob to core code From: Jakub Kicinski Make sure disable_ipv6_mod itself is not part of the IPv6 module, in case core code wants to refer to it. We will remove support for IPv6=m soon, this change helps make fixes we commit before that less messy. Link: https://patch.msgid.link/20260307-net-nd_tbl_fixes-v4-1-e2677e85628c@suse.com Signed-off-by: Jakub Kicinski --- include/linux/ipv6.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 443053a76dcf..a7421382a916 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -333,7 +333,12 @@ struct tcp6_timewait_sock { }; #if IS_ENABLED(CONFIG_IPV6) -bool ipv6_mod_enabled(void); +extern int disable_ipv6_mod; + +static inline bool ipv6_mod_enabled(void) +{ + return disable_ipv6_mod == 0; +} static inline struct ipv6_pinfo *inet6_sk(const struct sock *__sk) { -- cgit v1.2.3 From 49b76317592ecbaefd0969d51d02019966cc994b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 1 Mar 2026 16:52:37 -0800 Subject: sched/wait: correct kernel-doc descriptions Use the correct function name and function parameter name to avoid these kernel-doc warnings: Warning: include/linux/wait_bit.h:424 expecting prototype for wait_var_event_killable(). Prototype was for wait_var_event_interruptible() instead Warning: include/linux/wait_bit.h:508 function parameter 'lock' not described in 'wait_var_event_mutex' Signed-off-by: Randy Dunlap Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260302005237.3473095-1-rdunlap@infradead.org --- include/linux/wait_bit.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h index 9e29d79fc790..ace7379d627d 100644 --- a/include/linux/wait_bit.h +++ b/include/linux/wait_bit.h @@ -406,7 +406,7 @@ do { \ schedule()) /** - * wait_var_event_killable - wait for a variable to be updated and notified + * wait_var_event_interruptible - wait for a variable to be updated and notified * @var: the address of variable being waited on * @condition: the condition to wait for * @@ -492,7 +492,7 @@ do { \ * wait_var_event_mutex - wait for a variable to be updated under a mutex * @var: the address of the variable being waited on * @condition: condition to wait for - * @mutex: the mutex which protects updates to the variable + * @lock: the mutex which protects updates to the variable * * Wait for a condition which can only be reliably tested while holding * a mutex. The variables assessed in the condition will normal be -- cgit v1.2.3 From 14de1552a4e3fece78bb20314887e70888c9d448 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 11 Mar 2026 16:14:55 -0700 Subject: include/linux/local_lock_internal.h: Make this header file again compatible with sparse There are two versions of the __this_cpu_local_lock() definitions in include/linux/local_lock_internal.h: one version that relies on the Clang overloading functionality and another version that does not. Select the latter version when using sparse. This patch fixes the following errors reported by sparse: include/linux/local_lock_internal.h:331:40: sparse: sparse: multiple definitions for function '__this_cpu_local_lock' include/linux/local_lock_internal.h:325:37: sparse: the previous one is here Closes: https://lore.kernel.org/oe-kbuild-all/202603062334.wgI5htP0-lkp@intel.com/ Fixes: d3febf16dee2 ("locking/local_lock: Support Clang's context analysis") Reported-by: kernel test robot Signed-off-by: Bart Van Assche Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Marco Elver Link: https://patch.msgid.link/20260311231455.1961413-1-bvanassche@acm.org --- include/linux/local_lock_internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h index eff711bf973f..234be7f12c15 100644 --- a/include/linux/local_lock_internal.h +++ b/include/linux/local_lock_internal.h @@ -315,7 +315,7 @@ do { \ #endif /* CONFIG_PREEMPT_RT */ -#if defined(WARN_CONTEXT_ANALYSIS) +#if defined(WARN_CONTEXT_ANALYSIS) && !defined(__CHECKER__) /* * Because the compiler only knows about the base per-CPU variable, use this * helper function to make the compiler think we lock/unlock the @base variable, -- cgit v1.2.3 From 754e38d2d1aeeadddac5220f34e07cf263502a46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh=20=28Schneider=20Electric=29?= Date: Wed, 11 Mar 2026 11:15:11 +0100 Subject: tracing: Use explicit array size instead of sentinel elements in symbol printing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The sentinel value added by the wrapper macros __print_symbolic() et al prevents the callers from adding their own trailing comma. This makes constructing symbol list dynamically based on kconfig values tedious. Drop the sentinel elements, so callers can either specify the trailing comma or not, just like in regular array initializers. Signed-off-by: Thomas Weißschuh (Schneider Electric) Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260311-hrtimer-cleanups-v1-2-095357392669@linutronix.de --- include/linux/trace_events.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 37eb2f0f3dd8..40a43a4c7caf 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -22,20 +22,23 @@ union bpf_attr; const char *trace_print_flags_seq(struct trace_seq *p, const char *delim, unsigned long flags, - const struct trace_print_flags *flag_array); + const struct trace_print_flags *flag_array, + size_t flag_array_size); const char *trace_print_symbols_seq(struct trace_seq *p, unsigned long val, - const struct trace_print_flags *symbol_array); + const struct trace_print_flags *symbol_array, + size_t symbol_array_size); #if BITS_PER_LONG == 32 const char *trace_print_flags_seq_u64(struct trace_seq *p, const char *delim, unsigned long long flags, - const struct trace_print_flags_u64 *flag_array); + const struct trace_print_flags_u64 *flag_array, + size_t flag_array_size); const char *trace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, - const struct trace_print_flags_u64 - *symbol_array); + const struct trace_print_flags_u64 *symbol_array, + size_t symbol_array_size); #endif struct trace_iterator; -- cgit v1.2.3 From 8ef2807042d0886a85bbcb0aba1a2a277680dc4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh=20=28Schneider=20Electric=29?= Date: Wed, 11 Mar 2026 11:15:15 +0100 Subject: hrtimer: Remove hrtimer_get_expires_ns() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are no users left. Signed-off-by: Thomas Weißschuh (Schneider Electric) Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260311-hrtimer-cleanups-v1-6-095357392669@linutronix.de --- include/linux/hrtimer.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index c087b7142330..9ced498fefaa 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -116,11 +116,6 @@ static inline ktime_t hrtimer_get_softexpires(const struct hrtimer *timer) return timer->_softexpires; } -static inline s64 hrtimer_get_expires_ns(const struct hrtimer *timer) -{ - return ktime_to_ns(timer->node.expires); -} - ktime_t hrtimer_cb_get_time(const struct hrtimer *timer); static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer) -- cgit v1.2.3 From b94c076dd949426d09e5d415304acb3f951d9069 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh=20=28Schneider=20Electric=29?= Date: Wed, 11 Mar 2026 11:15:17 +0100 Subject: hrtimer: Drop spurious space in 'enum hrtimer_base_type' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This spurious space makes grepping for the enum definition annoying. Remove it. Signed-off-by: Thomas Weißschuh (Schneider Electric) Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260311-hrtimer-cleanups-v1-8-095357392669@linutronix.de --- include/linux/hrtimer_defs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h index 0f851b2432c3..e6d4dc1b61e0 100644 --- a/include/linux/hrtimer_defs.h +++ b/include/linux/hrtimer_defs.h @@ -35,7 +35,7 @@ struct hrtimer_clock_base { ktime_t offset; } __hrtimer_clock_base_align; -enum hrtimer_base_type { +enum hrtimer_base_type { HRTIMER_BASE_MONOTONIC, HRTIMER_BASE_REALTIME, HRTIMER_BASE_BOOTTIME, -- cgit v1.2.3 From f12ef5cb4e035e15f0c324c41ff402441578ffda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh=20=28Schneider=20Electric=29?= Date: Wed, 11 Mar 2026 11:15:19 +0100 Subject: hrtimer: Mark index and clockid of clock base as const MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These fields are initialized once and are never supposed to change. Mark them as const to make this explicit. Signed-off-by: Thomas Weißschuh (Schneider Electric) Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260311-hrtimer-cleanups-v1-10-095357392669@linutronix.de --- include/linux/hrtimer_defs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h index e6d4dc1b61e0..a03240c0b14f 100644 --- a/include/linux/hrtimer_defs.h +++ b/include/linux/hrtimer_defs.h @@ -26,8 +26,8 @@ */ struct hrtimer_clock_base { struct hrtimer_cpu_base *cpu_base; - unsigned int index; - clockid_t clockid; + const unsigned int index; + const clockid_t clockid; seqcount_raw_spinlock_t seq; ktime_t expires_next; struct hrtimer *running; -- cgit v1.2.3 From f27fc117cf8fba56e0619694e685f9bca9b9cb82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh=20=28Schneider=20Electric=29?= Date: Wed, 11 Mar 2026 11:15:20 +0100 Subject: hrtimer: Remove trailing comma after HRTIMER_MAX_CLOCK_BASES MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HRTIMER_MAX_CLOCK_BASES is required to stay the last value of the enum. Drop the trailing comma so no new members are added after it by mistake. Signed-off-by: Thomas Weißschuh (Schneider Electric) Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260311-hrtimer-cleanups-v1-11-095357392669@linutronix.de --- include/linux/hrtimer_defs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h index a03240c0b14f..52ed9e46ff13 100644 --- a/include/linux/hrtimer_defs.h +++ b/include/linux/hrtimer_defs.h @@ -44,7 +44,7 @@ enum hrtimer_base_type { HRTIMER_BASE_REALTIME_SOFT, HRTIMER_BASE_BOOTTIME_SOFT, HRTIMER_BASE_TAI_SOFT, - HRTIMER_MAX_CLOCK_BASES, + HRTIMER_MAX_CLOCK_BASES }; /** -- cgit v1.2.3 From 6f459eda8b60382efa0da2ca025c26a2018adc87 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 10 Mar 2026 12:44:51 +0000 Subject: tcp: add tcp_release_cb_cond() helper Majority of tcp_release_cb() calls do nothing at all. Provide tcp_release_cb_cond() helper so that release_sock() can avoid these calls. Also hint the compiler that __release_sock() and wake_up() are rarely called. $ scripts/bloat-o-meter -t vmlinux.old vmlinux.new add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-77 (-77) Function old new delta release_sock 258 181 -77 Total: Before=25235790, After=25235713, chg -0.00% Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260310124451.2280968-1-edumazet@google.com Signed-off-by: Paolo Abeni --- include/linux/tcp.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index c44cf9ae8d16..bcebc4f07532 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -548,6 +548,13 @@ enum tsq_flags { TCPF_ACK_DEFERRED = BIT(TCP_ACK_DEFERRED), }; +/* Flags of interest for tcp_release_cb() */ +#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \ + TCPF_WRITE_TIMER_DEFERRED | \ + TCPF_DELACK_TIMER_DEFERRED | \ + TCPF_MTU_REDUCED_DEFERRED | \ + TCPF_ACK_DEFERRED) + #define tcp_sk(ptr) container_of_const(ptr, struct tcp_sock, inet_conn.icsk_inet.sk) /* Variant of tcp_sk() upgrading a const sock to a read/write tcp socket. -- cgit v1.2.3 From c670267ff50d5f9beb486f0203cdede580a99ae3 Mon Sep 17 00:00:00 2001 From: Qingfang Deng Date: Fri, 6 Feb 2026 14:20:03 +0800 Subject: tty: constify tty_ldisc_ops tty_ldisc_ops is not modified once registered, so make it const. Signed-off-by: Qingfang Deng Link: https://patch.msgid.link/20260206062004.1273890-1-dqfext@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/linux/tty_ldisc.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tty_ldisc.h b/include/linux/tty_ldisc.h index c5cccc3fc1e8..d227a58e3e49 100644 --- a/include/linux/tty_ldisc.h +++ b/include/linux/tty_ldisc.h @@ -266,7 +266,7 @@ struct tty_ldisc_ops { }; struct tty_ldisc { - struct tty_ldisc_ops *ops; + const struct tty_ldisc_ops *ops; struct tty_struct *tty; }; @@ -281,8 +281,8 @@ struct tty_ldisc *tty_ldisc_ref_wait(struct tty_struct *); void tty_ldisc_flush(struct tty_struct *tty); -int tty_register_ldisc(struct tty_ldisc_ops *new_ldisc); -void tty_unregister_ldisc(struct tty_ldisc_ops *ldisc); +int tty_register_ldisc(const struct tty_ldisc_ops *new_ldisc); +void tty_unregister_ldisc(const struct tty_ldisc_ops *ldisc); int tty_set_ldisc(struct tty_struct *tty, int disc); #endif /* _LINUX_TTY_LDISC_H */ -- cgit v1.2.3 From 24728b93fafe0949b5353e1a7b3a94175fe26d6e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 10 Mar 2026 22:23:47 -0700 Subject: serdev: serdev.h: clean up kernel-doc comments Correct kernel-doc comment format and add a missing to avoid kernel-doc warnings: Warning: include/linux/serdev.h:49 struct member 'write_comp' not described in 'serdev_device' Warning: include/linux/serdev.h:49 struct member 'write_lock' not described in 'serdev_device' Warning: include/linux/serdev.h:68 struct member 'shutdown' not described in 'serdev_device_driver' Warning: include/linux/serdev.h:134 function parameter 'serdev' not described in 'serdev_device_put' Warning: include/linux/serdev.h:162 function parameter 'ctrl' not described in 'serdev_controller_put' Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20260311052347.305612-1-rdunlap@infradead.org Signed-off-by: Greg Kroah-Hartman --- include/linux/serdev.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/serdev.h b/include/linux/serdev.h index 5654c58eb73c..090c93c08045 100644 --- a/include/linux/serdev.h +++ b/include/linux/serdev.h @@ -37,8 +37,8 @@ struct serdev_device_ops { * @nr: Device number on serdev bus. * @ctrl: serdev controller managing this device. * @ops: Device operations. - * @write_comp Completion used by serdev_device_write() internally - * @write_lock Lock to serialize access when writing data + * @write_comp: Completion used by serdev_device_write() internally + * @write_lock: Lock to serialize access when writing data */ struct serdev_device { struct device dev; @@ -60,6 +60,7 @@ static inline struct serdev_device *to_serdev_device(struct device *d) * structure. * @probe: binds this driver to a serdev device. * @remove: unbinds this driver from the serdev device. + * @shutdown: shut down this serdev device. */ struct serdev_device_driver { struct device_driver driver; @@ -129,7 +130,7 @@ static inline void serdev_device_set_drvdata(struct serdev_device *serdev, void /** * serdev_device_put() - decrement serdev device refcount - * @serdev serdev device. + * @serdev: serdev device. */ static inline void serdev_device_put(struct serdev_device *serdev) { @@ -157,7 +158,7 @@ static inline void serdev_controller_set_drvdata(struct serdev_controller *ctrl, /** * serdev_controller_put() - decrement controller refcount - * @ctrl serdev controller. + * @ctrl: serdev controller. */ static inline void serdev_controller_put(struct serdev_controller *ctrl) { -- cgit v1.2.3 From eb3b0d92c9c39890592cca6647601fe5c631efea Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Fri, 13 Feb 2026 16:50:39 +0800 Subject: tty: tty_port: add workqueue to flip TTY buffer On the embedded platform, certain critical data, such as IMU data, is transmitted through UART. The tty_flip_buffer_push() interface in the TTY layer uses system_dfl_wq to handle the flipping of the TTY buffer. Although the unbound workqueue can create new threads on demand and wake up the kworker thread on an idle CPU, it may be preempted by real-time tasks or other high-prio tasks. flush_to_ldisc() needs to wake up the relevant data handle thread. When executing __wake_up_common_lock(), it calls spin_lock_irqsave(), which does not disable preemption but disables migration in RT-Linux. This prevents the kworker thread from being migrated to other cores by CPU's balancing logic, resulting in long delays. The call trace is as follows: __wake_up_common_lock __wake_up ep_poll_callback __wake_up_common __wake_up_common_lock __wake_up n_tty_receive_buf_common n_tty_receive_buf2 tty_ldisc_receive_buf tty_port_default_receive_buf flush_to_ldisc In our system, the processing interval for each frame of IMU data transmitted via UART can experience significant jitter due to this issue. Instead of the expected 10 to 15 ms frame processing interval, we see spikes up to 30 to 35 ms. Moreover, in just one or two hours, there can be 2 to 3 occurrences of such high jitter, which is quite frequent. This jitter exceeds the software's tolerable limit of 20 ms. Introduce flip_wq in tty_port which can be set by tty_port_link_wq() or as default linked to default workqueue allocated when tty_register_driver(). The default workqueue is allocated with flag WQ_SYSFS, so that cpumask and nice can be set dynamically. The execution timing of tty_port_link_wq() is not clearly restricted. The newly added function tty_port_link_driver_wq() checks whether the flip_wq of the tty_port has already been assigned when linking the default tty_driver's workqueue to the port. After the user has set a custom workqueue for a certain tty_port using tty_port_link_wq(), the system will only use this custom workqueue, even if tty_driver does not have %TTY_DRIVER_NO_WORKQUEUE flag. When tty_port register device, flip_wq link operation is done by tty_port_link_driver_wq(), but for in-memory devices the link operation cannot cover all the cases. Although tty_port_install() is dedicated for in-memory devices lik PTY to link port allocated on demand, the logic of tty_port_install() is so simple that people may not call it, vc_cons[0].d->port is one such case. We check the buf.flip_wq when flip TTY buffer, if buf.flip_wq of TTY port is NULL, use system_dfl_wq as a backup. To avoid naming conflict of the default tty_driver's workqueue, using '"%s-%s", driver->name, driver->driver_name' as the workqueue name. In cases where driver_name is not specified and therefore is NULL, the workqueue is not created. Drivers that do not define driver_name are potentially in-memory devices like vty, which generally do not require special workqueue settings. Even with the combination of name and driver_name, the workqueue names can still be duplicated, as many tty serial drivers use "ttyS" as dev_name and "serial" as driver_name. I modified the conflicting driver_name of these drivers by appending a suffix of _xx based on the corresponding .c file. If this modification is not made, it could not only lead to duplicate workqueue names but also result in duplicate entries for the /proc/tty/driver/ nodes. Introduce %TTY_DRIVER_NO_WORKQUEUE flag meaning not to create the default single tty_driver workqueue. Two reasons why need to introduce the %TTY_DRIVER_NO_WORKQUEUE flag: 1. If the WQ_SYSFS parameter is enabled, workqueue_sysfs_register() will fail when trying to create a workqueue with the same name. The pty is an example of this; if both CONFIG_LEGACY_PTYS and CONFIG_UNIX98_PTYS are enabled, the call to tty_register_driver() in unix98_pty_init() will fail. 2. Different TTY ports may be used for different tasks, which may require separate core binding control via workqueues. In this case, the workqueue created by default in the TTY driver is unnecessary. Enabling this flag prevents the creation of this redundant workqueue. After applying this patch, we can set the related UART TTY flip buffer workqueue by sysfs. We set the cpumask to CPU cores associated with the IMU tasks, and set the nice to -20. Testing has shown significant improvement in the previously described issue, with almost no stuttering occurring anymore. Tested-by: Tommaso Merciai Tested-by: Marek Szyprowski Signed-off-by: Xin Zhao Link: https://patch.msgid.link/20260213085039.3274704-1-jackzxcui1989@163.com Signed-off-by: Greg Kroah-Hartman --- include/linux/tty_buffer.h | 1 + include/linux/tty_driver.h | 7 +++++++ include/linux/tty_port.h | 13 +++++++++++++ 3 files changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tty_buffer.h b/include/linux/tty_buffer.h index 31125e3be3c5..48adcb0e8ff3 100644 --- a/include/linux/tty_buffer.h +++ b/include/linux/tty_buffer.h @@ -34,6 +34,7 @@ static inline u8 *flag_buf_ptr(struct tty_buffer *b, unsigned int ofs) struct tty_bufhead { struct tty_buffer *head; /* Queue head */ + struct workqueue_struct *flip_wq; struct work_struct work; struct mutex lock; atomic_t priority; diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h index 188ee9b768eb..1f2896e56e77 100644 --- a/include/linux/tty_driver.h +++ b/include/linux/tty_driver.h @@ -69,6 +69,10 @@ struct serial_struct; * Do not create numbered ``/dev`` nodes. For example, create * ``/dev/ttyprintk`` and not ``/dev/ttyprintk0``. Applicable only when a * driver for a single tty device is being allocated. + * + * @TTY_DRIVER_NO_WORKQUEUE: + * Do not create workqueue when tty_register_driver(). Whenever set, flip + * buffer workqueue can be set by tty_port_link_wq() for every port. */ enum tty_driver_flag { TTY_DRIVER_INSTALLED = BIT(0), @@ -79,6 +83,7 @@ enum tty_driver_flag { TTY_DRIVER_HARDWARE_BREAK = BIT(5), TTY_DRIVER_DYNAMIC_ALLOC = BIT(6), TTY_DRIVER_UNNUMBERED_NODE = BIT(7), + TTY_DRIVER_NO_WORKQUEUE = BIT(8), }; enum tty_driver_type { @@ -506,6 +511,7 @@ struct tty_operations { * @flags: tty driver flags (%TTY_DRIVER_) * @proc_entry: proc fs entry, used internally * @other: driver of the linked tty; only used for the PTY driver + * @flip_wq: workqueue to queue flip buffer work on * @ttys: array of active &struct tty_struct, set by tty_standard_install() * @ports: array of &struct tty_port; can be set during initialization by * tty_port_link_device() and similar @@ -539,6 +545,7 @@ struct tty_driver { unsigned long flags; struct proc_dir_entry *proc_entry; struct tty_driver *other; + struct workqueue_struct *flip_wq; /* * Pointer to the tty data structures diff --git a/include/linux/tty_port.h b/include/linux/tty_port.h index 660c254f1efe..d2a7882c0b58 100644 --- a/include/linux/tty_port.h +++ b/include/linux/tty_port.h @@ -138,6 +138,7 @@ struct tty_port { kernel */ void tty_port_init(struct tty_port *port); +void tty_port_link_wq(struct tty_port *port, struct workqueue_struct *flip_wq); void tty_port_link_device(struct tty_port *port, struct tty_driver *driver, unsigned index); struct device *tty_port_register_device(struct tty_port *port, @@ -165,6 +166,18 @@ static inline struct tty_port *tty_port_get(struct tty_port *port) return NULL; } +/* + * Never overwrite the workqueue set by tty_port_link_wq(). + * No effect when %TTY_DRIVER_NO_WORKQUEUE is set, as driver->flip_wq is + * %NULL. + */ +static inline void tty_port_link_driver_wq(struct tty_port *port, + struct tty_driver *driver) +{ + if (!port->buf.flip_wq) + tty_port_link_wq(port, driver->flip_wq); +} + /* If the cts flow control is enabled, return true. */ static inline bool tty_port_cts_enabled(const struct tty_port *port) { -- cgit v1.2.3 From 8324a54f604da18f21070702a8ad82ab2062787b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Tue, 3 Feb 2026 19:10:45 +0200 Subject: serial: 8250: Add serial8250_handle_irq_locked() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 8250_port exports serial8250_handle_irq() to HW specific 8250 drivers. It takes port's lock within but a HW specific 8250 driver may want to take port's lock itself, do something, and then call the generic handler in 8250_port but to do that, the caller has to release port's lock for no good reason. Introduce serial8250_handle_irq_locked() which a HW specific driver can call while already holding port's lock. As this is new export, put it straight into a namespace (where all 8250 exports should eventually be moved). Tested-by: Bandal, Shankar Tested-by: Murthy, Shanth Cc: stable Reviewed-by: Andy Shevchenko Signed-off-by: Ilpo Järvinen Link: https://patch.msgid.link/20260203171049.4353-4-ilpo.jarvinen@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_8250.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h index 01efdce0fda0..a95b2d143d24 100644 --- a/include/linux/serial_8250.h +++ b/include/linux/serial_8250.h @@ -195,6 +195,7 @@ void serial8250_do_set_mctrl(struct uart_port *port, unsigned int mctrl); void serial8250_do_set_divisor(struct uart_port *port, unsigned int baud, unsigned int quot); int fsl8250_handle_irq(struct uart_port *port); +void serial8250_handle_irq_locked(struct uart_port *port, unsigned int iir); int serial8250_handle_irq(struct uart_port *port, unsigned int iir); u16 serial8250_rx_chars(struct uart_8250_port *up, u16 lsr); void serial8250_read_char(struct uart_8250_port *up, u16 lsr); -- cgit v1.2.3 From 59621105ffca7a33955f56bc7dee0923992f5832 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 23 Feb 2026 14:37:16 +0100 Subject: of: provide of_machine_read_compatible() Provide a helper function allowing users to read the compatible string of the machine, hiding the access to the root node. Reviewed-by: Christophe Leroy (CS GROUP) Signed-off-by: Bartosz Golaszewski Reviewed-by: Rob Herring (Arm) Link: https://patch.msgid.link/20260223-soc-of-root-v2-1-b45da45903c8@oss.qualcomm.com Signed-off-by: Greg Kroah-Hartman --- include/linux/of.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index be6ec4916adf..7df971d52b55 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -426,6 +426,8 @@ static inline bool of_machine_is_compatible(const char *compat) return of_machine_compatible_match(compats); } +int of_machine_read_compatible(const char **compatible, unsigned int index); + extern int of_add_property(struct device_node *np, struct property *prop); extern int of_remove_property(struct device_node *np, struct property *prop); extern int of_update_property(struct device_node *np, struct property *newprop); @@ -851,6 +853,12 @@ static inline int of_machine_is_compatible(const char *compat) return 0; } +static inline int of_machine_read_compatible(const char **compatible, + unsigned int index) +{ + return -ENOSYS; +} + static inline int of_add_property(struct device_node *np, struct property *prop) { return 0; -- cgit v1.2.3 From c86d3b7b847cc9b32a17117cfd71679e4315fd9f Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 23 Feb 2026 14:37:17 +0100 Subject: of: provide of_machine_read_model() Provide a helper function allowing users to read the model string of the machine, hiding the access to the root node. Signed-off-by: Bartosz Golaszewski Reviewed-by: Rob Herring (Arm) Link: https://patch.msgid.link/20260223-soc-of-root-v2-2-b45da45903c8@oss.qualcomm.com Signed-off-by: Greg Kroah-Hartman --- include/linux/of.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index 7df971d52b55..2b95777f16f6 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -427,6 +427,7 @@ static inline bool of_machine_is_compatible(const char *compat) } int of_machine_read_compatible(const char **compatible, unsigned int index); +int of_machine_read_model(const char **model); extern int of_add_property(struct device_node *np, struct property *prop); extern int of_remove_property(struct device_node *np, struct property *prop); @@ -859,6 +860,11 @@ static inline int of_machine_read_compatible(const char **compatible, return -ENOSYS; } +static inline int of_machine_read_model(const char **model) +{ + return -ENOSYS; +} + static inline int of_add_property(struct device_node *np, struct property *prop) { return 0; -- cgit v1.2.3 From 030706e954c10749da8c75464c6b02cb30cb00aa Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 23 Feb 2026 14:37:19 +0100 Subject: base: soc: rename and export soc_device_get_machine() Some SoC drivers reimplement the functionality of soc_device_get_machine(). Make this function accessible through the sys_soc.h header and rename it to a more descriptive name. Reviewed-by: Christophe Leroy (CS GROUP) Signed-off-by: Bartosz Golaszewski Reviewed-by: Rob Herring (Arm) Link: https://patch.msgid.link/20260223-soc-of-root-v2-4-b45da45903c8@oss.qualcomm.com Signed-off-by: Greg Kroah-Hartman --- include/linux/sys_soc.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sys_soc.h b/include/linux/sys_soc.h index d9b3cf0f410c..f19f5cec18e2 100644 --- a/include/linux/sys_soc.h +++ b/include/linux/sys_soc.h @@ -37,6 +37,16 @@ void soc_device_unregister(struct soc_device *soc_dev); */ struct device *soc_device_to_device(struct soc_device *soc); +/** + * soc_attr_read_machine - retrieve the machine model and store it in + * the soc_device_attribute structure + * @soc_dev_attr: SoC attribute structure to store the model in + * + * Returns: + * 0 on success, negative error number on failure. + */ +int soc_attr_read_machine(struct soc_device_attribute *soc_dev_attr); + #ifdef CONFIG_SOC_BUS const struct soc_device_attribute *soc_device_match( const struct soc_device_attribute *matches); -- cgit v1.2.3 From bb729bf1d6fdf5c2087c1651165c74cef0da1742 Mon Sep 17 00:00:00 2001 From: Li Ming Date: Tue, 10 Mar 2026 23:57:53 +0800 Subject: driver core: Add conditional guard support for device_lock() Introduce conditional guard version of device_lock() for scenarios that require conditional device lock holding. Suggested-by: Dan Williams Reviewed-by: Dan Williams Acked-by: Greg Kroah-Hartman Signed-off-by: Li Ming Link: https://patch.msgid.link/20260310-fix_access_endpoint_without_drv_check-v1-1-94fe919a0b87@zohomail.com Signed-off-by: Danilo Krummrich --- include/linux/device.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 0be95294b6e6..4fafee80524b 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -911,6 +911,7 @@ static inline void device_unlock(struct device *dev) } DEFINE_GUARD(device, struct device *, device_lock(_T), device_unlock(_T)) +DEFINE_GUARD_COND(device, _intr, device_lock_interruptible(_T), _RET == 0) static inline void device_lock_assert(struct device *dev) { -- cgit v1.2.3 From 8431c602f551549f082bbfa67f3003f2d8e3e132 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 11 Mar 2026 12:31:10 +0000 Subject: ip_tunnel: adapt iptunnel_xmit_stats() to NETDEV_PCPU_STAT_DSTATS Blamed commits forgot that vxlan/geneve use udp_tunnel[6]_xmit_skb() which call iptunnel_xmit_stats(). iptunnel_xmit_stats() was assuming tunnels were only using NETDEV_PCPU_STAT_TSTATS. @syncp offset in pcpu_sw_netstats and pcpu_dstats is different. 32bit kernels would either have corruptions or freezes if the syncp sequence was overwritten. This patch also moves pcpu_stat_type closer to dev->{t,d}stats to avoid a potential cache line miss since iptunnel_xmit_stats() needs to read it. Fixes: 6fa6de302246 ("geneve: Handle stats using NETDEV_PCPU_STAT_DSTATS.") Fixes: be226352e8dc ("vxlan: Handle stats using NETDEV_PCPU_STAT_DSTATS.") Signed-off-by: Eric Dumazet Reviewed-by: Guillaume Nault Link: https://patch.msgid.link/20260311123110.1471930-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ae269a2e7f4d..d7aac6f185bc 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2155,6 +2155,7 @@ struct net_device { unsigned long state; unsigned int flags; unsigned short hard_header_len; + enum netdev_stat_type pcpu_stat_type:8; netdev_features_t features; struct inet6_dev __rcu *ip6_ptr; __cacheline_group_end(net_device_read_txrx); @@ -2404,8 +2405,6 @@ struct net_device { void *ml_priv; enum netdev_ml_priv_type ml_priv_type; - enum netdev_stat_type pcpu_stat_type:8; - #if IS_ENABLED(CONFIG_GARP) struct garp_port __rcu *garp_port; #endif -- cgit v1.2.3 From e416e7fa417b2d2604c1526a2f9cc38da7ced166 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 10 Mar 2026 22:29:53 -0700 Subject: tee: clean up tee_core.h kernel-doc Use the correct struct member name and function parameter name in kernel-doc comments. Move a macro that was between a struct's documentation and its declaration. These eliminate the following kernel-doc warnings: Warning: include/linux/tee_core.h:73 struct member 'c_no_users' not described in 'tee_device' Warning: include/linux/tee_core.h:132 #define TEE_DESC_PRIVILEGED 0x1; error: Cannot parse struct or union! Warning: include/linux/tee_core.h:257 function parameter 'connection_data' not described in 'tee_session_calc_client_uuid' Warning: include/linux/tee_core.h:320 function parameter 'teedev' not described in 'tee_get_drvdata' Signed-off-by: Randy Dunlap Reviewed-by: Sumit Garg Signed-off-by: Jens Wiklander --- include/linux/tee_core.h | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tee_core.h b/include/linux/tee_core.h index ee5f0bd41f43..f993d5118edd 100644 --- a/include/linux/tee_core.h +++ b/include/linux/tee_core.h @@ -50,7 +50,7 @@ enum tee_dma_heap_id { * @dev: embedded basic device structure * @cdev: embedded cdev * @num_users: number of active users of this device - * @c_no_user: completion used when unregistering the device + * @c_no_users: completion used when unregistering the device * @mutex: mutex protecting @num_users and @idr * @idr: register of user space shared memory objects allocated or * registered on this device @@ -132,6 +132,7 @@ struct tee_driver_ops { /* Size for TEE revision string buffer used by get_tee_revision(). */ #define TEE_REVISION_STR_SIZE 128 +#define TEE_DESC_PRIVILEGED 0x1 /** * struct tee_desc - Describes the TEE driver to the subsystem * @name: name of driver @@ -139,7 +140,6 @@ struct tee_driver_ops { * @owner: module providing the driver * @flags: Extra properties of driver, defined by TEE_DESC_* below */ -#define TEE_DESC_PRIVILEGED 0x1 struct tee_desc { const char *name; const struct tee_driver_ops *ops; @@ -187,7 +187,7 @@ struct tee_protmem_pool_ops { * Allocates a new struct tee_device instance. The device is * removed by tee_device_unregister(). * - * @returns a pointer to a 'struct tee_device' or an ERR_PTR on failure + * @returns: a pointer to a 'struct tee_device' or an ERR_PTR on failure */ struct tee_device *tee_device_alloc(const struct tee_desc *teedesc, struct device *dev, @@ -201,7 +201,7 @@ struct tee_device *tee_device_alloc(const struct tee_desc *teedesc, * tee_device_unregister() need to be called to remove the @teedev if * this function fails. * - * @returns < 0 on failure + * @returns: < 0 on failure */ int tee_device_register(struct tee_device *teedev); @@ -254,14 +254,14 @@ void tee_device_set_dev_groups(struct tee_device *teedev, * tee_session_calc_client_uuid() - Calculates client UUID for session * @uuid: Resulting UUID * @connection_method: Connection method for session (TEE_IOCTL_LOGIN_*) - * @connectuon_data: Connection data for opening session + * @connection_data: Connection data for opening session * * Based on connection method calculates UUIDv5 based client UUID. * * For group based logins verifies that calling process has specified * credentials. * - * @return < 0 on failure + * @returns: < 0 on failure */ int tee_session_calc_client_uuid(uuid_t *uuid, u32 connection_method, const u8 connection_data[TEE_IOCTL_UUID_LEN]); @@ -295,7 +295,7 @@ struct tee_shm_pool_ops { * @paddr: Physical address of start of pool * @size: Size in bytes of the pool * - * @returns pointer to a 'struct tee_shm_pool' or an ERR_PTR on failure. + * @returns: pointer to a 'struct tee_shm_pool' or an ERR_PTR on failure. */ struct tee_shm_pool *tee_shm_pool_alloc_res_mem(unsigned long vaddr, phys_addr_t paddr, size_t size, @@ -318,14 +318,16 @@ static inline void tee_shm_pool_free(struct tee_shm_pool *pool) * @paddr: Physical address of start of pool * @size: Size in bytes of the pool * - * @returns pointer to a 'struct tee_protmem_pool' or an ERR_PTR on failure. + * @returns: pointer to a 'struct tee_protmem_pool' or an ERR_PTR on failure. */ struct tee_protmem_pool *tee_protmem_static_pool_alloc(phys_addr_t paddr, size_t size); /** * tee_get_drvdata() - Return driver_data pointer - * @returns the driver_data pointer supplied to tee_register(). + * @teedev: Pointer to the tee_device + * + * @returns: the driver_data pointer supplied to tee_register(). */ void *tee_get_drvdata(struct tee_device *teedev); @@ -334,7 +336,7 @@ void *tee_get_drvdata(struct tee_device *teedev); * TEE driver * @ctx: The TEE context for shared memory allocation * @size: Shared memory allocation size - * @returns a pointer to 'struct tee_shm' on success or an ERR_PTR on failure + * @returns: a pointer to 'struct tee_shm' on success or an ERR_PTR on failure */ struct tee_shm *tee_shm_alloc_priv_buf(struct tee_context *ctx, size_t size); @@ -354,7 +356,7 @@ void tee_dyn_shm_free_helper(struct tee_shm *shm, /** * tee_shm_is_dynamic() - Check if shared memory object is of the dynamic kind * @shm: Shared memory handle - * @returns true if object is dynamic shared memory + * @returns: true if object is dynamic shared memory */ static inline bool tee_shm_is_dynamic(struct tee_shm *shm) { @@ -370,7 +372,7 @@ void tee_shm_put(struct tee_shm *shm); /** * tee_shm_get_id() - Get id of a shared memory object * @shm: Shared memory handle - * @returns id + * @returns: id */ static inline int tee_shm_get_id(struct tee_shm *shm) { @@ -382,7 +384,7 @@ static inline int tee_shm_get_id(struct tee_shm *shm) * count * @ctx: Context owning the shared memory * @id: Id of shared memory object - * @returns a pointer to 'struct tee_shm' on success or an ERR_PTR on failure + * @returns: a pointer to 'struct tee_shm' on success or an ERR_PTR on failure */ struct tee_shm *tee_shm_get_from_id(struct tee_context *ctx, int id); @@ -402,7 +404,7 @@ static inline bool tee_param_is_memref(struct tee_param *param) * teedev_open() - Open a struct tee_device * @teedev: Device to open * - * @return a pointer to struct tee_context on success or an ERR_PTR on failure. + * @returns: pointer to struct tee_context on success or an ERR_PTR on failure. */ struct tee_context *teedev_open(struct tee_device *teedev); -- cgit v1.2.3 From 5eb608319bb56464674a71b4a66ea65c6c435d64 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Tue, 27 Jan 2026 17:56:01 -0500 Subject: vt: save/restore unicode screen buffer for alternate screen The alternate screen support added by commit 23743ba64709 ("vt: add support for smput/rmput escape codes") only saves and restores the regular screen buffer (vc_origin), but completely ignores the corresponding unicode screen buffer (vc_uni_lines) creating a messed-up display. Add vc_saved_uni_lines to save the unicode screen buffer when entering the alternate screen, and restore it when leaving. Also ensure proper cleanup in reset_terminal() and vc_deallocate(). Fixes: 23743ba64709 ("vt: add support for smput/rmput escape codes") Cc: stable Signed-off-by: Nicolas Pitre Link: https://patch.msgid.link/5o2p6qp3-91pq-0p17-or02-1oors4417ns7@onlyvoer.pbz Signed-off-by: Greg Kroah-Hartman --- include/linux/console_struct.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/console_struct.h b/include/linux/console_struct.h index 13b35637bd5a..d5ca855116df 100644 --- a/include/linux/console_struct.h +++ b/include/linux/console_struct.h @@ -160,6 +160,7 @@ struct vc_data { struct uni_pagedict **uni_pagedict_loc; /* [!] Location of uni_pagedict variable for this console */ u32 **vc_uni_lines; /* unicode screen content */ u16 *vc_saved_screen; + u32 **vc_saved_uni_lines; unsigned int vc_saved_cols; unsigned int vc_saved_rows; /* additional information is in vt_kern.h */ -- cgit v1.2.3 From fe2511adb1fc1814df06ca11e0d8a92f792e4029 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 1 Mar 2026 13:30:17 +0100 Subject: sysfs: constify group arrays in function arguments Constify the groups array argument where applicable. This allows to pass constant arrays as arguments. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/17035265-8882-4101-b7a7-16b3eb94f8b5@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/linux/sysfs.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index 99b775f3ff46..9777e9445dd5 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -445,15 +445,15 @@ void sysfs_delete_link(struct kobject *dir, struct kobject *targ, int __must_check sysfs_create_group(struct kobject *kobj, const struct attribute_group *grp); int __must_check sysfs_create_groups(struct kobject *kobj, - const struct attribute_group **groups); + const struct attribute_group *const *groups); int __must_check sysfs_update_groups(struct kobject *kobj, - const struct attribute_group **groups); + const struct attribute_group *const *groups); int sysfs_update_group(struct kobject *kobj, const struct attribute_group *grp); void sysfs_remove_group(struct kobject *kobj, const struct attribute_group *grp); void sysfs_remove_groups(struct kobject *kobj, - const struct attribute_group **groups); + const struct attribute_group *const *groups); int sysfs_add_file_to_group(struct kobject *kobj, const struct attribute *attr, const char *group); void sysfs_remove_file_from_group(struct kobject *kobj, @@ -486,7 +486,7 @@ int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid); int sysfs_link_change_owner(struct kobject *kobj, struct kobject *targ, const char *name, kuid_t kuid, kgid_t kgid); int sysfs_groups_change_owner(struct kobject *kobj, - const struct attribute_group **groups, + const struct attribute_group *const *groups, kuid_t kuid, kgid_t kgid); int sysfs_group_change_owner(struct kobject *kobj, const struct attribute_group *groups, kuid_t kuid, @@ -629,13 +629,13 @@ static inline int sysfs_create_group(struct kobject *kobj, } static inline int sysfs_create_groups(struct kobject *kobj, - const struct attribute_group **groups) + const struct attribute_group *const *groups) { return 0; } static inline int sysfs_update_groups(struct kobject *kobj, - const struct attribute_group **groups) + const struct attribute_group *const *groups) { return 0; } @@ -652,7 +652,7 @@ static inline void sysfs_remove_group(struct kobject *kobj, } static inline void sysfs_remove_groups(struct kobject *kobj, - const struct attribute_group **groups) + const struct attribute_group *const *groups) { } @@ -733,7 +733,7 @@ static inline int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t k } static inline int sysfs_groups_change_owner(struct kobject *kobj, - const struct attribute_group **groups, + const struct attribute_group *const *groups, kuid_t kuid, kgid_t kgid) { return 0; -- cgit v1.2.3 From ece5283706aff6791a37894bafbb0c134a94c0f3 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 1 Mar 2026 13:31:02 +0100 Subject: driver: core: constify groups array argument in device_add_groups and device_remove_groups Now that sysfs_create_groups() and sysfs_remove_groups() allow to pass constant groups arrays, we can constify the groups array argument also here. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/8ea2d6d1-0adb-4d7f-92bc-751e93ce08d6@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/linux/device.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 0be95294b6e6..48a0444ccc1e 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1131,9 +1131,9 @@ device_create_with_groups(const struct class *cls, struct device *parent, dev_t void device_destroy(const struct class *cls, dev_t devt); int __must_check device_add_groups(struct device *dev, - const struct attribute_group **groups); + const struct attribute_group *const *groups); void device_remove_groups(struct device *dev, - const struct attribute_group **groups); + const struct attribute_group *const *groups); static inline int __must_check device_add_group(struct device *dev, const struct attribute_group *grp) -- cgit v1.2.3 From 10f874dc92b3f3bf96470d997bdf157b289c9d4c Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 1 Mar 2026 13:31:56 +0100 Subject: driver core: make struct class groups members constant arrays Constify the groups arrays, allowing to assign constant arrays. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/7ff56b07-09ca-4948-b98f-5bd37ceef21e@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/linux/device/class.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/device/class.h b/include/linux/device/class.h index 65880e60c720..2079239a5aa5 100644 --- a/include/linux/device/class.h +++ b/include/linux/device/class.h @@ -50,8 +50,8 @@ struct fwnode_handle; struct class { const char *name; - const struct attribute_group **class_groups; - const struct attribute_group **dev_groups; + const struct attribute_group *const *class_groups; + const struct attribute_group *const *dev_groups; int (*dev_uevent)(const struct device *dev, struct kobj_uevent_env *env); char *(*devnode)(const struct device *dev, umode_t *mode); -- cgit v1.2.3 From cb0caadb64ca0894c4a24e1a34841f260d462f90 Mon Sep 17 00:00:00 2001 From: Shayne Chen Date: Fri, 13 Mar 2026 14:21:49 +0800 Subject: wifi: ieee80211: fix definition of EHT-MCS 15 in MRU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to the definition in IEEE Std 802.11be-2024, Table 9-417r, each bit indicates support for the transmission and reception of EHT-MCS 15 in: - B0: 52+26-tone and 106+26-tone MRUs. - B1: a 484+242-tone MRU if 80 MHz is supported. - B2: a 996+484-tone MRU and a 996+484+242-tone MRU if 160 MHz is supported. - B3: a 3×996-tone MRU if 320 MHz is supported. Fixes: 6239da18d2f9 ("wifi: mac80211: adjust EHT capa when lowering bandwidth") Signed-off-by: Shayne Chen Link: https://patch.msgid.link/20260313062150.3165433-1-shayne.chen@mediatek.com Signed-off-by: Johannes Berg --- include/linux/ieee80211-eht.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211-eht.h b/include/linux/ieee80211-eht.h index f8e9f5d36d2a..a97b1d01f3ac 100644 --- a/include/linux/ieee80211-eht.h +++ b/include/linux/ieee80211-eht.h @@ -251,8 +251,8 @@ struct ieee80211_eht_operation_info { #define IEEE80211_EHT_PHY_CAP5_SUPP_EXTRA_EHT_LTF 0x40 #define IEEE80211_EHT_PHY_CAP6_MAX_NUM_SUPP_EHT_LTF_MASK 0x07 -#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_80MHZ 0x08 -#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_160MHZ 0x30 +#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_80MHZ 0x10 +#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_160MHZ 0x20 #define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_320MHZ 0x40 #define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_MASK 0x78 #define IEEE80211_EHT_PHY_CAP6_EHT_DUP_6GHZ_SUPP 0x80 -- cgit v1.2.3 From 7caedbb5ade345df0eec0bf01035c780919a9f56 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Mon, 9 Mar 2026 13:37:02 -0700 Subject: integrity: Eliminate weak definition of arch_get_secureboot() security/integrity/secure_boot.c contains a single __weak function, which breaks recordmcount when building with clang: $ make -skj"$(nproc)" ARCH=powerpc LLVM=1 ppc64_defconfig security/integrity/secure_boot.o Cannot find symbol for section 2: .text. security/integrity/secure_boot.o: failed Introduce a Kconfig symbol, CONFIG_HAVE_ARCH_GET_SECUREBOOT, to indicate that an architecture provides a definition of arch_get_secureboot(). Provide a static inline stub when this symbol is not defined to achieve the same effect as the __weak function, allowing secure_boot.c to be removed altogether. Move the s390 definition of arch_get_secureboot() out of the CONFIG_KEXEC_FILE block to ensure it is always available, as it does not actually depend on KEXEC_FILE. Reported-by: Arnd Bergmann Fixes: 31a6a07eefeb ("integrity: Make arch_ima_get_secureboot integrity-wide") Signed-off-by: Nathan Chancellor Acked-by: Arnd Bergmann Signed-off-by: Mimi Zohar --- include/linux/secure_boot.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/secure_boot.h b/include/linux/secure_boot.h index 3ded3f03655c..d17e92351567 100644 --- a/include/linux/secure_boot.h +++ b/include/linux/secure_boot.h @@ -10,10 +10,14 @@ #include +#ifdef CONFIG_HAVE_ARCH_GET_SECUREBOOT /* * Returns true if the platform secure boot is enabled. * Returns false if disabled or not supported. */ bool arch_get_secureboot(void); +#else +static inline bool arch_get_secureboot(void) { return false; } +#endif #endif /* _LINUX_SECURE_BOOT_H */ -- cgit v1.2.3 From 97e6fabee5dcb2d86d4ff45f20606b8a73181f74 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 13 Mar 2026 13:53:03 +0100 Subject: driver core: auxiliary bus: Introduce dev_is_auxiliary() Introduce dev_is_auxiliary() in analogy with dev_is_platform() to facilitate subsequent changes. Signed-off-by: Rafael J. Wysocki Reviewed-by: Danilo Krummrich Reviewed-by: Greg Kroah-Hartman Link: https://patch.msgid.link/5079467.GXAFRqVoOG@rafael.j.wysocki --- include/linux/auxiliary_bus.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/auxiliary_bus.h b/include/linux/auxiliary_bus.h index 4086afd0cc6b..bc09b55e3682 100644 --- a/include/linux/auxiliary_bus.h +++ b/include/linux/auxiliary_bus.h @@ -271,6 +271,8 @@ struct auxiliary_device *__devm_auxiliary_device_create(struct device *dev, __devm_auxiliary_device_create(dev, KBUILD_MODNAME, devname, \ platform_data, 0) +bool dev_is_auxiliary(struct device *dev); + /** * module_auxiliary_driver() - Helper macro for registering an auxiliary driver * @__auxiliary_driver: auxiliary driver struct -- cgit v1.2.3 From 98d709cba3193f0bec54da4cd76ef499ea2f1ef7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 13 Mar 2026 09:43:22 -1000 Subject: sched_ext: Implement SCX_ENQ_IMMED Add SCX_ENQ_IMMED enqueue flag for local DSQ insertions. Once a task is dispatched with IMMED, it either gets on the CPU immediately and stays on it, or gets reenqueued back to the BPF scheduler. It will never linger on a local DSQ behind other tasks or on a CPU taken by a higher-priority class. rq_is_open() uses rq->next_class to determine whether the rq is available, and wakeup_preempt_scx() triggers reenqueue when a higher-priority class task arrives. These capture all higher class preemptions. Combined with reenqueue points in the dispatch path, all cases where an IMMED task would not execute immediately are covered. SCX_TASK_IMMED persists in p->scx.flags until the next fresh enqueue, so the guarantee survives SAVE/RESTORE cycles. If preempted while running, put_prev_task_scx() reenqueues through ops.enqueue() with SCX_TASK_REENQ_PREEMPTED instead of silently placing the task back on the local DSQ. This enables tighter scheduling latency control by preventing tasks from piling up on local DSQs. It also enables opportunistic CPU sharing across sub-schedulers - without this, a sub-scheduler can stuff the local DSQ of a shared CPU, making it difficult for others to use. v2: - Rewrite is_curr_done() as rq_is_open() using rq->next_class and implement wakeup_preempt_scx() to achieve complete coverage of all cases where IMMED tasks could get stranded. - Track IMMED persistently in p->scx.flags and reenqueue preempted-while-running tasks through ops.enqueue(). - Bound deferred reenq cycles (SCX_REENQ_LOCAL_MAX_REPEAT). - Misc renames, documentation. Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- include/linux/sched/ext.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 60a4f65d0174..602dc83cab36 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -100,6 +100,7 @@ enum scx_ent_flags { SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */ SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */ SCX_TASK_SUB_INIT = 1 << 4, /* task being initialized for a sub sched */ + SCX_TASK_IMMED = 1 << 5, /* task is on local DSQ with %SCX_ENQ_IMMED */ /* * Bits 8 and 9 are used to carry task state: @@ -125,6 +126,8 @@ enum scx_ent_flags { * * NONE not being reenqueued * KFUNC reenqueued by scx_bpf_dsq_reenq() and friends + * IMMED reenqueued due to failed ENQ_IMMED + * PREEMPTED preempted while running */ SCX_TASK_REENQ_REASON_SHIFT = 12, SCX_TASK_REENQ_REASON_BITS = 2, @@ -132,6 +135,8 @@ enum scx_ent_flags { SCX_TASK_REENQ_NONE = 0 << SCX_TASK_REENQ_REASON_SHIFT, SCX_TASK_REENQ_KFUNC = 1 << SCX_TASK_REENQ_REASON_SHIFT, + SCX_TASK_REENQ_IMMED = 2 << SCX_TASK_REENQ_REASON_SHIFT, + SCX_TASK_REENQ_PREEMPTED = 3 << SCX_TASK_REENQ_REASON_SHIFT, /* iteration cursor, not a task */ SCX_TASK_CURSOR = 1 << 31, -- cgit v1.2.3 From 82b6c1b542ea0530318c6f2a880d884eb4dce49f Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 2 Mar 2026 17:29:05 +0100 Subject: of: Add of_machine_get_match() helper Currently, there are two helpers to match the root compatible value against an of_device_id array: - of_machine_device_match() returns true if a match is found, - of_machine_get_match_data() returns the match data if a match is found. However, there is no helper that returns the actual of_device_id structure corresponding to the match, leading to code duplication in various drivers. Fix this by reworking of_machine_device_match() to return the actual match structure, and renaming it to of_machine_get_match(). Retain the old of_machine_device_match() functionality using a cheap static inline wrapper around the new of_machine_get_match() helper. Signed-off-by: Geert Uytterhoeven Acked-by: Viresh Kumar Link: https://patch.msgid.link/14e1c03d443b1a5f210609ec3a1ebbaeab8fb3d9.1772468323.git.geert+renesas@glider.be Signed-off-by: Rob Herring (Arm) --- include/linux/of.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index be6ec4916adf..b4d7d33b0ceb 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -410,7 +410,7 @@ extern int of_alias_get_id(const struct device_node *np, const char *stem); extern int of_alias_get_highest_id(const char *stem); bool of_machine_compatible_match(const char *const *compats); -bool of_machine_device_match(const struct of_device_id *matches); +const struct of_device_id *of_machine_get_match(const struct of_device_id *matches); const void *of_machine_get_match_data(const struct of_device_id *matches); /** @@ -866,9 +866,9 @@ static inline bool of_machine_compatible_match(const char *const *compats) return false; } -static inline bool of_machine_device_match(const struct of_device_id *matches) +static inline const struct of_device_id *of_machine_get_match(const struct of_device_id *matches) { - return false; + return NULL; } static inline const void * @@ -976,6 +976,11 @@ static inline int of_numa_init(void) } #endif +static inline bool of_machine_device_match(const struct of_device_id *matches) +{ + return of_machine_get_match(matches) != NULL; +} + static inline struct device_node *of_find_matching_node( struct device_node *from, const struct of_device_id *matches) -- cgit v1.2.3 From d7eafe655b741dfc241d5b920f6d2cea45b568d9 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Sun, 1 Mar 2026 06:13:16 +0800 Subject: dma-mapping: Separate DMA sync issuing and completion waiting Currently, arch_sync_dma_for_cpu and arch_sync_dma_for_device always wait for the completion of each DMA buffer. That is, issuing the DMA sync and waiting for completion is done in a single API call. For scatter-gather lists with multiple entries, this means issuing and waiting is repeated for each entry, which can hurt performance. Architectures like ARM64 may be able to issue all DMA sync operations for all entries first and then wait for completion together. To address this, arch_sync_dma_for_* now batches DMA operations and performs a flush afterward. On ARM64, the flush is implemented with a dsb instruction in arch_sync_dma_flush(). On other architectures, arch_sync_dma_flush() is currently a nop. Cc: Leon Romanovsky Cc: Catalin Marinas Cc: Will Deacon Cc: Marek Szyprowski Cc: Robin Murphy Cc: Ada Couprie Diaz Cc: Ard Biesheuvel Cc: Marc Zyngier Cc: Anshuman Khandual Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Joerg Roedel Cc: Stefano Stabellini Cc: Oleksandr Tyshchenko Cc: Tangquan Zheng Reviewed-by: Juergen Gross # drivers/xen/swiotlb-xen.c Tested-by: Xueyuan Chen Signed-off-by: Barry Song Reviewed-by: Leon Romanovsky Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260228221316.59934-1-21cnbao@gmail.com --- include/linux/dma-map-ops.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 60b63756df82..8a07df5a9ef6 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -361,6 +361,12 @@ static inline void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, } #endif /* ARCH_HAS_SYNC_DMA_FOR_CPU */ +#ifndef CONFIG_ARCH_HAS_BATCHED_DMA_SYNC +static inline void arch_sync_dma_flush(void) +{ +} +#endif + #ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL void arch_sync_dma_for_cpu_all(void); #else -- cgit v1.2.3 From 74f0cca1100b6d1f1ea28178435aff8078d06603 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 11 Mar 2026 05:19:56 +0000 Subject: udp: Remove UDPLITE_SEND_CSCOV and UDPLITE_RECV_CSCOV. UDP-Lite supports variable-length checksum and has two socket options, UDPLITE_SEND_CSCOV and UDPLITE_RECV_CSCOV, to control the checksum coverage. Let's remove the support. setsockopt(UDPLITE_SEND_CSCOV / UDPLITE_RECV_CSCOV) was only available for UDP-Lite and returned -ENOPROTOOPT for UDP. Now, the options are handled in ip_setsockopt() and ipv6_setsockopt(), which still return the same error. getsockopt(UDPLITE_SEND_CSCOV / UDPLITE_RECV_CSCOV) was available for UDP and always returned 0, meaning full checksum, but now -ENOPROTOOPT is returned. Given that getsockopt() is meaningless for UDP and even the options are not defined under include/uapi/, this should not be a problem. $ man 7 udplite ... BUGS Where glibc support is missing, the following definitions are needed: #define IPPROTO_UDPLITE 136 #define UDPLITE_SEND_CSCOV 10 #define UDPLITE_RECV_CSCOV 11 Signed-off-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260311052020.1213705-10-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/linux/udp.h | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/udp.h b/include/linux/udp.h index 1cbf6b4d3aab..ce56ebcee5cb 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -40,8 +40,6 @@ enum { UDP_FLAGS_ACCEPT_FRAGLIST, UDP_FLAGS_ACCEPT_L4, UDP_FLAGS_ENCAP_ENABLED, /* This socket enabled encap */ - UDP_FLAGS_UDPLITE_SEND_CC, /* set via udplite setsockopt */ - UDP_FLAGS_UDPLITE_RECV_CC, /* set via udplite setsockopt */ }; /* per NUMA structure for lockless producer usage. */ @@ -74,11 +72,7 @@ struct udp_sock { */ __u16 len; /* total length of pending frames */ __u16 gso_size; - /* - * Fields specific to UDP-Lite. - */ - __u16 pcslen; - __u16 pcrlen; + /* * For encapsulation sockets. */ @@ -236,8 +230,6 @@ static inline void udp_allow_gso(struct sock *sk) hlist_nulls_for_each_entry_rcu(__up, node, list, udp_lrpa_node) #endif -#define IS_UDPLITE(__sk) (unlikely(__sk->sk_protocol == IPPROTO_UDPLITE)) - static inline struct sock *udp_tunnel_sk(const struct net *net, bool is_ipv6) { #if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) -- cgit v1.2.3 From 203247c5cb972af5d46bdb7d41ef40078048810b Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 13 Mar 2026 07:47:00 -0700 Subject: blk-integrity: support arbitrary buffer alignment A bio segment may have partial interval block data with the rest continuing into the next segments because direct-io data payloads only need to align in memory to the device's DMA limits. At the same time, the protection information may also be split in multiple segments. The most likely way that may happen is if two requests merge, or if we're directly using the io_uring user metadata. The generate/verify, however, only ever accessed the first bip_vec. Further, it may be possible to unalign the protection fields from the user space buffer, or if there are odd additional opaque bytes in front or in back of the protection information metadata region. Change up the iteration to allow spanning multiple segments. This patch is mostly a re-write of the protection information handling to allow any arbitrary alignments, so it's probably easier to review the end result rather than the diff. Many controllers are not able to handle interval data composed of multiple segments when PI is used, so this patch introduces a new integrity limit that a low level driver can set to notify that it is capable, default to false. The nvme driver is the first one to enable it in this patch. Everyone else will force DMA alignment to the logical block size as before to ensure interval data is always aligned within a single segment. Reviewed-by: Martin K. Petersen Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch Link: https://patch.msgid.link/20260313144701.1221652-2-kbusch@meta.com Signed-off-by: Jens Axboe --- include/linux/blk-integrity.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index ea6d7d322ae3..b1b530613c34 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -14,6 +14,7 @@ enum blk_integrity_flags { BLK_INTEGRITY_DEVICE_CAPABLE = 1 << 2, BLK_INTEGRITY_REF_TAG = 1 << 3, BLK_INTEGRITY_STACKED = 1 << 4, + BLK_SPLIT_INTERVAL_CAPABLE = 1 << 5, }; const char *blk_integrity_profile_name(struct blk_integrity *bi); -- cgit v1.2.3 From 0e24d17bd9668f9dad78ede6a0e8f13dab176682 Mon Sep 17 00:00:00 2001 From: Simon Baatz Date: Mon, 9 Mar 2026 09:02:26 +0100 Subject: tcp: implement RFC 7323 window retraction receiver requirements By default, the Linux TCP implementation does not shrink the advertised window (RFC 7323 calls this "window retraction") with the following exceptions: - When an incoming segment cannot be added due to the receive buffer running out of memory. Since commit 8c670bdfa58e ("tcp: correct handling of extreme memory squeeze") a zero window will be advertised in this case. It turns out that reaching the required memory pressure is easy when window scaling is in use. In the simplest case, sending a sufficient number of segments smaller than the scale factor to a receiver that does not read data is enough. - Commit b650d953cd39 ("tcp: enforce receive buffer memory limits by allowing the tcp window to shrink") addressed the "eating memory" problem by introducing a sysctl knob that allows shrinking the window before running out of memory. However, RFC 7323 does not only state that shrinking the window is necessary in some cases, it also formulates requirements for TCP implementations when doing so (Section 2.4). This commit addresses the receiver-side requirements: After retracting the window, the peer may have a snd_nxt that lies within a previously advertised window but is now beyond the retracted window. This means that all incoming segments (including pure ACKs) will be rejected until the application happens to read enough data to let the peer's snd_nxt be in window again (which may be never). To comply with RFC 7323, the receiver MUST honor any segment that would have been in window for any ACK sent by the receiver and, when window scaling is in effect, SHOULD track the maximum window sequence number it has advertised. This patch tracks that maximum window sequence number rcv_mwnd_seq throughout the connection and uses it in tcp_sequence() when deciding whether a segment is acceptable. rcv_mwnd_seq is updated together with rcv_wup and rcv_wnd in tcp_select_window(). If we count tcp_sequence() as fast path, it is read in the fast path. Therefore, rcv_mwnd_seq is put into rcv_wnd's cacheline group. The logic for handling received data in tcp_data_queue() is already sufficient and does not need to be updated. Signed-off-by: Simon Baatz Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260309-tcp_rfc7323_retract_wnd_rfc-v3-1-4c7f96b1ec69@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/tcp.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index bcebc4f07532..6982f10e826b 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -316,6 +316,9 @@ struct tcp_sock { */ u32 app_limited; /* limited until "delivered" reaches this val */ u32 rcv_wnd; /* Current receiver window */ + u32 rcv_mwnd_seq; /* Maximum window sequence number (RFC 7323, + * section 2.4, receiver requirements) + */ u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ /* * Options received (usually on last packet, some only on SYN packets). -- cgit v1.2.3 From f807b5b9b89eb9220d034115c272c312251cbcac Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 12 Mar 2026 12:13:52 +0000 Subject: net: stmmac: avoid passing pci_dev The pci_dev is only used to provide the ethtool bus_info using pci_name(priv->plat->pdev). This is the same as dev_name(priv->device). Thus, rather than passing the pci_dev, make use of what we already have. To avoid unexpectedly exposing the device name through ethtool where it wasn't provided before, add a flag priv->plat->provide_bus_info to enable this, which only dwmac-intel needs to set. Signed-off-by: Russell King (Oracle) Reviewed-by: Simon Horman Link: https://patch.msgid.link/E1w0evI-0000000CzY7-1fyo@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 937985276e6b..72febd246bdb 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -348,7 +348,7 @@ struct plat_stmmacenet_data { int rss_en; int mac_port_sel_speed; u8 vlan_fail_q; - struct pci_dev *pdev; + bool provide_bus_info; int int_snapshot_num; int msi_mac_vec; int msi_wol_vec; -- cgit v1.2.3 From 6df1459605cedd2112ebf660c77f42bb87d5c306 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 9 Mar 2026 18:03:31 +0100 Subject: net: phy: make mdio_device.c part of libphy This patch - makes mdio_device.c part of libphy - makes mdio_device_(un)register_reset() static - moves mdiobus_(un)register_device() from mdio_bus.c to mdio_device.c, stops exporting both functions and makes them private to phylib This further decouples the MDIO consumer functionality from libphy. Note: This makes MDIO driver registration part of phylib, therefore adjust Kconfig dependencies where needed. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/c6dbf9b3-3ca0-434b-ad3a-71fe602ab809@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/mdio.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mdio.h b/include/linux/mdio.h index 5d1203b9af20..f4f9d9609448 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -688,8 +688,6 @@ static inline int mdiodev_c45_write(struct mdio_device *mdiodev, u32 devad, val); } -int mdiobus_register_device(struct mdio_device *mdiodev); -int mdiobus_unregister_device(struct mdio_device *mdiodev); bool mdiobus_is_registered_device(struct mii_bus *bus, int addr); struct phy_device *mdiobus_get_phy(struct mii_bus *bus, int addr); -- cgit v1.2.3 From c4399af5e55658e832779b256d8458323011f983 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 9 Mar 2026 18:06:00 +0100 Subject: net: phy: move remaining provider code to mdio_bus_provider.c This moves definition of mdio_bus class and bus_type to the provider side, what allows to make them private to libphy. As a prerequisite MDIO statistics handling is moved to the provider side as well. Note: This patch causes a checkpatch error "Macros with complex values should be enclosed in parentheses" for MDIO_BUS_STATS_ADDR_ATTR_GROUP. I consider this a false positive here, in addition the patch just moves existing code. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/47b85676-b349-4aa0-a5ef-cd37769a4c69@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index e9b0d7427b0e..5de4b172cd0b 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -2446,9 +2446,6 @@ int __phy_hwtstamp_set(struct phy_device *phydev, struct phy_port *phy_get_sfp_port(struct phy_device *phydev); -extern const struct bus_type mdio_bus_type; -extern const struct class mdio_bus_class; - /** * phy_module_driver() - Helper macro for registering PHY drivers * @__phy_drivers: array of PHY drivers to register -- cgit v1.2.3 From 2a8c8a03f306e21a0ea74c93d4332119557f4575 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 12 Mar 2026 11:04:07 +0100 Subject: net/mlx5: Add a shared devlink instance for PFs on same chip Use the previously introduced shared devlink infrastructure to create a shared devlink instance for mlx5 PFs that reside on the same physical chip. The shared instance is identified by the chip's serial number extracted from PCI VPD (V3 keyword, with fallback to serial number for older devices). Each PF that probes calls mlx5_shd_init() which extracts the chip serial number and uses devlink_shd_get() to get or create the shared instance. When a PF is removed, mlx5_shd_uninit() calls devlink_shd_put() to release the reference. The shared instance is automatically destroyed when the last PF is removed. Make the PF devlink instances nested in this shared devlink instance, allowing userspace to identify which PFs belong to the same physical chip. Example: pci/0000:08:00.0: index 0 nested_devlink: auxiliary/mlx5_core.eth.0 devlink_index/1: index 1 nested_devlink: pci/0000:08:00.0 pci/0000:08:00.1 auxiliary/mlx5_core.eth.0: index 2 pci/0000:08:00.1: index 3 nested_devlink: auxiliary/mlx5_core.eth.1 auxiliary/mlx5_core.eth.1: index 4 Signed-off-by: Jiri Pirko Link: https://patch.msgid.link/20260312100407.551173-14-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- include/linux/mlx5/driver.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 04dcd09f7517..1268fcf35ec7 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -798,6 +798,7 @@ struct mlx5_core_dev { enum mlx5_wc_state wc_state; /* sync write combining state */ struct mutex wc_state_lock; + struct devlink *shd; }; struct mlx5_db { -- cgit v1.2.3 From 0834d6f4abd0ca35b5706d267a6e4b78303a95de Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Thu, 12 Mar 2026 14:02:29 +0100 Subject: PCI: endpoint: Do not mark the BAR succeeding a 64-bit BAR as BAR_RESERVED A BAR that can only be configured as a 64-bit BAR by an EPC driver is marked as such using the "only_64bit" flag. Currently, the documentation says that an EPC driver should explicitly mark the BAR succeeding an "only_64bit" BAR as BAR_RESERVED. However, a 64-bit BAR will always take up two BARs. It is thus redundant to mark both BARs. pci_epc_get_next_free_bar() already skips the BAR succeeding a "only_64bit" BAR, regardless if the succeeding BAR is marked as BAR_RESERVED or not. Thus, drop the BAR_RESERVED for a BAR succeeding a "only_64bit" BAR. No functional changes. Suggested-by: Manivannan Sadhasivam Signed-off-by: Niklas Cassel Signed-off-by: Manivannan Sadhasivam Link: https://patch.msgid.link/20260312130229.2282001-13-cassel@kernel.org --- include/linux/pci-epc.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h index c021c7af175f..c981ea7d52c0 100644 --- a/include/linux/pci-epc.h +++ b/include/linux/pci-epc.h @@ -206,8 +206,7 @@ enum pci_epc_bar_type { * @fixed_size: the fixed size, only applicable if type is BAR_FIXED_MASK. * @only_64bit: if true, an EPF driver is not allowed to choose if this BAR * should be configured as 32-bit or 64-bit, the EPF driver must - * configure this BAR as 64-bit. Additionally, the BAR succeeding - * this BAR must be set to type BAR_RESERVED. + * configure this BAR as 64-bit. * * only_64bit should not be set on a BAR of type BAR_RESERVED. * (If BARx is a 64-bit BAR that an EPF driver is not allowed to -- cgit v1.2.3 From 27ce1d8ecb9b9ae025b9e9e199845624bc950998 Mon Sep 17 00:00:00 2001 From: Manikanta Maddireddy Date: Thu, 12 Mar 2026 14:02:30 +0100 Subject: PCI: endpoint: Allow only_64bit on BAR_RESERVED Remove the documentation that forbids setting only_64bit on a BAR of type BAR_RESERVED. When a reserved BAR is 64-bit by default, setting only_64bit is the most accurate description. If we later add support to disable a reserved BAR (e.g. disable_bar() for BARs that were never set via set_bar()), the implementation will need to clear the adjacent BAR (upper 32 bits) as well; having only_64bit set documents that requirement. Signed-off-by: Manikanta Maddireddy Signed-off-by: Niklas Cassel Signed-off-by: Manivannan Sadhasivam Link: https://patch.msgid.link/20260312130229.2282001-14-cassel@kernel.org --- include/linux/pci-epc.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h index c981ea7d52c0..5c59f5606869 100644 --- a/include/linux/pci-epc.h +++ b/include/linux/pci-epc.h @@ -207,11 +207,6 @@ enum pci_epc_bar_type { * @only_64bit: if true, an EPF driver is not allowed to choose if this BAR * should be configured as 32-bit or 64-bit, the EPF driver must * configure this BAR as 64-bit. - * - * only_64bit should not be set on a BAR of type BAR_RESERVED. - * (If BARx is a 64-bit BAR that an EPF driver is not allowed to - * touch, then both BARx and BARx+1 must be set to type - * BAR_RESERVED.) */ struct pci_epc_bar_desc { enum pci_epc_bar_type type; -- cgit v1.2.3 From f51644eb40a73677fcd0c92d8174eddde5d0be0e Mon Sep 17 00:00:00 2001 From: Koichiro Den Date: Thu, 12 Mar 2026 14:02:31 +0100 Subject: PCI: endpoint: Describe reserved subregions within BARs Some endpoint controllers expose platform-owned, fixed register windows within a BAR that EPF drivers must not reprogram (e.g. a BAR marked BAR_RESERVED). Even in that case, EPF drivers may need to reference a well-defined subset of that BAR, e.g. to reuse an integrated DMA controller MMIO window as a doorbell target. Introduce struct pci_epc_bar_rsvd_region and extend struct pci_epc_bar_desc so EPC drivers can advertise such fixed subregions in a controller-agnostic way. No functional change for existing users. Signed-off-by: Koichiro Den Signed-off-by: Niklas Cassel Signed-off-by: Manivannan Sadhasivam Tested-by: Manikanta Maddireddy Tested-by: Koichiro Den Reviewed-by: Frank Li Link: https://patch.msgid.link/20260312130229.2282001-15-cassel@kernel.org --- include/linux/pci-epc.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h index 5c59f5606869..ebcdf70aa9b9 100644 --- a/include/linux/pci-epc.h +++ b/include/linux/pci-epc.h @@ -200,6 +200,30 @@ enum pci_epc_bar_type { BAR_RESERVED, }; +/** + * enum pci_epc_bar_rsvd_region_type - type of a fixed subregion behind a BAR + * @PCI_EPC_BAR_RSVD_DMA_CTRL_MMIO: Integrated DMA controller MMIO window + * + * BARs marked BAR_RESERVED are owned by the SoC/EPC hardware and must not be + * reprogrammed by EPF drivers. Some of them still expose fixed subregions that + * EPFs may want to reference (e.g. embedded doorbell fallback). + */ +enum pci_epc_bar_rsvd_region_type { + PCI_EPC_BAR_RSVD_DMA_CTRL_MMIO = 0, +}; + +/** + * struct pci_epc_bar_rsvd_region - fixed subregion behind a BAR + * @type: reserved region type + * @offset: offset within the BAR aperture + * @size: size of the reserved region + */ +struct pci_epc_bar_rsvd_region { + enum pci_epc_bar_rsvd_region_type type; + resource_size_t offset; + resource_size_t size; +}; + /** * struct pci_epc_bar_desc - hardware description for a BAR * @type: the type of the BAR @@ -207,11 +231,15 @@ enum pci_epc_bar_type { * @only_64bit: if true, an EPF driver is not allowed to choose if this BAR * should be configured as 32-bit or 64-bit, the EPF driver must * configure this BAR as 64-bit. + * @nr_rsvd_regions: number of fixed subregions described for BAR_RESERVED + * @rsvd_regions: fixed subregions behind BAR_RESERVED */ struct pci_epc_bar_desc { enum pci_epc_bar_type type; u64 fixed_size; bool only_64bit; + u8 nr_rsvd_regions; + const struct pci_epc_bar_rsvd_region *rsvd_regions; }; /** -- cgit v1.2.3 From 33642e9e36dc084e4fc9245a266c9843bc8303b9 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Thu, 12 Mar 2026 14:02:33 +0100 Subject: PCI: endpoint: Introduce pci_epc_bar_type BAR_DISABLED Add a pci_epc_bar_type BAR_DISABLED to more clearly differentiate from BAR_RESERVED. This BAR type will only be used to describe a BAR that the EPC driver should disable, and will thus never be available to an EPF driver. (Unlike BAR_RESERVED, which will never be disabled by default by an EPC driver.) Co-developed-by: Manikanta Maddireddy Signed-off-by: Manikanta Maddireddy Signed-off-by: Niklas Cassel Signed-off-by: Manivannan Sadhasivam Tested-by: Koichiro Den Tested-by: Manikanta Maddireddy Reviewed-by: Frank Li Reviewed-by: Manikanta Maddireddy Link: https://patch.msgid.link/20260312130229.2282001-17-cassel@kernel.org --- include/linux/pci-epc.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h index ebcdf70aa9b9..334c2b7578d0 100644 --- a/include/linux/pci-epc.h +++ b/include/linux/pci-epc.h @@ -191,13 +191,21 @@ struct pci_epc { * @BAR_RESIZABLE: The BAR implements the PCI-SIG Resizable BAR Capability. * NOTE: An EPC driver can currently only set a single supported * size. - * @BAR_RESERVED: The BAR should not be touched by an EPF driver. + * @BAR_RESERVED: Used for HW-backed BARs (e.g. MSI-X table, DMA regs). The BAR + * should not be disabled by an EPC driver. The BAR should not be + * reprogrammed by an EPF driver. An EPF driver is allowed to + * disable the BAR if absolutely necessary. (However, right now + * there is no EPC operation to disable a BAR that has not been + * programmed using pci_epc_set_bar().) + * @BAR_DISABLED: The BAR should be disabled by an EPC driver. The BAR will be + * unavailable to an EPF driver. */ enum pci_epc_bar_type { BAR_PROGRAMMABLE = 0, BAR_FIXED, BAR_RESIZABLE, BAR_RESERVED, + BAR_DISABLED, }; /** -- cgit v1.2.3 From 45c2a55d13c698ba6a281315676934c44225b034 Mon Sep 17 00:00:00 2001 From: Unnathi Chalicheemala Date: Thu, 5 Mar 2026 19:12:05 -0800 Subject: soc: qcom: llcc: Add per-slice counter and common llcc slice descriptor Fix incorrect slice activation/deactivation accounting by replacing the bitmap-based activation tracking with per-slice atomic reference counters. This resolves mismatches that occur when multiple client drivers vote for the same slice or when llcc_slice_getd() is called multiple times. As part of this fix, simplify slice descriptor handling by eliminating dynamic allocation. llcc_slice_getd() now returns a pointer to a preallocated descriptor, removing the need for repeated allocation/free cycles and ensuring consistent reference tracking across all users. Signed-off-by: Unnathi Chalicheemala Signed-off-by: Francisco Munoz Ruiz Reviewed-by: Konrad Dybcio Link: https://lore.kernel.org/r/20260305-external_llcc_changes1set-v1-1-6347e52e648e@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- include/linux/soc/qcom/llcc-qcom.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soc/qcom/llcc-qcom.h b/include/linux/soc/qcom/llcc-qcom.h index 8243ab3a12a8..227125d84318 100644 --- a/include/linux/soc/qcom/llcc-qcom.h +++ b/include/linux/soc/qcom/llcc-qcom.h @@ -91,10 +91,12 @@ * struct llcc_slice_desc - Cache slice descriptor * @slice_id: llcc slice id * @slice_size: Size allocated for the llcc slice + * @refcount: Atomic counter to track activate/deactivate calls */ struct llcc_slice_desc { u32 slice_id; size_t slice_size; + refcount_t refcount; }; /** @@ -152,11 +154,10 @@ struct llcc_edac_reg_offset { * @edac_reg_offset: Offset of the LLCC EDAC registers * @lock: mutex associated with each slice * @cfg_size: size of the config data table - * @max_slices: max slices as read from device tree * @num_banks: Number of llcc banks - * @bitmap: Bit map to track the active slice ids * @ecc_irq: interrupt for llcc cache error detection and reporting * @ecc_irq_configured: 'True' if firmware has already configured the irq propagation + * @desc: Array pointer of pre-allocated LLCC slice descriptors * @version: Indicates the LLCC version */ struct llcc_drv_data { @@ -167,12 +168,11 @@ struct llcc_drv_data { const struct llcc_edac_reg_offset *edac_reg_offset; struct mutex lock; u32 cfg_size; - u32 max_slices; u32 num_banks; - unsigned long *bitmap; int ecc_irq; bool ecc_irq_configured; u32 version; + struct llcc_slice_desc *desc; }; #if IS_ENABLED(CONFIG_QCOM_LLCC) -- cgit v1.2.3 From 641f6fda143b879da1515f821ee475073678cf2a Mon Sep 17 00:00:00 2001 From: Mukesh Ojha Date: Thu, 29 Jan 2026 20:53:20 +0530 Subject: soc: qcom: pd-mapper: Fix element length in servreg_loc_pfr_req_ei It looks element length declared in servreg_loc_pfr_req_ei for reason not matching servreg_loc_pfr_req's reason field due which we could observe decoding error on PD crash. qmi_decode_string_elem: String len 81 >= Max Len 65 Fix this by matching with servreg_loc_pfr_req's reason field. Fixes: 1ebcde047c54 ("soc: qcom: add pd-mapper implementation") Signed-off-by: Mukesh Ojha Reviewed-by: Dmitry Baryshkov Tested-by: Nikita Travkin Link: https://lore.kernel.org/r/20260129152320.3658053-2-mukesh.ojha@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- include/linux/soc/qcom/pdr.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/soc/qcom/pdr.h b/include/linux/soc/qcom/pdr.h index 83a8ea612e69..2b7691e47c2a 100644 --- a/include/linux/soc/qcom/pdr.h +++ b/include/linux/soc/qcom/pdr.h @@ -5,6 +5,7 @@ #include #define SERVREG_NAME_LENGTH 64 +#define SERVREG_PFR_LENGTH 256 struct pdr_service; struct pdr_handle; -- cgit v1.2.3 From e4ee7621d732162ea2ec714ae76dac2f70519417 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Tue, 10 Mar 2026 00:03:30 +0100 Subject: soc: qcom: qmi: Enumerate the service IDs of QMI The QMI framework proposes a set of services which are defined by an integer identifier. The different QMI client lookup for the services via this identifier. Moreover, the function qmi_add_lookup() and qmi_add_server() must match the service ID but the code in different places set the same value but with a different macro name. These macros are spreaded across the different subsystems implementing the protocols associated with a service. It would make more sense to define them in the QMI header for the sake of consistency and clarity. This change use an unified naming for the services and enumerate the ones implemented in the Linux kernel. More services can come later and put the service ID in this same header. Signed-off-by: Daniel Lezcano Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20260309230346.3584252-2-daniel.lezcano@oss.qualcomm.com [bjorn: Lower case hex constants] Signed-off-by: Bjorn Andersson --- include/linux/soc/qcom/qmi.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/qcom/qmi.h b/include/linux/soc/qcom/qmi.h index 291cdc7ef49c..b9dcb437a0be 100644 --- a/include/linux/soc/qcom/qmi.h +++ b/include/linux/soc/qcom/qmi.h @@ -92,6 +92,18 @@ struct qmi_elem_info { #define QMI_ERR_INCOMPATIBLE_STATE_V01 90 #define QMI_ERR_NOT_SUPPORTED_V01 94 +/* + * Enumerate the IDs of the QMI services + */ +#define QMI_SERVICE_ID_TEST 0x0f /* 15 */ +#define QMI_SERVICE_ID_SSCTL 0x2b /* 43 */ +#define QMI_SERVICE_ID_IPA 0x31 /* 49 */ +#define QMI_SERVICE_ID_SERVREG_LOC 0x40 /* 64 */ +#define QMI_SERVICE_ID_SERVREG_NOTIF 0x42 /* 66 */ +#define QMI_SERVICE_ID_WLFW 0x45 /* 69 */ +#define QMI_SERVICE_ID_SLIMBUS 0x301 /* 769 */ +#define QMI_SERVICE_ID_USB_AUDIO_STREAM 0x41d /* 1053 */ + /** * struct qmi_response_type_v01 - common response header (decoded) * @result: result of the transaction -- cgit v1.2.3 From dea046e7f46f2357124a465e058c92cac3e351c5 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 9 Mar 2026 13:42:41 +0100 Subject: gpio: remove machine hogs With no more users, remove legacy machine hog API from the kernel. Reviewed-by: Linus Walleij Reviewed-by: Andy Shevchenko Link: https://patch.msgid.link/20260309-gpio-hog-fwnode-v2-5-4e61f3dbf06a@oss.qualcomm.com Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/machine.h | 33 --------------------------------- 1 file changed, 33 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio/machine.h b/include/linux/gpio/machine.h index 44e5f162973e..5eb88f5d0630 100644 --- a/include/linux/gpio/machine.h +++ b/include/linux/gpio/machine.h @@ -46,23 +46,6 @@ struct gpiod_lookup_table { struct gpiod_lookup table[]; }; -/** - * struct gpiod_hog - GPIO line hog table - * @chip_label: name of the chip the GPIO belongs to - * @chip_hwnum: hardware number (i.e. relative to the chip) of the GPIO - * @line_name: consumer name for the hogged line - * @lflags: bitmask of gpio_lookup_flags GPIO_* values - * @dflags: GPIO flags used to specify the direction and value - */ -struct gpiod_hog { - struct list_head list; - const char *chip_label; - u16 chip_hwnum; - const char *line_name; - unsigned long lflags; - int dflags; -}; - /* * Helper for lookup tables with just one single lookup for a device. */ @@ -95,24 +78,10 @@ static struct gpiod_lookup_table _name = { \ .flags = _flags, \ } -/* - * Simple definition of a single GPIO hog in an array. - */ -#define GPIO_HOG(_chip_label, _chip_hwnum, _line_name, _lflags, _dflags) \ -(struct gpiod_hog) { \ - .chip_label = _chip_label, \ - .chip_hwnum = _chip_hwnum, \ - .line_name = _line_name, \ - .lflags = _lflags, \ - .dflags = _dflags, \ -} - #ifdef CONFIG_GPIOLIB void gpiod_add_lookup_table(struct gpiod_lookup_table *table); void gpiod_add_lookup_tables(struct gpiod_lookup_table **tables, size_t n); void gpiod_remove_lookup_table(struct gpiod_lookup_table *table); -void gpiod_add_hogs(struct gpiod_hog *hogs); -void gpiod_remove_hogs(struct gpiod_hog *hogs); #else /* ! CONFIG_GPIOLIB */ static inline void gpiod_add_lookup_table(struct gpiod_lookup_table *table) {} @@ -120,8 +89,6 @@ static inline void gpiod_add_lookup_tables(struct gpiod_lookup_table **tables, size_t n) {} static inline void gpiod_remove_lookup_table(struct gpiod_lookup_table *table) {} -static inline void gpiod_add_hogs(struct gpiod_hog *hogs) {} -static inline void gpiod_remove_hogs(struct gpiod_hog *hogs) {} #endif /* CONFIG_GPIOLIB */ #endif /* __LINUX_GPIO_MACHINE_H */ -- cgit v1.2.3 From a25f48fd920b557e6ad02f692f690520c82f5914 Mon Sep 17 00:00:00 2001 From: Alban Bedel Date: Wed, 11 Mar 2026 15:31:20 +0100 Subject: gpio: kempld: Implement the interrupt controller Add a GPIO IRQ chip implementation for the kempld GPIO controller. Of note is only how the parent IRQ is obtained. The IRQ for the GPIO controller can be configured in the BIOS, along with the IRQ for the I2C controller. These IRQ are returned by ACPI but this information is only usable if both IRQ are configured. When only one is configured, only one is returned making it impossible to know which one it is. Luckily the BIOS will set the configured IRQ in the PLD registers, so it can be read from there instead, and that also work on platforms without ACPI. The vendor driver allowed to override the IRQ using a module parameters, so there are boards in field which used this parameter instead of properly configuring the BIOS. This implementation provides this as well for compatibility. Signed-off-by: Alban Bedel Link: https://patch.msgid.link/20260311143120.2179347-5-alban.bedel@lht.dlh.de Signed-off-by: Bartosz Golaszewski --- include/linux/mfd/kempld.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mfd/kempld.h b/include/linux/mfd/kempld.h index 643c096b93ac..2dbd80abfd1d 100644 --- a/include/linux/mfd/kempld.h +++ b/include/linux/mfd/kempld.h @@ -37,6 +37,7 @@ #define KEMPLD_SPEC_GET_MINOR(x) (x & 0x0f) #define KEMPLD_SPEC_GET_MAJOR(x) ((x >> 4) & 0x0f) #define KEMPLD_IRQ_GPIO 0x35 +#define KEMPLD_IRQ_GPIO_MASK 0x0f #define KEMPLD_IRQ_I2C 0x36 #define KEMPLD_CFG 0x37 #define KEMPLD_CFG_GPIO_I2C_MUX (1 << 0) -- cgit v1.2.3 From 428c56525bf5dbc3bd5e30014df1f5213f8bd7c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Fri, 13 Mar 2026 09:22:18 +0100 Subject: jump_label: use ATOMIC_INIT() for initialization of .enabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently ATOMIC_INIT() is not used because in the past that macro was provided by linux/atomic.h which is not usable from linux/jump_label.h. However since commit 7ca8cf5347f7 ("locking/atomic: Move ATOMIC_INIT into linux/types.h") the macro only requires linux/types.h. Remove the now unnecessary workaround and the associated assertions. Signed-off-by: Thomas Weißschuh Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260313-jump_label-cleanup-v2-1-35d3c0bde549@linutronix.de --- include/linux/jump_label.h | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index fdb79dd1ebd8..e494b360d36d 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -238,18 +238,11 @@ extern void static_key_enable_cpuslocked(struct static_key *key); extern void static_key_disable_cpuslocked(struct static_key *key); extern enum jump_label_type jump_label_init_type(struct jump_entry *entry); -/* - * We should be using ATOMIC_INIT() for initializing .enabled, but - * the inclusion of atomic.h is problematic for inclusion of jump_label.h - * in 'low-level' headers. Thus, we are initializing .enabled with a - * raw value, but have added a BUILD_BUG_ON() to catch any issues in - * jump_label_init() see: kernel/jump_label.c. - */ #define STATIC_KEY_INIT_TRUE \ - { .enabled = { 1 }, \ + { .enabled = ATOMIC_INIT(1), \ { .type = JUMP_TYPE_TRUE } } #define STATIC_KEY_INIT_FALSE \ - { .enabled = { 0 }, \ + { .enabled = ATOMIC_INIT(0), \ { .type = JUMP_TYPE_FALSE } } #else /* !CONFIG_JUMP_LABEL */ -- cgit v1.2.3 From acb38872d4cbec5b6825345d9d757e21d2d9d953 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Fri, 13 Mar 2026 09:22:19 +0100 Subject: jump_label: remove workaround for old compilers in initializations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The extra braces for the initialization of the anonymous union members were added in commit cd8d860dcce9 ("jump_label: Fix anonymous union initialization") to compensate for limitations in gcc < 4.6. Versions of gcc this old are not supported anymore, so drop the workaround. Signed-off-by: Thomas Weißschuh Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260313-jump_label-cleanup-v2-2-35d3c0bde549@linutronix.de --- include/linux/jump_label.h | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index e494b360d36d..b9c7b0ebf7b9 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -87,13 +87,6 @@ struct static_key { atomic_t enabled; #ifdef CONFIG_JUMP_LABEL /* - * Note: - * To make anonymous unions work with old compilers, the static - * initialization of them requires brackets. This creates a dependency - * on the order of the struct with the initializers. If any fields - * are added, STATIC_KEY_INIT_TRUE and STATIC_KEY_INIT_FALSE may need - * to be modified. - * * bit 0 => 1 if key is initially true * 0 if initially false * bit 1 => 1 if points to struct static_key_mod @@ -240,10 +233,10 @@ extern enum jump_label_type jump_label_init_type(struct jump_entry *entry); #define STATIC_KEY_INIT_TRUE \ { .enabled = ATOMIC_INIT(1), \ - { .type = JUMP_TYPE_TRUE } } + .type = JUMP_TYPE_TRUE } #define STATIC_KEY_INIT_FALSE \ { .enabled = ATOMIC_INIT(0), \ - { .type = JUMP_TYPE_FALSE } } + .type = JUMP_TYPE_FALSE } #else /* !CONFIG_JUMP_LABEL */ -- cgit v1.2.3 From 2deccd5c862a0337a691bcfaa87919b4216e6103 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 9 Mar 2026 17:40:42 +0100 Subject: cleanup: Optimize guards Andrew reported that a guard() conversion of zone_lock increased the code size unnecessarily. It turns out the unconditional __GUARD_IS_ERR() is to blame. As explored earlier [1], __GUARD_IS_ERR(), similar to IS_ERR_OR_NULL(), generates somewhat sub-optimal code. However, looking at things again, it is possible to avoid doing the __GUARD_IS_ERR() unconditionally. Revert the normal destructors to a simple NULL test and only add the IS_ERR bit to COND guards. This cures the reported overhead; as compiled by GCC-16: page_alloc.o: pre: Total: Before=45299, After=45371, chg +0.16% post: Total: Before=45299, After=45026, chg -0.60% [1] https://lkml.kernel.org/r/20250513085001.GC25891@noisy.programming.kicks-ass.net Reported-by: Andrew Morton Signed-off-by: Peter Zijlstra (Intel) Tested-by: Dan Williams Link: https://patch.msgid.link/20260309164516.GE606826@noisy.programming.kicks-ass.net --- include/linux/cleanup.h | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h index dbc4162921e9..ea95ca4bc11c 100644 --- a/include/linux/cleanup.h +++ b/include/linux/cleanup.h @@ -286,15 +286,18 @@ static __always_inline _type class_##_name##_constructor(_init_args) \ __no_context_analysis \ { _type t = _init; return t; } -#define EXTEND_CLASS(_name, ext, _init, _init_args...) \ -typedef lock_##_name##_t lock_##_name##ext##_t; \ +#define EXTEND_CLASS_COND(_name, ext, _cond, _init, _init_args...) \ +typedef lock_##_name##_t lock_##_name##ext##_t; \ typedef class_##_name##_t class_##_name##ext##_t; \ -static __always_inline void class_##_name##ext##_destructor(class_##_name##_t *p) \ -{ class_##_name##_destructor(p); } \ +static __always_inline void class_##_name##ext##_destructor(class_##_name##_t *_T) \ +{ if (_cond) return; class_##_name##_destructor(_T); } \ static __always_inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \ __no_context_analysis \ { class_##_name##_t t = _init; return t; } +#define EXTEND_CLASS(_name, ext, _init, _init_args...) \ + EXTEND_CLASS_COND(_name, ext, 0, _init, _init_args) + #define CLASS(_name, var) \ class_##_name##_t var __cleanup(class_##_name##_destructor) = \ class_##_name##_constructor @@ -394,12 +397,12 @@ static __maybe_unused const bool class_##_name##_is_conditional = _is_cond __DEFINE_GUARD_LOCK_PTR(_name, _T) #define DEFINE_GUARD(_name, _type, _lock, _unlock) \ - DEFINE_CLASS(_name, _type, if (!__GUARD_IS_ERR(_T)) { _unlock; }, ({ _lock; _T; }), _type _T); \ + DEFINE_CLASS(_name, _type, if (_T) { _unlock; }, ({ _lock; _T; }), _type _T); \ DEFINE_CLASS_IS_GUARD(_name) #define DEFINE_GUARD_COND_4(_name, _ext, _lock, _cond) \ __DEFINE_CLASS_IS_CONDITIONAL(_name##_ext, true); \ - EXTEND_CLASS(_name, _ext, \ + EXTEND_CLASS_COND(_name, _ext, __GUARD_IS_ERR(*_T), \ ({ void *_t = _T; int _RET = (_lock); if (_T && !(_cond)) _t = ERR_PTR(_RET); _t; }), \ class_##_name##_t _T) \ static __always_inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \ @@ -488,7 +491,7 @@ typedef struct { \ static __always_inline void class_##_name##_destructor(class_##_name##_t *_T) \ __no_context_analysis \ { \ - if (!__GUARD_IS_ERR(_T->lock)) { _unlock; } \ + if (_T->lock) { _unlock; } \ } \ \ __DEFINE_GUARD_LOCK_PTR(_name, &_T->lock) @@ -565,7 +568,7 @@ __DEFINE_LOCK_GUARD_0(_name, _lock) #define DEFINE_LOCK_GUARD_1_COND_4(_name, _ext, _lock, _cond) \ __DEFINE_CLASS_IS_CONDITIONAL(_name##_ext, true); \ - EXTEND_CLASS(_name, _ext, \ + EXTEND_CLASS_COND(_name, _ext, __GUARD_IS_ERR(_T->lock), \ ({ class_##_name##_t _t = { .lock = l }, *_T = &_t;\ int _RET = (_lock); \ if (_T->lock && !(_cond)) _T->lock = ERR_PTR(_RET);\ -- cgit v1.2.3 From 756a0e011cfca0b45a48464aa25b05d9a9c2fb0b Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 13 Mar 2026 10:15:07 -0700 Subject: locking: Fix rwlock support in Architecture support for rwlocks must be available whether or not CONFIG_DEBUG_SPINLOCK has been defined. Move the definitions of the arch_{read,write}_{lock,trylock,unlock}() macros such that these become visbile if CONFIG_DEBUG_SPINLOCK=n. This patch prepares for converting do_raw_{read,write}_trylock() into inline functions. Without this patch that conversion triggers a build failure for UP architectures, e.g. arm-ep93xx. I used the following kernel configuration to build the kernel for that architecture: CONFIG_ARCH_MULTIPLATFORM=y CONFIG_ARCH_MULTI_V7=n CONFIG_ATAGS=y CONFIG_MMU=y CONFIG_ARCH_MULTI_V4T=y CONFIG_CPU_LITTLE_ENDIAN=y CONFIG_ARCH_EP93XX=y Fixes: fb1c8f93d869 ("[PATCH] spinlock consolidation") Signed-off-by: Bart Van Assche Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260313171510.230998-2-bvanassche@acm.org --- include/linux/spinlock_up.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h index 1e84e71ca495..3a50976471d7 100644 --- a/include/linux/spinlock_up.h +++ b/include/linux/spinlock_up.h @@ -48,16 +48,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock) lock->slock = 1; } -/* - * Read-write spinlocks. No debug version. - */ -#define arch_read_lock(lock) do { barrier(); (void)(lock); } while (0) -#define arch_write_lock(lock) do { barrier(); (void)(lock); } while (0) -#define arch_read_trylock(lock) ({ barrier(); (void)(lock); 1; }) -#define arch_write_trylock(lock) ({ barrier(); (void)(lock); 1; }) -#define arch_read_unlock(lock) do { barrier(); (void)(lock); } while (0) -#define arch_write_unlock(lock) do { barrier(); (void)(lock); } while (0) - #else /* DEBUG_SPINLOCK */ #define arch_spin_is_locked(lock) ((void)(lock), 0) /* for sched/core.c and kernel_lock.c: */ @@ -68,4 +58,14 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock) #define arch_spin_is_contended(lock) (((void)(lock), 0)) +/* + * Read-write spinlocks. No debug version. + */ +#define arch_read_lock(lock) do { barrier(); (void)(lock); } while (0) +#define arch_write_lock(lock) do { barrier(); (void)(lock); } while (0) +#define arch_read_trylock(lock) ({ barrier(); (void)(lock); 1; }) +#define arch_write_trylock(lock) ({ barrier(); (void)(lock); 1; }) +#define arch_read_unlock(lock) do { barrier(); (void)(lock); } while (0) +#define arch_write_unlock(lock) do { barrier(); (void)(lock); } while (0) + #endif /* __LINUX_SPINLOCK_UP_H */ -- cgit v1.2.3 From c4d3b8c77d85082d32250c505beb1d9e46ee47ee Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 13 Mar 2026 10:15:08 -0700 Subject: locking: Add lock context support in do_raw_{read,write}_trylock() Convert do_raw_{read,write}_trylock() from macros into inline functions and annotate these inline functions with __cond_acquires_shared() or __cond_acquires() as appropriate. This change is necessary to build kernel drivers or subsystems that use rwlock synchronization objects with lock context analysis enabled. The return type 'int' matches the return type for CONFIG_DEBUG_SPINLOCK=y. Signed-off-by: Bart Van Assche Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260313171510.230998-3-bvanassche@acm.org --- include/linux/rwlock.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h index 21ceefc4a49f..4e67cd934d8f 100644 --- a/include/linux/rwlock.h +++ b/include/linux/rwlock.h @@ -37,10 +37,20 @@ extern int do_raw_write_trylock(rwlock_t *lock) __cond_acquires(true, lock); extern void do_raw_write_unlock(rwlock_t *lock) __releases(lock); #else # define do_raw_read_lock(rwlock) do {__acquire_shared(lock); arch_read_lock(&(rwlock)->raw_lock); } while (0) -# define do_raw_read_trylock(rwlock) arch_read_trylock(&(rwlock)->raw_lock) +static inline int do_raw_read_trylock(rwlock_t *rwlock) + __cond_acquires_shared(true, rwlock) + __no_context_analysis +{ + return arch_read_trylock(&(rwlock)->raw_lock); +} # define do_raw_read_unlock(rwlock) do {arch_read_unlock(&(rwlock)->raw_lock); __release_shared(lock); } while (0) # define do_raw_write_lock(rwlock) do {__acquire(lock); arch_write_lock(&(rwlock)->raw_lock); } while (0) -# define do_raw_write_trylock(rwlock) arch_write_trylock(&(rwlock)->raw_lock) +static inline int do_raw_write_trylock(rwlock_t *rwlock) + __cond_acquires(true, rwlock) + __no_context_analysis +{ + return arch_write_trylock(&(rwlock)->raw_lock); +} # define do_raw_write_unlock(rwlock) do {arch_write_unlock(&(rwlock)->raw_lock); __release(lock); } while (0) #endif -- cgit v1.2.3 From cb15d8e6cbe8d085ac585016deb2e1e0107b99e5 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Sat, 14 Mar 2026 23:56:49 +0100 Subject: ASoC: codec: arizona: Convert to use GPIO descriptors This converts the Arizona driver to use GPIO descriptors exclusively, deletes the legacy code path an updates the in-tree user of legacy GPIO. The GPIO lines for mic detect polarity and headphone ID detection are made exclusively descriptor-oriented. The headphone ID detection could actually only be used by the legacy GPIO code, but I converted it to use a descriptor if someone would actually need it so we don't just drop useful code. The compatible "wlf,hpdet-id-gpio" is not in the device tree bindings and only intended to be used by software nodes if any. If someone insists I can try to add a binding for it, but I doubt there is any real user so it seems pointless. Signed-off-by: Linus Walleij Reviewed-by: Charles Keepax Reviewed-by: Bartosz Golaszewski Acked-by: Krzysztof Kozlowski Link: https://patch.msgid.link/20260314-asoc-arizona-v1-1-ecc9a165307c@kernel.org Signed-off-by: Mark Brown --- include/linux/mfd/arizona/pdata.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/arizona/pdata.h b/include/linux/mfd/arizona/pdata.h index f72e6d4b14a7..d465dcd8c90a 100644 --- a/include/linux/mfd/arizona/pdata.h +++ b/include/linux/mfd/arizona/pdata.h @@ -117,11 +117,6 @@ struct arizona_pdata { /** Check for line output with HPDET method */ bool hpdet_acc_id_line; -#ifdef CONFIG_GPIOLIB_LEGACY - /** GPIO used for mic isolation with HPDET */ - int hpdet_id_gpio; -#endif - /** Channel to use for headphone detection */ unsigned int hpdet_channel; @@ -131,11 +126,6 @@ struct arizona_pdata { /** Extra debounce timeout used during initial mic detection (ms) */ unsigned int micd_detect_debounce; -#ifdef CONFIG_GPIOLIB_LEGACY - /** GPIO for mic detection polarity */ - int micd_pol_gpio; -#endif - /** Mic detect ramp rate */ unsigned int micd_bias_start_time; -- cgit v1.2.3 From f8e761655997cc0ee434fb5f35570d2e93d3a707 Mon Sep 17 00:00:00 2001 From: Alexei Lazar Date: Mon, 9 Mar 2026 11:34:27 +0200 Subject: net/mlx5: Add IFC bits for shared headroom pool PBMC support Add hardware interface definitions for shared headroom pool (SHP) in port buffer management: - shp_pbmc_pbsr_support: capability bit in PCAM enhanced features indicating device support for shared headroom pool in PBMC/PBSR. - shared_headroom_pool: buffer entry in PBMC register (pbmc_reg_bits) for the shared headroom pool configuration, reusing the bufferx layout; reduce trailing reserved region accordingly. Signed-off-by: Alexei Lazar Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20260309093435.1850724-2-tariqt@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index a3948b36820d..a76c54bf1927 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -10845,7 +10845,9 @@ struct mlx5_ifc_pcam_enhanced_features_bits { u8 fec_200G_per_lane_in_pplm[0x1]; u8 reserved_at_1e[0x2a]; u8 fec_100G_per_lane_in_pplm[0x1]; - u8 reserved_at_49[0xa]; + u8 reserved_at_49[0x2]; + u8 shp_pbmc_pbsr_support[0x1]; + u8 reserved_at_4c[0x7]; u8 buffer_ownership[0x1]; u8 resereved_at_54[0x14]; u8 fec_50G_per_lane_in_pplm[0x1]; @@ -12090,8 +12092,9 @@ struct mlx5_ifc_pbmc_reg_bits { u8 port_buffer_size[0x10]; struct mlx5_ifc_bufferx_reg_bits buffer[10]; + struct mlx5_ifc_bufferx_reg_bits shared_headroom_pool; - u8 reserved_at_2e0[0x80]; + u8 reserved_at_320[0x40]; }; struct mlx5_ifc_sbpr_reg_bits { -- cgit v1.2.3 From 691dffc7255e740bc3df1c68b50b36786aadeb3a Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Mon, 9 Mar 2026 11:34:28 +0200 Subject: net/mlx5: Add silent mode set/query and VHCA RX IFC bits Update the mlx5 IFC headers with newly defined capability and command-layout bits: - Add silent_mode_query and rename silent_mode to silent_mode_set cap fields. - Add forward_vhca_rx and MLX5_IFC_FLOW_DESTINATION_TYPE_VHCA_RX. - Expose silent mode fields in the L2 table query command structures. Update the SD support check to use the new capability name (silent_mode_set) to match the updated IFC definition. Signed-off-by: Shay Drory Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20260309093435.1850724-3-tariqt@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index a76c54bf1927..8fa4fb3d36cf 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -469,7 +469,8 @@ struct mlx5_ifc_flow_table_prop_layout_bits { u8 table_miss_action_domain[0x1]; u8 termination_table[0x1]; u8 reformat_and_fwd_to_table[0x1]; - u8 reserved_at_1a[0x2]; + u8 forward_vhca_rx[0x1]; + u8 reserved_at_1b[0x1]; u8 ipsec_encrypt[0x1]; u8 ipsec_decrypt[0x1]; u8 sw_owner_v2[0x1]; @@ -2012,12 +2013,14 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 disable_local_lb_mc[0x1]; u8 log_min_hairpin_wq_data_sz[0x5]; u8 reserved_at_3e8[0x1]; - u8 silent_mode[0x1]; + u8 silent_mode_set[0x1]; u8 vhca_state[0x1]; u8 log_max_vlan_list[0x5]; u8 reserved_at_3f0[0x3]; u8 log_max_current_mc_list[0x5]; - u8 reserved_at_3f8[0x3]; + u8 reserved_at_3f8[0x1]; + u8 silent_mode_query[0x1]; + u8 reserved_at_3fa[0x1]; u8 log_max_current_uc_list[0x5]; u8 general_obj_types[0x40]; @@ -2279,6 +2282,7 @@ enum mlx5_ifc_flow_destination_type { MLX5_IFC_FLOW_DESTINATION_TYPE_VPORT = 0x0, MLX5_IFC_FLOW_DESTINATION_TYPE_FLOW_TABLE = 0x1, MLX5_IFC_FLOW_DESTINATION_TYPE_TIR = 0x2, + MLX5_IFC_FLOW_DESTINATION_TYPE_VHCA_RX = 0x4, MLX5_IFC_FLOW_DESTINATION_TYPE_FLOW_SAMPLER = 0x6, MLX5_IFC_FLOW_DESTINATION_TYPE_UPLINK = 0x8, MLX5_IFC_FLOW_DESTINATION_TYPE_TABLE_TYPE = 0xA, @@ -6265,7 +6269,9 @@ struct mlx5_ifc_query_l2_table_entry_out_bits { u8 reserved_at_40[0xa0]; - u8 reserved_at_e0[0x13]; + u8 reserved_at_e0[0x11]; + u8 silent_mode[0x1]; + u8 reserved_at_f2[0x1]; u8 vlan_valid[0x1]; u8 vlan[0xc]; @@ -6281,7 +6287,10 @@ struct mlx5_ifc_query_l2_table_entry_in_bits { u8 reserved_at_20[0x10]; u8 op_mod[0x10]; - u8 reserved_at_40[0x60]; + u8 reserved_at_40[0x40]; + + u8 silent_mode_query[0x1]; + u8 reserved_at_81[0x1f]; u8 reserved_at_a0[0x8]; u8 table_index[0x18]; -- cgit v1.2.3 From 971b28accc09436fe6a6d5afd667dcbfb3ed7e03 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Mon, 9 Mar 2026 11:34:32 +0200 Subject: net/mlx5: LAG, replace mlx5_get_dev_index with LAG sequence number Introduce mlx5_lag_get_dev_seq() which returns a device's sequence number within the LAG: master is always 0, remaining devices numbered sequentially. This provides a stable index for peer flow tracking and vport ordering without depending on native_port_num. Replace mlx5_get_dev_index() usage in en_tc.c (peer flow array indexing) and ib_rep.c (vport index ordering) with the new API. Signed-off-by: Shay Drory Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20260309093435.1850724-7-tariqt@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/lag.h | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 include/linux/mlx5/lag.h (limited to 'include/linux') diff --git a/include/linux/mlx5/lag.h b/include/linux/mlx5/lag.h new file mode 100644 index 000000000000..d370dfd19055 --- /dev/null +++ b/include/linux/mlx5/lag.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#ifndef __MLX5_LAG_API_H__ +#define __MLX5_LAG_API_H__ + +struct mlx5_core_dev; + +int mlx5_lag_get_dev_seq(struct mlx5_core_dev *dev); + +#endif /* __MLX5_LAG_API_H__ */ -- cgit v1.2.3 From 0bc9059fab6365feaf95cc9a796a3d381915a70f Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Mon, 9 Mar 2026 11:34:33 +0200 Subject: net/mlx5: Add VHCA RX flow destination support for FW steering Introduce MLX5_FLOW_DESTINATION_TYPE_VHCA_RX as a new flow steering destination type. Wire the new destination through flow steering command setup by mapping it to MLX5_IFC_FLOW_DESTINATION_TYPE_VHCA_RX and passing the vhca id, extend forward-destination validation to accept it, and teach the flow steering tracepoint formatter to print rx_vhca_id. Signed-off-by: Shay Drory Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20260309093435.1850724-8-tariqt@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/fs.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 9cadb1d5e6df..02064424e868 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -55,6 +55,7 @@ enum mlx5_flow_destination_type { MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM, MLX5_FLOW_DESTINATION_TYPE_RANGE, MLX5_FLOW_DESTINATION_TYPE_TABLE_TYPE, + MLX5_FLOW_DESTINATION_TYPE_VHCA_RX, }; enum { @@ -189,6 +190,9 @@ struct mlx5_flow_destination { u32 ft_num; struct mlx5_flow_table *ft; struct mlx5_fc *counter; + struct { + u16 id; + } vhca; struct { u16 num; u16 vhca_id; -- cgit v1.2.3 From d6c9b4de8109a3b4ca9c6c6b7c5fbc42cfeff9ae Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Mon, 9 Mar 2026 11:34:34 +0200 Subject: {net/RDMA}/mlx5: Add LAG demux table API and vport demux rules Downstream patches will introduce SW-only LAG (e.g. shared_fdb without HW LAG). In this mode the firmware cannot create the LAG demux table, but vport demuxing is still required. Move LAG demux flow-table ownership to the LAG layer and introduce APIs to init/cleanup the demux table and add/delete per-vport rules. Adjust the RDMA driver to use the new APIs. In this mode, the LAG layer will create a flow group that matches vport metadata. Vports that are not native to the LAG master eswitch add the demux rule during IB representor load and remove it on unload. The demux rule forward traffic from said vports to their native eswitch manager via a new dest type - MLX5_FLOW_DESTINATION_TYPE_VHCA_RX. Signed-off-by: Shay Drory Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20260309093435.1850724-9-tariqt@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/fs.h | 6 +++--- include/linux/mlx5/lag.h | 10 ++++++++++ 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 02064424e868..d8f3b7ef319e 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -252,9 +252,9 @@ mlx5_create_auto_grouped_flow_table(struct mlx5_flow_namespace *ns, struct mlx5_flow_table * mlx5_create_vport_flow_table(struct mlx5_flow_namespace *ns, struct mlx5_flow_table_attr *ft_attr, u16 vport); -struct mlx5_flow_table *mlx5_create_lag_demux_flow_table( - struct mlx5_flow_namespace *ns, - int prio, u32 level); +struct mlx5_flow_table * +mlx5_create_lag_demux_flow_table(struct mlx5_flow_namespace *ns, + struct mlx5_flow_table_attr *ft_attr); int mlx5_destroy_flow_table(struct mlx5_flow_table *ft); /* inbox should be set with the following values: diff --git a/include/linux/mlx5/lag.h b/include/linux/mlx5/lag.h index d370dfd19055..ab9f754664e5 100644 --- a/include/linux/mlx5/lag.h +++ b/include/linux/mlx5/lag.h @@ -4,8 +4,18 @@ #ifndef __MLX5_LAG_API_H__ #define __MLX5_LAG_API_H__ +#include + struct mlx5_core_dev; +struct mlx5_flow_table; +struct mlx5_flow_table_attr; +int mlx5_lag_demux_init(struct mlx5_core_dev *dev, + struct mlx5_flow_table_attr *ft_attr); +void mlx5_lag_demux_cleanup(struct mlx5_core_dev *dev); +int mlx5_lag_demux_rule_add(struct mlx5_core_dev *dev, u16 vport_num, + int vport_index); +void mlx5_lag_demux_rule_del(struct mlx5_core_dev *dev, int vport_index); int mlx5_lag_get_dev_seq(struct mlx5_core_dev *dev); #endif /* __MLX5_LAG_API_H__ */ -- cgit v1.2.3 From 4dd2115f43594da5271a1aa34fde6719b4259047 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Mon, 9 Mar 2026 11:34:35 +0200 Subject: net/mlx5: Expose MLX5_UMR_ALIGN definition Expose HW constant value in a shared header, to be used by core/EN drivers. Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20260309093435.1850724-10-tariqt@nvidia.com Reviewed-by: Dragos Tatulea Signed-off-by: Leon Romanovsky --- include/linux/mlx5/device.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 25c6b42140b2..07a25f264292 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -293,6 +293,7 @@ enum { MLX5_UMR_INLINE = (1 << 7), }; +#define MLX5_UMR_ALIGN (2048) #define MLX5_UMR_FLEX_ALIGNMENT 0x40 #define MLX5_UMR_MTT_NUM_ENTRIES_ALIGNMENT (MLX5_UMR_FLEX_ALIGNMENT / sizeof(struct mlx5_mtt)) #define MLX5_UMR_KLM_NUM_ENTRIES_ALIGNMENT (MLX5_UMR_FLEX_ALIGNMENT / sizeof(struct mlx5_klm)) -- cgit v1.2.3 From f1a424e21c15993db0f9594cda17ef5d516ab3e9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 14 Mar 2026 08:41:04 -0600 Subject: io_uring: switch struct io_ring_ctx internal bitfields to flags Bitfields cannot be set and checked atomically, and this makes it more clear that these are indeed in shared storage and must be checked and set in a sane fashion. This is in preparation for annotating a few of the known racy, but harmless, flags checking. No intended functional changes in this patch. Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index dd1420bfcb73..0b3f08adc217 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -268,24 +268,30 @@ struct io_alloc_cache { unsigned int init_clear; }; +enum { + IO_RING_F_DRAIN_NEXT = BIT(0), + IO_RING_F_OP_RESTRICTED = BIT(1), + IO_RING_F_REG_RESTRICTED = BIT(2), + IO_RING_F_OFF_TIMEOUT_USED = BIT(3), + IO_RING_F_DRAIN_ACTIVE = BIT(4), + IO_RING_F_HAS_EVFD = BIT(5), + /* all CQEs should be posted only by the submitter task */ + IO_RING_F_TASK_COMPLETE = BIT(6), + IO_RING_F_LOCKLESS_CQ = BIT(7), + IO_RING_F_SYSCALL_IOPOLL = BIT(8), + IO_RING_F_POLL_ACTIVATED = BIT(9), + IO_RING_F_DRAIN_DISABLED = BIT(10), + IO_RING_F_COMPAT = BIT(11), + IO_RING_F_IOWQ_LIMITS_SET = BIT(12), +}; + struct io_ring_ctx { /* const or read-mostly hot data */ struct { + /* ring setup flags */ unsigned int flags; - unsigned int drain_next: 1; - unsigned int op_restricted: 1; - unsigned int reg_restricted: 1; - unsigned int off_timeout_used: 1; - unsigned int drain_active: 1; - unsigned int has_evfd: 1; - /* all CQEs should be posted only by the submitter task */ - unsigned int task_complete: 1; - unsigned int lockless_cq: 1; - unsigned int syscall_iopoll: 1; - unsigned int poll_activated: 1; - unsigned int drain_disabled: 1; - unsigned int compat: 1; - unsigned int iowq_limits_set : 1; + /* internal state flags IO_RING_F_* flags , mostly read-only */ + unsigned int int_flags; struct task_struct *submitter_task; struct io_rings *rings; -- cgit v1.2.3 From 9165dc4fa969b64c2d4396ee4e1546a719978dd1 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Mon, 2 Mar 2026 10:29:10 -0700 Subject: io_uring: add REQ_F_IOPOLL A subsequent commit will allow uring_cmds to files that don't implement ->uring_cmd_iopoll() to be issued to IORING_SETUP_IOPOLL io_urings. This means the ctx's IORING_SETUP_IOPOLL flag isn't sufficient to determine whether a given request needs to be iopolled. Introduce a request flag REQ_F_IOPOLL set in ->issue() if a request needs to be iopolled to completion. Set the flag in io_rw_init_file() and io_uring_cmd() for requests issued to IORING_SETUP_IOPOLL ctxs. Use the request flag instead of IORING_SETUP_IOPOLL in places dealing with a specific request. A future possibility would be to add an option to enable/disable iopoll in the io_uring SQE instead of determining it from IORING_SETUP_IOPOLL. Signed-off-by: Caleb Sander Mateos Reviewed-by: Kanchan Joshi Reviewed-by: Anuj Gupta Link: https://patch.msgid.link/20260302172914.2488599-2-csander@purestorage.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 0b3f08adc217..4dbd7083dd54 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -550,6 +550,7 @@ enum { REQ_F_HAS_METADATA_BIT, REQ_F_IMPORT_BUFFER_BIT, REQ_F_SQE_COPIED_BIT, + REQ_F_IOPOLL_BIT, /* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, @@ -641,6 +642,8 @@ enum { REQ_F_IMPORT_BUFFER = IO_REQ_FLAG(REQ_F_IMPORT_BUFFER_BIT), /* ->sqe_copy() has been called, if necessary */ REQ_F_SQE_COPIED = IO_REQ_FLAG(REQ_F_SQE_COPIED_BIT), + /* request must be iopolled to completion (set in ->issue()) */ + REQ_F_IOPOLL = IO_REQ_FLAG(REQ_F_IOPOLL_BIT), }; struct io_tw_req { -- cgit v1.2.3 From 033af2b3eb19c5ed96825572105bca3611635ada Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 26 Feb 2026 12:48:38 +0000 Subject: io_uring: introduce callback driven main loop The io_uring_enter() has a fixed order of execution: it submits requests, waits for completions, and returns to the user. Allow to optionally replace it with a custom loop driven by a callback called loop_step. The basic requirements to the callback is that it should be able to submit requests, wait for completions, parse them and repeat. Most of the communication including parameter passing can be implemented via shared memory. The callback should return IOU_LOOP_CONTINUE to continue execution or IOU_LOOP_STOP to return to the user space. Note that the kernel may decide to prematurely terminate it as well, e.g. in case the process was signalled or killed. The hook takes a structure with parameters. It can be used to ask the kernel to wait for CQEs by setting cq_wait_idx to the CQE index it wants to wait for. Spurious wake ups are possible and even likely, the callback is expected to handle it. There will be more parameters in the future like timeout. It can be used with kernel callbacks, for example, as a slow path deprecation mechanism overwiting SQEs and emulating the wanted behaviour, however it's more useful together with BPF programs implemented in following patches. Note that keeping it separately from the normal io_uring wait loop makes things much simpler and cleaner. It keeps it in one place instead of spreading a bunch of checks in different places including disabling the submission path. It holds the lock by default, which is a better fit for BPF synchronisation and the loop execution model. It nicely avoids existing quirks like forced wake ups on timeout request completion. And it should be easier to implement new features. Signed-off-by: Pavel Begunkov Link: https://patch.msgid.link/a2d369aa1c9dd23ad7edac9220cffc563abcaed6.1772109579.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 4dbd7083dd54..344b634b8989 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -41,6 +41,8 @@ enum io_uring_cmd_flags { IO_URING_F_COMPAT = (1 << 12), }; +struct iou_loop_params; + struct io_wq_work_node { struct io_wq_work_node *next; }; @@ -361,6 +363,9 @@ struct io_ring_ctx { struct io_alloc_cache rw_cache; struct io_alloc_cache cmd_cache; + int (*loop_step)(struct io_ring_ctx *ctx, + struct iou_loop_params *); + /* * Any cancelable uring_cmd is added to this list in * ->uring_cmd() by io_uring_cmd_insert_cancelable() -- cgit v1.2.3 From 98f37634b12b17ad5c56db8fb63cf9d7dc55d74c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 26 Feb 2026 12:48:41 +0000 Subject: io_uring/bpf-ops: implement bpf ops registration Implement BPF struct ops registration. It's registered off the BPF path, and can be removed by BPF as well as io_uring. To protect it, introduce a global lock synchronising registration. ctx->uring_lock can be nested under it. ctx->bpf_ops is write protected by both locks and so it's safe to read it under either of them. Signed-off-by: Pavel Begunkov Link: https://patch.msgid.link/1f46bffd76008de49cbafa2ad77d348810a4f69e.1772109579.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 344b634b8989..28e5dbdac55b 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -8,6 +8,9 @@ #include #include +struct iou_loop_params; +struct io_uring_bpf_ops; + enum { /* * A hint to not wake right away but delay until there are enough of @@ -488,6 +491,8 @@ struct io_ring_ctx { DECLARE_HASHTABLE(napi_ht, 4); #endif + struct io_uring_bpf_ops *bpf_ops; + /* * Protection for resize vs mmap races - both the mmap and resize * side will need to grab this lock, to prevent either side from -- cgit v1.2.3 From b7405dcf7385445e10821777143f18c3ce20fa04 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 15 Mar 2026 10:41:52 +0000 Subject: bonding: prevent potential infinite loop in bond_header_parse() bond_header_parse() can loop if a stack of two bonding devices is setup, because skb->dev always points to the hierarchy top. Add new "const struct net_device *dev" parameter to (struct header_ops)->parse() method to make sure the recursion is bounded, and that the final leaf parse method is called. Fixes: 950803f72547 ("bonding: fix type confusion in bond_setup_by_slave()") Signed-off-by: Eric Dumazet Reviewed-by: Jiayuan Chen Tested-by: Jiayuan Chen Cc: Jay Vosburgh Cc: Andrew Lunn Link: https://patch.msgid.link/20260315104152.1436867-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/etherdevice.h | 3 ++- include/linux/if_ether.h | 3 ++- include/linux/netdevice.h | 6 ++++-- 3 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 9a1eacf35d37..df8f88f63a70 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -42,7 +42,8 @@ extern const struct header_ops eth_header_ops; int eth_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned len); -int eth_header_parse(const struct sk_buff *skb, unsigned char *haddr); +int eth_header_parse(const struct sk_buff *skb, const struct net_device *dev, + unsigned char *haddr); int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh, __be16 type); void eth_header_cache_update(struct hh_cache *hh, const struct net_device *dev, diff --git a/include/linux/if_ether.h b/include/linux/if_ether.h index 61b7335aa037..ca9afa824aa4 100644 --- a/include/linux/if_ether.h +++ b/include/linux/if_ether.h @@ -40,7 +40,8 @@ static inline struct ethhdr *inner_eth_hdr(const struct sk_buff *skb) return (struct ethhdr *)skb_inner_mac_header(skb); } -int eth_header_parse(const struct sk_buff *skb, unsigned char *haddr); +int eth_header_parse(const struct sk_buff *skb, const struct net_device *dev, + unsigned char *haddr); extern ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d7aac6f185bc..7ca01eb3f7d2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -311,7 +311,9 @@ struct header_ops { int (*create) (struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned int len); - int (*parse)(const struct sk_buff *skb, unsigned char *haddr); + int (*parse)(const struct sk_buff *skb, + const struct net_device *dev, + unsigned char *haddr); int (*cache)(const struct neighbour *neigh, struct hh_cache *hh, __be16 type); void (*cache_update)(struct hh_cache *hh, const struct net_device *dev, @@ -3445,7 +3447,7 @@ static inline int dev_parse_header(const struct sk_buff *skb, if (!dev->header_ops || !dev->header_ops->parse) return 0; - return dev->header_ops->parse(skb, haddr); + return dev->header_ops->parse(skb, dev, haddr); } static inline __be16 dev_parse_header_protocol(const struct sk_buff *skb) -- cgit v1.2.3 From 37a23d6f11938cd59927e3307b9b301624df8e8f Mon Sep 17 00:00:00 2001 From: Rosen Penev Date: Wed, 11 Mar 2026 21:59:21 -0700 Subject: bus: mhi: host: Use kzalloc_flex Change kzalloc + kzalloc to just kzalloc with a flexible array member. Add __counted_by for extra runtime analysis when requested. Move counting assignment immediately after allocation as required by __counted_by. Move mhi_buf definition as a complete definition as needed for flex arrays. It's not a pointer anymore. Signed-off-by: Rosen Penev [mani: squashed https://lore.kernel.org/mhi/20260317-mhi-invalid-free-mhi-buffers-v1-1-8418a3ad604f@oss.qualcomm.com] Signed-off-by: Manivannan Sadhasivam Link: https://patch.msgid.link/20260312045921.7663-1-rosenp@gmail.com --- include/linux/mhi.h | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mhi.h b/include/linux/mhi.h index 88ccb3e14f48..fb3ba639f4f8 100644 --- a/include/linux/mhi.h +++ b/include/linux/mhi.h @@ -85,17 +85,33 @@ enum mhi_ch_type { MHI_CH_TYPE_INBOUND_COALESCED = 3, }; +/** + * struct mhi_buf - MHI Buffer description + * @buf: Virtual address of the buffer + * @name: Buffer label. For offload channel, configurations name must be: + * ECA - Event context array data + * CCA - Channel context array data + * @dma_addr: IOMMU address of the buffer + * @len: # of bytes + */ +struct mhi_buf { + void *buf; + const char *name; + dma_addr_t dma_addr; + size_t len; +}; + /** * struct image_info - Firmware and RDDM table * @mhi_buf: Buffer for firmware and RDDM table * @entries: # of entries in table */ struct image_info { - struct mhi_buf *mhi_buf; /* private: from internal.h */ struct bhi_vec_entry *bhi_vec; /* public: */ u32 entries; + struct mhi_buf mhi_buf[] __counted_by(entries); }; /** @@ -488,22 +504,6 @@ struct mhi_result { int transaction_status; }; -/** - * struct mhi_buf - MHI Buffer description - * @buf: Virtual address of the buffer - * @name: Buffer label. For offload channel, configurations name must be: - * ECA - Event context array data - * CCA - Channel context array data - * @dma_addr: IOMMU address of the buffer - * @len: # of bytes - */ -struct mhi_buf { - void *buf; - const char *name; - dma_addr_t dma_addr; - size_t len; -}; - /** * struct mhi_driver - Structure representing a MHI client driver * @probe: CB function for client driver probe function -- cgit v1.2.3 From e71e00127110dedc6a9e746178282b4dac97ed96 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 27 Feb 2026 11:25:36 -0400 Subject: iommupt: Add the RISC-V page table format The RISC-V format is a fairly simple 5 level page table not unlike the x86 one. It has optional support for a single contiguous page size of 64k (16 x 4k). The specification describes a 32-bit format, the general code can support it via a #define but the iommu side implementation has been left off until a user comes. Tested-by: Vincent Chen Acked-by: Paul Walmsley # arch/riscv Reviewed-by: Tomasz Jeznach Tested-by: Tomasz Jeznach Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- include/linux/generic_pt/common.h | 16 ++++++++++++++++ include/linux/generic_pt/iommu.h | 11 +++++++++++ 2 files changed, 27 insertions(+) (limited to 'include/linux') diff --git a/include/linux/generic_pt/common.h b/include/linux/generic_pt/common.h index 6a9a1acb5aad..fc5d0b5edadc 100644 --- a/include/linux/generic_pt/common.h +++ b/include/linux/generic_pt/common.h @@ -175,6 +175,22 @@ enum { PT_FEAT_VTDSS_FORCE_WRITEABLE, }; +struct pt_riscv_32 { + struct pt_common common; +}; + +struct pt_riscv_64 { + struct pt_common common; +}; + +enum { + /* + * Support the 64k contiguous page size following the Svnapot extension. + */ + PT_FEAT_RISCV_SVNAPOT_64K = PT_FEAT_FMT_START, + +}; + struct pt_x86_64 { struct pt_common common; }; diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index 9eefbb74efd0..49d9addb98c5 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -275,6 +275,17 @@ struct pt_iommu_vtdss_hw_info { IOMMU_FORMAT(vtdss, vtdss_pt); +struct pt_iommu_riscv_64_cfg { + struct pt_iommu_cfg common; +}; + +struct pt_iommu_riscv_64_hw_info { + u64 ppn; + u8 fsc_iosatp_mode; +}; + +IOMMU_FORMAT(riscv_64, riscv_64pt); + struct pt_iommu_x86_64_cfg { struct pt_iommu_cfg common; /* 4 is a 57 bit 5 level table */ -- cgit v1.2.3 From 99fb8afa16add85ed016baee9735231bca0c32b4 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 27 Feb 2026 15:30:10 -0400 Subject: iommupt: Directly call iommupt's unmap_range() The common algorithm in iommupt does not require the iommu_pgsize() calculations, it can directly unmap any arbitrary range. Add a new function pointer to directly call an iommupt unmap_range op and make __iommu_unmap() call it directly. Gives about a 5% gain on single page unmappings. The function pointer is run through pt_iommu_ops instead of iommu_domain_ops to discourage using it outside iommupt. All drivers with their own page tables should continue to use the simplified map/unmap_pages() style interfaces. Reviewed-by: Samiullah Khawaja Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Reviewed-by: Lu Baolu Signed-off-by: Joerg Roedel --- include/linux/generic_pt/iommu.h | 37 +++++++++++++++++++++++++++++++------ include/linux/iommu.h | 1 + 2 files changed, 32 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index 49d9addb98c5..0da971134a37 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -66,6 +66,13 @@ struct pt_iommu { struct device *iommu_device; }; +static inline struct pt_iommu *iommupt_from_domain(struct iommu_domain *domain) +{ + if (!IS_ENABLED(CONFIG_IOMMU_PT) || !domain->is_iommupt) + return NULL; + return container_of(domain, struct pt_iommu, domain); +} + /** * struct pt_iommu_info - Details about the IOMMU page table * @@ -80,6 +87,29 @@ struct pt_iommu_info { }; struct pt_iommu_ops { + /** + * @unmap_range: Make a range of IOVA empty/not present + * @iommu_table: Table to manipulate + * @iova: IO virtual address to start + * @len: Length of the range starting from @iova + * @iotlb_gather: Gather struct that must be flushed on return + * + * unmap_range() will remove a translation created by map_range(). It + * cannot subdivide a mapping created by map_range(), so it should be + * called with IOVA ranges that match those passed to map_pages. The + * IOVA range can aggregate contiguous map_range() calls so long as no + * individual range is split. + * + * Context: The caller must hold a write range lock that includes + * the whole range. + * + * Returns: Number of bytes of VA unmapped. iova + res will be the + * point unmapping stopped. + */ + size_t (*unmap_range)(struct pt_iommu *iommu_table, dma_addr_t iova, + dma_addr_t len, + struct iommu_iotlb_gather *iotlb_gather); + /** * @set_dirty: Make the iova write dirty * @iommu_table: Table to manipulate @@ -198,10 +228,6 @@ struct pt_iommu_cfg { unsigned long iova, phys_addr_t paddr, \ size_t pgsize, size_t pgcount, \ int prot, gfp_t gfp, size_t *mapped); \ - size_t pt_iommu_##fmt##_unmap_pages( \ - struct iommu_domain *domain, unsigned long iova, \ - size_t pgsize, size_t pgcount, \ - struct iommu_iotlb_gather *iotlb_gather); \ int pt_iommu_##fmt##_read_and_clear_dirty( \ struct iommu_domain *domain, unsigned long iova, size_t size, \ unsigned long flags, struct iommu_dirty_bitmap *dirty); \ @@ -223,8 +249,7 @@ struct pt_iommu_cfg { */ #define IOMMU_PT_DOMAIN_OPS(fmt) \ .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys, \ - .map_pages = &pt_iommu_##fmt##_map_pages, \ - .unmap_pages = &pt_iommu_##fmt##_unmap_pages + .map_pages = &pt_iommu_##fmt##_map_pages #define IOMMU_PT_DIRTY_OPS(fmt) \ .read_and_clear_dirty = &pt_iommu_##fmt##_read_and_clear_dirty diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 54b8b48c762e..7ca648c01336 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -223,6 +223,7 @@ enum iommu_domain_cookie_type { struct iommu_domain { unsigned type; enum iommu_domain_cookie_type cookie_type; + bool is_iommupt; const struct iommu_domain_ops *ops; const struct iommu_dirty_ops *dirty_ops; const struct iommu_ops *owner; /* Whose domain_alloc we came from */ -- cgit v1.2.3 From d6c65b0fd6218bd21ed0be7a8d3218e8f6dc91de Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 27 Feb 2026 15:30:11 -0400 Subject: iommupt: Avoid rewalking during map Currently the core code provides a simplified interface to drivers where it fragments a requested multi-page map into single page size steps after doing all the calculations to figure out what page size is appropriate. Each step rewalks the page tables from the start. Since iommupt has a single implementation of the mapping algorithm it can internally compute each step as it goes while retaining its current position in the walk. Add a new function pt_pgsz_count() which computes the same page size fragement of a large mapping operations. Compute the next fragment when all the leaf entries of the current fragement have been written, then continue walking from the current point. The function pointer is run through pt_iommu_ops instead of iommu_domain_ops to discourage using it outside iommupt. All drivers with their own page tables should continue to use the simplified map_pages() style interfaces. Reviewed-by: Samiullah Khawaja Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Reviewed-by: Lu Baolu Signed-off-by: Joerg Roedel --- include/linux/generic_pt/iommu.h | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index 0da971134a37..dd0edd02a48a 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -87,6 +87,33 @@ struct pt_iommu_info { }; struct pt_iommu_ops { + /** + * @map_range: Install translation for an IOVA range + * @iommu_table: Table to manipulate + * @iova: IO virtual address to start + * @paddr: Physical/Output address to start + * @len: Length of the range starting from @iova + * @prot: A bitmap of IOMMU_READ/WRITE/CACHE/NOEXEC/MMIO + * @gfp: GFP flags for any memory allocations + * + * The range starting at IOVA will have paddr installed into it. The + * rage is automatically segmented into optimally sized table entries, + * and can have any valid alignment. + * + * On error the caller will probably want to invoke unmap on the range + * from iova up to the amount indicated by @mapped to return the table + * back to an unchanged state. + * + * Context: The caller must hold a write range lock that includes + * the whole range. + * + * Returns: -ERRNO on failure, 0 on success. The number of bytes of VA + * that were mapped are added to @mapped, @mapped is not zerod first. + */ + int (*map_range)(struct pt_iommu *iommu_table, dma_addr_t iova, + phys_addr_t paddr, dma_addr_t len, unsigned int prot, + gfp_t gfp, size_t *mapped); + /** * @unmap_range: Make a range of IOVA empty/not present * @iommu_table: Table to manipulate @@ -224,10 +251,6 @@ struct pt_iommu_cfg { #define IOMMU_PROTOTYPES(fmt) \ phys_addr_t pt_iommu_##fmt##_iova_to_phys(struct iommu_domain *domain, \ dma_addr_t iova); \ - int pt_iommu_##fmt##_map_pages(struct iommu_domain *domain, \ - unsigned long iova, phys_addr_t paddr, \ - size_t pgsize, size_t pgcount, \ - int prot, gfp_t gfp, size_t *mapped); \ int pt_iommu_##fmt##_read_and_clear_dirty( \ struct iommu_domain *domain, unsigned long iova, size_t size, \ unsigned long flags, struct iommu_dirty_bitmap *dirty); \ @@ -248,8 +271,7 @@ struct pt_iommu_cfg { * iommu_pt */ #define IOMMU_PT_DOMAIN_OPS(fmt) \ - .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys, \ - .map_pages = &pt_iommu_##fmt##_map_pages + .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys #define IOMMU_PT_DIRTY_OPS(fmt) \ .read_and_clear_dirty = &pt_iommu_##fmt##_read_and_clear_dirty -- cgit v1.2.3 From 45c6a2dc7ec8339052666b06065c521a10cc29bb Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 1 Mar 2026 16:52:14 -0800 Subject: iommu/io-pgtable: fix all kernel-doc warnings in io-pgtable.h Avoid kernel-doc warnings in io-pgtable.h: - use the correct struct member names or kernel-doc format - add a missing struct member description - add a missing function return comment section Warning: include/linux/io-pgtable.h:187 struct member 'coherent_walk' not described in 'io_pgtable_cfg' Warning: include/linux/io-pgtable.h:187 struct member 'arm_lpae_s1_cfg' not described in 'io_pgtable_cfg' Warning: include/linux/io-pgtable.h:187 struct member 'arm_lpae_s2_cfg' not described in 'io_pgtable_cfg' Warning: include/linux/io-pgtable.h:187 struct member 'arm_v7s_cfg' not described in 'io_pgtable_cfg' Warning: include/linux/io-pgtable.h:187 struct member 'arm_mali_lpae_cfg' not described in 'io_pgtable_cfg' Warning: include/linux/io-pgtable.h:187 struct member 'apple_dart_cfg' not described in 'io_pgtable_cfg' Warning: include/linux/io-pgtable.h:187 struct member 'amd' not described in 'io_pgtable_cfg' Warning: include/linux/io-pgtable.h:223 struct member 'read_and_clear_dirty' not described in 'io_pgtable_ops' Warning: include/linux/io-pgtable.h:237 No description found for return value of 'alloc_io_pgtable_ops' Signed-off-by: Randy Dunlap Reviewed-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- include/linux/io-pgtable.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index 7a1516011ccf..e19872e37e06 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -53,7 +53,7 @@ struct iommu_flush_ops { * tables. * @ias: Input address (iova) size, in bits. * @oas: Output address (paddr) size, in bits. - * @coherent_walk A flag to indicate whether or not page table walks made + * @coherent_walk: A flag to indicate whether or not page table walks made * by the IOMMU are coherent with the CPU caches. * @tlb: TLB management callbacks for this set of tables. * @iommu_dev: The device representing the DMA configuration for the @@ -136,6 +136,7 @@ struct io_pgtable_cfg { void (*free)(void *cookie, void *pages, size_t size); /* Low-level data specific to the table format */ + /* private: */ union { struct { u64 ttbr; @@ -203,6 +204,9 @@ struct arm_lpae_io_pgtable_walk_data { * @unmap_pages: Unmap a range of virtually contiguous pages of the same size. * @iova_to_phys: Translate iova to physical address. * @pgtable_walk: (optional) Perform a page table walk for a given iova. + * @read_and_clear_dirty: Record dirty info per IOVA. If an IOVA is dirty, + * clear its dirty state from the PTE unless the + * IOMMU_DIRTY_NO_CLEAR flag is passed in. * * These functions map directly onto the iommu_ops member functions with * the same names. @@ -231,7 +235,9 @@ struct io_pgtable_ops { * the configuration actually provided by the allocator (e.g. the * pgsize_bitmap may be restricted). * @cookie: An opaque token provided by the IOMMU driver and passed back to - * the callback routines in cfg->tlb. + * the callback routines. + * + * Returns: Pointer to the &struct io_pgtable_ops for this set of page tables. */ struct io_pgtable_ops *alloc_io_pgtable_ops(enum io_pgtable_fmt fmt, struct io_pgtable_cfg *cfg, -- cgit v1.2.3 From a82efb8747d1b8a7c0a377dc79c2aac204eae788 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Tue, 17 Mar 2026 11:16:02 +0000 Subject: iommu: Add device ATS supported capability PCIe ATS may be disabled by platform firmware, root complex limitations, or kernel policy even when a device advertises the ATS capability in its PCI configuration space. Add a new IOMMU_CAP_PCI_ATS_SUPPORTED capability to allow IOMMU drivers to report the effective ATS decision for a device. When this capability is true for a device, ATS may be enabled for that device, but it does not imply that ATS is currently enabled. A subsequent patch will extend iommufd to expose the effective ATS status to userspace. Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Signed-off-by: Shameer Kolothum Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 7ca648c01336..a904821ed169 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -272,6 +272,8 @@ enum iommu_cap { */ IOMMU_CAP_DEFERRED_FLUSH, IOMMU_CAP_DIRTY_TRACKING, /* IOMMU supports dirty tracking */ + /* ATS is supported and may be enabled for this device */ + IOMMU_CAP_PCI_ATS_SUPPORTED, }; /* These are the possible reserved region types */ -- cgit v1.2.3 From 2727d44f5d5bc3f8e55a6a0ccf24d8105a5a400e Mon Sep 17 00:00:00 2001 From: Kit Dallege Date: Sun, 15 Mar 2026 18:09:31 +0100 Subject: writeback: fix kernel-doc function name mismatch for wb_put_many() The kernel-doc comment says wb_put but the actual function is wb_put_many. Fix the name to match. Assisted-by: Claude:claude-opus-4-6 Signed-off-by: Kit Dallege Link: https://patch.msgid.link/20260315170931.65852-1-xaum.io@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/backing-dev-defs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index c88fd4d37d1f..a06b93446d10 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -237,7 +237,7 @@ static inline void wb_get(struct bdi_writeback *wb) } /** - * wb_put - decrement a wb's refcount + * wb_put_many - decrement a wb's refcount * @wb: bdi_writeback to put * @nr: number of references to put */ -- cgit v1.2.3 From 9577c74c96f88d807d1ba005adbf5952e7127e55 Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Thu, 12 Mar 2026 18:51:41 -0700 Subject: platform/x86/intel/vsec: Make driver_data info const MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Treat PCI id->driver_data (intel_vsec_platform_info) as read-only by making vsec_priv->info a const pointer and updating all function signatures to accept const intel_vsec_platform_info *. This improves const-correctness and clarifies that the platform info data from the driver_data table is not meant to be modified at runtime. No functional changes intended. Signed-off-by: David E. Box Reviewed-by: Michael J. Ruhl Link: https://patch.msgid.link/20260313015202.3660072-3-david.e.box@linux.intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/intel_vsec.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/intel_vsec.h b/include/linux/intel_vsec.h index 1a0f357c2427..d551174b0049 100644 --- a/include/linux/intel_vsec.h +++ b/include/linux/intel_vsec.h @@ -200,13 +200,13 @@ static inline struct intel_vsec_device *auxdev_to_ivdev(struct auxiliary_device #if IS_ENABLED(CONFIG_INTEL_VSEC) int intel_vsec_register(struct pci_dev *pdev, - struct intel_vsec_platform_info *info); + const struct intel_vsec_platform_info *info); int intel_vsec_set_mapping(struct oobmsm_plat_info *plat_info, struct intel_vsec_device *vsec_dev); struct oobmsm_plat_info *intel_vsec_get_mapping(struct pci_dev *pdev); #else static inline int intel_vsec_register(struct pci_dev *pdev, - struct intel_vsec_platform_info *info) + const struct intel_vsec_platform_info *info) { return -ENODEV; } -- cgit v1.2.3 From c62fd96a04e4a7b847448f97ecfe9f3fe706e7b3 Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Thu, 12 Mar 2026 18:51:42 -0700 Subject: platform/x86/intel/vsec: Decouple add/link helpers from PCI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This refactor prepares for adding ACPI-enumerated PMT endpoints. While intel_vsec is bound to PCI today, some helpers are used by code that will also register PMT endpoints from non-PCI (ACPI) paths. Clean up PCI-specific plumbing where it isn’t strictly required and rely on generic struct device where possible. Signed-off-by: David E. Box Reviewed-by: Ilpo Järvinen Reviewed-by: Michael J. Ruhl Link: https://patch.msgid.link/20260313015202.3660072-4-david.e.box@linux.intel.com Signed-off-by: Ilpo Järvinen --- include/linux/intel_vsec.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/intel_vsec.h b/include/linux/intel_vsec.h index d551174b0049..49a746ec0128 100644 --- a/include/linux/intel_vsec.h +++ b/include/linux/intel_vsec.h @@ -184,7 +184,7 @@ struct pmt_feature_group { struct telemetry_region regions[]; }; -int intel_vsec_add_aux(struct pci_dev *pdev, struct device *parent, +int intel_vsec_add_aux(struct device *parent, struct intel_vsec_device *intel_vsec_dev, const char *name); -- cgit v1.2.3 From 353042d54d82f6c46449f0ee38c244b5a13c1fe4 Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Thu, 12 Mar 2026 18:51:43 -0700 Subject: platform/x86/intel/vsec: Switch exported helpers from pci_dev to device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Preparatory refactor for ACPI-enumerated PMT endpoints. Several exported PMT/VSEC interfaces and structs carried struct pci_dev * even though callers only need a generic struct device. Move those to struct device * so the same APIs work for PCI and ACPI parents. Acked-by: Rodrigo Vivi Signed-off-by: David E. Box Link: https://patch.msgid.link/20260313015202.3660072-5-david.e.box@linux.intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/intel_vsec.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/intel_vsec.h b/include/linux/intel_vsec.h index 49a746ec0128..4eecb2a6bac4 100644 --- a/include/linux/intel_vsec.h +++ b/include/linux/intel_vsec.h @@ -29,6 +29,7 @@ #define INTEL_DVSEC_TABLE_OFFSET(x) ((x) & GENMASK(31, 3)) #define TABLE_OFFSET_SHIFT 3 +struct device; struct pci_dev; struct resource; @@ -82,14 +83,14 @@ enum intel_vsec_quirks { * struct pmt_callbacks - Callback infrastructure for PMT devices * @read_telem: when specified, called by client driver to access PMT * data (instead of direct copy). - * * pdev: PCI device reference for the callback's use + * * dev: device reference for the callback's use * * guid: ID of data to acccss * * data: buffer for the data to be copied * * off: offset into the requested buffer * * count: size of buffer */ struct pmt_callbacks { - int (*read_telem)(struct pci_dev *pdev, u32 guid, u64 *data, loff_t off, u32 count); + int (*read_telem)(struct device *dev, u32 guid, u64 *data, loff_t off, u32 count); }; struct vsec_feature_dependency { @@ -122,7 +123,7 @@ struct intel_vsec_platform_info { /** * struct intel_vsec_device - Auxbus specific device information * @auxdev: auxbus device struct for auxbus access - * @pcidev: pci device associated with the device + * @dev: struct device associated with the device * @resource: any resources shared by the parent * @ida: id reference * @num_resources: number of resources @@ -135,7 +136,7 @@ struct intel_vsec_platform_info { */ struct intel_vsec_device { struct auxiliary_device auxdev; - struct pci_dev *pcidev; + struct device *dev; struct resource *resource; struct ida *ida; int num_resources; @@ -199,13 +200,13 @@ static inline struct intel_vsec_device *auxdev_to_ivdev(struct auxiliary_device } #if IS_ENABLED(CONFIG_INTEL_VSEC) -int intel_vsec_register(struct pci_dev *pdev, +int intel_vsec_register(struct device *dev, const struct intel_vsec_platform_info *info); int intel_vsec_set_mapping(struct oobmsm_plat_info *plat_info, struct intel_vsec_device *vsec_dev); struct oobmsm_plat_info *intel_vsec_get_mapping(struct pci_dev *pdev); #else -static inline int intel_vsec_register(struct pci_dev *pdev, +static inline int intel_vsec_register(struct device *dev, const struct intel_vsec_platform_info *info) { return -ENODEV; -- cgit v1.2.3 From 22fa2ebc11a164e1ea529da6c356e3e01aef8ac8 Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Thu, 12 Mar 2026 18:51:45 -0700 Subject: platform/x86/intel/vsec: Plumb ACPI PMT discovery tables through vsec MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some platforms expose PMT discovery via ACPI instead of PCI BARs. Add a generic discovery source flag and carry ACPI discovery entries alongside the existing PCI resource path so PMT clients can consume either. Changes: - Add enum intel_vsec_disc_source { _PCI, _ACPI }. - Extend intel_vsec_platform_info and intel_vsec_device with source enum and ACPI discovery table pointer/ - When src==ACPI, skip BAR resource setup and copy the ACPI discovery entries into the aux device. No user-visible behavior change yet; this only wires ACPI data through vsec in preparation for ACPI-enumerated PMT clients. Signed-off-by: David E. Box Link: https://patch.msgid.link/20260313015202.3660072-7-david.e.box@linux.intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/intel_vsec.h | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/intel_vsec.h b/include/linux/intel_vsec.h index 4eecb2a6bac4..1fe5665a9d02 100644 --- a/include/linux/intel_vsec.h +++ b/include/linux/intel_vsec.h @@ -33,6 +33,11 @@ struct device; struct pci_dev; struct resource; +enum intel_vsec_disc_source { + INTEL_VSEC_DISC_PCI, /* PCI, default */ + INTEL_VSEC_DISC_ACPI, /* ACPI */ +}; + enum intel_vsec_id { VSEC_ID_TELEMETRY = 2, VSEC_ID_WATCHER = 3, @@ -103,6 +108,10 @@ struct vsec_feature_dependency { * @parent: parent device in the auxbus chain * @headers: list of headers to define the PMT client devices to create * @deps: array of feature dependencies + * @acpi_disc: ACPI discovery tables, each entry is two QWORDs + * in little-endian format as defined by the PMT ACPI spec. + * Valid only when @provider == INTEL_VSEC_DISC_ACPI. + * @src: source of discovery table data * @priv_data: private data, usable by parent devices, currently a callback * @caps: bitmask of PMT capabilities for the given headers * @quirks: bitmask of VSEC device quirks @@ -113,6 +122,8 @@ struct intel_vsec_platform_info { struct device *parent; struct intel_vsec_header **headers; const struct vsec_feature_dependency *deps; + u32 (*acpi_disc)[4]; + enum intel_vsec_disc_source src; void *priv_data; unsigned long caps; unsigned long quirks; @@ -124,7 +135,12 @@ struct intel_vsec_platform_info { * struct intel_vsec_device - Auxbus specific device information * @auxdev: auxbus device struct for auxbus access * @dev: struct device associated with the device - * @resource: any resources shared by the parent + * @resource: PCI discovery resources (BAR windows), one per discovery + * instance. Valid only when @src == INTEL_VSEC_DISC_PCI + * @acpi_disc: ACPI discovery tables, each entry is two QWORDs + * in little-endian format as defined by the PMT ACPI spec. + * Valid only when @src == INTEL_VSEC_DISC_ACPI. + * @src: source of discovery table data * @ida: id reference * @num_resources: number of resources * @id: xarray id @@ -138,6 +154,8 @@ struct intel_vsec_device { struct auxiliary_device auxdev; struct device *dev; struct resource *resource; + u32 (*acpi_disc)[4]; + enum intel_vsec_disc_source src; struct ida *ida; int num_resources; int id; /* xa */ -- cgit v1.2.3 From cb3d1049f4ea77d5ad93f17d8ac1f2ed4da70501 Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Tue, 3 Mar 2026 12:53:18 +0100 Subject: driver core: generalize driver_override in struct device Currently, there are 12 busses (including platform and PCI) that duplicate the driver_override logic for their individual devices. All of them seem to be prone to the bug described in [1]. While this could be solved for every bus individually using a separate lock, solving this in the driver-core generically results in less (and cleaner) changes overall. Thus, move driver_override to struct device, provide corresponding accessors for busses and handle locking with a separate lock internally. In particular, add device_set_driver_override(), device_has_driver_override(), device_match_driver_override() and generalize the sysfs store() and show() callbacks via a driver_override feature flag in struct bus_type. Until all busses have migrated, keep driver_set_override() in place. Note that we can't use the device lock for the reasons described in [2]. Link: https://bugzilla.kernel.org/show_bug.cgi?id=220789 [1] Link: https://lore.kernel.org/driver-core/DGRGTIRHA62X.3RY09D9SOK77P@kernel.org/ [2] Tested-by: Gui-Dong Han Co-developed-by: Gui-Dong Han Signed-off-by: Gui-Dong Han Reviewed-by: Greg Kroah-Hartman Link: https://patch.msgid.link/20260303115720.48783-2-dakr@kernel.org [ Use dev->bus instead of sp->bus for consistency; fix commit message to refer to the struct bus_type's driver_override feature flag. - Danilo ] Signed-off-by: Danilo Krummrich --- include/linux/device.h | 54 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/device/bus.h | 4 ++++ 2 files changed, 58 insertions(+) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 0be95294b6e6..e65d564f01cd 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -483,6 +483,8 @@ struct device_physical_location { * on. This shrinks the "Board Support Packages" (BSPs) and * minimizes board-specific #ifdefs in drivers. * @driver_data: Private pointer for driver specific info. + * @driver_override: Driver name to force a match. Do not touch directly; use + * device_set_driver_override() instead. * @links: Links to suppliers and consumers of this device. * @power: For device power management. * See Documentation/driver-api/pm/devices.rst for details. @@ -576,6 +578,10 @@ struct device { core doesn't touch it */ void *driver_data; /* Driver data, set and get with dev_set_drvdata/dev_get_drvdata */ + struct { + const char *name; + spinlock_t lock; + } driver_override; struct mutex mutex; /* mutex to synchronize calls to * its driver. */ @@ -701,6 +707,54 @@ struct device_link { #define kobj_to_dev(__kobj) container_of_const(__kobj, struct device, kobj) +int __device_set_driver_override(struct device *dev, const char *s, size_t len); + +/** + * device_set_driver_override() - Helper to set or clear driver override. + * @dev: Device to change + * @s: NUL-terminated string, new driver name to force a match, pass empty + * string to clear it ("" or "\n", where the latter is only for sysfs + * interface). + * + * Helper to set or clear driver override of a device. + * + * Returns: 0 on success or a negative error code on failure. + */ +static inline int device_set_driver_override(struct device *dev, const char *s) +{ + return __device_set_driver_override(dev, s, s ? strlen(s) : 0); +} + +/** + * device_has_driver_override() - Check if a driver override has been set. + * @dev: device to check + * + * Returns true if a driver override has been set for this device. + */ +static inline bool device_has_driver_override(struct device *dev) +{ + guard(spinlock)(&dev->driver_override.lock); + return !!dev->driver_override.name; +} + +/** + * device_match_driver_override() - Match a driver against the device's driver_override. + * @dev: device to check + * @drv: driver to match against + * + * Returns > 0 if a driver override is set and matches the given driver, 0 if a + * driver override is set but does not match, or < 0 if a driver override is not + * set at all. + */ +static inline int device_match_driver_override(struct device *dev, + const struct device_driver *drv) +{ + guard(spinlock)(&dev->driver_override.lock); + if (dev->driver_override.name) + return !strcmp(dev->driver_override.name, drv->name); + return -1; +} + /** * device_iommu_mapped - Returns true when the device DMA is translated * by an IOMMU diff --git a/include/linux/device/bus.h b/include/linux/device/bus.h index 63de5f053c33..c1b463cd6464 100644 --- a/include/linux/device/bus.h +++ b/include/linux/device/bus.h @@ -65,6 +65,9 @@ struct fwnode_handle; * this bus. * @pm: Power management operations of this bus, callback the specific * device driver's pm-ops. + * @driver_override: Set to true if this bus supports the driver_override + * mechanism, which allows userspace to force a specific + * driver to bind to a device via a sysfs attribute. * @need_parent_lock: When probing or removing a device on this bus, the * device core should lock the device's parent. * @@ -106,6 +109,7 @@ struct bus_type { const struct dev_pm_ops *pm; + bool driver_override; bool need_parent_lock; }; -- cgit v1.2.3 From 2b38efc05bf7a8568ec74bfffea0f5cfa62bc01d Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Tue, 3 Mar 2026 12:53:21 +0100 Subject: driver core: platform: use generic driver_override infrastructure When a driver is probed through __driver_attach(), the bus' match() callback is called without the device lock held, thus accessing the driver_override field without a lock, which can cause a UAF. Fix this by using the driver-core driver_override infrastructure taking care of proper locking internally. Note that calling match() from __driver_attach() without the device lock held is intentional. [1] Link: https://lore.kernel.org/driver-core/DGRGTIRHA62X.3RY09D9SOK77P@kernel.org/ [1] Reported-by: Gui-Dong Han Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220789 Fixes: 3d713e0e382e ("driver core: platform: add device binding path 'driver_override'") Reviewed-by: Greg Kroah-Hartman Link: https://patch.msgid.link/20260303115720.48783-5-dakr@kernel.org Signed-off-by: Danilo Krummrich --- include/linux/platform_device.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h index 813da101b5bf..ed1d50d1c3c1 100644 --- a/include/linux/platform_device.h +++ b/include/linux/platform_device.h @@ -31,11 +31,6 @@ struct platform_device { struct resource *resource; const struct platform_device_id *id_entry; - /* - * Driver name to force a match. Do not set directly, because core - * frees it. Use driver_set_override() to set or clear it. - */ - const char *driver_override; /* MFD cell pointer */ struct mfd_cell *mfd_cell; -- cgit v1.2.3 From bb8539e0e60916ef3ed4a92eb2f3cfd8e34061ef Mon Sep 17 00:00:00 2001 From: Qingfang Deng Date: Mon, 16 Mar 2026 17:28:23 +0800 Subject: ppp: require callers of ppp_dev_name() to hold RCU ppp_dev_name() holds the RCU read lock internally to protect pch->ppp. However, as it returns netdev->name to the caller, the caller should also hold either RCU or RTNL lock to prevent the netdev from being freed. The only two references of the function is in the L2TP driver, both of which already hold RCU. So remove the internal RCU lock and document that callers must hold RCU. Signed-off-by: Qingfang Deng Reviewed-by: Breno Leitao Link: https://patch.msgid.link/20260316092824.479149-1-dqfext@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/ppp_channel.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ppp_channel.h b/include/linux/ppp_channel.h index ca8ad03eeef0..2f63e9a6cc88 100644 --- a/include/linux/ppp_channel.h +++ b/include/linux/ppp_channel.h @@ -72,7 +72,9 @@ extern int ppp_channel_index(struct ppp_channel *); /* Get the unit number associated with a channel, or -1 if none */ extern int ppp_unit_number(struct ppp_channel *); -/* Get the device name associated with a channel, or NULL if none */ +/* Get the device name associated with a channel, or NULL if none. + * Caller must hold RCU read lock. + */ extern char *ppp_dev_name(struct ppp_channel *); /* -- cgit v1.2.3 From b520c4eef83dd406591431f936de0908c3ed7fb9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Mar 2026 17:11:30 +0100 Subject: block: split bio_alloc_bioset more clearly into a fast and slowpath bio_alloc_bioset tries non-waiting slab allocations first for the bio and bvec array, but does so in a somewhat convoluted way. Restructure the function so that it first open codes these slab allocations, and then falls back to the mempools with the original gfp mask. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni -ck Reviewed-by: Martin K. Petersen Link: https://patch.msgid.link/20260316161144.1607877-3-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bio.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 9693a0d6fefe..984844d2870b 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -350,8 +350,7 @@ extern void bioset_exit(struct bio_set *); extern int biovec_init_pool(mempool_t *pool, int pool_entries); struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs, - blk_opf_t opf, gfp_t gfp_mask, - struct bio_set *bs); + blk_opf_t opf, gfp_t gfp, struct bio_set *bs); struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask); extern void bio_put(struct bio *); -- cgit v1.2.3 From 10febd397591d93f42adb743c2c664041e7f1bcb Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Thu, 12 Mar 2026 04:44:30 +0000 Subject: sched/topology: Remove sched_domain_shared allocation with sd_data Now that "sd->shared" assignments are using the sched_domain_shared objects allocated with s_data, remove the sd_data based allocations. Signed-off-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Link: https://patch.msgid.link/20260312044434.1974-6-kprateek.nayak@amd.com --- include/linux/sched/topology.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index a1e1032426dc..51c29581f15e 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -172,7 +172,6 @@ typedef int (*sched_domain_flags_f)(void); struct sd_data { struct sched_domain *__percpu *sd; - struct sched_domain_shared *__percpu *sds; struct sched_group *__percpu *sg; struct sched_group_capacity *__percpu *sgc; }; -- cgit v1.2.3 From 8ca12326f592f7554acf2788ecb1c5c954dcf31c Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Mon, 16 Mar 2026 00:36:22 +0100 Subject: PM: EM: Switch to rcu_dereference_all() in wakeup path em_cpu_energy() is part of the EAS (Fair) task wakeup path. Now that rcu_read_{,un}lock() have been removed from find_energy_efficient_cpu() switch to rcu_dereference_all() and check for rcu_read_lock_any_held() in em_cpu_energy() as well. In EAS (Fair) task wakeup path is a preempt/IRQ disabled region, so rcu_read_{,un}lock() can be removed. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: K Prateek Nayak Link: https://patch.msgid.link/5b1228b7-5949-4a45-9f62-e8ce936de694@arm.com --- include/linux/energy_model.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index e7497f804644..c909a8ba22e8 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -248,7 +248,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, struct em_perf_state *ps; int i; - WARN_ONCE(!rcu_read_lock_held(), "EM: rcu read lock needed\n"); + lockdep_assert(rcu_read_lock_any_held()); if (!sum_util) return 0; @@ -267,7 +267,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, * Find the lowest performance state of the Energy Model above the * requested performance. */ - em_table = rcu_dereference(pd->em_table); + em_table = rcu_dereference_all(pd->em_table); i = em_pd_get_efficient_state(em_table->state, pd, max_util); ps = &em_table->state[i]; -- cgit v1.2.3 From b7560798466a07d9c3fb011698e92c335ab28baf Mon Sep 17 00:00:00 2001 From: Devendra K Verma Date: Wed, 18 Mar 2026 12:34:03 +0530 Subject: dmaengine: dw-edma: Add non-LL mode AMD MDB IP supports Linked List (LL) mode as well as non-LL mode. The current code does not have the mechanisms to enable the DMA transactions using the non-LL mode. The following two cases are added with this patch: - For the AMD (Xilinx) only, when a valid physical base address of the device side DDR is not configured, then the IP can still be used in non-LL mode. For all the channels DMA transactions will be using the non-LL mode only. This, the default non-LL mode, is not applicable for Synopsys IP with the current code addition. - If the default mode is LL-mode, for both AMD (Xilinx) and Synosys, and if user wants to use non-LL mode then user can do so via configuring the peripheral_config param of dma_slave_config. Signed-off-by: Devendra K Verma Reviewed-by: Frank Li Link: https://patch.msgid.link/20260318070403.1634706-3-devendra.verma@amd.com Signed-off-by: Vinod Koul --- include/linux/dma/edma.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/dma/edma.h b/include/linux/dma/edma.h index 9da53c75e49b..1fafd5b0e315 100644 --- a/include/linux/dma/edma.h +++ b/include/linux/dma/edma.h @@ -103,6 +103,7 @@ struct dw_edma_chip { enum dw_edma_map_format mf; struct dw_edma *dw; + bool cfg_non_ll; }; /* Export to the platform drivers */ -- cgit v1.2.3 From de9e2b3d88af36411301c049a1b049f3e4fe0757 Mon Sep 17 00:00:00 2001 From: Cristian Ciocaltea Date: Tue, 3 Mar 2026 21:24:17 +0200 Subject: uapi: Provide DIV_ROUND_CLOSEST() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently DIV_ROUND_CLOSEST() is only available for the kernel via include/linux/math.h. Expose it to userland as well by adding __KERNEL_DIV_ROUND_CLOSEST() as a common definition in uapi. Additionally, ensure it allows building ISO C applications by switching from the 'typeof' GNU extension to the ISO-friendly __typeof__. Reviewed-by: Nícolas F. R. A. Prado Tested-by: Diederik de Haas Acked-by: Andy Shevchenko Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Cristian Ciocaltea Link: https://patch.msgid.link/20260303-rk3588-bgcolor-v8-1-fee377037ad1@collabora.com Signed-off-by: Daniel Stone --- include/linux/math.h | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/math.h b/include/linux/math.h index 6dc1d1d32fbc..1e8fb3efbc8c 100644 --- a/include/linux/math.h +++ b/include/linux/math.h @@ -89,23 +89,7 @@ } \ ) -/* - * Divide positive or negative dividend by positive or negative divisor - * and round to closest integer. Result is undefined for negative - * divisors if the dividend variable type is unsigned and for negative - * dividends if the divisor variable type is unsigned. - */ -#define DIV_ROUND_CLOSEST(x, divisor)( \ -{ \ - typeof(x) __x = x; \ - typeof(divisor) __d = divisor; \ - (((typeof(x))-1) > 0 || \ - ((typeof(divisor))-1) > 0 || \ - (((__x) > 0) == ((__d) > 0))) ? \ - (((__x) + ((__d) / 2)) / (__d)) : \ - (((__x) - ((__d) / 2)) / (__d)); \ -} \ -) +#define DIV_ROUND_CLOSEST __KERNEL_DIV_ROUND_CLOSEST /* * Same as above but for u64 dividends. divisor must be a 32-bit * number. -- cgit v1.2.3 From f8a5f6934f30b9ee334256347dd70a7ba0f8be7f Mon Sep 17 00:00:00 2001 From: Aldo Conte Date: Wed, 11 Mar 2026 17:33:20 +0100 Subject: usb: typec: Document priority and mode_selection fields in struct typec_altmode The fields 'priority' and 'mode_selection' in struct typec_altmode are missing from the kernel-doc comment, which results in warnings when building the documentation with 'make htmldocs'. WARNING: ./include/linux/usb/typec_altmode.h:44 struct member 'priority' not described in 'typec_altmode' WARNING: ./include/linux/usb/typec_altmode.h:44 struct member 'mode_selection' not described in 'typec_altmode' Document both fields to keep the kernel-doc comment aligned with the structure definition. Signed-off-by: Aldo Conte Reviewed-by: Heikki Krogerus Link: https://patch.msgid.link/20260311163320.61534-1-aldocontelk@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/typec_altmode.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/typec_altmode.h b/include/linux/usb/typec_altmode.h index 0513d333b797..b90cc5cfff8d 100644 --- a/include/linux/usb/typec_altmode.h +++ b/include/linux/usb/typec_altmode.h @@ -26,6 +26,9 @@ struct typec_altmode_ops; * @mode: Index of the Mode * @vdo: VDO returned by Discover Modes USB PD command * @active: Tells has the mode been entered or not + * @priority: Priority used by the automatic alternate mode selection process + * @mode_selection: Whether entry to this alternate mode is managed by the + * automatic alternate mode selection process or by the specific driver * @desc: Optional human readable description of the mode * @ops: Operations vector from the driver * @cable_ops: Cable operations vector from the driver. -- cgit v1.2.3 From a43dd4f6f91ed1a1d16595cb0c550b283e9b2298 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Mon, 16 Mar 2026 15:03:00 +0000 Subject: power: supply: Add PD SPR AVS support to USB type enum Add two new members to the power_supply_usb_type to represent the USB Power Delivery (PD) Standard Power Range (SPR) Adjustable Voltage Supply (AVS) charging types: POWER_SUPPLY_USB_TYPE_PD_SPR_AVS: For devices supporting only the PD SPR AVS type. POWER_SUPPLY_USB_TYPE_PD_PPS_SPR_AVS: For devices that support both PD Programmable Power Supply (PPS) and PD SPR AVS. Signed-off-by: Badhri Jagan Sridharan Link: https://patch.msgid.link/20260316150301.3892223-3-badhri@google.com Signed-off-by: Greg Kroah-Hartman --- include/linux/power_supply.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 360ffdf272da..7a5e4c3242a0 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -210,6 +210,9 @@ enum power_supply_usb_type { POWER_SUPPLY_USB_TYPE_PD, /* Power Delivery Port */ POWER_SUPPLY_USB_TYPE_PD_DRP, /* PD Dual Role Port */ POWER_SUPPLY_USB_TYPE_PD_PPS, /* PD Programmable Power Supply */ + /* PD Standard Power Range Adjustable Voltage Supply */ + POWER_SUPPLY_USB_TYPE_PD_SPR_AVS, + POWER_SUPPLY_USB_TYPE_PD_PPS_SPR_AVS, /* Supports both PD PPS + SPR AVS */ POWER_SUPPLY_USB_TYPE_APPLE_BRICK_ID, /* Apple Charging Method */ }; -- cgit v1.2.3 From d3d959404e6c72e09db8de8893a970edf0ac565d Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Mon, 16 Mar 2026 15:03:01 +0000 Subject: tcpm: Implement sink support for PD SPR AVS negotiation Add support to enable TCPM to negotiate with USB PD Standard Power Range Adjustable Voltage Supply (SPR AVS) when acting as a power sink. * Added support to the tcpm power supply properties, allowing userspace to enable and control the dynamic limits (voltage and current) specific to the SPR AVS contract. * Implemented tcpm_pd_select_spr_avs_apdo() to select the appropriate APDO and validate the requested voltage/current against both the Source and Sink capabilities. * Implemented tcpm_pd_build_spr_avs_request() to construct the Request Data Object (RDO) for SPR AVS. * Added SNK_NEGOTIATE_SPR_AVS_CAPABILITIES state to the state machine to handle negotiation for SPR AVS. * Updated the SNK_TRANSITION_SINK state to implement the SPR AVS-specific VBUS transition rules, including reducing current draw to PD_I_SNK_STBY_MA for large voltage changes, as required by USB PD spec. Log stub captured when enabling AVS: $ echo 3 > /sys/class/power_supply/tcpm-source-psy-1-0025/online $ cat /d/usb/tcpm-1-0025/log [ 358.895775] request to set AVS online [ 358.895792] AMS POWER_NEGOTIATION start [ 358.895806] state change SNK_READY -> AMS_START [rev3 POWER_NEGOTIATION] [ 358.895850] state change AMS_START -> SNK_NEGOTIATE_SPR_AVS_CAPABILITIES [rev3 POWER_NEGOTIATION] [ 358.895866] SPR AVS src_pdo_index:4 snk_pdo_index:2 req_op_curr_ma roundup:2200 req_out_volt_mv roundup:9000 [ 358.895880] Requesting APDO SPR AVS 4: 9000 mV, 2200 mA [ 358.896405] set_auto_vbus_discharge_threshold mode:0 pps_active:n vbus:0 pps_apdo_min_volt:0 ret:0 [ 358.896422] PD TX, header: 0x1a82 [ 358.900158] PD TX complete, status: 0 [ 358.900205] pending state change SNK_NEGOTIATE_SPR_AVS_CAPABILITIES -> HARD_RESET_SEND @ 60 ms [rev3 POWER_NEGOTIATION] [ 358.904832] PD RX, header: 0x1a3 [1] [ 358.904854] state change SNK_NEGOTIATE_SPR_AVS_CAPABILITIES -> SNK_TRANSITION_SINK [rev3 POWER_NEGOTIATION] [ 358.904888] pending state change SNK_TRANSITION_SINK -> HARD_RESET_SEND @ 700 ms [rev3 POWER_NEGOTIATION] [ 359.021530] PD RX, header: 0x3a6 [1] [ 359.021546] Setting voltage/current limit 9000 mV 2200 mA [ 359.023035] set_auto_vbus_discharge_threshold mode:3 pps_active:n vbus:9000 pps_apdo_min_volt:0 ret:0 [ 359.023053] state change SNK_TRANSITION_SINK -> SNK_READY [rev3 POWER_NEGOTIATION] [ 359.023090] AMS POWER_NEGOTIATION finished $ cat /sys/class/power_supply/tcpm-source-psy-1-0025/online 3 Log stub captured when increasing voltage: $ echo 9100000 > /sys/class/power_supply/tcpm-source-psy-1-0025/voltage_now $ cat /d/usb/tcpm-1-0025/log [ 632.116714] AMS POWER_NEGOTIATION start [ 632.116728] state change SNK_READY -> AMS_START [rev3 POWER_NEGOTIATION] [ 632.116779] state change AMS_START -> SNK_NEGOTIATE_SPR_AVS_CAPABILITIES [rev3 POWER_NEGOTIATION] [ 632.116798] SPR AVS src_pdo_index:4 snk_pdo_index:2 req_op_curr_ma roundup:2200 req_out_volt_mv roundup:9100 [ 632.116811] Requesting APDO SPR AVS 4: 9100 mV, 2200 mA [ 632.117315] set_auto_vbus_discharge_threshold mode:0 pps_active:n vbus:0 pps_apdo_min_volt:0 ret:0 [ 632.117328] PD TX, header: 0x1c82 [ 632.121007] PD TX complete, status: 0 [ 632.121052] pending state change SNK_NEGOTIATE_SPR_AVS_CAPABILITIES -> HARD_RESET_SEND @ 60 ms [rev3 POWER_NEGOTIATION] [ 632.124572] PD RX, header: 0x5a3 [1] [ 632.124594] state change SNK_NEGOTIATE_SPR_AVS_CAPABILITIES -> SNK_TRANSITION_SINK [rev3 POWER_NEGOTIATION] [ 632.124623] pending state change SNK_TRANSITION_SINK -> HARD_RESET_SEND @ 700 ms [rev3 POWER_NEGOTIATION] [ 632.149256] PD RX, header: 0x7a6 [1] [ 632.149271] Setting voltage/current limit 9100 mV 2200 mA [ 632.150770] set_auto_vbus_discharge_threshold mode:3 pps_active:n vbus:9100 pps_apdo_min_volt:0 ret:0 [ 632.150787] state change SNK_TRANSITION_SINK -> SNK_READY [rev3 POWER_NEGOTIATION] [ 632.150823] AMS POWER_NEGOTIATION finished $ cat /sys/class/power_supply/tcpm-source-psy-1-0025/voltage_now 9100000 Signed-off-by: Badhri Jagan Sridharan Reviewed-by: Amit Sunil Dhamne Acked-by: Heikki Krogerus Link: https://patch.msgid.link/20260316150301.3892223-4-badhri@google.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/pd.h | 32 +++++++++++++++++++++++++++++--- include/linux/usb/tcpm.h | 2 +- 2 files changed, 30 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb/pd.h b/include/linux/usb/pd.h index 5a98983195cb..337a5485af7c 100644 --- a/include/linux/usb/pd.h +++ b/include/linux/usb/pd.h @@ -398,9 +398,30 @@ enum pd_apdo_type { #define PDO_SPR_AVS_APDO_15V_TO_20V_MAX_CURR GENMASK(9, 0) /* 10mA unit */ /* SPR AVS has two different current ranges 9V - 15V, 15V - 20V */ -#define SPR_AVS_TIER1_MIN_VOLT_MV 9000 -#define SPR_AVS_TIER1_MAX_VOLT_MV 15000 -#define SPR_AVS_TIER2_MAX_VOLT_MV 20000 +#define SPR_AVS_TIER1_MIN_VOLT_MV 9000 +#define SPR_AVS_TIER1_MAX_VOLT_MV 15000 +#define SPR_AVS_TIER2_MAX_VOLT_MV 20000 + +#define SPR_AVS_AVS_SMALL_STEP_V 1 +/* vAvsStep - 100mv */ +#define SPR_AVS_VOLT_MV_STEP 100 +/* SPR AVS RDO Operating Current is in 50mA step */ +#define RDO_SPR_AVS_CURR_MA_STEP 50 +/* SPR AVS RDO Output voltage is in 25mV step */ +#define RDO_SPR_AVS_OUT_VOLT_MV_STEP 25 + +#define RDO_SPR_AVS_VOLT GENMASK(20, 9) +#define RDO_SPR_AVS_CURR GENMASK(6, 0) + +#define RDO_SPR_AVS_OUT_VOLT(mv) \ + FIELD_PREP(RDO_SPR_AVS_VOLT, ((mv) / RDO_SPR_AVS_OUT_VOLT_MV_STEP)) + +#define RDO_SPR_AVS_OP_CURR(ma) \ + FIELD_PREP(RDO_SPR_AVS_CURR, ((ma) / RDO_SPR_AVS_CURR_MA_STEP)) + +#define RDO_AVS(idx, out_mv, op_ma, flags) \ + (RDO_OBJ(idx) | (flags) | \ + RDO_SPR_AVS_OUT_VOLT(out_mv) | RDO_SPR_AVS_OP_CURR(op_ma)) static inline enum pd_pdo_type pdo_type(u32 pdo) { @@ -660,6 +681,11 @@ static inline unsigned int rdo_max_power(u32 rdo) #define PD_P_SNK_STDBY_MW 2500 /* 2500 mW */ +#define PD_I_SNK_STBY_MA 500 /* 500 mA */ + +#define PD_T_AVS_SRC_TRANS_SMALL 50 /* 50 ms */ +#define PD_T_AVS_SRC_TRANS_LARGE 700 /* 700 ms */ + #if IS_ENABLED(CONFIG_TYPEC) struct usb_power_delivery; diff --git a/include/linux/usb/tcpm.h b/include/linux/usb/tcpm.h index b22e659f81ba..93079450bba0 100644 --- a/include/linux/usb/tcpm.h +++ b/include/linux/usb/tcpm.h @@ -31,7 +31,7 @@ enum typec_cc_polarity { /* Time to wait for TCPC to complete transmit */ #define PD_T_TCPC_TX_TIMEOUT 100 /* in ms */ #define PD_ROLE_SWAP_TIMEOUT (MSEC_PER_SEC * 10) -#define PD_PPS_CTRL_TIMEOUT (MSEC_PER_SEC * 10) +#define PD_AUG_PSY_CTRL_TIMEOUT (MSEC_PER_SEC * 10) enum tcpm_transmit_status { TCPC_TX_SUCCESS = 0, -- cgit v1.2.3 From 16c1e8385b3bb65d412d7a60107f8894587c63fa Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Mar 2026 15:25:44 -0400 Subject: cpufreq: optimize policy_is_shared() The switch to cpumask_nth() over cpumask_weight(), as it may return earlier - as soon as the function counts the required number of CPUs. Signed-off-by: Yury Norov Acked-by: Viresh Kumar Reviewed-by: Zhongqiu Han Link: https://patch.msgid.link/20260314192544.605914-1-ynorov@nvidia.com Signed-off-by: Rafael J. Wysocki --- include/linux/cpufreq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index cc894fc38971..8ca2bcb3d7ae 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -232,7 +232,7 @@ static inline bool policy_is_inactive(struct cpufreq_policy *policy) static inline bool policy_is_shared(struct cpufreq_policy *policy) { - return cpumask_weight(policy->cpus) > 1; + return cpumask_nth(1, policy->cpus) < nr_cpumask_bits; } #ifdef CONFIG_CPU_FREQ -- cgit v1.2.3 From deffe1edba626d474fef38007c03646ca5876a0e Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Fri, 13 Mar 2026 14:48:02 +0100 Subject: module: Fix freeing of charp module parameters when CONFIG_SYSFS=n When setting a charp module parameter, the param_set_charp() function allocates memory to store a copy of the input value. Later, when the module is potentially unloaded, the destroy_params() function is called to free this allocated memory. However, destroy_params() is available only when CONFIG_SYSFS=y, otherwise only a dummy variant is present. In the unlikely case that the kernel is configured with CONFIG_MODULES=y and CONFIG_SYSFS=n, this results in a memory leak of charp values when a module is unloaded. Fix this issue by making destroy_params() always available when CONFIG_MODULES=y. Rename the function to module_destroy_params() to clarify that it is intended for use by the module loader. Fixes: e180a6b7759a ("param: fix charp parameters set via sysfs") Signed-off-by: Petr Pavlu Signed-off-by: Sami Tolvanen --- include/linux/moduleparam.h | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 7d22d4c4ea2e..8667f72503d9 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -426,14 +426,9 @@ extern char *parse_args(const char *name, void *arg, parse_unknown_fn unknown); /* Called by module remove. */ -#ifdef CONFIG_SYSFS -extern void destroy_params(const struct kernel_param *params, unsigned num); -#else -static inline void destroy_params(const struct kernel_param *params, - unsigned num) -{ -} -#endif /* !CONFIG_SYSFS */ +#ifdef CONFIG_MODULES +void module_destroy_params(const struct kernel_param *params, unsigned int num); +#endif /* All the helper functions */ /* The macros to do compile-time type checking stolen from Jakub -- cgit v1.2.3 From 65f535501e2a3378629b8650eca553920de5e5a2 Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Fri, 13 Mar 2026 14:48:03 +0100 Subject: module: Clean up parse_args() arguments * Use the preferred `unsigned int` over plain `unsigned` for the `num` parameter. * Synchronize the parameter names in moduleparam.h with the ones used by the implementation in params.c. Signed-off-by: Petr Pavlu Signed-off-by: Sami Tolvanen --- include/linux/moduleparam.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 8667f72503d9..604bc6e9f3a1 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -417,12 +417,12 @@ extern bool parameqn(const char *name1, const char *name2, size_t n); typedef int (*parse_unknown_fn)(char *param, char *val, const char *doing, void *arg); /* Called on module insert or kernel boot */ -extern char *parse_args(const char *name, +extern char *parse_args(const char *doing, char *args, const struct kernel_param *params, - unsigned num, - s16 level_min, - s16 level_max, + unsigned int num, + s16 min_level, + s16 max_level, void *arg, parse_unknown_fn unknown); /* Called by module remove. */ -- cgit v1.2.3 From 44a063c00fb13cf1f2e8a53a2ab10b232a44954b Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Fri, 13 Mar 2026 14:48:04 +0100 Subject: module: Remove extern keyword from param prototypes The external function declarations do not need the "extern" keyword. Remove it to align with the Linux kernel coding style and to silence the associated checkpatch warnings. Signed-off-by: Petr Pavlu Signed-off-by: Sami Tolvanen --- include/linux/moduleparam.h | 89 ++++++++++++++++++++++----------------------- 1 file changed, 44 insertions(+), 45 deletions(-) (limited to 'include/linux') diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 604bc6e9f3a1..075f28585074 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -317,8 +317,8 @@ struct kparam_array name, &__param_ops_##name, arg, perm, -1, 0) #ifdef CONFIG_SYSFS -extern void kernel_param_lock(struct module *mod); -extern void kernel_param_unlock(struct module *mod); +void kernel_param_lock(struct module *mod); +void kernel_param_unlock(struct module *mod); #else static inline void kernel_param_lock(struct module *mod) { @@ -398,7 +398,7 @@ static inline void kernel_param_unlock(struct module *mod) * Returns: true if the two parameter names are equal. * Dashes (-) are considered equal to underscores (_). */ -extern bool parameq(const char *name1, const char *name2); +bool parameq(const char *name1, const char *name2); /** * parameqn - checks if two parameter names match @@ -412,18 +412,18 @@ extern bool parameq(const char *name1, const char *name2); * are equal. * Dashes (-) are considered equal to underscores (_). */ -extern bool parameqn(const char *name1, const char *name2, size_t n); +bool parameqn(const char *name1, const char *name2, size_t n); typedef int (*parse_unknown_fn)(char *param, char *val, const char *doing, void *arg); /* Called on module insert or kernel boot */ -extern char *parse_args(const char *doing, - char *args, - const struct kernel_param *params, - unsigned int num, - s16 min_level, - s16 max_level, - void *arg, parse_unknown_fn unknown); +char *parse_args(const char *doing, + char *args, + const struct kernel_param *params, + unsigned int num, + s16 min_level, + s16 max_level, + void *arg, parse_unknown_fn unknown); /* Called by module remove. */ #ifdef CONFIG_MODULES @@ -437,78 +437,77 @@ void module_destroy_params(const struct kernel_param *params, unsigned int num); static inline type __always_unused *__check_##name(void) { return(p); } extern const struct kernel_param_ops param_ops_byte; -extern int param_set_byte(const char *val, const struct kernel_param *kp); -extern int param_get_byte(char *buffer, const struct kernel_param *kp); +int param_set_byte(const char *val, const struct kernel_param *kp); +int param_get_byte(char *buffer, const struct kernel_param *kp); #define param_check_byte(name, p) __param_check(name, p, unsigned char) extern const struct kernel_param_ops param_ops_short; -extern int param_set_short(const char *val, const struct kernel_param *kp); -extern int param_get_short(char *buffer, const struct kernel_param *kp); +int param_set_short(const char *val, const struct kernel_param *kp); +int param_get_short(char *buffer, const struct kernel_param *kp); #define param_check_short(name, p) __param_check(name, p, short) extern const struct kernel_param_ops param_ops_ushort; -extern int param_set_ushort(const char *val, const struct kernel_param *kp); -extern int param_get_ushort(char *buffer, const struct kernel_param *kp); +int param_set_ushort(const char *val, const struct kernel_param *kp); +int param_get_ushort(char *buffer, const struct kernel_param *kp); #define param_check_ushort(name, p) __param_check(name, p, unsigned short) extern const struct kernel_param_ops param_ops_int; -extern int param_set_int(const char *val, const struct kernel_param *kp); -extern int param_get_int(char *buffer, const struct kernel_param *kp); +int param_set_int(const char *val, const struct kernel_param *kp); +int param_get_int(char *buffer, const struct kernel_param *kp); #define param_check_int(name, p) __param_check(name, p, int) extern const struct kernel_param_ops param_ops_uint; -extern int param_set_uint(const char *val, const struct kernel_param *kp); -extern int param_get_uint(char *buffer, const struct kernel_param *kp); +int param_set_uint(const char *val, const struct kernel_param *kp); +int param_get_uint(char *buffer, const struct kernel_param *kp); int param_set_uint_minmax(const char *val, const struct kernel_param *kp, unsigned int min, unsigned int max); #define param_check_uint(name, p) __param_check(name, p, unsigned int) extern const struct kernel_param_ops param_ops_long; -extern int param_set_long(const char *val, const struct kernel_param *kp); -extern int param_get_long(char *buffer, const struct kernel_param *kp); +int param_set_long(const char *val, const struct kernel_param *kp); +int param_get_long(char *buffer, const struct kernel_param *kp); #define param_check_long(name, p) __param_check(name, p, long) extern const struct kernel_param_ops param_ops_ulong; -extern int param_set_ulong(const char *val, const struct kernel_param *kp); -extern int param_get_ulong(char *buffer, const struct kernel_param *kp); +int param_set_ulong(const char *val, const struct kernel_param *kp); +int param_get_ulong(char *buffer, const struct kernel_param *kp); #define param_check_ulong(name, p) __param_check(name, p, unsigned long) extern const struct kernel_param_ops param_ops_ullong; -extern int param_set_ullong(const char *val, const struct kernel_param *kp); -extern int param_get_ullong(char *buffer, const struct kernel_param *kp); +int param_set_ullong(const char *val, const struct kernel_param *kp); +int param_get_ullong(char *buffer, const struct kernel_param *kp); #define param_check_ullong(name, p) __param_check(name, p, unsigned long long) extern const struct kernel_param_ops param_ops_hexint; -extern int param_set_hexint(const char *val, const struct kernel_param *kp); -extern int param_get_hexint(char *buffer, const struct kernel_param *kp); +int param_set_hexint(const char *val, const struct kernel_param *kp); +int param_get_hexint(char *buffer, const struct kernel_param *kp); #define param_check_hexint(name, p) param_check_uint(name, p) extern const struct kernel_param_ops param_ops_charp; -extern int param_set_charp(const char *val, const struct kernel_param *kp); -extern int param_get_charp(char *buffer, const struct kernel_param *kp); -extern void param_free_charp(void *arg); +int param_set_charp(const char *val, const struct kernel_param *kp); +int param_get_charp(char *buffer, const struct kernel_param *kp); +void param_free_charp(void *arg); #define param_check_charp(name, p) __param_check(name, p, char *) /* We used to allow int as well as bool. We're taking that away! */ extern const struct kernel_param_ops param_ops_bool; -extern int param_set_bool(const char *val, const struct kernel_param *kp); -extern int param_get_bool(char *buffer, const struct kernel_param *kp); +int param_set_bool(const char *val, const struct kernel_param *kp); +int param_get_bool(char *buffer, const struct kernel_param *kp); #define param_check_bool(name, p) __param_check(name, p, bool) extern const struct kernel_param_ops param_ops_bool_enable_only; -extern int param_set_bool_enable_only(const char *val, - const struct kernel_param *kp); +int param_set_bool_enable_only(const char *val, const struct kernel_param *kp); /* getter is the same as for the regular bool */ #define param_check_bool_enable_only param_check_bool extern const struct kernel_param_ops param_ops_invbool; -extern int param_set_invbool(const char *val, const struct kernel_param *kp); -extern int param_get_invbool(char *buffer, const struct kernel_param *kp); +int param_set_invbool(const char *val, const struct kernel_param *kp); +int param_get_invbool(char *buffer, const struct kernel_param *kp); #define param_check_invbool(name, p) __param_check(name, p, bool) /* An int, which can only be set like a bool (though it shows as an int). */ extern const struct kernel_param_ops param_ops_bint; -extern int param_set_bint(const char *val, const struct kernel_param *kp); +int param_set_bint(const char *val, const struct kernel_param *kp); #define param_get_bint param_get_int #define param_check_bint param_check_int @@ -615,19 +614,19 @@ enum hwparam_type { extern const struct kernel_param_ops param_array_ops; extern const struct kernel_param_ops param_ops_string; -extern int param_set_copystring(const char *val, const struct kernel_param *); -extern int param_get_string(char *buffer, const struct kernel_param *kp); +int param_set_copystring(const char *val, const struct kernel_param *kp); +int param_get_string(char *buffer, const struct kernel_param *kp); /* for exporting parameters in /sys/module/.../parameters */ struct module; #if defined(CONFIG_SYSFS) && defined(CONFIG_MODULES) -extern int module_param_sysfs_setup(struct module *mod, - const struct kernel_param *kparam, - unsigned int num_params); +int module_param_sysfs_setup(struct module *mod, + const struct kernel_param *kparam, + unsigned int num_params); -extern void module_param_sysfs_remove(struct module *mod); +void module_param_sysfs_remove(struct module *mod); #else static inline int module_param_sysfs_setup(struct module *mod, const struct kernel_param *kparam, -- cgit v1.2.3 From 306c36a76da2d6d2b5e91db925d41a9a8d77dbfd Mon Sep 17 00:00:00 2001 From: Josh Law Date: Wed, 18 Mar 2026 15:59:12 +0000 Subject: bootconfig: constify xbc_calc_checksum() data parameter xbc_calc_checksum() only reads the data buffer, so mark the parameter as const void * and the internal pointer as const unsigned char *. Link: https://lore.kernel.org/all/20260318155919.78168-7-objecting@objecting.org/ Signed-off-by: Josh Law Signed-off-by: Masami Hiramatsu (Google) --- include/linux/bootconfig.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h index 25df9260d206..23a96c5edcf3 100644 --- a/include/linux/bootconfig.h +++ b/include/linux/bootconfig.h @@ -36,9 +36,9 @@ bool __init cmdline_has_extra_options(void); * The checksum will be used with the BOOTCONFIG_MAGIC and the size for * embedding the bootconfig in the initrd image. */ -static inline __init uint32_t xbc_calc_checksum(void *data, uint32_t size) +static inline __init uint32_t xbc_calc_checksum(const void *data, uint32_t size) { - unsigned char *p = data; + const unsigned char *p = data; uint32_t ret = 0; while (size--) -- cgit v1.2.3 From 6eb255d019b810614c5cbd99b9ef281b7b9361e3 Mon Sep 17 00:00:00 2001 From: Josh Law Date: Wed, 18 Mar 2026 15:59:19 +0000 Subject: lib/bootconfig: change xbc_node_index() return type to uint16_t lib/bootconfig.c:136:21: warning: conversion from 'long int' to 'int' may change value [-Wconversion] lib/bootconfig.c:308:33: warning: conversion from 'int' to 'uint16_t' may change value [-Wconversion] lib/bootconfig.c:467:37: warning: conversion from 'int' to 'uint16_t' may change value [-Wconversion] lib/bootconfig.c:469:40: warning: conversion from 'int' to 'uint16_t' may change value [-Wconversion] lib/bootconfig.c:472:54: warning: conversion from 'int' to 'uint16_t' may change value [-Wconversion] lib/bootconfig.c:476:45: warning: conversion from 'int' to 'uint16_t' may change value [-Wconversion] xbc_node_index() returns the position of a node in the xbc_nodes array, which has at most XBC_NODE_MAX (8192) entries, well within uint16_t range. Every caller stores the result in a uint16_t field (node->parent, node->child, node->next, or the keys[] array in compose_key_after), so the int return type causes narrowing warnings at all six call sites. Change the return type to uint16_t and add an explicit cast on the pointer subtraction to match the storage width and eliminate the warnings. Link: https://lore.kernel.org/all/20260318155919.78168-14-objecting@objecting.org/ Signed-off-by: Josh Law Signed-off-by: Masami Hiramatsu (Google) --- include/linux/bootconfig.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h index 23a96c5edcf3..692a5acc2ffc 100644 --- a/include/linux/bootconfig.h +++ b/include/linux/bootconfig.h @@ -66,7 +66,7 @@ struct xbc_node { /* Node tree access raw APIs */ struct xbc_node * __init xbc_root_node(void); -int __init xbc_node_index(struct xbc_node *node); +uint16_t __init xbc_node_index(struct xbc_node *node); struct xbc_node * __init xbc_node_get_parent(struct xbc_node *node); struct xbc_node * __init xbc_node_get_child(struct xbc_node *node); struct xbc_node * __init xbc_node_get_next(struct xbc_node *node); -- cgit v1.2.3 From 9c6b4009da5991db6bf02a47a578885a04a5e3f6 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 16 Mar 2026 11:04:03 +0100 Subject: net: mdio-gpio: remove linux/mdio-gpio.h The three defines from the linux/mdio-gpio.h header are only used in the mdio-gpio module. There's no reason to have them in a public header. Move them into the driver and remove mdio-gpio.h. Signed-off-by: Bartosz Golaszewski Link: https://patch.msgid.link/20260316-gpio-mdio-hdr-cleanup-v1-1-2df696f74728@oss.qualcomm.com Signed-off-by: Jakub Kicinski --- include/linux/mdio-gpio.h | 9 --------- 1 file changed, 9 deletions(-) delete mode 100644 include/linux/mdio-gpio.h (limited to 'include/linux') diff --git a/include/linux/mdio-gpio.h b/include/linux/mdio-gpio.h deleted file mode 100644 index cea443a672cb..000000000000 --- a/include/linux/mdio-gpio.h +++ /dev/null @@ -1,9 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __LINUX_MDIO_GPIO_H -#define __LINUX_MDIO_GPIO_H - -#define MDIO_GPIO_MDC 0 -#define MDIO_GPIO_MDIO 1 -#define MDIO_GPIO_MDO 2 - -#endif -- cgit v1.2.3 From 356d4fbcf3defaff0f98d2b6b54f3b26f0ff189d Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 16 Mar 2026 11:04:04 +0100 Subject: net: mdio-gpio: remove linux/platform_data/mdio-gpio.h Nobody defines struct mdio_gpio_platform_data. Remove platform data support from mdio-gpio and drop the header. Signed-off-by: Bartosz Golaszewski Link: https://patch.msgid.link/20260316-gpio-mdio-hdr-cleanup-v1-2-2df696f74728@oss.qualcomm.com Signed-off-by: Jakub Kicinski --- include/linux/platform_data/mdio-gpio.h | 14 -------------- 1 file changed, 14 deletions(-) delete mode 100644 include/linux/platform_data/mdio-gpio.h (limited to 'include/linux') diff --git a/include/linux/platform_data/mdio-gpio.h b/include/linux/platform_data/mdio-gpio.h deleted file mode 100644 index 13874fa6e767..000000000000 --- a/include/linux/platform_data/mdio-gpio.h +++ /dev/null @@ -1,14 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * MDIO-GPIO bus platform data structure - */ - -#ifndef __LINUX_MDIO_GPIO_PDATA_H -#define __LINUX_MDIO_GPIO_PDATA_H - -struct mdio_gpio_platform_data { - u32 phy_mask; - u32 phy_ignore_ta_mask; -}; - -#endif /* __LINUX_MDIO_GPIO_PDATA_H */ -- cgit v1.2.3 From dc3d720e12f602059490c1ab2bfee84a7465998f Mon Sep 17 00:00:00 2001 From: Haiyang Zhang Date: Tue, 17 Mar 2026 12:18:05 -0700 Subject: net: ethtool: add ethtool COALESCE_RX_CQE_FRAMES/NSECS Add two parameters for drivers supporting Rx CQE coalescing / descriptor writeback. ETHTOOL_A_COALESCE_RX_CQE_FRAMES: Maximum number of frames that can be coalesced into a CQE or writeback. ETHTOOL_A_COALESCE_RX_CQE_NSECS: Max time in nanoseconds after the first packet arrival in a coalesced CQE or writeback to be sent. Signed-off-by: Haiyang Zhang Link: https://patch.msgid.link/20260317191826.1346111-2-haiyangz@linux.microsoft.com Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 83c375840835..656d465bcd06 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -332,6 +332,8 @@ struct kernel_ethtool_coalesce { u32 tx_aggr_max_bytes; u32 tx_aggr_max_frames; u32 tx_aggr_time_usecs; + u32 rx_cqe_frames; + u32 rx_cqe_nsecs; }; /** @@ -380,7 +382,9 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32, #define ETHTOOL_COALESCE_TX_AGGR_TIME_USECS BIT(26) #define ETHTOOL_COALESCE_RX_PROFILE BIT(27) #define ETHTOOL_COALESCE_TX_PROFILE BIT(28) -#define ETHTOOL_COALESCE_ALL_PARAMS GENMASK(28, 0) +#define ETHTOOL_COALESCE_RX_CQE_FRAMES BIT(29) +#define ETHTOOL_COALESCE_RX_CQE_NSECS BIT(30) +#define ETHTOOL_COALESCE_ALL_PARAMS GENMASK(30, 0) #define ETHTOOL_COALESCE_USECS \ (ETHTOOL_COALESCE_RX_USECS | ETHTOOL_COALESCE_TX_USECS) -- cgit v1.2.3 From a8eed0ba6a4b2f1803ecdfa9f11a4818cf87c474 Mon Sep 17 00:00:00 2001 From: Shardul Bankar Date: Wed, 18 Mar 2026 13:08:22 +0530 Subject: hfsplus: refactor b-tree map page access and add node-type validation In HFS+ b-trees, the node allocation bitmap is stored across multiple records. The first chunk resides in the b-tree Header Node at record index 2, while all subsequent chunks are stored in dedicated Map Nodes at record index 0. This structural quirk forces callers like hfs_bmap_alloc() and hfs_bmap_free() to duplicate boilerplate code to validate offsets, correct lengths, and map the underlying pages via kmap_local_page(). There is also currently no strict node-type validation before reading these records, leaving the allocator vulnerable if a corrupted image points a map linkage to an Index or Leaf node. Introduce a unified bit-level API to encapsulate the map record access: 1. A new `struct hfs_bmap_ctx` to cleanly pass state and safely handle page math across all architectures. 2. `hfs_bmap_get_map_page()`: Automatically validates node types (HFS_NODE_HEADER vs HFS_NODE_MAP), infers the correct record index, handles page-boundary math, and returns the unmapped `struct page *` directly to the caller to avoid asymmetric mappings. 3. `hfs_bmap_clear_bit()`: A clean wrapper that internally handles page mapping/unmapping for single-bit operations. Refactor hfs_bmap_alloc() and hfs_bmap_free() to utilize this new API. This deduplicates the allocator logic, hardens the map traversal against fuzzed images, and provides the exact abstractions needed for upcoming mount-time validation checks. Signed-off-by: Shardul Bankar Reviewed-by: Viacheslav Dubeyko Tested-by: Viacheslav Dubeyko Signed-off-by: Viacheslav Dubeyko Link: https://lore.kernel.org/r/20260318073823.3933718-2-shardul.b@mpiricsoftware.com Signed-off-by: Viacheslav Dubeyko --- include/linux/hfs_common.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hfs_common.h b/include/linux/hfs_common.h index dadb5e0aa8a3..be24c687858e 100644 --- a/include/linux/hfs_common.h +++ b/include/linux/hfs_common.h @@ -510,6 +510,8 @@ struct hfs_btree_header_rec { #define HFSPLUS_NODE_MXSZ 32768 #define HFSPLUS_ATTR_TREE_NODE_SIZE 8192 #define HFSPLUS_BTREE_HDR_NODE_RECS_COUNT 3 +#define HFSPLUS_BTREE_HDR_MAP_REC_INDEX 2 /* Map (bitmap) record in Header node */ +#define HFSPLUS_BTREE_MAP_NODE_REC_INDEX 0 /* Map record in Map Node */ #define HFSPLUS_BTREE_HDR_USER_BYTES 128 /* btree key type */ -- cgit v1.2.3 From c888c4c834c9afc18879fde91ad405be21b7425d Mon Sep 17 00:00:00 2001 From: Svyatoslav Ryhel Date: Tue, 3 Mar 2026 10:42:28 +0200 Subject: gpu: host1x: convert MIPI to use operation function pointers Convert existing MIPI code to use operation function pointers, a necessary step for supporting Tegra20/Tegra30 SoCs. All common MIPI configuration that is SoC-independent remains in mipi.c, while all SoC-specific code is moved to tegra114-mipi.c (The naming matches the first SoC generation with a dedicated calibration block). Shared structures and function calls are placed into tegra-mipi-cal.h. Tested-by: Luca Ceresoli # tegra20, parallel camera Signed-off-by: Svyatoslav Ryhel Acked-by: Mikko Perttunen Signed-off-by: Hans Verkuil --- include/linux/host1x.h | 10 -------- include/linux/tegra-mipi-cal.h | 57 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 10 deletions(-) create mode 100644 include/linux/tegra-mipi-cal.h (limited to 'include/linux') diff --git a/include/linux/host1x.h b/include/linux/host1x.h index 5e7a63143a4a..1f5f55917d1c 100644 --- a/include/linux/host1x.h +++ b/include/linux/host1x.h @@ -453,16 +453,6 @@ void host1x_client_unregister(struct host1x_client *client); int host1x_client_suspend(struct host1x_client *client); int host1x_client_resume(struct host1x_client *client); -struct tegra_mipi_device; - -struct tegra_mipi_device *tegra_mipi_request(struct device *device, - struct device_node *np); -void tegra_mipi_free(struct tegra_mipi_device *device); -int tegra_mipi_enable(struct tegra_mipi_device *device); -int tegra_mipi_disable(struct tegra_mipi_device *device); -int tegra_mipi_start_calibration(struct tegra_mipi_device *device); -int tegra_mipi_finish_calibration(struct tegra_mipi_device *device); - /* host1x memory contexts */ struct host1x_memory_context { diff --git a/include/linux/tegra-mipi-cal.h b/include/linux/tegra-mipi-cal.h new file mode 100644 index 000000000000..2a540b50f65d --- /dev/null +++ b/include/linux/tegra-mipi-cal.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __TEGRA_MIPI_CAL_H_ +#define __TEGRA_MIPI_CAL_H_ + +struct tegra_mipi_device { + const struct tegra_mipi_ops *ops; + struct platform_device *pdev; + unsigned long pads; +}; + +/** + * Operations for Tegra MIPI calibration device + */ +struct tegra_mipi_ops { + /** + * @enable: + * + * Enable MIPI calibration device + */ + int (*enable)(struct tegra_mipi_device *device); + + /** + * @disable: + * + * Disable MIPI calibration device + */ + int (*disable)(struct tegra_mipi_device *device); + + /** + * @start_calibration: + * + * Start MIPI calibration + */ + int (*start_calibration)(struct tegra_mipi_device *device); + + /** + * @finish_calibration: + * + * Finish MIPI calibration + */ + int (*finish_calibration)(struct tegra_mipi_device *device); +}; + +int devm_tegra_mipi_add_provider(struct device *device, struct device_node *np, + const struct tegra_mipi_ops *ops); + +struct tegra_mipi_device *tegra_mipi_request(struct device *device, + struct device_node *np); +void tegra_mipi_free(struct tegra_mipi_device *device); + +int tegra_mipi_enable(struct tegra_mipi_device *device); +int tegra_mipi_disable(struct tegra_mipi_device *device); +int tegra_mipi_start_calibration(struct tegra_mipi_device *device); +int tegra_mipi_finish_calibration(struct tegra_mipi_device *device); + +#endif /* __TEGRA_MIPI_CAL_H_ */ -- cgit v1.2.3 From 7a3ac62473f2bd213557e41aaab7a8f144037dfd Mon Sep 17 00:00:00 2001 From: Lucas Zampieri Date: Sat, 14 Mar 2026 01:05:29 +0000 Subject: HID: input: Introduce struct hid_battery and refactor battery code Introduce struct hid_battery to encapsulate individual battery state, preparing for future multi-battery support. The new structure contains all battery-related fields previously stored directly in hid_device (capacity, min, max, report_type, report_id, charge_status, etc.). The hid_device->battery pointer type changes from struct power_supply* to struct hid_battery*, and all battery functions are refactored accordingly. A hid_get_battery() helper is added for external drivers, with hid-apple.c and hid-magicmouse.c updated to use the new API. The hid-input-test.c KUnit tests are also updated for the new structure. No functional changes for single-battery devices. Signed-off-by: Lucas Zampieri Signed-off-by: Benjamin Tissoires --- include/linux/hid.h | 52 +++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 39 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index 31324609af4d..e4e2a5643bda 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -634,6 +634,36 @@ enum hid_battery_status { HID_BATTERY_REPORTED, /* Device sent unsolicited battery strength report */ }; +/** + * struct hid_battery - represents a single battery power supply + * @dev: pointer to the parent hid_device + * @ps: the power supply instance + * @min: minimum battery value from HID descriptor + * @max: maximum battery value from HID descriptor + * @report_type: HID report type (input/feature) + * @report_id: HID report ID for this battery + * @charge_status: current charging status + * @status: battery reporting status + * @capacity: current battery capacity (0-100) + * @avoid_query: if true, avoid querying battery (e.g., for stylus) + * @present: if true, battery is present (may be dynamic) + * @ratelimit_time: rate limiting for battery reports + */ +struct hid_battery { + struct hid_device *dev; + struct power_supply *ps; + __s32 min; + __s32 max; + __s32 report_type; + __s32 report_id; + __s32 charge_status; + enum hid_battery_status status; + __s32 capacity; + bool avoid_query; + bool present; + ktime_t ratelimit_time; +}; + struct hid_driver; struct hid_ll_driver; @@ -670,20 +700,9 @@ struct hid_device { #ifdef CONFIG_HID_BATTERY_STRENGTH /* * Power supply information for HID devices which report - * battery strength. power_supply was successfully registered if - * battery is non-NULL. + * battery strength. battery is non-NULL if successfully registered. */ - struct power_supply *battery; - __s32 battery_capacity; - __s32 battery_min; - __s32 battery_max; - __s32 battery_report_type; - __s32 battery_report_id; - __s32 battery_charge_status; - enum hid_battery_status battery_status; - bool battery_avoid_query; - bool battery_present; - ktime_t battery_ratelimit_time; + struct hid_battery *battery; #endif unsigned long status; /* see STAT flags above */ @@ -744,6 +763,13 @@ static inline void hid_set_drvdata(struct hid_device *hdev, void *data) dev_set_drvdata(&hdev->dev, data); } +#ifdef CONFIG_HID_BATTERY_STRENGTH +static inline struct hid_battery *hid_get_battery(struct hid_device *hdev) +{ + return hdev->battery; +} +#endif + #define HID_GLOBAL_STACK_SIZE 4 #define HID_COLLECTION_STACK_SIZE 4 -- cgit v1.2.3 From 4a58ae85c3f9b142ffba023d0f976978ade57d1b Mon Sep 17 00:00:00 2001 From: Lucas Zampieri Date: Sat, 14 Mar 2026 01:05:30 +0000 Subject: HID: input: Add support for multiple batteries per device Add support for HID devices that report multiple batteries, each identified by its report ID. The hid_device->battery pointer is replaced with a batteries list. Batteries are named using the pattern hid-{uniq}-battery-{report_id}. The hid_get_battery() helper returns the first battery in the list for backwards compatibility with single-battery drivers. Signed-off-by: Lucas Zampieri Signed-off-by: Benjamin Tissoires --- include/linux/hid.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index e4e2a5643bda..fb1a3f3ad9fa 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -648,6 +648,7 @@ enum hid_battery_status { * @avoid_query: if true, avoid querying battery (e.g., for stylus) * @present: if true, battery is present (may be dynamic) * @ratelimit_time: rate limiting for battery reports + * @list: list node for linking into hid_device's battery list */ struct hid_battery { struct hid_device *dev; @@ -662,6 +663,7 @@ struct hid_battery { bool avoid_query; bool present; ktime_t ratelimit_time; + struct list_head list; }; struct hid_driver; @@ -700,9 +702,10 @@ struct hid_device { #ifdef CONFIG_HID_BATTERY_STRENGTH /* * Power supply information for HID devices which report - * battery strength. battery is non-NULL if successfully registered. + * battery strength. Each battery is tracked separately in the + * batteries list. */ - struct hid_battery *battery; + struct list_head batteries; #endif unsigned long status; /* see STAT flags above */ @@ -766,7 +769,9 @@ static inline void hid_set_drvdata(struct hid_device *hdev, void *data) #ifdef CONFIG_HID_BATTERY_STRENGTH static inline struct hid_battery *hid_get_battery(struct hid_device *hdev) { - return hdev->battery; + if (list_empty(&hdev->batteries)) + return NULL; + return list_first_entry(&hdev->batteries, struct hid_battery, list); } #endif -- cgit v1.2.3 From a1e97ce80d9f41d0bb83951d758ff6fe49f3de60 Mon Sep 17 00:00:00 2001 From: Yang Xiuwei Date: Tue, 17 Mar 2026 15:22:25 +0800 Subject: bsg: add io_uring command support to generic layer Add an io_uring command handler to the generic BSG layer. The new .uring_cmd file operation validates io_uring features and delegates handling to a per-queue bsg_uring_cmd_fn callback. Extend bsg_register_queue() so transport drivers can register both sg_io and io_uring command handlers. Signed-off-by: Yang Xiuwei Reviewed-by: Bart Van Assche Link: https://patch.msgid.link/20260317072226.2598233-3-yangxiuwei@kylinos.cn Signed-off-by: Jens Axboe --- include/linux/bsg.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bsg.h b/include/linux/bsg.h index ee2df73edf83..162730bfc2d8 100644 --- a/include/linux/bsg.h +++ b/include/linux/bsg.h @@ -7,13 +7,17 @@ struct bsg_device; struct device; struct request_queue; +struct io_uring_cmd; typedef int (bsg_sg_io_fn)(struct request_queue *, struct sg_io_v4 *hdr, bool open_for_write, unsigned int timeout); +typedef int (bsg_uring_cmd_fn)(struct request_queue *q, struct io_uring_cmd *ioucmd, + unsigned int issue_flags, bool open_for_write); + struct bsg_device *bsg_register_queue(struct request_queue *q, struct device *parent, const char *name, - bsg_sg_io_fn *sg_io_fn); + bsg_sg_io_fn *sg_io_fn, bsg_uring_cmd_fn *uring_cmd_fn); void bsg_unregister_queue(struct bsg_device *bcd); #endif /* _LINUX_BSG_H */ -- cgit v1.2.3 From f656807150e3e1c6f76cab918e5adfad6d881d58 Mon Sep 17 00:00:00 2001 From: Sascha Bischoff Date: Thu, 19 Mar 2026 15:52:34 +0000 Subject: KVM: arm64: gic-v5: Detect implemented PPIs on boot As part of booting the system and initialising KVM, create and populate a mask of the implemented PPIs. This mask allows future PPI operations (such as save/restore or state, or syncing back into the shadow state) to only consider PPIs that are actually implemented on the host. The set of implemented virtual PPIs matches the set of implemented physical PPIs for a GICv5 host. Therefore, this mask represents all PPIs that could ever by used by a GICv5-based guest on a specific host, albeit pre-filtered by what we support in KVM (see next paragraph). Only architected PPIs are currently supported in KVM with GICv5. Moreover, as KVM only supports a subset of all possible PPIS (Timers, PMU, GICv5 SW_PPI) the PPI mask only includes these PPIs, if present. The timers are always assumed to be present; if we have KVM we have EL2, which means that we have the EL1 & EL2 Timer PPIs. If we have a PMU (v3), then the PMUIRQ is present. The GICv5 SW_PPI is always assumed to be present. Signed-off-by: Sascha Bischoff Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20260319154937.3619520-12-sascha.bischoff@arm.com Signed-off-by: Marc Zyngier --- include/linux/irqchip/arm-gic-v5.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqchip/arm-gic-v5.h b/include/linux/irqchip/arm-gic-v5.h index b78488df6c98..b1566a7c93ec 100644 --- a/include/linux/irqchip/arm-gic-v5.h +++ b/include/linux/irqchip/arm-gic-v5.h @@ -24,6 +24,28 @@ #define GICV5_HWIRQ_TYPE_LPI UL(0x2) #define GICV5_HWIRQ_TYPE_SPI UL(0x3) +/* + * Architected PPIs + */ +#define GICV5_ARCH_PPI_S_DB_PPI 0x0 +#define GICV5_ARCH_PPI_RL_DB_PPI 0x1 +#define GICV5_ARCH_PPI_NS_DB_PPI 0x2 +#define GICV5_ARCH_PPI_SW_PPI 0x3 +#define GICV5_ARCH_PPI_HACDBSIRQ 0xf +#define GICV5_ARCH_PPI_CNTHVS 0x13 +#define GICV5_ARCH_PPI_CNTHPS 0x14 +#define GICV5_ARCH_PPI_PMBIRQ 0x15 +#define GICV5_ARCH_PPI_COMMIRQ 0x16 +#define GICV5_ARCH_PPI_PMUIRQ 0x17 +#define GICV5_ARCH_PPI_CTIIRQ 0x18 +#define GICV5_ARCH_PPI_GICMNT 0x19 +#define GICV5_ARCH_PPI_CNTHP 0x1a +#define GICV5_ARCH_PPI_CNTV 0x1b +#define GICV5_ARCH_PPI_CNTHV 0x1c +#define GICV5_ARCH_PPI_CNTPS 0x1d +#define GICV5_ARCH_PPI_CNTP 0x1e +#define GICV5_ARCH_PPI_TRBIRQ 0x1f + /* * Tables attributes */ -- cgit v1.2.3 From 9b8e3d4ca0e734dd13dc261c5f888b359f8f5983 Mon Sep 17 00:00:00 2001 From: Sascha Bischoff Date: Thu, 19 Mar 2026 15:54:08 +0000 Subject: KVM: arm64: gic-v5: Implement GICv5 load/put and save/restore This change introduces GICv5 load/put. Additionally, it plumbs in save/restore for: * PPIs (ICH_PPI_x_EL2 regs) * ICH_VMCR_EL2 * ICH_APR_EL2 * ICC_ICSR_EL1 A GICv5-specific enable bit is added to struct vgic_vmcr as this differs from previous GICs. On GICv5-native systems, the VMCR only contains the enable bit (driven by the guest via ICC_CR0_EL1.EN) and the priority mask (PCR). A struct gicv5_vpe is also introduced. This currently only contains a single field - bool resident - which is used to track if a VPE is currently running or not, and is used to avoid a case of double load or double put on the WFI path for a vCPU. This struct will be extended as additional GICv5 support is merged, specifically for VPE doorbells. Co-authored-by: Timothy Hayes Signed-off-by: Timothy Hayes Signed-off-by: Sascha Bischoff Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20260319154937.3619520-18-sascha.bischoff@arm.com Signed-off-by: Marc Zyngier --- include/linux/irqchip/arm-gic-v5.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqchip/arm-gic-v5.h b/include/linux/irqchip/arm-gic-v5.h index b1566a7c93ec..40d2fce68294 100644 --- a/include/linux/irqchip/arm-gic-v5.h +++ b/include/linux/irqchip/arm-gic-v5.h @@ -387,6 +387,11 @@ int gicv5_spi_irq_set_type(struct irq_data *d, unsigned int type); int gicv5_irs_iste_alloc(u32 lpi); void gicv5_irs_syncr(void); +/* Embedded in kvm.arch */ +struct gicv5_vpe { + bool resident; +}; + struct gicv5_its_devtab_cfg { union { struct { -- cgit v1.2.3 From 37a25294682d28ef3bd131566602450a72c4d839 Mon Sep 17 00:00:00 2001 From: Sascha Bischoff Date: Thu, 19 Mar 2026 15:58:48 +0000 Subject: KVM: arm64: gic-v5: Introduce kvm_arm_vgic_v5_ops and register them Only the KVM_DEV_ARM_VGIC_GRP_CTRL->KVM_DEV_ARM_VGIC_CTRL_INIT op is currently supported. All other ops are stubbed out. Co-authored-by: Timothy Hayes Signed-off-by: Timothy Hayes Signed-off-by: Sascha Bischoff Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20260319154937.3619520-36-sascha.bischoff@arm.com Signed-off-by: Marc Zyngier --- include/linux/kvm_host.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 6b76e7a6f4c2..779d9ed85cbf 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2366,6 +2366,7 @@ void kvm_unregister_device_ops(u32 type); extern struct kvm_device_ops kvm_mpic_ops; extern struct kvm_device_ops kvm_arm_vgic_v2_ops; extern struct kvm_device_ops kvm_arm_vgic_v3_ops; +extern struct kvm_device_ops kvm_arm_vgic_v5_ops; #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT -- cgit v1.2.3 From 50ff3f404617c5d15832fec3711978104c4c9efd Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 17 Mar 2026 18:17:49 +0200 Subject: vfio: Add support for VFIO_DEVICE_FEATURE_MIG_PRECOPY_INFOv2 Currently, existing VFIO_MIG_GET_PRECOPY_INFO implementations don't assign info.flags before copy_to_user(). Because they copy the struct in from userspace first, this effectively echoes userspace-provided flags back as output, preventing the field from being used to report new reliable data from the drivers. Add support for a new device feature named VFIO_DEVICE_FEATURE_MIG_PRECOPY_INFOv2. On SET, enables the v2 pre_copy_info behaviour, where the vfio_precopy_info.flags is a valid output field. Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20260317161753.18964-3-yishaih@nvidia.com Signed-off-by: Alex Williamson --- include/linux/vfio.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index e90859956514..7c1d33283e04 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -52,6 +52,7 @@ struct vfio_device { struct vfio_device_set *dev_set; struct list_head dev_set_list; unsigned int migration_flags; + u8 precopy_info_v2; struct kvm *kvm; /* Members below here are private, not for driver use */ -- cgit v1.2.3 From c995498636c704641c9e809c31b59445b48f7adc Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 17 Mar 2026 18:17:50 +0200 Subject: vfio: Adapt drivers to use the core helper vfio_check_precopy_ioctl Introduce a core helper function for VFIO_MIG_GET_PRECOPY_INFO and adapt all drivers to use it. It centralizes the common code and ensures that output flags are cleared on entry, in case user opts in to VFIO_DEVICE_FEATURE_MIG_PRECOPY_INFOv2. This preventing any unintended echoing of userspace data back to userspace. Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20260317161753.18964-4-yishaih@nvidia.com Signed-off-by: Alex Williamson --- include/linux/vfio.h | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 7c1d33283e04..50b474334a19 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -16,6 +16,7 @@ #include #include #include +#include struct kvm; struct iommufd_ctx; @@ -285,6 +286,44 @@ static inline int vfio_check_feature(u32 flags, size_t argsz, u32 supported_ops, return 1; } +/** + * vfio_check_precopy_ioctl - Validate user input for the VFIO_MIG_GET_PRECOPY_INFO ioctl + * @vdev: The vfio device + * @cmd: Cmd from the ioctl + * @arg: Arg from the ioctl + * @info: Driver pointer to hold the userspace input to the ioctl + * + * For use in a driver's get_precopy_info. Checks that the inputs to the + * VFIO_MIG_GET_PRECOPY_INFO ioctl are correct. + + * Returns 0 on success, otherwise errno. + */ + +static inline int +vfio_check_precopy_ioctl(struct vfio_device *vdev, unsigned int cmd, + unsigned long arg, struct vfio_precopy_info *info) +{ + unsigned long minsz; + + if (cmd != VFIO_MIG_GET_PRECOPY_INFO) + return -ENOTTY; + + minsz = offsetofend(struct vfio_precopy_info, dirty_bytes); + + if (copy_from_user(info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info->argsz < minsz) + return -EINVAL; + + /* keep v1 behaviour as is for compatibility reasons */ + if (vdev->precopy_info_v2) + /* flags are output, set its initial value to 0 */ + info->flags = 0; + + return 0; +} + struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev, const struct vfio_device_ops *ops); #define vfio_alloc_device(dev_struct, member, dev, ops) \ -- cgit v1.2.3 From 4bee09a5dbd14e3369926b14b4ee14e22ebfc1f6 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 17 Mar 2026 18:17:51 +0200 Subject: net/mlx5: Add IFC bits for migration state Add the relevant IFC bits for querying an extra migration state from the device. Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20260317161753.18964-5-yishaih@nvidia.com Signed-off-by: Alex Williamson --- include/linux/mlx5/mlx5_ifc.h | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 775cb0c56865..1c8922c58c8f 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -2173,7 +2173,8 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 sf_eq_usage[0x1]; u8 reserved_at_d3[0x5]; u8 multiplane[0x1]; - u8 reserved_at_d9[0x7]; + u8 migration_state[0x1]; + u8 reserved_at_da[0x6]; u8 cross_vhca_object_to_object_supported[0x20]; @@ -13280,13 +13281,24 @@ struct mlx5_ifc_query_vhca_migration_state_in_bits { u8 reserved_at_60[0x20]; }; +enum { + MLX5_QUERY_VHCA_MIG_STATE_UNINITIALIZED = 0x0, + MLX5_QUERY_VHCA_MIG_STATE_OPER_MIGRATION_IDLE = 0x1, + MLX5_QUERY_VHCA_MIG_STATE_OPER_MIGRATION_READY = 0x2, + MLX5_QUERY_VHCA_MIG_STATE_OPER_MIGRATION_DIRTY = 0x3, + MLX5_QUERY_VHCA_MIG_STATE_OPER_MIGRATION_INIT = 0x4, +}; + struct mlx5_ifc_query_vhca_migration_state_out_bits { u8 status[0x8]; u8 reserved_at_8[0x18]; u8 syndrome[0x20]; - u8 reserved_at_40[0x40]; + u8 reserved_at_40[0x20]; + + u8 migration_state[0x4]; + u8 reserved_at_64[0x1c]; u8 required_umem_size[0x20]; -- cgit v1.2.3 From 418eab7a6f3c002d8e64d6e95ec27118017019af Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 19 Mar 2026 14:29:20 -0600 Subject: io_uring/kbuf: propagate BUF_MORE through early buffer commit path When io_should_commit() returns true (eg for non-pollable files), buffer commit happens at buffer selection time and sel->buf_list is set to NULL. When __io_put_kbufs() generates CQE flags at completion time, it calls __io_put_kbuf_ring() which finds a NULL buffer_list and hence cannot determine whether the buffer was consumed or not. This means that IORING_CQE_F_BUF_MORE is never set for non-pollable input with incrementally consumed buffers. Likewise for io_buffers_select(), which always commits upfront and discards the return value of io_kbuf_commit(). Add REQ_F_BUF_MORE to store the result of io_kbuf_commit() during early commit. Then __io_put_kbuf_ring() can check this flag and set IORING_F_BUF_MORE accordingy. Reported-by: Martin Michaelis Cc: stable@vger.kernel.org Fixes: ae98dbf43d75 ("io_uring/kbuf: add support for incremental buffer consumption") Link: https://github.com/axboe/liburing/issues/1553 Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index dd1420bfcb73..214fdbd49052 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -541,6 +541,7 @@ enum { REQ_F_BL_NO_RECYCLE_BIT, REQ_F_BUFFERS_COMMIT_BIT, REQ_F_BUF_NODE_BIT, + REQ_F_BUF_MORE_BIT, REQ_F_HAS_METADATA_BIT, REQ_F_IMPORT_BUFFER_BIT, REQ_F_SQE_COPIED_BIT, @@ -626,6 +627,8 @@ enum { REQ_F_BUFFERS_COMMIT = IO_REQ_FLAG(REQ_F_BUFFERS_COMMIT_BIT), /* buf node is valid */ REQ_F_BUF_NODE = IO_REQ_FLAG(REQ_F_BUF_NODE_BIT), + /* incremental buffer consumption, more space available */ + REQ_F_BUF_MORE = IO_REQ_FLAG(REQ_F_BUF_MORE_BIT), /* request has read/write metadata assigned */ REQ_F_HAS_METADATA = IO_REQ_FLAG(REQ_F_HAS_METADATA_BIT), /* -- cgit v1.2.3 From 3fe1dcbc2d20c5dbc581c0bb458e05365bfffcf7 Mon Sep 17 00:00:00 2001 From: Nicholas Sielicki Date: Sat, 7 Mar 2026 03:00:09 -0600 Subject: module: expose imported namespaces via sysfs Add /sys/module/*/import_ns to expose imported namespaces for currently loaded modules. The file contains one namespace per line and only exists for modules that import at least one namespace. Previously, the only way for userspace to inspect the symbol namespaces a module imports is to locate the .ko on disk and invoke modinfo(8) to decompress/parse the metadata. The kernel validated namespaces at load time, but it was otherwise discarded. Exposing this data via sysfs provides a runtime mechanism to verify which namespaces are being used by modules. For example, this allows userspace to audit driver API access in Android GKI, which uses symbol namespaces to restrict vendor drivers from using specific kernel interfaces (e.g., direct filesystem access). Signed-off-by: Nicholas Sielicki [Sami: Updated the commit message to explain motivation.] Signed-off-by: Sami Tolvanen --- include/linux/module.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index 14f391b186c6..60ed1c3e0ed9 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -413,6 +413,7 @@ struct module { struct module_attribute *modinfo_attrs; const char *version; const char *srcversion; + const char *imported_namespaces; struct kobject *holders_dir; /* Exported symbols */ -- cgit v1.2.3 From e61af3ca893363838f263f3528476f6e1faed9aa Mon Sep 17 00:00:00 2001 From: Rosen Penev Date: Wed, 18 Mar 2026 12:10:37 -0700 Subject: hsi: hsi_core: use kzalloc_flex Simplifies allocations by using a flexible array member in this struct. Add __counted_by to get extra runtime analysis. Signed-off-by: Rosen Penev Link: https://patch.msgid.link/20260318191037.5661-1-rosenp@gmail.com Signed-off-by: Sebastian Reichel --- include/linux/hsi/hsi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hsi/hsi.h b/include/linux/hsi/hsi.h index 6ca92bff02c6..ea6bef9b6012 100644 --- a/include/linux/hsi/hsi.h +++ b/include/linux/hsi/hsi.h @@ -271,7 +271,7 @@ struct hsi_controller { struct module *owner; unsigned int id; unsigned int num_ports; - struct hsi_port **port; + struct hsi_port *port[] __counted_by(num_ports); }; #define to_hsi_controller(dev) container_of(dev, struct hsi_controller, device) -- cgit v1.2.3 From 9bb0a4d6a4433b75274204b083dac8e515d2007d Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 16 Mar 2026 21:06:47 +0200 Subject: dma-mapping: Clarify valid conditions for CPU cache line overlap Rename the DMA_ATTR_CPU_CACHE_CLEAN attribute to better reflect that it is debugging aid to inform DMA core code that CPU cache line overlaps are allowed, and refine the documentation describing its use. Signed-off-by: Leon Romanovsky Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260316-dma-debug-overlap-v3-3-1dde90a7f08b@nvidia.com --- include/linux/dma-mapping.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 29973baa0581..da44394b3a1a 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -80,11 +80,11 @@ #define DMA_ATTR_MMIO (1UL << 10) /* - * DMA_ATTR_CPU_CACHE_CLEAN: Indicates the CPU will not dirty any cacheline - * overlapping this buffer while it is mapped for DMA. All mappings sharing - * a cacheline must have this attribute for this to be considered safe. + * DMA_ATTR_DEBUGGING_IGNORE_CACHELINES: Indicates the CPU cache line can be + * overlapped. All mappings sharing a cacheline must have this attribute for + * this to be considered safe. */ -#define DMA_ATTR_CPU_CACHE_CLEAN (1UL << 11) +#define DMA_ATTR_DEBUGGING_IGNORE_CACHELINES (1UL << 11) /* * A dma_addr_t can hold any valid DMA or bus address for the platform. It can -- cgit v1.2.3 From e6a58fa2556203a7f6731b4071705dc81cca5ca5 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 16 Mar 2026 21:06:48 +0200 Subject: dma-mapping: Introduce DMA require coherency attribute The mapping buffers which carry this attribute require DMA coherent system. This means that they can't take SWIOTLB path, can perform CPU cache overlap and doesn't perform cache flushing. Signed-off-by: Leon Romanovsky Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260316-dma-debug-overlap-v3-4-1dde90a7f08b@nvidia.com --- include/linux/dma-mapping.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index da44394b3a1a..482b919f040f 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -86,6 +86,13 @@ */ #define DMA_ATTR_DEBUGGING_IGNORE_CACHELINES (1UL << 11) +/* + * DMA_ATTR_REQUIRE_COHERENT: Indicates that DMA coherency is required. + * All mappings that carry this attribute can't work with SWIOTLB and cache + * flushing. + */ +#define DMA_ATTR_REQUIRE_COHERENT (1UL << 12) + /* * A dma_addr_t can hold any valid DMA or bus address for the platform. It can * be given to a device to use as a DMA source or target. It is specific to a -- cgit v1.2.3 From 1613462be621ad5103ec338a7b0ca0746ec4e5f1 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 14 Oct 2025 13:28:15 +0200 Subject: xen/privcmd: add boot control for restricted usage in domU When running in an unprivileged domU under Xen, the privcmd driver is restricted to allow only hypercalls against a target domain, for which the current domU is acting as a device model. Add a boot parameter "unrestricted" to allow all hypercalls (the hypervisor will still refuse destructive hypercalls affecting other guests). Make this new parameter effective only in case the domU wasn't started using secure boot, as otherwise hypercalls targeting the domU itself might result in violating the secure boot functionality. This is achieved by adding another lockdown reason, which can be tested to not being set when applying the "unrestricted" option. This is part of XSA-482 Signed-off-by: Juergen Gross --- V2: - new patch --- include/linux/security.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index 83a646d72f6f..ee88dd2d2d1f 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -145,6 +145,7 @@ enum lockdown_reason { LOCKDOWN_BPF_WRITE_USER, LOCKDOWN_DBG_WRITE_KERNEL, LOCKDOWN_RTAS_ERROR_INJECTION, + LOCKDOWN_XEN_USER_ACTIONS, LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_KCORE, LOCKDOWN_KPROBES, -- cgit v1.2.3 From d9794c0600f95b226b6672c5b364e44c80d660c5 Mon Sep 17 00:00:00 2001 From: Kit Dallege Date: Sun, 15 Mar 2026 18:10:01 +0100 Subject: dma-mapping: fix false kernel-doc comment marker Change /** to /* for the DMA attributes list comment in dma-mapping.h. The comment is not a kernel-doc structured comment and should not use the kernel-doc opening marker. Assisted-by: Claude:claude-opus-4-6 Signed-off-by: Kit Dallege Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260315171001.66010-1-xaum.io@gmail.com --- include/linux/dma-mapping.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 29973baa0581..0c2807e50bdf 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -9,7 +9,7 @@ #include #include -/** +/* * List of possible attributes associated with a DMA mapping. The semantics * of each attribute should be defined in Documentation/core-api/dma-attributes.rst. */ -- cgit v1.2.3 From 763aacf86f1baefb134c70813aa8c72d1675d738 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 17 Mar 2026 10:01:54 +0100 Subject: clocksource: Rewrite watchdog code completely The clocksource watchdog code has over time reached the state of an impenetrable maze of duct tape and staples. The original design, which was made in the context of systems far smaller than today, is based on the assumption that the to be monitored clocksource (TSC) can be trivially compared against a known to be stable clocksource (HPET/ACPI-PM timer). Over the years it turned out that this approach has major flaws: - Long delays between watchdog invocations can result in wrap arounds of the reference clocksource - Scalability of the reference clocksource readout can degrade on large multi-socket systems due to interconnect congestion This was addressed with various heuristics which degraded the accuracy of the watchdog to the point that it fails to detect actual TSC problems on older hardware which exposes slow inter CPU drifts due to firmware manipulating the TSC to hide SMI time. To address this and bring back sanity to the watchdog, rewrite the code completely with a different approach: 1) Restrict the validation against a reference clocksource to the boot CPU, which is usually the CPU/Socket closest to the legacy block which contains the reference source (HPET/ACPI-PM timer). Validate that the reference readout is within a bound latency so that the actual comparison against the TSC stays within 500ppm as long as the clocks are stable. 2) Compare the TSCs of the other CPUs in a round robin fashion against the boot CPU in the same way the TSC synchronization on CPU hotplug works. This still can suffer from delayed reaction of the remote CPU to the SMP function call and the latency of the control variable cache line. But this latency is not affecting correctness. It only affects the accuracy. With low contention the readout latency is in the low nanoseconds range, which detects even slight skews between CPUs. Under high contention this becomes obviously less accurate, but still detects slow skews reliably as it solely relies on subsequent readouts being monotonically increasing. It just can take slightly longer to detect the issue. 3) Rewrite the watchdog test so it tests the various mechanisms one by one and validating the result against the expectation. Signed-off-by: Thomas Gleixner Tested-by: Borislav Petkov (AMD) Tested-by: Daniel J Blueman Reviewed-by: Jiri Wiesner Reviewed-by: Daniel J Blueman Link: https://patch.msgid.link/20260123231521.926490888@kernel.org Link: https://patch.msgid.link/87h5qeomm5.ffs@tglx --- include/linux/clocksource.h | 28 +++++++--------------------- 1 file changed, 7 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 25774fc5b53d..ccf5c0ca26b7 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -44,8 +44,6 @@ struct module; * @shift: Cycle to nanosecond divisor (power of two) * @max_idle_ns: Maximum idle time permitted by the clocksource (nsecs) * @maxadj: Maximum adjustment value to mult (~11%) - * @uncertainty_margin: Maximum uncertainty in nanoseconds per half second. - * Zero says to use default WATCHDOG_THRESHOLD. * @archdata: Optional arch-specific data * @max_cycles: Maximum safe cycle value which won't overflow on * multiplication @@ -105,7 +103,6 @@ struct clocksource { u32 shift; u64 max_idle_ns; u32 maxadj; - u32 uncertainty_margin; #ifdef CONFIG_ARCH_CLOCKSOURCE_DATA struct arch_clocksource_data archdata; #endif @@ -133,6 +130,7 @@ struct clocksource { struct list_head wd_list; u64 cs_last; u64 wd_last; + unsigned int wd_cpu; #endif struct module *owner; }; @@ -142,15 +140,18 @@ struct clocksource { */ #define CLOCK_SOURCE_IS_CONTINUOUS 0x01 #define CLOCK_SOURCE_MUST_VERIFY 0x02 +#define CLOCK_SOURCE_CALIBRATED 0x04 #define CLOCK_SOURCE_WATCHDOG 0x10 #define CLOCK_SOURCE_VALID_FOR_HRES 0x20 #define CLOCK_SOURCE_UNSTABLE 0x40 #define CLOCK_SOURCE_SUSPEND_NONSTOP 0x80 #define CLOCK_SOURCE_RESELECT 0x100 -#define CLOCK_SOURCE_VERIFY_PERCPU 0x200 -#define CLOCK_SOURCE_CAN_INLINE_READ 0x400 -#define CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT 0x800 +#define CLOCK_SOURCE_CAN_INLINE_READ 0x200 +#define CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT 0x400 + +#define CLOCK_SOURCE_WDTEST 0x800 +#define CLOCK_SOURCE_WDTEST_PERCPU 0x1000 /* simplify initialization of mask field */ #define CLOCKSOURCE_MASK(bits) GENMASK_ULL((bits) - 1, 0) @@ -301,21 +302,6 @@ static inline void timer_probe(void) {} #define TIMER_ACPI_DECLARE(name, table_id, fn) \ ACPI_DECLARE_PROBE_ENTRY(timer, name, table_id, 0, NULL, 0, fn) -static inline unsigned int clocksource_get_max_watchdog_retry(void) -{ - /* - * When system is in the boot phase or under heavy workload, there - * can be random big latencies during the clocksource/watchdog - * read, so allow retries to filter the noise latency. As the - * latency's frequency and maximum value goes up with the number of - * CPUs, scale the number of retries with the number of online - * CPUs. - */ - return (ilog2(num_online_cpus()) / 2) + 1; -} - -void clocksource_verify_percpu(struct clocksource *cs); - /** * struct clocksource_base - hardware abstraction for clock on which a clocksource * is based -- cgit v1.2.3 From 3d6bb84f6bb3f4c05fc47fd02ce75ce3032a4ce1 Mon Sep 17 00:00:00 2001 From: Yuto Ohnuki Date: Thu, 26 Feb 2026 20:18:58 +0000 Subject: fs: remove stale and duplicate forward declarations Remove the following unnecessary forward declarations from fs.h, which improves maintainability. - struct hd_geometry: became unused in fs.h when block_device_operations was moved to blkdev.h in commit 08f858512151 ("[PATCH] move block_device_operations to blkdev.h"). The forward declaration is now added to blkdev.h where it is actually used. - struct iovec: became unused when aio_read/aio_write were removed in commit 8436318205b9 ("->aio_read and ->aio_write removed") - struct iov_iter: duplicate forward declaration. This removes the redundant second declaration, added in commit 293bc9822fa9 ("new methods: ->read_iter() and ->write_iter()") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202512301303.s7YWTZHA-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202512302139.Wl0soAlz-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202512302105.pmzYfmcV-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202512302125.FNgHwu5z-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202512302108.nIV8r5ES-lkp@intel.com/ Signed-off-by: Yuto Ohnuki Link: https://patch.msgid.link/20260226201857.27310-2-ytohnuki@amazon.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/blkdev.h | 1 + include/linux/fs.h | 3 --- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 99ef8cd7673c..54cdd71aab07 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -38,6 +38,7 @@ struct blk_flush_queue; struct kiocb; struct pr_ops; struct rq_qos; +struct hd_geometry; struct blk_report_zones_args; struct blk_queue_stats; struct blk_stat_callback; diff --git a/include/linux/fs.h b/include/linux/fs.h index a2af5ddd5323..280d43c9f04a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -55,8 +55,6 @@ struct bdi_writeback; struct bio; struct io_comp_batch; struct fiemap_extent_info; -struct hd_geometry; -struct iovec; struct kiocb; struct kobject; struct pipe_inode_info; @@ -1917,7 +1915,6 @@ struct dir_context { */ #define COPY_FILE_SPLICE (1 << 0) -struct iov_iter; struct io_uring_cmd; struct offset_ctx; -- cgit v1.2.3 From 76f9377cd2ab7a9220c25d33940d9ca20d368172 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 19 Mar 2026 17:51:45 -0700 Subject: writeback: don't block sync for filesystems with no data integrity guarantees Add a SB_I_NO_DATA_INTEGRITY superblock flag for filesystems that cannot guarantee data persistence on sync (eg fuse). For superblocks with this flag set, sync kicks off writeback of dirty inodes but does not wait for the flusher threads to complete the writeback. This replaces the per-inode AS_NO_DATA_INTEGRITY mapping flag added in commit f9a49aa302a0 ("fs/writeback: skip AS_NO_DATA_INTEGRITY mappings in wait_sb_inodes()"). The flag belongs at the superblock level because data integrity is a filesystem-wide property, not a per-inode one. Having this flag at the superblock level also allows us to skip having to iterate every dirty inode in wait_sb_inodes() only to skip each inode individually. Prior to this commit, mappings with no data integrity guarantees skipped waiting on writeback completion but still waited on the flusher threads to finish initiating the writeback. Waiting on the flusher threads is unnecessary. This commit kicks off writeback but does not wait on the flusher threads. This change properly addresses a recent report [1] for a suspend-to-RAM hang seen on fuse-overlayfs that was caused by waiting on the flusher threads to finish: Workqueue: pm_fs_sync pm_fs_sync_work_fn Call Trace: __schedule+0x457/0x1720 schedule+0x27/0xd0 wb_wait_for_completion+0x97/0xe0 sync_inodes_sb+0xf8/0x2e0 __iterate_supers+0xdc/0x160 ksys_sync+0x43/0xb0 pm_fs_sync_work_fn+0x17/0xa0 process_one_work+0x193/0x350 worker_thread+0x1a1/0x310 kthread+0xfc/0x240 ret_from_fork+0x243/0x280 ret_from_fork_asm+0x1a/0x30 On fuse this is problematic because there are paths that may cause the flusher thread to block (eg if systemd freezes the user session cgroups first, which freezes the fuse daemon, before invoking the kernel suspend. The kernel suspend triggers ->write_node() which on fuse issues a synchronous setattr request, which cannot be processed since the daemon is frozen. Or if the daemon is buggy and cannot properly complete writeback, initiating writeback on a dirty folio already under writeback leads to writeback_get_folio() -> folio_prepare_writeback() -> unconditional wait on writeback to finish, which will cause a hang). This commit restores fuse to its prior behavior before tmp folios were removed, where sync was essentially a no-op. [1] https://lore.kernel.org/linux-fsdevel/CAJnrk1a-asuvfrbKXbEwwDSctvemF+6zfhdnuzO65Pt8HsFSRw@mail.gmail.com/T/#m632c4648e9cafc4239299887109ebd880ac6c5c1 Fixes: 0c58a97f919c ("fuse: remove tmp folio for writebacks and internal rb tree") Reported-by: John Cc: stable@vger.kernel.org Signed-off-by: Joanne Koong Link: https://patch.msgid.link/20260320005145.2483161-2-joannelkoong@gmail.com Reviewed-by: Jan Kara Reviewed-by: David Hildenbrand (Arm) Signed-off-by: Christian Brauner --- include/linux/fs/super_types.h | 1 + include/linux/pagemap.h | 11 ----------- 2 files changed, 1 insertion(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs/super_types.h b/include/linux/fs/super_types.h index fa7638b81246..383050e7fdf5 100644 --- a/include/linux/fs/super_types.h +++ b/include/linux/fs/super_types.h @@ -338,5 +338,6 @@ struct super_block { #define SB_I_NOUMASK 0x00001000 /* VFS does not apply umask */ #define SB_I_NOIDMAP 0x00002000 /* No idmapped mounts on this superblock */ #define SB_I_ALLOW_HSM 0x00004000 /* Allow HSM events on this superblock */ +#define SB_I_NO_DATA_INTEGRITY 0x00008000 /* fs cannot guarantee data persistence on sync */ #endif /* _LINUX_FS_SUPER_TYPES_H */ diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index ec442af3f886..31a848485ad9 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -210,7 +210,6 @@ enum mapping_flags { AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM = 9, AS_KERNEL_FILE = 10, /* mapping for a fake kernel file that shouldn't account usage to user cgroups */ - AS_NO_DATA_INTEGRITY = 11, /* no data integrity guarantees */ /* Bits 16-25 are used for FOLIO_ORDER */ AS_FOLIO_ORDER_BITS = 5, AS_FOLIO_ORDER_MIN = 16, @@ -346,16 +345,6 @@ static inline bool mapping_writeback_may_deadlock_on_reclaim(const struct addres return test_bit(AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM, &mapping->flags); } -static inline void mapping_set_no_data_integrity(struct address_space *mapping) -{ - set_bit(AS_NO_DATA_INTEGRITY, &mapping->flags); -} - -static inline bool mapping_no_data_integrity(const struct address_space *mapping) -{ - return test_bit(AS_NO_DATA_INTEGRITY, &mapping->flags); -} - static inline gfp_t mapping_gfp_mask(const struct address_space *mapping) { return mapping->gfp_mask; -- cgit v1.2.3 From 26f775a054c3cda86ad465a64141894a90a9e145 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 19 Mar 2026 07:52:17 -0700 Subject: mm/damon/core: avoid use of half-online-committed context One major usage of damon_call() is online DAMON parameters update. It is done by calling damon_commit_ctx() inside the damon_call() callback function. damon_commit_ctx() can fail for two reasons: 1) invalid parameters and 2) internal memory allocation failures. In case of failures, the damon_ctx that attempted to be updated (commit destination) can be partially updated (or, corrupted from a perspective), and therefore shouldn't be used anymore. The function only ensures the damon_ctx object can safely deallocated using damon_destroy_ctx(). The API callers are, however, calling damon_commit_ctx() only after asserting the parameters are valid, to avoid damon_commit_ctx() fails due to invalid input parameters. But it can still theoretically fail if the internal memory allocation fails. In the case, DAMON may run with the partially updated damon_ctx. This can result in unexpected behaviors including even NULL pointer dereference in case of damos_commit_dests() failure [1]. Such allocation failure is arguably too small to fail, so the real world impact would be rare. But, given the bad consequence, this needs to be fixed. Avoid such partially-committed (maybe-corrupted) damon_ctx use by saving the damon_commit_ctx() failure on the damon_ctx object. For this, introduce damon_ctx->maybe_corrupted field. damon_commit_ctx() sets it when it is failed. kdamond_call() checks if the field is set after each damon_call_control->fn() is executed. If it is set, ignore remaining callback requests and return. All kdamond_call() callers including kdamond_fn() also check the maybe_corrupted field right after kdamond_call() invocations. If the field is set, break the kdamond_fn() main loop so that DAMON sill doesn't use the context that might be corrupted. [sj@kernel.org: let kdamond_call() with cancel regardless of maybe_corrupted] Link: https://lkml.kernel.org/r/20260320031553.2479-1-sj@kernel.org Link: https://sashiko.dev/#/patchset/20260319145218.86197-1-sj%40kernel.org Link: https://lkml.kernel.org/r/20260319145218.86197-1-sj@kernel.org Link: https://lore.kernel.org/20260319043309.97966-1-sj@kernel.org [1] Fixes: 3301f1861d34 ("mm/damon/sysfs: handle commit command using damon_call()") Signed-off-by: SeongJae Park Cc: [6.15+] Signed-off-by: Andrew Morton --- include/linux/damon.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index a4fea23da857..be3d198043ff 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -810,6 +810,12 @@ struct damon_ctx { struct damos_walk_control *walk_control; struct mutex walk_control_lock; + /* + * indicate if this may be corrupted. Currentonly this is set only for + * damon_commit_ctx() failure. + */ + bool maybe_corrupted; + /* Working thread of the given DAMON context */ struct task_struct *kdamond; /* Protects @kdamond field access */ -- cgit v1.2.3 From 3414c809777e37855063347f5fbd23ff03e1c9fb Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 11 Mar 2026 22:13:23 -0700 Subject: hwrng: core - avoid kernel-doc warnings Mark internal fields as "private:" so that kernel-doc comments are not needed for them, eliminating kernel-doc warnings: Warning: include/linux/hw_random.h:54 struct member 'list' not described in 'hwrng' Warning: include/linux/hw_random.h:54 struct member 'ref' not described in 'hwrng' Warning: include/linux/hw_random.h:54 struct member 'cleanup_work' not described in 'hwrng' Warning: include/linux/hw_random.h:54 struct member 'cleanup_done' not described in 'hwrng' Warning: include/linux/hw_random.h:54 struct member 'dying' not described in 'hwrng' Signed-off-by: Randy Dunlap Signed-off-by: Herbert Xu --- include/linux/hw_random.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hw_random.h b/include/linux/hw_random.h index b77bc55a4cf3..1d3c1927986e 100644 --- a/include/linux/hw_random.h +++ b/include/linux/hw_random.h @@ -46,7 +46,7 @@ struct hwrng { unsigned long priv; unsigned short quality; - /* internal. */ + /* private: internal. */ struct list_head list; struct kref ref; struct work_struct cleanup_work; -- cgit v1.2.3 From b44c7129f1e3cd0e6233c7cb2d88f917d92f213d Mon Sep 17 00:00:00 2001 From: Zongyu Wu Date: Fri, 13 Mar 2026 17:40:39 +0800 Subject: crypto: hisilicon - add device load query functionality to debugfs The accelerator device supports usage statistics. This patch enables obtaining the accelerator's usage through the "dev_usage" file. The returned number expressed as a percentage as a percentage. Signed-off-by: Zongyu Wu Signed-off-by: Herbert Xu --- include/linux/hisi_acc_qm.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index 51a6dc2b97e9..8a581b5bbbcd 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -102,6 +102,12 @@ #define QM_MIG_REGION_SEL 0x100198 #define QM_MIG_REGION_EN BIT(0) +#define QM_MAX_CHANNEL_NUM 8 +#define QM_CHANNEL_USAGE_OFFSET 0x1100 +#define QM_MAX_DEV_USAGE 100 +#define QM_DEV_USAGE_RATE 100 +#define QM_CHANNEL_ADDR_INTRVL 0x4 + /* uacce mode of the driver */ #define UACCE_MODE_NOUACCE 0 /* don't use uacce */ #define UACCE_MODE_SVA 1 /* use uacce sva mode */ @@ -359,6 +365,11 @@ struct qm_rsv_buf { struct qm_dma qcdma; }; +struct qm_channel { + int channel_num; + const char *channel_name[QM_MAX_CHANNEL_NUM]; +}; + struct hisi_qm { enum qm_hw_ver ver; enum qm_fun_type fun_type; @@ -433,6 +444,7 @@ struct hisi_qm { struct qm_err_isolate isolate_data; struct hisi_qm_cap_tables cap_tables; + struct qm_channel channel_data; }; struct hisi_qp_status { -- cgit v1.2.3 From c8c4a2972f83c8b68ff03b43cecdb898939ff851 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Fri, 13 Mar 2026 11:24:33 -0400 Subject: padata: Put CPU offline callback in ONLINE section to allow failure syzbot reported the following warning: DEAD callback error for CPU1 WARNING: kernel/cpu.c:1463 at _cpu_down+0x759/0x1020 kernel/cpu.c:1463, CPU#0: syz.0.1960/14614 at commit 4ae12d8bd9a8 ("Merge tag 'kbuild-fixes-7.0-2' of git://git.kernel.org/pub/scm/linux/kernel/git/kbuild/linux") which tglx traced to padata_cpu_dead() given it's the only sub-CPUHP_TEARDOWN_CPU callback that returns an error. Failure isn't allowed in hotplug states before CPUHP_TEARDOWN_CPU so move the CPU offline callback to the ONLINE section where failure is possible. Fixes: 894c9ef9780c ("padata: validate cpumask without removed CPU during offline") Reported-by: syzbot+123e1b70473ce213f3af@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/69af0a05.050a0220.310d8.002f.GAE@google.com/ Debugged-by: Thomas Gleixner Signed-off-by: Daniel Jordan Signed-off-by: Herbert Xu --- include/linux/cpuhotplug.h | 1 - include/linux/padata.h | 8 ++++---- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 62cd7b35a29c..22ba327ec227 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -92,7 +92,6 @@ enum cpuhp_state { CPUHP_NET_DEV_DEAD, CPUHP_IOMMU_IOVA_DEAD, CPUHP_AP_ARM_CACHE_B15_RAC_DEAD, - CPUHP_PADATA_DEAD, CPUHP_AP_DTPM_CPU_DEAD, CPUHP_RANDOM_PREPARE, CPUHP_WORKQUEUE_PREP, diff --git a/include/linux/padata.h b/include/linux/padata.h index 765f2778e264..b6232bea6edf 100644 --- a/include/linux/padata.h +++ b/include/linux/padata.h @@ -149,23 +149,23 @@ struct padata_mt_job { /** * struct padata_instance - The overall control structure. * - * @cpu_online_node: Linkage for CPU online callback. - * @cpu_dead_node: Linkage for CPU offline callback. + * @cpuhp_node: Linkage for CPU hotplug callbacks. * @parallel_wq: The workqueue used for parallel work. * @serial_wq: The workqueue used for serial work. * @pslist: List of padata_shell objects attached to this instance. * @cpumask: User supplied cpumasks for parallel and serial works. + * @validate_cpumask: Internal cpumask used to validate @cpumask during hotplug. * @kobj: padata instance kernel object. * @lock: padata instance lock. * @flags: padata flags. */ struct padata_instance { - struct hlist_node cpu_online_node; - struct hlist_node cpu_dead_node; + struct hlist_node cpuhp_node; struct workqueue_struct *parallel_wq; struct workqueue_struct *serial_wq; struct list_head pslist; struct padata_cpumask cpumask; + cpumask_var_t validate_cpumask; struct kobject kobj; struct mutex lock; u8 flags; -- cgit v1.2.3 From e8b83499b4cbc8b989f7cd6aaa893b669326e93c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 11 Mar 2026 22:13:45 -0700 Subject: iio: st_sensors: correct kernel-doc issues Use the proper kernel-doc format and struct member names to avoid kernel-doc warnings: Warning: include/linux/iio/common/st_sensors.h:184 struct member 'int1' not described in 'st_sensor_data_ready_irq' Warning: ../include/linux/iio/common/st_sensors.h:184 struct member 'int2' not described in 'st_sensor_data_ready_irq' Warning: ../include/linux/iio/common/st_sensors.h:184 struct member 'stat_drdy' not described in 'st_sensor_data_ready_irq' Warning: ../include/linux/iio/common/st_sensors.h:184 struct member 'ig1' not described in 'st_sensor_data_ready_irq' Warning: ../include/linux/iio/common/st_sensors.h:219 struct member 'num_ch' not described in 'st_sensor_settings' Warning: ../include/linux/iio/common/st_sensors.h:263 struct member 'num_data_channels' not described in 'st_sensor_data' Signed-off-by: Randy Dunlap Reviewed-by: Andy Shevchenko Signed-off-by: Jonathan Cameron --- include/linux/iio/common/st_sensors.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/common/st_sensors.h b/include/linux/iio/common/st_sensors.h index f9ae5cdd884f..1ba496f0fea5 100644 --- a/include/linux/iio/common/st_sensors.h +++ b/include/linux/iio/common/st_sensors.h @@ -160,12 +160,12 @@ struct st_sensor_int_drdy { /** * struct st_sensor_data_ready_irq - ST sensor device data-ready interrupt - * struct int1 - data-ready configuration register for INT1 pin. - * struct int2 - data-ready configuration register for INT2 pin. + * @int1: data-ready configuration register for INT1 pin. + * @int2: data-ready configuration register for INT2 pin. * @addr_ihl: address to enable/disable active low on the INT lines. * @mask_ihl: mask to enable/disable active low on the INT lines. - * struct stat_drdy - status register of DRDY (data ready) interrupt. - * struct ig1 - represents the Interrupt Generator 1 of sensors. + * @stat_drdy: status register of DRDY (data ready) interrupt. + * @ig1: represents the Interrupt Generator 1 of sensors. * @en_addr: address of the enable ig1 register. * @en_mask: mask to write the on/off value for enable. */ @@ -190,6 +190,7 @@ struct st_sensor_data_ready_irq { * @wai_addr: The address of WhoAmI register. * @sensors_supported: List of supported sensors by struct itself. * @ch: IIO channels for the sensor. + * @num_ch: Number of IIO channels in @ch * @odr: Output data rate register and ODR list available. * @pw: Power register of the sensor. * @enable_axis: Enable one or more axis of the sensor. @@ -228,7 +229,7 @@ struct st_sensor_settings { * @regmap: Pointer to specific sensor regmap configuration. * @enabled: Status of the sensor (false->off, true->on). * @odr: Output data rate of the sensor [Hz]. - * num_data_channels: Number of data channels used in buffer. + * @num_data_channels: Number of data channels used in buffer. * @drdy_int_pin: Redirect DRDY on pin 1 (1) or pin 2 (2). * @int_pin_open_drain: Set the interrupt/DRDY to open drain. * @irq: the IRQ number. -- cgit v1.2.3 From c0368933dd3d4a8210a07a0c95c471421fbf7523 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Thu, 19 Mar 2026 14:22:10 +0200 Subject: mlx5: Remove redundant iseg base iseg_base and base_addr both point to BAR0, making iseg_base redundant. Remove iseg_base and rely on base_addr instead, reducing the size of struct mlx5_core_dev. Signed-off-by: Parav Pandit Reviewed-by: Shay Drori Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20260319122211.27384-2-tariqt@nvidia.com Reviewed-by: Joe Damato Signed-off-by: Leon Romanovsky --- include/linux/mlx5/driver.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 04dcd09f7517..b8b5af78284d 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -755,7 +755,6 @@ struct mlx5_core_dev { } caps; struct mlx5_timeouts *timeouts; u64 sys_image_guid; - phys_addr_t iseg_base; struct mlx5_init_seg __iomem *iseg; phys_addr_t bar_addr; enum mlx5_device_state state; -- cgit v1.2.3 From 26469110c750c8179560637dd813e5d65b8148d2 Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Thu, 19 Mar 2026 14:22:11 +0200 Subject: net/mlx5: Add vhca_id_type bit to alias context Add vhca_id_type bit to alias context which allows indicating the vhca_id_type to be passed at vhca_id_to_be_accessed, which can be either HW or SW, note that SW_VHCA_ID must be used to allow alias to work properly after migration. Signed-off-by: Patrisious Haddad Reviewed-by: Leon Romanovsky Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20260319122211.27384-3-tariqt@nvidia.com Reviewed-by: Joe Damato Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 8fa4fb3d36cf..2400b4c38c77 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1968,7 +1968,8 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_360[0x3]; u8 log_max_rq[0x5]; - u8 reserved_at_368[0x3]; + u8 ft_alias_sw_vhca_id[0x1]; + u8 reserved_at_369[0x2]; u8 log_max_sq[0x5]; u8 reserved_at_370[0x3]; u8 log_max_tir[0x5]; @@ -6957,7 +6958,9 @@ struct mlx5_ifc_create_match_definer_out_bits { struct mlx5_ifc_alias_context_bits { u8 vhca_id_to_be_accessed[0x10]; - u8 reserved_at_10[0xd]; + u8 reserved_at_10[0xb]; + u8 vhca_id_type[0x1]; + u8 reserved_at_1c[0x1]; u8 status[0x3]; u8 object_id_to_be_accessed[0x20]; u8 reserved_at_40[0x40]; -- cgit v1.2.3 From 00a5d1e71c928edb1e7de211a82e87858105dd47 Mon Sep 17 00:00:00 2001 From: Tzuyi Chang Date: Tue, 17 Mar 2026 19:54:05 +0800 Subject: pinctrl: pinconf-generic: Add properties 'input-threshold-voltage-microvolt' Add a new generic pin configuration parameter PIN_CONFIG_INPUT_VOLTAGE_UV. This parameter is used to specify the input voltage level of a pin in microvolts, which corresponds to the 'input-voltage-microvolt' property in Device Tree. Reviewed-by: Linus Walleij Signed-off-by: Tzuyi Chang Co-developed-by: Yu-Chun Lin Signed-off-by: Yu-Chun Lin Signed-off-by: Linus Walleij --- include/linux/pinctrl/pinconf-generic.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h index 531dc3e9b3f7..a5d4b2d8633a 100644 --- a/include/linux/pinctrl/pinconf-generic.h +++ b/include/linux/pinctrl/pinconf-generic.h @@ -83,6 +83,8 @@ struct pinctrl_map; * schmitt-trigger mode is disabled. * @PIN_CONFIG_INPUT_SCHMITT_UV: this will configure an input pin to run in * schmitt-trigger mode. The argument is in uV. + * @PIN_CONFIG_INPUT_VOLTAGE_UV: this will configure the input voltage level of + * the pin. The argument is specified in microvolts. * @PIN_CONFIG_MODE_LOW_POWER: this will configure the pin for low power * operation, if several modes of operation are supported these can be * passed in the argument on a custom form, else just use argument 1 @@ -145,6 +147,7 @@ enum pin_config_param { PIN_CONFIG_INPUT_SCHMITT, PIN_CONFIG_INPUT_SCHMITT_ENABLE, PIN_CONFIG_INPUT_SCHMITT_UV, + PIN_CONFIG_INPUT_VOLTAGE_UV, PIN_CONFIG_MODE_LOW_POWER, PIN_CONFIG_MODE_PWM, PIN_CONFIG_LEVEL, -- cgit v1.2.3 From e43dce8a0bc09083ea1145a1a0c61d83cbe72d97 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Mar 2026 08:01:44 +0100 Subject: fs: fix archiecture-specific compat_ftruncate64 The "small" argument to do_sys_ftruncate indicates if > 32-bit size should be reject, but all the arch-specific compat ftruncate64 implementations get this wrong. Merge do_sys_ftruncate and ksys_ftruncate, replace the integer as boolean small flag with a descriptive one about LFS semantics, and use it correctly in the architecture-specific ftruncate64 implementations. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Fixes: 3dd681d944f6 ("arm64: 32-bit (compat) applications support") Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20260323070205.2939118-2-hch@lst.de Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/syscalls.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 02bd6ddb6278..8787b3511c86 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -1283,12 +1283,8 @@ static inline long ksys_lchown(const char __user *filename, uid_t user, AT_SYMLINK_NOFOLLOW); } -int do_sys_ftruncate(unsigned int fd, loff_t length, int small); - -static inline long ksys_ftruncate(unsigned int fd, loff_t length) -{ - return do_sys_ftruncate(fd, length, 1); -} +#define FTRUNCATE_LFS (1u << 0) /* allow truncating > 32-bit */ +int ksys_ftruncate(unsigned int fd, loff_t length, unsigned int flags); int do_sys_truncate(const char __user *pathname, loff_t length); -- cgit v1.2.3 From e8767a3134ca69a307b37f7f58ca088dbee6eb82 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Mar 2026 08:01:46 +0100 Subject: fs: remove do_sys_truncate do_sys_truncate ist only used to implement ksys_truncate and the native truncate syscalls. Merge do_sys_truncate into ksys_truncate and return int from it as it only returns 0 or negative errnos. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20260323070205.2939118-4-hch@lst.de Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/syscalls.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 8787b3511c86..f5639d5ac331 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -1285,13 +1285,7 @@ static inline long ksys_lchown(const char __user *filename, uid_t user, #define FTRUNCATE_LFS (1u << 0) /* allow truncating > 32-bit */ int ksys_ftruncate(unsigned int fd, loff_t length, unsigned int flags); - -int do_sys_truncate(const char __user *pathname, loff_t length); - -static inline long ksys_truncate(const char __user *pathname, loff_t length) -{ - return do_sys_truncate(pathname, length); -} +int ksys_truncate(const char __user *pathname, loff_t length); static inline unsigned int ksys_personality(unsigned int personality) { -- cgit v1.2.3 From a676643709115816d3ce7e50aa5b5a4af1ee6c45 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Wed, 25 Feb 2026 18:58:32 -0500 Subject: bitmap: drop __find_nth_andnot_bit() Remove find_nth_andnot_bit() leftovers. CC: Rasmus Villemoes CC: Andrew Morton CC: Shaopeng Tan Fixes: b0c85e99458a ("cpumask: Remove unnecessary cpumask_nth_andnot()") Signed-off-by: Yury Norov --- include/linux/find.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/find.h b/include/linux/find.h index 9d720ad92bc1..6c2be8ca615d 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -22,8 +22,6 @@ extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long si unsigned long __find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n); unsigned long __find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long n); -unsigned long __find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, - unsigned long size, unsigned long n); unsigned long __find_nth_and_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, const unsigned long *addr3, unsigned long size, unsigned long n); -- cgit v1.2.3 From bf31ddc14f8c6bcd4987c31fe2bc9e93e433b41a Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 22 Dec 2025 14:11:37 -0500 Subject: bitmap: add bitmap_weight_from() The function calculates a Hamming weight of a bitmap starting from an arbitrary bit. Signed-off-by: Yury Norov --- include/linux/bitmap.h | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index b0395e4ccf90..9c0d1de44350 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -57,6 +57,7 @@ struct device; * bitmap_weight(src, nbits) Hamming Weight: number set bits * bitmap_weight_and(src1, src2, nbits) Hamming Weight of and'ed bitmap * bitmap_weight_andnot(src1, src2, nbits) Hamming Weight of andnot'ed bitmap + * bitmap_weight_from(src, start, end) Hamming Weight starting from @start * bitmap_set(dst, pos, nbits) Set specified bit area * bitmap_clear(dst, pos, nbits) Clear specified bit area * bitmap_find_next_zero_area(buf, len, pos, n, mask) Find bit free area @@ -479,6 +480,38 @@ unsigned long bitmap_weight_andnot(const unsigned long *src1, return __bitmap_weight_andnot(src1, src2, nbits); } +/** + * bitmap_weight_from - Hamming weight for a memory region + * @bitmap: The base address + * @start: The bitnumber to starts weighting + * @end: the bitmap size in bits + * + * Returns the number of set bits in the region. If @start >= @end, + * return >= end. + */ +static __always_inline +unsigned long bitmap_weight_from(const unsigned long *bitmap, + unsigned int start, unsigned int end) +{ + unsigned long w; + + if (unlikely(start >= end)) + return end; + + if (small_const_nbits(end)) + return hweight_long(*bitmap & GENMASK(end - 1, start)); + + bitmap += start / BITS_PER_LONG; + /* Opencode round_down() to not include math.h */ + end -= start & ~(BITS_PER_LONG - 1); + start %= BITS_PER_LONG; + w = bitmap_weight(bitmap, end); + if (start) + w -= hweight_long(*bitmap & BITMAP_LAST_WORD_MASK(start)); + + return w; +} + static __always_inline void bitmap_set(unsigned long *map, unsigned int start, unsigned int nbits) { -- cgit v1.2.3 From 18c48899653fa7a04120537c228031b5c7e4e9d6 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Tue, 10 Mar 2026 16:53:02 -0400 Subject: lib: crypto: fix comments for count_leading_zeros() count_leading_zeros() is based on fls(), which is defined for x == 0, contrary to __ffs() family. The comment in crypto/mpi erroneously states that the function may return undef in such case. Fix the comment together with the outdated function signature, and now that COUNT_LEADING_ZEROS_0 is not referenced in the codebase, get rid of it too. Reviewed-by: Andy Shevchenko Acked-by: Eric Biggers Signed-off-by: Yury Norov --- include/linux/count_zeros.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/count_zeros.h b/include/linux/count_zeros.h index 5b8ff5ac660d..4e5680327ece 100644 --- a/include/linux/count_zeros.h +++ b/include/linux/count_zeros.h @@ -18,7 +18,7 @@ * * If the MSB of @x is set, the result is 0. * If only the LSB of @x is set, then the result is BITS_PER_LONG-1. - * If @x is 0 then the result is COUNT_LEADING_ZEROS_0. + * If @x is 0 then the result is BITS_PER_LONG. */ static inline int count_leading_zeros(unsigned long x) { @@ -28,8 +28,6 @@ static inline int count_leading_zeros(unsigned long x) return BITS_PER_LONG - fls64(x); } -#define COUNT_LEADING_ZEROS_0 BITS_PER_LONG - /** * count_trailing_zeros - Count the number of zeros from the LSB forwards * @x: The value -- cgit v1.2.3 From 4e64c91b813f666dffc3962a815a8a50b7d6f468 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 12 Mar 2026 19:08:16 -0400 Subject: lib: count_zeros: fix 32/64-bit inconsistency in count_trailing_zeros() Based on 'sizeof(x) == 4' condition, in 32-bit case the function is wired to ffs(), while in 64-bit case to __ffs(). The difference is substantial: ffs(x) == __ffs(x) + 1. Also, ffs(0) == 0, while __ffs(0) is undefined. The 32-bit behaviour is inconsistent with the function description, so it needs to get fixed. There are 9 individual users for the function in 6 different subsystems. Some arches and drivers are 64-bit only: - arch/loongarch/kvm/intc/eiointc.c; - drivers/hv/mshv_vtl_main.c; - kernel/liveupdate/kexec_handover.c; The others are: - ib_umem_find_best_pgsz(): as per comment, __ffs() should be correct; - rzv2m_csi_reg_write_bit(): ARCH_RENESAS only, unclear; - lz77_match_len(): CIFS_COMPRESSION only, unclear, experimental; IB and CIFS are explicitly OK with the change. The attached patch gets rid of 32-bit explicit support, so that both 32- and 64-bit versions rely on __ffs(). CC: K. Y. Srinivasan CC: Haiyang Zhang CC: Mark Brown CC: Steve French CC: Alexander Graf CC: Mike Rapoport CC: Pasha Tatashin Acked-by: Leon Romanovsky Signed-off-by: Yury Norov --- include/linux/count_zeros.h | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/count_zeros.h b/include/linux/count_zeros.h index 4e5680327ece..5034a30b5c7c 100644 --- a/include/linux/count_zeros.h +++ b/include/linux/count_zeros.h @@ -10,6 +10,8 @@ #include +#define COUNT_TRAILING_ZEROS_0 (-1) + /** * count_leading_zeros - Count the number of zeros from the MSB back * @x: The value @@ -40,12 +42,7 @@ static inline int count_leading_zeros(unsigned long x) */ static inline int count_trailing_zeros(unsigned long x) { -#define COUNT_TRAILING_ZEROS_0 (-1) - - if (sizeof(x) == 4) - return ffs(x); - else - return (x != 0) ? __ffs(x) : COUNT_TRAILING_ZEROS_0; + return (x != 0) ? __ffs(x) : COUNT_TRAILING_ZEROS_0; } #endif /* _LINUX_BITOPS_COUNT_ZEROS_H_ */ -- cgit v1.2.3 From be56db15fcce8bb8bd85f236382276d52ce73d08 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 23 Mar 2026 12:17:12 -0400 Subject: lib: count_zeros: unify count_{leading,trailing}_zeros() The 'leading' helper returns BITS_PER_LONG if x == 0, while 'trailing' one returns COUNT_TRAILING_ZEROS_0, which turns to be -1. None of the current users explicitly check the returned value for COUNT_TRAILING_ZEROS_0, except the loongarch, which tests implicitly for the '>= 0'. So, align count_trailing_zeros() with the count_leading_zeros(), and simplify the loongarch handling. Reviewed-by: Andy Shevchenko Signed-off-by: Yury Norov --- include/linux/count_zeros.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/count_zeros.h b/include/linux/count_zeros.h index 5034a30b5c7c..b72ba3faa036 100644 --- a/include/linux/count_zeros.h +++ b/include/linux/count_zeros.h @@ -10,8 +10,6 @@ #include -#define COUNT_TRAILING_ZEROS_0 (-1) - /** * count_leading_zeros - Count the number of zeros from the MSB back * @x: The value @@ -38,11 +36,11 @@ static inline int count_leading_zeros(unsigned long x) * * If the LSB of @x is set, the result is 0. * If only the MSB of @x is set, then the result is BITS_PER_LONG-1. - * If @x is 0 then the result is COUNT_TRAILING_ZEROS_0. + * If @x is 0 then the result is BITS_PER_LONG. */ static inline int count_trailing_zeros(unsigned long x) { - return (x != 0) ? __ffs(x) : COUNT_TRAILING_ZEROS_0; + return x ? __ffs(x) : BITS_PER_LONG; } #endif /* _LINUX_BITOPS_COUNT_ZEROS_H_ */ -- cgit v1.2.3 From 7b52b262f8a8cd96dac33721389a884420c18365 Mon Sep 17 00:00:00 2001 From: Kit Dallege Date: Sun, 15 Mar 2026 16:34:14 +0100 Subject: bitops: fix kernel-doc parameter name for parity8() The kernel-doc comment for parity8() documents the parameter as @value but the actual parameter name is @val. Fix the mismatch. Assisted-by: Claude Signed-off-by: Kit Dallege Signed-off-by: Yury Norov --- include/linux/bitops.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bitops.h b/include/linux/bitops.h index ea7898cc5903..1fe46703792f 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -230,7 +230,7 @@ static inline int get_count_order_long(unsigned long l) /** * parity8 - get the parity of an u8 value - * @value: the value to be examined + * @val: the value to be examined * * Determine the parity of the u8 argument. * -- cgit v1.2.3 From 473e470f16f98569d59adc11c4a318780fb68fe9 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 3 Feb 2026 17:45:29 +0100 Subject: tracing: move __printf() attribute on __ftrace_vbprintk() The sunrpc change to use trace_printk() for debugging caused a new warning for every instance of dprintk() in some configurations, when -Wformat-security is enabled: fs/nfs/getroot.c: In function 'nfs_get_root': fs/nfs/getroot.c:90:17: error: format not a string literal and no format arguments [-Werror=format-security] 90 | nfs_errorf(fc, "NFS: Couldn't getattr on root"); I've been slowly chipping away at those warnings over time with the intention of enabling them by default in the future. While I could not figure out why this only happens for this one instance, I see that the __trace_bprintk() function is always called with a local variable as the format string, rather than a literal. Move the __printf(2,3) annotation on this function from the declaration to the caller. As this is can only be validated for literals, the attribute on the declaration causes the warnings every time, but removing it entirely introduces a new warning on the __ftrace_vbprintk() definition. The format strings still get checked because the underlying literal keeps getting passed into __trace_printk() in the "else" branch, which is not taken but still evaluated for compile-time warnings. Cc: Masami Hiramatsu Cc: Anna Schumaker Cc: Chuck Lever Cc: Simon Horman Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Yury Norov Cc: Randy Dunlap Link: https://patch.msgid.link/20260203164545.3174910-1-arnd@kernel.org Fixes: ec7d8e68ef0e ("sunrpc: add a Kconfig option to redirect dfprintk() output to trace buffer") Acked-by: Jeff Layton Acked-by: Steven Rostedt (Google) Signed-off-by: Arnd Bergmann Acked-by: Andy Shevchenko Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_printk.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/trace_printk.h b/include/linux/trace_printk.h index bb5874097f24..2670ec7f4262 100644 --- a/include/linux/trace_printk.h +++ b/include/linux/trace_printk.h @@ -107,7 +107,6 @@ do { \ __trace_printk(_THIS_IP_, fmt, ##args); \ } while (0) -extern __printf(2, 3) int __trace_bprintk(unsigned long ip, const char *fmt, ...); extern __printf(2, 3) -- cgit v1.2.3 From 9a475dc71c38d6abc42ba722ace4a72372876d91 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 18 Mar 2026 16:06:11 +0000 Subject: net: stmmac: move default_an_inband to plat_stmmacenet_data Move the default_an_inband flag from struct mdio_bus_data to struct plat_stmmacenet_data. This is to allow platforms that do not use the integrated MDIO bus to enable inband mode. Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1w2tPP-0000000DYAX-0TKw@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 72febd246bdb..565bb394b194 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -89,7 +89,6 @@ struct stmmac_mdio_bus_data { int *irqs; int probed_phy_irq; bool needs_reset; - bool default_an_inband; }; struct stmmac_dma_cfg { @@ -250,6 +249,7 @@ struct plat_stmmacenet_data { struct stmmac_dma_cfg *dma_cfg; struct stmmac_safety_feature_cfg *safety_feat_cfg; int clk_csr; + bool default_an_inband; bool enh_desc; bool tx_coe; bool bugged_jumbo; -- cgit v1.2.3 From 68cff4fff61fb69ef7bb1f6302d4766822a395cc Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 18 Mar 2026 16:06:26 +0000 Subject: net: stmmac: add BASE-X support to integrated PCS The integrated PCS supports 802.3z (BASE-X) modes when the Synopsys IP is coupled with an appropriate SerDes to provide the electrical interface. The PCS presents a TBI interface to the SerDes for this. Thus, the BASE-X related registers are only present when TBI mode is supported. dwmac-qcom-ethqos added support for using 2.5G with the integrated PCS by calling dwmac_ctrl_ane() directly. Add support for the following to the integrated PCS: - 1000BASE-X protocol unconditionally. - 2500BASE-X if the coupled SerDes supports 2.5G speed. - The above without autonegotiation. - If the PCS supports TBI, then optional BASE-X autonegotiation for each of the above. Reviewed-by: Maxime Chevallier Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1w2tPe-0000000DYAp-1qpV@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 565bb394b194..5b2bece81448 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -212,6 +212,7 @@ enum dwmac_core_type { #define STMMAC_FLAG_EN_TX_LPI_CLK_PHY_CAP BIT(12) #define STMMAC_FLAG_HWTSTAMP_CORRECT_LATENCY BIT(13) #define STMMAC_FLAG_KEEP_PREAMBLE_BEFORE_SFD BIT(14) +#define STMMAC_FLAG_SERDES_SUPPORTS_2500M BIT(15) struct mac_device_info; -- cgit v1.2.3 From eb37011395f12138056a4d124159f1a8436662d3 Mon Sep 17 00:00:00 2001 From: Qingfang Deng Date: Fri, 20 Mar 2026 15:56:03 +0800 Subject: net: add netdev_from_priv() helper Add a helper to get netdev from private data pointer, so drivers won't have to store redundant netdev in priv. Signed-off-by: Qingfang Deng Link: https://patch.msgid.link/20260320075605.490832-1-dqfext@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7ca01eb3f7d2..6882b41bb3e8 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2765,6 +2765,17 @@ static inline void *netdev_priv(const struct net_device *dev) return (void *)dev->priv; } +/** + * netdev_from_priv() - get network device from priv + * @priv: network device private data + * + * Returns: net_device to which @priv belongs + */ +static inline struct net_device *netdev_from_priv(const void *priv) +{ + return container_of(priv, struct net_device, priv); +} + /* Set the sysfs physical device reference for the network logical device * if set prior to registration will cause a symlink during initialization. */ -- cgit v1.2.3 From 9027497a25e3c92b5053b2643e0c18f910865625 Mon Sep 17 00:00:00 2001 From: Qingfang Deng Date: Fri, 20 Mar 2026 15:56:04 +0800 Subject: team: use netdev_from_priv() Use the new netdev_from_priv() helper to access the net device from struct team. Signed-off-by: Qingfang Deng Link: https://patch.msgid.link/20260320075605.490832-2-dqfext@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/if_team.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/if_team.h b/include/linux/if_team.h index ce97d891cf72..ccb5327de26d 100644 --- a/include/linux/if_team.h +++ b/include/linux/if_team.h @@ -186,7 +186,6 @@ struct team_mode { #define TEAM_MODE_PRIV_SIZE (sizeof(long) * TEAM_MODE_PRIV_LONGS) struct team { - struct net_device *dev; /* associated netdevice */ struct team_pcpu_stats __percpu *pcpu_stats; const struct header_ops *header_ops_cache; @@ -232,7 +231,7 @@ static inline int team_dev_queue_xmit(struct team *team, struct team_port *port, skb_set_queue_mapping(skb, qdisc_skb_cb(skb)->slave_dev_queue_mapping); skb->dev = port->dev; - if (unlikely(netpoll_tx_running(team->dev))) { + if (unlikely(netpoll_tx_running(netdev_from_priv(team)))) { team_netpoll_send_skb(port, skb); return 0; } -- cgit v1.2.3 From 0475f9e779b456f934adbc44eeb98e3080a1893f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Fri, 20 Mar 2026 09:58:21 +0100 Subject: ethtool: Track user-provided RSS indirection table size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Track the number of indirection table entries the user originally provided (context 0/default as well!). Replace IFF_RXFH_CONFIGURED with rss_indir_user_size: the flag is redundant now that user_size captures the same information. Add ethtool_rxfh_indir_lost() for drivers that must reset the indirection table. Convert bnxt and mlx5 to use it. Signed-off-by: Björn Töpel Link: https://patch.msgid.link/20260320085826.1957255-2-bjorn@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 7 +++++++ include/linux/netdevice.h | 7 +------ 2 files changed, 8 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 656d465bcd06..34ca9261de82 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -176,6 +176,8 @@ static inline u32 ethtool_rxfh_indir_default(u32 index, u32 n_rx_rings) * struct ethtool_rxfh_context - a custom RSS context configuration * @indir_size: Number of u32 entries in indirection table * @key_size: Size of hash key, in bytes + * @indir_user_size: number of user provided entries for the + * indirection table * @priv_size: Size of driver private data, in bytes * @hfunc: RSS hash function identifier. One of the %ETH_RSS_HASH_* * @input_xfrm: Defines how the input data is transformed. Valid values are one @@ -186,6 +188,7 @@ static inline u32 ethtool_rxfh_indir_default(u32 index, u32 n_rx_rings) struct ethtool_rxfh_context { u32 indir_size; u32 key_size; + u32 indir_user_size; u16 priv_size; u8 hfunc; u8 input_xfrm; @@ -214,6 +217,7 @@ static inline u8 *ethtool_rxfh_context_key(struct ethtool_rxfh_context *ctx) } void ethtool_rxfh_context_lost(struct net_device *dev, u32 context_id); +void ethtool_rxfh_indir_lost(struct net_device *dev); struct link_mode_info { int speed; @@ -1337,12 +1341,15 @@ int ethtool_virtdev_set_link_ksettings(struct net_device *dev, * @rss_ctx: XArray of custom RSS contexts * @rss_lock: Protects entries in @rss_ctx. May be taken from * within RTNL. + * @rss_indir_user_size: Number of user provided entries for the default + * (context 0) indirection table. * @wol_enabled: Wake-on-LAN is enabled * @module_fw_flash_in_progress: Module firmware flashing is in progress. */ struct ethtool_netdev_state { struct xarray rss_ctx; struct mutex rss_lock; + u32 rss_indir_user_size; unsigned wol_enabled:1; unsigned module_fw_flash_in_progress:1; }; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 6882b41bb3e8..e15367373f7c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1716,7 +1716,6 @@ struct net_device_ops { * @IFF_OPENVSWITCH: device is a Open vSwitch master * @IFF_L3MDEV_SLAVE: device is enslaved to an L3 master device * @IFF_TEAM: device is a team device - * @IFF_RXFH_CONFIGURED: device has had Rx Flow indirection table configured * @IFF_PHONY_HEADROOM: the headroom value is controlled by an external * entity (i.e. the master device for bridged veth) * @IFF_MACSEC: device is a MACsec device @@ -1752,7 +1751,6 @@ enum netdev_priv_flags { IFF_OPENVSWITCH = 1<<20, IFF_L3MDEV_SLAVE = 1<<21, IFF_TEAM = 1<<22, - IFF_RXFH_CONFIGURED = 1<<23, IFF_PHONY_HEADROOM = 1<<24, IFF_MACSEC = 1<<25, IFF_NO_RX_HANDLER = 1<<26, @@ -5580,10 +5578,7 @@ static inline bool netif_is_lag_port(const struct net_device *dev) return netif_is_bond_slave(dev) || netif_is_team_port(dev); } -static inline bool netif_is_rxfh_configured(const struct net_device *dev) -{ - return dev->priv_flags & IFF_RXFH_CONFIGURED; -} +bool netif_is_rxfh_configured(const struct net_device *dev); static inline bool netif_is_failover(const struct net_device *dev) { -- cgit v1.2.3 From 02bcb20083b2780772cfb66cd426f31940296783 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Fri, 20 Mar 2026 09:58:22 +0100 Subject: ethtool: Add RSS indirection table resize helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The core locks ctx->indir_size when an RSS context is created. Some NICs (e.g. bnxt) change their indirection table size based on the channel count, because the hardware table is a shared resource. This forces drivers to reject channel changes when RSS contexts exist. Add driver helpers to resize indirection tables: ethtool_rxfh_indir_can_resize() checks whether the default context indirection table can be resized. ethtool_rxfh_indir_resize() resizes the default context table in place. Folding (shrink) requires the table to be periodic at the new size; non-periodic tables are rejected. Unfolding (grow) replicates the existing pattern. Sizes must be multiples of each other. ethtool_rxfh_ctxs_can_resize() validates all non-default RSS contexts can be resized. ethtool_rxfh_ctxs_resize() applies the resize. Signed-off-by: Björn Töpel Link: https://patch.msgid.link/20260320085826.1957255-3-bjorn@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 34ca9261de82..1cb0740ba331 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -218,6 +218,12 @@ static inline u8 *ethtool_rxfh_context_key(struct ethtool_rxfh_context *ctx) void ethtool_rxfh_context_lost(struct net_device *dev, u32 context_id); void ethtool_rxfh_indir_lost(struct net_device *dev); +bool ethtool_rxfh_indir_can_resize(struct net_device *dev, const u32 *tbl, + u32 old_size, u32 new_size); +void ethtool_rxfh_indir_resize(struct net_device *dev, u32 *tbl, + u32 old_size, u32 new_size); +int ethtool_rxfh_ctxs_can_resize(struct net_device *dev, u32 new_indir_size); +void ethtool_rxfh_ctxs_resize(struct net_device *dev, u32 new_indir_size); struct link_mode_info { int speed; -- cgit v1.2.3 From e379dce8af11d8d6040b4348316a499bfd174bfb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 23 Mar 2026 10:36:27 +0100 Subject: sched/topology: Fix sched_domain_span() Commit 8e8e23dea43e ("sched/topology: Compute sd_weight considering cpuset partitions") ends up relying on the fact that structure initialization should not touch the flexible array. However, the official GCC specification for "Arrays of Length Zero" [*] says: Although the size of a zero-length array is zero, an array member of this kind may increase the size of the enclosing type as a result of tail padding. Additionally, structure initialization will zero tail padding. With the end result that since offsetof(*type, member) < sizeof(*type), array initialization will clobber the flex array. Luckily, the way flexible array sizes are calculated is: sizeof(*type) + count * sizeof(*type->member) This means we have the complete size of the flex array *outside* of sizeof(*type), so use that instead of relying on the broken flex array definition. [*] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html Fixes: 8e8e23dea43e ("sched/topology: Compute sd_weight considering cpuset partitions") Reported-by: Nathan Chancellor Debugged-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Tested-by: Jon Hunter Tested-by: Chen Yu Tested-by: K Prateek Nayak Tested-by: Nathan Chancellor Link: https://patch.msgid.link/20260323093627.GY3738010@noisy.programming.kicks-ass.net --- include/linux/sched/topology.h | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 51c29581f15e..36553e14866d 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -142,18 +142,30 @@ struct sched_domain { unsigned int span_weight; /* - * Span of all CPUs in this domain. + * See sched_domain_span(), on why flex arrays are broken. * - * NOTE: this field is variable length. (Allocated dynamically - * by attaching extra space to the end of the structure, - * depending on how many CPUs the kernel has booted up with) - */ unsigned long span[]; + */ }; static inline struct cpumask *sched_domain_span(struct sched_domain *sd) { - return to_cpumask(sd->span); + /* + * Turns out that C flexible arrays are fundamentally broken since it + * is allowed for offsetof(*sd, span) < sizeof(*sd), this means that + * structure initialzation *sd = { ... }; which writes every byte + * inside sizeof(*type), will over-write the start of the flexible + * array. + * + * Luckily, the way we allocate sched_domain is by: + * + * sizeof(*sd) + cpumask_size() + * + * this means that we have sufficient space for the whole flex array + * *outside* of sizeof(*sd). So use that, and avoid using sd->span. + */ + unsigned long *bitmap = (void *)sd + sizeof(*sd); + return to_cpumask(bitmap); } extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], -- cgit v1.2.3 From 38ec410b99a5ee6566f75650ce3d4fd632940fd0 Mon Sep 17 00:00:00 2001 From: Xuan Zhuo Date: Fri, 20 Mar 2026 10:18:17 +0800 Subject: virtio-net: correct hdr_len handling for VIRTIO_NET_F_GUEST_HDRLEN The commit be50da3e9d4a ("net: virtio_net: implement exact header length guest feature") introduces support for the VIRTIO_NET_F_GUEST_HDRLEN feature in virtio-net. This feature requires virtio-net to set hdr_len to the actual header length of the packet when transmitting, the number of bytes from the start of the packet to the beginning of the transport-layer payload. However, in practice, hdr_len was being set using skb_headlen(skb), which is clearly incorrect. This commit fixes that issue. Fixes: be50da3e9d4a ("net: virtio_net: implement exact header length guest feature") Signed-off-by: Xuan Zhuo Link: https://patch.msgid.link/20260320021818.111741-2-xuanzhuo@linux.alibaba.com Acked-by: Michael S. Tsirkin Signed-off-by: Paolo Abeni --- include/linux/virtio_net.h | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 75dabb763c65..361b60c8be68 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -207,6 +207,23 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, return __virtio_net_hdr_to_skb(skb, hdr, little_endian, hdr->gso_type); } +/* This function must be called after virtio_net_hdr_from_skb(). */ +static inline void __virtio_net_set_hdrlen(const struct sk_buff *skb, + struct virtio_net_hdr *hdr, + bool little_endian) +{ + u16 hdr_len; + + hdr_len = skb_transport_offset(skb); + + if (hdr->gso_type == VIRTIO_NET_HDR_GSO_UDP_L4) + hdr_len += sizeof(struct udphdr); + else + hdr_len += tcp_hdrlen(skb); + + hdr->hdr_len = __cpu_to_virtio16(little_endian, hdr_len); +} + static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb, struct virtio_net_hdr *hdr, bool little_endian, @@ -385,7 +402,8 @@ virtio_net_hdr_tnl_from_skb(const struct sk_buff *skb, bool tnl_hdr_negotiated, bool little_endian, int vlan_hlen, - bool has_data_valid) + bool has_data_valid, + bool feature_hdrlen) { struct virtio_net_hdr *hdr = (struct virtio_net_hdr *)vhdr; unsigned int inner_nh, outer_th; @@ -394,9 +412,17 @@ virtio_net_hdr_tnl_from_skb(const struct sk_buff *skb, tnl_gso_type = skb_shinfo(skb)->gso_type & (SKB_GSO_UDP_TUNNEL | SKB_GSO_UDP_TUNNEL_CSUM); - if (!tnl_gso_type) - return virtio_net_hdr_from_skb(skb, hdr, little_endian, - has_data_valid, vlan_hlen); + if (!tnl_gso_type) { + ret = virtio_net_hdr_from_skb(skb, hdr, little_endian, + has_data_valid, vlan_hlen); + if (ret) + return ret; + + if (feature_hdrlen && hdr->hdr_len) + __virtio_net_set_hdrlen(skb, hdr, little_endian); + + return ret; + } /* Tunnel support not negotiated but skb ask for it. */ if (!tnl_hdr_negotiated) -- cgit v1.2.3 From 6c860dc02a8e60b438e26940227dfa641fcdb66a Mon Sep 17 00:00:00 2001 From: Xuan Zhuo Date: Fri, 20 Mar 2026 10:18:18 +0800 Subject: virtio-net: correct hdr_len handling for tunnel gso The commit a2fb4bc4e2a6a03 ("net: implement virtio helpers to handle UDP GSO tunneling.") introduces support for the UDP GSO tunnel feature in virtio-net. The virtio spec says: If the \field{gso_type} has the VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV4 bit or VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV6 bit set, \field{hdr_len} accounts for all the headers up to and including the inner transport. The commit did not update the hdr_len to include the inner transport. I observed that the "hdr_len" is 116 for this packet: 17:36:18.241105 52:55:00:d1:27:0a > 2e:2c:df:46:a9:e1, ethertype IPv4 (0x0800), length 2912: (tos 0x0, ttl 64, id 45197, offset 0, flags [none], proto UDP (17), length 2898) 192.168.122.100.50613 > 192.168.122.1.4789: [bad udp cksum 0x8106 -> 0x26a0!] VXLAN, flags [I] (0x08), vni 1 fa:c3:ba:82:05:ee > ce:85:0c:31:77:e5, ethertype IPv4 (0x0800), length 2862: (tos 0x0, ttl 64, id 14678, offset 0, flags [DF], proto TCP (6), length 2848) 192.168.3.1.49880 > 192.168.3.2.9898: Flags [P.], cksum 0x9266 (incorrect -> 0xaa20), seq 515667:518463, ack 1, win 64, options [nop,nop,TS val 2990048824 ecr 2798801412], length 2796 116 = 14(mac) + 20(ip) + 8(udp) + 8(vxlan) + 14(inner mac) + 20(inner ip) + 32(innner tcp) Fixes: a2fb4bc4e2a6a03 ("net: implement virtio helpers to handle UDP GSO tunneling.") Signed-off-by: Xuan Zhuo Link: https://patch.msgid.link/20260320021818.111741-3-xuanzhuo@linux.alibaba.com Acked-by: Michael S. Tsirkin Signed-off-by: Paolo Abeni --- include/linux/virtio_net.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 361b60c8be68..f36d21b5bc19 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -224,6 +224,22 @@ static inline void __virtio_net_set_hdrlen(const struct sk_buff *skb, hdr->hdr_len = __cpu_to_virtio16(little_endian, hdr_len); } +/* This function must be called after virtio_net_hdr_from_skb(). */ +static inline void __virtio_net_set_tnl_hdrlen(const struct sk_buff *skb, + struct virtio_net_hdr *hdr) +{ + u16 hdr_len; + + hdr_len = skb_inner_transport_offset(skb); + + if (hdr->gso_type == VIRTIO_NET_HDR_GSO_UDP_L4) + hdr_len += sizeof(struct udphdr); + else + hdr_len += inner_tcp_hdrlen(skb); + + hdr->hdr_len = __cpu_to_virtio16(true, hdr_len); +} + static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb, struct virtio_net_hdr *hdr, bool little_endian, @@ -440,6 +456,9 @@ virtio_net_hdr_tnl_from_skb(const struct sk_buff *skb, if (ret) return ret; + if (feature_hdrlen && hdr->hdr_len) + __virtio_net_set_tnl_hdrlen(skb, hdr); + if (skb->protocol == htons(ETH_P_IPV6)) hdr->gso_type |= VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV6; else -- cgit v1.2.3 From 96b76f7bc575ac6c69090f4642e424b04fb6784c Mon Sep 17 00:00:00 2001 From: AKASHI Takahiro Date: Mon, 23 Mar 2026 22:01:10 +0300 Subject: pinctrl: introduce pinctrl_gpio_get_config() This is a counterpart of pinctrl_gpio_set_config(), which will be used to implement the ->get() interface in a GPIO driver for SCMI. This also requires that we create a stub so pin_config_get_for_pin() can build when CONFIG_PINCONF is disabled. Signed-off-by: AKASHI Takahiro Signed-off-by: Dan Carpenter Reviewed-by: Linus Walleij Reviewed-by: Andy Shevchenko Signed-off-by: Linus Walleij --- include/linux/pinctrl/consumer.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pinctrl/consumer.h b/include/linux/pinctrl/consumer.h index 63ce16191eb9..11b8f0b8da0c 100644 --- a/include/linux/pinctrl/consumer.h +++ b/include/linux/pinctrl/consumer.h @@ -35,6 +35,8 @@ int pinctrl_gpio_direction_output(struct gpio_chip *gc, unsigned int offset); int pinctrl_gpio_set_config(struct gpio_chip *gc, unsigned int offset, unsigned long config); +int pinctrl_gpio_get_config(struct gpio_chip *gc, unsigned int offset, + unsigned long *config); struct pinctrl * __must_check pinctrl_get(struct device *dev); void pinctrl_put(struct pinctrl *p); @@ -101,6 +103,13 @@ pinctrl_gpio_direction_output(struct gpio_chip *gc, unsigned int offset) return 0; } +static inline int +pinctrl_gpio_get_config(struct gpio_chip *gc, unsigned int offset, + unsigned long *config) +{ + return 0; +} + static inline int pinctrl_gpio_set_config(struct gpio_chip *gc, unsigned int offset, unsigned long config) -- cgit v1.2.3 From a21c1e961de28b95099a9ca2c3774b2eee1a33bb Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 19 Mar 2026 14:52:38 +0100 Subject: compiler: Simplify generic RELOC_HIDE() When enabling Context Analysis (CONTEXT_ANALYSIS := y) in arch/x86/kvm code, Clang's Thread Safety Analysis failed to recognize that identical per_cpu() accesses refer to the same lock: | CC [M] arch/x86/kvm/vmx/posted_intr.o | arch/x86/kvm/vmx/posted_intr.c:186:2: error: releasing raw_spinlock '__ptr + __per_cpu_offset[vcpu->cpu]' that was not held [-Werror,-Wthread-safety-analysis] | 186 | raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); | | ^ | ./include/linux/spinlock.h:276:32: note: expanded from macro 'raw_spin_unlock' | 276 | #define raw_spin_unlock(lock) _raw_spin_unlock(lock) | | ^ | arch/x86/kvm/vmx/posted_intr.c:207:1: error: raw_spinlock '__ptr + __per_cpu_offset[vcpu->cpu]' is still held at the end of function [-Werror,-Wthread-safety-analysis] | 207 | } | | ^ | arch/x86/kvm/vmx/posted_intr.c:182:2: note: raw_spinlock acquired here | 182 | raw_spin_lock_nested(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu), | | ^ | ./include/linux/spinlock.h:235:2: note: expanded from macro 'raw_spin_lock_nested' | 235 | _raw_spin_lock(((void)(subclass), (lock))) | | ^ | 2 errors generated. This occurred because the default RELOC_HIDE() implementation (used by the per-CPU macros) is a statement expression containing an intermediate 'unsigned long' variable (this version appears to predate Git history). While the analysis strips away inner casts when resolving pointer aliases, it stops when encountering intermediate non-pointer variables (this is Thread Safety Analysis specific and irrelevant for codegen). This prevents the analysis from concluding that the pointers passed to e.g. raw_spin_lock() and raw_spin_unlock() were identical when per-CPU accessors are used. Simplify RELOC_HIDE() to a single expression. This preserves the intent of obfuscating UB-introducing out-of-bounds pointer calculations from the compiler via the 'unsigned long' cast, but allows the alias analysis to successfully resolve the pointers. Using a recent Clang version, I observe that generated code remains the same for vmlinux; the intermediate variable was already being optimized away (for any respectable modern compiler, not doing so would be an optimizer bug). Note that GCC provides its own version of RELOC_HIDE(), so this change only affects Clang builds. Add a test case to lib/test_context-analysis.c to catch any regressions. Reported-by: Bart Van Assche Reported-by: Sean Christopherson Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Nathan Chancellor Link: https://lore.kernel.org/all/e3946223-4543-4a76-a328-9c6865e95192@acm.org/ Link: https://patch.msgid.link/20260319135245.1420780-1-elver@google.com --- include/linux/compiler.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index af16624b29fd..cb2f6050bdf7 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -149,10 +149,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, #endif #ifndef RELOC_HIDE -# define RELOC_HIDE(ptr, off) \ - ({ unsigned long __ptr; \ - __ptr = (unsigned long) (ptr); \ - (typeof(ptr)) (__ptr + (off)); }) +# define RELOC_HIDE(ptr, off) ((typeof(ptr))((unsigned long)(ptr) + (off))) #endif #define absolute_pointer(val) RELOC_HIDE((void *)(val), 0) -- cgit v1.2.3 From cc34d77dd48708d810c12bfd6f5bf03304f6c824 Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Tue, 24 Mar 2026 01:59:15 +0100 Subject: spi: use generic driver_override infrastructure When a driver is probed through __driver_attach(), the bus' match() callback is called without the device lock held, thus accessing the driver_override field without a lock, which can cause a UAF. Fix this by using the driver-core driver_override infrastructure taking care of proper locking internally. Note that calling match() from __driver_attach() without the device lock held is intentional. [1] Also note that we do not enable the driver_override feature of struct bus_type, as SPI - in contrast to most other buses - passes "" to sysfs_emit() when the driver_override pointer is NULL. Thus, printing "\n" instead of "(null)\n". Link: https://lore.kernel.org/driver-core/DGRGTIRHA62X.3RY09D9SOK77P@kernel.org/ [1] Reported-by: Gui-Dong Han Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220789 Fixes: 5039563e7c25 ("spi: Add driver_override SPI device attribute") Signed-off-by: Danilo Krummrich Link: https://patch.msgid.link/20260324005919.2408620-12-dakr@kernel.org Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index af7cfee7b8f6..0dc671c07d3a 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -159,10 +159,6 @@ extern void spi_transfer_cs_change_delay_exec(struct spi_message *msg, * @modalias: Name of the driver to use with this device, or an alias * for that name. This appears in the sysfs "modalias" attribute * for driver coldplugging, and in uevents used for hotplugging - * @driver_override: If the name of a driver is written to this attribute, then - * the device will bind to the named driver and only the named driver. - * Do not set directly, because core frees it; use driver_set_override() to - * set or clear it. * @pcpu_statistics: statistics for the spi_device * @word_delay: delay to be inserted between consecutive * words of a transfer @@ -224,7 +220,6 @@ struct spi_device { void *controller_state; void *controller_data; char modalias[SPI_NAME_SIZE]; - const char *driver_override; /* The statistics */ struct spi_statistics __percpu *pcpu_statistics; -- cgit v1.2.3 From a02327413acc141a887fe77b89656e88bcc4f412 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 14 Mar 2026 14:45:55 -0700 Subject: bpf: Remove inclusions of crypto/sha1.h Since commit 603b44162325 ("bpf: Update the bpf_prog_calc_tag to use SHA256") made BPF program tags use SHA-256 instead of SHA-1, the header no longer needs to be included. Remove the relevant inclusions so that they no longer unnecessarily come up in searches for which kernel code is still using the obsolete SHA-1 algorithm. Since net/ipv6/addrconf.c was relying on the transitive inclusion of (for an unrelated purpose) via , make it include explicitly in order to keep that file building. Signed-off-by: Eric Biggers Acked-by: Paul Chaignon Link: https://lore.kernel.org/r/20260314214555.112386-1-ebiggers@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 44d7ae95ddbc..e40d4071a345 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -21,7 +21,6 @@ #include #include #include -#include #include #include -- cgit v1.2.3 From bd882ffdd48a200ca2faa7c3e690ecf765784b16 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 23 Mar 2026 16:38:32 +0800 Subject: f2fs: call f2fs_handle_critical_error() to set cp_error flag f2fs_handle_page_eio() is the only left place we set CP_ERROR_FLAG directly, it missed to update superblock.s_stop_reason, let's call f2fs_handle_critical_error() instead to fix that. Introduce STOP_CP_REASON_READ_{META,NODE,DATA} stop_cp_reason enum variable to indicate which kind of data we failed to read. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- include/linux/f2fs_fs.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index dc41722fcc9d..829a59399dac 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -80,6 +80,9 @@ enum stop_cp_reason { STOP_CP_REASON_NO_SEGMENT, STOP_CP_REASON_CORRUPTED_FREE_BITMAP, STOP_CP_REASON_CORRUPTED_NID, + STOP_CP_REASON_READ_META, + STOP_CP_REASON_READ_NODE, + STOP_CP_REASON_READ_DATA, STOP_CP_REASON_MAX, }; -- cgit v1.2.3 From 1b164b876c36c3eb5561dd9b37702b04401b0166 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 24 Mar 2026 10:21:25 -1000 Subject: cgroup: Wait for dying tasks to leave on rmdir a72f73c4dd9b ("cgroup: Don't expose dead tasks in cgroup") hid PF_EXITING tasks from cgroup.procs so that systemd doesn't see tasks that have already been reaped via waitpid(). However, the populated counter (nr_populated_csets) is only decremented when the task later passes through cgroup_task_dead() in finish_task_switch(). This means cgroup.procs can appear empty while the cgroup is still populated, causing rmdir to fail with -EBUSY. Fix this by making cgroup_rmdir() wait for dying tasks to fully leave. If the cgroup is populated but all remaining tasks have PF_EXITING set (the task iterator returns none due to the existing filter), wait for a kick from cgroup_task_dead() and retry. The wait is brief as tasks are removed from the cgroup's css_set between PF_EXITING assertion in do_exit() and cgroup_task_dead() in finish_task_switch(). v2: cgroup_is_populated() true to false transition happens under css_set_lock not cgroup_mutex, so retest under css_set_lock before sleeping to avoid missed wakeups (Sebastian). Fixes: a72f73c4dd9b ("cgroup: Don't expose dead tasks in cgroup") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202603222104.2c81684e-lkp@intel.com Reported-by: Sebastian Andrzej Siewior Signed-off-by: Tejun Heo Reviewed-by: Sebastian Andrzej Siewior Cc: Bert Karwatzki Cc: Michal Koutny Cc: cgroups@vger.kernel.org --- include/linux/cgroup-defs.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index bb92f5c169ca..7f87399938fa 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -609,6 +609,9 @@ struct cgroup { /* used to wait for offlining of csses */ wait_queue_head_t offline_waitq; + /* used by cgroup_rmdir() to wait for dying tasks to leave */ + wait_queue_head_t dying_populated_waitq; + /* used to schedule release agent */ struct work_struct release_agent_work; -- cgit v1.2.3 From 8988913aacee82e5401bf3b96839731982dcbde7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 5 Mar 2026 10:31:38 +0100 Subject: module: Drop unused signature types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Only PKCS#7 signatures are used today. Remove the unused enum values. As this enum is used in on-disk data, preserve the numeric value. Signed-off-by: Thomas Weißschuh Reviewed-by: Petr Pavlu Reviewed-by: Nicolas Schier Signed-off-by: Sami Tolvanen --- include/linux/module_signature.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/module_signature.h b/include/linux/module_signature.h index 7eb4b00381ac..820cc1473383 100644 --- a/include/linux/module_signature.h +++ b/include/linux/module_signature.h @@ -15,9 +15,7 @@ #define MODULE_SIG_STRING "~Module signature appended~\n" enum pkey_id_type { - PKEY_ID_PGP, /* OpenPGP generated key ID */ - PKEY_ID_X509, /* X.509 arbitrary subjectKeyIdentifier */ - PKEY_ID_PKCS7, /* Signature in PKCS#7 message */ + PKEY_ID_PKCS7 = 2, /* Signature in PKCS#7 message */ }; /* -- cgit v1.2.3 From acd87264af525dba6e9355310e8acdf066a5f6b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 5 Mar 2026 10:31:39 +0100 Subject: module: Give 'enum pkey_id_type' a more specific name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This enum originates in generic cryptographic code and has a very generic name. Nowadays it is only used for module signatures. As this enum is going to be exposed in a UAPI header, give it a more specific name for clarity and consistency. Signed-off-by: Thomas Weißschuh Reviewed-by: Petr Pavlu Reviewed-by: Nicolas Schier Signed-off-by: Sami Tolvanen --- include/linux/module_signature.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/module_signature.h b/include/linux/module_signature.h index 820cc1473383..c3a05d4cfe67 100644 --- a/include/linux/module_signature.h +++ b/include/linux/module_signature.h @@ -14,8 +14,8 @@ /* In stripped ARM and x86-64 modules, ~ is surprisingly rare. */ #define MODULE_SIG_STRING "~Module signature appended~\n" -enum pkey_id_type { - PKEY_ID_PKCS7 = 2, /* Signature in PKCS#7 message */ +enum module_signature_type { + MODULE_SIGNATURE_TYPE_PKCS7 = 2, /* Signature in PKCS#7 message */ }; /* @@ -31,7 +31,7 @@ enum pkey_id_type { struct module_signature { u8 algo; /* Public-key crypto algorithm [0] */ u8 hash; /* Digest algorithm [0] */ - u8 id_type; /* Key identifier type [PKEY_ID_PKCS7] */ + u8 id_type; /* Key identifier type [enum module_signature_type] */ u8 signer_len; /* Length of signer's name [0] */ u8 key_id_len; /* Length of key identifier [0] */ u8 __pad[3]; -- cgit v1.2.3 From 2ae4ea2d9aaf25cb74fbc23450b1b8f0a5b7aa89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 5 Mar 2026 10:31:40 +0100 Subject: module: Give MODULE_SIG_STRING a more descriptive name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The purpose of the constant it is not entirely clear from its name. As this constant is going to be exposed in a UAPI header, give it a more specific name for clarity. As all its users call it 'marker', use that wording in the constant itself. Signed-off-by: Thomas Weißschuh Reviewed-by: Petr Pavlu Reviewed-by: Nicolas Schier Signed-off-by: Sami Tolvanen --- include/linux/module_signature.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/module_signature.h b/include/linux/module_signature.h index c3a05d4cfe67..915549c779dc 100644 --- a/include/linux/module_signature.h +++ b/include/linux/module_signature.h @@ -12,7 +12,7 @@ #include /* In stripped ARM and x86-64 modules, ~ is surprisingly rare. */ -#define MODULE_SIG_STRING "~Module signature appended~\n" +#define MODULE_SIGNATURE_MARKER "~Module signature appended~\n" enum module_signature_type { MODULE_SIGNATURE_TYPE_PKCS7 = 2, /* Signature in PKCS#7 message */ -- cgit v1.2.3 From f9909cf0a2dcc9e99377f3fcc965ccd93e518e34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 5 Mar 2026 10:31:41 +0100 Subject: module: Move 'struct module_signature' to UAPI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This structure definition is used outside the kernel proper. For example in kmod and the kernel build environment. To allow reuse, move it to a new UAPI header. While it is not a true UAPI, it is a common practice to have non-UAPI interface definitions in the kernel's UAPI headers. Signed-off-by: Thomas Weißschuh Reviewed-by: Petr Pavlu Reviewed-by: Nicolas Schier Signed-off-by: Sami Tolvanen --- include/linux/module_signature.h | 28 +--------------------------- 1 file changed, 1 insertion(+), 27 deletions(-) (limited to 'include/linux') diff --git a/include/linux/module_signature.h b/include/linux/module_signature.h index 915549c779dc..db335d46787f 100644 --- a/include/linux/module_signature.h +++ b/include/linux/module_signature.h @@ -10,33 +10,7 @@ #define _LINUX_MODULE_SIGNATURE_H #include - -/* In stripped ARM and x86-64 modules, ~ is surprisingly rare. */ -#define MODULE_SIGNATURE_MARKER "~Module signature appended~\n" - -enum module_signature_type { - MODULE_SIGNATURE_TYPE_PKCS7 = 2, /* Signature in PKCS#7 message */ -}; - -/* - * Module signature information block. - * - * The constituents of the signature section are, in order: - * - * - Signer's name - * - Key identifier - * - Signature data - * - Information block - */ -struct module_signature { - u8 algo; /* Public-key crypto algorithm [0] */ - u8 hash; /* Digest algorithm [0] */ - u8 id_type; /* Key identifier type [enum module_signature_type] */ - u8 signer_len; /* Length of signer's name [0] */ - u8 key_id_len; /* Length of key identifier [0] */ - u8 __pad[3]; - __be32 sig_len; /* Length of signature data */ -}; +#include int mod_check_sig(const struct module_signature *ms, size_t file_len, const char *name); -- cgit v1.2.3 From c291cfac49a67debdad766eedc450ef613f41b2d Mon Sep 17 00:00:00 2001 From: Kit Dallege Date: Sun, 15 Mar 2026 18:09:41 +0100 Subject: entry: Add missing kernel-doc for arch_ptrace_report_syscall functions Document @regs and @step parameters for arch_ptrace_report_syscall_entry() and arch_ptrace_report_syscall_exit() that were missing from the kernel-doc comments. Signed-off-by: Kit Dallege Signed-off-by: Thomas Gleixner Assisted-by: Claude:claude-opus-4-6 Link: https://patch.msgid.link/20260315170941.65913-1-xaum.io@gmail.com --- include/linux/entry-common.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index f83ca0abf2cd..d223246401bc 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -48,6 +48,7 @@ /** * arch_ptrace_report_syscall_entry - Architecture specific ptrace_report_syscall_entry() wrapper + * @regs: Pointer to the register state at syscall entry * * Invoked from syscall_trace_enter() to wrap ptrace_report_syscall_entry(). * @@ -205,6 +206,8 @@ static __always_inline bool report_single_step(unsigned long work) /** * arch_ptrace_report_syscall_exit - Architecture specific ptrace_report_syscall_exit() + * @regs: Pointer to the register state at syscall exit + * @step: Indicates a single-step exit rather than a normal syscall exit * * This allows architecture specific ptrace_report_syscall_exit() * implementations. If not defined by the architecture this falls back to -- cgit v1.2.3 From 8add6d87dc69c0620c7e60bdc6be6b3b0092d9fa Mon Sep 17 00:00:00 2001 From: Xuyang Dong Date: Tue, 3 Mar 2026 16:06:55 +0800 Subject: clk: divider: Add devm_clk_hw_register_divider_parent_data Add the devres variant of clk_hw_register_divider_parent_data() for registering a divider clock with parent clk data instead of parent name. Reviewed-by: Brian Masney Signed-off-by: Xuyang Dong Signed-off-by: Stephen Boyd --- include/linux/clk-provider.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index 630705a47129..64967ac1b1df 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -947,6 +947,26 @@ struct clk *clk_register_divider_table(struct device *dev, const char *name, (parent_hw), NULL, (flags), (reg), \ (shift), (width), (clk_divider_flags), \ NULL, (lock)) +/** + * devm_clk_hw_register_divider_parent_data - register a divider clock with the + * clock framework + * @dev: device registering this clock + * @name: name of this clock + * @parent_data: parent clk data + * @flags: framework-specific flags + * @reg: register address to adjust divider + * @shift: number of bits to shift the bitfield + * @width: width of the bitfield + * @clk_divider_flags: divider-specific flags for this clock + * @lock: shared register lock for this clock + */ +#define devm_clk_hw_register_divider_parent_data(dev, name, parent_data, \ + flags, reg, shift, width, \ + clk_divider_flags, lock) \ + __devm_clk_hw_register_divider((dev), NULL, (name), NULL, NULL, \ + (parent_data), (flags), (reg), (shift), \ + (width), (clk_divider_flags), NULL, \ + (lock)) /** * devm_clk_hw_register_divider_table - register a table based divider clock * with the clock framework (devres variant) -- cgit v1.2.3 From 37beb42560165869838e7d91724f3e629db64129 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Tue, 3 Mar 2026 15:08:38 +0000 Subject: randomize_kstack: Maintain kstack_offset per task kstack_offset was previously maintained per-cpu, but this caused a couple of issues. So let's instead make it per-task. Issue 1: add_random_kstack_offset() and choose_random_kstack_offset() expected and required to be called with interrupts and preemption disabled so that it could manipulate per-cpu state. But arm64, loongarch and risc-v are calling them with interrupts and preemption enabled. I don't _think_ this causes any functional issues, but it's certainly unexpected and could lead to manipulating the wrong cpu's state, which could cause a minor performance degradation due to bouncing the cache lines. By maintaining the state per-task those functions can safely be called in preemptible context. Issue 2: add_random_kstack_offset() is called before executing the syscall and expands the stack using a previously chosen random offset. choose_random_kstack_offset() is called after executing the syscall and chooses and stores a new random offset for the next syscall. With per-cpu storage for this offset, an attacker could force cpu migration during the execution of the syscall and prevent the offset from being updated for the original cpu such that it is predictable for the next syscall on that cpu. By maintaining the state per-task, this problem goes away because the per-task random offset is updated after the syscall regardless of which cpu it is executing on. Fixes: 39218ff4c625 ("stack: Optionally randomize kernel stack offset each syscall") Closes: https://lore.kernel.org/all/dd8c37bc-795f-4c7a-9086-69e584d8ab24@arm.com/ Cc: stable@vger.kernel.org Acked-by: Mark Rutland Signed-off-by: Ryan Roberts Link: https://patch.msgid.link/20260303150840.3789438-2-ryan.roberts@arm.com Signed-off-by: Kees Cook --- include/linux/randomize_kstack.h | 26 +++++++++++++++----------- include/linux/sched.h | 4 ++++ 2 files changed, 19 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/randomize_kstack.h b/include/linux/randomize_kstack.h index 1d982dbdd0d0..5d3916ca747c 100644 --- a/include/linux/randomize_kstack.h +++ b/include/linux/randomize_kstack.h @@ -9,7 +9,6 @@ DECLARE_STATIC_KEY_MAYBE(CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT, randomize_kstack_offset); -DECLARE_PER_CPU(u32, kstack_offset); /* * Do not use this anywhere else in the kernel. This is used here because @@ -50,15 +49,14 @@ DECLARE_PER_CPU(u32, kstack_offset); * add_random_kstack_offset - Increase stack utilization by previously * chosen random offset * - * This should be used in the syscall entry path when interrupts and - * preempt are disabled, and after user registers have been stored to - * the stack. For testing the resulting entropy, please see: - * tools/testing/selftests/lkdtm/stack-entropy.sh + * This should be used in the syscall entry path after user registers have been + * stored to the stack. Preemption may be enabled. For testing the resulting + * entropy, please see: tools/testing/selftests/lkdtm/stack-entropy.sh */ #define add_random_kstack_offset() do { \ if (static_branch_maybe(CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT, \ &randomize_kstack_offset)) { \ - u32 offset = raw_cpu_read(kstack_offset); \ + u32 offset = current->kstack_offset; \ u8 *ptr = __kstack_alloca(KSTACK_OFFSET_MAX(offset)); \ /* Keep allocation even after "ptr" loses scope. */ \ asm volatile("" :: "r"(ptr) : "memory"); \ @@ -69,9 +67,9 @@ DECLARE_PER_CPU(u32, kstack_offset); * choose_random_kstack_offset - Choose the random offset for the next * add_random_kstack_offset() * - * This should only be used during syscall exit when interrupts and - * preempt are disabled. This position in the syscall flow is done to - * frustrate attacks from userspace attempting to learn the next offset: + * This should only be used during syscall exit. Preemption may be enabled. This + * position in the syscall flow is done to frustrate attacks from userspace + * attempting to learn the next offset: * - Maximize the timing uncertainty visible from userspace: if the * offset is chosen at syscall entry, userspace has much more control * over the timing between choosing offsets. "How long will we be in @@ -85,14 +83,20 @@ DECLARE_PER_CPU(u32, kstack_offset); #define choose_random_kstack_offset(rand) do { \ if (static_branch_maybe(CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT, \ &randomize_kstack_offset)) { \ - u32 offset = raw_cpu_read(kstack_offset); \ + u32 offset = current->kstack_offset; \ offset = ror32(offset, 5) ^ (rand); \ - raw_cpu_write(kstack_offset, offset); \ + current->kstack_offset = offset; \ } \ } while (0) + +static inline void random_kstack_task_init(struct task_struct *tsk) +{ + tsk->kstack_offset = 0; +} #else /* CONFIG_RANDOMIZE_KSTACK_OFFSET */ #define add_random_kstack_offset() do { } while (0) #define choose_random_kstack_offset(rand) do { } while (0) +#define random_kstack_task_init(tsk) do { } while (0) #endif /* CONFIG_RANDOMIZE_KSTACK_OFFSET */ #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index a7b4a980eb2f..8358e430dd7f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1592,6 +1592,10 @@ struct task_struct { unsigned long prev_lowest_stack; #endif +#ifdef CONFIG_RANDOMIZE_KSTACK_OFFSET + u32 kstack_offset; +#endif + #ifdef CONFIG_X86_MCE void __user *mce_vaddr; __u64 mce_kflags; -- cgit v1.2.3 From a96ef5848cb096226bf6aff31a90d8b136d99b71 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Tue, 3 Mar 2026 15:08:39 +0000 Subject: randomize_kstack: Unify random source across arches Previously different architectures were using random sources of differing strength and cost to decide the random kstack offset. A number of architectures (loongarch, powerpc, s390, x86) were using their timestamp counter, at whatever the frequency happened to be. Other arches (arm64, riscv) were using entropy from the crng via get_random_u16(). There have been concerns that in some cases the timestamp counters may be too weak, because they can be easily guessed or influenced by user space. And get_random_u16() has been shown to be too costly for the level of protection kstack offset randomization provides. So let's use a common, architecture-agnostic source of entropy; a per-cpu prng, seeded at boot-time from the crng. This has a few benefits: - We can remove choose_random_kstack_offset(); That was only there to try to make the timestamp counter value a bit harder to influence from user space [*]. - The architecture code is simplified. All it has to do now is call add_random_kstack_offset() in the syscall path. - The strength of the randomness can be reasoned about independently of the architecture. - Arches previously using get_random_u16() now have much faster syscall paths, see below results. [*] Additionally, this gets rid of some redundant work on s390 and x86. Before this patch, those architectures called choose_random_kstack_offset() under arch_exit_to_user_mode_prepare(), which is also called for exception returns to userspace which were *not* syscalls (e.g. regular interrupts). Getting rid of choose_random_kstack_offset() avoids a small amount of redundant work for the non-syscall cases. In some configurations, add_random_kstack_offset() will now call instrumentable code, so for a couple of arches, I have moved the call a bit later to the first point where instrumentation is allowed. This doesn't impact the efficacy of the mechanism. There have been some claims that a prng may be less strong than the timestamp counter if not regularly reseeded. But the prng has a period of about 2^113. So as long as the prng state remains secret, it should not be possible to guess. If the prng state can be accessed, we have bigger problems. Additionally, we are only consuming 6 bits to randomize the stack, so there are only 64 possible random offsets. I assert that it would be trivial for an attacker to brute force by repeating their attack and waiting for the random stack offset to be the desired one. The prng approach seems entirely proportional to this level of protection. Performance data are provided below. The baseline is v6.18 with rndstack on for each respective arch. (I)/(R) indicate statistically significant improvement/regression. arm64 platform is AWS Graviton3 (m7g.metal). x86_64 platform is AWS Sapphire Rapids (m7i.24xlarge): +-----------------+--------------+---------------+---------------+ | Benchmark | Result Class | per-cpu-prng | per-cpu-prng | | | | arm64 (metal) | x86_64 (VM) | +=================+==============+===============+===============+ | syscall/getpid | mean (ns) | (I) -9.50% | (I) -17.65% | | | p99 (ns) | (I) -59.24% | (I) -24.41% | | | p99.9 (ns) | (I) -59.52% | (I) -28.52% | +-----------------+--------------+---------------+---------------+ | syscall/getppid | mean (ns) | (I) -9.52% | (I) -19.24% | | | p99 (ns) | (I) -59.25% | (I) -25.03% | | | p99.9 (ns) | (I) -59.50% | (I) -28.17% | +-----------------+--------------+---------------+---------------+ | syscall/invalid | mean (ns) | (I) -10.31% | (I) -18.56% | | | p99 (ns) | (I) -60.79% | (I) -20.06% | | | p99.9 (ns) | (I) -61.04% | (I) -25.04% | +-----------------+--------------+---------------+---------------+ I tested an earlier version of this change on x86 bare metal and it showed a smaller but still significant improvement. The bare metal system wasn't available this time around so testing was done in a VM instance. I'm guessing the cost of rdtsc is higher for VMs. Acked-by: Mark Rutland Signed-off-by: Ryan Roberts Link: https://patch.msgid.link/20260303150840.3789438-3-ryan.roberts@arm.com Signed-off-by: Kees Cook --- include/linux/randomize_kstack.h | 52 +++++++++++++--------------------------- include/linux/sched.h | 4 ---- 2 files changed, 17 insertions(+), 39 deletions(-) (limited to 'include/linux') diff --git a/include/linux/randomize_kstack.h b/include/linux/randomize_kstack.h index 5d3916ca747c..024fc20e7762 100644 --- a/include/linux/randomize_kstack.h +++ b/include/linux/randomize_kstack.h @@ -6,6 +6,7 @@ #include #include #include +#include DECLARE_STATIC_KEY_MAYBE(CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT, randomize_kstack_offset); @@ -45,9 +46,22 @@ DECLARE_STATIC_KEY_MAYBE(CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT, #define KSTACK_OFFSET_MAX(x) ((x) & 0b1111111100) #endif +DECLARE_PER_CPU(struct rnd_state, kstack_rnd_state); + +static __always_inline u32 get_kstack_offset(void) +{ + struct rnd_state *state; + u32 rnd; + + state = &get_cpu_var(kstack_rnd_state); + rnd = prandom_u32_state(state); + put_cpu_var(kstack_rnd_state); + + return rnd; +} + /** - * add_random_kstack_offset - Increase stack utilization by previously - * chosen random offset + * add_random_kstack_offset - Increase stack utilization by a random offset. * * This should be used in the syscall entry path after user registers have been * stored to the stack. Preemption may be enabled. For testing the resulting @@ -56,47 +70,15 @@ DECLARE_STATIC_KEY_MAYBE(CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT, #define add_random_kstack_offset() do { \ if (static_branch_maybe(CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT, \ &randomize_kstack_offset)) { \ - u32 offset = current->kstack_offset; \ + u32 offset = get_kstack_offset(); \ u8 *ptr = __kstack_alloca(KSTACK_OFFSET_MAX(offset)); \ /* Keep allocation even after "ptr" loses scope. */ \ asm volatile("" :: "r"(ptr) : "memory"); \ } \ } while (0) -/** - * choose_random_kstack_offset - Choose the random offset for the next - * add_random_kstack_offset() - * - * This should only be used during syscall exit. Preemption may be enabled. This - * position in the syscall flow is done to frustrate attacks from userspace - * attempting to learn the next offset: - * - Maximize the timing uncertainty visible from userspace: if the - * offset is chosen at syscall entry, userspace has much more control - * over the timing between choosing offsets. "How long will we be in - * kernel mode?" tends to be more difficult to predict than "how long - * will we be in user mode?" - * - Reduce the lifetime of the new offset sitting in memory during - * kernel mode execution. Exposure of "thread-local" memory content - * (e.g. current, percpu, etc) tends to be easier than arbitrary - * location memory exposure. - */ -#define choose_random_kstack_offset(rand) do { \ - if (static_branch_maybe(CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT, \ - &randomize_kstack_offset)) { \ - u32 offset = current->kstack_offset; \ - offset = ror32(offset, 5) ^ (rand); \ - current->kstack_offset = offset; \ - } \ -} while (0) - -static inline void random_kstack_task_init(struct task_struct *tsk) -{ - tsk->kstack_offset = 0; -} #else /* CONFIG_RANDOMIZE_KSTACK_OFFSET */ #define add_random_kstack_offset() do { } while (0) -#define choose_random_kstack_offset(rand) do { } while (0) -#define random_kstack_task_init(tsk) do { } while (0) #endif /* CONFIG_RANDOMIZE_KSTACK_OFFSET */ #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 8358e430dd7f..a7b4a980eb2f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1592,10 +1592,6 @@ struct task_struct { unsigned long prev_lowest_stack; #endif -#ifdef CONFIG_RANDOMIZE_KSTACK_OFFSET - u32 kstack_offset; -#endif - #ifdef CONFIG_X86_MCE void __user *mce_vaddr; __u64 mce_kflags; -- cgit v1.2.3 From 2cdaff22ed26f1e619aa2b43f27bb84f2c6ef8f8 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Wed, 25 Mar 2026 02:55:48 +0100 Subject: dma-mapping: add missing `inline` for `dma_free_attrs` Under an UML build for an upcoming series [1], I got `-Wstatic-in-inline` for `dma_free_attrs`: BINDGEN rust/bindings/bindings_generated.rs - due to target missing In file included from rust/helpers/helpers.c:59: rust/helpers/dma.c:17:2: warning: static function 'dma_free_attrs' is used in an inline function with external linkage [-Wstatic-in-inline] 17 | dma_free_attrs(dev, size, cpu_addr, dma_handle, attrs); | ^ rust/helpers/dma.c:12:1: note: use 'static' to give inline function 'rust_helper_dma_free_attrs' internal linkage 12 | __rust_helper void rust_helper_dma_free_attrs(struct device *dev, size_t size, | ^ | static The issue is that `dma_free_attrs` was not marked `inline` when it was introduced alongside the rest of the stubs. Thus mark it. Fixes: ed6ccf10f24b ("dma-mapping: properly stub out the DMA API for !CONFIG_HAS_DMA") Closes: https://lore.kernel.org/rust-for-linux/20260322194616.89847-1-ojeda@kernel.org/ [1] Signed-off-by: Miguel Ojeda Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260325015548.70912-1-ojeda@kernel.org --- include/linux/dma-mapping.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 482b919f040f..99ef042ecdb4 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -255,8 +255,8 @@ static inline void *dma_alloc_attrs(struct device *dev, size_t size, { return NULL; } -static void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr, - dma_addr_t dma_handle, unsigned long attrs) +static inline void dma_free_attrs(struct device *dev, size_t size, + void *cpu_addr, dma_addr_t dma_handle, unsigned long attrs) { } static inline void *dmam_alloc_attrs(struct device *dev, size_t size, -- cgit v1.2.3 From 2c8fe1f14240d75f2002e16b2b69c5c2d27ed41c Mon Sep 17 00:00:00 2001 From: Oliver Collyer Date: Fri, 26 Dec 2025 06:57:18 +0000 Subject: media: uvcvideo: Add support for P010 pixel format Add support for the P010 (10-bit Y/UV 4:2:0) pixel format to the uvcvideo driver. This format is exposed by USB capture devices such as the Magewell USB Capture HDMI 4K Pro when capturing HDR10 content. P010 stores 10-bit Y and interleaved UV samples in 16-bit little-endian words, with data in the upper 10 bits and zeros in the lower 6 bits. This requires 2 bytes per sample, so bytesperline is wWidth * 2. V4L2_PIX_FMT_P010 was added to the V4L2 core in commit 5374d8fb75f3 ("media: Add P010 video format"). Based on the community DKMS patch from awawa-dev/P010_for_V4L2. Link: https://github.com/awawa-dev/P010_for_V4L2 Signed-off-by: Oliver Collyer Reviewed-by: Laurent Pinchart Link: https://patch.msgid.link/20251226065718.95504-1-ovcollyer@mac.com Signed-off-by: Laurent Pinchart Signed-off-by: Hans Verkuil --- include/linux/usb/uvc.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/uvc.h b/include/linux/usb/uvc.h index ea92ac623a45..05bfebab42b6 100644 --- a/include/linux/usb/uvc.h +++ b/include/linux/usb/uvc.h @@ -138,6 +138,9 @@ #define UVC_GUID_FORMAT_M420 \ { 'M', '4', '2', '0', 0x00, 0x00, 0x10, 0x00, \ 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} +#define UVC_GUID_FORMAT_P010 \ + { 'P', '0', '1', '0', 0x00, 0x00, 0x10, 0x00, \ + 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} #define UVC_GUID_FORMAT_H264 \ { 'H', '2', '6', '4', 0x00, 0x00, 0x10, 0x00, \ -- cgit v1.2.3 From e93ab401da4b2e2c1b8ef2424de2f238d51c8b2d Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 27 Feb 2026 14:22:16 +0100 Subject: quota: Fix race of dquot_scan_active() with quota deactivation dquot_scan_active() can race with quota deactivation in quota_release_workfn() like: CPU0 (quota_release_workfn) CPU1 (dquot_scan_active) ============================== ============================== spin_lock(&dq_list_lock); list_replace_init( &releasing_dquots, &rls_head); /* dquot X on rls_head, dq_count == 0, DQ_ACTIVE_B still set */ spin_unlock(&dq_list_lock); synchronize_srcu(&dquot_srcu); spin_lock(&dq_list_lock); list_for_each_entry(dquot, &inuse_list, dq_inuse) { /* finds dquot X */ dquot_active(X) -> true atomic_inc(&X->dq_count); } spin_unlock(&dq_list_lock); spin_lock(&dq_list_lock); dquot = list_first_entry(&rls_head); WARN_ON_ONCE(atomic_read(&dquot->dq_count)); The problem is not only a cosmetic one as under memory pressure the caller of dquot_scan_active() can end up working on freed dquot. Fix the problem by making sure the dquot is removed from releasing list when we acquire a reference to it. Fixes: 869b6ea1609f ("quota: Fix slow quotaoff") Reported-by: Sam Sun Link: https://lore.kernel.org/all/CAEkJfYPTt3uP1vAYnQ5V2ZWn5O9PLhhGi5HbOcAzyP9vbXyjeg@mail.gmail.com Signed-off-by: Jan Kara --- include/linux/quotaops.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index c334f82ed385..f9c0f9d7c9d9 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -44,14 +44,7 @@ int dquot_initialize(struct inode *inode); bool dquot_initialize_needed(struct inode *inode); void dquot_drop(struct inode *inode); struct dquot *dqget(struct super_block *sb, struct kqid qid); -static inline struct dquot *dqgrab(struct dquot *dquot) -{ - /* Make sure someone else has active reference to dquot */ - WARN_ON_ONCE(!atomic_read(&dquot->dq_count)); - WARN_ON_ONCE(!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)); - atomic_inc(&dquot->dq_count); - return dquot; -} +struct dquot *dqgrab(struct dquot *dquot); static inline bool dquot_is_busy(struct dquot *dquot) { -- cgit v1.2.3 From 239cd6a417b989708da4b39a71f925897ec87287 Mon Sep 17 00:00:00 2001 From: Manikandan Muralidharan Date: Mon, 23 Feb 2026 15:49:17 +0530 Subject: mfd: atmel-hlcdc: Fetch LVDS PLL clock for LVDS display The XLCDC IP supports parallel RGB, MIPI DSI and LVDS Display. The LCD Generic clock (sys_clk) is used for Parallel RGB and MIPI displays, while the LVDS PLL clock (lvds_pll_clk) is used for LVDS displays.Since both the clocks cannot co-exist together in the DT for a given display, this patch tries sys_clk first (RGB/MIPI), fallback to lvds_pll_clk (LVDS). Signed-off-by: Manikandan Muralidharan Signed-off-by: Dharma Balasubiramani Link: https://patch.msgid.link/20260223101920.284697-2-manikandan.m@microchip.com Signed-off-by: Lee Jones --- include/linux/mfd/atmel-hlcdc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mfd/atmel-hlcdc.h b/include/linux/mfd/atmel-hlcdc.h index 80d675a03b39..07c2081867fd 100644 --- a/include/linux/mfd/atmel-hlcdc.h +++ b/include/linux/mfd/atmel-hlcdc.h @@ -75,6 +75,7 @@ */ struct atmel_hlcdc { struct regmap *regmap; + struct clk *lvds_pll_clk; struct clk *periph_clk; struct clk *sys_clk; struct clk *slow_clk; -- cgit v1.2.3 From 0735e3007c1be6cb40372c403a69200d0929c8d7 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 18 Feb 2026 11:48:01 +0100 Subject: mfd: lpc_ich: Expose the GPIO controller cell's software node One of the users of this driver - meraki-mx100 - abuses the software node API by setting up a dummy software node without any logical link to this GPIO controller and uses the fact that the GPIO core matches the controller's label against the swnode's name to make the lookup work. We want to remove this behavior from GPIOLIB in favor of actual matching of firmware nodes but that would break this user. To facilitate that: create a software node for the GPIO controller cell and expose its address in the provided MFD header. Signed-off-by: Bartosz Golaszewski Acked-by: Andy Shevchenko Link: https://patch.msgid.link/20260218-meraki-swnodes-v2-1-92c521da241c@oss.qualcomm.com Signed-off-by: Lee Jones --- include/linux/mfd/lpc_ich.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/lpc_ich.h b/include/linux/mfd/lpc_ich.h index 1fbda1f8967d..1819aa743c5c 100644 --- a/include/linux/mfd/lpc_ich.h +++ b/include/linux/mfd/lpc_ich.h @@ -37,4 +37,6 @@ struct lpc_ich_info { u8 use_gpio; }; +extern const struct software_node lpc_ich_gpio_swnode; + #endif -- cgit v1.2.3 From a09506820afa391e0a8ecc4b05c954f21e50b1de Mon Sep 17 00:00:00 2001 From: Akari Tsuyukusa Date: Mon, 2 Mar 2026 23:00:45 +0900 Subject: mfd: mt6397: Properly fix CID of MT6328, MT6331 and MT6332 CIDs set for MT6328, MT6331 and MT6332 are not appropriate. Many Android downstream kernels define CID as below, MT6328: #define PMIC6328_E1_CID_CODE 0x2810 #define PMIC6328_E2_CID_CODE 0x2820 #define PMIC6328_E3_CID_CODE 0x2830 MT6331/MT6332: #define PMIC6331_E1_CID_CODE 0x3110 #define PMIC6331_E2_CID_CODE 0x3120 #define PMIC6331_E3_CID_CODE 0x3130 #define PMIC6332_E1_CID_CODE 0x3210 #define PMIC6332_E2_CID_CODE 0x3220 #define PMIC6332_E3_CID_CODE 0x3230 The current configuration incorrectly uses the revision code as the CID. Therefore, the driver cannot detect the same PMIC of different revisions. (E1/E2 for MT6328, E1/E3 for MT6331/MT6332) Based on these, the CID of MT6328, MT6331 and MT6332 should be corrected. Additionally, the incorrect MT6331/MT6332 CID overlaps with the MT6320's actual CID: #define PMIC6320_E1_CID_CODE 0x1020 #define PMIC6320_E2_CID_CODE 0x2020 This causes a conflict in the switch-case statement of mt6397-irq.c, this prevents adding support for MT6320. Signed-off-by: Akari Tsuyukusa Reviewed-by: AngeloGioacchino Del Regno Link: https://patch.msgid.link/20260302140045.651727-1-akkun11.open@gmail.com Signed-off-by: Lee Jones --- include/linux/mfd/mt6397/core.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/mt6397/core.h b/include/linux/mfd/mt6397/core.h index b774c3a4bb62..340fc72e22aa 100644 --- a/include/linux/mfd/mt6397/core.h +++ b/include/linux/mfd/mt6397/core.h @@ -12,9 +12,9 @@ enum chip_id { MT6323_CHIP_ID = 0x23, - MT6328_CHIP_ID = 0x30, - MT6331_CHIP_ID = 0x20, - MT6332_CHIP_ID = 0x20, + MT6328_CHIP_ID = 0x28, + MT6331_CHIP_ID = 0x31, + MT6332_CHIP_ID = 0x32, MT6357_CHIP_ID = 0x57, MT6358_CHIP_ID = 0x58, MT6359_CHIP_ID = 0x59, -- cgit v1.2.3 From ee63402eb41a4ffcac72490b3e93de606de8d394 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 9 Mar 2026 14:42:20 -0700 Subject: mfd: congatec: Fix kernel-doc struct member names Correct the struct member names to avoid kernel-doc warnings: Warning: include/linux/mfd/cgbc.h:38 struct member 'version' not described in 'cgbc_device_data' Warning: ../include/linux/mfd/cgbc.h:38 struct member 'lock' not described in 'cgbc_device_data' Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20260309214223.749088-2-rdunlap@infradead.org Signed-off-by: Lee Jones --- include/linux/mfd/cgbc.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/cgbc.h b/include/linux/mfd/cgbc.h index badbec4c7033..91f501e76c8f 100644 --- a/include/linux/mfd/cgbc.h +++ b/include/linux/mfd/cgbc.h @@ -26,8 +26,8 @@ struct cgbc_version { * @io_cmd: Pointer to the command IO memory * @session: Session id returned by the Board Controller * @dev: Pointer to kernel device structure - * @cgbc_version: Board Controller version structure - * @mutex: Board Controller mutex + * @version: Board Controller version structure + * @lock: Board Controller mutex */ struct cgbc_device_data { void __iomem *io_session; -- cgit v1.2.3 From 69d7fa1b918d0aa0157aef5f71f757916194f099 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 9 Mar 2026 14:42:21 -0700 Subject: mfd: kempld: Fix kernel-doc struct member names Correct the struct member names to avoid kernel-doc warnings: Warning: include/linux/mfd/kempld.h:114 struct member 'gpio_base' not described in 'kempld_platform_data' Warning: include/linux/mfd/kempld.h:114 struct member 'get_hardware_mutex' not described in 'kempld_platform_data' Warning: include/linux/mfd/kempld.h:114 struct member 'release_hardware_mutex' not described in 'kempld_platform_data' Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20260309214223.749088-3-rdunlap@infradead.org Signed-off-by: Lee Jones --- include/linux/mfd/kempld.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/kempld.h b/include/linux/mfd/kempld.h index 643c096b93ac..30fb40325a09 100644 --- a/include/linux/mfd/kempld.h +++ b/include/linux/mfd/kempld.h @@ -97,10 +97,10 @@ struct kempld_device_data { /** * struct kempld_platform_data - PLD hardware configuration structure * @pld_clock: PLD clock frequency - * @gpio_base GPIO base pin number + * @gpio_base: GPIO base pin number * @ioresource: IO addresses of the PLD - * @get_mutex: PLD specific get_mutex callback - * @release_mutex: PLD specific release_mutex callback + * @get_hardware_mutex: PLD specific get_mutex callback + * @release_hardware_mutex: PLD specific release_mutex callback * @get_info: PLD specific get_info callback * @register_cells: PLD specific register_cells callback */ -- cgit v1.2.3 From 5671125a129e97bdd634cf74137cf109d4420a0b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 9 Mar 2026 14:42:22 -0700 Subject: mfd: rsmu: Remove a empty kernel-doc line kernel-doc format expects a prototype on the line that immediately follows the "/**" line, so drop this empty line. Warning: include/linux/mfd/rsmu.h:21 Cannot find identifier on line: * Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20260309214223.749088-4-rdunlap@infradead.org Signed-off-by: Lee Jones --- include/linux/mfd/rsmu.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mfd/rsmu.h b/include/linux/mfd/rsmu.h index 0379aa207428..2f27386a7122 100644 --- a/include/linux/mfd/rsmu.h +++ b/include/linux/mfd/rsmu.h @@ -19,7 +19,6 @@ enum rsmu_type { }; /** - * * struct rsmu_ddata - device data structure for sub devices. * * @dev: i2c/spi device. -- cgit v1.2.3 From 92601fb9d8f61db2ea254965722e379f53d111b5 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 9 Mar 2026 14:42:23 -0700 Subject: mfd: si476x: Fix kernel-doc warnings Add kernel-doc entries for missing fields or correct some typos in names to eliminate kernel-doc warnings: Warning: include/linux/mfd/si476x-core.h:156 struct member 'regmap' not described in 'si476x_core' Warning: include/linux/mfd/si476x-core.h:156 struct member 'power_state' not described in 'si476x_core' Warning: include/linux/mfd/si476x-core.h:156 struct member 'supplies' not described in 'si476x_core' Warning: include/linux/mfd/si476x-core.h:156 struct member 'is_alive' not described in 'si476x_core' Warning: include/linux/mfd/si476x-core.h:156 struct member 'rds_fifo_depth' not described in 'si476x_core' Warning: include/linux/mfd/si476x-core.h:170 function parameter 'core' not described in 'si476x_core_lock' Warning: include/linux/mfd/si476x-core.h:179 function parameter 'core' not described in 'si476x_core_unlock' Warning: include/linux/mfd/si476x-core.h:259 struct member 'firmware' not described in 'si476x_func_info' Warning: include/linux/mfd/si476x-core.h:335 struct member 'rds' not described in 'si476x_rds_status_report' I don't know what the 'ble' field is so I didn't add a kernel-doc comment for it: Warning: include/linux/mfd/si476x-core.h:335 struct member 'ble' not described in 'si476x_rds_status_report' Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20260309214223.749088-5-rdunlap@infradead.org Signed-off-by: Lee Jones --- include/linux/mfd/si476x-core.h | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/si476x-core.h b/include/linux/mfd/si476x-core.h index dd95c37ca134..e913b2cdf77d 100644 --- a/include/linux/mfd/si476x-core.h +++ b/include/linux/mfd/si476x-core.h @@ -77,6 +77,7 @@ enum si476x_power_state { * underlying "core" device which all the MFD cell-devices use. * * @client: Actual I2C client used to transfer commands to the chip. + * @regmap: Regmap for accessing the device registers * @chip_id: Last digit of the chip model(E.g. "1" for SI4761) * @cells: MFD cell devices created by this driver. * @cmd_lock: Mutex used to serialize all the requests to the core @@ -100,16 +101,18 @@ enum si476x_power_state { * @stc: Similar to @cts, but for the STC bit of the status value. * @power_up_parameters: Parameters used as argument for POWER_UP * command when the device is started. - * @state: Current power state of the device. - * @supplues: Structure containing handles to all power supplies used + * @power_state: Current power state of the device. + * @supplies: Structure containing handles to all power supplies used * by the device (NULL ones are ignored). * @gpio_reset: GPIO pin connectet to the RSTB pin of the chip. * @pinmux: Chip's configurable pins configuration. * @diversity_mode: Chips role when functioning in diversity mode. + * @is_alive: Chip is initialized and active. * @status_monitor: Polling worker used in polling use case scenarion * (when IRQ is not avalible). * @revision: Chip's running firmware revision number(Used for correct * command set support). + * @rds_fifo_depth: RDS FIFO size: 20 for IRQ mode or 5 for polling mode. */ struct si476x_core { @@ -166,6 +169,7 @@ static inline struct si476x_core *i2c_mfd_cell_to_core(struct device *dev) /** * si476x_core_lock() - lock the core device to get an exclusive access * to it. + * @core: Core device structure */ static inline void si476x_core_lock(struct si476x_core *core) { @@ -175,6 +179,7 @@ static inline void si476x_core_lock(struct si476x_core *core) /** * si476x_core_unlock() - unlock the core device to relinquish an * exclusive access to it. + * @core: Core device structure */ static inline void si476x_core_unlock(struct si476x_core *core) { @@ -246,9 +251,10 @@ static inline int si476x_to_v4l2(struct si476x_core *core, u16 freq) * struct si476x_func_info - structure containing result of the * FUNC_INFO command. * + * @firmware: Firmware version numbers. * @firmware.major: Firmware major number. * @firmware.minor[...]: Firmware minor numbers. - * @patch_id: + * @patch_id: Firmware patch level. * @func: Mode tuner is working in. */ struct si476x_func_info { @@ -318,8 +324,9 @@ enum si476x_smoothmetrics { * @tp: Current channel's TP flag. * @pty: Current channel's PTY code. * @pi: Current channel's PI code. - * @rdsfifoused: Number of blocks remaining in the RDS FIFO (0 if - * empty). + * @rdsfifoused: Number of blocks remaining in the RDS FIFO (0 if empty). + * @ble: + * @rds: RDS data descriptor */ struct si476x_rds_status_report { bool rdstpptyint, rdspiint, rdssyncint, rdsfifoint; -- cgit v1.2.3 From fe0e422cbcf4b7ee35cacc463631092b310a6f6a Mon Sep 17 00:00:00 2001 From: Phil Elwell Date: Sat, 7 Mar 2026 00:41:21 +0100 Subject: mfd: bcm2835-pm: Introduce SoC-specific type identifier Power management blocks across the BCM2835 family share a common base but require variant-specific handling. For instance, the BCM2712 lacks ASB register space, yet it manages the power domain for the V3D graphics block. Add a hardware type identifier to the driver's private data. This allows the driver to distinguish between SoC models and implement custom quirks or features as needed. Signed-off-by: Phil Elwell Co-developed-by: Stanimir Varbanov Signed-off-by: Stanimir Varbanov Signed-off-by: Andrea della Porta Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/c4bb218654e91f312a01b419d3d408e5131f7673.1772839224.git.andrea.porta@suse.com Signed-off-by: Lee Jones --- include/linux/mfd/bcm2835-pm.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/bcm2835-pm.h b/include/linux/mfd/bcm2835-pm.h index f70a810c55f7..d2e17ab1dbfc 100644 --- a/include/linux/mfd/bcm2835-pm.h +++ b/include/linux/mfd/bcm2835-pm.h @@ -5,11 +5,18 @@ #include +enum bcm2835_soc { + BCM2835_PM_SOC_BCM2835, + BCM2835_PM_SOC_BCM2711, + BCM2835_PM_SOC_BCM2712, +}; + struct bcm2835_pm { struct device *dev; void __iomem *base; void __iomem *asb; void __iomem *rpivid_asb; + enum bcm2835_soc soc; }; #endif /* BCM2835_MFD_PM_H */ -- cgit v1.2.3 From 630bbba45cfd3e4f9247cefd3e2cdc03fe40421b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20B=C3=B6hmwalder?= Date: Tue, 24 Mar 2026 16:29:07 +0100 Subject: drbd: use genl pre_doit/post_doit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Every doit handler followed the same pattern: stack-allocate an adm_ctx, call drbd_adm_prepare() at the top, call drbd_adm_finish() at the bottom. This duplicated boilerplate across 25 handlers and made error paths inconsistent, since some handlers could miss sending the reply skb on early-exit paths. The generic netlink framework already provides pre_doit/post_doit hooks for exactly this purpose. An old comment even noted "this would be a good candidate for a pre_doit hook". Use them: - pre_doit heap-allocates adm_ctx, looks up per-command flags from a new drbd_genl_cmd_flags[] table, runs drbd_adm_prepare(), and stores the context in info->user_ptr[0]. - post_doit sends the reply, drops kref references for device/connection/resource, and frees the adm_ctx. - Handlers just receive adm_ctx from info->user_ptr[0], set reply_dh->ret_code, and return. All teardown is in post_doit. - drbd_adm_finish() is removed, superseded by post_doit. Signed-off-by: Christoph Böhmwalder Link: https://patch.msgid.link/20260324152907.2840984-1-christoph.boehmwalder@linbit.com Signed-off-by: Jens Axboe --- include/linux/genl_magic_func.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/genl_magic_func.h b/include/linux/genl_magic_func.h index d4da060b7532..6edcac85155e 100644 --- a/include/linux/genl_magic_func.h +++ b/include/linux/genl_magic_func.h @@ -292,6 +292,10 @@ static struct genl_family ZZZ_genl_family __ro_after_init = { #endif .maxattr = ARRAY_SIZE(CONCATENATE(GENL_MAGIC_FAMILY, _tla_nl_policy))-1, .policy = CONCATENATE(GENL_MAGIC_FAMILY, _tla_nl_policy), +#ifdef GENL_MAGIC_FAMILY_PRE_DOIT + .pre_doit = GENL_MAGIC_FAMILY_PRE_DOIT, + .post_doit = GENL_MAGIC_FAMILY_POST_DOIT, +#endif .ops = ZZZ_genl_ops, .n_ops = ARRAY_SIZE(ZZZ_genl_ops), .mcgrps = ZZZ_genl_mcgrps, -- cgit v1.2.3 From 175b45ed343a9c547b5f45293d3ea08d38a7b6f4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 14 Mar 2026 04:12:58 -0700 Subject: srcu: Use raw spinlocks so call_srcu() can be used under preempt_disable() Tree SRCU has used non-raw spinlocks for many years, motivated by a desire to avoid unnecessary real-time latency and the absence of any reason to use raw spinlocks. However, the recent use of SRCU in tracing as the underlying implementation of RCU Tasks Trace means that call_srcu() is invoked from preemption-disabled regions of code, which in turn requires that any locks acquired by call_srcu() or its callees must be raw spinlocks. This commit therefore converts SRCU's spinlocks to raw spinlocks. [boqun: Add Fixes tag] Reported-by: Kumar Kartikeya Dwivedi Fixes: c27cea4416a3 ("rcu: Re-implement RCU Tasks Trace in terms of SRCU-fast") Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng Cc: Sebastian Andrzej Siewior --- include/linux/srcutree.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 958cb7ef41cb..dfb31d11ff05 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -34,7 +34,7 @@ struct srcu_data { /* Values: SRCU_READ_FLAVOR_.* */ /* Update-side state. */ - spinlock_t __private lock ____cacheline_internodealigned_in_smp; + raw_spinlock_t __private lock ____cacheline_internodealigned_in_smp; struct rcu_segcblist srcu_cblist; /* List of callbacks.*/ unsigned long srcu_gp_seq_needed; /* Furthest future GP needed. */ unsigned long srcu_gp_seq_needed_exp; /* Furthest future exp GP. */ @@ -55,7 +55,7 @@ struct srcu_data { * Node in SRCU combining tree, similar in function to rcu_data. */ struct srcu_node { - spinlock_t __private lock; + raw_spinlock_t __private lock; unsigned long srcu_have_cbs[4]; /* GP seq for children having CBs, but only */ /* if greater than ->srcu_gp_seq. */ unsigned long srcu_data_have_cbs[4]; /* Which srcu_data structs have CBs for given GP? */ @@ -74,7 +74,7 @@ struct srcu_usage { /* First node at each level. */ int srcu_size_state; /* Small-to-big transition state. */ struct mutex srcu_cb_mutex; /* Serialize CB preparation. */ - spinlock_t __private lock; /* Protect counters and size state. */ + raw_spinlock_t __private lock; /* Protect counters and size state. */ struct mutex srcu_gp_mutex; /* Serialize GP work. */ unsigned long srcu_gp_seq; /* Grace-period seq #. */ unsigned long srcu_gp_seq_needed; /* Latest gp_seq needed. */ @@ -156,7 +156,7 @@ struct srcu_struct { #define __SRCU_USAGE_INIT(name) \ { \ - .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ + .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \ .srcu_gp_seq = SRCU_GP_SEQ_INITIAL_VAL, \ .srcu_gp_seq_needed = SRCU_GP_SEQ_INITIAL_VAL_WITH_STATE, \ .srcu_gp_seq_needed_exp = SRCU_GP_SEQ_INITIAL_VAL, \ -- cgit v1.2.3 From 7c405fb3279b39244b260b54f1bd6488689ae235 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Wed, 18 Mar 2026 17:56:21 -0700 Subject: rcu: Use an intermediate irq_work to start process_srcu() Since commit c27cea4416a3 ("rcu: Re-implement RCU Tasks Trace in terms of SRCU-fast") we switched to SRCU in BPF. However as BPF instrument can happen basically everywhere (including where a scheduler lock is held), call_srcu() now needs to avoid acquiring scheduler lock because otherwise it could cause deadlock [1]. Fix this by following what the previous RCU Tasks Trace did: using an irq_work to delay the queuing of the work to start process_srcu(). [boqun: Apply Joel's feedback] [boqun: Apply Andrea's test feedback] Reported-by: Andrea Righi Closes: https://lore.kernel.org/all/abjzvz_tL_siV17s@gpd4/ Fixes: commit c27cea4416a3 ("rcu: Re-implement RCU Tasks Trace in terms of SRCU-fast") Link: https://lore.kernel.org/rcu/3c4c5a29-24ea-492d-aeee-e0d9605b4183@nvidia.com/ [1] Suggested-by: Zqiang Tested-by: Andrea Righi Tested-by: Paul E. McKenney Tested-by: Joel Fernandes Signed-off-by: Boqun Feng --- include/linux/srcutree.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index dfb31d11ff05..be76fa4fc170 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -95,6 +95,7 @@ struct srcu_usage { unsigned long reschedule_jiffies; unsigned long reschedule_count; struct delayed_work work; + struct irq_work irq_work; struct srcu_struct *srcu_ssp; }; -- cgit v1.2.3 From a6fc88b22bc8d12ad52e8412c667ec0f5bf055af Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Mon, 23 Mar 2026 20:14:18 -0400 Subject: srcu: Use irq_work to start GP in tiny SRCU Tiny SRCU's srcu_gp_start_if_needed() directly calls schedule_work(), which acquires the workqueue pool->lock. This causes a lockdep splat when call_srcu() is called with a scheduler lock held, due to: call_srcu() [holding pi_lock] srcu_gp_start_if_needed() schedule_work() -> pool->lock workqueue_init() / create_worker() [holding pool->lock] wake_up_process() -> try_to_wake_up() -> pi_lock Also add irq_work_sync() to cleanup_srcu_struct() to prevent a use-after-free if a queued irq_work fires after cleanup begins. Tested with rcutorture SRCU-T and no lockdep warnings. [ Thanks to Boqun for similar fix in patch "rcu: Use an intermediate irq_work to start process_srcu()" ] Signed-off-by: Joel Fernandes Reviewed-by: Paul E. McKenney Signed-off-by: Boqun Feng --- include/linux/srcutiny.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index dec7cbe015aa..905b629e8fa3 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -11,6 +11,7 @@ #ifndef _LINUX_SRCU_TINY_H #define _LINUX_SRCU_TINY_H +#include #include struct srcu_struct { @@ -24,18 +25,21 @@ struct srcu_struct { struct rcu_head *srcu_cb_head; /* Pending callbacks: Head. */ struct rcu_head **srcu_cb_tail; /* Pending callbacks: Tail. */ struct work_struct srcu_work; /* For driving grace periods. */ + struct irq_work srcu_irq_work; /* Defer schedule_work() to irq work. */ #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ }; void srcu_drive_gp(struct work_struct *wp); +void srcu_tiny_irq_work(struct irq_work *irq_work); #define __SRCU_STRUCT_INIT(name, __ignored, ___ignored, ____ignored) \ { \ .srcu_wq = __SWAIT_QUEUE_HEAD_INITIALIZER(name.srcu_wq), \ .srcu_cb_tail = &name.srcu_cb_head, \ .srcu_work = __WORK_INITIALIZER(name.srcu_work, srcu_drive_gp), \ + .srcu_irq_work = { .func = srcu_tiny_irq_work }, \ __SRCU_DEP_MAP_INIT(name) \ } -- cgit v1.2.3 From 698753e4a51a5a6670d527a3db084c3616253abc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Le=20Goffic?= Date: Tue, 10 Feb 2026 20:05:45 +0100 Subject: bus: firewall: move stm32_firewall header file in include folder MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Other driver than RIFSC and ETZPC can implement firewall ops, such as RCC. In order for them to have access to the ops and type of this framework, we need to get the `stm32_firewall.h` file in the include/ folder. Signed-off-by: Clément Le Goffic Signed-off-by: Clément Le Goffic Acked-by: Gatien Chevallier Link: https://lore.kernel.org/r/20260210-b4-firewall-upstream-v8-1-097c1e47af82@gmail.com Signed-off-by: Alexandre Torgue --- include/linux/bus/stm32_firewall.h | 83 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 include/linux/bus/stm32_firewall.h (limited to 'include/linux') diff --git a/include/linux/bus/stm32_firewall.h b/include/linux/bus/stm32_firewall.h new file mode 100644 index 000000000000..e5fac85fe346 --- /dev/null +++ b/include/linux/bus/stm32_firewall.h @@ -0,0 +1,83 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2023, STMicroelectronics - All Rights Reserved + */ + +#ifndef _STM32_FIREWALL_H +#define _STM32_FIREWALL_H + +#include +#include +#include +#include +#include + +/** + * STM32_PERIPHERAL_FIREWALL: This type of firewall protects peripherals + * STM32_MEMORY_FIREWALL: This type of firewall protects memories/subsets of memory + * zones + * STM32_NOTYPE_FIREWALL: Undefined firewall type + */ + +#define STM32_PERIPHERAL_FIREWALL BIT(1) +#define STM32_MEMORY_FIREWALL BIT(2) +#define STM32_NOTYPE_FIREWALL BIT(3) + +/** + * struct stm32_firewall_controller - Information on firewall controller supplying services + * + * @name: Name of the firewall controller + * @dev: Device reference of the firewall controller + * @mmio: Base address of the firewall controller + * @entry: List entry of the firewall controller list + * @type: Type of firewall + * @max_entries: Number of entries covered by the firewall + * @grant_access: Callback used to grant access for a device access against a + * firewall controller + * @release_access: Callback used to release resources taken by a device when access was + * granted + * @grant_memory_range_access: Callback used to grant access for a device to a given memory region + */ +struct stm32_firewall_controller { + const char *name; + struct device *dev; + void __iomem *mmio; + struct list_head entry; + unsigned int type; + unsigned int max_entries; + + int (*grant_access)(struct stm32_firewall_controller *ctrl, u32 id); + void (*release_access)(struct stm32_firewall_controller *ctrl, u32 id); + int (*grant_memory_range_access)(struct stm32_firewall_controller *ctrl, phys_addr_t paddr, + size_t size); +}; + +/** + * stm32_firewall_controller_register - Register a firewall controller to the STM32 firewall + * framework + * @firewall_controller: Firewall controller to register + * + * Returns 0 in case of success or -ENODEV if no controller was given. + */ +int stm32_firewall_controller_register(struct stm32_firewall_controller *firewall_controller); + +/** + * stm32_firewall_controller_unregister - Unregister a firewall controller from the STM32 + * firewall framework + * @firewall_controller: Firewall controller to unregister + */ +void stm32_firewall_controller_unregister(struct stm32_firewall_controller *firewall_controller); + +/** + * stm32_firewall_populate_bus - Populate device tree nodes that have a correct firewall + * configuration. This is used at boot-time only, as a sanity check + * between device tree and firewalls hardware configurations to + * prevent a kernel crash when a device driver is not granted access + * + * @firewall_controller: Firewall controller which nodes will be populated or not + * + * Returns 0 in case of success or appropriate errno code if error occurred. + */ +int stm32_firewall_populate_bus(struct stm32_firewall_controller *firewall_controller); + +#endif /* _STM32_FIREWALL_H */ -- cgit v1.2.3 From 4ea96cfc728bbc7c1c3051cce9209bf9a6eeb23e Mon Sep 17 00:00:00 2001 From: Gatien Chevallier Date: Thu, 26 Feb 2026 11:30:20 +0100 Subject: bus: stm32_firewall: add stm32_firewall_get_grant_all_access() API Add the stm32_firewall_get_grant_all_access() API to be able to fetch all firewall references in an access-controllers property and try to grant access to all of them. Signed-off-by: Gatien Chevallier Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20260226-debug_bus-v6-5-5d794697798d@foss.st.com Signed-off-by: Alexandre Torgue --- include/linux/bus/stm32_firewall_device.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bus/stm32_firewall_device.h b/include/linux/bus/stm32_firewall_device.h index eaa7a3f54450..6c878f3ca86f 100644 --- a/include/linux/bus/stm32_firewall_device.h +++ b/include/linux/bus/stm32_firewall_device.h @@ -112,6 +112,25 @@ int stm32_firewall_grant_access_by_id(struct stm32_firewall *firewall, u32 subsy */ void stm32_firewall_release_access_by_id(struct stm32_firewall *firewall, u32 subsystem_id); +/** + * stm32_firewall_get_grant_all_access - Allocate and get all the firewall(s) associated to given + * device. Then, try to grant access rights for each element. + * This function is basically a helper function that wraps + * both stm32_firewall_get_firewall() and + * stm32_firewall_grant_access() on all firewall references of + * a device along with the allocation of the array. + * Realease access using stm32_firewall_release_access* APIs + * when done. + * + * @dev: Device performing the checks + * @firewall: Pointer to the array of firewall references to be allocated + * @nb_firewall: Number of allocated elements in @firewall + * + * Returns 0 on success, or appropriate errno code if error occurred. + */ +int stm32_firewall_get_grant_all_access(struct device *dev, struct stm32_firewall **firewall, + int *nb_firewall); + #else /* CONFIG_STM32_FIREWALL */ static inline int stm32_firewall_get_firewall(struct device_node *np, @@ -141,5 +160,12 @@ static inline void stm32_firewall_release_access_by_id(struct stm32_firewall *fi { } +static inline int stm32_firewall_get_grant_all_access(struct device *dev, + struct stm32_firewall **firewall, + int *nb_firewall) +{ + return -ENODEV; +} + #endif /* CONFIG_STM32_FIREWALL */ #endif /* STM32_FIREWALL_DEVICE_H */ -- cgit v1.2.3 From cc5623947f3d86687c39771fcbea641907966d5c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 9 Mar 2026 23:17:26 -0700 Subject: smp: Add missing kernel-doc comments Add missing kernel-doc comments and rearrange the order of others to prevent all kernel-doc warnings. - add function Returns: sections or format existing comments as kernel-doc - add missing function parameter comments - use "/**" for smp_call_function_any() and on_each_cpu_cond_mask() - correct the commented function name for on_each_cpu_cond_mask() - use correct format for function short descriptions - add all kernel-doc comments for smp_call_on_cpu() - remove kernel-doc comments for raw_smp_processor_id() since there is no prototype for it here (other than !SMP) - in smp.h, rearrange some lines so that the kernel-doc comments for smp_processor_id() are immediately before the macro (to prevent kernel-doc warnings) - remove "Returns" from smp_call_function() since it doesn't return a value Signed-off-by: Randy Dunlap Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260310061726.1153764-1-rdunlap@infradead.org --- include/linux/smp.h | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/smp.h b/include/linux/smp.h index 1ebd88026119..6925d15ccaa7 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -73,7 +73,7 @@ static inline void on_each_cpu(smp_call_func_t func, void *info, int wait) } /** - * on_each_cpu_mask(): Run a function on processors specified by + * on_each_cpu_mask() - Run a function on processors specified by * cpumask, which may include the local processor. * @mask: The set of cpus to run on (only runs on online subset). * @func: The function to run. This must be fast and non-blocking. @@ -239,13 +239,30 @@ static inline int get_boot_cpu_id(void) #endif /* !SMP */ -/** +/* * raw_smp_processor_id() - get the current (unstable) CPU id * - * For then you know what you are doing and need an unstable + * raw_smp_processor_id() is arch-specific/arch-defined and + * may be a macro or a static inline function. + * + * For when you know what you are doing and need an unstable * CPU id. */ +/* + * Allow the architecture to differentiate between a stable and unstable read. + * For example, x86 uses an IRQ-safe asm-volatile read for the unstable but a + * regular asm read for the stable. + */ +#ifndef __smp_processor_id +#define __smp_processor_id() raw_smp_processor_id() +#endif + +#ifdef CONFIG_DEBUG_PREEMPT + extern unsigned int debug_smp_processor_id(void); +# define smp_processor_id() debug_smp_processor_id() + +#else /** * smp_processor_id() - get the current (stable) CPU id * @@ -258,23 +275,10 @@ static inline int get_boot_cpu_id(void) * - preemption is disabled; * - the task is CPU affine. * - * When CONFIG_DEBUG_PREEMPT; we verify these assumption and WARN + * When CONFIG_DEBUG_PREEMPT=y, we verify these assumptions and WARN * when smp_processor_id() is used when the CPU id is not stable. */ -/* - * Allow the architecture to differentiate between a stable and unstable read. - * For example, x86 uses an IRQ-safe asm-volatile read for the unstable but a - * regular asm read for the stable. - */ -#ifndef __smp_processor_id -#define __smp_processor_id() raw_smp_processor_id() -#endif - -#ifdef CONFIG_DEBUG_PREEMPT - extern unsigned int debug_smp_processor_id(void); -# define smp_processor_id() debug_smp_processor_id() -#else # define smp_processor_id() __smp_processor_id() #endif -- cgit v1.2.3 From 897c2beb4a7799154a67942fa85a9678f885f36b Mon Sep 17 00:00:00 2001 From: Viacheslav Dubeyko Date: Mon, 23 Mar 2026 17:39:50 -0700 Subject: hfsplus: fix generic/523 test-case failure The xfstests' test-case generic/523 fails to execute correctly: FSTYP -- hfsplus PLATFORM -- Linux/x86_64 hfsplus-testing-0001 6.15.0-rc4+ #8 SMP PREEMPT_DYNAMIC Thu May 1 16:43:22 PDT 2025 MKFS_OPTIONS -- /dev/loop51 MOUNT_OPTIONS -- /dev/loop51 /mnt/scratch generic/523 - output mismatch (see xfstests-dev/results//generic/523.out.bad) The test-case expects to have '/' in the xattr name. However, HFS+ unicode logic makes conversion of '/' into ':'. In HFS+, a filename can contain '/' because ':' is the separator. The slash is a valid filename character on macOS. But on Linux, / is the path separator and it cannot appear in a filename component. But xattr name can contain any of these symbols. It means that this unicode logic conversion doesn't need to be executed for the case of xattr name. This patch adds distinguishing the regular and xattr names. If we have a regular name, then this conversion of special symbols will be executed. Otherwise, the conversion is skipped for the case of xattr names. sudo ./check -g auto FSTYP -- hfsplus PLATFORM -- Linux/x86_64 hfsplus-testing-0001 7.0.0-rc1+ #24 SMP PREEMPT_DYNAMIC Fri Mar 20 12:36:49 PDT 2026 MKFS_OPTIONS -- /dev/loop51 MOUNT_OPTIONS -- /dev/loop51 /mnt/scratch generic/523 33s ... 25s Closes: https://github.com/hfs-linux-kernel/hfs-linux-kernel/issues/178 cc: John Paul Adrian Glaubitz cc: Yangtao Li cc: linux-fsdevel@vger.kernel.org Signed-off-by: Viacheslav Dubeyko Link: https://lore.kernel.org/r/20260324003949.417048-2-slava@dubeyko.com Signed-off-by: Viacheslav Dubeyko --- include/linux/hfs_common.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hfs_common.h b/include/linux/hfs_common.h index be24c687858e..9e71b9a03b60 100644 --- a/include/linux/hfs_common.h +++ b/include/linux/hfs_common.h @@ -166,6 +166,11 @@ struct hfsplus_attr_unistr { hfsplus_unichr unicode[HFSPLUS_ATTR_MAX_STRLEN]; } __packed; +enum { + HFS_REGULAR_NAME, + HFS_XATTR_NAME, +}; + struct hfs_extent { __be16 block; __be16 count; -- cgit v1.2.3 From 15d6dacdc97dce5ca8a0baf40f5fe8f3dcfef516 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Wed, 18 Mar 2026 14:39:25 +0200 Subject: wifi: ieee80211: Add some missing NAN definitions Add some missing NAN Device capabilities definitions. Signed-off-by: Ilan Peer Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20260318143604.5f6b36d2b208.I7ef571682d5add96eabfcf87f81285893021e851@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211-nan.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211-nan.h b/include/linux/ieee80211-nan.h index d07959bf8a90..ebf28ea651f9 100644 --- a/include/linux/ieee80211-nan.h +++ b/include/linux/ieee80211-nan.h @@ -9,7 +9,7 @@ * Copyright (c) 2006, Michael Wu * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH * Copyright (c) 2016 - 2017 Intel Deutschland GmbH - * Copyright (c) 2018 - 2025 Intel Corporation + * Copyright (c) 2018 - 2026 Intel Corporation */ #ifndef LINUX_IEEE80211_NAN_H @@ -23,6 +23,11 @@ #define NAN_OP_MODE_160MHZ 0x04 #define NAN_OP_MODE_PNDL_SUPPRTED 0x08 +#define NAN_DEV_CAPA_NUM_TX_ANT_POS 0 +#define NAN_DEV_CAPA_NUM_TX_ANT_MASK 0x0f +#define NAN_DEV_CAPA_NUM_RX_ANT_POS 4 +#define NAN_DEV_CAPA_NUM_RX_ANT_MASK 0xf0 + /* NAN Device capabilities, as defined in Wi-Fi Aware (TM) specification * Table 79 */ -- cgit v1.2.3 From 7dd6f81f4ef801b57f6ce7b0eee32aef5c488538 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Wed, 25 Mar 2026 21:57:39 +0200 Subject: wifi: mac80211: ignore reserved bits in reconfiguration status The Link ID Info field in the Reconfiguration Status Duple subfield of the Reconfiguration Response frame only uses the lower four bits for the link ID. The upper bits are reserved and should therefore be ignored. Signed-off-by: Benjamin Berg Reviewed-by: Ilan Peer Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20260325215404.ab5ccf4bc62e.I9aef8f4fb6f1b06671bb6cf0e2bd4ec6e4c8bda4@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 52db36120314..b5d649db123f 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1194,6 +1194,13 @@ struct ieee80211_mgmt { #define IEEE80211_MIN_ACTION_SIZE(type) offsetofend(struct ieee80211_mgmt, u.action.type) +/* Link Reconfiguration Status Duple field */ +struct ieee80211_ml_reconf_status { + u8 info; + __le16 status; +} __packed; + +#define IEEE80211_ML_RECONF_LINK_ID_MASK 0xf /* Management MIC information element (IEEE 802.11w) for CMAC */ struct ieee80211_mmie { -- cgit v1.2.3 From 677a3d82b6407522d922705209c311e043a17813 Mon Sep 17 00:00:00 2001 From: "Vineeth Pillai (Google)" Date: Mon, 23 Mar 2026 12:00:20 -0400 Subject: tracepoint: Add trace_call__##name() API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add trace_call__##name() as a companion to trace_##name(). When a caller already guards a tracepoint with an explicit enabled check: if (trace_foo_enabled() && cond) trace_foo(args); trace_foo() internally repeats the static_branch_unlikely() test, which the compiler cannot fold since static branches are patched binary instructions. This results in two static-branch evaluations for every guarded call site. trace_call__##name() calls __do_trace_##name() directly, skipping the redundant static-branch re-check. This avoids leaking the internal __do_trace_##name() symbol into call sites while still eliminating the double evaluation: if (trace_foo_enabled() && cond) trace_invoke_foo(args); /* calls __do_trace_foo() directly */ Three locations are updated: - __DECLARE_TRACE: invoke form omits static_branch_unlikely, retains the LOCKDEP RCU-watching assertion. - __DECLARE_TRACE_SYSCALL: same, plus retains might_fault(). - !TRACEPOINTS_ENABLED stub: empty no-op so callers compile cleanly when tracepoints are compiled out. Cc: Dmitry Ilvokhin Cc: Mathieu Desnoyers Cc: Ingo Molnar Cc: Jens Axboe Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Marcelo Ricardo Leitner Cc: Xin Long Cc: Jon Maloy Cc: Aaron Conole Cc: Eelco Chaudron Cc: Ilya Maximets Cc: Jiri Pirko Cc: Oded Gabbay Cc: Koby Elbaz Cc: "Rafael J. Wysocki" Cc: Viresh Kumar Cc: "Gautham R. Shenoy" Cc: Huang Rui Cc: Mario Limonciello Cc: Len Brown Cc: Srinivas Pandruvada Cc: MyungJoo Ham Cc: Kyungmin Park Cc: Chanwoo Choi Cc: Christian König Cc: Sumit Semwal Cc: Eddie James Cc: Andrew Jeffery Cc: Joel Stanley Cc: David Airlie Cc: Simona Vetter Cc: Alex Deucher Cc: Danilo Krummrich Cc: Matthew Brost Cc: Philipp Stanner Cc: Harry Wentland Cc: Leo Li Cc: Jiri Kosina Cc: Benjamin Tissoires Cc: Wolfram Sang Cc: Mark Brown Cc: Michael Hennerich Cc: Nuno Sá Cc: "James E.J. Bottomley" Cc: "Martin K. Petersen" Cc: Chris Mason Cc: David Sterba Cc: Thomas Gleixner Cc: Andrew Morton Cc: SeongJae Park Cc: Borislav Petkov Cc: Dave Hansen Link: https://patch.msgid.link/20260323160052.17528-2-vineeth@bitbyteword.org Suggested-by: Steven Rostedt Suggested-by: Peter Zijlstra Acked-by: Masami Hiramatsu (Google) Signed-off-by: Vineeth Pillai (Google) Assisted-by: Claude:claude-sonnet-4-6 Signed-off-by: Steven Rostedt (Google) --- include/linux/tracepoint.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 22ca1c8b54f3..ed969705341f 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -294,6 +294,10 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) WARN_ONCE(!rcu_is_watching(), \ "RCU not watching for tracepoint"); \ } \ + } \ + static inline void trace_call__##name(proto) \ + { \ + __do_trace_##name(args); \ } #define __DECLARE_TRACE_SYSCALL(name, proto, args, data_proto) \ @@ -313,6 +317,11 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) WARN_ONCE(!rcu_is_watching(), \ "RCU not watching for tracepoint"); \ } \ + } \ + static inline void trace_call__##name(proto) \ + { \ + might_fault(); \ + __do_trace_##name(args); \ } /* @@ -398,6 +407,8 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #define __DECLARE_TRACE_COMMON(name, proto, args, data_proto) \ static inline void trace_##name(proto) \ { } \ + static inline void trace_call__##name(proto) \ + { } \ static inline int \ register_trace_##name(void (*probe)(data_proto), \ void *data) \ -- cgit v1.2.3 From 5f36c9ca33336036a087b270e68e8236c733f448 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 26 Mar 2026 10:54:08 +0100 Subject: fs: Rename generic_file_fsync() to simple_fsync() The implementation is now really basic so rename generic_file_fsync() simple_fsync() and __generic_file_fsync() to simple_fsync_noflush(). Signed-off-by: Jan Kara Link: https://patch.msgid.link/20260326095354.16340-56-jack@suse.cz Signed-off-by: Christian Brauner --- include/linux/fs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 8b3dd145b25e..0fc0cb23000e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3295,8 +3295,8 @@ void simple_offset_destroy(struct offset_ctx *octx); extern const struct file_operations simple_offset_dir_operations; -extern int __generic_file_fsync(struct file *, loff_t, loff_t, int); -extern int generic_file_fsync(struct file *, loff_t, loff_t, int); +extern int simple_fsync_noflush(struct file *, loff_t, loff_t, int); +extern int simple_fsync(struct file *, loff_t, loff_t, int); extern int generic_check_addressable(unsigned, u64); -- cgit v1.2.3 From 972b9dd4e4180fbb2352bf2f0e015b7b63f5cca0 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 26 Mar 2026 10:54:16 +0100 Subject: fs: Ignore inode metadata buffers in inode_lru_isolate() There are only a few filesystems that use generic tracking of inode metadata buffer heads. As such the logic to reclaim tracked metadata buffer heads in inode_lru_isolate() doesn't bring a benefit big enough to justify intertwining of inode reclaim and metadata buffer head tracking. Just treat tracked metadata buffer heads as any other metadata filesystem has to properly clean up on inode eviction and stop handling it in inode_lru_isolate(). As a result filesystems using generic tracking of metadata buffer heads may now see dirty metadata buffers in their .evict methods more often which can slow down inode reclaim but given these filesystems aren't used in performance demanding setups we should be fine. Signed-off-by: Jan Kara Link: https://patch.msgid.link/20260326095354.16340-64-jack@suse.cz Tested-by: syzbot@syzkaller.appspotmail.com Signed-off-by: Christian Brauner --- include/linux/buffer_head.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index b16b88bfbc3e..631bf971efc0 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -517,7 +517,6 @@ void buffer_init(void); bool try_to_free_buffers(struct folio *folio); int inode_has_buffers(struct inode *inode); void invalidate_inode_buffers(struct inode *inode); -int remove_inode_buffers(struct inode *inode); int sync_mapping_buffers(struct address_space *mapping); void invalidate_bh_lrus(void); void invalidate_bh_lrus_cpu(void); @@ -528,9 +527,7 @@ extern int buffer_heads_over_limit; static inline void buffer_init(void) {} static inline bool try_to_free_buffers(struct folio *folio) { return true; } -static inline int inode_has_buffers(struct inode *inode) { return 0; } static inline void invalidate_inode_buffers(struct inode *inode) {} -static inline int remove_inode_buffers(struct inode *inode) { return 1; } static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; } static inline void invalidate_bh_lrus(void) {} static inline void invalidate_bh_lrus_cpu(void) {} -- cgit v1.2.3 From 2811f2a82fafff40867b318360cc06143b088a7c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 26 Mar 2026 10:54:18 +0100 Subject: hugetlbfs: Stop using i_private_data Instead of using i_private_data for resv_map pointer add the pointer into hugetlbfs private part of the inode. Reviewed-by: Christoph Hellwig Signed-off-by: Jan Kara Link: https://patch.msgid.link/20260326095354.16340-66-jack@suse.cz Signed-off-by: Christian Brauner --- include/linux/hugetlb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 65910437be1c..fc5462fe943f 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -518,6 +518,7 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb) struct hugetlbfs_inode_info { struct inode vfs_inode; + struct resv_map *resv_map; unsigned int seals; }; -- cgit v1.2.3 From cd336f2e275de14866101d3395c7d2be0a0c1b04 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 26 Mar 2026 10:54:20 +0100 Subject: fs: Remove i_private_data Nobody is using it anymore. Reviewed-by: Christoph Hellwig Signed-off-by: Jan Kara Link: https://patch.msgid.link/20260326095354.16340-68-jack@suse.cz Tested-by: syzbot@syzkaller.appspotmail.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 0fc0cb23000e..d488459396f4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -465,7 +465,6 @@ extern const struct address_space_operations empty_aops; * @wb_err: The most recent error which has occurred. * @i_private_lock: For use by the owner of the address_space. * @i_private_list: For use by the owner of the address_space. - * @i_private_data: For use by the owner of the address_space. */ struct address_space { struct inode *host; @@ -486,7 +485,6 @@ struct address_space { spinlock_t i_private_lock; struct list_head i_private_list; struct rw_semaphore i_mmap_rwsem; - void * i_private_data; } __attribute__((aligned(sizeof(long)))) __randomize_layout; /* * On most architectures that alignment is already the case; but -- cgit v1.2.3 From 521bea7cec8a79684402d555caab408ed43171d5 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 26 Mar 2026 10:54:24 +0100 Subject: fs: Move metadata bhs tracking to a separate struct Instead of tracking metadata bhs for a mapping using i_private_list and i_private_lock create a dedicated mapping_metadata_bhs struct for it. So far this struct is embedded in address_space but that will be switched for per-fs private inode parts later in the series. This also changes the locking from bdev mapping's i_private_lock to a new lock embedded in mapping_metadata_bhs to untangle the i_private_lock locking for maintaining lists of metadata bhs and the locking for looking up / reclaiming bdev's buffer heads. The locking in remove_assoc_map() gets more complex due to this but overall this looks like a reasonable tradeoff. Reviewed-by: Christoph Hellwig Signed-off-by: Jan Kara Link: https://patch.msgid.link/20260326095354.16340-72-jack@suse.cz Signed-off-by: Christian Brauner --- include/linux/fs.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index d488459396f4..76360b0040e0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -445,6 +445,12 @@ struct address_space_operations { extern const struct address_space_operations empty_aops; +/* Structure for tracking metadata buffer heads associated with the mapping */ +struct mapping_metadata_bhs { + spinlock_t lock; /* Lock protecting bh list */ + struct list_head list; /* The list of bhs (b_assoc_buffers) */ +}; + /** * struct address_space - Contents of a cacheable, mappable object. * @host: Owner, either the inode or the block_device. @@ -484,6 +490,7 @@ struct address_space { errseq_t wb_err; spinlock_t i_private_lock; struct list_head i_private_list; + struct mapping_metadata_bhs i_metadata_bhs; struct rw_semaphore i_mmap_rwsem; } __attribute__((aligned(sizeof(long)))) __randomize_layout; /* -- cgit v1.2.3 From c86f5d25514c2a60fcf5ea0aa11c5d8bd1a313ef Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 26 Mar 2026 10:54:25 +0100 Subject: fs: Make bhs point to mapping_metadata_bhs Make buffer heads point to mapping_metadata_bhs instead of struct address_space. This makes the code more self contained. For the (only) case of IO error handling where we really need to reach struct address_space add a pointer to the mapping from mapping_metadata_bhs. Reviewed-by: Christoph Hellwig Signed-off-by: Jan Kara Link: https://patch.msgid.link/20260326095354.16340-73-jack@suse.cz Signed-off-by: Christian Brauner --- include/linux/buffer_head.h | 4 ++-- include/linux/fs.h | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 631bf971efc0..20636599d858 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -73,8 +73,8 @@ struct buffer_head { bh_end_io_t *b_end_io; /* I/O completion */ void *b_private; /* reserved for b_end_io */ struct list_head b_assoc_buffers; /* associated with another mapping */ - struct address_space *b_assoc_map; /* mapping this buffer is - associated with */ + struct mapping_metadata_bhs *b_mmb; /* head of the list of metadata bhs + * this buffer is associated with */ atomic_t b_count; /* users using this buffer_head */ spinlock_t b_uptodate_lock; /* Used by the first bh in a page, to * serialise IO completion of other diff --git a/include/linux/fs.h b/include/linux/fs.h index 76360b0040e0..fa2a812bd718 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -447,6 +447,7 @@ extern const struct address_space_operations empty_aops; /* Structure for tracking metadata buffer heads associated with the mapping */ struct mapping_metadata_bhs { + struct address_space *mapping; /* Mapping bhs are associated with */ spinlock_t lock; /* Lock protecting bh list */ struct list_head list; /* The list of bhs (b_assoc_buffers) */ }; -- cgit v1.2.3 From 025c9af1a20c8353f586c9bfd30705dfe4a277de Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 26 Mar 2026 10:54:26 +0100 Subject: fs: Switch inode_has_buffers() to take mapping_metadata_bhs As part of a move towards placing mapping_metadata_bhs in fs-private inode part, switch inode_has_buffers() to take mapping_metadata_bhs and rename the function to mmb_has_buffers(). Reviewed-by: Christoph Hellwig Signed-off-by: Jan Kara Link: https://patch.msgid.link/20260326095354.16340-74-jack@suse.cz Signed-off-by: Christian Brauner --- include/linux/buffer_head.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 20636599d858..44094fd476f5 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -515,7 +515,7 @@ bool block_dirty_folio(struct address_space *mapping, struct folio *folio); void buffer_init(void); bool try_to_free_buffers(struct folio *folio); -int inode_has_buffers(struct inode *inode); +bool mmb_has_buffers(struct mapping_metadata_bhs *mmb); void invalidate_inode_buffers(struct inode *inode); int sync_mapping_buffers(struct address_space *mapping); void invalidate_bh_lrus(void); -- cgit v1.2.3 From a8c8122a3dac55d25a1912b8fec9b8cd7366c37a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 26 Mar 2026 10:54:27 +0100 Subject: fs: Provide functions for handling mapping_metadata_bhs directly As part of transition toward moving mapping_metadata_bhs to fs-private part of the inode, provide functions for operations on this list directly instead of going through the inode / mapping. Signed-off-by: Jan Kara Link: https://patch.msgid.link/20260326095354.16340-75-jack@suse.cz Signed-off-by: Christian Brauner --- include/linux/buffer_head.h | 44 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 36 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 44094fd476f5..e207dcca7a25 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -205,12 +205,30 @@ struct buffer_head *create_empty_buffers(struct folio *folio, void end_buffer_read_sync(struct buffer_head *bh, int uptodate); void end_buffer_write_sync(struct buffer_head *bh, int uptodate); -/* Things to do with buffers at mapping->private_list */ -void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode); -int generic_buffers_fsync_noflush(struct file *file, loff_t start, loff_t end, - bool datasync); -int generic_buffers_fsync(struct file *file, loff_t start, loff_t end, - bool datasync); +/* Things to do with metadata buffers list */ +void mmb_mark_buffer_dirty(struct buffer_head *bh, struct mapping_metadata_bhs *mmb); +static inline void mark_buffer_dirty_inode(struct buffer_head *bh, + struct inode *inode) +{ + mmb_mark_buffer_dirty(bh, &inode->i_data.i_metadata_bhs); +} +int mmb_fsync_noflush(struct file *file, struct mapping_metadata_bhs *mmb, + loff_t start, loff_t end, bool datasync); +static inline int generic_buffers_fsync_noflush(struct file *file, + loff_t start, loff_t end, + bool datasync) +{ + return mmb_fsync_noflush(file, &file->f_mapping->i_metadata_bhs, + start, end, datasync); +} +int mmb_fsync(struct file *file, struct mapping_metadata_bhs *mmb, + loff_t start, loff_t end, bool datasync); +static inline int generic_buffers_fsync(struct file *file, + loff_t start, loff_t end, bool datasync) +{ + return mmb_fsync(file, &file->f_mapping->i_metadata_bhs, + start, end, datasync); +} void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len); static inline void clean_bdev_bh_alias(struct buffer_head *bh) @@ -515,9 +533,18 @@ bool block_dirty_folio(struct address_space *mapping, struct folio *folio); void buffer_init(void); bool try_to_free_buffers(struct folio *folio); +void mmb_init(struct mapping_metadata_bhs *mmb, struct address_space *mapping); bool mmb_has_buffers(struct mapping_metadata_bhs *mmb); -void invalidate_inode_buffers(struct inode *inode); -int sync_mapping_buffers(struct address_space *mapping); +void mmb_invalidate(struct mapping_metadata_bhs *mmb); +int mmb_sync(struct mapping_metadata_bhs *mmb); +static inline void invalidate_inode_buffers(struct inode *inode) +{ + mmb_invalidate(&inode->i_data.i_metadata_bhs); +} +static inline int sync_mapping_buffers(struct address_space *mapping) +{ + return mmb_sync(&mapping->i_metadata_bhs); +} void invalidate_bh_lrus(void); void invalidate_bh_lrus_cpu(void); bool has_bh_in_lru(int cpu, void *dummy); @@ -527,6 +554,7 @@ extern int buffer_heads_over_limit; static inline void buffer_init(void) {} static inline bool try_to_free_buffers(struct folio *folio) { return true; } +static inline int mmb_sync(struct mapping_metadata_bhs *mmb) { return 0; } static inline void invalidate_inode_buffers(struct inode *inode) {} static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; } static inline void invalidate_bh_lrus(void) {} -- cgit v1.2.3 From cb6d109b9ccc374d09812c2387ab826499ee6562 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 26 Mar 2026 10:54:35 +0100 Subject: fs: Drop mapping_metadata_bhs from address space Nobody uses mapping_metadata_bhs in struct address_space anymore. Just remove it and with it all helper functions using it. Signed-off-by: Jan Kara Link: https://patch.msgid.link/20260326095354.16340-83-jack@suse.cz Signed-off-by: Christian Brauner --- include/linux/buffer_head.h | 28 ---------------------------- include/linux/fs.h | 1 - 2 files changed, 29 deletions(-) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index e207dcca7a25..e4939e33b4b5 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -207,28 +207,10 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate); /* Things to do with metadata buffers list */ void mmb_mark_buffer_dirty(struct buffer_head *bh, struct mapping_metadata_bhs *mmb); -static inline void mark_buffer_dirty_inode(struct buffer_head *bh, - struct inode *inode) -{ - mmb_mark_buffer_dirty(bh, &inode->i_data.i_metadata_bhs); -} int mmb_fsync_noflush(struct file *file, struct mapping_metadata_bhs *mmb, loff_t start, loff_t end, bool datasync); -static inline int generic_buffers_fsync_noflush(struct file *file, - loff_t start, loff_t end, - bool datasync) -{ - return mmb_fsync_noflush(file, &file->f_mapping->i_metadata_bhs, - start, end, datasync); -} int mmb_fsync(struct file *file, struct mapping_metadata_bhs *mmb, loff_t start, loff_t end, bool datasync); -static inline int generic_buffers_fsync(struct file *file, - loff_t start, loff_t end, bool datasync) -{ - return mmb_fsync(file, &file->f_mapping->i_metadata_bhs, - start, end, datasync); -} void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len); static inline void clean_bdev_bh_alias(struct buffer_head *bh) @@ -537,14 +519,6 @@ void mmb_init(struct mapping_metadata_bhs *mmb, struct address_space *mapping); bool mmb_has_buffers(struct mapping_metadata_bhs *mmb); void mmb_invalidate(struct mapping_metadata_bhs *mmb); int mmb_sync(struct mapping_metadata_bhs *mmb); -static inline void invalidate_inode_buffers(struct inode *inode) -{ - mmb_invalidate(&inode->i_data.i_metadata_bhs); -} -static inline int sync_mapping_buffers(struct address_space *mapping) -{ - return mmb_sync(&mapping->i_metadata_bhs); -} void invalidate_bh_lrus(void); void invalidate_bh_lrus_cpu(void); bool has_bh_in_lru(int cpu, void *dummy); @@ -555,8 +529,6 @@ extern int buffer_heads_over_limit; static inline void buffer_init(void) {} static inline bool try_to_free_buffers(struct folio *folio) { return true; } static inline int mmb_sync(struct mapping_metadata_bhs *mmb) { return 0; } -static inline void invalidate_inode_buffers(struct inode *inode) {} -static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; } static inline void invalidate_bh_lrus(void) {} static inline void invalidate_bh_lrus_cpu(void) {} static inline bool has_bh_in_lru(int cpu, void *dummy) { return false; } diff --git a/include/linux/fs.h b/include/linux/fs.h index fa2a812bd718..ccfa696253c8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -491,7 +491,6 @@ struct address_space { errseq_t wb_err; spinlock_t i_private_lock; struct list_head i_private_list; - struct mapping_metadata_bhs i_metadata_bhs; struct rw_semaphore i_mmap_rwsem; } __attribute__((aligned(sizeof(long)))) __randomize_layout; /* -- cgit v1.2.3 From f219798ce294e346031022a85670f68eb2dec10e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 26 Mar 2026 10:54:36 +0100 Subject: fs: Drop i_private_list from address_space Nobody is using i_private_list anymore. Remove it. Signed-off-by: Jan Kara Link: https://patch.msgid.link/20260326095354.16340-84-jack@suse.cz Tested-by: syzbot@syzkaller.appspotmail.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index ccfa696253c8..a3bed26d066d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -471,7 +471,6 @@ struct mapping_metadata_bhs { * @flags: Error bits and flags (AS_*). * @wb_err: The most recent error which has occurred. * @i_private_lock: For use by the owner of the address_space. - * @i_private_list: For use by the owner of the address_space. */ struct address_space { struct inode *host; @@ -490,7 +489,6 @@ struct address_space { unsigned long flags; errseq_t wb_err; spinlock_t i_private_lock; - struct list_head i_private_list; struct rw_semaphore i_mmap_rwsem; } __attribute__((aligned(sizeof(long)))) __randomize_layout; /* -- cgit v1.2.3 From 0e764b9d46071668969410ec5429be0e2f38c6d3 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 25 Mar 2026 08:20:17 +0000 Subject: netfs: Fix the handling of stream->front by removing it The netfs_io_stream::front member is meant to point to the subrequest currently being collected on a stream, but it isn't actually used this way by direct write (which mostly ignores it). However, there's a tracepoint which looks at it. Further, stream->front is actually redundant with stream->subrequests.next. Fix the potential problem in the direct code by just removing the member and using stream->subrequests.next instead, thereby also simplifying the code. Fixes: a0b4c7a49137 ("netfs: Fix unbuffered/DIO writes to dispatch subrequests in strict sequence") Reported-by: Paulo Alcantara Signed-off-by: David Howells Link: https://patch.msgid.link/4158599.1774426817@warthog.procyon.org.uk Reviewed-by: Paulo Alcantara (Red Hat) cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/linux/netfs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 72ee7d210a74..ba17ac5bf356 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -140,7 +140,6 @@ struct netfs_io_stream { void (*issue_write)(struct netfs_io_subrequest *subreq); /* Collection tracking */ struct list_head subrequests; /* Contributory I/O operations */ - struct netfs_io_subrequest *front; /* Op being collected */ unsigned long long collected_to; /* Position we've collected results to */ size_t transferred; /* The amount transferred from this stream */ unsigned short error; /* Aggregate error for the stream */ -- cgit v1.2.3 From 935a04923ad293cd89bf6ec23fc4efc9cf1a0142 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Thu, 12 Mar 2026 11:04:36 +0100 Subject: nsproxy: Add FOR_EACH_NS_TYPE() X-macro and CLONE_NS_ALL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce the FOR_EACH_NS_TYPE(X) macro as the single source of truth for the set of (struct type, CLONE_NEW* flag) pairs that define Linux namespace types. Currently, the list of CLONE_NEW* flags is duplicated inline in multiple call sites and would need another copy in each new consumer. This makes it easy to miss one when a new namespace type is added. Derive two things from the X-macro: - CLONE_NS_ALL: Bitmask of all known CLONE_NEW* flags, usable as a validity mask or iteration bound. - ns_common_type(): Rewritten to use the X-macro via a leading-comma _Generic pattern, so the struct-to-flag mapping stays in sync with the flag set automatically. Replace the inline flag enumerations in copy_namespaces(), unshare_nsproxy_namespaces(), check_setns_flags(), and ksys_unshare() with CLONE_NS_ALL. When a new namespace type is added, only FOR_EACH_NS_TYPE needs to be updated; CLONE_NS_ALL, ns_common_type(), and all the call sites pick up the change automatically. Cc: Christian Brauner Cc: Günther Noack Signed-off-by: Mickaël Salaün Link: https://patch.msgid.link/20260312100444.2609563-4-mic@digikod.net Reviewed-by: Christian Brauner Signed-off-by: Christian Brauner --- include/linux/ns/ns_common_types.h | 44 +++++++++++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ns/ns_common_types.h b/include/linux/ns/ns_common_types.h index 0014fbc1c626..ea45c54e4435 100644 --- a/include/linux/ns/ns_common_types.h +++ b/include/linux/ns/ns_common_types.h @@ -7,6 +7,7 @@ #include #include #include +#include struct cgroup_namespace; struct dentry; @@ -184,15 +185,38 @@ struct ns_common { struct user_namespace *: (IS_ENABLED(CONFIG_USER_NS) ? &userns_operations : NULL), \ struct uts_namespace *: (IS_ENABLED(CONFIG_UTS_NS) ? &utsns_operations : NULL)) -#define ns_common_type(__ns) \ - _Generic((__ns), \ - struct cgroup_namespace *: CLONE_NEWCGROUP, \ - struct ipc_namespace *: CLONE_NEWIPC, \ - struct mnt_namespace *: CLONE_NEWNS, \ - struct net *: CLONE_NEWNET, \ - struct pid_namespace *: CLONE_NEWPID, \ - struct time_namespace *: CLONE_NEWTIME, \ - struct user_namespace *: CLONE_NEWUSER, \ - struct uts_namespace *: CLONE_NEWUTS) +/* + * FOR_EACH_NS_TYPE - Canonical list of namespace types + * + * Enumerates all (struct type, CLONE_NEW* flag) pairs. This is the + * single source of truth used to derive ns_common_type() and + * CLONE_NS_ALL. When adding a new namespace type, add a single entry + * here; all consumers update automatically. + * + * @X: Callback macro taking (struct_name, clone_flag) as arguments. + */ +#define FOR_EACH_NS_TYPE(X) \ + X(cgroup_namespace, CLONE_NEWCGROUP) \ + X(ipc_namespace, CLONE_NEWIPC) \ + X(mnt_namespace, CLONE_NEWNS) \ + X(net, CLONE_NEWNET) \ + X(pid_namespace, CLONE_NEWPID) \ + X(time_namespace, CLONE_NEWTIME) \ + X(user_namespace, CLONE_NEWUSER) \ + X(uts_namespace, CLONE_NEWUTS) + +/* Bitmask of all known CLONE_NEW* flags. */ +#define _NS_TYPE_FLAG_OR(struct_name, flag) | (flag) +#define CLONE_NS_ALL (0 FOR_EACH_NS_TYPE(_NS_TYPE_FLAG_OR)) + +/* + * ns_common_type - Map a namespace struct pointer to its CLONE_NEW* flag + * + * Uses a leading-comma pattern so the FOR_EACH_NS_TYPE expansion + * produces ", struct foo *: FLAG" entries without a trailing comma. + */ +#define _NS_TYPE_ASSOC(struct_name, flag) , struct struct_name *: (flag) + +#define ns_common_type(__ns) _Generic((__ns)FOR_EACH_NS_TYPE(_NS_TYPE_ASSOC)) #endif /* _LINUX_NS_COMMON_TYPES_H */ -- cgit v1.2.3 From bade44fe546212e142befb69ba22f34944030a99 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 24 Mar 2026 14:01:45 -0400 Subject: tracing: Move snapshot code out of trace.c and into trace_snapshot.c The trace.c file was a dumping ground for most tracing code. Start organizing it better by moving various functions out into their own files. Move all the snapshot code, including the max trace code into its own trace_snapshot.c file. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://patch.msgid.link/20260324140145.36352d6a@gandalf.local.home Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index c242fe49af4c..28b30c6f1031 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -31,7 +31,7 @@ #define ARCH_SUPPORTS_FTRACE_OPS 0 #endif -#ifdef CONFIG_TRACING +#ifdef CONFIG_TRACER_SNAPSHOT extern void ftrace_boot_snapshot(void); #else static inline void ftrace_boot_snapshot(void) { } -- cgit v1.2.3 From 5dc9cf835aba73c882348aa4f99be83b6e45ad9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 26 Mar 2026 12:42:30 +0100 Subject: vdso/timens: Move functions to new file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As a preparation of the untangling of time namespaces and the vDSO, move the glue functions between those subsystems into a new file. While at it, switch the mutex lock and mmap_read_lock() in the vDSO namespace code to guard(). Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260326-vdso-timens-decoupling-v2-1-c82693a7775f@linutronix.de --- include/linux/time_namespace.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index c514d0e5a45c..0421bf1b13d7 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -38,8 +38,6 @@ static inline struct time_namespace *to_time_ns(struct ns_common *ns) return container_of(ns, struct time_namespace, ns); } void __init time_ns_init(void); -extern int vdso_join_timens(struct task_struct *task, - struct time_namespace *ns); extern void timens_commit(struct task_struct *tsk, struct time_namespace *ns); static inline struct time_namespace *get_time_ns(struct time_namespace *ns) @@ -117,12 +115,6 @@ static inline void __init time_ns_init(void) { } -static inline int vdso_join_timens(struct task_struct *task, - struct time_namespace *ns) -{ - return 0; -} - static inline void timens_commit(struct task_struct *tsk, struct time_namespace *ns) { -- cgit v1.2.3 From 1b6c89285d37114d7efe8ab04102a542581cd7da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 26 Mar 2026 12:42:31 +0100 Subject: timens: Remove dependency on the vDSO MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, missing time namespace support in the vDSO meant that time namespaces needed to be disabled globally. This was expressed in a hard dependency on the generic vDSO library. This also meant that architectures without any vDSO or only a stub vDSO could not enable time namespaces. Now that all architectures using a real vDSO are using the generic library, that dependency is not necessary anymore. Remove the dependency and let all architectures enable time namespaces. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260326-vdso-timens-decoupling-v2-2-c82693a7775f@linutronix.de --- include/linux/time_namespace.h | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index 0421bf1b13d7..c1de21a27c34 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -25,7 +25,9 @@ struct time_namespace { struct ucounts *ucounts; struct ns_common ns; struct timens_offsets offsets; +#ifdef CONFIG_TIME_NS_VDSO struct page *vvar_page; +#endif /* If set prevents changing offsets after any task joined namespace. */ bool frozen_offsets; } __randomize_layout; @@ -38,7 +40,6 @@ static inline struct time_namespace *to_time_ns(struct ns_common *ns) return container_of(ns, struct time_namespace, ns); } void __init time_ns_init(void); -extern void timens_commit(struct task_struct *tsk, struct time_namespace *ns); static inline struct time_namespace *get_time_ns(struct time_namespace *ns) { @@ -51,7 +52,6 @@ struct time_namespace *copy_time_ns(u64 flags, struct time_namespace *old_ns); void free_time_ns(struct time_namespace *ns); void timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk); -struct page *find_timens_vvar_page(struct vm_area_struct *vma); static inline void put_time_ns(struct time_namespace *ns) { @@ -115,11 +115,6 @@ static inline void __init time_ns_init(void) { } -static inline void timens_commit(struct task_struct *tsk, - struct time_namespace *ns) -{ -} - static inline struct time_namespace *get_time_ns(struct time_namespace *ns) { return NULL; @@ -146,11 +141,6 @@ static inline void timens_on_fork(struct nsproxy *nsproxy, return; } -static inline struct page *find_timens_vvar_page(struct vm_area_struct *vma) -{ - return NULL; -} - static inline void timens_add_monotonic(struct timespec64 *ts) { } static inline void timens_add_boottime(struct timespec64 *ts) { } @@ -167,4 +157,18 @@ static inline ktime_t timens_ktime_to_host(clockid_t clockid, ktime_t tim) } #endif +#ifdef CONFIG_TIME_NS_VDSO +extern void timens_commit(struct task_struct *tsk, struct time_namespace *ns); +struct page *find_timens_vvar_page(struct vm_area_struct *vma); +#else /* !CONFIG_TIME_NS_VDSO */ +static inline void timens_commit(struct task_struct *tsk, struct time_namespace *ns) +{ +} + +static inline struct page *find_timens_vvar_page(struct vm_area_struct *vma) +{ + return NULL; +} +#endif /* CONFIG_TIME_NS_VDSO */ + #endif /* _LINUX_TIMENS_H */ -- cgit v1.2.3 From 190a8c48ff623c3d67cb295b4536a660db2012aa Mon Sep 17 00:00:00 2001 From: Hao-Yu Yang Date: Fri, 13 Mar 2026 20:47:56 +0800 Subject: futex: Fix UaF between futex_key_to_node_opt() and vma_replace_policy() During futex_key_to_node_opt() execution, vma->vm_policy is read under speculative mmap lock and RCU. Concurrently, mbind() may call vma_replace_policy() which frees the old mempolicy immediately via kmem_cache_free(). This creates a race where __futex_key_to_node() dereferences a freed mempolicy pointer, causing a use-after-free read of mpol->mode. [ 151.412631] BUG: KASAN: slab-use-after-free in __futex_key_to_node (kernel/futex/core.c:349) [ 151.414046] Read of size 2 at addr ffff888001c49634 by task e/87 [ 151.415969] Call Trace: [ 151.416732] __asan_load2 (mm/kasan/generic.c:271) [ 151.416777] __futex_key_to_node (kernel/futex/core.c:349) [ 151.416822] get_futex_key (kernel/futex/core.c:374 kernel/futex/core.c:386 kernel/futex/core.c:593) Fix by adding rcu to __mpol_put(). Fixes: c042c505210d ("futex: Implement FUTEX2_MPOL") Reported-by: Hao-Yu Yang Suggested-by: Eric Dumazet Signed-off-by: Hao-Yu Yang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Eric Dumazet Acked-by: David Hildenbrand (Arm) Link: https://patch.msgid.link/20260324174418.GB1850007@noisy.programming.kicks-ass.net --- include/linux/mempolicy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 0fe96f3ab3ef..65c732d440d2 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -55,6 +55,7 @@ struct mempolicy { nodemask_t cpuset_mems_allowed; /* relative to these nodes */ nodemask_t user_nodemask; /* nodemask passed by user */ } w; + struct rcu_head rcu; }; /* -- cgit v1.2.3 From abdd23c8849d45c6bdef0ab6facbbc63bddebbe1 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Wed, 25 Mar 2026 10:00:17 +0100 Subject: of: reserved_mem: remove fdt node from the structure FDT node is not needed for anything besides the initialization, so it can be simply passed as an argument to the reserved memory region init function. Signed-off-by: Marek Szyprowski Link: https://patch.msgid.link/20260325090023.3175348-2-m.szyprowski@samsung.com Signed-off-by: Rob Herring (Arm) --- include/linux/of_reserved_mem.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/of_reserved_mem.h b/include/linux/of_reserved_mem.h index f573423359f4..5159938bfe03 100644 --- a/include/linux/of_reserved_mem.h +++ b/include/linux/of_reserved_mem.h @@ -11,7 +11,6 @@ struct resource; struct reserved_mem { const char *name; - unsigned long fdt_node; const struct reserved_mem_ops *ops; phys_addr_t base; phys_addr_t size; @@ -25,7 +24,8 @@ struct reserved_mem_ops { struct device *dev); }; -typedef int (*reservedmem_of_init_fn)(struct reserved_mem *rmem); +typedef int (*reservedmem_of_init_fn)(unsigned long node, + struct reserved_mem *rmem); #ifdef CONFIG_OF_RESERVED_MEM -- cgit v1.2.3 From c640cad6a5382ea08a4e052156cfefc8021c51b7 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Wed, 25 Mar 2026 10:00:19 +0100 Subject: of: reserved_mem: switch to ops based OF_DECLARE() Move init function from OF_DECLARE() argument to the given reserved memory region ops structure and then pass that structure to the OF_DECLARE() initializer. This node_init callback is mandatory for the reserved mem driver. Such change makes it possible in the future to add more functions called by the generic code before given memory region is initialized and rmem object is created. Signed-off-by: Marek Szyprowski Link: https://patch.msgid.link/20260325090023.3175348-4-m.szyprowski@samsung.com Signed-off-by: Rob Herring (Arm) --- include/linux/of_reserved_mem.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/of_reserved_mem.h b/include/linux/of_reserved_mem.h index 5159938bfe03..747a1e73d5dd 100644 --- a/include/linux/of_reserved_mem.h +++ b/include/linux/of_reserved_mem.h @@ -18,19 +18,17 @@ struct reserved_mem { }; struct reserved_mem_ops { + int (*node_init)(unsigned long fdt_node, struct reserved_mem *rmem); int (*device_init)(struct reserved_mem *rmem, struct device *dev); void (*device_release)(struct reserved_mem *rmem, struct device *dev); }; -typedef int (*reservedmem_of_init_fn)(unsigned long node, - struct reserved_mem *rmem); - #ifdef CONFIG_OF_RESERVED_MEM -#define RESERVEDMEM_OF_DECLARE(name, compat, init) \ - _OF_DECLARE(reservedmem, name, compat, init, reservedmem_of_init_fn) +#define RESERVEDMEM_OF_DECLARE(name, compat, ops) \ + _OF_DECLARE(reservedmem, name, compat, ops, struct reserved_mem_ops *) int of_reserved_mem_device_init_by_idx(struct device *dev, struct device_node *np, int idx); @@ -48,8 +46,9 @@ int of_reserved_mem_region_count(const struct device_node *np); #else -#define RESERVEDMEM_OF_DECLARE(name, compat, init) \ - _OF_DECLARE_STUB(reservedmem, name, compat, init, reservedmem_of_init_fn) +#define RESERVEDMEM_OF_DECLARE(name, compat, ops) \ + _OF_DECLARE_STUB(reservedmem, name, compat, ops, \ + struct reserved_mem_ops *) static inline int of_reserved_mem_device_init_by_idx(struct device *dev, struct device_node *np, int idx) -- cgit v1.2.3 From 7fd3981202b9aef8862aa06ca0d75496c0f9681f Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Wed, 25 Mar 2026 10:00:20 +0100 Subject: of: reserved_mem: replace CMA quirks by generic methods Add optional reserved memory callbacks to perform region verification and early fixup, then move all CMA related code in of_reserved_mem.c to them. Signed-off-by: Marek Szyprowski Link: https://patch.msgid.link/20260325090023.3175348-5-m.szyprowski@samsung.com Signed-off-by: Rob Herring (Arm) --- include/linux/cma.h | 10 ---------- include/linux/dma-map-ops.h | 3 --- include/linux/of_reserved_mem.h | 3 +++ 3 files changed, 3 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cma.h b/include/linux/cma.h index d0793eaaadaa..8555d38a97b1 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -61,14 +61,4 @@ extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data) extern bool cma_intersects(struct cma *cma, unsigned long start, unsigned long end); extern void cma_reserve_pages_on_error(struct cma *cma); - -#ifdef CONFIG_DMA_CMA -extern bool cma_skip_dt_default_reserved_mem(void); -#else -static inline bool cma_skip_dt_default_reserved_mem(void) -{ - return false; -} -#endif - #endif diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 60b63756df82..55ecd2934225 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -147,9 +147,6 @@ static inline void dma_free_contiguous(struct device *dev, struct page *page, { __free_pages(page, get_order(size)); } -static inline void dma_contiguous_early_fixup(phys_addr_t base, unsigned long size) -{ -} #endif /* CONFIG_DMA_CMA*/ #ifdef CONFIG_DMA_DECLARE_COHERENT diff --git a/include/linux/of_reserved_mem.h b/include/linux/of_reserved_mem.h index 747a1e73d5dd..e8b20b29fa68 100644 --- a/include/linux/of_reserved_mem.h +++ b/include/linux/of_reserved_mem.h @@ -18,6 +18,9 @@ struct reserved_mem { }; struct reserved_mem_ops { + int (*node_validate)(unsigned long fdt_node, phys_addr_t *align); + int (*node_fixup)(unsigned long fdt_node, phys_addr_t base, + phys_addr_t size); int (*node_init)(unsigned long fdt_node, struct reserved_mem *rmem); int (*device_init)(struct reserved_mem *rmem, struct device *dev); -- cgit v1.2.3 From edfaa81d5da5fbfe3c73fece3ca0417a04cc4ba2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Tue, 24 Mar 2026 18:56:24 +0200 Subject: resource: Add __resource_contains_unbound() for internal contains checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit __find_resource_space() currently uses resource_contains() but for tentative resources that are not yet crafted into the resource tree. As resource_contains() checks that IORESOURCE_UNSET is not set for either of the resources, the caller has to hack around this problem by clearing the IORESOURCE_UNSET flag (essentially lying to resource_contains()). Instead of the hack, introduce __resource_contains_unbound() for cases like this. Signed-off-by: Ilpo Järvinen Signed-off-by: Bjorn Helgaas Tested-by: Xifer Link: https://patch.msgid.link/20260324165633.4583-2-ilpo.jarvinen@linux.intel.com --- include/linux/ioport.h | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 5533a5debf3f..19d5e04564d9 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -304,14 +304,28 @@ static inline unsigned long resource_ext_type(const struct resource *res) { return res->flags & IORESOURCE_EXT_TYPE_BITS; } -/* True iff r1 completely contains r2 */ -static inline bool resource_contains(const struct resource *r1, const struct resource *r2) + +/* + * For checking if @r1 completely contains @r2 for resources that have real + * addresses but are not yet crafted into the resource tree. Normally + * resource_contains() should be used instead of this function as it checks + * also IORESOURCE_UNSET flag. + */ +static inline bool __resource_contains_unbound(const struct resource *r1, + const struct resource *r2) { if (resource_type(r1) != resource_type(r2)) return false; + + return r1->start <= r2->start && r1->end >= r2->end; +} +/* True iff r1 completely contains r2 */ +static inline bool resource_contains(const struct resource *r1, const struct resource *r2) +{ if (r1->flags & IORESOURCE_UNSET || r2->flags & IORESOURCE_UNSET) return false; - return r1->start <= r2->start && r1->end >= r2->end; + + return __resource_contains_unbound(r1, r2); } /* True if any part of r1 overlaps r2 */ -- cgit v1.2.3 From f72e77c33e4b5657af35125e75bab249256030f3 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Tue, 17 Mar 2026 09:01:20 -0700 Subject: device property: Make modifications of fwnode "flags" thread safe In various places in the kernel, we modify the fwnode "flags" member by doing either: fwnode->flags |= SOME_FLAG; fwnode->flags &= ~SOME_FLAG; This type of modification is not thread-safe. If two threads are both mucking with the flags at the same time then one can clobber the other. While flags are often modified while under the "fwnode_link_lock", this is not universally true. Create some accessor functions for setting, clearing, and testing the FWNODE flags and move all users to these accessor functions. New accessor functions use set_bit() and clear_bit(), which are thread-safe. Cc: stable@vger.kernel.org Fixes: c2c724c868c4 ("driver core: Add fw_devlink_parse_fwtree()") Reviewed-by: Andy Shevchenko Acked-by: Mark Brown Reviewed-by: Wolfram Sang Signed-off-by: Douglas Anderson Reviewed-by: Rafael J. Wysocki (Intel) Reviewed-by: Saravana Kannan Link: https://patch.msgid.link/20260317090112.v2.1.I0a4d03104ecd5103df3d76f66c8d21b1d15a2e38@changeid [ Fix fwnode_clear_flag() argument alignment, restore dropped blank line in fwnode_dev_initialized(), and remove unnecessary parentheses around fwnode_test_flag() calls. - Danilo ] Signed-off-by: Danilo Krummrich --- include/linux/fwnode.h | 44 +++++++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h index 097be89487bf..80b38fbf2121 100644 --- a/include/linux/fwnode.h +++ b/include/linux/fwnode.h @@ -15,6 +15,7 @@ #define _LINUX_FWNODE_H_ #include +#include #include #include #include @@ -42,12 +43,12 @@ struct device; * suppliers. Only enforce ordering with suppliers that have * drivers. */ -#define FWNODE_FLAG_LINKS_ADDED BIT(0) -#define FWNODE_FLAG_NOT_DEVICE BIT(1) -#define FWNODE_FLAG_INITIALIZED BIT(2) -#define FWNODE_FLAG_NEEDS_CHILD_BOUND_ON_ADD BIT(3) -#define FWNODE_FLAG_BEST_EFFORT BIT(4) -#define FWNODE_FLAG_VISITED BIT(5) +#define FWNODE_FLAG_LINKS_ADDED 0 +#define FWNODE_FLAG_NOT_DEVICE 1 +#define FWNODE_FLAG_INITIALIZED 2 +#define FWNODE_FLAG_NEEDS_CHILD_BOUND_ON_ADD 3 +#define FWNODE_FLAG_BEST_EFFORT 4 +#define FWNODE_FLAG_VISITED 5 struct fwnode_handle { struct fwnode_handle *secondary; @@ -57,7 +58,7 @@ struct fwnode_handle { struct device *dev; struct list_head suppliers; struct list_head consumers; - u8 flags; + unsigned long flags; }; /* @@ -212,16 +213,37 @@ static inline void fwnode_init(struct fwnode_handle *fwnode, INIT_LIST_HEAD(&fwnode->suppliers); } +static inline void fwnode_set_flag(struct fwnode_handle *fwnode, + unsigned int bit) +{ + set_bit(bit, &fwnode->flags); +} + +static inline void fwnode_clear_flag(struct fwnode_handle *fwnode, + unsigned int bit) +{ + clear_bit(bit, &fwnode->flags); +} + +static inline void fwnode_assign_flag(struct fwnode_handle *fwnode, + unsigned int bit, bool value) +{ + assign_bit(bit, &fwnode->flags, value); +} + +static inline bool fwnode_test_flag(struct fwnode_handle *fwnode, + unsigned int bit) +{ + return test_bit(bit, &fwnode->flags); +} + static inline void fwnode_dev_initialized(struct fwnode_handle *fwnode, bool initialized) { if (IS_ERR_OR_NULL(fwnode)) return; - if (initialized) - fwnode->flags |= FWNODE_FLAG_INITIALIZED; - else - fwnode->flags &= ~FWNODE_FLAG_INITIALIZED; + fwnode_assign_flag(fwnode, FWNODE_FLAG_INITIALIZED, initialized); } int fwnode_link_add(struct fwnode_handle *con, struct fwnode_handle *sup, -- cgit v1.2.3 From 57a04a13aac1f247d171c3f3aef93efc69e6979e Mon Sep 17 00:00:00 2001 From: Qingfang Deng Date: Tue, 24 Mar 2026 22:08:56 +0800 Subject: netdevsim: fix build if SKB_EXTENSIONS=n __skb_ext_put() is not declared if SKB_EXTENSIONS is not enabled, which causes a build error: drivers/net/netdevsim/netdev.c: In function 'nsim_forward_skb': drivers/net/netdevsim/netdev.c:114:25: error: implicit declaration of function '__skb_ext_put'; did you mean 'skb_ext_put'? [-Werror=implicit-function-declaration] 114 | __skb_ext_put(psp_ext); | ^~~~~~~~~~~~~ | skb_ext_put cc1: some warnings being treated as errors Add a stub to fix the build. Fixes: 7d9351435ebb ("netdevsim: drop PSP ext ref on forward failure") Signed-off-by: Qingfang Deng Link: https://patch.msgid.link/20260324140857.783-1-dqfext@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index daa4e4944ce3..2f278ce376b7 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -5097,6 +5097,7 @@ static inline bool skb_has_extensions(struct sk_buff *skb) return unlikely(skb->active_extensions); } #else +static inline void __skb_ext_put(struct skb_ext *ext) {} static inline void skb_ext_put(struct sk_buff *skb) {} static inline void skb_ext_reset(struct sk_buff *skb) {} static inline void skb_ext_del(struct sk_buff *skb, int unused) {} -- cgit v1.2.3 From a800398e746f8c9010c626a71d92a05b708f7622 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 24 Mar 2026 10:05:40 +0000 Subject: net: stmmac: remove axi_kbbe, axi_mb and axi_rb members axi_kbbe, axi_mb and axi_rb are all written, but nothing ever reads their values. Remove the code that sets these and the struct members. Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1w4ydo-0000000Dlpb-34jd@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 5b2bece81448..eaaee329ef9d 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -133,10 +133,7 @@ struct stmmac_axi { u32 axi_blen_regval; bool axi_lpi_en; bool axi_xit_frm; - bool axi_kbbe; bool axi_fb; - bool axi_mb; - bool axi_rb; }; struct stmmac_rxq_cfg { -- cgit v1.2.3 From 90c5def10bea574b101b7a520c015ca81742183f Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 2 Mar 2026 18:22:52 -0400 Subject: iommu: Do not call drivers for empty gathers An empty gather is coded with start=U64_MAX, end=0 and several drivers go on to convert that to a size with: end - start + 1 Which gives 2 for an empty gather. This then causes Weird Stuff to happen (for example an UBSAN splat in VT-d) that is hopefully harmless, but maybe not. Prevent drivers from being called right in iommu_iotlb_sync(). Auditing shows that AMD, Intel, Mediatek and RSIC-V drivers all do things on these empty gathers. Further, there are several callers that can trigger empty gathers, especially in unusual conditions. For example iommu_map_nosync() will call a 0 size unmap on some error paths. Also in VFIO, iommupt and other places. Cc: stable@vger.kernel.org Reported-by: Janusz Krzysztofik Closes: https://lore.kernel.org/r/11145826.aFP6jjVeTY@jkrzyszt-mobl2.ger.corp.intel.com Signed-off-by: Jason Gunthorpe Reviewed-by: Lu Baolu Reviewed-by: Samiullah Khawaja Reviewed-by: Robin Murphy Reviewed-by: Vasant Hegde Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 54b8b48c762e..555597b54083 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -980,7 +980,8 @@ static inline void iommu_flush_iotlb_all(struct iommu_domain *domain) static inline void iommu_iotlb_sync(struct iommu_domain *domain, struct iommu_iotlb_gather *iotlb_gather) { - if (domain->ops->iotlb_sync) + if (domain->ops->iotlb_sync && + likely(iotlb_gather->start < iotlb_gather->end)) domain->ops->iotlb_sync(domain, iotlb_gather); iommu_iotlb_gather_init(iotlb_gather); -- cgit v1.2.3 From d134feeb5df33fbf77f482f52a366a44642dba09 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Thu, 19 Mar 2026 10:29:32 +0100 Subject: printk: add print_hex_dump_devel() Add print_hex_dump_devel() as the hex dump equivalent of pr_devel(), which emits output only when DEBUG is enabled, but keeps call sites compiled otherwise. Suggested-by: Herbert Xu Signed-off-by: Thorsten Blum Reviewed-by: John Ogness Signed-off-by: Herbert Xu --- include/linux/printk.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/printk.h b/include/linux/printk.h index 63d516c873b4..54e3c621fec3 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -801,6 +801,19 @@ static inline void print_hex_dump_debug(const char *prefix_str, int prefix_type, } #endif +#if defined(DEBUG) +#define print_hex_dump_devel(prefix_str, prefix_type, rowsize, \ + groupsize, buf, len, ascii) \ + print_hex_dump(KERN_DEBUG, prefix_str, prefix_type, rowsize, \ + groupsize, buf, len, ascii) +#else +static inline void print_hex_dump_devel(const char *prefix_str, int prefix_type, + int rowsize, int groupsize, + const void *buf, size_t len, bool ascii) +{ +} +#endif + /** * print_hex_dump_bytes - shorthand form of print_hex_dump() with default params * @prefix_str: string to prefix each line with; -- cgit v1.2.3 From 7be18a1fa00eab5283b35c13e26c6b76fcaab9ce Mon Sep 17 00:00:00 2001 From: Pavan Chebbi Date: Sat, 14 Mar 2026 08:16:01 -0700 Subject: fwctl/bnxt_en: Move common definitions to include/linux/bnxt/ We have common definitions that are now going to be used by more than one component outside of bnxt (bnxt_re and fwctl) Move bnxt_ulp.h to include/linux/bnxt/ as ulp.h. Link: https://patch.msgid.link/r/20260314151605.932749-2-pavan.chebbi@broadcom.com Reviewed-by: Andy Gospodarek Reviewed-by: Leon Romanovsky Cc: linux-rdma@vger.kernel.org Signed-off-by: Pavan Chebbi Signed-off-by: Jason Gunthorpe --- include/linux/bnxt/ulp.h | 128 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100644 include/linux/bnxt/ulp.h (limited to 'include/linux') diff --git a/include/linux/bnxt/ulp.h b/include/linux/bnxt/ulp.h new file mode 100644 index 000000000000..3c5b8a53f715 --- /dev/null +++ b/include/linux/bnxt/ulp.h @@ -0,0 +1,128 @@ +/* Broadcom NetXtreme-C/E network driver. + * + * Copyright (c) 2016-2018 Broadcom Limited + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. + */ + +#ifndef BNXT_ULP_H +#define BNXT_ULP_H + +#define BNXT_MIN_ROCE_CP_RINGS 2 +#define BNXT_MIN_ROCE_STAT_CTXS 1 + +#define BNXT_MAX_ROCE_MSIX_VF 2 +#define BNXT_MAX_ROCE_MSIX_NPAR_PF 5 +#define BNXT_MAX_ROCE_MSIX 64 + +struct hwrm_async_event_cmpl; +struct bnxt; + +struct bnxt_msix_entry { + u32 vector; + u32 ring_idx; + u32 db_offset; +}; + +struct bnxt_ulp_ops { + /* async_notifier() cannot sleep (in BH context) */ + void (*ulp_async_notifier)(void *, struct hwrm_async_event_cmpl *); + void (*ulp_irq_stop)(void *, bool); + void (*ulp_irq_restart)(void *, struct bnxt_msix_entry *); +}; + +struct bnxt_fw_msg { + void *msg; + int msg_len; + void *resp; + int resp_max_len; + int timeout; +}; + +struct bnxt_ulp { + void *handle; + struct bnxt_ulp_ops __rcu *ulp_ops; + unsigned long *async_events_bmap; + u16 max_async_event_id; + u16 msix_requested; +}; + +struct bnxt_en_dev { + struct net_device *net; + struct pci_dev *pdev; + struct bnxt_msix_entry msix_entries[BNXT_MAX_ROCE_MSIX]; + u32 flags; + #define BNXT_EN_FLAG_ROCEV1_CAP 0x1 + #define BNXT_EN_FLAG_ROCEV2_CAP 0x2 + #define BNXT_EN_FLAG_ROCE_CAP (BNXT_EN_FLAG_ROCEV1_CAP | \ + BNXT_EN_FLAG_ROCEV2_CAP) + #define BNXT_EN_FLAG_ULP_STOPPED 0x8 + #define BNXT_EN_FLAG_VF 0x10 +#define BNXT_EN_VF(edev) ((edev)->flags & BNXT_EN_FLAG_VF) + #define BNXT_EN_FLAG_ROCE_VF_RES_MGMT 0x20 + #define BNXT_EN_FLAG_SW_RES_LMT 0x40 +#define BNXT_EN_SW_RES_LMT(edev) ((edev)->flags & BNXT_EN_FLAG_SW_RES_LMT) + + struct bnxt_ulp *ulp_tbl; + int l2_db_size; /* Doorbell BAR size in + * bytes mapped by L2 + * driver. + */ + int l2_db_size_nc; /* Doorbell BAR size in + * bytes mapped as non- + * cacheable. + */ + int l2_db_offset; /* Doorbell offset in + * bytes within + * l2_db_size_nc. + */ + u16 chip_num; + u16 hw_ring_stats_size; + u16 pf_port_id; + unsigned long en_state; /* Could be checked in + * RoCE driver suspend + * mode only. Will be + * updated in resume. + */ + void __iomem *bar0; + + u16 ulp_num_msix_vec; + u16 ulp_num_ctxs; + + /* serialize ulp operations */ + struct mutex en_dev_lock; +}; + +static inline bool bnxt_ulp_registered(struct bnxt_en_dev *edev) +{ + if (edev && rcu_access_pointer(edev->ulp_tbl->ulp_ops)) + return true; + return false; +} + +int bnxt_get_ulp_msix_num(struct bnxt *bp); +int bnxt_get_ulp_msix_num_in_use(struct bnxt *bp); +void bnxt_set_ulp_msix_num(struct bnxt *bp, int num); +int bnxt_get_ulp_stat_ctxs(struct bnxt *bp); +void bnxt_set_ulp_stat_ctxs(struct bnxt *bp, int num_ctxs); +int bnxt_get_ulp_stat_ctxs_in_use(struct bnxt *bp); +void bnxt_set_dflt_ulp_stat_ctxs(struct bnxt *bp); +void bnxt_ulp_stop(struct bnxt *bp); +void bnxt_ulp_start(struct bnxt *bp, int err); +void bnxt_ulp_sriov_cfg(struct bnxt *bp, int num_vfs); +void bnxt_ulp_irq_stop(struct bnxt *bp); +void bnxt_ulp_irq_restart(struct bnxt *bp, int err); +void bnxt_ulp_async_events(struct bnxt *bp, struct hwrm_async_event_cmpl *cmpl); +void bnxt_rdma_aux_device_uninit(struct bnxt *bp); +void bnxt_rdma_aux_device_del(struct bnxt *bp); +void bnxt_rdma_aux_device_add(struct bnxt *bp); +void bnxt_rdma_aux_device_init(struct bnxt *bp); +int bnxt_register_dev(struct bnxt_en_dev *edev, struct bnxt_ulp_ops *ulp_ops, + void *handle); +void bnxt_unregister_dev(struct bnxt_en_dev *edev); +int bnxt_send_msg(struct bnxt_en_dev *edev, struct bnxt_fw_msg *fw_msg); +void bnxt_register_async_events(struct bnxt_en_dev *edev, + unsigned long *events_bmap, u16 max_id); +#endif -- cgit v1.2.3 From 2c7c85c8c7881d57c5fa1114f4b0dbd7fc53a36f Mon Sep 17 00:00:00 2001 From: Pavan Chebbi Date: Sat, 14 Mar 2026 08:16:02 -0700 Subject: fwctl/bnxt_en: Refactor aux bus functions to be more generic Up until now there was only one auxiliary device that bnxt created and that was for RoCE driver. bnxt fwctl is also going to use an aux bus device that bnxt should create. This requires some nomenclature changes and refactoring of the existing bnxt aux dev functions. Convert 'aux_priv' and 'edev' members of struct bnxt into arrays where each element contains supported auxbus device's data. Move struct bnxt_aux_priv from bnxt.h to ulp.h because that is where it belongs. Make aux bus init/uninit/add/del functions more generic which will loop through all the aux device types. Make bnxt_ulp_start/stop functions (the only other common functions applicable to any aux device) loop through the aux devices to update their config and states. Make callers of bnxt_ulp_start() call it only when there are no errors. Also, as an improvement in code, bnxt_register_dev() can skip unnecessary dereferencing of edev from bp, instead use the edev pointer from the function parameter. Future patches will reuse these functions to add an aux bus device for fwctl. Link: https://patch.msgid.link/r/20260314151605.932749-3-pavan.chebbi@broadcom.com Reviewed-by: Andy Gospodarek Reviewed-by: Leon Romanovsky Signed-off-by: Pavan Chebbi Signed-off-by: Jason Gunthorpe --- include/linux/bnxt/ulp.h | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bnxt/ulp.h b/include/linux/bnxt/ulp.h index 3c5b8a53f715..1a4643c46f86 100644 --- a/include/linux/bnxt/ulp.h +++ b/include/linux/bnxt/ulp.h @@ -10,6 +10,8 @@ #ifndef BNXT_ULP_H #define BNXT_ULP_H +#include + #define BNXT_MIN_ROCE_CP_RINGS 2 #define BNXT_MIN_ROCE_STAT_CTXS 1 @@ -20,6 +22,17 @@ struct hwrm_async_event_cmpl; struct bnxt; +enum bnxt_auxdev_type { + BNXT_AUXDEV_RDMA = 0, + __BNXT_AUXDEV_MAX +}; + +struct bnxt_aux_priv { + struct auxiliary_device aux_dev; + struct bnxt_en_dev *edev; + int id; +}; + struct bnxt_msix_entry { u32 vector; u32 ring_idx; @@ -110,19 +123,21 @@ void bnxt_set_ulp_stat_ctxs(struct bnxt *bp, int num_ctxs); int bnxt_get_ulp_stat_ctxs_in_use(struct bnxt *bp); void bnxt_set_dflt_ulp_stat_ctxs(struct bnxt *bp); void bnxt_ulp_stop(struct bnxt *bp); -void bnxt_ulp_start(struct bnxt *bp, int err); +void bnxt_ulp_start(struct bnxt *bp); void bnxt_ulp_sriov_cfg(struct bnxt *bp, int num_vfs); void bnxt_ulp_irq_stop(struct bnxt *bp); void bnxt_ulp_irq_restart(struct bnxt *bp, int err); void bnxt_ulp_async_events(struct bnxt *bp, struct hwrm_async_event_cmpl *cmpl); -void bnxt_rdma_aux_device_uninit(struct bnxt *bp); -void bnxt_rdma_aux_device_del(struct bnxt *bp); -void bnxt_rdma_aux_device_add(struct bnxt *bp); -void bnxt_rdma_aux_device_init(struct bnxt *bp); +void bnxt_aux_devices_uninit(struct bnxt *bp); +void bnxt_aux_devices_del(struct bnxt *bp); +void bnxt_aux_devices_add(struct bnxt *bp); +void bnxt_aux_devices_init(struct bnxt *bp); int bnxt_register_dev(struct bnxt_en_dev *edev, struct bnxt_ulp_ops *ulp_ops, void *handle); void bnxt_unregister_dev(struct bnxt_en_dev *edev); int bnxt_send_msg(struct bnxt_en_dev *edev, struct bnxt_fw_msg *fw_msg); void bnxt_register_async_events(struct bnxt_en_dev *edev, unsigned long *events_bmap, u16 max_id); +int bnxt_auxdev_id_alloc(struct bnxt *bp); +void bnxt_auxdev_id_free(struct bnxt *bp, int id); #endif -- cgit v1.2.3 From 5102836da8397deb2a3d8b9e574238781ea47390 Mon Sep 17 00:00:00 2001 From: Pavan Chebbi Date: Sat, 14 Mar 2026 08:16:03 -0700 Subject: fwctl/bnxt_en: Create an aux device for fwctl Create an additional auxiliary device to support fwctl. The next patch will create bnxt_fwctl and bind to this device. Link: https://patch.msgid.link/r/20260314151605.932749-4-pavan.chebbi@broadcom.com Reviewed-by: Andy Gospodarek Reviewed-by: Leon Romanovsky Signed-off-by: Pavan Chebbi Signed-off-by: Jason Gunthorpe --- include/linux/bnxt/ulp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bnxt/ulp.h b/include/linux/bnxt/ulp.h index 1a4643c46f86..0851ad3394b0 100644 --- a/include/linux/bnxt/ulp.h +++ b/include/linux/bnxt/ulp.h @@ -24,6 +24,7 @@ struct bnxt; enum bnxt_auxdev_type { BNXT_AUXDEV_RDMA = 0, + BNXT_AUXDEV_FWCTL, __BNXT_AUXDEV_MAX }; -- cgit v1.2.3 From 9100a28c8bb4270744942cf834efcd80f1acda7d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 1 Mar 2026 23:59:39 -0800 Subject: nvme-auth: add NVME_AUTH_MAX_DIGEST_SIZE constant Define a NVME_AUTH_MAX_DIGEST_SIZE constant and use it in the appropriate places. Acked-by: Ard Biesheuvel Acked-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Eric Biggers Signed-off-by: Keith Busch --- include/linux/nvme.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 655d194f8e72..edfebbce6745 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1837,6 +1837,11 @@ enum { NVME_AUTH_HASH_INVALID = 0xff, }; +/* Maximum digest size for any NVME_AUTH_HASH_* value */ +enum { + NVME_AUTH_MAX_DIGEST_SIZE = 64, +}; + /* Defined Diffie-Hellman group identifiers for DH-HMAC-CHAP authentication */ enum { NVME_AUTH_DHGROUP_NULL = 0x00, -- cgit v1.2.3 From bf0e2567a639c455110f9be5db8c92032175e222 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 1 Mar 2026 23:59:41 -0800 Subject: nvme-auth: use proper argument types For input parameters, use pointer to const. This makes it easier to understand which parameters are inputs and which are outputs. In addition, consistently use char for strings and u8 for binary. This makes it easier to understand what is a string and what is binary data. Acked-by: Ard Biesheuvel Acked-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Eric Biggers Signed-off-by: Keith Busch --- include/linux/nvme-auth.h | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme-auth.h b/include/linux/nvme-auth.h index 60e069a6757f..a4b248c24ccf 100644 --- a/include/linux/nvme-auth.h +++ b/include/linux/nvme-auth.h @@ -25,27 +25,27 @@ size_t nvme_auth_hmac_hash_len(u8 hmac_id); u8 nvme_auth_hmac_id(const char *hmac_name); u32 nvme_auth_key_struct_size(u32 key_len); -struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret, - u8 key_hash); +struct nvme_dhchap_key *nvme_auth_extract_key(const char *secret, u8 key_hash); void nvme_auth_free_key(struct nvme_dhchap_key *key); struct nvme_dhchap_key *nvme_auth_alloc_key(u32 len, u8 hash); struct nvme_dhchap_key *nvme_auth_transform_key( - struct nvme_dhchap_key *key, char *nqn); -int nvme_auth_generate_key(u8 *secret, struct nvme_dhchap_key **ret_key); -int nvme_auth_augmented_challenge(u8 hmac_id, u8 *skey, size_t skey_len, - u8 *challenge, u8 *aug, size_t hlen); + const struct nvme_dhchap_key *key, const char *nqn); +int nvme_auth_generate_key(const char *secret, struct nvme_dhchap_key **ret_key); +int nvme_auth_augmented_challenge(u8 hmac_id, const u8 *skey, size_t skey_len, + const u8 *challenge, u8 *aug, size_t hlen); int nvme_auth_gen_privkey(struct crypto_kpp *dh_tfm, u8 dh_gid); int nvme_auth_gen_pubkey(struct crypto_kpp *dh_tfm, u8 *host_key, size_t host_key_len); int nvme_auth_gen_shared_secret(struct crypto_kpp *dh_tfm, - u8 *ctrl_key, size_t ctrl_key_len, + const u8 *ctrl_key, size_t ctrl_key_len, u8 *sess_key, size_t sess_key_len); -int nvme_auth_generate_psk(u8 hmac_id, u8 *skey, size_t skey_len, - u8 *c1, u8 *c2, size_t hash_len, +int nvme_auth_generate_psk(u8 hmac_id, const u8 *skey, size_t skey_len, + const u8 *c1, const u8 *c2, size_t hash_len, u8 **ret_psk, size_t *ret_len); -int nvme_auth_generate_digest(u8 hmac_id, u8 *psk, size_t psk_len, - char *subsysnqn, char *hostnqn, u8 **ret_digest); -int nvme_auth_derive_tls_psk(int hmac_id, u8 *psk, size_t psk_len, - u8 *psk_digest, u8 **ret_psk); +int nvme_auth_generate_digest(u8 hmac_id, const u8 *psk, size_t psk_len, + const char *subsysnqn, const char *hostnqn, + char **ret_digest); +int nvme_auth_derive_tls_psk(int hmac_id, const u8 *psk, size_t psk_len, + const char *psk_digest, u8 **ret_psk); #endif /* _NVME_AUTH_H */ -- cgit v1.2.3 From 0beeca72cf21c7c1d9d232148fdeef8e5e242f62 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 1 Mar 2026 23:59:43 -0800 Subject: nvme-auth: rename nvme_auth_generate_key() to nvme_auth_parse_key() This function does not generate a key. It parses the key from the string that the caller passes in. Acked-by: Ard Biesheuvel Acked-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Eric Biggers Signed-off-by: Keith Busch --- include/linux/nvme-auth.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nvme-auth.h b/include/linux/nvme-auth.h index a4b248c24ccf..02ca9a716256 100644 --- a/include/linux/nvme-auth.h +++ b/include/linux/nvme-auth.h @@ -30,7 +30,7 @@ void nvme_auth_free_key(struct nvme_dhchap_key *key); struct nvme_dhchap_key *nvme_auth_alloc_key(u32 len, u8 hash); struct nvme_dhchap_key *nvme_auth_transform_key( const struct nvme_dhchap_key *key, const char *nqn); -int nvme_auth_generate_key(const char *secret, struct nvme_dhchap_key **ret_key); +int nvme_auth_parse_key(const char *secret, struct nvme_dhchap_key **ret_key); int nvme_auth_augmented_challenge(u8 hmac_id, const u8 *skey, size_t skey_len, const u8 *challenge, u8 *aug, size_t hlen); int nvme_auth_gen_privkey(struct crypto_kpp *dh_tfm, u8 dh_gid); -- cgit v1.2.3 From 4263ca1cae5cebc09ba95375c4a8927bf4b39d49 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 1 Mar 2026 23:59:45 -0800 Subject: nvme-auth: common: add HMAC helper functions Add some helper functions for computing HMAC-SHA256, HMAC-SHA384, or HMAC-SHA512 values using the crypto library instead of crypto_shash. These will enable some significant simplifications and performance improvements in nvme-auth. Acked-by: Ard Biesheuvel Acked-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Eric Biggers Signed-off-by: Keith Busch --- include/linux/nvme-auth.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme-auth.h b/include/linux/nvme-auth.h index 02ca9a716256..940d0703eb1d 100644 --- a/include/linux/nvme-auth.h +++ b/include/linux/nvme-auth.h @@ -7,6 +7,7 @@ #define _NVME_AUTH_H #include +#include struct nvme_dhchap_key { size_t len; @@ -23,6 +24,19 @@ const char *nvme_auth_hmac_name(u8 hmac_id); const char *nvme_auth_digest_name(u8 hmac_id); size_t nvme_auth_hmac_hash_len(u8 hmac_id); u8 nvme_auth_hmac_id(const char *hmac_name); +struct nvme_auth_hmac_ctx { + u8 hmac_id; + union { + struct hmac_sha256_ctx sha256; + struct hmac_sha384_ctx sha384; + struct hmac_sha512_ctx sha512; + }; +}; +int nvme_auth_hmac_init(struct nvme_auth_hmac_ctx *hmac, u8 hmac_id, + const u8 *key, size_t key_len); +void nvme_auth_hmac_update(struct nvme_auth_hmac_ctx *hmac, const u8 *data, + size_t data_len); +void nvme_auth_hmac_final(struct nvme_auth_hmac_ctx *hmac, u8 *out); u32 nvme_auth_key_struct_size(u32 key_len); struct nvme_dhchap_key *nvme_auth_extract_key(const char *secret, u8 key_hash); -- cgit v1.2.3 From 844d950bb2cb1fc5b8973369de59cbfb7eecd94d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 1 Mar 2026 23:59:57 -0800 Subject: nvme-auth: common: remove nvme_auth_digest_name() Since nvme_auth_digest_name() is no longer used, remove it and the associated data from the hash_map array. Acked-by: Ard Biesheuvel Acked-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Eric Biggers Signed-off-by: Keith Busch --- include/linux/nvme-auth.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nvme-auth.h b/include/linux/nvme-auth.h index 940d0703eb1d..184a1f9510fa 100644 --- a/include/linux/nvme-auth.h +++ b/include/linux/nvme-auth.h @@ -21,7 +21,6 @@ const char *nvme_auth_dhgroup_kpp(u8 dhgroup_id); u8 nvme_auth_dhgroup_id(const char *dhgroup_name); const char *nvme_auth_hmac_name(u8 hmac_id); -const char *nvme_auth_digest_name(u8 hmac_id); size_t nvme_auth_hmac_hash_len(u8 hmac_id); u8 nvme_auth_hmac_id(const char *hmac_name); struct nvme_auth_hmac_ctx { -- cgit v1.2.3 From ac61e869bef13a43d624893559acbac3a4e2a341 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 27 Feb 2026 13:23:46 -0700 Subject: nvme: add preferred I/O size fields to struct nvme_id_ns_nvm A subsequent change will use the NPDGL and NPDAL fields of the NVM Command Set Specific Identify Namespace structure, so add them (and the handful of intervening fields) to struct nvme_id_ns_nvm. Add an assertion that the size is still 4 KB. Signed-off-by: Caleb Sander Mateos Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- include/linux/nvme.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index edfebbce6745..ec011dce4a97 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -513,9 +513,16 @@ struct nvme_id_ns_nvm { __u8 pic; __u8 rsvd9[3]; __le32 elbaf[64]; - __u8 rsvd268[3828]; + __le32 npdgl; + __le32 nprg; + __le32 npra; + __le32 nors; + __le32 npdal; + __u8 rsvd288[3808]; }; +static_assert(sizeof(struct nvme_id_ns_nvm) == 4096); + enum { NVME_ID_NS_NVM_STS_MASK = 0x7f, NVME_ID_NS_NVM_GUARD_SHIFT = 7, -- cgit v1.2.3 From d3c04a6ea5fd7a3d81f7c80880125108df9a4cbd Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 27 Feb 2026 13:23:48 -0700 Subject: nvme: update nvme_id_ns OPTPERF constants In NVMe verson 2.0 and below, OPTPERF comprises only bit 4 of NSFEAT in the Identify Namespace structure. Since version 2.1, OPTPERF includes both bits 4 and 5 of NSFEAT. Replace the NVME_NS_FEAT_IO_OPT constant with NVME_NS_FEAT_OPTPERF_SHIFT, NVME_NS_FEAT_OPTPERF_MASK, and NVME_NS_FEAT_OPTPERF_MASK_2_1, representing the first bit, pre-2.1 bit width, and post-2.1 bit width of OPTPERF. Update nvme_update_disk_info() to check both OPTPERF bits for controllers that report version 2.1 or newer, as NPWG and NOWS are supported even if only bit 5 is set. Signed-off-by: Caleb Sander Mateos Signed-off-by: Keith Busch --- include/linux/nvme.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index ec011dce4a97..2b66a86d7da6 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -597,7 +597,11 @@ enum { enum { NVME_NS_FEAT_THIN = 1 << 0, NVME_NS_FEAT_ATOMICS = 1 << 1, - NVME_NS_FEAT_IO_OPT = 1 << 4, + NVME_NS_FEAT_OPTPERF_SHIFT = 4, + /* In NVMe version 2.0 and below, OPTPERF is only bit 4 of NSFEAT */ + NVME_NS_FEAT_OPTPERF_MASK = 0x1, + /* Since version 2.1, OPTPERF is bits 4 and 5 of NSFEAT */ + NVME_NS_FEAT_OPTPERF_MASK_2_1 = 0x3, NVME_NS_ATTR_RO = 1 << 0, NVME_NS_FLBAS_LBA_MASK = 0xf, NVME_NS_FLBAS_LBA_UMASK = 0x60, -- cgit v1.2.3 From 09e8f0f93491c6be867f32d4edc0b16fb5da785e Mon Sep 17 00:00:00 2001 From: Alistair Francis Date: Fri, 20 Mar 2026 10:20:44 +1000 Subject: nvme: Add the DHCHAP maximum HD IDs In preperation for using DHCHAP length in upcoming host and target patches let's add the hash and diffie-hellman ID length macros. Reviewed-by: Christoph Hellwig Reviewed-by: Yunje Shin Reviewed-by: Hannes Reinecke Reviewed-by: Chris Leech Signed-off-by: Alistair Francis Signed-off-by: Keith Busch --- include/linux/nvme.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 2b66a86d7da6..041f30931a90 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -2348,4 +2348,8 @@ enum nvme_pr_change_ptpl { #define NVME_PR_IGNORE_KEY (1 << 3) +/* Section 8.3.4.5.2 of the NVMe 2.1 */ +#define NVME_AUTH_DHCHAP_MAX_HASH_IDS 30 +#define NVME_AUTH_DHCHAP_MAX_DH_IDS 30 + #endif /* _LINUX_NVME_H */ -- cgit v1.2.3 From f699bcc8bcdf99565928a7b1fc7ee656f6c81815 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Tue, 24 Mar 2026 18:56:25 +0200 Subject: resource: Pass full extent of empty space to resource_alignf callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit __find_resource_space() calculates the full extent of empty space but only passes the aligned space to resource_alignf callback. In some situations, the callback may choose take advantage of the free space before the requested alignment. Pass the full extent of the calculated empty space to resource_alignf callback as an additional parameter. Signed-off-by: Ilpo Järvinen Signed-off-by: Bjorn Helgaas Tested-by: Xifer Link: https://patch.msgid.link/20260324165633.4583-3-ilpo.jarvinen@linux.intel.com --- include/linux/ioport.h | 2 ++ include/linux/pci.h | 7 ++++--- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 19d5e04564d9..3c73c9c0d4f7 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -202,6 +202,7 @@ enum { * typedef resource_alignf - Resource alignment callback * @data: Private data used by the callback * @res: Resource candidate range (an empty resource space) + * @empty_res: Empty resource range without alignment applied * @size: The minimum size of the empty space * @align: Alignment from the constraints * @@ -212,6 +213,7 @@ enum { */ typedef resource_size_t (*resource_alignf)(void *data, const struct resource *res, + const struct resource *empty_res, resource_size_t size, resource_size_t align); diff --git a/include/linux/pci.h b/include/linux/pci.h index 1c270f1d5123..ac332ff9da9f 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1206,9 +1206,10 @@ int __must_check pcibios_enable_device(struct pci_dev *, int mask); char *pcibios_setup(char *str); /* Used only when drivers/pci/setup.c is used */ -resource_size_t pcibios_align_resource(void *, const struct resource *, - resource_size_t, - resource_size_t); +resource_size_t pcibios_align_resource(void *data, const struct resource *res, + const struct resource *empty_res, + resource_size_t size, + resource_size_t align); /* Generic PCI functions used internally */ -- cgit v1.2.3 From 9036bd0efcb6162a77f3bf9bacbafba7686c7275 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Tue, 24 Mar 2026 18:56:32 +0200 Subject: PCI: Align head space better MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a bridge window contains big and small resource(s), the small resource(s) may not amount to the half of the size of the big resource which would allow calculate_head_align() to shrink the head alignment. This results in always placing the small resource(s) after the big resource. In general, it would be good to be able to place the small resource(s) before the big resource to achieve better utilization of the address space. In the cases where the large resource can only fit at the end of the window, it is even required. However, carrying the information over from pbus_size_mem() and calculate_head_align() to __pci_assign_resource() and pcibios_align_resource() is not easy with the current data structures. A somewhat hacky way to move the non-aligning tail part to the head is possible within pcibios_align_resource(). The free space between the start of the free space span and the aligned start address can be compared with the non-aligning remainder of the size. If the free space is larger than the remainder, placing the remainder before the start address is possible. This relocation should generally work, because PCI resources consist only power-of-2 atoms. Various arch requirements may still need to override the relocation, so the relocation is only applied selectively in such cases. Closes: https://bugzilla.kernel.org/show_bug.cgi?id=221205 Signed-off-by: Ilpo Järvinen Signed-off-by: Bjorn Helgaas Tested-by: Xifer Link: https://patch.msgid.link/20260324165633.4583-10-ilpo.jarvinen@linux.intel.com --- include/linux/pci.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index ac332ff9da9f..cedf948dc614 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1210,6 +1210,11 @@ resource_size_t pcibios_align_resource(void *data, const struct resource *res, const struct resource *empty_res, resource_size_t size, resource_size_t align); +resource_size_t pci_align_resource(struct pci_dev *dev, + const struct resource *res, + const struct resource *empty_res, + resource_size_t size, + resource_size_t align); /* Generic PCI functions used internally */ -- cgit v1.2.3 From 09e61daf8e96b9bdb04dd112bdecf9382fd3f919 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:45:51 +0000 Subject: arm_mpam: resctrl: Add boilerplate cpuhp and domain allocation resctrl has its own data structures to describe its resources. We can't use these directly as we play tricks with the 'MBA' resource, picking the MPAM controls or monitors that best apply. We may export the same component as both L3 and MBA. Add mpam_resctrl_res[] as the array of class->resctrl mappings we are exporting, and add the cpuhp hooks that allocated and free the resctrl domain structures. Only the mpam control feature are considered here and monitor support will be added later. While we're here, plumb in a few other obvious things. CONFIG_ARM_CPU_RESCTRL is used to allow this code to be built even though it can't yet be linked against resctrl. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse --- include/linux/arm_mpam.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index 7f00c5285a32..2c7d1413a401 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -49,6 +49,9 @@ static inline int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, } #endif +bool resctrl_arch_alloc_capable(void); +bool resctrl_arch_mon_capable(void); + /** * mpam_register_requestor() - Register a requestor with the MPAM driver * @partid_max: The maximum PARTID value the requestor can generate. -- cgit v1.2.3 From 9d2e1a99fae58ce992f147bdf83b5d9089f70b27 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:45:56 +0000 Subject: arm_mpam: resctrl: Add plumbing against arm64 task and cpu hooks arm64 provides helpers for changing a task's and a cpu's mpam partid/pmg values. These are used to back a number of resctrl_arch_ functions. Connect them up. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse --- include/linux/arm_mpam.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index 2c7d1413a401..5a78299ec464 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -52,6 +52,11 @@ static inline int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, bool resctrl_arch_alloc_capable(void); bool resctrl_arch_mon_capable(void); +void resctrl_arch_set_cpu_default_closid(int cpu, u32 closid); +void resctrl_arch_set_closid_rmid(struct task_struct *tsk, u32 closid, u32 rmid); +void resctrl_arch_set_cpu_default_closid_rmid(int cpu, u32 closid, u32 rmid); +void resctrl_arch_sched_in(struct task_struct *tsk); + /** * mpam_register_requestor() - Register a requestor with the MPAM driver * @partid_max: The maximum PARTID value the requestor can generate. -- cgit v1.2.3 From 6789fb99282c0a8e8e84701b7edf456f4a9e71e2 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:45:57 +0000 Subject: arm_mpam: resctrl: Add CDP emulation Intel RDT's CDP feature allows the cache to use a different control value depending on whether the accesses was for instruction fetch or a data access. MPAM's equivalent feature is the other way up: the CPU assigns a different partid label to traffic depending on whether it was instruction fetch or a data access, which causes the cache to use a different control value based solely on the partid. MPAM can emulate CDP, with the side effect that the alternative partid is seen by all MSC, it can't be enabled per-MSC. Add the resctrl hooks to turn this on or off. Add the helpers that match a closid against a task, which need to be aware that the value written to hardware is not the same as the one resctrl is using. Update the 'arm64_mpam_global_default' variable the arch code uses during context switch to know when the per-cpu value should be used instead. Also, update these per-cpu values and sync the resulting mpam partid/pmg configuration to hardware. resctrl can enable CDP for L2 caches, L3 caches or both. When it is enabled by one and not the other MPAM globally enabled CDP but hides the effect on the other cache resource. This hiding is possible as CPOR is the only supported cache control and that uses a resource bitmap; two partids with the same bitmap act as one. Awkwardly, the MB controls don't implement CDP and CDP can't be hidden as the memory bandwidth control is a maximum per partid which can't be modelled with more partids. If the total maximum is used for both the data and instruction partids then then the maximum may be exceeded and if it is split in two then the one using more bandwidth will hit a lower limit. Hence, hide the MB controls completely if CDP is enabled for any resource. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Cc: Dave Martin Cc: Amit Singh Tomar Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse --- include/linux/arm_mpam.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index 5a78299ec464..d329b1dc148b 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -56,6 +56,8 @@ void resctrl_arch_set_cpu_default_closid(int cpu, u32 closid); void resctrl_arch_set_closid_rmid(struct task_struct *tsk, u32 closid, u32 rmid); void resctrl_arch_set_cpu_default_closid_rmid(int cpu, u32 closid, u32 rmid); void resctrl_arch_sched_in(struct task_struct *tsk); +bool resctrl_arch_match_closid(struct task_struct *tsk, u32 closid); +bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 closid, u32 rmid); /** * mpam_register_requestor() - Register a requestor with the MPAM driver -- cgit v1.2.3 From 3e9b35823aabcb85cc039960256426e50f1fd601 Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Fri, 13 Mar 2026 14:46:00 +0000 Subject: arm_mpam: resctrl: Add rmid index helpers Because MPAM's pmg aren't identical to RDT's rmid, resctrl handles some data structures by index. This allows x86 to map indexes to RMID, and MPAM to map them to partid-and-pmg. Add the helpers to do this. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Suggested-by: James Morse Signed-off-by: Ben Horgan Signed-off-by: James Morse --- include/linux/arm_mpam.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index d329b1dc148b..7d23c90f077d 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -58,6 +58,9 @@ void resctrl_arch_set_cpu_default_closid_rmid(int cpu, u32 closid, u32 rmid); void resctrl_arch_sched_in(struct task_struct *tsk); bool resctrl_arch_match_closid(struct task_struct *tsk, u32 closid); bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 closid, u32 rmid); +u32 resctrl_arch_rmid_idx_encode(u32 closid, u32 rmid); +void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid); +u32 resctrl_arch_system_num_rmid_idx(void); /** * mpam_register_requestor() - Register a requestor with the MPAM driver -- cgit v1.2.3 From 2a3c79c61539779a09928893518c8286d7774b54 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:46:06 +0000 Subject: arm_mpam: resctrl: Allow resctrl to allocate monitors When resctrl wants to read a domain's 'QOS_L3_OCCUP', it needs to allocate a monitor on the corresponding resource. Monitors are allocated by class instead of component. Add helpers to allocate a CSU monitor. These helper return an out of range value for MBM counters. Allocating a montitor context is expected to block until hardware resources become available. This only makes sense for QOS_L3_OCCUP as unallocated MBM counters are losing data. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse --- include/linux/arm_mpam.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index 7d23c90f077d..e1461e32af75 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -5,6 +5,7 @@ #define __LINUX_ARM_MPAM_H #include +#include #include struct mpam_msc; @@ -62,6 +63,10 @@ u32 resctrl_arch_rmid_idx_encode(u32 closid, u32 rmid); void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid); u32 resctrl_arch_system_num_rmid_idx(void); +struct rdt_resource; +void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, enum resctrl_event_id evtid); +void resctrl_arch_mon_ctx_free(struct rdt_resource *r, enum resctrl_event_id evtid, void *ctx); + /** * mpam_register_requestor() - Register a requestor with the MPAM driver * @partid_max: The maximum PARTID value the requestor can generate. -- cgit v1.2.3 From fb56b29932ca276df268806ad52ed80f40f99a6e Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:46:07 +0000 Subject: arm_mpam: resctrl: Add resctrl_arch_rmid_read() resctrl uses resctrl_arch_rmid_read() to read counters. CDP emulation means the counter may need reading in three different ways. The helpers behind the resctrl_arch_ functions will be re-used for the ABMC equivalent functions. Add the rounding helper for checking monitor values while we're here. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Jesse Chick Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse --- include/linux/arm_mpam.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index e1461e32af75..86d5e326d2bd 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -67,6 +67,11 @@ struct rdt_resource; void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, enum resctrl_event_id evtid); void resctrl_arch_mon_ctx_free(struct rdt_resource *r, enum resctrl_event_id evtid, void *ctx); +static inline unsigned int resctrl_arch_round_mon_val(unsigned int val) +{ + return val; +} + /** * mpam_register_requestor() - Register a requestor with the MPAM driver * @partid_max: The maximum PARTID value the requestor can generate. -- cgit v1.2.3 From efc775eadce2c6e0921c21d9c29a7b6686022281 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:46:09 +0000 Subject: arm_mpam: resctrl: Add empty definitions for assorted resctrl functions A few resctrl features and hooks need to be provided, but aren't needed or supported on MPAM platforms. resctrl has individual hooks to separately enable and disable the closid/partid and rmid/pmg context switching code. For MPAM this is all the same thing, as the value in struct task_struct is used to cache the value that should be written to hardware. arm64's context switching code is enabled once MPAM is usable, but doesn't touch the hardware unless the value has changed. For now event configuration is not supported, and can be turned off by returning 'false' from resctrl_arch_is_evt_configurable(). The new io_alloc feature is not supported either, always return false from the enable helper to indicate and fail the enable. Add this, and empty definitions for the other hooks. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse --- include/linux/arm_mpam.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index 86d5e326d2bd..f92a36187a52 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -67,6 +67,15 @@ struct rdt_resource; void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, enum resctrl_event_id evtid); void resctrl_arch_mon_ctx_free(struct rdt_resource *r, enum resctrl_event_id evtid, void *ctx); +/* + * The CPU configuration for MPAM is cheap to write, and is only written if it + * has changed. No need for fine grained enables. + */ +static inline void resctrl_arch_enable_mon(void) { } +static inline void resctrl_arch_disable_mon(void) { } +static inline void resctrl_arch_enable_alloc(void) { } +static inline void resctrl_arch_disable_alloc(void) { } + static inline unsigned int resctrl_arch_round_mon_val(unsigned int val) { return val; -- cgit v1.2.3 From 362a4549f2acfb03390ddfcd91041e65a11a739f Mon Sep 17 00:00:00 2001 From: Koichiro Den Date: Fri, 6 Mar 2026 12:14:41 +0900 Subject: NTB: core: Add .get_dma_dev() callback to ntb_dev_ops Some NTB implementations are backed by a PCI function that is not the right struct device to use with DMA API helpers (e.g. due to IOMMU topology, or because the NTB device is virtual). Add an optional .get_dma_dev() callback to struct ntb_dev_ops and provide a helper, ntb_get_dma_dev(), so NTB clients can use the appropriate struct device for DMA allocations and mappings. If the callback is not implemented, ntb_get_dma_dev() returns the current default (ntb->dev.parent). Drivers that implement .get_dma_dev() must return a non-NULL device. Suggested-by: Frank Li Signed-off-by: Koichiro Den Signed-off-by: Manivannan Sadhasivam [bhelgaas: format doc] Signed-off-by: Bjorn Helgaas Reviewed-by: Dave Jiang Link: https://patch.msgid.link/20260306031443.1911860-2-den@valinux.co.jp --- include/linux/ntb.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ntb.h b/include/linux/ntb.h index 8ff9d663096b..879c3e89e026 100644 --- a/include/linux/ntb.h +++ b/include/linux/ntb.h @@ -256,6 +256,7 @@ static inline int ntb_ctx_ops_is_valid(const struct ntb_ctx_ops *ops) * @msg_clear_mask: See ntb_msg_clear_mask(). * @msg_read: See ntb_msg_read(). * @peer_msg_write: See ntb_peer_msg_write(). + * @get_dma_dev: See ntb_get_dma_dev(). */ struct ntb_dev_ops { int (*port_number)(struct ntb_dev *ntb); @@ -329,6 +330,7 @@ struct ntb_dev_ops { int (*msg_clear_mask)(struct ntb_dev *ntb, u64 mask_bits); u32 (*msg_read)(struct ntb_dev *ntb, int *pidx, int midx); int (*peer_msg_write)(struct ntb_dev *ntb, int pidx, int midx, u32 msg); + struct device *(*get_dma_dev)(struct ntb_dev *ntb); }; static inline int ntb_dev_ops_is_valid(const struct ntb_dev_ops *ops) @@ -391,6 +393,8 @@ static inline int ntb_dev_ops_is_valid(const struct ntb_dev_ops *ops) /* !ops->msg_clear_mask == !ops->msg_count && */ !ops->msg_read == !ops->msg_count && !ops->peer_msg_write == !ops->msg_count && + + /* ops->get_dma_dev is optional */ 1; } @@ -1563,6 +1567,26 @@ static inline int ntb_peer_msg_write(struct ntb_dev *ntb, int pidx, int midx, return ntb->ops->peer_msg_write(ntb, pidx, midx, msg); } +/** + * ntb_get_dma_dev() - get the device to use for DMA allocations/mappings + * @ntb: NTB device context. + * + * Return a struct device suitable for DMA API allocations and mappings. + * This is typically the parent of the NTB device, but may be overridden by a + * driver by implementing .get_dma_dev(). + * + * Drivers that implement .get_dma_dev() must return a non-NULL pointer. + * + * Return: device pointer to use for DMA operations. + */ +static inline struct device *ntb_get_dma_dev(struct ntb_dev *ntb) +{ + if (!ntb->ops->get_dma_dev) + return ntb->dev.parent; + + return ntb->ops->get_dma_dev(ntb); +} + /** * ntb_peer_resource_idx() - get a resource index for a given peer idx * @ntb: NTB device context. -- cgit v1.2.3 From fffca572f9ca51607f180a37d0c898404c8f9112 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 26 Mar 2026 15:06:31 +0100 Subject: mpage: Provide variant of mpage_writepages() with own optional folio handler Some filesystems need to treat some folios specially (for example for inodes with inline data). Doing the handling in their .writepages method in a race-free manner results in duplicating some of the writeback internals. So provide generalized version of mpage_writepages() that allows filesystem to provide a handler called for each folio which can handle the folio in a special way. Reviewed-by: Christoph Hellwig Link: https://patch.msgid.link/20260326140635.15895-3-jack@suse.cz Signed-off-by: Jan Kara --- include/linux/mpage.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mpage.h b/include/linux/mpage.h index 1bdc39daac0a..358946990bfa 100644 --- a/include/linux/mpage.h +++ b/include/linux/mpage.h @@ -17,7 +17,14 @@ struct readahead_control; void mpage_readahead(struct readahead_control *, get_block_t get_block); int mpage_read_folio(struct folio *folio, get_block_t get_block); -int mpage_writepages(struct address_space *mapping, - struct writeback_control *wbc, get_block_t get_block); +int __mpage_writepages(struct address_space *mapping, + struct writeback_control *wbc, get_block_t get_block, + int (*write_folio)(struct folio *folio, + struct writeback_control *wbc)); +static inline int mpage_writepages(struct address_space *mapping, + struct writeback_control *wbc, get_block_t get_block) +{ + return __mpage_writepages(mapping, wbc, get_block, NULL); +} #endif -- cgit v1.2.3 From 33eded29319d41fcba5d0257b126a48b449aad47 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 25 Mar 2026 12:36:08 -0700 Subject: dm: provide helper to set stacked limits There are multiple device mappers that set up their stacking limits exactly the same for the logical, physical and minimum IO queue limits. Provide a helper for it. Signed-off-by: Keith Busch Signed-off-by: Mikulas Patocka --- include/linux/device-mapper.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 38f625af6ab4..cd4faaf5d427 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -755,4 +755,11 @@ static inline unsigned long to_bytes(sector_t n) return (n << SECTOR_SHIFT); } +static inline void dm_stack_bs_limits(struct queue_limits *limits, unsigned int bs) +{ + limits->logical_block_size = max(limits->logical_block_size, bs); + limits->physical_block_size = max(limits->physical_block_size, bs); + limits->io_min = max(limits->io_min, bs); +} + #endif /* _LINUX_DEVICE_MAPPER_H */ -- cgit v1.2.3 From d748047af1355df044faf8df0eedf0ef75f87de8 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 25 Mar 2026 13:36:00 -0400 Subject: ptr_ring: disable KCSAN warnings Eric Dumazet reported KCSAN warnings: BUG: KCSAN: data-race in pfifo_fast_dequeue / pfifo_fast_enqueue write to 0xffff88811d5ccc00 of 8 bytes by interrupt on cpu 0: __ptr_ring_zero_tail include/linux/ptr_ring.h:259 [inline] __ptr_ring_discard_one include/linux/ptr_ring.h:291 [inline] __ptr_ring_consume include/linux/ptr_ring.h:311 [inline] __skb_array_consume include/linux/skb_array.h:98 [inline] pfifo_fast_dequeue+0x770/0x8f0 net/sched/sch_generic.c:770 dequeue_skb net/sched/sch_generic.c:297 [inline] qdisc_restart net/sched/sch_generic.c:402 [inline] __qdisc_run+0x189/0xc80 net/sched/sch_generic.c:420 qdisc_run include/net/pkt_sched.h:120 [inline] net_tx_action+0x379/0x590 net/core/dev.c:5793 handle_softirqs+0xb9/0x280 kernel/softirq.c:622 do_softirq+0x45/0x60 kernel/softirq.c:523 __local_bh_enable_ip+0x70/0x80 kernel/softirq.c:450 local_bh_enable include/linux/bottom_half.h:33 [inline] bpf_test_run+0x2db/0x620 net/bpf/test_run.c:426 bpf_prog_test_run_skb+0x9a4/0xef0 net/bpf/test_run.c:1159 bpf_prog_test_run+0x204/0x340 kernel/bpf/syscall.c:4721 __sys_bpf+0x52e/0x7e0 kernel/bpf/syscall.c:6246 __do_sys_bpf kernel/bpf/syscall.c:6341 [inline] __se_sys_bpf kernel/bpf/syscall.c:6339 [inline] __x64_sys_bpf+0x41/0x50 kernel/bpf/syscall.c:6339 x64_sys_call+0x10cb/0x3020 arch/x86/include/generated/asm/syscalls_64.h:322 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0x12c/0x370 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f read to 0xffff88811d5ccc00 of 8 bytes by task 22632 on cpu 1: __ptr_ring_produce include/linux/ptr_ring.h:106 [inline] ptr_ring_produce include/linux/ptr_ring.h:129 [inline] skb_array_produce include/linux/skb_array.h:44 [inline] pfifo_fast_enqueue+0xd5/0x2c0 net/sched/sch_generic.c:741 dev_qdisc_enqueue net/core/dev.c:4144 [inline] __dev_xmit_skb net/core/dev.c:4188 [inline] __dev_queue_xmit+0x6a4/0x1f20 net/core/dev.c:4795 dev_queue_xmit include/linux/netdevice.h:3384 [inline] __bpf_tx_skb net/core/filter.c:2153 [inline] __bpf_redirect_common net/core/filter.c:2197 [inline] __bpf_redirect+0x862/0x990 net/core/filter.c:2204 ____bpf_clone_redirect net/core/filter.c:2487 [inline] bpf_clone_redirect+0x20c/0x290 net/core/filter.c:2450 bpf_prog_53f18857bc887b09+0x22/0x2a bpf_dispatcher_nop_func include/linux/bpf.h:1402 [inline] __bpf_prog_run include/linux/filter.h:723 [inline] bpf_prog_run include/linux/filter.h:730 [inline] bpf_test_run+0x29d/0x620 net/bpf/test_run.c:423 bpf_prog_test_run_skb+0x9a4/0xef0 net/bpf/test_run.c:1159 bpf_prog_test_run+0x204/0x340 kernel/bpf/syscall.c:4721 __sys_bpf+0x52e/0x7e0 kernel/bpf/syscall.c:6246 __do_sys_bpf kernel/bpf/syscall.c:6341 [inline] __se_sys_bpf kernel/bpf/syscall.c:6339 [inline] __x64_sys_bpf+0x41/0x50 kernel/bpf/syscall.c:6339 x64_sys_call+0x10cb/0x3020 arch/x86/include/generated/asm/syscalls_64.h:322 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0x12c/0x370 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f value changed: 0xffff888104a93a00 -> 0x0000000000000000 Reported by Kernel Concurrency Sanitizer on: CPU: 1 UID: 0 PID: 22632 Comm: syz.0.4135 Tainted: G W syzkaller #0 PREEMPT(full) Tainted: [W]=WARN Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/24/2026 There is no race on ring accesses: reading/writing a partial pointer would be fine, because the reading is done by the producer which merely cares about NULL/non NULL. Document and disable the warnings using data_race(). Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/dd3984b3bce9df3591927f927668cb31cc7ecf34.1774460059.git.mst@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/ptr_ring.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h index 534531807d95..d2c3629bbe45 100644 --- a/include/linux/ptr_ring.h +++ b/include/linux/ptr_ring.h @@ -48,7 +48,7 @@ struct ptr_ring { */ static inline bool __ptr_ring_full(struct ptr_ring *r) { - return r->queue[r->producer]; + return data_race(r->queue[r->producer]); } static inline bool ptr_ring_full(struct ptr_ring *r) @@ -103,7 +103,7 @@ static inline bool ptr_ring_full_bh(struct ptr_ring *r) */ static inline int __ptr_ring_produce(struct ptr_ring *r, void *ptr) { - if (unlikely(!r->size) || r->queue[r->producer]) + if (unlikely(!r->size) || data_race(r->queue[r->producer])) return -ENOSPC; /* Make sure the pointer we are storing points to a valid data. */ @@ -194,7 +194,7 @@ static inline void *__ptr_ring_peek(struct ptr_ring *r) static inline bool __ptr_ring_empty(struct ptr_ring *r) { if (likely(r->size)) - return !r->queue[READ_ONCE(r->consumer_head)]; + return !data_race(r->queue[READ_ONCE(r->consumer_head)]); return true; } @@ -256,7 +256,7 @@ static inline void __ptr_ring_zero_tail(struct ptr_ring *r, int consumer_head) * besides the first one until we write out all entries. */ while (likely(head > r->consumer_tail)) - r->queue[--head] = NULL; + data_race(r->queue[--head] = NULL); r->consumer_tail = consumer_head; } -- cgit v1.2.3 From 4c5e7f0fcd592801c9cc18f29f80fbee84eb8669 Mon Sep 17 00:00:00 2001 From: Jinjiang Tu Date: Thu, 19 Mar 2026 09:25:41 +0800 Subject: mm/huge_memory: fix folio isn't locked in softleaf_to_folio() On arm64 server, we found folio that get from migration entry isn't locked in softleaf_to_folio(). This issue triggers when mTHP splitting and zap_nonpresent_ptes() races, and the root cause is lack of memory barrier in softleaf_to_folio(). The race is as follows: CPU0 CPU1 deferred_split_scan() zap_nonpresent_ptes() lock folio split_folio() unmap_folio() change ptes to migration entries __split_folio_to_order() softleaf_to_folio() set flags(including PG_locked) for tail pages folio = pfn_folio(softleaf_to_pfn(entry)) smp_wmb() VM_WARN_ON_ONCE(!folio_test_locked(folio)) prep_compound_page() for tail pages In __split_folio_to_order(), smp_wmb() guarantees page flags of tail pages are visible before the tail page becomes non-compound. smp_wmb() should be paired with smp_rmb() in softleaf_to_folio(), which is missed. As a result, if zap_nonpresent_ptes() accesses migration entry that stores tail pfn, softleaf_to_folio() may see the updated compound_head of tail page before page->flags. This issue will trigger VM_WARN_ON_ONCE() in pfn_swap_entry_folio() because of the race between folio split and zap_nonpresent_ptes() leading to a folio incorrectly undergoing modification without a folio lock being held. This is a BUG_ON() before commit 93976a20345b ("mm: eliminate further swapops predicates"), which in merged in v6.19-rc1. To fix it, add missing smp_rmb() if the softleaf entry is migration entry in softleaf_to_folio() and softleaf_to_page(). [tujinjiang@huawei.com: update function name and comments] Link: https://lkml.kernel.org/r/20260321075214.3305564-1-tujinjiang@huawei.com Link: https://lkml.kernel.org/r/20260319012541.4158561-1-tujinjiang@huawei.com Fixes: e9b61f19858a ("thp: reintroduce split_huge_page()") Signed-off-by: Jinjiang Tu Acked-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Barry Song Cc: Kefeng Wang Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Nanyong Sun Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- include/linux/leafops.h | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/leafops.h b/include/linux/leafops.h index a9ff94b744f2..05673d3529e7 100644 --- a/include/linux/leafops.h +++ b/include/linux/leafops.h @@ -363,6 +363,23 @@ static inline unsigned long softleaf_to_pfn(softleaf_t entry) return swp_offset(entry) & SWP_PFN_MASK; } +static inline void softleaf_migration_sync(softleaf_t entry, + struct folio *folio) +{ + /* + * Ensure we do not race with split, which might alter tail pages into new + * folios and thus result in observing an unlocked folio. + * This matches the write barrier in __split_folio_to_order(). + */ + smp_rmb(); + + /* + * Any use of migration entries may only occur while the + * corresponding page is locked + */ + VM_WARN_ON_ONCE(!folio_test_locked(folio)); +} + /** * softleaf_to_page() - Obtains struct page for PFN encoded within leaf entry. * @entry: Leaf entry, softleaf_has_pfn(@entry) must return true. @@ -374,11 +391,8 @@ static inline struct page *softleaf_to_page(softleaf_t entry) struct page *page = pfn_to_page(softleaf_to_pfn(entry)); VM_WARN_ON_ONCE(!softleaf_has_pfn(entry)); - /* - * Any use of migration entries may only occur while the - * corresponding page is locked - */ - VM_WARN_ON_ONCE(softleaf_is_migration(entry) && !PageLocked(page)); + if (softleaf_is_migration(entry)) + softleaf_migration_sync(entry, page_folio(page)); return page; } @@ -394,12 +408,8 @@ static inline struct folio *softleaf_to_folio(softleaf_t entry) struct folio *folio = pfn_folio(softleaf_to_pfn(entry)); VM_WARN_ON_ONCE(!softleaf_has_pfn(entry)); - /* - * Any use of migration entries may only occur while the - * corresponding folio is locked. - */ - VM_WARN_ON_ONCE(softleaf_is_migration(entry) && - !folio_test_locked(folio)); + if (softleaf_is_migration(entry)) + softleaf_migration_sync(entry, folio); return folio; } -- cgit v1.2.3 From 187b00a26679ae58a79f56c0024df1e3dbd7dff0 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 25 Mar 2026 23:00:02 +0200 Subject: net: stmmac: provide flag to disable EEE Some platforms have problems when EEE is enabled, and thus need a way to disable stmmac EEE support. Add a flag before the other LPI related flags which tells stmmac to avoid populating the phylink LPI capabilities, which causes phylink to call phy_disable_eee() for any PHY that is attached to the affected phylink instance. iMX8MP is an example - the lpi_intr_o signal is wired to an OR gate along with the main dwmac interrupts. Since lpi_intr_o is synchronous to the receive clock domain, and takes four clock cycles to clear, this leads to interrupt storms as the interrupt remains asserted for some time after the LPI control and status register is read. This problem becomes worse when the receive clock from the PHY stops when the receive path enters LPI state - which means that lpi_intr_o can not deassert until the clock restarts. Since the LPI state of the receive path depends on the link partner, this is out of our control. We could disable RX clock stop at the PHY, but that doesn't get around the slow-to-deassert lpi_intr_o mentioned in the above paragraph. Previously, iMX8MP worked around this by disabling gigabit EEE, but this is insufficient - the problem is also visible at 100M speeds, where the receive clock is slower. There is extensive discussion and investigation in the thread linked below, the result of which is summarised in this commit message. Reported-by: Laurent Pinchart Closes: https://lore.kernel.org/r/20251026122905.29028-1-laurent.pinchart@ideasonboard.com Signed-off-by: Russell King (Oracle) Tested-by: Ovidiu Panait Signed-off-by: Laurent Pinchart Reviewed-by: Laurent Pinchart Reviewed-by: Kieran Bingham Link: https://patch.msgid.link/20260325210003.2752013-2-laurent.pinchart@ideasonboard.com Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index eaaee329ef9d..4430b967abde 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -204,12 +204,13 @@ enum dwmac_core_type { #define STMMAC_FLAG_MULTI_MSI_EN BIT(7) #define STMMAC_FLAG_EXT_SNAPSHOT_EN BIT(8) #define STMMAC_FLAG_INT_SNAPSHOT_EN BIT(9) -#define STMMAC_FLAG_RX_CLK_RUNS_IN_LPI BIT(10) -#define STMMAC_FLAG_EN_TX_LPI_CLOCKGATING BIT(11) -#define STMMAC_FLAG_EN_TX_LPI_CLK_PHY_CAP BIT(12) -#define STMMAC_FLAG_HWTSTAMP_CORRECT_LATENCY BIT(13) -#define STMMAC_FLAG_KEEP_PREAMBLE_BEFORE_SFD BIT(14) -#define STMMAC_FLAG_SERDES_SUPPORTS_2500M BIT(15) +#define STMMAC_FLAG_EEE_DISABLE BIT(10) +#define STMMAC_FLAG_RX_CLK_RUNS_IN_LPI BIT(11) +#define STMMAC_FLAG_EN_TX_LPI_CLOCKGATING BIT(12) +#define STMMAC_FLAG_EN_TX_LPI_CLK_PHY_CAP BIT(13) +#define STMMAC_FLAG_HWTSTAMP_CORRECT_LATENCY BIT(14) +#define STMMAC_FLAG_KEEP_PREAMBLE_BEFORE_SFD BIT(15) +#define STMMAC_FLAG_SERDES_SUPPORTS_2500M BIT(16) struct mac_device_info; -- cgit v1.2.3 From 077ba03600faea5f2aa15afbb83580878cc8b500 Mon Sep 17 00:00:00 2001 From: Mayank Rungta Date: Thu, 12 Mar 2026 16:22:05 -0700 Subject: watchdog/hardlockup: improve buddy system detection timeliness Currently, the buddy system only performs checks every 3rd sample. With a 4-second interval. If a check window is missed, the next check occurs 12 seconds later, potentially delaying hard lockup detection for up to 24 seconds. Modify the buddy system to perform checks at every interval (4s). Introduce a missed-interrupt threshold to maintain the existing grace period while reducing the detection window to 8-12 seconds. Best and worst case detection scenarios: Before (12s check window): - Best case: Lockup occurs after first check but just before heartbeat interval. Detected in ~8s (8s till next check). - Worst case: Lockup occurs just after a check. Detected in ~24s (missed check + 12s till next check + 12s logic). After (4s check window with threshold of 3): - Best case: Lockup occurs just before a check. Detected in ~8s (0s till 1st check + 4s till 2nd + 4s till 3rd). - Worst case: Lockup occurs just after a check. Detected in ~12s (4s till 1st check + 4s till 2nd + 4s till 3rd). Link: https://lkml.kernel.org/r/20260312-hardlockup-watchdog-fixes-v2-4-45bd8a0cc7ed@google.com Signed-off-by: Mayank Rungta Reviewed-by: Douglas Anderson Reviewed-by: Petr Mladek Cc: Ian Rogers Cc: Jonathan Corbet Cc: Li Huafei Cc: Max Kellermann Cc: Shuah Khan Cc: Stephane Erainan Cc: Wang Jinchao Cc: Yunhui Cui Signed-off-by: Andrew Morton --- include/linux/nmi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 207156f2143c..bc1162895f35 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -21,6 +21,7 @@ void lockup_detector_soft_poweroff(void); extern int watchdog_user_enabled; extern int watchdog_thresh; extern unsigned long watchdog_enabled; +extern int watchdog_hardlockup_miss_thresh; extern struct cpumask watchdog_cpumask; extern unsigned long *watchdog_cpumask_bits; -- cgit v1.2.3 From 89e5d7d616009e5fada5da081b1d79cdd59150ab Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Fri, 27 Mar 2026 16:10:21 +0100 Subject: mailbox: remove superfluous internal header Quite some controller drivers use the defines from the internal header already. This prevents controller drivers outside the mailbox directory. Move the defines to the public controller header to allow this again as the defines are not strictly internal anyhow. Signed-off-by: Wolfram Sang Reviewed-by: Sudeep Holla Reviewed-by: Daniel Baluta Signed-off-by: Jassi Brar --- include/linux/mailbox_controller.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mailbox_controller.h b/include/linux/mailbox_controller.h index 80a427c7ca29..16fef421c30c 100644 --- a/include/linux/mailbox_controller.h +++ b/include/linux/mailbox_controller.h @@ -3,6 +3,7 @@ #ifndef __MAILBOX_CONTROLLER_H #define __MAILBOX_CONTROLLER_H +#include #include #include #include @@ -11,6 +12,10 @@ struct mbox_chan; +#define TXDONE_BY_IRQ BIT(0) /* controller has remote RTR irq */ +#define TXDONE_BY_POLL BIT(1) /* controller can read status of last TX */ +#define TXDONE_BY_ACK BIT(2) /* S/W ACK received by Client ticks the TX */ + /** * struct mbox_chan_ops - methods to control mailbox channels * @send_data: The API asks the MBOX controller driver, in atomic -- cgit v1.2.3 From c58e9456e30c7098cbcd9f04571992be8a2e4e63 Mon Sep 17 00:00:00 2001 From: Jassi Brar Date: Fri, 27 Mar 2026 17:00:40 -0500 Subject: mailbox: Fix NULL message support in mbox_send_message() The active_req field serves double duty as both the "is a TX in flight" flag (NULL means idle) and the storage for the in-flight message pointer. When a client sends NULL via mbox_send_message(), active_req is set to NULL, which the framework misinterprets as "no active request". This breaks the TX state machine by: - tx_tick() short-circuits on (!mssg), skipping the tx_done callback and the tx_complete completion - txdone_hrtimer() skips the channel entirely since active_req is NULL, so poll-based TX-done detection never fires. Fix this by introducing a MBOX_NO_MSG sentinel value that means "no active request," freeing NULL to be valid message data. The sentinel is defined in the subsystem-internal mailbox.h so that controller drivers within drivers/mailbox/ can reference it, but it is not exposed to clients outside the subsystem. Fifteen in-tree callers send NULL (doorbell-style IPCs on Qualcomm, Tegra, TI, Xilinx, i.MX, SCMI, and PCC platforms). All were audited for regression: - Most already work around the bug via knows_txdone=true with a manual mbox_client_txdone() call, making the framework's tracking irrelevant. These are unaffected. - Poll-based callers (Xilinx zynqmp/r5) are strictly better off: the poll timer now correctly detects NULL-active channels instead of silently skipping them. - irq-qcom-mpm.c was a pre-existing bug -- the only Qualcomm caller that omitted the knows_txdone + mbox_client_txdone() pattern. Fixed in a companion commit ("irqchip/qcom-mpm: Fix missing mailbox TX done acknowledgment"). - No caller sets both a tx_done callback and sends NULL, nor combines tx_block=true with NULL sends, so the newly reachable callback/completion paths are never exercised. Also update tegra-hsp's flush callback, which directly inspects active_req to wait for the channel to drain: the old "!= NULL" check becomes "!= MBOX_NO_MSG", otherwise flush spins until timeout since the sentinel is non-NULL. The only tradeoff is that 'MBOX_NO_MSG' can not be used as a message by clients. Reported-by: Joonwon Kang Reviewed-by: Douglas Anderson Signed-off-by: Jassi Brar --- include/linux/mailbox_controller.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mailbox_controller.h b/include/linux/mailbox_controller.h index 16fef421c30c..e3896b08f22e 100644 --- a/include/linux/mailbox_controller.h +++ b/include/linux/mailbox_controller.h @@ -12,6 +12,9 @@ struct mbox_chan; +/* Sentinel value distinguishing "no active request" from "NULL message data" */ +#define MBOX_NO_MSG ((void *)-1) + #define TXDONE_BY_IRQ BIT(0) /* controller has remote RTR irq */ #define TXDONE_BY_POLL BIT(1) /* controller can read status of last TX */ #define TXDONE_BY_ACK BIT(2) /* S/W ACK received by Client ticks the TX */ -- cgit v1.2.3 From d457072576a6a60ba853b1d815f123da57b48021 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Fri, 27 Mar 2026 13:32:40 -0700 Subject: bpf: Support struct btf_struct_meta via KF_IMPLICIT_ARGS The following kfuncs currently accept void *meta__ign argument: * bpf_obj_new_impl * bpf_obj_drop_impl * bpf_percpu_obj_new_impl * bpf_percpu_obj_drop_impl * bpf_refcount_acquire_impl * bpf_list_push_back_impl * bpf_list_push_front_impl * bpf_rbtree_add_impl The __ign suffix is an indicator for the verifier to skip the argument in check_kfunc_args(). Then, in fixup_kfunc_call() the verifier may set the value of this argument to struct btf_struct_meta * kptr_struct_meta from insn_aux_data. BPF programs must pass a dummy NULL value when calling these kfuncs. Additionally, the list and rbtree _impl kfuncs also accept an implicit u64 argument, which doesn't require __ign suffix because it's a scalar, and BPF programs explicitly pass 0. Add new kfuncs with KF_IMPLICIT_ARGS [1], that correspond to each _impl kfunc accepting meta__ign. The existing _impl kfuncs remain unchanged for backwards compatibility. To support this, add "btf_struct_meta" to the list of recognized implicit argument types in resolve_btfids. Implement is_kfunc_arg_implicit() in the verifier, that determines implicit args by inspecting both a non-_impl BTF prototype of the kfunc. Update the special_kfunc_list in the verifier and relevant checks to support both the old _impl and the new KF_IMPLICIT_ARGS variants of btf_struct_meta users. [1] https://lore.kernel.org/bpf/20260120222638.3976562-1-ihor.solodrai@linux.dev/ Signed-off-by: Ihor Solodrai Acked-by: Jiri Olsa Link: https://lore.kernel.org/r/20260327203241.3365046-1-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/btf_ids.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index 139bdececdcf..af011db39ab3 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -217,7 +217,7 @@ BTF_SET8_END(name) #else -#define BTF_ID_LIST(name) static u32 __maybe_unused name[64]; +#define BTF_ID_LIST(name) static u32 __maybe_unused name[128]; #define BTF_ID(prefix, name) #define BTF_ID_FLAGS(prefix, name, ...) #define BTF_ID_UNUSED -- cgit v1.2.3 From fde39f7df10b3dc150abb87c4718efba93cbc755 Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Wed, 25 Mar 2026 13:08:44 +0100 Subject: ipv6: replace IS_BUILTIN(CONFIG_IPV6) with IS_ENABLED(CONFIG_IPV6) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As IPv6 is built-in only, it does not make sense to continue using IS_BUILTIN(CONFIG_IPV6). Therefore, replace it with IS_ENABLED() when necessary and drop it if it isn't valid anymore. Notice that there is still one instance related to ICMPv6, as it requires more changes it will be handle separately. Signed-off-by: Fernando Fernandez Mancera Tested-by: Ricardo B. Marlière Acked-by: Martin KaFai Lau Link: https://patch.msgid.link/20260325120928.15848-4-fmancera@suse.de Signed-off-by: Jakub Kicinski --- include/linux/indirect_call_wrapper.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/indirect_call_wrapper.h b/include/linux/indirect_call_wrapper.h index dc272b514a01..0e4340ecd857 100644 --- a/include/linux/indirect_call_wrapper.h +++ b/include/linux/indirect_call_wrapper.h @@ -57,7 +57,7 @@ * builtin, this macro simplify dealing with indirect calls with only ipv4/ipv6 * alternatives */ -#if IS_BUILTIN(CONFIG_IPV6) +#if IS_ENABLED(CONFIG_IPV6) #define INDIRECT_CALL_INET(f, f2, f1, ...) \ INDIRECT_CALL_2(f, f2, f1, __VA_ARGS__) #elif IS_ENABLED(CONFIG_INET) -- cgit v1.2.3 From d2042d35f413b7131cc571655bbcb2c049489fe7 Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Wed, 25 Mar 2026 13:08:45 +0100 Subject: ipv6: remove dynamic ICMPv6 sender registration infrastructure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As IPv6 is built-in only, there is no need to maintain the sender registration infrastructure used to allow built-in subsystems to send ICMPv6 messages when IPv6 was compiled as a module. Drop the registration mechanism and the __icmpv6_send() sender implementation. While icmpv6_send() users could be converted to icmp6_send() that doesn't seems necessary as none of them are using the force_saddr parameter. Signed-off-by: Fernando Fernandez Mancera Tested-by: Ricardo B. Marlière Link: https://patch.msgid.link/20260325120928.15848-5-fmancera@suse.de Signed-off-by: Jakub Kicinski --- include/linux/icmpv6.h | 29 ++--------------------------- 1 file changed, 2 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/include/linux/icmpv6.h b/include/linux/icmpv6.h index e3b3b0fa2a8f..2bd9f2157e6c 100644 --- a/include/linux/icmpv6.h +++ b/include/linux/icmpv6.h @@ -15,38 +15,13 @@ static inline struct icmp6hdr *icmp6_hdr(const struct sk_buff *skb) #if IS_ENABLED(CONFIG_IPV6) -typedef void ip6_icmp_send_t(struct sk_buff *skb, u8 type, u8 code, __u32 info, - const struct in6_addr *force_saddr, - const struct inet6_skb_parm *parm); void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, const struct in6_addr *force_saddr, const struct inet6_skb_parm *parm); -#if IS_BUILTIN(CONFIG_IPV6) -static inline void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, - const struct inet6_skb_parm *parm) -{ - icmp6_send(skb, type, code, info, NULL, parm); -} -static inline int inet6_register_icmp_sender(ip6_icmp_send_t *fn) -{ - BUILD_BUG_ON(fn != icmp6_send); - return 0; -} -static inline int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn) -{ - BUILD_BUG_ON(fn != icmp6_send); - return 0; -} -#else -extern void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, - const struct inet6_skb_parm *parm); -extern int inet6_register_icmp_sender(ip6_icmp_send_t *fn); -extern int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn); -#endif static inline void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) { - __icmpv6_send(skb, type, code, info, IP6CB(skb)); + icmp6_send(skb, type, code, info, NULL, IP6CB(skb)); } int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type, @@ -58,7 +33,7 @@ void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info); static inline void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info) { struct inet6_skb_parm parm = { 0 }; - __icmpv6_send(skb_in, type, code, info, &parm); + icmp6_send(skb_in, type, code, info, NULL, &parm); } #endif -- cgit v1.2.3 From b2c981e7c4653e3c276d5f3a0e012711d3596418 Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Wed, 25 Mar 2026 13:08:52 +0100 Subject: netfilter: remove nf_ipv6_ops and use direct function calls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As IPv6 is built-in only, nf_ipv6_ops can be removed completely as it is not longer necessary. Convert all nf_ipv6_ops usage to direct function calls instead. In addition, remove the ipv6_netfilter_init/fini() functions as they are not necessary any longer. Signed-off-by: Fernando Fernandez Mancera Tested-by: Ricardo B. Marlière Link: https://patch.msgid.link/20260325120928.15848-12-fmancera@suse.de Signed-off-by: Jakub Kicinski --- include/linux/netfilter_ipv6.h | 102 +++-------------------------------------- 1 file changed, 6 insertions(+), 96 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h index 61aa48f46dd7..5ce45b6d890f 100644 --- a/include/linux/netfilter_ipv6.h +++ b/include/linux/netfilter_ipv6.h @@ -34,59 +34,13 @@ struct ip6_rt_info { struct nf_queue_entry; struct nf_bridge_frag_data; -/* - * Hook functions for ipv6 to allow xt_* modules to be built-in even - * if IPv6 is a module. - */ -struct nf_ipv6_ops { -#if IS_MODULE(CONFIG_IPV6) - int (*chk_addr)(struct net *net, const struct in6_addr *addr, - const struct net_device *dev, int strict); - int (*route_me_harder)(struct net *net, struct sock *sk, struct sk_buff *skb); - int (*dev_get_saddr)(struct net *net, const struct net_device *dev, - const struct in6_addr *daddr, unsigned int srcprefs, - struct in6_addr *saddr); - int (*route)(struct net *net, struct dst_entry **dst, struct flowi *fl, - bool strict); - u32 (*cookie_init_sequence)(const struct ipv6hdr *iph, - const struct tcphdr *th, u16 *mssp); - int (*cookie_v6_check)(const struct ipv6hdr *iph, - const struct tcphdr *th); -#endif - void (*route_input)(struct sk_buff *skb); - int (*fragment)(struct net *net, struct sock *sk, struct sk_buff *skb, - int (*output)(struct net *, struct sock *, struct sk_buff *)); - int (*reroute)(struct sk_buff *skb, const struct nf_queue_entry *entry); -#if IS_MODULE(CONFIG_IPV6) - int (*br_fragment)(struct net *net, struct sock *sk, - struct sk_buff *skb, - struct nf_bridge_frag_data *data, - int (*output)(struct net *, struct sock *sk, - const struct nf_bridge_frag_data *data, - struct sk_buff *)); -#endif -}; - #ifdef CONFIG_NETFILTER #include -extern const struct nf_ipv6_ops __rcu *nf_ipv6_ops; -static inline const struct nf_ipv6_ops *nf_get_ipv6_ops(void) -{ - return rcu_dereference(nf_ipv6_ops); -} - static inline int nf_ipv6_chk_addr(struct net *net, const struct in6_addr *addr, const struct net_device *dev, int strict) { -#if IS_MODULE(CONFIG_IPV6) - const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops(); - - if (!v6_ops) - return 1; - - return v6_ops->chk_addr(net, addr, dev, strict); -#elif IS_BUILTIN(CONFIG_IPV6) +#if IS_ENABLED(CONFIG_IPV6) return ipv6_chk_addr(net, addr, dev, strict); #else return 1; @@ -99,15 +53,7 @@ int __nf_ip6_route(struct net *net, struct dst_entry **dst, static inline int nf_ip6_route(struct net *net, struct dst_entry **dst, struct flowi *fl, bool strict) { -#if IS_MODULE(CONFIG_IPV6) - const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops(); - - if (v6ops) - return v6ops->route(net, dst, fl, strict); - - return -EHOSTUNREACH; -#endif -#if IS_BUILTIN(CONFIG_IPV6) +#if IS_ENABLED(CONFIG_IPV6) return __nf_ip6_route(net, dst, fl, strict); #else return -EHOSTUNREACH; @@ -129,14 +75,7 @@ static inline int nf_br_ip6_fragment(struct net *net, struct sock *sk, const struct nf_bridge_frag_data *data, struct sk_buff *)) { -#if IS_MODULE(CONFIG_IPV6) - const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops(); - - if (!v6_ops) - return 1; - - return v6_ops->br_fragment(net, sk, skb, data, output); -#elif IS_BUILTIN(CONFIG_IPV6) +#if IS_ENABLED(CONFIG_IPV6) return br_ip6_fragment(net, sk, skb, data, output); #else return 1; @@ -147,14 +86,7 @@ int ip6_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb); static inline int nf_ip6_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb) { -#if IS_MODULE(CONFIG_IPV6) - const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops(); - - if (!v6_ops) - return -EHOSTUNREACH; - - return v6_ops->route_me_harder(net, sk, skb); -#elif IS_BUILTIN(CONFIG_IPV6) +#if IS_ENABLED(CONFIG_IPV6) return ip6_route_me_harder(net, sk, skb); #else return -EHOSTUNREACH; @@ -165,15 +97,8 @@ static inline u32 nf_ipv6_cookie_init_sequence(const struct ipv6hdr *iph, const struct tcphdr *th, u16 *mssp) { -#if IS_ENABLED(CONFIG_SYN_COOKIES) -#if IS_MODULE(CONFIG_IPV6) - const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops(); - - if (v6_ops) - return v6_ops->cookie_init_sequence(iph, th, mssp); -#elif IS_BUILTIN(CONFIG_IPV6) +#if IS_ENABLED(CONFIG_IPV6) && IS_ENABLED(CONFIG_SYN_COOKIES) return __cookie_v6_init_sequence(iph, th, mssp); -#endif #endif return 0; } @@ -181,15 +106,8 @@ static inline u32 nf_ipv6_cookie_init_sequence(const struct ipv6hdr *iph, static inline int nf_cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th) { -#if IS_ENABLED(CONFIG_SYN_COOKIES) -#if IS_MODULE(CONFIG_IPV6) - const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops(); - - if (v6_ops) - return v6_ops->cookie_v6_check(iph, th); -#elif IS_BUILTIN(CONFIG_IPV6) +#if IS_ENABLED(CONFIG_IPV6) && IS_ENABLED(CONFIG_SYN_COOKIES) return __cookie_v6_check(iph, th); -#endif #endif return 0; } @@ -198,14 +116,6 @@ __sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, u_int8_t protocol); int nf_ip6_check_hbh_len(struct sk_buff *skb, u32 *plen); - -int ipv6_netfilter_init(void); -void ipv6_netfilter_fini(void); - -#else /* CONFIG_NETFILTER */ -static inline int ipv6_netfilter_init(void) { return 0; } -static inline void ipv6_netfilter_fini(void) { return; } -static inline const struct nf_ipv6_ops *nf_get_ipv6_ops(void) { return NULL; } #endif /* CONFIG_NETFILTER */ #endif /*__LINUX_IP6_NETFILTER_H*/ -- cgit v1.2.3 From 55b6dd54c3bcb6edf7ad630a4510759f4b0cf1cd Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 13 Jan 2026 13:37:39 -0500 Subject: nfsd/sunrpc: add svc_rqst->rq_private pointer and remove rq_lease_breaker rq_lease_breaker has always been a NFSv4 specific layering violation in svc_rqst. The reason it's there though is that we need a place that is thread-local, and accessible from the svc_rqst pointer. Add a new rq_private pointer to struct svc_rqst. This is intended for use by the threads that are handling the service. sunrpc code doesn't touch it. In nfsd, define a new struct nfsd_thread_local_info. nfsd declares one of these on the stack and puts a pointer to it in rq_private. Add a new ntli_lease_breaker field to the new struct and convert all of the places that access rq_lease_breaker to use the new field instead. Signed-off-by: Jeff Layton Reviewed-by: Benjamin Coddington Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 4dc14c7a711b..ab8237ba9596 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -175,6 +175,9 @@ static inline unsigned long svc_serv_maxpages(const struct svc_serv *serv) /* * The context of a single thread, including the request currently being * processed. + * + * RPC programs are free to use rq_private to stash thread-local information. + * The sunrpc layer will not access it. */ struct svc_rqst { struct list_head rq_all; /* all threads list */ @@ -251,7 +254,7 @@ struct svc_rqst { unsigned long bc_to_initval; unsigned int bc_to_retries; unsigned int rq_status_counter; /* RPC processing counter */ - void **rq_lease_breaker; /* The v4 client breaking a lease */ + void *rq_private; /* For use by the service thread */ }; /* bits for rq_flags */ -- cgit v1.2.3 From 322ecd01bf8ad7e0da21e174679aff1759e68b2c Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 13 Jan 2026 13:37:40 -0500 Subject: nfsd/sunrpc: move rq_cachetype into struct nfsd_thread_local_info The svc_rqst->rq_cachetype field is only accessed by nfsd. Move it into the nfsd_thread_local_info instead. Signed-off-by: Jeff Layton Reviewed-by: Benjamin Coddington Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index ab8237ba9596..62152e4f3bcc 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -218,7 +218,6 @@ struct svc_rqst { u32 rq_vers; /* program version */ u32 rq_proc; /* procedure number */ u32 rq_prot; /* IP protocol */ - int rq_cachetype; /* catering to nfsd */ unsigned long rq_flags; /* flags field */ ktime_t rq_qtime; /* enqueue time */ -- cgit v1.2.3 From 153b9e025308417d167332c93e1bcc11174178de Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:23 -0500 Subject: lockd: Relocate and rename nlm_drop_reply The nlm_drop_reply status code is internal to the kernel's lockd implementation and must never appear on the wire. Its previous location in xdr.h grouped it with legitimate NLM protocol status codes, obscuring this critical distinction. Relocate the definition to lockd.h with a comment block for internal status codes, and rename to nlm__int__drop_reply to make its internal-only nature explicit. This prepares for adding additional internal status codes in subsequent patches. Signed-off-by: Chuck Lever --- include/linux/lockd/lockd.h | 6 ++++++ include/linux/lockd/xdr.h | 2 -- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 330e38776bb2..fdefec39553f 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -38,6 +38,12 @@ */ #define LOCKD_DFLT_TIMEO 10 +/* + * Internal-use status codes, not to be placed on the wire. + * Version handlers translate these to appropriate wire values. + */ +#define nlm__int__drop_reply cpu_to_be32(30000) + /* * Lockd host handle (used both by the client and server personality). */ diff --git a/include/linux/lockd/xdr.h b/include/linux/lockd/xdr.h index 17d53165d9f2..292e4e38d17d 100644 --- a/include/linux/lockd/xdr.h +++ b/include/linux/lockd/xdr.h @@ -33,8 +33,6 @@ struct svc_rqst; #define nlm_lck_blocked cpu_to_be32(NLM_LCK_BLOCKED) #define nlm_lck_denied_grace_period cpu_to_be32(NLM_LCK_DENIED_GRACE_PERIOD) -#define nlm_drop_reply cpu_to_be32(30000) - /* Lock info passed via NLM */ struct nlm_lock { char * caller; -- cgit v1.2.3 From 9e0d0c61940796893e0c2200cdc7be0684218238 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:24 -0500 Subject: lockd: Introduce nlm__int__deadlock The use of CONFIG_LOCKD_V4 in combination with a later cast_status() in the NLMv3 code is difficult to reason about. Instead, replace the use of nlm_deadlock with an implementation-defined status value that version-specific code translates appropriately. The new approach establishes a translation boundary: generic lockd code returns nlm__int__deadlock when posix_lock_file() yields -EDEADLK. Version-specific handlers (svc4proc.c for NLMv4, svcproc.c for NLMv3) translate this internal status to the appropriate wire protocol value. NLMv4 maps to nlm4_deadlock; NLMv3 maps to nlm_lck_denied (since NLMv3 lacks a deadlock-specific status code). Later this modification will also remove the need to include NLMv4 headers in NLMv3 and generic code. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/lockd/lockd.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index fdefec39553f..793691912137 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -43,6 +43,7 @@ * Version handlers translate these to appropriate wire values. */ #define nlm__int__drop_reply cpu_to_be32(30000) +#define nlm__int__deadlock cpu_to_be32(30001) /* * Lockd host handle (used both by the client and server personality). -- cgit v1.2.3 From 7db001e03d7a668ca6c3789fee42a24236ca90f6 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:25 -0500 Subject: lockd: Have nlm_fopen() return errno values The nlm_fopen() function is part of the API between nfsd and lockd. Currently its return value is an on-the-wire NLM status code. But that forces NFSD to include NLM wire protocol definitions despite having no other dependency on the NLM wire protocol. In addition, a CONFIG_LOCKD_V4 Kconfig symbol appears in the middle of NFSD source code. Refactor: Let's not use on-the-wire values as part of a high-level API between two Linux kernel modules. That's what we have errno for, right? And, instead of simply moving the CONFIG_LOCKD_V4 check, we can get rid of it entirely and let the decision of what actual NLM status code goes on the wire to be left up to NLM version-specific code. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/lockd/bind.h | 8 +++----- include/linux/lockd/lockd.h | 2 ++ 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lockd/bind.h b/include/linux/lockd/bind.h index c53c81242e72..2f5dd9e943ee 100644 --- a/include/linux/lockd/bind.h +++ b/include/linux/lockd/bind.h @@ -26,11 +26,9 @@ struct rpc_clnt; * This is the set of functions for lockd->nfsd communication */ struct nlmsvc_binding { - __be32 (*fopen)(struct svc_rqst *, - struct nfs_fh *, - struct file **, - int mode); - void (*fclose)(struct file *); + int (*fopen)(struct svc_rqst *rqstp, struct nfs_fh *f, + struct file **filp, int flags); + void (*fclose)(struct file *filp); }; extern const struct nlmsvc_binding *nlmsvc_ops; diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 793691912137..195e6ce28f6e 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -44,6 +44,8 @@ */ #define nlm__int__drop_reply cpu_to_be32(30000) #define nlm__int__deadlock cpu_to_be32(30001) +#define nlm__int__stale_fh cpu_to_be32(30002) +#define nlm__int__failed cpu_to_be32(30003) /* * Lockd host handle (used both by the client and server personality). -- cgit v1.2.3 From efb5b15e3b78f5644dd2d4ddec8880e0c9aa5b5f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:26 -0500 Subject: lockd: Relocate nlmsvc_unlock API declarations The nlmsvc_unlock_all_by_sb() and nlmsvc_unlock_all_by_ip() functions are part of lockd's external API, consumed by other kernel subsystems. Their declarations currently reside in linux/lockd/lockd.h alongside internal implementation details, which blurs the boundary between lockd's public interface and its private internals. Moving these declarations to linux/lockd/bind.h groups them with other external API functions and makes the separation explicit. This clarifies which functions are intended for external use and reduces the risk of internal implementation details leaking into the public API surface. Build-tested with allyesconfig; no functional changes. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/lockd/bind.h | 7 +++++++ include/linux/lockd/lockd.h | 6 ------ 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lockd/bind.h b/include/linux/lockd/bind.h index 2f5dd9e943ee..82eca0a13ccc 100644 --- a/include/linux/lockd/bind.h +++ b/include/linux/lockd/bind.h @@ -21,6 +21,7 @@ struct svc_rqst; struct rpc_task; struct rpc_clnt; +struct super_block; /* * This is the set of functions for lockd->nfsd communication @@ -80,4 +81,10 @@ extern int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl, vo extern int lockd_up(struct net *net, const struct cred *cred); extern void lockd_down(struct net *net); +/* + * Cluster failover support + */ +int nlmsvc_unlock_all_by_sb(struct super_block *sb); +int nlmsvc_unlock_all_by_ip(struct sockaddr *server_addr); + #endif /* LINUX_LOCKD_BIND_H */ diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 195e6ce28f6e..0d883f48ec21 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -311,12 +311,6 @@ void nlmsvc_mark_resources(struct net *); void nlmsvc_free_host_resources(struct nlm_host *); void nlmsvc_invalidate_all(void); -/* - * Cluster failover support - */ -int nlmsvc_unlock_all_by_sb(struct super_block *sb); -int nlmsvc_unlock_all_by_ip(struct sockaddr *server_addr); - static inline struct file *nlmsvc_file_file(const struct nlm_file *file) { return file->f_file[O_RDONLY] ? -- cgit v1.2.3 From 840621fd2ff23ada8b9262d90477e75232566e6b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:27 -0500 Subject: NFS: Use nlmclnt_shutdown_rpc_clnt() to safely shut down NLM A race condition exists in shutdown_store() when writing to the sysfs "shutdown" file concurrently with nlm_shutdown_hosts_net(). Without synchronization, the following sequence can occur: 1. shutdown_store() reads server->nlm_host (non-NULL) 2. nlm_shutdown_hosts_net() acquires nlm_host_mutex, calls rpc_shutdown_client(), sets h_rpcclnt to NULL, and potentially frees the host via nlm_gc_hosts() 3. shutdown_store() dereferences the now-stale or freed host Introduce nlmclnt_shutdown_rpc_clnt(), which acquires nlm_host_mutex before accessing h_rpcclnt. This synchronizes with nlm_shutdown_hosts_net() and ensures the rpc_clnt pointer remains valid during the shutdown operation. This change also improves API layering: NFS client code no longer needs to include the internal lockd header to access nlm_host fields. The new helper resides in bind.h alongside other public lockd interfaces. Reported-by: Jeff Layton Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/lockd/bind.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/lockd/bind.h b/include/linux/lockd/bind.h index 82eca0a13ccc..39c124dcb19c 100644 --- a/include/linux/lockd/bind.h +++ b/include/linux/lockd/bind.h @@ -57,6 +57,7 @@ struct nlmclnt_initdata { extern struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init); extern void nlmclnt_done(struct nlm_host *host); extern struct rpc_clnt *nlmclnt_rpc_clnt(struct nlm_host *host); +extern void nlmclnt_shutdown_rpc_clnt(struct nlm_host *host); /* * NLM client operations provide a means to modify RPC processing of NLM -- cgit v1.2.3 From f4d5f8caadd858f11b21e8a9e5c85290fc21a568 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:28 -0500 Subject: lockd: Move xdr4.h from include/linux/lockd/ to fs/lockd/ The xdr4.h header declares NLMv4-specific XDR encoder/decoder functions and error codes that are used exclusively within the lockd subsystem. Moving it from include/linux/lockd/ to fs/lockd/ clarifies the intended scope of these declarations and prevents external code from depending on lockd-internal interfaces. This change reduces the public API surface of the lockd module and makes it easier to refactor NLMv4 internals without risk of breaking out-of-tree consumers. The header's contents are implementation details of the NLMv4 wire protocol handling, not a contract with other kernel subsystems. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/lockd/bind.h | 3 --- include/linux/lockd/lockd.h | 7 ++++--- include/linux/lockd/xdr4.h | 43 ------------------------------------------- 3 files changed, 4 insertions(+), 49 deletions(-) delete mode 100644 include/linux/lockd/xdr4.h (limited to 'include/linux') diff --git a/include/linux/lockd/bind.h b/include/linux/lockd/bind.h index 39c124dcb19c..077da0696f12 100644 --- a/include/linux/lockd/bind.h +++ b/include/linux/lockd/bind.h @@ -13,9 +13,6 @@ #include /* need xdr-encoded error codes too, so... */ #include -#ifdef CONFIG_LOCKD_V4 -#include -#endif /* Dummy declarations */ struct svc_rqst; diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 0d883f48ec21..46f244141645 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -22,9 +22,6 @@ #include #include #include -#ifdef CONFIG_LOCKD_V4 -#include -#endif #include #include @@ -235,6 +232,10 @@ int nlmclnt_reclaim(struct nlm_host *, struct file_lock *, struct nlm_rqst *); void nlmclnt_next_cookie(struct nlm_cookie *); +#ifdef CONFIG_LOCKD_V4 +extern const struct rpc_version nlm_version4; +#endif + /* * Host cache */ diff --git a/include/linux/lockd/xdr4.h b/include/linux/lockd/xdr4.h deleted file mode 100644 index 72831e35dca3..000000000000 --- a/include/linux/lockd/xdr4.h +++ /dev/null @@ -1,43 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * linux/include/linux/lockd/xdr4.h - * - * XDR types for the NLM protocol - * - * Copyright (C) 1996 Olaf Kirch - */ - -#ifndef LOCKD_XDR4_H -#define LOCKD_XDR4_H - -#include -#include -#include -#include - -/* error codes new to NLMv4 */ -#define nlm4_deadlock cpu_to_be32(NLM_DEADLCK) -#define nlm4_rofs cpu_to_be32(NLM_ROFS) -#define nlm4_stale_fh cpu_to_be32(NLM_STALE_FH) -#define nlm4_fbig cpu_to_be32(NLM_FBIG) -#define nlm4_failed cpu_to_be32(NLM_FAILED) - -void nlm4svc_set_file_lock_range(struct file_lock *fl, u64 off, u64 len); -bool nlm4svc_decode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_decode_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_decode_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_decode_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_decode_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_decode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_decode_reboot(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_decode_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_decode_notify(struct svc_rqst *rqstp, struct xdr_stream *xdr); - -bool nlm4svc_encode_testres(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_encode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_encode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_encode_shareres(struct svc_rqst *rqstp, struct xdr_stream *xdr); - -extern const struct rpc_version nlm_version4; - -#endif /* LOCKD_XDR4_H */ -- cgit v1.2.3 From 4db2f8a016dc9f9b357bfbf5c507c2582bb36730 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:29 -0500 Subject: lockd: Move share.h from include/linux/lockd/ to fs/lockd/ The share.h header defines struct nlm_share and declares the DOS share management functions used by the NLM server to implement NLM_SHARE and NLM_UNSHARE operations. These interfaces are used exclusively within the lockd subsystem. A git grep search confirms no external code references them. Relocating this header from include/linux/lockd/ to fs/lockd/ narrows the public API surface of the lockd module. Out-of-tree code cannot depend on these internal interfaces after this change. Future refactoring of the share management implementation thus requires no consideration of external consumers. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/lockd/lockd.h | 2 ++ include/linux/lockd/share.h | 32 -------------------------------- 2 files changed, 2 insertions(+), 32 deletions(-) delete mode 100644 include/linux/lockd/share.h (limited to 'include/linux') diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 46f244141645..eebcecd12fae 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -155,6 +155,8 @@ struct nlm_rqst { void * a_callback_data; /* sent to nlmclnt_operations callbacks */ }; +struct nlm_share; + /* * This struct describes a file held open by lockd on behalf of * an NFS client. diff --git a/include/linux/lockd/share.h b/include/linux/lockd/share.h deleted file mode 100644 index 1f18a9faf645..000000000000 --- a/include/linux/lockd/share.h +++ /dev/null @@ -1,32 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * linux/include/linux/lockd/share.h - * - * DOS share management for lockd. - * - * Copyright (C) 1996, Olaf Kirch - */ - -#ifndef LINUX_LOCKD_SHARE_H -#define LINUX_LOCKD_SHARE_H - -/* - * DOS share for a specific file - */ -struct nlm_share { - struct nlm_share * s_next; /* linked list */ - struct nlm_host * s_host; /* client host */ - struct nlm_file * s_file; /* shared file */ - struct xdr_netobj s_owner; /* owner handle */ - u32 s_access; /* access mode */ - u32 s_mode; /* deny mode */ -}; - -__be32 nlmsvc_share_file(struct nlm_host *, struct nlm_file *, - struct nlm_args *); -__be32 nlmsvc_unshare_file(struct nlm_host *, struct nlm_file *, - struct nlm_args *); -void nlmsvc_traverse_shares(struct nlm_host *, struct nlm_file *, - nlm_host_match_fn_t); - -#endif /* LINUX_LOCKD_SHARE_H */ -- cgit v1.2.3 From 2c562c6e6715619ce34bb37d8a0a5e40fdcc7a44 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:30 -0500 Subject: lockd: Relocate include/linux/lockd/lockd.h Headers placed in include/linux/ form part of the kernel's internal API and signal to subsystem maintainers that other parts of the kernel may depend on them. By moving lockd.h into fs/lockd/, lockd becomes a more self-contained module whose internal interfaces are clearly distinguished from its public contract with the rest of the kernel. This relocation addresses a long-standing XXX comment in the header itself that acknowledged the file's misplacement. Future changes to lockd internals can now proceed with confidence that external consumers are not inadvertently coupled to implementation details. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/lockd/lockd.h | 401 -------------------------------------------- 1 file changed, 401 deletions(-) delete mode 100644 include/linux/lockd/lockd.h (limited to 'include/linux') diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h deleted file mode 100644 index eebcecd12fae..000000000000 --- a/include/linux/lockd/lockd.h +++ /dev/null @@ -1,401 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * linux/include/linux/lockd/lockd.h - * - * General-purpose lockd include file. - * - * Copyright (C) 1996 Olaf Kirch - */ - -#ifndef LINUX_LOCKD_LOCKD_H -#define LINUX_LOCKD_LOCKD_H - -/* XXX: a lot of this should really be under fs/lockd. */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Version string - */ -#define LOCKD_VERSION "0.5" - -/* - * Default timeout for RPC calls (seconds) - */ -#define LOCKD_DFLT_TIMEO 10 - -/* - * Internal-use status codes, not to be placed on the wire. - * Version handlers translate these to appropriate wire values. - */ -#define nlm__int__drop_reply cpu_to_be32(30000) -#define nlm__int__deadlock cpu_to_be32(30001) -#define nlm__int__stale_fh cpu_to_be32(30002) -#define nlm__int__failed cpu_to_be32(30003) - -/* - * Lockd host handle (used both by the client and server personality). - */ -struct nlm_host { - struct hlist_node h_hash; /* doubly linked list */ - struct sockaddr_storage h_addr; /* peer address */ - size_t h_addrlen; - struct sockaddr_storage h_srcaddr; /* our address (optional) */ - size_t h_srcaddrlen; - struct rpc_clnt *h_rpcclnt; /* RPC client to talk to peer */ - char *h_name; /* remote hostname */ - u32 h_version; /* interface version */ - unsigned short h_proto; /* transport proto */ - unsigned short h_reclaiming : 1, - h_server : 1, /* server side, not client side */ - h_noresvport : 1, - h_inuse : 1; - wait_queue_head_t h_gracewait; /* wait while reclaiming */ - struct rw_semaphore h_rwsem; /* Reboot recovery lock */ - u32 h_state; /* pseudo-state counter */ - u32 h_nsmstate; /* true remote NSM state */ - u32 h_pidcount; /* Pseudopids */ - refcount_t h_count; /* reference count */ - struct mutex h_mutex; /* mutex for pmap binding */ - unsigned long h_nextrebind; /* next portmap call */ - unsigned long h_expires; /* eligible for GC */ - struct list_head h_lockowners; /* Lockowners for the client */ - spinlock_t h_lock; - struct list_head h_granted; /* Locks in GRANTED state */ - struct list_head h_reclaim; /* Locks in RECLAIM state */ - struct nsm_handle *h_nsmhandle; /* NSM status handle */ - char *h_addrbuf; /* address eyecatcher */ - struct net *net; /* host net */ - const struct cred *h_cred; - char nodename[UNX_MAXNODENAME + 1]; - const struct nlmclnt_operations *h_nlmclnt_ops; /* Callback ops for NLM users */ -}; - -/* - * The largest string sm_addrbuf should hold is a full-size IPv6 address - * (no "::" anywhere) with a scope ID. The buffer size is computed to - * hold eight groups of colon-separated four-hex-digit numbers, a - * percent sign, a scope id (at most 32 bits, in decimal), and NUL. - */ -#define NSM_ADDRBUF ((8 * 4 + 7) + (1 + 10) + 1) - -struct nsm_handle { - struct list_head sm_link; - refcount_t sm_count; - char *sm_mon_name; - char *sm_name; - struct sockaddr_storage sm_addr; - size_t sm_addrlen; - unsigned int sm_monitored : 1, - sm_sticky : 1; /* don't unmonitor */ - struct nsm_private sm_priv; - char sm_addrbuf[NSM_ADDRBUF]; -}; - -/* - * Rigorous type checking on sockaddr type conversions - */ -static inline struct sockaddr *nlm_addr(const struct nlm_host *host) -{ - return (struct sockaddr *)&host->h_addr; -} - -static inline struct sockaddr *nlm_srcaddr(const struct nlm_host *host) -{ - return (struct sockaddr *)&host->h_srcaddr; -} - -/* - * Map an fl_owner_t into a unique 32-bit "pid" - */ -struct nlm_lockowner { - struct list_head list; - refcount_t count; - - struct nlm_host *host; - fl_owner_t owner; - uint32_t pid; -}; - -/* - * This is the representation of a blocked client lock. - */ -struct nlm_wait { - struct list_head b_list; /* linked list */ - wait_queue_head_t b_wait; /* where to wait on */ - struct nlm_host *b_host; - struct file_lock *b_lock; /* local file lock */ - __be32 b_status; /* grant callback status */ -}; - -/* - * Memory chunk for NLM client RPC request. - */ -#define NLMCLNT_OHSIZE ((__NEW_UTS_LEN) + 10u) -struct nlm_rqst { - refcount_t a_count; - unsigned int a_flags; /* initial RPC task flags */ - struct nlm_host * a_host; /* host handle */ - struct nlm_args a_args; /* arguments */ - struct nlm_res a_res; /* result */ - struct nlm_block * a_block; - unsigned int a_retries; /* Retry count */ - u8 a_owner[NLMCLNT_OHSIZE]; - void * a_callback_data; /* sent to nlmclnt_operations callbacks */ -}; - -struct nlm_share; - -/* - * This struct describes a file held open by lockd on behalf of - * an NFS client. - */ -struct nlm_file { - struct hlist_node f_list; /* linked list */ - struct nfs_fh f_handle; /* NFS file handle */ - struct file * f_file[2]; /* VFS file pointers, - indexed by O_ flags */ - struct nlm_share * f_shares; /* DOS shares */ - struct list_head f_blocks; /* blocked locks */ - unsigned int f_locks; /* guesstimate # of locks */ - unsigned int f_count; /* reference count */ - struct mutex f_mutex; /* avoid concurrent access */ -}; - -/* - * This is a server block (i.e. a lock requested by some client which - * couldn't be granted because of a conflicting lock). - */ -#define NLM_NEVER (~(unsigned long) 0) -/* timeout on non-blocking call: */ -#define NLM_TIMEOUT (7 * HZ) - -struct nlm_block { - struct kref b_count; /* Reference count */ - struct list_head b_list; /* linked list of all blocks */ - struct list_head b_flist; /* linked list (per file) */ - struct nlm_rqst * b_call; /* RPC args & callback info */ - struct svc_serv * b_daemon; /* NLM service */ - struct nlm_host * b_host; /* host handle for RPC clnt */ - unsigned long b_when; /* next re-xmit */ - unsigned int b_id; /* block id */ - unsigned char b_granted; /* VFS granted lock */ - struct nlm_file * b_file; /* file in question */ - struct cache_req * b_cache_req; /* deferred request handling */ - struct cache_deferred_req * b_deferred_req; - unsigned int b_flags; /* block flags */ -#define B_QUEUED 1 /* lock queued */ -#define B_GOT_CALLBACK 2 /* got lock or conflicting lock */ -#define B_TIMED_OUT 4 /* filesystem too slow to respond */ -}; - -/* - * Global variables - */ -extern const struct rpc_program nlm_program; -extern const struct svc_procedure nlmsvc_procedures[24]; -#ifdef CONFIG_LOCKD_V4 -extern const struct svc_procedure nlmsvc_procedures4[24]; -#endif -extern int nlmsvc_grace_period; -extern unsigned long nlm_timeout; -extern bool nsm_use_hostnames; -extern u32 nsm_local_state; - -extern struct timer_list nlmsvc_retry; - -/* - * Lockd client functions - */ -struct nlm_rqst * nlm_alloc_call(struct nlm_host *host); -int nlm_async_call(struct nlm_rqst *, u32, const struct rpc_call_ops *); -int nlm_async_reply(struct nlm_rqst *, u32, const struct rpc_call_ops *); -void nlmclnt_release_call(struct nlm_rqst *); -void nlmclnt_prepare_block(struct nlm_wait *block, struct nlm_host *host, - struct file_lock *fl); -void nlmclnt_queue_block(struct nlm_wait *block); -__be32 nlmclnt_dequeue_block(struct nlm_wait *block); -int nlmclnt_wait(struct nlm_wait *block, struct nlm_rqst *req, long timeout); -__be32 nlmclnt_grant(const struct sockaddr *addr, - const struct nlm_lock *lock); -void nlmclnt_recovery(struct nlm_host *); -int nlmclnt_reclaim(struct nlm_host *, struct file_lock *, - struct nlm_rqst *); -void nlmclnt_next_cookie(struct nlm_cookie *); - -#ifdef CONFIG_LOCKD_V4 -extern const struct rpc_version nlm_version4; -#endif - -/* - * Host cache - */ -struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap, - const size_t salen, - const unsigned short protocol, - const u32 version, - const char *hostname, - int noresvport, - struct net *net, - const struct cred *cred); -void nlmclnt_release_host(struct nlm_host *); -struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, - const char *hostname, - const size_t hostname_len); -void nlmsvc_release_host(struct nlm_host *); -struct rpc_clnt * nlm_bind_host(struct nlm_host *); -void nlm_rebind_host(struct nlm_host *); -struct nlm_host * nlm_get_host(struct nlm_host *); -void nlm_shutdown_hosts(void); -void nlm_shutdown_hosts_net(struct net *net); -void nlm_host_rebooted(const struct net *net, - const struct nlm_reboot *); - -/* - * Host monitoring - */ -int nsm_monitor(const struct nlm_host *host); -void nsm_unmonitor(const struct nlm_host *host); - -struct nsm_handle *nsm_get_handle(const struct net *net, - const struct sockaddr *sap, - const size_t salen, - const char *hostname, - const size_t hostname_len); -struct nsm_handle *nsm_reboot_lookup(const struct net *net, - const struct nlm_reboot *info); -void nsm_release(struct nsm_handle *nsm); - -/* - * This is used in garbage collection and resource reclaim - * A return value != 0 means destroy the lock/block/share - */ -typedef int (*nlm_host_match_fn_t)(void *cur, struct nlm_host *ref); - -/* - * Server-side lock handling - */ -int lock_to_openmode(struct file_lock *); -__be32 nlmsvc_lock(struct svc_rqst *, struct nlm_file *, - struct nlm_host *, struct nlm_lock *, int, - struct nlm_cookie *, int); -__be32 nlmsvc_unlock(struct net *net, struct nlm_file *, struct nlm_lock *); -__be32 nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, - struct nlm_host *host, struct nlm_lock *lock, - struct nlm_lock *conflock); -__be32 nlmsvc_cancel_blocked(struct net *net, struct nlm_file *, struct nlm_lock *); -void nlmsvc_retry_blocked(struct svc_rqst *rqstp); -void nlmsvc_traverse_blocks(struct nlm_host *, struct nlm_file *, - nlm_host_match_fn_t match); -void nlmsvc_grant_reply(struct nlm_cookie *, __be32); -void nlmsvc_release_call(struct nlm_rqst *); -void nlmsvc_locks_init_private(struct file_lock *, struct nlm_host *, pid_t); - -/* - * File handling for the server personality - */ -__be32 nlm_lookup_file(struct svc_rqst *, struct nlm_file **, - struct nlm_lock *); -void nlm_release_file(struct nlm_file *); -void nlmsvc_put_lockowner(struct nlm_lockowner *); -void nlmsvc_release_lockowner(struct nlm_lock *); -void nlmsvc_mark_resources(struct net *); -void nlmsvc_free_host_resources(struct nlm_host *); -void nlmsvc_invalidate_all(void); - -static inline struct file *nlmsvc_file_file(const struct nlm_file *file) -{ - return file->f_file[O_RDONLY] ? - file->f_file[O_RDONLY] : file->f_file[O_WRONLY]; -} - -static inline struct inode *nlmsvc_file_inode(struct nlm_file *file) -{ - return file_inode(nlmsvc_file_file(file)); -} - -static inline bool -nlmsvc_file_cannot_lock(const struct nlm_file *file) -{ - return exportfs_cannot_lock(nlmsvc_file_file(file)->f_path.dentry->d_sb->s_export_op); -} - -static inline int __nlm_privileged_request4(const struct sockaddr *sap) -{ - const struct sockaddr_in *sin = (struct sockaddr_in *)sap; - - if (ntohs(sin->sin_port) > 1023) - return 0; - - return ipv4_is_loopback(sin->sin_addr.s_addr); -} - -#if IS_ENABLED(CONFIG_IPV6) -static inline int __nlm_privileged_request6(const struct sockaddr *sap) -{ - const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; - - if (ntohs(sin6->sin6_port) > 1023) - return 0; - - if (ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_MAPPED) - return ipv4_is_loopback(sin6->sin6_addr.s6_addr32[3]); - - return ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LOOPBACK; -} -#else /* IS_ENABLED(CONFIG_IPV6) */ -static inline int __nlm_privileged_request6(const struct sockaddr *sap) -{ - return 0; -} -#endif /* IS_ENABLED(CONFIG_IPV6) */ - -/* - * Ensure incoming requests are from local privileged callers. - * - * Return TRUE if sender is local and is connecting via a privileged port; - * otherwise return FALSE. - */ -static inline int nlm_privileged_requester(const struct svc_rqst *rqstp) -{ - const struct sockaddr *sap = svc_addr(rqstp); - - switch (sap->sa_family) { - case AF_INET: - return __nlm_privileged_request4(sap); - case AF_INET6: - return __nlm_privileged_request6(sap); - default: - return 0; - } -} - -/* - * Compare two NLM locks. - * When the second lock is of type F_UNLCK, this acts like a wildcard. - */ -static inline int nlm_compare_locks(const struct file_lock *fl1, - const struct file_lock *fl2) -{ - return file_inode(fl1->c.flc_file) == file_inode(fl2->c.flc_file) - && fl1->c.flc_pid == fl2->c.flc_pid - && fl1->c.flc_owner == fl2->c.flc_owner - && fl1->fl_start == fl2->fl_start - && fl1->fl_end == fl2->fl_end - &&(fl1->c.flc_type == fl2->c.flc_type || fl2->c.flc_type == F_UNLCK); -} - -extern const struct lock_manager_operations nlmsvc_lock_operations; - -#endif /* LINUX_LOCKD_LOCKD_H */ -- cgit v1.2.3 From 236f3171ac690f632e13d391f47c68c3a8519bd2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:31 -0500 Subject: lockd: Remove lockd/debug.h The lockd include structure has unnecessary indirection. The header include/linux/lockd/debug.h is consumed only by fs/lockd/lockd.h, creating an extra compilation dependency and making the code harder to navigate. Fold the debug.h definitions directly into lockd.h and remove the now-redundant header. This reduces the include tree depth and makes the debug-related definitions easier to find when working on lockd internals. Build-tested with lockd built as module and built-in. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/lockd/debug.h | 40 ---------------------------------------- 1 file changed, 40 deletions(-) delete mode 100644 include/linux/lockd/debug.h (limited to 'include/linux') diff --git a/include/linux/lockd/debug.h b/include/linux/lockd/debug.h deleted file mode 100644 index eede2ab5246f..000000000000 --- a/include/linux/lockd/debug.h +++ /dev/null @@ -1,40 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * linux/include/linux/lockd/debug.h - * - * Debugging stuff. - * - * Copyright (C) 1996 Olaf Kirch - */ - -#ifndef LINUX_LOCKD_DEBUG_H -#define LINUX_LOCKD_DEBUG_H - -#include - -/* - * Enable lockd debugging. - * Requires RPC_DEBUG. - */ -#undef ifdebug -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -# define ifdebug(flag) if (unlikely(nlm_debug & NLMDBG_##flag)) -#else -# define ifdebug(flag) if (0) -#endif - -/* - * Debug flags - */ -#define NLMDBG_SVC 0x0001 -#define NLMDBG_CLIENT 0x0002 -#define NLMDBG_CLNTLOCK 0x0004 -#define NLMDBG_SVCLOCK 0x0008 -#define NLMDBG_MONITOR 0x0010 -#define NLMDBG_CLNTSUBS 0x0020 -#define NLMDBG_SVCSUBS 0x0040 -#define NLMDBG_HOSTCACHE 0x0080 -#define NLMDBG_XDR 0x0100 -#define NLMDBG_ALL 0x7fff - -#endif /* LINUX_LOCKD_DEBUG_H */ -- cgit v1.2.3 From 615384a24b1e6b0f091ebc1dfbf7ec8b4c27fa81 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:32 -0500 Subject: lockd: Move xdr.h from include/linux/lockd/ to fs/lockd/ The lockd subsystem unnecessarily exposes internal NLM XDR type definitions through the global include path. These definitions are not used by any code outside fs/lockd/, making them inappropriate for include/linux/lockd/. Moving xdr.h to fs/lockd/ narrows the API surface and clarifies that these types are internal implementation details. The comment in linux/lockd/bind.h stating xdr.h was needed for "xdr-encoded error codes" is stale: no lockd API consumers use those codes. Forward declarations for struct nfs_fh and struct file_lock are added to bind.h because their definitions were previously pulled in transitively through xdr.h. Additionally, nfs3proc.c and proc.c need explicit includes of filelock.h for FL_CLOSE and for accessing struct file_lock members, respectively. Built and tested with lockd client/server operations. No functional change. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/lockd/bind.h | 5 +- include/linux/lockd/xdr.h | 113 --------------------------------------------- 2 files changed, 2 insertions(+), 116 deletions(-) delete mode 100644 include/linux/lockd/xdr.h (limited to 'include/linux') diff --git a/include/linux/lockd/bind.h b/include/linux/lockd/bind.h index 077da0696f12..ba9258c96bfd 100644 --- a/include/linux/lockd/bind.h +++ b/include/linux/lockd/bind.h @@ -11,10 +11,9 @@ #define LINUX_LOCKD_BIND_H #include -/* need xdr-encoded error codes too, so... */ -#include -/* Dummy declarations */ +struct file_lock; +struct nfs_fh; struct svc_rqst; struct rpc_task; struct rpc_clnt; diff --git a/include/linux/lockd/xdr.h b/include/linux/lockd/xdr.h deleted file mode 100644 index 292e4e38d17d..000000000000 --- a/include/linux/lockd/xdr.h +++ /dev/null @@ -1,113 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * linux/include/linux/lockd/xdr.h - * - * XDR types for the NLM protocol - * - * Copyright (C) 1996 Olaf Kirch - */ - -#ifndef LOCKD_XDR_H -#define LOCKD_XDR_H - -#include -#include -#include -#include - -#define SM_MAXSTRLEN 1024 -#define SM_PRIV_SIZE 16 - -struct nsm_private { - unsigned char data[SM_PRIV_SIZE]; -}; - -struct svc_rqst; - -#define NLM_MAXCOOKIELEN 32 -#define NLM_MAXSTRLEN 1024 - -#define nlm_granted cpu_to_be32(NLM_LCK_GRANTED) -#define nlm_lck_denied cpu_to_be32(NLM_LCK_DENIED) -#define nlm_lck_denied_nolocks cpu_to_be32(NLM_LCK_DENIED_NOLOCKS) -#define nlm_lck_blocked cpu_to_be32(NLM_LCK_BLOCKED) -#define nlm_lck_denied_grace_period cpu_to_be32(NLM_LCK_DENIED_GRACE_PERIOD) - -/* Lock info passed via NLM */ -struct nlm_lock { - char * caller; - unsigned int len; /* length of "caller" */ - struct nfs_fh fh; - struct xdr_netobj oh; - u32 svid; - u64 lock_start; - u64 lock_len; - struct file_lock fl; -}; - -/* - * NLM cookies. Technically they can be 1K, but Linux only uses 8 bytes. - * FreeBSD uses 16, Apple Mac OS X 10.3 uses 20. Therefore we set it to - * 32 bytes. - */ - -struct nlm_cookie -{ - unsigned char data[NLM_MAXCOOKIELEN]; - unsigned int len; -}; - -/* - * Generic lockd arguments for all but sm_notify - */ -struct nlm_args { - struct nlm_cookie cookie; - struct nlm_lock lock; - u32 block; - u32 reclaim; - u32 state; - u32 monitor; - u32 fsm_access; - u32 fsm_mode; -}; - -/* - * Generic lockd result - */ -struct nlm_res { - struct nlm_cookie cookie; - __be32 status; - struct nlm_lock lock; -}; - -/* - * statd callback when client has rebooted - */ -struct nlm_reboot { - char *mon; - unsigned int len; - u32 state; - struct nsm_private priv; -}; - -/* - * Contents of statd callback when monitored host rebooted - */ -#define NLMSVC_XDRSIZE sizeof(struct nlm_args) - -bool nlmsvc_decode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlmsvc_decode_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlmsvc_decode_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlmsvc_decode_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlmsvc_decode_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlmsvc_decode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlmsvc_decode_reboot(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlmsvc_decode_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlmsvc_decode_notify(struct svc_rqst *rqstp, struct xdr_stream *xdr); - -bool nlmsvc_encode_testres(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlmsvc_encode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlmsvc_encode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlmsvc_encode_shareres(struct svc_rqst *rqstp, struct xdr_stream *xdr); - -#endif /* LOCKD_XDR_H */ -- cgit v1.2.3 From 5829352e568d24dd04ae112128a4f44748d073bc Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:33 -0500 Subject: lockd: Make linux/lockd/nlm.h an internal header The NLM protocol constants and status codes in nlm.h are needed only by lockd's internal implementation. NFS client code and NFSD interact with lockd through the stable API in bind.h and have no direct use for protocol-level definitions. Exposing these definitions globally via bind.h creates unnecessary coupling between lockd internals and its consumers. Moving nlm.h from include/linux/lockd/ to fs/lockd/ clarifies the API boundary: bind.h provides the lockd service interface, while nlm.h remains available only to code within fs/lockd/ that implements the protocol. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/lockd/bind.h | 2 -- include/linux/lockd/nlm.h | 58 ---------------------------------------------- 2 files changed, 60 deletions(-) delete mode 100644 include/linux/lockd/nlm.h (limited to 'include/linux') diff --git a/include/linux/lockd/bind.h b/include/linux/lockd/bind.h index ba9258c96bfd..b614e0deea72 100644 --- a/include/linux/lockd/bind.h +++ b/include/linux/lockd/bind.h @@ -10,8 +10,6 @@ #ifndef LINUX_LOCKD_BIND_H #define LINUX_LOCKD_BIND_H -#include - struct file_lock; struct nfs_fh; struct svc_rqst; diff --git a/include/linux/lockd/nlm.h b/include/linux/lockd/nlm.h deleted file mode 100644 index 6e343ef760dc..000000000000 --- a/include/linux/lockd/nlm.h +++ /dev/null @@ -1,58 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * linux/include/linux/lockd/nlm.h - * - * Declarations for the Network Lock Manager protocol. - * - * Copyright (C) 1996, Olaf Kirch - */ - -#ifndef LINUX_LOCKD_NLM_H -#define LINUX_LOCKD_NLM_H - - -/* Maximum file offset in file_lock.fl_end */ -# define NLM_OFFSET_MAX ((s32) 0x7fffffff) -# define NLM4_OFFSET_MAX ((s64) ((~(u64)0) >> 1)) - -/* Return states for NLM */ -enum { - NLM_LCK_GRANTED = 0, - NLM_LCK_DENIED = 1, - NLM_LCK_DENIED_NOLOCKS = 2, - NLM_LCK_BLOCKED = 3, - NLM_LCK_DENIED_GRACE_PERIOD = 4, -#ifdef CONFIG_LOCKD_V4 - NLM_DEADLCK = 5, - NLM_ROFS = 6, - NLM_STALE_FH = 7, - NLM_FBIG = 8, - NLM_FAILED = 9, -#endif -}; - -#define NLM_PROGRAM 100021 - -#define NLMPROC_NULL 0 -#define NLMPROC_TEST 1 -#define NLMPROC_LOCK 2 -#define NLMPROC_CANCEL 3 -#define NLMPROC_UNLOCK 4 -#define NLMPROC_GRANTED 5 -#define NLMPROC_TEST_MSG 6 -#define NLMPROC_LOCK_MSG 7 -#define NLMPROC_CANCEL_MSG 8 -#define NLMPROC_UNLOCK_MSG 9 -#define NLMPROC_GRANTED_MSG 10 -#define NLMPROC_TEST_RES 11 -#define NLMPROC_LOCK_RES 12 -#define NLMPROC_CANCEL_RES 13 -#define NLMPROC_UNLOCK_RES 14 -#define NLMPROC_GRANTED_RES 15 -#define NLMPROC_NSM_NOTIFY 16 /* statd callback */ -#define NLMPROC_SHARE 20 -#define NLMPROC_UNSHARE 21 -#define NLMPROC_NM_LOCK 22 -#define NLMPROC_FREE_ALL 23 - -#endif /* LINUX_LOCKD_NLM_H */ -- cgit v1.2.3 From adcc59114ccd402259c089b0fea24da5e4974563 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 4 Feb 2026 21:21:50 +0100 Subject: sunrpc: Kill RPC_IFDEBUG() RPC_IFDEBUG() is used in only two places. In one the user of the definition is guarded by ifdeffery, in the second one it's implied due to dprintk() usage. Kill the macro and move the ifdeffery to the regular condition with the variable defined inside, while in the second case add the same conditional and move the respective code there. Reviewed-by: Jeff Layton Signed-off-by: Andy Shevchenko Signed-off-by: Chuck Lever --- include/linux/sunrpc/debug.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/debug.h b/include/linux/sunrpc/debug.h index eb4bd62df319..93d1a11ffbfb 100644 --- a/include/linux/sunrpc/debug.h +++ b/include/linux/sunrpc/debug.h @@ -49,12 +49,10 @@ do { \ } \ } while (0) -# define RPC_IFDEBUG(x) x #else # define ifdebug(fac) if (0) # define dfprintk(fac, fmt, ...) do {} while (0) # define dfprintk_rcu(fac, fmt, ...) do {} while (0) -# define RPC_IFDEBUG(x) #endif /* -- cgit v1.2.3 From 6f57293abb8d087de830dd3f02e66d94b3e59973 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 4 Feb 2026 21:21:51 +0100 Subject: sunrpc: Fix compilation error (`make W=1`) when dprintk() is no-op Clang compiler is not happy about set but unused variables: .../flexfilelayout/flexfilelayoutdev.c:56:9: error: variable 'ret' set but not used [-Werror,-Wunused-but-set-variable] .../flexfilelayout/flexfilelayout.c:1505:6: error: variable 'err' set but not used [-Werror,-Wunused-but-set-variable] .../nfs4proc.c:9244:12: error: variable 'ptr' set but not used [-Werror,-Wunused-but-set-variable] Fix these by forwarding parameters of dprintk() to no_printk(). The positive side-effect is a format-string checker enabled even for the cases when dprintk() is no-op. Fixes: d67ae825a59d ("pnfs/flexfiles: Add the FlexFile Layout Driver") Fixes: fc931582c260 ("nfs41: create_session operation") Acked-by: Geert Uytterhoeven Reviewed-by: Jeff Layton Signed-off-by: Andy Shevchenko Signed-off-by: Chuck Lever --- include/linux/sunrpc/debug.h | 8 ++++++-- include/linux/sunrpc/sched.h | 3 --- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/debug.h b/include/linux/sunrpc/debug.h index 93d1a11ffbfb..ab61bed2f7af 100644 --- a/include/linux/sunrpc/debug.h +++ b/include/linux/sunrpc/debug.h @@ -38,6 +38,8 @@ extern unsigned int nlm_debug; do { \ ifdebug(fac) \ __sunrpc_printk(fmt, ##__VA_ARGS__); \ + else \ + no_printk(fmt, ##__VA_ARGS__); \ } while (0) # define dfprintk_rcu(fac, fmt, ...) \ @@ -46,13 +48,15 @@ do { \ rcu_read_lock(); \ __sunrpc_printk(fmt, ##__VA_ARGS__); \ rcu_read_unlock(); \ + } else { \ + no_printk(fmt, ##__VA_ARGS__); \ } \ } while (0) #else # define ifdebug(fac) if (0) -# define dfprintk(fac, fmt, ...) do {} while (0) -# define dfprintk_rcu(fac, fmt, ...) do {} while (0) +# define dfprintk(fac, fmt, ...) no_printk(fmt, ##__VA_ARGS__) +# define dfprintk_rcu(fac, fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif /* diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index ccba79ebf893..0dbdf3722537 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -95,10 +95,7 @@ struct rpc_task { int tk_rpc_status; /* Result of last RPC operation */ unsigned short tk_flags; /* misc flags */ unsigned short tk_timeouts; /* maj timeouts */ - -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) || IS_ENABLED(CONFIG_TRACEPOINTS) unsigned short tk_pid; /* debugging aid */ -#endif unsigned char tk_priority : 2,/* Task priority */ tk_garb_retry : 2, tk_cred_retry : 2; -- cgit v1.2.3 From f52792f484ba2316853736856dde19b7e7458861 Mon Sep 17 00:00:00 2001 From: Dai Ngo Date: Fri, 13 Feb 2026 10:36:30 -0800 Subject: NFSD: Enforce timeout on layout recall and integrate lease manager fencing When a layout conflict triggers a recall, enforcing a timeout is necessary to prevent excessive nfsd threads from being blocked in __break_lease ensuring the server continues servicing incoming requests efficiently. This patch introduces a new function to lease_manager_operations: lm_breaker_timedout: Invoked when a lease recall times out and is about to be disposed of. This function enables the lease manager to inform the caller whether the file_lease should remain on the flc_list or be disposed of. For the NFSD lease manager, this function now handles layout recall timeouts. If the layout type supports fencing and the client has not been fenced, a fence operation is triggered to prevent the client from accessing the block device. While the fencing operation is in progress, the conflicting file_lease remains on the flc_list until fencing is complete. This guarantees that no other clients can access the file, and the client with exclusive access is properly blocked before disposal. Signed-off-by: Dai Ngo Signed-off-by: Chuck Lever --- include/linux/filelock.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/filelock.h b/include/linux/filelock.h index d2c9740e26a8..5f0a2fb31450 100644 --- a/include/linux/filelock.h +++ b/include/linux/filelock.h @@ -50,6 +50,7 @@ struct lease_manager_operations { void (*lm_setup)(struct file_lease *, void **); bool (*lm_breaker_owns_lease)(struct file_lease *); int (*lm_open_conflict)(struct file *, int); + bool (*lm_breaker_timedout)(struct file_lease *fl); }; struct lock_manager { -- cgit v1.2.3 From 5bc37b759ec0cdde2c652a2637d704f2d6306617 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 17 Feb 2026 17:06:53 -0500 Subject: Documentation: Add the RPC language description of NLM version 4 In order to generate source code to encode and decode NLMv4 protocol elements, include a copy of the RPC language description of NLMv4 for xdrgen to process. The language description is an amalgam of RFC 1813 and the Open Group's XNFS specification: https://pubs.opengroup.org/onlinepubs/9629799/chap10.htm The C code committed here was generated from the new nlm4.x file using tools/net/sunrpc/xdrgen/xdrgen. The goals of replacing hand-written XDR functions with ones that are tool-generated are to improve memory safety and make XDR encoding and decoding less brittle to maintain. The xdrgen utility derives both the type definitions and the encode/decode functions directly from protocol specifications, using names and symbols familiar to anyone who knows those specs. Unlike hand-written code that can inadvertently diverge from the specification, xdrgen guarantees that the generated code matches the specification exactly. We would eventually like xdrgen to generate Rust code as well, making the conversion of the kernel's NFS stacks to use Rust just a little easier for us. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/xdrgen/nlm4.h | 233 +++++++++++++++++++++++++++++++++++++ 1 file changed, 233 insertions(+) create mode 100644 include/linux/sunrpc/xdrgen/nlm4.h (limited to 'include/linux') diff --git a/include/linux/sunrpc/xdrgen/nlm4.h b/include/linux/sunrpc/xdrgen/nlm4.h new file mode 100644 index 000000000000..e95e8f105624 --- /dev/null +++ b/include/linux/sunrpc/xdrgen/nlm4.h @@ -0,0 +1,233 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Generated by xdrgen. Manual edits will be lost. */ +/* XDR specification file: ../../Documentation/sunrpc/xdr/nlm4.x */ +/* XDR specification modification time: Thu Dec 25 13:10:19 2025 */ + +#ifndef _LINUX_XDRGEN_NLM4_DEF_H +#define _LINUX_XDRGEN_NLM4_DEF_H + +#include +#include + +enum { LM_MAXSTRLEN = 1024 }; + +enum { LM_MAXNAMELEN = 1025 }; + +enum { MAXNETOBJ_SZ = 1024 }; + +typedef opaque netobj; + +enum fsh4_mode { + fsm_DN = 0, + fsm_DR = 1, + fsm_DW = 2, + fsm_DRW = 3, +}; + +typedef enum fsh4_mode fsh4_mode; + +enum fsh4_access { + fsa_NONE = 0, + fsa_R = 1, + fsa_W = 2, + fsa_RW = 3, +}; + +typedef enum fsh4_access fsh4_access; + +enum { SM_MAXSTRLEN = 1024 }; + +typedef u64 uint64; + +typedef s64 int64; + +typedef u32 uint32; + +typedef s32 int32; + +enum nlm4_stats { + NLM4_GRANTED = 0, + NLM4_DENIED = 1, + NLM4_DENIED_NOLOCKS = 2, + NLM4_BLOCKED = 3, + NLM4_DENIED_GRACE_PERIOD = 4, + NLM4_DEADLCK = 5, + NLM4_ROFS = 6, + NLM4_STALE_FH = 7, + NLM4_FBIG = 8, + NLM4_FAILED = 9, +}; + +typedef __be32 nlm4_stats; + +struct nlm4_holder { + bool exclusive; + int32 svid; + netobj oh; + uint64 l_offset; + uint64 l_len; +}; + +struct nlm4_testrply { + nlm4_stats stat; + union { + struct nlm4_holder holder; + } u; +}; + +struct nlm4_stat { + nlm4_stats stat; +}; + +struct nlm4_res { + netobj cookie; + struct nlm4_stat stat; +}; + +struct nlm4_testres { + netobj cookie; + struct nlm4_testrply stat; +}; + +struct nlm4_lock { + string caller_name; + netobj fh; + netobj oh; + int32 svid; + uint64 l_offset; + uint64 l_len; +}; + +struct nlm4_lockargs { + netobj cookie; + bool block; + bool exclusive; + struct nlm4_lock alock; + bool reclaim; + int32 state; +}; + +struct nlm4_cancargs { + netobj cookie; + bool block; + bool exclusive; + struct nlm4_lock alock; +}; + +struct nlm4_testargs { + netobj cookie; + bool exclusive; + struct nlm4_lock alock; +}; + +struct nlm4_unlockargs { + netobj cookie; + struct nlm4_lock alock; +}; + +struct nlm4_share { + string caller_name; + netobj fh; + netobj oh; + fsh4_mode mode; + fsh4_access access; +}; + +struct nlm4_shareargs { + netobj cookie; + struct nlm4_share share; + bool reclaim; +}; + +struct nlm4_shareres { + netobj cookie; + nlm4_stats stat; + int32 sequence; +}; + +struct nlm4_notify { + string name; + int32 state; +}; + +enum { SM_PRIV_SIZE = 16 }; + +struct nlm4_notifyargs { + struct nlm4_notify notify; + u8 private[SM_PRIV_SIZE]; +}; + +enum { + NLMPROC4_NULL = 0, + NLMPROC4_TEST = 1, + NLMPROC4_LOCK = 2, + NLMPROC4_CANCEL = 3, + NLMPROC4_UNLOCK = 4, + NLMPROC4_GRANTED = 5, + NLMPROC4_TEST_MSG = 6, + NLMPROC4_LOCK_MSG = 7, + NLMPROC4_CANCEL_MSG = 8, + NLMPROC4_UNLOCK_MSG = 9, + NLMPROC4_GRANTED_MSG = 10, + NLMPROC4_TEST_RES = 11, + NLMPROC4_LOCK_RES = 12, + NLMPROC4_CANCEL_RES = 13, + NLMPROC4_UNLOCK_RES = 14, + NLMPROC4_GRANTED_RES = 15, + NLMPROC4_SM_NOTIFY = 16, + NLMPROC4_SHARE = 20, + NLMPROC4_UNSHARE = 21, + NLMPROC4_NM_LOCK = 22, + NLMPROC4_FREE_ALL = 23, +}; + +#ifndef NLM4_PROG +#define NLM4_PROG (100021) +#endif + +#define NLM4_netobj_sz (XDR_unsigned_int + XDR_QUADLEN(MAXNETOBJ_SZ)) +#define NLM4_fsh4_mode_sz (XDR_int) +#define NLM4_fsh4_access_sz (XDR_int) +#define NLM4_uint64_sz \ + (XDR_unsigned_hyper) +#define NLM4_int64_sz \ + (XDR_hyper) +#define NLM4_uint32_sz \ + (XDR_unsigned_long) +#define NLM4_int32_sz \ + (XDR_long) +#define NLM4_nlm4_stats_sz (XDR_int) +#define NLM4_nlm4_holder_sz \ + (XDR_bool + NLM4_int32_sz + NLM4_netobj_sz + NLM4_uint64_sz + NLM4_uint64_sz) +#define NLM4_nlm4_testrply_sz \ + (NLM4_nlm4_stats_sz + NLM4_nlm4_holder_sz) +#define NLM4_nlm4_stat_sz \ + (NLM4_nlm4_stats_sz) +#define NLM4_nlm4_res_sz \ + (NLM4_netobj_sz + NLM4_nlm4_stat_sz) +#define NLM4_nlm4_testres_sz \ + (NLM4_netobj_sz + NLM4_nlm4_testrply_sz) +#define NLM4_nlm4_lock_sz \ + (XDR_unsigned_int + XDR_QUADLEN(LM_MAXSTRLEN) + NLM4_netobj_sz + NLM4_netobj_sz + NLM4_int32_sz + NLM4_uint64_sz + NLM4_uint64_sz) +#define NLM4_nlm4_lockargs_sz \ + (NLM4_netobj_sz + XDR_bool + XDR_bool + NLM4_nlm4_lock_sz + XDR_bool + NLM4_int32_sz) +#define NLM4_nlm4_cancargs_sz \ + (NLM4_netobj_sz + XDR_bool + XDR_bool + NLM4_nlm4_lock_sz) +#define NLM4_nlm4_testargs_sz \ + (NLM4_netobj_sz + XDR_bool + NLM4_nlm4_lock_sz) +#define NLM4_nlm4_unlockargs_sz \ + (NLM4_netobj_sz + NLM4_nlm4_lock_sz) +#define NLM4_nlm4_share_sz \ + (XDR_unsigned_int + XDR_QUADLEN(LM_MAXSTRLEN) + NLM4_netobj_sz + NLM4_netobj_sz + NLM4_fsh4_mode_sz + NLM4_fsh4_access_sz) +#define NLM4_nlm4_shareargs_sz \ + (NLM4_netobj_sz + NLM4_nlm4_share_sz + XDR_bool) +#define NLM4_nlm4_shareres_sz \ + (NLM4_netobj_sz + NLM4_nlm4_stats_sz + NLM4_int32_sz) +#define NLM4_nlm4_notify_sz \ + (XDR_unsigned_int + XDR_QUADLEN(LM_MAXNAMELEN) + NLM4_int32_sz) +#define NLM4_nlm4_notifyargs_sz \ + (NLM4_nlm4_notify_sz + XDR_QUADLEN(SM_PRIV_SIZE)) +#define NLM4_MAX_ARGS_SZ \ + (NLM4_nlm4_lockargs_sz) + +#endif /* _LINUX_XDRGEN_NLM4_DEF_H */ -- cgit v1.2.3 From 17c1d66579ff27a7a8f2f407d1425272ff6fdd8c Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 23 Feb 2026 12:09:59 -0500 Subject: sunrpc: convert queue_lock from global spinlock to per-cache-detail lock The global queue_lock serializes all upcall queue operations across every cache_detail instance. Convert it to a per-cache-detail spinlock so that different caches (e.g. auth.unix.ip vs nfsd.fh) no longer contend with each other on queue operations. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/cache.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index e783132e481f..3d32dd1f7b05 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -113,6 +113,7 @@ struct cache_detail { /* fields for communication over channel */ struct list_head queue; + spinlock_t queue_lock; atomic_t writers; /* how many time is /channel open */ time64_t last_close; /* if no writers, when did last close */ -- cgit v1.2.3 From 552d0e17ea042fc4f959c4543cbbd0e54de7a8e9 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 23 Feb 2026 12:10:00 -0500 Subject: sunrpc: convert queue_wait from global to per-cache-detail waitqueue The queue_wait waitqueue is currently a file-scoped global, so a wake_up for one cache_detail wakes pollers on all caches. Convert it to a per-cache-detail field so that only pollers on the relevant cache are woken. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/cache.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index 3d32dd1f7b05..031379efba24 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -16,6 +16,7 @@ #include #include #include +#include /* * Each cache requires: @@ -114,6 +115,7 @@ struct cache_detail { /* fields for communication over channel */ struct list_head queue; spinlock_t queue_lock; + wait_queue_head_t queue_wait; atomic_t writers; /* how many time is /channel open */ time64_t last_close; /* if no writers, when did last close */ -- cgit v1.2.3 From facc4e3c80420e3466003ce09b576e005b56a015 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 23 Feb 2026 12:10:01 -0500 Subject: sunrpc: split cache_detail queue into request and reader lists Replace the single interleaved queue (which mixed cache_request and cache_reader entries distinguished by a ->reader flag) with two dedicated lists: cd->requests for upcall requests and cd->readers for open file handles. Readers now track their position via a monotonically increasing sequence number (next_seqno) rather than by their position in the shared list. Each cache_request is assigned a seqno when enqueued, and a new cache_next_request() helper finds the next request at or after a given seqno. This eliminates the cache_queue wrapper struct entirely, simplifies the reader-skipping loops in cache_read/cache_poll/cache_ioctl/ cache_release, and makes the data flow easier to reason about. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/cache.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index 031379efba24..b1e595c2615b 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -113,9 +113,11 @@ struct cache_detail { int entries; /* fields for communication over channel */ - struct list_head queue; + struct list_head requests; + struct list_head readers; spinlock_t queue_lock; wait_queue_head_t queue_wait; + u64 next_seqno; atomic_t writers; /* how many time is /channel open */ time64_t last_close; /* if no writers, when did last close */ -- cgit v1.2.3 From ee66b9e3e1c69efc986f3932555f07121c3460a7 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 26 Feb 2026 09:47:35 -0500 Subject: SUNRPC: Allocate a separate Reply page array struct svc_rqst uses a single dynamically-allocated page array (rq_pages) for both the incoming RPC Call message and the outgoing RPC Reply message. rq_respages is a sliding pointer into rq_pages that each transport receive path must compute based on how many pages the Call consumed. This boundary tracking is a source of confusion and bugs, and prevents an RPC transaction from having both a large Call and a large Reply simultaneously. Allocate rq_respages as its own page array, eliminating the boundary arithmetic. This decouples Call and Reply buffer lifetimes, following the precedent set by rq_bvec (a separate dynamically- allocated array for I/O vectors). Each svc_rqst now pins twice as many pages as before. For a server running 16 threads with a 1MB maximum payload, the additional cost is roughly 16MB of pinned memory. The new dynamic svc thread count facility keeps this overhead minimal on an idle server. A subsequent patch in this series limits per-request repopulation to only the pages released during the previous RPC, avoiding a full-array scan on each call to svc_alloc_arg(). Note: We've considered several alternatives to maintaining a full second array. Each alternative reintroduces either boundary logic complexity or I/O-path allocation pressure. rq_next_page is initialized in svc_alloc_arg() and svc_process() during Reply construction, and in svc_rdma_recvfrom() as a precaution on error paths. Transport receive paths no longer compute it from the Call size. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 47 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 62152e4f3bcc..3b1a98ab5cba 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -134,25 +134,24 @@ enum { extern u32 svc_max_payload(const struct svc_rqst *rqstp); /* - * RPC Requests and replies are stored in one or more pages. - * We maintain an array of pages for each server thread. - * Requests are copied into these pages as they arrive. Remaining - * pages are available to write the reply into. + * RPC Call and Reply messages each have their own page array. + * rq_pages holds the incoming Call message; rq_respages holds + * the outgoing Reply message. Both arrays are sized to + * svc_serv_maxpages() entries and are allocated dynamically. * - * Pages are sent using ->sendmsg with MSG_SPLICE_PAGES so each server thread - * needs to allocate more to replace those used in sending. To help keep track - * of these pages we have a receive list where all pages initialy live, and a - * send list where pages are moved to when there are to be part of a reply. + * Pages are sent using ->sendmsg with MSG_SPLICE_PAGES so each + * server thread needs to allocate more to replace those used in + * sending. * - * We use xdr_buf for holding responses as it fits well with NFS - * read responses (that have a header, and some data pages, and possibly - * a tail) and means we can share some client side routines. + * xdr_buf holds responses; the structure fits NFS read responses + * (header, data pages, optional tail) and enables sharing of + * client-side routines. * - * The xdr_buf.head kvec always points to the first page in the rq_*pages - * list. The xdr_buf.pages pointer points to the second page on that - * list. xdr_buf.tail points to the end of the first page. - * This assumes that the non-page part of an rpc reply will fit - * in a page - NFSd ensures this. lockd also has no trouble. + * The xdr_buf.head kvec always points to the first page in the + * rq_*pages list. The xdr_buf.pages pointer points to the second + * page on that list. xdr_buf.tail points to the end of the first + * page. This assumes that the non-page part of an rpc reply will + * fit in a page - NFSd ensures this. lockd also has no trouble. */ /** @@ -162,10 +161,10 @@ extern u32 svc_max_payload(const struct svc_rqst *rqstp); * Returns a count of pages or vectors that can hold the maximum * size RPC message for @serv. * - * Each request/reply pair can have at most one "payload", plus two - * pages, one for the request, and one for the reply. - * nfsd_splice_actor() might need an extra page when a READ payload - * is not page-aligned. + * Each page array can hold at most one payload plus two + * overhead pages (one for the RPC header, one for tail data). + * nfsd_splice_actor() might need an extra page when a READ + * payload is not page-aligned. */ static inline unsigned long svc_serv_maxpages(const struct svc_serv *serv) { @@ -204,11 +203,11 @@ struct svc_rqst { struct xdr_stream rq_res_stream; struct folio *rq_scratch_folio; struct xdr_buf rq_res; - unsigned long rq_maxpages; /* num of entries in rq_pages */ - struct page * *rq_pages; - struct page * *rq_respages; /* points into rq_pages */ + unsigned long rq_maxpages; /* entries per page array */ + struct page * *rq_pages; /* Call buffer pages */ + struct page * *rq_respages; /* Reply buffer pages */ struct page * *rq_next_page; /* next reply page to use */ - struct page * *rq_page_end; /* one past the last page */ + struct page * *rq_page_end; /* one past the last reply page */ struct folio_batch rq_fbatch; struct bio_vec *rq_bvec; -- cgit v1.2.3 From 7ed7504287a627834f2a35ef04e5dfd26d1c8986 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 26 Feb 2026 09:47:38 -0500 Subject: SUNRPC: Track consumed rq_pages entries The rq_pages array holds pages allocated for incoming RPC requests. Two transport receive paths NULL entries in rq_pages to prevent svc_rqst_release_pages() from freeing pages that the transport has taken ownership of: - svc_tcp_save_pages() moves partial request data pages to svsk->sk_pages during multi-fragment TCP reassembly. - svc_rdma_clear_rqst_pages() moves request data pages to head->rc_pages because they are targets of active RDMA Read WRs. A new rq_pages_nfree field in struct svc_rqst records how many entries were NULLed. svc_alloc_arg() uses it to refill only those entries rather than scanning the full rq_pages array. In steady state, the transport NULLs a handful of entries per RPC, so the allocator visits only those entries instead of the full ~259 slots (for 1MB messages). Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 3b1a98ab5cba..c3399cf64524 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -143,6 +143,15 @@ extern u32 svc_max_payload(const struct svc_rqst *rqstp); * server thread needs to allocate more to replace those used in * sending. * + * rq_pages request page contract: + * + * Transport receive paths that move request data pages out of + * rq_pages -- TCP multi-fragment reassembly (svc_tcp_save_pages) + * and RDMA Read I/O (svc_rdma_clear_rqst_pages) -- NULL those + * entries to prevent svc_rqst_release_pages() from freeing pages + * still in transport use, and set rq_pages_nfree to the count. + * svc_alloc_arg() refills only that many rq_pages entries. + * * xdr_buf holds responses; the structure fits NFS read responses * (header, data pages, optional tail) and enables sharing of * client-side routines. @@ -204,6 +213,7 @@ struct svc_rqst { struct folio *rq_scratch_folio; struct xdr_buf rq_res; unsigned long rq_maxpages; /* entries per page array */ + unsigned long rq_pages_nfree; /* rq_pages entries NULLed by transport */ struct page * *rq_pages; /* Call buffer pages */ struct page * *rq_respages; /* Reply buffer pages */ struct page * *rq_next_page; /* next reply page to use */ -- cgit v1.2.3 From d7f3efd9ff474867b04e1ea784690f02450a245b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 26 Feb 2026 09:47:39 -0500 Subject: SUNRPC: Optimize rq_respages allocation in svc_alloc_arg svc_alloc_arg() invokes alloc_pages_bulk() with the full rq_maxpages count (~259 for 1MB messages) for the rq_respages array, causing a full-array scan despite most slots holding valid pages. svc_rqst_release_pages() NULLs only the range [rq_respages, rq_next_page) after each RPC, so only that range contains NULL entries. Limit the rq_respages fill in svc_alloc_arg() to that range instead of scanning the full array. svc_init_buffer() initializes rq_next_page to span the entire rq_respages array, so the first svc_alloc_arg() call fills all slots. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index c3399cf64524..669c944eaf7f 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -152,6 +152,10 @@ extern u32 svc_max_payload(const struct svc_rqst *rqstp); * still in transport use, and set rq_pages_nfree to the count. * svc_alloc_arg() refills only that many rq_pages entries. * + * For rq_respages, svc_rqst_release_pages() NULLs entries in + * [rq_respages, rq_next_page) after each RPC. svc_alloc_arg() + * refills only that range. + * * xdr_buf holds responses; the structure fits NFS read responses * (header, data pages, optional tail) and enables sharing of * client-side routines. -- cgit v1.2.3 From ccc89b9d1ed233349cfe8d87b842e7351b74d8de Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 27 Feb 2026 09:03:28 -0500 Subject: svcrdma: Add fair queuing for Send Queue access When the Send Queue fills, multiple threads may wait for SQ slots. The previous implementation had no ordering guarantee, allowing starvation when one thread repeatedly acquires slots while others wait indefinitely. Introduce a ticket-based fair queuing system. Each waiter takes a ticket number and is served in FIFO order. This ensures forward progress for all waiters when SQ capacity is constrained. The implementation has two phases: 1. Fast path: attempt to reserve SQ slots without waiting 2. Slow path: take a ticket, wait for turn, then wait for slots The ticket system adds two atomic counters to the transport: - sc_sq_ticket_head: next ticket to issue - sc_sq_ticket_tail: ticket currently being served A dedicated wait queue (sc_sq_ticket_wait) handles ticket ordering, separate from sc_send_wait which handles SQ capacity. This separation ensures that send completions (the high-frequency wake source) wake only the current ticket holder rather than all queued waiters. Ticket handoff wakes only the ticket wait queue, and each ticket holder that exits via connection close propagates the wake to the next waiter in line. When a waiter successfully reserves slots, it advances the tail counter and wakes the next waiter. This creates an orderly handoff that prevents starvation while maintaining good throughput on the fast path when contention is low. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 57f4fd94166a..658b8498177e 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -84,6 +84,9 @@ struct svcxprt_rdma { atomic_t sc_sq_avail; /* SQEs ready to be consumed */ unsigned int sc_sq_depth; /* Depth of SQ */ + atomic_t sc_sq_ticket_head; /* Next ticket to issue */ + atomic_t sc_sq_ticket_tail; /* Ticket currently serving */ + wait_queue_head_t sc_sq_ticket_wait; /* Ticket ordering waitlist */ __be32 sc_fc_credits; /* Forward credits */ u32 sc_max_requests; /* Max requests */ u32 sc_max_bc_requests;/* Backward credits */ @@ -306,6 +309,13 @@ extern void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, struct svc_rdma_recv_ctxt *rctxt, int status); extern void svc_rdma_wake_send_waiters(struct svcxprt_rdma *rdma, int avail); +extern int svc_rdma_sq_wait(struct svcxprt_rdma *rdma, + const struct rpc_rdma_cid *cid, int sqecount); +extern int svc_rdma_post_send_err(struct svcxprt_rdma *rdma, + const struct rpc_rdma_cid *cid, + const struct ib_send_wr *bad_wr, + const struct ib_send_wr *first_wr, + int sqecount, int ret); extern int svc_rdma_sendto(struct svc_rqst *); extern int svc_rdma_result_payload(struct svc_rqst *rqstp, unsigned int offset, unsigned int length); -- cgit v1.2.3 From d16f060f3ee297424c0aba047b1d49208adb9318 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 27 Feb 2026 09:03:31 -0500 Subject: svcrdma: Add Write chunk WRs to the RPC's Send WR chain Previously, Write chunk RDMA Writes were posted via a separate ib_post_send() call with their own completion handler. Each Write chunk incurred a doorbell and generated a completion event. Link Write chunk WRs onto the RPC Reply's Send WR chain so that a single ib_post_send() call posts both the RDMA Writes and the Send WR. A single completion event signals that all operations have finished. This reduces both doorbell rate and completion rate, as well as eliminating the latency of a round-trip between the Write chunk completion and the subsequent Send WR posting. The lifecycle of Write chunk resources changes: previously, the svc_rdma_write_done() completion handler released Write chunk resources when RDMA Writes completed. With WR chaining, resources remain live until the Send completion. A new sc_write_info_list tracks Write chunk metadata attached to each Send context, and svc_rdma_write_chunk_release() frees these resources when the Send context is released. The svc_rdma_write_done() handler now handles only error cases. On success it returns immediately since the Send completion handles resource release. On failure (WR flush), it closes the connection to signal to the client that the RPC Reply is incomplete. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 658b8498177e..df6e08aaad57 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -216,6 +216,7 @@ struct svc_rdma_recv_ctxt { */ struct svc_rdma_write_info { struct svcxprt_rdma *wi_rdma; + struct list_head wi_list; const struct svc_rdma_chunk *wi_chunk; @@ -244,7 +245,10 @@ struct svc_rdma_send_ctxt { struct ib_cqe sc_cqe; struct xdr_buf sc_hdrbuf; struct xdr_stream sc_stream; + + struct list_head sc_write_info_list; struct svc_rdma_write_info sc_reply_info; + void *sc_xprt_buf; int sc_page_count; int sc_cur_sge_no; @@ -277,11 +281,14 @@ extern void svc_rdma_cc_init(struct svcxprt_rdma *rdma, extern void svc_rdma_cc_release(struct svcxprt_rdma *rdma, struct svc_rdma_chunk_ctxt *cc, enum dma_data_direction dir); +extern void svc_rdma_write_chunk_release(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt); extern void svc_rdma_reply_chunk_release(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt); -extern int svc_rdma_send_write_list(struct svcxprt_rdma *rdma, - const struct svc_rdma_recv_ctxt *rctxt, - const struct xdr_buf *xdr); +extern int svc_rdma_prepare_write_list(struct svcxprt_rdma *rdma, + const struct svc_rdma_recv_ctxt *rctxt, + struct svc_rdma_send_ctxt *sctxt, + const struct xdr_buf *xdr); extern int svc_rdma_prepare_reply_chunk(struct svcxprt_rdma *rdma, const struct svc_rdma_pcl *write_pcl, const struct svc_rdma_pcl *reply_pcl, -- cgit v1.2.3 From 3603bf99062c6d563df4fba3848f829d5401d959 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 28 Feb 2026 14:09:22 -0800 Subject: SUNRPC: xdr.h: fix all kernel-doc warnings Correct a function parameter name (s/page/folio/) and add function return value sections for multiple functions to eliminate kernel-doc warnings: Warning: include/linux/sunrpc/xdr.h:298 function parameter 'folio' not described in 'xdr_set_scratch_folio' Warning: include/linux/sunrpc/xdr.h:337 No description found for return value of 'xdr_stream_remaining' Warning: include/linux/sunrpc/xdr.h:357 No description found for return value of 'xdr_align_size' Warning: include/linux/sunrpc/xdr.h:374 No description found for return value of 'xdr_pad_size' Warning: include/linux/sunrpc/xdr.h:387 No description found for return value of 'xdr_stream_encode_item_present' Signed-off-by: Randy Dunlap Signed-off-by: Chuck Lever --- include/linux/sunrpc/xdr.h | 48 +++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 152597750f55..b639a6fafcbc 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -290,7 +290,7 @@ xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buflen) /** * xdr_set_scratch_folio - Attach a scratch buffer for decoding data * @xdr: pointer to xdr_stream struct - * @page: an anonymous folio + * @folio: an anonymous folio * * See xdr_set_scratch_buffer(). */ @@ -330,7 +330,7 @@ static inline void xdr_commit_encode(struct xdr_stream *xdr) * xdr_stream_remaining - Return the number of bytes remaining in the stream * @xdr: pointer to struct xdr_stream * - * Return value: + * Returns: * Number of bytes remaining in @xdr before xdr->end */ static inline size_t @@ -350,7 +350,7 @@ ssize_t xdr_stream_encode_opaque_auth(struct xdr_stream *xdr, u32 flavor, * xdr_align_size - Calculate padded size of an object * @n: Size of an object being XDR encoded (in bytes) * - * Return value: + * Returns: * Size (in bytes) of the object including xdr padding */ static inline size_t @@ -368,7 +368,7 @@ xdr_align_size(size_t n) * This implementation avoids the need for conditional * branches or modulo division. * - * Return value: + * Returns: * Size (in bytes) of the needed XDR pad */ static inline size_t xdr_pad_size(size_t n) @@ -380,7 +380,7 @@ static inline size_t xdr_pad_size(size_t n) * xdr_stream_encode_item_present - Encode a "present" list item * @xdr: pointer to xdr_stream * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -399,7 +399,7 @@ static inline ssize_t xdr_stream_encode_item_present(struct xdr_stream *xdr) * xdr_stream_encode_item_absent - Encode a "not present" list item * @xdr: pointer to xdr_stream * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -419,7 +419,7 @@ static inline int xdr_stream_encode_item_absent(struct xdr_stream *xdr) * @p: address in a buffer into which to encode * @n: boolean value to encode * - * Return value: + * Returns: * Address of item following the encoded boolean */ static inline __be32 *xdr_encode_bool(__be32 *p, u32 n) @@ -433,7 +433,7 @@ static inline __be32 *xdr_encode_bool(__be32 *p, u32 n) * @xdr: pointer to xdr_stream * @n: boolean value to encode * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -453,7 +453,7 @@ static inline int xdr_stream_encode_bool(struct xdr_stream *xdr, __u32 n) * @xdr: pointer to xdr_stream * @n: integer to encode * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -474,7 +474,7 @@ xdr_stream_encode_u32(struct xdr_stream *xdr, __u32 n) * @xdr: pointer to xdr_stream * @n: integer to encode * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -495,7 +495,7 @@ xdr_stream_encode_be32(struct xdr_stream *xdr, __be32 n) * @xdr: pointer to xdr_stream * @n: 64-bit integer to encode * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -517,7 +517,7 @@ xdr_stream_encode_u64(struct xdr_stream *xdr, __u64 n) * @ptr: pointer to void pointer * @len: size of object * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -542,7 +542,7 @@ xdr_stream_encode_opaque_inline(struct xdr_stream *xdr, void **ptr, size_t len) * @ptr: pointer to opaque data object * @len: size of object pointed to by @ptr * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -563,7 +563,7 @@ xdr_stream_encode_opaque_fixed(struct xdr_stream *xdr, const void *ptr, size_t l * @ptr: pointer to opaque data object * @len: size of object pointed to by @ptr * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -585,7 +585,7 @@ xdr_stream_encode_opaque(struct xdr_stream *xdr, const void *ptr, size_t len) * @array: array of integers * @array_size: number of elements in @array * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -608,7 +608,7 @@ xdr_stream_encode_uint32_array(struct xdr_stream *xdr, * xdr_item_is_absent - symbolically handle XDR discriminators * @p: pointer to undecoded discriminator * - * Return values: + * Returns: * %true if the following XDR item is absent * %false if the following XDR item is present */ @@ -621,7 +621,7 @@ static inline bool xdr_item_is_absent(const __be32 *p) * xdr_item_is_present - symbolically handle XDR discriminators * @p: pointer to undecoded discriminator * - * Return values: + * Returns: * %true if the following XDR item is present * %false if the following XDR item is absent */ @@ -635,7 +635,7 @@ static inline bool xdr_item_is_present(const __be32 *p) * @xdr: pointer to xdr_stream * @ptr: pointer to a u32 in which to store the result * - * Return values: + * Returns: * %0 on success * %-EBADMSG on XDR buffer overflow */ @@ -656,7 +656,7 @@ xdr_stream_decode_bool(struct xdr_stream *xdr, __u32 *ptr) * @xdr: pointer to xdr_stream * @ptr: location to store integer * - * Return values: + * Returns: * %0 on success * %-EBADMSG on XDR buffer overflow */ @@ -677,7 +677,7 @@ xdr_stream_decode_u32(struct xdr_stream *xdr, __u32 *ptr) * @xdr: pointer to xdr_stream * @ptr: location to store integer * - * Return values: + * Returns: * %0 on success * %-EBADMSG on XDR buffer overflow */ @@ -698,7 +698,7 @@ xdr_stream_decode_be32(struct xdr_stream *xdr, __be32 *ptr) * @xdr: pointer to xdr_stream * @ptr: location to store 64-bit integer * - * Return values: + * Returns: * %0 on success * %-EBADMSG on XDR buffer overflow */ @@ -720,7 +720,7 @@ xdr_stream_decode_u64(struct xdr_stream *xdr, __u64 *ptr) * @ptr: location to store data * @len: size of buffer pointed to by @ptr * - * Return values: + * Returns: * %0 on success * %-EBADMSG on XDR buffer overflow */ @@ -746,7 +746,7 @@ xdr_stream_decode_opaque_fixed(struct xdr_stream *xdr, void *ptr, size_t len) * on @xdr. It is therefore expected that the object it points to should * be processed immediately. * - * Return values: + * Returns: * On success, returns size of object stored in *@ptr * %-EBADMSG on XDR buffer overflow * %-EMSGSIZE if the size of the object would exceed @maxlen @@ -777,7 +777,7 @@ xdr_stream_decode_opaque_inline(struct xdr_stream *xdr, void **ptr, size_t maxle * @array: location to store the integer array or NULL * @array_size: number of elements to store * - * Return values: + * Returns: * On success, returns number of elements stored in @array * %-EBADMSG on XDR buffer overflow * %-EMSGSIZE if the size of the array exceeds @array_size -- cgit v1.2.3 From f995fc377ac7d3757e1d94e6403940c4b8f3d76e Mon Sep 17 00:00:00 2001 From: "Tycho Andersen (AMD)" Date: Tue, 24 Mar 2026 10:13:00 -0600 Subject: crypto/ccp: Implement SNP x86 shutdown The SEV firmware has support to disable SNP during an SNP_SHUTDOWN_EX command. Verify that this support is available and set the flag so that SNP is disabled when it is not being used. In cases where SNP is disabled, skip the call to amd_iommu_snp_disable(), as all of the IOMMU pages have already been made shared. Also skip the panic case, since snp_shutdown() does IPIs. Signed-off-by: Tycho Andersen (AMD) Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tom Lendacky Acked-by: Herbert Xu Link: https://patch.msgid.link/20260324161301.1353976-7-tycho@kernel.org --- include/linux/psp-sev.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index 69ffa4b4d1fa..d5099a2baca5 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -829,12 +829,14 @@ struct sev_data_range_list { * * @len: length of the command buffer read by the PSP * @iommu_snp_shutdown: Disable enforcement of SNP in the IOMMU + * @x86_snp_shutdown: Disable SNP on all cores * @rsvd1: reserved */ struct sev_data_snp_shutdown_ex { u32 len; u32 iommu_snp_shutdown:1; - u32 rsvd1:31; + u32 x86_snp_shutdown:1; + u32 rsvd1:30; } __packed; /** @@ -891,6 +893,7 @@ struct snp_feature_info { } __packed; /* Feature bits in ECX */ +#define SNP_X86_SHUTDOWN_SUPPORTED BIT(1) #define SNP_RAPL_DISABLE_SUPPORTED BIT(2) #define SNP_CIPHER_TEXT_HIDING_SUPPORTED BIT(3) #define SNP_AES_256_XTS_POLICY_SUPPORTED BIT(4) -- cgit v1.2.3 From f3b536878a3cf47e5193a96176a3ca2aaf0d848f Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 11 Mar 2026 22:14:44 -0700 Subject: powercap: correct kernel-doc function parameter names Use the correct function parameter names in kernel-doc comments to avoid these warnings: Warning: include/linux/powercap.h:254 function parameter 'name' not described in 'powercap_register_control_type' Warning: include/linux/powercap.h:298 function parameter 'nr_constraints' not described in 'powercap_register_zone' Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20260312051444.685136-1-rdunlap@infradead.org Signed-off-by: Rafael J. Wysocki --- include/linux/powercap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/powercap.h b/include/linux/powercap.h index 3d557bbcd2c7..603419db924c 100644 --- a/include/linux/powercap.h +++ b/include/linux/powercap.h @@ -238,7 +238,7 @@ static inline void *powercap_get_zone_data(struct powercap_zone *power_zone) * Advantage of this parameter is that client can embed * this data in its data structures and allocate in a * single call, preventing multiple allocations. -* @control_type_name: The Name of this control_type, which will be shown +* @name: The Name of this control_type, which will be shown * in the sysfs Interface. * @ops: Callbacks for control type. This parameter is optional. * @@ -277,7 +277,7 @@ int powercap_unregister_control_type(struct powercap_control_type *instance); * @name: A name for this zone. * @parent: A pointer to the parent power zone instance if any or NULL * @ops: Pointer to zone operation callback structure. -* @no_constraints: Number of constraints for this zone +* @nr_constraints: Number of constraints for this zone * @const_ops: Pointer to constraint callback structure * * Register a power zone under a given control type. A power zone must register -- cgit v1.2.3 From 8765715b4e8a1dd24ab5d507c42fc0bcd3d83f5c Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Fri, 13 Mar 2026 11:53:27 -0700 Subject: powercap: intel_rapl: Remove unused AVERAGE_POWER primitive The AVERAGE_POWER primitive and RAPL_PRIMITIVE_DERIVED flag are not used anywhere in the code. Remove them to simplify the primitive handling logic. No functional changes. Co-developed-by: Zhang Rui Signed-off-by: Zhang Rui Signed-off-by: Kuppuswamy Sathyanarayanan Acked-by: Srinivas Pandruvada Link: https://patch.msgid.link/20260313185333.2370733-2-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- include/linux/intel_rapl.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h index 6d694099a3ad..9e6bd654be1f 100644 --- a/include/linux/intel_rapl.h +++ b/include/linux/intel_rapl.h @@ -77,7 +77,6 @@ enum rapl_primitives { PSYS_TIME_WINDOW1, PSYS_TIME_WINDOW2, /* below are not raw primitive data */ - AVERAGE_POWER, NR_RAPL_PRIMITIVES, }; -- cgit v1.2.3 From 59eb73b98ae0b12fc9b39c08f0f5a5552cb02d1e Mon Sep 17 00:00:00 2001 From: John Groves Date: Fri, 27 Mar 2026 21:04:22 +0000 Subject: dax: Factor out dax_folio_reset_order() helper Both fs/dax.c:dax_folio_put() and drivers/dax/fsdev.c: fsdev_clear_folio_state() (the latter coming in the next commit after this one) contain nearly identical code to reset a compound DAX folio back to order-0 pages. Factor this out into a shared helper function. The new dax_folio_reset_order() function: - Clears the folio's mapping and share count - Resets compound folio state via folio_reset_order() - Clears PageHead and compound_head for each sub-page - Restores the pgmap pointer for each resulting order-0 folio - Returns the original folio order (for callers that need to advance by that many pages) Two intentional differences from the original dax_folio_put() logic: 1. folio->share is cleared unconditionally. This is correct because the DAX subsystem maintains the invariant that share != 0 only when mapping == NULL (enforced by dax_folio_make_shared()). dax_folio_put() ensures share has reached zero before calling this helper, so the unconditional clear is safe. 2. folio->pgmap is now explicitly restored for order-0 folios. For the dax_folio_put() caller this is a no-op (reads and writes back the same field). It is intentional for the upcoming fsdev_clear_folio_state() caller, which converts previously-compound folios and needs pgmap re-established for all pages regardless of order. This simplifies fsdev_clear_folio_state() from ~50 lines to ~15 lines. Suggested-by: Jonathan Cameron Reviewed-by: Ira Weiny Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Signed-off-by: John Groves Link: https://patch.msgid.link/0100019d311cc6b9-5be7428a-7f16-4774-8f90-a44b88ac5660-000000@email.amazonses.com Signed-off-by: Ira Weiny --- include/linux/dax.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index bf103f317cac..73cfc1a7c8f1 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -153,6 +153,7 @@ static inline void fs_put_dax(struct dax_device *dax_dev, void *holder) #if IS_ENABLED(CONFIG_FS_DAX) int dax_writeback_mapping_range(struct address_space *mapping, struct dax_device *dax_dev, struct writeback_control *wbc); +int dax_folio_reset_order(struct folio *folio); struct page *dax_layout_busy_page(struct address_space *mapping); struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end); -- cgit v1.2.3 From 700ecbc1f5aa02ba9ad68d7be1ef7a9c8eae07e9 Mon Sep 17 00:00:00 2001 From: John Groves Date: Fri, 27 Mar 2026 21:05:03 +0000 Subject: dax: Add dax_set_ops() for setting dax_operations at bind time Add a new dax_set_ops() function that allows drivers to set the dax_operations after the dax_device has been allocated. This is needed for fsdev_dax where the operations need to be set during probe and cleared during unbind. The fsdev driver uses devm_add_action_or_reset() for cleanup consistency, avoiding the complexity of mixing devm-managed resources with manual cleanup in a remove() callback. This ensures cleanup happens automatically in the correct reverse order when the device is unbound. Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Signed-off-by: John Groves Link: https://patch.msgid.link/0100019d311d65a0-b9c1419e-f3a0-4afd-b0bd-848f18ff5950-000000@email.amazonses.com Signed-off-by: Ira Weiny --- include/linux/dax.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index 73cfc1a7c8f1..b19bfe0c2fd1 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -243,6 +243,7 @@ static inline void dax_break_layout_final(struct inode *inode) bool dax_alive(struct dax_device *dax_dev); void *dax_get_private(struct dax_device *dax_dev); +int dax_set_ops(struct dax_device *dax_dev, const struct dax_operations *ops); long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, enum dax_access_mode mode, void **kaddr, unsigned long *pfn); size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, -- cgit v1.2.3 From eec38f5d86d27535509c99f02ccc642ceb0c3e2a Mon Sep 17 00:00:00 2001 From: John Groves Date: Fri, 27 Mar 2026 21:05:12 +0000 Subject: dax: Add fs_dax_get() func to prepare dax for fs-dax usage The fs_dax_get() function should be called by fs-dax file systems after opening a fsdev dax device. This adds holder_operations, which provides a memory failure callback path and effects exclusivity between callers of fs_dax_get(). fs_dax_get() is specific to fsdev_dax, so it checks the driver type (which required touching bus.[ch]). fs_dax_get() fails if fsdev_dax is not bound to the memory. This function serves the same role as fs_dax_get_by_bdev(), which dax file systems call after opening the pmem block device. This can't be located in fsdev.c because struct dax_device is opaque there. This will be called by fs/fuse/famfs.c in a subsequent commit. Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Signed-off-by: John Groves Link: https://patch.msgid.link/0100019d311d8750-75395c22-031b-4d5f-aebe-790dca656b87-000000@email.amazonses.com Signed-off-by: Ira Weiny --- include/linux/dax.h | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index b19bfe0c2fd1..a85e270bfb3c 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -130,7 +130,6 @@ int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk); void dax_remove_host(struct gendisk *disk); struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off, void *holder, const struct dax_holder_operations *ops); -void fs_put_dax(struct dax_device *dax_dev, void *holder); #else static inline int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk) { @@ -145,12 +144,12 @@ static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, { return NULL; } -static inline void fs_put_dax(struct dax_device *dax_dev, void *holder) -{ -} #endif /* CONFIG_BLOCK && CONFIG_FS_DAX */ #if IS_ENABLED(CONFIG_FS_DAX) +void fs_put_dax(struct dax_device *dax_dev, void *holder); +int fs_dax_get(struct dax_device *dax_dev, void *holder, + const struct dax_holder_operations *hops); int dax_writeback_mapping_range(struct address_space *mapping, struct dax_device *dax_dev, struct writeback_control *wbc); int dax_folio_reset_order(struct folio *folio); @@ -164,6 +163,15 @@ dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, void dax_unlock_mapping_entry(struct address_space *mapping, unsigned long index, dax_entry_t cookie); #else +static inline void fs_put_dax(struct dax_device *dax_dev, void *holder) +{ +} + +static inline int fs_dax_get(struct dax_device *dax_dev, void *holder, + const struct dax_holder_operations *hops) +{ + return -EOPNOTSUPP; +} static inline struct page *dax_layout_busy_page(struct address_space *mapping) { return NULL; -- cgit v1.2.3 From 2ae624d5a555d47a735fb3f4d850402859a4db77 Mon Sep 17 00:00:00 2001 From: John Groves Date: Fri, 27 Mar 2026 21:05:21 +0000 Subject: dax: export dax_dev_get() famfs needs to look up a dax_device by dev_t when resolving fmap entries that reference character dax devices. Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Signed-off-by: John Groves Link: https://patch.msgid.link/0100019d311daab5-bb212f0b-4e05-4668-bf53-d76fab56be68-000000@email.amazonses.com Signed-off-by: Ira Weiny --- include/linux/dax.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index a85e270bfb3c..9ef95b136bb8 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -54,6 +54,7 @@ struct dax_device *alloc_dax(void *private, const struct dax_operations *ops); void *dax_holder(struct dax_device *dax_dev); void put_dax(struct dax_device *dax_dev); void kill_dax(struct dax_device *dax_dev); +struct dax_device *dax_dev_get(dev_t devt); void dax_write_cache(struct dax_device *dax_dev, bool wc); bool dax_write_cache_enabled(struct dax_device *dax_dev); bool dax_synchronous(struct dax_device *dax_dev); -- cgit v1.2.3 From 698f54d4eb9034bb4366985659bd77ae471b6c4e Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Wed, 25 Mar 2026 15:55:20 +0100 Subject: usb: translate ENOSPC for user space In case of insufficient bandwidth usb_submit_urb() returns -ENOSPC. Translating this to -EIO is not optimal. There are insufficient resources not an error. EBUSY is a better fit. Signed-off-by: Oliver Neukum Link: https://patch.msgid.link/20260325145537.372993-1-oneukum@suse.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb.h b/include/linux/usb.h index 04277af4bb9d..815f2212936e 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -2075,6 +2075,8 @@ static inline int usb_translate_errors(int error_code) case -ENODEV: case -EOPNOTSUPP: return error_code; + case -ENOSPC: + return -EBUSY; default: return -EIO; } -- cgit v1.2.3 From b422f7c072ac8d9b83c3d22e03709b92626ca88a Mon Sep 17 00:00:00 2001 From: Amit Sunil Dhamne Date: Wed, 25 Mar 2026 22:22:24 +0000 Subject: mfd: max77759: add register bitmasks and modify irq configs for charger MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add register bitmasks for charger function. In addition split the charger IRQs further such that each bit represents an IRQ downstream of charger regmap irq chip. In addition populate the ack_base to offload irq ack to the regmap irq chip framework. Signed-off-by: Amit Sunil Dhamne Reviewed-by: André Draszik Link: https://patch.msgid.link/20260325-max77759-charger-v9-3-4486dd297adc@google.com Signed-off-by: Greg Kroah-Hartman --- include/linux/mfd/max77759.h | 166 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 137 insertions(+), 29 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/max77759.h b/include/linux/mfd/max77759.h index c6face34e385..ad1aa4c2b779 100644 --- a/include/linux/mfd/max77759.h +++ b/include/linux/mfd/max77759.h @@ -59,35 +59,65 @@ #define MAX77759_MAXQ_REG_AP_DATAIN0 0xb1 #define MAX77759_MAXQ_REG_UIC_SWRST 0xe0 -#define MAX77759_CHGR_REG_CHG_INT 0xb0 -#define MAX77759_CHGR_REG_CHG_INT2 0xb1 -#define MAX77759_CHGR_REG_CHG_INT_MASK 0xb2 -#define MAX77759_CHGR_REG_CHG_INT2_MASK 0xb3 -#define MAX77759_CHGR_REG_CHG_INT_OK 0xb4 -#define MAX77759_CHGR_REG_CHG_DETAILS_00 0xb5 -#define MAX77759_CHGR_REG_CHG_DETAILS_01 0xb6 -#define MAX77759_CHGR_REG_CHG_DETAILS_02 0xb7 -#define MAX77759_CHGR_REG_CHG_DETAILS_03 0xb8 -#define MAX77759_CHGR_REG_CHG_CNFG_00 0xb9 -#define MAX77759_CHGR_REG_CHG_CNFG_01 0xba -#define MAX77759_CHGR_REG_CHG_CNFG_02 0xbb -#define MAX77759_CHGR_REG_CHG_CNFG_03 0xbc -#define MAX77759_CHGR_REG_CHG_CNFG_04 0xbd -#define MAX77759_CHGR_REG_CHG_CNFG_05 0xbe -#define MAX77759_CHGR_REG_CHG_CNFG_06 0xbf -#define MAX77759_CHGR_REG_CHG_CNFG_07 0xc0 -#define MAX77759_CHGR_REG_CHG_CNFG_08 0xc1 -#define MAX77759_CHGR_REG_CHG_CNFG_09 0xc2 -#define MAX77759_CHGR_REG_CHG_CNFG_10 0xc3 -#define MAX77759_CHGR_REG_CHG_CNFG_11 0xc4 -#define MAX77759_CHGR_REG_CHG_CNFG_12 0xc5 -#define MAX77759_CHGR_REG_CHG_CNFG_13 0xc6 -#define MAX77759_CHGR_REG_CHG_CNFG_14 0xc7 -#define MAX77759_CHGR_REG_CHG_CNFG_15 0xc8 -#define MAX77759_CHGR_REG_CHG_CNFG_16 0xc9 -#define MAX77759_CHGR_REG_CHG_CNFG_17 0xca -#define MAX77759_CHGR_REG_CHG_CNFG_18 0xcb -#define MAX77759_CHGR_REG_CHG_CNFG_19 0xcc +#define MAX77759_CHGR_REG_CHG_INT 0xb0 +#define MAX77759_CHGR_REG_CHG_INT_AICL BIT(7) +#define MAX77759_CHGR_REG_CHG_INT_CHGIN BIT(6) +#define MAX77759_CHGR_REG_CHG_INT_WCIN BIT(5) +#define MAX77759_CHGR_REG_CHG_INT_CHG BIT(4) +#define MAX77759_CHGR_REG_CHG_INT_BAT BIT(3) +#define MAX77759_CHGR_REG_CHG_INT_INLIM BIT(2) +#define MAX77759_CHGR_REG_CHG_INT_THM2 BIT(1) +#define MAX77759_CHGR_REG_CHG_INT_BYP BIT(0) +#define MAX77759_CHGR_REG_CHG_INT2 0xb1 +#define MAX77759_CHGR_REG_CHG_INT2_INSEL BIT(7) +#define MAX77759_CHGR_REG_CHG_INT2_SYS_UVLO1 BIT(6) +#define MAX77759_CHGR_REG_CHG_INT2_SYS_UVLO2 BIT(5) +#define MAX77759_CHGR_REG_CHG_INT2_BAT_OILO BIT(4) +#define MAX77759_CHGR_REG_CHG_INT2_CHG_STA_CC BIT(3) +#define MAX77759_CHGR_REG_CHG_INT2_CHG_STA_CV BIT(2) +#define MAX77759_CHGR_REG_CHG_INT2_CHG_STA_TO BIT(1) +#define MAX77759_CHGR_REG_CHG_INT2_CHG_STA_DONE BIT(0) +#define MAX77759_CHGR_REG_CHG_INT_MASK 0xb2 +#define MAX77759_CHGR_REG_CHG_INT2_MASK 0xb3 +#define MAX77759_CHGR_REG_CHG_INT_OK 0xb4 +#define MAX77759_CHGR_REG_CHG_DETAILS_00 0xb5 +#define MAX77759_CHGR_REG_CHG_DETAILS_00_CHGIN_DTLS GENMASK(6, 5) +#define MAX77759_CHGR_REG_CHG_DETAILS_01 0xb6 +#define MAX77759_CHGR_REG_CHG_DETAILS_01_BAT_DTLS GENMASK(6, 4) +#define MAX77759_CHGR_REG_CHG_DETAILS_01_CHG_DTLS GENMASK(3, 0) +#define MAX77759_CHGR_REG_CHG_DETAILS_02 0xb7 +#define MAX77759_CHGR_REG_CHG_DETAILS_02_CHGIN_STS BIT(5) +#define MAX77759_CHGR_REG_CHG_DETAILS_03 0xb8 +#define MAX77759_CHGR_REG_CHG_CNFG_00 0xb9 +#define MAX77759_CHGR_REG_CHG_CNFG_00_MODE GENMASK(3, 0) +#define MAX77759_CHGR_REG_CHG_CNFG_01 0xba +#define MAX77759_CHGR_REG_CHG_CNFG_02 0xbb +#define MAX77759_CHGR_REG_CHG_CNFG_02_CHGCC GENMASK(5, 0) +#define MAX77759_CHGR_REG_CHG_CNFG_03 0xbc +#define MAX77759_CHGR_REG_CHG_CNFG_04 0xbd +#define MAX77759_CHGR_REG_CHG_CNFG_04_CHG_CV_PRM GENMASK(5, 0) +#define MAX77759_CHGR_REG_CHG_CNFG_05 0xbe +#define MAX77759_CHGR_REG_CHG_CNFG_06 0xbf +#define MAX77759_CHGR_REG_CHG_CNFG_06_CHGPROT GENMASK(3, 2) +#define MAX77759_CHGR_REG_CHG_CNFG_07 0xc0 +#define MAX77759_CHGR_REG_CHG_CNFG_08 0xc1 +#define MAX77759_CHGR_REG_CHG_CNFG_09 0xc2 +#define MAX77759_CHGR_REG_CHG_CNFG_09_CHGIN_ILIM GENMASK(6, 0) +#define MAX77759_CHGR_REG_CHG_CNFG_10 0xc3 +#define MAX77759_CHGR_REG_CHG_CNFG_11 0xc4 +#define MAX77759_CHGR_REG_CHG_CNFG_12 0xc5 +/* Wireless Charging input channel select */ +#define MAX77759_CHGR_REG_CHG_CNFG_12_WCINSEL BIT(6) +/* CHGIN/USB input channel select */ +#define MAX77759_CHGR_REG_CHG_CNFG_12_CHGINSEL BIT(5) +#define MAX77759_CHGR_REG_CHG_CNFG_13 0xc6 +#define MAX77759_CHGR_REG_CHG_CNFG_14 0xc7 +#define MAX77759_CHGR_REG_CHG_CNFG_15 0xc8 +#define MAX77759_CHGR_REG_CHG_CNFG_16 0xc9 +#define MAX77759_CHGR_REG_CHG_CNFG_17 0xca +#define MAX77759_CHGR_REG_CHG_CNFG_18 0xcb +#define MAX77759_CHGR_REG_CHG_CNFG_18_WDTEN BIT(0) +#define MAX77759_CHGR_REG_CHG_CNFG_19 0xcc /* MaxQ opcodes for max77759_maxq_command() */ #define MAX77759_MAXQ_OPCODE_MAXLENGTH (MAX77759_MAXQ_REG_AP_DATAOUT32 - \ @@ -101,6 +131,84 @@ #define MAX77759_MAXQ_OPCODE_USER_SPACE_READ 0x81 #define MAX77759_MAXQ_OPCODE_USER_SPACE_WRITE 0x82 +/* + * enum max77759_chgr_chgin_dtls_status - Charger Input Status + * @MAX77759_CHGR_CHGIN_DTLS_VBUS_UNDERVOLTAGE: + * Charger input voltage (Vchgin) < Under Voltage Threshold (Vuvlo) + * @MAX77759_CHGR_CHGIN_DTLS_VBUS_MARGINAL_VOLTAGE: Vchgin > Vuvlo and + * Vchgin < (Battery Voltage (Vbatt) + system voltage (Vsys)) + * @MAX77759_CHGR_CHGIN_DTLS_VBUS_OVERVOLTAGE: + * Vchgin > Over Voltage threshold (Vovlo) + * @MAX77759_CHGR_CHGIN_DTLS_VBUS_VALID: + * Vchgin > Vuvlo, Vchgin < Vovlo and Vchgin > (Vsys + Vbatt) + */ +enum max77759_chgr_chgin_dtls_status { + MAX77759_CHGR_CHGIN_DTLS_VBUS_UNDERVOLTAGE, + MAX77759_CHGR_CHGIN_DTLS_VBUS_MARGINAL_VOLTAGE, + MAX77759_CHGR_CHGIN_DTLS_VBUS_OVERVOLTAGE, + MAX77759_CHGR_CHGIN_DTLS_VBUS_VALID, +}; + +/* + * enum max77759_chgr_bat_dtls_states - Battery Details + * @MAX77759_CHGR_BAT_DTLS_NO_BATT_CHG_SUSP: No battery and the charger suspended + * @MAX77759_CHGR_BAT_DTLS_DEAD_BATTERY: Vbatt < Vtrickle + * @MAX77759_CHGR_BAT_DTLS_BAT_CHG_TIMER_FAULT: Charging suspended due to timer fault + * @MAX77759_CHGR_BAT_DTLS_BAT_OKAY: Battery okay and Vbatt > Min Sys Voltage (Vsysmin) + * @MAX77759_CHGR_BAT_DTLS_BAT_UNDERVOLTAGE: Battery is okay. Vtrickle < Vbatt < Vsysmin + * @MAX77759_CHGR_BAT_DTLS_BAT_OVERVOLTAGE: Battery voltage > Overvoltage threshold + * @MAX77759_CHGR_BAT_DTLS_BAT_OVERCURRENT: Battery current exceeds overcurrent threshold + * @MAX77759_CHGR_BAT_DTLS_BAT_ONLY_MODE: Battery only mode and battery level not available + */ +enum max77759_chgr_bat_dtls_states { + MAX77759_CHGR_BAT_DTLS_NO_BATT_CHG_SUSP, + MAX77759_CHGR_BAT_DTLS_DEAD_BATTERY, + MAX77759_CHGR_BAT_DTLS_BAT_CHG_TIMER_FAULT, + MAX77759_CHGR_BAT_DTLS_BAT_OKAY, + MAX77759_CHGR_BAT_DTLS_BAT_UNDERVOLTAGE, + MAX77759_CHGR_BAT_DTLS_BAT_OVERVOLTAGE, + MAX77759_CHGR_BAT_DTLS_BAT_OVERCURRENT, + MAX77759_CHGR_BAT_DTLS_BAT_ONLY_MODE, +}; + +/* + * enum max77759_chgr_chg_dtls_states - Charger Details + * @MAX77759_CHGR_CHG_DTLS_PREQUAL: Charger in prequalification mode + * @MAX77759_CHGR_CHG_DTLS_CC: Charger in fast charge const curr mode + * @MAX77759_CHGR_CHG_DTLS_CV: Charger in fast charge const voltage mode + * @MAX77759_CHGR_CHG_DTLS_TO: Charger is in top off mode + * @MAX77759_CHGR_CHG_DTLS_DONE: Charger is done + * @MAX77759_CHGR_CHG_DTLS_RSVD_1: Reserved + * @MAX77759_CHGR_CHG_DTLS_TIMER_FAULT: Charger is in timer fault mode + * @MAX77759_CHGR_CHG_DTLS_SUSP_BATT_THM: Charger is suspended as battery removal detected + * @MAX77759_CHGR_CHG_DTLS_OFF: Charger is off. Input invalid or charger disabled + * @MAX77759_CHGR_CHG_DTLS_RSVD_2: Reserved + * @MAX77759_CHGR_CHG_DTLS_RSVD_3: Reserved + * @MAX77759_CHGR_CHG_DTLS_OFF_WDOG_TIMER: Charger is off as watchdog timer expired + * @MAX77759_CHGR_CHG_DTLS_SUSP_JEITA: Charger is in JEITA control mode + */ +enum max77759_chgr_chg_dtls_states { + MAX77759_CHGR_CHG_DTLS_PREQUAL, + MAX77759_CHGR_CHG_DTLS_CC, + MAX77759_CHGR_CHG_DTLS_CV, + MAX77759_CHGR_CHG_DTLS_TO, + MAX77759_CHGR_CHG_DTLS_DONE, + MAX77759_CHGR_CHG_DTLS_RSVD_1, + MAX77759_CHGR_CHG_DTLS_TIMER_FAULT, + MAX77759_CHGR_CHG_DTLS_SUSP_BATT_THM, + MAX77759_CHGR_CHG_DTLS_OFF, + MAX77759_CHGR_CHG_DTLS_RSVD_2, + MAX77759_CHGR_CHG_DTLS_RSVD_3, + MAX77759_CHGR_CHG_DTLS_OFF_WDOG_TIMER, + MAX77759_CHGR_CHG_DTLS_SUSP_JEITA, +}; + +enum max77759_chgr_mode { + MAX77759_CHGR_MODE_OFF, + MAX77759_CHGR_MODE_CHG_BUCK_ON = 0x5, + MAX77759_CHGR_MODE_OTG_BOOST_ON = 0xA, +}; + /** * struct max77759 - core max77759 internal data structure * -- cgit v1.2.3 From f23388d0f6523cc3a72edf6e78cb11931a07da10 Mon Sep 17 00:00:00 2001 From: Amit Sunil Dhamne Date: Wed, 25 Mar 2026 22:22:25 +0000 Subject: lib/linear_ranges: Add linear_range_get_selector_high_array Add a helper function to find the selector for a given value in a linear range array. The selector should be such that the value it represents should be higher or equal to the given value. Signed-off-by: Amit Sunil Dhamne Reviewed-by: Matti Vaittinen Acked-by: Mark Brown Link: https://patch.msgid.link/20260325-max77759-charger-v9-4-4486dd297adc@google.com Signed-off-by: Greg Kroah-Hartman --- include/linux/linear_range.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/linear_range.h b/include/linux/linear_range.h index 2e4f4c3539c0..0f3037f1a94f 100644 --- a/include/linux/linear_range.h +++ b/include/linux/linear_range.h @@ -57,5 +57,8 @@ void linear_range_get_selector_within(const struct linear_range *r, int linear_range_get_selector_low_array(const struct linear_range *r, int ranges, unsigned int val, unsigned int *selector, bool *found); +int linear_range_get_selector_high_array(const struct linear_range *r, + int ranges, unsigned int val, + unsigned int *selector, bool *found); #endif -- cgit v1.2.3 From 68a66a44af6e196ca426d1250104d3018ed9e74b Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Sun, 25 Jan 2026 13:30:03 +0200 Subject: soc: qcom: ubwc: add helper to get min_acc length MDSS and GPU drivers use different approaches to get min_acc length. Add helper function that can be used by all the drivers. The helper reflects our current best guess, it blindly copies the approach adopted by the MDSS drivers and it matches current values selected by the GPU driver. Reviewed-by: Bryan O'Donoghue Acked-by: Bjorn Andersson Reviewed-by: Konrad Dybcio Reviewed-by: Dikshita Agarwal Tested-by: Wangao Wang Signed-off-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20260125-iris-ubwc-v4-1-1ff30644ac81@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- include/linux/soc/qcom/ubwc.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/qcom/ubwc.h b/include/linux/soc/qcom/ubwc.h index f052e241736c..5bdeca18d54d 100644 --- a/include/linux/soc/qcom/ubwc.h +++ b/include/linux/soc/qcom/ubwc.h @@ -74,4 +74,14 @@ static inline bool qcom_ubwc_get_ubwc_mode(const struct qcom_ubwc_cfg_data *cfg) return ret; } +/* + * This is the best guess, based on the MDSS driver, which worked so far. + */ +static inline bool qcom_ubwc_min_acc_length_64b(const struct qcom_ubwc_cfg_data *cfg) +{ + return cfg->ubwc_enc_version == UBWC_1_0 && + (cfg->ubwc_dec_version == UBWC_2_0 || + cfg->ubwc_dec_version == UBWC_3_0); +} + #endif /* __QCOM_UBWC_H__ */ -- cgit v1.2.3 From b2571ef8d4ec9bb636889a9132090bcc3449792e Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Sun, 25 Jan 2026 13:30:04 +0200 Subject: soc: qcom: ubwc: add helpers to get programmable values Currently the database stores macrotile_mode in the data. However it can be derived from the rest of the data: it should be used for UBWC encoding >= 3.0 except for several corner cases (SM8150 and SC8180X). The ubwc_bank_spread field seems to be based on the impreside data we had for the MDSS and DPU programming. In some cases UBWC engine inside the display controller doesn't need to program it, although bank spread is to be enabled. Bank swizzle is also currently stored as is, but it is almost standard (banks 1-3 for UBWC 1.0 and 2-3 for other versions), the only exception being Lemans (it uses only bank 3). Add helpers returning values from the config for now. They will be rewritten later, in a separate series, but having the helper now simplifies refacroring the code later. Tested-by: Wangao Wang Signed-off-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20260125-iris-ubwc-v4-2-1ff30644ac81@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- include/linux/soc/qcom/ubwc.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/qcom/ubwc.h b/include/linux/soc/qcom/ubwc.h index 5bdeca18d54d..f5d0e2341261 100644 --- a/include/linux/soc/qcom/ubwc.h +++ b/include/linux/soc/qcom/ubwc.h @@ -84,4 +84,19 @@ static inline bool qcom_ubwc_min_acc_length_64b(const struct qcom_ubwc_cfg_data cfg->ubwc_dec_version == UBWC_3_0); } +static inline bool qcom_ubwc_macrotile_mode(const struct qcom_ubwc_cfg_data *cfg) +{ + return cfg->macrotile_mode; +} + +static inline bool qcom_ubwc_bank_spread(const struct qcom_ubwc_cfg_data *cfg) +{ + return cfg->ubwc_bank_spread; +} + +static inline u32 qcom_ubwc_swizzle(const struct qcom_ubwc_cfg_data *cfg) +{ + return cfg->ubwc_swizzle; +} + #endif /* __QCOM_UBWC_H__ */ -- cgit v1.2.3 From c6f4e552e1eae4a5726230254108213b085e1ae3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 15 Nov 2025 19:07:41 -0800 Subject: rcutorture: Add a textbook-style trivial preemptible RCU This commit adds a trivial textbook implementation of preemptible RCU to rcutorture ("torture_type=trivial-preempt"), similar in spirit to the existing "torture_type=trivial" textbook implementation of non-preemptible RCU. Neither trivial RCU implementation has any value for production use, and are intended only to keep Paul honest in his introductory writings and presentations. [ paulmck: Apply kernel test robot feedback. ] Signed-off-by: Joel Fernandes --- include/linux/sched.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 5a5d3dbc9cdf..ffb2ad9716f0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -949,6 +949,10 @@ struct task_struct { struct srcu_ctr __percpu *trc_reader_scp; #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ +#ifdef CONFIG_TRIVIAL_PREEMPT_RCU + int rcu_trivial_preempt_nesting; +#endif /* #ifdef CONFIG_TRIVIAL_PREEMPT_RCU */ + struct sched_info sched_info; struct list_head tasks; -- cgit v1.2.3 From d978d3fc0488691f3b10919594d1d7d465fa568b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 27 Jan 2026 15:24:24 -0800 Subject: srcu: Fix SRCU read flavor macro comments The SRCU_READ_FLAVOR_FAST and SRCU_READ_FLAVOR_FAST_UPDOWN comments need repair. The former fails to not that SRCU-fast can be used in NMI handlers, and the latter says that it goes with srcu_read_lock_fast() when it really goes with srcu_read_lock_fast_updown(). This commit therefore fixes both comments. Signed-off-by: Paul E. McKenney Signed-off-by: Joel Fernandes --- include/linux/srcu.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index bb44a0bd7696..81b1938512d5 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -69,8 +69,8 @@ int init_srcu_struct_fast_updown(struct srcu_struct *ssp); #define SRCU_READ_FLAVOR_NORMAL 0x1 // srcu_read_lock(). #define SRCU_READ_FLAVOR_NMI 0x2 // srcu_read_lock_nmisafe(). // 0x4 // SRCU-lite is no longer with us. -#define SRCU_READ_FLAVOR_FAST 0x4 // srcu_read_lock_fast(). -#define SRCU_READ_FLAVOR_FAST_UPDOWN 0x8 // srcu_read_lock_fast(). +#define SRCU_READ_FLAVOR_FAST 0x4 // srcu_read_lock_fast(), also NMI-safe. +#define SRCU_READ_FLAVOR_FAST_UPDOWN 0x8 // srcu_read_lock_fast_updown(). #define SRCU_READ_FLAVOR_ALL (SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_NMI | \ SRCU_READ_FLAVOR_FAST | SRCU_READ_FLAVOR_FAST_UPDOWN) // All of the above. -- cgit v1.2.3 From 4968907016c2a54800a67273b92b3b66245bd372 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 6 Jan 2026 10:28:10 -0800 Subject: srcu: Fix s/they disables/they disable/ typo in srcu_read_unlock_fast() Typo fix in srcu_read_unlock_fast() header comment. Reported-by: Mathieu Desnoyers Signed-off-by: Paul E. McKenney Reviewed-by: Mathieu Desnoyers Signed-off-by: Joel Fernandes --- include/linux/srcutree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index be76fa4fc170..fd1a9270cb9a 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -260,7 +260,7 @@ static inline struct srcu_ctr __percpu *__srcu_ctr_to_ptr(struct srcu_struct *ss * srcu_read_unlock_fast(). * * Note that both this_cpu_inc() and atomic_long_inc() are RCU read-side - * critical sections either because they disables interrupts, because + * critical sections either because they disable interrupts, because * they are a single instruction, or because they are read-modify-write * atomic operations, depending on the whims of the architecture. * This matters because the SRCU-fast grace-period mechanism uses either -- cgit v1.2.3 From ad6ef775cbefffd6c614dfc57429c364192b5de0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 14 Jan 2026 16:18:30 -0800 Subject: rcu-tasks: Document that RCU Tasks Trace grace periods now imply RCU grace periods Now that RCU Tasks Trace is implemented in terms of SRCU-fast, the fact that each SRCU-fast grace period implies at least two RCU grace periods in turn means that each RCU Tasks Trace grace period implies at least two grace periods. This commit therefore updates the documentation accordingly. Reviewed-by: Frederic Weisbecker Reported-by: Alexei Starovoitov Signed-off-by: Paul E. McKenney Signed-off-by: Joel Fernandes --- include/linux/rcupdate.h | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 04f3f86a4145..18a85c30fd4f 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -208,12 +208,9 @@ static inline void exit_tasks_rcu_finish(void) { } /** * rcu_trace_implies_rcu_gp - does an RCU Tasks Trace grace period imply an RCU grace period? * - * As an accident of implementation, an RCU Tasks Trace grace period also - * acts as an RCU grace period. However, this could change at any time. - * Code relying on this accident must call this function to verify that - * this accident is still happening. - * - * You have been warned! + * Now that RCU Tasks Trace is implemented in terms of SRCU-fast, a + * call to synchronize_rcu_tasks_trace() is guaranteed to imply at least + * one call to synchronize_rcu(). */ static inline bool rcu_trace_implies_rcu_gp(void) { return true; } -- cgit v1.2.3 From 6e39ba4e5a82aa5469b2ac517b74a71accb0540f Mon Sep 17 00:00:00 2001 From: Pierre Gondois Date: Thu, 26 Mar 2026 21:44:01 +0100 Subject: cpufreq: Add boost_freq_req QoS request The Power Management Quality of Service (PM QoS) allows to aggregate constraints from multiple entities. It is currently used to manage the min/max frequency of a given policy. Frequency constraints can come for instance from: - Thermal framework: acpi_thermal_cpufreq_init() - Firmware: _PPC objects: acpi_processor_ppc_init() - User: by setting policyX/scaling_[min|max]_freq The minimum of the max frequency constraints is used to compute the resulting maximum allowed frequency. When enabling boost frequencies, the same frequency request object (policy->max_freq_req) as to handle requests from users is used. As a result, when setting: - scaling_max_freq - boost The last sysfs file used overwrites the request from the other sysfs file. To avoid this, create a per-policy boost_freq_req to save the boost constraints instead of overwriting the last scaling_max_freq constraint. policy_set_boost() calls the cpufreq set_boost callback. Update the newly added boost_freq_req request from there: - whenever boost is toggled - to cover all possible paths In the existing .set_boost() callbacks: - Don't update policy->max as this is done through the qos notifier cpufreq_notifier_max() which calls cpufreq_set_policy(). - Remove freq_qos_update_request() calls as the qos request is now done in policy_set_boost() and updates the new boost_freq_req $ ## Init state scaling_max_freq:1000000 cpuinfo_max_freq:1000000 $ echo 700000 > scaling_max_freq scaling_max_freq:700000 cpuinfo_max_freq:1000000 $ echo 1 > ../boost scaling_max_freq:1200000 cpuinfo_max_freq:1200000 $ echo 800000 > scaling_max_freq scaling_max_freq:800000 cpuinfo_max_freq:1200000 $ ## Final step: $ ## Without the patches: $ echo 0 > ../boost scaling_max_freq:1000000 cpuinfo_max_freq:1000000 $ ## With the patches: $ echo 0 > ../boost scaling_max_freq:800000 cpuinfo_max_freq:1000000 Note: cpufreq_frequency_table_cpuinfo() updates policy->min and max from: A. cpufreq_boost_set_sw() \-cpufreq_frequency_table_cpuinfo() B. cpufreq_policy_online() \-cpufreq_table_validate_and_sort() \-cpufreq_frequency_table_cpuinfo() Keep these updates as some drivers expect policy->min and max to be set through B. Reviewed-by: Lifeng Zheng Signed-off-by: Pierre Gondois Acked-by: Viresh Kumar Link: https://patch.msgid.link/20260326204404.1401849-3-pierre.gondois@arm.com Signed-off-by: Rafael J. Wysocki --- include/linux/cpufreq.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 8ca2bcb3d7ae..b6f6c7d06912 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -81,6 +81,7 @@ struct cpufreq_policy { struct freq_constraints constraints; struct freq_qos_request *min_freq_req; struct freq_qos_request *max_freq_req; + struct freq_qos_request *boost_freq_req; struct cpufreq_frequency_table *freq_table; enum cpufreq_table_sorting freq_table_sorted; -- cgit v1.2.3 From 5de7bcaadf160c1716b20a263cf8f5b06f658959 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 30 Mar 2026 13:11:07 -0700 Subject: x86: rename and clean up __copy_from_user_inatomic_nocache() Similarly to the previous commit, this renames the somewhat confusingly named function. But in this case, it was at least less confusing: the __copy_from_user_inatomic_nocache is indeed copying from user memory, and it is indeed ok to be used in an atomic context, so it will not warn about it. But the previous commit also removed the NTB mis-use of the __copy_from_user_inatomic_nocache() function, and as a result every call-site is now _actually_ doing a real user copy. That means that we can now do the proper user pointer verification too. End result: add proper address checking, remove the double underscores, and change the "nocache" to "nontemporal" to more accurately describe what this x86-only function actually does. It might be worth noting that only the target is non-temporal: the actual user accesses are normal memory accesses. Also worth noting is that non-x86 targets (and on older 32-bit x86 CPU's before XMM2 in the Pentium III) we end up just falling back on a regular user copy, so nothing can actually depend on the non-temporal semantics, but that has always been true. Signed-off-by: Linus Torvalds --- include/linux/uaccess.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 1f3804245c06..4c7d0b815093 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -331,16 +331,21 @@ static inline size_t probe_subpage_writeable(char __user *uaddr, size_t size) #endif /* CONFIG_ARCH_HAS_SUBPAGE_FAULTS */ -#ifndef ARCH_HAS_NOCACHE_UACCESS +#ifndef ARCH_HAS_NONTEMPORAL_UACCESS static inline __must_check unsigned long -__copy_from_user_inatomic_nocache(void *to, const void __user *from, +copy_from_user_inatomic_nontemporal(void *to, const void __user *from, unsigned long n) { + if (can_do_masked_user_access()) + from = mask_user_address(from); + else + if (!access_ok(from, n)) + return n; return __copy_from_user_inatomic(to, from, n); } -#endif /* ARCH_HAS_NOCACHE_UACCESS */ +#endif /* ARCH_HAS_NONTEMPORAL_UACCESS */ extern __must_check int check_zeroed_user(const void __user *from, size_t size); -- cgit v1.2.3 From 555aa178f8d22261d71da74df6267e6e6e97f95a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 27 Mar 2026 17:55:08 +0100 Subject: vfio: unhide vdev->debug_root When debugfs is disabled, the hisilicon driver now fails to build: drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c: In function 'hisi_acc_vfio_debug_init': drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c:1671:62: error: 'struct vfio_device' has no member named 'debug_root' 1671 | vfio_dev_migration = debugfs_lookup("migration", vdev->debug_root); | ^~ The driver otherwise relies on dead-code elimination, but this reference fails. The single struct member is not going to make much of a difference for memory consumption, so just keep this visible unconditionally. Signed-off-by: Arnd Bergmann Fixes: b398f91779b8 ("hisi_acc_vfio_pci: register debugfs for hisilicon migration driver") Link: https://lore.kernel.org/r/20260327165521.3779707-1-arnd@kernel.org Signed-off-by: Alex Williamson --- include/linux/vfio.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 50b474334a19..31b826efba00 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -74,13 +74,11 @@ struct vfio_device { u8 iommufd_attached:1; #endif u8 cdev_opened:1; -#ifdef CONFIG_DEBUG_FS /* * debug_root is a static property of the vfio_device * which must be set prior to registering the vfio_device. */ struct dentry *debug_root; -#endif }; /** -- cgit v1.2.3 From 331e5fd5bfd7aae3ab4eb947367b9d609ebb3fb3 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Thu, 26 Mar 2026 10:30:00 +0100 Subject: hwmon: (ina2xx) drop unused platform data Nobody defines struct ina2xx_platform_data. Remove platform data support from the drivers which still have it (it's effectively dead code) and remove the header. Signed-off-by: Bartosz Golaszewski Reviewed-by: Andy Shevchenko Acked-by: Jonathan Cameron Link: https://lore.kernel.org/r/20260326-drop-ina2xx-pdata-v1-1-c159437bb2df@oss.qualcomm.com [groeck: Fixed continuation line alignment] Signed-off-by: Guenter Roeck --- include/linux/platform_data/ina2xx.h | 16 ---------------- 1 file changed, 16 deletions(-) delete mode 100644 include/linux/platform_data/ina2xx.h (limited to 'include/linux') diff --git a/include/linux/platform_data/ina2xx.h b/include/linux/platform_data/ina2xx.h deleted file mode 100644 index 2aa5ee9a9050..000000000000 --- a/include/linux/platform_data/ina2xx.h +++ /dev/null @@ -1,16 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Driver for Texas Instruments INA219, INA226 power monitor chips - * - * Copyright (C) 2012 Lothar Felten - * - * For further information, see the Documentation/hwmon/ina2xx.rst file. - */ - -/** - * struct ina2xx_platform_data - ina2xx info - * @shunt_uohms shunt resistance in microohms - */ -struct ina2xx_platform_data { - long shunt_uohms; -}; -- cgit v1.2.3 From e7fef85039ccdba67d97b2a09f313aceeb6691c8 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Thu, 26 Mar 2026 13:36:29 +0530 Subject: serdev: Convert to_serdev_*() helpers to macros and use container_of_const() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If these helpers receive the 'const struct device' pointer, then the const qualifier will get dropped, leading to below warning: warning: passing argument 1 of ‘to_serdev_device_driver’ discards 'const' qualifier from pointer target type [-Wdiscarded-qualifiers] This is not an issue as of now, but with the future commits adding serdev device based driver matching, this warning will get triggered. Hence, convert these helpers to macros so that the qualifier get preserved and also use container_of_const() as container_of() is deprecated. Tested-by: Hans de Goede # ThinkPad T14s gen6 (arm64) Reviewed-by: Bartosz Golaszewski Signed-off-by: Manivannan Sadhasivam Reviewed-by: Rob Herring (Arm) Link: https://patch.msgid.link/20260326-pci-m2-e-v7-1-43324a7866e6@oss.qualcomm.com Signed-off-by: Bartosz Golaszewski --- include/linux/serdev.h | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/serdev.h b/include/linux/serdev.h index 5654c58eb73c..0c7d3c27d1f8 100644 --- a/include/linux/serdev.h +++ b/include/linux/serdev.h @@ -49,10 +49,7 @@ struct serdev_device { struct mutex write_lock; }; -static inline struct serdev_device *to_serdev_device(struct device *d) -{ - return container_of(d, struct serdev_device, dev); -} +#define to_serdev_device(d) container_of_const(d, struct serdev_device, dev) /** * struct serdev_device_driver - serdev slave device driver @@ -68,10 +65,7 @@ struct serdev_device_driver { void (*shutdown)(struct serdev_device *); }; -static inline struct serdev_device_driver *to_serdev_device_driver(struct device_driver *d) -{ - return container_of(d, struct serdev_device_driver, driver); -} +#define to_serdev_device_driver(d) container_of_const(d, struct serdev_device_driver, driver) enum serdev_parity { SERDEV_PARITY_NONE, @@ -112,10 +106,7 @@ struct serdev_controller { const struct serdev_controller_ops *ops; }; -static inline struct serdev_controller *to_serdev_controller(struct device *d) -{ - return container_of(d, struct serdev_controller, dev); -} +#define to_serdev_controller(d) container_of_const(d, struct serdev_controller, dev) static inline void *serdev_device_get_drvdata(const struct serdev_device *serdev) { -- cgit v1.2.3 From a2b4814190af5944b276c5fd708d95ea146106b3 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Thu, 26 Mar 2026 13:36:30 +0530 Subject: serdev: Add an API to find the serdev controller associated with the devicetree node Add of_find_serdev_controller_by_node() API to find the serdev controller device associated with the devicetree node. Tested-by: Hans de Goede # ThinkPad T14s gen6 (arm64) Reviewed-by: Bartosz Golaszewski Signed-off-by: Manivannan Sadhasivam Reviewed-by: Rob Herring (Arm) Link: https://patch.msgid.link/20260326-pci-m2-e-v7-2-43324a7866e6@oss.qualcomm.com Signed-off-by: Bartosz Golaszewski --- include/linux/serdev.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/serdev.h b/include/linux/serdev.h index 0c7d3c27d1f8..188c0ba62d50 100644 --- a/include/linux/serdev.h +++ b/include/linux/serdev.h @@ -334,4 +334,13 @@ static inline bool serdev_acpi_get_uart_resource(struct acpi_resource *ares, } #endif /* CONFIG_ACPI */ +#ifdef CONFIG_OF +struct serdev_controller *of_find_serdev_controller_by_node(struct device_node *node); +#else +static inline struct serdev_controller *of_find_serdev_controller_by_node(struct device_node *node) +{ + return NULL; +} +#endif /* CONFIG_OF */ + #endif /*_LINUX_SERDEV_H */ -- cgit v1.2.3 From 25bd73562941b04cfba1a278d8c84f2b1c69b8e9 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 31 Mar 2026 12:00:10 +0200 Subject: dma: contiguous: Turn heap registration logic around The CMA heap instantiation was initially developed by having the contiguous DMA code call into the CMA heap to create a new instance every time a reserved memory area is probed. Turning the CMA heap into a module would create a dependency of the kernel on a module, which doesn't work. Let's turn the logic around and do the opposite: store all the reserved memory CMA regions into the contiguous DMA code, and provide an iterator for the heap to use when it probes. Signed-off-by: Maxime Ripard Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260331-dma-buf-heaps-as-modules-v4-1-e18fda504419@kernel.org --- include/linux/dma-buf/heaps/cma.h | 16 ---------------- include/linux/dma-map-ops.h | 5 +++++ 2 files changed, 5 insertions(+), 16 deletions(-) delete mode 100644 include/linux/dma-buf/heaps/cma.h (limited to 'include/linux') diff --git a/include/linux/dma-buf/heaps/cma.h b/include/linux/dma-buf/heaps/cma.h deleted file mode 100644 index e751479e21e7..000000000000 --- a/include/linux/dma-buf/heaps/cma.h +++ /dev/null @@ -1,16 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef DMA_BUF_HEAP_CMA_H_ -#define DMA_BUF_HEAP_CMA_H_ - -struct cma; - -#ifdef CONFIG_DMABUF_HEAPS_CMA -int dma_heap_cma_register_heap(struct cma *cma); -#else -static inline int dma_heap_cma_register_heap(struct cma *cma) -{ - return 0; -} -#endif // CONFIG_DMABUF_HEAPS_CMA - -#endif // DMA_BUF_HEAP_CMA_H_ diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 60b63756df82..c4c93c72ff6f 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -99,6 +99,7 @@ static inline struct cma *dev_get_cma_area(struct device *dev) return dev->cma_area; return dma_contiguous_default_area; } +struct cma *dma_contiguous_get_area_by_idx(unsigned int idx); void dma_contiguous_reserve(phys_addr_t addr_limit); int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, @@ -117,6 +118,10 @@ static inline struct cma *dev_get_cma_area(struct device *dev) { return NULL; } +static inline struct cma *dma_contiguous_get_area_by_idx(unsigned int idx) +{ + return NULL; +} static inline void dma_contiguous_reserve(phys_addr_t limit) { } -- cgit v1.2.3 From b3707be95f045c4e526e419435af29dc9dd1c267 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 31 Mar 2026 12:00:11 +0200 Subject: dma: contiguous: Make dev_get_cma_area() a proper function As we try to enable dma-buf heaps, and the CMA one in particular, to compile as modules, we need to export dev_get_cma_area(). It's currently implemented as an inline function that returns either the content of device->cma_area or dma_contiguous_default_area. Thus, it means we need to export dma_contiguous_default_area, which isn't really something we want any module to have access to. Instead, let's make dev_get_cma_area() a proper function we will be able to export so we can avoid exporting dma_contiguous_default_area. Signed-off-by: Maxime Ripard Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260331-dma-buf-heaps-as-modules-v4-2-e18fda504419@kernel.org --- include/linux/dma-map-ops.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index c4c93c72ff6f..8604106c0c01 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -93,12 +93,7 @@ static inline void set_dma_ops(struct device *dev, #ifdef CONFIG_DMA_CMA extern struct cma *dma_contiguous_default_area; -static inline struct cma *dev_get_cma_area(struct device *dev) -{ - if (dev && dev->cma_area) - return dev->cma_area; - return dma_contiguous_default_area; -} +struct cma *dev_get_cma_area(struct device *dev); struct cma *dma_contiguous_get_area_by_idx(unsigned int idx); void dma_contiguous_reserve(phys_addr_t addr_limit); -- cgit v1.2.3 From 633040f853467a490437ace26d6a5413e64c0dd0 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 31 Mar 2026 12:00:12 +0200 Subject: dma: contiguous: Make dma_contiguous_default_area static Now that dev_get_cma_area() is no longer inline, we don't have any user of dma_contiguous_default_area() outside of contiguous.c so we can make it static. Signed-off-by: Maxime Ripard Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260331-dma-buf-heaps-as-modules-v4-3-e18fda504419@kernel.org --- include/linux/dma-map-ops.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 8604106c0c01..bef279ebeae7 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -91,8 +91,6 @@ static inline void set_dma_ops(struct device *dev, #endif /* CONFIG_ARCH_HAS_DMA_OPS */ #ifdef CONFIG_DMA_CMA -extern struct cma *dma_contiguous_default_area; - struct cma *dev_get_cma_area(struct device *dev); struct cma *dma_contiguous_get_area_by_idx(unsigned int idx); -- cgit v1.2.3 From 499d2d2f4cf9f16634db47b06dee9676611b897f Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Tue, 10 Mar 2026 10:53:49 +0100 Subject: sed-opal: Add STACK_RESET command The TCG Opal device could enter a state where no new session can be created, blocking even Discovery or PSID reset. While a power cycle or waiting for the timeout should work, there is another possibility for recovery: using the Stack Reset command. The Stack Reset command is defined in the TCG Storage Architecture Core Specification and is mandatory for all Opal devices (see Section 3.3.6 of the Opal SSC specification). This patch implements the Stack Reset command. Sending it should clear all active sessions immediately, allowing subsequent commands to run successfully. While it is a TCG transport layer command, the Linux kernel implements only Opal ioctls, so it makes sense to use the IOC_OPAL ioctl interface. The Stack Reset takes no arguments; the response can be success or pending. If the command reports a pending state, userspace can try to repeat it; in this case, the code returns -EBUSY. Signed-off-by: Milan Broz Reviewed-by: Ondrej Kozina Link: https://patch.msgid.link/20260310095349.411287-1-gmazyland@gmail.com Signed-off-by: Jens Axboe --- include/linux/sed-opal.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index aa006edb612b..0630430cc01a 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -57,6 +57,7 @@ static inline bool is_sed_ioctl(unsigned int cmd) case IOC_OPAL_LR_SET_START_LEN: case IOC_OPAL_ENABLE_DISABLE_LR: case IOC_OPAL_GET_SUM_STATUS: + case IOC_OPAL_STACK_RESET: return true; } return false; -- cgit v1.2.3 From f5587d1b6ec938afb2f74fe399a68020d66923e4 Mon Sep 17 00:00:00 2001 From: Gabriele Monaco Date: Mon, 30 Mar 2026 13:10:00 +0200 Subject: rv: Add Hybrid Automata monitor type Deterministic automata define which events are allowed in every state, but cannot define more sophisticated constraint taking into account the system's environment (e.g. time or other states not producing events). Add the Hybrid Automata monitor type as an extension of Deterministic automata where each state transition is validating a constraint on a finite number of environment variables. Hybrid automata can be used to implement timed automata, where the environment variables are clocks. Also implement the necessary functionality to handle clock constraints (ns or jiffy granularity) on state and events. Reviewed-by: Nam Cao Link: https://lore.kernel.org/r/20260330111010.153663-3-gmonaco@redhat.com Signed-off-by: Gabriele Monaco --- include/linux/rv.h | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rv.h b/include/linux/rv.h index 58774eb3aecf..0aef9e3c785c 100644 --- a/include/linux/rv.h +++ b/include/linux/rv.h @@ -81,11 +81,49 @@ struct ltl_monitor {}; #endif /* CONFIG_RV_LTL_MONITOR */ +#ifdef CONFIG_RV_HA_MONITOR +/* + * In the future, hybrid automata may rely on multiple + * environment variables, e.g. different clocks started at + * different times or running at different speed. + * For now we support only 1 variable. + */ +#define MAX_HA_ENV_LEN 1 + +/* + * Monitors can pick the preferred timer implementation: + * No timer: if monitors don't have state invariants. + * Timer wheel: lightweight invariants check but far less precise. + * Hrtimer: accurate invariants check with higher overhead. + */ +#define HA_TIMER_NONE 0 +#define HA_TIMER_WHEEL 1 +#define HA_TIMER_HRTIMER 2 + +/* + * Hybrid automaton per-object variables. + */ +struct ha_monitor { + struct da_monitor da_mon; + u64 env_store[MAX_HA_ENV_LEN]; + union { + struct hrtimer hrtimer; + struct timer_list timer; + }; +}; + +#else + +struct ha_monitor { }; + +#endif /* CONFIG_RV_HA_MONITOR */ + #define RV_PER_TASK_MONITOR_INIT (CONFIG_RV_PER_TASK_MONITORS) union rv_task_monitor { struct da_monitor da_mon; struct ltl_monitor ltl_mon; + struct ha_monitor ha_mon; }; #ifdef CONFIG_RV_REACTORS -- cgit v1.2.3 From 4a24127bd6cbf03fb17de8b43f2d8db3f55ca333 Mon Sep 17 00:00:00 2001 From: Gabriele Monaco Date: Mon, 30 Mar 2026 13:10:06 +0200 Subject: rv: Add support for per-object monitors in DA/HA RV deterministic and hybrid automata currently only support global, per-cpu and per-task monitors. It isn't possible to write a model that would follow some different type of object, like a deadline entity or a lock. Define the generic per-object monitor implementation which shares part of the implementation with the per-task monitors. The user needs to provide an id for the object (e.g. pid for tasks) and define the data type for the monitor_target (e.g. struct task_struct * for tasks). Both are supplied to the event handlers, as the id may not be easily available in the target. The monitor storage (e.g. the rv monitor, pointer to the target, etc.) is stored in a hash table indexed by id. Monitor storage objects are automatically allocated unless specified otherwise (e.g. if the creation context is unsafe for allocation). Reviewed-by: Nam Cao Link: https://lore.kernel.org/r/20260330111010.153663-9-gmonaco@redhat.com Signed-off-by: Gabriele Monaco --- include/linux/rv.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/rv.h b/include/linux/rv.h index 0aef9e3c785c..541ba404926a 100644 --- a/include/linux/rv.h +++ b/include/linux/rv.h @@ -13,6 +13,7 @@ #define RV_MON_GLOBAL 0 #define RV_MON_PER_CPU 1 #define RV_MON_PER_TASK 2 +#define RV_MON_PER_OBJ 3 #ifdef CONFIG_RV #include -- cgit v1.2.3 From c85dbddad705babfbddfef182495994f7f5262c9 Mon Sep 17 00:00:00 2001 From: Gabriele Monaco Date: Mon, 30 Mar 2026 13:10:09 +0200 Subject: sched/deadline: Move some utility functions to deadline.h Some utility functions on sched_dl_entity can be useful outside of deadline.c , for instance for modelling, without relying on raw structure fields. Move functions like dl_task_of and dl_is_implicit to deadline.h to make them available outside. Acked-by: Juri Lelli Link: https://lore.kernel.org/r/20260330111010.153663-12-gmonaco@redhat.com Signed-off-by: Gabriele Monaco --- include/linux/sched/deadline.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h index c40115d4e34d..1198138cb839 100644 --- a/include/linux/sched/deadline.h +++ b/include/linux/sched/deadline.h @@ -37,4 +37,31 @@ extern void dl_clear_root_domain_cpu(int cpu); extern u64 dl_cookie; extern bool dl_bw_visited(int cpu, u64 cookie); +static inline bool dl_server(struct sched_dl_entity *dl_se) +{ + return dl_se->dl_server; +} + +static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) +{ + BUG_ON(dl_server(dl_se)); + return container_of(dl_se, struct task_struct, dl); +} + +/* + * Regarding the deadline, a task with implicit deadline has a relative + * deadline == relative period. A task with constrained deadline has a + * relative deadline <= relative period. + * + * We support constrained deadline tasks. However, there are some restrictions + * applied only for tasks which do not have an implicit deadline. See + * update_dl_entity() to know more about such restrictions. + * + * The dl_is_implicit() returns true if the task has an implicit deadline. + */ +static inline bool dl_is_implicit(struct sched_dl_entity *dl_se) +{ + return dl_se->dl_deadline == dl_se->dl_period; +} + #endif /* _LINUX_SCHED_DEADLINE_H */ -- cgit v1.2.3 From 62ba6d66b22356a6a78fd015c37fd108710dfc32 Mon Sep 17 00:00:00 2001 From: Rosen Penev Date: Thu, 26 Mar 2026 19:48:28 -0700 Subject: EDAC/mc: Use kzalloc_flex() Convert struct mem_ctl_info to use flex array and use the new flex array helpers to enable runtime bounds checking, including annotating the array length member with __counted_by() for extra runtime analysis when requested. Move memcpy() after the counter assignment so that it is initialized before the first reference to the flex array, as the new attribute requires. [ bp: Heavily massage commit message. ] Signed-off-by: Rosen Penev Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Yazen Ghannam Reviewed-by: Qiuxu Zhuo Link: https://patch.msgid.link/20260327024828.7377-1-rosenp@gmail.com --- include/linux/edac.h | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/edac.h b/include/linux/edac.h index fa32f2aca22f..deba46b3ee25 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -541,17 +541,6 @@ struct mem_ctl_info { struct csrow_info **csrows; unsigned int nr_csrows, num_cschannel; - /* - * Memory Controller hierarchy - * - * There are basically two types of memory controller: the ones that - * sees memory sticks ("dimms"), and the ones that sees memory ranks. - * All old memory controllers enumerate memories per rank, but most - * of the recent drivers enumerate memories per DIMM, instead. - * When the memory controller is per rank, csbased is true. - */ - unsigned int n_layers; - struct edac_mc_layer *layers; bool csbased; /* @@ -609,6 +598,18 @@ struct mem_ctl_info { u8 fake_inject_layer[EDAC_MAX_LAYERS]; bool fake_inject_ue; u16 fake_inject_count; + + /* + * Memory Controller hierarchy + * + * There are basically two types of memory controller: the ones that + * sees memory sticks ("dimms"), and the ones that sees memory ranks. + * All old memory controllers enumerate memories per rank, but most + * of the recent drivers enumerate memories per DIMM, instead. + * When the memory controller is per rank, csbased is true. + */ + unsigned int n_layers; + struct edac_mc_layer layers[] __counted_by(n_layers); }; #define mci_for_each_dimm(mci, dimm) \ -- cgit v1.2.3 From 00247cbf173a9e1e2304db8e3f9172d36366b255 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 31 Mar 2026 09:37:19 -0700 Subject: refcount: Remove unused __signed_wrap function annotations With CONFIG_UBSAN_INTEGER_WRAP being replaced by Overflow Behavior Types, remove the __signed_wrap function annotation as it is already unused, and any future work here will use OBT annotations instead. Link: https://patch.msgid.link/20260331163725.2765789-1-kees@kernel.org Signed-off-by: Kees Cook --- include/linux/compiler_types.h | 9 +-------- include/linux/refcount.h | 10 +++++----- 2 files changed, 6 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 890076d0974b..e8fd77593b68 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -432,18 +432,11 @@ struct ftrace_likely_data { #define at_least #endif -/* Do not trap wrapping arithmetic within an annotated function. */ -#ifdef CONFIG_UBSAN_INTEGER_WRAP -# define __signed_wrap __attribute__((no_sanitize("signed-integer-overflow"))) -#else -# define __signed_wrap -#endif - /* Section for code which can't be instrumented at all */ #define __noinstr_section(section) \ noinline notrace __attribute((__section__(section))) \ __no_kcsan __no_sanitize_address __no_profile __no_sanitize_coverage \ - __no_sanitize_memory __signed_wrap + __no_sanitize_memory #define noinstr __noinstr_section(".noinstr.text") diff --git a/include/linux/refcount.h b/include/linux/refcount.h index 3da377ffb0c2..ba7657ced281 100644 --- a/include/linux/refcount.h +++ b/include/linux/refcount.h @@ -170,7 +170,7 @@ static inline unsigned int refcount_read(const refcount_t *r) return atomic_read(&r->refs); } -static inline __must_check __signed_wrap +static inline __must_check bool __refcount_add_not_zero(int i, refcount_t *r, int *oldp) { int old = refcount_read(r); @@ -212,7 +212,7 @@ static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r) return __refcount_add_not_zero(i, r, NULL); } -static inline __must_check __signed_wrap +static inline __must_check bool __refcount_add_not_zero_limited_acquire(int i, refcount_t *r, int *oldp, int limit) { @@ -244,7 +244,7 @@ __refcount_inc_not_zero_limited_acquire(refcount_t *r, int *oldp, int limit) return __refcount_add_not_zero_limited_acquire(1, r, oldp, limit); } -static inline __must_check __signed_wrap +static inline __must_check bool __refcount_add_not_zero_acquire(int i, refcount_t *r, int *oldp) { return __refcount_add_not_zero_limited_acquire(i, r, oldp, INT_MAX); @@ -277,7 +277,7 @@ static inline __must_check bool refcount_add_not_zero_acquire(int i, refcount_t return __refcount_add_not_zero_acquire(i, r, NULL); } -static inline __signed_wrap +static inline void __refcount_add(int i, refcount_t *r, int *oldp) { int old = atomic_fetch_add_relaxed(i, &r->refs); @@ -383,7 +383,7 @@ static inline void refcount_inc(refcount_t *r) __refcount_inc(r, NULL); } -static inline __must_check __signed_wrap +static inline __must_check bool __refcount_sub_and_test(int i, refcount_t *r, int *oldp) { int old = atomic_fetch_sub_release(i, &r->refs); -- cgit v1.2.3 From c76fef7dcd9372e3476d4df5e0a72ed5919a814b Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 31 Mar 2026 23:10:20 +0200 Subject: bpf: Fix grace period wait for tracepoint bpf_link Recently, tracepoints were switched from using disabled preemption (which acts as RCU read section) to SRCU-fast when they are not faultable. This means that to do a proper grace period wait for programs running in such tracepoints, we must use SRCU's grace period wait. This is only for non-faultable tracepoints, faultable ones continue using RCU Tasks Trace. However, bpf_link_free() currently does call_rcu() for all cases when the link is non-sleepable (hence, for tracepoints, non-faultable). Fix this by doing a call_srcu() grace period wait. As far RCU Tasks Trace gp -> RCU gp chaining is concerned, it is deemed unnecessary for tracepoint programs. The link and program are either accessed under RCU Tasks Trace protection, or SRCU-fast protection now. The earlier logic of chaining both RCU Tasks Trace and RCU gp waits was to generalize the logic, even if it conceded an extra RCU gp wait, however that is unnecessary for tracepoints even before this change. In practice no cost was paid since rcu_trace_implies_rcu_gp() was always true. Hence we need not chaining any RCU gp after the SRCU gp. For instance, in the non-faultable raw tracepoint, the RCU read section of the program in __bpf_trace_run() is enclosed in the SRCU gp, likewise for faultable raw tracepoint, the program is under the RCU Tasks Trace protection. Hence, the outermost scope can be waited upon to ensure correctness. Also, sleepable programs cannot be attached to non-faultable tracepoints, so whenever program or link is sleepable, only RCU Tasks Trace protection is being used for the link and prog. Fixes: a46023d5616e ("tracing: Guard __DECLARE_TRACE() use of __DO_TRACE_CALL() with SRCU-fast") Reviewed-by: Sun Jian Reviewed-by: Puranjay Mohan Acked-by: Andrii Nakryiko Signed-off-by: Kumar Kartikeya Dwivedi Acked-by: Steven Rostedt (Google) Link: https://lore.kernel.org/r/20260331211021.1632902-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 4 ++++ include/linux/tracepoint.h | 20 ++++++++++++++++++++ 2 files changed, 24 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 05b34a6355b0..35b1e25bd104 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1854,6 +1854,10 @@ struct bpf_link_ops { * target hook is sleepable, we'll go through tasks trace RCU GP and * then "classic" RCU GP; this need for chaining tasks trace and * classic RCU GPs is designated by setting bpf_link->sleepable flag + * + * For non-sleepable tracepoint links we go through SRCU gp instead, + * since RCU is not used in that case. Sleepable tracepoints still + * follow the scheme above. */ void (*dealloc_deferred)(struct bpf_link *link); int (*detach)(struct bpf_link *link); diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 22ca1c8b54f3..1d7f29f5e901 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -122,6 +122,22 @@ static inline bool tracepoint_is_faultable(struct tracepoint *tp) { return tp->ext && tp->ext->faultable; } +/* + * Run RCU callback with the appropriate grace period wait for non-faultable + * tracepoints, e.g., those used in atomic context. + */ +static inline void call_tracepoint_unregister_atomic(struct rcu_head *rcu, rcu_callback_t func) +{ + call_srcu(&tracepoint_srcu, rcu, func); +} +/* + * Run RCU callback with the appropriate grace period wait for faultable + * tracepoints, e.g., those used in syscall context. + */ +static inline void call_tracepoint_unregister_syscall(struct rcu_head *rcu, rcu_callback_t func) +{ + call_rcu_tasks_trace(rcu, func); +} #else static inline void tracepoint_synchronize_unregister(void) { } @@ -129,6 +145,10 @@ static inline bool tracepoint_is_faultable(struct tracepoint *tp) { return false; } +static inline void call_tracepoint_unregister_atomic(struct rcu_head *rcu, rcu_callback_t func) +{ } +static inline void call_tracepoint_unregister_syscall(struct rcu_head *rcu, rcu_callback_t func) +{ } #endif #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS -- cgit v1.2.3 From 10a4eb5882ba16164ece86d99486084f02f148bb Mon Sep 17 00:00:00 2001 From: Siddharth Nayyar Date: Thu, 26 Mar 2026 21:25:02 +0000 Subject: module: define ksym_flags enumeration to represent kernel symbol flags The core architectural issue with kernel symbol flags is our reliance on splitting the main symbol table, ksymtab. To handle a single boolean property, such as GPL-only, all exported symbols are split across two separate tables: __ksymtab and __ksymtab_gpl. This design forces the module loader to perform a separate search on each of these tables for every symbol it needs, for vmlinux and for all previously loaded modules. This approach is fundamentally not scalable. If we were to introduce a second flag, we would need four distinct symbol tables. For n boolean flags, this model requires an exponential growth to 2^n tables, dramatically increasing complexity. Another consequence of this fragmentation is degraded performance. For example, a binary search on the symbol table of vmlinux, that would take only 14 comparison steps (assuming ~2^14 or 16K symbols) in a unified table, can require up to 26 steps when spread across two tables (assuming both tables have ~2^13 symbols). This performance penalty worsens as more flags are added. To address this, symbol flags is an enumeration used to represent flags as a bitset, for example a flag to tell if a symbol is GPL only. The said bitset is introduced in subsequent patches and will contain values of kernel symbol flags. These bitset will then be used to infer flag values rather than fragmenting ksymtab for separating symbols with different flag values, thereby eliminating the need to fragment the ksymtab. Link: https://lore.kernel.org/r/20260326-kflagstab-v5-0-fa0796fe88d9@google.com Signed-off-by: Siddharth Nayyar Reviewed-by: Petr Pavlu [Sami: Updated the commit message to explain the use case for the series.] Signed-off-by: Sami Tolvanen --- include/linux/module_symbol.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/module_symbol.h b/include/linux/module_symbol.h index 77c9895b9ddb..574609aced99 100644 --- a/include/linux/module_symbol.h +++ b/include/linux/module_symbol.h @@ -2,6 +2,11 @@ #ifndef _LINUX_MODULE_SYMBOL_H #define _LINUX_MODULE_SYMBOL_H +/* Kernel symbol flags bitset. */ +enum ksym_flags { + KSYM_FLAG_GPL_ONLY = 1 << 0, +}; + /* This ignores the intensely annoying "mapping symbols" found in ELF files. */ static inline bool is_mapping_symbol(const char *str) { -- cgit v1.2.3 From 16d0e04f546ffba78c74bbfeb57d93147bcaf2c5 Mon Sep 17 00:00:00 2001 From: Siddharth Nayyar Date: Thu, 26 Mar 2026 21:25:04 +0000 Subject: module: populate kflagstab in modpost This patch adds the ability to create entries for kernel symbol flag bitsets in kflagstab. Modpost populates only the GPL-only flag for now. Signed-off-by: Siddharth Nayyar Reviewed-by: Petr Pavlu Signed-off-by: Sami Tolvanen --- include/linux/export-internal.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/export-internal.h b/include/linux/export-internal.h index d445705ac13c..4123c7592404 100644 --- a/include/linux/export-internal.h +++ b/include/linux/export-internal.h @@ -69,4 +69,11 @@ ".long " #crc "\n" \ ".previous" "\n") +#define SYMBOL_FLAGS(sym, flags) \ + asm(" .section \"___kflagstab+" #sym "\",\"a\"" "\n" \ + "__flags_" #sym ":" "\n" \ + " .byte " #flags "\n" \ + " .previous" "\n" \ + ) + #endif /* __LINUX_EXPORT_INTERNAL_H__ */ -- cgit v1.2.3 From 55fcb926b6d8b5cfb40873e4840a69961db1bb69 Mon Sep 17 00:00:00 2001 From: Siddharth Nayyar Date: Thu, 26 Mar 2026 21:25:05 +0000 Subject: module: use kflagstab instead of *_gpl sections Read kflagstab section for vmlinux and modules to determine whether kernel symbols are GPL only. This patch eliminates the need for fragmenting the ksymtab for infering the value of GPL-only symbol flag, henceforth stop populating *_gpl versions of the ksymtab and kcrctab in modpost. Signed-off-by: Siddharth Nayyar Reviewed-by: Petr Pavlu Signed-off-by: Sami Tolvanen --- include/linux/export-internal.h | 21 +++++++++++---------- include/linux/module.h | 1 + 2 files changed, 12 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/export-internal.h b/include/linux/export-internal.h index 4123c7592404..726054614752 100644 --- a/include/linux/export-internal.h +++ b/include/linux/export-internal.h @@ -37,14 +37,14 @@ * section flag requires it. Use '%progbits' instead of '@progbits' since the * former apparently works on all arches according to the binutils source. */ -#define __KSYMTAB(name, sym, sec, ns) \ +#define __KSYMTAB(name, sym, ns) \ asm(" .section \"__ksymtab_strings\",\"aMS\",%progbits,1" "\n" \ "__kstrtab_" #name ":" "\n" \ " .asciz \"" #name "\"" "\n" \ "__kstrtabns_" #name ":" "\n" \ " .asciz \"" ns "\"" "\n" \ " .previous" "\n" \ - " .section \"___ksymtab" sec "+" #name "\", \"a\"" "\n" \ + " .section \"___ksymtab+" #name "\", \"a\"" "\n" \ __KSYM_ALIGN "\n" \ "__ksymtab_" #name ":" "\n" \ __KSYM_REF(sym) "\n" \ @@ -59,15 +59,16 @@ #define KSYM_FUNC(name) name #endif -#define KSYMTAB_FUNC(name, sec, ns) __KSYMTAB(name, KSYM_FUNC(name), sec, ns) -#define KSYMTAB_DATA(name, sec, ns) __KSYMTAB(name, name, sec, ns) +#define KSYMTAB_FUNC(name, ns) __KSYMTAB(name, KSYM_FUNC(name), ns) +#define KSYMTAB_DATA(name, ns) __KSYMTAB(name, name, ns) -#define SYMBOL_CRC(sym, crc, sec) \ - asm(".section \"___kcrctab" sec "+" #sym "\",\"a\"" "\n" \ - ".balign 4" "\n" \ - "__crc_" #sym ":" "\n" \ - ".long " #crc "\n" \ - ".previous" "\n") +#define SYMBOL_CRC(sym, crc) \ + asm(" .section \"___kcrctab+" #sym "\",\"a\"" "\n" \ + " .balign 4" "\n" \ + "__crc_" #sym ":" "\n" \ + " .long " #crc "\n" \ + " .previous" "\n" \ + ) #define SYMBOL_FLAGS(sym, flags) \ asm(" .section \"___kflagstab+" #sym "\",\"a\"" "\n" \ diff --git a/include/linux/module.h b/include/linux/module.h index 60ed1c3e0ed9..917b29332e15 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -419,6 +419,7 @@ struct module { /* Exported symbols */ const struct kernel_symbol *syms; const u32 *crcs; + const u8 *flagstab; unsigned int num_syms; #ifdef CONFIG_ARCH_USES_CFI_TRAPS -- cgit v1.2.3 From b4760ff2a5e4351c59d185967735f59c0b0bd7f6 Mon Sep 17 00:00:00 2001 From: Siddharth Nayyar Date: Thu, 26 Mar 2026 21:25:06 +0000 Subject: module: deprecate usage of *_gpl sections in module loader The *_gpl section are not being used populated by modpost anymore. Hence the module loader doesn't need to find and process these sections in modules. This patch also simplifies symbol finding logic in module loader since *_gpl sections don't have to be searched anymore. Signed-off-by: Siddharth Nayyar Reviewed-by: Petr Pavlu Signed-off-by: Sami Tolvanen --- include/linux/module.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index 917b29332e15..7566815fabbe 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -435,9 +435,6 @@ struct module { unsigned int num_kp; /* GPL-only exported symbols. */ - unsigned int num_gpl_syms; - const struct kernel_symbol *gpl_syms; - const u32 *gpl_crcs; bool using_gplonly_symbols; #ifdef CONFIG_MODULE_SIG -- cgit v1.2.3 From 8c0ef7b56d6bbbc53f2d43d99c195144f01b0775 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 11 Mar 2026 22:14:00 -0700 Subject: lis3lv02d: fix kernel-doc warnings Use the correct kernel-doc format to avoid kernel-doc warnings: Warning: include/linux/lis3lv02d.h:125 struct member 'st_min_limits' not described in 'lis3lv02d_platform_data' Warning: include/linux/lis3lv02d.h:125 struct member 'st_max_limits' not described in 'lis3lv02d_platform_data' Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20260312051400.682991-1-rdunlap@infradead.org Signed-off-by: Greg Kroah-Hartman --- include/linux/lis3lv02d.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lis3lv02d.h b/include/linux/lis3lv02d.h index b72b8cdba765..feb60ba4e30e 100644 --- a/include/linux/lis3lv02d.h +++ b/include/linux/lis3lv02d.h @@ -30,8 +30,8 @@ * @default_rate: Default sampling rate. 0 means reset default * @setup_resources: Interrupt line setup call back function * @release_resources: Interrupt line release call back function - * @st_min_limits[3]: Selftest acceptance minimum values - * @st_max_limits[3]: Selftest acceptance maximum values + * @st_min_limits: Selftest acceptance minimum values (x, y, z) + * @st_max_limits: Selftest acceptance maximum values (x, y, z) * @irq2: Irq line 2 number * * Platform data is used to setup the sensor chip. Meaning of the different -- cgit v1.2.3 From 8b7b85384fad6e21e8a28628e7ebacb5a6329de4 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Mon, 23 Mar 2026 09:20:42 +0200 Subject: memblock: move reserve_bootmem_range() to memblock.c and make it static reserve_bootmem_region() is only called from memmap_init_reserved_pages() and it was in mm/mm_init.c because of its dependecies on static init_deferred_page(). Since init_deferred_page() is not static anymore, move reserve_bootmem_region(), rename it to memmap_init_reserved_range() and make it static. Update the comment describing it to better reflect what the function does and drop bogus comment about reserved pages in free_bootmem_page(). Update memblock test stubs to reflect the core changes. Reviewed-by: Lorenzo Stoakes (Oracle) Reviewed-by: David Hildenbrand (Arm) Link: https://patch.msgid.link/20260323072042.3651061-1-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) --- include/linux/bootmem_info.h | 4 ---- include/linux/mm.h | 3 --- 2 files changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bootmem_info.h b/include/linux/bootmem_info.h index 4c506e76a808..492ceeb1cdf8 100644 --- a/include/linux/bootmem_info.h +++ b/include/linux/bootmem_info.h @@ -44,10 +44,6 @@ static inline void free_bootmem_page(struct page *page) { enum bootmem_type type = bootmem_type(page); - /* - * The reserve_bootmem_region sets the reserved flag on bootmem - * pages. - */ VM_BUG_ON_PAGE(page_ref_count(page) != 2, page); if (type == SECTION_INFO || type == MIX_SECTION_INFO) diff --git a/include/linux/mm.h b/include/linux/mm.h index abb4963c1f06..764d10fdfb5d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3686,9 +3686,6 @@ extern unsigned long free_reserved_area(void *start, void *end, extern void adjust_managed_page_count(struct page *page, long count); -extern void reserve_bootmem_region(phys_addr_t start, - phys_addr_t end, int nid); - /* Free the reserved page into the buddy system, so it gets managed. */ void free_reserved_page(struct page *page); -- cgit v1.2.3 From 87ce9e83ab8be5daf64351cd481ffa6537778e6b Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Mon, 23 Mar 2026 09:48:35 +0200 Subject: memblock, treewide: make memblock_free() handle late freeing It shouldn't be responsibility of memblock users to detect if they free memory allocated from memblock late and should use memblock_free_late(). Make memblock_free() and memblock_phys_free() take care of late memory freeing and drop memblock_free_late(). Link: https://patch.msgid.link/20260323074836.3653702-9-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) --- include/linux/memblock.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 6ec5e9ac0699..6f6c5b5c4a4b 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -172,8 +172,6 @@ void __next_mem_range_rev(u64 *idx, int nid, enum memblock_flags flags, struct memblock_type *type_b, phys_addr_t *out_start, phys_addr_t *out_end, int *out_nid); -void memblock_free_late(phys_addr_t base, phys_addr_t size); - #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP static inline void __next_physmem_range(u64 *idx, struct memblock_type *type, phys_addr_t *out_start, -- cgit v1.2.3 From 36776b7f8a8955b4e75b5d490a75fee0c7a2a7ef Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 31 Mar 2026 17:21:43 +0200 Subject: lib/hexdump: print_hex_dump_bytes() calls print_hex_dump_debug() print_hex_dump_bytes() claims to be a simple wrapper around print_hex_dump(), but it actally calls print_hex_dump_debug(), which means no output is printed if (dynamic) DEBUG is disabled. Update the documentation to match the implementation. Fixes: 091cb0994edd20d6 ("lib/hexdump: make print_hex_dump_bytes() a nop on !DEBUG builds") Signed-off-by: Geert Uytterhoeven Reviewed-by: Petr Mladek Link: https://patch.msgid.link/3d5c3069fd9102ecaf81d044b750cd613eb72a08.1774970392.git.geert+renesas@glider.be Signed-off-by: Petr Mladek --- include/linux/printk.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/printk.h b/include/linux/printk.h index 45c663124c9b..dc02e217a9d1 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -803,7 +803,8 @@ static inline void print_hex_dump_debug(const char *prefix_str, int prefix_type, #endif /** - * print_hex_dump_bytes - shorthand form of print_hex_dump() with default params + * print_hex_dump_bytes - shorthand form of print_hex_dump_debug() with default + * params * @prefix_str: string to prefix each line with; * caller supplies trailing spaces for alignment if desired * @prefix_type: controls whether prefix of an offset, address, or none @@ -811,7 +812,7 @@ static inline void print_hex_dump_debug(const char *prefix_str, int prefix_type, * @buf: data blob to dump * @len: number of bytes in the @buf * - * Calls print_hex_dump(), with log level of KERN_DEBUG, + * Calls print_hex_dump_debug(), with log level of KERN_DEBUG, * rowsize of 16, groupsize of 1, and ASCII output included. */ #define print_hex_dump_bytes(prefix_str, prefix_type, buf, len) \ -- cgit v1.2.3 From b7e8590987aa94c9dc51518fad0e58cb887b1db5 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 30 Mar 2026 14:16:34 +0200 Subject: netfilter: ipset: use nla_strcmp for IPSET_ATTR_NAME attr IPSET_ATTR_NAME and IPSET_ATTR_NAMEREF are of NLA_STRING type, they cannot be treated like a c-string. They either have to be switched to NLA_NUL_STRING, or the compare operations need to use the nla functions. Fixes: f830837f0eed ("netfilter: ipset: list:set set type support") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/ipset/ip_set.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index e9f4f845d760..b98331572ad2 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -309,7 +309,7 @@ enum { /* register and unregister set references */ extern ip_set_id_t ip_set_get_byname(struct net *net, - const char *name, struct ip_set **set); + const struct nlattr *name, struct ip_set **set); extern void ip_set_put_byindex(struct net *net, ip_set_id_t index); extern void ip_set_name_byindex(struct net *net, ip_set_id_t index, char *name); extern ip_set_id_t ip_set_nfnl_get_byindex(struct net *net, ip_set_id_t index); -- cgit v1.2.3 From 88c4bd90725557796c15878b7cb70066e9e6b5ab Mon Sep 17 00:00:00 2001 From: Michal Wilczynski Date: Thu, 3 Apr 2025 15:10:51 +0200 Subject: firmware: thead: Fix buffer overflow and use standard endian macros Addresses two issues in the TH1520 AON firmware protocol driver: 1. Fix a potential buffer overflow where the code used unsafe pointer arithmetic to access the 'mode' field through the 'resource' pointer with an offset. This was flagged by Smatch static checker as: "buffer overflow 'data' 2 <= 3" 2. Replace custom RPC_SET_BE* and RPC_GET_BE* macros with standard kernel endianness conversion macros (cpu_to_be16, etc.) for better portability and maintainability. The functionality was re-tested with the GPU power-up sequence, confirming the GPU powers up correctly and the driver probes successfully. [ 12.702370] powervr ffef400000.gpu: [drm] loaded firmware powervr/rogue_36.52.104.182_v1.fw [ 12.711043] powervr ffef400000.gpu: [drm] FW version v1.0 (build 6645434 OS) [ 12.719787] [drm] Initialized powervr 1.0.0 for ffef400000.gpu on minor 0 Fixes: e4b3cbd840e5 ("firmware: thead: Add AON firmware protocol driver") Reported-by: Dan Carpenter Closes: https://lore.kernel.org/all/17a0ccce-060b-4b9d-a3c4-8d5d5823b1c9@stanley.mountain/ Signed-off-by: Michal Wilczynski Reviewed-by: Dan Carpenter Acked-by: Drew Fustini Cc: stable@vger.kernel.org Signed-off-by: Ulf Hansson --- include/linux/firmware/thead/thead,th1520-aon.h | 74 ------------------------- 1 file changed, 74 deletions(-) (limited to 'include/linux') diff --git a/include/linux/firmware/thead/thead,th1520-aon.h b/include/linux/firmware/thead/thead,th1520-aon.h index dae132b66873..d81f5f6f5b90 100644 --- a/include/linux/firmware/thead/thead,th1520-aon.h +++ b/include/linux/firmware/thead/thead,th1520-aon.h @@ -97,80 +97,6 @@ struct th1520_aon_rpc_ack_common { #define RPC_GET_SVC_FLAG_ACK_TYPE(MESG) (((MESG)->svc & 0x40) >> 6) #define RPC_SET_SVC_FLAG_ACK_TYPE(MESG, ACK) ((MESG)->svc |= (ACK) << 6) -#define RPC_SET_BE64(MESG, OFFSET, SET_DATA) \ - do { \ - u8 *data = (u8 *)(MESG); \ - u64 _offset = (OFFSET); \ - u64 _set_data = (SET_DATA); \ - data[_offset + 7] = _set_data & 0xFF; \ - data[_offset + 6] = (_set_data & 0xFF00) >> 8; \ - data[_offset + 5] = (_set_data & 0xFF0000) >> 16; \ - data[_offset + 4] = (_set_data & 0xFF000000) >> 24; \ - data[_offset + 3] = (_set_data & 0xFF00000000) >> 32; \ - data[_offset + 2] = (_set_data & 0xFF0000000000) >> 40; \ - data[_offset + 1] = (_set_data & 0xFF000000000000) >> 48; \ - data[_offset + 0] = (_set_data & 0xFF00000000000000) >> 56; \ - } while (0) - -#define RPC_SET_BE32(MESG, OFFSET, SET_DATA) \ - do { \ - u8 *data = (u8 *)(MESG); \ - u64 _offset = (OFFSET); \ - u64 _set_data = (SET_DATA); \ - data[_offset + 3] = (_set_data) & 0xFF; \ - data[_offset + 2] = (_set_data & 0xFF00) >> 8; \ - data[_offset + 1] = (_set_data & 0xFF0000) >> 16; \ - data[_offset + 0] = (_set_data & 0xFF000000) >> 24; \ - } while (0) - -#define RPC_SET_BE16(MESG, OFFSET, SET_DATA) \ - do { \ - u8 *data = (u8 *)(MESG); \ - u64 _offset = (OFFSET); \ - u64 _set_data = (SET_DATA); \ - data[_offset + 1] = (_set_data) & 0xFF; \ - data[_offset + 0] = (_set_data & 0xFF00) >> 8; \ - } while (0) - -#define RPC_SET_U8(MESG, OFFSET, SET_DATA) \ - do { \ - u8 *data = (u8 *)(MESG); \ - data[OFFSET] = (SET_DATA) & 0xFF; \ - } while (0) - -#define RPC_GET_BE64(MESG, OFFSET, PTR) \ - do { \ - u8 *data = (u8 *)(MESG); \ - u64 _offset = (OFFSET); \ - *(u32 *)(PTR) = \ - (data[_offset + 7] | data[_offset + 6] << 8 | \ - data[_offset + 5] << 16 | data[_offset + 4] << 24 | \ - data[_offset + 3] << 32 | data[_offset + 2] << 40 | \ - data[_offset + 1] << 48 | data[_offset + 0] << 56); \ - } while (0) - -#define RPC_GET_BE32(MESG, OFFSET, PTR) \ - do { \ - u8 *data = (u8 *)(MESG); \ - u64 _offset = (OFFSET); \ - *(u32 *)(PTR) = \ - (data[_offset + 3] | data[_offset + 2] << 8 | \ - data[_offset + 1] << 16 | data[_offset + 0] << 24); \ - } while (0) - -#define RPC_GET_BE16(MESG, OFFSET, PTR) \ - do { \ - u8 *data = (u8 *)(MESG); \ - u64 _offset = (OFFSET); \ - *(u16 *)(PTR) = (data[_offset + 1] | data[_offset + 0] << 8); \ - } while (0) - -#define RPC_GET_U8(MESG, OFFSET, PTR) \ - do { \ - u8 *data = (u8 *)(MESG); \ - *(u8 *)(PTR) = (data[OFFSET]); \ - } while (0) - /* * Defines for SC PM Power Mode */ -- cgit v1.2.3 From f67866701d74c4362b7ea74e7015922ad338375b Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Mon, 19 Jan 2026 15:31:15 +0100 Subject: pmdomain: core: Extend statistics for domain idle states with s2idle data To allow user space to monitor the selection of the domain idle state during s2idle for a CPU PM domain, let's extend the debugfs support in genpd with this information. Suggested-by: Dhruva Gole Reviewed-by: Dhruva Gole Reviewed-by: Kevin Hilman Signed-off-by: Ulf Hansson --- include/linux/pm_domain.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index 93ba0143ca47..f6f6d494f728 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -183,6 +183,7 @@ struct genpd_power_state { u64 rejected; u64 above; u64 below; + u64 usage_s2idle; struct fwnode_handle *fwnode; u64 idle_time; void *data; -- cgit v1.2.3 From 1877d3f258cbb57d64e275754fb9b18b089ce72d Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Sun, 1 Feb 2026 12:48:59 +0200 Subject: PM: domains: De-constify fields in struct dev_pm_domain_attach_data It doesn't really make sense to keep u32 fields to be marked as const. Having the const fields prevents their modification in the driver. Instead the whole struct can be defined as const, if it is constant. Fixes: 161e16a5e50a ("PM: domains: Add helper functions to attach/detach multiple PM domains") Signed-off-by: Dmitry Baryshkov Signed-off-by: Ulf Hansson --- include/linux/pm_domain.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index 93ba0143ca47..38d1814ab8a5 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -49,8 +49,8 @@ struct dev_pm_domain_attach_data { const char * const *pd_names; - const u32 num_pd_names; - const u32 pd_flags; + u32 num_pd_names; + u32 pd_flags; }; struct dev_pm_domain_list { -- cgit v1.2.3 From 9266b4da051a410d9e6c5c0b0ef0c877855aa1b8 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 31 Mar 2026 10:33:46 +0530 Subject: cpufreq: Allocate QoS freq_req objects with policy A recent change exposed a bug in the error path: if freq_qos_add_request(boost_freq_req) fails, min_freq_req may remain a valid pointer even though it was never successfully added. During policy teardown, this leads to an unconditional call to freq_qos_remove_request(), triggering a WARN. The current design allocates all three freq_req objects together, making the lifetime rules unclear and error handling fragile. Simplify this by allocating the QoS freq_req objects at policy allocation time. The policy itself is dynamically allocated, and two of the three requests are always needed anyway. This ensures consistent lifetime management and eliminates the inconsistent state in failure paths. Reported-by: Zhongqiu Han Fixes: 6e39ba4e5a82 ("cpufreq: Add boost_freq_req QoS request") Signed-off-by: Viresh Kumar Reviewed-by: Lifeng Zheng Tested-by: Pierre Gondois Reviewed-by: Zhongqiu Han Link: https://patch.msgid.link/a293f29d841b86c51f34699c6e717e01858d8ada.1774933424.git.viresh.kumar@linaro.org Signed-off-by: Rafael J. Wysocki --- include/linux/cpufreq.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index b6f6c7d06912..9b10eb486ece 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -79,9 +79,9 @@ struct cpufreq_policy { * called, but you're in IRQ context */ struct freq_constraints constraints; - struct freq_qos_request *min_freq_req; - struct freq_qos_request *max_freq_req; - struct freq_qos_request *boost_freq_req; + struct freq_qos_request min_freq_req; + struct freq_qos_request max_freq_req; + struct freq_qos_request boost_freq_req; struct cpufreq_frequency_table *freq_table; enum cpufreq_table_sorting freq_table_sorted; -- cgit v1.2.3 From 04bcbed4cd33495d05ba98857a748e416ab603b7 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Tue, 31 Mar 2026 14:19:46 -0700 Subject: powercap: intel_rapl: Move primitive info to header for interface drivers RAPL primitive information varies across different RAPL interfaces (MSR, TPMI, MMIO). Keeping them in the common code adds no benefit, but requires interface-specific handling logic and makes the common layer unnecessarily complex. Move the primitive info infrastructure to the shared header to allow interface drivers to configure RAPL primitives. Specific changes: 1. Move struct rapl_primitive_info, enum unit_type, and PRIMITIVE_INFO_INIT macro to intel_rapl.h. 2. Change the @rpi field in struct rapl_if_priv from void * to struct rapl_primitive_info * to improve type safety and eliminate unnecessary casts. No functional changes. This is a preparatory refactoring to allow interface drivers to supply their own RAPL primitive settings. Co-developed-by: Zhang Rui Signed-off-by: Zhang Rui Signed-off-by: Kuppuswamy Sathyanarayanan Link: https://patch.msgid.link/20260331211950.3329932-4-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- include/linux/intel_rapl.h | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h index 9e6bd654be1f..01f290de3586 100644 --- a/include/linux/intel_rapl.h +++ b/include/linux/intel_rapl.h @@ -137,6 +137,34 @@ struct rapl_defaults { bool spr_psys_bits; }; +#define PRIMITIVE_INFO_INIT(p, m, s, i, u, f) { \ + .name = #p, \ + .mask = m, \ + .shift = s, \ + .id = i, \ + .unit = u, \ + .flag = f \ + } + +enum unit_type { + ARBITRARY_UNIT, /* no translation */ + POWER_UNIT, + ENERGY_UNIT, + TIME_UNIT, +}; + +/* per domain data. used to describe individual knobs such that access function + * can be consolidated into one instead of many inline functions. + */ +struct rapl_primitive_info { + const char *name; + u64 mask; + int shift; + enum rapl_domain_reg_id id; + enum unit_type unit; + u32 flag; +}; + /** * struct rapl_if_priv: private data for different RAPL interfaces * @control_type: Each RAPL interface must have its own powercap @@ -152,7 +180,7 @@ struct rapl_defaults { * @write_raw: Callback for writing RAPL interface specific * registers. * @defaults: pointer to default settings - * @rpi: internal pointer to interface primitive info + * @rpi: pointer to interface primitive info */ struct rapl_if_priv { enum rapl_if_type type; @@ -164,7 +192,7 @@ struct rapl_if_priv { int (*read_raw)(int id, struct reg_action *ra, bool pmu_ctx); int (*write_raw)(int id, struct reg_action *ra); const struct rapl_defaults *defaults; - void *rpi; + struct rapl_primitive_info *rpi; }; #ifdef CONFIG_PERF_EVENTS -- cgit v1.2.3 From c3bb8d4f5d802ec1a16f018e82030bccb7a053a4 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Tue, 31 Mar 2026 14:19:50 -0700 Subject: powercap: intel_rapl: Consolidate PL4 and PMU support flags into rapl_defaults Currently, PL4 and MSR-based RAPL PMU support are detected using separate CPU ID tables (pl4_support_ids and pmu_support_ids) in the MSR driver probe path. This creates a maintenance burden since adding a new CPU requires updates in two places: the rapl_ids table and one or both of these capability tables. Consolidate PL4 and PMU capability information directly into struct rapl_defaults by adding msr_pl4_support and msr_pmu_support flags. This allows per-CPU capability to be expressed in a single place alongside other per-CPU defaults, eliminating the duplicate CPU ID tables entirely. No functional changes are intended. Co-developed-by: Zhang Rui Signed-off-by: Zhang Rui Acked-by: Srinivas Pandruvada Signed-off-by: Kuppuswamy Sathyanarayanan Link: https://patch.msgid.link/20260331211950.3329932-8-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- include/linux/intel_rapl.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h index 01f290de3586..328004f605c3 100644 --- a/include/linux/intel_rapl.h +++ b/include/linux/intel_rapl.h @@ -135,6 +135,8 @@ struct rapl_defaults { unsigned int dram_domain_energy_unit; unsigned int psys_domain_energy_unit; bool spr_psys_bits; + bool msr_pl4_support; + bool msr_pmu_support; }; #define PRIMITIVE_INFO_INIT(p, m, s, i, u, f) { \ -- cgit v1.2.3 From c2de5a5be4d60af5f928a2dd2b0f73e17358e346 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 30 Mar 2026 09:07:55 +0200 Subject: timens: Add a __free() wrapper for put_time_ns() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The wrapper will be used to simplify cleanups of 'struct time_namespace'. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260330-timens-cleanup-v1-1-936e91c9dd30@linutronix.de --- include/linux/time_namespace.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index c1de21a27c34..58bd9728df58 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -8,6 +8,7 @@ #include #include #include +#include struct user_namespace; extern struct user_namespace init_user_ns; @@ -171,4 +172,6 @@ static inline struct page *find_timens_vvar_page(struct vm_area_struct *vma) } #endif /* CONFIG_TIME_NS_VDSO */ +DEFINE_FREE(time_ns, struct time_namespace *, if (_T) put_time_ns(_T)) + #endif /* _LINUX_TIMENS_H */ -- cgit v1.2.3 From c064abc68e009d2cc18416e7132d9c25e03125b6 Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Sat, 7 Mar 2026 08:10:20 -0600 Subject: firmware: dmi: Correct an indexing error in dmi.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The entries later in enum dmi_entry_type don't match the SMBIOS specification¹. The entry for type 33: `64-Bit Memory Error Information` is not present and thus the index for all later entries is incorrect. Add it. Also, add missing entry types 43-46, while at it. ¹ Search for "System Management BIOS (SMBIOS) Reference Specification" [ bp: Drop the flaky SMBIOS spec URL. ] Fixes: 93c890dbe5287 ("firmware: Add DMI entry types to the headers") Signed-off-by: Mario Limonciello (AMD) Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Jean Delvare Reviewed-by: Yazen Ghannam Link: https://patch.msgid.link/20260307141024.819807-2-superm1@kernel.org --- include/linux/dmi.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dmi.h b/include/linux/dmi.h index 927f8a8b7a1d..2eedf44e6801 100644 --- a/include/linux/dmi.h +++ b/include/linux/dmi.h @@ -60,6 +60,7 @@ enum dmi_entry_type { DMI_ENTRY_OOB_REMOTE_ACCESS, DMI_ENTRY_BIS_ENTRY, DMI_ENTRY_SYSTEM_BOOT, + DMI_ENTRY_64_MEM_ERROR, DMI_ENTRY_MGMT_DEV, DMI_ENTRY_MGMT_DEV_COMPONENT, DMI_ENTRY_MGMT_DEV_THRES, @@ -69,6 +70,10 @@ enum dmi_entry_type { DMI_ENTRY_ADDITIONAL, DMI_ENTRY_ONBOARD_DEV_EXT, DMI_ENTRY_MGMT_CONTROLLER_HOST, + DMI_ENTRY_TPM_DEVICE, + DMI_ENTRY_PROCESSOR_ADDITIONAL, + DMI_ENTRY_FIRMWARE_INVENTORY, + DMI_ENTRY_STRING_PROPERTY, DMI_ENTRY_INACTIVE = 126, DMI_ENTRY_END_OF_TABLE = 127, }; -- cgit v1.2.3 From bc91133e260c8113c1119073c03b93c12aa41738 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Sat, 7 Mar 2026 08:10:24 -0600 Subject: x86/CPU/AMD: Print AGESA string from DMI additional information entry Type 40 entries (Additional Information) are summarized in section 7.41 as part of the SMBIOS specification. Generally, these entries aren't interesting to save. However on some AMD Zen systems, the AGESA version is stored here. This is useful to save to the kernel message logs for debugging. It can be used to cross-reference issues. Implement an iterator for the Additional Information entries. Use this to find and print the AGESA string. Do so in AMD code, since the use case is AMD-specific. [ bp: Match only "AGESA". ] Signed-off-by: Yazen Ghannam Co-developed-by: "Mario Limonciello (AMD)" Signed-off-by: "Mario Limonciello (AMD)" Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Jean Delvare Link: https://patch.msgid.link/20260307141024.819807-6-superm1@kernel.org --- include/linux/dmi.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dmi.h b/include/linux/dmi.h index 2eedf44e6801..c8700e6a694d 100644 --- a/include/linux/dmi.h +++ b/include/linux/dmi.h @@ -91,6 +91,21 @@ struct dmi_device { void *device_data; /* Type specific data */ }; +#define DMI_A_INFO_ENT_MIN_SIZE 0x6 +struct dmi_a_info_entry { + u8 length; + u16 handle; + u8 offset; + u8 str_num; + u8 value[]; +} __packed; + +#define DMI_A_INFO_MIN_SIZE 0xB +struct dmi_a_info { + struct dmi_header header; + u8 count; +} __packed; + #ifdef CONFIG_DMI struct dmi_dev_onboard { @@ -120,6 +135,7 @@ extern void dmi_memdev_name(u16 handle, const char **bank, const char **device); extern u64 dmi_memdev_size(u16 handle); extern u8 dmi_memdev_type(u16 handle); extern u16 dmi_memdev_handle(int slot); +const char *dmi_string_nosave(const struct dmi_header *dm, u8 s); #else @@ -153,6 +169,8 @@ static inline u8 dmi_memdev_type(u16 handle) { return 0x0; } static inline u16 dmi_memdev_handle(int slot) { return 0xffff; } static inline const struct dmi_system_id * dmi_first_match(const struct dmi_system_id *list) { return NULL; } +static inline const char * + dmi_string_nosave(const struct dmi_header *dm, u8 s) { return ""; } #endif -- cgit v1.2.3 From 9bf092c97b86af63694d9902b9e14047214ba76d Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Wed, 1 Apr 2026 17:20:41 +0200 Subject: sched: update task_struct->comm comment Since commit 3a3f61ce5e0b ("exec: Make sure task->comm is always NUL-terminated"), __set_task_comm() is unlocked and no longer uses strscpy_pad() - update the stale comment accordingly. Signed-off-by: Thorsten Blum Link: https://patch.msgid.link/20260401152039.724811-4-thorsten.blum@linux.dev Signed-off-by: Kees Cook --- include/linux/sched.h | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 074ad4ef3d81..43dfbcf1babe 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1158,12 +1158,9 @@ struct task_struct { /* * executable name, excluding path. * - * - normally initialized begin_new_exec() - * - set it with set_task_comm() - * - strscpy_pad() to ensure it is always NUL-terminated and - * zero-padded - * - task_lock() to ensure the operation is atomic and the name is - * fully updated. + * - normally initialized by begin_new_exec() + * - set it with set_task_comm() to ensure it is always + * NUL-terminated and zero-padded */ char comm[TASK_COMM_LEN]; -- cgit v1.2.3 From 9dc42c9070282c81058a875fea5acae057610980 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Wed, 1 Apr 2026 06:03:52 -0700 Subject: workqueue: fix typo in WQ_AFFN_SMT comment Fix "poer" -> "per" in the WQ_AFFN_SMT enum comment. Signed-off-by: Breno Leitao Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 9f971912c6be..75634a09576a 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -131,7 +131,7 @@ struct rcu_work { enum wq_affn_scope { WQ_AFFN_DFL, /* use system default */ WQ_AFFN_CPU, /* one pod per CPU */ - WQ_AFFN_SMT, /* one pod poer SMT */ + WQ_AFFN_SMT, /* one pod per SMT */ WQ_AFFN_CACHE, /* one pod per LLC */ WQ_AFFN_NUMA, /* one pod per NUMA node */ WQ_AFFN_SYSTEM, /* one pod across the whole system */ -- cgit v1.2.3 From 5920d046f7ae3bf9cf51b9d915c1fff13d299d84 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Wed, 1 Apr 2026 06:03:53 -0700 Subject: workqueue: add WQ_AFFN_CACHE_SHARD affinity scope On systems where many CPUs share one LLC, unbound workqueues using WQ_AFFN_CACHE collapse to a single worker pool, causing heavy spinlock contention on pool->lock. For example, Chuck Lever measured 39% of cycles lost to native_queued_spin_lock_slowpath on a 12-core shared-L3 NFS-over-RDMA system. The existing affinity hierarchy (cpu, smt, cache, numa, system) offers no intermediate option between per-LLC and per-SMT-core granularity. Add WQ_AFFN_CACHE_SHARD, which subdivides each LLC into groups of at most wq_cache_shard_size cores (default 8, tunable via boot parameter). Shards are always split on core (SMT group) boundaries so that Hyper-Threading siblings are never placed in different pods. Cores are distributed across shards as evenly as possible -- for example, 36 cores in a single LLC with max shard size 8 produces 5 shards of 8+7+7+7+7 cores. The implementation follows the same comparator pattern as other affinity scopes: precompute_cache_shard_ids() pre-fills the cpu_shard_id[] array from the already-initialized WQ_AFFN_CACHE and WQ_AFFN_SMT topology, and cpus_share_cache_shard() is passed to init_pod_type(). Benchmark on NVIDIA Grace (72 CPUs, single LLC, 50k items/thread), show cache_shard delivers ~5x the throughput and ~6.5x lower p50 latency compared to cache scope on this 72-core single-LLC system. Suggested-by: Tejun Heo Signed-off-by: Breno Leitao Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 75634a09576a..ab6cb70ca1a5 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -133,6 +133,7 @@ enum wq_affn_scope { WQ_AFFN_CPU, /* one pod per CPU */ WQ_AFFN_SMT, /* one pod per SMT */ WQ_AFFN_CACHE, /* one pod per LLC */ + WQ_AFFN_CACHE_SHARD, /* synthetic sub-LLC shards */ WQ_AFFN_NUMA, /* one pod per NUMA node */ WQ_AFFN_SYSTEM, /* one pod across the whole system */ -- cgit v1.2.3 From d57e74f10461b80c77d1678f646720f616fb8553 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sun, 1 Mar 2026 20:11:55 -0500 Subject: bitmap: introduce bitmap_weighted_xor() The function helps to XOR bitmaps and calculate Hamming weight of the result in one pass. Reviewed-by: Aleksandr Loktionov Reviewed-by: Jacob Keller Signed-off-by: Yury Norov --- include/linux/bitmap.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 9c0d1de44350..b007d54a9036 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -46,6 +46,7 @@ struct device; * bitmap_and(dst, src1, src2, nbits) *dst = *src1 & *src2 * bitmap_or(dst, src1, src2, nbits) *dst = *src1 | *src2 * bitmap_weighted_or(dst, src1, src2, nbits) *dst = *src1 | *src2. Returns Hamming Weight of dst + * bitmap_weighted_xor(dst, src1, src2, nbits) *dst = *src1 ^ *src2. Returns Hamming Weight of dst * bitmap_xor(dst, src1, src2, nbits) *dst = *src1 ^ *src2 * bitmap_andnot(dst, src1, src2, nbits) *dst = *src1 & ~(*src2) * bitmap_complement(dst, src, nbits) *dst = ~(*src) @@ -169,6 +170,8 @@ void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); unsigned int __bitmap_weighted_or(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); +unsigned int __bitmap_weighted_xor(unsigned long *dst, const unsigned long *bitmap1, + const unsigned long *bitmap2, unsigned int nbits); void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, @@ -353,6 +356,18 @@ unsigned int bitmap_weighted_or(unsigned long *dst, const unsigned long *src1, } } +static __always_inline +unsigned int bitmap_weighted_xor(unsigned long *dst, const unsigned long *src1, + const unsigned long *src2, unsigned int nbits) +{ + if (small_const_nbits(nbits)) { + *dst = *src1 ^ *src2; + return hweight_long(*dst & BITMAP_LAST_WORD_MASK(nbits)); + } else { + return __bitmap_weighted_xor(dst, src1, src2, nbits); + } +} + static __always_inline void bitmap_xor(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, unsigned int nbits) -- cgit v1.2.3 From f0548044a02630402d374df195ed3af4cc5e4711 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 25 Mar 2026 20:23:51 +0100 Subject: dma-mapping: introduce DMA_ATTR_CC_SHARED for shared memory Current CC designs don't place a vIOMMU in front of untrusted devices. Instead, the DMA API forces all untrusted device DMA through swiotlb bounce buffers (is_swiotlb_force_bounce()) which copies data into shared memory on behalf of the device. When a caller has already arranged for the memory to be shared via set_memory_decrypted(), the DMA API needs to know so it can map directly using the unencrypted physical address rather than bounce buffering. Following the pattern of DMA_ATTR_MMIO, add DMA_ATTR_CC_SHARED for this purpose. Like the MMIO case, only the caller knows what kind of memory it has and must inform the DMA API for it to work correctly. Signed-off-by: Jiri Pirko Reviewed-by: Jason Gunthorpe Acked-by: Sumit Semwal Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260325192352.437608-2-jiri@resnulli.us --- include/linux/dma-mapping.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 677c51ab7510..db8ab24a54f4 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -92,6 +92,16 @@ * flushing. */ #define DMA_ATTR_REQUIRE_COHERENT (1UL << 12) +/* + * DMA_ATTR_CC_SHARED: Indicates the DMA mapping is shared (decrypted) for + * confidential computing guests. For normal system memory the caller must have + * called set_memory_decrypted(), and pgprot_decrypted must be used when + * creating CPU PTEs for the mapping. The same shared semantic may be passed + * to the vIOMMU when it sets up the IOPTE. For MMIO use together with + * DMA_ATTR_MMIO to indicate shared MMIO. Unless DMA_ATTR_MMIO is provided + * a struct page is required. + */ +#define DMA_ATTR_CC_SHARED (1UL << 13) /* * A dma_addr_t can hold any valid DMA or bus address for the platform. It can -- cgit v1.2.3 From 3c7df5079cfc6133d01ae144ae76a980276cc726 Mon Sep 17 00:00:00 2001 From: Amit Sunil Dhamne Date: Wed, 1 Apr 2026 21:02:08 +0000 Subject: mfd: max77759: fix comment style for enums Fix comment style for enums so they're kernel-doc compliant. Signed-off-by: Amit Sunil Dhamne Link: https://patch.msgid.link/20260401-fix-mfd-max77759-usb-next-v1-1-174ec23ad824@google.com Signed-off-by: Greg Kroah-Hartman --- include/linux/mfd/max77759.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/max77759.h b/include/linux/mfd/max77759.h index ad1aa4c2b779..ec19be952877 100644 --- a/include/linux/mfd/max77759.h +++ b/include/linux/mfd/max77759.h @@ -131,12 +131,12 @@ #define MAX77759_MAXQ_OPCODE_USER_SPACE_READ 0x81 #define MAX77759_MAXQ_OPCODE_USER_SPACE_WRITE 0x82 -/* +/** * enum max77759_chgr_chgin_dtls_status - Charger Input Status * @MAX77759_CHGR_CHGIN_DTLS_VBUS_UNDERVOLTAGE: * Charger input voltage (Vchgin) < Under Voltage Threshold (Vuvlo) - * @MAX77759_CHGR_CHGIN_DTLS_VBUS_MARGINAL_VOLTAGE: Vchgin > Vuvlo and - * Vchgin < (Battery Voltage (Vbatt) + system voltage (Vsys)) + * @MAX77759_CHGR_CHGIN_DTLS_VBUS_MARGINAL_VOLTAGE: + * Vchgin > Vuvlo and Vchgin < (Battery Voltage (Vbatt) + system voltage (Vsys)) * @MAX77759_CHGR_CHGIN_DTLS_VBUS_OVERVOLTAGE: * Vchgin > Over Voltage threshold (Vovlo) * @MAX77759_CHGR_CHGIN_DTLS_VBUS_VALID: @@ -149,7 +149,7 @@ enum max77759_chgr_chgin_dtls_status { MAX77759_CHGR_CHGIN_DTLS_VBUS_VALID, }; -/* +/** * enum max77759_chgr_bat_dtls_states - Battery Details * @MAX77759_CHGR_BAT_DTLS_NO_BATT_CHG_SUSP: No battery and the charger suspended * @MAX77759_CHGR_BAT_DTLS_DEAD_BATTERY: Vbatt < Vtrickle @@ -171,7 +171,7 @@ enum max77759_chgr_bat_dtls_states { MAX77759_CHGR_BAT_DTLS_BAT_ONLY_MODE, }; -/* +/** * enum max77759_chgr_chg_dtls_states - Charger Details * @MAX77759_CHGR_CHG_DTLS_PREQUAL: Charger in prequalification mode * @MAX77759_CHGR_CHG_DTLS_CC: Charger in fast charge const curr mode -- cgit v1.2.3 From 7b7f2dd913829e06705035dfc41ca25fa6ec68d3 Mon Sep 17 00:00:00 2001 From: Pawel Laszczak Date: Tue, 31 Mar 2026 10:19:11 +0200 Subject: usb: cdnsp: Add support for device-only configuration This patch introduces support for operating the Cadence USBSSP (cdnsp) controller in a peripheral-only mode, bypassing the Dual-Role Device (DRD) logic. The change in BAR indexing (from BAR 2 to BAR 1) is a direct consequence of switching from 64-bit to 32-bit addressing in the Peripheral-only configuration. Tested on PCI platform with Device-only configuration. Platform-side changes are included to support the PCI glue layer's property injection. Signed-off-by: Pawel Laszczak Acked-by: Bjorn Helgaas # pci_ids.h Link: https://patch.msgid.link/20260331-device_only-v1-1-00378b80365c@cadence.com Signed-off-by: Greg Kroah-Hartman --- include/linux/pci_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 406abf629be2..a931fb201402 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2424,6 +2424,7 @@ #define PCI_DEVICE_ID_CDNS_USBSS 0x0100 #define PCI_DEVICE_ID_CDNS_USB 0x0120 #define PCI_DEVICE_ID_CDNS_USBSSP 0x0200 +#define PCI_DEVICE_ID_CDNS_UDC_USBSSP 0x0400 #define PCI_VENDOR_ID_ARECA 0x17d3 #define PCI_DEVICE_ID_ARECA_1110 0x1110 -- cgit v1.2.3 From bd3d245b0fef571f93504904df62b8865b1c0d34 Mon Sep 17 00:00:00 2001 From: Guan-Yu Lin Date: Wed, 1 Apr 2026 12:32:17 +0000 Subject: usb: core: use dedicated spinlock for offload state Replace the coarse USB device lock with a dedicated offload_lock spinlock to reduce contention during offload operations. Use offload_pm_locked to synchronize with PM transitions and replace the legacy offload_at_suspend flag. Optimize usb_offload_get/put by switching from auto-resume/suspend to pm_runtime_get_if_active(). This ensures offload state is only modified when the device is already active, avoiding unnecessary power transitions. Cc: stable Fixes: ef82a4803aab ("xhci: sideband: add api to trace sideband usage") Signed-off-by: Guan-Yu Lin Tested-by: Hailong Liu Acked-by: Mathias Nyman Link: https://patch.msgid.link/20260401123238.3790062-2-guanyulin@google.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb.h b/include/linux/usb.h index 04277af4bb9d..4aab20015851 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -21,6 +21,7 @@ #include /* for struct completion */ #include /* for current && schedule_timeout */ #include /* for struct mutex */ +#include /* for spinlock_t */ #include /* for runtime PM */ struct usb_device; @@ -636,8 +637,9 @@ struct usb3_lpm_parameters { * @do_remote_wakeup: remote wakeup should be enabled * @reset_resume: needs reset instead of resume * @port_is_suspended: the upstream port is suspended (L2 or U3) - * @offload_at_suspend: offload activities during suspend is enabled. + * @offload_pm_locked: prevents offload_usage changes during PM transitions. * @offload_usage: number of offload activities happening on this usb device. + * @offload_lock: protects offload_usage and offload_pm_locked * @slot_id: Slot ID assigned by xHCI * @l1_params: best effor service latency for USB2 L1 LPM state, and L1 timeout. * @u1_params: exit latencies for USB3 U1 LPM state, and hub-initiated timeout. @@ -726,8 +728,9 @@ struct usb_device { unsigned do_remote_wakeup:1; unsigned reset_resume:1; unsigned port_is_suspended:1; - unsigned offload_at_suspend:1; + unsigned offload_pm_locked:1; int offload_usage; + spinlock_t offload_lock; enum usb_link_tunnel_mode tunnel_mode; struct device_link *usb4_link; @@ -849,6 +852,7 @@ static inline void usb_mark_last_busy(struct usb_device *udev) int usb_offload_get(struct usb_device *udev); int usb_offload_put(struct usb_device *udev); bool usb_offload_check(struct usb_device *udev); +void usb_offload_set_pm_locked(struct usb_device *udev, bool locked); #else static inline int usb_offload_get(struct usb_device *udev) @@ -857,6 +861,8 @@ static inline int usb_offload_put(struct usb_device *udev) { return 0; } static inline bool usb_offload_check(struct usb_device *udev) { return false; } +static inline void usb_offload_set_pm_locked(struct usb_device *udev, bool locked) +{ } #endif extern int usb_disable_lpm(struct usb_device *udev); -- cgit v1.2.3 From 408d8af01f3a4d666620029a85e741906ff96f47 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 24 Jan 2026 17:58:48 -0500 Subject: for_each_alias(): helper macro for iterating through dentries of given inode Most of the places using d_alias are loops iterating through all aliases for given inode; introduce a helper macro (for_each_alias(dentry, inode)) and convert open-coded instances of such loop to it. They are easier to read that way and it reduces the noise on the next steps. You _must_ hold inode->i_lock over that thing. Signed-off-by: Al Viro --- include/linux/dcache.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 898c60d21c92..7f1dbc7121d7 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -615,4 +615,8 @@ void set_default_d_op(struct super_block *, const struct dentry_operations *); struct dentry *d_make_persistent(struct dentry *, struct inode *); void d_make_discardable(struct dentry *dentry); +/* inode->i_lock must be held over that */ +#define for_each_alias(dentry, inode) \ + hlist_for_each_entry(dentry, &(inode)->i_dentry, d_u.d_alias) + #endif /* __LINUX_DCACHE_H */ -- cgit v1.2.3 From 2420067cecacb1d1bf6dc39294d0c9f04066ff98 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 27 Jan 2026 22:51:37 -0500 Subject: struct dentry: make ->d_u anonymous Making ->d_rcu and (then) ->d_child overlapping dates back to 2006; anon unions support had been added to gcc only in 4.6 (2011) and the minimal gcc version hadn't been bumped to that until 4.19 (2018). These days there's no reason not to keep that union named. Signed-off-by: Al Viro --- include/linux/dcache.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 7f1dbc7121d7..f939d2ed10a3 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -128,7 +128,7 @@ struct dentry { struct hlist_node d_alias; /* inode alias list */ struct hlist_bl_node d_in_lookup_hash; /* only for in-lookup ones */ struct rcu_head d_rcu; - } d_u; + }; }; /* @@ -617,6 +617,6 @@ void d_make_discardable(struct dentry *dentry); /* inode->i_lock must be held over that */ #define for_each_alias(dentry, inode) \ - hlist_for_each_entry(dentry, &(inode)->i_dentry, d_u.d_alias) + hlist_for_each_entry(dentry, &(inode)->i_dentry, d_alias) #endif /* __LINUX_DCACHE_H */ -- cgit v1.2.3 From 241cb8dee0f83856c728f4fe2c29e331386c92f2 Mon Sep 17 00:00:00 2001 From: Ian Abbott Date: Fri, 30 Jan 2026 16:47:26 +0000 Subject: comedi: add comedi_check_request_region() There is an existing comedi_request_region(dev, start, len) function used by COMEDI drivers for legacy devices to request an I/O port region starting at a specified base address (which must be non-zero) and with a specified length. It uses request_region(). On success, it sets dev->iobase and dev->iolen and returns 0. There is a alternative function __comedi_request_region(dev, start, len) which does the same thing without setting dev->iobase and dev->iolen. Most hardware devices have restrictions on the allowed I/O port base address and alignment, so add new functions comedi_check_request_region(dev, start, len, minstart, maxend, minalign) and __comedi_check_request_region(dev, start, len, minstart, maxend, minalign) to perform these additional checks. Turn the original functions into static inline wrapper functions that call the new functions. Signed-off-by: Ian Abbott Link: https://patch.msgid.link/20260130170416.49994-2-abbotti@mev.co.uk Signed-off-by: Greg Kroah-Hartman --- include/linux/comedi/comedidev.h | 53 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 49 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/comedi/comedidev.h b/include/linux/comedi/comedidev.h index 35fdc41845ce..577a08f37aee 100644 --- a/include/linux/comedi/comedidev.h +++ b/include/linux/comedi/comedidev.h @@ -1026,10 +1026,55 @@ int comedi_load_firmware(struct comedi_device *dev, struct device *hw_dev, unsigned long context), unsigned long context); -int __comedi_request_region(struct comedi_device *dev, - unsigned long start, unsigned long len); -int comedi_request_region(struct comedi_device *dev, - unsigned long start, unsigned long len); +int __comedi_check_request_region(struct comedi_device *dev, + unsigned long start, unsigned long len, + unsigned long minstart, unsigned long maxend, + unsigned long minalign); +int comedi_check_request_region(struct comedi_device *dev, + unsigned long start, unsigned long len, + unsigned long minstart, unsigned long maxend, + unsigned long minalign); + +/** + * __comedi_request_region() - Request an I/O region for a legacy driver + * @dev: COMEDI device. + * @start: Base address of the I/O region. + * @len: Length of the I/O region. + * + * Requests the specified I/O port region which must start at a non-zero + * address. + * + * Returns 0 on success, -EINVAL if @start is 0, or -EIO if the request + * fails. + */ +static inline int __comedi_request_region(struct comedi_device *dev, + unsigned long start, + unsigned long len) +{ + return __comedi_check_request_region(dev, start, len, 0, ~0ul, 1); +} + +/** + * comedi_request_region() - Request an I/O region for a legacy driver + * @dev: COMEDI device. + * @start: Base address of the I/O region. + * @len: Length of the I/O region. + * + * Requests the specified I/O port region which must start at a non-zero + * address. + * + * On success, @dev->iobase is set to the base address of the region and + * @dev->iolen is set to its length. + * + * Returns 0 on success, -EINVAL if @start is 0, or -EIO if the request + * fails. + */ +static inline int comedi_request_region(struct comedi_device *dev, + unsigned long start, unsigned long len) +{ + return comedi_check_request_region(dev, start, len, 0, ~0ul, 1); +} + void comedi_legacy_detach(struct comedi_device *dev); int comedi_auto_config(struct device *hardware_device, -- cgit v1.2.3 From 6c561848ee5246f083770aaf39b2666f940d60dd Mon Sep 17 00:00:00 2001 From: Rosen Penev Date: Wed, 11 Mar 2026 16:24:59 -0700 Subject: comedi: isadma: use kzalloc_flex Switched struct pointer member to a flexible array member to get rid of kzalloc_objs as there's no need for them to be separately allocated. AAdded __counted_by for extra runtime analysis. Signed-off-by: Rosen Penev Link: https://patch.msgid.link/20260311232459.18407-1-rosenp@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/linux/comedi/comedi_isadma.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/comedi/comedi_isadma.h b/include/linux/comedi/comedi_isadma.h index 9d2b12db7e6e..7514ce222fa6 100644 --- a/include/linux/comedi/comedi_isadma.h +++ b/include/linux/comedi/comedi_isadma.h @@ -48,11 +48,11 @@ struct comedi_isadma_desc { */ struct comedi_isadma { struct device *dev; - struct comedi_isadma_desc *desc; int n_desc; int cur_dma; unsigned int chan; unsigned int chan2; + struct comedi_isadma_desc desc[] __counted_by(n_desc); }; #if IS_ENABLED(CONFIG_ISA_DMA_API) -- cgit v1.2.3 From 25e531b422dc2ac90cdae3b6e74b5cdeb081440d Mon Sep 17 00:00:00 2001 From: Michal Pecio Date: Thu, 2 Apr 2026 16:13:42 +0300 Subject: usb: xhci: Make usb_host_endpoint.hcpriv survive endpoint_disable() xHCI hardware maintains its endpoint state between add_endpoint() and drop_endpoint() calls followed by successful check_bandwidth(). So does the driver. Core may call endpoint_disable() during xHCI endpoint life, so don't clear host_ep->hcpriv then, because this breaks endpoint_reset(). If a driver calls usb_set_interface(), submits URBs which make host sequence state non-zero and calls usb_clear_halt(), the device clears its sequence state but xhci_endpoint_reset() bails out. The next URB malfunctions: USB2 loses one packet, USB3 gets Transaction Error or may not complete at all on some (buggy?) HCs from ASMedia and AMD. This is triggered by uvcvideo on bulk video devices. The code was copied from ehci_endpoint_disable() but it isn't needed here - hcpriv should only be NULL on emulated root hub endpoints. It might prevent resetting and inadvertently enabling a disabled and dropped endpoint, but core shouldn't try to reset dropped endpoints. Document xhci requirements regarding hcpriv. They are currently met. Fixes: 18b74067ac78 ("xhci: Fix use-after-free regression in xhci clear hub TT implementation") Cc: stable@vger.kernel.org Signed-off-by: Michal Pecio Signed-off-by: Mathias Nyman Link: https://patch.msgid.link/20260402131342.2628648-26-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/usb.h b/include/linux/usb.h index 815f2212936e..779bbfdfa0c7 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -54,7 +54,8 @@ struct ep_device; * @eusb2_isoc_ep_comp: eUSB2 isoc companion descriptor for this endpoint * @urb_list: urbs queued to this endpoint; maintained by usbcore * @hcpriv: for use by HCD; typically holds hardware dma queue head (QH) - * with one or more transfer descriptors (TDs) per urb + * with one or more transfer descriptors (TDs) per urb; must be preserved + * by core while BW is allocated for the endpoint * @ep_dev: ep_device for sysfs info * @extra: descriptors following this endpoint in the configuration * @extralen: how many bytes of "extra" are valid -- cgit v1.2.3 From 2d7ce8eb59ec880774c7500ac949f0100acba521 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 25 Feb 2026 21:12:07 -0800 Subject: misc: apds990x: fix all kernel-doc warnings Move a #define so that it is not between kernel-doc and its struct declaration. Spell one struct member correctly. Warning: include/linux/platform_data/apds990x.h:33 #define APDS_PARAM_SCALE 4096; error: Cannot parse struct or union! Warning: include/linux/platform_data/apds990x.h:62 struct member 'pdrive' not described in 'apds990x_platform_data' Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20260226051207.547152-1-rdunlap@infradead.org Signed-off-by: Greg Kroah-Hartman --- include/linux/platform_data/apds990x.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/apds990x.h b/include/linux/platform_data/apds990x.h index 94dfbaa365e1..37684f68c04f 100644 --- a/include/linux/platform_data/apds990x.h +++ b/include/linux/platform_data/apds990x.h @@ -31,7 +31,6 @@ * itself. If the GA is zero, driver will use uncovered sensor default values * format: decimal value * APDS_PARAM_SCALE except df which is plain integer. */ -#define APDS_PARAM_SCALE 4096 struct apds990x_chip_factors { int ga; int cf1; @@ -40,11 +39,12 @@ struct apds990x_chip_factors { int irf2; int df; }; +#define APDS_PARAM_SCALE 4096 /** * struct apds990x_platform_data - platform data for apsd990x.c driver * @cf: chip factor data - * @pddrive: IR-led driving current + * @pdrive: IR-led driving current * @ppcount: number of IR pulses used for proximity estimation * @setup_resources: interrupt line setup call back function * @release_resources: interrupt line release call back function -- cgit v1.2.3 From c03791085adcd61fa9b766ab303c7d0941d7378d Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Mon, 16 Mar 2026 08:18:49 +0000 Subject: cpufreq: Pass the policy to cpufreq_driver->adjust_perf() cpufreq_cpu_get() can sleep on PREEMPT_RT in presence of concurrent writer(s), however amd-pstate depends on fetching the cpudata via the policy's driver data which necessitates grabbing the reference. Since schedutil governor can call "cpufreq_driver->update_perf()" during sched_tick/enqueue/dequeue with rq_lock held and IRQs disabled, fetching the policy object using the cpufreq_cpu_get() helper in the scheduler fast-path leads to "BUG: scheduling while atomic" on PREEMPT_RT [1]. Pass the cached cpufreq policy object in sg_policy to the update_perf() instead of just the CPU. The CPU can be inferred using "policy->cpu". The lifetime of cpufreq_policy object outlasts that of the governor and the cpufreq driver (allocated when the CPU is onlined and only reclaimed when the CPU is offlined / the CPU device is removed) which makes it safe to be referenced throughout the governor's lifetime. Closes:https://lore.kernel.org/all/20250731092316.3191-1-spasswolf@web.de/ [1] Fixes: 1d215f0319c2 ("cpufreq: amd-pstate: Add fast switch function for AMD P-State") Reported-by: Bert Karwatzki Acked-by: Viresh Kumar Signed-off-by: K Prateek Nayak Acked-by: Gary Guo # Rust Reviewed-by: Gautham R. Shenoy Reviewed-by: Zhongqiu Han Link: https://lore.kernel.org/r/20260316081849.19368-3-kprateek.nayak@amd.com Signed-off-by: Mario Limonciello (AMD) --- include/linux/cpufreq.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index cc894fc38971..4317c5a312bd 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -372,7 +372,7 @@ struct cpufreq_driver { * conditions) scale invariance can be disabled, which causes the * schedutil governor to fall back to the latter. */ - void (*adjust_perf)(unsigned int cpu, + void (*adjust_perf)(struct cpufreq_policy *policy, unsigned long min_perf, unsigned long target_perf, unsigned long capacity); @@ -617,7 +617,7 @@ struct cpufreq_governor { /* Pass a target to the cpufreq driver */ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy, unsigned int target_freq); -void cpufreq_driver_adjust_perf(unsigned int cpu, +void cpufreq_driver_adjust_perf(struct cpufreq_policy *policy, unsigned long min_perf, unsigned long target_perf, unsigned long capacity); -- cgit v1.2.3 From e417ac73d24ae68b8dd6a1b02f9db03a7a5c184b Mon Sep 17 00:00:00 2001 From: Nicolai Buchwitz Date: Wed, 1 Apr 2026 14:38:44 +0200 Subject: net: phy: microchip: add downshift tunable support for LAN88xx Implement the standard ETHTOOL_PHY_DOWNSHIFT tunable for the LAN88xx PHY. This allows runtime configuration of the auto-downshift feature via ethtool: ethtool --set-phy-tunable eth0 downshift on count 3 The LAN88xx PHY supports downshifting from 1000BASE-T to 100BASE-TX after 2-5 failed auto-negotiation attempts. Valid count values are 2, 3, 4 and 5. This is based on an earlier downstream implementation by Phil Elwell. Signed-off-by: Nicolai Buchwitz Reviewed-by: Andrew Lunn Reviewed-by: Russell King (Oracle) Link: https://patch.msgid.link/20260401123848.696766-2-nb@tipi-net.de Signed-off-by: Jakub Kicinski --- include/linux/microchipphy.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/microchipphy.h b/include/linux/microchipphy.h index 517288da19fd..7da956c666a0 100644 --- a/include/linux/microchipphy.h +++ b/include/linux/microchipphy.h @@ -61,6 +61,11 @@ /* Registers specific to the LAN7800/LAN7850 embedded phy */ #define LAN78XX_PHY_LED_MODE_SELECT (0x1D) +/* PHY Control 3 register (page 1) */ +#define LAN78XX_PHY_CTRL3 (0x14) +#define LAN78XX_PHY_CTRL3_AUTO_DOWNSHIFT BIT(4) +#define LAN78XX_PHY_CTRL3_DOWNSHIFT_CTRL_MASK GENMASK(3, 2) + /* DSP registers */ #define PHY_ARDENNES_MMD_DEV_3_PHY_CFG (0x806A) #define PHY_ARDENNES_MMD_DEV_3_PHY_CFG_ZD_DLY_EN_ (0x2000) -- cgit v1.2.3 From ec1d77cb0ee98249142dcd0376d76e7a48ba0b31 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Thu, 2 Apr 2026 17:09:15 +0200 Subject: bpf: Use bpf_verifier_env buffers for reg_set_min_max In a subsequent patch, the regs_refine_cond_op and reg_bounds_sync functions will be called in is_branch_taken instead of reg_set_min_max, to simulate each branch's outcome. Since they will run before we branch out, these two functions will need to work on temporary registers for the two branches. This refactoring patch prepares for that change, by introducing the temporary registers on bpf_verifier_env and using them in reg_set_min_max. This change also allows us to save one fake_reg slot as we don't need to allocate an additional temporary buffer in case of a BPF_K condition. Finally, you may notice that this patch removes the check for "false_reg1 == false_reg2" in reg_set_min_max. That check was introduced in commit d43ad9da8052 ("bpf: Skip bounds adjustment for conditional jumps on same scalar register") to avoid an invariant violation. Given that "env->false_reg1 == env->false_reg2" doesn't make sense and invariant violations are addressed in a subsequent commit, this patch just removes the check. Suggested-by: Eduard Zingerman Co-developed-by: Harishankar Vishwanathan Signed-off-by: Harishankar Vishwanathan Signed-off-by: Paul Chaignon Acked-by: Shung-Hsi Yu Acked-by: Mykyta Yatsenko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/260b0270052944a420e1c56e6a92df4d43cadf03.1775142354.git.paul.chaignon@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 090aa26d1c98..b129e0aaee20 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -837,7 +837,9 @@ struct bpf_verifier_env { u64 scratched_stack_slots; u64 prev_log_pos, prev_insn_print_pos; /* buffer used to temporary hold constants as scalar registers */ - struct bpf_reg_state fake_reg[2]; + struct bpf_reg_state fake_reg[1]; + /* buffers used to save updated reg states while simulating branches */ + struct bpf_reg_state true_reg1, true_reg2, false_reg1, false_reg2; /* buffer used to generate temporary string representations, * e.g., in reg_type_str() to generate reg_type string */ -- cgit v1.2.3 From 54e20be48fd4bc1df5f6fbca552b5be8c47dbd18 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Mar 2026 07:16:40 +0100 Subject: xor: split xor.h Keep xor.h for the public API, and split the struct xor_block_template definition that is only needed by the xor.ko core and architecture-specific optimizations into a separate xor_impl.h header. Link: https://lkml.kernel.org/r/20260327061704.3707577-9-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Eric Biggers Tested-by: Eric Biggers Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Andreas Larsson Cc: Anton Ivanov Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chris Mason Cc: Christian Borntraeger Cc: Dan Williams Cc: David S. Miller Cc: David Sterba Cc: Heiko Carstens Cc: Herbert Xu Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jason A. Donenfeld Cc: Johannes Berg Cc: Li Nan Cc: Madhavan Srinivasan Cc: Magnus Lindholm Cc: Matt Turner Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Richard Henderson Cc: Richard Weinberger Cc: Russell King Cc: Song Liu Cc: Sven Schnelle Cc: Ted Ts'o Cc: Vasily Gorbik Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/raid/xor.h | 22 +--------------------- include/linux/raid/xor_impl.h | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 21 deletions(-) create mode 100644 include/linux/raid/xor_impl.h (limited to 'include/linux') diff --git a/include/linux/raid/xor.h b/include/linux/raid/xor.h index 51b811b62322..02bda8d99534 100644 --- a/include/linux/raid/xor.h +++ b/include/linux/raid/xor.h @@ -7,24 +7,4 @@ extern void xor_blocks(unsigned int count, unsigned int bytes, void *dest, void **srcs); -struct xor_block_template { - struct xor_block_template *next; - const char *name; - int speed; - void (*do_2)(unsigned long, unsigned long * __restrict, - const unsigned long * __restrict); - void (*do_3)(unsigned long, unsigned long * __restrict, - const unsigned long * __restrict, - const unsigned long * __restrict); - void (*do_4)(unsigned long, unsigned long * __restrict, - const unsigned long * __restrict, - const unsigned long * __restrict, - const unsigned long * __restrict); - void (*do_5)(unsigned long, unsigned long * __restrict, - const unsigned long * __restrict, - const unsigned long * __restrict, - const unsigned long * __restrict, - const unsigned long * __restrict); -}; - -#endif +#endif /* _XOR_H */ diff --git a/include/linux/raid/xor_impl.h b/include/linux/raid/xor_impl.h new file mode 100644 index 000000000000..a1890cd66812 --- /dev/null +++ b/include/linux/raid/xor_impl.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _XOR_IMPL_H +#define _XOR_IMPL_H + +struct xor_block_template { + struct xor_block_template *next; + const char *name; + int speed; + void (*do_2)(unsigned long, unsigned long * __restrict, + const unsigned long * __restrict); + void (*do_3)(unsigned long, unsigned long * __restrict, + const unsigned long * __restrict, + const unsigned long * __restrict); + void (*do_4)(unsigned long, unsigned long * __restrict, + const unsigned long * __restrict, + const unsigned long * __restrict, + const unsigned long * __restrict); + void (*do_5)(unsigned long, unsigned long * __restrict, + const unsigned long * __restrict, + const unsigned long * __restrict, + const unsigned long * __restrict, + const unsigned long * __restrict); +}; + +#endif /* _XOR_IMPL_H */ -- cgit v1.2.3 From 35ebc4de105989034f1250e40eb6dbf5e136b04e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Mar 2026 07:16:41 +0100 Subject: xor: remove macro abuse for XOR implementation registrations Drop the pretty confusing historic XOR_TRY_TEMPLATES and XOR_SELECT_TEMPLATE, and instead let the architectures provide a arch_xor_init that calls either xor_register to register candidates or xor_force to force a specific implementation. Link: https://lkml.kernel.org/r/20260327061704.3707577-10-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Eric Biggers Tested-by: Eric Biggers Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Andreas Larsson Cc: Anton Ivanov Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chris Mason Cc: Christian Borntraeger Cc: Dan Williams Cc: David S. Miller Cc: David Sterba Cc: Heiko Carstens Cc: Herbert Xu Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jason A. Donenfeld Cc: Johannes Berg Cc: Li Nan Cc: Madhavan Srinivasan Cc: Magnus Lindholm Cc: Matt Turner Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Richard Henderson Cc: Richard Weinberger Cc: Russell King Cc: Song Liu Cc: Sven Schnelle Cc: Ted Ts'o Cc: Vasily Gorbik Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/raid/xor_impl.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/raid/xor_impl.h b/include/linux/raid/xor_impl.h index a1890cd66812..6ed4c445ab24 100644 --- a/include/linux/raid/xor_impl.h +++ b/include/linux/raid/xor_impl.h @@ -2,6 +2,8 @@ #ifndef _XOR_IMPL_H #define _XOR_IMPL_H +#include + struct xor_block_template { struct xor_block_template *next; const char *name; @@ -22,4 +24,7 @@ struct xor_block_template { const unsigned long * __restrict); }; +void __init xor_register(struct xor_block_template *tmpl); +void __init xor_force(struct xor_block_template *tmpl); + #endif /* _XOR_IMPL_H */ -- cgit v1.2.3 From e20043b4765cdf7ec8e963d706bb91469cba8cb8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Mar 2026 07:16:53 +0100 Subject: xor: make xor.ko self-contained in lib/raid/ Move the asm/xor.h headers to lib/raid/xor/$(SRCARCH)/xor_arch.h and include/linux/raid/xor_impl.h to lib/raid/xor/xor_impl.h so that the xor.ko module implementation is self-contained in lib/raid/. As this remove the asm-generic mechanism a new kconfig symbol is added to indicate that a architecture-specific implementations exists, and xor_arch.h should be included. Link: https://lkml.kernel.org/r/20260327061704.3707577-22-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Eric Biggers Tested-by: Eric Biggers Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Andreas Larsson Cc: Anton Ivanov Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chris Mason Cc: Christian Borntraeger Cc: Dan Williams Cc: David S. Miller Cc: David Sterba Cc: Heiko Carstens Cc: Herbert Xu Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jason A. Donenfeld Cc: Johannes Berg Cc: Li Nan Cc: Madhavan Srinivasan Cc: Magnus Lindholm Cc: Matt Turner Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Richard Henderson Cc: Richard Weinberger Cc: Russell King Cc: Song Liu Cc: Sven Schnelle Cc: Ted Ts'o Cc: Vasily Gorbik Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/raid/xor_impl.h | 30 ------------------------------ 1 file changed, 30 deletions(-) delete mode 100644 include/linux/raid/xor_impl.h (limited to 'include/linux') diff --git a/include/linux/raid/xor_impl.h b/include/linux/raid/xor_impl.h deleted file mode 100644 index 6ed4c445ab24..000000000000 --- a/include/linux/raid/xor_impl.h +++ /dev/null @@ -1,30 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _XOR_IMPL_H -#define _XOR_IMPL_H - -#include - -struct xor_block_template { - struct xor_block_template *next; - const char *name; - int speed; - void (*do_2)(unsigned long, unsigned long * __restrict, - const unsigned long * __restrict); - void (*do_3)(unsigned long, unsigned long * __restrict, - const unsigned long * __restrict, - const unsigned long * __restrict); - void (*do_4)(unsigned long, unsigned long * __restrict, - const unsigned long * __restrict, - const unsigned long * __restrict, - const unsigned long * __restrict); - void (*do_5)(unsigned long, unsigned long * __restrict, - const unsigned long * __restrict, - const unsigned long * __restrict, - const unsigned long * __restrict, - const unsigned long * __restrict); -}; - -void __init xor_register(struct xor_block_template *tmpl); -void __init xor_force(struct xor_block_template *tmpl); - -#endif /* _XOR_IMPL_H */ -- cgit v1.2.3 From e420f0a88b24b80302f57965ceb7387aa3f12488 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Mar 2026 07:16:54 +0100 Subject: xor: add a better public API xor_blocks is very annoying to use, because it is limited to 4 + 1 sources / destinations, has an odd argument order and is completely undocumented. Lift the code that loops around it from btrfs and async_tx/async_xor into common code under the name xor_gen and properly document it. [hch@lst.de: make xor_blocks less annoying to use] Link: https://lkml.kernel.org/r/20260327061704.3707577-24-hch@lst.de Link: https://lkml.kernel.org/r/20260327061704.3707577-23-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Eric Biggers Tested-by: Eric Biggers Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Andreas Larsson Cc: Anton Ivanov Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chris Mason Cc: Christian Borntraeger Cc: Dan Williams Cc: David S. Miller Cc: David Sterba Cc: Heiko Carstens Cc: Herbert Xu Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jason A. Donenfeld Cc: Johannes Berg Cc: Li Nan Cc: Madhavan Srinivasan Cc: Magnus Lindholm Cc: Matt Turner Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Richard Henderson Cc: Richard Weinberger Cc: Russell King Cc: Song Liu Cc: Sven Schnelle Cc: Ted Ts'o Cc: Vasily Gorbik Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/raid/xor.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/raid/xor.h b/include/linux/raid/xor.h index 02bda8d99534..6d9a39fd85dd 100644 --- a/include/linux/raid/xor.h +++ b/include/linux/raid/xor.h @@ -7,4 +7,6 @@ extern void xor_blocks(unsigned int count, unsigned int bytes, void *dest, void **srcs); +void xor_gen(void *dest, void **srcs, unsigned int src_cnt, unsigned int bytes); + #endif /* _XOR_H */ -- cgit v1.2.3 From 80dcf0a7832a5acde0f0701a4dc7b586fc8bcc88 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Mar 2026 07:16:58 +0100 Subject: xor: pass the entire operation to the low-level ops Currently the high-level xor code chunks up all operations into small units for only up to 1 + 4 vectors, and passes it to four different methods. This means the FPU/vector context is entered and left a lot for wide stripes, and a lot of indirect expensive indirect calls are performed. Switch to passing the entire gen_xor request to the low-level ops, and provide a macro to dispatch it to the existing helper. This reduce the number of indirect calls and FPU/vector context switches by a factor approaching nr_stripes / 4, and also reduces source and binary code size. Link: https://lkml.kernel.org/r/20260327061704.3707577-27-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Eric Biggers Tested-by: Eric Biggers Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Andreas Larsson Cc: Anton Ivanov Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chris Mason Cc: Christian Borntraeger Cc: Dan Williams Cc: David S. Miller Cc: David Sterba Cc: Heiko Carstens Cc: Herbert Xu Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jason A. Donenfeld Cc: Johannes Berg Cc: Li Nan Cc: Madhavan Srinivasan Cc: Magnus Lindholm Cc: Matt Turner Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Richard Henderson Cc: Richard Weinberger Cc: Russell King Cc: Song Liu Cc: Sven Schnelle Cc: Ted Ts'o Cc: Vasily Gorbik Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/raid/xor.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/raid/xor.h b/include/linux/raid/xor.h index 6d9a39fd85dd..870558c9d36e 100644 --- a/include/linux/raid/xor.h +++ b/include/linux/raid/xor.h @@ -2,11 +2,6 @@ #ifndef _XOR_H #define _XOR_H -#define MAX_XOR_BLOCKS 4 - -extern void xor_blocks(unsigned int count, unsigned int bytes, - void *dest, void **srcs); - void xor_gen(void *dest, void **srcs, unsigned int src_cnt, unsigned int bytes); #endif /* _XOR_H */ -- cgit v1.2.3 From fe74eb289163d10b34f8ee571cdc3257306f343f Mon Sep 17 00:00:00 2001 From: Coiby Xu Date: Wed, 25 Feb 2026 14:03:45 +0800 Subject: crash: align the declaration of crash_load_dm_crypt_keys with CONFIG_CRASH_DM_CRYPT This will prevent a compilation failure when CONFIG_CRASH_DUMP is enabled but CONFIG_CRASH_DM_CRYPT is disabled, arch/powerpc/kexec/elf_64.c: In function 'elf64_load': >> arch/powerpc/kexec/elf_64.c:82:23: error: implicit declaration of function 'crash_load_dm_crypt_keys' [-Werror=implicit-function-declaration] 82 | ret = crash_load_dm_crypt_keys(image); | ^~~~~~~~~~~~~~~~~~~~~~~~ cc1: some warnings being treated as errors Link: https://lkml.kernel.org/r/20260225060347.718905-3-coxu@redhat.com Signed-off-by: Coiby Xu Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202602120648.RgQALnnI-lkp@intel.com/ Acked-by: Baoquan He Cc: Arnaud Lefebvre Cc: Christophe Leroy (CS GROUP) Cc: Dave Young Cc: Kairui Song Cc: Krzysztof Kozlowski Cc: Pingfan Liu Cc: Rob Herring Cc: Sourabh Jain Cc: Thomas Staudt Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/crash_core.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index d35726d6a415..c1dee3f971a9 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -34,13 +34,6 @@ static inline void arch_kexec_protect_crashkres(void) { } static inline void arch_kexec_unprotect_crashkres(void) { } #endif -#ifdef CONFIG_CRASH_DM_CRYPT -int crash_load_dm_crypt_keys(struct kimage *image); -ssize_t dm_crypt_keys_read(char *buf, size_t count, u64 *ppos); -#else -static inline int crash_load_dm_crypt_keys(struct kimage *image) {return 0; } -#endif - #ifndef arch_crash_handle_hotplug_event static inline void arch_crash_handle_hotplug_event(struct kimage *image, void *arg) { } #endif @@ -96,4 +89,11 @@ static inline void crash_save_cpu(struct pt_regs *regs, int cpu) {}; static inline int kimage_crash_copy_vmcoreinfo(struct kimage *image) { return 0; }; #endif /* CONFIG_CRASH_DUMP*/ +#ifdef CONFIG_CRASH_DM_CRYPT +int crash_load_dm_crypt_keys(struct kimage *image); +ssize_t dm_crypt_keys_read(char *buf, size_t count, u64 *ppos); +#else +static inline int crash_load_dm_crypt_keys(struct kimage *image) {return 0; } +#endif + #endif /* LINUX_CRASH_CORE_H */ -- cgit v1.2.3 From 49eac82653e13243cec5f8c25e35161b922a9c80 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 3 Apr 2026 14:03:37 +0200 Subject: Revert "usb: cdnsp: Add support for device-only configuration" This reverts commit 7b7f2dd913829e06705035dfc41ca25fa6ec68d3. There was some problems with an earlier cdns3 change, so this one needs to be backed out as well. Cc: Pawel Laszczak Cc: Bjorn Helgaas Reported-by: Peter Chen Link: https://lore.kernel.org/r/ac+LEWMCQpLSnfoD@nchen-desktop Signed-off-by: Greg Kroah-Hartman --- include/linux/pci_ids.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index a931fb201402..406abf629be2 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2424,7 +2424,6 @@ #define PCI_DEVICE_ID_CDNS_USBSS 0x0100 #define PCI_DEVICE_ID_CDNS_USB 0x0120 #define PCI_DEVICE_ID_CDNS_USBSSP 0x0200 -#define PCI_DEVICE_ID_CDNS_UDC_USBSSP 0x0400 #define PCI_VENDOR_ID_ARECA 0x17d3 #define PCI_DEVICE_ID_ARECA_1110 0x1110 -- cgit v1.2.3 From fa4a1ff8ab235a308d8c983827657a69649185fd Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 24 Mar 2026 19:13:19 +0000 Subject: locking: Add task::blocked_lock to serialize blocked_on state So far, we have been able to utilize the mutex::wait_lock for serializing the blocked_on state, but when we move to proxying across runqueues, we will need to add more state and a way to serialize changes to this state in contexts where we don't hold the mutex::wait_lock. So introduce the task::blocked_lock, which nests under the mutex::wait_lock in the locking order, and rework the locking to use it. Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: K Prateek Nayak Link: https://patch.msgid.link/20260324191337.1841376-5-jstultz@google.com --- include/linux/sched.h | 48 +++++++++++++++++------------------------------- 1 file changed, 17 insertions(+), 31 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 5a5d3dbc9cdf..2eef9bc6daaa 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1238,6 +1238,7 @@ struct task_struct { #endif struct mutex *blocked_on; /* lock we're blocked on */ + raw_spinlock_t blocked_lock; #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER /* @@ -2181,57 +2182,42 @@ extern int __cond_resched_rwlock_write(rwlock_t *lock) __must_hold(lock); #ifndef CONFIG_PREEMPT_RT static inline struct mutex *__get_task_blocked_on(struct task_struct *p) { - struct mutex *m = p->blocked_on; - - if (m) - lockdep_assert_held_once(&m->wait_lock); - return m; + lockdep_assert_held_once(&p->blocked_lock); + return p->blocked_on; } static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m) { - struct mutex *blocked_on = READ_ONCE(p->blocked_on); - WARN_ON_ONCE(!m); /* The task should only be setting itself as blocked */ WARN_ON_ONCE(p != current); - /* Currently we serialize blocked_on under the mutex::wait_lock */ - lockdep_assert_held_once(&m->wait_lock); + /* Currently we serialize blocked_on under the task::blocked_lock */ + lockdep_assert_held_once(&p->blocked_lock); /* * Check ensure we don't overwrite existing mutex value * with a different mutex. Note, setting it to the same * lock repeatedly is ok. */ - WARN_ON_ONCE(blocked_on && blocked_on != m); - WRITE_ONCE(p->blocked_on, m); -} - -static inline void set_task_blocked_on(struct task_struct *p, struct mutex *m) -{ - guard(raw_spinlock_irqsave)(&m->wait_lock); - __set_task_blocked_on(p, m); + WARN_ON_ONCE(p->blocked_on && p->blocked_on != m); + p->blocked_on = m; } static inline void __clear_task_blocked_on(struct task_struct *p, struct mutex *m) { - if (m) { - struct mutex *blocked_on = READ_ONCE(p->blocked_on); - - /* Currently we serialize blocked_on under the mutex::wait_lock */ - lockdep_assert_held_once(&m->wait_lock); - /* - * There may be cases where we re-clear already cleared - * blocked_on relationships, but make sure we are not - * clearing the relationship with a different lock. - */ - WARN_ON_ONCE(blocked_on && blocked_on != m); - } - WRITE_ONCE(p->blocked_on, NULL); + /* Currently we serialize blocked_on under the task::blocked_lock */ + lockdep_assert_held_once(&p->blocked_lock); + /* + * There may be cases where we re-clear already cleared + * blocked_on relationships, but make sure we are not + * clearing the relationship with a different lock. + */ + WARN_ON_ONCE(m && p->blocked_on && p->blocked_on != m); + p->blocked_on = NULL; } static inline void clear_task_blocked_on(struct task_struct *p, struct mutex *m) { - guard(raw_spinlock_irqsave)(&m->wait_lock); + guard(raw_spinlock_irqsave)(&p->blocked_lock); __clear_task_blocked_on(p, m); } #else -- cgit v1.2.3 From 2d7622669836dcbbb449741b4e6c503ffe005c25 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 24 Mar 2026 19:13:21 +0000 Subject: sched/locking: Add special p->blocked_on==PROXY_WAKING value for proxy return-migration As we add functionality to proxy execution, we may migrate a donor task to a runqueue where it can't run due to cpu affinity. Thus, we must be careful to ensure we return-migrate the task back to a cpu in its cpumask when it becomes unblocked. Peter helpfully provided the following example with pictures: "Suppose we have a ww_mutex cycle: ,-+-* Mutex-1 <-. Task-A ---' | | ,-- Task-B `-> Mutex-2 *-+-' Where Task-A holds Mutex-1 and tries to acquire Mutex-2, and where Task-B holds Mutex-2 and tries to acquire Mutex-1. Then the blocked_on->owner chain will go in circles. Task-A -> Mutex-2 ^ | | v Mutex-1 <- Task-B We need two things: - find_proxy_task() to stop iterating the circle; - the woken task to 'unblock' and run, such that it can back-off and re-try the transaction. Now, the current code [without this patch] does: __clear_task_blocked_on(); wake_q_add(); And surely clearing ->blocked_on is sufficient to break the cycle. Suppose it is Task-B that is made to back-off, then we have: Task-A -> Mutex-2 -> Task-B (no further blocked_on) and it would attempt to run Task-B. Or worse, it could directly pick Task-B and run it, without ever getting into find_proxy_task(). Now, here is a problem because Task-B might not be runnable on the CPU it is currently on; and because !task_is_blocked() we don't get into the proxy paths, so nobody is going to fix this up. Ideally we would have dequeued Task-B alongside of clearing ->blocked_on, but alas, [the lock ordering prevents us from getting the task_rq_lock() and] spoils things." Thus we need more than just a binary concept of the task being blocked on a mutex or not. So allow setting blocked_on to PROXY_WAKING as a special value which specifies the task is no longer blocked, but needs to be evaluated for return migration *before* it can be run. This will then be used in a later patch to handle proxy return-migration. Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: K Prateek Nayak Link: https://patch.msgid.link/20260324191337.1841376-7-jstultz@google.com --- include/linux/sched.h | 51 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 2eef9bc6daaa..8ec3b6d7d718 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2180,10 +2180,20 @@ extern int __cond_resched_rwlock_write(rwlock_t *lock) __must_hold(lock); }) #ifndef CONFIG_PREEMPT_RT + +/* + * With proxy exec, if a task has been proxy-migrated, it may be a donor + * on a cpu that it can't actually run on. Thus we need a special state + * to denote that the task is being woken, but that it needs to be + * evaluated for return-migration before it is run. So if the task is + * blocked_on PROXY_WAKING, return migrate it before running it. + */ +#define PROXY_WAKING ((struct mutex *)(-1L)) + static inline struct mutex *__get_task_blocked_on(struct task_struct *p) { lockdep_assert_held_once(&p->blocked_lock); - return p->blocked_on; + return p->blocked_on == PROXY_WAKING ? NULL : p->blocked_on; } static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m) @@ -2211,7 +2221,7 @@ static inline void __clear_task_blocked_on(struct task_struct *p, struct mutex * * blocked_on relationships, but make sure we are not * clearing the relationship with a different lock. */ - WARN_ON_ONCE(m && p->blocked_on && p->blocked_on != m); + WARN_ON_ONCE(m && p->blocked_on && p->blocked_on != m && p->blocked_on != PROXY_WAKING); p->blocked_on = NULL; } @@ -2220,6 +2230,35 @@ static inline void clear_task_blocked_on(struct task_struct *p, struct mutex *m) guard(raw_spinlock_irqsave)(&p->blocked_lock); __clear_task_blocked_on(p, m); } + +static inline void __set_task_blocked_on_waking(struct task_struct *p, struct mutex *m) +{ + /* Currently we serialize blocked_on under the task::blocked_lock */ + lockdep_assert_held_once(&p->blocked_lock); + + if (!sched_proxy_exec()) { + __clear_task_blocked_on(p, m); + return; + } + + /* Don't set PROXY_WAKING if blocked_on was already cleared */ + if (!p->blocked_on) + return; + /* + * There may be cases where we set PROXY_WAKING on tasks that were + * already set to waking, but make sure we are not changing + * the relationship with a different lock. + */ + WARN_ON_ONCE(m && p->blocked_on != m && p->blocked_on != PROXY_WAKING); + p->blocked_on = PROXY_WAKING; +} + +static inline void set_task_blocked_on_waking(struct task_struct *p, struct mutex *m) +{ + guard(raw_spinlock_irqsave)(&p->blocked_lock); + __set_task_blocked_on_waking(p, m); +} + #else static inline void __clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m) { @@ -2228,6 +2267,14 @@ static inline void __clear_task_blocked_on(struct task_struct *p, struct rt_mute static inline void clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m) { } + +static inline void __set_task_blocked_on_waking(struct task_struct *p, struct rt_mutex *m) +{ +} + +static inline void set_task_blocked_on_waking(struct task_struct *p, struct rt_mutex *m) +{ +} #endif /* !CONFIG_PREEMPT_RT */ static __always_inline bool need_resched(void) -- cgit v1.2.3 From 4e2866b2baaddfff6069a2f18fc134c1d5a08f2b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 11 Mar 2026 12:18:54 -0400 Subject: SUNRPC: Add svc_rqst_page_release() helper svc_rqst_replace_page() releases displaced pages through a per-rqst folio batch, but exposes the add-or-flush sequence directly. svc_tcp_restore_pages() releases displaced pages individually with put_page(). Introduce svc_rqst_page_release() to encapsulate the batched release mechanism. Convert svc_rqst_replace_page() and svc_tcp_restore_pages() to use it. The latter now benefits from the same batched release that svc_rqst_replace_page() already uses. Reviewed-by: Christoph Hellwig Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 669c944eaf7f..1ebd9c7efa70 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -498,6 +498,21 @@ int svc_generic_rpcbind_set(struct net *net, #define RPC_MAX_ADDRBUFLEN (63U) +/** + * svc_rqst_page_release - release a page associated with an RPC transaction + * @rqstp: RPC transaction context + * @page: page to release + * + * Released pages are batched and freed together, reducing + * allocator pressure under heavy RPC workloads. + */ +static inline void svc_rqst_page_release(struct svc_rqst *rqstp, + struct page *page) +{ + if (!folio_batch_add(&rqstp->rq_fbatch, page_folio(page))) + __folio_batch_release(&rqstp->rq_fbatch); +} + /* * When we want to reduce the size of the reserved space in the response * buffer, we need to take into account the size of any checksum data that -- cgit v1.2.3 From e6898ec751e4d8577b210f8e816ea9f8c2a7158a Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 2 Apr 2026 19:44:17 -0700 Subject: bpf: Sort subprogs in topological order after check_cfg() Add a pass that sorts subprogs in topological order so that iterating subprog_topo_order[] walks leaf subprogs first, then their callers. This is computed as a DFS post-order traversal of the CFG. The pass runs after check_cfg() to ensure the CFG has been validated before traversing and after postorder has been computed to avoid walking dead code. Reviewed-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260403024422.87231-3-alexei.starovoitov@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index b129e0aaee20..d21541f96ee9 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -787,6 +787,8 @@ struct bpf_verifier_env { const struct bpf_line_info *prev_linfo; struct bpf_verifier_log log; struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 2]; /* max + 2 for the fake and exception subprogs */ + /* subprog indices sorted in topological order: leaves first, callers last */ + int subprog_topo_order[BPF_MAX_SUBPROGS + 2]; union { struct bpf_idmap idmap_scratch; struct bpf_idset idset_scratch; -- cgit v1.2.3 From f1606dd0ac49230f5a5fa1a279210fdf0249c20f Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 2 Apr 2026 19:44:19 -0700 Subject: bpf: Add bpf_compute_const_regs() and bpf_prune_dead_branches() passes Add two passes before the main verifier pass: bpf_compute_const_regs() is a forward dataflow analysis that tracks register values in R0-R9 across the program using fixed-point iteration in reverse postorder. Each register is tracked with a six-state lattice: UNVISITED -> CONST(val) / MAP_PTR(map_index) / MAP_VALUE(map_index, offset) / SUBPROG(num) -> UNKNOWN At merge points, if two paths produce the same state and value for a register, it stays; otherwise it becomes UNKNOWN. The analysis handles: - MOV, ADD, SUB, AND with immediate or register operands - LD_IMM64 for plain constants, map FDs, map values, and subprogs - LDX from read-only maps: constant-folds the load by reading the map value directly via bpf_map_direct_read() Results that fit in 32 bits are stored per-instruction in insn_aux_data and bitmasks. bpf_prune_dead_branches() uses the computed constants to evaluate conditional branches. When both operands of a conditional jump are known constants, the branch outcome is determined statically and the instruction is rewritten to an unconditional jump. The CFG postorder is then recomputed to reflect new control flow. This eliminates dead edges so that subsequent liveness analysis doesn't propagate through dead code. Also add runtime sanity check to validate that precomputed constants match the verifier's tracked state. Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260403024422.87231-5-alexei.starovoitov@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index d21541f96ee9..c5e65cdb6328 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -595,6 +595,18 @@ struct bpf_insn_aux_data { u32 scc; /* registers alive before this instruction. */ u16 live_regs_before; + /* + * Bitmask of R0-R9 that hold known values at this instruction. + * const_reg_mask: scalar constants that fit in 32 bits. + * const_reg_map_mask: map pointers, val is map_index into used_maps[]. + * const_reg_subprog_mask: subprog pointers, val is subprog number. + * const_reg_vals[i] holds the 32-bit value for register i. + * Populated by compute_const_regs() pre-pass. + */ + u16 const_reg_mask; + u16 const_reg_map_mask; + u16 const_reg_subprog_mask; + u32 const_reg_vals[10]; }; #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ @@ -945,6 +957,10 @@ void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab); int mark_chain_precision(struct bpf_verifier_env *env, int regno); +bool bpf_map_is_rdonly(const struct bpf_map *map); +int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val, + bool is_ldsx); + #define BPF_BASE_TYPE_MASK GENMASK(BPF_BASE_TYPE_BITS - 1, 0) /* extract base type from bpf_{arg, return, reg}_type. */ @@ -1088,6 +1104,13 @@ struct bpf_iarray *bpf_insn_successors(struct bpf_verifier_env *env, u32 idx); void bpf_fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask); bool bpf_calls_callback(struct bpf_verifier_env *env, int insn_idx); +int bpf_find_subprog(struct bpf_verifier_env *env, int off); +int bpf_compute_const_regs(struct bpf_verifier_env *env); +int bpf_prune_dead_branches(struct bpf_verifier_env *env); +int bpf_compute_postorder(struct bpf_verifier_env *env); +bool bpf_insn_is_cond_jump(u8 code); +bool bpf_is_may_goto_insn(struct bpf_insn *insn); + int bpf_stack_liveness_init(struct bpf_verifier_env *env); void bpf_stack_liveness_free(struct bpf_verifier_env *env); int bpf_update_live_stack(struct bpf_verifier_env *env); -- cgit v1.2.3 From 19dbb1347481105e8aabc7479af35c09a65333a9 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 2 Apr 2026 19:44:20 -0700 Subject: bpf: Move verifier helpers to header Move several helpers to header as preparation for the subsequent stack liveness patches. Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260403024422.87231-6-alexei.starovoitov@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c5e65cdb6328..7bd32a8a45f6 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -879,6 +879,30 @@ static inline struct bpf_subprog_info *subprog_info(struct bpf_verifier_env *env return &env->subprog_info[subprog]; } +struct bpf_call_summary { + u8 num_params; + bool is_void; + bool fastcall; +}; + +static inline bool bpf_helper_call(const struct bpf_insn *insn) +{ + return insn->code == (BPF_JMP | BPF_CALL) && + insn->src_reg == 0; +} + +static inline bool bpf_pseudo_call(const struct bpf_insn *insn) +{ + return insn->code == (BPF_JMP | BPF_CALL) && + insn->src_reg == BPF_PSEUDO_CALL; +} + +static inline bool bpf_pseudo_kfunc_call(const struct bpf_insn *insn) +{ + return insn->code == (BPF_JMP | BPF_CALL) && + insn->src_reg == BPF_PSEUDO_KFUNC_CALL; +} + __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, va_list args); __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, @@ -1111,6 +1135,10 @@ int bpf_compute_postorder(struct bpf_verifier_env *env); bool bpf_insn_is_cond_jump(u8 code); bool bpf_is_may_goto_insn(struct bpf_insn *insn); +void bpf_verbose_insn(struct bpf_verifier_env *env, struct bpf_insn *insn); +bool bpf_get_call_summary(struct bpf_verifier_env *env, struct bpf_insn *call, + struct bpf_call_summary *cs); + int bpf_stack_liveness_init(struct bpf_verifier_env *env); void bpf_stack_liveness_free(struct bpf_verifier_env *env); int bpf_update_live_stack(struct bpf_verifier_env *env); -- cgit v1.2.3 From 1a1cadbd5d50b31ae1340c2a9938947719696ca0 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 2 Apr 2026 19:44:21 -0700 Subject: bpf: Add helper and kfunc stack access size resolution The static stack liveness analysis needs to know how many bytes a helper or kfunc accesses through a stack pointer argument, so it can precisely mark the affected stack slots as stack 'def' or 'use'. Add bpf_helper_stack_access_bytes() and bpf_kfunc_stack_access_bytes() which resolve the access size for a given call argument. Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260403024422.87231-7-alexei.starovoitov@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 7bd32a8a45f6..36bfd96d4563 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -1138,6 +1138,12 @@ bool bpf_is_may_goto_insn(struct bpf_insn *insn); void bpf_verbose_insn(struct bpf_verifier_env *env, struct bpf_insn *insn); bool bpf_get_call_summary(struct bpf_verifier_env *env, struct bpf_insn *call, struct bpf_call_summary *cs); +s64 bpf_helper_stack_access_bytes(struct bpf_verifier_env *env, + struct bpf_insn *insn, int arg, + int insn_idx); +s64 bpf_kfunc_stack_access_bytes(struct bpf_verifier_env *env, + struct bpf_insn *insn, int arg, + int insn_idx); int bpf_stack_liveness_init(struct bpf_verifier_env *env); void bpf_stack_liveness_free(struct bpf_verifier_env *env); -- cgit v1.2.3 From 624bf3440d7214b62c22d698a0a294323f331d5d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 10 Mar 2026 16:48:12 -0700 Subject: KVM: SEV: Disallow LAUNCH_FINISH if vCPUs are actively being created Reject LAUNCH_FINISH for SEV-ES and SNP VMs if KVM is actively creating one or more vCPUs, as KVM needs to process and encrypt each vCPU's VMSA. Letting userspace create vCPUs while LAUNCH_FINISH is in-progress is "fine", at least in the current code base, as kvm_for_each_vcpu() operates on online_vcpus, LAUNCH_FINISH (all SEV+ sub-ioctls) holds kvm->mutex, and fully onlining a vCPU in kvm_vm_ioctl_create_vcpu() is done under kvm->mutex. I.e. there's no difference between an in-progress vCPU and a vCPU that is created entirely after LAUNCH_FINISH. However, given that concurrent LAUNCH_FINISH and vCPU creation can't possibly work (for any reasonable definition of "work"), since userspace can't guarantee whether a particular vCPU will be encrypted or not, disallow the combination as a hardening measure, to reduce the probability of introducing bugs in the future, and to avoid having to reason about the safety of future changes related to LAUNCH_FINISH. Cc: Jethro Beekman Closes: https://lore.kernel.org/all/b31f7c6e-2807-4662-bcdd-eea2c1e132fa@fortanix.com Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20260310234829.2608037-5-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 34759a262b28..3c7f8557f7af 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1029,6 +1029,13 @@ static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id) return NULL; } +static inline bool kvm_is_vcpu_creation_in_progress(struct kvm *kvm) +{ + lockdep_assert_held(&kvm->lock); + + return kvm->created_vcpus != atomic_read(&kvm->online_vcpus); +} + void kvm_destroy_vcpus(struct kvm *kvm); int kvm_trylock_all_vcpus(struct kvm *kvm); -- cgit v1.2.3 From 9617b5b62c7cf4284740ba5efdbf083aa5a87e5f Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Thu, 2 Apr 2026 16:15:02 +0200 Subject: kernel: ksysfs: initialize kernel_kobj earlier Software nodes depend on kernel_kobj which is initialized pretty late into the boot process - as a core_initcall(). Ahead of moving the software node initialization to driver_init() we must first make kernel_kobj available before it. Make ksysfs_init() visible in a new header - ksysfs.h - and call it in do_basic_setup() right before driver_init(). Signed-off-by: Bartosz Golaszewski Link: https://patch.msgid.link/20260402-nokia770-gpio-swnodes-v5-1-d730db3dd299@oss.qualcomm.com Signed-off-by: Danilo Krummrich --- include/linux/ksysfs.h | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 include/linux/ksysfs.h (limited to 'include/linux') diff --git a/include/linux/ksysfs.h b/include/linux/ksysfs.h new file mode 100644 index 000000000000..c7dc6e18f28e --- /dev/null +++ b/include/linux/ksysfs.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _KSYSFS_H_ +#define _KSYSFS_H_ + +void ksysfs_init(void); + +#endif /* _KSYSFS_H_ */ -- cgit v1.2.3 From 6af36aeb147a06dea47c49859cd6ca5659aeb987 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Fri, 19 Dec 2025 13:18:22 -0500 Subject: lsm: add backing_file LSM hooks Stacked filesystems such as overlayfs do not currently provide the necessary mechanisms for LSMs to properly enforce access controls on the mmap() and mprotect() operations. In order to resolve this gap, a LSM security blob is being added to the backing_file struct and the following new LSM hooks are being created: security_backing_file_alloc() security_backing_file_free() security_mmap_backing_file() The first two hooks are to manage the lifecycle of the LSM security blob in the backing_file struct, while the third provides a new mmap() access control point for the underlying backing file. It is also expected that LSMs will likely want to update their security_file_mprotect() callback to address issues with their mprotect() controls, but that does not require a change to the security_file_mprotect() LSM hook. There are a three other small changes to support these new LSM hooks: * Pass the user file associated with a backing file down to alloc_empty_backing_file() so it can be included in the security_backing_file_alloc() hook. * Add getter and setter functions for the backing_file struct LSM blob as the backing_file struct remains private to fs/file_table.c. * Constify the file struct field in the LSM common_audit_data struct to better support LSMs that need to pass a const file struct pointer into the common LSM audit code. Thanks to Arnd Bergmann for identifying the missing EXPORT_SYMBOL_GPL() and supplying a fixup. Cc: stable@vger.kernel.org Cc: linux-fsdevel@vger.kernel.org Cc: linux-unionfs@vger.kernel.org Cc: linux-erofs@lists.ozlabs.org Reviewed-by: Amir Goldstein Reviewed-by: Serge Hallyn Reviewed-by: Christian Brauner Signed-off-by: Paul Moore --- include/linux/backing-file.h | 4 ++-- include/linux/fs.h | 13 +++++++++++++ include/linux/lsm_audit.h | 2 +- include/linux/lsm_hook_defs.h | 5 +++++ include/linux/lsm_hooks.h | 1 + include/linux/security.h | 22 ++++++++++++++++++++++ 6 files changed, 44 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-file.h b/include/linux/backing-file.h index 1476a6ed1bfd..c939cd222730 100644 --- a/include/linux/backing-file.h +++ b/include/linux/backing-file.h @@ -18,10 +18,10 @@ struct backing_file_ctx { void (*end_write)(struct kiocb *iocb, ssize_t); }; -struct file *backing_file_open(const struct path *user_path, int flags, +struct file *backing_file_open(const struct file *user_file, int flags, const struct path *real_path, const struct cred *cred); -struct file *backing_tmpfile_open(const struct path *user_path, int flags, +struct file *backing_tmpfile_open(const struct file *user_file, int flags, const struct path *real_parentpath, umode_t mode, const struct cred *cred); ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter, diff --git a/include/linux/fs.h b/include/linux/fs.h index 8b3dd145b25e..d0d0e8f55589 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2475,6 +2475,19 @@ struct file *dentry_create(struct path *path, int flags, umode_t mode, const struct cred *cred); const struct path *backing_file_user_path(const struct file *f); +#ifdef CONFIG_SECURITY +void *backing_file_security(const struct file *f); +void backing_file_set_security(struct file *f, void *security); +#else +static inline void *backing_file_security(const struct file *f) +{ + return NULL; +} +static inline void backing_file_set_security(struct file *f, void *security) +{ +} +#endif /* CONFIG_SECURITY */ + /* * When mmapping a file on a stackable filesystem (e.g., overlayfs), the file * stored in ->vm_file is a backing file whose f_inode is on the underlying diff --git a/include/linux/lsm_audit.h b/include/linux/lsm_audit.h index 382c56a97bba..584db296e43b 100644 --- a/include/linux/lsm_audit.h +++ b/include/linux/lsm_audit.h @@ -94,7 +94,7 @@ struct common_audit_data { #endif char *kmod_name; struct lsm_ioctlop_audit *op; - struct file *file; + const struct file *file; struct lsm_ibpkey_audit *ibpkey; struct lsm_ibendport_audit *ibendport; int reason; diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 8c42b4bde09c..b4958167e381 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -191,6 +191,9 @@ LSM_HOOK(int, 0, file_permission, struct file *file, int mask) LSM_HOOK(int, 0, file_alloc_security, struct file *file) LSM_HOOK(void, LSM_RET_VOID, file_release, struct file *file) LSM_HOOK(void, LSM_RET_VOID, file_free_security, struct file *file) +LSM_HOOK(int, 0, backing_file_alloc, struct file *backing_file, + const struct file *user_file) +LSM_HOOK(void, LSM_RET_VOID, backing_file_free, struct file *backing_file) LSM_HOOK(int, 0, file_ioctl, struct file *file, unsigned int cmd, unsigned long arg) LSM_HOOK(int, 0, file_ioctl_compat, struct file *file, unsigned int cmd, @@ -198,6 +201,8 @@ LSM_HOOK(int, 0, file_ioctl_compat, struct file *file, unsigned int cmd, LSM_HOOK(int, 0, mmap_addr, unsigned long addr) LSM_HOOK(int, 0, mmap_file, struct file *file, unsigned long reqprot, unsigned long prot, unsigned long flags) +LSM_HOOK(int, 0, mmap_backing_file, struct vm_area_struct *vma, + struct file *backing_file, struct file *user_file) LSM_HOOK(int, 0, file_mprotect, struct vm_area_struct *vma, unsigned long reqprot, unsigned long prot) LSM_HOOK(int, 0, file_lock, struct file *file, unsigned int cmd) diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index d48bf0ad26f4..b4f8cad53ddb 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -104,6 +104,7 @@ struct security_hook_list { struct lsm_blob_sizes { unsigned int lbs_cred; unsigned int lbs_file; + unsigned int lbs_backing_file; unsigned int lbs_ib; unsigned int lbs_inode; unsigned int lbs_sock; diff --git a/include/linux/security.h b/include/linux/security.h index 83a646d72f6f..ad99b35891a6 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -471,11 +471,17 @@ int security_file_permission(struct file *file, int mask); int security_file_alloc(struct file *file); void security_file_release(struct file *file); void security_file_free(struct file *file); +int security_backing_file_alloc(struct file *backing_file, + const struct file *user_file); +void security_backing_file_free(struct file *backing_file); int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg); int security_file_ioctl_compat(struct file *file, unsigned int cmd, unsigned long arg); int security_mmap_file(struct file *file, unsigned long prot, unsigned long flags); +int security_mmap_backing_file(struct vm_area_struct *vma, + struct file *backing_file, + struct file *user_file); int security_mmap_addr(unsigned long addr); int security_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot, unsigned long prot); @@ -1140,6 +1146,15 @@ static inline void security_file_release(struct file *file) static inline void security_file_free(struct file *file) { } +static inline int security_backing_file_alloc(struct file *backing_file, + const struct file *user_file) +{ + return 0; +} + +static inline void security_backing_file_free(struct file *backing_file) +{ } + static inline int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -1159,6 +1174,13 @@ static inline int security_mmap_file(struct file *file, unsigned long prot, return 0; } +static inline int security_mmap_backing_file(struct vm_area_struct *vma, + struct file *backing_file, + struct file *user_file) +{ + return 0; +} + static inline int security_mmap_addr(unsigned long addr) { return cap_mmap_addr(addr); -- cgit v1.2.3 From a9b460225e47a3d98296eba71c62ff0ad58a2032 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 2 Apr 2026 15:26:54 +0000 Subject: net: always inline some skb helpers Some performance critical helpers from include/linux/skbuff.h are not inlined by clang. Use __always_inline hint for: - __skb_fill_netmem_desc() - __skb_fill_page_desc() - skb_fill_netmem_desc() - skb_fill_page_desc() - __skb_pull() - pskb_may_pull_reason() - pskb_may_pull() - pskb_pull() - pskb_trim() - skb_orphan() - skb_postpull_rcsum() - skb_header_pointer() - skb_clear_delivery_time() - skb_tstamp_cond() - skb_warn_if_lro() This increases performance and saves ~1200 bytes of text. $ scripts/bloat-o-meter -t vmlinux.old vmlinux.new add/remove: 4/24 grow/shrink: 66/12 up/down: 4104/-5306 (-1202) Function old new delta ip_multipath_l3_keys - 303 +303 tcp_sendmsg_locked 4560 4848 +288 xfrm_input 6240 6455 +215 esp_output_head 1516 1711 +195 skb_try_coalesce 696 866 +170 bpf_prog_test_run_skb 1951 2091 +140 tls_strp_read_copy 528 667 +139 gue_udp_recv 738 871 +133 __ip6_append_data 4159 4279 +120 __bond_xmit_hash 1019 1122 +103 ip6_multipath_l3_keys 394 495 +101 bpf_lwt_seg6_action 1096 1197 +101 input_action_end_dx2 344 442 +98 vxlan_remcsum 487 581 +94 udpv6_queue_rcv_skb 393 480 +87 udp_queue_rcv_skb 385 471 +86 gue_remcsum 453 539 +86 udp_lib_checksum_complete 84 168 +84 vxlan_xmit 2777 2857 +80 nf_reset_ct 456 532 +76 igmp_rcv 1902 1978 +76 mpls_forward 1097 1169 +72 tcp_add_backlog 1226 1292 +66 nfulnl_log_packet 3091 3156 +65 tcp_rcv_established 1966 2026 +60 __strp_recv 1547 1603 +56 eth_type_trans 357 411 +54 bond_flow_ip 392 444 +52 __icmp_send 1584 1630 +46 ip_defrag 1636 1681 +45 tpacket_rcv 2793 2837 +44 refcount_add 132 176 +44 nf_ct_frag6_gather 1959 2003 +44 napi_skb_free_stolen_head 199 240 +41 __pskb_trim - 41 +41 napi_reuse_skb 319 358 +39 icmpv6_rcv 1877 1916 +39 br_handle_frame_finish 1672 1711 +39 ip_rcv_core 841 879 +38 ip_check_defrag 377 415 +38 br_stp_rcv 909 947 +38 qdisc_pkt_len_segs_init 366 399 +33 mld_query_work 2945 2975 +30 bpf_sk_assign_tcp_reqsk 607 637 +30 udp_gro_receive 1657 1686 +29 ip6_rcv_core 1170 1193 +23 ah_input 1176 1197 +21 tun_get_user 5174 5194 +20 llc_rcv 815 834 +19 __pfx_udp_lib_checksum_complete 16 32 +16 __pfx_refcount_add 48 64 +16 __pfx_nf_reset_ct 96 112 +16 __pfx_ip_multipath_l3_keys - 16 +16 __pfx___pskb_trim - 16 +16 packet_sendmsg 5771 5781 +10 esp_output_tail 1460 1470 +10 alloc_skb_with_frags 433 443 +10 xsk_generic_xmit 3477 3486 +9 mptcp_sendmsg_frag 2250 2259 +9 __ip_append_data 4166 4175 +9 __ip6_tnl_rcv 1159 1168 +9 skb_zerocopy 1215 1220 +5 gre_parse_header 1358 1362 +4 __iptunnel_pull_header 405 407 +2 skb_vlan_untag 692 693 +1 psp_dev_rcv 701 702 +1 netkit_xmit 1263 1264 +1 gre_rcv 2776 2777 +1 gre_gso_segment 1521 1522 +1 bpf_skb_net_hdr_pop 535 536 +1 udp6_ufo_fragment 888 884 -4 br_multicast_rcv 9154 9148 -6 snap_rcv 312 305 -7 skb_copy_ubufs 1841 1834 -7 __pfx_skb_tstamp_cond 16 - -16 __pfx_skb_clear_delivery_time 16 - -16 __pfx_pskb_trim 16 - -16 __pfx_pskb_pull 16 - -16 ipv6_gso_segment 1400 1383 -17 ipv6_frag_rcv 2511 2492 -19 erspan_xmit 1221 1190 -31 __pfx_skb_warn_if_lro 32 - -32 __pfx___skb_fill_page_desc 32 - -32 skb_tstamp_cond 42 - -42 pskb_trim 46 - -46 __pfx_skb_postpull_rcsum 48 - -48 tcp_gso_segment 1524 1475 -49 skb_clear_delivery_time 54 - -54 __pfx_skb_fill_page_desc 64 - -64 __pfx_skb_header_pointer 80 - -80 pskb_pull 91 - -91 skb_warn_if_lro 110 - -110 tcp_v6_rcv 3288 3170 -118 __pfx___skb_pull 128 - -128 __pfx_skb_orphan 144 - -144 __pfx_pskb_may_pull 160 - -160 tcp_v4_rcv 3334 3153 -181 __skb_fill_page_desc 231 - -231 udp_rcv 1809 1553 -256 skb_postpull_rcsum 318 - -318 skb_header_pointer 367 - -367 fib_multipath_hash 3399 3018 -381 skb_orphan 513 - -513 skb_fill_page_desc 534 - -534 __skb_pull 568 - -568 pskb_may_pull 604 - -604 Total: Before=29652698, After=29651496, chg -0.00% Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260402152654.1720627-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 46 ++++++++++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index fbfa9852e82a..26fe18bcfad8 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2605,8 +2605,9 @@ static inline void skb_len_add(struct sk_buff *skb, int delta) * * Does not take any additional reference on the fragment. */ -static inline void __skb_fill_netmem_desc(struct sk_buff *skb, int i, - netmem_ref netmem, int off, int size) +static __always_inline void +__skb_fill_netmem_desc(struct sk_buff *skb, int i, netmem_ref netmem, + int off, int size) { struct page *page; @@ -2628,14 +2629,16 @@ static inline void __skb_fill_netmem_desc(struct sk_buff *skb, int i, skb->pfmemalloc = true; } -static inline void __skb_fill_page_desc(struct sk_buff *skb, int i, - struct page *page, int off, int size) +static __always_inline void +__skb_fill_page_desc(struct sk_buff *skb, int i, struct page *page, + int off, int size) { __skb_fill_netmem_desc(skb, i, page_to_netmem(page), off, size); } -static inline void skb_fill_netmem_desc(struct sk_buff *skb, int i, - netmem_ref netmem, int off, int size) +static __always_inline void +skb_fill_netmem_desc(struct sk_buff *skb, int i, netmem_ref netmem, + int off, int size) { __skb_fill_netmem_desc(skb, i, netmem, off, size); skb_shinfo(skb)->nr_frags = i + 1; @@ -2655,8 +2658,9 @@ static inline void skb_fill_netmem_desc(struct sk_buff *skb, int i, * * Does not take any additional reference on the fragment. */ -static inline void skb_fill_page_desc(struct sk_buff *skb, int i, - struct page *page, int off, int size) +static __always_inline void +skb_fill_page_desc(struct sk_buff *skb, int i, struct page *page, + int off, int size) { skb_fill_netmem_desc(skb, i, page_to_netmem(page), off, size); } @@ -2828,7 +2832,7 @@ static inline void *__skb_push(struct sk_buff *skb, unsigned int len) } void *skb_pull(struct sk_buff *skb, unsigned int len); -static inline void *__skb_pull(struct sk_buff *skb, unsigned int len) +static __always_inline void *__skb_pull(struct sk_buff *skb, unsigned int len) { DEBUG_NET_WARN_ON_ONCE(len > INT_MAX); @@ -2853,7 +2857,7 @@ void *skb_pull_data(struct sk_buff *skb, size_t len); void *__pskb_pull_tail(struct sk_buff *skb, int delta); -static inline enum skb_drop_reason +static __always_inline enum skb_drop_reason pskb_may_pull_reason(struct sk_buff *skb, unsigned int len) { DEBUG_NET_WARN_ON_ONCE(len > INT_MAX); @@ -2871,12 +2875,13 @@ pskb_may_pull_reason(struct sk_buff *skb, unsigned int len) return SKB_NOT_DROPPED_YET; } -static inline bool pskb_may_pull(struct sk_buff *skb, unsigned int len) +static __always_inline bool +pskb_may_pull(struct sk_buff *skb, unsigned int len) { return pskb_may_pull_reason(skb, len) == SKB_NOT_DROPPED_YET; } -static inline void *pskb_pull(struct sk_buff *skb, unsigned int len) +static __always_inline void *pskb_pull(struct sk_buff *skb, unsigned int len) { if (!pskb_may_pull(skb, len)) return NULL; @@ -3337,7 +3342,7 @@ static inline int __pskb_trim(struct sk_buff *skb, unsigned int len) return 0; } -static inline int pskb_trim(struct sk_buff *skb, unsigned int len) +static __always_inline int pskb_trim(struct sk_buff *skb, unsigned int len) { skb_might_realloc(skb); return (len < skb->len) ? __pskb_trim(skb, len) : 0; @@ -3380,7 +3385,7 @@ static inline int __skb_grow(struct sk_buff *skb, unsigned int len) * destructor function and make the @skb unowned. The buffer continues * to exist but is no longer charged to its former owner. */ -static inline void skb_orphan(struct sk_buff *skb) +static __always_inline void skb_orphan(struct sk_buff *skb) { if (skb->destructor) { skb->destructor(skb); @@ -4044,8 +4049,8 @@ __skb_postpull_rcsum(struct sk_buff *skb, const void *start, unsigned int len, * update the CHECKSUM_COMPLETE checksum, or set ip_summed to * CHECKSUM_NONE so that it can be recomputed from scratch. */ -static inline void skb_postpull_rcsum(struct sk_buff *skb, - const void *start, unsigned int len) +static __always_inline void +skb_postpull_rcsum(struct sk_buff *skb, const void *start, unsigned int len) { if (skb->ip_summed == CHECKSUM_COMPLETE) skb->csum = wsum_negate(csum_partial(start, len, @@ -4304,7 +4309,7 @@ __skb_header_pointer(const struct sk_buff *skb, int offset, int len, return buffer; } -static inline void * __must_check +static __always_inline void * __must_check skb_header_pointer(const struct sk_buff *skb, int offset, int len, void *buffer) { return __skb_header_pointer(skb, offset, len, skb->data, @@ -4476,7 +4481,7 @@ DECLARE_STATIC_KEY_FALSE(netstamp_needed_key); /* It is used in the ingress path to clear the delivery_time. * If needed, set the skb->tstamp to the (rcv) timestamp. */ -static inline void skb_clear_delivery_time(struct sk_buff *skb) +static __always_inline void skb_clear_delivery_time(struct sk_buff *skb) { if (skb->tstamp_type) { skb->tstamp_type = SKB_CLOCK_REALTIME; @@ -4503,7 +4508,8 @@ static inline ktime_t skb_tstamp(const struct sk_buff *skb) return skb->tstamp; } -static inline ktime_t skb_tstamp_cond(const struct sk_buff *skb, bool cond) +static __always_inline ktime_t +skb_tstamp_cond(const struct sk_buff *skb, bool cond) { if (skb->tstamp_type != SKB_CLOCK_MONOTONIC && skb->tstamp) return skb->tstamp; @@ -5293,7 +5299,7 @@ static inline void skb_decrease_gso_size(struct skb_shared_info *shinfo, void __skb_warn_lro_forwarding(const struct sk_buff *skb); -static inline bool skb_warn_if_lro(const struct sk_buff *skb) +static __always_inline bool skb_warn_if_lro(const struct sk_buff *skb) { /* LRO sets gso_size but not gso_type, whereas if GSO is really * wanted then gso_type will be set. */ -- cgit v1.2.3 From 10a4206a24013be4d558d476010cbf2eb4c9fa64 Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Tue, 24 Mar 2026 01:59:09 +0100 Subject: PCI: use generic driver_override infrastructure When a driver is probed through __driver_attach(), the bus' match() callback is called without the device lock held, thus accessing the driver_override field without a lock, which can cause a UAF. Fix this by using the driver-core driver_override infrastructure taking care of proper locking internally. Note that calling match() from __driver_attach() without the device lock held is intentional. [1] Link: https://lore.kernel.org/driver-core/DGRGTIRHA62X.3RY09D9SOK77P@kernel.org/ [1] Reported-by: Gui-Dong Han Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220789 Fixes: 782a985d7af2 ("PCI: Introduce new device binding path using pci_dev.driver_override") Acked-by: Bjorn Helgaas Acked-by: Alex Williamson Tested-by: Gui-Dong Han Reviewed-by: Gui-Dong Han Link: https://patch.msgid.link/20260324005919.2408620-6-dakr@kernel.org Signed-off-by: Danilo Krummrich --- include/linux/pci.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 1c270f1d5123..57e9463e4347 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -575,12 +575,6 @@ struct pci_dev { u8 supported_speeds; /* Supported Link Speeds Vector */ phys_addr_t rom; /* Physical address if not from BAR */ size_t romlen; /* Length if not from BAR */ - /* - * Driver name to force a match. Do not set directly, because core - * frees it. Use driver_set_override() to set or clear it. - */ - const char *driver_override; - unsigned long priv_flags; /* Private flags for the PCI driver */ /* These methods index pci_reset_fn_methods[] */ -- cgit v1.2.3 From 8a700b1fc94df4d847a04f14ebc7f8532592b367 Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Tue, 24 Mar 2026 01:59:10 +0100 Subject: platform/wmi: use generic driver_override infrastructure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a driver is probed through __driver_attach(), the bus' match() callback is called without the device lock held, thus accessing the driver_override field without a lock, which can cause a UAF. Fix this by using the driver-core driver_override infrastructure taking care of proper locking internally. Note that calling match() from __driver_attach() without the device lock held is intentional. [1] Link: https://lore.kernel.org/driver-core/DGRGTIRHA62X.3RY09D9SOK77P@kernel.org/ [1] Reported-by: Gui-Dong Han Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220789 Fixes: 12046f8c77e0 ("platform/x86: wmi: Add driver_override support") Reviewed-by: Armin Wolf Acked-by: Ilpo Järvinen Link: https://patch.msgid.link/20260324005919.2408620-7-dakr@kernel.org Signed-off-by: Danilo Krummrich --- include/linux/wmi.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/wmi.h b/include/linux/wmi.h index 75cb0c7cfe57..14fb644e1701 100644 --- a/include/linux/wmi.h +++ b/include/linux/wmi.h @@ -18,16 +18,12 @@ * struct wmi_device - WMI device structure * @dev: Device associated with this WMI device * @setable: True for devices implementing the Set Control Method - * @driver_override: Driver name to force a match; do not set directly, - * because core frees it; use driver_set_override() to - * set or clear it. * * This represents WMI devices discovered by the WMI driver core. */ struct wmi_device { struct device dev; bool setable; - const char *driver_override; }; /** -- cgit v1.2.3 From 85bb534ff12aab6916058897b39c748940a7a4c6 Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Tue, 24 Mar 2026 01:59:12 +0100 Subject: vdpa: use generic driver_override infrastructure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a driver is probed through __driver_attach(), the bus' match() callback is called without the device lock held, thus accessing the driver_override field without a lock, which can cause a UAF. Fix this by using the driver-core driver_override infrastructure taking care of proper locking internally. Note that calling match() from __driver_attach() without the device lock held is intentional. [1] Link: https://lore.kernel.org/driver-core/DGRGTIRHA62X.3RY09D9SOK77P@kernel.org/ [1] Reported-by: Gui-Dong Han Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220789 Fixes: 539fec78edb4 ("vdpa: add driver_override support") Acked-by: Eugenio Pérez Acked-by: Michael S. Tsirkin Link: https://patch.msgid.link/20260324005919.2408620-9-dakr@kernel.org Signed-off-by: Danilo Krummrich --- include/linux/vdpa.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 2bfe3baa63f4..782c42d25db1 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -72,9 +72,6 @@ struct vdpa_mgmt_dev; * struct vdpa_device - representation of a vDPA device * @dev: underlying device * @vmap: the metadata passed to upper layer to be used for mapping - * @driver_override: driver name to force a match; do not set directly, - * because core frees it; use driver_set_override() to - * set or clear it. * @config: the configuration ops for this device. * @map: the map ops for this device * @cf_lock: Protects get and set access to configuration layout. @@ -90,7 +87,6 @@ struct vdpa_mgmt_dev; struct vdpa_device { struct device dev; union virtio_map vmap; - const char *driver_override; const struct vdpa_config_ops *config; const struct virtio_map_ops *map; struct rw_semaphore cf_lock; /* Protects get/set config */ -- cgit v1.2.3 From 15ed91aa84ea7bacef3c24286d5136055b4335a8 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Thu, 2 Apr 2026 20:40:56 +0200 Subject: dpll: add frequency monitoring callback ops Add new callback operations for a dpll device: - freq_monitor_get(..) - to obtain current state of frequency monitor feature from dpll device, - freq_monitor_set(..) - to allow feature configuration. Add new callback operation for a dpll pin: - measured_freq_get(..) - to obtain the measured frequency in mHz. Obtain the feature state value using the get callback and provide it to the user if the device driver implements callbacks. The measured_freq_get pin callback is only invoked when the frequency monitor is enabled. The freq_monitor_get device callback is required when measured_freq_get is provided by the driver. Execute the set callback upon user requests. Reviewed-by: Vadim Fedorenko Signed-off-by: Ivan Vecera Link: https://patch.msgid.link/20260402184057.1890514-3-ivecera@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/dpll.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dpll.h b/include/linux/dpll.h index 2ce295b46b8c..b7277a8b484d 100644 --- a/include/linux/dpll.h +++ b/include/linux/dpll.h @@ -52,6 +52,12 @@ struct dpll_device_ops { int (*phase_offset_avg_factor_get)(const struct dpll_device *dpll, void *dpll_priv, u32 *factor, struct netlink_ext_ack *extack); + int (*freq_monitor_set)(const struct dpll_device *dpll, void *dpll_priv, + enum dpll_feature_state state, + struct netlink_ext_ack *extack); + int (*freq_monitor_get)(const struct dpll_device *dpll, void *dpll_priv, + enum dpll_feature_state *state, + struct netlink_ext_ack *extack); }; struct dpll_pin_ops { @@ -110,6 +116,10 @@ struct dpll_pin_ops { int (*ffo_get)(const struct dpll_pin *pin, void *pin_priv, const struct dpll_device *dpll, void *dpll_priv, s64 *ffo, struct netlink_ext_ack *extack); + int (*measured_freq_get)(const struct dpll_pin *pin, void *pin_priv, + const struct dpll_device *dpll, + void *dpll_priv, u64 *measured_freq, + struct netlink_ext_ack *extack); int (*esync_set)(const struct dpll_pin *pin, void *pin_priv, const struct dpll_device *dpll, void *dpll_priv, u64 freq, struct netlink_ext_ack *extack); -- cgit v1.2.3 From 14a51045e10d3087b8374deef02a9d3a694132d6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 21 Jan 2026 18:17:12 -0500 Subject: get rid of busy-waiting in shrink_dcache_tree() If shrink_dcache_tree() runs into a potential victim that is already dying, it must wait for that dentry to go away. To avoid busy-waiting we need some object to wait on and a way for dentry_unlist() to see that we need to be notified. The obvious place for the object to wait on would be on our stack frame. We will store a pointer to that object (struct completion_list) in victim dentry; if there's more than one thread wanting to wait for the same dentry to finish dying, we'll have their instances linked into a list, with reference in dentry pointing to the head of that list. * new object - struct completion_list. A pair of struct completion and pointer to the next instance. That's what shrink_dcache_tree() will wait on if needed. * add a new member (->waiters, opaque pointer to struct completion_list) to struct dentry. It is defined for negative live dentries that are not in-lookup ones and it will remain NULL for almost all of them. It does not conflict with ->d_rcu (defined for killed dentries), ->d_alias (defined for positive dentries, all live) or ->d_in_lookup_hash (defined for in-lookup dentries, all live negative). That allows to colocate all four members. * make sure that all places where dentry enters the state where ->waiters is defined (live, negative, not-in-lookup) initialize ->waiters to NULL. * if select_collect2() runs into a dentry that is already dying, have its caller insert a local instance of struct completion_list into the head of the list hanging off dentry->waiters and wait for completion. * if dentry_unlist() sees non-NULL ->waiters, have it carefully walk through the completion_list instances in that list, calling complete() for each. For now struct completion_list is local to fs/dcache.c; it's obviously dentry-agnostic, and it can be trivially lifted into linux/completion.h if somebody finds a reason to do so... Signed-off-by: Al Viro --- include/linux/dcache.h | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index f939d2ed10a3..19098253f2dd 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -88,6 +88,7 @@ union shortname_store { #define d_lock d_lockref.lock #define d_iname d_shortname.string +struct completion_list; struct dentry { /* RCU lookup touched fields */ @@ -122,12 +123,23 @@ struct dentry { struct hlist_node d_sib; /* child of parent list */ struct hlist_head d_children; /* our children */ /* - * d_alias and d_rcu can share memory + * the following members can share memory - their uses are + * mutually exclusive. */ union { - struct hlist_node d_alias; /* inode alias list */ - struct hlist_bl_node d_in_lookup_hash; /* only for in-lookup ones */ + /* positives: inode alias list */ + struct hlist_node d_alias; + /* in-lookup ones (all negative, live): hash chain */ + struct hlist_bl_node d_in_lookup_hash; + /* killed ones: (already negative) used to schedule freeing */ struct rcu_head d_rcu; + /* + * live non-in-lookup negatives: used if shrink_dcache_tree() + * races with eviction by another thread and needs to wait for + * this dentry to get killed . Remains NULL for almost all + * negative dentries. + */ + struct completion_list *waiters; }; }; -- cgit v1.2.3 From 5f352433ea39171e19fbb3a7e18d983510176854 Mon Sep 17 00:00:00 2001 From: Manikanta Maddireddy Date: Tue, 24 Mar 2026 13:38:54 +0530 Subject: PCI: endpoint: Add reserved region type for MSI-X Table and PBA Add PCI_EPC_BAR_RSVD_MSIX_TBL_RAM and PCI_EPC_BAR_RSVD_MSIX_PBA_RAM to enum pci_epc_bar_rsvd_region_type so that Endpoint controllers can describe hardware-owned MSI-X Table and PBA (Pending Bit Array) regions behind a BAR_RESERVED BAR. Signed-off-by: Manikanta Maddireddy Signed-off-by: Manivannan Sadhasivam Reviewed-by: Niklas Cassel Link: https://patch.msgid.link/20260324080857.916263-2-mmaddireddy@nvidia.com --- include/linux/pci-epc.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h index 334c2b7578d0..1eca1264815b 100644 --- a/include/linux/pci-epc.h +++ b/include/linux/pci-epc.h @@ -211,6 +211,8 @@ enum pci_epc_bar_type { /** * enum pci_epc_bar_rsvd_region_type - type of a fixed subregion behind a BAR * @PCI_EPC_BAR_RSVD_DMA_CTRL_MMIO: Integrated DMA controller MMIO window + * @PCI_EPC_BAR_RSVD_MSIX_TBL_RAM: MSI-X table structure + * @PCI_EPC_BAR_RSVD_MSIX_PBA_RAM: MSI-X PBA structure * * BARs marked BAR_RESERVED are owned by the SoC/EPC hardware and must not be * reprogrammed by EPF drivers. Some of them still expose fixed subregions that @@ -218,6 +220,8 @@ enum pci_epc_bar_type { */ enum pci_epc_bar_rsvd_region_type { PCI_EPC_BAR_RSVD_DMA_CTRL_MMIO = 0, + PCI_EPC_BAR_RSVD_MSIX_TBL_RAM, + PCI_EPC_BAR_RSVD_MSIX_PBA_RAM, }; /** -- cgit v1.2.3 From 8b155f2e4a91f3507951e6ace4b413688ac28b96 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 3 Apr 2026 12:48:51 -0600 Subject: block: remove unused BVEC_ITER_ALL_INIT This macro no longer has any users, so remove it. Signed-off-by: Caleb Sander Mateos Link: https://patch.msgid.link/20260403184852.2140919-1-csander@purestorage.com Signed-off-by: Jens Axboe --- include/linux/bvec.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bvec.h b/include/linux/bvec.h index 06fb60471aaf..d36dd476feda 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -203,15 +203,6 @@ static inline void bvec_iter_advance_single(const struct bio_vec *bv, ((bvl = mp_bvec_iter_bvec((bio_vec), (iter))), 1); \ bvec_iter_advance_single((bio_vec), &(iter), (bvl).bv_len)) -/* for iterating one bio from start to end */ -#define BVEC_ITER_ALL_INIT (struct bvec_iter) \ -{ \ - .bi_sector = 0, \ - .bi_size = UINT_MAX, \ - .bi_idx = 0, \ - .bi_bvec_done = 0, \ -} - static inline struct bio_vec *bvec_init_iter_all(struct bvec_iter_all *iter_all) { iter_all->done = 0; -- cgit v1.2.3 From 6c8dfb0362732bf1e4829867a2a5239fedc592d0 Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Tue, 24 Mar 2026 01:59:06 +0100 Subject: bus: fsl-mc: use generic driver_override infrastructure When a driver is probed through __driver_attach(), the bus' match() callback is called without the device lock held, thus accessing the driver_override field without a lock, which can cause a UAF. Fix this by using the driver-core driver_override infrastructure taking care of proper locking internally. Note that calling match() from __driver_attach() without the device lock held is intentional. [1] Tested-by: Ioana Ciornei Acked-by: Ioana Ciornei Acked-by: Christophe Leroy (CS GROUP) Link: https://lore.kernel.org/driver-core/DGRGTIRHA62X.3RY09D9SOK77P@kernel.org/ [1] Reported-by: Gui-Dong Han Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220789 Fixes: 1f86a00c1159 ("bus/fsl-mc: add support for 'driver_override' in the mc-bus") Link: https://patch.msgid.link/20260324005919.2408620-3-dakr@kernel.org Signed-off-by: Danilo Krummrich --- include/linux/fsl/mc.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fsl/mc.h b/include/linux/fsl/mc.h index 897d6211c163..1da63f2d7040 100644 --- a/include/linux/fsl/mc.h +++ b/include/linux/fsl/mc.h @@ -178,9 +178,6 @@ struct fsl_mc_obj_desc { * @regions: pointer to array of MMIO region entries * @irqs: pointer to array of pointers to interrupts allocated to this device * @resource: generic resource associated with this MC object device, if any. - * @driver_override: driver name to force a match; do not set directly, - * because core frees it; use driver_set_override() to - * set or clear it. * * Generic device object for MC object devices that are "attached" to a * MC bus. @@ -214,7 +211,6 @@ struct fsl_mc_device { struct fsl_mc_device_irq **irqs; struct fsl_mc_resource *resource; struct device_link *consumer_link; - const char *driver_override; }; #define to_fsl_mc_device(_dev) \ -- cgit v1.2.3 From adfc80dd0d7831335b5105fb3d8747094bf42878 Mon Sep 17 00:00:00 2001 From: Paul Walmsley Date: Sat, 4 Apr 2026 18:40:58 -0600 Subject: prctl: rename branch landing pad implementation functions to be more explicit Per Linus' comments about the unreadability of abbreviations such as "indir_br_lp", rename the three prctl() implementation functions to be more explicit. This involves renaming "indir_br_lp_status" in the function names to "branch_landing_pad_state". While here, add _prctl_ into the function names, following the speculation control prctl implementation functions. Link: https://lore.kernel.org/linux-riscv/CAHk-=whhSLGZAx3N5jJpb4GLFDqH_QvS07D+6BnkPWmCEzTAgw@mail.gmail.com/ Cc: Deepak Gupta Cc: Linus Torvalds Cc: Mark Brown Signed-off-by: Paul Walmsley --- include/linux/cpu.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 8239cd95a005..9b6b0d87fdb0 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -229,8 +229,8 @@ static inline bool cpu_attack_vector_mitigated(enum cpu_attack_vectors v) #define smt_mitigations SMT_MITIGATIONS_OFF #endif -int arch_get_indir_br_lp_status(struct task_struct *t, unsigned long __user *status); -int arch_set_indir_br_lp_status(struct task_struct *t, unsigned long status); -int arch_lock_indir_br_lp_status(struct task_struct *t, unsigned long status); +int arch_prctl_get_branch_landing_pad_state(struct task_struct *t, unsigned long __user *state); +int arch_prctl_set_branch_landing_pad_state(struct task_struct *t, unsigned long state); +int arch_prctl_lock_branch_landing_pad_state(struct task_struct *t); #endif /* _LINUX_CPU_H_ */ -- cgit v1.2.3 From 9ec1e972c3de3106140c18d2a1c7c74795d85a69 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 30 Jan 2026 15:59:16 -0500 Subject: maple_tree: introduce maple_copy node and use it in mas_spanning_rebalance() Introduce an internal-memory only node type called maple_copy to facilitate internal copy operations. Use it in mas_spanning_rebalance() for just the leaf nodes. Initially, the maple_copy node is used to configure the source nodes and copy the data into the big_node. The maple_copy contains a list of source entries with start and end offsets. One of the maple_copy entries can be itself with an offset of 0 to 2, representing the data where the store partially overwrites entries, or fully overwrites the entry. The side effect is that the source nodes no longer have to worry about partially copying the existing offset if it is not fully overwritten. This is in preparation of removal of the maple big_node, but for the time being the data is copied to the big node to limit the change size. Link: https://lkml.kernel.org/r/20260130205935.2559335-12-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Alice Ryhl Cc: Andrew Ballance Cc: Arnd Bergmann Cc: Christian Kujau Cc: Geert Uytterhoeven Cc: Kuninori Morimoto Cc: Matthew Wilcox (Oracle) Cc: SeongJae Park Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include/linux') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 7b8aad47121e..9bc7fa89bc2e 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -139,6 +139,7 @@ enum maple_type { maple_leaf_64, maple_range_64, maple_arange_64, + maple_copy, }; enum store_type { @@ -154,6 +155,30 @@ enum store_type { wr_slot_store, }; +struct maple_copy { + struct { + struct maple_node *node; + unsigned long max; + unsigned char start; + unsigned char end; + enum maple_type mt; + } src[4]; + /* Simulated node */ + void __rcu *slot[3]; + unsigned long min; + union { + unsigned long pivot[3]; + struct { + void *_pad[2]; + unsigned long max; + }; + }; + unsigned char end; + + /*Avoid passing these around */ + unsigned char s_count; +}; + /** * DOC: Maple tree flags * @@ -299,6 +324,7 @@ struct maple_node { }; struct maple_range_64 mr64; struct maple_arange_64 ma64; + struct maple_copy cp; }; }; -- cgit v1.2.3 From 6953038cab845f3720ec8d83915f4f083861e195 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 30 Jan 2026 15:59:19 -0500 Subject: maple_tree: change initial big node setup in mas_wr_spanning_rebalance() Instead of copying the data into the big node and finding out that the data may need to be moved or appended to, calculate the data space up front (in the maple copy node) and set up another source for the copy. The additional copy source is tracked in the maple state sib (short for sibling), and is put into the maple write states for future operations after the data is in the big node. To facilitate the newly moved node, some initial setup of the maple subtree state are relocated after the potential shift caused by the new way of rebalancing against a sibling. Link: https://lkml.kernel.org/r/20260130205935.2559335-15-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Alice Ryhl Cc: Andrew Ballance Cc: Arnd Bergmann Cc: Christian Kujau Cc: Geert Uytterhoeven Cc: Kuninori Morimoto Cc: Matthew Wilcox (Oracle) Cc: SeongJae Park Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 9bc7fa89bc2e..e99e16ac1c6d 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -177,6 +177,7 @@ struct maple_copy { /*Avoid passing these around */ unsigned char s_count; + unsigned char data; }; /** -- cgit v1.2.3 From 20b20162e1f3b7e60cf0e79116fb2f3bdef3dc5e Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 30 Jan 2026 15:59:21 -0500 Subject: maple_tree: add gap support, slot and pivot sizes for maple copy Add plumbing work for using maple copy as a normal node for a source of copy operations. This is needed later. Link: https://lkml.kernel.org/r/20260130205935.2559335-17-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Alice Ryhl Cc: Andrew Ballance Cc: Arnd Bergmann Cc: Christian Kujau Cc: Geert Uytterhoeven Cc: Kuninori Morimoto Cc: Matthew Wilcox (Oracle) Cc: SeongJae Park Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index e99e16ac1c6d..db6a02788902 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -165,6 +165,7 @@ struct maple_copy { } src[4]; /* Simulated node */ void __rcu *slot[3]; + unsigned long gap[3]; unsigned long min; union { unsigned long pivot[3]; -- cgit v1.2.3 From a9c6716e088a1d4badd4fa6797469506bb99ec8b Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 30 Jan 2026 15:59:22 -0500 Subject: maple_tree: start using maple copy node for destination Stop using the maple subtree state and big node in favour of using three destinations in the maple copy node. That is, expand the way leaves were handled to all levels of the tree and use the maple copy node to track the new nodes. Extract out the sibling init into the data calculation since this is where the insufficient data can be detected. The remainder of the sibling code to shift the next iteration is moved to the spanning_ascend() function, since it is not always needed. Next introduce the dst_setup() function which will decide how many nodes are needed to contain the data at this level. Using the destination count, populate the copy node's dst array with the new nodes and set d_count to the correct value. Note that this can be tricky in the case of a leaf node with exactly enough room because of the rule against NULLs at the end of leaves. Once the destinations are ready, copy the data by altering the cp_data_write() function to copy from the sources to the destinations directly. This eliminates the use of the big node in this code path. On node completion, node_finalise() will zero out the remaining area and set the metadata, if necessary. spanning_ascend() is used to decide if the operation is complete. It may create a new root, converge into one destination, or continue upwards by ascending the left and right write maple states. One test case setup needed to be tweaked so that the targeted node was surrounded by full nodes. [akpm@linux-foundation.org: coding-style cleanups] Link: https://lkml.kernel.org/r/20260130205935.2559335-18-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Alice Ryhl Cc: Andrew Ballance Cc: Arnd Bergmann Cc: Christian Kujau Cc: Geert Uytterhoeven Cc: Kuninori Morimoto Cc: Matthew Wilcox (Oracle) Cc: SeongJae Park Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index db6a02788902..0c464eade1d6 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -156,6 +156,17 @@ enum store_type { }; struct maple_copy { + /* + * min, max, and pivots are values + * start, end, split are indexes into arrays + * data is a size + */ + + struct { + struct maple_node *node; + unsigned long max; + enum maple_type mt; + } dst[3]; struct { struct maple_node *node; unsigned long max; @@ -178,7 +189,10 @@ struct maple_copy { /*Avoid passing these around */ unsigned char s_count; + unsigned char d_count; + unsigned char split; unsigned char data; + unsigned char height; }; /** -- cgit v1.2.3 From e4f4fc7aa8b720d934a0bfcea7f8aae4271d308f Mon Sep 17 00:00:00 2001 From: "JP Kobryn (Meta)" Date: Thu, 19 Feb 2026 15:58:46 -0800 Subject: mm: move pgscan, pgsteal, pgrefill to node stats MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are situations where reclaim kicks in on a system with free memory. One possible cause is a NUMA imbalance scenario where one or more nodes are under pressure. It would help if we could easily identify such nodes. Move the pgscan, pgsteal, and pgrefill counters from vm_event_item to node_stat_item to provide per-node reclaim visibility. With these counters as node stats, the values are now displayed in the per-node section of /proc/zoneinfo, which allows for quick identification of the affected nodes. /proc/vmstat continues to report the same counters, aggregated across all nodes. But the ordering of these items within the readout changes as they move from the vm events section to the node stats section. Memcg accounting of these counters is preserved. The relocated counters remain visible in memory.stat alongside the existing aggregate pgscan and pgsteal counters. However, this change affects how the global counters are accumulated. Previously, the global event count update was gated on !cgroup_reclaim(), excluding memcg-based reclaim from /proc/vmstat. Now that mod_lruvec_state() is being used to update the counters, the global counters will include all reclaim. This is consistent with how pgdemote counters are already tracked. Finally, the virtio_balloon driver is updated to use global_node_page_state() to fetch the counters, as they are no longer accessible through the vm_events array. Link: https://lkml.kernel.org/r/20260219235846.161910-1-jp.kobryn@linux.dev Signed-off-by: JP Kobryn Suggested-by: Johannes Weiner Acked-by: Michael S. Tsirkin Reviewed-by: Vlastimil Babka (SUSE) Acked-by: Johannes Weiner Acked-by: Shakeel Butt Acked-by: Michal Hocko Cc: Alistair Popple Cc: Axel Rasmussen Cc: Byungchul Park Cc: David Hildenbrand Cc: Eugenio Pérez Cc: Gregory Price Cc: "Huang, Ying" Cc: Jason Wang Cc: Joshua Hahn Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mathew Brost Cc: Mike Rapoport Cc: Muchun Song Cc: Qi Zheng Cc: Rakie Kim Cc: Roman Gushchin Cc: Suren Baghdasaryan Cc: Wei Xu Cc: Xuan Zhuo Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 13 +++++++++++++ include/linux/vm_event_item.h | 13 ------------- 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3e51190a55e4..546bca95ca40 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -255,6 +255,19 @@ enum node_stat_item { PGDEMOTE_DIRECT, PGDEMOTE_KHUGEPAGED, PGDEMOTE_PROACTIVE, + PGSTEAL_KSWAPD, + PGSTEAL_DIRECT, + PGSTEAL_KHUGEPAGED, + PGSTEAL_PROACTIVE, + PGSTEAL_ANON, + PGSTEAL_FILE, + PGSCAN_KSWAPD, + PGSCAN_DIRECT, + PGSCAN_KHUGEPAGED, + PGSCAN_PROACTIVE, + PGSCAN_ANON, + PGSCAN_FILE, + PGREFILL, #ifdef CONFIG_HUGETLB_PAGE NR_HUGETLB, #endif diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 22a139f82d75..03fe95f5a020 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -38,21 +38,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, PGFREE, PGACTIVATE, PGDEACTIVATE, PGLAZYFREE, PGFAULT, PGMAJFAULT, PGLAZYFREED, - PGREFILL, PGREUSE, - PGSTEAL_KSWAPD, - PGSTEAL_DIRECT, - PGSTEAL_KHUGEPAGED, - PGSTEAL_PROACTIVE, - PGSCAN_KSWAPD, - PGSCAN_DIRECT, - PGSCAN_KHUGEPAGED, - PGSCAN_PROACTIVE, PGSCAN_DIRECT_THROTTLE, - PGSCAN_ANON, - PGSCAN_FILE, - PGSTEAL_ANON, - PGSTEAL_FILE, #ifdef CONFIG_NUMA PGSCAN_ZONE_RECLAIM_SUCCESS, PGSCAN_ZONE_RECLAIM_FAILED, -- cgit v1.2.3 From 0d6af9bcf383bcdf601e670bb605861b01e318e7 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 18 Feb 2026 04:06:34 +0800 Subject: mm, swap: use the swap table to track the swap count Now all the infrastructures are ready, switch to using the swap table only. This is unfortunately a large patch because the whole old counting mechanism, especially SWP_CONTINUED, has to be gone and switch to the new mechanism together, with no intermediate steps available. The swap table is capable of holding up to SWP_TB_COUNT_MAX - 1 counts in the higher bits of each table entry, so using that, the swap_map can be completely dropped. swap_map also had a limit of SWAP_CONT_MAX. Any value beyond that limit will require a COUNT_CONTINUED page. COUNT_CONTINUED is a bit complex to maintain, so for the swap table, a simpler approach is used: when the count goes beyond SWP_TB_COUNT_MAX - 1, the cluster will have an extend_table allocated, which is a swap cluster-sized array of unsigned int. The counting is basically offloaded there until the count drops below SWP_TB_COUNT_MAX again. Both the swap table and the extend table are cluster-based, so they exhibit good performance and sparsity. To make the switch from swap_map to swap table clean, this commit cleans up and introduces a new set of functions based on the swap table design, for manipulating swap counts: - __swap_cluster_dup_entry, __swap_cluster_put_entry, __swap_cluster_alloc_entry, __swap_cluster_free_entry: Increase/decrease the count of a swap slot, or alloc / free a swap slot. This is the internal routine that does the counting work based on the swap table and handles all the complexities. The caller will need to lock the cluster before calling them. All swap count-related update operations are wrapped by these four helpers. - swap_dup_entries_cluster, swap_put_entries_cluster: Increase/decrease the swap count of one or a set of swap slots in the same cluster range. These two helpers serve as the common routines for folio_dup_swap & swap_dup_entry_direct, or folio_put_swap & swap_put_entries_direct. And use these helpers to replace all existing callers. This helps to simplify the count tracking by a lot, and the swap_map is gone. [ryncsn@gmail.com: fix build] Link: https://lkml.kernel.org/r/aZWuLZi-vYi3vAWe@KASONG-MC4 Link: https://lkml.kernel.org/r/20260218-swap-table-p3-v3-9-f4e34be021a7@tencent.com Signed-off-by: Kairui Song Suggested-by: Chris Li Acked-by: Chris Li Cc: Baoquan He Cc: Barry Song Cc: David Hildenbrand Cc: Johannes Weiner Cc: Kairui Song Cc: Kemeng Shi Cc: kernel test robot Cc: Lorenzo Stoakes Cc: Nhat Pham Signed-off-by: Andrew Morton --- include/linux/swap.h | 28 +++------------------------- 1 file changed, 3 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 62fc7499b408..0effe3cc50f5 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -208,7 +208,6 @@ enum { SWP_DISCARDABLE = (1 << 2), /* blkdev support discard */ SWP_DISCARDING = (1 << 3), /* now discarding a free cluster */ SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */ - SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */ SWP_BLKDEV = (1 << 6), /* its a block device */ SWP_ACTIVATED = (1 << 7), /* set after swap_activate success */ SWP_FS_OPS = (1 << 8), /* swapfile operations go through fs */ @@ -223,16 +222,6 @@ enum { #define SWAP_CLUSTER_MAX_SKIPPED (SWAP_CLUSTER_MAX << 10) #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX -/* Bit flag in swap_map */ -#define COUNT_CONTINUED 0x80 /* Flag swap_map continuation for full count */ - -/* Special value in first swap_map */ -#define SWAP_MAP_MAX 0x3e /* Max count */ -#define SWAP_MAP_BAD 0x3f /* Note page is bad */ - -/* Special value in each swap_map continuation */ -#define SWAP_CONT_MAX 0x7f /* Max count */ - /* * The first page in the swap file is the swap header, which is always marked * bad to prevent it from being allocated as an entry. This also prevents the @@ -264,8 +253,7 @@ struct swap_info_struct { signed short prio; /* swap priority of this type */ struct plist_node list; /* entry in swap_active_head */ signed char type; /* strange name for an index */ - unsigned int max; /* extent of the swap_map */ - unsigned char *swap_map; /* vmalloc'ed array of usage counts */ + unsigned int max; /* size of this swap device */ unsigned long *zeromap; /* kvmalloc'ed bitmap to track zero pages */ struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ struct list_head free_clusters; /* free clusters list */ @@ -284,18 +272,14 @@ struct swap_info_struct { struct completion comp; /* seldom referenced */ spinlock_t lock; /* * protect map scan related fields like - * swap_map, inuse_pages and all cluster - * lists. other fields are only changed + * inuse_pages and all cluster lists. + * Other fields are only changed * at swapon/swapoff, so are protected * by swap_lock. changing flags need * hold this lock and swap_lock. If * both locks need hold, hold swap_lock * first. */ - spinlock_t cont_lock; /* - * protect swap count continuation page - * list. - */ struct work_struct discard_work; /* discard worker */ struct work_struct reclaim_work; /* reclaim worker */ struct list_head discard_clusters; /* discard clusters list */ @@ -451,7 +435,6 @@ static inline long get_nr_swap_pages(void) } extern void si_swapinfo(struct sysinfo *); -extern int add_swap_count_continuation(swp_entry_t, gfp_t); int swap_type_of(dev_t device, sector_t offset); int find_first_swap(dev_t *device); extern unsigned int count_swap_pages(int, int); @@ -517,11 +500,6 @@ static inline void free_swap_cache(struct folio *folio) { } -static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask) -{ - return 0; -} - static inline int swap_dup_entry_direct(swp_entry_t ent) { return 0; -- cgit v1.2.3 From 1beb9b7223d2a1f1872f76a3d29b0a4a3cee4171 Mon Sep 17 00:00:00 2001 From: "Pratyush Yadav (Google)" Date: Mon, 16 Feb 2026 19:59:32 +0100 Subject: memfd: export memfd_{add,get}_seals() Patch series "mm: memfd_luo: preserve file seals", v2. This series adds support for preserving file seals when preserving a memfd using LUO. Patch 1 exports some memfd seal manipulation functions and patch 2 adds support for preserving them. Since it makes changes to the serialized data structure for memfd, it also bumps the version number. This patch (of 2): Support for preserving file seals will be added to memfd preservation using the Live Update Orchestrator (LUO). Export memfd_{add,get}_seals)() so memfd_luo can use them to manipulate the seals. Link: https://lkml.kernel.org/r/20260216185946.1215770-1-pratyush@kernel.org Link: https://lkml.kernel.org/r/20260216185946.1215770-2-pratyush@kernel.org Signed-off-by: Pratyush Yadav (Google) Acked-by: Mike Rapoport (Microsoft) Tested-by: Samiullah Khawaja Cc: Alexander Graf Cc: Baolin Wang Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: Pasha Tatashin Signed-off-by: Andrew Morton --- include/linux/memfd.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memfd.h b/include/linux/memfd.h index c328a7b356d0..b4fda09dab9f 100644 --- a/include/linux/memfd.h +++ b/include/linux/memfd.h @@ -18,6 +18,8 @@ struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx); */ int memfd_check_seals_mmap(struct file *file, vm_flags_t *vm_flags_ptr); struct file *memfd_alloc_file(const char *name, unsigned int flags); +int memfd_get_seals(struct file *file); +int memfd_add_seals(struct file *file, unsigned int seals); #else static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned int a) { @@ -37,6 +39,16 @@ static inline struct file *memfd_alloc_file(const char *name, unsigned int flags { return ERR_PTR(-EINVAL); } + +static inline int memfd_get_seals(struct file *file) +{ + return -EINVAL; +} + +static inline int memfd_add_seals(struct file *file, unsigned int seals) +{ + return -EINVAL; +} #endif #endif /* __LINUX_MEMFD_H */ -- cgit v1.2.3 From 8a552d68a86ef0e6fb2ff4af13031a5e82c0f1d0 Mon Sep 17 00:00:00 2001 From: "Pratyush Yadav (Google)" Date: Mon, 16 Feb 2026 19:59:33 +0100 Subject: mm: memfd_luo: preserve file seals File seals are used on memfd for making shared memory communication with untrusted peers safer and simpler. Seals provide a guarantee that certain operations won't be allowed on the file such as writes or truncations. Maintaining these guarantees across a live update will help keeping such use cases secure. These guarantees will also be needed for IOMMUFD preservation with LUO. Normally when IOMMUFD maps a memfd, it pins all its pages to make sure any truncation operations on the memfd don't lead to IOMMUFD using freed memory. This doesn't work with LUO since the preserved memfd might have completely different pages after a live update, and mapping them back to the IOMMUFD will cause all sorts of problems. Using and preserving the seals allows IOMMUFD preservation logic to trust the memfd. Since the uABI defines seals as an int, preserve them by introducing a new u32 field. There are currently only 6 possible seals, so the extra bits are unused and provide room for future expansion. Since the seals are uABI, it is safe to use them directly in the ABI. While at it, also add a u32 flags field. It makes sure the struct is nicely aligned, and can be used later to support things like MFD_CLOEXEC. Since the serialization structure is changed, bump the version number to "memfd-v2". It is important to note that the memfd-v2 version only supports seals that existed when this version was defined. This set is defined by MEMFD_LUO_ALL_SEALS. Any new seal might bring a completely different semantic with it and the parser for memfd-v2 cannot be expected to deal with that. If there are any future seals added, they will need another version bump. Link: https://lkml.kernel.org/r/20260216185946.1215770-3-pratyush@kernel.org Signed-off-by: Pratyush Yadav (Google) Tested-by: Samiullah Khawaja Cc: Alexander Graf Cc: Baolin Wang Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: Mike Rapoport (Microsoft) Cc: Pasha Tatashin Signed-off-by: Andrew Morton --- include/linux/kho/abi/memfd.h | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kho/abi/memfd.h b/include/linux/kho/abi/memfd.h index 68cb6303b846..08b10fea2afc 100644 --- a/include/linux/kho/abi/memfd.h +++ b/include/linux/kho/abi/memfd.h @@ -56,10 +56,24 @@ struct memfd_luo_folio_ser { u64 index; } __packed; +/* + * The set of seals this version supports preserving. If support for any new + * seals is needed, add it here and bump version. + */ +#define MEMFD_LUO_ALL_SEALS (F_SEAL_SEAL | \ + F_SEAL_SHRINK | \ + F_SEAL_GROW | \ + F_SEAL_WRITE | \ + F_SEAL_FUTURE_WRITE | \ + F_SEAL_EXEC) + /** * struct memfd_luo_ser - Main serialization structure for a memfd. * @pos: The file's current position (f_pos). * @size: The total size of the file in bytes (i_size). + * @seals: The seals present on the memfd. The seals are uABI so it is safe + * to directly use them in the ABI. + * @flags: Flags for the file. Unused flag bits must be set to 0. * @nr_folios: Number of folios in the folios array. * @folios: KHO vmalloc descriptor pointing to the array of * struct memfd_luo_folio_ser. @@ -67,11 +81,13 @@ struct memfd_luo_folio_ser { struct memfd_luo_ser { u64 pos; u64 size; + u32 seals; + u32 flags; u64 nr_folios; struct kho_vmalloc folios; } __packed; /* The compatibility string for memfd file handler */ -#define MEMFD_LUO_FH_COMPATIBLE "memfd-v1" +#define MEMFD_LUO_FH_COMPATIBLE "memfd-v2" #endif /* _LINUX_KHO_ABI_MEMFD_H */ -- cgit v1.2.3 From c9cb94c6b85a2854ae03c874331b0880ee735441 Mon Sep 17 00:00:00 2001 From: Asier Gutierrez Date: Fri, 13 Feb 2026 14:50:32 +0000 Subject: mm/damon: remove unused target param of get_scheme_score() damon_target is not used by get_scheme_score operations, nor with virtual neither with physical addresses. Link: https://lkml.kernel.org/r/20260213145032.1740407-1-gutierrez.asier@huawei-partners.com Signed-off-by: Asier Gutierrez Reviewed-by: SeongJae Park Cc: Kefeng Wang Cc: Quanmin Yan Cc: ze zuo Signed-off-by: Andrew Morton --- include/linux/damon.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index be3d198043ff..60e6da3012fa 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -647,8 +647,7 @@ struct damon_operations { void (*prepare_access_checks)(struct damon_ctx *context); unsigned int (*check_accesses)(struct damon_ctx *context); int (*get_scheme_score)(struct damon_ctx *context, - struct damon_target *t, struct damon_region *r, - struct damos *scheme); + struct damon_region *r, struct damos *scheme); unsigned long (*apply_scheme)(struct damon_ctx *context, struct damon_target *t, struct damon_region *r, struct damos *scheme, unsigned long *sz_filter_passed); -- cgit v1.2.3 From 5ad41a38c36474ff59545cb514801d90719555de Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Fri, 13 Feb 2026 15:18:22 +0800 Subject: mm: zswap: add per-memcg stat for incompressible pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm: zswap: add per-memcg stat for incompressible pages", v3. In containerized environments, knowing which cgroup is contributing incompressible pages to zswap is essential for effective resource management. This series adds a new per-memcg stat 'zswap_incomp' to track incompressible pages, along with a selftest. This patch (of 2): The global zswap_stored_incompressible_pages counter was added in commit dca4437a5861 ("mm/zswap: store Acked-by: Nhat Pham Acked-by: Shakeel Butt Reviewed-by: Yosry Ahmed Reviewed-by: SeongJae Park Cc: Johannes Weiner Cc: Chengming Zhou Cc: Jonathan Corbet Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Roman Gushchin Cc: Shuah Khan Cc: Tejun Heo Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 70b685a85bf4..5695776f32c8 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -39,6 +39,7 @@ enum memcg_stat_item { MEMCG_KMEM, MEMCG_ZSWAP_B, MEMCG_ZSWAPPED, + MEMCG_ZSWAP_INCOMP, MEMCG_NR_STAT, }; -- cgit v1.2.3 From c5c48345135ff04e039377020df23294d59aa59a Mon Sep 17 00:00:00 2001 From: Gregory Price Date: Wed, 11 Feb 2026 16:54:47 -0500 Subject: mm: name the anonymous MMOP enum as enum mmop Give the MMOP enum (MMOP_OFFLINE, MMOP_ONLINE, etc) a proper type name so the compiler can help catch invalid values being assigned to variables of this type. Leave the existing functions returning int alone to allow for value-or-error pattern to remain unchanged without churn. mmop_default_online_type is left as int because it uses the -1 sentinal value to signal it hasn't been initialized yet. Keep the uint8_t buffer in offline_and_remove_memory() as-is for space efficiency, with an explicit cast when we consume the value. Move the enum definition before the CONFIG_MEMORY_HOTPLUG guard so it is unconditionally available for struct memory_block in memory.h. No functional change. Link: https://lore.kernel.org/linux-mm/3424eba7-523b-4351-abd0-3a888a3e5e61@kernel.org/ Link: https://lkml.kernel.org/r/20260211215447.2194189-1-gourry@gourry.net Signed-off-by: Gregory Price Suggested-by: Jonathan Cameron Suggested-by: "David Hildenbrand (arm)" Reviewed-by: Ben Cheatham Acked-by: David Hildenbrand (Arm) Reviewed-by: Dave Jiang Reviewed-by: Davidlohr Bueso Reviewed-by: Jonathan Cameron Cc: Danilo Krummrich Cc: Greg Kroah-Hartman Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/memory.h | 3 ++- include/linux/memory_hotplug.h | 16 ++++++++-------- 2 files changed, 10 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory.h b/include/linux/memory.h index faeaa921e55b..5bb5599c6b2b 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -19,6 +19,7 @@ #include #include #include +#include #define MIN_MEMORY_BLOCK_SIZE (1UL << SECTION_SIZE_BITS) @@ -77,7 +78,7 @@ enum memory_block_state { struct memory_block { unsigned long start_section_nr; enum memory_block_state state; /* serialized by the dev->lock */ - int online_type; /* for passing data to online routine */ + enum mmop online_type; /* for passing data to online routine */ int nid; /* NID for this memory block */ /* * The single zone of this memory block if all PFNs of this memory block diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index f2f16cdd73ee..e77ef3d7ff73 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -16,11 +16,8 @@ struct resource; struct vmem_altmap; struct dev_pagemap; -#ifdef CONFIG_MEMORY_HOTPLUG -struct page *pfn_to_online_page(unsigned long pfn); - /* Types for control the zone type of onlined and offlined memory */ -enum { +enum mmop { /* Offline the memory. */ MMOP_OFFLINE = 0, /* Online the memory. Zone depends, see default_zone_for_pfn(). */ @@ -31,6 +28,9 @@ enum { MMOP_ONLINE_MOVABLE, }; +#ifdef CONFIG_MEMORY_HOTPLUG +struct page *pfn_to_online_page(unsigned long pfn); + /* Flags for add_memory() and friends to specify memory hotplug details. */ typedef int __bitwise mhp_t; @@ -286,8 +286,8 @@ static inline void __remove_memory(u64 start, u64 size) {} #ifdef CONFIG_MEMORY_HOTPLUG /* Default online_type (MMOP_*) when new memory blocks are added. */ -extern int mhp_get_default_online_type(void); -extern void mhp_set_default_online_type(int online_type); +extern enum mmop mhp_get_default_online_type(void); +extern void mhp_set_default_online_type(enum mmop online_type); extern void __ref free_area_init_core_hotplug(struct pglist_data *pgdat); extern int __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags); extern int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags); @@ -310,8 +310,8 @@ extern void sparse_remove_section(unsigned long pfn, unsigned long nr_pages, struct vmem_altmap *altmap); extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum); -extern struct zone *zone_for_pfn_range(int online_type, int nid, - struct memory_group *group, unsigned long start_pfn, +extern struct zone *zone_for_pfn_range(enum mmop online_type, + int nid, struct memory_group *group, unsigned long start_pfn, unsigned long nr_pages); extern int arch_create_linear_mapping(int nid, u64 start, u64 size, struct mhp_params *params); -- cgit v1.2.3 From 652d12bc74a075f345f228f8945e05517a38874d Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Wed, 11 Feb 2026 12:31:38 +0200 Subject: mm: don't special case !MMU for is_zero_pfn() and my_zero_pfn() Patch series "arch, mm: consolidate empty_zero_page", v3. These patches cleanup handling of ZERO_PAGE() and zero_pfn. This patch (of 4): nommu architectures have empty_zero_page and define ZERO_PAGE() and although they don't really use it to populate page tables, there is no reason to hardwire !MMU implementation of is_zero_pfn() and my_zero_pfn() to 0. Drop #ifdef CONFIG_MMU around implementations of is_zero_pfn() and my_zero_pfn() and remove !MMU version. While on it, make zero_pfn __ro_after_init. Link: https://lkml.kernel.org/r/20260211103141.3215197-1-rppt@kernel.org Link: https://lkml.kernel.org/r/20260211103141.3215197-2-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: David Hildenbrand (Arm) Acked-by: Liam R. Howlett Cc: Andreas Larsson Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Magnus Lindholm Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Will Deacon Cc: Christophe Leroy (CS GROUP) Cc: Dave Hansen Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index a50df42a893f..5e772599d9a5 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1917,7 +1917,6 @@ static inline void pfnmap_setup_cachemode_pfn(unsigned long pfn, pgprot_t *prot) pfnmap_setup_cachemode(pfn, PAGE_SIZE, prot); } -#ifdef CONFIG_MMU #ifdef __HAVE_COLOR_ZERO_PAGE static inline int is_zero_pfn(unsigned long pfn) { @@ -1940,18 +1939,7 @@ static inline unsigned long my_zero_pfn(unsigned long addr) extern unsigned long zero_pfn; return zero_pfn; } -#endif -#else -static inline int is_zero_pfn(unsigned long pfn) -{ - return 0; -} - -static inline unsigned long my_zero_pfn(unsigned long addr) -{ - return 0; -} -#endif /* CONFIG_MMU */ +#endif /* __HAVE_COLOR_ZERO_PAGE */ #ifdef CONFIG_MMU -- cgit v1.2.3 From 9a1d0c738b45ea8da4e6897099c708e89f43daad Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Wed, 11 Feb 2026 12:31:39 +0200 Subject: mm: rename my_zero_pfn() to zero_pfn() my_zero_pfn() is a silly name. Rename zero_pfn variable to zero_page_pfn and my_zero_pfn() function to zero_pfn(). While on it, move extern declarations of zero_page_pfn outside the functions that use it and add a comment about what ZERO_PAGE is. Link: https://lkml.kernel.org/r/20260211103141.3215197-3-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: David Hildenbrand (Arm) Acked-by: Vlastimil Babka Reviewed-by: Liam R. Howlett Cc: Andreas Larsson Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Christophe Leroy (CS GROUP) Cc: Dave Hansen Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Magnus Lindholm Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 5e772599d9a5..c3a56f6b1ea5 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1917,27 +1917,39 @@ static inline void pfnmap_setup_cachemode_pfn(unsigned long pfn, pgprot_t *prot) pfnmap_setup_cachemode(pfn, PAGE_SIZE, prot); } +/* + * ZERO_PAGE() is global shared page(s) that is always zero. It is used for + * zero-mapped memory areas, CoW etc. + * + * On architectures that __HAVE_COLOR_ZERO_PAGE there are several such pages + * for different ranges in the virtual address space. + * + * zero_page_pfn identifies the first (or the only) pfn for these pages. + */ #ifdef __HAVE_COLOR_ZERO_PAGE static inline int is_zero_pfn(unsigned long pfn) { - extern unsigned long zero_pfn; - unsigned long offset_from_zero_pfn = pfn - zero_pfn; + extern unsigned long zero_page_pfn; + unsigned long offset_from_zero_pfn = pfn - zero_page_pfn; + return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT); } -#define my_zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr)) +#define zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr)) #else static inline int is_zero_pfn(unsigned long pfn) { - extern unsigned long zero_pfn; - return pfn == zero_pfn; + extern unsigned long zero_page_pfn; + + return pfn == zero_page_pfn; } -static inline unsigned long my_zero_pfn(unsigned long addr) +static inline unsigned long zero_pfn(unsigned long addr) { - extern unsigned long zero_pfn; - return zero_pfn; + extern unsigned long zero_page_pfn; + + return zero_page_pfn; } #endif /* __HAVE_COLOR_ZERO_PAGE */ -- cgit v1.2.3 From 6215d9f4470fbb48245ffdfade821685e2728c65 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Wed, 11 Feb 2026 12:31:40 +0200 Subject: arch, mm: consolidate empty_zero_page Reduce 22 declarations of empty_zero_page to 3 and 23 declarations of ZERO_PAGE() to 4. Every architecture defines empty_zero_page that way or another, but for the most of them it is always a page aligned page in BSS and most definitions of ZERO_PAGE do virt_to_page(empty_zero_page). Move Linus vetted x86 definition of empty_zero_page and ZERO_PAGE() to the core MM and drop these definitions in architectures that do not implement colored zero page (MIPS and s390). ZERO_PAGE() remains a macro because turning it to a wrapper for a static inline causes severe pain in header dependencies. For the most part the change is mechanical, with these being noteworthy: * alpha: aliased empty_zero_page with ZERO_PGE that was also used for boot parameters. Switching to a generic empty_zero_page removes the aliasing and keeps ZERO_PGE for boot parameters only * arm64: uses __pa_symbol() in ZERO_PAGE() so that definition of ZERO_PAGE() is kept intact. * m68k/parisc/um: allocated empty_zero_page from memblock, although they do not support zero page coloring and having it in BSS will work fine. * sparc64 can have empty_zero_page in BSS rather allocate it, but it can't use virt_to_page() for BSS. Keep it's definition of ZERO_PAGE() but instead of allocating it, make mem_map_zero point to empty_zero_page. * sh: used empty_zero_page for boot parameters at the very early boot. Rename the parameters page to boot_params_page and let sh use the generic empty_zero_page. * hexagon: had an amusing comment about empty_zero_page /* A handy thing to have if one has the RAM. Declared in head.S */ that unfortunately had to go :) Link: https://lkml.kernel.org/r/20260211103141.3215197-4-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Helge Deller [parisc] Tested-by: Helge Deller [parisc] Reviewed-by: Christophe Leroy (CS GROUP) Acked-by: Dave Hansen Acked-by: Catalin Marinas Acked-by: Magnus Lindholm [alpha] Acked-by: Dinh Nguyen [nios2] Acked-by: Andreas Larsson [sparc] Acked-by: David Hildenbrand (Arm) Acked-by: Liam R. Howlett Cc: "Borislav Petkov (AMD)" Cc: David S. Miller Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index c3a56f6b1ea5..2a05c3885f85 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1925,6 +1925,9 @@ static inline void pfnmap_setup_cachemode_pfn(unsigned long pfn, pgprot_t *prot) * for different ranges in the virtual address space. * * zero_page_pfn identifies the first (or the only) pfn for these pages. + * + * For architectures that don't __HAVE_COLOR_ZERO_PAGE the zero page lives in + * empty_zero_page in BSS. */ #ifdef __HAVE_COLOR_ZERO_PAGE static inline int is_zero_pfn(unsigned long pfn) @@ -1951,6 +1954,13 @@ static inline unsigned long zero_pfn(unsigned long addr) return zero_page_pfn; } + +extern uint8_t empty_zero_page[PAGE_SIZE]; + +#ifndef ZERO_PAGE +#define ZERO_PAGE(vaddr) ((void)(vaddr),virt_to_page(empty_zero_page)) +#endif + #endif /* __HAVE_COLOR_ZERO_PAGE */ #ifdef CONFIG_MMU -- cgit v1.2.3 From 26513781d1b3a1e8b4b576ed62751d604a69b374 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Wed, 11 Feb 2026 12:31:41 +0200 Subject: mm: cache struct page for empty_zero_page and return it from ZERO_PAGE() For most architectures every invocation of ZERO_PAGE() does virt_to_page(empty_zero_page). But empty_zero_page is in BSS and it is enough to get its struct page once at initialization time and then use it whenever a zero page should be accessed. Add yet another __zero_page variable that will be initialized as virt_to_page(empty_zero_page) for most architectures in a weak arch_setup_zero_pages() function. For architectures that use colored zero pages (MIPS and s390) rename their setup_zero_pages() to arch_setup_zero_pages() and make it global rather than static. For architectures that cannot use virt_to_page() for BSS (arm64 and sparc64) add override of arch_setup_zero_pages(). Link: https://lkml.kernel.org/r/20260211103141.3215197-5-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Catalin Marinas Acked-by: David Hildenbrand (Arm) Acked-by: Liam R. Howlett Cc: Andreas Larsson Cc: "Borislav Petkov (AMD)" Cc: Christophe Leroy (CS GROUP) Cc: Dave Hansen Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Magnus Lindholm Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 2a05c3885f85..776993d4567b 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1929,6 +1929,8 @@ static inline void pfnmap_setup_cachemode_pfn(unsigned long pfn, pgprot_t *prot) * For architectures that don't __HAVE_COLOR_ZERO_PAGE the zero page lives in * empty_zero_page in BSS. */ +void arch_setup_zero_pages(void); + #ifdef __HAVE_COLOR_ZERO_PAGE static inline int is_zero_pfn(unsigned long pfn) { @@ -1956,10 +1958,13 @@ static inline unsigned long zero_pfn(unsigned long addr) } extern uint8_t empty_zero_page[PAGE_SIZE]; +extern struct page *__zero_page; -#ifndef ZERO_PAGE -#define ZERO_PAGE(vaddr) ((void)(vaddr),virt_to_page(empty_zero_page)) -#endif +static inline struct page *_zero_page(unsigned long addr) +{ + return __zero_page; +} +#define ZERO_PAGE(vaddr) _zero_page(vaddr) #endif /* __HAVE_COLOR_ZERO_PAGE */ -- cgit v1.2.3 From 6cc153f90b7cf07db2b49469dfd79141b145036a Mon Sep 17 00:00:00 2001 From: Vernon Yang Date: Sat, 21 Feb 2026 17:39:17 +0800 Subject: mm: add folio_test_lazyfree helper Add folio_test_lazyfree() function to identify lazy-free folios to improve code readability. Link: https://lkml.kernel.org/r/20260221093918.1456187-4-vernon2gm@gmail.com Signed-off-by: Vernon Yang Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Lance Yang Reviewed-by: Dev Jain Reviewed-by: Barry Song Cc: Baolin Wang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Nico Pache Cc: Ryan Roberts Cc: Steven Rostedt Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index f7a0e4af0c73..415e9f2ef616 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -724,6 +724,11 @@ static __always_inline bool folio_test_anon(const struct folio *folio) return ((unsigned long)folio->mapping & FOLIO_MAPPING_ANON) != 0; } +static __always_inline bool folio_test_lazyfree(const struct folio *folio) +{ + return folio_test_anon(folio) && !folio_test_swapbacked(folio); +} + static __always_inline bool PageAnonNotKsm(const struct page *page) { unsigned long flags = (unsigned long)page_folio(page)->mapping; -- cgit v1.2.3 From 3f2ad90060f65d6f66414b8a67c569154bafec7b Mon Sep 17 00:00:00 2001 From: Jason Miu Date: Thu, 5 Feb 2026 18:14:27 -0800 Subject: kho: adopt radix tree for preserved memory tracking Patch series "Make KHO Stateless", v9. This series transitions KHO from an xarray-based metadata tracking system with serialization to a radix tree data structure that can be passed directly to the next kernel. The key motivations for this change are to: - Eliminate the need for data serialization before kexec. - Remove the KHO finalize state. - Pass preservation metadata more directly to the next kernel via the FDT. The new approach uses a radix tree to mark preserved pages. A page's physical address and its order are encoded into a single value. The tree is composed of multiple levels of page-sized tables, with leaf nodes being bitmaps where each set bit represents a preserved page. The physical address of the radix tree's root is passed in the FDT, allowing the next kernel to reconstruct the preserved memory map. This series is broken down into the following patches: 1. kho: Adopt radix tree for preserved memory tracking: Replaces the xarray-based tracker with the new radix tree implementation and increments the ABI version. 2. kho: Remove finalize state and clients: Removes the now-obsolete kho_finalize() function and its usage from client code and debugfs. This patch (of 2): Introduce a radix tree implementation for tracking preserved memory pages and switch the KHO memory tracking mechanism to use it. This lays the groundwork for a stateless KHO implementation that eliminates the need for serialization and the associated "finalize" state. This patch introduces the core radix tree data structures and constants to the KHO ABI. It adds the radix tree node and leaf structures, along with documentation for the radix tree key encoding scheme that combines a page's physical address and order. To support broader use by other kernel subsystems, such as hugetlb preservation, the core radix tree manipulation functions are exported as a public API. The xarray-based memory tracking is replaced with this new radix tree implementation. The core KHO preservation and unpreservation functions are wired up to use the radix tree helpers. On boot, the second kernel restores the preserved memory map by walking the radix tree whose root physical address is passed via the FDT. The ABI `compatible` version is bumped to "kho-v2" to reflect the structural changes in the preserved memory map and sub-FDT property names. This includes renaming "fdt" to "preserved-data" to better reflect that preserved state may use formats other than FDT. [ran.xiaokai@zte.com.cn: fix child node parsing for debugfs in/sub_fdts] Link: https://lkml.kernel.org/r/20260309033530.244508-1-ranxiaokai627@163.com Link: https://lkml.kernel.org/r/20260206021428.3386442-1-jasonmiu@google.com Link: https://lkml.kernel.org/r/20260206021428.3386442-2-jasonmiu@google.com Signed-off-by: Jason Miu Signed-off-by: Ran Xiaokai Reviewed-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Cc: Alexander Graf Cc: Baoquan He Cc: Changyuan Lyu Cc: David Matlack Cc: David Rientjes Cc: Jason Gunthorpe Cc: Pratyush Yadav Cc: Ran Xiaokai Signed-off-by: Andrew Morton --- include/linux/kho/abi/kexec_handover.h | 144 +++++++++++++++++++++++++++++---- include/linux/kho_radix_tree.h | 70 ++++++++++++++++ 2 files changed, 199 insertions(+), 15 deletions(-) create mode 100644 include/linux/kho_radix_tree.h (limited to 'include/linux') diff --git a/include/linux/kho/abi/kexec_handover.h b/include/linux/kho/abi/kexec_handover.h index 2201a0d2c159..6b7d8ef550f9 100644 --- a/include/linux/kho/abi/kexec_handover.h +++ b/include/linux/kho/abi/kexec_handover.h @@ -10,8 +10,13 @@ #ifndef _LINUX_KHO_ABI_KEXEC_HANDOVER_H #define _LINUX_KHO_ABI_KEXEC_HANDOVER_H +#include +#include +#include #include +#include + /** * DOC: Kexec Handover ABI * @@ -29,32 +34,32 @@ * compatibility is only guaranteed for kernels supporting the same ABI version. * * FDT Structure Overview: - * The FDT serves as a central registry for physical - * addresses of preserved data structures and sub-FDTs. The first kernel - * populates this FDT with references to memory regions and other FDTs that - * need to persist across the kexec transition. The subsequent kernel then - * parses this FDT to locate and restore the preserved data.:: + * The FDT serves as a central registry for physical addresses of preserved + * data structures. The first kernel populates this FDT with references to + * memory regions and other metadata that need to persist across the kexec + * transition. The subsequent kernel then parses this FDT to locate and + * restore the preserved data.:: * * / { - * compatible = "kho-v1"; + * compatible = "kho-v2"; * * preserved-memory-map = <0x...>; * * { - * fdt = <0x...>; + * preserved-data = <0x...>; * }; * * { - * fdt = <0x...>; + * preserved-data = <0x...>; * }; * ... ... * { - * fdt = <0x...>; + * preserved-data = <0x...>; * }; * }; * * Root KHO Node (/): - * - compatible: "kho-v1" + * - compatible: "kho-v2" * * Indentifies the overall KHO ABI version. * @@ -69,20 +74,20 @@ * is provided by the subsystem that uses KHO for preserving its * data. * - * - fdt: u64 + * - preserved-data: u64 * - * Physical address pointing to a subnode FDT blob that is also + * Physical address pointing to a subnode data blob that is also * being preserved. */ /* The compatible string for the KHO FDT root node. */ -#define KHO_FDT_COMPATIBLE "kho-v1" +#define KHO_FDT_COMPATIBLE "kho-v2" /* The FDT property for the preserved memory map. */ #define KHO_FDT_MEMORY_MAP_PROP_NAME "preserved-memory-map" -/* The FDT property for sub-FDTs. */ -#define KHO_FDT_SUB_TREE_PROP_NAME "fdt" +/* The FDT property for preserved data blobs. */ +#define KHO_FDT_SUB_TREE_PROP_NAME "preserved-data" /** * DOC: Kexec Handover ABI for vmalloc Preservation @@ -160,4 +165,113 @@ struct kho_vmalloc { unsigned short order; }; +/** + * DOC: KHO persistent memory tracker + * + * KHO tracks preserved memory using a radix tree data structure. Each node of + * the tree is exactly a single page. The leaf nodes are bitmaps where each set + * bit is a preserved page of any order. The intermediate nodes are tables of + * physical addresses that point to a lower level node. + * + * The tree hierarchy is shown below:: + * + * root + * +-------------------+ + * | Level 5 | (struct kho_radix_node) + * +-------------------+ + * | + * v + * +-------------------+ + * | Level 4 | (struct kho_radix_node) + * +-------------------+ + * | + * | ... (intermediate levels) + * | + * v + * +-------------------+ + * | Level 0 | (struct kho_radix_leaf) + * +-------------------+ + * + * The tree is traversed using a key that encodes the page's physical address + * (pa) and its order into a single unsigned long value. The encoded key value + * is composed of two parts: the 'order bit' in the upper part and the + * 'shifted physical address' in the lower part.:: + * + * +------------+-----------------------------+--------------------------+ + * | Page Order | Order Bit | Shifted Physical Address | + * +------------+-----------------------------+--------------------------+ + * | 0 | ...000100 ... (at bit 52) | pa >> (PAGE_SHIFT + 0) | + * | 1 | ...000010 ... (at bit 51) | pa >> (PAGE_SHIFT + 1) | + * | 2 | ...000001 ... (at bit 50) | pa >> (PAGE_SHIFT + 2) | + * | ... | ... | ... | + * +------------+-----------------------------+--------------------------+ + * + * Shifted Physical Address: + * The 'shifted physical address' is the physical address normalized for its + * order. It effectively represents the PFN shifted right by the order. + * + * Order Bit: + * The 'order bit' encodes the page order by setting a single bit at a + * specific position. The position of this bit itself represents the order. + * + * For instance, on a 64-bit system with 4KB pages (PAGE_SHIFT = 12), the + * maximum range for the shifted physical address (for order 0) is 52 bits + * (64 - 12). This address occupies bits [0-51]. For order 0, the order bit is + * set at position 52. + * + * The following diagram illustrates how the encoded key value is split into + * indices for the tree levels, with PAGE_SIZE of 4KB:: + * + * 63:60 59:51 50:42 41:33 32:24 23:15 14:0 + * +---------+--------+--------+--------+--------+--------+-----------------+ + * | 0 | Lv 5 | Lv 4 | Lv 3 | Lv 2 | Lv 1 | Lv 0 (bitmap) | + * +---------+--------+--------+--------+--------+--------+-----------------+ + * + * The radix tree stores pages of all orders in a single 6-level hierarchy. It + * efficiently shares higher tree levels, especially due to common zero top + * address bits, allowing a single, efficient algorithm to manage all + * pages. This bitmap approach also offers memory efficiency; for example, a + * 512KB bitmap can cover a 16GB memory range for 0-order pages with PAGE_SIZE = + * 4KB. + * + * The data structures defined here are part of the KHO ABI. Any modification + * to these structures that breaks backward compatibility must be accompanied by + * an update to the "compatible" string. This ensures that a newer kernel can + * correctly interpret the data passed by an older kernel. + */ + +/* + * Defines constants for the KHO radix tree structure, used to track preserved + * memory. These constants govern the indexing, sizing, and depth of the tree. + */ +enum kho_radix_consts { + /* + * The bit position of the order bit (and also the length of the + * shifted physical address) for an order-0 page. + */ + KHO_ORDER_0_LOG2 = 64 - PAGE_SHIFT, + + /* Size of the table in kho_radix_node, in log2 */ + KHO_TABLE_SIZE_LOG2 = const_ilog2(PAGE_SIZE / sizeof(phys_addr_t)), + + /* Number of bits in the kho_radix_leaf bitmap, in log2 */ + KHO_BITMAP_SIZE_LOG2 = PAGE_SHIFT + const_ilog2(BITS_PER_BYTE), + + /* + * The total tree depth is the number of intermediate levels + * and 1 bitmap level. + */ + KHO_TREE_MAX_DEPTH = + DIV_ROUND_UP(KHO_ORDER_0_LOG2 - KHO_BITMAP_SIZE_LOG2, + KHO_TABLE_SIZE_LOG2) + 1, +}; + +struct kho_radix_node { + u64 table[1 << KHO_TABLE_SIZE_LOG2]; +}; + +struct kho_radix_leaf { + DECLARE_BITMAP(bitmap, 1 << KHO_BITMAP_SIZE_LOG2); +}; + #endif /* _LINUX_KHO_ABI_KEXEC_HANDOVER_H */ diff --git a/include/linux/kho_radix_tree.h b/include/linux/kho_radix_tree.h new file mode 100644 index 000000000000..84e918b96e53 --- /dev/null +++ b/include/linux/kho_radix_tree.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _LINUX_KHO_RADIX_TREE_H +#define _LINUX_KHO_RADIX_TREE_H + +#include +#include +#include +#include + +/** + * DOC: Kexec Handover Radix Tree + * + * This is a radix tree implementation for tracking physical memory pages + * across kexec transitions. It was developed for the KHO mechanism but is + * designed for broader use by any subsystem that needs to preserve pages. + * + * The radix tree is a multi-level tree where leaf nodes are bitmaps + * representing individual pages. To allow pages of different sizes (orders) + * to be stored efficiently in a single tree, it uses a unique key encoding + * scheme. Each key is an unsigned long that combines a page's physical + * address and its order. + * + * Client code is responsible for allocating the root node of the tree, + * initializing the mutex lock, and managing its lifecycle. It must use the + * tree data structures defined in the KHO ABI, + * `include/linux/kho/abi/kexec_handover.h`. + */ + +struct kho_radix_node; + +struct kho_radix_tree { + struct kho_radix_node *root; + struct mutex lock; /* protects the tree's structure and root pointer */ +}; + +typedef int (*kho_radix_tree_walk_callback_t)(phys_addr_t phys, + unsigned int order); + +#ifdef CONFIG_KEXEC_HANDOVER + +int kho_radix_add_page(struct kho_radix_tree *tree, unsigned long pfn, + unsigned int order); + +void kho_radix_del_page(struct kho_radix_tree *tree, unsigned long pfn, + unsigned int order); + +int kho_radix_walk_tree(struct kho_radix_tree *tree, + kho_radix_tree_walk_callback_t cb); + +#else /* #ifdef CONFIG_KEXEC_HANDOVER */ + +static inline int kho_radix_add_page(struct kho_radix_tree *tree, long pfn, + unsigned int order) +{ + return -EOPNOTSUPP; +} + +static inline void kho_radix_del_page(struct kho_radix_tree *tree, + unsigned long pfn, unsigned int order) { } + +static inline int kho_radix_walk_tree(struct kho_radix_tree *tree, + kho_radix_tree_walk_callback_t cb) +{ + return -EOPNOTSUPP; +} + +#endif /* #ifdef CONFIG_KEXEC_HANDOVER */ + +#endif /* _LINUX_KHO_RADIX_TREE_H */ -- cgit v1.2.3 From b9ec0ed907062a67a7cca2d04e7652aec06a0c35 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 23 Feb 2026 11:01:06 -0500 Subject: mm: vmalloc: streamline vmalloc memory accounting Use a vmstat counter instead of a custom, open-coded atomic. This has the added benefit of making the data available per-node, and prepares for cleaning up the memcg accounting as well. Link: https://lkml.kernel.org/r/20260223160147.3792777-1-hannes@cmpxchg.org Acked-by: Shakeel Butt Signed-off-by: Johannes Weiner Reviewed-by: Roman Gushchin Reviewed-by: Vishal Moola (Oracle) Reviewed-by: Uladzislau Rezki (Sony) Cc: Joshua Hahn Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 1 + include/linux/vmalloc.h | 3 --- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 546bca95ca40..db41b18a919d 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -220,6 +220,7 @@ enum node_stat_item { NR_KERNEL_MISC_RECLAIMABLE, /* reclaimable non-slab kernel pages */ NR_FOLL_PIN_ACQUIRED, /* via: pin_user_page(), gup flag: FOLL_PIN */ NR_FOLL_PIN_RELEASED, /* pages returned via unpin_user_page() */ + NR_VMALLOC, NR_KERNEL_STACK_KB, /* measured in KiB */ #if IS_ENABLED(CONFIG_SHADOW_CALL_STACK) NR_KERNEL_SCS_KB, /* measured in KiB */ diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index e8e94f90d686..3b02c0c6b371 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -286,8 +286,6 @@ int unregister_vmap_purge_notifier(struct notifier_block *nb); #ifdef CONFIG_MMU #define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START) -unsigned long vmalloc_nr_pages(void); - int vm_area_map_pages(struct vm_struct *area, unsigned long start, unsigned long end, struct page **pages); void vm_area_unmap_pages(struct vm_struct *area, unsigned long start, @@ -304,7 +302,6 @@ static inline void set_vm_flush_reset_perms(void *addr) #else /* !CONFIG_MMU */ #define VMALLOC_TOTAL 0UL -static inline unsigned long vmalloc_nr_pages(void) { return 0; } static inline void set_vm_flush_reset_perms(void *addr) {} #endif /* CONFIG_MMU */ -- cgit v1.2.3 From c466412c73c339e33e83b68770e5b556457c03de Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 23 Feb 2026 11:01:07 -0500 Subject: mm: memcontrol: switch to native NR_VMALLOC vmstat counter Eliminates the custom memcg counter and results in a single, consolidated accounting call in vmalloc code. Link: https://lkml.kernel.org/r/20260223160147.3792777-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Shakeel Butt Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: Roman Gushchin Reviewed-by: Vishal Moola (Oracle) Cc: Joshua Hahn Cc: Michal Hocko Cc: Muchun Song Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5695776f32c8..5173a9f16721 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -35,7 +35,6 @@ enum memcg_stat_item { MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS, MEMCG_SOCK, MEMCG_PERCPU_B, - MEMCG_VMALLOC, MEMCG_KMEM, MEMCG_ZSWAP_B, MEMCG_ZSWAPPED, -- cgit v1.2.3 From 2b8acf8450f577d3785dacfd616630b76dc8f88d Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Wed, 25 Feb 2026 16:13:58 +0000 Subject: mm: introduce vm_mmap_shadow_stack() as a helper for VM_SHADOW_STACK mappings Patch series "mm: arch/shstk: Common shadow stack mapping helper and VM_NOHUGEPAGE", v2. A series to extract the common shadow stack mmap into a separate helper for arm64, riscv and x86. This patch (of 5): arm64, riscv and x86 use a similar pattern for mapping the user shadow stack (cloned from x86). Extract this into a helper to facilitate code reuse. The call to do_mmap() from the new helper uses PROT_READ|PROT_WRITE prot bits instead of the PROT_READ with an explicit VM_WRITE vm_flag. The x86 intent was to avoid PROT_WRITE implying normal write since the shadow stack is not writable by normal stores. However, from a kernel perspective, the vma is writeable. Functionally there is no difference. Link: https://lkml.kernel.org/r/20260225161404.3157851-1-catalin.marinas@arm.com Link: https://lkml.kernel.org/r/20260225161404.3157851-2-catalin.marinas@arm.com Signed-off-by: Catalin Marinas Tested-by: Deepak Gupta Reviewed-by: Mark Brown Acked-by: David Hildenbrand (Arm) Reviewed-by: Mike Rapoport (Microsoft) Cc: Albert Ou Cc: Alexandre Ghiti Cc: "Borislav Petkov (AMD)" Cc: "Edgecombe, Rick P" Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "Liam R. Howlett" Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Palmer Dabbelt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Will Deacon Cc: Dave Hansen Cc: Paul Walmsley Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index abb4963c1f06..bb0cfe38ca19 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3903,6 +3903,8 @@ extern int vm_munmap(unsigned long, size_t); extern unsigned long __must_check vm_mmap(struct file *, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); +extern unsigned long __must_check vm_mmap_shadow_stack(unsigned long addr, + unsigned long len, unsigned long flags); struct vm_unmapped_area_info { #define VM_UNMAPPED_AREA_TOPDOWN 1 -- cgit v1.2.3 From cbf56f9981014ee48ae9b9e2254f31d1642b8f8f Mon Sep 17 00:00:00 2001 From: Tal Zussman Date: Wed, 25 Feb 2026 18:44:25 -0500 Subject: mm: remove stray references to struct pagevec Patch series "mm: Remove stray references to pagevec", v2. struct pagevec was removed in commit 1e0877d58b1e ("mm: remove struct pagevec"). Remove any stray references to it and rename relevant files and macros accordingly. While at it, remove unnecessary #includes of pagevec.h (now folio_batch.h) in .c files. There are probably more of these that could be removed in .h files, but those are more complex to verify. This patch (of 4): struct pagevec was removed in commit 1e0877d58b1e ("mm: remove struct pagevec"). Remove remaining forward declarations and change __folio_batch_release()'s declaration to match its definition. Link: https://lkml.kernel.org/r/20260225-pagevec_cleanup-v2-0-716868cc2d11@columbia.edu Link: https://lkml.kernel.org/r/20260225-pagevec_cleanup-v2-1-716868cc2d11@columbia.edu Signed-off-by: Tal Zussman Reviewed-by: Matthew Wilcox (Oracle) Acked-by: David Hildenbrand (Arm) Acked-by: Chris Li Acked-by: Zi Yan Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Christian Brauner Cc: Jan Kara Signed-off-by: Andrew Morton --- include/linux/pagevec.h | 2 +- include/linux/swap.h | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 63be5a451627..007affabf335 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -93,7 +93,7 @@ static inline struct folio *folio_batch_next(struct folio_batch *fbatch) return fbatch->folios[fbatch->i++]; } -void __folio_batch_release(struct folio_batch *pvec); +void __folio_batch_release(struct folio_batch *fbatch); static inline void folio_batch_release(struct folio_batch *fbatch) { diff --git a/include/linux/swap.h b/include/linux/swap.h index 0effe3cc50f5..4b1f13b5bbad 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -20,8 +20,6 @@ struct notifier_block; struct bio; -struct pagevec; - #define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */ #define SWAP_FLAG_PRIO_MASK 0x7fff #define SWAP_FLAG_DISCARD 0x10000 /* enable discard for swap */ -- cgit v1.2.3 From 4e1d77a8f382a0ef4dd7732bb1986c8143600def Mon Sep 17 00:00:00 2001 From: Tal Zussman Date: Wed, 25 Feb 2026 18:44:27 -0500 Subject: folio_batch: rename pagevec.h to folio_batch.h struct pagevec was removed in commit 1e0877d58b1e ("mm: remove struct pagevec"). Rename include/linux/pagevec.h to reflect reality and update includes tree-wide. Add the new filename to MAINTAINERS explicitly, as it no longer matches the "include/linux/page[-_]*" pattern in MEMORY MANAGEMENT - CORE. Link: https://lkml.kernel.org/r/20260225-pagevec_cleanup-v2-3-716868cc2d11@columbia.edu Signed-off-by: Tal Zussman Acked-by: David Hildenbrand (Arm) Reviewed-by: Jan Kara Acked-by: Zi Yan Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Chris Li Cc: Christian Brauner Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/folio_batch.h | 105 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/folio_queue.h | 2 +- include/linux/iomap.h | 2 +- include/linux/pagevec.h | 105 -------------------------------------------- include/linux/sunrpc/svc.h | 2 +- include/linux/writeback.h | 2 +- 6 files changed, 109 insertions(+), 109 deletions(-) create mode 100644 include/linux/folio_batch.h delete mode 100644 include/linux/pagevec.h (limited to 'include/linux') diff --git a/include/linux/folio_batch.h b/include/linux/folio_batch.h new file mode 100644 index 000000000000..a2f3d3043f7e --- /dev/null +++ b/include/linux/folio_batch.h @@ -0,0 +1,105 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * include/linux/folio_batch.h + * + * In many places it is efficient to batch an operation up against multiple + * folios. A folio_batch is a container which is used for that. + */ + +#ifndef _LINUX_FOLIO_BATCH_H +#define _LINUX_FOLIO_BATCH_H + +#include + +/* 31 pointers + header align the folio_batch structure to a power of two */ +#define PAGEVEC_SIZE 31 + +struct folio; + +/** + * struct folio_batch - A collection of folios. + * + * The folio_batch is used to amortise the cost of retrieving and + * operating on a set of folios. The order of folios in the batch may be + * significant (eg delete_from_page_cache_batch()). Some users of the + * folio_batch store "exceptional" entries in it which can be removed + * by calling folio_batch_remove_exceptionals(). + */ +struct folio_batch { + unsigned char nr; + unsigned char i; + bool percpu_pvec_drained; + struct folio *folios[PAGEVEC_SIZE]; +}; + +/** + * folio_batch_init() - Initialise a batch of folios + * @fbatch: The folio batch. + * + * A freshly initialised folio_batch contains zero folios. + */ +static inline void folio_batch_init(struct folio_batch *fbatch) +{ + fbatch->nr = 0; + fbatch->i = 0; + fbatch->percpu_pvec_drained = false; +} + +static inline void folio_batch_reinit(struct folio_batch *fbatch) +{ + fbatch->nr = 0; + fbatch->i = 0; +} + +static inline unsigned int folio_batch_count(const struct folio_batch *fbatch) +{ + return fbatch->nr; +} + +static inline unsigned int folio_batch_space(const struct folio_batch *fbatch) +{ + return PAGEVEC_SIZE - fbatch->nr; +} + +/** + * folio_batch_add() - Add a folio to a batch. + * @fbatch: The folio batch. + * @folio: The folio to add. + * + * The folio is added to the end of the batch. + * The batch must have previously been initialised using folio_batch_init(). + * + * Return: The number of slots still available. + */ +static inline unsigned folio_batch_add(struct folio_batch *fbatch, + struct folio *folio) +{ + fbatch->folios[fbatch->nr++] = folio; + return folio_batch_space(fbatch); +} + +/** + * folio_batch_next - Return the next folio to process. + * @fbatch: The folio batch being processed. + * + * Use this function to implement a queue of folios. + * + * Return: The next folio in the queue, or NULL if the queue is empty. + */ +static inline struct folio *folio_batch_next(struct folio_batch *fbatch) +{ + if (fbatch->i == fbatch->nr) + return NULL; + return fbatch->folios[fbatch->i++]; +} + +void __folio_batch_release(struct folio_batch *fbatch); + +static inline void folio_batch_release(struct folio_batch *fbatch) +{ + if (folio_batch_count(fbatch)) + __folio_batch_release(fbatch); +} + +void folio_batch_remove_exceptionals(struct folio_batch *fbatch); +#endif /* _LINUX_FOLIO_BATCH_H */ diff --git a/include/linux/folio_queue.h b/include/linux/folio_queue.h index adab609c972e..0d3765fa9d1d 100644 --- a/include/linux/folio_queue.h +++ b/include/linux/folio_queue.h @@ -14,7 +14,7 @@ #ifndef _LINUX_FOLIO_QUEUE_H #define _LINUX_FOLIO_QUEUE_H -#include +#include #include /* diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 99b7209dabd7..4551613cea2f 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -9,7 +9,7 @@ #include #include #include -#include +#include struct address_space; struct fiemap_extent_info; diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h deleted file mode 100644 index 007affabf335..000000000000 --- a/include/linux/pagevec.h +++ /dev/null @@ -1,105 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * include/linux/pagevec.h - * - * In many places it is efficient to batch an operation up against multiple - * folios. A folio_batch is a container which is used for that. - */ - -#ifndef _LINUX_PAGEVEC_H -#define _LINUX_PAGEVEC_H - -#include - -/* 31 pointers + header align the folio_batch structure to a power of two */ -#define PAGEVEC_SIZE 31 - -struct folio; - -/** - * struct folio_batch - A collection of folios. - * - * The folio_batch is used to amortise the cost of retrieving and - * operating on a set of folios. The order of folios in the batch may be - * significant (eg delete_from_page_cache_batch()). Some users of the - * folio_batch store "exceptional" entries in it which can be removed - * by calling folio_batch_remove_exceptionals(). - */ -struct folio_batch { - unsigned char nr; - unsigned char i; - bool percpu_pvec_drained; - struct folio *folios[PAGEVEC_SIZE]; -}; - -/** - * folio_batch_init() - Initialise a batch of folios - * @fbatch: The folio batch. - * - * A freshly initialised folio_batch contains zero folios. - */ -static inline void folio_batch_init(struct folio_batch *fbatch) -{ - fbatch->nr = 0; - fbatch->i = 0; - fbatch->percpu_pvec_drained = false; -} - -static inline void folio_batch_reinit(struct folio_batch *fbatch) -{ - fbatch->nr = 0; - fbatch->i = 0; -} - -static inline unsigned int folio_batch_count(const struct folio_batch *fbatch) -{ - return fbatch->nr; -} - -static inline unsigned int folio_batch_space(const struct folio_batch *fbatch) -{ - return PAGEVEC_SIZE - fbatch->nr; -} - -/** - * folio_batch_add() - Add a folio to a batch. - * @fbatch: The folio batch. - * @folio: The folio to add. - * - * The folio is added to the end of the batch. - * The batch must have previously been initialised using folio_batch_init(). - * - * Return: The number of slots still available. - */ -static inline unsigned folio_batch_add(struct folio_batch *fbatch, - struct folio *folio) -{ - fbatch->folios[fbatch->nr++] = folio; - return folio_batch_space(fbatch); -} - -/** - * folio_batch_next - Return the next folio to process. - * @fbatch: The folio batch being processed. - * - * Use this function to implement a queue of folios. - * - * Return: The next folio in the queue, or NULL if the queue is empty. - */ -static inline struct folio *folio_batch_next(struct folio_batch *fbatch) -{ - if (fbatch->i == fbatch->nr) - return NULL; - return fbatch->folios[fbatch->i++]; -} - -void __folio_batch_release(struct folio_batch *fbatch); - -static inline void folio_batch_release(struct folio_batch *fbatch) -{ - if (folio_batch_count(fbatch)) - __folio_batch_release(fbatch); -} - -void folio_batch_remove_exceptionals(struct folio_batch *fbatch); -#endif /* _LINUX_PAGEVEC_H */ diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 4dc14c7a711b..a11acf5cd63b 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include /* diff --git a/include/linux/writeback.h b/include/linux/writeback.h index e530112c4b3a..62552a2ce5b9 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -11,7 +11,7 @@ #include #include #include -#include +#include struct bio; -- cgit v1.2.3 From 511f04aac469a3ae04f7f2588101020aebb19c90 Mon Sep 17 00:00:00 2001 From: Tal Zussman Date: Wed, 25 Feb 2026 18:44:28 -0500 Subject: folio_batch: rename PAGEVEC_SIZE to FOLIO_BATCH_SIZE struct pagevec no longer exists. Rename the macro appropriately. Link: https://lkml.kernel.org/r/20260225-pagevec_cleanup-v2-4-716868cc2d11@columbia.edu Signed-off-by: Tal Zussman Acked-by: David Hildenbrand (Arm) Reviewed-by: Jan Kara Acked-by: Zi Yan Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Chris Li Cc: Christian Brauner Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/folio_batch.h | 6 +++--- include/linux/folio_queue.h | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/folio_batch.h b/include/linux/folio_batch.h index a2f3d3043f7e..b45946adc50b 100644 --- a/include/linux/folio_batch.h +++ b/include/linux/folio_batch.h @@ -12,7 +12,7 @@ #include /* 31 pointers + header align the folio_batch structure to a power of two */ -#define PAGEVEC_SIZE 31 +#define FOLIO_BATCH_SIZE 31 struct folio; @@ -29,7 +29,7 @@ struct folio_batch { unsigned char nr; unsigned char i; bool percpu_pvec_drained; - struct folio *folios[PAGEVEC_SIZE]; + struct folio *folios[FOLIO_BATCH_SIZE]; }; /** @@ -58,7 +58,7 @@ static inline unsigned int folio_batch_count(const struct folio_batch *fbatch) static inline unsigned int folio_batch_space(const struct folio_batch *fbatch) { - return PAGEVEC_SIZE - fbatch->nr; + return FOLIO_BATCH_SIZE - fbatch->nr; } /** diff --git a/include/linux/folio_queue.h b/include/linux/folio_queue.h index 0d3765fa9d1d..f6d5f1f127c9 100644 --- a/include/linux/folio_queue.h +++ b/include/linux/folio_queue.h @@ -29,12 +29,12 @@ */ struct folio_queue { struct folio_batch vec; /* Folios in the queue segment */ - u8 orders[PAGEVEC_SIZE]; /* Order of each folio */ + u8 orders[FOLIO_BATCH_SIZE]; /* Order of each folio */ struct folio_queue *next; /* Next queue segment or NULL */ struct folio_queue *prev; /* Previous queue segment of NULL */ unsigned long marks; /* 1-bit mark per folio */ unsigned long marks2; /* Second 1-bit mark per folio */ -#if PAGEVEC_SIZE > BITS_PER_LONG +#if FOLIO_BATCH_SIZE > BITS_PER_LONG #error marks is not big enough #endif unsigned int rreq_id; @@ -70,7 +70,7 @@ static inline void folioq_init(struct folio_queue *folioq, unsigned int rreq_id) */ static inline unsigned int folioq_nr_slots(const struct folio_queue *folioq) { - return PAGEVEC_SIZE; + return FOLIO_BATCH_SIZE; } /** -- cgit v1.2.3 From a2c77ec320a99581e8272868ccfa53a7d7a7b168 Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Fri, 27 Feb 2026 19:42:39 +0000 Subject: mm: move MAX_FOLIO_ORDER definition to mmzone.h Patch series "mm: Eliminate fake head pages from vmemmap optimization", v7. This series removes "fake head pages" from the HugeTLB vmemmap optimization (HVO) by changing how tail pages encode their relationship to the head page. It simplifies compound_head() and page_ref_add_unless(). Both are in the hot path. Background ========== HVO reduces memory overhead by freeing vmemmap pages for HugeTLB pages and remapping the freed virtual addresses to a single physical page. Previously, all tail page vmemmap entries were remapped to the first vmemmap page (containing the head struct page), creating "fake heads" - tail pages that appear to have PG_head set when accessed through the deduplicated vmemmap. This required special handling in compound_head() to detect and work around fake heads, adding complexity and overhead to a very hot path. New Approach ============ For architectures/configs where sizeof(struct page) is a power of 2 (the common case), this series changes how position of the head page is encoded in the tail pages. Instead of storing a pointer to the head page, the ->compound_info (renamed from ->compound_head) now stores a mask. The mask can be applied to any tail page's virtual address to compute the head page address. Critically, all tail pages of the same order now have identical compound_info values, regardless of which compound page they belong to. The key insight is that all tail pages of the same order now have identical compound_info values, regardless of which compound page they belong to. In v7, these shared tail pages are allocated per-zone. This ensures that zone information (stored in page->flags) is correct even for shared tail pages, removing the need for the special-casing in page_zonenum() proposed in earlier versions. To support per-zone shared pages for boot-allocated gigantic pages, the vmemmap population is deferred until zones are initialized. This simplifies the logic significantly and allows the removal of vmemmap_undo_hvo(). Benefits ======== 1. Simplified compound_head(): No fake head detection needed, can be implemented in a branchless manner. 2. Simplified page_ref_add_unless(): RCU protection removed since there's no race with fake head remapping. 3. Cleaner architecture: The shared tail pages are truly read-only and contain valid tail page metadata. If sizeof(struct page) is not power-of-2, there are no functional changes. HVO is not supported in this configuration. I had hoped to see performance improvement, but my testing thus far has shown either no change or only a slight improvement within the noise. Series Organization =================== Patch 1: Move MAX_FOLIO_ORDER definition to mmzone.h. Patches 2-4: Refactoring of field names and interfaces. Patches 5-6: Architecture alignment for LoongArch and RISC-V. Patch 7: Mask-based compound_head() implementation. Patch 8: Add memmap alignment checks. Patch 9: Branchless compound_head() optimization. Patch 10: Defer vmemmap population for bootmem hugepages. Patch 11: Refactor vmemmap_walk. Patch 12: x86 vDSO build fix. Patch 13: Eliminate fake heads with per-zone shared tail pages. Patches 14-16: Cleanup of fake head infrastructure. Patch 17: Documentation update. Patch 18: Use compound_head() in page_slab(). This patch (of 17): Move MAX_FOLIO_ORDER definition from mm.h to mmzone.h. This is preparation for adding the vmemmap_tails array to struct zone, which requires MAX_FOLIO_ORDER to be available in mmzone.h. Link: https://lkml.kernel.org/r/20260227194302.274384-1-kas@kernel.org Link: https://lkml.kernel.org/r/20260227194302.274384-2-kas@kernel.org Signed-off-by: Kiryl Shutsemau Acked-by: David Hildenbrand (Red Hat) Acked-by: Zi Yan Acked-by: Muchun Song Acked-by: Usama Arif Reviewed-by: Vlastimil Babka Cc: Albert Ou Cc: Alexandre Ghiti Cc: Baoquan He Cc: Christoph Lameter Cc: David Rientjes Cc: Frank van der Linden Cc: Harry Yoo Cc: Huacai Chen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Roman Gushchin Cc: WANG Xuerui Signed-off-by: Andrew Morton --- include/linux/mm.h | 31 ------------------------------- include/linux/mmzone.h | 31 +++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 31 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index bb0cfe38ca19..4e999c21d89a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include @@ -2479,36 +2478,6 @@ static inline unsigned long folio_nr_pages(const struct folio *folio) return folio_large_nr_pages(folio); } -#if !defined(CONFIG_HAVE_GIGANTIC_FOLIOS) -/* - * We don't expect any folios that exceed buddy sizes (and consequently - * memory sections). - */ -#define MAX_FOLIO_ORDER MAX_PAGE_ORDER -#elif defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) -/* - * Only pages within a single memory section are guaranteed to be - * contiguous. By limiting folios to a single memory section, all folio - * pages are guaranteed to be contiguous. - */ -#define MAX_FOLIO_ORDER PFN_SECTION_SHIFT -#elif defined(CONFIG_HUGETLB_PAGE) -/* - * There is no real limit on the folio size. We limit them to the maximum we - * currently expect (see CONFIG_HAVE_GIGANTIC_FOLIOS): with hugetlb, we expect - * no folios larger than 16 GiB on 64bit and 1 GiB on 32bit. - */ -#define MAX_FOLIO_ORDER get_order(IS_ENABLED(CONFIG_64BIT) ? SZ_16G : SZ_1G) -#else -/* - * Without hugetlb, gigantic folios that are bigger than a single PUD are - * currently impossible. - */ -#define MAX_FOLIO_ORDER PUD_ORDER -#endif - -#define MAX_FOLIO_NR_PAGES (1UL << MAX_FOLIO_ORDER) - /* * compound_nr() returns the number of pages in this potentially compound * page. compound_nr() can be called on a tail page, and is defined to diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index db41b18a919d..4c481ec77da9 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -23,6 +23,7 @@ #include #include #include +#include #include /* Free memory management - zoned buddy allocator. */ @@ -61,6 +62,36 @@ */ #define PAGE_ALLOC_COSTLY_ORDER 3 +#if !defined(CONFIG_HAVE_GIGANTIC_FOLIOS) +/* + * We don't expect any folios that exceed buddy sizes (and consequently + * memory sections). + */ +#define MAX_FOLIO_ORDER MAX_PAGE_ORDER +#elif defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) +/* + * Only pages within a single memory section are guaranteed to be + * contiguous. By limiting folios to a single memory section, all folio + * pages are guaranteed to be contiguous. + */ +#define MAX_FOLIO_ORDER PFN_SECTION_SHIFT +#elif defined(CONFIG_HUGETLB_PAGE) +/* + * There is no real limit on the folio size. We limit them to the maximum we + * currently expect (see CONFIG_HAVE_GIGANTIC_FOLIOS): with hugetlb, we expect + * no folios larger than 16 GiB on 64bit and 1 GiB on 32bit. + */ +#define MAX_FOLIO_ORDER get_order(IS_ENABLED(CONFIG_64BIT) ? SZ_16G : SZ_1G) +#else +/* + * Without hugetlb, gigantic folios that are bigger than a single PUD are + * currently impossible. + */ +#define MAX_FOLIO_ORDER PUD_ORDER +#endif + +#define MAX_FOLIO_NR_PAGES (1UL << MAX_FOLIO_ORDER) + enum migratetype { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, -- cgit v1.2.3 From f0369fb13619569ba8564ce8d4fc9d385bbee8a2 Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Fri, 27 Feb 2026 19:42:40 +0000 Subject: mm: change the interface of prep_compound_tail() Instead of passing down the head page and tail page index, pass the tail and head pages directly, as well as the order of the compound page. This is a preparation for changing how the head position is encoded in the tail page. Link: https://lkml.kernel.org/r/20260227194302.274384-3-kas@kernel.org Signed-off-by: Kiryl Shutsemau Reviewed-by: Muchun Song Reviewed-by: Zi Yan Acked-by: David Hildenbrand (arm) Reviewed-by: Vlastimil Babka Cc: Albert Ou Cc: Alexandre Ghiti Cc: Baoquan He Cc: Christoph Lameter Cc: David Rientjes Cc: Frank van der Linden Cc: Harry Yoo Cc: Huacai Chen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Roman Gushchin Cc: Usama Arif Cc: WANG Xuerui Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 415e9f2ef616..7729a4a28b44 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -870,9 +870,10 @@ static inline bool folio_test_large(const struct folio *folio) return folio_test_head(folio); } -static __always_inline void set_compound_head(struct page *page, struct page *head) +static __always_inline void set_compound_head(struct page *tail, + const struct page *head, unsigned int order) { - WRITE_ONCE(page->compound_head, (unsigned long)head + 1); + WRITE_ONCE(tail->compound_head, (unsigned long)head + 1); } static __always_inline void clear_compound_head(struct page *page) -- cgit v1.2.3 From d50569612c29215c5d1c64f47a65604ed265d2e9 Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Fri, 27 Feb 2026 19:42:41 +0000 Subject: mm: rename the 'compound_head' field in the 'struct page' to 'compound_info' The 'compound_head' field in the 'struct page' encodes whether the page is a tail and where to locate the head page. Bit 0 is set if the page is a tail, and the remaining bits in the field point to the head page. As preparation for changing how the field encodes information about the head page, rename the field to 'compound_info'. Link: https://lkml.kernel.org/r/20260227194302.274384-4-kas@kernel.org Signed-off-by: Kiryl Shutsemau Reviewed-by: Muchun Song Reviewed-by: Zi Yan Acked-by: David Hildenbrand (arm) Reviewed-by: Vlastimil Babka Cc: Albert Ou Cc: Alexandre Ghiti Cc: Baoquan He Cc: Christoph Lameter Cc: David Rientjes Cc: Frank van der Linden Cc: Harry Yoo Cc: Huacai Chen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Roman Gushchin Cc: Usama Arif Cc: WANG Xuerui Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 20 ++++++++++---------- include/linux/page-flags.h | 18 +++++++++--------- include/linux/types.h | 2 +- 3 files changed, 20 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3cc8ae722886..7bc82a2b889f 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -126,14 +126,14 @@ struct page { atomic_long_t pp_ref_count; }; struct { /* Tail pages of compound page */ - unsigned long compound_head; /* Bit zero is set */ + unsigned long compound_info; /* Bit zero is set */ }; struct { /* ZONE_DEVICE pages */ /* - * The first word is used for compound_head or folio + * The first word is used for compound_info or folio * pgmap */ - void *_unused_pgmap_compound_head; + void *_unused_pgmap_compound_info; void *zone_device_data; /* * ZONE_DEVICE private pages are counted as being @@ -409,7 +409,7 @@ struct folio { /* private: avoid cluttering the output */ /* For the Unevictable "LRU list" slot */ struct { - /* Avoid compound_head */ + /* Avoid compound_info */ void *__filler; /* public: */ unsigned int mlock_count; @@ -510,7 +510,7 @@ struct folio { FOLIO_MATCH(flags, flags); FOLIO_MATCH(lru, lru); FOLIO_MATCH(mapping, mapping); -FOLIO_MATCH(compound_head, lru); +FOLIO_MATCH(compound_info, lru); FOLIO_MATCH(__folio_index, index); FOLIO_MATCH(private, private); FOLIO_MATCH(_mapcount, _mapcount); @@ -529,7 +529,7 @@ FOLIO_MATCH(_last_cpupid, _last_cpupid); static_assert(offsetof(struct folio, fl) == \ offsetof(struct page, pg) + sizeof(struct page)) FOLIO_MATCH(flags, _flags_1); -FOLIO_MATCH(compound_head, _head_1); +FOLIO_MATCH(compound_info, _head_1); FOLIO_MATCH(_mapcount, _mapcount_1); FOLIO_MATCH(_refcount, _refcount_1); #undef FOLIO_MATCH @@ -537,13 +537,13 @@ FOLIO_MATCH(_refcount, _refcount_1); static_assert(offsetof(struct folio, fl) == \ offsetof(struct page, pg) + 2 * sizeof(struct page)) FOLIO_MATCH(flags, _flags_2); -FOLIO_MATCH(compound_head, _head_2); +FOLIO_MATCH(compound_info, _head_2); #undef FOLIO_MATCH #define FOLIO_MATCH(pg, fl) \ static_assert(offsetof(struct folio, fl) == \ offsetof(struct page, pg) + 3 * sizeof(struct page)) FOLIO_MATCH(flags, _flags_3); -FOLIO_MATCH(compound_head, _head_3); +FOLIO_MATCH(compound_info, _head_3); #undef FOLIO_MATCH /** @@ -609,8 +609,8 @@ struct ptdesc { #define TABLE_MATCH(pg, pt) \ static_assert(offsetof(struct page, pg) == offsetof(struct ptdesc, pt)) TABLE_MATCH(flags, pt_flags); -TABLE_MATCH(compound_head, pt_list); -TABLE_MATCH(compound_head, _pt_pad_1); +TABLE_MATCH(compound_info, pt_list); +TABLE_MATCH(compound_info, _pt_pad_1); TABLE_MATCH(mapping, __page_mapping); TABLE_MATCH(__folio_index, pt_index); TABLE_MATCH(rcu_head, pt_rcu_head); diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 7729a4a28b44..265a798295ff 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -213,7 +213,7 @@ static __always_inline const struct page *page_fixed_fake_head(const struct page /* * Only addresses aligned with PAGE_SIZE of struct page may be fake head * struct page. The alignment check aims to avoid access the fields ( - * e.g. compound_head) of the @page[1]. It can avoid touch a (possibly) + * e.g. compound_info) of the @page[1]. It can avoid touch a (possibly) * cold cacheline in some cases. */ if (IS_ALIGNED((unsigned long)page, PAGE_SIZE) && @@ -223,7 +223,7 @@ static __always_inline const struct page *page_fixed_fake_head(const struct page * because the @page is a compound page composed with at least * two contiguous pages. */ - unsigned long head = READ_ONCE(page[1].compound_head); + unsigned long head = READ_ONCE(page[1].compound_info); if (likely(head & 1)) return (const struct page *)(head - 1); @@ -281,7 +281,7 @@ static __always_inline int page_is_fake_head(const struct page *page) static __always_inline unsigned long _compound_head(const struct page *page) { - unsigned long head = READ_ONCE(page->compound_head); + unsigned long head = READ_ONCE(page->compound_info); if (unlikely(head & 1)) return head - 1; @@ -320,13 +320,13 @@ static __always_inline unsigned long _compound_head(const struct page *page) static __always_inline int PageTail(const struct page *page) { - return READ_ONCE(page->compound_head) & 1 || page_is_fake_head(page); + return READ_ONCE(page->compound_info) & 1 || page_is_fake_head(page); } static __always_inline int PageCompound(const struct page *page) { return test_bit(PG_head, &page->flags.f) || - READ_ONCE(page->compound_head) & 1; + READ_ONCE(page->compound_info) & 1; } #define PAGE_POISON_PATTERN -1l @@ -348,7 +348,7 @@ static const unsigned long *const_folio_flags(const struct folio *folio, { const struct page *page = &folio->page; - VM_BUG_ON_PGFLAGS(page->compound_head & 1, page); + VM_BUG_ON_PGFLAGS(page->compound_info & 1, page); VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags.f), page); return &page[n].flags.f; } @@ -357,7 +357,7 @@ static unsigned long *folio_flags(struct folio *folio, unsigned n) { struct page *page = &folio->page; - VM_BUG_ON_PGFLAGS(page->compound_head & 1, page); + VM_BUG_ON_PGFLAGS(page->compound_info & 1, page); VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags.f), page); return &page[n].flags.f; } @@ -873,12 +873,12 @@ static inline bool folio_test_large(const struct folio *folio) static __always_inline void set_compound_head(struct page *tail, const struct page *head, unsigned int order) { - WRITE_ONCE(tail->compound_head, (unsigned long)head + 1); + WRITE_ONCE(tail->compound_info, (unsigned long)head + 1); } static __always_inline void clear_compound_head(struct page *page) { - WRITE_ONCE(page->compound_head, 0); + WRITE_ONCE(page->compound_info, 0); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE diff --git a/include/linux/types.h b/include/linux/types.h index 7e71d260763c..608050dbca6a 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -239,7 +239,7 @@ struct ustat { * * This guarantee is important for few reasons: * - future call_rcu_lazy() will make use of lower bits in the pointer; - * - the structure shares storage space in struct page with @compound_head, + * - the structure shares storage space in struct page with @compound_info, * which encode PageTail() in bit 0. The guarantee is needed to avoid * false-positive PageTail(). */ -- cgit v1.2.3 From 67c79a5af051f57339ecf383d3f67e200741ce20 Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Fri, 27 Feb 2026 19:42:42 +0000 Subject: mm: move set/clear_compound_head() next to compound_head() Move set_compound_head() and clear_compound_head() to be adjacent to the compound_head() function in page-flags.h. These functions encode and decode the same compound_info field, so keeping them together makes it easier to verify their logic is consistent, especially when the encoding changes. Link: https://lkml.kernel.org/r/20260227194302.274384-5-kas@kernel.org Signed-off-by: Kiryl Shutsemau Reviewed-by: Muchun Song Reviewed-by: Zi Yan Acked-by: David Hildenbrand (arm) Reviewed-by: Vlastimil Babka Cc: Albert Ou Cc: Alexandre Ghiti Cc: Baoquan He Cc: Christoph Lameter Cc: David Rientjes Cc: Frank van der Linden Cc: Harry Yoo Cc: Huacai Chen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Roman Gushchin Cc: Usama Arif Cc: WANG Xuerui Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 265a798295ff..5c469d38dd69 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -290,6 +290,17 @@ static __always_inline unsigned long _compound_head(const struct page *page) #define compound_head(page) ((typeof(page))_compound_head(page)) +static __always_inline void set_compound_head(struct page *tail, + const struct page *head, unsigned int order) +{ + WRITE_ONCE(tail->compound_info, (unsigned long)head + 1); +} + +static __always_inline void clear_compound_head(struct page *page) +{ + WRITE_ONCE(page->compound_info, 0); +} + /** * page_folio - Converts from page to folio. * @p: The page. @@ -870,17 +881,6 @@ static inline bool folio_test_large(const struct folio *folio) return folio_test_head(folio); } -static __always_inline void set_compound_head(struct page *tail, - const struct page *head, unsigned int order) -{ - WRITE_ONCE(tail->compound_info, (unsigned long)head + 1); -} - -static __always_inline void clear_compound_head(struct page *page) -{ - WRITE_ONCE(page->compound_info, 0); -} - #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline void ClearPageCompound(struct page *page) { -- cgit v1.2.3 From 476849b0fba4450f5adf22196bcff9c24c673bc4 Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Fri, 27 Feb 2026 19:42:43 +0000 Subject: riscv/mm: align vmemmap to maximal folio size The upcoming change to the HugeTLB vmemmap optimization (HVO) requires struct pages of the head page to be naturally aligned with regard to the folio size. Align vmemmap to the newly introduced MAX_FOLIO_VMEMMAP_ALIGN. Link: https://lkml.kernel.org/r/20260227194302.274384-6-kas@kernel.org Signed-off-by: Kiryl Shutsemau Cc: Albert Ou Cc: Alexandre Ghiti Cc: Baoquan He Cc: Christoph Lameter Cc: David Hildenbrand (arm) Cc: David Rientjes Cc: Frank van der Linden Cc: Harry Yoo Cc: Huacai Chen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Roman Gushchin Cc: Usama Arif Cc: Vlastimil Babka Cc: WANG Xuerui Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 4c481ec77da9..0bef68e41f19 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -92,6 +92,17 @@ #define MAX_FOLIO_NR_PAGES (1UL << MAX_FOLIO_ORDER) +/* + * HugeTLB Vmemmap Optimization (HVO) requires struct pages of the head page to + * be naturally aligned with regard to the folio size. + * + * HVO which is only active if the size of struct page is a power of 2. + */ +#define MAX_FOLIO_VMEMMAP_ALIGN \ + (IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP) && \ + is_power_of_2(sizeof(struct page)) ? \ + MAX_FOLIO_NR_PAGES * sizeof(struct page) : 0) + enum migratetype { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, -- cgit v1.2.3 From 8c846c879e226c312c2c7a7bc1e323779903530f Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Fri, 27 Feb 2026 19:42:45 +0000 Subject: mm: rework compound_head() for power-of-2 sizeof(struct page) For tail pages, the kernel uses the 'compound_info' field to get to the head page. The bit 0 of the field indicates whether the page is a tail page, and if set, the remaining bits represent a pointer to the head page. For cases when size of struct page is power-of-2, change the encoding of compound_info to store a mask that can be applied to the virtual address of the tail page in order to access the head page. It is possible because struct page of the head page is naturally aligned with regards to order of the page. The significant impact of this modification is that all tail pages of the same order will now have identical 'compound_info', regardless of the compound page they are associated with. This paves the way for eliminating fake heads. The HugeTLB Vmemmap Optimization (HVO) creates fake heads and it is only applied when the sizeof(struct page) is power-of-2. Having identical tail pages allows the same page to be mapped into the vmemmap of all pages, maintaining memory savings without fake heads. If sizeof(struct page) is not power-of-2, there is no functional changes. Limit mask usage to HugeTLB vmemmap optimization (HVO) where it makes a difference. The approach with mask would work in the wider set of conditions, but it requires validating that struct pages are naturally aligned for all orders up to the MAX_FOLIO_ORDER, which can be tricky. Link: https://lkml.kernel.org/r/20260227194302.274384-8-kas@kernel.org Signed-off-by: Kiryl Shutsemau Reviewed-by: Muchun Song Reviewed-by: Zi Yan Acked-by: David Hildenbrand (Arm) Acked-by: Usama Arif Reviewed-by: Vlastimil Babka Cc: Albert Ou Cc: Alexandre Ghiti Cc: Baoquan He Cc: Christoph Lameter Cc: David Rientjes Cc: Frank van der Linden Cc: Harry Yoo Cc: Huacai Chen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Roman Gushchin Cc: WANG Xuerui Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 81 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 73 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 5c469d38dd69..43876b108f0a 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -198,6 +198,29 @@ enum pageflags { #ifndef __GENERATING_BOUNDS_H +/* + * For tail pages, if the size of struct page is power-of-2 ->compound_info + * encodes the mask that converts the address of the tail page address to + * the head page address. + * + * Otherwise, ->compound_info has direct pointer to head pages. + */ +static __always_inline bool compound_info_has_mask(void) +{ + /* + * Limit mask usage to HugeTLB vmemmap optimization (HVO) where it + * makes a difference. + * + * The approach with mask would work in the wider set of conditions, + * but it requires validating that struct pages are naturally aligned + * for all orders up to the MAX_FOLIO_ORDER, which can be tricky. + */ + if (!IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP)) + return false; + + return is_power_of_2(sizeof(struct page)); +} + #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP DECLARE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); @@ -207,6 +230,10 @@ DECLARE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); */ static __always_inline const struct page *page_fixed_fake_head(const struct page *page) { + /* Fake heads only exists if compound_info_has_mask() is true */ + if (!compound_info_has_mask()) + return page; + if (!static_branch_unlikely(&hugetlb_optimize_vmemmap_key)) return page; @@ -223,10 +250,14 @@ static __always_inline const struct page *page_fixed_fake_head(const struct page * because the @page is a compound page composed with at least * two contiguous pages. */ - unsigned long head = READ_ONCE(page[1].compound_info); + unsigned long info = READ_ONCE(page[1].compound_info); + + /* See set_compound_head() */ + if (likely(info & 1)) { + unsigned long p = (unsigned long)page; - if (likely(head & 1)) - return (const struct page *)(head - 1); + return (const struct page *)(p & info); + } } return page; } @@ -281,11 +312,26 @@ static __always_inline int page_is_fake_head(const struct page *page) static __always_inline unsigned long _compound_head(const struct page *page) { - unsigned long head = READ_ONCE(page->compound_info); + unsigned long info = READ_ONCE(page->compound_info); - if (unlikely(head & 1)) - return head - 1; - return (unsigned long)page_fixed_fake_head(page); + /* Bit 0 encodes PageTail() */ + if (!(info & 1)) + return (unsigned long)page_fixed_fake_head(page); + + /* + * If compound_info_has_mask() is false, the rest of compound_info is + * the pointer to the head page. + */ + if (!compound_info_has_mask()) + return info - 1; + + /* + * If compound_info_has_mask() is true the rest of the info encodes + * the mask that converts the address of the tail page to the head page. + * + * No need to clear bit 0 in the mask as 'page' always has it clear. + */ + return (unsigned long)page & info; } #define compound_head(page) ((typeof(page))_compound_head(page)) @@ -293,7 +339,26 @@ static __always_inline unsigned long _compound_head(const struct page *page) static __always_inline void set_compound_head(struct page *tail, const struct page *head, unsigned int order) { - WRITE_ONCE(tail->compound_info, (unsigned long)head + 1); + unsigned int shift; + unsigned long mask; + + if (!compound_info_has_mask()) { + WRITE_ONCE(tail->compound_info, (unsigned long)head | 1); + return; + } + + /* + * If the size of struct page is power-of-2, bits [shift:0] of the + * virtual address of compound head are zero. + * + * Calculate mask that can be applied to the virtual address of + * the tail page to get address of the head page. + */ + shift = order + order_base_2(sizeof(struct page)); + mask = GENMASK(BITS_PER_LONG - 1, shift); + + /* Bit 0 encodes PageTail() */ + WRITE_ONCE(tail->compound_info, mask | 1); } static __always_inline void clear_compound_head(struct page *page) -- cgit v1.2.3 From 209e6d9eb13aaf1b6e0fc6f76afc00d055e5ba12 Mon Sep 17 00:00:00 2001 From: "Kiryl Shutsemau (Meta)" Date: Fri, 27 Feb 2026 19:42:47 +0000 Subject: mm/hugetlb: defer vmemmap population for bootmem hugepages Currently, the vmemmap for bootmem-allocated gigantic pages is populated early in hugetlb_vmemmap_init_early(). However, the zone information is only available after zones are initialized. If it is later discovered that a page spans multiple zones, the HVO mapping must be undone and replaced with a normal mapping using vmemmap_undo_hvo(). Defer the actual vmemmap population to hugetlb_vmemmap_init_late(). At this stage, zones are already initialized, so it can be checked if the page is valid for HVO before deciding how to populate the vmemmap. This allows us to remove vmemmap_undo_hvo() and the complex logic required to rollback HVO mappings. In hugetlb_vmemmap_init_late(), if HVO population fails or if the zones are invalid, fall back to a normal vmemmap population. Postponing population until hugetlb_vmemmap_init_late() also makes zone information available from within vmemmap_populate_hvo(). Link: https://lkml.kernel.org/r/20260227194302.274384-10-kas@kernel.org Signed-off-by: Kiryl Shutsemau (Meta) Acked-by: David Hildenbrand (Arm) Cc: Albert Ou Cc: Alexandre Ghiti Cc: Baoquan He Cc: Christoph Lameter Cc: David Rientjes Cc: Frank van der Linden Cc: Harry Yoo Cc: Huacai Chen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Roman Gushchin Cc: Usama Arif Cc: Vlastimil Babka Cc: WANG Xuerui Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 4e999c21d89a..d7e53532a109 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4481,8 +4481,6 @@ int vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); int vmemmap_populate_hvo(unsigned long start, unsigned long end, int node, unsigned long headsize); -int vmemmap_undo_hvo(unsigned long start, unsigned long end, int node, - unsigned long headsize); void vmemmap_wrprotect_hvo(unsigned long start, unsigned long end, int node, unsigned long headsize); void vmemmap_populate_print_last(void); -- cgit v1.2.3 From 622026e87c4019e609010811757e31193cc23847 Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Fri, 27 Feb 2026 19:42:50 +0000 Subject: mm/hugetlb: remove fake head pages HugeTLB Vmemmap Optimization (HVO) reduces memory usage by freeing most vmemmap pages for huge pages and remapping the freed range to a single page containing the struct page metadata. With the new mask-based compound_info encoding (for power-of-2 struct page sizes), all tail pages of the same order are now identical regardless of which compound page they belong to. This means the tail pages can be truly shared without fake heads. Allocate a single page of initialized tail struct pages per zone per order in the vmemmap_tails[] array in struct zone. All huge pages of that order in the zone share this tail page, mapped read-only into their vmemmap. The head page remains unique per huge page. Redefine MAX_FOLIO_ORDER using ilog2(). The define has to produce a compile-constant as it is used to specify vmemmap_tail array size. For some reason, compiler is not able to solve get_order() at compile-time, but ilog2() works. Avoid PUD_ORDER to define MAX_FOLIO_ORDER as it adds dependency to which generates hard-to-break include loop. This eliminates fake heads while maintaining the same memory savings, and simplifies compound_head() by removing fake head detection. Link: https://lkml.kernel.org/r/20260227194302.274384-13-kas@kernel.org Signed-off-by: Kiryl Shutsemau Reviewed-by: Vlastimil Babka (SUSE) Acked-by: David Hildenbrand (Arm) Cc: Albert Ou Cc: Alexandre Ghiti Cc: Baoquan He Cc: Christoph Lameter Cc: David Rientjes Cc: Frank van der Linden Cc: Harry Yoo Cc: Huacai Chen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Roman Gushchin Cc: Usama Arif Cc: WANG Xuerui Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 3 ++- include/linux/mmzone.h | 19 +++++++++++++++++-- 2 files changed, 19 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index d7e53532a109..19619e5efeba 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4479,7 +4479,8 @@ int vmemmap_populate_hugepages(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); int vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); -int vmemmap_populate_hvo(unsigned long start, unsigned long end, int node, +int vmemmap_populate_hvo(unsigned long start, unsigned long end, + unsigned int order, struct zone *zone, unsigned long headsize); void vmemmap_wrprotect_hvo(unsigned long start, unsigned long end, int node, unsigned long headsize); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 0bef68e41f19..5c3ae0348754 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -81,13 +81,17 @@ * currently expect (see CONFIG_HAVE_GIGANTIC_FOLIOS): with hugetlb, we expect * no folios larger than 16 GiB on 64bit and 1 GiB on 32bit. */ -#define MAX_FOLIO_ORDER get_order(IS_ENABLED(CONFIG_64BIT) ? SZ_16G : SZ_1G) +#ifdef CONFIG_64BIT +#define MAX_FOLIO_ORDER (ilog2(SZ_16G) - PAGE_SHIFT) +#else +#define MAX_FOLIO_ORDER (ilog2(SZ_1G) - PAGE_SHIFT) +#endif #else /* * Without hugetlb, gigantic folios that are bigger than a single PUD are * currently impossible. */ -#define MAX_FOLIO_ORDER PUD_ORDER +#define MAX_FOLIO_ORDER (PUD_SHIFT - PAGE_SHIFT) #endif #define MAX_FOLIO_NR_PAGES (1UL << MAX_FOLIO_ORDER) @@ -103,6 +107,14 @@ is_power_of_2(sizeof(struct page)) ? \ MAX_FOLIO_NR_PAGES * sizeof(struct page) : 0) +/* + * vmemmap optimization (like HVO) is only possible for page orders that fill + * two or more pages with struct pages. + */ +#define VMEMMAP_TAIL_MIN_ORDER (ilog2(2 * PAGE_SIZE / sizeof(struct page))) +#define __NR_VMEMMAP_TAILS (MAX_FOLIO_ORDER - VMEMMAP_TAIL_MIN_ORDER + 1) +#define NR_VMEMMAP_TAILS (__NR_VMEMMAP_TAILS > 0 ? __NR_VMEMMAP_TAILS : 0) + enum migratetype { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, @@ -1113,6 +1125,9 @@ struct zone { /* Zone statistics */ atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS]; +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP + struct page *vmemmap_tails[NR_VMEMMAP_TAILS]; +#endif } ____cacheline_internodealigned_in_smp; enum pgdat_flags { -- cgit v1.2.3 From 32c440d67e6cd96a715007d0e62eb970b0c49abc Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Fri, 27 Feb 2026 19:42:51 +0000 Subject: mm: drop fake head checks With fake head pages eliminated in the previous commit, remove the supporting infrastructure: - page_fixed_fake_head(): no longer needed to detect fake heads; - page_is_fake_head(): no longer needed; - page_count_writable(): no longer needed for RCU protection; - RCU read_lock in page_ref_add_unless(): no longer needed; This substantially simplifies compound_head() and page_ref_add_unless(), removing both branches and RCU overhead from these hot paths. RCU was required to serialize allocation of hugetlb page against get_page_unless_zero() and prevent writing to read-only fake head. It is redundant without fake heads. See bd225530a4c7 ("mm/hugetlb_vmemmap: fix race with speculative PFN walkers") for more details. synchronize_rcu() in mm/hugetlb_vmemmap.c will be removed by a separate patch. Link: https://lkml.kernel.org/r/20260227194302.274384-14-kas@kernel.org Signed-off-by: Kiryl Shutsemau Reviewed-by: Muchun Song Acked-by: David Hildenbrand (Arm) Reviewed-by: Vlastimil Babka Cc: Albert Ou Cc: Alexandre Ghiti Cc: Baoquan He Cc: Christoph Lameter Cc: David Rientjes Cc: Frank van der Linden Cc: Harry Yoo Cc: Huacai Chen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Roman Gushchin Cc: Usama Arif Cc: WANG Xuerui Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 93 ++-------------------------------------------- include/linux/page_ref.h | 8 +--- 2 files changed, 4 insertions(+), 97 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 43876b108f0a..b8eef2181598 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -221,102 +221,15 @@ static __always_inline bool compound_info_has_mask(void) return is_power_of_2(sizeof(struct page)); } -#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP DECLARE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); -/* - * Return the real head page struct iff the @page is a fake head page, otherwise - * return the @page itself. See Documentation/mm/vmemmap_dedup.rst. - */ -static __always_inline const struct page *page_fixed_fake_head(const struct page *page) -{ - /* Fake heads only exists if compound_info_has_mask() is true */ - if (!compound_info_has_mask()) - return page; - - if (!static_branch_unlikely(&hugetlb_optimize_vmemmap_key)) - return page; - - /* - * Only addresses aligned with PAGE_SIZE of struct page may be fake head - * struct page. The alignment check aims to avoid access the fields ( - * e.g. compound_info) of the @page[1]. It can avoid touch a (possibly) - * cold cacheline in some cases. - */ - if (IS_ALIGNED((unsigned long)page, PAGE_SIZE) && - test_bit(PG_head, &page->flags.f)) { - /* - * We can safely access the field of the @page[1] with PG_head - * because the @page is a compound page composed with at least - * two contiguous pages. - */ - unsigned long info = READ_ONCE(page[1].compound_info); - - /* See set_compound_head() */ - if (likely(info & 1)) { - unsigned long p = (unsigned long)page; - - return (const struct page *)(p & info); - } - } - return page; -} - -static __always_inline bool page_count_writable(const struct page *page, int u) -{ - if (!static_branch_unlikely(&hugetlb_optimize_vmemmap_key)) - return true; - - /* - * The refcount check is ordered before the fake-head check to prevent - * the following race: - * CPU 1 (HVO) CPU 2 (speculative PFN walker) - * - * page_ref_freeze() - * synchronize_rcu() - * rcu_read_lock() - * page_is_fake_head() is false - * vmemmap_remap_pte() - * XXX: struct page[] becomes r/o - * - * page_ref_unfreeze() - * page_ref_count() is not zero - * - * atomic_add_unless(&page->_refcount) - * XXX: try to modify r/o struct page[] - * - * The refcount check also prevents modification attempts to other (r/o) - * tail pages that are not fake heads. - */ - if (atomic_read_acquire(&page->_refcount) == u) - return false; - - return page_fixed_fake_head(page) == page; -} -#else -static inline const struct page *page_fixed_fake_head(const struct page *page) -{ - return page; -} - -static inline bool page_count_writable(const struct page *page, int u) -{ - return true; -} -#endif - -static __always_inline int page_is_fake_head(const struct page *page) -{ - return page_fixed_fake_head(page) != page; -} - static __always_inline unsigned long _compound_head(const struct page *page) { unsigned long info = READ_ONCE(page->compound_info); /* Bit 0 encodes PageTail() */ if (!(info & 1)) - return (unsigned long)page_fixed_fake_head(page); + return (unsigned long)page; /* * If compound_info_has_mask() is false, the rest of compound_info is @@ -396,7 +309,7 @@ static __always_inline void clear_compound_head(struct page *page) static __always_inline int PageTail(const struct page *page) { - return READ_ONCE(page->compound_info) & 1 || page_is_fake_head(page); + return READ_ONCE(page->compound_info) & 1; } static __always_inline int PageCompound(const struct page *page) @@ -928,7 +841,7 @@ static __always_inline bool folio_test_head(const struct folio *folio) static __always_inline int PageHead(const struct page *page) { PF_POISONED_CHECK(page); - return test_bit(PG_head, &page->flags.f) && !page_is_fake_head(page); + return test_bit(PG_head, &page->flags.f); } __SETPAGEFLAG(Head, head, PF_ANY) diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h index 544150d1d5fd..490d0ad6e56d 100644 --- a/include/linux/page_ref.h +++ b/include/linux/page_ref.h @@ -230,13 +230,7 @@ static inline int folio_ref_dec_return(struct folio *folio) static inline bool page_ref_add_unless(struct page *page, int nr, int u) { - bool ret = false; - - rcu_read_lock(); - /* avoid writing to the vmemmap area being remapped */ - if (page_count_writable(page, u)) - ret = atomic_add_unless(&page->_refcount, nr, u); - rcu_read_unlock(); + bool ret = atomic_add_unless(&page->_refcount, nr, u); if (page_ref_tracepoint_active(page_ref_mod_unless)) __page_ref_mod_unless(page, nr, ret); -- cgit v1.2.3 From da3e2d1ca43de56a83a806237b6be7e91cf07052 Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Fri, 27 Feb 2026 19:42:53 +0000 Subject: mm/hugetlb: remove hugetlb_optimize_vmemmap_key static key The hugetlb_optimize_vmemmap_key static key was used to guard fake head detection in compound_head() and related functions. It allowed skipping the fake head checks entirely when HVO was not in use. With fake heads eliminated and the detection code removed, the static key serves no purpose. Remove its definition and all increment/decrement calls. Link: https://lkml.kernel.org/r/20260227194302.274384-16-kas@kernel.org Signed-off-by: Kiryl Shutsemau Reviewed-by: Muchun Song Acked-by: David Hildenbrand (Arm) Cc: Albert Ou Cc: Alexandre Ghiti Cc: Baoquan He Cc: Christoph Lameter Cc: David Rientjes Cc: Frank van der Linden Cc: Harry Yoo Cc: Huacai Chen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Roman Gushchin Cc: Usama Arif Cc: Vlastimil Babka Cc: WANG Xuerui Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index b8eef2181598..f361bd6c814c 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -221,8 +221,6 @@ static __always_inline bool compound_info_has_mask(void) return is_power_of_2(sizeof(struct page)); } -DECLARE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); - static __always_inline unsigned long _compound_head(const struct page *page) { unsigned long info = READ_ONCE(page->compound_info); -- cgit v1.2.3 From 66b2a3d9ae460934fef5fd588077730f483e8c8c Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Fri, 27 Feb 2026 19:42:54 +0000 Subject: mm: remove the branch from compound_head() The compound_head() function is a hot path. For example, the zap path calls it for every leaf page table entry. Rewrite the helper function in a branchless manner to eliminate the risk of CPU branch misprediction. Link: https://lkml.kernel.org/r/20260227194302.274384-17-kas@kernel.org Signed-off-by: Kiryl Shutsemau Reviewed-by: Muchun Song Reviewed-by: Zi Yan Acked-by: David Hildenbrand (Arm) Reviewed-by: Vlastimil Babka Cc: Albert Ou Cc: Alexandre Ghiti Cc: Baoquan He Cc: Christoph Lameter Cc: David Rientjes Cc: Frank van der Linden Cc: Harry Yoo Cc: Huacai Chen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Roman Gushchin Cc: Usama Arif Cc: WANG Xuerui Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index f361bd6c814c..7223f6f4e2b4 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -224,25 +224,32 @@ static __always_inline bool compound_info_has_mask(void) static __always_inline unsigned long _compound_head(const struct page *page) { unsigned long info = READ_ONCE(page->compound_info); + unsigned long mask; - /* Bit 0 encodes PageTail() */ - if (!(info & 1)) - return (unsigned long)page; + if (!compound_info_has_mask()) { + /* Bit 0 encodes PageTail() */ + if (info & 1) + return info - 1; - /* - * If compound_info_has_mask() is false, the rest of compound_info is - * the pointer to the head page. - */ - if (!compound_info_has_mask()) - return info - 1; + return (unsigned long)page; + } /* * If compound_info_has_mask() is true the rest of the info encodes * the mask that converts the address of the tail page to the head page. * * No need to clear bit 0 in the mask as 'page' always has it clear. + * + * Let's do it in a branchless manner. */ - return (unsigned long)page & info; + + /* Non-tail: -1UL, Tail: 0 */ + mask = (info & 1) - 1; + + /* Non-tail: -1UL, Tail: info */ + mask |= info; + + return (unsigned long)page & mask; } #define compound_head(page) ((typeof(page))_compound_head(page)) -- cgit v1.2.3 From 99573ef4ac30d4eae7a7937f0c9ea351991e3ccc Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 27 Feb 2026 22:29:52 +0100 Subject: mm/pagewalk: drop FW_MIGRATION We removed the last user of FW_MIGRATION in commit 912aa825957f ("Revert "mm/ksm: convert break_ksm() from walk_page_range_vma() to folio_walk""). So let's remove FW_MIGRATION and assign FW_ZEROPAGE bit 0. Including leafops.h is no longer required. While at it, convert "expose_page" to "zeropage", as zeropages are now the only remaining use case for not exposing a page. Link: https://lkml.kernel.org/r/20260227212952.190691-1-david@kernel.org Signed-off-by: David Hildenbrand (Arm) Cc: Lorenzo Stoakes Cc: "Liam R. Howlett" Cc: Vlastimil Babka Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/pagewalk.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index 88e18615dd72..b41d7265c01b 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -148,14 +148,8 @@ int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, typedef int __bitwise folio_walk_flags_t; -/* - * Walk migration entries as well. Careful: a large folio might get split - * concurrently. - */ -#define FW_MIGRATION ((__force folio_walk_flags_t)BIT(0)) - /* Walk shared zeropages (small + huge) as well. */ -#define FW_ZEROPAGE ((__force folio_walk_flags_t)BIT(1)) +#define FW_ZEROPAGE ((__force folio_walk_flags_t)BIT(0)) enum folio_walk_level { FW_LEVEL_PTE, -- cgit v1.2.3 From 3d56d7317b271a1a5030ebb135c58aedc4c0fd36 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Fri, 27 Feb 2026 04:03:00 +0000 Subject: mm: replace READ_ONCE() in pud_trans_unstable() Replace READ_ONCE() with the existing standard page table accessor for PUD aka pudp_get() in pud_trans_unstable(). This does not create any functional change for platforms that do not override pudp_get(), which still defaults to READ_ONCE(). Link: https://lkml.kernel.org/r/20260227040300.2091901-1-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: David Hildenbrand (Arm) Acked-by: SeongJae Park Reviewed-by: Lorenzo Stoakes Cc: Mike Rapoport Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 776993d4567b..d2767a4c027b 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -2004,7 +2004,7 @@ static inline int pud_trans_unstable(pud_t *pud) { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) - pud_t pudval = READ_ONCE(*pud); + pud_t pudval = pudp_get(pud); if (pud_none(pudval) || pud_trans_huge(pudval)) return 1; -- cgit v1.2.3 From 28266ac94a50e585c267a79d9ef5c2803d4dcd7a Mon Sep 17 00:00:00 2001 From: Gladyshev Ilya Date: Sun, 1 Mar 2026 13:19:39 +0000 Subject: mm: make ref_unless functions unless_zero only There are no users of (folio/page)_ref_add_unless(page, nr, u) with u != 0 [1] and all current users are "internal" for page refcounting API. This allows us to safely drop this parameter and reduce function semantics to the "unless zero" cases only. If needed, these functions for the u!=0 cases can be trivially reintroduced later using the same atomic_add_unless operations as before. [1]: The last user was dropped in v5.18 kernel, commit 27674ef6c73f ("mm: remove the extra ZONE_DEVICE struct page refcount"). There is no trace of discussion as to why this cleanup wasn't done earlier. Link: https://lkml.kernel.org/r/a0c89b49d38c671a0bdd35069d15ee13e08314d2.1772370066.git.gladyshev.ilya1@h-partners.com Co-developed-by: Gorbunov Ivan Signed-off-by: Gorbunov Ivan Signed-off-by: Gladyshev Ilya Acked-by: David Hildenbrand (Arm) Acked-by: Kiryl Shutsemau Acked-by: Zi Yan Reviewed-by: Lorenzo Stoakes Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Will Deacon Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- include/linux/page_ref.h | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 19619e5efeba..08b743aab92a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1506,7 +1506,7 @@ static inline int folio_put_testzero(struct folio *folio) */ static inline bool get_page_unless_zero(struct page *page) { - return page_ref_add_unless(page, 1, 0); + return page_ref_add_unless_zero(page, 1); } static inline struct folio *folio_get_nontail_page(struct page *page) diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h index 490d0ad6e56d..94d3f0e71c06 100644 --- a/include/linux/page_ref.h +++ b/include/linux/page_ref.h @@ -228,18 +228,18 @@ static inline int folio_ref_dec_return(struct folio *folio) return page_ref_dec_return(&folio->page); } -static inline bool page_ref_add_unless(struct page *page, int nr, int u) +static inline bool page_ref_add_unless_zero(struct page *page, int nr) { - bool ret = atomic_add_unless(&page->_refcount, nr, u); + bool ret = atomic_add_unless(&page->_refcount, nr, 0); if (page_ref_tracepoint_active(page_ref_mod_unless)) __page_ref_mod_unless(page, nr, ret); return ret; } -static inline bool folio_ref_add_unless(struct folio *folio, int nr, int u) +static inline bool folio_ref_add_unless_zero(struct folio *folio, int nr) { - return page_ref_add_unless(&folio->page, nr, u); + return page_ref_add_unless_zero(&folio->page, nr); } /** @@ -255,12 +255,12 @@ static inline bool folio_ref_add_unless(struct folio *folio, int nr, int u) */ static inline bool folio_try_get(struct folio *folio) { - return folio_ref_add_unless(folio, 1, 0); + return folio_ref_add_unless_zero(folio, 1); } static inline bool folio_ref_try_add(struct folio *folio, int count) { - return folio_ref_add_unless(folio, count, 0); + return folio_ref_add_unless_zero(folio, count); } static inline int page_ref_freeze(struct page *page, int count) -- cgit v1.2.3 From de008c9ba5684f14e83bcf86cd45fb0e4e6c4d82 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 27 Feb 2026 21:08:33 +0100 Subject: mm/memory: remove "zap_details" parameter from zap_page_range_single() Nobody except memory.c should really set that parameter to non-NULL. So let's just drop it and make unmap_mapping_range_vma() use zap_page_range_single_batched() instead. [david@kernel.org: format on a single line] Link: https://lkml.kernel.org/r/8a27e9ac-2025-4724-a46d-0a7c90894ba7@kernel.org Link: https://lkml.kernel.org/r/20260227200848.114019-3-david@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Acked-by: Puranjay Mohan Cc: Alexander Gordeev Cc: Alexei Starovoitov Cc: Alice Ryhl Cc: Andrii Nakryiko Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnd Bergmann Cc: Arve Cc: "Borislav Petkov (AMD)" Cc: Carlos Llamas Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: Daniel Borkman Cc: Dave Airlie Cc: David Ahern Cc: David Rientjes Cc: David S. Miller Cc: Dimitri Sivanich Cc: Eric Dumazet Cc: Gerald Schaefer Cc: Greg Kroah-Hartman Cc: Hartley Sweeten Cc: Heiko Carstens Cc: Ian Abbott Cc: Ingo Molnar Cc: Jakub Kacinski Cc: Jani Nikula Cc: Jann Horn Cc: Janosch Frank Cc: Jarkko Sakkinen Cc: Jason Gunthorpe Cc: Jonas Lahtinen Cc: Leon Romanovsky Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Namhyung kim Cc: Neal Cardwell Cc: Paolo Abeni Cc: Pedro Falcato Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Todd Kjos Cc: Tvrtko Ursulin Cc: Vasily Gorbik Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- include/linux/mm.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 08b743aab92a..6512d70c5852 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2804,11 +2804,10 @@ struct page *vm_normal_page_pud(struct vm_area_struct *vma, unsigned long addr, void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size); void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, - unsigned long size, struct zap_details *details); + unsigned long size); static inline void zap_vma_pages(struct vm_area_struct *vma) { - zap_page_range_single(vma, vma->vm_start, - vma->vm_end - vma->vm_start, NULL); + zap_page_range_single(vma, vma->vm_start, vma->vm_end - vma->vm_start); } struct mmu_notifier_range; -- cgit v1.2.3 From 599a59e6037838ea7cd6264d7980ea63de244994 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 27 Feb 2026 21:08:35 +0100 Subject: mm/memory: simplify calculation in unmap_mapping_range_tree() Let's simplify the calculation a bit further to make it easier to get, reusing vma_last_pgoff() which we move from interval_tree.c to mm.h. Link: https://lkml.kernel.org/r/20260227200848.114019-5-david@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Alexander Gordeev Cc: Alexei Starovoitov Cc: Alice Ryhl Cc: Andrii Nakryiko Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnd Bergmann Cc: Arve Cc: "Borislav Petkov (AMD)" Cc: Carlos Llamas Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: Daniel Borkman Cc: Dave Airlie Cc: David Ahern Cc: David Rientjes Cc: David S. Miller Cc: Dimitri Sivanich Cc: Eric Dumazet Cc: Gerald Schaefer Cc: Greg Kroah-Hartman Cc: Hartley Sweeten Cc: Heiko Carstens Cc: Ian Abbott Cc: Ingo Molnar Cc: Jakub Kacinski Cc: Jani Nikula Cc: Jann Horn Cc: Janosch Frank Cc: Jarkko Sakkinen Cc: Jason Gunthorpe Cc: Jonas Lahtinen Cc: Leon Romanovsky Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Namhyung kim Cc: Neal Cardwell Cc: Paolo Abeni Cc: Pedro Falcato Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Todd Kjos Cc: Tvrtko Ursulin Cc: Vasily Gorbik Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- include/linux/mm.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6512d70c5852..771d021b7948 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3969,6 +3969,11 @@ static inline unsigned long vma_pages(const struct vm_area_struct *vma) return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; } +static inline unsigned long vma_last_pgoff(struct vm_area_struct *vma) +{ + return vma->vm_pgoff + vma_pages(vma) - 1; +} + static inline unsigned long vma_desc_size(const struct vm_area_desc *desc) { return desc->end - desc->start; -- cgit v1.2.3 From a97bc13d15f472c7f8ede1b38660fb55b6dab68d Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 27 Feb 2026 21:08:40 +0100 Subject: mm/memory: convert details->even_cows into details->skip_cows The current semantics are confusing: simply because someone specifies an empty zap_detail struct suddenly makes should_zap_cows() behave differently. The default should be to also zap CoW'ed anonymous pages. Really only unmap_mapping_pages() and friends want to skip zapping of these anon folios. So let's invert the meaning; turn the confusing "reclaim_pt" check that overrides other properties in should_zap_cows() into a safety check. Note that the only caller that sets reclaim_pt=true is madvise_dontneed_single_vma(), which wants to zap any pages. Link: https://lkml.kernel.org/r/20260227200848.114019-10-david@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Alexander Gordeev Cc: Alexei Starovoitov Cc: Alice Ryhl Cc: Andrii Nakryiko Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnd Bergmann Cc: Arve Cc: "Borislav Petkov (AMD)" Cc: Carlos Llamas Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: Daniel Borkman Cc: Dave Airlie Cc: David Ahern Cc: David Rientjes Cc: David S. Miller Cc: Dimitri Sivanich Cc: Eric Dumazet Cc: Gerald Schaefer Cc: Greg Kroah-Hartman Cc: Hartley Sweeten Cc: Heiko Carstens Cc: Ian Abbott Cc: Ingo Molnar Cc: Jakub Kacinski Cc: Jani Nikula Cc: Jann Horn Cc: Janosch Frank Cc: Jarkko Sakkinen Cc: Jason Gunthorpe Cc: Jonas Lahtinen Cc: Leon Romanovsky Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Namhyung kim Cc: Neal Cardwell Cc: Paolo Abeni Cc: Pedro Falcato Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Todd Kjos Cc: Tvrtko Ursulin Cc: Vasily Gorbik Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 771d021b7948..cb4f5fbccaf0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2767,7 +2767,7 @@ extern void pagefault_out_of_memory(void); */ struct zap_details { struct folio *single_folio; /* Locked folio to be unmapped */ - bool even_cows; /* Zap COWed private pages too? */ + bool skip_cows; /* Do not zap COWed private pages */ bool reclaim_pt; /* Need reclaim page tables? */ zap_flags_t zap_flags; /* Extra flags for zapping */ }; -- cgit v1.2.3 From 5f10cbbddc2bd80a5944f1c783830f7ebf648ad2 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 27 Feb 2026 21:08:41 +0100 Subject: mm/memory: use __zap_vma_range() in zap_vma_for_reaping() Let's call __zap_vma_range() instead of unmap_page_range() to prepare for further cleanups. To keep the existing behavior, whereby we do not call uprobe_munmap() which could block, add a new "reaping" member to zap_details and use it. Likely we should handle the possible blocking in uprobe_munmap() differently, but for now keep it unchanged. Link: https://lkml.kernel.org/r/20260227200848.114019-11-david@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Alexander Gordeev Cc: Alexei Starovoitov Cc: Alice Ryhl Cc: Andrii Nakryiko Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnd Bergmann Cc: Arve Cc: "Borislav Petkov (AMD)" Cc: Carlos Llamas Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: Daniel Borkman Cc: Dave Airlie Cc: David Ahern Cc: David Rientjes Cc: David S. Miller Cc: Dimitri Sivanich Cc: Eric Dumazet Cc: Gerald Schaefer Cc: Greg Kroah-Hartman Cc: Hartley Sweeten Cc: Heiko Carstens Cc: Ian Abbott Cc: Ingo Molnar Cc: Jakub Kacinski Cc: Jani Nikula Cc: Jann Horn Cc: Janosch Frank Cc: Jarkko Sakkinen Cc: Jason Gunthorpe Cc: Jonas Lahtinen Cc: Leon Romanovsky Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Namhyung kim Cc: Neal Cardwell Cc: Paolo Abeni Cc: Pedro Falcato Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Todd Kjos Cc: Tvrtko Ursulin Cc: Vasily Gorbik Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index cb4f5fbccaf0..488a144c9161 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2769,6 +2769,7 @@ struct zap_details { struct folio *single_folio; /* Locked folio to be unmapped */ bool skip_cows; /* Do not zap COWed private pages */ bool reclaim_pt; /* Need reclaim page tables? */ + bool reaping; /* Reaping, do not block. */ zap_flags_t zap_flags; /* Extra flags for zapping */ }; -- cgit v1.2.3 From 32bc7fe4a6f4d359b6de96cbc106d2cac695154e Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 27 Feb 2026 21:08:43 +0100 Subject: mm: rename zap_vma_pages() to zap_vma() Let's rename it to an even simpler name. While at it, add some simplistic kernel doc. Link: https://lkml.kernel.org/r/20260227200848.114019-13-david@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Alexander Gordeev Cc: Alexei Starovoitov Cc: Alice Ryhl Cc: Andrii Nakryiko Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnd Bergmann Cc: Arve Cc: "Borislav Petkov (AMD)" Cc: Carlos Llamas Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: Daniel Borkman Cc: Dave Airlie Cc: David Ahern Cc: David Rientjes Cc: David S. Miller Cc: Dimitri Sivanich Cc: Eric Dumazet Cc: Gerald Schaefer Cc: Greg Kroah-Hartman Cc: Hartley Sweeten Cc: Heiko Carstens Cc: Ian Abbott Cc: Ingo Molnar Cc: Jakub Kacinski Cc: Jani Nikula Cc: Jann Horn Cc: Janosch Frank Cc: Jarkko Sakkinen Cc: Jason Gunthorpe Cc: Jonas Lahtinen Cc: Leon Romanovsky Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Namhyung kim Cc: Neal Cardwell Cc: Paolo Abeni Cc: Pedro Falcato Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Todd Kjos Cc: Tvrtko Ursulin Cc: Vasily Gorbik Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 488a144c9161..60c13d40c65c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2806,7 +2806,11 @@ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size); void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, unsigned long size); -static inline void zap_vma_pages(struct vm_area_struct *vma) +/** + * zap_vma - zap all page table entries in a vma + * @vma: The vma to zap. + */ +static inline void zap_vma(struct vm_area_struct *vma) { zap_page_range_single(vma, vma->vm_start, vma->vm_end - vma->vm_start); } -- cgit v1.2.3 From 0326440c3545c86b6501c7c636fcf018d6e87b8c Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 27 Feb 2026 21:08:45 +0100 Subject: mm: rename zap_page_range_single() to zap_vma_range() Let's rename it to make it better match our new naming scheme. While at it, polish the kerneldoc. [akpm@linux-foundation.org: fix rustfmtcheck] Link: https://lkml.kernel.org/r/20260227200848.114019-15-david@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Acked-by: Puranjay Mohan Cc: Alexander Gordeev Cc: Alexei Starovoitov Cc: Alice Ryhl Cc: Andrii Nakryiko Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnd Bergmann Cc: Arve Cc: "Borislav Petkov (AMD)" Cc: Carlos Llamas Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: Daniel Borkman Cc: Dave Airlie Cc: David Ahern Cc: David Rientjes Cc: David S. Miller Cc: Dimitri Sivanich Cc: Eric Dumazet Cc: Gerald Schaefer Cc: Greg Kroah-Hartman Cc: Hartley Sweeten Cc: Heiko Carstens Cc: Ian Abbott Cc: Ingo Molnar Cc: Jakub Kacinski Cc: Jani Nikula Cc: Jann Horn Cc: Janosch Frank Cc: Jarkko Sakkinen Cc: Jason Gunthorpe Cc: Jonas Lahtinen Cc: Leon Romanovsky Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Namhyung kim Cc: Neal Cardwell Cc: Paolo Abeni Cc: Pedro Falcato Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Todd Kjos Cc: Tvrtko Ursulin Cc: Vasily Gorbik Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- include/linux/mm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 60c13d40c65c..10a5b9ba4eeb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2804,7 +2804,7 @@ struct page *vm_normal_page_pud(struct vm_area_struct *vma, unsigned long addr, void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size); -void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, +void zap_vma_range(struct vm_area_struct *vma, unsigned long address, unsigned long size); /** * zap_vma - zap all page table entries in a vma @@ -2812,7 +2812,7 @@ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, */ static inline void zap_vma(struct vm_area_struct *vma) { - zap_page_range_single(vma, vma->vm_start, vma->vm_end - vma->vm_start); + zap_vma_range(vma, vma->vm_start, vma->vm_end - vma->vm_start); } struct mmu_notifier_range; -- cgit v1.2.3 From 52a9e9cd181fab8b03cf4e982533224697669976 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 27 Feb 2026 21:08:46 +0100 Subject: mm: rename zap_vma_ptes() to zap_special_vma_range() zap_vma_ptes() is the only zapping function we export to modules. It's essentially a wrapper around zap_vma_range(), however, with some safety checks: * That the passed range fits fully into the VMA * That it's only used for VM_PFNMAP We will add support for VM_MIXEDMAP next, so use the more-generic term "special vma", although "special" is a bit overloaded. Maybe we'll later just support any VM_SPECIAL flag. While at it, improve the kerneldoc. Link: https://lkml.kernel.org/r/20260227200848.114019-16-david@kernel.org Signed-off-by: David Hildenbrand (Arm) Acked-by: Leon Romanovsky [drivers/infiniband] Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Alexander Gordeev Cc: Alexei Starovoitov Cc: Alice Ryhl Cc: Andrii Nakryiko Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnd Bergmann Cc: Arve Cc: "Borislav Petkov (AMD)" Cc: Carlos Llamas Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: Daniel Borkman Cc: Dave Airlie Cc: David Ahern Cc: David Rientjes Cc: David S. Miller Cc: Dimitri Sivanich Cc: Eric Dumazet Cc: Gerald Schaefer Cc: Greg Kroah-Hartman Cc: Hartley Sweeten Cc: Heiko Carstens Cc: Ian Abbott Cc: Ingo Molnar Cc: Jakub Kacinski Cc: Jani Nikula Cc: Jann Horn Cc: Janosch Frank Cc: Jarkko Sakkinen Cc: Jason Gunthorpe Cc: Jonas Lahtinen Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Namhyung kim Cc: Neal Cardwell Cc: Paolo Abeni Cc: Pedro Falcato Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Todd Kjos Cc: Tvrtko Ursulin Cc: Vasily Gorbik Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 10a5b9ba4eeb..c516d5177211 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2802,7 +2802,7 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, struct page *vm_normal_page_pud(struct vm_area_struct *vma, unsigned long addr, pud_t pud); -void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, +void zap_special_vma_range(struct vm_area_struct *vma, unsigned long address, unsigned long size); void zap_vma_range(struct vm_area_struct *vma, unsigned long address, unsigned long size); -- cgit v1.2.3 From 5a970006786a3b10577e762a9a6c0b9353b4e8a4 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 6 Mar 2026 14:43:37 +0800 Subject: mm: use inline helper functions instead of ugly macros Patch series "support batched checking of the young flag for MGLRU", v3. This is a follow-up to the previous work [1], to support batched checking of the young flag for MGLRU. Similarly, batched checking of young flag for large folios can improve performance during large-folio reclamation when MGLRU is enabled. I observed noticeable performance improvements (see patch 5) on an Arm64 machine that supports contiguous PTEs. All mm-selftests are passed. Patch 1 - 3: cleanup patches. Patch 4: add a new generic batched PTE helper: test_and_clear_young_ptes(). Patch 5: support batched young flag checking for MGLRU. Patch 6: implement the Arm64 arch-specific test_and_clear_young_ptes(). This patch (of 6): People have already complained that these *_clear_young_notify() related macros are very ugly, so let's use inline helpers to make them more readable. In addition, we cannot implement these inline helper functions in the mmu_notifier.h file, because some arch-specific files will include the mmu_notifier.h, which introduces header compilation dependencies and causes build errors (e.g., arch/arm64/include/asm/tlbflush.h). Moreover, since these functions are only used in the mm, implementing these inline helpers in the mm/internal.h header seems reasonable. Link: https://lkml.kernel.org/r/cover.1772778858.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/ea14af84e7967ccebb25082c28a8669d6da8fe57.1772778858.git.baolin.wang@linux.alibaba.com Link: https://lore.kernel.org/all/cover.1770645603.git.baolin.wang@linux.alibaba.com/ [1] Signed-off-by: Baolin Wang Reviewed-by: Rik van Riel Reviewed-by: Barry Song Acked-by: David Hildenbrand (Arm) Cc: Axel Rasmussen Cc: Catalin Marinas Cc: Dev Jain Cc: Jann Horn Cc: Johannes Weiner Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Qi Zheng Cc: Ryan Roberts Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Xu Cc: Will Deacon Cc: Yuanchu Xie Cc: Alistair Popple Signed-off-by: Andrew Morton --- include/linux/mmu_notifier.h | 54 -------------------------------------------- 1 file changed, 54 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 8450e18a87c2..3705d350c863 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -516,55 +516,6 @@ static inline void mmu_notifier_range_init_owner( range->owner = owner; } -#define clear_flush_young_ptes_notify(__vma, __address, __ptep, __nr) \ -({ \ - int __young; \ - struct vm_area_struct *___vma = __vma; \ - unsigned long ___address = __address; \ - unsigned int ___nr = __nr; \ - __young = clear_flush_young_ptes(___vma, ___address, __ptep, ___nr); \ - __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \ - ___address, \ - ___address + \ - ___nr * PAGE_SIZE); \ - __young; \ -}) - -#define pmdp_clear_flush_young_notify(__vma, __address, __pmdp) \ -({ \ - int __young; \ - struct vm_area_struct *___vma = __vma; \ - unsigned long ___address = __address; \ - __young = pmdp_clear_flush_young(___vma, ___address, __pmdp); \ - __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \ - ___address, \ - ___address + \ - PMD_SIZE); \ - __young; \ -}) - -#define ptep_clear_young_notify(__vma, __address, __ptep) \ -({ \ - int __young; \ - struct vm_area_struct *___vma = __vma; \ - unsigned long ___address = __address; \ - __young = ptep_test_and_clear_young(___vma, ___address, __ptep);\ - __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address, \ - ___address + PAGE_SIZE); \ - __young; \ -}) - -#define pmdp_clear_young_notify(__vma, __address, __pmdp) \ -({ \ - int __young; \ - struct vm_area_struct *___vma = __vma; \ - unsigned long ___address = __address; \ - __young = pmdp_test_and_clear_young(___vma, ___address, __pmdp);\ - __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address, \ - ___address + PMD_SIZE); \ - __young; \ -}) - #else /* CONFIG_MMU_NOTIFIER */ struct mmu_notifier_range { @@ -652,11 +603,6 @@ static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm) #define mmu_notifier_range_update_to_read_only(r) false -#define clear_flush_young_ptes_notify clear_flush_young_ptes -#define pmdp_clear_flush_young_notify pmdp_clear_flush_young -#define ptep_clear_young_notify ptep_test_and_clear_young -#define pmdp_clear_young_notify pmdp_test_and_clear_young - static inline void mmu_notifier_synchronize(void) { } -- cgit v1.2.3 From 6d7237dda44f24bb0dec5dbd2a0ed6be77bf6ef6 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 6 Mar 2026 14:43:40 +0800 Subject: mm: add a batched helper to clear the young flag for large folios Currently, MGLRU will call ptep_test_and_clear_young_notify() to check and clear the young flag for each PTE sequentially, which is inefficient for large folios reclamation. Moreover, on Arm64 architecture, which supports contiguous PTEs, the Arm64- specific ptep_test_and_clear_young() already implements an optimization to clear the young flags for PTEs within a contiguous range. However, this is not sufficient. Similar to the Arm64 specific clear_flush_young_ptes(), we can extend this to perform batched operations for the entire large folio (which might exceed the contiguous range: CONT_PTE_SIZE). Thus, we can introduce a new batched helper: test_and_clear_young_ptes() and its wrapper test_and_clear_young_ptes_notify() which are consistent with the existing functions, to perform batched checking of the young flags for large folios, which can help improve performance during large folio reclamation when MGLRU is enabled. And it will be overridden by the architecture that implements a more efficient batch operation in the following patches. Link: https://lkml.kernel.org/r/23ec671bfcc06cd24ee0fbff8e329402742274a0.1772778858.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Andrew Morton Cc: Alistair Popple Cc: Axel Rasmussen Cc: Barry Song Cc: Catalin Marinas Cc: David Hildenbrand (Arm) Cc: Dev Jain Cc: Jann Horn Cc: Johannes Weiner Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Qi Zheng Cc: Rik van Riel Cc: Ryan Roberts Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Xu Cc: Will Deacon Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index d2767a4c027b..17d961c612fc 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1103,6 +1103,43 @@ static inline int clear_flush_young_ptes(struct vm_area_struct *vma, } #endif +#ifndef test_and_clear_young_ptes +/** + * test_and_clear_young_ptes - Mark PTEs that map consecutive pages of the same + * folio as old + * @vma: The virtual memory area the pages are mapped into. + * @addr: Address the first page is mapped at. + * @ptep: Page table pointer for the first entry. + * @nr: Number of entries to clear access bit. + * + * May be overridden by the architecture; otherwise, implemented as a simple + * loop over ptep_test_and_clear_young(). + * + * Note that PTE bits in the PTE range besides the PFN can differ. For example, + * some PTEs might be write-protected. + * + * Context: The caller holds the page table lock. The PTEs map consecutive + * pages that belong to the same folio. The PTEs are all in the same PMD. + * + * Returns: whether any PTE was young. + */ +static inline int test_and_clear_young_ptes(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, unsigned int nr) +{ + int young = 0; + + for (;;) { + young |= ptep_test_and_clear_young(vma, addr, ptep); + if (--nr == 0) + break; + ptep++; + addr += PAGE_SIZE; + } + + return young; +} +#endif + /* * On some architectures hardware does not set page access bit when accessing * memory page, it is responsibility of software setting this bit. It brings -- cgit v1.2.3 From 56e5b60b2114dee967c971f08dd29ef193bd3a2d Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 6 Mar 2026 14:43:41 +0800 Subject: mm: support batched checking of the young flag for MGLRU Use the batched helper test_and_clear_young_ptes_notify() to check and clear the young flag to improve the performance during large folio reclamation when MGLRU is enabled. Meanwhile, we can also support batched checking the young and dirty flag when MGLRU walks the mm's pagetable to update the folios' generation counter. Since MGLRU also checks the PTE dirty bit, use folio_pte_batch_flags() with FPB_MERGE_YOUNG_DIRTY set to detect batches of PTEs for a large folio. Then we can remove the ptep_test_and_clear_young_notify() since it has no users now. Note that we also update the 'young' counter and 'mm_stats[MM_LEAF_YOUNG]' counter with the batched count in the lru_gen_look_around() and walk_pte_range(). However, the batched operations may inflate these two counters, because in a large folio not all PTEs may have been accessed. (Additionally, tracking how many PTEs have been accessed within a large folio is not very meaningful, since the mm core actually tracks access/dirty on a per-folio basis, not per page). The impact analysis is as follows: 1. The 'mm_stats[MM_LEAF_YOUNG]' counter has no functional impact and is mainly for debugging. 2. The 'young' counter is used to decide whether to place the current PMD entry into the bloom filters by suitable_to_scan() (so that next time we can check whether it has been accessed again), which may set the hash bit in the bloom filters for a PMD entry that hasn't seen much access. However, bloom filters inherently allow some error, so this effect appears negligible. Link: https://lkml.kernel.org/r/378f4acf7d07410aa7c2e4b49d56bb165918eb34.1772778858.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Rik van Riel Acked-by: David Hildenbrand (Arm) Cc: Alistair Popple Cc: Axel Rasmussen Cc: Barry Song Cc: Catalin Marinas Cc: Dev Jain Cc: Jann Horn Cc: Johannes Weiner Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Qi Zheng Cc: Ryan Roberts Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Xu Cc: Will Deacon Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5c3ae0348754..3f651baf7e2b 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -684,7 +684,7 @@ struct lru_gen_memcg { void lru_gen_init_pgdat(struct pglist_data *pgdat); void lru_gen_init_lruvec(struct lruvec *lruvec); -bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw); +bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw, unsigned int nr); void lru_gen_init_memcg(struct mem_cgroup *memcg); void lru_gen_exit_memcg(struct mem_cgroup *memcg); @@ -703,7 +703,8 @@ static inline void lru_gen_init_lruvec(struct lruvec *lruvec) { } -static inline bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) +static inline bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw, + unsigned int nr) { return false; } -- cgit v1.2.3 From 417607de1f4e6280f646aa42cad5ed84e9228c01 Mon Sep 17 00:00:00 2001 From: Yuvraj Sakshith Date: Tue, 3 Mar 2026 03:30:28 -0800 Subject: mm/page_reporting: add PAGE_REPORTING_ORDER_UNSPECIFIED MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Allow order zero pages in page reporting", v4. Today, page reporting sets page_reporting_order in two ways: (1) page_reporting.page_reporting_order cmdline parameter (2) Driver can pass order while registering itself. In both cases, order zero is ignored by free page reporting because it is used to set page_reporting_order to a default value, like MAX_PAGE_ORDER. In some cases we might want page_reporting_order to be zero. For instance, when virtio-balloon runs inside a guest with tiny memory (say, 16MB), it might not be able to find a order 1 page (or in the worst case order MAX_PAGE_ORDER page) after some uptime. Page reporting should be able to return order zero pages back for optimal memory relinquishment. This patch changes the default fallback value from '0' to '-1' in all possible clients of free page reporting (hv_balloon and virtio-balloon) together with allowing '0' as a valid order in page_reporting_register(). This patch (of 5): Drivers can pass order of pages to be reported while registering itself. Today, this is a magic number, 0. Label this with PAGE_REPORTING_ORDER_UNSPECIFIED and check for it when the driver is being registered. This macro will be used in relevant drivers next. [akpm@linux-foundation.org: tweak whitespace, per David] Link: https://lkml.kernel.org/r/20260303113032.3008371-1-yuvraj.sakshith@oss.qualcomm.com Link: https://lkml.kernel.org/r/20260303113032.3008371-2-yuvraj.sakshith@oss.qualcomm.com Signed-off-by: Yuvraj Sakshith Acked-by: David Hildenbrand (Arm) Reviewed-by: Michael Kelley Acked-by: Michael S. Tsirkin Cc: Brendan Jackman Cc: Dexuan Cui Cc: Eugenio Pérez Cc: Haiyang Zhang Cc: Jason Wang Cc: Johannes Weiner Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Liu Cc: Xuan Zhuo Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/page_reporting.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/page_reporting.h b/include/linux/page_reporting.h index fe648dfa3a7c..d1886c657285 100644 --- a/include/linux/page_reporting.h +++ b/include/linux/page_reporting.h @@ -7,6 +7,7 @@ /* This value should always be a power of 2, see page_reporting_cycle() */ #define PAGE_REPORTING_CAPACITY 32 +#define PAGE_REPORTING_ORDER_UNSPECIFIED 0 struct page_reporting_dev_info { /* function that alters pages to make them "reported" */ -- cgit v1.2.3 From 5467c292d07ffcd55a7a66af2259855f49e1dd06 Mon Sep 17 00:00:00 2001 From: Yuvraj Sakshith Date: Tue, 3 Mar 2026 03:30:31 -0800 Subject: mm/page_reporting: change PAGE_REPORTING_ORDER_UNSPECIFIED to -1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PAGE_REPORTING_ORDER_UNSPECIFIED is now set to zero. This means, pages of order zero cannot be reported to a client/driver -- as zero is used to signal a fallback to MAX_PAGE_ORDER. Change PAGE_REPORTING_ORDER_UNSPECIFIED to (-1), so that zero can be used as a valid order with which pages can be reported. Link: https://lkml.kernel.org/r/20260303113032.3008371-5-yuvraj.sakshith@oss.qualcomm.com Signed-off-by: Yuvraj Sakshith Acked-by: David Hildenbrand (Arm) Reviewed-by: Michael Kelley Acked-by: Michael S. Tsirkin Cc: Brendan Jackman Cc: Dexuan Cui Cc: Eugenio Pérez Cc: Haiyang Zhang Cc: Jason Wang Cc: Johannes Weiner Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Liu Cc: Xuan Zhuo Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/page_reporting.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/page_reporting.h b/include/linux/page_reporting.h index d1886c657285..9d4ca5c218a0 100644 --- a/include/linux/page_reporting.h +++ b/include/linux/page_reporting.h @@ -7,7 +7,7 @@ /* This value should always be a power of 2, see page_reporting_cycle() */ #define PAGE_REPORTING_CAPACITY 32 -#define PAGE_REPORTING_ORDER_UNSPECIFIED 0 +#define PAGE_REPORTING_ORDER_UNSPECIFIED -1 struct page_reporting_dev_info { /* function that alters pages to make them "reported" */ -- cgit v1.2.3 From e650bb30ca532901da6def04c7d1de72ae59ea4e Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Thu, 5 Mar 2026 10:50:14 +0000 Subject: mm: rename VMA flag helpers to be more readable Patch series "mm: vma flag tweaks". The ongoing work around introducing non-system word VMA flags has introduced a number of helper functions and macros to make life easier when working with these flags and to make conversions from the legacy use of VM_xxx flags more straightforward. This series improves these to reduce confusion as to what they do and to improve consistency and readability. Firstly the series renames vma_flags_test() to vma_flags_test_any() to make it abundantly clear that this function tests whether any of the flags are set (as opposed to vma_flags_test_all()). It then renames vma_desc_test_flags() to vma_desc_test_any() for the same reason. Note that we drop the 'flags' suffix here, as vma_desc_test_any_flags() would be cumbersome and 'test' implies a flag test. Similarly, we rename vma_test_all_flags() to vma_test_all() for consistency. Next, we have a couple of instances (erofs, zonefs) where we are now testing for vma_desc_test_any(desc, VMA_SHARED_BIT) && vma_desc_test_any(desc, VMA_MAYWRITE_BIT). This is silly, so this series introduces vma_desc_test_all() so these callers can instead invoke vma_desc_test_all(desc, VMA_SHARED_BIT, VMA_MAYWRITE_BIT). We then observe that quite a few instances of vma_flags_test_any() and vma_desc_test_any() are in fact only testing against a single flag. Using the _any() variant here is just confusing - 'any' of single item reads strangely and is liable to cause confusion. So in these instances the series reintroduces vma_flags_test() and vma_desc_test() as helpers which test against a single flag. The fact that vma_flags_t is a struct and that vma_flag_t utilises sparse to avoid confusion with vm_flags_t makes it impossible for a user to misuse these helpers without it getting flagged somewhere. The series also updates __mk_vma_flags() and functions invoked by it to explicitly mark them always inline to match expectation and to be consistent with other VMA flag helpers. It also renames vma_flag_set() to vma_flags_set_flag() (a function only used by __mk_vma_flags()) to be consistent with other VMA flag helpers. Finally it updates the VMA tests for each of these changes, and introduces explicit tests for vma_flags_test() and vma_desc_test() to assert that they behave as expected. This patch (of 6): On reflection, it's confusing to have vma_flags_test() and vma_desc_test_flags() test whether any comma-separated VMA flag bit is set, while also having vma_flags_test_all() and vma_test_all_flags() separately test whether all flags are set. Firstly, rename vma_flags_test() to vma_flags_test_any() to eliminate this confusion. Secondly, since the VMA descriptor flag functions are becoming rather cumbersome, prefer vma_desc_test*() to vma_desc_test_flags*(), and also rename vma_desc_test_flags() to vma_desc_test_any(). Finally, rename vma_test_all_flags() to vma_test_all() to keep the VMA-specific helper consistent with the VMA descriptor naming convention and to help avoid confusion vs. vma_flags_test_all(). While we're here, also update whitespace to be consistent in helper functions. Link: https://lkml.kernel.org/r/cover.1772704455.git.ljs@kernel.org Link: https://lkml.kernel.org/r/0f9cb3c511c478344fac0b3b3b0300bb95be95e9.1772704455.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Suggested-by: Pedro Falcato Acked-by: David Hildenbrand (Arm) Reviewed-by: Pedro Falcato Cc: Arnd Bergmann Cc: Babu Moger Cc: Baolin Wang Cc: Chao Yu Cc: Chatre, Reinette Cc: Chunhai Guo Cc: Damien Le Maol Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Hongbo Li Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jeffle Xu Cc: Johannes Thumshirn Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naohiro Aota Cc: Oscar Salvador Cc: Sandeep Dhavale Cc: Suren Baghdasaryan Cc: Vishal Verma Cc: Vlastimil Babka Cc: Yue Hu Signed-off-by: Andrew Morton --- include/linux/dax.h | 4 ++-- include/linux/hugetlb_inline.h | 2 +- include/linux/mm.h | 48 ++++++++++++++++++++++-------------------- 3 files changed, 28 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index bf103f317cac..535019001577 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -69,7 +69,7 @@ static inline bool daxdev_mapping_supported(const struct vm_area_desc *desc, const struct inode *inode, struct dax_device *dax_dev) { - if (!vma_desc_test_flags(desc, VMA_SYNC_BIT)) + if (!vma_desc_test_any(desc, VMA_SYNC_BIT)) return true; if (!IS_DAX(inode)) return false; @@ -115,7 +115,7 @@ static inline bool daxdev_mapping_supported(const struct vm_area_desc *desc, const struct inode *inode, struct dax_device *dax_dev) { - return !vma_desc_test_flags(desc, VMA_SYNC_BIT); + return !vma_desc_test_any(desc, VMA_SYNC_BIT); } static inline size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i) diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h index 593f5d4e108b..84afc3c3e2e4 100644 --- a/include/linux/hugetlb_inline.h +++ b/include/linux/hugetlb_inline.h @@ -13,7 +13,7 @@ static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags) static inline bool is_vma_hugetlb_flags(const vma_flags_t *flags) { - return vma_flags_test(flags, VMA_HUGETLB_BIT); + return vma_flags_test_any(flags, VMA_HUGETLB_BIT); } #else diff --git a/include/linux/mm.h b/include/linux/mm.h index c516d5177211..ee7671d6c5eb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1062,7 +1062,7 @@ static inline vma_flags_t __mk_vma_flags(size_t count, const vma_flag_t *bits) (const vma_flag_t []){__VA_ARGS__}) /* Test each of to_test flags in flags, non-atomically. */ -static __always_inline bool vma_flags_test_mask(const vma_flags_t *flags, +static __always_inline bool vma_flags_test_any_mask(const vma_flags_t *flags, vma_flags_t to_test) { const unsigned long *bitmap = flags->__vma_flags; @@ -1074,10 +1074,10 @@ static __always_inline bool vma_flags_test_mask(const vma_flags_t *flags, /* * Test whether any specified VMA flag is set, e.g.: * - * if (vma_flags_test(flags, VMA_READ_BIT, VMA_MAYREAD_BIT)) { ... } + * if (vma_flags_test_any(flags, VMA_READ_BIT, VMA_MAYREAD_BIT)) { ... } */ -#define vma_flags_test(flags, ...) \ - vma_flags_test_mask(flags, mk_vma_flags(__VA_ARGS__)) +#define vma_flags_test_any(flags, ...) \ + vma_flags_test_any_mask(flags, mk_vma_flags(__VA_ARGS__)) /* Test that ALL of the to_test flags are set, non-atomically. */ static __always_inline bool vma_flags_test_all_mask(const vma_flags_t *flags, @@ -1098,7 +1098,8 @@ static __always_inline bool vma_flags_test_all_mask(const vma_flags_t *flags, vma_flags_test_all_mask(flags, mk_vma_flags(__VA_ARGS__)) /* Set each of the to_set flags in flags, non-atomically. */ -static __always_inline void vma_flags_set_mask(vma_flags_t *flags, vma_flags_t to_set) +static __always_inline void vma_flags_set_mask(vma_flags_t *flags, + vma_flags_t to_set) { unsigned long *bitmap = flags->__vma_flags; const unsigned long *bitmap_to_set = to_set.__vma_flags; @@ -1115,7 +1116,8 @@ static __always_inline void vma_flags_set_mask(vma_flags_t *flags, vma_flags_t t vma_flags_set_mask(flags, mk_vma_flags(__VA_ARGS__)) /* Clear all of the to-clear flags in flags, non-atomically. */ -static __always_inline void vma_flags_clear_mask(vma_flags_t *flags, vma_flags_t to_clear) +static __always_inline void vma_flags_clear_mask(vma_flags_t *flags, + vma_flags_t to_clear) { unsigned long *bitmap = flags->__vma_flags; const unsigned long *bitmap_to_clear = to_clear.__vma_flags; @@ -1137,8 +1139,8 @@ static __always_inline void vma_flags_clear_mask(vma_flags_t *flags, vma_flags_t * Note: appropriate locks must be held, this function does not acquire them for * you. */ -static inline bool vma_test_all_flags_mask(const struct vm_area_struct *vma, - vma_flags_t flags) +static inline bool vma_test_all_mask(const struct vm_area_struct *vma, + vma_flags_t flags) { return vma_flags_test_all_mask(&vma->flags, flags); } @@ -1146,10 +1148,10 @@ static inline bool vma_test_all_flags_mask(const struct vm_area_struct *vma, /* * Helper macro for checking that ALL specified flags are set in a VMA, e.g.: * - * if (vma_test_all_flags(vma, VMA_READ_BIT, VMA_MAYREAD_BIT) { ... } + * if (vma_test_all(vma, VMA_READ_BIT, VMA_MAYREAD_BIT) { ... } */ -#define vma_test_all_flags(vma, ...) \ - vma_test_all_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) +#define vma_test_all(vma, ...) \ + vma_test_all_mask(vma, mk_vma_flags(__VA_ARGS__)) /* * Helper to set all VMA flags in a VMA. @@ -1158,7 +1160,7 @@ static inline bool vma_test_all_flags_mask(const struct vm_area_struct *vma, * you. */ static inline void vma_set_flags_mask(struct vm_area_struct *vma, - vma_flags_t flags) + vma_flags_t flags) { vma_flags_set_mask(&vma->flags, flags); } @@ -1176,25 +1178,25 @@ static inline void vma_set_flags_mask(struct vm_area_struct *vma, vma_set_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) /* Helper to test all VMA flags in a VMA descriptor. */ -static inline bool vma_desc_test_flags_mask(const struct vm_area_desc *desc, - vma_flags_t flags) +static inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, + vma_flags_t flags) { - return vma_flags_test_mask(&desc->vma_flags, flags); + return vma_flags_test_any_mask(&desc->vma_flags, flags); } /* * Helper macro for testing VMA flags for an input pointer to a struct * vm_area_desc object describing a proposed VMA, e.g.: * - * if (vma_desc_test_flags(desc, VMA_IO_BIT, VMA_PFNMAP_BIT, + * if (vma_desc_test_any(desc, VMA_IO_BIT, VMA_PFNMAP_BIT, * VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT)) { ... } */ -#define vma_desc_test_flags(desc, ...) \ - vma_desc_test_flags_mask(desc, mk_vma_flags(__VA_ARGS__)) +#define vma_desc_test_any(desc, ...) \ + vma_desc_test_any_mask(desc, mk_vma_flags(__VA_ARGS__)) /* Helper to set all VMA flags in a VMA descriptor. */ static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, - vma_flags_t flags) + vma_flags_t flags) { vma_flags_set_mask(&desc->vma_flags, flags); } @@ -1211,7 +1213,7 @@ static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, /* Helper to clear all VMA flags in a VMA descriptor. */ static inline void vma_desc_clear_flags_mask(struct vm_area_desc *desc, - vma_flags_t flags) + vma_flags_t flags) { vma_flags_clear_mask(&desc->vma_flags, flags); } @@ -1936,8 +1938,8 @@ static inline bool vma_desc_is_cow_mapping(struct vm_area_desc *desc) { const vma_flags_t *flags = &desc->vma_flags; - return vma_flags_test(flags, VMA_MAYWRITE_BIT) && - !vma_flags_test(flags, VMA_SHARED_BIT); + return vma_flags_test_any(flags, VMA_MAYWRITE_BIT) && + !vma_flags_test_any(flags, VMA_SHARED_BIT); } #ifndef CONFIG_MMU @@ -1956,7 +1958,7 @@ static inline bool is_nommu_shared_mapping(vm_flags_t flags) static inline bool is_nommu_shared_vma_flags(const vma_flags_t *flags) { - return vma_flags_test(flags, VMA_MAYSHARE_BIT, VMA_MAYOVERLAY_BIT); + return vma_flags_test_any(flags, VMA_MAYSHARE_BIT, VMA_MAYOVERLAY_BIT); } #endif -- cgit v1.2.3 From 0b3ed2a495b5c10296d9371502d70ce4398f0c58 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Thu, 5 Mar 2026 10:50:15 +0000 Subject: mm: add vma_desc_test_all() and use it erofs and zonefs are using vma_desc_test_any() twice to check whether all of VMA_SHARED_BIT and VMA_MAYWRITE_BIT are set, this is silly, so add vma_desc_test_all() to test all flags and update erofs and zonefs to use it. While we're here, update the helper function comments to be more consistent. Also add the same to the VMA test headers. Link: https://lkml.kernel.org/r/568c8f8d6a84ff64014f997517cba7a629f7eed6.1772704455.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Reviewed-by: Vlastimil Babka (SUSE) Acked-by: David Hildenbrand (Arm) Reviewed-by: Pedro Falcato Cc: Arnd Bergmann Cc: Babu Moger Cc: Baolin Wang Cc: Chao Yu Cc: Chatre, Reinette Cc: Chunhai Guo Cc: Damien Le Maol Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Hongbo Li Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jeffle Xu Cc: Johannes Thumshirn Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naohiro Aota Cc: Oscar Salvador Cc: Sandeep Dhavale Cc: Suren Baghdasaryan Cc: Vishal Verma Cc: Yue Hu Signed-off-by: Andrew Morton --- include/linux/mm.h | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index ee7671d6c5eb..f964e4050583 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1177,7 +1177,7 @@ static inline void vma_set_flags_mask(struct vm_area_struct *vma, #define vma_set_flags(vma, ...) \ vma_set_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) -/* Helper to test all VMA flags in a VMA descriptor. */ +/* Helper to test any VMA flags in a VMA descriptor. */ static inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, vma_flags_t flags) { @@ -1185,8 +1185,8 @@ static inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, } /* - * Helper macro for testing VMA flags for an input pointer to a struct - * vm_area_desc object describing a proposed VMA, e.g.: + * Helper macro for testing whether any VMA flags are set in a VMA descriptor, + * e.g.: * * if (vma_desc_test_any(desc, VMA_IO_BIT, VMA_PFNMAP_BIT, * VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT)) { ... } @@ -1194,6 +1194,22 @@ static inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, #define vma_desc_test_any(desc, ...) \ vma_desc_test_any_mask(desc, mk_vma_flags(__VA_ARGS__)) +/* Helper to test all VMA flags in a VMA descriptor. */ +static inline bool vma_desc_test_all_mask(const struct vm_area_desc *desc, + vma_flags_t flags) +{ + return vma_flags_test_all_mask(&desc->vma_flags, flags); +} + +/* + * Helper macro for testing whether ALL VMA flags are set in a VMA descriptor, + * e.g.: + * + * if (vma_desc_test_all(desc, VMA_READ_BIT, VMA_MAYREAD_BIT)) { ... } + */ +#define vma_desc_test_all(desc, ...) \ + vma_desc_test_all_mask(desc, mk_vma_flags(__VA_ARGS__)) + /* Helper to set all VMA flags in a VMA descriptor. */ static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, vma_flags_t flags) @@ -1206,7 +1222,7 @@ static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, * vm_area_desc object describing a proposed VMA, e.g.: * * vma_desc_set_flags(desc, VMA_IO_BIT, VMA_PFNMAP_BIT, VMA_DONTEXPAND_BIT, - * VMA_DONTDUMP_BIT); + * VMA_DONTDUMP_BIT); */ #define vma_desc_set_flags(desc, ...) \ vma_desc_set_flags_mask(desc, mk_vma_flags(__VA_ARGS__)) -- cgit v1.2.3 From a5eee1128de526ba199bd4c7be39b849223e5001 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Thu, 5 Mar 2026 10:50:16 +0000 Subject: mm: always inline __mk_vma_flags() and invoked functions Be explicit about __mk_vma_flags() (which is used by the mk_vma_flags() macro) always being inline, as we rely on the compiler to evaluate the loop in this function and determine that it can replace the code with the an equivalent constant value, e.g. that: __mk_vma_flags(2, (const vma_flag_t []){ VMA_WRITE_BIT, VMA_EXEC_BIT }); Can be replaced with: (1UL << VMA_WRITE_BIT) | (1UL << VMA_EXEC_BIT) = (1UL << 1) | (1UL << 2) = 6 Most likely an 'inline' will suffice for this, but be explicit as we can be. Also update all of the functions __mk_vma_flags() ultimately invokes to be always inline too. Note that test_bitmap_const_eval() asserts that the relevant bitmap functions result in build time constant values. Additionally, vma_flag_set() operates on a vma_flags_t type, so it is inconsistently named versus other VMA flags functions. We only use vma_flag_set() in __mk_vma_flags() so we don't need to worry about its new name being rather cumbersome, so rename it to vma_flags_set_flag() to disambiguate it from vma_flags_set(). Also update the VMA test headers to reflect the changes. Link: https://lkml.kernel.org/r/241f49c52074d436edbb9c6a6662a8dc142a8f43.1772704455.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: David Hildenbrand (Arm) Reviewed-by: Pedro Falcato Cc: Arnd Bergmann Cc: Babu Moger Cc: Baolin Wang Cc: Chao Yu Cc: Chatre, Reinette Cc: Chunhai Guo Cc: Damien Le Maol Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Hongbo Li Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jeffle Xu Cc: Johannes Thumshirn Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naohiro Aota Cc: Oscar Salvador Cc: Sandeep Dhavale Cc: Suren Baghdasaryan Cc: Vishal Verma Cc: Vlastimil Babka Cc: Yue Hu Signed-off-by: Andrew Morton --- include/linux/mm.h | 8 +++++--- include/linux/mm_types.h | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index f964e4050583..9dcdf13570fb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1030,21 +1030,23 @@ static inline bool vma_test_atomic_flag(struct vm_area_struct *vma, vma_flag_t b } /* Set an individual VMA flag in flags, non-atomically. */ -static inline void vma_flag_set(vma_flags_t *flags, vma_flag_t bit) +static __always_inline void vma_flags_set_flag(vma_flags_t *flags, + vma_flag_t bit) { unsigned long *bitmap = flags->__vma_flags; __set_bit((__force int)bit, bitmap); } -static inline vma_flags_t __mk_vma_flags(size_t count, const vma_flag_t *bits) +static __always_inline vma_flags_t __mk_vma_flags(size_t count, + const vma_flag_t *bits) { vma_flags_t flags; int i; vma_flags_clear_all(&flags); for (i = 0; i < count; i++) - vma_flag_set(&flags, bits[i]); + vma_flags_set_flag(&flags, bits[i]); return flags; } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7bc82a2b889f..f22aecb047b7 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1056,7 +1056,7 @@ struct vm_area_struct { } __randomize_layout; /* Clears all bits in the VMA flags bitmap, non-atomically. */ -static inline void vma_flags_clear_all(vma_flags_t *flags) +static __always_inline void vma_flags_clear_all(vma_flags_t *flags) { bitmap_zero(flags->__vma_flags, NUM_VMA_FLAG_BITS); } -- cgit v1.2.3 From 5e6d45d720ca299cc82d84948c4ba622fff64f22 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Thu, 5 Mar 2026 10:50:17 +0000 Subject: mm: reintroduce vma_flags_test() as a singular flag test Since we've now renamed vma_flags_test() to vma_flags_test_any() to be very clear as to what we are in fact testing, we now have the opportunity to bring vma_flags_test() back, but for explicitly testing a single VMA flag. This is useful, as often flag tests are against a single flag, and vma_flags_test_any(flags, VMA_READ_BIT) reads oddly and potentially causes confusion. We use sparse to enforce that users won't accidentally pass vm_flags_t to this function without it being flagged so this should make it harder to get this wrong. Of course, passing vma_flags_t to the function is impossible, as it is a struct. Also update the VMA tests to reflect this change. Link: https://lkml.kernel.org/r/f33f8d7f16c3f3d286a1dc2cba12c23683073134.1772704455.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: David Hildenbrand (Arm) Reviewed-by: Pedro Falcato Cc: Arnd Bergmann Cc: Babu Moger Cc: Baolin Wang Cc: Chao Yu Cc: Chatre, Reinette Cc: Chunhai Guo Cc: Damien Le Maol Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Hongbo Li Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jeffle Xu Cc: Johannes Thumshirn Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naohiro Aota Cc: Oscar Salvador Cc: Sandeep Dhavale Cc: Suren Baghdasaryan Cc: Vishal Verma Cc: Vlastimil Babka Cc: Yue Hu Signed-off-by: Andrew Morton --- include/linux/mm.h | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 9dcdf13570fb..9392723a5c50 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1050,6 +1050,19 @@ static __always_inline vma_flags_t __mk_vma_flags(size_t count, return flags; } +/* + * Test whether a specific VMA flag is set, e.g.: + * + * if (vma_flags_test(flags, VMA_READ_BIT)) { ... } + */ +static __always_inline bool vma_flags_test(const vma_flags_t *flags, + vma_flag_t bit) +{ + const unsigned long *bitmap = flags->__vma_flags; + + return test_bit((__force int)bit, bitmap); +} + /* * Helper macro which bitwise-or combines the specified input flags into a * vma_flags_t bitmap value. E.g.: @@ -1956,8 +1969,8 @@ static inline bool vma_desc_is_cow_mapping(struct vm_area_desc *desc) { const vma_flags_t *flags = &desc->vma_flags; - return vma_flags_test_any(flags, VMA_MAYWRITE_BIT) && - !vma_flags_test_any(flags, VMA_SHARED_BIT); + return vma_flags_test(flags, VMA_MAYWRITE_BIT) && + !vma_flags_test(flags, VMA_SHARED_BIT); } #ifndef CONFIG_MMU -- cgit v1.2.3 From 0c2aa6635716a5aa19576deef062efab5322072f Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Thu, 5 Mar 2026 10:50:18 +0000 Subject: mm: reintroduce vma_desc_test() as a singular flag test Similar to vma_flags_test(), we have previously renamed vma_desc_test() to vma_desc_test_any(). Now that is in place, we can reintroduce vma_desc_test() to explicitly check for a single VMA flag. As with vma_flags_test(), this is useful as often flag tests are against a single flag, and vma_desc_test_any(flags, VMA_READ_BIT) reads oddly and potentially causes confusion. As with vma_flags_test() a combination of sparse and vma_flags_t being a struct means that users cannot misuse this function without it getting flagged. Also update the VMA tests to reflect this change. Link: https://lkml.kernel.org/r/3a65ca23defb05060333f0586428fe279a484564.1772704455.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: David Hildenbrand (Arm) Reviewed-by: Pedro Falcato Cc: Arnd Bergmann Cc: Babu Moger Cc: Baolin Wang Cc: Chao Yu Cc: Chatre, Reinette Cc: Chunhai Guo Cc: Damien Le Maol Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Hongbo Li Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jeffle Xu Cc: Johannes Thumshirn Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naohiro Aota Cc: Oscar Salvador Cc: Sandeep Dhavale Cc: Suren Baghdasaryan Cc: Vishal Verma Cc: Vlastimil Babka Cc: Yue Hu Signed-off-by: Andrew Morton --- include/linux/dax.h | 4 ++-- include/linux/mm.h | 11 +++++++++++ 2 files changed, 13 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index 535019001577..10a7cc79aea5 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -69,7 +69,7 @@ static inline bool daxdev_mapping_supported(const struct vm_area_desc *desc, const struct inode *inode, struct dax_device *dax_dev) { - if (!vma_desc_test_any(desc, VMA_SYNC_BIT)) + if (!vma_desc_test(desc, VMA_SYNC_BIT)) return true; if (!IS_DAX(inode)) return false; @@ -115,7 +115,7 @@ static inline bool daxdev_mapping_supported(const struct vm_area_desc *desc, const struct inode *inode, struct dax_device *dax_dev) { - return !vma_desc_test_any(desc, VMA_SYNC_BIT); + return !vma_desc_test(desc, VMA_SYNC_BIT); } static inline size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i) diff --git a/include/linux/mm.h b/include/linux/mm.h index 9392723a5c50..63d1f619260e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1192,6 +1192,17 @@ static inline void vma_set_flags_mask(struct vm_area_struct *vma, #define vma_set_flags(vma, ...) \ vma_set_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) +/* + * Test whether a specific VMA flag is set in a VMA descriptor, e.g.: + * + * if (vma_desc_test(desc, VMA_READ_BIT)) { ... } + */ +static __always_inline bool vma_desc_test(const struct vm_area_desc *desc, + vma_flag_t bit) +{ + return vma_flags_test(&desc->vma_flags, bit); +} + /* Helper to test any VMA flags in a VMA descriptor. */ static inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, vma_flags_t flags) -- cgit v1.2.3 From db359fccf212e7fa3136e6edbed6228475646fd7 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Tue, 24 Feb 2026 14:13:47 +0900 Subject: mm: introduce a new page type for page pool in page type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, the condition 'page->pp_magic == PP_SIGNATURE' is used to determine if a page belongs to a page pool. However, with the planned removal of @pp_magic, we should instead leverage the page_type in struct page, such as PGTY_netpp, for this purpose. Introduce and use the page type APIs e.g. PageNetpp(), __SetPageNetpp(), and __ClearPageNetpp() instead, and remove the existing APIs accessing @pp_magic e.g. page_pool_page_is_pp(), netmem_or_pp_magic(), and netmem_clear_pp_magic(). Plus, add @page_type to struct net_iov at the same offset as struct page so as to use the page_type APIs for struct net_iov as well. While at it, reorder @type and @owner in struct net_iov to avoid a hole and increasing the struct size. This work was inspired by the following link: https://lore.kernel.org/all/582f41c0-2742-4400-9c81-0d46bf4e8314@gmail.com/ While at it, move the sanity check for page pool to on the free path. [byungchul@sk.com: gate the sanity check, per Johannes] Link: https://lkml.kernel.org/r/20260316223113.20097-1-byungchul@sk.com Link: https://lkml.kernel.org/r/20260224051347.19621-1-byungchul@sk.com Co-developed-by: Pavel Begunkov Signed-off-by: Pavel Begunkov Signed-off-by: Byungchul Park Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Acked-by: Zi Yan Acked-by: Vlastimil Babka Reviewed-by: Toke Høiland-Jørgensen Acked-by: Mike Rapoport (Microsoft) Acked-by: Johannes Weiner Acked-by: Jakub Kicinski Acked-by: Jesper Dangaard Brouer Acked-by: Ilias Apalodimas Cc: Alexei Starovoitov Cc: Andrew Lunn Cc: Baolin Wang Cc: Brendan Jackman Cc: Christian Brauner Cc: Daniel Borkmann Cc: David S. Miller Cc: David Wei Cc: Dragos Tatulea Cc: Eric Dumazet Cc: John Fastabend Cc: Leon Romanovsky Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Mark Bloch Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mina Almasry Cc: Paolo Abeni Cc: Saeed Mahameed Cc: Simon Horman Cc: Stanislav Fomichev Cc: Stehen Rothwell Cc: Suren Baghdasaryan Cc: Taehee Yoo Cc: Tariq Toukan Cc: Usama Arif Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/mm.h | 27 +++------------------------ include/linux/page-flags.h | 6 ++++++ 2 files changed, 9 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 63d1f619260e..c758f4e68727 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4840,10 +4840,9 @@ int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); * DMA mapping IDs for page_pool * * When DMA-mapping a page, page_pool allocates an ID (from an xarray) and - * stashes it in the upper bits of page->pp_magic. We always want to be able to - * unambiguously identify page pool pages (using page_pool_page_is_pp()). Non-PP - * pages can have arbitrary kernel pointers stored in the same field as pp_magic - * (since it overlaps with page->lru.next), so we must ensure that we cannot + * stashes it in the upper bits of page->pp_magic. Non-PP pages can have + * arbitrary kernel pointers stored in the same field as pp_magic (since + * it overlaps with page->lru.next), so we must ensure that we cannot * mistake a valid kernel pointer with any of the values we write into this * field. * @@ -4878,26 +4877,6 @@ int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); #define PP_DMA_INDEX_MASK GENMASK(PP_DMA_INDEX_BITS + PP_DMA_INDEX_SHIFT - 1, \ PP_DMA_INDEX_SHIFT) -/* Mask used for checking in page_pool_page_is_pp() below. page->pp_magic is - * OR'ed with PP_SIGNATURE after the allocation in order to preserve bit 0 for - * the head page of compound page and bit 1 for pfmemalloc page, as well as the - * bits used for the DMA index. page_is_pfmemalloc() is checked in - * __page_pool_put_page() to avoid recycling the pfmemalloc page. - */ -#define PP_MAGIC_MASK ~(PP_DMA_INDEX_MASK | 0x3UL) - -#ifdef CONFIG_PAGE_POOL -static inline bool page_pool_page_is_pp(const struct page *page) -{ - return (page->pp_magic & PP_MAGIC_MASK) == PP_SIGNATURE; -} -#else -static inline bool page_pool_page_is_pp(const struct page *page) -{ - return false; -} -#endif - #define PAGE_SNAPSHOT_FAITHFUL (1 << 0) #define PAGE_SNAPSHOT_PG_BUDDY (1 << 1) #define PAGE_SNAPSHOT_PG_IDLE (1 << 2) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 7223f6f4e2b4..0e03d816e8b9 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -923,6 +923,7 @@ enum pagetype { PGTY_zsmalloc = 0xf6, PGTY_unaccepted = 0xf7, PGTY_large_kmalloc = 0xf8, + PGTY_netpp = 0xf9, PGTY_mapcount_underflow = 0xff }; @@ -1055,6 +1056,11 @@ PAGE_TYPE_OPS(Zsmalloc, zsmalloc, zsmalloc) PAGE_TYPE_OPS(Unaccepted, unaccepted, unaccepted) PAGE_TYPE_OPS(LargeKmalloc, large_kmalloc, large_kmalloc) +/* + * Marks page_pool allocated pages. + */ +PAGE_TYPE_OPS(Netpp, netpp, netpp) + /** * PageHuge - Determine if the page belongs to hugetlbfs * @page: The page to test. -- cgit v1.2.3 From 3802e1d98e92ca6abdd25446b802f405fef83da0 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 7 Mar 2026 11:53:52 -0800 Subject: mm/damon: document non-zero length damon_region assumption DAMON regions are assumed to always be non-zero length. There was a confusion [1] about it, probably due to lack of the documentation. Document it. Link: https://lkml.kernel.org/r/20260307195356.203753-5-sj@kernel.org Link: https://lore.kernel.org/20251231070029.79682-1-sj@kernel.org/ [1] Signed-off-by: SeongJae Park Acked-by: wang lian Cc: Brendan Higgins Cc: David Gow Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 60e6da3012fa..7d0265d02954 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -55,6 +55,8 @@ struct damon_size_range { * @list: List head for siblings. * @age: Age of this region. * + * For any use case, @ar should be non-zero positive size. + * * @nr_accesses is reset to zero for every &damon_attrs->aggr_interval and be * increased for every &damon_attrs->sample_interval if an access to the region * during the last sampling interval is found. The update of this field should -- cgit v1.2.3 From 341ffe82a7a3a1e0756b58999405b6df0c2b3e8d Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Mon, 9 Mar 2026 16:18:58 +0100 Subject: mm: move vma_kernel_pagesize() from hugetlb to mm.h Patch series "mm: move vma_(kernel|mmu)_pagesize() out of hugetlb.c", v2. Looking into vma_(kernel|mmu)_pagesize(), I realized that there is one scenario where DAX would not do the right thing when the kernel is not compiled with hugetlb support. Without hugetlb support, vma_(kernel|mmu)_pagesize() will always return PAGE_SIZE instead of using the ->pagesize() result provided by dax-device code. Fix that by moving vma_kernel_pagesize() to core MM code, where it belongs. I don't think this is stable material, but am not 100% sure. Also, move vma_mmu_pagesize() while at it. Remove the unnecessary hugetlb.h inclusion from KVM code. This patch (of 4): In the past, only hugetlb had special "vma_kernel_pagesize()" requirements, so it provided its own implementation. In commit 05ea88608d4e ("mm, hugetlbfs: introduce ->pagesize() to vm_operations_struct") we generalized that approach by providing a vm_ops->pagesize() callback to be used by device-dax. Once device-dax started using that callback in commit c1d53b92b95c ("device-dax: implement ->pagesize() for smaps to report MMUPageSize") it was missed that CONFIG_DEV_DAX does not depend on hugetlb support. So building a kernel with CONFIG_DEV_DAX but without CONFIG_HUGETLBFS would not pick up that value. Fix it by moving vma_kernel_pagesize() to mm.h, providing only a single implementation. While at it, improve the kerneldoc a bit. Ideally, we'd move vma_mmu_pagesize() as well to the header. However, its __weak symbol might be overwritten by a PPC variant in hugetlb code. So let's leave it in there for now, as it really only matters for some hugetlb oddities. This was found by code inspection. Link: https://lkml.kernel.org/r/20260309151901.123947-1-david@kernel.org Link: https://lkml.kernel.org/r/20260309151901.123947-2-david@kernel.org Fixes: c1d53b92b95c ("device-dax: implement ->pagesize() for smaps to report MMUPageSize") Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Acked-by: Mike Rapoport (Microsoft) Cc: Dan Williams Cc: "Christophe Leroy (CS GROUP)" Cc: Jann Horn Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Paolo Bonzini Cc: Pedro Falcato Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 7 ------- include/linux/mm.h | 20 ++++++++++++++++++++ 2 files changed, 20 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 65910437be1c..44c1848a2c21 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -777,8 +777,6 @@ static inline unsigned long huge_page_size(const struct hstate *h) return (unsigned long)PAGE_SIZE << h->order; } -extern unsigned long vma_kernel_pagesize(struct vm_area_struct *vma); - extern unsigned long vma_mmu_pagesize(struct vm_area_struct *vma); static inline unsigned long huge_page_mask(struct hstate *h) @@ -1177,11 +1175,6 @@ static inline unsigned long huge_page_mask(struct hstate *h) return PAGE_MASK; } -static inline unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) -{ - return PAGE_SIZE; -} - static inline unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) { return PAGE_SIZE; diff --git a/include/linux/mm.h b/include/linux/mm.h index c758f4e68727..e62cea754b0e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1351,6 +1351,26 @@ static inline bool vma_is_shared_maywrite(const struct vm_area_struct *vma) return is_shared_maywrite(&vma->flags); } +/** + * vma_kernel_pagesize - Default page size granularity for this VMA. + * @vma: The user mapping. + * + * The kernel page size specifies in which granularity VMA modifications + * can be performed. Folios in this VMA will be aligned to, and at least + * the size of the number of bytes returned by this function. + * + * The default kernel page size is not affected by Transparent Huge Pages + * being in effect. + * + * Return: The default page size granularity for this VMA. + */ +static inline unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) +{ + if (unlikely(vma->vm_ops && vma->vm_ops->pagesize)) + return vma->vm_ops->pagesize(vma); + return PAGE_SIZE; +} + static inline struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max) { -- cgit v1.2.3 From a9496e9e4b7c5785e82000a26b1118b4a1fd85c7 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Mon, 9 Mar 2026 16:18:59 +0100 Subject: mm: move vma_mmu_pagesize() from hugetlb to vma.c vma_mmu_pagesize() is also queried on non-hugetlb VMAs and does not really belong into hugetlb.c. PPC64 provides a custom overwrite with CONFIG_HUGETLB_PAGE, see arch/powerpc/mm/book3s64/slice.c, so we cannot easily make this a static inline function. So let's move it to vma.c and add some proper kerneldoc. To make vma tests happy, add a simple vma_kernel_pagesize() stub in tools/testing/vma/include/custom.h. Link: https://lkml.kernel.org/r/20260309151901.123947-3-david@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Acked-by: Mike Rapoport (Microsoft) Cc: "Christophe Leroy (CS GROUP)" Cc: Dan Williams Cc: Jann Horn Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Paolo Bonzini Cc: Pedro Falcato Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 7 ------- include/linux/mm.h | 2 ++ 2 files changed, 2 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 44c1848a2c21..aaf3d472e6b5 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -777,8 +777,6 @@ static inline unsigned long huge_page_size(const struct hstate *h) return (unsigned long)PAGE_SIZE << h->order; } -extern unsigned long vma_mmu_pagesize(struct vm_area_struct *vma); - static inline unsigned long huge_page_mask(struct hstate *h) { return h->mask; @@ -1175,11 +1173,6 @@ static inline unsigned long huge_page_mask(struct hstate *h) return PAGE_MASK; } -static inline unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) -{ - return PAGE_SIZE; -} - static inline unsigned int huge_page_order(struct hstate *h) { return 0; diff --git a/include/linux/mm.h b/include/linux/mm.h index e62cea754b0e..efb8be5d259c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1371,6 +1371,8 @@ static inline unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) return PAGE_SIZE; } +unsigned long vma_mmu_pagesize(struct vm_area_struct *vma); + static inline struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max) { -- cgit v1.2.3 From d239462787b072c78eb19fc1f155c3d411256282 Mon Sep 17 00:00:00 2001 From: Anthony Yznaga Date: Tue, 10 Mar 2026 08:58:20 -0700 Subject: mm: prevent droppable mappings from being locked Droppable mappings must not be lockable. There is a check for VMAs with VM_DROPPABLE set in mlock_fixup() along with checks for other types of unlockable VMAs which ensures this when calling mlock()/mlock2(). For mlockall(MCL_FUTURE), the check for unlockable VMAs is different. In apply_mlockall_flags(), if the flags parameter has MCL_FUTURE set, the current task's mm's default VMA flag field mm->def_flags has VM_LOCKED applied to it. VM_LOCKONFAULT is also applied if MCL_ONFAULT is also set. When these flags are set as default in this manner they are cleared in __mmap_complete() for new mappings that do not support mlock. A check for VM_DROPPABLE in __mmap_complete() is missing resulting in droppable mappings created with VM_LOCKED set. To fix this and reduce that chance of similar bugs in the future, introduce and use vma_supports_mlock(). Link: https://lkml.kernel.org/r/20260310155821.17869-1-anthony.yznaga@oracle.com Fixes: 9651fcedf7b9 ("mm: add MAP_DROPPABLE for designating always lazily freeable mappings") Signed-off-by: Anthony Yznaga Suggested-by: David Hildenbrand Acked-by: David Hildenbrand (Arm) Reviewed-by: Pedro Falcato Reviewed-by: Lorenzo Stoakes (Oracle) Tested-by: Lorenzo Stoakes (Oracle) Cc: Jann Horn Cc: Jason A. Donenfeld Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- include/linux/hugetlb_inline.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h index 84afc3c3e2e4..565b473fd135 100644 --- a/include/linux/hugetlb_inline.h +++ b/include/linux/hugetlb_inline.h @@ -30,7 +30,7 @@ static inline bool is_vma_hugetlb_flags(const vma_flags_t *flags) #endif -static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) +static inline bool is_vm_hugetlb_page(const struct vm_area_struct *vma) { return is_vm_hugetlb_flags(vma->vm_flags); } -- cgit v1.2.3 From 8719c59c4b928fc9ad8d8f45ecbdf859660c904c Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 Mar 2026 18:05:17 -0700 Subject: mm/damon/core: introduce damos_quota_goal_tuner Patch series "mm/damon: support multiple goal-based quota tuning algorithms". Aim-oriented DAMOS quota auto-tuning uses a single tuning algorithm. The algorithm is designed to find a quota value that should be consistently kept for achieving the aimed goal for long term. It is useful and reliable at automatically operating systems that have dynamic environments in the long term. As always, however, no single algorithm fits all. When the environment has static characteristics or there are control towers in not only the kernel space but also the user space, the algorithm shows some limitations. In such environments, users want kernel work in a more short term deterministic way. Actually there were at least two reports [1,2] of such cases. Extend DAMOS quotas goal to support multiple quota tuning algorithms that users can select. Keep the current algorithm as the default one, to not break the old users. Also give it a name, "consist", as it is designed to "consistently" apply the DAMOS action. And introduce a new tuning algorithm, namely "temporal". It is designed to apply the DAMOS action only temporally, in a deterministic way. In more detail, as long as the goal is under-achieved, it uses the maximum quota available. Once the goal is over-achieved, it sets the quota zero. Tests ===== I confirmed the feature is working as expected using the latest version of DAMON user-space tool, like below. $ # start DAMOS for reclaiming memory aiming 30% free memory $ sudo ./damo/damo start --damos_action pageout \ --damos_quota_goal_tuner temporal \ --damos_quota_goal node_mem_free_bp 30% 0 \ --damos_quota_interval 1s \ --damos_quota_space 100M Note that >=3.1.8 version of DAMON user-space tool supports this feature (--damos_quota_goal_tuner). As expected, DAMOS stops reclaiming memory as soon as the goal amount of free memory is made. When 'consist' tuner is used, the reclamation was continued even after the goal amount of free memory is made, resulting in more than goal amount of free memory, as expected. Patch Sequence ============== First four patches implement the features. Patch 1 extends core API to allow multiple tuners and make the current tuner as the default and only available tuner, namely 'consist'. Patch 2 allows future tuners setting zero effective quota. Patch 3 introduces the second tuner, namely 'temporal'. Patch 4 further extends DAMON sysfs API to let users use that. Three following patches (patches 5-7) update design, usage, and ABI documents, respectively. Final four patches (patches 8-11) are for adding tests. The eighth patch (patch 8) extends the kunit test for online parameters commit for validating the goal_tuner. The ninth and the tenth patches (patches 9-10) extend the testing-purpose DAMON sysfs control helper and DAMON status dumping tool to support the newly added feature. The final eleventh one (patch 11) extends the existing online commit selftest to cover the new feature. This patch (of 11): DAMOS quota goal feature utilizes a single feedback loop based algorithm for automatic tuning of the effective quota. It is useful in dynamic environments that operate systems with only kernels in the long term. But, no one fits all. It is not very easy to control in environments having more controlled characteristics and user-space control towers. We actually got multiple reports [1,2] of use cases that the algorithm is not optimal. Introduce a new field of 'struct damos_quotas', namely 'goal_tuner'. It specifies what tuning algorithm the given scheme should use, and allows DAMON API callers to set it as they want. Nonetheless, this commit introduces no new tuning algorithm but only the interface. This commit hence makes no behavioral change. A new algorithm will be added by the following commit. Link: https://lkml.kernel.org/r/20260310010529.91162-2-sj@kernel.org Link: https://lore.kernel.org/CALa+Y17__d=ZsM1yX+MXx0ozVdsXnFqF4p0g+kATEitrWyZFfg@mail.gmail.com [1] Link: https://lore.kernel.org/20260204022537.814-1-yunjeong.mun@sk.com [2] Signed-off-by: SeongJae Park Cc: Shuah Khan Cc: Brendan Higgins Cc: David Gow Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/damon.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 7d0265d02954..24de35a8395a 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -215,12 +215,21 @@ struct damos_quota_goal { struct list_head list; }; +/** + * enum damos_quota_goal_tuner - Goal-based quota tuning logic. + * @DAMOS_QUOTA_GOAL_TUNER_CONSIST: Aim long term consistent quota. + */ +enum damos_quota_goal_tuner { + DAMOS_QUOTA_GOAL_TUNER_CONSIST, +}; + /** * struct damos_quota - Controls the aggressiveness of the given scheme. * @reset_interval: Charge reset interval in milliseconds. * @ms: Maximum milliseconds that the scheme can use. * @sz: Maximum bytes of memory that the action can be applied. * @goals: Head of quota tuning goals (&damos_quota_goal) list. + * @goal_tuner: Goal-based @esz tuning algorithm to use. * @esz: Effective size quota in bytes. * * @weight_sz: Weight of the region's size for prioritization. @@ -262,6 +271,7 @@ struct damos_quota { unsigned long ms; unsigned long sz; struct list_head goals; + enum damos_quota_goal_tuner goal_tuner; unsigned long esz; unsigned int weight_sz; -- cgit v1.2.3 From af738a6a00c1febb0d543ba6a1400413f824ecf1 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 Mar 2026 18:05:19 -0700 Subject: mm/damon/core: introduce DAMOS_QUOTA_GOAL_TUNER_TEMPORAL Introduce a new goal-based DAMOS quota auto-tuning algorithm, namely DAMOS_QUOTA_GOAL_TUNER_TEMPORAL (temporal in short). The algorithm aims to trigger the DAMOS action only for a temporal time, to achieve the goal as soon as possible. For the temporal period, it uses as much quota as allowed. Once the goal is achieved, it sets the quota zero, so effectively makes the scheme be deactivated. Link: https://lkml.kernel.org/r/20260310010529.91162-4-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 24de35a8395a..e44e2132ccaf 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -218,9 +218,11 @@ struct damos_quota_goal { /** * enum damos_quota_goal_tuner - Goal-based quota tuning logic. * @DAMOS_QUOTA_GOAL_TUNER_CONSIST: Aim long term consistent quota. + * @DAMOS_QUOTA_GOAL_TUNER_TEMPORAL: Aim zero quota asap. */ enum damos_quota_goal_tuner { DAMOS_QUOTA_GOAL_TUNER_CONSIST, + DAMOS_QUOTA_GOAL_TUNER_TEMPORAL, }; /** -- cgit v1.2.3 From d4e981b280454f4368950db6269c6077d66453cf Mon Sep 17 00:00:00 2001 From: Kexin Sun Date: Thu, 12 Mar 2026 13:38:12 +0800 Subject: kasan: update outdated comment kmalloc_large() was renamed kmalloc_large_noprof() by commit 7bd230a26648 ("mm/slab: enable slab allocation tagging for kmalloc and friends"), and subsequently renamed __kmalloc_large_noprof() by commit a0a44d9175b3 ("mm, slab: don't wrap internal functions with alloc_hooks()"), making it an internal implementation detail. Large kmalloc allocations are now performed through the public kmalloc() interface directly, making the reference to KMALLOC_MAX_SIZE also stale (KMALLOC_MAX_CACHE_SIZE would be more accurate). Remove the references to kmalloc_large() and KMALLOC_MAX_SIZE, and rephrase the description for large kmalloc allocations. Link: https://lkml.kernel.org/r/20260312053812.1365-1-kexinsun@smail.nju.edu.cn Signed-off-by: Kexin Sun Suggested-by: Andrey Ryabinin Assisted-by: unnamed:deepseek-v3.2 coccinelle Reviewed-by: Andrey Ryabinin Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Julia Lawall Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- include/linux/kasan.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 338a1921a50a..bf233bde68c7 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -352,8 +352,8 @@ bool __kasan_mempool_poison_object(void *ptr, unsigned long ip); * kasan_mempool_unpoison_object(). * * This function operates on all slab allocations including large kmalloc - * allocations (the ones returned by kmalloc_large() or by kmalloc() with the - * size > KMALLOC_MAX_SIZE). + * allocations (i.e. the ones backed directly by the buddy allocator rather + * than kmalloc slab caches). * * Return: true if the allocation can be safely reused; false otherwise. */ @@ -381,8 +381,8 @@ void __kasan_mempool_unpoison_object(void *ptr, size_t size, unsigned long ip); * original tags based on the pointer value. * * This function operates on all slab allocations including large kmalloc - * allocations (the ones returned by kmalloc_large() or by kmalloc() with the - * size > KMALLOC_MAX_SIZE). + * allocations (i.e. the ones backed directly by the buddy allocator rather + * than kmalloc slab caches). */ static __always_inline void kasan_mempool_unpoison_object(void *ptr, size_t size) -- cgit v1.2.3 From 2d1e54aab6fd01f7502af20e125312e06a15bf9c Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Wed, 11 Mar 2026 17:24:37 +0000 Subject: mm: abstract reading sysctl_max_map_count, and READ_ONCE() Concurrent reads and writes of sysctl_max_map_count are possible, so we should READ_ONCE() and WRITE_ONCE(). The sysctl procfs logic already enforces WRITE_ONCE(), so abstract the read side with get_sysctl_max_map_count(). While we're here, also move the field to mm/internal.h and add the getter there since only mm interacts with it, there's no need for anybody else to have access. Finally, update the VMA userland tests to reflect the change. Link: https://lkml.kernel.org/r/0715259eb37cbdfde4f9e5db92a20ec7110a1ce5.1773249037.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Reviewed-by: Pedro Falcato Cc: Jann Horn Cc: Jianzhou Zhao Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index efb8be5d259c..25ba5816e02b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -207,8 +207,6 @@ static inline void __mm_zero_struct_page(struct page *page) #define MAPCOUNT_ELF_CORE_MARGIN (5) #define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) -extern int sysctl_max_map_count; - extern unsigned long sysctl_user_reserve_kbytes; extern unsigned long sysctl_admin_reserve_kbytes; -- cgit v1.2.3 From eabc2eddb2767e0ed90f98a65744bf4c8e287db7 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 10 Mar 2026 22:29:24 -0700 Subject: mm/damon/core: receive addr_unit on damon_set_region_biggest_system_ram_default() damon_find_biggest_system_ram() was not supporting addr_unit in the past. Hence, its caller, damon_set_region_biggest_system_ram_default(), was also not supporting addr_unit. The previous commit has updated the inner function to support addr_unit. There is no more reason to not support addr_unit on damon_set_region_biggest_system_ram_default(). Rather, it makes unnecessary inconsistency on support of addr_unit. Update it to receive addr_unit and handle it inside. Link: https://lkml.kernel.org/r/20260311052927.93921-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Yang yingliang Signed-off-by: Andrew Morton --- include/linux/damon.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index e44e2132ccaf..d9a3babbafc1 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -994,6 +994,7 @@ int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control); int damon_set_region_biggest_system_ram_default(struct damon_target *t, unsigned long *start, unsigned long *end, + unsigned long addr_unit, unsigned long min_region_sz); #endif /* CONFIG_DAMON */ -- cgit v1.2.3 From 0217c7fb4de4a40cee667eb21901f3204effe5ac Mon Sep 17 00:00:00 2001 From: Jianhui Zhou Date: Tue, 10 Mar 2026 19:05:26 +0800 Subject: mm/userfaultfd: fix hugetlb fault mutex hash calculation In mfill_atomic_hugetlb(), linear_page_index() is used to calculate the page index for hugetlb_fault_mutex_hash(). However, linear_page_index() returns the index in PAGE_SIZE units, while hugetlb_fault_mutex_hash() expects the index in huge page units. This mismatch means that different addresses within the same huge page can produce different hash values, leading to the use of different mutexes for the same huge page. This can cause races between faulting threads, which can corrupt the reservation map and trigger the BUG_ON in resv_map_release(). Fix this by introducing hugetlb_linear_page_index(), which returns the page index in huge page granularity, and using it in place of linear_page_index(). Link: https://lkml.kernel.org/r/20260310110526.335749-1-jianhuizzzzz@gmail.com Fixes: a08c7193e4f1 ("mm/filemap: remove hugetlb special casing in filemap.c") Signed-off-by: Jianhui Zhou Reported-by: syzbot+f525fd79634858f478e7@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=f525fd79634858f478e7 Acked-by: SeongJae Park Reviewed-by: David Hildenbrand (Arm) Acked-by: Mike Rapoport (Microsoft) Cc: Jane Chu Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: JonasZhou Cc: Muchun Song Cc: Oscar Salvador Cc: Peter Xu Cc: SeongJae Park Cc: Sidhartha Kumar Cc: Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index aaf3d472e6b5..9c098a02a09e 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -792,6 +792,23 @@ static inline unsigned huge_page_shift(struct hstate *h) return h->order + PAGE_SHIFT; } +/** + * hugetlb_linear_page_index() - linear_page_index() but in hugetlb + * page size granularity. + * @vma: the hugetlb VMA + * @address: the virtual address within the VMA + * + * Return: the page offset within the mapping in huge page units. + */ +static inline pgoff_t hugetlb_linear_page_index(struct vm_area_struct *vma, + unsigned long address) +{ + struct hstate *h = hstate_vma(vma); + + return ((address - vma->vm_start) >> huge_page_shift(h)) + + (vma->vm_pgoff >> huge_page_order(h)); +} + static inline bool order_is_gigantic(unsigned int order) { return order > MAX_PAGE_ORDER; -- cgit v1.2.3 From a91fd9f710490a89713823be3e7790ac59a085f8 Mon Sep 17 00:00:00 2001 From: Nico Pache Date: Wed, 25 Mar 2026 05:40:18 -0600 Subject: mm: consolidate anonymous folio PTE mapping into helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm: khugepaged cleanups and mTHP prerequisites", v4. The following series contains cleanups and prerequisites for my work on khugepaged mTHP support [1]. These have been separated out to ease review. The first patch in the series refactors the page fault folio to pte mapping and follows a similar convention as defined by map_anon_folio_pmd_(no)pf(). This not only cleans up the current implementation of do_anonymous_page(), but will allow for reuse later in the khugepaged mTHP implementation. The second patch adds a small is_pmd_order() helper to check if an order is the PMD order. This check is open-coded in a number of places. This patch aims to clean this up and will be used more in the khugepaged mTHP work. The third patch also adds a small DEFINE for (HPAGE_PMD_NR - 1) which is used often across the khugepaged code. The fourth and fifth patch come from the khugepaged mTHP patchset [1]. These two patches include the rename of function prefixes, and the unification of khugepaged and madvise_collapse via a new collapse_single_pmd function. Patch 1: refactor do_anonymous_page into map_anon_folio_pte_(no)pf Patch 2: add is_pmd_order helper Patch 3: Add define for (HPAGE_PMD_NR - 1) Patch 4: Refactor/rename hpage_collapse Patch 5: Refactoring to combine madvise_collapse and khugepaged A big thanks to everyone that has reviewed, tested, and participated in the development process. This patch (of 5): The anonymous page fault handler in do_anonymous_page() open-codes the sequence to map a newly allocated anonymous folio at the PTE level: - construct the PTE entry - add rmap - add to LRU - set the PTEs - update the MMU cache. Introduce two helpers to consolidate this duplicated logic, mirroring the existing map_anon_folio_pmd_nopf() pattern for PMD-level mappings: map_anon_folio_pte_nopf(): constructs the PTE entry, takes folio references, adds anon rmap and LRU. This function also handles the uffd_wp that can occur in the pf variant. The future khugepaged mTHP code calls this to handle mapping the new collapsed mTHP to its folio. map_anon_folio_pte_pf(): extends the nopf variant to handle MM_ANONPAGES counter updates, and mTHP fault allocation statistics for the page fault path. The zero-page read path in do_anonymous_page() is also untangled from the shared setpte label, since it does not allocate a folio and should not share the same mapping sequence as the write path. We can now leave nr_pages undeclared at the function intialization, and use the single page update_mmu_cache function to handle the zero page update. This refactoring will also help reduce code duplication between mm/memory.c and mm/khugepaged.c, and provides a clean API for PTE-level anonymous folio mapping that can be reused by future callers (like khugpeaged mTHP support) Link: https://lkml.kernel.org/r/20260325114022.444081-1-npache@redhat.com Link: https://lkml.kernel.org/r/20260325114022.444081-2-npache@redhat.com Link: https://lore.kernel.org/all/20260122192841.128719-1-npache@redhat.com Signed-off-by: Nico Pache Suggested-by: Lorenzo Stoakes (Oracle) Reviewed-by: Lorenzo Stoakes (Oracle) Reviewed-by: Dev Jain Reviewed-by: Lance Yang Acked-by: David Hildenbrand (Arm) Cc: Alistair Popple Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Baolin Wang Cc: Barry Song Cc: Brendan Jackman Cc: Byungchul Park Cc: Catalin Marinas Cc: David Rientjes Cc: Gregory Price Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Joshua Hahn Cc: Kefeng Wang Cc: Liam Howlett Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Matthew Brost Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Nanyong Sun Cc: Pedro Falcato Cc: Peter Xu Cc: Rafael Aquini Cc: Rakie Kim Cc: Randy Dunlap Cc: Ryan Roberts Cc: Shivank Garg Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Takashi Iwai (SUSE) Cc: Thomas Hellström Cc: Usama Arif Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Wei Yang Cc: Will Deacon Cc: Yang Shi Cc: Zach O'Keefe Cc: Zi Yan Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- include/linux/mm.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 25ba5816e02b..16a1ad9a3397 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4916,4 +4916,8 @@ static inline bool snapshot_page_is_faithful(const struct page_snapshot *ps) void snapshot_page(struct page_snapshot *ps, const struct page *page); +void map_anon_folio_pte_nopf(struct folio *folio, pte_t *pte, + struct vm_area_struct *vma, unsigned long addr, + bool uffd_wp); + #endif /* _LINUX_MM_H */ -- cgit v1.2.3 From b90c453d2664ba445383956560581f9db708584f Mon Sep 17 00:00:00 2001 From: Nico Pache Date: Wed, 25 Mar 2026 05:40:19 -0600 Subject: mm: introduce is_pmd_order helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to add mTHP support to khugepaged, we will often be checking if a given order is (or is not) a PMD order. Some places in the kernel already use this check, so lets create a simple helper function to keep the code clean and readable. Link: https://lkml.kernel.org/r/20260325114022.444081-3-npache@redhat.com Signed-off-by: Nico Pache Acked-by: David Hildenbrand (Arm) Reviewed-by: Baolin Wang Reviewed-by: Dev Jain Reviewed-by: Wei Yang Reviewed-by: Lance Yang Reviewed-by: Barry Song Reviewed-by: Zi Yan Reviewed-by: Pedro Falcato Reviewed-by: Lorenzo Stoakes Suggested-by: Lorenzo Stoakes Cc: Alistair Popple Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Brendan Jackman Cc: Byungchul Park Cc: Catalin Marinas Cc: David Rientjes Cc: Gregory Price Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Joshua Hahn Cc: Kefeng Wang Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Matthew Brost Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Nanyong Sun Cc: Peter Xu Cc: Rafael Aquini Cc: Rakie Kim Cc: Randy Dunlap Cc: Ryan Roberts Cc: Shivank Garg Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Takashi Iwai (SUSE) Cc: Thomas Hellström Cc: Usama Arif Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Will Deacon Cc: Yang Shi Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index a4d9f964dfde..bd7f0e1d8094 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -771,6 +771,11 @@ static inline bool pmd_is_huge(pmd_t pmd) } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +static inline bool is_pmd_order(unsigned int order) +{ + return order == HPAGE_PMD_ORDER; +} + static inline int split_folio_to_list_to_order(struct folio *folio, struct list_head *list, int new_order) { -- cgit v1.2.3 From 22688ade3b54b2f4f2887c7dad75db6d588ae07c Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 20 Mar 2026 23:13:41 +0100 Subject: mm/sparse: remove sparse_decode_mem_map() section_deactivate() applies to CONFIG_SPARSEMEM_VMEMMAP only. So we can just use pfn_to_page() (after making sure we have the start PFN of the section), and remove sparse_decode_mem_map(). Link: https://lkml.kernel.org/r/20260320-sparsemem_cleanups-v2-9-096addc8800d@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Reviewed-by: Mike Rapoport (Microsoft) Cc: Axel Rasmussen Cc: Liam Howlett Cc: Michal Hocko Cc: Oscar Salvador Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/memory_hotplug.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index e77ef3d7ff73..815e908c4135 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -308,8 +308,6 @@ extern int sparse_add_section(int nid, unsigned long pfn, struct dev_pagemap *pgmap); extern void sparse_remove_section(unsigned long pfn, unsigned long nr_pages, struct vmem_altmap *altmap); -extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, - unsigned long pnum); extern struct zone *zone_for_pfn_range(enum mmop online_type, int nid, struct memory_group *group, unsigned long start_pfn, unsigned long nr_pages); -- cgit v1.2.3 From fead6dcff83b02f8d6dc3c1ebbe4e09c05c54ee5 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 20 Mar 2026 23:13:43 +0100 Subject: mm: prepare to move subsection_map_init() to mm/sparse-vmemmap.c We want to move subsection_map_init() to mm/sparse-vmemmap.c. To prepare for getting rid of subsection_map_init() in mm/sparse.c completely, use a static inline function for !CONFIG_SPARSEMEM_VMEMMAP. While at it, move the declaration to internal.h and rename it to "sparse_init_subsection_map()". Link: https://lkml.kernel.org/r/20260320-sparsemem_cleanups-v2-11-096addc8800d@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Reviewed-by: Mike Rapoport (Microsoft) Cc: Axel Rasmussen Cc: Liam Howlett Cc: Michal Hocko Cc: Oscar Salvador Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3f651baf7e2b..7cf4a194aea2 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1982,8 +1982,6 @@ struct mem_section_usage { unsigned long pageblock_flags[0]; }; -void subsection_map_init(unsigned long pfn, unsigned long nr_pages); - struct page; struct page_ext; struct mem_section { @@ -2376,7 +2374,6 @@ static inline unsigned long next_present_section_nr(unsigned long section_nr) #define sparse_vmemmap_init_nid_early(_nid) do {} while (0) #define sparse_vmemmap_init_nid_late(_nid) do {} while (0) #define pfn_in_present_section pfn_valid -#define subsection_map_init(_pfn, _nr_pages) do {} while (0) #endif /* CONFIG_SPARSEMEM */ /* -- cgit v1.2.3 From f62a3bf227c95a105fccb5a2062367387cd49430 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 20 Mar 2026 23:13:45 +0100 Subject: mm/sparse: move sparse_init_one_section() to internal.h While at it, convert the BUG_ON to a VM_WARN_ON_ONCE, avoid long lines, and merge sparse_encode_mem_map() into its only caller sparse_init_one_section(). Clarify the comment a bit, pointing at page_to_pfn(). [david@kernel.org: s/VM_WARN_ON/VM_WARN_ON_ONCE/] Link: https://lkml.kernel.org/r/6b04c1a1-74e7-42e8-8523-a40802e5dacc@kernel.org Link: https://lkml.kernel.org/r/20260320-sparsemem_cleanups-v2-13-096addc8800d@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Reviewed-by: Mike Rapoport (Microsoft) Cc: Axel Rasmussen Cc: Liam Howlett Cc: Michal Hocko Cc: Oscar Salvador Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 7cf4a194aea2..ed335567d64e 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1988,7 +1988,7 @@ struct mem_section { /* * This is, logically, a pointer to an array of struct * pages. However, it is stored with some other magic. - * (see sparse.c::sparse_init_one_section()) + * (see sparse_init_one_section()) * * Additionally during early boot we encode node id of * the location of the section here to guide allocation. -- cgit v1.2.3 From 738de20c4fafe64290c5086d683254f60e837db6 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 20 Mar 2026 23:13:47 +0100 Subject: mm/sparse: move memory hotplug bits to sparse-vmemmap.c Let's move all memory hoptplug related code to sparse-vmemmap.c. We only have to expose sparse_index_init(). While at it, drop the definition of sparse_index_init() for !CONFIG_SPARSEMEM, which is unused, and place the declaration in internal.h. Link: https://lkml.kernel.org/r/20260320-sparsemem_cleanups-v2-15-096addc8800d@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Reviewed-by: Mike Rapoport (Microsoft) Cc: Axel Rasmussen Cc: Liam Howlett Cc: Michal Hocko Cc: Oscar Salvador Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index ed335567d64e..4a20df132258 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -2370,7 +2370,6 @@ static inline unsigned long next_present_section_nr(unsigned long section_nr) #endif #else -#define sparse_index_init(_sec, _nid) do {} while (0) #define sparse_vmemmap_init_nid_early(_nid) do {} while (0) #define sparse_vmemmap_init_nid_late(_nid) do {} while (0) #define pfn_in_present_section pfn_valid -- cgit v1.2.3 From 6ebf98d71f9b509e833e0af00795ad3723d2f410 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Thu, 19 Mar 2026 09:19:41 +0100 Subject: mm: introduce CONFIG_NUMA_MIGRATION and simplify CONFIG_MIGRATION CONFIG_MEMORY_HOTREMOVE, CONFIG_COMPACTION and CONFIG_CMA all select CONFIG_MIGRATION, because they require it to work (users). Only CONFIG_NUMA_BALANCING and CONFIG_BALLOON_MIGRATION depend on CONFIG_MIGRATION. CONFIG_BALLOON_MIGRATION is not an actual user, but an implementation of migration support, so the dependency is correct (CONFIG_BALLOON_MIGRATION does not make any sense without CONFIG_MIGRATION). However, kconfig-language.rst clearly states "In general use select only for non-visible symbols". So far CONFIG_MIGRATION is user-visible ... and the dependencies rather confusing. The whole reason why CONFIG_MIGRATION is user-visible is because of CONFIG_NUMA: some users might want CONFIG_NUMA but not page migration support. Let's clean all that up by introducing a dedicated CONFIG_NUMA_MIGRATION config option for that purpose only. Make CONFIG_NUMA_BALANCING that so far depended on CONFIG_NUMA && CONFIG_MIGRATION to depend on CONFIG_MIGRATION instead. CONFIG_NUMA_MIGRATION will depend on CONFIG_NUMA && CONFIG_MMU. CONFIG_NUMA_MIGRATION is user-visible and will default to "y". We use that default so new configs will automatically enable it, just like it was the case with CONFIG_MIGRATION. The downside is that some configs that used to have CONFIG_MIGRATION=n might get it re-enabled by CONFIG_NUMA_MIGRATION=y, which shouldn't be a problem. CONFIG_MIGRATION is now a non-visible config option. Any code that select CONFIG_MIGRATION (as before) must depend directly or indirectly on CONFIG_MMU. CONFIG_NUMA_MIGRATION is responsible for any NUMA migration code, which is mempolicy migration code, memory-tiering code, and move_pages() code in migrate.c. CONFIG_NUMA_BALANCING uses its functionality. Note that this implies that with CONFIG_NUMA_MIGRATION=n, move_pages() will not be available even though CONFIG_MIGRATION=y, which is an expected change. In migrate.c, we can remove the CONFIG_NUMA check as both CONFIG_NUMA_MIGRATION and CONFIG_NUMA_BALANCING depend on it. With this change, CONFIG_MIGRATION is an internal config, all users of migration selects CONFIG_MIGRATION, and only CONFIG_BALLOON_MIGRATION depends on it. Link: https://lkml.kernel.org/r/20260319-config_migration-v1-2-42270124966f@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Acked-by: Zi Yan Reviewed-by: Jonathan Cameron Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Alistair Popple Cc: "Borislav Petkov (AMD)" Cc: Byungchul Park Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Gregory Price Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: "Huang, Ying" Cc: Ingo Molnar Cc: Joshua Hahn Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Matthew Brost Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Rakie Kim Cc: Sebastian Andrzej Siewior Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/memory-tiers.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h index 96987d9d95a8..7999c58629ee 100644 --- a/include/linux/memory-tiers.h +++ b/include/linux/memory-tiers.h @@ -52,7 +52,7 @@ int mt_perf_to_adistance(struct access_coordinate *perf, int *adist); struct memory_dev_type *mt_find_alloc_memory_type(int adist, struct list_head *memory_types); void mt_put_memory_types(struct list_head *memory_types); -#ifdef CONFIG_MIGRATION +#ifdef CONFIG_NUMA_MIGRATION int next_demotion_node(int node, const nodemask_t *allowed_mask); void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets); bool node_is_toptier(int node); -- cgit v1.2.3 From a6a8c087dce00eac0c6d03e560b0fa3d529afa5f Mon Sep 17 00:00:00 2001 From: Leno Hou Date: Thu, 19 Mar 2026 00:30:49 +0800 Subject: mm/mglru: fix cgroup OOM during MGLRU state switching When the Multi-Gen LRU (MGLRU) state is toggled dynamically, a race condition exists between the state switching and the memory reclaim path. This can lead to unexpected cgroup OOM kills, even when plenty of reclaimable memory is available. Problem Description ================== The issue arises from a "reclaim vacuum" during the transition. 1. When disabling MGLRU, lru_gen_change_state() sets lrugen->enabled to false before the pages are drained from MGLRU lists back to traditional LRU lists. 2. Concurrent reclaimers in shrink_lruvec() see lrugen->enabled as false and skip the MGLRU path. 3. However, these pages might not have reached the traditional LRU lists yet, or the changes are not yet visible to all CPUs due to a lack of synchronization. 4. get_scan_count() subsequently finds traditional LRU lists empty, concludes there is no reclaimable memory, and triggers an OOM kill. A similar race can occur during enablement, where the reclaimer sees the new state but the MGLRU lists haven't been populated via fill_evictable() yet. Solution ======== Introduce a 'switching' state (`lru_switch`) to bridge the transition. When transitioning, the system enters this intermediate state where the reclaimer is forced to attempt both MGLRU and traditional reclaim paths sequentially. This ensures that folios remain visible to at least one reclaim mechanism until the transition is fully materialized across all CPUs. Race & Mitigation ================ A race window exists between checking the 'draining' state and performing the actual list operations. For instance, a reclaimer might observe the draining state as false just before it changes, leading to a suboptimal reclaim path decision. However, this impact is effectively mitigated by the kernel's reclaim retry mechanism (e.g., in do_try_to_free_pages). If a reclaimer pass fails to find eligible folios due to a state transition race, subsequent retries in the loop will observe the updated state and correctly direct the scan to the appropriate LRU lists. This ensures the transient inconsistency does not escalate into a terminal OOM kill. This effectively reduce the race window that previously triggered OOMs under high memory pressure. This fix has been verified on v7.0.0-rc1; dynamic toggling of MGLRU functions correctly without triggering unexpected OOM kills. Link: https://lkml.kernel.org/r/20260319-b4-switch-mglru-v2-v5-1-8898491e5f17@gmail.com Signed-off-by: Leno Hou Acked-by: Yafang Shao Reviewed-by: Barry Song Reviewed-by: Axel Rasmussen Cc: Yuanchu Xie Cc: Wei Xu Cc: Jialing Wang Cc: Yu Zhao Cc: Kairui Song Cc: Bingfang Guo Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index fa2d6ba811b5..2aedcff6a2c1 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -102,6 +102,12 @@ static __always_inline enum lru_list folio_lru_list(const struct folio *folio) #ifdef CONFIG_LRU_GEN +static inline bool lru_gen_switching(void) +{ + DECLARE_STATIC_KEY_FALSE(lru_switch); + + return static_branch_unlikely(&lru_switch); +} #ifdef CONFIG_LRU_GEN_ENABLED static inline bool lru_gen_enabled(void) { @@ -316,6 +322,11 @@ static inline bool lru_gen_enabled(void) return false; } +static inline bool lru_gen_switching(void) +{ + return false; +} + static inline bool lru_gen_in_fault(void) { return false; -- cgit v1.2.3 From a62ca3f40feaaaf0dfc4db1f2edeca5a70f4123d Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Sat, 21 Mar 2026 14:42:49 +0800 Subject: mm: change to return bool for ptep_test_and_clear_young() Patch series "change young flag check functions to return bool", v2. This is a cleanup patchset to change all young flag check functions to return bool, as discussed with David in the previous thread[1]. Since callers only care about whether the young flag was set, returning bool makes the intention clearer. No functional changes intended. This patch (of 6): Callers use ptep_test_and_clear_young() to clear the young flag and check whether it was set. Change the return type to bool to make the intention clearer. Link: https://lkml.kernel.org/r/cover.1774075004.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/57e70efa9703d43959aa645246ea3cbdba14fa17.1774075004.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Ritesh Harjani (IBM) Reviewed-by: Lorenzo Stoakes (Oracle) Acked-by: David Hildenbrand (Arm) Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 17d961c612fc..8e75dc9f7932 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -491,17 +491,17 @@ static inline pgd_t pgdp_get(pgd_t *pgdp) #endif #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, - unsigned long address, - pte_t *ptep) +static inline bool ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) { pte_t pte = ptep_get(ptep); - int r = 1; + bool young = true; + if (!pte_young(pte)) - r = 0; + young = false; else set_pte_at(vma->vm_mm, address, ptep, pte_mkold(pte)); - return r; + return young; } #endif @@ -1123,10 +1123,10 @@ static inline int clear_flush_young_ptes(struct vm_area_struct *vma, * * Returns: whether any PTE was young. */ -static inline int test_and_clear_young_ptes(struct vm_area_struct *vma, +static inline bool test_and_clear_young_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr) { - int young = 0; + bool young = false; for (;;) { young |= ptep_test_and_clear_young(vma, addr, ptep); -- cgit v1.2.3 From 06c4dfa3ced61635895d0e258da8dc63da539f42 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Sat, 21 Mar 2026 14:42:50 +0800 Subject: mm: change to return bool for ptep_clear_flush_young()/clear_flush_young_ptes() The ptep_clear_flush_young() and clear_flush_young_ptes() are used to clear the young flag and flush the TLB, returning whether the young flag was set. Change the return type to bool to make the intention clearer. Link: https://lkml.kernel.org/r/24af5144b96103631594501f77d4525f2475c1be.1774075004.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Ritesh Harjani (IBM) Reviewed-by: Lorenzo Stoakes (Oracle) Acked-by: David Hildenbrand (Arm) Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 8e75dc9f7932..99450a3b0705 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -531,8 +531,8 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH -int ptep_clear_flush_young(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep); +bool ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep); #endif #ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH @@ -1086,10 +1086,10 @@ static inline void wrprotect_ptes(struct mm_struct *mm, unsigned long addr, * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. The PTEs are all in the same PMD. */ -static inline int clear_flush_young_ptes(struct vm_area_struct *vma, +static inline bool clear_flush_young_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr) { - int young = 0; + bool young = false; for (;;) { young |= ptep_clear_flush_young(vma, addr, ptep); -- cgit v1.2.3 From 42e26354c4ef28772398b1d71b7477834037305c Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Sat, 21 Mar 2026 14:42:51 +0800 Subject: mm: change to return bool for pmdp_test_and_clear_young() Callers use pmdp_test_and_clear_young() to clear the young flag and check whether it was set for this PMD entry. Change the return type to bool to make the intention clearer. Link: https://lkml.kernel.org/r/f1d31307a13365d3d0fed5809727dcc2dd59631b.1774075004.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Ritesh Harjani (IBM) Reviewed-by: Lorenzo Stoakes (Oracle) Acked-by: David Hildenbrand (Arm) Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 99450a3b0705..6db900a5d38b 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -507,25 +507,24 @@ static inline bool ptep_test_and_clear_young(struct vm_area_struct *vma, #ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) -static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long address, - pmd_t *pmdp) +static inline bool pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) { pmd_t pmd = *pmdp; - int r = 1; + bool young = true; + if (!pmd_young(pmd)) - r = 0; + young = false; else set_pmd_at(vma->vm_mm, address, pmdp, pmd_mkold(pmd)); - return r; + return young; } #else -static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long address, - pmd_t *pmdp) +static inline bool pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) { BUILD_BUG(); - return 0; + return false; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */ #endif -- cgit v1.2.3 From 2d46a397472191a10b0df294d64da542bfd1de57 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Sat, 21 Mar 2026 14:42:52 +0800 Subject: mm: change to return bool for pmdp_clear_flush_young() The pmdp_clear_flush_young() is used to clear the young flag and flush the TLB, returning whether the young flag was set for this PMD entry. Change the return type to bool to make the intention clearer. Link: https://lkml.kernel.org/r/a668b9a974c0d675e7a41f6973bcbe3336e8b373.1774075004.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Lorenzo Stoakes (Oracle) Acked-by: David Hildenbrand (Arm) Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Ritesh Harjani (IBM) Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 6db900a5d38b..cdd68ed3ae1a 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -536,18 +536,18 @@ bool ptep_clear_flush_young(struct vm_area_struct *vma, #ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH #ifdef CONFIG_TRANSPARENT_HUGEPAGE -extern int pmdp_clear_flush_young(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp); +bool pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); #else /* * Despite relevant to THP only, this API is called from generic rmap code * under PageTransHuge(), hence needs a dummy implementation for !THP */ -static inline int pmdp_clear_flush_young(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) +static inline bool pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) { BUILD_BUG(); - return 0; + return false; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif -- cgit v1.2.3 From 1fc7dc675e26c43f3219d70a09b9f0c4aa43a13a Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Sat, 21 Mar 2026 14:42:54 +0800 Subject: mm: change to return bool for the MMU notifier's young flag check The MMU notifier young flag check related functions only return whether the young flag was set. Change the return type to bool to make the intention clearer. Link: https://lkml.kernel.org/r/a9ad3fe938002d87358e7bfca264f753ab602561.1774075004.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Lorenzo Stoakes (Oracle) Acked-by: David Hildenbrand (Arm) Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Ritesh Harjani (IBM) Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/mmu_notifier.h | 76 ++++++++++++++++++++------------------------ 1 file changed, 35 insertions(+), 41 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 3705d350c863..17f2cdc77dd5 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -97,20 +97,20 @@ struct mmu_notifier_ops { * Start-end is necessary in case the secondary MMU is mapping the page * at a smaller granularity than the primary MMU. */ - int (*clear_flush_young)(struct mmu_notifier *subscription, - struct mm_struct *mm, - unsigned long start, - unsigned long end); + bool (*clear_flush_young)(struct mmu_notifier *subscription, + struct mm_struct *mm, + unsigned long start, + unsigned long end); /* * clear_young is a lightweight version of clear_flush_young. Like the * latter, it is supposed to test-and-clear the young/accessed bitflag * in the secondary pte, but it may omit flushing the secondary tlb. */ - int (*clear_young)(struct mmu_notifier *subscription, - struct mm_struct *mm, - unsigned long start, - unsigned long end); + bool (*clear_young)(struct mmu_notifier *subscription, + struct mm_struct *mm, + unsigned long start, + unsigned long end); /* * test_young is called to check the young/accessed bitflag in @@ -118,9 +118,9 @@ struct mmu_notifier_ops { * frequently used without actually clearing the flag or tearing * down the secondary mapping on the page. */ - int (*test_young)(struct mmu_notifier *subscription, - struct mm_struct *mm, - unsigned long address); + bool (*test_young)(struct mmu_notifier *subscription, + struct mm_struct *mm, + unsigned long address); /* * invalidate_range_start() and invalidate_range_end() must be @@ -376,14 +376,12 @@ mmu_interval_check_retry(struct mmu_interval_notifier *interval_sub, extern void __mmu_notifier_subscriptions_destroy(struct mm_struct *mm); extern void __mmu_notifier_release(struct mm_struct *mm); -extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm, - unsigned long start, - unsigned long end); -extern int __mmu_notifier_clear_young(struct mm_struct *mm, - unsigned long start, - unsigned long end); -extern int __mmu_notifier_test_young(struct mm_struct *mm, - unsigned long address); +bool __mmu_notifier_clear_flush_young(struct mm_struct *mm, + unsigned long start, unsigned long end); +bool __mmu_notifier_clear_young(struct mm_struct *mm, + unsigned long start, unsigned long end); +bool __mmu_notifier_test_young(struct mm_struct *mm, + unsigned long address); extern int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *r); extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r); extern void __mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm, @@ -403,30 +401,28 @@ static inline void mmu_notifier_release(struct mm_struct *mm) __mmu_notifier_release(mm); } -static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, - unsigned long start, - unsigned long end) +static inline bool mmu_notifier_clear_flush_young(struct mm_struct *mm, + unsigned long start, unsigned long end) { if (mm_has_notifiers(mm)) return __mmu_notifier_clear_flush_young(mm, start, end); - return 0; + return false; } -static inline int mmu_notifier_clear_young(struct mm_struct *mm, - unsigned long start, - unsigned long end) +static inline bool mmu_notifier_clear_young(struct mm_struct *mm, + unsigned long start, unsigned long end) { if (mm_has_notifiers(mm)) return __mmu_notifier_clear_young(mm, start, end); - return 0; + return false; } -static inline int mmu_notifier_test_young(struct mm_struct *mm, - unsigned long address) +static inline bool mmu_notifier_test_young(struct mm_struct *mm, + unsigned long address) { if (mm_has_notifiers(mm)) return __mmu_notifier_test_young(mm, address); - return 0; + return false; } static inline void @@ -552,24 +548,22 @@ static inline void mmu_notifier_release(struct mm_struct *mm) { } -static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, - unsigned long start, - unsigned long end) +static inline bool mmu_notifier_clear_flush_young(struct mm_struct *mm, + unsigned long start, unsigned long end) { - return 0; + return false; } -static inline int mmu_notifier_clear_young(struct mm_struct *mm, - unsigned long start, - unsigned long end) +static inline bool mmu_notifier_clear_young(struct mm_struct *mm, + unsigned long start, unsigned long end) { - return 0; + return false; } -static inline int mmu_notifier_test_young(struct mm_struct *mm, - unsigned long address) +static inline bool mmu_notifier_test_young(struct mm_struct *mm, + unsigned long address) { - return 0; + return false; } static inline void -- cgit v1.2.3 From 54fdcbfe1cbd1d8f06d0c57c8cc43ddcc1cd421c Mon Sep 17 00:00:00 2001 From: Ye Liu Date: Mon, 23 Mar 2026 17:03:04 +0800 Subject: mm: remove unused page_is_file_lru() function The page_is_file_lru() wrapper function is no longer used. The kernel has moved to folio-based APIs, and all callers should use folio_is_file_lru() instead. Remove the obsolete page-based wrapper function. Link: https://lkml.kernel.org/r/20260323090305.798057-1-ye.liu@linux.dev Signed-off-by: Ye Liu Acked-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 2aedcff6a2c1..7fc2ced00f8f 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -30,11 +30,6 @@ static inline int folio_is_file_lru(const struct folio *folio) return !folio_test_swapbacked(folio); } -static inline int page_is_file_lru(struct page *page) -{ - return folio_is_file_lru(page_folio(page)); -} - static __always_inline void __update_lru_size(struct lruvec *lruvec, enum lru_list lru, enum zone_type zid, long nr_pages) -- cgit v1.2.3 From 4ff07459db888054f68575646d7fe04f31f1e56d Mon Sep 17 00:00:00 2001 From: Jinjiang Tu Date: Thu, 19 Mar 2026 09:25:41 +0800 Subject: mm/huge_memory: fix folio isn't locked in softleaf_to_folio() On arm64 server, we found folio that get from migration entry isn't locked in softleaf_to_folio(). This issue triggers when mTHP splitting and zap_nonpresent_ptes() races, and the root cause is lack of memory barrier in softleaf_to_folio(). The race is as follows: CPU0 CPU1 deferred_split_scan() zap_nonpresent_ptes() lock folio split_folio() unmap_folio() change ptes to migration entries __split_folio_to_order() softleaf_to_folio() set flags(including PG_locked) for tail pages folio = pfn_folio(softleaf_to_pfn(entry)) smp_wmb() VM_WARN_ON_ONCE(!folio_test_locked(folio)) prep_compound_page() for tail pages In __split_folio_to_order(), smp_wmb() guarantees page flags of tail pages are visible before the tail page becomes non-compound. smp_wmb() should be paired with smp_rmb() in softleaf_to_folio(), which is missed. As a result, if zap_nonpresent_ptes() accesses migration entry that stores tail pfn, softleaf_to_folio() may see the updated compound_head of tail page before page->flags. This issue will trigger VM_WARN_ON_ONCE() in pfn_swap_entry_folio() because of the race between folio split and zap_nonpresent_ptes() leading to a folio incorrectly undergoing modification without a folio lock being held. This is a BUG_ON() before commit 93976a20345b ("mm: eliminate further swapops predicates"), which in merged in v6.19-rc1. To fix it, add missing smp_rmb() if the softleaf entry is migration entry in softleaf_to_folio() and softleaf_to_page(). [tujinjiang@huawei.com: update function name and comments] Link: https://lkml.kernel.org/r/20260321075214.3305564-1-tujinjiang@huawei.com Link: https://lkml.kernel.org/r/20260319012541.4158561-1-tujinjiang@huawei.com Fixes: e9b61f19858a ("thp: reintroduce split_huge_page()") Signed-off-by: Jinjiang Tu Acked-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Barry Song Cc: Kefeng Wang Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Nanyong Sun Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- include/linux/leafops.h | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/leafops.h b/include/linux/leafops.h index a9ff94b744f2..05673d3529e7 100644 --- a/include/linux/leafops.h +++ b/include/linux/leafops.h @@ -363,6 +363,23 @@ static inline unsigned long softleaf_to_pfn(softleaf_t entry) return swp_offset(entry) & SWP_PFN_MASK; } +static inline void softleaf_migration_sync(softleaf_t entry, + struct folio *folio) +{ + /* + * Ensure we do not race with split, which might alter tail pages into new + * folios and thus result in observing an unlocked folio. + * This matches the write barrier in __split_folio_to_order(). + */ + smp_rmb(); + + /* + * Any use of migration entries may only occur while the + * corresponding page is locked + */ + VM_WARN_ON_ONCE(!folio_test_locked(folio)); +} + /** * softleaf_to_page() - Obtains struct page for PFN encoded within leaf entry. * @entry: Leaf entry, softleaf_has_pfn(@entry) must return true. @@ -374,11 +391,8 @@ static inline struct page *softleaf_to_page(softleaf_t entry) struct page *page = pfn_to_page(softleaf_to_pfn(entry)); VM_WARN_ON_ONCE(!softleaf_has_pfn(entry)); - /* - * Any use of migration entries may only occur while the - * corresponding page is locked - */ - VM_WARN_ON_ONCE(softleaf_is_migration(entry) && !PageLocked(page)); + if (softleaf_is_migration(entry)) + softleaf_migration_sync(entry, page_folio(page)); return page; } @@ -394,12 +408,8 @@ static inline struct folio *softleaf_to_folio(softleaf_t entry) struct folio *folio = pfn_folio(softleaf_to_pfn(entry)); VM_WARN_ON_ONCE(!softleaf_has_pfn(entry)); - /* - * Any use of migration entries may only occur while the - * corresponding folio is locked. - */ - VM_WARN_ON_ONCE(softleaf_is_migration(entry) && - !folio_test_locked(folio)); + if (softleaf_is_migration(entry)) + softleaf_migration_sync(entry, folio); return folio; } -- cgit v1.2.3 From 6bc0987d0b508b3768808efafa1e90041713526b Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:18 +0000 Subject: mm/vma: add vma_flags_empty(), vma_flags_and(), vma_flags_diff_pair() Patch series "mm/vma: convert vm_flags_t to vma_flags_t in vma code", v4. This series converts a lot of the existing use of the legacy vm_flags_t data type to the new vma_flags_t type which replaces it. In order to do so it adds a number of additional helpers: * vma_flags_empty() - Determines whether a vma_flags_t value has no bits set. * vma_flags_and() - Performs a bitwise AND between two vma_flags_t values. * vma_flags_diff_pair() - Determines which flags are not shared between a pair of VMA flags (typically non-constant values) * append_vma_flags() - Similar to mk_vma_flags(), but allows a vma_flags_t value to be specified (typically a constant value) which will be copied and appended to to create a new vma_flags_t value, with additional flags specified to append to it. * vma_flags_same() - Determines if a vma_flags_t value is exactly equal to a set of VMA flags. * vma_flags_same_mask() - Determines if a vma_flags_t value is eactly equal to another vma_flags_t value (typically constant). * vma_flags_same_pair() - Determines if a pair of vma_flags_t values are exactly equal to one another (typically both non-constant). * vma_flags_to_legacy() - Converts a vma_flags_t value to a vm_flags_t value, used to enable more iterative introduction of the use of vma_flags_t. * legacy_to_vma_flags() - Converts a vm_flags_t value to a vma_flags-t value, for the same purpose. * vma_flags_test_single_mask() - Tests whether a vma_flags_t value contain the single flag specified in an input vma_flags_t flag mask, or if that flag mask is empty, is defined to return false. Useful for config-predicated VMA flag mask defines. * vma_test() - Tests whether a VMA's flags contain a specific singular VMA flag. * vma_test_any() - Tests whether a VMA's flags contain any of a set of VMA flags. * vma_test_any_mask() - Tests whether a VMA's flags contain any of the flags specified in another, typically constant, vma_flags_t value. * vma_test_single_mask() - Tests whether a VMA's flags contain the single flag specified in an input vma_flags_t flag mask, or if that flag mask is empty, is defined to return false. Useful for config-predicated VMA flag mask defines. * vma_clear_flags() - Clears a specific set of VMA flags from a vma_flags_t value. * vma_clear_flags_mask() - Clears those flag set in a vma_flags_t value (typically constant) from a (typically not constant) vma_flags_t value. The series mostly focuses on the the VMA specific code, especially that contained in mm/vma.c and mm/vma.h. It updates both brk() and mmap() logic to utils vma_flags_t values as much as is practiaclly possible at this point, changing surrounding logic to be able to do so. It also updates the vma_modify_xxx() functions where they interact with VMA flags directly to use vm_flags_t values where possible. There is extensive testing added in the VMA userland tests to assert that all of these new VMA flag functions work correctly. This patch (of 25): Firstly, add the ability to determine if VMA flags are empty, that is no flags are set in a vma_flags_t value. Next, add the ability to obtain the equivalent of the bitwise and of two vma_flags_t values, via vma_flags_and_mask(). Next, add the ability to obtain the difference between two sets of VMA flags, that is the equivalent to the exclusive bitwise OR of the two sets of flags, via vma_flags_diff_pair(). vma_flags_xxx_mask() typically operates on a pointer to a vma_flags_t value, which is assumed to be an lvalue of some kind (such as a field in a struct or a stack variable) and an rvalue of some kind (typically a constant set of VMA flags obtained e.g. via mk_vma_flags() or equivalent). However vma_flags_diff_pair() is intended to operate on two lvalues, so use the _pair() suffix to make this clear. Finally, update VMA userland tests to add these helpers. We also port bitmap_xor() and __bitmap_xor() to the tools/ headers and source to allow the tests to work with vma_flags_diff_pair(). Link: https://lkml.kernel.org/r/cover.1774034900.git.ljs@kernel.org Link: https://lkml.kernel.org/r/53ab55b7da91425775e42c03177498ad6de88ef4.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm.h | 60 ++++++++++++++++++++++++++++++++++++++++-------- include/linux/mm_types.h | 8 +++++++ 2 files changed, 59 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 16a1ad9a3397..7954a7a2b811 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1048,6 +1048,19 @@ static __always_inline vma_flags_t __mk_vma_flags(size_t count, return flags; } +/* + * Helper macro which bitwise-or combines the specified input flags into a + * vma_flags_t bitmap value. E.g.: + * + * vma_flags_t flags = mk_vma_flags(VMA_IO_BIT, VMA_PFNMAP_BIT, + * VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT); + * + * The compiler cleverly optimises away all of the work and this ends up being + * equivalent to aggregating the values manually. + */ +#define mk_vma_flags(...) __mk_vma_flags(COUNT_ARGS(__VA_ARGS__), \ + (const vma_flag_t []){__VA_ARGS__}) + /* * Test whether a specific VMA flag is set, e.g.: * @@ -1062,17 +1075,30 @@ static __always_inline bool vma_flags_test(const vma_flags_t *flags, } /* - * Helper macro which bitwise-or combines the specified input flags into a - * vma_flags_t bitmap value. E.g.: - * - * vma_flags_t flags = mk_vma_flags(VMA_IO_BIT, VMA_PFNMAP_BIT, - * VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT); + * Obtain a set of VMA flags which contain the overlapping flags contained + * within flags and to_and. + */ +static __always_inline vma_flags_t vma_flags_and_mask(const vma_flags_t *flags, + vma_flags_t to_and) +{ + vma_flags_t dst; + unsigned long *bitmap_dst = dst.__vma_flags; + const unsigned long *bitmap = flags->__vma_flags; + const unsigned long *bitmap_to_and = to_and.__vma_flags; + + bitmap_and(bitmap_dst, bitmap, bitmap_to_and, NUM_VMA_FLAG_BITS); + return dst; +} + +/* + * Obtain a set of VMA flags which contains the specified overlapping flags, + * e.g.: * - * The compiler cleverly optimises away all of the work and this ends up being - * equivalent to aggregating the values manually. + * vma_flags_t read_flags = vma_flags_and(&flags, VMA_READ_BIT, + * VMA_MAY_READ_BIT); */ -#define mk_vma_flags(...) __mk_vma_flags(COUNT_ARGS(__VA_ARGS__), \ - (const vma_flag_t []){__VA_ARGS__}) +#define vma_flags_and(flags, ...) \ + vma_flags_and_mask(flags, mk_vma_flags(__VA_ARGS__)) /* Test each of to_test flags in flags, non-atomically. */ static __always_inline bool vma_flags_test_any_mask(const vma_flags_t *flags, @@ -1146,6 +1172,22 @@ static __always_inline void vma_flags_clear_mask(vma_flags_t *flags, #define vma_flags_clear(flags, ...) \ vma_flags_clear_mask(flags, mk_vma_flags(__VA_ARGS__)) +/* + * Obtain a VMA flags value containing those flags that are present in flags or + * flags_other but not in both. + */ +static __always_inline vma_flags_t vma_flags_diff_pair(const vma_flags_t *flags, + const vma_flags_t *flags_other) +{ + vma_flags_t dst; + const unsigned long *bitmap_other = flags_other->__vma_flags; + const unsigned long *bitmap = flags->__vma_flags; + unsigned long *bitmap_dst = dst.__vma_flags; + + bitmap_xor(bitmap_dst, bitmap, bitmap_other, NUM_VMA_FLAG_BITS); + return dst; +} + /* * Helper to test that ALL specified flags are set in a VMA. * diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index f22aecb047b7..321aa150c1ee 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -870,6 +870,14 @@ typedef struct { #define EMPTY_VMA_FLAGS ((vma_flags_t){ }) +/* Are no flags set in the specified VMA flags? */ +static __always_inline bool vma_flags_empty(const vma_flags_t *flags) +{ + const unsigned long *bitmap = flags->__vma_flags; + + return bitmap_empty(bitmap, NUM_VMA_FLAG_BITS); +} + /* * Describes a VMA that is about to be mmap()'ed. Drivers may choose to * manipulate mutable fields which will cause those fields to be updated in the -- cgit v1.2.3 From 8228e42b5f88aa68708ced277399ee3b59748627 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:20 +0000 Subject: mm/vma: add further vma_flags_t unions In order to utilise the new vma_flags_t type, we currently place it in union with legacy vm_flags fields of type vm_flags_t to make the transition smoother. Add vma_flags_t union entries for mm->def_flags and vmg->vm_flags - mm->def_vma_flags and vmg->vma_flags respectively. Once the conversion is complete, these will be replaced with vma_flags_t entries alone. Also update the VMA tests to reflect the change. Link: https://lkml.kernel.org/r/d507d542c089ba132e9da53f2ff7f80ca117c3b4.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 321aa150c1ee..8ef84849953f 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1249,7 +1249,11 @@ struct mm_struct { unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */ unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */ unsigned long stack_vm; /* VM_STACK */ - vm_flags_t def_flags; + union { + /* Temporary while VMA flags are being converted. */ + vm_flags_t def_flags; + vma_flags_t def_vma_flags; + }; /** * @write_protect_seq: Locked when any thread is write -- cgit v1.2.3 From 7ec1885a7e283caaf6566aedc1eea5988d545f97 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:22 +0000 Subject: mm/vma: use new VMA flags for sticky flags logic Use the new vma_flags_t flags implementation to perform the logic around sticky flags and what flags are ignored on VMA merge. We make use of the new vma_flags_empty(), vma_flags_diff_pair(), and vma_flags_and_mask() functionality. Also update the VMA tests accordingly. Link: https://lkml.kernel.org/r/369574f06360ffa44707047e3b58eb4897345fba.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm.h | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7954a7a2b811..d7e647e31742 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -540,6 +540,7 @@ enum { /* VMA basic access permission flags */ #define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) +#define VMA_ACCESS_FLAGS mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT) /* * Special vmas that are non-mergable, non-mlock()able. @@ -585,27 +586,32 @@ enum { * possesses it but the other does not, the merged VMA should nonetheless have * applied to it: * - * VM_SOFTDIRTY - if a VMA is marked soft-dirty, that is has not had its - * references cleared via /proc/$pid/clear_refs, any merged VMA - * should be considered soft-dirty also as it operates at a VMA - * granularity. + * VMA_SOFTDIRTY_BIT - if a VMA is marked soft-dirty, that is has not had its + * references cleared via /proc/$pid/clear_refs, any + * merged VMA should be considered soft-dirty also as it + * operates at a VMA granularity. * - * VM_MAYBE_GUARD - If a VMA may have guard regions in place it implies that - * mapped page tables may contain metadata not described by the - * VMA and thus any merged VMA may also contain this metadata, - * and thus we must make this flag sticky. + * VMA_MAYBE_GUARD_BIT - If a VMA may have guard regions in place it implies + * that mapped page tables may contain metadata not + * described by the VMA and thus any merged VMA may also + * contain this metadata, and thus we must make this flag + * sticky. */ -#define VM_STICKY (VM_SOFTDIRTY | VM_MAYBE_GUARD) +#ifdef CONFIG_MEM_SOFT_DIRTY +#define VMA_STICKY_FLAGS mk_vma_flags(VMA_SOFTDIRTY_BIT, VMA_MAYBE_GUARD_BIT) +#else +#define VMA_STICKY_FLAGS mk_vma_flags(VMA_MAYBE_GUARD_BIT) +#endif /* * VMA flags we ignore for the purposes of merge, i.e. one VMA possessing one * of these flags and the other not does not preclude a merge. * - * VM_STICKY - When merging VMAs, VMA flags must match, unless they are - * 'sticky'. If any sticky flags exist in either VMA, we simply - * set all of them on the merged VMA. + * VMA_STICKY_FLAGS - When merging VMAs, VMA flags must match, unless they + * are 'sticky'. If any sticky flags exist in either VMA, + * we simply set all of them on the merged VMA. */ -#define VM_IGNORE_MERGE VM_STICKY +#define VMA_IGNORE_MERGE_FLAGS VMA_STICKY_FLAGS /* * Flags which should result in page tables being copied on fork. These are -- cgit v1.2.3 From e8d464f4a94ccbcae8c9d3137ac5621b57ddd8a1 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:24 +0000 Subject: mm/vma: add append_vma_flags() helper In order to be able to efficiently combine VMA flag masks with additional VMA flag bits we need to extend the concept introduced in mk_vma_flags() and __mk_vma_flags() by allowing the specification of a VMA flag mask to append VMA flag bits to. Update __mk_vma_flags() to allow for this and update mk_vma_flags() accordingly, and also provide append_vma_flags() to allow for the caller to specify which VMA flags mask to append to. Finally, update the VMA flags tests to reflect the change. Link: https://lkml.kernel.org/r/9f928cd4688270002f2c0c3777fcc9b49cc7a8ea.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm.h | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index d7e647e31742..26cfb2fbe4db 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1042,13 +1042,11 @@ static __always_inline void vma_flags_set_flag(vma_flags_t *flags, __set_bit((__force int)bit, bitmap); } -static __always_inline vma_flags_t __mk_vma_flags(size_t count, - const vma_flag_t *bits) +static __always_inline vma_flags_t __mk_vma_flags(vma_flags_t flags, + size_t count, const vma_flag_t *bits) { - vma_flags_t flags; int i; - vma_flags_clear_all(&flags); for (i = 0; i < count; i++) vma_flags_set_flag(&flags, bits[i]); return flags; @@ -1064,8 +1062,18 @@ static __always_inline vma_flags_t __mk_vma_flags(size_t count, * The compiler cleverly optimises away all of the work and this ends up being * equivalent to aggregating the values manually. */ -#define mk_vma_flags(...) __mk_vma_flags(COUNT_ARGS(__VA_ARGS__), \ - (const vma_flag_t []){__VA_ARGS__}) +#define mk_vma_flags(...) __mk_vma_flags(EMPTY_VMA_FLAGS, \ + COUNT_ARGS(__VA_ARGS__), (const vma_flag_t []){__VA_ARGS__}) + +/* + * Helper macro which acts like mk_vma_flags, only appending to a copy of the + * specified flags rather than establishing new flags. E.g.: + * + * vma_flags_t flags = append_vma_flags(VMA_STACK_DEFAULT_FLAGS, VMA_STACK_BIT, + * VMA_ACCOUNT_BIT); + */ +#define append_vma_flags(flags, ...) __mk_vma_flags(flags, \ + COUNT_ARGS(__VA_ARGS__), (const vma_flag_t []){__VA_ARGS__}) /* * Test whether a specific VMA flag is set, e.g.: -- cgit v1.2.3 From 5fb55e951cf591c5e2d45273ceadbdcd0c44932c Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:26 +0000 Subject: mm: unexport vm_brk_flags() and eliminate vm_flags parameter This function is only used by elf_load(), and that is a static function that doesn't need an exported symbol to invoke an internal function, so un-EXPORT_SYMBOLS() it. Also, the vm_flags parameter is unnecessary, as we only ever set VM_EXEC, so simply make this parameter a boolean. While we're here, clean up the mm.h definitions for the various vm_xxx() helpers so we actually specify parameter names and elide the redundant extern's. Link: https://lkml.kernel.org/r/7bada48ddf3f9dbd3e6c4fc50ec2f4de97706f52.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 26cfb2fbe4db..5b85ffc2760c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3991,12 +3991,12 @@ static inline void mm_populate(unsigned long addr, unsigned long len) {} #endif /* This takes the mm semaphore itself */ -extern int __must_check vm_brk_flags(unsigned long, unsigned long, unsigned long); -extern int vm_munmap(unsigned long, size_t); -extern unsigned long __must_check vm_mmap(struct file *, unsigned long, - unsigned long, unsigned long, - unsigned long, unsigned long); -extern unsigned long __must_check vm_mmap_shadow_stack(unsigned long addr, +int __must_check vm_brk_flags(unsigned long addr, unsigned long request, bool is_exec); +int vm_munmap(unsigned long start, size_t len); +unsigned long __must_check vm_mmap(struct file *file, unsigned long addr, + unsigned long len, unsigned long prot, + unsigned long flag, unsigned long offset); +unsigned long __must_check vm_mmap_shadow_stack(unsigned long addr, unsigned long len, unsigned long flags); struct vm_unmapped_area_info { -- cgit v1.2.3 From 3ee584538259c356c66146ac46f2e4fd2ba28bee Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:27 +0000 Subject: mm/vma: introduce vma_flags_same[_mask/_pair]() Add helpers to determine if two sets of VMA flags are precisely the same, that is - that every flag set one is set in another, and neither contain any flags not set in the other. We also introduce vma_flags_same_pair() for cases where we want to compare two sets of VMA flags which are both non-const values. Also update the VMA tests to reflect the change, we already implicitly test that this functions correctly having used it for testing purposes previously. Link: https://lkml.kernel.org/r/4f764bf619e77205837c7c819b62139ef6337ca3.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 5b85ffc2760c..1f3e9100164d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1202,6 +1202,34 @@ static __always_inline vma_flags_t vma_flags_diff_pair(const vma_flags_t *flags, return dst; } +/* Determine if flags and flags_other have precisely the same flags set. */ +static __always_inline bool vma_flags_same_pair(const vma_flags_t *flags, + const vma_flags_t *flags_other) +{ + const unsigned long *bitmap = flags->__vma_flags; + const unsigned long *bitmap_other = flags_other->__vma_flags; + + return bitmap_equal(bitmap, bitmap_other, NUM_VMA_FLAG_BITS); +} + +/* Determine if flags and flags_other have precisely the same flags set. */ +static __always_inline bool vma_flags_same_mask(const vma_flags_t *flags, + vma_flags_t flags_other) +{ + const unsigned long *bitmap = flags->__vma_flags; + const unsigned long *bitmap_other = flags_other.__vma_flags; + + return bitmap_equal(bitmap, bitmap_other, NUM_VMA_FLAG_BITS); +} + +/* + * Helper macro to determine if only the specific flags are set, e.g.: + * + * if (vma_flags_same(&flags, VMA_WRITE_BIT) { ... } + */ +#define vma_flags_same(flags, ...) \ + vma_flags_same_mask(flags, mk_vma_flags(__VA_ARGS__)) + /* * Helper to test that ALL specified flags are set in a VMA. * -- cgit v1.2.3 From c8555bc95d6222aa729b3a1195e07e566707ec02 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:28 +0000 Subject: mm/vma: introduce [vma_flags,legacy]_to_[legacy,vma_flags]() helpers While we are still converting VMA flags from vma_flags_t to vm_flags_t, introduce helpers to convert between the two to allow for iterative development without having to 'change the world' in a single commit'. Also update VMA flags tests to reflect the change. Finally, refresh vma_flags_overwrite_word(), vma_flag_overwrite_word_once(), vma_flags_set_word() and vma_flags_clear_word() in the VMA tests to reflect current kernel implementations - this should make no functional difference, but keeps the logic consistent between the two. Link: https://lkml.kernel.org/r/d3569470dbb3ae79134ca7c3eb3fc4df7086e874.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 8ef84849953f..1da8fb04133f 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1069,6 +1069,18 @@ static __always_inline void vma_flags_clear_all(vma_flags_t *flags) bitmap_zero(flags->__vma_flags, NUM_VMA_FLAG_BITS); } +/* + * Helper function which converts a vma_flags_t value to a legacy vm_flags_t + * value. This is only valid if the input flags value can be expressed in a + * system word. + * + * Will be removed once the conversion to VMA flags is complete. + */ +static __always_inline vm_flags_t vma_flags_to_legacy(vma_flags_t flags) +{ + return (vm_flags_t)flags.__vma_flags[0]; +} + /* * Copy value to the first system word of VMA flags, non-atomically. * @@ -1082,6 +1094,20 @@ static inline void vma_flags_overwrite_word(vma_flags_t *flags, unsigned long va bitmap[0] = value; } +/* + * Helper function which converts a legacy vm_flags_t value to a vma_flags_t + * value. + * + * Will be removed once the conversion to VMA flags is complete. + */ +static __always_inline vma_flags_t legacy_to_vma_flags(vm_flags_t flags) +{ + vma_flags_t ret = EMPTY_VMA_FLAGS; + + vma_flags_overwrite_word(&ret, flags); + return ret; +} + /* * Copy value to the first system word of VMA flags ONCE, non-atomically. * -- cgit v1.2.3 From fb67bba5d9b8561f433695c8916c097910193561 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:30 +0000 Subject: mm/vma: introduce vma_test[_any[_mask]](), and make inlining consistent Introduce helper functions and macros to make it convenient to test flags and flag masks for VMAs, specifically: * vma_test() - determine if a single VMA flag is set in a VMA. * vma_test_any_mask() - determine if any flags in a vma_flags_t value are set in a VMA. * vma_test_any() - Helper macro to test if any of specific flags are set. Also, there are a mix of 'inline's and '__always_inline's in VMA helper function declarations, update to consistently use __always_inline. Finally, update the VMA tests to reflect the changes. Link: https://lkml.kernel.org/r/be1d71f08307d747a82232cbd8664a88c0f41419.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm.h | 49 +++++++++++++++++++++++++++++++++++++++--------- include/linux/mm_types.h | 12 ++++++++---- 2 files changed, 48 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 1f3e9100164d..f704d7cf2871 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -994,7 +994,8 @@ static inline void vm_flags_mod(struct vm_area_struct *vma, __vm_flags_mod(vma, set, clear); } -static inline bool __vma_atomic_valid_flag(struct vm_area_struct *vma, vma_flag_t bit) +static __always_inline bool __vma_atomic_valid_flag(struct vm_area_struct *vma, + vma_flag_t bit) { const vm_flags_t mask = BIT((__force int)bit); @@ -1009,7 +1010,8 @@ static inline bool __vma_atomic_valid_flag(struct vm_area_struct *vma, vma_flag_ * Set VMA flag atomically. Requires only VMA/mmap read lock. Only specific * valid flags are allowed to do this. */ -static inline void vma_set_atomic_flag(struct vm_area_struct *vma, vma_flag_t bit) +static __always_inline void vma_set_atomic_flag(struct vm_area_struct *vma, + vma_flag_t bit) { unsigned long *bitmap = vma->flags.__vma_flags; @@ -1025,7 +1027,8 @@ static inline void vma_set_atomic_flag(struct vm_area_struct *vma, vma_flag_t bi * This is necessarily racey, so callers must ensure that serialisation is * achieved through some other means, or that races are permissible. */ -static inline bool vma_test_atomic_flag(struct vm_area_struct *vma, vma_flag_t bit) +static __always_inline bool vma_test_atomic_flag(struct vm_area_struct *vma, + vma_flag_t bit) { if (__vma_atomic_valid_flag(vma, bit)) return test_bit((__force int)bit, &vma->vm_flags); @@ -1230,13 +1233,41 @@ static __always_inline bool vma_flags_same_mask(const vma_flags_t *flags, #define vma_flags_same(flags, ...) \ vma_flags_same_mask(flags, mk_vma_flags(__VA_ARGS__)) +/* + * Test whether a specific flag in the VMA is set, e.g.: + * + * if (vma_test(vma, VMA_READ_BIT)) { ... } + */ +static __always_inline bool vma_test(const struct vm_area_struct *vma, + vma_flag_t bit) +{ + return vma_flags_test(&vma->flags, bit); +} + +/* Helper to test any VMA flags in a VMA . */ +static __always_inline bool vma_test_any_mask(const struct vm_area_struct *vma, + vma_flags_t flags) +{ + return vma_flags_test_any_mask(&vma->flags, flags); +} + +/* + * Helper macro for testing whether any VMA flags are set in a VMA, + * e.g.: + * + * if (vma_test_any(vma, VMA_IO_BIT, VMA_PFNMAP_BIT, + * VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT)) { ... } + */ +#define vma_test_any(vma, ...) \ + vma_test_any_mask(vma, mk_vma_flags(__VA_ARGS__)) + /* * Helper to test that ALL specified flags are set in a VMA. * * Note: appropriate locks must be held, this function does not acquire them for * you. */ -static inline bool vma_test_all_mask(const struct vm_area_struct *vma, +static __always_inline bool vma_test_all_mask(const struct vm_area_struct *vma, vma_flags_t flags) { return vma_flags_test_all_mask(&vma->flags, flags); @@ -1256,7 +1287,7 @@ static inline bool vma_test_all_mask(const struct vm_area_struct *vma, * Note: appropriate locks must be held, this function does not acquire them for * you. */ -static inline void vma_set_flags_mask(struct vm_area_struct *vma, +static __always_inline void vma_set_flags_mask(struct vm_area_struct *vma, vma_flags_t flags) { vma_flags_set_mask(&vma->flags, flags); @@ -1286,7 +1317,7 @@ static __always_inline bool vma_desc_test(const struct vm_area_desc *desc, } /* Helper to test any VMA flags in a VMA descriptor. */ -static inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, +static __always_inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, vma_flags_t flags) { return vma_flags_test_any_mask(&desc->vma_flags, flags); @@ -1303,7 +1334,7 @@ static inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, vma_desc_test_any_mask(desc, mk_vma_flags(__VA_ARGS__)) /* Helper to test all VMA flags in a VMA descriptor. */ -static inline bool vma_desc_test_all_mask(const struct vm_area_desc *desc, +static __always_inline bool vma_desc_test_all_mask(const struct vm_area_desc *desc, vma_flags_t flags) { return vma_flags_test_all_mask(&desc->vma_flags, flags); @@ -1319,7 +1350,7 @@ static inline bool vma_desc_test_all_mask(const struct vm_area_desc *desc, vma_desc_test_all_mask(desc, mk_vma_flags(__VA_ARGS__)) /* Helper to set all VMA flags in a VMA descriptor. */ -static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, +static __always_inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, vma_flags_t flags) { vma_flags_set_mask(&desc->vma_flags, flags); @@ -1336,7 +1367,7 @@ static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, vma_desc_set_flags_mask(desc, mk_vma_flags(__VA_ARGS__)) /* Helper to clear all VMA flags in a VMA descriptor. */ -static inline void vma_desc_clear_flags_mask(struct vm_area_desc *desc, +static __always_inline void vma_desc_clear_flags_mask(struct vm_area_desc *desc, vma_flags_t flags) { vma_flags_clear_mask(&desc->vma_flags, flags); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 1da8fb04133f..38fe6b915024 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1087,7 +1087,8 @@ static __always_inline vm_flags_t vma_flags_to_legacy(vma_flags_t flags) * IMPORTANT: This does not overwrite bytes past the first system word. The * caller must account for this. */ -static inline void vma_flags_overwrite_word(vma_flags_t *flags, unsigned long value) +static __always_inline void vma_flags_overwrite_word(vma_flags_t *flags, + unsigned long value) { unsigned long *bitmap = flags->__vma_flags; @@ -1114,7 +1115,8 @@ static __always_inline vma_flags_t legacy_to_vma_flags(vm_flags_t flags) * IMPORTANT: This does not overwrite bytes past the first system word. The * caller must account for this. */ -static inline void vma_flags_overwrite_word_once(vma_flags_t *flags, unsigned long value) +static __always_inline void vma_flags_overwrite_word_once(vma_flags_t *flags, + unsigned long value) { unsigned long *bitmap = flags->__vma_flags; @@ -1122,7 +1124,8 @@ static inline void vma_flags_overwrite_word_once(vma_flags_t *flags, unsigned lo } /* Update the first system word of VMA flags setting bits, non-atomically. */ -static inline void vma_flags_set_word(vma_flags_t *flags, unsigned long value) +static __always_inline void vma_flags_set_word(vma_flags_t *flags, + unsigned long value) { unsigned long *bitmap = flags->__vma_flags; @@ -1130,7 +1133,8 @@ static inline void vma_flags_set_word(vma_flags_t *flags, unsigned long value) } /* Update the first system word of VMA flags clearing bits, non-atomically. */ -static inline void vma_flags_clear_word(vma_flags_t *flags, unsigned long value) +static __always_inline void vma_flags_clear_word(vma_flags_t *flags, + unsigned long value) { unsigned long *bitmap = flags->__vma_flags; -- cgit v1.2.3 From e79d1c500f52506b9eab39e81017e30b76f2864d Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:32 +0000 Subject: mm: introduce vma_flags_count() and vma[_flags]_test_single_mask() vma_flags_count() determines how many bits are set in VMA flags, using bitmap_weight(). vma_flags_test_single_mask() determines if a vma_flags_t set of flags contains a single flag specified as another vma_flags_t value, or if the sought flag mask is empty, it is defined to return false. This is useful when we want to declare a VMA flag as optionally a single flag in a mask or empty depending on kernel configuration. This allows us to have VM_NONE-like semantics when checking whether the flag is set. In a subsequent patch, we introduce the use of VMA_DROPPABLE of type vma_flags_t using precisely these semantics. It would be actively confusing to use vma_flags_test_any_single_mask() for this (and vma_flags_test_all_mask() is not correct to use here, as it trivially returns true when tested against an empty vma flags mask). We introduce vma_flags_count() to be able to assert that the compared flag mask is singular or empty, checked when CONFIG_DEBUG_VM is enabled. Also update the VMA tests as part of this change. Link: https://lkml.kernel.org/r/cd778dd02b9f2a01eb54d25a49dea8ec2ddf7753.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm.h | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index f704d7cf2871..de72382efac2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1078,6 +1078,14 @@ static __always_inline vma_flags_t __mk_vma_flags(vma_flags_t flags, #define append_vma_flags(flags, ...) __mk_vma_flags(flags, \ COUNT_ARGS(__VA_ARGS__), (const vma_flag_t []){__VA_ARGS__}) +/* Calculates the number of set bits in the specified VMA flags. */ +static __always_inline int vma_flags_count(const vma_flags_t *flags) +{ + const unsigned long *bitmap = flags->__vma_flags; + + return bitmap_weight(bitmap, NUM_VMA_FLAG_BITS); +} + /* * Test whether a specific VMA flag is set, e.g.: * @@ -1153,6 +1161,26 @@ static __always_inline bool vma_flags_test_all_mask(const vma_flags_t *flags, #define vma_flags_test_all(flags, ...) \ vma_flags_test_all_mask(flags, mk_vma_flags(__VA_ARGS__)) +/* + * Helper to test that a flag mask of type vma_flags_t has a SINGLE flag set + * (returning false if flagmask has no flags set). + * + * This is defined to make the semantics clearer when testing an optionally + * defined VMA flags mask, e.g.: + * + * if (vma_flags_test_single_mask(&flags, VMA_DROPPABLE)) { ... } + * + * When VMA_DROPPABLE is defined if available, or set to EMPTY_VMA_FLAGS + * otherwise. + */ +static __always_inline bool vma_flags_test_single_mask(const vma_flags_t *flags, + vma_flags_t flagmask) +{ + VM_WARN_ON_ONCE(vma_flags_count(&flagmask) > 1); + + return vma_flags_test_any_mask(flags, flagmask); +} + /* Set each of the to_set flags in flags, non-atomically. */ static __always_inline void vma_flags_set_mask(vma_flags_t *flags, vma_flags_t to_set) @@ -1281,6 +1309,24 @@ static __always_inline bool vma_test_all_mask(const struct vm_area_struct *vma, #define vma_test_all(vma, ...) \ vma_test_all_mask(vma, mk_vma_flags(__VA_ARGS__)) +/* + * Helper to test that a flag mask of type vma_flags_t has a SINGLE flag set + * (returning false if flagmask has no flags set). + * + * This is useful when a flag needs to be either defined or not depending upon + * kernel configuration, e.g.: + * + * if (vma_test_single_mask(vma, VMA_DROPPABLE)) { ... } + * + * When VMA_DROPPABLE is defined if available, or set to EMPTY_VMA_FLAGS + * otherwise. + */ +static __always_inline bool +vma_test_single_mask(const struct vm_area_struct *vma, vma_flags_t flagmask) +{ + return vma_flags_test_single_mask(&vma->flags, flagmask); +} + /* * Helper to set all VMA flags in a VMA. * -- cgit v1.2.3 From 3a6455d56bd7c4cfb1ea35ddae052943065e338e Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:34 +0000 Subject: mm: convert do_brk_flags() to use vma_flags_t In order to be able to do this, we need to change VM_DATA_DEFAULT_FLAGS and friends and update the architecture-specific definitions also. We then have to update some KSM logic to handle VMA flags, and introduce VMA_STACK_FLAGS to define the vma_flags_t equivalent of VM_STACK_FLAGS. We also introduce two helper functions for use during the time we are converting legacy flags to vma_flags_t values - vma_flags_to_legacy() and legacy_to_vma_flags(). This enables us to iteratively make changes to break these changes up into separate parts. We use these explicitly here to keep VM_STACK_FLAGS around for certain users which need to maintain the legacy vm_flags_t values for the time being. We are no longer able to rely on the simple VM_xxx being set to zero if the feature is not enabled, so in the case of VM_DROPPABLE we introduce VMA_DROPPABLE as the vma_flags_t equivalent, which is set to EMPTY_VMA_FLAGS if the droppable flag is not available. While we're here, we make the description of do_brk_flags() into a kdoc comment, as it almost was already. We use vma_flags_to_legacy() to not need to update the vm_get_page_prot() logic as this time. Note that in create_init_stack_vma() we have to replace the BUILD_BUG_ON() with a VM_WARN_ON_ONCE() as the tested values are no longer build time available. We also update mprotect_fixup() to use VMA flags where possible, though we have to live with a little duplication between vm_flags_t and vma_flags_t values for the time being until further conversions are made. While we're here, update VM_SPECIAL to be defined in terms of VMA_SPECIAL_FLAGS now we have vma_flags_to_legacy(). Finally, we update the VMA tests to reflect these changes. Link: https://lkml.kernel.org/r/d02e3e45d9a33d7904b149f5604904089fd640ae.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Paul Moore [SELinux] Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/ksm.h | 10 +++++----- include/linux/mm.h | 49 ++++++++++++++++++++++++++++++------------------- 2 files changed, 35 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ksm.h b/include/linux/ksm.h index c982694c987b..d39d0d5483a2 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -17,8 +17,8 @@ #ifdef CONFIG_KSM int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, vm_flags_t *vm_flags); -vm_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file, - vm_flags_t vm_flags); +vma_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file, + vma_flags_t vma_flags); int ksm_enable_merge_any(struct mm_struct *mm); int ksm_disable_merge_any(struct mm_struct *mm); int ksm_disable(struct mm_struct *mm); @@ -103,10 +103,10 @@ bool ksm_process_mergeable(struct mm_struct *mm); #else /* !CONFIG_KSM */ -static inline vm_flags_t ksm_vma_flags(struct mm_struct *mm, - const struct file *file, vm_flags_t vm_flags) +static inline vma_flags_t ksm_vma_flags(struct mm_struct *mm, + const struct file *file, vma_flags_t vma_flags) { - return vm_flags; + return vma_flags; } static inline int ksm_disable(struct mm_struct *mm) diff --git a/include/linux/mm.h b/include/linux/mm.h index de72382efac2..4042a584671e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -346,9 +346,9 @@ enum { * if KVM does not lock down the memory type. */ DECLARE_VMA_BIT(ALLOW_ANY_UNCACHED, 39), -#ifdef CONFIG_PPC32 +#if defined(CONFIG_PPC32) DECLARE_VMA_BIT_ALIAS(DROPPABLE, ARCH_1), -#else +#elif defined(CONFIG_64BIT) DECLARE_VMA_BIT(DROPPABLE, 40), #endif DECLARE_VMA_BIT(UFFD_MINOR, 41), @@ -503,31 +503,42 @@ enum { #endif #if defined(CONFIG_64BIT) || defined(CONFIG_PPC32) #define VM_DROPPABLE INIT_VM_FLAG(DROPPABLE) +#define VMA_DROPPABLE mk_vma_flags(VMA_DROPPABLE_BIT) #else #define VM_DROPPABLE VM_NONE +#define VMA_DROPPABLE EMPTY_VMA_FLAGS #endif /* Bits set in the VMA until the stack is in its final location */ #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY) -#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) +#define TASK_EXEC_BIT ((current->personality & READ_IMPLIES_EXEC) ? \ + VMA_EXEC_BIT : VMA_READ_BIT) /* Common data flag combinations */ -#define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \ - VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) -#define VM_DATA_FLAGS_NON_EXEC (VM_READ | VM_WRITE | VM_MAYREAD | \ - VM_MAYWRITE | VM_MAYEXEC) -#define VM_DATA_FLAGS_EXEC (VM_READ | VM_WRITE | VM_EXEC | \ - VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) - -#ifndef VM_DATA_DEFAULT_FLAGS /* arch can override this */ -#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_EXEC +#define VMA_DATA_FLAGS_TSK_EXEC mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, \ + TASK_EXEC_BIT, VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT, \ + VMA_MAYEXEC_BIT) +#define VMA_DATA_FLAGS_NON_EXEC mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, \ + VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT, VMA_MAYEXEC_BIT) +#define VMA_DATA_FLAGS_EXEC mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, \ + VMA_EXEC_BIT, VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT, \ + VMA_MAYEXEC_BIT) + +#ifndef VMA_DATA_DEFAULT_FLAGS /* arch can override this */ +#define VMA_DATA_DEFAULT_FLAGS VMA_DATA_FLAGS_EXEC #endif -#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ -#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS +#ifndef VMA_STACK_DEFAULT_FLAGS /* arch can override this */ +#define VMA_STACK_DEFAULT_FLAGS VMA_DATA_DEFAULT_FLAGS #endif +#define VMA_STACK_FLAGS append_vma_flags(VMA_STACK_DEFAULT_FLAGS, \ + VMA_STACK_BIT, VMA_ACCOUNT_BIT) + +/* Temporary until VMA flags conversion complete. */ +#define VM_STACK_FLAGS vma_flags_to_legacy(VMA_STACK_FLAGS) + #define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) #ifdef CONFIG_MSEAL_SYSTEM_MAPPINGS @@ -536,8 +547,6 @@ enum { #define VM_SEALED_SYSMAP VM_NONE #endif -#define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) - /* VMA basic access permission flags */ #define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) #define VMA_ACCESS_FLAGS mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT) @@ -545,7 +554,10 @@ enum { /* * Special vmas that are non-mergable, non-mlock()able. */ -#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) + +#define VMA_SPECIAL_FLAGS mk_vma_flags(VMA_IO_BIT, VMA_DONTEXPAND_BIT, \ + VMA_PFNMAP_BIT, VMA_MIXEDMAP_BIT) +#define VM_SPECIAL vma_flags_to_legacy(VMA_SPECIAL_FLAGS) /* * Physically remapped pages are special. Tell the @@ -1407,7 +1419,7 @@ static __always_inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, * vm_area_desc object describing a proposed VMA, e.g.: * * vma_desc_set_flags(desc, VMA_IO_BIT, VMA_PFNMAP_BIT, VMA_DONTEXPAND_BIT, - * VMA_DONTDUMP_BIT); + * VMA_DONTDUMP_BIT); */ #define vma_desc_set_flags(desc, ...) \ vma_desc_set_flags_mask(desc, mk_vma_flags(__VA_ARGS__)) @@ -4045,7 +4057,6 @@ extern int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); extern struct file *get_mm_exe_file(struct mm_struct *mm); extern struct file *get_task_exe_file(struct task_struct *task); -extern bool may_expand_vm(struct mm_struct *, vm_flags_t, unsigned long npages); extern void vm_stat_account(struct mm_struct *, vm_flags_t, long npages); extern bool vma_is_special_mapping(const struct vm_area_struct *vma, -- cgit v1.2.3 From d720b81d01b137dfc23e07461b05b76f822af6ab Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:36 +0000 Subject: mm/vma: introduce vma_clear_flags[_mask]() Introduce a helper function and helper macro to easily clear a VMA's flags using the new vma_flags_t vma->flags field: * vma_clear_flags_mask() - Clears all of the flags in a specified mask in the VMA's flags field. * vma_clear_flags() - Clears all of the specified individual VMA flag bits in a VMA's flags field. Also update the VMA tests to reflect the change. Link: https://lkml.kernel.org/r/9bd15da35c2c90e7441265adf01b5c2d3b5c6d41.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 4042a584671e..6b614f8af045 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1363,6 +1363,22 @@ static __always_inline void vma_set_flags_mask(struct vm_area_struct *vma, #define vma_set_flags(vma, ...) \ vma_set_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) +/* Helper to clear all VMA flags in a VMA. */ +static __always_inline void vma_clear_flags_mask(struct vm_area_struct *vma, + vma_flags_t flags) +{ + vma_flags_clear_mask(&vma->flags, flags); +} + +/* + * Helper macro for clearing VMA flags, e.g.: + * + * vma_clear_flags(vma, VMA_IO_BIT, VMA_PFNMAP_BIT, VMA_DONTEXPAND_BIT, + * VMA_DONTDUMP_BIT); + */ +#define vma_clear_flags(vma, ...) \ + vma_clear_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) + /* * Test whether a specific VMA flag is set in a VMA descriptor, e.g.: * -- cgit v1.2.3 From 769669bd9ca4cbae2562d57fe753efdcf17a196d Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:38 +0000 Subject: mm/vma: convert as much as we can in mm/vma.c to vma_flags_t Now we have established a good foundation for vm_flags_t to vma_flags_t changes, update mm/vma.c to utilise vma_flags_t wherever possible. We are able to convert VM_STARTGAP_FLAGS entirely as this is only used in mm/vma.c, and to account for the fact we can't use VM_NONE to make life easier, place the definition of this within existing #ifdef's to be cleaner. Generally the remaining changes are mechanical. Also update the VMA tests to reflect the changes. Link: https://lkml.kernel.org/r/5fdeaf8af9a12c2a5d68497495f52fa627d05a5b.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6b614f8af045..c6b40dc88918 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -463,8 +463,10 @@ enum { #if defined(CONFIG_X86_USER_SHADOW_STACK) || defined(CONFIG_ARM64_GCS) || \ defined(CONFIG_RISCV_USER_CFI) #define VM_SHADOW_STACK INIT_VM_FLAG(SHADOW_STACK) +#define VMA_STARTGAP_FLAGS mk_vma_flags(VMA_GROWSDOWN_BIT, VMA_SHADOW_STACK_BIT) #else #define VM_SHADOW_STACK VM_NONE +#define VMA_STARTGAP_FLAGS mk_vma_flags(VMA_GROWSDOWN_BIT) #endif #if defined(CONFIG_PPC64) #define VM_SAO INIT_VM_FLAG(SAO) @@ -539,8 +541,6 @@ enum { /* Temporary until VMA flags conversion complete. */ #define VM_STACK_FLAGS vma_flags_to_legacy(VMA_STACK_FLAGS) -#define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) - #ifdef CONFIG_MSEAL_SYSTEM_MAPPINGS #define VM_SEALED_SYSMAP VM_SEALED #else @@ -584,6 +584,8 @@ enum { /* This mask represents all the VMA flag bits used by mlock */ #define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT) +#define VMA_LOCKED_MASK mk_vma_flags(VMA_LOCKED_BIT, VMA_LOCKONFAULT_BIT) + /* These flags can be updated atomically via VMA/mmap read lock. */ #define VM_ATOMIC_SET_ALLOWED VM_MAYBE_GUARD -- cgit v1.2.3 From a06eb2f8279e0b2b42799d42041f144377f5a086 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:40 +0000 Subject: mm/vma: convert vma_modify_flags[_uffd]() to use vma_flags_t Update the vma_modify_flags() and vma_modify_flags_uffd() functions to accept a vma_flags_t parameter rather than a vm_flags_t one, and propagate the changes as needed to implement this change. Also add vma_flags_reset_once() in replacement of vm_flags_reset_once(). We still need to be careful here because we need to avoid tearing, so maintain the assumption that the first system word set of flags are the only ones that require protection from tearing, and retain this functionality. We can copy the remainder of VMA flags above 64 bits normally. But hopefully by the time that happens, we will have replaced the logic that requires these WRITE_ONCE()'s with something else. We also replace instances of vm_flags_reset() with a simple write of VMA flags. We are no longer perform a number of checks, most notable of all the VMA flags asserts becase: 1. We might be operating on a VMA that is not yet added to the tree. 2. We might be operating on a VMA that is now detached. 3. Really in all but core code, you should be using vma_desc_xxx(). 4. Other VMA fields are manipulated with no such checks. 5. It'd be egregious to have to add variants of flag functions just to account for cases such as the above, especially when we don't do so for other VMA fields. Drivers are the problematic cases and why it was especially important (and also for debug as VMA locks were introduced), the mmap_prepare work is solving this generally. Additionally, we can fairly safely assume by this point the soft dirty flags are being set correctly, so it's reasonable to drop this also. Finally, update the VMA tests to reflect this. Link: https://lkml.kernel.org/r/51afbb2b8c3681003cc7926647e37335d793836e.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm.h | 22 ++++++++++------------ include/linux/userfaultfd_k.h | 3 +++ 2 files changed, 13 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index c6b40dc88918..72bc5016094b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -954,22 +954,20 @@ static inline void vm_flags_reset(struct vm_area_struct *vma, vm_flags_init(vma, flags); } -static inline void vm_flags_reset_once(struct vm_area_struct *vma, - vm_flags_t flags) +static inline void vma_flags_reset_once(struct vm_area_struct *vma, + vma_flags_t *flags) { - vma_assert_write_locked(vma); - /* - * If VMA flags exist beyond the first system word, also clear these. It - * is assumed the write once behaviour is required only for the first - * system word. - */ + const unsigned long word = flags->__vma_flags[0]; + + /* It is assumed only the first system word must be written once. */ + vma_flags_overwrite_word_once(&vma->flags, word); + /* The remainder can be copied normally. */ if (NUM_VMA_FLAG_BITS > BITS_PER_LONG) { - unsigned long *bitmap = vma->flags.__vma_flags; + unsigned long *dst = &vma->flags.__vma_flags[1]; + const unsigned long *src = &flags->__vma_flags[1]; - bitmap_zero(&bitmap[1], NUM_VMA_FLAG_BITS - BITS_PER_LONG); + bitmap_copy(dst, src, NUM_VMA_FLAG_BITS - BITS_PER_LONG); } - - vma_flags_overwrite_word_once(&vma->flags, flags); } static inline void vm_flags_set(struct vm_area_struct *vma, diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index fd5f42765497..d83e349900a3 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -23,6 +23,9 @@ /* The set of all possible UFFD-related VM flags. */ #define __VM_UFFD_FLAGS (VM_UFFD_MISSING | VM_UFFD_WP | VM_UFFD_MINOR) +#define __VMA_UFFD_FLAGS mk_vma_flags(VMA_UFFD_MISSING_BIT, VMA_UFFD_WP_BIT, \ + VMA_UFFD_MINOR_BIT) + /* * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining * new flags, since they might collide with O_* ones. We want -- cgit v1.2.3 From 90cb921c4d7bf92854344d3e76561f48784c613e Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:41 +0000 Subject: mm/vma: convert __mmap_region() to use vma_flags_t Update the mmap() implementation logic implemented in __mmap_region() and functions invoked by it. The mmap_region() function converts its input vm_flags_t parameter to a vma_flags_t value which it then passes to __mmap_region() which uses the vma_flags_t value consistently from then on. As part of the change, we convert map_deny_write_exec() to using vma_flags_t (it was incorrectly using unsigned long before), and place it in vma.h, as it is only used internal to mm. With this change, we eliminate the legacy is_shared_maywrite_vm_flags() helper function which is now no longer required. We are also able to update the MMAP_STATE() and VMG_MMAP_STATE() macros to use the vma_flags_t value. Finally, we update the VMA tests to reflect the change. Link: https://lkml.kernel.org/r/1fc33a404c962f02da778da100387cc19bd62153.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm.h | 18 ++++++++++++------ include/linux/mman.h | 49 ------------------------------------------------- 2 files changed, 12 insertions(+), 55 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 72bc5016094b..9472b3c9a22b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1522,12 +1522,6 @@ static inline bool vma_is_accessible(const struct vm_area_struct *vma) return vma->vm_flags & VM_ACCESS_FLAGS; } -static inline bool is_shared_maywrite_vm_flags(vm_flags_t vm_flags) -{ - return (vm_flags & (VM_SHARED | VM_MAYWRITE)) == - (VM_SHARED | VM_MAYWRITE); -} - static inline bool is_shared_maywrite(const vma_flags_t *flags) { return vma_flags_test_all(flags, VMA_SHARED_BIT, VMA_MAYWRITE_BIT); @@ -4335,12 +4329,24 @@ static inline bool range_in_vma(const struct vm_area_struct *vma, #ifdef CONFIG_MMU pgprot_t vm_get_page_prot(vm_flags_t vm_flags); + +static inline pgprot_t vma_get_page_prot(vma_flags_t vma_flags) +{ + const vm_flags_t vm_flags = vma_flags_to_legacy(vma_flags); + + return vm_get_page_prot(vm_flags); +} + void vma_set_page_prot(struct vm_area_struct *vma); #else static inline pgprot_t vm_get_page_prot(vm_flags_t vm_flags) { return __pgprot(0); } +static inline pgprot_t vma_get_page_prot(vma_flags_t vma_flags) +{ + return __pgprot(0); +} static inline void vma_set_page_prot(struct vm_area_struct *vma) { vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); diff --git a/include/linux/mman.h b/include/linux/mman.h index 0ba8a7e8b90a..389521594c69 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -170,53 +170,4 @@ static inline bool arch_memory_deny_write_exec_supported(void) } #define arch_memory_deny_write_exec_supported arch_memory_deny_write_exec_supported #endif - -/* - * Denies creating a writable executable mapping or gaining executable permissions. - * - * This denies the following: - * - * a) mmap(PROT_WRITE | PROT_EXEC) - * - * b) mmap(PROT_WRITE) - * mprotect(PROT_EXEC) - * - * c) mmap(PROT_WRITE) - * mprotect(PROT_READ) - * mprotect(PROT_EXEC) - * - * But allows the following: - * - * d) mmap(PROT_READ | PROT_EXEC) - * mmap(PROT_READ | PROT_EXEC | PROT_BTI) - * - * This is only applicable if the user has set the Memory-Deny-Write-Execute - * (MDWE) protection mask for the current process. - * - * @old specifies the VMA flags the VMA originally possessed, and @new the ones - * we propose to set. - * - * Return: false if proposed change is OK, true if not ok and should be denied. - */ -static inline bool map_deny_write_exec(unsigned long old, unsigned long new) -{ - /* If MDWE is disabled, we have nothing to deny. */ - if (!mm_flags_test(MMF_HAS_MDWE, current->mm)) - return false; - - /* If the new VMA is not executable, we have nothing to deny. */ - if (!(new & VM_EXEC)) - return false; - - /* Under MDWE we do not accept newly writably executable VMAs... */ - if (new & VM_WRITE) - return true; - - /* ...nor previously non-executable VMAs becoming executable. */ - if (!(old & VM_EXEC)) - return true; - - return false; -} - #endif /* _LINUX_MMAN_H */ -- cgit v1.2.3 From 3e4bb2706817710d9461394da8b75be79981586b Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 22:39:27 +0000 Subject: mm: various small mmap_prepare cleanups Patch series "mm: expand mmap_prepare functionality and usage", v4. This series expands the mmap_prepare functionality, which is intended to replace the deprecated f_op->mmap hook which has been the source of bugs and security issues for some time. This series starts with some cleanup of existing mmap_prepare logic, then adds documentation for the mmap_prepare call to make it easier for filesystem and driver writers to understand how it works. It then importantly adds a vm_ops->mapped hook, a key feature that was missing from mmap_prepare previously - this is invoked when a driver which specifies mmap_prepare has successfully been mapped but not merged with another VMA. mmap_prepare is invoked prior to a merge being attempted, so you cannot manipulate state such as reference counts as if it were a new mapping. The vm_ops->mapped hook allows a driver to perform tasks required at this stage, and provides symmetry against subsequent vm_ops->open,close calls. The series uses this to correct the afs implementation which wrongly manipulated reference count at mmap_prepare time. It then adds an mmap_prepare equivalent of vm_iomap_memory() - mmap_action_simple_ioremap(), then uses this to update a number of drivers. It then splits out the mmap_prepare compatibility layer (which allows for invocation of mmap_prepare hooks in an mmap() hook) in such a way as to allow for more incremental implementation of mmap_prepare hooks. It then uses this to extend mmap_prepare usage in drivers. Finally it adds an mmap_prepare equivalent of vm_map_pages(), which lays the foundation for future work which will extend mmap_prepare to DMA coherent mappings. This patch (of 21): Rather than passing arbitrary fields, pass a vm_area_desc pointer to mmap prepare functions to mmap prepare, and an action and vma pointer to mmap complete in order to put all the action-specific logic in the function actually doing the work. Additionally, allow mmap prepare functions to return an error so we can error out as soon as possible if there is something logically incorrect in the input. Update remap_pfn_range_prepare() to properly check the input range for the CoW case. Also remove io_remap_pfn_range_complete(), as we can simply set up the fields correctly in io_remap_pfn_range_prepare() and use remap_pfn_range_complete() for this. While we're here, make remap_pfn_range_prepare_vma() a little neater, and pass mmap_action directly to call_action_complete(). Then, update compat_vma_mmap() to perform its logic directly, as __compat_vma_map() is not used by anything so we don't need to export it. Also update compat_vma_mmap() to use vfs_mmap_prepare() rather than calling the mmap_prepare op directly. Finally, update the VMA userland tests to reflect the changes. Link: https://lkml.kernel.org/r/cover.1774045440.git.ljs@kernel.org Link: https://lkml.kernel.org/r/99f408e4694f44ab12bdc55fe0bd9685d3bd1117.1774045440.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Al Viro Cc: Arnd Bergmann Cc: Bodo Stroesser Cc: Christian Brauner Cc: Clemens Ladisch Cc: David Hildenbrand Cc: David Howells Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Marc Dionne Cc: "Martin K. Petersen" Cc: Maxime Coquelin Cc: Michal Hocko Cc: Mike Rapoport Cc: Miquel Raynal Cc: Pedro Falcato Cc: Richard Weinberger Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Vignesh Raghavendra Cc: Wei Liu Signed-off-by: Andrew Morton --- include/linux/fs.h | 2 -- include/linux/mm.h | 7 +++---- 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 8b3dd145b25e..a2628a12bd2b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2058,8 +2058,6 @@ static inline bool can_mmap_file(struct file *file) return true; } -int __compat_vma_mmap(const struct file_operations *f_op, - struct file *file, struct vm_area_struct *vma); int compat_vma_mmap(struct file *file, struct vm_area_struct *vma); static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/include/linux/mm.h b/include/linux/mm.h index 9472b3c9a22b..6ca2fc5ae83f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4304,10 +4304,9 @@ static inline void mmap_action_ioremap_full(struct vm_area_desc *desc, mmap_action_ioremap(desc, desc->start, start_pfn, vma_desc_size(desc)); } -void mmap_action_prepare(struct mmap_action *action, - struct vm_area_desc *desc); -int mmap_action_complete(struct mmap_action *action, - struct vm_area_struct *vma); +int mmap_action_prepare(struct vm_area_desc *desc); +int mmap_action_complete(struct vm_area_struct *vma, + struct mmap_action *action); /* Look up the first VMA which exactly match the interval vm_start ... vm_end */ static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, -- cgit v1.2.3 From 827e97cf4bf59e9a72bcec37944bcebb3139a457 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 22:39:29 +0000 Subject: mm: document vm_operations_struct->open the same as close() Describe when the operation is invoked and the context in which it is invoked, matching the description already added for vm_op->close(). While we're here, update all outdated references to an 'area' field for VMAs to the more consistent 'vma'. Link: https://lkml.kernel.org/r/7d0ca833c12014320f0fa00f816f95e6e10076f2.1774045440.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Al Viro Cc: Arnd Bergmann Cc: Bodo Stroesser Cc: Christian Brauner Cc: Clemens Ladisch Cc: David Hildenbrand Cc: David Howells Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Marc Dionne Cc: "Martin K. Petersen" Cc: Maxime Coquelin Cc: Michal Hocko Cc: Mike Rapoport Cc: Miquel Raynal Cc: Pedro Falcato Cc: Richard Weinberger Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Vignesh Raghavendra Cc: Wei Liu Signed-off-by: Andrew Morton --- include/linux/mm.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6ca2fc5ae83f..21a2eef5f8fe 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -764,15 +764,20 @@ struct vm_fault { * to the functions called when a no-page or a wp-page exception occurs. */ struct vm_operations_struct { - void (*open)(struct vm_area_struct * area); + /** + * @open: Called when a VMA is remapped, split or forked. Not called + * upon first mapping a VMA. + * Context: User context. May sleep. Caller holds mmap_lock. + */ + void (*open)(struct vm_area_struct *vma); /** * @close: Called when the VMA is being removed from the MM. * Context: User context. May sleep. Caller holds mmap_lock. */ - void (*close)(struct vm_area_struct * area); + void (*close)(struct vm_area_struct *vma); /* Called any time before splitting to check if it's allowed */ - int (*may_split)(struct vm_area_struct *area, unsigned long addr); - int (*mremap)(struct vm_area_struct *area); + int (*may_split)(struct vm_area_struct *vma, unsigned long addr); + int (*mremap)(struct vm_area_struct *vma); /* * Called by mprotect() to make driver-specific permission * checks before mprotect() is finalised. The VMA must not @@ -784,7 +789,7 @@ struct vm_operations_struct { vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsigned int order); vm_fault_t (*map_pages)(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); - unsigned long (*pagesize)(struct vm_area_struct * area); + unsigned long (*pagesize)(struct vm_area_struct *vma); /* notification that a previously read-only page is about to become * writable, if an error is returned it will cause a SIGBUS */ -- cgit v1.2.3 From c50ca15dd4962bdf834945c2fa29b904042f366a Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 22:39:34 +0000 Subject: mm: add vm_ops->mapped hook Previously, when a driver needed to do something like establish a reference count, it could do so in the mmap hook in the knowledge that the mapping would succeed. With the introduction of f_op->mmap_prepare this is no longer the case, as it is invoked prior to actually establishing the mapping. mmap_prepare is not appropriate for this kind of thing as it is called before any merge might take place, and after which an error might occur meaning resources could be leaked. To take this into account, introduce a new vm_ops->mapped callback which is invoked when the VMA is first mapped (though notably - not when it is merged - which is correct and mirrors existing mmap/open/close behaviour). We do better that vm_ops->open() here, as this callback can return an error, at which point the VMA will be unmapped. Note that vm_ops->mapped() is invoked after any mmap action is complete (such as I/O remapping). We intentionally do not expose the VMA at this point, exposing only the fields that could be used, and an output parameter in case the operation needs to update the vma->vm_private_data field. In order to deal with stacked filesystems which invoke inner filesystem's mmap() invocations, add __compat_vma_mapped() and invoke it on vfs_mmap() (via compat_vma_mmap()) to ensure that the mapped callback is handled when an mmap() caller invokes a nested filesystem's mmap_prepare() callback. Update the mmap_prepare documentation to describe the mapped hook and make it clear what its intended use is. The vm_ops->mapped() call is handled by the mmap complete logic to ensure the same code paths are handled by both the compatibility and VMA layers. Additionally, update VMA userland test headers to reflect the change. Link: https://lkml.kernel.org/r/4c5e98297eb0aae9565c564e1c296a112702f144.1774045440.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Al Viro Cc: Arnd Bergmann Cc: Bodo Stroesser Cc: Christian Brauner Cc: Clemens Ladisch Cc: David Hildenbrand Cc: David Howells Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Marc Dionne Cc: "Martin K. Petersen" Cc: Maxime Coquelin Cc: Michal Hocko Cc: Mike Rapoport Cc: Miquel Raynal Cc: Pedro Falcato Cc: Richard Weinberger Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Vignesh Raghavendra Cc: Wei Liu Signed-off-by: Andrew Morton --- include/linux/fs.h | 9 ++++++++- include/linux/mm.h | 17 +++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index a2628a12bd2b..c390f5c667e3 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2059,13 +2059,20 @@ static inline bool can_mmap_file(struct file *file) } int compat_vma_mmap(struct file *file, struct vm_area_struct *vma); +int __vma_check_mmap_hook(struct vm_area_struct *vma); static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma) { + int err; + if (file->f_op->mmap_prepare) return compat_vma_mmap(file, vma); - return file->f_op->mmap(file, vma); + err = file->f_op->mmap(file, vma); + if (err) + return err; + + return __vma_check_mmap_hook(vma); } static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc) diff --git a/include/linux/mm.h b/include/linux/mm.h index 21a2eef5f8fe..81fbcfed44dd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -775,6 +775,23 @@ struct vm_operations_struct { * Context: User context. May sleep. Caller holds mmap_lock. */ void (*close)(struct vm_area_struct *vma); + /** + * @mapped: Called when the VMA is first mapped in the MM. Not called if + * the new VMA is merged with an adjacent VMA. + * + * The @vm_private_data field is an output field allowing the user to + * modify vma->vm_private_data as necessary. + * + * ONLY valid if set from f_op->mmap_prepare. Will result in an error if + * set from f_op->mmap. + * + * Returns %0 on success, or an error otherwise. On error, the VMA will + * be unmapped. + * + * Context: User context. May sleep. Caller holds mmap_lock. + */ + int (*mapped)(unsigned long start, unsigned long end, pgoff_t pgoff, + const struct file *file, void **vm_private_data); /* Called any time before splitting to check if it's allowed */ int (*may_split)(struct vm_area_struct *vma, unsigned long addr); int (*mremap)(struct vm_area_struct *vma); -- cgit v1.2.3 From a1b7fb40cb71a33c68a609fcee0946425d698415 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 22:39:37 +0000 Subject: mm: add mmap_action_simple_ioremap() Currently drivers use vm_iomap_memory() as a simple helper function for I/O remapping memory over a range starting at a specified physical address over a specified length. In order to utilise this from mmap_prepare, separate out the core logic into __simple_ioremap_prep(), update vm_iomap_memory() to use it, and add simple_ioremap_prepare() to do the same with a VMA descriptor object. We also add MMAP_SIMPLE_IO_REMAP and relevant fields to the struct mmap_action type to permit this operation also. We use mmap_action_ioremap() to set up the actual I/O remap operation once we have checked and figured out the parameters, which makes simple_ioremap_prepare() easy to implement. We then add mmap_action_simple_ioremap() to allow drivers to make use of this mode. We update the mmap_prepare documentation to describe this mode. Finally, we update the VMA tests to reflect this change. Link: https://lkml.kernel.org/r/a08ef1c4542202684da63bb37f459d5dbbeddd91.1774045440.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Reviewed-by: Suren Baghdasaryan Acked-by: Vlastimil Babka (SUSE) Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Al Viro Cc: Arnd Bergmann Cc: Bodo Stroesser Cc: Christian Brauner Cc: Clemens Ladisch Cc: David Hildenbrand Cc: David Howells Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Marc Dionne Cc: "Martin K. Petersen" Cc: Maxime Coquelin Cc: Michal Hocko Cc: Mike Rapoport Cc: Miquel Raynal Cc: Pedro Falcato Cc: Richard Weinberger Cc: Ryan Roberts Cc: Vignesh Raghavendra Cc: Wei Liu Signed-off-by: Andrew Morton --- include/linux/mm.h | 24 +++++++++++++++++++++++- include/linux/mm_types.h | 6 +++++- 2 files changed, 28 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 81fbcfed44dd..53b21de40f87 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4321,11 +4321,33 @@ static inline void mmap_action_ioremap(struct vm_area_desc *desc, * @start_pfn: The first PFN in the range to remap. */ static inline void mmap_action_ioremap_full(struct vm_area_desc *desc, - unsigned long start_pfn) + unsigned long start_pfn) { mmap_action_ioremap(desc, desc->start, start_pfn, vma_desc_size(desc)); } +/** + * mmap_action_simple_ioremap - helper for mmap_prepare hook to specify that the + * physical range in [start_phys_addr, start_phys_addr + size) should be I/O + * remapped. + * @desc: The VMA descriptor for the VMA requiring remap. + * @start_phys_addr: Start of the physical memory to be mapped. + * @size: Size of the area to map. + * + * NOTE: Some drivers might want to tweak desc->page_prot for purposes of + * write-combine or similar. + */ +static inline void mmap_action_simple_ioremap(struct vm_area_desc *desc, + phys_addr_t start_phys_addr, + unsigned long size) +{ + struct mmap_action *action = &desc->action; + + action->simple_ioremap.start_phys_addr = start_phys_addr; + action->simple_ioremap.size = size; + action->type = MMAP_SIMPLE_IO_REMAP; +} + int mmap_action_prepare(struct vm_area_desc *desc); int mmap_action_complete(struct vm_area_struct *vma, struct mmap_action *action); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 38fe6b915024..91a3db174d78 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -814,6 +814,7 @@ enum mmap_action_type { MMAP_NOTHING, /* Mapping is complete, no further action. */ MMAP_REMAP_PFN, /* Remap PFN range. */ MMAP_IO_REMAP_PFN, /* I/O remap PFN range. */ + MMAP_SIMPLE_IO_REMAP, /* I/O remap with guardrails. */ }; /* @@ -822,13 +823,16 @@ enum mmap_action_type { */ struct mmap_action { union { - /* Remap range. */ struct { unsigned long start; unsigned long start_pfn; unsigned long size; pgprot_t pgprot; } remap; + struct { + phys_addr_t start_phys_addr; + unsigned long size; + } simple_ioremap; }; enum mmap_action_type type; -- cgit v1.2.3 From 668937b7b2256f4b2a982e8f69b07d9ee8f81d36 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 22:39:43 +0000 Subject: mm: allow handling of stacked mmap_prepare hooks in more drivers While the conversion of mmap hooks to mmap_prepare is underway, we will encounter situations where mmap hooks need to invoke nested mmap_prepare hooks. The nesting of mmap hooks is termed 'stacking'. In order to flexibly facilitate the conversion of custom mmap hooks in drivers which stack, we must split up the existing __compat_vma_mmap() function into two separate functions: * compat_set_desc_from_vma() - This allows the setting of a vm_area_desc object's fields to the relevant fields of a VMA. * __compat_vma_mmap() - Once an mmap_prepare hook has been executed upon a vm_area_desc object, this function performs any mmap actions specified by the mmap_prepare hook and then invokes its vm_ops->mapped() hook if any were specified. In ordinary cases, where a file's f_op->mmap_prepare() hook simply needs to be invoked in a stacked mmap() hook, compat_vma_mmap() can be used. However some drivers define their own nested hooks, which are invoked in turn by another hook. A concrete example is vmbus_channel->mmap_ring_buffer(), which is invoked in turn by bin_attribute->mmap(): vmbus_channel->mmap_ring_buffer() has a signature of: int (*mmap_ring_buffer)(struct vmbus_channel *channel, struct vm_area_struct *vma); And bin_attribute->mmap() has a signature of: int (*mmap)(struct file *, struct kobject *, const struct bin_attribute *attr, struct vm_area_struct *vma); And so compat_vma_mmap() cannot be used here for incremental conversion of hooks from mmap() to mmap_prepare(). There are many such instances like this, where conversion to mmap_prepare would otherwise cascade to a huge change set due to nesting of this kind. The changes in this patch mean we could now instead convert vmbus_channel->mmap_ring_buffer() to vmbus_channel->mmap_prepare_ring_buffer(), and implement something like: struct vm_area_desc desc; int err; compat_set_desc_from_vma(&desc, file, vma); err = channel->mmap_prepare_ring_buffer(channel, &desc); if (err) return err; return __compat_vma_mmap(&desc, vma); Allowing us to incrementally update this logic, and other logic like it. Unfortunately, as part of this change, we need to be able to flexibly assign to the VMA descriptor, so have to remove some of the const declarations within the structure. Also update the VMA tests to reflect the changes. Link: https://lkml.kernel.org/r/24aac3019dd34740e788d169fccbe3c62781e648.1774045440.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Al Viro Cc: Arnd Bergmann Cc: Bodo Stroesser Cc: Christian Brauner Cc: Clemens Ladisch Cc: David Hildenbrand Cc: David Howells Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Marc Dionne Cc: "Martin K. Petersen" Cc: Maxime Coquelin Cc: Michal Hocko Cc: Mike Rapoport Cc: Miquel Raynal Cc: Pedro Falcato Cc: Richard Weinberger Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Vignesh Raghavendra Cc: Wei Liu Signed-off-by: Andrew Morton --- include/linux/fs.h | 3 +++ include/linux/mm_types.h | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index c390f5c667e3..0bdccfa70b44 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2058,6 +2058,9 @@ static inline bool can_mmap_file(struct file *file) return true; } +void compat_set_desc_from_vma(struct vm_area_desc *desc, const struct file *file, + const struct vm_area_struct *vma); +int __compat_vma_mmap(struct vm_area_desc *desc, struct vm_area_struct *vma); int compat_vma_mmap(struct file *file, struct vm_area_struct *vma); int __vma_check_mmap_hook(struct vm_area_struct *vma); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 91a3db174d78..b702c63bf0e0 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -891,8 +891,8 @@ static __always_inline bool vma_flags_empty(const vma_flags_t *flags) */ struct vm_area_desc { /* Immutable state. */ - const struct mm_struct *const mm; - struct file *const file; /* May vary from vm_file in stacked callers. */ + struct mm_struct *mm; + struct file *file; /* May vary from vm_file in stacked callers. */ unsigned long start; unsigned long end; -- cgit v1.2.3 From f98cb7ca4aa44645347771c2c2a9724bc210c49e Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 22:39:44 +0000 Subject: drivers: hv: vmbus: replace deprecated mmap hook with mmap_prepare The f_op->mmap interface is deprecated, so update the vmbus driver to use its successor, mmap_prepare. This updates all callbacks which referenced the function pointer hv_mmap_ring_buffer to instead reference hv_mmap_prepare_ring_buffer, utilising the newly introduced compat_set_desc_from_vma() and __compat_vma_mmap() to be able to implement this change. The UIO HV generic driver is the only user of hv_create_ring_sysfs(), which is the only function which references vmbus_channel->mmap_prepare_ring_buffer which, in turn, is the only external interface to hv_mmap_prepare_ring_buffer. This patch therefore updates this caller to use mmap_prepare instead, which also previously used vm_iomap_memory(), so this change replaces it with its mmap_prepare equivalent, mmap_action_simple_ioremap(). [akpm@linux-foundation.org: restore struct vmbus_channel comment, per Michael Kelley] Link: https://lkml.kernel.org/r/05467cb62267d750e5c770147517d4df0246cda6.1774045440.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Reviewed-by: Michael Kelley Tested-by: Michael Kelley Acked-by: Vlastimil Babka (SUSE) Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Al Viro Cc: Arnd Bergmann Cc: Bodo Stroesser Cc: Christian Brauner Cc: Clemens Ladisch Cc: David Hildenbrand Cc: David Howells Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Marc Dionne Cc: "Martin K. Petersen" Cc: Maxime Coquelin Cc: Michal Hocko Cc: Mike Rapoport Cc: Miquel Raynal Cc: Pedro Falcato Cc: Richard Weinberger Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Vignesh Raghavendra Cc: Wei Liu Signed-off-by: Andrew Morton --- include/linux/hyperv.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index dfc516c1c719..a26fb8e7cedf 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1015,8 +1015,8 @@ struct vmbus_channel { /* The max size of a packet on this channel */ u32 max_pkt_size; - /* function to mmap ring buffer memory to the channel's sysfs ring attribute */ - int (*mmap_ring_buffer)(struct vmbus_channel *channel, struct vm_area_struct *vma); + /* function to mmap ring buffer memory to the channel's sysfs ring attribute */ + int (*mmap_prepare_ring_buffer)(struct vmbus_channel *channel, struct vm_area_desc *desc); /* boolean to control visibility of sysfs for ring buffer */ bool ring_sysfs_visible; -- cgit v1.2.3 From 933f05f58ac6014eaac387d22a76ace8606891d1 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 22:39:45 +0000 Subject: uio: replace deprecated mmap hook with mmap_prepare in uio_info The f_op->mmap interface is deprecated, so update uio_info to use its successor, mmap_prepare. Therefore, replace the uio_info->mmap hook with a new uio_info->mmap_prepare hook, and update its one user, target_core_user, to both specify this new mmap_prepare hook and also to use the new vm_ops->mapped() hook to continue to maintain a correct udev->kref refcount. Then update uio_mmap() to utilise the mmap_prepare compatibility layer to invoke this callback from the uio mmap invocation. Link: https://lkml.kernel.org/r/157583e4477705b496896c7acd4ac88a937b8fa6.1774045440.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Al Viro Cc: Arnd Bergmann Cc: Bodo Stroesser Cc: Christian Brauner Cc: Clemens Ladisch Cc: David Hildenbrand Cc: David Howells Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Marc Dionne Cc: "Martin K. Petersen" Cc: Maxime Coquelin Cc: Michal Hocko Cc: Mike Rapoport Cc: Miquel Raynal Cc: Pedro Falcato Cc: Richard Weinberger Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Vignesh Raghavendra Cc: Wei Liu Signed-off-by: Andrew Morton --- include/linux/uio_driver.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uio_driver.h b/include/linux/uio_driver.h index 334641e20fb1..02eaac47ac44 100644 --- a/include/linux/uio_driver.h +++ b/include/linux/uio_driver.h @@ -97,7 +97,7 @@ struct uio_device { * @irq_flags: flags for request_irq() * @priv: optional private data * @handler: the device's irq handler - * @mmap: mmap operation for this uio device + * @mmap_prepare: mmap_prepare operation for this uio device * @open: open operation for this uio device * @release: release operation for this uio device * @irqcontrol: disable/enable irqs when 0/1 is written to /dev/uioX @@ -112,7 +112,7 @@ struct uio_info { unsigned long irq_flags; void *priv; irqreturn_t (*handler)(int irq, struct uio_info *dev_info); - int (*mmap)(struct uio_info *info, struct vm_area_struct *vma); + int (*mmap_prepare)(struct uio_info *info, struct vm_area_desc *desc); int (*open)(struct uio_info *info, struct inode *inode); int (*release)(struct uio_info *info, struct inode *inode); int (*irqcontrol)(struct uio_info *info, s32 irq_on); -- cgit v1.2.3 From 62c65fd740e979a3967db08971b93aefcec510d4 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 22:39:46 +0000 Subject: mm: add mmap_action_map_kernel_pages[_full]() A user can invoke mmap_action_map_kernel_pages() to specify that the mapping should map kernel pages starting from desc->start of a specified number of pages specified in an array. In order to implement this, adjust mmap_action_prepare() to be able to return an error code, as it makes sense to assert that the specified parameters are valid as quickly as possible as well as updating the VMA flags to include VMA_MIXEDMAP_BIT as necessary. This provides an mmap_prepare equivalent of vm_insert_pages(). We additionally update the existing vm_insert_pages() code to use range_in_vma() and add a new range_in_vma_desc() helper function for the mmap_prepare case, sharing the code between the two in range_is_subset(). We add both mmap_action_map_kernel_pages() and mmap_action_map_kernel_pages_full() to allow for both partial and full VMA mappings. We update the documentation to reflect the new features. Finally, we update the VMA tests accordingly to reflect the changes. Link: https://lkml.kernel.org/r/926ac961690d856e67ec847bee2370ab3c6b9046.1774045440.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Reviewed-by: Suren Baghdasaryan Acked-by: Vlastimil Babka (SUSE) Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Al Viro Cc: Arnd Bergmann Cc: Bodo Stroesser Cc: Christian Brauner Cc: Clemens Ladisch Cc: David Hildenbrand Cc: David Howells Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Marc Dionne Cc: "Martin K. Petersen" Cc: Maxime Coquelin Cc: Michal Hocko Cc: Mike Rapoport Cc: Miquel Raynal Cc: Pedro Falcato Cc: Richard Weinberger Cc: Ryan Roberts Cc: Vignesh Raghavendra Cc: Wei Liu Signed-off-by: Andrew Morton --- include/linux/mm.h | 95 +++++++++++++++++++++++++++++++++++++++++++++++- include/linux/mm_types.h | 7 ++++ 2 files changed, 100 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 53b21de40f87..61dff7f03554 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2905,7 +2905,7 @@ static inline bool folio_maybe_mapped_shared(struct folio *folio) * The caller must add any reference (e.g., from folio_try_get()) it might be * holding itself to the result. * - * Returns the expected folio refcount. + * Returns: the expected folio refcount. */ static inline int folio_expected_ref_count(const struct folio *folio) { @@ -4348,6 +4348,45 @@ static inline void mmap_action_simple_ioremap(struct vm_area_desc *desc, action->type = MMAP_SIMPLE_IO_REMAP; } +/** + * mmap_action_map_kernel_pages - helper for mmap_prepare hook to specify that + * @num kernel pages contained in the @pages array should be mapped to userland + * starting at virtual address @start. + * @desc: The VMA descriptor for the VMA requiring kernel pags to be mapped. + * @start: The virtual address from which to map them. + * @pages: An array of struct page pointers describing the memory to map. + * @nr_pages: The number of entries in the @pages aray. + */ +static inline void mmap_action_map_kernel_pages(struct vm_area_desc *desc, + unsigned long start, struct page **pages, + unsigned long nr_pages) +{ + struct mmap_action *action = &desc->action; + + action->type = MMAP_MAP_KERNEL_PAGES; + action->map_kernel.start = start; + action->map_kernel.pages = pages; + action->map_kernel.nr_pages = nr_pages; + action->map_kernel.pgoff = desc->pgoff; +} + +/** + * mmap_action_map_kernel_pages_full - helper for mmap_prepare hook to specify that + * kernel pages contained in the @pages array should be mapped to userland + * from @desc->start to @desc->end. + * @desc: The VMA descriptor for the VMA requiring kernel pags to be mapped. + * @pages: An array of struct page pointers describing the memory to map. + * + * The caller must ensure that @pages contains sufficient entries to cover the + * entire range described by @desc. + */ +static inline void mmap_action_map_kernel_pages_full(struct vm_area_desc *desc, + struct page **pages) +{ + mmap_action_map_kernel_pages(desc, desc->start, pages, + vma_desc_pages(desc)); +} + int mmap_action_prepare(struct vm_area_desc *desc); int mmap_action_complete(struct vm_area_struct *vma, struct mmap_action *action); @@ -4364,10 +4403,59 @@ static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, return vma; } +/** + * range_is_subset - Is the specified inner range a subset of the outer range? + * @outer_start: The start of the outer range. + * @outer_end: The exclusive end of the outer range. + * @inner_start: The start of the inner range. + * @inner_end: The exclusive end of the inner range. + * + * Returns: %true if [inner_start, inner_end) is a subset of [outer_start, + * outer_end), otherwise %false. + */ +static inline bool range_is_subset(unsigned long outer_start, + unsigned long outer_end, + unsigned long inner_start, + unsigned long inner_end) +{ + return outer_start <= inner_start && inner_end <= outer_end; +} + +/** + * range_in_vma - is the specified [@start, @end) range a subset of the VMA? + * @vma: The VMA against which we want to check [@start, @end). + * @start: The start of the range we wish to check. + * @end: The exclusive end of the range we wish to check. + * + * Returns: %true if [@start, @end) is a subset of [@vma->vm_start, + * @vma->vm_end), %false otherwise. + */ static inline bool range_in_vma(const struct vm_area_struct *vma, unsigned long start, unsigned long end) { - return (vma && vma->vm_start <= start && end <= vma->vm_end); + if (!vma) + return false; + + return range_is_subset(vma->vm_start, vma->vm_end, start, end); +} + +/** + * range_in_vma_desc - is the specified [@start, @end) range a subset of the VMA + * described by @desc, a VMA descriptor? + * @desc: The VMA descriptor against which we want to check [@start, @end). + * @start: The start of the range we wish to check. + * @end: The exclusive end of the range we wish to check. + * + * Returns: %true if [@start, @end) is a subset of [@desc->start, @desc->end), + * %false otherwise. + */ +static inline bool range_in_vma_desc(const struct vm_area_desc *desc, + unsigned long start, unsigned long end) +{ + if (!desc) + return false; + + return range_is_subset(desc->start, desc->end, start, end); } #ifdef CONFIG_MMU @@ -4411,6 +4499,9 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num); +int map_kernel_pages_prepare(struct vm_area_desc *desc); +int map_kernel_pages_complete(struct vm_area_struct *vma, + struct mmap_action *action); int vm_map_pages(struct vm_area_struct *vma, struct page **pages, unsigned long num); int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index b702c63bf0e0..a308e2c23b82 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -815,6 +815,7 @@ enum mmap_action_type { MMAP_REMAP_PFN, /* Remap PFN range. */ MMAP_IO_REMAP_PFN, /* I/O remap PFN range. */ MMAP_SIMPLE_IO_REMAP, /* I/O remap with guardrails. */ + MMAP_MAP_KERNEL_PAGES, /* Map kernel page range from array. */ }; /* @@ -833,6 +834,12 @@ struct mmap_action { phys_addr_t start_phys_addr; unsigned long size; } simple_ioremap; + struct { + unsigned long start; + struct page **pages; + unsigned long nr_pages; + pgoff_t pgoff; + } map_kernel; }; enum mmap_action_type type; -- cgit v1.2.3 From c0ea52c18c78c33c68c350eb9d3dcdf8c513254d Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 18:07:18 +0000 Subject: mm/huge_memory: simplify vma_is_specal_huge() Patch series "mm/huge_memory: refactor zap_huge_pmd()", v3. zap_huge_pmd() is overly complicated, clean it up and also add an assert in the case that we encounter a buggy PMD entry that doesn't match expectations. This is motivated by a bug discovered [0] where the PMD entry was none of: * A non-DAX, PFN or mixed map. * The huge zero folio * A present PMD entry * A softleaf entry In zap_huge_pmd(), but due to the bug we manged to reach this code. It is useful to explicitly call this out rather than have an arbitrary NULL pointer dereference happen, which also improves understanding of what's going on. The series goes further to make use of vm_normal_folio_pmd() rather than implementing custom logic for retrieving the folio, and extends softleaf functionality to provide and use an equivalent softleaf function. This patch (of 13): This function is confused - it overloads the term 'special' yet again, checks for DAX but in many cases the code explicitly excludes DAX before invoking the predicate. It also unnecessarily checks for vma->vm_file - this has to be present for a driver to have set VMA_MIXEDMAP_BIT or VMA_PFNMAP_BIT. In fact, a far simpler form of this is to reverse the DAX predicate and return false if DAX is set. This makes sense from the point of view of 'special' as in vm_normal_page(), as DAX actually does potentially have retrievable folios. Also there's no need to have this in mm.h so move it to huge_memory.c. No functional change intended. Link: https://lkml.kernel.org/r/cover.1774029655.git.ljs@kernel.org Link: https://lkml.kernel.org/r/d2b65883dc4895f197c4b4a69fbf27a063463412.1774029655.git.ljs@kernel.org Link: https://lore.kernel.org/all/6b3d7ad7-49e1-407a-903d-3103704160d8@lucifer.local/ [0] Signed-off-by: Lorenzo Stoakes (Oracle) Reviewed-by: Suren Baghdasaryan Cc: Baolin Wang Cc: Barry Song Cc: David Hildenbrand Cc: Dev Jain Cc: Lance Yang Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Ryan Roberts Cc: Zi Yan Cc: Qi Zheng Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 4 ++-- include/linux/mm.h | 16 ---------------- 2 files changed, 2 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index bd7f0e1d8094..61fda1672b29 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -83,7 +83,7 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr; * file is never split and the MAX_PAGECACHE_ORDER limit does not apply to * it. Same to PFNMAPs where there's neither page* nor pagecache. */ -#define THP_ORDERS_ALL_SPECIAL \ +#define THP_ORDERS_ALL_SPECIAL_DAX \ (BIT(PMD_ORDER) | BIT(PUD_ORDER)) #define THP_ORDERS_ALL_FILE_DEFAULT \ ((BIT(MAX_PAGECACHE_ORDER + 1) - 1) & ~BIT(0)) @@ -92,7 +92,7 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr; * Mask of all large folio orders supported for THP. */ #define THP_ORDERS_ALL \ - (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_SPECIAL | THP_ORDERS_ALL_FILE_DEFAULT) + (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_SPECIAL_DAX | THP_ORDERS_ALL_FILE_DEFAULT) enum tva_type { TVA_SMAPS, /* Exposing "THPeligible:" in smaps. */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 61dff7f03554..8260e28205e9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -5068,22 +5068,6 @@ long copy_folio_from_user(struct folio *dst_folio, const void __user *usr_src, bool allow_pagefault); -/** - * vma_is_special_huge - Are transhuge page-table entries considered special? - * @vma: Pointer to the struct vm_area_struct to consider - * - * Whether transhuge page-table entries are considered "special" following - * the definition in vm_normal_page(). - * - * Return: true if transhuge page-table entries should be considered special, - * false otherwise. - */ -static inline bool vma_is_special_huge(const struct vm_area_struct *vma) -{ - return vma_is_dax(vma) || (vma->vm_file && - (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))); -} - #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ #if MAX_NUMNODES > 1 -- cgit v1.2.3 From b92b9d4f699ce1f0ae746ebc69bca329adc07293 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 18:07:20 +0000 Subject: mm/huge_memory: have zap_huge_pmd return a boolean, add kdoc There's no need to use the ancient approach of returning an integer here, just return a boolean. Also update flush_needed to be a boolean, similarly. Also add a kdoc comment describing the function. No functional change intended. Link: https://lkml.kernel.org/r/132274566cd49d2960a2294c36dd2450593dfc55.1774029655.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Reviewed-by: Baolin Wang Acked-by: Qi Zheng Reviewed-by: Suren Baghdasaryan Cc: Barry Song Cc: David Hildenbrand Cc: Dev Jain Cc: Lance Yang Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Ryan Roberts Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 61fda1672b29..2949e5acff35 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -27,8 +27,8 @@ static inline void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud) vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf); bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long next); -int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr); +bool zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr); int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr); bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, -- cgit v1.2.3 From 64b7d889d03ce94940d6dd9440c4e74c1108ac78 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 18:07:28 +0000 Subject: mm: add softleaf_is_valid_pmd_entry(), pmd_to_softleaf_folio() Separate pmd_is_valid_softleaf() into separate components, then use the pmd_is_valid_softleaf() predicate to implement pmd_to_softleaf_folio(). This returns the folio associated with a softleaf entry at PMD level. It expects this to be valid for a PMD entry. If CONFIG_DEBUG_VM is set, then assert on this being an invalid entry, and either way return NULL in this case. This lays the ground for further refactorings. Link: https://lkml.kernel.org/r/b677592596274fa3fd701890497948e4b0e07cec.1774029655.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Reviewed-by: Suren Baghdasaryan Cc: Baolin Wang Cc: Barry Song Cc: David Hildenbrand Cc: Dev Jain Cc: Lance Yang Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Qi Zheng Cc: Ryan Roberts Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/leafops.h | 39 +++++++++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/leafops.h b/include/linux/leafops.h index 05673d3529e7..992cd8bd8ed0 100644 --- a/include/linux/leafops.h +++ b/include/linux/leafops.h @@ -607,7 +607,20 @@ static inline bool pmd_is_migration_entry(pmd_t pmd) } /** - * pmd_is_valid_softleaf() - Is this PMD entry a valid leaf entry? + * softleaf_is_valid_pmd_entry() - Is the specified softleaf entry obtained from + * a PMD one that we support at PMD level? + * @entry: Entry to check. + * Returns: true if the softleaf entry is valid at PMD, otherwise false. + */ +static inline bool softleaf_is_valid_pmd_entry(softleaf_t entry) +{ + /* Only device private, migration entries valid for PMD. */ + return softleaf_is_device_private(entry) || + softleaf_is_migration(entry); +} + +/** + * pmd_is_valid_softleaf() - Is this PMD entry a valid softleaf entry? * @pmd: PMD entry. * * PMD leaf entries are valid only if they are device private or migration @@ -620,9 +633,27 @@ static inline bool pmd_is_valid_softleaf(pmd_t pmd) { const softleaf_t entry = softleaf_from_pmd(pmd); - /* Only device private, migration entries valid for PMD. */ - return softleaf_is_device_private(entry) || - softleaf_is_migration(entry); + return softleaf_is_valid_pmd_entry(entry); +} + +/** + * pmd_to_softleaf_folio() - Convert the PMD entry to a folio. + * @pmd: PMD entry. + * + * The PMD entry is expected to be a valid PMD softleaf entry. + * + * Returns: the folio the softleaf entry references if this is a valid softleaf + * entry, otherwise NULL. + */ +static inline struct folio *pmd_to_softleaf_folio(pmd_t pmd) +{ + const softleaf_t entry = softleaf_from_pmd(pmd); + + if (!softleaf_is_valid_pmd_entry(entry)) { + VM_WARN_ON_ONCE(true); + return NULL; + } + return softleaf_to_folio(entry); } #endif /* CONFIG_MMU */ -- cgit v1.2.3 From 3031b76d65e14a946cfb5000d79b642f58ffac5c Mon Sep 17 00:00:00 2001 From: Alexander Usyskin Date: Sun, 5 Apr 2026 14:23:25 +0300 Subject: mei: bus: add mei_cldev_uuid Add mei_cldev_uuid API on mei bus to allow client to query what UUID it bound to. Reviewed-by: Andy Shevchenko Reviewed-by: Badal Nilawar Signed-off-by: Alexander Usyskin Link: https://patch.msgid.link/20260405112326.1535208-2-alexander.usyskin@intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/mei_cl_bus.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mei_cl_bus.h b/include/linux/mei_cl_bus.h index a82755e1fc40..5bdbd9e1d460 100644 --- a/include/linux/mei_cl_bus.h +++ b/include/linux/mei_cl_bus.h @@ -112,6 +112,7 @@ int mei_cldev_register_rx_cb(struct mei_cl_device *cldev, mei_cldev_cb_t rx_cb); int mei_cldev_register_notif_cb(struct mei_cl_device *cldev, mei_cldev_cb_t notif_cb); +const uuid_le *mei_cldev_uuid(const struct mei_cl_device *cldev); u8 mei_cldev_ver(const struct mei_cl_device *cldev); size_t mei_cldev_mtu(const struct mei_cl_device *cldev); -- cgit v1.2.3 From 4251dab9d176212afdf4ced263b59bc0d5292c7f Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 17 Mar 2026 13:36:50 +0100 Subject: remoteproc: mtk_scp_ipi: Constify buffer passed to scp_ipi_send() scp_ipi_send() should only send the passed buffer, without modifying its contents, so mark pointer 'buf' as pointer to const. Acked-by: Mathieu Poirier Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20260317-rpmsg-send-const-v3-1-4d7fd27f037f@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- include/linux/remoteproc/mtk_scp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/remoteproc/mtk_scp.h b/include/linux/remoteproc/mtk_scp.h index 344ff41c22c7..4070537d6542 100644 --- a/include/linux/remoteproc/mtk_scp.h +++ b/include/linux/remoteproc/mtk_scp.h @@ -58,7 +58,7 @@ int scp_ipi_register(struct mtk_scp *scp, u32 id, scp_ipi_handler_t handler, void *priv); void scp_ipi_unregister(struct mtk_scp *scp, u32 id); -int scp_ipi_send(struct mtk_scp *scp, u32 id, void *buf, unsigned int len, +int scp_ipi_send(struct mtk_scp *scp, u32 id, const void *buf, unsigned int len, unsigned int wait); unsigned int scp_get_vdec_hw_capa(struct mtk_scp *scp); -- cgit v1.2.3 From 90dacbf4bf13410c727ffaca8fe3ce3276ae58c2 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 17 Mar 2026 13:36:51 +0100 Subject: remoteproc: mtk_scp: Constify buffer passed to scp_send_ipi() scp_send_ipi() should only send the passed buffer, without modifying its contents, so mark pointer 'buf' as pointer to const. Acked-by: Mathieu Poirier Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20260317-rpmsg-send-const-v3-2-4d7fd27f037f@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- include/linux/rpmsg/mtk_rpmsg.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rpmsg/mtk_rpmsg.h b/include/linux/rpmsg/mtk_rpmsg.h index 363b60178040..badcbc89917f 100644 --- a/include/linux/rpmsg/mtk_rpmsg.h +++ b/include/linux/rpmsg/mtk_rpmsg.h @@ -25,7 +25,7 @@ struct mtk_rpmsg_info { ipi_handler_t handler, void *priv); void (*unregister_ipi)(struct platform_device *pdev, u32 id); int (*send_ipi)(struct platform_device *pdev, u32 id, - void *buf, unsigned int len, unsigned int wait); + const void *buf, unsigned int len, unsigned int wait); int ns_ipi_id; }; -- cgit v1.2.3 From b8077b4da2e89917ec4c632b66e60d49089bbda3 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 17 Mar 2026 13:36:52 +0100 Subject: rpmsg: Constify buffer passed to send API The rpmsg_send(), rpmsg_sendto() and other variants of sending interfaces should only send the passed data, without modifying its contents, so mark pointer 'data' as pointer to const. All users of this interface already follow this approach, so only the function declarations have to be updated. Acked-by: Mathieu Poirier Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20260317-rpmsg-send-const-v3-3-4d7fd27f037f@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- include/linux/rpmsg.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rpmsg.h b/include/linux/rpmsg.h index fb7ab9165645..83266ce14642 100644 --- a/include/linux/rpmsg.h +++ b/include/linux/rpmsg.h @@ -182,11 +182,11 @@ struct rpmsg_endpoint *rpmsg_create_ept(struct rpmsg_device *, rpmsg_rx_cb_t cb, void *priv, struct rpmsg_channel_info chinfo); -int rpmsg_send(struct rpmsg_endpoint *ept, void *data, int len); -int rpmsg_sendto(struct rpmsg_endpoint *ept, void *data, int len, u32 dst); +int rpmsg_send(struct rpmsg_endpoint *ept, const void *data, int len); +int rpmsg_sendto(struct rpmsg_endpoint *ept, const void *data, int len, u32 dst); -int rpmsg_trysend(struct rpmsg_endpoint *ept, void *data, int len); -int rpmsg_trysendto(struct rpmsg_endpoint *ept, void *data, int len, u32 dst); +int rpmsg_trysend(struct rpmsg_endpoint *ept, const void *data, int len); +int rpmsg_trysendto(struct rpmsg_endpoint *ept, const void *data, int len, u32 dst); __poll_t rpmsg_poll(struct rpmsg_endpoint *ept, struct file *filp, poll_table *wait); @@ -249,7 +249,7 @@ static inline struct rpmsg_endpoint *rpmsg_create_ept(struct rpmsg_device *rpdev return NULL; } -static inline int rpmsg_send(struct rpmsg_endpoint *ept, void *data, int len) +static inline int rpmsg_send(struct rpmsg_endpoint *ept, const void *data, int len) { /* This shouldn't be possible */ WARN_ON(1); @@ -257,7 +257,7 @@ static inline int rpmsg_send(struct rpmsg_endpoint *ept, void *data, int len) return -ENXIO; } -static inline int rpmsg_sendto(struct rpmsg_endpoint *ept, void *data, int len, +static inline int rpmsg_sendto(struct rpmsg_endpoint *ept, const void *data, int len, u32 dst) { /* This shouldn't be possible */ @@ -267,7 +267,8 @@ static inline int rpmsg_sendto(struct rpmsg_endpoint *ept, void *data, int len, } -static inline int rpmsg_trysend(struct rpmsg_endpoint *ept, void *data, int len) +static inline int rpmsg_trysend(struct rpmsg_endpoint *ept, const void *data, + int len) { /* This shouldn't be possible */ WARN_ON(1); @@ -275,7 +276,7 @@ static inline int rpmsg_trysend(struct rpmsg_endpoint *ept, void *data, int len) return -ENXIO; } -static inline int rpmsg_trysendto(struct rpmsg_endpoint *ept, void *data, +static inline int rpmsg_trysendto(struct rpmsg_endpoint *ept, const void *data, int len, u32 dst) { /* This shouldn't be possible */ -- cgit v1.2.3 From 66ec83627902d2585e14911692b317496731767a Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 17 Mar 2026 13:36:53 +0100 Subject: ASoC: qcom: Constify GPR packet being send over GPR interface gpr_send_pkt() and pkt_router_send_svc_pkt() only send the GPR packet they receive, without any need to actually modify it, so mark the pointer to GPR packet as pointer to const for code safety and code self-documentation. Several users of this interface can follow up and also operate on pointer to const. Acked-by: Mathieu Poirier Acked-by: Mark Brown Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20260317-rpmsg-send-const-v3-4-4d7fd27f037f@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- include/linux/soc/qcom/apr.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soc/qcom/apr.h b/include/linux/soc/qcom/apr.h index 6e1b1202e818..58fa1df96347 100644 --- a/include/linux/soc/qcom/apr.h +++ b/include/linux/soc/qcom/apr.h @@ -191,7 +191,7 @@ int apr_send_pkt(struct apr_device *adev, struct apr_pkt *pkt); gpr_port_t *gpr_alloc_port(gpr_device_t *gdev, struct device *dev, gpr_port_cb cb, void *priv); void gpr_free_port(gpr_port_t *port); -int gpr_send_port_pkt(gpr_port_t *port, struct gpr_pkt *pkt); -int gpr_send_pkt(gpr_device_t *gdev, struct gpr_pkt *pkt); +int gpr_send_port_pkt(gpr_port_t *port, const struct gpr_pkt *pkt); +int gpr_send_pkt(gpr_device_t *gdev, const struct gpr_pkt *pkt); #endif /* __QCOM_APR_H_ */ -- cgit v1.2.3 From ad5fd5aeb65a4426635cf55ef06c96e60a66e648 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Wed, 1 Apr 2026 09:11:40 +0200 Subject: hwspinlock: remove now unused pdata from header file The last user turned out to be obsolete and was removed. Remove the unused struct now, too. Signed-off-by: Wolfram Sang Reviewed-by: Linus Walleij Acked-by: Andy Shevchenko Link: https://lore.kernel.org/r/20260401071141.4718-3-wsa+renesas@sang-engineering.com Signed-off-by: Bjorn Andersson --- include/linux/hwspinlock.h | 28 ---------------------------- 1 file changed, 28 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hwspinlock.h b/include/linux/hwspinlock.h index f35b42e8c5de..74b91244fe0e 100644 --- a/include/linux/hwspinlock.h +++ b/include/linux/hwspinlock.h @@ -25,34 +25,6 @@ struct hwspinlock; struct hwspinlock_device; struct hwspinlock_ops; -/** - * struct hwspinlock_pdata - platform data for hwspinlock drivers - * @base_id: base id for this hwspinlock device - * - * hwspinlock devices provide system-wide hardware locks that are used - * by remote processors that have no other way to achieve synchronization. - * - * To achieve that, each physical lock must have a system-wide id number - * that is agreed upon, otherwise remote processors can't possibly assume - * they're using the same hardware lock. - * - * Usually boards have a single hwspinlock device, which provides several - * hwspinlocks, and in this case, they can be trivially numbered 0 to - * (num-of-locks - 1). - * - * In case boards have several hwspinlocks devices, a different base id - * should be used for each hwspinlock device (they can't all use 0 as - * a starting id!). - * - * This platform data structure should be used to provide the base id - * for each device (which is trivially 0 when only a single hwspinlock - * device exists). It can be shared between different platforms, hence - * its location. - */ -struct hwspinlock_pdata { - int base_id; -}; - #ifdef CONFIG_HWSPINLOCK int hwspin_lock_register(struct hwspinlock_device *bank, struct device *dev, -- cgit v1.2.3 From f652d0a4e13c5f5416da15ba791b99c5d1ac9b18 Mon Sep 17 00:00:00 2001 From: Chengwen Feng Date: Wed, 1 Apr 2026 16:16:37 +0800 Subject: ACPI: Centralize acpi_get_cpu_uid() declaration in include/linux/acpi.h Centralize acpi_get_cpu_uid() in include/linux/acpi.h (global scope) and remove arch-specific declarations from arm64/loongarch/riscv/x86 asm/acpi.h. This unifies the interface across architectures and simplifies maintenance by eliminating duplicate prototypes. Signed-off-by: Chengwen Feng Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20260401081640.26875-6-fengchengwen@huawei.com Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 4d2f0bed7a06..74a73f0e5944 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -324,6 +324,17 @@ int acpi_unmap_cpu(int cpu); acpi_handle acpi_get_processor_handle(int cpu); +/** + * acpi_get_cpu_uid() - Get ACPI Processor UID of from MADT table + * @cpu: Logical CPU number (0-based) + * @uid: Pointer to store ACPI Processor UID + * + * Return: 0 on success (ACPI Processor ID stored in *uid); + * -EINVAL if CPU number is invalid or out of range; + * -ENODEV if ACPI Processor UID for the CPU is not found. + */ +int acpi_get_cpu_uid(unsigned int cpu, u32 *uid); + #ifdef CONFIG_ACPI_HOTPLUG_IOAPIC int acpi_get_ioapic_id(acpi_handle handle, u32 gsi_base, u64 *phys_addr); #endif -- cgit v1.2.3 From abdd2a86535b59c76d14da2547160bc83e059c03 Mon Sep 17 00:00:00 2001 From: Chengwen Feng Date: Wed, 1 Apr 2026 16:16:40 +0800 Subject: PCI/TPH: Pass ACPI Processor UID to Cache Locality _DSM pcie_tph_get_cpu_st() uses the Query Cache Locality Features _DSM [1] to retrieve the TPH Steering Tag for memory associated with the CPU identified by its "cpu_uid" parameter, a Linux logical CPU ID. The _DSM requires an ACPI Processor UID, which pcie_tph_get_cpu_st() previously assumed was the same as the Linux logical CPU ID. This is true on x86 but not on arm64, so pcie_tph_get_cpu_st() returned the wrong Steering Tag, resulting in incorrect TPH functionality on arm64. Convert the Linux logical CPU ID to the ACPI Processor UID with acpi_get_cpu_uid() before passing it to the _DSM. Additionally, rename the pcie_tph_get_cpu_st() parameter from "cpu_uid" to "cpu" to reflect that it represents a logical CPU ID (not an ACPI Processor UID). [1] According to ECN_TPH-ST_Revision_20200924 (https://members.pcisig.com/wg/PCI-SIG/document/15470), the input is defined as: "If the target is a processor, then this field represents the ACPI Processor UID of the processor as specified in the MADT. If the target is a processor container, then this field represents the ACPI Processor UID of the processor container as specified in the PPTT." Fixes: d2e8a34876ce ("PCI/TPH: Add Steering Tag support") Signed-off-by: Chengwen Feng Reviewed-by: Jonathan Cameron Reviewed-by: Bjorn Helgaas Link: https://patch.msgid.link/20260401081640.26875-9-fengchengwen@huawei.com Signed-off-by: Rafael J. Wysocki --- include/linux/pci-tph.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci-tph.h b/include/linux/pci-tph.h index ba28140ce670..be68cd17f2f8 100644 --- a/include/linux/pci-tph.h +++ b/include/linux/pci-tph.h @@ -25,7 +25,7 @@ int pcie_tph_set_st_entry(struct pci_dev *pdev, unsigned int index, u16 tag); int pcie_tph_get_cpu_st(struct pci_dev *dev, enum tph_mem_type mem_type, - unsigned int cpu_uid, u16 *tag); + unsigned int cpu, u16 *tag); void pcie_disable_tph(struct pci_dev *pdev); int pcie_enable_tph(struct pci_dev *pdev, int mode); u16 pcie_tph_get_st_table_size(struct pci_dev *pdev); @@ -36,7 +36,7 @@ static inline int pcie_tph_set_st_entry(struct pci_dev *pdev, { return -EINVAL; } static inline int pcie_tph_get_cpu_st(struct pci_dev *dev, enum tph_mem_type mem_type, - unsigned int cpu_uid, u16 *tag) + unsigned int cpu, u16 *tag) { return -EINVAL; } static inline void pcie_disable_tph(struct pci_dev *pdev) { } static inline int pcie_enable_tph(struct pci_dev *pdev, int mode) -- cgit v1.2.3 From 18474aed5d0d382f8057ceed7811a735134d28b9 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 30 Mar 2026 16:38:18 -0600 Subject: bpf: Avoid -Wflex-array-members-not-at-end warnings Apparently, struct bpf_empty_prog_array exists entirely to populate a single element of "items" in a global variable. "null_prog" is only used during the initializer. None of this is needed; globals will be correctly sized with an array initializer of a flexible-array member. So, remove struct bpf_empty_prog_array and adjust the rest of the code, accordingly. With these changes, fix the following warnings: ./include/linux/bpf.h:2369:31: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end] Signed-off-by: Gustavo A. R. Silva Acked-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/acr7Whmn0br3xeBP@kspp Signed-off-by: Alexei Starovoitov --- include/linux/bpf-cgroup.h | 2 +- include/linux/bpf.h | 7 +------ 2 files changed, 2 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 2f535331f926..b2e79c2b41d5 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -184,7 +184,7 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk, struct bpf_prog_array *array; array = rcu_access_pointer(cgrp->bpf.effective[type]); - return array != &bpf_empty_prog_array.hdr; + return array != &bpf_empty_prog_array; } /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */ diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 35b1e25bd104..30d35d5fe40b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2369,18 +2369,13 @@ struct bpf_prog_array { struct bpf_prog_array_item items[]; }; -struct bpf_empty_prog_array { - struct bpf_prog_array hdr; - struct bpf_prog *null_prog; -}; - /* to avoid allocating empty bpf_prog_array for cgroups that * don't have bpf program attached use one global 'bpf_empty_prog_array' * It will not be modified the caller of bpf_prog_array_alloc() * (since caller requested prog_cnt == 0) * that pointer should be 'freed' by bpf_prog_array_free() */ -extern struct bpf_empty_prog_array bpf_empty_prog_array; +extern struct bpf_prog_array bpf_empty_prog_array; struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags); void bpf_prog_array_free(struct bpf_prog_array *progs); -- cgit v1.2.3 From a9c4b1d37622ed01b75f94a4f68cf55f33153a31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20B=C3=B6hmwalder?= Date: Fri, 3 Apr 2026 15:29:53 +0200 Subject: drbd: remove DRBD_GENLA_F_MANDATORY flag handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DRBD used a custom mechanism to mark netlink attributes as "mandatory": bit 14 of nla_type was repurposed as DRBD_GENLA_F_MANDATORY. Attributes sent from userspace that had this bit present and that were unknown to the kernel would lead to an error. Since commit ef6243acb478 ("genetlink: optionally validate strictly/dumps"), the generic netlink layer rejects unknown top-level attributes when strict validation is enabled. DRBD never opted out of strict validation, so unknown top-level attributes are already rejected by the netlink core. The mandatory flag mechanism was required for nested attributes, because these are parsed liberally, silently dropping attributes unknown to the kernel. This prepares for the move to a new YNL-based family, which will use the now-default strict parsing. The current family is not expected to gain any new attributes, which makes this change safe. Old userspace that still sets bit 14 is unaffected: nla_type() strips it before __nla_validate_parse() performs attribute validation, so the bit never reaches DRBD. Remove all references to the mandatory flag in DRBD. Cc: Johannes Berg Cc: Jakub Kicinski Signed-off-by: Christoph Böhmwalder Link: https://patch.msgid.link/20260403132953.2248751-1-christoph.boehmwalder@linbit.com Signed-off-by: Jens Axboe --- include/linux/drbd_genl.h | 208 +++++++++++++++++++------------------- include/linux/genl_magic_func.h | 3 +- include/linux/genl_magic_struct.h | 15 +-- 3 files changed, 108 insertions(+), 118 deletions(-) (limited to 'include/linux') diff --git a/include/linux/drbd_genl.h b/include/linux/drbd_genl.h index 53f44b8cd75f..f53c534aba0c 100644 --- a/include/linux/drbd_genl.h +++ b/include/linux/drbd_genl.h @@ -87,7 +87,7 @@ */ GENL_struct(DRBD_NLA_CFG_REPLY, 1, drbd_cfg_reply, /* "arbitrary" size strings, nla_policy.len = 0 */ - __str_field(1, DRBD_GENLA_F_MANDATORY, info_text, 0) + __str_field(1, 0, info_text, 0) ) /* Configuration requests typically need a context to operate on. @@ -96,10 +96,10 @@ GENL_struct(DRBD_NLA_CFG_REPLY, 1, drbd_cfg_reply, * and/or the replication group (aka resource) name, * and the volume id within the resource. */ GENL_struct(DRBD_NLA_CFG_CONTEXT, 2, drbd_cfg_context, - __u32_field(1, DRBD_GENLA_F_MANDATORY, ctx_volume) - __str_field(2, DRBD_GENLA_F_MANDATORY, ctx_resource_name, 128) - __bin_field(3, DRBD_GENLA_F_MANDATORY, ctx_my_addr, 128) - __bin_field(4, DRBD_GENLA_F_MANDATORY, ctx_peer_addr, 128) + __u32_field(1, 0, ctx_volume) + __str_field(2, 0, ctx_resource_name, 128) + __bin_field(3, 0, ctx_my_addr, 128) + __bin_field(4, 0, ctx_peer_addr, 128) ) GENL_struct(DRBD_NLA_DISK_CONF, 3, disk_conf, @@ -108,86 +108,86 @@ GENL_struct(DRBD_NLA_DISK_CONF, 3, disk_conf, __s32_field(3, DRBD_F_REQUIRED | DRBD_F_INVARIANT, meta_dev_idx) /* use the resize command to try and change the disk_size */ - __u64_field(4, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, disk_size) + __u64_field(4, DRBD_F_INVARIANT, disk_size) /* we could change the max_bio_bvecs, * but it won't propagate through the stack */ - __u32_field(5, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, max_bio_bvecs) - - __u32_field_def(6, DRBD_GENLA_F_MANDATORY, on_io_error, DRBD_ON_IO_ERROR_DEF) - __u32_field_def(7, DRBD_GENLA_F_MANDATORY, fencing, DRBD_FENCING_DEF) - - __u32_field_def(8, DRBD_GENLA_F_MANDATORY, resync_rate, DRBD_RESYNC_RATE_DEF) - __s32_field_def(9, DRBD_GENLA_F_MANDATORY, resync_after, DRBD_MINOR_NUMBER_DEF) - __u32_field_def(10, DRBD_GENLA_F_MANDATORY, al_extents, DRBD_AL_EXTENTS_DEF) - __u32_field_def(11, DRBD_GENLA_F_MANDATORY, c_plan_ahead, DRBD_C_PLAN_AHEAD_DEF) - __u32_field_def(12, DRBD_GENLA_F_MANDATORY, c_delay_target, DRBD_C_DELAY_TARGET_DEF) - __u32_field_def(13, DRBD_GENLA_F_MANDATORY, c_fill_target, DRBD_C_FILL_TARGET_DEF) - __u32_field_def(14, DRBD_GENLA_F_MANDATORY, c_max_rate, DRBD_C_MAX_RATE_DEF) - __u32_field_def(15, DRBD_GENLA_F_MANDATORY, c_min_rate, DRBD_C_MIN_RATE_DEF) - __u32_field_def(20, DRBD_GENLA_F_MANDATORY, disk_timeout, DRBD_DISK_TIMEOUT_DEF) + __u32_field(5, DRBD_F_INVARIANT, max_bio_bvecs) + + __u32_field_def(6, 0, on_io_error, DRBD_ON_IO_ERROR_DEF) + __u32_field_def(7, 0, fencing, DRBD_FENCING_DEF) + + __u32_field_def(8, 0, resync_rate, DRBD_RESYNC_RATE_DEF) + __s32_field_def(9, 0, resync_after, DRBD_MINOR_NUMBER_DEF) + __u32_field_def(10, 0, al_extents, DRBD_AL_EXTENTS_DEF) + __u32_field_def(11, 0, c_plan_ahead, DRBD_C_PLAN_AHEAD_DEF) + __u32_field_def(12, 0, c_delay_target, DRBD_C_DELAY_TARGET_DEF) + __u32_field_def(13, 0, c_fill_target, DRBD_C_FILL_TARGET_DEF) + __u32_field_def(14, 0, c_max_rate, DRBD_C_MAX_RATE_DEF) + __u32_field_def(15, 0, c_min_rate, DRBD_C_MIN_RATE_DEF) + __u32_field_def(20, 0, disk_timeout, DRBD_DISK_TIMEOUT_DEF) __u32_field_def(21, 0 /* OPTIONAL */, read_balancing, DRBD_READ_BALANCING_DEF) __u32_field_def(25, 0 /* OPTIONAL */, rs_discard_granularity, DRBD_RS_DISCARD_GRANULARITY_DEF) - __flg_field_def(16, DRBD_GENLA_F_MANDATORY, disk_barrier, DRBD_DISK_BARRIER_DEF) - __flg_field_def(17, DRBD_GENLA_F_MANDATORY, disk_flushes, DRBD_DISK_FLUSHES_DEF) - __flg_field_def(18, DRBD_GENLA_F_MANDATORY, disk_drain, DRBD_DISK_DRAIN_DEF) - __flg_field_def(19, DRBD_GENLA_F_MANDATORY, md_flushes, DRBD_MD_FLUSHES_DEF) + __flg_field_def(16, 0, disk_barrier, DRBD_DISK_BARRIER_DEF) + __flg_field_def(17, 0, disk_flushes, DRBD_DISK_FLUSHES_DEF) + __flg_field_def(18, 0, disk_drain, DRBD_DISK_DRAIN_DEF) + __flg_field_def(19, 0, md_flushes, DRBD_MD_FLUSHES_DEF) __flg_field_def(23, 0 /* OPTIONAL */, al_updates, DRBD_AL_UPDATES_DEF) __flg_field_def(24, 0 /* OPTIONAL */, discard_zeroes_if_aligned, DRBD_DISCARD_ZEROES_IF_ALIGNED_DEF) __flg_field_def(26, 0 /* OPTIONAL */, disable_write_same, DRBD_DISABLE_WRITE_SAME_DEF) ) GENL_struct(DRBD_NLA_RESOURCE_OPTS, 4, res_opts, - __str_field_def(1, DRBD_GENLA_F_MANDATORY, cpu_mask, DRBD_CPU_MASK_SIZE) - __u32_field_def(2, DRBD_GENLA_F_MANDATORY, on_no_data, DRBD_ON_NO_DATA_DEF) + __str_field_def(1, 0, cpu_mask, DRBD_CPU_MASK_SIZE) + __u32_field_def(2, 0, on_no_data, DRBD_ON_NO_DATA_DEF) ) GENL_struct(DRBD_NLA_NET_CONF, 5, net_conf, - __str_field_def(1, DRBD_GENLA_F_MANDATORY | DRBD_F_SENSITIVE, + __str_field_def(1, DRBD_F_SENSITIVE, shared_secret, SHARED_SECRET_MAX) - __str_field_def(2, DRBD_GENLA_F_MANDATORY, cram_hmac_alg, SHARED_SECRET_MAX) - __str_field_def(3, DRBD_GENLA_F_MANDATORY, integrity_alg, SHARED_SECRET_MAX) - __str_field_def(4, DRBD_GENLA_F_MANDATORY, verify_alg, SHARED_SECRET_MAX) - __str_field_def(5, DRBD_GENLA_F_MANDATORY, csums_alg, SHARED_SECRET_MAX) - __u32_field_def(6, DRBD_GENLA_F_MANDATORY, wire_protocol, DRBD_PROTOCOL_DEF) - __u32_field_def(7, DRBD_GENLA_F_MANDATORY, connect_int, DRBD_CONNECT_INT_DEF) - __u32_field_def(8, DRBD_GENLA_F_MANDATORY, timeout, DRBD_TIMEOUT_DEF) - __u32_field_def(9, DRBD_GENLA_F_MANDATORY, ping_int, DRBD_PING_INT_DEF) - __u32_field_def(10, DRBD_GENLA_F_MANDATORY, ping_timeo, DRBD_PING_TIMEO_DEF) - __u32_field_def(11, DRBD_GENLA_F_MANDATORY, sndbuf_size, DRBD_SNDBUF_SIZE_DEF) - __u32_field_def(12, DRBD_GENLA_F_MANDATORY, rcvbuf_size, DRBD_RCVBUF_SIZE_DEF) - __u32_field_def(13, DRBD_GENLA_F_MANDATORY, ko_count, DRBD_KO_COUNT_DEF) - __u32_field_def(14, DRBD_GENLA_F_MANDATORY, max_buffers, DRBD_MAX_BUFFERS_DEF) - __u32_field_def(15, DRBD_GENLA_F_MANDATORY, max_epoch_size, DRBD_MAX_EPOCH_SIZE_DEF) - __u32_field_def(16, DRBD_GENLA_F_MANDATORY, unplug_watermark, DRBD_UNPLUG_WATERMARK_DEF) - __u32_field_def(17, DRBD_GENLA_F_MANDATORY, after_sb_0p, DRBD_AFTER_SB_0P_DEF) - __u32_field_def(18, DRBD_GENLA_F_MANDATORY, after_sb_1p, DRBD_AFTER_SB_1P_DEF) - __u32_field_def(19, DRBD_GENLA_F_MANDATORY, after_sb_2p, DRBD_AFTER_SB_2P_DEF) - __u32_field_def(20, DRBD_GENLA_F_MANDATORY, rr_conflict, DRBD_RR_CONFLICT_DEF) - __u32_field_def(21, DRBD_GENLA_F_MANDATORY, on_congestion, DRBD_ON_CONGESTION_DEF) - __u32_field_def(22, DRBD_GENLA_F_MANDATORY, cong_fill, DRBD_CONG_FILL_DEF) - __u32_field_def(23, DRBD_GENLA_F_MANDATORY, cong_extents, DRBD_CONG_EXTENTS_DEF) - __flg_field_def(24, DRBD_GENLA_F_MANDATORY, two_primaries, DRBD_ALLOW_TWO_PRIMARIES_DEF) - __flg_field(25, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, discard_my_data) - __flg_field_def(26, DRBD_GENLA_F_MANDATORY, tcp_cork, DRBD_TCP_CORK_DEF) - __flg_field_def(27, DRBD_GENLA_F_MANDATORY, always_asbp, DRBD_ALWAYS_ASBP_DEF) - __flg_field(28, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, tentative) - __flg_field_def(29, DRBD_GENLA_F_MANDATORY, use_rle, DRBD_USE_RLE_DEF) - /* 9: __u32_field_def(30, DRBD_GENLA_F_MANDATORY, fencing_policy, DRBD_FENCING_DEF) */ - /* 9: __str_field_def(31, DRBD_GENLA_F_MANDATORY, name, SHARED_SECRET_MAX) */ + __str_field_def(2, 0, cram_hmac_alg, SHARED_SECRET_MAX) + __str_field_def(3, 0, integrity_alg, SHARED_SECRET_MAX) + __str_field_def(4, 0, verify_alg, SHARED_SECRET_MAX) + __str_field_def(5, 0, csums_alg, SHARED_SECRET_MAX) + __u32_field_def(6, 0, wire_protocol, DRBD_PROTOCOL_DEF) + __u32_field_def(7, 0, connect_int, DRBD_CONNECT_INT_DEF) + __u32_field_def(8, 0, timeout, DRBD_TIMEOUT_DEF) + __u32_field_def(9, 0, ping_int, DRBD_PING_INT_DEF) + __u32_field_def(10, 0, ping_timeo, DRBD_PING_TIMEO_DEF) + __u32_field_def(11, 0, sndbuf_size, DRBD_SNDBUF_SIZE_DEF) + __u32_field_def(12, 0, rcvbuf_size, DRBD_RCVBUF_SIZE_DEF) + __u32_field_def(13, 0, ko_count, DRBD_KO_COUNT_DEF) + __u32_field_def(14, 0, max_buffers, DRBD_MAX_BUFFERS_DEF) + __u32_field_def(15, 0, max_epoch_size, DRBD_MAX_EPOCH_SIZE_DEF) + __u32_field_def(16, 0, unplug_watermark, DRBD_UNPLUG_WATERMARK_DEF) + __u32_field_def(17, 0, after_sb_0p, DRBD_AFTER_SB_0P_DEF) + __u32_field_def(18, 0, after_sb_1p, DRBD_AFTER_SB_1P_DEF) + __u32_field_def(19, 0, after_sb_2p, DRBD_AFTER_SB_2P_DEF) + __u32_field_def(20, 0, rr_conflict, DRBD_RR_CONFLICT_DEF) + __u32_field_def(21, 0, on_congestion, DRBD_ON_CONGESTION_DEF) + __u32_field_def(22, 0, cong_fill, DRBD_CONG_FILL_DEF) + __u32_field_def(23, 0, cong_extents, DRBD_CONG_EXTENTS_DEF) + __flg_field_def(24, 0, two_primaries, DRBD_ALLOW_TWO_PRIMARIES_DEF) + __flg_field(25, DRBD_F_INVARIANT, discard_my_data) + __flg_field_def(26, 0, tcp_cork, DRBD_TCP_CORK_DEF) + __flg_field_def(27, 0, always_asbp, DRBD_ALWAYS_ASBP_DEF) + __flg_field(28, DRBD_F_INVARIANT, tentative) + __flg_field_def(29, 0, use_rle, DRBD_USE_RLE_DEF) + /* 9: __u32_field_def(30, 0, fencing_policy, DRBD_FENCING_DEF) */ + /* 9: __str_field_def(31, 0, name, SHARED_SECRET_MAX) */ /* 9: __u32_field(32, DRBD_F_REQUIRED | DRBD_F_INVARIANT, peer_node_id) */ __flg_field_def(33, 0 /* OPTIONAL */, csums_after_crash_only, DRBD_CSUMS_AFTER_CRASH_ONLY_DEF) __u32_field_def(34, 0 /* OPTIONAL */, sock_check_timeo, DRBD_SOCKET_CHECK_TIMEO_DEF) ) GENL_struct(DRBD_NLA_SET_ROLE_PARMS, 6, set_role_parms, - __flg_field(1, DRBD_GENLA_F_MANDATORY, assume_uptodate) + __flg_field(1, 0, assume_uptodate) ) GENL_struct(DRBD_NLA_RESIZE_PARMS, 7, resize_parms, - __u64_field(1, DRBD_GENLA_F_MANDATORY, resize_size) - __flg_field(2, DRBD_GENLA_F_MANDATORY, resize_force) - __flg_field(3, DRBD_GENLA_F_MANDATORY, no_resync) + __u64_field(1, 0, resize_size) + __flg_field(2, 0, resize_force) + __flg_field(3, 0, no_resync) __u32_field_def(4, 0 /* OPTIONAL */, al_stripes, DRBD_AL_STRIPES_DEF) __u32_field_def(5, 0 /* OPTIONAL */, al_stripe_size, DRBD_AL_STRIPE_SIZE_DEF) ) @@ -195,31 +195,31 @@ GENL_struct(DRBD_NLA_RESIZE_PARMS, 7, resize_parms, GENL_struct(DRBD_NLA_STATE_INFO, 8, state_info, /* the reason of the broadcast, * if this is an event triggered broadcast. */ - __u32_field(1, DRBD_GENLA_F_MANDATORY, sib_reason) + __u32_field(1, 0, sib_reason) __u32_field(2, DRBD_F_REQUIRED, current_state) - __u64_field(3, DRBD_GENLA_F_MANDATORY, capacity) - __u64_field(4, DRBD_GENLA_F_MANDATORY, ed_uuid) + __u64_field(3, 0, capacity) + __u64_field(4, 0, ed_uuid) /* These are for broadcast from after state change work. * prev_state and new_state are from the moment the state change took * place, new_state is not neccessarily the same as current_state, * there may have been more state changes since. Which will be * broadcasted soon, in their respective after state change work. */ - __u32_field(5, DRBD_GENLA_F_MANDATORY, prev_state) - __u32_field(6, DRBD_GENLA_F_MANDATORY, new_state) + __u32_field(5, 0, prev_state) + __u32_field(6, 0, new_state) /* if we have a local disk: */ - __bin_field(7, DRBD_GENLA_F_MANDATORY, uuids, (UI_SIZE*sizeof(__u64))) - __u32_field(8, DRBD_GENLA_F_MANDATORY, disk_flags) - __u64_field(9, DRBD_GENLA_F_MANDATORY, bits_total) - __u64_field(10, DRBD_GENLA_F_MANDATORY, bits_oos) + __bin_field(7, 0, uuids, (UI_SIZE*sizeof(__u64))) + __u32_field(8, 0, disk_flags) + __u64_field(9, 0, bits_total) + __u64_field(10, 0, bits_oos) /* and in case resync or online verify is active */ - __u64_field(11, DRBD_GENLA_F_MANDATORY, bits_rs_total) - __u64_field(12, DRBD_GENLA_F_MANDATORY, bits_rs_failed) + __u64_field(11, 0, bits_rs_total) + __u64_field(12, 0, bits_rs_failed) /* for pre and post notifications of helper execution */ - __str_field(13, DRBD_GENLA_F_MANDATORY, helper, 32) - __u32_field(14, DRBD_GENLA_F_MANDATORY, helper_exit_code) + __str_field(13, 0, helper, 32) + __u32_field(14, 0, helper_exit_code) __u64_field(15, 0, send_cnt) __u64_field(16, 0, recv_cnt) @@ -233,12 +233,12 @@ GENL_struct(DRBD_NLA_STATE_INFO, 8, state_info, ) GENL_struct(DRBD_NLA_START_OV_PARMS, 9, start_ov_parms, - __u64_field(1, DRBD_GENLA_F_MANDATORY, ov_start_sector) - __u64_field(2, DRBD_GENLA_F_MANDATORY, ov_stop_sector) + __u64_field(1, 0, ov_start_sector) + __u64_field(2, 0, ov_stop_sector) ) GENL_struct(DRBD_NLA_NEW_C_UUID_PARMS, 10, new_c_uuid_parms, - __flg_field(1, DRBD_GENLA_F_MANDATORY, clear_bm) + __flg_field(1, 0, clear_bm) ) GENL_struct(DRBD_NLA_TIMEOUT_PARMS, 11, timeout_parms, @@ -246,11 +246,11 @@ GENL_struct(DRBD_NLA_TIMEOUT_PARMS, 11, timeout_parms, ) GENL_struct(DRBD_NLA_DISCONNECT_PARMS, 12, disconnect_parms, - __flg_field(1, DRBD_GENLA_F_MANDATORY, force_disconnect) + __flg_field(1, 0, force_disconnect) ) GENL_struct(DRBD_NLA_DETACH_PARMS, 13, detach_parms, - __flg_field(1, DRBD_GENLA_F_MANDATORY, force_detach) + __flg_field(1, 0, force_detach) ) GENL_struct(DRBD_NLA_RESOURCE_INFO, 15, resource_info, @@ -315,12 +315,12 @@ GENL_struct(DRBD_NLA_PEER_DEVICE_STATISTICS, 22, peer_device_statistics, ) GENL_struct(DRBD_NLA_NOTIFICATION_HEADER, 23, drbd_notification_header, - __u32_field(1, DRBD_GENLA_F_MANDATORY, nh_type) + __u32_field(1, 0, nh_type) ) GENL_struct(DRBD_NLA_HELPER, 24, drbd_helper_info, - __str_field(1, DRBD_GENLA_F_MANDATORY, helper_name, 32) - __u32_field(2, DRBD_GENLA_F_MANDATORY, helper_status) + __str_field(1, 0, helper_name, 32) + __u32_field(2, 0, helper_status) ) /* @@ -333,9 +333,9 @@ GENL_notification( DRBD_EVENT, 1, events, GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) GENL_tla_expected(DRBD_NLA_STATE_INFO, DRBD_F_REQUIRED) - GENL_tla_expected(DRBD_NLA_NET_CONF, DRBD_GENLA_F_MANDATORY) - GENL_tla_expected(DRBD_NLA_DISK_CONF, DRBD_GENLA_F_MANDATORY) - GENL_tla_expected(DRBD_NLA_SYNCER_CONF, DRBD_GENLA_F_MANDATORY) + GENL_tla_expected(DRBD_NLA_NET_CONF, 0) + GENL_tla_expected(DRBD_NLA_DISK_CONF, 0) + GENL_tla_expected(DRBD_NLA_SYNCER_CONF, 0) ) /* query kernel for specific or all info */ @@ -349,7 +349,7 @@ GENL_op( ), /* To select the object .doit. * Or a subset of objects in .dumpit. */ - GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY) + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, 0) ) /* add DRBD minor devices as volumes to resources */ @@ -367,7 +367,7 @@ GENL_op(DRBD_ADM_DEL_RESOURCE, 8, GENL_doit(drbd_adm_del_resource), GENL_op(DRBD_ADM_RESOURCE_OPTS, 9, GENL_doit(drbd_adm_resource_opts), GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) - GENL_tla_expected(DRBD_NLA_RESOURCE_OPTS, DRBD_GENLA_F_MANDATORY) + GENL_tla_expected(DRBD_NLA_RESOURCE_OPTS, 0) ) GENL_op( @@ -403,7 +403,7 @@ GENL_op( DRBD_ADM_RESIZE, 13, GENL_doit(drbd_adm_resize), GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) - GENL_tla_expected(DRBD_NLA_RESIZE_PARMS, DRBD_GENLA_F_MANDATORY) + GENL_tla_expected(DRBD_NLA_RESIZE_PARMS, 0) ) GENL_op( @@ -424,18 +424,18 @@ GENL_op( DRBD_ADM_NEW_C_UUID, 16, GENL_doit(drbd_adm_new_c_uuid), GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) - GENL_tla_expected(DRBD_NLA_NEW_C_UUID_PARMS, DRBD_GENLA_F_MANDATORY) + GENL_tla_expected(DRBD_NLA_NEW_C_UUID_PARMS, 0) ) GENL_op( DRBD_ADM_START_OV, 17, GENL_doit(drbd_adm_start_ov), - GENL_tla_expected(DRBD_NLA_START_OV_PARMS, DRBD_GENLA_F_MANDATORY) + GENL_tla_expected(DRBD_NLA_START_OV_PARMS, 0) ) GENL_op(DRBD_ADM_DETACH, 18, GENL_doit(drbd_adm_detach), GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) - GENL_tla_expected(DRBD_NLA_DETACH_PARMS, DRBD_GENLA_F_MANDATORY)) + GENL_tla_expected(DRBD_NLA_DETACH_PARMS, 0)) GENL_op(DRBD_ADM_INVALIDATE, 19, GENL_doit(drbd_adm_invalidate), GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) @@ -460,36 +460,36 @@ GENL_op(DRBD_ADM_GET_RESOURCES, 30, GENL_op_init( .dumpit = drbd_adm_dump_resources, ), - GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY) - GENL_tla_expected(DRBD_NLA_RESOURCE_INFO, DRBD_GENLA_F_MANDATORY) - GENL_tla_expected(DRBD_NLA_RESOURCE_STATISTICS, DRBD_GENLA_F_MANDATORY)) + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, 0) + GENL_tla_expected(DRBD_NLA_RESOURCE_INFO, 0) + GENL_tla_expected(DRBD_NLA_RESOURCE_STATISTICS, 0)) GENL_op(DRBD_ADM_GET_DEVICES, 31, GENL_op_init( .dumpit = drbd_adm_dump_devices, .done = drbd_adm_dump_devices_done, ), - GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY) - GENL_tla_expected(DRBD_NLA_DEVICE_INFO, DRBD_GENLA_F_MANDATORY) - GENL_tla_expected(DRBD_NLA_DEVICE_STATISTICS, DRBD_GENLA_F_MANDATORY)) + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, 0) + GENL_tla_expected(DRBD_NLA_DEVICE_INFO, 0) + GENL_tla_expected(DRBD_NLA_DEVICE_STATISTICS, 0)) GENL_op(DRBD_ADM_GET_CONNECTIONS, 32, GENL_op_init( .dumpit = drbd_adm_dump_connections, .done = drbd_adm_dump_connections_done, ), - GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY) - GENL_tla_expected(DRBD_NLA_CONNECTION_INFO, DRBD_GENLA_F_MANDATORY) - GENL_tla_expected(DRBD_NLA_CONNECTION_STATISTICS, DRBD_GENLA_F_MANDATORY)) + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, 0) + GENL_tla_expected(DRBD_NLA_CONNECTION_INFO, 0) + GENL_tla_expected(DRBD_NLA_CONNECTION_STATISTICS, 0)) GENL_op(DRBD_ADM_GET_PEER_DEVICES, 33, GENL_op_init( .dumpit = drbd_adm_dump_peer_devices, .done = drbd_adm_dump_peer_devices_done, ), - GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY) - GENL_tla_expected(DRBD_NLA_PEER_DEVICE_INFO, DRBD_GENLA_F_MANDATORY) - GENL_tla_expected(DRBD_NLA_PEER_DEVICE_STATISTICS, DRBD_GENLA_F_MANDATORY)) + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, 0) + GENL_tla_expected(DRBD_NLA_PEER_DEVICE_INFO, 0) + GENL_tla_expected(DRBD_NLA_PEER_DEVICE_STATISTICS, 0)) GENL_notification( DRBD_RESOURCE_STATE, 34, events, @@ -524,7 +524,7 @@ GENL_op( GENL_op_init( .dumpit = drbd_adm_get_initial_state, ), - GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY)) + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, 0)) GENL_notification( DRBD_HELPER, 40, events, diff --git a/include/linux/genl_magic_func.h b/include/linux/genl_magic_func.h index 6edcac85155e..a7d36c9ea924 100644 --- a/include/linux/genl_magic_func.h +++ b/include/linux/genl_magic_func.h @@ -149,7 +149,8 @@ static int __ ## s_name ## _from_attrs(struct s_name *s, \ if (!tla) \ return -ENOMSG; \ DPRINT_TLA(#s_name, "<=-", #tag_name); \ - err = drbd_nla_parse_nested(ntb, maxtype, tla, s_name ## _nl_policy); \ + err = nla_parse_nested_deprecated(ntb, maxtype, tla, \ + s_name ## _nl_policy, NULL); \ if (err) \ return err; \ \ diff --git a/include/linux/genl_magic_struct.h b/include/linux/genl_magic_struct.h index 621b87a87d74..2200cedd160a 100644 --- a/include/linux/genl_magic_struct.h +++ b/include/linux/genl_magic_struct.h @@ -25,16 +25,6 @@ extern void CONCATENATE(GENL_MAGIC_FAMILY, _genl_unregister)(void); * Extension of genl attribute validation policies {{{2 */ -/* - * @DRBD_GENLA_F_MANDATORY: By default, netlink ignores attributes it does not - * know about. This flag can be set in nlattr->nla_type to indicate that this - * attribute must not be ignored. - * - * We check and remove this flag in drbd_nla_check_mandatory() before - * validating the attribute types and lengths via nla_parse_nested(). - */ -#define DRBD_GENLA_F_MANDATORY (1 << 14) - /* * Flags specific to drbd and not visible at the netlink layer, used in * _from_attrs and _to_skb: @@ -52,7 +42,6 @@ extern void CONCATENATE(GENL_MAGIC_FAMILY, _genl_unregister)(void); #define DRBD_F_SENSITIVE (1 << 1) #define DRBD_F_INVARIANT (1 << 2) -#define __nla_type(x) ((__u16)((x) & NLA_TYPE_MASK & ~DRBD_GENLA_F_MANDATORY)) /* }}}1 * MAGIC @@ -158,12 +147,12 @@ enum { \ #undef __field #define __field(attr_nr, attr_flag, name, nla_type, type, \ __get, __put, __is_signed) \ - T_ ## name = (__u16)(attr_nr | ((attr_flag) & DRBD_GENLA_F_MANDATORY)), + T_ ## name = (__u16)(attr_nr), #undef __array #define __array(attr_nr, attr_flag, name, nla_type, type, \ maxlen, __get, __put, __is_signed) \ - T_ ## name = (__u16)(attr_nr | ((attr_flag) & DRBD_GENLA_F_MANDATORY)), + T_ ## name = (__u16)(attr_nr), #include GENL_MAGIC_INCLUDE_FILE -- cgit v1.2.3 From a4c6c18e93a1d24df9ab95794cef471c89daefe4 Mon Sep 17 00:00:00 2001 From: Huisong Li Date: Tue, 7 Apr 2026 16:11:40 +0800 Subject: cpuidle: Extract and export no-lock variants of cpuidle_unregister_device() The cpuidle_unregister_device() function always acquires the internal cpuidle_lock (or pause/resume idle) during their execution. However, in some power notification scenarios (e.g., when old idle states may become unavailable), it is necessary to efficiently disable cpuidle first, then remove and re-create all cpuidle devices for all CPUs. To avoid frequent lock overhead and ensure atomicity across the entire batch operation, the caller needs to hold the cpuidle_lock once outside the loop. To address this, extract the core logic into the new function cpuidle_unregister_device_no_lock() and export it. Signed-off-by: Huisong Li [ rjw: Added missing "inline", subject and changelog tweaks ] Link: https://patch.msgid.link/20260407081141.2493581-2-lihuisong@huawei.com Signed-off-by: Rafael J. Wysocki --- include/linux/cpuidle.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 4073690504a7..a2485348def3 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -188,6 +188,7 @@ extern void cpuidle_driver_state_disabled(struct cpuidle_driver *drv, int idx, extern void cpuidle_unregister_driver(struct cpuidle_driver *drv); extern int cpuidle_register_device(struct cpuidle_device *dev); extern void cpuidle_unregister_device(struct cpuidle_device *dev); +extern void cpuidle_unregister_device_no_lock(struct cpuidle_device *dev); extern int cpuidle_register(struct cpuidle_driver *drv, const struct cpumask *const coupled_cpus); extern void cpuidle_unregister(struct cpuidle_driver *drv); @@ -226,6 +227,7 @@ static inline void cpuidle_unregister_driver(struct cpuidle_driver *drv) { } static inline int cpuidle_register_device(struct cpuidle_device *dev) {return -ENODEV; } static inline void cpuidle_unregister_device(struct cpuidle_device *dev) { } +static inline void cpuidle_unregister_device_no_lock(struct cpuidle_device *dev) {} static inline int cpuidle_register(struct cpuidle_driver *drv, const struct cpumask *const coupled_cpus) {return -ENODEV; } -- cgit v1.2.3 From 8ea6b92faebe4bad0e271cb9a8d819b8955ed476 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Thu, 26 Mar 2026 12:14:32 +0200 Subject: wifi: ieee80211: add more NAN definitions These will be needed to implement NAN synchronization in mac80211_hwsim. Signed-off-by: Benjamin Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20260326121156.ebb52db4c1eb.Ie8142cf92fc8c97c744a7c8b0a94ce3da6ff75ec@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211-nan.h | 37 +++++++++++++++++++++++++++++++++++++ include/linux/ieee80211.h | 1 + 2 files changed, 38 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211-nan.h b/include/linux/ieee80211-nan.h index ebf28ea651f9..455033955e54 100644 --- a/include/linux/ieee80211-nan.h +++ b/include/linux/ieee80211-nan.h @@ -37,4 +37,41 @@ #define NAN_DEV_CAPA_NDPE_SUPPORTED 0x08 #define NAN_DEV_CAPA_S3_SUPPORTED 0x10 +/* NAN attributes, as defined in Wi-Fi Aware (TM) specification 4.0 Table 42 */ +#define NAN_ATTR_MASTER_INDICATION 0x00 +#define NAN_ATTR_CLUSTER_INFO 0x01 + +struct ieee80211_nan_attr { + u8 attr; + __le16 length; + u8 data[]; +} __packed; + +struct ieee80211_nan_master_indication { + u8 master_pref; + u8 random_factor; +} __packed; + +struct ieee80211_nan_anchor_master_info { + union { + __le64 master_rank; + struct { + u8 master_addr[ETH_ALEN]; + u8 random_factor; + u8 master_pref; + } __packed; + } __packed; + u8 hop_count; + __le32 ambtt; +} __packed; + +#define for_each_nan_attr(_attr, _data, _datalen) \ + for (_attr = (const struct ieee80211_nan_attr *)(_data); \ + (const u8 *)(_data) + (_datalen) - (const u8 *)_attr >= \ + (int)sizeof(*_attr) && \ + (const u8 *)(_data) + (_datalen) - (const u8 *)_attr >= \ + (int)sizeof(*_attr) + le16_to_cpu(_attr->length); \ + _attr = (const struct ieee80211_nan_attr *) \ + (_attr->data + le16_to_cpu(_attr->length))) + #endif /* LINUX_IEEE80211_NAN_H */ diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index b5d649db123f..ffa8f9f77efe 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2240,6 +2240,7 @@ struct ieee80211_multiple_bssid_configuration { #define WLAN_OUI_WFA 0x506f9a #define WLAN_OUI_TYPE_WFA_P2P 9 +#define WLAN_OUI_TYPE_WFA_NAN 0x13 #define WLAN_OUI_TYPE_WFA_DPP 0x1A #define WLAN_OUI_MICROSOFT 0x0050f2 #define WLAN_OUI_TYPE_MICROSOFT_WPA 1 -- cgit v1.2.3 From 6fa747550e35f0a74e649b19d97055988a25b2e4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Apr 2026 16:05:26 +0200 Subject: block: factor out a bio_await helper Add a new helper to wait for a bio and anything chained off it to complete synchronously after submitting it. This factors common code out of submit_bio_wait and bio_await_chain and will also be useful for file system code and thus is exported. Note that this will now set REQ_SYNC also for the bio_await case for consistency. Nothing should look at the flag in the end_io handler, but if something does having the flag set makes more sense. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Link: https://patch.msgid.link/20260407140538.633364-4-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bio.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 984844d2870b..97d747320b35 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -432,6 +432,8 @@ extern void bio_uninit(struct bio *); void bio_reset(struct bio *bio, struct block_device *bdev, blk_opf_t opf); void bio_reuse(struct bio *bio, blk_opf_t opf); void bio_chain(struct bio *, struct bio *); +void bio_await(struct bio *bio, void *priv, + void (*submit)(struct bio *bio, void *priv)); int __must_check bio_add_page(struct bio *bio, struct page *page, unsigned len, unsigned off); -- cgit v1.2.3 From c713b96427ce5c4a74b8babe14137451ac3ffe54 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Tue, 7 Apr 2026 11:23:13 +0200 Subject: vt: Implement helpers for struct vc_font in source file Move the helpers vc_font_pitch() and vc_font_size() from the VT header file into source file. They are not called very often, so there's no benefit in keeping them in the headers. Also avoids including from the header. v2: - fix typo in commit description Signed-off-by: Thomas Zimmermann Acked-by: Greg Kroah-Hartman Signed-off-by: Helge Deller --- include/linux/console_struct.h | 30 ++---------------------------- 1 file changed, 2 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/include/linux/console_struct.h b/include/linux/console_struct.h index 771cba16cb54..fe4733c1ae4c 100644 --- a/include/linux/console_struct.h +++ b/include/linux/console_struct.h @@ -13,7 +13,6 @@ #ifndef _LINUX_CONSOLE_STRUCT_H #define _LINUX_CONSOLE_STRUCT_H -#include #include #include #include @@ -83,33 +82,8 @@ struct vc_font { const unsigned char *data; }; -/** - * vc_font_pitch - Calculates the number of bytes between two adjacent scanlines - * @font: The VC font - * - * Returns: - * The number of bytes between two adjacent scanlines in the font data - */ -static inline unsigned int vc_font_pitch(const struct vc_font *font) -{ - return DIV_ROUND_UP(font->width, 8); -} - -/** - * vc_font_size - Calculates the size of the font data in bytes - * @font: The VC font - * - * vc_font_size() calculates the number of bytes of font data in the - * font specified by @font. The function calculates the size from the - * font parameters. - * - * Returns: - * The size of the font data in bytes. - */ -static inline unsigned int vc_font_size(const struct vc_font *font) -{ - return font->height * vc_font_pitch(font) * font->charcount; -} +unsigned int vc_font_pitch(const struct vc_font *font); +unsigned int vc_font_size(const struct vc_font *font); /* * Example: vc_data of a console that was scrolled 3 lines down. -- cgit v1.2.3 From 97df8960240afc47c2349d008b0993e7727bbda5 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Tue, 7 Apr 2026 11:23:14 +0200 Subject: lib/fonts: Provide helpers for calculating glyph pitch and size Implement pitch and size calculation for a single font glyph in the new helpers font_glyph_pitch() and font_glyph_size(). Replace the instances where the calculations are open-coded. Note that in the case of fbcon console rotation, the parameters for a glyph's width and height might be reversed. This is intentional. v2: - fix typos in commit message Signed-off-by: Thomas Zimmermann Signed-off-by: Helge Deller --- include/linux/font.h | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'include/linux') diff --git a/include/linux/font.h b/include/linux/font.h index 5401f07dd6ce..3bd49d914b22 100644 --- a/include/linux/font.h +++ b/include/linux/font.h @@ -11,10 +11,50 @@ #ifndef _VIDEO_FONT_H #define _VIDEO_FONT_H +#include #include struct console_font; +/* + * Glyphs + */ + +/** + * font_glyph_pitch - Calculates the number of bytes per scanline + * @width: The glyph width in bits per scanline + * + * A glyph's pitch is the number of bytes in a single scanline, rounded + * up to the next full byte. The parameter @width receives the number + * of visible bits per scanline. For example, if width is 14 bytes per + * scanline, the pitch is 2 bytes per scanline. If width is 8 bits per + * scanline, the pitch is 1 byte per scanline. + * + * Returns: + * The number of bytes in a single scanline of the glyph + */ +static inline unsigned int font_glyph_pitch(unsigned int width) +{ + return DIV_ROUND_UP(width, 8); +} + +/** + * font_glyph_size - Calculates the number of bytes per glyph + * @width: The glyph width in bits per scanline + * @vpitch: The number of scanlines in the glyph + * + * The number of bytes in a glyph depends on the pitch and the number + * of scanlines. font_glyph_size automatically calculates the pitch + * from the given width. The parameter @vpitch gives the number of + * scanlines, which is usually the glyph's height in scanlines. Fonts + * coming from user space can sometimes have a different vertical pitch + * with empty scanlines between two adjacent glyphs. + */ +static inline unsigned int font_glyph_size(unsigned int width, unsigned int vpitch) +{ + return font_glyph_pitch(width) * vpitch; +} + /* * font_data_t and helpers */ -- cgit v1.2.3 From bdfd943231347ce57133d1ba2d93ed87f1050e81 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Tue, 7 Apr 2026 11:23:16 +0200 Subject: lib/fonts: Implement glyph rotation Move the glyph rotation helpers from fbcon to the font library. Wrap them behind clean interfaces. Also clear the output memory to zero. Previously, the implementation relied on the caller to do that. Go through the fbcon code and callers of the glyph-rotation helpers. In addition to the font rotation, there's also the cursor code, which uses the rotation helpers. The font-rotation relied on a single memset to zero for the whole font. This is now multiple memsets on each glyph. This will be sorted out when the font library also implements font rotation. Building glyph rotation in the font library still depends on CONFIG_FRAMEBUFFER_CONSOLE_ROTATION=y. If we get more users of the code, we can still add a dedicated Kconfig symbol to the font library. No changes have been made to the actual implementation of the rotate_*() and pattern_*() functions. These will be refactored as separate changes. v2: - fix typos Signed-off-by: Thomas Zimmermann Signed-off-by: Helge Deller --- include/linux/font.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/font.h b/include/linux/font.h index 3bd49d914b22..0a240dd70422 100644 --- a/include/linux/font.h +++ b/include/linux/font.h @@ -104,6 +104,14 @@ unsigned int font_data_size(font_data_t *fd); bool font_data_is_equal(font_data_t *lhs, font_data_t *rhs); int font_data_export(font_data_t *fd, struct console_font *font, unsigned int vpitch); +/* font_rotate.c */ +void font_glyph_rotate_90(const unsigned char *glyph, unsigned int width, unsigned int height, + unsigned char *out); +void font_glyph_rotate_180(const unsigned char *glyph, unsigned int width, unsigned int height, + unsigned char *out); +void font_glyph_rotate_270(const unsigned char *glyph, unsigned int width, unsigned int height, + unsigned char *out); + /* * Font description */ -- cgit v1.2.3 From cfa72955a029cd79433694cac6b5630788609cd4 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Tue, 7 Apr 2026 11:23:19 +0200 Subject: lib/fonts: Implement font rotation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the core of fbcon's font-rotation code to the font library as the new helper font_data_rotate(). The code can rotate in steps of 90°. For completeness, it also copies the glyph data for multiples of 360°. Bring back the memset optimization. A memset to 0 again clears the whole glyph output buffer. Then use the internal rotation helpers on the cleared output. Fbcon's original implementation worked like this, but lost it during refactoring. Replace fbcon's font-rotation code with the new implementations. All that's left to do for fbcon is to maintain its internal fbcon state. v2: - fix typos Signed-off-by: Thomas Zimmermann Signed-off-by: Helge Deller --- include/linux/font.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/font.h b/include/linux/font.h index 0a240dd70422..6845f02d739a 100644 --- a/include/linux/font.h +++ b/include/linux/font.h @@ -111,6 +111,9 @@ void font_glyph_rotate_180(const unsigned char *glyph, unsigned int width, unsig unsigned char *out); void font_glyph_rotate_270(const unsigned char *glyph, unsigned int width, unsigned int height, unsigned char *out); +unsigned char *font_data_rotate(font_data_t *fd, unsigned int width, unsigned int height, + unsigned int charcount, unsigned int steps, + unsigned char *buf, size_t *bufsize); /* * Font description -- cgit v1.2.3 From eb25e202b3d60cdc239f14e0e5f6f7465fcc506c Mon Sep 17 00:00:00 2001 From: Justin Suess Date: Fri, 27 Mar 2026 17:48:26 +0100 Subject: lsm: Add LSM hook security_unix_find MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add an LSM hook security_unix_find. This hook is called to check the path of a named UNIX socket before a connection is initiated. The peer socket may be inspected as well. Why existing hooks are unsuitable: Existing socket hooks, security_unix_stream_connect(), security_unix_may_send(), and security_socket_connect() don't provide TOCTOU-free / namespace independent access to the paths of sockets. (1) We cannot resolve the path from the struct sockaddr in existing hooks. This requires another path lookup. A change in the path between the two lookups will cause a TOCTOU bug. (2) We cannot use the struct path from the listening socket, because it may be bound to a path in a different namespace than the caller, resulting in a path that cannot be referenced at policy creation time. Consumers of the hook wishing to reference @other are responsible for acquiring the unix_state_lock and checking for the SOCK_DEAD flag therein, ensuring the socket hasn't died since lookup. Cc: Günther Noack Cc: Tingmao Wang Cc: Mickaël Salaün Cc: Paul Moore Signed-off-by: Justin Suess Signed-off-by: Günther Noack Reviewed-by: Georgia Garcia Acked-by: Paul Moore Link: https://lore.kernel.org/r/20260327164838.38231-2-gnoack3000@gmail.com Signed-off-by: Mickaël Salaün --- include/linux/lsm_hook_defs.h | 5 +++++ include/linux/security.h | 11 +++++++++++ 2 files changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 8c42b4bde09c..7a0fd3dbfa29 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -317,6 +317,11 @@ LSM_HOOK(int, 0, post_notification, const struct cred *w_cred, LSM_HOOK(int, 0, watch_key, struct key *key) #endif /* CONFIG_SECURITY && CONFIG_KEY_NOTIFICATIONS */ +#if defined(CONFIG_SECURITY_NETWORK) && defined(CONFIG_SECURITY_PATH) +LSM_HOOK(int, 0, unix_find, const struct path *path, struct sock *other, + int flags) +#endif /* CONFIG_SECURITY_NETWORK && CONFIG_SECURITY_PATH */ + #ifdef CONFIG_SECURITY_NETWORK LSM_HOOK(int, 0, unix_stream_connect, struct sock *sock, struct sock *other, struct sock *newsk) diff --git a/include/linux/security.h b/include/linux/security.h index ee88dd2d2d1f..c2d665cbfcfb 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -1932,6 +1932,17 @@ static inline int security_mptcp_add_subflow(struct sock *sk, struct sock *ssk) } #endif /* CONFIG_SECURITY_NETWORK */ +#if defined(CONFIG_SECURITY_NETWORK) && defined(CONFIG_SECURITY_PATH) + +int security_unix_find(const struct path *path, struct sock *other, int flags); + +#else /* CONFIG_SECURITY_NETWORK && CONFIG_SECURITY_PATH */ +static inline int security_unix_find(const struct path *path, struct sock *other, int flags) +{ + return 0; +} +#endif /* CONFIG_SECURITY_NETWORK && CONFIG_SECURITY_PATH */ + #ifdef CONFIG_SECURITY_INFINIBAND int security_ib_pkey_access(void *sec, u64 subnet_prefix, u16 pkey); int security_ib_endport_manage_subnet(void *sec, const char *name, u8 port_num); -- cgit v1.2.3 From 57b23c0f612dcfa1aae99c9422d6d36ced1670d4 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 7 Apr 2026 18:22:33 +0200 Subject: bpf: Retire rcu_trace_implies_rcu_gp() RCU Tasks Trace grace period implies RCU grace period, and this guarantee is expected to remain in the future. Only BPF is the user of this predicate, hence retire the API and clean up all in-tree users. RCU Tasks Trace is now implemented on SRCU-fast and its grace period mechanism always has at least one call to synchronize_rcu() as it is required for SRCU-fast's correctness (it replaces the smp_mb() that SRCU-fast readers skip). So, RCU-tt GP will always imply RCU GP. Reviewed-by: Puranjay Mohan Reviewed-by: Paul E. McKenney Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260407162234.785270-1-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/rcupdate.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 04f3f86a4145..bfa765132de8 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -205,18 +205,6 @@ static inline void exit_tasks_rcu_start(void) { } static inline void exit_tasks_rcu_finish(void) { } #endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ -/** - * rcu_trace_implies_rcu_gp - does an RCU Tasks Trace grace period imply an RCU grace period? - * - * As an accident of implementation, an RCU Tasks Trace grace period also - * acts as an RCU grace period. However, this could change at any time. - * Code relying on this accident must call this function to verify that - * this accident is still happening. - * - * You have been warned! - */ -static inline bool rcu_trace_implies_rcu_gp(void) { return true; } - /** * cond_resched_tasks_rcu_qs - Report potential quiescent states to RCU * -- cgit v1.2.3 From d79dc408deb6c192adbad7893ee0c22d50826511 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 3 Apr 2026 00:18:15 +0200 Subject: PCI: Remove no_pci_devices() After having removed the last usage of no_pci_devices(), this function can be removed. Signed-off-by: Heiner Kallweit Signed-off-by: Bjorn Helgaas Link: https://patch.msgid.link/b0ce592d-c34c-4e0b-b389-4e346b3a0c44@gmail.com --- include/linux/pci.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 1c270f1d5123..482dd8460dd9 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1193,8 +1193,6 @@ extern const struct bus_type pci_bus_type; /* Do NOT directly access these two variables, unless you are arch-specific PCI * code, or PCI core code. */ extern struct list_head pci_root_buses; /* List of all known PCI buses */ -/* Some device drivers need know if PCI is initiated */ -int no_pci_devices(void); void pcibios_resource_survey_bus(struct pci_bus *bus); void pcibios_bus_add_device(struct pci_dev *pdev); @@ -2132,7 +2130,6 @@ static inline struct pci_dev *pci_get_base_class(unsigned int class, static inline int pci_dev_present(const struct pci_device_id *ids) { return 0; } -#define no_pci_devices() (1) #define pci_dev_put(dev) do { } while (0) static inline void pci_set_master(struct pci_dev *dev) { } -- cgit v1.2.3 From 2232ba9c7931d5c1061f7f4e897b944ea39c3aa9 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Tue, 24 Feb 2026 12:06:18 +1000 Subject: mm: add gpu active/reclaim per-node stat counters (v2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While discussing memcg intergration with gpu memory allocations, it was pointed out that there was no numa/system counters for GPU memory allocations. With more integrated memory GPU server systems turning up, and more requirements for memory tracking it seems we should start closing the gap. Add two counters to track GPU per-node system memory allocations. The first is currently allocated to GPU objects, and the second is for memory that is stored in GPU page pools that can be reclaimed, by the shrinker. Cc: Christian Koenig Cc: Matthew Brost Cc: Johannes Weiner Cc: linux-mm@kvack.org Cc: Andrew Morton Acked-by: Zi Yan Acked-by: Shakeel Butt Acked-by: Andrew Morton Acked-by: Christian König Signed-off-by: Dave Airlie --- include/linux/mmzone.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3e51190a55e4..841b40031833 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -260,6 +260,8 @@ enum node_stat_item { #endif NR_BALLOON_PAGES, NR_KERNEL_FILE_PAGES, + NR_GPU_ACTIVE, /* Pages assigned to GPU objects */ + NR_GPU_RECLAIM, /* Pages in shrinkable GPU pools */ NR_VM_NODE_STAT_ITEMS }; -- cgit v1.2.3 From 6e6f2b9b3375cc0e6b8567d31ae7d3b2d910582f Mon Sep 17 00:00:00 2001 From: Sun Jian Date: Tue, 3 Mar 2026 18:15:25 +0800 Subject: netfilter: use function typedefs for __rcu NAT helper hook pointers After commit 07919126ecfc ("netfilter: annotate NAT helper hook pointers with __rcu"), sparse can warn about type/address-space mismatches when RCU-dereferencing NAT helper hook function pointers. The hooks are __rcu-annotated and accessed via rcu_dereference(), but the combination of complex function pointer declarators and the WRITE_ONCE() machinery used by RCU_INIT_POINTER()/rcu_assign_pointer() can confuse sparse and trigger false positives. Introduce typedefs for the NAT helper function types, so __rcu applies to a simple "fn_t __rcu *" pointer form. Also replace local typeof(hook) variables with "fn_t *" to avoid propagating __rcu address space into temporaries. No functional change intended. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202603022359.3dGE9fwI-lkp@intel.com/ Signed-off-by: Sun Jian Signed-off-by: Florian Westphal --- include/linux/netfilter/nf_conntrack_amanda.h | 15 +++++++++------ include/linux/netfilter/nf_conntrack_ftp.h | 17 ++++++++++------- include/linux/netfilter/nf_conntrack_irc.h | 15 +++++++++------ include/linux/netfilter/nf_conntrack_snmp.h | 11 +++++++---- include/linux/netfilter/nf_conntrack_tftp.h | 9 ++++++--- 5 files changed, 41 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/nf_conntrack_amanda.h b/include/linux/netfilter/nf_conntrack_amanda.h index dfe89f38d1f7..1719987e8fd8 100644 --- a/include/linux/netfilter/nf_conntrack_amanda.h +++ b/include/linux/netfilter/nf_conntrack_amanda.h @@ -7,10 +7,13 @@ #include #include -extern unsigned int (__rcu *nf_nat_amanda_hook)(struct sk_buff *skb, - enum ip_conntrack_info ctinfo, - unsigned int protoff, - unsigned int matchoff, - unsigned int matchlen, - struct nf_conntrack_expect *exp); +typedef unsigned int +nf_nat_amanda_hook_fn(struct sk_buff *skb, + enum ip_conntrack_info ctinfo, + unsigned int protoff, + unsigned int matchoff, + unsigned int matchlen, + struct nf_conntrack_expect *exp); + +extern nf_nat_amanda_hook_fn __rcu *nf_nat_amanda_hook; #endif /* _NF_CONNTRACK_AMANDA_H */ diff --git a/include/linux/netfilter/nf_conntrack_ftp.h b/include/linux/netfilter/nf_conntrack_ftp.h index f31292642035..7b62446ccec4 100644 --- a/include/linux/netfilter/nf_conntrack_ftp.h +++ b/include/linux/netfilter/nf_conntrack_ftp.h @@ -26,11 +26,14 @@ struct nf_ct_ftp_master { /* For NAT to hook in when we find a packet which describes what other * connection we should expect. */ -extern unsigned int (__rcu *nf_nat_ftp_hook)(struct sk_buff *skb, - enum ip_conntrack_info ctinfo, - enum nf_ct_ftp_type type, - unsigned int protoff, - unsigned int matchoff, - unsigned int matchlen, - struct nf_conntrack_expect *exp); +typedef unsigned int +nf_nat_ftp_hook_fn(struct sk_buff *skb, + enum ip_conntrack_info ctinfo, + enum nf_ct_ftp_type type, + unsigned int protoff, + unsigned int matchoff, + unsigned int matchlen, + struct nf_conntrack_expect *exp); + +extern nf_nat_ftp_hook_fn __rcu *nf_nat_ftp_hook; #endif /* _NF_CONNTRACK_FTP_H */ diff --git a/include/linux/netfilter/nf_conntrack_irc.h b/include/linux/netfilter/nf_conntrack_irc.h index 4f3ca5621998..ce07250afb4e 100644 --- a/include/linux/netfilter/nf_conntrack_irc.h +++ b/include/linux/netfilter/nf_conntrack_irc.h @@ -8,11 +8,14 @@ #define IRC_PORT 6667 -extern unsigned int (__rcu *nf_nat_irc_hook)(struct sk_buff *skb, - enum ip_conntrack_info ctinfo, - unsigned int protoff, - unsigned int matchoff, - unsigned int matchlen, - struct nf_conntrack_expect *exp); +typedef unsigned int +nf_nat_irc_hook_fn(struct sk_buff *skb, + enum ip_conntrack_info ctinfo, + unsigned int protoff, + unsigned int matchoff, + unsigned int matchlen, + struct nf_conntrack_expect *exp); + +extern nf_nat_irc_hook_fn __rcu *nf_nat_irc_hook; #endif /* _NF_CONNTRACK_IRC_H */ diff --git a/include/linux/netfilter/nf_conntrack_snmp.h b/include/linux/netfilter/nf_conntrack_snmp.h index 99107e4f5234..bb39f04a9977 100644 --- a/include/linux/netfilter/nf_conntrack_snmp.h +++ b/include/linux/netfilter/nf_conntrack_snmp.h @@ -5,9 +5,12 @@ #include #include -extern int (__rcu *nf_nat_snmp_hook)(struct sk_buff *skb, - unsigned int protoff, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo); +typedef int +nf_nat_snmp_hook_fn(struct sk_buff *skb, + unsigned int protoff, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo); + +extern nf_nat_snmp_hook_fn __rcu *nf_nat_snmp_hook; #endif /* _NF_CONNTRACK_SNMP_H */ diff --git a/include/linux/netfilter/nf_conntrack_tftp.h b/include/linux/netfilter/nf_conntrack_tftp.h index 1490b68dd7d1..90b334bbce3c 100644 --- a/include/linux/netfilter/nf_conntrack_tftp.h +++ b/include/linux/netfilter/nf_conntrack_tftp.h @@ -19,8 +19,11 @@ struct tftphdr { #define TFTP_OPCODE_ACK 4 #define TFTP_OPCODE_ERROR 5 -extern unsigned int (__rcu *nf_nat_tftp_hook)(struct sk_buff *skb, - enum ip_conntrack_info ctinfo, - struct nf_conntrack_expect *exp); +typedef unsigned int +nf_nat_tftp_hook_fn(struct sk_buff *skb, + enum ip_conntrack_info ctinfo, + struct nf_conntrack_expect *exp); + +extern nf_nat_tftp_hook_fn __rcu *nf_nat_tftp_hook; #endif /* _NF_CONNTRACK_TFTP_H */ -- cgit v1.2.3 From 613c83766884503f0f6bfdc45964c84b5286091c Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 7 Apr 2026 20:06:47 -0700 Subject: wifi: mac80211, cfg80211: Export michael_mic() and move it to cfg80211 Export michael_mic() so that the ath11k and ath12k drivers can call it. In addition, move it from mac80211 to cfg80211 so that the ipw2x00 drivers, which depend on cfg80211 but not mac80211, can also call it. Currently these drivers have their own local implementations of michael_mic() based on crypto_shash, which is redundant and inefficient. By consolidating all the Michael MIC code into cfg80211, we'll be able to remove the duplicate Michael MIC code in the crypto/ directory. Signed-off-by: Eric Biggers Link: https://patch.msgid.link/20260408030651.80336-3-ebiggers@kernel.org Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index ffa8f9f77efe..23f9df9be837 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1921,6 +1921,11 @@ enum ieee80211_radio_measurement_actioncode { #define PMK_MAX_LEN 64 #define SAE_PASSWORD_MAX_LEN 128 +#define MICHAEL_MIC_LEN 8 + +void michael_mic(const u8 *key, struct ieee80211_hdr *hdr, + const u8 *data, size_t data_len, u8 *mic); + /* Public action codes (IEEE Std 802.11-2016, 9.6.8.1, Table 9-307) */ enum ieee80211_pub_actioncode { WLAN_PUB_ACTION_20_40_BSS_COEX = 0, -- cgit v1.2.3 From 1f0d117cd6ca8e74e70e415e89b059fce37674c6 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 7 Apr 2026 14:16:41 +0100 Subject: entry: Fix stale comment for irqentry_enter() The kerneldoc comment for irqentry_enter() refers to idtentry_exit(), which is an accidental holdover from the x86 entry code that the generic irqentry code was based on. Correct this to refer to irqentry_exit(). Signed-off-by: Mark Rutland Signed-off-by: Thomas Gleixner Reviewed-by: Jinjie Ruan Acked-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260407131650.3813777-2-mark.rutland@arm.com --- include/linux/irq-entry-common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index d26d1b1bcbfb..3cf4d21168ba 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -394,7 +394,7 @@ typedef struct irqentry_state { * establish the proper context for NOHZ_FULL. Otherwise scheduling on exit * would not be possible. * - * Returns: An opaque object that must be passed to idtentry_exit() + * Returns: An opaque object that must be passed to irqentry_exit() */ irqentry_state_t noinstr irqentry_enter(struct pt_regs *regs); -- cgit v1.2.3 From 22f66e7ef4ce9414b4bd18abe50ead4a1284b01a Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 7 Apr 2026 14:16:42 +0100 Subject: entry: Remove local_irq_{enable,disable}_exit_to_user() local_irq_enable_exit_to_user() and local_irq_disable_exit_to_user() are never overridden by architecture code, and are always equivalent to local_irq_enable() and local_irq_disable(). These functions were added on the assumption that arm64 would override them to manage 'DAIF' exception masking, as described by Thomas Gleixner in these threads: https://lore.kernel.org/all/20190919150809.340471236@linutronix.de/ https://lore.kernel.org/all/alpine.DEB.2.21.1910240119090.1852@nanos.tec.linutronix.de/ In practice arm64 did not need to override either. Prior to moving to the generic irqentry code, arm64's management of DAIF was reworked in commit: 97d935faacde ("arm64: Unmask Debug + SError in do_notify_resume()") Since that commit, arm64 only masks interrupts during the 'prepare' step when returning to user mode, and masks other DAIF exceptions later. Within arm64_exit_to_user_mode(), the arm64 entry code is as follows: local_irq_disable(); exit_to_user_mode_prepare_legacy(regs); local_daif_mask(); mte_check_tfsr_exit(); exit_to_user_mode(); Remove the unnecessary local_irq_enable_exit_to_user() and local_irq_disable_exit_to_user() functions. Signed-off-by: Mark Rutland Signed-off-by: Thomas Gleixner Reviewed-by: Jinjie Ruan Acked-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260407131650.3813777-3-mark.rutland@arm.com --- include/linux/entry-common.h | 2 +- include/linux/irq-entry-common.h | 31 ------------------------------- 2 files changed, 1 insertion(+), 32 deletions(-) (limited to 'include/linux') diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index f83ca0abf2cd..dbaa153100f4 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -321,7 +321,7 @@ static __always_inline void syscall_exit_to_user_mode(struct pt_regs *regs) { instrumentation_begin(); syscall_exit_to_user_mode_work(regs); - local_irq_disable_exit_to_user(); + local_irq_disable(); syscall_exit_to_user_mode_prepare(regs); instrumentation_end(); exit_to_user_mode(); diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index 3cf4d21168ba..93b4b551f7ae 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -100,37 +100,6 @@ static __always_inline void enter_from_user_mode(struct pt_regs *regs) instrumentation_end(); } -/** - * local_irq_enable_exit_to_user - Exit to user variant of local_irq_enable() - * @ti_work: Cached TIF flags gathered with interrupts disabled - * - * Defaults to local_irq_enable(). Can be supplied by architecture specific - * code. - */ -static inline void local_irq_enable_exit_to_user(unsigned long ti_work); - -#ifndef local_irq_enable_exit_to_user -static __always_inline void local_irq_enable_exit_to_user(unsigned long ti_work) -{ - local_irq_enable(); -} -#endif - -/** - * local_irq_disable_exit_to_user - Exit to user variant of local_irq_disable() - * - * Defaults to local_irq_disable(). Can be supplied by architecture specific - * code. - */ -static inline void local_irq_disable_exit_to_user(void); - -#ifndef local_irq_disable_exit_to_user -static __always_inline void local_irq_disable_exit_to_user(void) -{ - local_irq_disable(); -} -#endif - /** * arch_exit_to_user_mode_work - Architecture specific TIF work for exit * to user mode. -- cgit v1.2.3 From eb1b51afde506a8e38976190e518990d69ef5382 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 7 Apr 2026 14:16:43 +0100 Subject: entry: Move irqentry_enter() prototype later Subsequent patches will rework the irqentry_*() functions. The end result (and the intermediate diffs) will be much clearer if the prototype for the irqentry_enter() function is moved later, immediately before the prototype of the irqentry_exit() function. Move the prototype later. This is purely a move; there should be no functional change as a result of this change. Signed-off-by: Mark Rutland Signed-off-by: Thomas Gleixner Reviewed-by: Jinjie Ruan Acked-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260407131650.3813777-4-mark.rutland@arm.com --- include/linux/irq-entry-common.h | 44 ++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index 93b4b551f7ae..d1e8591a5919 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -334,6 +334,28 @@ typedef struct irqentry_state { } irqentry_state_t; #endif +/** + * irqentry_exit_cond_resched - Conditionally reschedule on return from interrupt + * + * Conditional reschedule with additional sanity checks. + */ +void raw_irqentry_exit_cond_resched(void); + +#ifdef CONFIG_PREEMPT_DYNAMIC +#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) +#define irqentry_exit_cond_resched_dynamic_enabled raw_irqentry_exit_cond_resched +#define irqentry_exit_cond_resched_dynamic_disabled NULL +DECLARE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched); +#define irqentry_exit_cond_resched() static_call(irqentry_exit_cond_resched)() +#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +DECLARE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); +void dynamic_irqentry_exit_cond_resched(void); +#define irqentry_exit_cond_resched() dynamic_irqentry_exit_cond_resched() +#endif +#else /* CONFIG_PREEMPT_DYNAMIC */ +#define irqentry_exit_cond_resched() raw_irqentry_exit_cond_resched() +#endif /* CONFIG_PREEMPT_DYNAMIC */ + /** * irqentry_enter - Handle state tracking on ordinary interrupt entries * @regs: Pointer to pt_regs of interrupted context @@ -367,28 +389,6 @@ typedef struct irqentry_state { */ irqentry_state_t noinstr irqentry_enter(struct pt_regs *regs); -/** - * irqentry_exit_cond_resched - Conditionally reschedule on return from interrupt - * - * Conditional reschedule with additional sanity checks. - */ -void raw_irqentry_exit_cond_resched(void); - -#ifdef CONFIG_PREEMPT_DYNAMIC -#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) -#define irqentry_exit_cond_resched_dynamic_enabled raw_irqentry_exit_cond_resched -#define irqentry_exit_cond_resched_dynamic_disabled NULL -DECLARE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched); -#define irqentry_exit_cond_resched() static_call(irqentry_exit_cond_resched)() -#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) -DECLARE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); -void dynamic_irqentry_exit_cond_resched(void); -#define irqentry_exit_cond_resched() dynamic_irqentry_exit_cond_resched() -#endif -#else /* CONFIG_PREEMPT_DYNAMIC */ -#define irqentry_exit_cond_resched() raw_irqentry_exit_cond_resched() -#endif /* CONFIG_PREEMPT_DYNAMIC */ - /** * irqentry_exit - Handle return from exception that used irqentry_enter() * @regs: Pointer to pt_regs (exception entry regs) -- cgit v1.2.3 From c5538d0141b383808f440186fcd0bc2799af2853 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 7 Apr 2026 14:16:44 +0100 Subject: entry: Split kernel mode logic from irqentry_{enter,exit}() The generic irqentry code has entry/exit functions specifically for exceptions taken from user mode, but doesn't have entry/exit functions specifically for exceptions taken from kernel mode. It would be helpful to have separate entry/exit functions specifically for exceptions taken from kernel mode. This would make the structure of the entry code more consistent, and would make it easier for architectures to manage logic specific to exceptions taken from kernel mode. Move the logic specific to kernel mode out of irqentry_enter() and irqentry_exit() into new irqentry_enter_from_kernel_mode() and irqentry_exit_to_kernel_mode() functions. These are marked __always_inline and placed in irq-entry-common.h, as with irqentry_enter_from_user_mode() and irqentry_exit_to_user_mode(), so that they can be inlined into architecture-specific wrappers. The existing out-of-line irqentry_enter() and irqentry_exit() functions retained as callers of the new functions. The lockdep assertion from irqentry_exit() is moved into irqentry_exit_to_user_mode() and irqentry_exit_to_kernel_mode(). This was previously missing from irqentry_exit_to_user_mode() when called directly, and any new lockdep assertion failure relating from this change is a latent bug. Aside from the lockdep change noted above, there should be no functional change as a result of this change. [ tglx: Updated kernel doc ] Signed-off-by: Mark Rutland Signed-off-by: Thomas Gleixner Reviewed-by: Jinjie Ruan Acked-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260407131650.3813777-5-mark.rutland@arm.com --- include/linux/irq-entry-common.h | 134 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 134 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index d1e8591a5919..66bc168bc6b5 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -304,6 +304,8 @@ static __always_inline void irqentry_enter_from_user_mode(struct pt_regs *regs) */ static __always_inline void irqentry_exit_to_user_mode(struct pt_regs *regs) { + lockdep_assert_irqs_disabled(); + instrumentation_begin(); irqentry_exit_to_user_mode_prepare(regs); instrumentation_end(); @@ -356,6 +358,138 @@ void dynamic_irqentry_exit_cond_resched(void); #define irqentry_exit_cond_resched() raw_irqentry_exit_cond_resched() #endif /* CONFIG_PREEMPT_DYNAMIC */ +/** + * irqentry_enter_from_kernel_mode - Establish state before invoking the irq handler + * @regs: Pointer to currents pt_regs + * + * Invoked from architecture specific entry code with interrupts disabled. + * Can only be called when the interrupt entry came from kernel mode. The + * calling code must be non-instrumentable. When the function returns all + * state is correct and the subsequent functions can be instrumented. + * + * The function establishes state (lockdep, RCU (context tracking), tracing) and + * is provided for architectures which require a strict split between entry from + * kernel and user mode and therefore cannot use irqentry_enter() which handles + * both entry modes. + * + * Returns: An opaque object that must be passed to irqentry_exit_to_kernel_mode(). + */ +static __always_inline irqentry_state_t irqentry_enter_from_kernel_mode(struct pt_regs *regs) +{ + irqentry_state_t ret = { + .exit_rcu = false, + }; + + /* + * If this entry hit the idle task invoke ct_irq_enter() whether + * RCU is watching or not. + * + * Interrupts can nest when the first interrupt invokes softirq + * processing on return which enables interrupts. + * + * Scheduler ticks in the idle task can mark quiescent state and + * terminate a grace period, if and only if the timer interrupt is + * not nested into another interrupt. + * + * Checking for rcu_is_watching() here would prevent the nesting + * interrupt to invoke ct_irq_enter(). If that nested interrupt is + * the tick then rcu_flavor_sched_clock_irq() would wrongfully + * assume that it is the first interrupt and eventually claim + * quiescent state and end grace periods prematurely. + * + * Unconditionally invoke ct_irq_enter() so RCU state stays + * consistent. + * + * TINY_RCU does not support EQS, so let the compiler eliminate + * this part when enabled. + */ + if (!IS_ENABLED(CONFIG_TINY_RCU) && + (is_idle_task(current) || arch_in_rcu_eqs())) { + /* + * If RCU is not watching then the same careful + * sequence vs. lockdep and tracing is required + * as in irqentry_enter_from_user_mode(). + */ + lockdep_hardirqs_off(CALLER_ADDR0); + ct_irq_enter(); + instrumentation_begin(); + kmsan_unpoison_entry_regs(regs); + trace_hardirqs_off_finish(); + instrumentation_end(); + + ret.exit_rcu = true; + return ret; + } + + /* + * If RCU is watching then RCU only wants to check whether it needs + * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick() + * already contains a warning when RCU is not watching, so no point + * in having another one here. + */ + lockdep_hardirqs_off(CALLER_ADDR0); + instrumentation_begin(); + kmsan_unpoison_entry_regs(regs); + rcu_irq_enter_check_tick(); + trace_hardirqs_off_finish(); + instrumentation_end(); + + return ret; +} + +/** + * irqentry_exit_to_kernel_mode - Run preempt checks and establish state after + * invoking the interrupt handler + * @regs: Pointer to current's pt_regs + * @state: Return value from matching call to irqentry_enter_from_kernel_mode() + * + * This is the counterpart of irqentry_enter_from_kernel_mode() and runs the + * necessary preemption check if possible and required. It returns to the caller + * with interrupts disabled and the correct state vs. tracing, lockdep and RCU + * required to return to the interrupted context. + * + * It is the last action before returning to the low level ASM code which just + * needs to return. + */ +static __always_inline void irqentry_exit_to_kernel_mode(struct pt_regs *regs, + irqentry_state_t state) +{ + lockdep_assert_irqs_disabled(); + + if (!regs_irqs_disabled(regs)) { + /* + * If RCU was not watching on entry this needs to be done + * carefully and needs the same ordering of lockdep/tracing + * and RCU as the return to user mode path. + */ + if (state.exit_rcu) { + instrumentation_begin(); + /* Tell the tracer that IRET will enable interrupts */ + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(); + instrumentation_end(); + ct_irq_exit(); + lockdep_hardirqs_on(CALLER_ADDR0); + return; + } + + instrumentation_begin(); + if (IS_ENABLED(CONFIG_PREEMPTION)) + irqentry_exit_cond_resched(); + + /* Covers both tracing and lockdep */ + trace_hardirqs_on(); + instrumentation_end(); + } else { + /* + * IRQ flags state is correct already. Just tell RCU if it + * was not watching on entry. + */ + if (state.exit_rcu) + ct_irq_exit(); + } +} + /** * irqentry_enter - Handle state tracking on ordinary interrupt entries * @regs: Pointer to pt_regs of interrupted context -- cgit v1.2.3 From 041aa7a85390c99b1de86dc28eddcff0890d8186 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 7 Apr 2026 14:16:45 +0100 Subject: entry: Split preemption from irqentry_exit_to_kernel_mode() Some architecture-specific work needs to be performed between the state management for exception entry/exit and the "real" work to handle the exception. For example, arm64 needs to manipulate a number of exception masking bits, with different exceptions requiring different masking. Generally this can all be hidden in the architecture code, but for arm64 the current structure of irqentry_exit_to_kernel_mode() makes this particularly difficult to handle in a way that is correct, maintainable, and efficient. The gory details are described in the thread surrounding: https://lore.kernel.org/lkml/acPAzdtjK5w-rNqC@J2N7QTR9R3/ The summary is: * Currently, irqentry_exit_to_kernel_mode() handles both involuntary preemption AND state management necessary for exception return. * When scheduling (including involuntary preemption), arm64 needs to have all arm64-specific exceptions unmasked, though regular interrupts must be masked. * Prior to the state management for exception return, arm64 needs to mask a number of arm64-specific exceptions, and perform some work with these exceptions masked (with RCU watching, etc). While in theory it is possible to handle this with a new arch_*() hook called somewhere under irqentry_exit_to_kernel_mode(), this is fragile and complicated, and doesn't match the flow used for exception return to user mode, which has a separate 'prepare' step (where preemption can occur) prior to the state management. To solve this, refactor irqentry_exit_to_kernel_mode() to match the style of {irqentry,syscall}_exit_to_user_mode(), moving preemption logic into a new irqentry_exit_to_kernel_mode_preempt() function, and moving state management in a new irqentry_exit_to_kernel_mode_after_preempt() function. The existing irqentry_exit_to_kernel_mode() is left as a caller of both of these, avoiding the need to modify existing callers. There should be no functional change as a result of this change. [ tglx: Updated kernel doc ] Signed-off-by: Mark Rutland Signed-off-by: Thomas Gleixner Reviewed-by: Jinjie Ruan Acked-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260407131650.3813777-6-mark.rutland@arm.com --- include/linux/irq-entry-common.h | 73 ++++++++++++++++++++++++++++++++-------- 1 file changed, 59 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index 66bc168bc6b5..384520217bfe 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -438,24 +438,46 @@ static __always_inline irqentry_state_t irqentry_enter_from_kernel_mode(struct p } /** - * irqentry_exit_to_kernel_mode - Run preempt checks and establish state after - * invoking the interrupt handler + * irqentry_exit_to_kernel_mode_preempt - Run preempt checks on return to kernel mode * @regs: Pointer to current's pt_regs * @state: Return value from matching call to irqentry_enter_from_kernel_mode() * - * This is the counterpart of irqentry_enter_from_kernel_mode() and runs the - * necessary preemption check if possible and required. It returns to the caller - * with interrupts disabled and the correct state vs. tracing, lockdep and RCU - * required to return to the interrupted context. + * This is to be invoked before irqentry_exit_to_kernel_mode_after_preempt() to + * allow kernel preemption on return from interrupt. + * + * Must be invoked with interrupts disabled and CPU state which allows kernel + * preemption. * - * It is the last action before returning to the low level ASM code which just - * needs to return. + * After returning from this function, the caller can modify CPU state before + * invoking irqentry_exit_to_kernel_mode_after_preempt(), which is required to + * re-establish the tracing, lockdep and RCU state for returning to the + * interrupted context. */ -static __always_inline void irqentry_exit_to_kernel_mode(struct pt_regs *regs, - irqentry_state_t state) +static inline void irqentry_exit_to_kernel_mode_preempt(struct pt_regs *regs, + irqentry_state_t state) { - lockdep_assert_irqs_disabled(); + if (regs_irqs_disabled(regs) || state.exit_rcu) + return; + + if (IS_ENABLED(CONFIG_PREEMPTION)) + irqentry_exit_cond_resched(); +} +/** + * irqentry_exit_to_kernel_mode_after_preempt - Establish trace, lockdep and RCU state + * @regs: Pointer to current's pt_regs + * @state: Return value from matching call to irqentry_enter_from_kernel_mode() + * + * This is to be invoked after irqentry_exit_to_kernel_mode_preempt() and before + * actually returning to the interrupted context. + * + * There are no requirements for the CPU state other than being able to complete + * the tracing, lockdep and RCU state transitions. After this function returns + * the caller must return directly to the interrupted context. + */ +static __always_inline void +irqentry_exit_to_kernel_mode_after_preempt(struct pt_regs *regs, irqentry_state_t state) +{ if (!regs_irqs_disabled(regs)) { /* * If RCU was not watching on entry this needs to be done @@ -474,9 +496,6 @@ static __always_inline void irqentry_exit_to_kernel_mode(struct pt_regs *regs, } instrumentation_begin(); - if (IS_ENABLED(CONFIG_PREEMPTION)) - irqentry_exit_cond_resched(); - /* Covers both tracing and lockdep */ trace_hardirqs_on(); instrumentation_end(); @@ -490,6 +509,32 @@ static __always_inline void irqentry_exit_to_kernel_mode(struct pt_regs *regs, } } +/** + * irqentry_exit_to_kernel_mode - Run preempt checks and establish state after + * invoking the interrupt handler + * @regs: Pointer to current's pt_regs + * @state: Return value from matching call to irqentry_enter_from_kernel_mode() + * + * This is the counterpart of irqentry_enter_from_kernel_mode() and combines + * the calls to irqentry_exit_to_kernel_mode_preempt() and + * irqentry_exit_to_kernel_mode_after_preempt(). + * + * The requirement for the CPU state is that it can schedule. After the function + * returns the tracing, lockdep and RCU state transitions are completed and the + * caller must return directly to the interrupted context. + */ +static __always_inline void irqentry_exit_to_kernel_mode(struct pt_regs *regs, + irqentry_state_t state) +{ + lockdep_assert_irqs_disabled(); + + instrumentation_begin(); + irqentry_exit_to_kernel_mode_preempt(regs, state); + instrumentation_end(); + + irqentry_exit_to_kernel_mode_after_preempt(regs, state); +} + /** * irqentry_enter - Handle state tracking on ordinary interrupt entries * @regs: Pointer to pt_regs of interrupted context -- cgit v1.2.3 From d33e89d12295087afd108a42f5e240ff53820461 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 7 Apr 2026 16:09:21 +0200 Subject: thermal: core: Suspend thermal zones later and resume them earlier To avoid some undesirable interactions between thermal zone suspend and resume with user space that is running when those operations are carried out, move them closer to the suspend and resume of devices, respectively, by updating dpm_prepare() to carry out thermal zone suspend and dpm_complete() to start thermal zone resume (that will continue asynchronously). This also makes the code easier to follow by removing one, arguably redundant, level of indirection represented by the thermal PM notifier. Signed-off-by: Rafael J. Wysocki Reviewed-by: Armin Wolf Link: https://patch.msgid.link/2036875.PYKUYFuaPT@rafael.j.wysocki --- include/linux/thermal.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 0b5ed6821080..0ddc77aeeca2 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -273,6 +273,9 @@ bool thermal_trip_is_bound_to_cdev(struct thermal_zone_device *tz, int thermal_zone_device_enable(struct thermal_zone_device *tz); int thermal_zone_device_disable(struct thermal_zone_device *tz); void thermal_zone_device_critical(struct thermal_zone_device *tz); + +void thermal_pm_prepare(void); +void thermal_pm_complete(void); #else static inline struct thermal_zone_device *thermal_zone_device_register_with_trips( const char *type, @@ -350,6 +353,9 @@ static inline int thermal_zone_device_enable(struct thermal_zone_device *tz) static inline int thermal_zone_device_disable(struct thermal_zone_device *tz) { return -ENODEV; } + +static inline void thermal_pm_prepare(void) {} +static inline void thermal_pm_complete(void) {} #endif /* CONFIG_THERMAL */ #endif /* __THERMAL_H__ */ -- cgit v1.2.3 From 408df6213f56f467675dc0ecf156a8bd1984555e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 6 Apr 2026 21:36:48 -0700 Subject: dma-fence: correct kernel-doc function parameter @flags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 'make htmldocs' complains that dma_fence_unlock_irqrestore() is missing a description of its @flags parameter. The description is there but it is missing a ':' sign. Add that and correct the possessive form of "its". WARNING: ../include/linux/dma-fence.h:414 function parameter 'flags' not described in 'dma_fence_unlock_irqrestore' Fixes: 3e5067931b5d ("dma-buf: abstract fence locking v2") Signed-off-by: Randy Dunlap Link: https://lore.kernel.org/r/20260407043649.2015894-1-rdunlap@infradead.org Reviewed-by: Christian König Signed-off-by: Christian König --- include/linux/dma-fence.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h index 3dc93f068bf6..b52ab692b22e 100644 --- a/include/linux/dma-fence.h +++ b/include/linux/dma-fence.h @@ -408,9 +408,9 @@ static inline spinlock_t *dma_fence_spinlock(struct dma_fence *fence) /** * dma_fence_unlock_irqrestore - unlock the fence and irqrestore * @fence: the fence to unlock - * @flags the CPU flags to restore + * @flags: the CPU flags to restore * - * Unlock the fence, allowing it to change it's state to signaled again. + * Unlock the fence, allowing it to change its state to signaled again. */ #define dma_fence_unlock_irqrestore(fence, flags) \ spin_unlock_irqrestore(dma_fence_spinlock(fence), flags) -- cgit v1.2.3 From 52f657e34d7b21b47434d9d8b26fa7f6778b63a0 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 8 Apr 2026 13:18:57 -0700 Subject: x86: shadow stacks: proper error handling for mmap lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 김영민 reports that shstk_pop_sigframe() doesn't check for errors from mmap_read_lock_killable(), which is a silly oversight, and also shows that we haven't marked those functions with "__must_check", which would have immediately caught it. So let's fix both issues. Reported-by: 김영민 Acked-by: Oleg Nesterov Acked-by: Dave Hansen Acked-by: Rick Edgecombe Signed-off-by: Linus Torvalds --- include/linux/mmap_lock.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 93eca48bc443..04b8f61ece5d 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -546,7 +546,7 @@ static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass) __mmap_lock_trace_acquire_returned(mm, true, true); } -static inline int mmap_write_lock_killable(struct mm_struct *mm) +static inline int __must_check mmap_write_lock_killable(struct mm_struct *mm) { int ret; @@ -593,7 +593,7 @@ static inline void mmap_read_lock(struct mm_struct *mm) __mmap_lock_trace_acquire_returned(mm, false, true); } -static inline int mmap_read_lock_killable(struct mm_struct *mm) +static inline int __must_check mmap_read_lock_killable(struct mm_struct *mm) { int ret; @@ -603,7 +603,7 @@ static inline int mmap_read_lock_killable(struct mm_struct *mm) return ret; } -static inline bool mmap_read_trylock(struct mm_struct *mm) +static inline bool __must_check mmap_read_trylock(struct mm_struct *mm) { bool ret; -- cgit v1.2.3 From 732af3aa6337fd56025c0548a9e54d6231052144 Mon Sep 17 00:00:00 2001 From: Viacheslav Dubeyko Date: Fri, 3 Apr 2026 16:05:55 -0700 Subject: hfsplus: rework logic of map nodes creation in xattr b-tree In hfsplus_init_header_node() when node_count > 63488 (header bitmap capacity), the code calculates map_nodes, subtracts them from free_nodes, and marks their positions used in the bitmap. However, it doesn't write the actual map node structure (type, record offsets, bitmap) for those physical positions, only node 0 is written. This patch reworks hfsplus_create_attributes_file() logic by introducing a specialized method of hfsplus_init_map_node() and writing the allocated map b-tree's nodes by means of hfsplus_write_attributes_file_node() method. cc: John Paul Adrian Glaubitz cc: Yangtao Li cc: linux-fsdevel@vger.kernel.org Signed-off-by: Viacheslav Dubeyko Link: https://lore.kernel.org/r/20260403230556.614171-5-slava@dubeyko.com Signed-off-by: Viacheslav Dubeyko --- include/linux/hfs_common.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hfs_common.h b/include/linux/hfs_common.h index 9e71b9a03b60..07dfc39630ab 100644 --- a/include/linux/hfs_common.h +++ b/include/linux/hfs_common.h @@ -518,6 +518,8 @@ struct hfs_btree_header_rec { #define HFSPLUS_BTREE_HDR_MAP_REC_INDEX 2 /* Map (bitmap) record in Header node */ #define HFSPLUS_BTREE_MAP_NODE_REC_INDEX 0 /* Map record in Map Node */ #define HFSPLUS_BTREE_HDR_USER_BYTES 128 +#define HFSPLUS_BTREE_MAP_NODE_RECS_COUNT 2 +#define HFSPLUS_BTREE_MAP_NODE_RESERVED_BYTES 2 /* btree key type */ #define HFSPLUS_KEY_CASEFOLDING 0xCF /* case-insensitive */ -- cgit v1.2.3 From fbb98834a9221de850a3b1afd78a25473685f9b5 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 8 Apr 2026 04:13:53 +0200 Subject: bpf: Extract bpf_get_linfo_file_line Extract bpf_get_linfo_file_line as its own function so that the logic to obtain the file, line, and line number for a given program can be shared in subsequent patches. Reviewed-by: Puranjay Mohan Acked-by: Mykyta Yatsenko Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260408021359.3786905-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 30d35d5fe40b..d8fb9d61f5ce 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3945,6 +3945,8 @@ static inline bool bpf_is_subprog(const struct bpf_prog *prog) return prog->aux->func_idx != 0; } +void bpf_get_linfo_file_line(struct btf *btf, const struct bpf_line_info *linfo, + const char **filep, const char **linep, int *nump); int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char **filep, const char **linep, int *nump); struct bpf_prog *bpf_prog_find_from_stack(void); -- cgit v1.2.3 From 4f64d5b66418b7f5967b7f7614d6107bb1fba705 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 8 Apr 2026 04:13:54 +0200 Subject: bpf: Make find_linfo widely available Move find_linfo() as bpf_find_linfo() into core.c to allow for its use in the verifier in subsequent patches. Signed-off-by: Kumar Kartikeya Dwivedi Acked-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20260408021359.3786905-4-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d8fb9d61f5ce..0136a108d083 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3945,6 +3945,7 @@ static inline bool bpf_is_subprog(const struct bpf_prog *prog) return prog->aux->func_idx != 0; } +const struct bpf_line_info *bpf_find_linfo(const struct bpf_prog *prog, u32 insn_off); void bpf_get_linfo_file_line(struct btf *btf, const struct bpf_line_info *linfo, const char **filep, const char **linep, int *nump); int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char **filep, -- cgit v1.2.3 From b773b9935239e9bec86b96ce91b6ba2252c20b44 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 7 Apr 2026 00:21:55 +0300 Subject: net: dsa: remove struct platform_data This is not used anywhere in the kernel. Signed-off-by: Vladimir Oltean Link: https://patch.msgid.link/20260406212158.721806-2-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- include/linux/platform_data/dsa.h | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/dsa.h b/include/linux/platform_data/dsa.h index d4d9bf2060a6..fec1ae5bddb9 100644 --- a/include/linux/platform_data/dsa.h +++ b/include/linux/platform_data/dsa.h @@ -48,21 +48,4 @@ struct dsa_chip_data { s8 rtable[DSA_MAX_SWITCHES]; }; -struct dsa_platform_data { - /* - * Reference to a Linux network interface that connects - * to the root switch chip of the tree. - */ - struct device *netdev; - struct net_device *of_netdev; - - /* - * Info structs describing each of the switch chips - * connected via this network interface. - */ - int nr_chips; - struct dsa_chip_data *chip; -}; - - #endif /* __DSA_PDATA_H */ -- cgit v1.2.3 From dc915f375e545cf72421d66ede983d88e298228f Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 7 Apr 2026 00:21:56 +0300 Subject: net: dsa: clean up struct dsa_chip_data This has accumulated some fields which are no longer parsed by the core or set by any driver. Remove them. Signed-off-by: Vladimir Oltean Link: https://patch.msgid.link/20260406212158.721806-3-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- include/linux/platform_data/dsa.h | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/dsa.h b/include/linux/platform_data/dsa.h index fec1ae5bddb9..031f4cf83ae2 100644 --- a/include/linux/platform_data/dsa.h +++ b/include/linux/platform_data/dsa.h @@ -10,12 +10,6 @@ struct net_device; #define DSA_RTABLE_NONE -1 struct dsa_chip_data { - /* - * How to access the switch configuration registers. - */ - struct device *host_dev; - int sw_addr; - /* * Reference to network devices */ @@ -24,12 +18,6 @@ struct dsa_chip_data { /* set to size of eeprom if supported by the switch */ int eeprom_len; - /* Device tree node pointer for this specific switch chip - * used during switch setup in case additional properties - * and resources needs to be used - */ - struct device_node *of_node; - /* * The names of the switch's ports. Use "cpu" to * designate the switch port that the cpu is connected to, @@ -38,14 +26,6 @@ struct dsa_chip_data { * or any other string to indicate this is a physical port. */ char *port_names[DSA_MAX_PORTS]; - struct device_node *port_dn[DSA_MAX_PORTS]; - - /* - * An array of which element [a] indicates which port on this - * switch should be used to send packets to that are destined - * for switch a. Can be NULL if there is only one switch chip. - */ - s8 rtable[DSA_MAX_SWITCHES]; }; #endif /* __DSA_PDATA_H */ -- cgit v1.2.3 From c3b09190e658d3f1c3cd595df3a931962662f8f0 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 7 Apr 2026 00:21:57 +0300 Subject: net: dsa: remove unused platform_data definitions Pretty self-explanatory, nobody needs these. Signed-off-by: Vladimir Oltean Link: https://patch.msgid.link/20260406212158.721806-4-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- include/linux/platform_data/dsa.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/dsa.h b/include/linux/platform_data/dsa.h index 031f4cf83ae2..77424bb24723 100644 --- a/include/linux/platform_data/dsa.h +++ b/include/linux/platform_data/dsa.h @@ -3,11 +3,8 @@ #define __DSA_PDATA_H struct device; -struct net_device; -#define DSA_MAX_SWITCHES 4 #define DSA_MAX_PORTS 12 -#define DSA_RTABLE_NONE -1 struct dsa_chip_data { /* -- cgit v1.2.3 From da9008674d9658de1e9f45d386ff6627313f39f7 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 7 Apr 2026 00:21:58 +0300 Subject: net: dsa: eliminate There is no reason at all to export these data types to the global include directory. Signed-off-by: Vladimir Oltean Link: https://patch.msgid.link/20260406212158.721806-5-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- include/linux/dsa/loop.h | 42 ------------------------------------------ 1 file changed, 42 deletions(-) delete mode 100644 include/linux/dsa/loop.h (limited to 'include/linux') diff --git a/include/linux/dsa/loop.h b/include/linux/dsa/loop.h deleted file mode 100644 index b8fef35591aa..000000000000 --- a/include/linux/dsa/loop.h +++ /dev/null @@ -1,42 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef DSA_LOOP_H -#define DSA_LOOP_H - -#include -#include -#include -#include - -struct dsa_loop_vlan { - u16 members; - u16 untagged; -}; - -struct dsa_loop_mib_entry { - char name[ETH_GSTRING_LEN]; - unsigned long val; -}; - -enum dsa_loop_mib_counters { - DSA_LOOP_PHY_READ_OK, - DSA_LOOP_PHY_READ_ERR, - DSA_LOOP_PHY_WRITE_OK, - DSA_LOOP_PHY_WRITE_ERR, - __DSA_LOOP_CNT_MAX, -}; - -struct dsa_loop_port { - struct dsa_loop_mib_entry mib[__DSA_LOOP_CNT_MAX]; - u16 pvid; - int mtu; -}; - -struct dsa_loop_priv { - struct mii_bus *bus; - unsigned int port_base; - struct dsa_loop_vlan vlans[VLAN_N_VID]; - struct net_device *netdev; - struct dsa_loop_port ports[DSA_MAX_PORTS]; -}; - -#endif /* DSA_LOOP_H */ -- cgit v1.2.3 From f9e3bd43d55f24331e5ea65f667dbb33716e7d6b Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Fri, 3 Apr 2026 12:00:27 +0300 Subject: net/mlx5: Rename MLX5_PF page counter type to MLX5_SELF The MLX5_PF enum value in mlx5_func_type is used to track firmware page allocations for the page manager function itself, which is either the ECPF on SmartNIC systems or the host PF when there is no ECPF. Rename it to MLX5_SELF to accurately reflect that this counter tracks pages allocated by the manager for its own use, regardless of whether it is a PF or ECPF. Signed-off-by: Moshe Shemesh Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20260403090028.137783-2-tariqt@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/driver.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index b8b5af78284d..10bc913591d5 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -550,7 +550,7 @@ struct mlx5_debugfs_entries { }; enum mlx5_func_type { - MLX5_PF, + MLX5_SELF, MLX5_VF, MLX5_SF, MLX5_HOST_PF, -- cgit v1.2.3 From a1bac8b70ede332a05487081c7512d2947f3a912 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Fri, 3 Apr 2026 12:00:28 +0300 Subject: net/mlx5: Add icm_mng_function_id_mode cap bit Introduce the capability bit icm_mng_function_id_mode to indicate that the device firmware uses vhca_id instead of function_id as the effective identifier for the firmware commands MANAGE_PAGES, QUERY_PAGES, and page request event. Signed-off-by: Moshe Shemesh Reviewed-by: Akiva Goldberger Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20260403090028.137783-3-tariqt@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 2400b4c38c77..007f5138db2b 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1654,6 +1654,11 @@ enum { MLX5_STEERING_FORMAT_CONNECTX_8 = 3, }; +enum { + MLX5_ID_MODE_FUNCTION_INDEX = 0, + MLX5_ID_MODE_FUNCTION_VHCA_ID = 1, +}; + struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_0[0x6]; u8 page_request_disable[0x1]; @@ -1916,7 +1921,8 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_280[0x10]; u8 max_wqe_sz_sq[0x10]; - u8 reserved_at_2a0[0x7]; + u8 icm_mng_function_id_mode[0x1]; + u8 reserved_at_2a1[0x6]; u8 mkey_pcie_tph[0x1]; u8 reserved_at_2a8[0x1]; u8 tis_tir_td_order[0x1]; -- cgit v1.2.3 From e3b2cf6e5dba416a03152f299d99982dfe1e861d Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 1 Apr 2026 12:15:58 +0200 Subject: kernfs: pass struct ns_common instead of const void * for namespace tags kernfs has historically used const void * to pass around namespace tags used for directory-level namespace filtering. The only current user of this is sysfs network namespace tagging where struct net pointers are cast to void *. Replace all const void * namespace parameters with const struct ns_common * throughout the kernfs, sysfs, and kobject namespace layers. This includes the kobj_ns_type_operations callbacks, kobject_namespace(), and all sysfs/kernfs APIs that accept or return namespace tags. Passing struct ns_common is needed because various codepaths require access to the underlying namespace. A struct ns_common can always be converted back to the concrete namespace type (e.g., struct net) via container_of() or to_ns_common() in the reverse direction. This is a preparatory change for switching to ns_id-based directory iteration to prevent a KASLR pointer leak through the current use of raw namespace pointers as hash seeds and comparison keys. Signed-off-by: Christian Brauner --- include/linux/device/class.h | 6 +++--- include/linux/kernfs.h | 40 ++++++++++++++++++++++++---------------- include/linux/kobject.h | 4 ++-- include/linux/kobject_ns.h | 13 +++++++------ include/linux/netdevice.h | 4 ++-- include/linux/sysfs.h | 24 ++++++++++++------------ 6 files changed, 50 insertions(+), 41 deletions(-) (limited to 'include/linux') diff --git a/include/linux/device/class.h b/include/linux/device/class.h index 65880e60c720..021da0d61796 100644 --- a/include/linux/device/class.h +++ b/include/linux/device/class.h @@ -62,7 +62,7 @@ struct class { int (*shutdown_pre)(struct device *dev); const struct kobj_ns_type_operations *ns_type; - const void *(*namespace)(const struct device *dev); + const struct ns_common *(*namespace)(const struct device *dev); void (*get_ownership)(const struct device *dev, kuid_t *uid, kgid_t *gid); @@ -180,9 +180,9 @@ struct class_attribute { struct class_attribute class_attr_##_name = __ATTR_WO(_name) int __must_check class_create_file_ns(const struct class *class, const struct class_attribute *attr, - const void *ns); + const struct ns_common *ns); void class_remove_file_ns(const struct class *class, const struct class_attribute *attr, - const void *ns); + const struct ns_common *ns); static inline int __must_check class_create_file(const struct class *class, const struct class_attribute *attr) diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index b5a5f32fdfd1..4f0ab88a1b31 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -23,6 +23,7 @@ struct file; struct dentry; struct iattr; +struct ns_common; struct seq_file; struct vm_area_struct; struct vm_operations_struct; @@ -209,7 +210,7 @@ struct kernfs_node { struct rb_node rb; - const void *ns; /* namespace tag */ + const struct ns_common *ns; /* namespace tag */ unsigned int hash; /* ns + name hash */ unsigned short flags; umode_t mode; @@ -331,7 +332,7 @@ struct kernfs_ops { */ struct kernfs_fs_context { struct kernfs_root *root; /* Root of the hierarchy being mounted */ - void *ns_tag; /* Namespace tag of the mount (or NULL) */ + struct ns_common *ns_tag; /* Namespace tag of the mount (or NULL) */ unsigned long magic; /* File system specific magic number */ /* The following are set/used by kernfs_mount() */ @@ -406,9 +407,11 @@ void pr_cont_kernfs_name(struct kernfs_node *kn); void pr_cont_kernfs_path(struct kernfs_node *kn); struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn); struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent, - const char *name, const void *ns); + const char *name, + const struct ns_common *ns); struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent, - const char *path, const void *ns); + const char *path, + const struct ns_common *ns); void kernfs_get(struct kernfs_node *kn); void kernfs_put(struct kernfs_node *kn); @@ -426,7 +429,8 @@ unsigned int kernfs_root_flags(struct kernfs_node *kn); struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, - void *priv, const void *ns); + void *priv, + const struct ns_common *ns); struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent, const char *name); struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, @@ -434,7 +438,8 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, kuid_t uid, kgid_t gid, loff_t size, const struct kernfs_ops *ops, - void *priv, const void *ns, + void *priv, + const struct ns_common *ns, struct lock_class_key *key); struct kernfs_node *kernfs_create_link(struct kernfs_node *parent, const char *name, @@ -446,9 +451,9 @@ void kernfs_break_active_protection(struct kernfs_node *kn); void kernfs_unbreak_active_protection(struct kernfs_node *kn); bool kernfs_remove_self(struct kernfs_node *kn); int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, - const void *ns); + const struct ns_common *ns); int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, - const char *new_name, const void *new_ns); + const char *new_name, const struct ns_common *new_ns); int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr); __poll_t kernfs_generic_poll(struct kernfs_open_file *of, struct poll_table_struct *pt); @@ -459,7 +464,7 @@ int kernfs_xattr_get(struct kernfs_node *kn, const char *name, int kernfs_xattr_set(struct kernfs_node *kn, const char *name, const void *value, size_t size, int flags); -const void *kernfs_super_ns(struct super_block *sb); +const struct ns_common *kernfs_super_ns(struct super_block *sb); int kernfs_get_tree(struct fs_context *fc); void kernfs_free_fs_context(struct fs_context *fc); void kernfs_kill_sb(struct super_block *sb); @@ -494,11 +499,11 @@ static inline struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn) static inline struct kernfs_node * kernfs_find_and_get_ns(struct kernfs_node *parent, const char *name, - const void *ns) + const struct ns_common *ns) { return NULL; } static inline struct kernfs_node * kernfs_walk_and_get_ns(struct kernfs_node *parent, const char *path, - const void *ns) + const struct ns_common *ns) { return NULL; } static inline void kernfs_get(struct kernfs_node *kn) { } @@ -526,14 +531,15 @@ static inline unsigned int kernfs_root_flags(struct kernfs_node *kn) static inline struct kernfs_node * kernfs_create_dir_ns(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, - void *priv, const void *ns) + void *priv, const struct ns_common *ns) { return ERR_PTR(-ENOSYS); } static inline struct kernfs_node * __kernfs_create_file(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, loff_t size, const struct kernfs_ops *ops, - void *priv, const void *ns, struct lock_class_key *key) + void *priv, const struct ns_common *ns, + struct lock_class_key *key) { return ERR_PTR(-ENOSYS); } static inline struct kernfs_node * @@ -549,12 +555,14 @@ static inline bool kernfs_remove_self(struct kernfs_node *kn) { return false; } static inline int kernfs_remove_by_name_ns(struct kernfs_node *kn, - const char *name, const void *ns) + const char *name, + const struct ns_common *ns) { return -ENOSYS; } static inline int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, - const char *new_name, const void *new_ns) + const char *new_name, + const struct ns_common *new_ns) { return -ENOSYS; } static inline int kernfs_setattr(struct kernfs_node *kn, @@ -575,7 +583,7 @@ static inline int kernfs_xattr_set(struct kernfs_node *kn, const char *name, const void *value, size_t size, int flags) { return -ENOSYS; } -static inline const void *kernfs_super_ns(struct super_block *sb) +static inline const struct ns_common *kernfs_super_ns(struct super_block *sb) { return NULL; } static inline int kernfs_get_tree(struct fs_context *fc) diff --git a/include/linux/kobject.h b/include/linux/kobject.h index c8219505a79f..bcb5d4e32001 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -109,7 +109,7 @@ struct kobject *kobject_get(struct kobject *kobj); struct kobject * __must_check kobject_get_unless_zero(struct kobject *kobj); void kobject_put(struct kobject *kobj); -const void *kobject_namespace(const struct kobject *kobj); +const struct ns_common *kobject_namespace(const struct kobject *kobj); void kobject_get_ownership(const struct kobject *kobj, kuid_t *uid, kgid_t *gid); char *kobject_get_path(const struct kobject *kobj, gfp_t flag); @@ -118,7 +118,7 @@ struct kobj_type { const struct sysfs_ops *sysfs_ops; const struct attribute_group **default_groups; const struct kobj_ns_type_operations *(*child_ns_type)(const struct kobject *kobj); - const void *(*namespace)(const struct kobject *kobj); + const struct ns_common *(*namespace)(const struct kobject *kobj); void (*get_ownership)(const struct kobject *kobj, kuid_t *uid, kgid_t *gid); }; diff --git a/include/linux/kobject_ns.h b/include/linux/kobject_ns.h index 150fe2ae1b6b..4f0990e09b93 100644 --- a/include/linux/kobject_ns.h +++ b/include/linux/kobject_ns.h @@ -16,6 +16,7 @@ #ifndef _LINUX_KOBJECT_NS_H #define _LINUX_KOBJECT_NS_H +struct ns_common; struct sock; struct kobject; @@ -39,10 +40,10 @@ enum kobj_ns_type { struct kobj_ns_type_operations { enum kobj_ns_type type; bool (*current_may_mount)(void); - void *(*grab_current_ns)(void); - const void *(*netlink_ns)(struct sock *sk); - const void *(*initial_ns)(void); - void (*drop_ns)(void *); + struct ns_common *(*grab_current_ns)(void); + const struct ns_common *(*netlink_ns)(struct sock *sk); + const struct ns_common *(*initial_ns)(void); + void (*drop_ns)(struct ns_common *); }; int kobj_ns_type_register(const struct kobj_ns_type_operations *ops); @@ -51,7 +52,7 @@ const struct kobj_ns_type_operations *kobj_child_ns_ops(const struct kobject *pa const struct kobj_ns_type_operations *kobj_ns_ops(const struct kobject *kobj); bool kobj_ns_current_may_mount(enum kobj_ns_type type); -void *kobj_ns_grab_current(enum kobj_ns_type type); -void kobj_ns_drop(enum kobj_ns_type type, void *ns); +struct ns_common *kobj_ns_grab_current(enum kobj_ns_type type); +void kobj_ns_drop(enum kobj_ns_type type, struct ns_common *ns); #endif /* _LINUX_KOBJECT_NS_H */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7ca01eb3f7d2..85c20bdd36fb 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -5339,9 +5339,9 @@ static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_devi } int netdev_class_create_file_ns(const struct class_attribute *class_attr, - const void *ns); + const struct ns_common *ns); void netdev_class_remove_file_ns(const struct class_attribute *class_attr, - const void *ns); + const struct ns_common *ns); extern const struct kobj_ns_type_operations net_ns_type_operations; diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index 99b775f3ff46..468259fb6049 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -396,13 +396,13 @@ struct sysfs_ops { #ifdef CONFIG_SYSFS -int __must_check sysfs_create_dir_ns(struct kobject *kobj, const void *ns); +int __must_check sysfs_create_dir_ns(struct kobject *kobj, const struct ns_common *ns); void sysfs_remove_dir(struct kobject *kobj); int __must_check sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name, - const void *new_ns); + const struct ns_common *new_ns); int __must_check sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj, - const void *new_ns); + const struct ns_common *new_ns); int __must_check sysfs_create_mount_point(struct kobject *parent_kobj, const char *name); void sysfs_remove_mount_point(struct kobject *parent_kobj, @@ -410,7 +410,7 @@ void sysfs_remove_mount_point(struct kobject *parent_kobj, int __must_check sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr, - const void *ns); + const struct ns_common *ns); int __must_check sysfs_create_files(struct kobject *kobj, const struct attribute * const *attr); int __must_check sysfs_chmod_file(struct kobject *kobj, @@ -419,7 +419,7 @@ struct kernfs_node *sysfs_break_active_protection(struct kobject *kobj, const struct attribute *attr); void sysfs_unbreak_active_protection(struct kernfs_node *kn); void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr, - const void *ns); + const struct ns_common *ns); bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr); void sysfs_remove_files(struct kobject *kobj, const struct attribute * const *attr); @@ -437,7 +437,7 @@ void sysfs_remove_link(struct kobject *kobj, const char *name); int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *target, const char *old_name, const char *new_name, - const void *new_ns); + const struct ns_common *new_ns); void sysfs_delete_link(struct kobject *dir, struct kobject *targ, const char *name); @@ -502,7 +502,7 @@ ssize_t sysfs_bin_attr_simple_read(struct file *file, struct kobject *kobj, #else /* CONFIG_SYSFS */ -static inline int sysfs_create_dir_ns(struct kobject *kobj, const void *ns) +static inline int sysfs_create_dir_ns(struct kobject *kobj, const struct ns_common *ns) { return 0; } @@ -512,14 +512,14 @@ static inline void sysfs_remove_dir(struct kobject *kobj) } static inline int sysfs_rename_dir_ns(struct kobject *kobj, - const char *new_name, const void *new_ns) + const char *new_name, const struct ns_common *new_ns) { return 0; } static inline int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj, - const void *new_ns) + const struct ns_common *new_ns) { return 0; } @@ -537,7 +537,7 @@ static inline void sysfs_remove_mount_point(struct kobject *parent_kobj, static inline int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr, - const void *ns) + const struct ns_common *ns) { return 0; } @@ -567,7 +567,7 @@ static inline void sysfs_unbreak_active_protection(struct kernfs_node *kn) static inline void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr, - const void *ns) + const struct ns_common *ns) { } @@ -612,7 +612,7 @@ static inline void sysfs_remove_link(struct kobject *kobj, const char *name) static inline int sysfs_rename_link_ns(struct kobject *k, struct kobject *t, const char *old_name, - const char *new_name, const void *ns) + const char *new_name, const struct ns_common *ns) { return 0; } -- cgit v1.2.3 From 5267f6ef49cb5fba426f2d286817b1355fde31da Mon Sep 17 00:00:00 2001 From: Li Chen Date: Fri, 6 Mar 2026 16:56:39 +0800 Subject: jbd2: add jinode dirty range accessors Provide a helper to fetch jinode dirty ranges in bytes. This lets filesystem callbacks avoid depending on the internal representation, preparing for a later conversion to page units. Suggested-by: Andreas Dilger Reviewed-by: Jan Kara Signed-off-by: Li Chen Link: https://patch.msgid.link/20260306085643.465275-2-me@linux.beauty Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index a53a00d36228..64392baf5f4b 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -445,6 +445,20 @@ struct jbd2_inode { loff_t i_dirty_end; }; +static inline bool jbd2_jinode_get_dirty_range(const struct jbd2_inode *jinode, + loff_t *start, loff_t *end) +{ + loff_t start_byte = jinode->i_dirty_start; + loff_t end_byte = jinode->i_dirty_end; + + if (!end_byte) + return false; + + *start = start_byte; + *end = end_byte; + return true; +} + struct jbd2_revoke_table_s; /** -- cgit v1.2.3 From 4edafa81a1d6020272d0c6eb68faeb810dd083c1 Mon Sep 17 00:00:00 2001 From: Li Chen Date: Fri, 6 Mar 2026 16:56:42 +0800 Subject: jbd2: store jinode dirty range in PAGE_SIZE units jbd2_inode fields are updated under journal->j_list_lock, but some paths read them without holding the lock (e.g. fast commit helpers and ordered truncate helpers). READ_ONCE() alone is not sufficient for the dirty range fields when they are stored as loff_t because 32-bit platforms can observe torn loads. Store the dirty range in PAGE_SIZE units as pgoff_t instead. Represent the dirty range end as an exclusive end page. This avoids a special sentinel value and keeps MAX_LFS_FILESIZE on 32-bit representable. Publish a new dirty range by updating end_page before start_page, and treat start_page >= end_page as empty in the accessor for robustness. Use READ_ONCE() on the read side and WRITE_ONCE() on the write side for the dirty range and i_flags to match the existing lockless access pattern. Suggested-by: Jan Kara Reviewed-by: Jan Kara Signed-off-by: Li Chen Link: https://patch.msgid.link/20260306085643.465275-5-me@linux.beauty Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 64392baf5f4b..7e785aa6d35d 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -429,33 +429,43 @@ struct jbd2_inode { unsigned long i_flags; /** - * @i_dirty_start: + * @i_dirty_start_page: + * + * Dirty range start in PAGE_SIZE units. + * + * The dirty range is empty if @i_dirty_start_page is greater than or + * equal to @i_dirty_end_page. * - * Offset in bytes where the dirty range for this inode starts. * [j_list_lock] */ - loff_t i_dirty_start; + pgoff_t i_dirty_start_page; /** - * @i_dirty_end: + * @i_dirty_end_page: + * + * Dirty range end in PAGE_SIZE units (exclusive). * - * Inclusive offset in bytes where the dirty range for this inode - * ends. [j_list_lock] + * [j_list_lock] */ - loff_t i_dirty_end; + pgoff_t i_dirty_end_page; }; +/* + * Lockless readers treat start_page >= end_page as an empty range. + * Writers publish a new non-empty range by storing i_dirty_end_page before + * i_dirty_start_page. + */ static inline bool jbd2_jinode_get_dirty_range(const struct jbd2_inode *jinode, loff_t *start, loff_t *end) { - loff_t start_byte = jinode->i_dirty_start; - loff_t end_byte = jinode->i_dirty_end; + pgoff_t start_page = READ_ONCE(jinode->i_dirty_start_page); + pgoff_t end_page = READ_ONCE(jinode->i_dirty_end_page); - if (!end_byte) + if (start_page >= end_page) return false; - *start = start_byte; - *end = end_byte; + *start = (loff_t)start_page << PAGE_SHIFT; + *end = ((loff_t)end_page << PAGE_SHIFT) - 1; return true; } -- cgit v1.2.3 From a142d0ae9f2ceb0fc7417e19ecfafc8179282e35 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 25 Feb 2026 13:39:48 +0100 Subject: memblock: Permit existing reserved regions to be marked RSRV_KERN Permit existing memblock reservations to be marked as RSRV_KERN. This will be used by the EFI code on x86 to distinguish between reservations of boot services data regions that have actual significance to the kernel and regions that are reserved temporarily to work around buggy firmware. Acked-by: Mike Rapoport (Microsoft) Signed-off-by: Ard Biesheuvel --- include/linux/memblock.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 6ec5e9ac0699..9eac4f268359 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -155,6 +155,7 @@ int memblock_mark_mirror(phys_addr_t base, phys_addr_t size); int memblock_mark_nomap(phys_addr_t base, phys_addr_t size); int memblock_clear_nomap(phys_addr_t base, phys_addr_t size); int memblock_reserved_mark_noinit(phys_addr_t base, phys_addr_t size); +int memblock_reserved_mark_kern(phys_addr_t base, phys_addr_t size); int memblock_mark_kho_scratch(phys_addr_t base, phys_addr_t size); int memblock_clear_kho_scratch(phys_addr_t base, phys_addr_t size); -- cgit v1.2.3 From 592a22338e5acfcd10983699cae8ea02ecd42935 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 7 Apr 2026 17:14:31 +0200 Subject: bitops: Update kernel-doc for sign_extendXX() The sign_extendXX() lack of Return section and have other style issues. Address that by updating kernel-doc accordingly. Signed-off-by: Andy Shevchenko Signed-off-by: Yury Norov --- include/linux/bitops.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 1fe46703792f..657eab2725ce 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -179,9 +179,11 @@ static inline __u8 ror8(__u8 word, unsigned int shift) /** * sign_extend32 - sign extend a 32-bit value using specified bit as sign-bit * @value: value to sign extend - * @index: 0 based bit index (0<=index<32) to sign bit + * @index: 0 based bit index (0 <= index < 32) to sign bit * * This is safe to use for 16- and 8-bit types as well. + * + * Return: 32-bit sign extended value */ static __always_inline __s32 sign_extend32(__u32 value, int index) { @@ -192,7 +194,11 @@ static __always_inline __s32 sign_extend32(__u32 value, int index) /** * sign_extend64 - sign extend a 64-bit value using specified bit as sign-bit * @value: value to sign extend - * @index: 0 based bit index (0<=index<64) to sign bit + * @index: 0 based bit index (0 <= index < 64) to sign bit + * + * This is safe to use for 32-, 16- and 8-bit types as well. + * + * Return: 64-bit sign extended value */ static __always_inline __s64 sign_extend64(__u64 value, int index) { -- cgit v1.2.3 From d04686d9bc86432ea3008d5f358373d8466d1943 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 3 Apr 2026 01:10:19 +0200 Subject: net: Implement netdev_nl_queue_create_doit Implement netdev_nl_queue_create_doit which creates a new rx queue in a virtual netdev and then leases it to a rx queue in a physical netdev. Example with ynl client: # ynl --family netdev --output-json --do queue-create \ --json '{"ifindex": 8, "type": "rx", "lease": {"ifindex": 4, "queue": {"type": "rx", "id": 15}}}' {'id': 1} Note that the netdevice locking order is always from the virtual to the physical device. Signed-off-by: Daniel Borkmann Co-developed-by: David Wei Signed-off-by: David Wei Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20260402231031.447597-3-daniel@iogearbox.net Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e15367373f7c..e8aa9cc4075d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2561,7 +2561,14 @@ struct net_device { * Also protects some fields in: * struct napi_struct, struct netdev_queue, struct netdev_rx_queue * - * Ordering: take after rtnl_lock. + * Ordering: + * + * - take after rtnl_lock + * + * - for the case of netdev queue leasing, the netdev-scope lock is + * taken for both the virtual and the physical device; to prevent + * deadlocks, the virtual device's lock must always be acquired + * before the physical device's (see netdev_nl_queue_create_doit) */ struct mutex lock; -- cgit v1.2.3 From 25444470570b44da61366e307b3e54be653bf595 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 3 Apr 2026 01:10:29 +0200 Subject: netkit: Add netkit notifier to check for unregistering devices Add a netdevice notifier in netkit to watch for NETDEV_UNREGISTER events. If the target device is indeed NETREG_UNREGISTERING and previously leased a queue to a netkit device, then collect the related netkit devices and batch-unregister_netdevice_many() them. If this were not done, then the netkit device would hold a reference on the physical device preventing it from going away. However, in case of both io_uring zero-copy as well as AF_XDP this situation is handled gracefully and the allocated resources are torn down. In the case where mentioned infra is used through netkit, the applications have a reference on netkit, and netkit in turn holds a reference on the physical device. In order to have netkit release the reference on the physical device, we need such watcher to then unregister the netkit ones. This is generally quite similar to the dependency handling in case of tunnels (e.g. vxlan bound to a underlying netdev) where the tunnel device gets removed along with the physical device. # ip a [...] 4: enp10s0f0np0: mtu 1500 qdisc mq state DOWN group default qlen 1000 link/ether e8:eb:d3:a3:43:f6 brd ff:ff:ff:ff:ff:ff inet 10.0.0.2/24 scope global enp10s0f0np0 valid_lft forever preferred_lft forever [...] 8: nk@NONE: mtu 1500 qdisc noop state DOWN group default qlen 1000 link/ether 00:00:00:00:00:00 brd ff:ff:ff:ff:ff:ff [...] # rmmod mlx5_ib # rmmod mlx5_core [...] [ 309.261822] mlx5_core 0000:0a:00.0 mlx5_0: Port: 1 Link DOWN [ 344.235236] mlx5_core 0000:0a:00.1: E-Switch: Unload vfs: mode(LEGACY), nvfs(0), necvfs(0), active vports(0) [ 344.246948] mlx5_core 0000:0a:00.1: E-Switch: Disable: mode(LEGACY), nvfs(0), necvfs(0), active vports(0) [ 344.463754] mlx5_core 0000:0a:00.1: E-Switch: Disable: mode(LEGACY), nvfs(0), necvfs(0), active vports(0) [ 344.770155] mlx5_core 0000:0a:00.1: E-Switch: cleanup [...] # ip a [...] [ both enp10s0f0np0 and nk gone ] [...] Signed-off-by: Daniel Borkmann Co-developed-by: David Wei Signed-off-by: David Wei Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20260402231031.447597-13-daniel@iogearbox.net Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e8aa9cc4075d..47417b2d48a4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3420,6 +3420,8 @@ static inline int dev_direct_xmit(struct sk_buff *skb, u16 queue_id) int register_netdevice(struct net_device *dev); void unregister_netdevice_queue(struct net_device *dev, struct list_head *head); void unregister_netdevice_many(struct list_head *head); +bool unregister_netdevice_queued(const struct net_device *dev); + static inline void unregister_netdevice(struct net_device *dev) { unregister_netdevice_queue(dev, NULL); -- cgit v1.2.3 From 8ad7f3b265a87cd4e5052677545f90f14c855b10 Mon Sep 17 00:00:00 2001 From: Pei Xiao Date: Fri, 10 Apr 2026 10:29:24 +0800 Subject: regmap: i3c: Add non-devm regmap_init_i3c() helper Add __regmap_init_i3c() and the corresponding regmap_init_i3c() macro to allow creating a regmap for I3C devices without using the device-managed version. This mirrors the pattern already established for other buses such as I2C, SPI and so on, giving drivers more flexibility when the regmap lifetime is not directly tied to the device. Signed-off-by: Pei Xiao Link: https://patch.msgid.link/a81256a8866b163979a20406abf01df7d7440104.1775788105.git.xiaopei01@kylinos.cn Signed-off-by: Mark Brown --- include/linux/regmap.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regmap.h b/include/linux/regmap.h index f1c5cb63c171..df44cb30f53b 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -693,6 +693,10 @@ struct regmap *__regmap_init_sdw_mbq(struct device *dev, struct sdw_slave *sdw, const struct regmap_sdw_mbq_cfg *mbq_config, struct lock_class_key *lock_key, const char *lock_name); +struct regmap *__regmap_init_i3c(struct i3c_device *i3c, + const struct regmap_config *config, + struct lock_class_key *lock_key, + const char *lock_name); struct regmap *__regmap_init_spi_avmm(struct spi_device *spi, const struct regmap_config *config, struct lock_class_key *lock_key, @@ -999,6 +1003,19 @@ bool regmap_ac97_default_volatile(struct device *dev, unsigned int reg); __regmap_lockdep_wrapper(__regmap_init_sdw_mbq, #config, \ dev, sdw, config, mbq_config) +/** + * regmap_init_i3c() - Initialise register map + * + * @i3c: Device that will be interacted with + * @config: Configuration for register map + * + * The return value will be an ERR_PTR() on error or a valid pointer to + * a struct regmap. + */ +#define regmap_init_i3c(i3c, config) \ + __regmap_lockdep_wrapper(__regmap_init_i3c, #config, \ + i3c, config) + /** * regmap_init_spi_avmm() - Initialize register map for Intel SPI Slave * to AVMM Bus Bridge -- cgit v1.2.3 From 39237e3208209d1bb35d939d6fee1f36b642f562 Mon Sep 17 00:00:00 2001 From: Marco Nenciarini Date: Wed, 1 Apr 2026 22:36:36 +0200 Subject: platform/x86: int3472: Rename pled to led in LED registration code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename the privacy LED type, struct member, and functions from "pled" to "led" in preparation for supporting additional LED types beyond just the privacy LED. No functional change. Reviewed-by: Andy Shevchenko Reviewed-by: Hans de Goede Signed-off-by: Marco Nenciarini Link: https://patch.msgid.link/20260401203638.1601661-3-mnencia@kcore.it Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/platform_data/x86/int3472.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/int3472.h b/include/linux/platform_data/x86/int3472.h index dbe745dc88d5..39a1938d77e1 100644 --- a/include/linux/platform_data/x86/int3472.h +++ b/include/linux/platform_data/x86/int3472.h @@ -122,12 +122,12 @@ struct int3472_discrete_device { u8 imgclk_index; } clock; - struct int3472_pled { + struct int3472_led { struct led_classdev classdev; struct led_lookup_data lookup; char name[INT3472_LED_MAX_NAME_LEN]; struct gpio_desc *gpio; - } pled; + } led; struct int3472_discrete_quirks quirks; @@ -161,7 +161,7 @@ int skl_int3472_register_regulator(struct int3472_discrete_device *int3472, const char *second_sensor); void skl_int3472_unregister_regulator(struct int3472_discrete_device *int3472); -int skl_int3472_register_pled(struct int3472_discrete_device *int3472, struct gpio_desc *gpio); -void skl_int3472_unregister_pled(struct int3472_discrete_device *int3472); +int skl_int3472_register_led(struct int3472_discrete_device *int3472, struct gpio_desc *gpio); +void skl_int3472_unregister_led(struct int3472_discrete_device *int3472); #endif -- cgit v1.2.3 From 218d3c44f5f0a3cc1647bc61a4e4eac663b37aa5 Mon Sep 17 00:00:00 2001 From: Marco Nenciarini Date: Wed, 1 Apr 2026 22:36:37 +0200 Subject: platform/x86: int3472: Parameterize LED con_id in registration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a con_id parameter to skl_int3472_register_led() to allow callers to specify both the LED name suffix and lookup con_id instead of hardcoding "privacy". This prepares for registering additional LED types with different names. While at it, rename the privacy LED's GPIO con_id from "privacy-led" to "privacy" in int3472_get_con_id_and_polarity() and pass it directly to skl_int3472_register_led(), reducing churn when adding new LED types. No functional change. Reviewed-by: Andy Shevchenko Reviewed-by: Hans de Goede Signed-off-by: Marco Nenciarini Link: https://patch.msgid.link/20260401203638.1601661-4-mnencia@kcore.it Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/platform_data/x86/int3472.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/int3472.h b/include/linux/platform_data/x86/int3472.h index 39a1938d77e1..ebf4d0637624 100644 --- a/include/linux/platform_data/x86/int3472.h +++ b/include/linux/platform_data/x86/int3472.h @@ -161,7 +161,8 @@ int skl_int3472_register_regulator(struct int3472_discrete_device *int3472, const char *second_sensor); void skl_int3472_unregister_regulator(struct int3472_discrete_device *int3472); -int skl_int3472_register_led(struct int3472_discrete_device *int3472, struct gpio_desc *gpio); +int skl_int3472_register_led(struct int3472_discrete_device *int3472, struct gpio_desc *gpio, + const char *con_id); void skl_int3472_unregister_led(struct int3472_discrete_device *int3472); #endif -- cgit v1.2.3 From cde32a92d4562b686f730fc08d4d558ecc99d516 Mon Sep 17 00:00:00 2001 From: Sean Wang Date: Tue, 24 Feb 2026 00:13:18 -0600 Subject: mmc: sdio: add MediaTek MT7902 SDIO device ID Add SDIO device ID (0x790a) for MediaTek MT7902 to sdio_ids.h. Acked-by: Ulf Hansson Signed-off-by: Sean Wang Signed-off-by: Luiz Augusto von Dentz --- include/linux/mmc/sdio_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h index 673cbdf43453..dce89c110691 100644 --- a/include/linux/mmc/sdio_ids.h +++ b/include/linux/mmc/sdio_ids.h @@ -111,6 +111,7 @@ #define SDIO_VENDOR_ID_MEDIATEK 0x037a #define SDIO_DEVICE_ID_MEDIATEK_MT7663 0x7663 #define SDIO_DEVICE_ID_MEDIATEK_MT7668 0x7668 +#define SDIO_DEVICE_ID_MEDIATEK_MT7902 0x790a #define SDIO_DEVICE_ID_MEDIATEK_MT7961 0x7961 #define SDIO_VENDOR_ID_MICROCHIP_WILC 0x0296 -- cgit v1.2.3 From 30a59dddd688bbd75f54e96b174a7aac914774d2 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Tue, 7 Apr 2026 16:58:09 -0300 Subject: vfs: introduce d_mark_tmpfile_name() CIFS requires O_TMPFILE dentries to have names of newly created delete-on-close files in the server so it can build full pathnames from the root of the share when performing operations on them. Suggested-by: Al Viro Signed-off-by: Paulo Alcantara (Red Hat) Cc: Christian Brauner Cc: Jan Kara Cc: David Howells Cc: Matthew Wilcox Cc: linux-fsdevel@vger.kernel.org Cc: linux-cifs@vger.kernel.org Signed-off-by: Steve French --- include/linux/dcache.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 898c60d21c92..f60819dcfebd 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -264,6 +264,7 @@ extern void d_invalidate(struct dentry *); extern struct dentry * d_make_root(struct inode *); extern void d_mark_tmpfile(struct file *, struct inode *); +void d_mark_tmpfile_name(struct file *file, const struct qstr *name); extern void d_tmpfile(struct file *, struct inode *); extern struct dentry *d_find_alias(struct inode *); -- cgit v1.2.3 From 7cd9a5d7d4b75802b97aa89f6f53375a6d84d1d5 Mon Sep 17 00:00:00 2001 From: Cheng-Yang Chou Date: Fri, 10 Apr 2026 07:54:06 -1000 Subject: sched_ext: Remove runtime kfunc mask enforcement Now that scx_kfunc_context_filter enforces context-sensitive kfunc restrictions at BPF load time, the per-task runtime enforcement via scx_kf_mask is redundant. Remove it entirely: - Delete enum scx_kf_mask, the kf_mask field on sched_ext_entity, and the scx_kf_allow()/scx_kf_disallow()/scx_kf_allowed() helpers along with the higher_bits()/highest_bit() helpers they used. - Strip the @mask parameter (and the BUILD_BUG_ON checks) from the SCX_CALL_OP[_RET]/SCX_CALL_OP_TASK[_RET]/SCX_CALL_OP_2TASKS_RET macros and update every call site. Reflow call sites that were wrapped only to fit the old 5-arg form and now collapse onto a single line under ~100 cols. - Remove the in-kfunc scx_kf_allowed() runtime checks from scx_dsq_insert_preamble(), scx_dsq_move(), scx_bpf_dispatch_nr_slots(), scx_bpf_dispatch_cancel(), scx_bpf_dsq_move_to_local___v2(), scx_bpf_sub_dispatch(), scx_bpf_reenqueue_local(), and the per-call guard inside select_cpu_from_kfunc(). scx_bpf_task_cgroup() and scx_kf_allowed_on_arg_tasks() were already cleaned up in the "drop redundant rq-locked check" patch. scx_kf_allowed_if_unlocked() was rewritten in the preceding "decouple" patch. No further changes to those helpers here. Co-developed-by: Juntong Deng Signed-off-by: Juntong Deng Signed-off-by: Cheng-Yang Chou Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- include/linux/sched/ext.h | 28 ---------------------------- 1 file changed, 28 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 602dc83cab36..1a3af2ea2a79 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -147,33 +147,6 @@ enum scx_ent_dsq_flags { SCX_TASK_DSQ_ON_PRIQ = 1 << 0, /* task is queued on the priority queue of a dsq */ }; -/* - * Mask bits for scx_entity.kf_mask. Not all kfuncs can be called from - * everywhere and the following bits track which kfunc sets are currently - * allowed for %current. This simple per-task tracking works because SCX ops - * nest in a limited way. BPF will likely implement a way to allow and disallow - * kfuncs depending on the calling context which will replace this manual - * mechanism. See scx_kf_allow(). - */ -enum scx_kf_mask { - SCX_KF_UNLOCKED = 0, /* sleepable and not rq locked */ - /* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */ - SCX_KF_CPU_RELEASE = 1 << 0, /* ops.cpu_release() */ - /* - * ops.dispatch() may release rq lock temporarily and thus ENQUEUE and - * SELECT_CPU may be nested inside. ops.dequeue (in REST) may also be - * nested inside DISPATCH. - */ - SCX_KF_DISPATCH = 1 << 1, /* ops.dispatch() */ - SCX_KF_ENQUEUE = 1 << 2, /* ops.enqueue() and ops.select_cpu() */ - SCX_KF_SELECT_CPU = 1 << 3, /* ops.select_cpu() */ - SCX_KF_REST = 1 << 4, /* other rq-locked operations */ - - __SCX_KF_RQ_LOCKED = SCX_KF_CPU_RELEASE | SCX_KF_DISPATCH | - SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST, - __SCX_KF_TERMINAL = SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST, -}; - enum scx_dsq_lnode_flags { SCX_DSQ_LNODE_ITER_CURSOR = 1 << 0, @@ -221,7 +194,6 @@ struct sched_ext_entity { s32 sticky_cpu; s32 holding_cpu; s32 selected_cpu; - u32 kf_mask; /* see scx_kf_mask above */ struct task_struct *kf_tasks[2]; /* see SCX_CALL_OP_TASK() */ struct list_head runnable_node; /* rq->scx.runnable_list */ -- cgit v1.2.3 From d6e152d905bdb1f32f9d99775e2f453350399a6a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 7 Apr 2026 10:54:17 +0200 Subject: clockevents: Prevent timer interrupt starvation Calvin reported an odd NMI watchdog lockup which claims that the CPU locked up in user space. He provided a reproducer, which sets up a timerfd based timer and then rearms it in a loop with an absolute expiry time of 1ns. As the expiry time is in the past, the timer ends up as the first expiring timer in the per CPU hrtimer base and the clockevent device is programmed with the minimum delta value. If the machine is fast enough, this ends up in a endless loop of programming the delta value to the minimum value defined by the clock event device, before the timer interrupt can fire, which starves the interrupt and consequently triggers the lockup detector because the hrtimer callback of the lockup mechanism is never invoked. As a first step to prevent this, avoid reprogramming the clock event device when: - a forced minimum delta event is pending - the new expiry delta is less then or equal to the minimum delta Thanks to Calvin for providing the reproducer and to Borislav for testing and providing data from his Zen5 machine. The problem is not limited to Zen5, but depending on the underlying clock event device (e.g. TSC deadline timer on Intel) and the CPU speed not necessarily observable. This change serves only as the last resort and further changes will be made to prevent this scenario earlier in the call chain as far as possible. [ tglx: Updated to restore the old behaviour vs. !force and delta <= 0 and fixed up the tick-broadcast handlers as pointed out by Borislav ] Fixes: d316c57ff6bf ("[PATCH] clockevents: add core functionality") Reported-by: Calvin Owens Signed-off-by: Thomas Gleixner Tested-by: Calvin Owens Tested-by: Borislav Petkov Link: https://lore.kernel.org/lkml/acMe-QZUel-bBYUh@mozart.vkv.me/ Link: https://patch.msgid.link/20260407083247.562657657@kernel.org --- include/linux/clockchips.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index b0df28ddd394..50cdc9da8d32 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -80,6 +80,7 @@ enum clock_event_state { * @shift: nanoseconds to cycles divisor (power of two) * @state_use_accessors:current state of the device, assigned by the core code * @features: features + * @next_event_forced: True if the last programming was a forced event * @retries: number of forced programming retries * @set_state_periodic: switch state to periodic * @set_state_oneshot: switch state to oneshot @@ -108,6 +109,7 @@ struct clock_event_device { u32 shift; enum clock_event_state state_use_accessors; unsigned int features; + unsigned int next_event_forced; unsigned long retries; int (*set_state_periodic)(struct clock_event_device *); -- cgit v1.2.3 From 33dfc521c20d02375c8696dcace04037d2a865e6 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 10 Apr 2026 13:55:52 -0700 Subject: bpf: share several utility functions as internal API Namely: - bpf_subprog_is_global - bpf_vlog_alignment Acked-by: Mykyta Yatsenko Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260410-patch-set-v4-1-5d4eecb343db@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 36bfd96d4563..15f7f9f35be9 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -1121,12 +1121,14 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie u32 frameno, bool print_all); void print_insn_state(struct bpf_verifier_env *env, const struct bpf_verifier_state *vstate, u32 frameno); +u32 bpf_vlog_alignment(u32 pos); struct bpf_subprog_info *bpf_find_containing_subprog(struct bpf_verifier_env *env, int off); int bpf_jmp_offset(struct bpf_insn *insn); struct bpf_iarray *bpf_insn_successors(struct bpf_verifier_env *env, u32 idx); void bpf_fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask); bool bpf_calls_callback(struct bpf_verifier_env *env, int insn_idx); +bool bpf_subprog_is_global(const struct bpf_verifier_env *env, int subprog); int bpf_find_subprog(struct bpf_verifier_env *env, int off); int bpf_compute_const_regs(struct bpf_verifier_env *env); -- cgit v1.2.3 From cf3ee1ecf3466ddb978a58df9d5b638e7dff673d Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 10 Apr 2026 13:55:53 -0700 Subject: bpf: save subprogram name in bpf_subprog_info Subprogram name can be computed from function info and BTF, but it is convenient to have the name readily available for logging purposes. Update comment saying that bpf_subprog_info->start has to be the first field, this is no longer true, relevant sites access .start field by it's name. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260410-patch-set-v4-2-5d4eecb343db@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 15f7f9f35be9..cec6054d6333 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -664,7 +664,7 @@ enum priv_stack_mode { }; struct bpf_subprog_info { - /* 'start' has to be the first field otherwise find_subprog() won't work */ + const char *name; /* name extracted from BTF */ u32 start; /* insn idx of function entry point */ u32 linfo_idx; /* The idx to the main_prog->aux->linfo */ u32 postorder_start; /* The idx to the env->cfg.insn_postorder */ -- cgit v1.2.3 From 2ad45b414b8779ba5c50f746fd767926cccde729 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 10 Apr 2026 13:55:54 -0700 Subject: bpf: Add spis_*() helpers for 4-byte stack slot bitmasks Add helper functions for manipulating u64[2] bitmasks that represent 4-byte stack slot liveness. The 512-byte BPF stack is divided into 128 4-byte slots, requiring 128 bits (two u64s) to track. These will be used by the static stack liveness analysis in the next commit. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260410-patch-set-v4-3-5d4eecb343db@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 67 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index cec6054d6333..f34e7a074a20 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -224,6 +224,73 @@ enum bpf_stack_slot_type { #define BPF_REG_SIZE 8 /* size of eBPF register in bytes */ +/* 4-byte stack slot granularity for liveness analysis */ +#define BPF_HALF_REG_SIZE 4 +#define STACK_SLOTS (MAX_BPF_STACK / BPF_HALF_REG_SIZE) /* 128 */ + +typedef struct { + u64 v[2]; +} spis_t; + +#define SPIS_ZERO ((spis_t){}) +#define SPIS_ALL ((spis_t){{ U64_MAX, U64_MAX }}) + +static inline bool spis_is_zero(spis_t s) +{ + return s.v[0] == 0 && s.v[1] == 0; +} + +static inline bool spis_equal(spis_t a, spis_t b) +{ + return a.v[0] == b.v[0] && a.v[1] == b.v[1]; +} + +static inline spis_t spis_or(spis_t a, spis_t b) +{ + return (spis_t){{ a.v[0] | b.v[0], a.v[1] | b.v[1] }}; +} + +static inline spis_t spis_and(spis_t a, spis_t b) +{ + return (spis_t){{ a.v[0] & b.v[0], a.v[1] & b.v[1] }}; +} + +static inline spis_t spis_xor(spis_t a, spis_t b) +{ + return (spis_t){{ a.v[0] ^ b.v[0], a.v[1] ^ b.v[1] }}; +} + +static inline spis_t spis_not(spis_t s) +{ + return (spis_t){{ ~s.v[0], ~s.v[1] }}; +} + +static inline bool spis_test_bit(spis_t s, u32 slot) +{ + return s.v[slot / 64] & BIT_ULL(slot % 64); +} + +static inline void spis_or_range(spis_t *mask, u32 lo, u32 hi) +{ + u32 w; + + for (w = lo; w <= hi && w < STACK_SLOTS; w++) + mask->v[w / 64] |= BIT_ULL(w % 64); +} + +static inline spis_t spis_one_bit(u32 slot) +{ + if (slot < 64) + return (spis_t){{ BIT_ULL(slot), 0 }}; + else + return (spis_t){{ 0, BIT_ULL(slot - 64) }}; +} + +static inline spis_t spis_single_slot(u32 spi) +{ + return spis_or(spis_one_bit(spi * 2), spis_one_bit(spi * 2 + 1)); +} + #define BPF_REGMASK_ARGS ((1 << BPF_REG_1) | (1 << BPF_REG_2) | \ (1 << BPF_REG_3) | (1 << BPF_REG_4) | \ (1 << BPF_REG_5)) -- cgit v1.2.3 From 7ca5f68cda073a6c4aa6135e98a27c7b2a731cdd Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 10 Apr 2026 13:55:55 -0700 Subject: bpf: make liveness.c track stack with 4-byte granularity Convert liveness bitmask type from u64 to spis_t, doubling the number of trackable stack slots from 64 to 128 to support 4-byte granularity. Each 8-byte SPI now maps to two consecutive 4-byte sub-slots in the bitmask: spi*2 half and spi*2+1 half. In verifier.c, check_stack_write_fixed_off() now reports 4-byte aligned writes of 4-byte writes as half-slot marks and 8-byte aligned 8-byte writes as two slots. Similar logic applied in check_stack_read_fixed_off(). Queries (is_live_before) are not yet migrated to half-slot granularity. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260410-patch-set-v4-4-5d4eecb343db@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index f34e7a074a20..7b31d8024c61 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -1217,8 +1217,8 @@ s64 bpf_kfunc_stack_access_bytes(struct bpf_verifier_env *env, int bpf_stack_liveness_init(struct bpf_verifier_env *env); void bpf_stack_liveness_free(struct bpf_verifier_env *env); int bpf_update_live_stack(struct bpf_verifier_env *env); -int bpf_mark_stack_read(struct bpf_verifier_env *env, u32 frameno, u32 insn_idx, u64 mask); -void bpf_mark_stack_write(struct bpf_verifier_env *env, u32 frameno, u64 mask); +int bpf_mark_stack_read(struct bpf_verifier_env *env, u32 frameno, u32 insn_idx, spis_t mask); +void bpf_mark_stack_write(struct bpf_verifier_env *env, u32 frameno, spis_t mask); int bpf_reset_stack_write_marks(struct bpf_verifier_env *env, u32 insn_idx); int bpf_commit_stack_write_marks(struct bpf_verifier_env *env); int bpf_live_stack_query_init(struct bpf_verifier_env *env, struct bpf_verifier_state *st); -- cgit v1.2.3 From bf0c571f7feb6fa05a512e2a5e50702501849d61 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 10 Apr 2026 13:55:58 -0700 Subject: bpf: introduce forward arg-tracking dataflow analysis MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The analysis is a basis for static liveness tracking mechanism introduced by the next two commits. A forward fixed-point analysis that tracks which frame's FP each register value is derived from, and at what byte offset. This is needed because a callee can receive a pointer to its caller's stack frame (e.g. r1 = fp-16 at the call site), then do *(u64 *)(r1 + 0) inside the callee — a cross-frame stack access that the callee's local liveness must attribute to the caller's stack. Each register holds an arg_track value from a three-level lattice: - Precise {frame=N, off=[o1,o2,...]} — known frame index and up to 4 concrete byte offsets - Offset-imprecise {frame=N, off_cnt=0} — known frame, unknown offset - Fully-imprecise {frame=ARG_IMPRECISE, mask=bitmask} — unknown frame, mask says which frames might be involved At CFG merge points the lattice moves toward imprecision (same frame+offset stays precise, same frame different offsets merges offset sets or becomes offset-imprecise, different frames become fully-imprecise with OR'd bitmask). The analysis also tracks spills/fills to the callee's own stack (at_stack_in/out), so FP derived values spilled and reloaded. This pass is run recursively per call site: when subprog A calls B with specific FP-derived arguments, B is re-analyzed with those entry args. The recursion follows analyze_subprog -> compute_subprog_args -> (for each call insn) -> analyze_subprog. Subprogs that receive no FP-derived args are skipped during recursion and analyzed independently at depth 0. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260410-patch-set-v4-7-5d4eecb343db@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 7b31d8024c61..49b19118c326 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -226,6 +226,7 @@ enum bpf_stack_slot_type { /* 4-byte stack slot granularity for liveness analysis */ #define BPF_HALF_REG_SIZE 4 +#define STACK_SLOT_SZ 4 #define STACK_SLOTS (MAX_BPF_STACK / BPF_HALF_REG_SIZE) /* 128 */ typedef struct { @@ -886,6 +887,8 @@ struct bpf_verifier_env { } cfg; struct backtrack_state bt; struct bpf_jmp_history_entry *cur_hist_ent; + /* Per-callsite copy of parent's converged at_stack_in for cross-frame fills. */ + struct arg_track **callsite_at_stack; u32 pass_cnt; /* number of times do_check() was called */ u32 subprog_cnt; /* number of instructions analyzed by the verifier */ @@ -1213,6 +1216,7 @@ s64 bpf_helper_stack_access_bytes(struct bpf_verifier_env *env, s64 bpf_kfunc_stack_access_bytes(struct bpf_verifier_env *env, struct bpf_insn *insn, int arg, int insn_idx); +int bpf_compute_subprog_arg_access(struct bpf_verifier_env *env); int bpf_stack_liveness_init(struct bpf_verifier_env *env); void bpf_stack_liveness_free(struct bpf_verifier_env *env); -- cgit v1.2.3 From fed53dbcdb61b0fbb1cf1d5bbd68d10f97aec974 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 10 Apr 2026 13:55:59 -0700 Subject: bpf: record arg tracking results in bpf_liveness masks After arg tracking reaches a fixed point, perform a single linear scan over the converged at_in[] state and translate each memory access into liveness read/write masks on the func_instance: - Load/store instructions: FP-derived pointer's frame and offset(s) are converted to half-slot masks targeting per_frame_masks->{may_read,must_write} - Helper/kfunc calls: record_call_access() queries bpf_helper_stack_access_bytes() / bpf_kfunc_stack_access_bytes() for each FP-derived argument to determine access size and direction. Unknown access size (S64_MIN) conservatively marks all slots from fp_off to fp+0 as read. - Imprecise pointers (frame == ARG_IMPRECISE): conservatively mark all slots in every frame covered by the pointer's frame bitmask as fully read. - Static subprog calls with unresolved arguments: conservatively mark all frames as fully read. Instead of a call to clean_live_states(), start cleaning the current state continuously as registers and stack become dead since the static analysis provides complete liveness information. This makes clean_live_states() and bpf_verifier_state->cleaned unnecessary. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260410-patch-set-v4-8-5d4eecb343db@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 49b19118c326..8e83a5e66fd8 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -492,7 +492,6 @@ struct bpf_verifier_state { bool speculative; bool in_sleepable; - bool cleaned; /* first and last insn idx of this verifier state */ u32 first_insn_idx; -- cgit v1.2.3 From 6762e3a0bce5fce94bca3c34ff13cde6a07b87f3 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 10 Apr 2026 13:56:00 -0700 Subject: bpf: simplify liveness to use (callsite, depth) keyed func_instances Rework func_instance identification and remove the dynamic liveness API, completing the transition to fully static stack liveness analysis. Replace callchain-based func_instance keys with (callsite, depth) pairs. The full callchain (all ancestor callsites) is no longer part of the hash key; only the immediate callsite and the call depth matter. This does not lose precision in practice and simplifies the data structure significantly: struct callchain is removed entirely, func_instance stores just callsite, depth. Drop must_write_acc propagation. Previously, must_write marks were accumulated across successors and propagated to the caller via propagate_to_outer_instance(). Instead, callee entry liveness (live_before at subprog start) is pulled directly back to the caller's callsite in analyze_subprog() after each callee returns. Since (callsite, depth) instances are shared across different call chains that invoke the same subprog at the same depth, must_write marks from one call may be stale for another. To handle this, analyze_subprog() records into a fresh_instance() when the instance was already visited (must_write_initialized), then merge_instances() combines the results: may_read is unioned, must_write is intersected. This ensures only slots written on ALL paths through all call sites are marked as guaranteed writes. This replaces commit_stack_write_marks() logic. Skip recursive descent into callees that receive no FP-derived arguments (has_fp_args() check). This is needed because global subprogram calls can push depth beyond MAX_CALL_FRAMES (max depth is 64 for global calls but only 8 frames are accommodated for FP passing). It also handles the case where a callback subprog cannot be determined by argument tracking: such callbacks will be processed by analyze_subprog() at depth 0 independently. Update lookup_instance() (used by is_live_before queries) to search for the func_instance with maximal depth at the corresponding callsite, walking depth downward from frameno to 0. This accounts for the fact that instance depth no longer corresponds 1:1 to bpf_verifier_state->curframe, since skipped non-FP calls create gaps. Remove the dynamic public liveness API from verifier.c: - bpf_mark_stack_{read,write}(), bpf_reset/commit_stack_write_marks() - bpf_update_live_stack(), bpf_reset_live_stack_callchain() - All call sites in check_stack_{read,write}_fixed_off(), check_stack_range_initialized(), mark_stack_slot_obj_read(), mark/unmark_stack_slots_{dynptr,iter,irq_flag}() - The per-instruction write mark accumulation in do_check() - The bpf_update_live_stack() call in prepare_func_exit() mark_stack_read() and mark_stack_write() become static functions in liveness.c, called only from the static analysis pass. The func_instance->updated and must_write_dropped flags are removed. Remove spis_single_slot(), spis_one_bit() helpers from bpf_verifier.h as they are no longer used. Signed-off-by: Eduard Zingerman Tested-by: Paul Chaignon Link: https://lore.kernel.org/r/20260410-patch-set-v4-9-5d4eecb343db@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 19 ------------------- 1 file changed, 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 8e83a5e66fd8..d7fbc0e1559b 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -279,19 +279,6 @@ static inline void spis_or_range(spis_t *mask, u32 lo, u32 hi) mask->v[w / 64] |= BIT_ULL(w % 64); } -static inline spis_t spis_one_bit(u32 slot) -{ - if (slot < 64) - return (spis_t){{ BIT_ULL(slot), 0 }}; - else - return (spis_t){{ 0, BIT_ULL(slot - 64) }}; -} - -static inline spis_t spis_single_slot(u32 spi) -{ - return spis_or(spis_one_bit(spi * 2), spis_one_bit(spi * 2 + 1)); -} - #define BPF_REGMASK_ARGS ((1 << BPF_REG_1) | (1 << BPF_REG_2) | \ (1 << BPF_REG_3) | (1 << BPF_REG_4) | \ (1 << BPF_REG_5)) @@ -1219,13 +1206,7 @@ int bpf_compute_subprog_arg_access(struct bpf_verifier_env *env); int bpf_stack_liveness_init(struct bpf_verifier_env *env); void bpf_stack_liveness_free(struct bpf_verifier_env *env); -int bpf_update_live_stack(struct bpf_verifier_env *env); -int bpf_mark_stack_read(struct bpf_verifier_env *env, u32 frameno, u32 insn_idx, spis_t mask); -void bpf_mark_stack_write(struct bpf_verifier_env *env, u32 frameno, spis_t mask); -int bpf_reset_stack_write_marks(struct bpf_verifier_env *env, u32 insn_idx); -int bpf_commit_stack_write_marks(struct bpf_verifier_env *env); int bpf_live_stack_query_init(struct bpf_verifier_env *env, struct bpf_verifier_state *st); bool bpf_stack_slot_alive(struct bpf_verifier_env *env, u32 frameno, u32 spi); -void bpf_reset_live_stack_callchain(struct bpf_verifier_env *env); #endif /* _LINUX_BPF_VERIFIER_H */ -- cgit v1.2.3 From 2c167d91775b0928eba1d2b9b5483ede63ca7b2e Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 10 Apr 2026 13:56:01 -0700 Subject: bpf: change logging scheme for live stack analysis Instead of breadcrumbs like: (d2,cs15) frame 0 insn 18 +live -16 (d2,cs15) frame 0 insn 17 +live -16 Print final accumulated stack use/def data per-func_instance per-instruction. printed func_instance's are ordered by callsite and depth. For example: stack use/def subprog#0 shared_instance_must_write_overwrite (d0,cs0): 0: (b7) r1 = 1 1: (7b) *(u64 *)(r10 -8) = r1 ; def: fp0-8 2: (7b) *(u64 *)(r10 -16) = r1 ; def: fp0-16 3: (bf) r1 = r10 4: (07) r1 += -8 5: (bf) r2 = r10 6: (07) r2 += -16 7: (85) call pc+7 ; use: fp0-8 fp0-16 8: (bf) r1 = r10 9: (07) r1 += -16 10: (bf) r2 = r10 11: (07) r2 += -8 12: (85) call pc+2 ; use: fp0-8 fp0-16 13: (b7) r0 = 0 14: (95) exit stack use/def subprog#1 forwarding_rw (d1,cs7): 15: (85) call pc+1 ; use: fp0-8 fp0-16 16: (95) exit stack use/def subprog#1 forwarding_rw (d1,cs12): 15: (85) call pc+1 ; use: fp0-8 fp0-16 16: (95) exit stack use/def subprog#2 write_first_read_second (d2,cs15): 17: (7a) *(u64 *)(r1 +0) = 42 18: (79) r0 = *(u64 *)(r2 +0) ; use: fp0-8 fp0-16 19: (95) exit For groups of three or more consecutive stack slots, abbreviate as follows: 25: (85) call bpf_loop#181 ; use: fp2-8..-512 fp1-8..-512 fp0-8..-512 Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260410-patch-set-v4-10-5d4eecb343db@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index d7fbc0e1559b..d3dc46aae2e7 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -256,11 +256,6 @@ static inline spis_t spis_and(spis_t a, spis_t b) return (spis_t){{ a.v[0] & b.v[0], a.v[1] & b.v[1] }}; } -static inline spis_t spis_xor(spis_t a, spis_t b) -{ - return (spis_t){{ a.v[0] ^ b.v[0], a.v[1] ^ b.v[1] }}; -} - static inline spis_t spis_not(spis_t s) { return (spis_t){{ ~s.v[0], ~s.v[1] }}; -- cgit v1.2.3 From 2cb27158adb38f1a78729e99f7469199d71c714a Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 10 Apr 2026 13:56:05 -0700 Subject: bpf: poison dead stack slots As a sanity check poison stack slots that stack liveness determined to be dead, so that any read from such slots will cause program rejection. If stack liveness logic is incorrect the poison can cause valid program to be rejected, but it also will prevent unsafe program to be accepted. Allow global subprogs "read" poisoned stack slots. The static stack liveness determined that subprog doesn't read certain stack slots, but sizeof(arg_type) based global subprog validation isn't accurate enough to know which slots will actually be read by the callee, so it needs to check full sizeof(arg_type) at the caller. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260410-patch-set-v4-14-5d4eecb343db@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index d3dc46aae2e7..05b9fe98b8f8 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -220,6 +220,7 @@ enum bpf_stack_slot_type { STACK_DYNPTR, STACK_ITER, STACK_IRQ_FLAG, + STACK_POISON, }; #define BPF_REG_SIZE 8 /* size of eBPF register in bytes */ -- cgit v1.2.3 From 70cf146a674c447753ceeb34246ad0afdd0064bb Mon Sep 17 00:00:00 2001 From: Jacob Moroni Date: Thu, 9 Apr 2026 15:01:23 +0000 Subject: PCI/P2PDMA: Add Google SoCs to the P2P DMA host bridge list All Google SoCs support peer-to-peer DMA between Root Ports, so add a wildcard rule to the host bridge list. Signed-off-by: Jacob Moroni Signed-off-by: Bjorn Helgaas Tested-by: David Hu Reviewed-by: Logan Gunthorpe Link: https://patch.msgid.link/20260409150123.3538444-2-jmoroni@google.com --- include/linux/pci_ids.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 406abf629be2..24cb42f66e4b 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2586,6 +2586,8 @@ #define PCI_VENDOR_ID_AZWAVE 0x1a3b +#define PCI_VENDOR_ID_GOOGLE 0x1ae0 + #define PCI_VENDOR_ID_REDHAT_QUMRANET 0x1af4 #define PCI_SUBVENDOR_ID_REDHAT_QUMRANET 0x1af4 #define PCI_SUBDEVICE_ID_QEMU 0x1100 -- cgit v1.2.3 From 5063e775889948c0475ccdf21c74a6191b7b6482 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Fri, 10 Apr 2026 18:54:17 -0700 Subject: bpf: Use kmalloc_nolock() universally in local storage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Switch to kmalloc_nolock() universally in local storage. Socket local storage didn't move to kmalloc_nolock() when BPF memory allocator was replaced by it for performance reasons. Now that kfree_rcu() supports freeing memory allocated by kmalloc_nolock(), we can move the remaining local storages to use kmalloc_nolock() and cleanup the cluttered free paths. Use kfree() instead of kfree_nolock() in bpf_selem_free_trace_rcu() and bpf_local_storage_free_trace_rcu(). Both callbacks run in process context where spinning is allowed, so kfree_nolock() is unnecessary. Benchmark: ./bench -p 1 local-storage-create --storage-type socket \ --batch-size {16,32,64} The benchmark is a microbenchmark stress-testing how fast local storage can be created. There is no measurable throughput change for socket local storage after switching from kzalloc() to kmalloc_nolock(). Socket local storage batch creation speed diff --------------- ---- ------------------ ---- Baseline 16 433.9 ± 0.6 k/s 32 434.3 ± 1.4 k/s 64 434.2 ± 0.7 k/s After 16 439.0 ± 1.9 k/s +1.2% 32 437.3 ± 2.0 k/s +0.7% 64 435.8 ± 2.5k/s +0.4% Also worth noting that the baseline got a 5% throughput boost when sheaf replaces percpu partial slab recently [0]. [0] https://lore.kernel.org/bpf/20260123-sheaves-for-all-v4-0-041323d506f7@suse.cz/ Signed-off-by: Amery Hung Link: https://lore.kernel.org/r/20260411015419.114016-3-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_local_storage.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 8157e8da61d4..dced54e9265f 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -54,7 +54,6 @@ struct bpf_local_storage_map { u32 bucket_log; u16 elem_size; u16 cache_idx; - bool use_kmalloc_nolock; }; struct bpf_local_storage_data { @@ -86,8 +85,7 @@ struct bpf_local_storage_elem { */ }; atomic_t state; - bool use_kmalloc_nolock; - /* 3 bytes hole */ + /* 4 bytes hole */ /* The data is stored in another cacheline to minimize * the number of cachelines access during a cache hit. */ @@ -104,7 +102,6 @@ struct bpf_local_storage { rqspinlock_t lock; /* Protect adding/removing from the "list" */ u64 mem_charge; /* Copy of mem charged to owner. Protected by "lock" */ refcount_t owner_refcnt;/* Used to pin owner when map_free is uncharging */ - bool use_kmalloc_nolock; }; /* U16_MAX is much more than enough for sk local storage @@ -137,8 +134,7 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr); struct bpf_map * bpf_local_storage_map_alloc(union bpf_attr *attr, - struct bpf_local_storage_cache *cache, - bool use_kmalloc_nolock); + struct bpf_local_storage_cache *cache); void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage, struct bpf_local_storage_map *smap, -- cgit v1.2.3 From 136deea435dc83d7fe2304303bb9bccb54f69bb0 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Fri, 10 Apr 2026 18:54:18 -0700 Subject: bpf: Remove gfp_flags plumbing from bpf_local_storage_update() Remove the check that rejects sleepable BPF programs from doing BPF_ANY/BPF_EXIST updates on local storage. This restriction was added in commit b00fa38a9c1c ("bpf: Enable non-atomic allocations in local storage") because kzalloc(GFP_KERNEL) could sleep inside local_storage->lock. This is no longer a concern: all local storage allocations now use kmalloc_nolock() which never sleeps. In addition, since kmalloc_nolock() only accepts __GFP_ACCOUNT, __GFP_ZERO and __GFP_NO_OBJ_EXT, the gfp_flags parameter plumbing from bpf_*_storage_get() to bpf_local_storage_update() becomes dead code. Remove gfp_flags from bpf_selem_alloc(), bpf_local_storage_alloc() and bpf_local_storage_update(). Drop the hidden 5th argument from bpf_*_storage_get helpers, and remove the verifier patching that injected GFP_KERNEL/GFP_ATOMIC into the fifth argument. Signed-off-by: Amery Hung Link: https://lore.kernel.org/r/20260411015419.114016-4-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_local_storage.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index dced54e9265f..9e4f5c45c974 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -188,7 +188,7 @@ int bpf_selem_link_map(struct bpf_local_storage_map *smap, struct bpf_local_storage_elem * bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value, - bool swap_uptrs, gfp_t gfp_flags); + bool swap_uptrs); void bpf_selem_free(struct bpf_local_storage_elem *selem, bool reuse_now); @@ -196,12 +196,11 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem, int bpf_local_storage_alloc(void *owner, struct bpf_local_storage_map *smap, - struct bpf_local_storage_elem *first_selem, - gfp_t gfp_flags); + struct bpf_local_storage_elem *first_selem); struct bpf_local_storage_data * bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, - void *value, u64 map_flags, bool swap_uptrs, gfp_t gfp_flags); + void *value, u64 map_flags, bool swap_uptrs); u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map); -- cgit v1.2.3 From f79ee9e4b23244e77b28d176ce99a2d84d813ac5 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Fri, 10 Apr 2026 19:41:02 +0200 Subject: spi: spi-mem: Add a packed command operation Instead of repeating the command opcode twice, some flash devices try to pack command and address bits. In this case, the second opcode byte being sent (LSB) is free to be used. The input data must be ANDed to only provide the relevant bits. Signed-off-by: Miquel Raynal Link: https://patch.msgid.link/20260410-winbond-6-19-rc1-oddr-v1-2-2ac4827a3868@bootlin.com Signed-off-by: Mark Brown --- include/linux/spi/spi-mem.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/spi/spi-mem.h b/include/linux/spi/spi-mem.h index 37f709784350..c8e207522223 100644 --- a/include/linux/spi/spi-mem.h +++ b/include/linux/spi/spi-mem.h @@ -28,6 +28,14 @@ .dtr = true, \ } +#define SPI_MEM_DTR_OP_PACKED_CMD(__opcode, __addr, __buswidth) \ + { \ + .nbytes = 2, \ + .opcode = __opcode << 8 | __addr, \ + .buswidth = __buswidth, \ + .dtr = true, \ + } + #define SPI_MEM_OP_ADDR(__nbytes, __val, __buswidth) \ { \ .nbytes = __nbytes, \ -- cgit v1.2.3 From a2225b6e834a838ae3c93709760edc0a169eb2f2 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Mon, 6 Apr 2026 16:22:54 -0700 Subject: driver core: Don't let a device probe until it's ready The moment we link a "struct device" into the list of devices for the bus, it's possible probe can happen. This is because another thread can load the driver at any time and that can cause the device to probe. This has been seen in practice with a stack crawl that looks like this [1]: really_probe() __driver_probe_device() driver_probe_device() __driver_attach() bus_for_each_dev() driver_attach() bus_add_driver() driver_register() __platform_driver_register() init_module() [some module] do_one_initcall() do_init_module() load_module() __arm64_sys_finit_module() invoke_syscall() As a result of the above, it was seen that device_links_driver_bound() could be called for the device before "dev->fwnode->dev" was assigned. This prevented __fw_devlink_pickup_dangling_consumers() from being called which meant that other devices waiting on our driver's sub-nodes were stuck deferring forever. It's believed that this problem is showing up suddenly for two reasons: 1. Android has recently (last ~1 year) implemented an optimization to the order it loads modules [2]. When devices opt-in to this faster loading, modules are loaded one-after-the-other very quickly. This is unlike how other distributions do it. The reproduction of this problem has only been seen on devices that opt-in to Android's "parallel module loading". 2. Android devices typically opt-in to fw_devlink, and the most noticeable issue is the NULL "dev->fwnode->dev" in device_links_driver_bound(). fw_devlink is somewhat new code and also not in use by all Linux devices. Even though the specific symptom where "dev->fwnode->dev" wasn't assigned could be fixed by moving that assignment higher in device_add(), other parts of device_add() (like the call to device_pm_add()) are also important to run before probe. Only moving the "dev->fwnode->dev" assignment would likely fix the current symptoms but lead to difficult-to-debug problems in the future. Fix the problem by preventing probe until device_add() has run far enough that the device is ready to probe. If somehow we end up trying to probe before we're allowed, __driver_probe_device() will return -EPROBE_DEFER which will make certain the device is noticed. In the race condition that was seen with Android's faster module loading, we will temporarily add the device to the deferred list and then take it off immediately when device_add() probes the device. Instead of adding another flag to the bitfields already in "struct device", instead add a new "flags" field and use that. This allows us to freely change the bit from different thread without worrying about corrupting nearby bits (and means threads changing other bit won't corrupt us). [1] Captured on a machine running a downstream 6.6 kernel [2] https://cs.android.com/android/platform/superproject/main/+/main:system/core/libmodprobe/libmodprobe.cpp?q=LoadModulesParallel Cc: stable@vger.kernel.org Fixes: 2023c610dc54 ("Driver core: add new device to bus's list before probing") Reviewed-by: Alan Stern Reviewed-by: Rafael J. Wysocki (Intel) Reviewed-by: Danilo Krummrich Acked-by: Greg Kroah-Hartman Acked-by: Marek Szyprowski Signed-off-by: Douglas Anderson Link: https://patch.msgid.link/20260406162231.v5.1.Id750b0fbcc94f23ed04b7aecabcead688d0d8c17@changeid Signed-off-by: Danilo Krummrich --- include/linux/device.h | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index e65d564f01cd..f27ed6eb87a9 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -458,6 +458,21 @@ struct device_physical_location { bool lid; }; +/** + * enum struct_device_flags - Flags in struct device + * + * Each flag should have a set of accessor functions created via + * __create_dev_flag_accessors() for each access. + * + * @DEV_FLAG_READY_TO_PROBE: If set then device_add() has finished enough + * initialization that probe could be called. + */ +enum struct_device_flags { + DEV_FLAG_READY_TO_PROBE = 0, + + DEV_FLAG_COUNT +}; + /** * struct device - The basic device structure * @parent: The device's "parent" device, the device to which it is attached. @@ -553,6 +568,7 @@ struct device_physical_location { * @dma_skip_sync: DMA sync operations can be skipped for coherent buffers. * @dma_iommu: Device is using default IOMMU implementation for DMA and * doesn't rely on dma_ops structure. + * @flags: DEV_FLAG_XXX flags. Use atomic bitfield operations to modify. * * At the lowest level, every device in a Linux system is represented by an * instance of struct device. The device structure contains the information @@ -675,8 +691,36 @@ struct device { #ifdef CONFIG_IOMMU_DMA bool dma_iommu:1; #endif + + DECLARE_BITMAP(flags, DEV_FLAG_COUNT); }; +#define __create_dev_flag_accessors(accessor_name, flag_name) \ +static inline bool dev_##accessor_name(const struct device *dev) \ +{ \ + return test_bit(flag_name, dev->flags); \ +} \ +static inline void dev_set_##accessor_name(struct device *dev) \ +{ \ + set_bit(flag_name, dev->flags); \ +} \ +static inline void dev_clear_##accessor_name(struct device *dev) \ +{ \ + clear_bit(flag_name, dev->flags); \ +} \ +static inline void dev_assign_##accessor_name(struct device *dev, bool value) \ +{ \ + assign_bit(flag_name, dev->flags, value); \ +} \ +static inline bool dev_test_and_set_##accessor_name(struct device *dev) \ +{ \ + return test_and_set_bit(flag_name, dev->flags); \ +} + +__create_dev_flag_accessors(ready_to_probe, DEV_FLAG_READY_TO_PROBE); + +#undef __create_dev_flag_accessors + /** * struct device_link - Device link representation. * @supplier: The device on the supplier end of the link. -- cgit v1.2.3 From 06c42142cf8aaeba3fa3c4336717b87ca4eebf8a Mon Sep 17 00:00:00 2001 From: Chenghai Huang Date: Mon, 30 Mar 2026 14:25:31 +0800 Subject: crypto: hisilicon - remove unused and non-public APIs for qm and sec - sec_register_to_crypto() and sec_unregister_from_crypto() have been removed, the function declarations have not been removed. Remove them. - hisi_qm_start_qp and hisi_qm_stop_qp are called internally by the QM. Therefore, the EXPORT_SYMBOL_GPL declaration of these non-public interfaces is deleted. Signed-off-by: Chenghai Huang Signed-off-by: Herbert Xu --- include/linux/hisi_acc_qm.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index 8a581b5bbbcd..a6268dc4f7cb 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -558,8 +558,6 @@ int hisi_qm_init(struct hisi_qm *qm); void hisi_qm_uninit(struct hisi_qm *qm); int hisi_qm_start(struct hisi_qm *qm); int hisi_qm_stop(struct hisi_qm *qm, enum qm_stop_reason r); -int hisi_qm_start_qp(struct hisi_qp *qp, unsigned long arg); -void hisi_qm_stop_qp(struct hisi_qp *qp); int hisi_qp_send(struct hisi_qp *qp, const void *msg); void hisi_qm_debug_init(struct hisi_qm *qm); void hisi_qm_debug_regs_clear(struct hisi_qm *qm); -- cgit v1.2.3 From 82db77f6fb16d23ea60d0f96dcf2b502a322a28f Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Wed, 8 Apr 2026 16:05:50 -0700 Subject: net: tso: Introduce tso_dma_map and helpers Add struct tso_dma_map to tso.h for tracking DMA addresses of mapped GSO payload data and tso_dma_map_completion_state. The tso_dma_map combines DMA mapping storage with iterator state, allowing drivers to walk pre-mapped DMA regions linearly. Includes fields for the DMA IOVA path (iova_state, iova_offset, total_len) and a fallback per-region path (linear_dma, frags[], frag_idx, offset). The tso_dma_map_completion_state makes the IOVA completion state opaque for drivers. Drivers are expected to allocate this and use the added helpers to update the completion state. Adds skb_frag_phys() to skbuff.h, returning the physical address of a paged fragment's data, which is used by the tso_dma_map helpers introduced in this commit described below. The added TSO DMA map helpers are: tso_dma_map_init(): DMA-maps the linear payload region and all frags upfront. Prefers the DMA IOVA API for a single contiguous mapping with one IOTLB sync; falls back to per-region dma_map_phys() otherwise. Returns 0 on success, cleans up partial mappings on failure. tso_dma_map_cleanup(): Handles both IOVA and fallback teardown paths. tso_dma_map_count(): counts how many descriptors the next N bytes of payload will need. Returns 1 if IOVA is used since the mapping is contiguous. tso_dma_map_next(): yields the next (dma_addr, chunk_len) pair. On the IOVA path, each segment is a single contiguous chunk. On the fallback path, indicates when a chunk starts a new DMA mapping so the driver can set dma_unmap_len on that descriptor for completion-time unmapping. tso_dma_map_completion_save(): updates the completion state. Drivers will call this at xmit time. tso_dma_map_complete(): tears down the mapping at completion time and returns true if the IOVA path was used. If it was not used, this is a no-op and returns false. Suggested-by: Jakub Kicinski Signed-off-by: Joe Damato Link: https://patch.msgid.link/20260408230607.2019402-2-joe@dama.to Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 26fe18bcfad8..2bcf78a4de7b 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3763,6 +3763,17 @@ static inline void *skb_frag_address_safe(const skb_frag_t *frag) return ptr + skb_frag_off(frag); } +/** + * skb_frag_phys - gets the physical address of the data in a paged fragment + * @frag: the paged fragment buffer + * + * Returns: the physical address of the data within @frag. + */ +static inline phys_addr_t skb_frag_phys(const skb_frag_t *frag) +{ + return page_to_phys(skb_frag_page(frag)) + skb_frag_off(frag); +} + /** * skb_frag_page_copy() - sets the page in a fragment from another fragment * @fragto: skb fragment where page is set -- cgit v1.2.3 From 7ef629b458018ed01dcab6cbdc644ef26b0d0d83 Mon Sep 17 00:00:00 2001 From: Nicolai Buchwitz Date: Mon, 6 Apr 2026 09:13:07 +0200 Subject: net: phy: add support for disabling PHY-autonomous EEE Some PHYs (e.g. Broadcom BCM54xx, Realtek RTL8211F) implement autonomous EEE where the PHY manages LPI signaling without forwarding it to the MAC. This conflicts with MAC drivers that implement their own LPI control. Add a .disable_autonomous_eee callback to struct phy_driver and call it from phy_support_eee(). When a MAC driver indicates it supports EEE via phy_support_eee(), the PHY's autonomous EEE is automatically disabled so the MAC can manage LPI entry/exit. Signed-off-by: Nicolai Buchwitz Link: https://patch.msgid.link/20260406-devel-autonomous-eee-v1-1-b335e7143711@tipi-net.de Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 5de4b172cd0b..199a7aaa341b 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -612,6 +612,8 @@ struct phy_oatc14_sqi_capability { * @advertising_eee: Currently advertised EEE linkmodes * @enable_tx_lpi: When True, MAC should transmit LPI to PHY * @eee_active: phylib private state, indicating that EEE has been negotiated + * @autonomous_eee_disabled: Set when autonomous EEE has been disabled, + * used to re-apply after PHY soft reset * @eee_cfg: User configuration of EEE * @lp_advertising: Current link partner advertised linkmodes * @host_interfaces: PHY interface modes supported by host @@ -739,6 +741,7 @@ struct phy_device { __ETHTOOL_DECLARE_LINK_MODE_MASK(eee_disabled_modes); bool enable_tx_lpi; bool eee_active; + bool autonomous_eee_disabled; struct eee_config eee_cfg; /* Host supported PHY interface types. Should be ignored if empty. */ @@ -1359,6 +1362,17 @@ struct phy_driver { void (*get_stats)(struct phy_device *dev, struct ethtool_stats *stats, u64 *data); + /** + * @disable_autonomous_eee: Disable PHY-autonomous EEE + * + * Some PHYs manage EEE autonomously, preventing the MAC from + * controlling LPI signaling. This callback disables autonomous + * EEE at the PHY. + * + * Return: 0 on success, negative errno on failure. + */ + int (*disable_autonomous_eee)(struct phy_device *dev); + /* Get and Set PHY tunables */ /** @get_tunable: Return the value of a tunable */ int (*get_tunable)(struct phy_device *dev, -- cgit v1.2.3 From bcb3e89fc0ecbe7a2b7ce614b72deda39083ac74 Mon Sep 17 00:00:00 2001 From: Nicolai Buchwitz Date: Mon, 6 Apr 2026 09:13:08 +0200 Subject: net: phy: broadcom: implement .disable_autonomous_eee for BCM54xx Implement the .disable_autonomous_eee callback for the BCM54210E. In AutogrEEEn mode the PHY manages EEE autonomously. Clearing the AutogrEEEn enable bit in MII_BUF_CNTL_0 switches the PHY to Native EEE mode. Signed-off-by: Nicolai Buchwitz Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/20260406-devel-autonomous-eee-v1-2-b335e7143711@tipi-net.de Signed-off-by: Jakub Kicinski --- include/linux/brcmphy.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index 115a964f3006..174687c4c80a 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -266,6 +266,9 @@ #define BCM54XX_TOP_MISC_IDDQ_SD (1 << 2) #define BCM54XX_TOP_MISC_IDDQ_SR (1 << 3) +#define BCM54XX_TOP_MISC_MII_BUF_CNTL0 (MII_BCM54XX_EXP_SEL_TOP + 0x00) +#define BCM54XX_MII_BUF_CNTL0_AUTOGREEEN_EN BIT(0) + #define BCM54XX_TOP_MISC_LED_CTL (MII_BCM54XX_EXP_SEL_TOP + 0x0C) #define BCM54XX_LED4_SEL_INTR BIT(1) -- cgit v1.2.3 From 449f08fa59dda5da40317b6976604b877c4ecd63 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Sun, 12 Apr 2026 08:29:30 -0700 Subject: bpf: Move fixup/post-processing logic from verifier.c into fixups.c verifier.c is huge. Split fixup/post-processing logic that runs after the verifier accepted the program into fixups.c. Mechanical move. No functional changes. Acked-by: Kumar Kartikeya Dwivedi Acked-by: Daniel Borkmann Link: https://lore.kernel.org/r/20260412152936.54262-2-alexei.starovoitov@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 78 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 05b9fe98b8f8..4380ecad485b 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -1205,4 +1205,82 @@ void bpf_stack_liveness_free(struct bpf_verifier_env *env); int bpf_live_stack_query_init(struct bpf_verifier_env *env, struct bpf_verifier_state *st); bool bpf_stack_slot_alive(struct bpf_verifier_env *env, u32 frameno, u32 spi); +#define BPF_MAP_KEY_POISON (1ULL << 63) +#define BPF_MAP_KEY_SEEN (1ULL << 62) + +static inline bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux) +{ + return aux->map_ptr_state.poison; +} + +static inline bool bpf_map_ptr_unpriv(const struct bpf_insn_aux_data *aux) +{ + return aux->map_ptr_state.unpriv; +} + +static inline bool bpf_map_key_poisoned(const struct bpf_insn_aux_data *aux) +{ + return aux->map_key_state & BPF_MAP_KEY_POISON; +} + +static inline bool bpf_map_key_unseen(const struct bpf_insn_aux_data *aux) +{ + return !(aux->map_key_state & BPF_MAP_KEY_SEEN); +} + +static inline u64 bpf_map_key_immediate(const struct bpf_insn_aux_data *aux) +{ + return aux->map_key_state & ~(BPF_MAP_KEY_SEEN | BPF_MAP_KEY_POISON); +} + +#define MAX_PACKET_OFF 0xffff + +enum bpf_reg_arg_type { + SRC_OP, /* register is used as source operand */ + DST_OP, /* register is used as destination operand */ + DST_OP_NO_MARK /* same as above, check only, don't mark */ +}; + +#define MAX_KFUNC_DESCS 256 + +struct bpf_kfunc_desc { + struct btf_func_model func_model; + u32 func_id; + s32 imm; + u16 offset; + unsigned long addr; +}; + +struct bpf_kfunc_desc_tab { + /* Sorted by func_id (BTF ID) and offset (fd_array offset) during + * verification. JITs do lookups by bpf_insn, where func_id may not be + * available, therefore at the end of verification do_misc_fixups() + * sorts this by imm and offset. + */ + struct bpf_kfunc_desc descs[MAX_KFUNC_DESCS]; + u32 nr_descs; +}; + +/* Functions exported from verifier.c, used by fixups.c */ +bool bpf_is_reg64(struct bpf_insn *insn, u32 regno, struct bpf_reg_state *reg, enum bpf_reg_arg_type t); +void bpf_clear_insn_aux_data(struct bpf_verifier_env *env, int start, int len); +void bpf_mark_subprog_exc_cb(struct bpf_verifier_env *env, int subprog); +bool bpf_allow_tail_call_in_subprogs(struct bpf_verifier_env *env); +bool bpf_verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm); +int bpf_add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, u16 offset); +int bpf_fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + struct bpf_insn *insn_buf, int insn_idx, int *cnt); + +/* Functions in fixups.c, called from bpf_check() */ +int bpf_remove_fastcall_spills_fills(struct bpf_verifier_env *env); +int bpf_optimize_bpf_loop(struct bpf_verifier_env *env); +void bpf_opt_hard_wire_dead_code_branches(struct bpf_verifier_env *env); +int bpf_opt_remove_dead_code(struct bpf_verifier_env *env); +int bpf_opt_remove_nops(struct bpf_verifier_env *env); +int bpf_opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, const union bpf_attr *attr); +int bpf_convert_ctx_accesses(struct bpf_verifier_env *env); +int bpf_jit_subprogs(struct bpf_verifier_env *env); +int bpf_fixup_call_args(struct bpf_verifier_env *env); +int bpf_do_misc_fixups(struct bpf_verifier_env *env); + #endif /* _LINUX_BPF_VERIFIER_H */ -- cgit v1.2.3 From fc150cddeea77561fbc94ac8f02cc75b016b09dd Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Sun, 12 Apr 2026 08:29:31 -0700 Subject: bpf: Move compute_insn_live_regs() into liveness.c verifier.c is huge. Move compute_insn_live_regs() into liveness.c. Mechanical move. No functional changes. Acked-by: Kumar Kartikeya Dwivedi Acked-by: Daniel Borkmann Link: https://lore.kernel.org/r/20260412152936.54262-3-alexei.starovoitov@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 4380ecad485b..e3f18667e030 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -1204,6 +1204,7 @@ int bpf_stack_liveness_init(struct bpf_verifier_env *env); void bpf_stack_liveness_free(struct bpf_verifier_env *env); int bpf_live_stack_query_init(struct bpf_verifier_env *env, struct bpf_verifier_state *st); bool bpf_stack_slot_alive(struct bpf_verifier_env *env, u32 frameno, u32 spi); +int bpf_compute_live_registers(struct bpf_verifier_env *env); #define BPF_MAP_KEY_POISON (1ULL << 63) #define BPF_MAP_KEY_SEEN (1ULL << 62) @@ -1234,6 +1235,7 @@ static inline u64 bpf_map_key_immediate(const struct bpf_insn_aux_data *aux) } #define MAX_PACKET_OFF 0xffff +#define CALLER_SAVED_REGS 6 enum bpf_reg_arg_type { SRC_OP, /* register is used as source operand */ -- cgit v1.2.3 From f8a8faceab9953ed074cd4125b31cc6a562237d8 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Sun, 12 Apr 2026 08:29:32 -0700 Subject: bpf: Move check_cfg() into cfg.c verifier.c is huge. Move check_cfg(), compute_postorder(), compute_scc() into cfg.c Mechanical move. No functional changes. Acked-by: Kumar Kartikeya Dwivedi Acked-by: Daniel Borkmann Link: https://lore.kernel.org/r/20260412152936.54262-4-alexei.starovoitov@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 115 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 114 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index e3f18667e030..aa92a597bc5c 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -983,6 +983,41 @@ __printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env, bpf_log(&env->log, "verifier bug: " fmt "\n", ##args); \ }) +static inline void mark_prune_point(struct bpf_verifier_env *env, int idx) +{ + env->insn_aux_data[idx].prune_point = true; +} + +static inline bool bpf_is_prune_point(struct bpf_verifier_env *env, int insn_idx) +{ + return env->insn_aux_data[insn_idx].prune_point; +} + +static inline void mark_force_checkpoint(struct bpf_verifier_env *env, int idx) +{ + env->insn_aux_data[idx].force_checkpoint = true; +} + +static inline bool bpf_is_force_checkpoint(struct bpf_verifier_env *env, int insn_idx) +{ + return env->insn_aux_data[insn_idx].force_checkpoint; +} + +static inline void mark_calls_callback(struct bpf_verifier_env *env, int idx) +{ + env->insn_aux_data[idx].calls_callback = true; +} + +static inline bool bpf_calls_callback(struct bpf_verifier_env *env, int insn_idx) +{ + return env->insn_aux_data[insn_idx].calls_callback; +} + +static inline void mark_jmp_point(struct bpf_verifier_env *env, int idx) +{ + env->insn_aux_data[idx].jmp_point = true; +} + static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env) { struct bpf_verifier_state *cur = env->cur_state; @@ -1179,13 +1214,91 @@ struct bpf_subprog_info *bpf_find_containing_subprog(struct bpf_verifier_env *en int bpf_jmp_offset(struct bpf_insn *insn); struct bpf_iarray *bpf_insn_successors(struct bpf_verifier_env *env, u32 idx); void bpf_fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask); -bool bpf_calls_callback(struct bpf_verifier_env *env, int insn_idx); bool bpf_subprog_is_global(const struct bpf_verifier_env *env, int subprog); int bpf_find_subprog(struct bpf_verifier_env *env, int off); int bpf_compute_const_regs(struct bpf_verifier_env *env); int bpf_prune_dead_branches(struct bpf_verifier_env *env); +int bpf_check_cfg(struct bpf_verifier_env *env); int bpf_compute_postorder(struct bpf_verifier_env *env); +int bpf_compute_scc(struct bpf_verifier_env *env); + +struct bpf_map_desc { + struct bpf_map *ptr; + int uid; +}; + +struct bpf_kfunc_call_arg_meta { + /* In parameters */ + struct btf *btf; + u32 func_id; + u32 kfunc_flags; + const struct btf_type *func_proto; + const char *func_name; + /* Out parameters */ + u32 ref_obj_id; + u8 release_regno; + bool r0_rdonly; + u32 ret_btf_id; + u64 r0_size; + u32 subprogno; + struct { + u64 value; + bool found; + } arg_constant; + + /* arg_{btf,btf_id,owning_ref} are used by kfunc-specific handling, + * generally to pass info about user-defined local kptr types to later + * verification logic + * bpf_obj_drop/bpf_percpu_obj_drop + * Record the local kptr type to be drop'd + * bpf_refcount_acquire (via KF_ARG_PTR_TO_REFCOUNTED_KPTR arg type) + * Record the local kptr type to be refcount_incr'd and use + * arg_owning_ref to determine whether refcount_acquire should be + * fallible + */ + struct btf *arg_btf; + u32 arg_btf_id; + bool arg_owning_ref; + bool arg_prog; + + struct { + struct btf_field *field; + } arg_list_head; + struct { + struct btf_field *field; + } arg_rbtree_root; + struct { + enum bpf_dynptr_type type; + u32 id; + u32 ref_obj_id; + } initialized_dynptr; + struct { + u8 spi; + u8 frameno; + } iter; + struct bpf_map_desc map; + u64 mem_size; +}; + +int bpf_get_helper_proto(struct bpf_verifier_env *env, int func_id, + const struct bpf_func_proto **ptr); +int bpf_fetch_kfunc_arg_meta(struct bpf_verifier_env *env, s32 func_id, + s16 offset, struct bpf_kfunc_call_arg_meta *meta); +bool bpf_is_async_callback_calling_insn(struct bpf_insn *insn); +bool bpf_is_sync_callback_calling_insn(struct bpf_insn *insn); +static inline bool bpf_is_iter_next_kfunc(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->kfunc_flags & KF_ITER_NEXT; +} + +static inline bool bpf_is_kfunc_sleepable(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->kfunc_flags & KF_SLEEPABLE; +} +bool bpf_is_kfunc_pkt_changing(struct bpf_kfunc_call_arg_meta *meta); +struct bpf_iarray *bpf_iarray_realloc(struct bpf_iarray *old, size_t n_elem); +int bpf_copy_insn_array_uniq(struct bpf_map *map, u32 start, u32 end, u32 *off); bool bpf_insn_is_cond_jump(u8 code); bool bpf_is_may_goto_insn(struct bpf_insn *insn); -- cgit v1.2.3 From c82834a5a11f743f2926107d8f8150e80742b814 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Sun, 12 Apr 2026 08:29:33 -0700 Subject: bpf: Move state equivalence logic to states.c verifier.c is huge. Move is_state_visited() to states.c, so that all state equivalence logic is in one file. Mechanical move. No functional changes. Acked-by: Kumar Kartikeya Dwivedi Acked-by: Daniel Borkmann Link: https://lore.kernel.org/r/20260412152936.54262-5-alexei.starovoitov@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 67 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index aa92a597bc5c..669453386282 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -1068,6 +1068,73 @@ void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab); int mark_chain_precision(struct bpf_verifier_env *env, int regno); +int bpf_is_state_visited(struct bpf_verifier_env *env, int insn_idx); +int bpf_update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st); + +void bpf_clear_jmp_history(struct bpf_verifier_state *state); +int bpf_copy_verifier_state(struct bpf_verifier_state *dst_state, + const struct bpf_verifier_state *src); +struct list_head *bpf_explored_state(struct bpf_verifier_env *env, int idx); +void bpf_free_verifier_state(struct bpf_verifier_state *state, bool free_self); +void bpf_free_backedges(struct bpf_scc_visit *visit); +int bpf_push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur, + int insn_flags, u64 linked_regs); +void bpf_mark_reg_not_init(const struct bpf_verifier_env *env, + struct bpf_reg_state *reg); +void bpf_mark_reg_unknown_imprecise(struct bpf_reg_state *reg); +void bpf_mark_all_scalars_precise(struct bpf_verifier_env *env, + struct bpf_verifier_state *st); +void bpf_clear_singular_ids(struct bpf_verifier_env *env, struct bpf_verifier_state *st); +int bpf_mark_chain_precision(struct bpf_verifier_env *env, + struct bpf_verifier_state *starting_state, + int regno, bool *changed); + +static inline int bpf_get_spi(s32 off) +{ + return (-off - 1) / BPF_REG_SIZE; +} + +static inline struct bpf_func_state *bpf_func(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg) +{ + struct bpf_verifier_state *cur = env->cur_state; + + return cur->frame[reg->frameno]; +} + +/* Return IP for a given frame in a call stack */ +static inline u32 bpf_frame_insn_idx(struct bpf_verifier_state *st, u32 frame) +{ + return frame == st->curframe + ? st->insn_idx + : st->frame[frame + 1]->callsite; +} + +static inline bool bpf_is_jmp_point(struct bpf_verifier_env *env, int insn_idx) +{ + return env->insn_aux_data[insn_idx].jmp_point; +} + +static inline bool bpf_is_spilled_reg(const struct bpf_stack_state *stack) +{ + return stack->slot_type[BPF_REG_SIZE - 1] == STACK_SPILL; +} + +static inline bool bpf_register_is_null(struct bpf_reg_state *reg) +{ + return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0); +} + +static inline void bpf_bt_set_frame_reg(struct backtrack_state *bt, u32 frame, u32 reg) +{ + bt->reg_masks[frame] |= 1 << reg; +} + +static inline void bpf_bt_set_frame_slot(struct backtrack_state *bt, u32 frame, u32 slot) +{ + bt->stack_masks[frame] |= 1ull << slot; +} + bool bpf_map_is_rdonly(const struct bpf_map *map); int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val, bool is_ldsx); -- cgit v1.2.3 From ed0b9710bd2efbe663d89728cd9c680c31c6a4e3 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Sun, 12 Apr 2026 08:29:34 -0700 Subject: bpf: Move backtracking logic to backtrack.c Move precision propagation and backtracking logic to backtrack.c to reduce verifier.c size. No functional changes. Acked-by: Kumar Kartikeya Dwivedi Acked-by: Daniel Borkmann Link: https://lore.kernel.org/r/20260412152936.54262-6-alexei.starovoitov@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 669453386282..5389af612696 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -279,6 +279,8 @@ static inline void spis_or_range(spis_t *mask, u32 lo, u32 hi) (1 << BPF_REG_3) | (1 << BPF_REG_4) | \ (1 << BPF_REG_5)) +#define BPF_MAIN_FUNC (-1) + #define BPF_DYNPTR_SIZE sizeof(struct bpf_dynptr_kern) #define BPF_DYNPTR_NR_SLOTS (BPF_DYNPTR_SIZE / BPF_REG_SIZE) @@ -1079,6 +1081,7 @@ void bpf_free_verifier_state(struct bpf_verifier_state *state, bool free_self); void bpf_free_backedges(struct bpf_scc_visit *visit); int bpf_push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur, int insn_flags, u64 linked_regs); +void bpf_bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_jmp_history_entry *hist); void bpf_mark_reg_not_init(const struct bpf_verifier_env *env, struct bpf_reg_state *reg); void bpf_mark_reg_unknown_imprecise(struct bpf_reg_state *reg); @@ -1120,6 +1123,11 @@ static inline bool bpf_is_spilled_reg(const struct bpf_stack_state *stack) return stack->slot_type[BPF_REG_SIZE - 1] == STACK_SPILL; } +static inline bool bpf_is_spilled_scalar_reg(const struct bpf_stack_state *stack) +{ + return bpf_is_spilled_reg(stack) && stack->spilled_ptr.type == SCALAR_VALUE; +} + static inline bool bpf_register_is_null(struct bpf_reg_state *reg) { return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0); @@ -1135,6 +1143,16 @@ static inline void bpf_bt_set_frame_slot(struct backtrack_state *bt, u32 frame, bt->stack_masks[frame] |= 1ull << slot; } +static inline bool bt_is_frame_reg_set(struct backtrack_state *bt, u32 frame, u32 reg) +{ + return bt->reg_masks[frame] & (1 << reg); +} + +static inline bool bt_is_frame_slot_set(struct backtrack_state *bt, u32 frame, u32 slot) +{ + return bt->stack_masks[frame] & (1ull << slot); +} + bool bpf_map_is_rdonly(const struct bpf_map *map); int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val, bool is_ldsx); -- cgit v1.2.3 From 99a832a2b5b8a8ecc1a2bdd64017892caf4aa096 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Sun, 12 Apr 2026 08:29:35 -0700 Subject: bpf: Move BTF checking logic into check_btf.c BTF validation logic is independent from the main verifier. Move it into check_btf.c Acked-by: Kumar Kartikeya Dwivedi Acked-by: Daniel Borkmann Link: https://lore.kernel.org/r/20260412152936.54262-7-alexei.starovoitov@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 5389af612696..53e8664cb566 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -1061,6 +1061,11 @@ static inline void bpf_trampoline_unpack_key(u64 key, u32 *obj_id, u32 *btf_id) *btf_id = key & 0x7FFFFFFF; } +int bpf_check_btf_info_early(struct bpf_verifier_env *env, + const union bpf_attr *attr, bpfptr_t uattr); +int bpf_check_btf_info(struct bpf_verifier_env *env, + const union bpf_attr *attr, bpfptr_t uattr); + int bpf_check_attach_target(struct bpf_verifier_log *log, const struct bpf_prog *prog, const struct bpf_prog *tgt_prog, -- cgit v1.2.3 From c78bcbd51976f123909e5c2baf8cebb699453c2f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 9 Apr 2026 14:56:22 +0000 Subject: net: change sk_filter_reason() to return the reason by value sk_filter_trim_cap will soon return the reason by value, do the same for sk_filter_reason(). $ scripts/bloat-o-meter -t vmlinux.old vmlinux.new add/remove: 0/0 grow/shrink: 0/2 up/down: 0/-21 (-21) Function old new delta sock_queue_rcv_skb_reason 128 126 -2 tun_net_xmit 1146 1127 -19 Total: Before=29722661, After=29722640, chg -0.00% Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20260409145625.2306224-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/filter.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 44d7ae95ddbc..59931e5810b4 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1102,10 +1102,13 @@ static inline int sk_filter(struct sock *sk, struct sk_buff *skb) return sk_filter_trim_cap(sk, skb, 1, &ignore_reason); } -static inline int sk_filter_reason(struct sock *sk, struct sk_buff *skb, - enum skb_drop_reason *reason) +static inline enum skb_drop_reason +sk_filter_reason(struct sock *sk, struct sk_buff *skb) { - return sk_filter_trim_cap(sk, skb, 1, reason); + enum skb_drop_reason drop_reason; + + sk_filter_trim_cap(sk, skb, 1, &drop_reason); + return drop_reason; } struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err); -- cgit v1.2.3 From fb37aea2a00e67ef5264ea39371d350a1d19b24f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 9 Apr 2026 14:56:24 +0000 Subject: net: change sk_filter_trim_cap() to return a drop_reason by value Current return value can be replaced with the drop_reason, reducing kernel bloat: $ scripts/bloat-o-meter -t vmlinux.old vmlinux.new add/remove: 0/2 grow/shrink: 1/11 up/down: 32/-603 (-571) Function old new delta tcp_v6_rcv 3135 3167 +32 unix_dgram_sendmsg 1731 1726 -5 netlink_unicast 957 945 -12 netlink_dump 1372 1359 -13 sk_filter_trim_cap 882 858 -24 tcp_v4_rcv 3143 3111 -32 __pfx_tcp_filter 32 - -32 netlink_broadcast_filtered 1633 1595 -38 sock_queue_rcv_skb_reason 126 76 -50 tun_net_xmit 1127 1074 -53 __sk_receive_skb 690 632 -58 udpv6_queue_rcv_one_skb 935 869 -66 udp_queue_rcv_one_skb 919 853 -66 tcp_filter 154 - -154 Total: Before=29722783, After=29722212, chg -0.00% Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20260409145625.2306224-6-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/filter.h | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 59931e5810b4..5ac08aa70123 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1092,23 +1092,21 @@ bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) return set_memory_rox((unsigned long)hdr, hdr->size >> PAGE_SHIFT); } -int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap, - enum skb_drop_reason *reason); +enum skb_drop_reason +sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap); static inline int sk_filter(struct sock *sk, struct sk_buff *skb) { - enum skb_drop_reason ignore_reason; + enum skb_drop_reason drop_reason; - return sk_filter_trim_cap(sk, skb, 1, &ignore_reason); + drop_reason = sk_filter_trim_cap(sk, skb, 1); + return drop_reason ? -EPERM : 0; } static inline enum skb_drop_reason sk_filter_reason(struct sock *sk, struct sk_buff *skb) { - enum skb_drop_reason drop_reason; - - sk_filter_trim_cap(sk, skb, 1, &drop_reason); - return drop_reason; + return sk_filter_trim_cap(sk, skb, 1); } struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err); -- cgit v1.2.3 From 105369d627b946f6a05f25e9c399167b1674d4bc Mon Sep 17 00:00:00 2001 From: Qingfang Deng Date: Fri, 10 Apr 2026 13:49:49 +0800 Subject: pppox: remove sk_pppox() helper The sk member can be directly accessed from struct pppox_sock without relying on type casting. Remove the sk_pppox() helper and update all call sites to use po->sk directly. Signed-off-by: Qingfang Deng Link: https://patch.msgid.link/20260410054954.114031-1-qingfang.deng@linux.dev Signed-off-by: Jakub Kicinski --- include/linux/if_pppox.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/if_pppox.h b/include/linux/if_pppox.h index 8bbf676c2a85..636772693f9a 100644 --- a/include/linux/if_pppox.h +++ b/include/linux/if_pppox.h @@ -57,11 +57,6 @@ static inline struct pppox_sock *pppox_sk(struct sock *sk) return (struct pppox_sock *)sk; } -static inline struct sock *sk_pppox(struct pppox_sock *po) -{ - return (struct sock *)po; -} - struct module; struct pppox_proto { -- cgit v1.2.3 From 6bc78039a77a46d89df987813fbafe333cd81367 Mon Sep 17 00:00:00 2001 From: Qingfang Deng Date: Fri, 10 Apr 2026 13:49:50 +0800 Subject: pppox: convert pppox_sk() to use container_of() Use container_of() macro instead of direct pointer casting to get the pppox_sock from a sock pointer. Signed-off-by: Qingfang Deng Link: https://patch.msgid.link/20260410054954.114031-2-qingfang.deng@linux.dev Signed-off-by: Jakub Kicinski --- include/linux/if_pppox.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/if_pppox.h b/include/linux/if_pppox.h index 636772693f9a..594d6dc3f4c9 100644 --- a/include/linux/if_pppox.h +++ b/include/linux/if_pppox.h @@ -54,7 +54,7 @@ struct pppox_sock { static inline struct pppox_sock *pppox_sk(struct sock *sk) { - return (struct pppox_sock *)sk; + return container_of(sk, struct pppox_sock, sk); } struct module; -- cgit v1.2.3 From 0bd75b7abafb3ed199df830c539c57ef9b62c2a2 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Fri, 10 Apr 2026 14:49:12 +0200 Subject: mailbox: prefix new constants with MBOX_ Commit 89e5d7d61600 ("mailbox: remove superfluous internal header") moved some constants to a public header but forgot to add a mailbox specific prefix. Add this now to prevent future collisions on a too generic naming. Link: https://sashiko.dev/#/patchset/20260327151112.5202-2-wsa%2Brenesas%40sang-engineering.com Signed-off-by: Wolfram Sang Reviewed-by: Sudeep Holla Signed-off-by: Jassi Brar --- include/linux/mailbox_controller.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mailbox_controller.h b/include/linux/mailbox_controller.h index e3896b08f22e..a49ee687d4cf 100644 --- a/include/linux/mailbox_controller.h +++ b/include/linux/mailbox_controller.h @@ -15,9 +15,9 @@ struct mbox_chan; /* Sentinel value distinguishing "no active request" from "NULL message data" */ #define MBOX_NO_MSG ((void *)-1) -#define TXDONE_BY_IRQ BIT(0) /* controller has remote RTR irq */ -#define TXDONE_BY_POLL BIT(1) /* controller can read status of last TX */ -#define TXDONE_BY_ACK BIT(2) /* S/W ACK received by Client ticks the TX */ +#define MBOX_TXDONE_BY_IRQ BIT(0) /* controller has remote RTR irq */ +#define MBOX_TXDONE_BY_POLL BIT(1) /* controller can read status of last TX */ +#define MBOX_TXDONE_BY_ACK BIT(2) /* S/W ACK received by Client ticks the TX */ /** * struct mbox_chan_ops - methods to control mailbox channels -- cgit v1.2.3 From 0ec7f158dc01e354ba83d808e46346dba826e353 Mon Sep 17 00:00:00 2001 From: Marco Nenciarini Date: Wed, 1 Apr 2026 22:36:38 +0200 Subject: platform/x86: int3472: Add support for GPIO type 0x02 (IR flood LED) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for GPIO type 0x02, which controls an IR flood LED used for face authentication on some laptops (e.g. Dell Pro Max 16 Premium). Without this patch, the kernel logs "GPIO type 0x02 unknown; the sensor may not work" and IR sensors paired with a flood LED cannot function. The flood LED is registered through the LED subsystem like the existing privacy LED, including a lookup entry to allow future consumer drivers to find and control it via led_get(). To support multiple LEDs per INT3472 device, convert the single led struct member to an array with a counter. Signed-off-by: Marco Nenciarini Reviewed-by: Andy Shevchenko Reviewed-by: Hans de Goede Link: https://patch.msgid.link/20260401203638.1601661-5-mnencia@kcore.it Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/platform_data/x86/int3472.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/int3472.h b/include/linux/platform_data/x86/int3472.h index ebf4d0637624..93f1e1fe09b4 100644 --- a/include/linux/platform_data/x86/int3472.h +++ b/include/linux/platform_data/x86/int3472.h @@ -23,6 +23,7 @@ /* PMIC GPIO Types */ #define INT3472_GPIO_TYPE_RESET 0x00 #define INT3472_GPIO_TYPE_POWERDOWN 0x01 +#define INT3472_GPIO_TYPE_STROBE 0x02 #define INT3472_GPIO_TYPE_POWER_ENABLE 0x0b #define INT3472_GPIO_TYPE_CLK_ENABLE 0x0c #define INT3472_GPIO_TYPE_PRIVACY_LED 0x0d @@ -32,6 +33,7 @@ #define INT3472_PDEV_MAX_NAME_LEN 23 #define INT3472_MAX_SENSOR_GPIOS 3 +#define INT3472_MAX_LEDS 2 #define INT3472_MAX_REGULATORS 3 /* E.g. "dovdd\0" */ @@ -127,11 +129,12 @@ struct int3472_discrete_device { struct led_lookup_data lookup; char name[INT3472_LED_MAX_NAME_LEN]; struct gpio_desc *gpio; - } led; + } leds[INT3472_MAX_LEDS]; struct int3472_discrete_quirks quirks; unsigned int ngpios; /* how many GPIOs have we seen */ + unsigned int n_leds; /* how many LEDs have we registered */ unsigned int n_sensor_gpios; /* how many have we mapped to sensor */ unsigned int n_regulator_gpios; /* how many have we mapped to a regulator */ struct gpiod_lookup_table gpios; @@ -163,6 +166,6 @@ void skl_int3472_unregister_regulator(struct int3472_discrete_device *int3472); int skl_int3472_register_led(struct int3472_discrete_device *int3472, struct gpio_desc *gpio, const char *con_id); -void skl_int3472_unregister_led(struct int3472_discrete_device *int3472); +void skl_int3472_unregister_leds(struct int3472_discrete_device *int3472); #endif -- cgit v1.2.3 From 7e2d964f417ec13763eecfecc5d2813f63cb8da0 Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Mon, 6 Apr 2026 22:32:32 +0200 Subject: platform/wmi: Add wmidev_invoke_procedure() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some WMI methods return no values, so the whole postprocessing of the result data is not needed for them. Add a special function for calling such WMI methods to prepare for future changes of the main wmidev_invoke_method() function. Signed-off-by: Armin Wolf Link: https://patch.msgid.link/20260406203237.2970-2-W_Armin@gmx.de Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/wmi.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/wmi.h b/include/linux/wmi.h index 75cb0c7cfe57..b00950dc1231 100644 --- a/include/linux/wmi.h +++ b/include/linux/wmi.h @@ -70,6 +70,9 @@ ssize_t wmi_string_from_utf8s(struct wmi_string *str, size_t max_chars, const u8 int wmidev_invoke_method(struct wmi_device *wdev, u8 instance, u32 method_id, const struct wmi_buffer *in, struct wmi_buffer *out); +int wmidev_invoke_procedure(struct wmi_device *wdev, u8 instance, u32 method_id, + const struct wmi_buffer *in); + int wmidev_query_block(struct wmi_device *wdev, u8 instance, struct wmi_buffer *out); int wmidev_set_block(struct wmi_device *wdev, u8 instance, const struct wmi_buffer *in); -- cgit v1.2.3 From 96b1b053e10d89f666a37b52be25ed4294e342be Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Mon, 6 Apr 2026 22:32:35 +0200 Subject: platform/wmi: Extend wmidev_invoke_method() to reject undersized data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit WMI drivers using the buffer-based WMI API are expected to reject undersized method return values. Extend wmidev_invoke_method() to enable the WMI driver core to perform this size check internally. Signed-off-by: Armin Wolf Link: https://patch.msgid.link/20260406203237.2970-5-W_Armin@gmx.de Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/wmi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/wmi.h b/include/linux/wmi.h index b00950dc1231..858398beb01a 100644 --- a/include/linux/wmi.h +++ b/include/linux/wmi.h @@ -68,7 +68,7 @@ ssize_t wmi_string_from_utf8s(struct wmi_string *str, size_t max_chars, const u8 size_t src_length); int wmidev_invoke_method(struct wmi_device *wdev, u8 instance, u32 method_id, - const struct wmi_buffer *in, struct wmi_buffer *out); + const struct wmi_buffer *in, struct wmi_buffer *out, size_t min_size); int wmidev_invoke_procedure(struct wmi_device *wdev, u8 instance, u32 method_id, const struct wmi_buffer *in); -- cgit v1.2.3 From 1aeded2f55f04fafb07b01e12142fd20c2a3d288 Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Mon, 6 Apr 2026 22:32:36 +0200 Subject: platform/wmi: Extend wmidev_query_block() to reject undersized data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit WMI drivers using the buffer-based WMI API are expected to reject undersized query results. Extend wmidev_query_block() to enable the WMI driver core to perform this size check internally. Signed-off-by: Armin Wolf Link: https://patch.msgid.link/20260406203237.2970-6-W_Armin@gmx.de Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/wmi.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/wmi.h b/include/linux/wmi.h index 858398beb01a..da94580572a9 100644 --- a/include/linux/wmi.h +++ b/include/linux/wmi.h @@ -73,7 +73,8 @@ int wmidev_invoke_method(struct wmi_device *wdev, u8 instance, u32 method_id, int wmidev_invoke_procedure(struct wmi_device *wdev, u8 instance, u32 method_id, const struct wmi_buffer *in); -int wmidev_query_block(struct wmi_device *wdev, u8 instance, struct wmi_buffer *out); +int wmidev_query_block(struct wmi_device *wdev, u8 instance, struct wmi_buffer *out, + size_t min_size); int wmidev_set_block(struct wmi_device *wdev, u8 instance, const struct wmi_buffer *in); -- cgit v1.2.3 From 2e2a39149fe37327e0af225f09cad19526a90d48 Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Mon, 6 Apr 2026 22:32:37 +0200 Subject: platform/wmi: Replace .no_notify_data with .min_event_size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit WMI drivers using the buffer-based WMI API are expected to reject undersized event payloads. Extend the WMI driver core to allow such drivers to specify their minimum supported event payload size. Also remove the now redundant .no_notify_data field. Signed-off-by: Armin Wolf Link: https://patch.msgid.link/20260406203237.2970-7-W_Armin@gmx.de Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/wmi.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/wmi.h b/include/linux/wmi.h index da94580572a9..2d242575a8b3 100644 --- a/include/linux/wmi.h +++ b/include/linux/wmi.h @@ -91,7 +91,7 @@ u8 wmidev_instance_count(struct wmi_device *wdev); * struct wmi_driver - WMI driver structure * @driver: Driver model structure * @id_table: List of WMI GUIDs supported by this driver - * @no_notify_data: Driver supports WMI events which provide no event data + * @min_event_size: Minimum event payload size supported by this driver * @no_singleton: Driver can be instantiated multiple times * @probe: Callback for device binding * @remove: Callback for device unbinding @@ -101,11 +101,14 @@ u8 wmidev_instance_count(struct wmi_device *wdev); * * This represents WMI drivers which handle WMI devices. The data inside the buffer * passed to the @notify_new callback is guaranteed to be aligned on a 8-byte boundary. + * The minimum supported size for said buffer can be specified using @min_event_size. + * WMI drivers that still use the deprecated @notify callback can still set @min_event_size + * to 0 in order to signal that they support WMI events which provide no event data. */ struct wmi_driver { struct device_driver driver; const struct wmi_device_id *id_table; - bool no_notify_data; + size_t min_event_size; bool no_singleton; int (*probe)(struct wmi_device *wdev, const void *context); -- cgit v1.2.3 From 3faf0ce6e499dfd32e596bcb5bca2c44d64f4cc1 Mon Sep 17 00:00:00 2001 From: Marc Harvey Date: Thu, 9 Apr 2026 02:59:23 +0000 Subject: net: team: Annotate reads and writes for mixed lock accessed values The team_port's "index" and the team's "en_port_count" are read in the hot transmit path, but are only written to when holding the rtnl lock. Use READ_ONCE() for all lockless reads of these values, and use WRITE_ONCE() for all writes. Reviewed-by: Jiri Pirko Signed-off-by: Marc Harvey Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260409-teaming-driver-internal-v7-1-f47e7589685d@google.com Signed-off-by: Paolo Abeni --- include/linux/if_team.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/if_team.h b/include/linux/if_team.h index ccb5327de26d..06f4d7400c1e 100644 --- a/include/linux/if_team.h +++ b/include/linux/if_team.h @@ -77,7 +77,7 @@ static inline struct team_port *team_port_get_rcu(const struct net_device *dev) static inline bool team_port_enabled(struct team_port *port) { - return port->index != -1; + return READ_ONCE(port->index) != -1; } static inline bool team_port_txable(struct team_port *port) @@ -272,7 +272,7 @@ static inline struct team_port *team_get_port_by_index_rcu(struct team *team, struct hlist_head *head = team_port_index_hash(team, port_index); hlist_for_each_entry_rcu(port, head, hlist) - if (port->index == port_index) + if (READ_ONCE(port->index) == port_index) return port; return NULL; } -- cgit v1.2.3 From 014f249121d73909528df320818fba7693d0ec92 Mon Sep 17 00:00:00 2001 From: Marc Harvey Date: Thu, 9 Apr 2026 02:59:24 +0000 Subject: net: team: Remove unused team_mode_op, port_enabled This team_mode_op wasn't used by any of the team modes, so remove it. Reviewed-by: Jiri Pirko Signed-off-by: Marc Harvey Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260409-teaming-driver-internal-v7-2-f47e7589685d@google.com Signed-off-by: Paolo Abeni --- include/linux/if_team.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/if_team.h b/include/linux/if_team.h index 06f4d7400c1e..a761f5282bcf 100644 --- a/include/linux/if_team.h +++ b/include/linux/if_team.h @@ -121,7 +121,6 @@ struct team_mode_ops { int (*port_enter)(struct team *team, struct team_port *port); void (*port_leave)(struct team *team, struct team_port *port); void (*port_change_dev_addr)(struct team *team, struct team_port *port); - void (*port_enabled)(struct team *team, struct team_port *port); void (*port_disabled)(struct team *team, struct team_port *port); }; -- cgit v1.2.3 From cfa477df2cc62ba53cb936669886361152b594a7 Mon Sep 17 00:00:00 2001 From: Marc Harvey Date: Thu, 9 Apr 2026 02:59:25 +0000 Subject: net: team: Rename port_disabled team mode op to port_tx_disabled This team mode op is only used by the load balance mode, and it only uses it in the tx path. Reviewed-by: Jiri Pirko Signed-off-by: Marc Harvey Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260409-teaming-driver-internal-v7-3-f47e7589685d@google.com Signed-off-by: Paolo Abeni --- include/linux/if_team.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/if_team.h b/include/linux/if_team.h index a761f5282bcf..740cb3100dfc 100644 --- a/include/linux/if_team.h +++ b/include/linux/if_team.h @@ -121,7 +121,7 @@ struct team_mode_ops { int (*port_enter)(struct team *team, struct team_port *port); void (*port_leave)(struct team *team, struct team_port *port); void (*port_change_dev_addr)(struct team *team, struct team_port *port); - void (*port_disabled)(struct team *team, struct team_port *port); + void (*port_tx_disabled)(struct team *team, struct team_port *port); }; extern int team_modeop_port_enter(struct team *team, struct team_port *port); -- cgit v1.2.3 From fa6ed31dd913b0f68c75ec80c3f4a324572071fc Mon Sep 17 00:00:00 2001 From: Marc Harvey Date: Thu, 9 Apr 2026 02:59:28 +0000 Subject: net: team: Rename enablement functions and struct members to tx Add no functional changes, but rename enablement functions, variables etc. that are used in teaming driver transmit decisions. Since rx and tx enablement are still coupled, some of the variables renamed in this patch are still used for the rx path, but that will change in a follow-up patch. Reviewed-by: Jiri Pirko Signed-off-by: Marc Harvey Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260409-teaming-driver-internal-v7-6-f47e7589685d@google.com Signed-off-by: Paolo Abeni --- include/linux/if_team.h | 46 ++++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/if_team.h b/include/linux/if_team.h index 740cb3100dfc..c777170ef552 100644 --- a/include/linux/if_team.h +++ b/include/linux/if_team.h @@ -27,10 +27,10 @@ struct team; struct team_port { struct net_device *dev; - struct hlist_node hlist; /* node in enabled ports hash list */ + struct hlist_node tx_hlist; /* node in tx-enabled ports hash list */ struct list_head list; /* node in ordinary list */ struct team *team; - int index; /* index of enabled port. If disabled, it's set to -1 */ + int tx_index; /* index of tx enabled port. If disabled, -1 */ bool linkup; /* either state.linkup or user.linkup */ @@ -77,7 +77,7 @@ static inline struct team_port *team_port_get_rcu(const struct net_device *dev) static inline bool team_port_enabled(struct team_port *port) { - return READ_ONCE(port->index) != -1; + return READ_ONCE(port->tx_index) != -1; } static inline bool team_port_txable(struct team_port *port) @@ -190,10 +190,10 @@ struct team { const struct header_ops *header_ops_cache; /* - * List of enabled ports and their count + * List of tx-enabled ports and counts of rx and tx-enabled ports. */ - int en_port_count; - struct hlist_head en_port_hlist[TEAM_PORT_HASHENTRIES]; + int tx_en_port_count; + struct hlist_head tx_en_port_hlist[TEAM_PORT_HASHENTRIES]; struct list_head port_list; /* list of all ports */ @@ -237,41 +237,43 @@ static inline int team_dev_queue_xmit(struct team *team, struct team_port *port, return dev_queue_xmit(skb); } -static inline struct hlist_head *team_port_index_hash(struct team *team, - int port_index) +static inline struct hlist_head *team_tx_port_index_hash(struct team *team, + int tx_port_index) { - return &team->en_port_hlist[port_index & (TEAM_PORT_HASHENTRIES - 1)]; + unsigned int list_entry = tx_port_index & (TEAM_PORT_HASHENTRIES - 1); + + return &team->tx_en_port_hlist[list_entry]; } -static inline struct team_port *team_get_port_by_index(struct team *team, - int port_index) +static inline struct team_port *team_get_port_by_tx_index(struct team *team, + int tx_port_index) { + struct hlist_head *head = team_tx_port_index_hash(team, tx_port_index); struct team_port *port; - struct hlist_head *head = team_port_index_hash(team, port_index); - hlist_for_each_entry(port, head, hlist) - if (port->index == port_index) + hlist_for_each_entry(port, head, tx_hlist) + if (port->tx_index == tx_port_index) return port; return NULL; } static inline int team_num_to_port_index(struct team *team, unsigned int num) { - int en_port_count = READ_ONCE(team->en_port_count); + int tx_en_port_count = READ_ONCE(team->tx_en_port_count); - if (unlikely(!en_port_count)) + if (unlikely(!tx_en_port_count)) return 0; - return num % en_port_count; + return num % tx_en_port_count; } -static inline struct team_port *team_get_port_by_index_rcu(struct team *team, - int port_index) +static inline struct team_port *team_get_port_by_tx_index_rcu(struct team *team, + int tx_port_index) { + struct hlist_head *head = team_tx_port_index_hash(team, tx_port_index); struct team_port *port; - struct hlist_head *head = team_port_index_hash(team, port_index); - hlist_for_each_entry_rcu(port, head, hlist) - if (READ_ONCE(port->index) == port_index) + hlist_for_each_entry_rcu(port, head, tx_hlist) + if (READ_ONCE(port->tx_index) == tx_port_index) return port; return NULL; } -- cgit v1.2.3 From 68f0833f279ac209ec865da76568c843dd38c508 Mon Sep 17 00:00:00 2001 From: Marc Harvey Date: Thu, 9 Apr 2026 02:59:29 +0000 Subject: net: team: Track rx enablement separately from tx enablement Separate the rx and tx enablement/disablement into different functions so that it is easier to interact with them independently later. Although this patch changes receive and transmit paths, the actual behavior of the teaming driver should remain unchanged, since there is no option introduced yet to change rx or tx enablement independently. Those options will be added in follow-up patches. Reviewed-by: Jiri Pirko Signed-off-by: Marc Harvey Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260409-teaming-driver-internal-v7-7-f47e7589685d@google.com Signed-off-by: Paolo Abeni --- include/linux/if_team.h | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/if_team.h b/include/linux/if_team.h index c777170ef552..3d21e06fda67 100644 --- a/include/linux/if_team.h +++ b/include/linux/if_team.h @@ -31,6 +31,7 @@ struct team_port { struct list_head list; /* node in ordinary list */ struct team *team; int tx_index; /* index of tx enabled port. If disabled, -1 */ + bool rx_enabled; bool linkup; /* either state.linkup or user.linkup */ @@ -75,14 +76,24 @@ static inline struct team_port *team_port_get_rcu(const struct net_device *dev) return rcu_dereference(dev->rx_handler_data); } -static inline bool team_port_enabled(struct team_port *port) +static inline bool team_port_rx_enabled(struct team_port *port) +{ + return READ_ONCE(port->rx_enabled); +} + +static inline bool team_port_tx_enabled(struct team_port *port) { return READ_ONCE(port->tx_index) != -1; } +static inline bool team_port_enabled(struct team_port *port) +{ + return team_port_rx_enabled(port) && team_port_tx_enabled(port); +} + static inline bool team_port_txable(struct team_port *port) { - return port->linkup && team_port_enabled(port); + return port->linkup && team_port_tx_enabled(port); } static inline bool team_port_dev_txable(const struct net_device *port_dev) @@ -193,6 +204,7 @@ struct team { * List of tx-enabled ports and counts of rx and tx-enabled ports. */ int tx_en_port_count; + int rx_en_port_count; struct hlist_head tx_en_port_hlist[TEAM_PORT_HASHENTRIES]; struct list_head port_list; /* list of all ports */ -- cgit v1.2.3 From b025461303d87923abfaae6cc07ba8a83ddfd844 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 7 Apr 2026 17:14:38 -0700 Subject: tcp: update window_clamp when SO_RCVBUF is set Commit under Fixes moved recomputing the window clamp to tcp_measure_rcv_mss() (when scaling_ratio changes). I suspect it missed the fact that we don't recompute the clamp when rcvbuf is set. Until scaling_ratio changes we are stuck with the old window clamp which may be based on the small initial buffer. scaling_ratio may never change. Inspired by Eric's recent commit d1361840f8c5 ("tcp: fix SO_RCVLOWAT and RCVBUF autotuning") plumb the user action thru to TCP and have it update the clamp. A smaller fix would be to just have tcp_rcvbuf_grow() adjust the clamp even if SOCK_RCVBUF_LOCK is set. But IIUC this is what we were trying to get away from in the first place. Fixes: a2cbb1603943 ("tcp: Update window clamping condition") Signed-off-by: Jakub Kicinski Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260408001438.129165-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- include/linux/net.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/net.h b/include/linux/net.h index a8e818de95b3..ca6a7bc5c9ae 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -223,6 +223,7 @@ struct proto_ops { int (*sendmsg_locked)(struct sock *sk, struct msghdr *msg, size_t size); int (*set_rcvlowat)(struct sock *sk, int val); + void (*set_rcvbuf)(struct sock *sk, int val); }; #define DECLARE_SOCKADDR(type, dst, src) \ -- cgit v1.2.3 From 9c332d7f63401c3ff1765c9998531b3784f3f9a4 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 24 Mar 2026 13:32:12 -0400 Subject: nfs: update inode ctime after removexattr operation xfstest generic/728 fails with delegated timestamps. The client does a removexattr and then a stat to test the ctime, which doesn't change. The stat() doesn't trigger a GETATTR because of the delegated timestamps, so it relies on the cached ctime, which is wrong. The setxattr compound has a trailing GETATTR, which ensures that its ctime gets updated. Follow the same strategy with removexattr. Fixes: 3e1f02123fba ("NFSv4.2: add client side XDR handling for extended attributes") Reported-by: Olga Kornievskaia Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- include/linux/nfs_xdr.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index ff1f12aa73d2..fcbd21b5685f 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1611,12 +1611,15 @@ struct nfs42_listxattrsres { struct nfs42_removexattrargs { struct nfs4_sequence_args seq_args; struct nfs_fh *fh; + const u32 *bitmask; const char *xattr_name; }; struct nfs42_removexattrres { struct nfs4_sequence_res seq_res; struct nfs4_change_info cinfo; + struct nfs_fattr *fattr; + const struct nfs_server *server; }; #endif /* CONFIG_NFS_V4_2 */ -- cgit v1.2.3 From 765bde47fe7f197dabeb12da76831f40d0b20377 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 6 Mar 2026 16:56:24 -0500 Subject: xprtrdma: Close lost-wakeup race in xprt_rdma_alloc_slot xprt_rdma_alloc_slot() and xprt_rdma_free_slot() lack serialization between the buffer pool and the backlog queue. A buffer freed after rpcrdma_buffer_get() finds the pool empty but before rpc_sleep_on() places the task on the backlog is returned to the pool with no waiter to wake, leaving the task stuck on the backlog indefinitely. After joining the backlog, re-check the pool and route any recovered buffer through xprt_wake_up_backlog(), whose queue lock serializes with concurrent wakeups and avoids double-assignment of slots. Because xprt_rdma_free_slot() does not hold reserve_lock, the XPRT_CONGESTED double-check in xprt_throttle_congested() is ineffective: a task can join the backlog through that path after free_slot has already found it empty and cleared the bit. Avoid this by using xprt_add_backlog_noncongested(), which queues the task without setting XPRT_CONGESTED, so every allocation reaches xprt_rdma_alloc_slot() and its post-sleep re-check. Fixes: edb41e61a54e ("xprtrdma: Make rpc_rqst part of rpcrdma_req") Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index f46d1fb8f71a..a82045804d34 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -404,6 +404,8 @@ struct rpc_xprt * xprt_alloc(struct net *net, size_t size, unsigned int max_req); void xprt_free(struct rpc_xprt *); void xprt_add_backlog(struct rpc_xprt *xprt, struct rpc_task *task); +void xprt_add_backlog_noncongested(struct rpc_xprt *xprt, + struct rpc_task *task); bool xprt_wake_up_backlog(struct rpc_xprt *xprt, struct rpc_rqst *req); void xprt_cleanup_ids(void); -- cgit v1.2.3 From 67fab22a7adcec0279b9b057eb3dc669e32834f0 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Wed, 8 Apr 2026 03:30:29 -0700 Subject: net: add getsockopt_iter callback to proto_ops Add a new getsockopt_iter callback to struct proto_ops that uses sockopt_t, a type-safe wrapper around iov_iter. This provides a clean interface for socket option operations that works with both user and kernel buffers. The sockopt_t type encapsulates an iov_iter and an optlen field. The optlen field, although not suggested by Linus, serves as both input (buffer size) and output (returned data size), allowing callbacks to return random values independent of the bytes written via copy_to_iter(), so, keep it separated from iov_iter.count. This is preparatory work for removing the SOL_SOCKET level restriction from io_uring getsockopt operations. Keep in mind that both iter_out and iter_in always point to the same data at all times, and we just have two of them to make the callback implementation sane. Suggested-by: Linus Torvalds Signed-off-by: Breno Leitao Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20260408-getsockopt-v3-1-061bb9cb355d@debian.org Signed-off-by: Jakub Kicinski --- include/linux/net.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/net.h b/include/linux/net.h index ca6a7bc5c9ae..f268f395ce47 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -23,9 +23,30 @@ #include #include #include +#include #include +/** + * struct sockopt - socket option value container + * @iter_in: iov_iter for reading optval with the content from the caller. + * Use copy_from_iter() given this iov direction is ITER_SOURCE + * @iter_out: iov_iter for protocols to update optval data to userspace + * Use _copy_to_iter() given iov direction is ITER_DEST + * @optlen: serves as both input (buffer size) and output (returned data size). + * + * Type-safe wrapper for socket option data that works with both + * user and kernel buffers. + * + * The optlen field allows callbacks to return a specific length value + * independent of the bytes written via copy_to_iter(). + */ +typedef struct sockopt { + struct iov_iter iter_in; + struct iov_iter iter_out; + int optlen; +} sockopt_t; + struct poll_table_struct; struct pipe_inode_info; struct inode; @@ -192,6 +213,8 @@ struct proto_ops { unsigned int optlen); int (*getsockopt)(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen); + int (*getsockopt_iter)(struct socket *sock, int level, + int optname, sockopt_t *opt); void (*show_fdinfo)(struct seq_file *m, struct socket *sock); int (*sendmsg) (struct socket *sock, struct msghdr *m, size_t total_len); -- cgit v1.2.3 From 0d5acba6331c326f394a677daf49a67f44a0416a Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Thu, 9 Apr 2026 14:52:32 -0700 Subject: Drivers: hv: vmbus: Export hv_vmbus_exists() and use it in pci-hyperv With commit f84b21da3624 ("PCI: hv: Don't load the driver for baremetal root partition"), the bare metal Linux root partition won't use the pci-hyperv driver, but when a Linux VM runs on the Linux root partition, pci-hyperv's module_init function init_hv_pci_drv() can still run, e.g. in the case of CONFIG_PCI_HYPERV=y, even if the VMBus driver is not used in such a VM (i.e. the hv_vmbus driver's init function returns -ENODEV due to vmbus_root_device being NULL). In such a Linux VM, init_hv_pci_drv() runs with a side effect: the 3 hvpci_block_ops callbacks are set to functions that depend on hv_vmbus. Later, when the MLX driver in such a VM invokes the callbacks, e.g. in drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c: mlx5_hv_register_invalidate(), hvpci_block_ops.reg_blk_invalidate() is hv_register_block_invalidate() rather than a NULL function pointer, and hv_register_block_invalidate() assumes that it can find a struct hv_pcibus_device from pdev->bus->sysdata, which is false in such a VM. Consequently, hv_register_block_invalidate() -> get_pcichild_wslot() -> spin_lock_irqsave() may hang since it can be accessing an invalid spinlock pointer. Fix the issue by exporting hv_vmbus_exists() and using it in pci-hyperv: hv_root_partition() is true and hv_nested is false ==> hv_vmbus_exists() is false. hv_root_partition() is true and hv_nested is true ==> hv_vmbus_exists() is true. hv_root_partition() is false ==> hv_vmbus_exists() is true. While at it, rename vmbus_exists() to hv_vmbus_exists() to follow the convention that all public functions have the hv_ prefix; also change the return value's type from int to bool to make the code more readable; also move the two pr_info() calls. Reported-by: Mukesh Rathor Signed-off-by: Dexuan Cui Signed-off-by: Wei Liu --- include/linux/hyperv.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index dfc516c1c719..5459e776ec17 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1304,6 +1304,8 @@ static inline void *hv_get_drvdata(struct hv_device *dev) struct device *hv_get_vmbus_root_device(void); +bool hv_vmbus_exists(void); + struct hv_ring_buffer_debug_info { u32 current_interrupt_mask; u32 current_read_index; -- cgit v1.2.3 From 5b484311507b5d403c1f7a45f6aa3778549e268b Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Mon, 13 Apr 2026 19:59:11 -0700 Subject: driver core: Add kernel-doc for DEV_FLAG_COUNT enum value Even though nobody should use this value (except when declaring the "flags" bitmap), kernel-doc still gets upset that it's not documented. It reports: WARNING: ../include/linux/device.h:519 Enum value 'DEV_FLAG_COUNT' not described in enum 'struct_device_flags' Add the description of DEV_FLAG_COUNT. Fixes: a2225b6e834a ("driver core: Don't let a device probe until it's ready") Reported-by: Randy Dunlap Closes: https://lore.kernel.org/f318cd43-81fd-48b9-abf7-92af85f12f91@infradead.org Signed-off-by: Douglas Anderson Tested-by: Randy Dunlap Reviewed-by: Randy Dunlap Link: https://patch.msgid.link/20260413195910.1.I23aca74fe2d3636a47df196a80920fecb2643220@changeid Signed-off-by: Danilo Krummrich --- include/linux/device.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index f27ed6eb87a9..ac972e7bead4 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -466,6 +466,7 @@ struct device_physical_location { * * @DEV_FLAG_READY_TO_PROBE: If set then device_add() has finished enough * initialization that probe could be called. + * @DEV_FLAG_COUNT: Number of defined struct_device_flags. */ enum struct_device_flags { DEV_FLAG_READY_TO_PROBE = 0, -- cgit v1.2.3 From 15e9e00a5aa4f56ca1cff7749c166e072d7cb6ac Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Tue, 14 Apr 2026 11:37:21 -0300 Subject: vfs: get rid of BUG_ON() in d_mark_tmpfile_name() Do proper error handling in d_mark_tmpfile_name() by returning errors rather than using BUG_ON()'s. Adjust caller to check for errors from d_mark_tmpfile_name() as well as clean it up for using return value from scnprintf() in QSTR_LEN() to make it more obvious where the tmpfile name's length is coming from. Link: https://lore.kernel.org/r/CAHk-=wgerpUKCDhdzKH0FEdLyfhj3doc9t+kO9Yb6rSsTp7hdQ@mail.gmail.com Suggested-by: Linus Torvalds Reviewed-by: David Howells Signed-off-by: Paulo Alcantara (Red Hat) Cc: Christian Brauner Cc: Jan Kara CC: linux-fsdevel@vger.kernel.org Cc: linux-cifs@vger.kernel.org Signed-off-by: Steve French --- include/linux/dcache.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index f60819dcfebd..c5bd5a74baba 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -264,7 +264,7 @@ extern void d_invalidate(struct dentry *); extern struct dentry * d_make_root(struct inode *); extern void d_mark_tmpfile(struct file *, struct inode *); -void d_mark_tmpfile_name(struct file *file, const struct qstr *name); +int d_mark_tmpfile_name(struct file *file, const struct qstr *name); extern void d_tmpfile(struct file *, struct inode *); extern struct dentry *d_find_alias(struct inode *); -- cgit v1.2.3 From 1f5ffc672165ff851063a5fd044b727ab2517ae3 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 14 Apr 2026 23:03:02 -0700 Subject: Fix mismerge of the arm64 / timer-core interrupt handling changes Commit c43267e6794a ("Merge tag 'arm64-upstream' of git://...") had a conflict in the irq entry/exit code due to commit c5538d0141b3 ("entry: Split kernel mode logic from irqentry_{enter,exit}()") having moved the core code in irqentry_enter/exit() from kernel/entry/common.c into helper inline functions in include/linux/irq-entry-common.h. On the other side of the merge, the timer-core code had introduced deferred hrtimer rearming infrastructure in commit 0e98eb14814e ("entry: Prepare for deferred hrtimer rearming"), adding two calls to hrtimer_rearm_deferred() in irqentry_enter(). When merging the two, moving the two calls to the new location wasn't a problem, but afterwards I had made the mistake of looking what had happened in linux-next. And linux-next had a very different merge resolution in commit 04f02dc3ea74 ("Merge tag 'entry-for-arm64-26-04-08' into sched/hrtick"), which had unified the two calls into one single call-site in irqentry_exit_to_kernel_mode_preempt(). And that merge resolution looked cleverer than the straightforward one I had done, so I re-did my merge the way it had been done in linux-next. But it turns out nobody apparently tests linux-next, and the merge in linux-next was just wrong. The difference is that hrtimer_rearm_deferred() doesn't get called at all for the case when state.exit_rcu is true, and the boot will typically fail due to timers not triggering correctly. So this undoes the "clever" merge, and does the straightforward one instead. Fixes: c43267e6794a ("Merge tag 'arm64-upstream' of git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux" Reported-and-tested-by: Alexei Starovoitov Link: https://lore.kernel.org/all/CAADnVQJ=MoiX4=guPWhL9vtnAELkpNx=GNm8RA1-aV424UFz2A@mail.gmail.com/ Link: https://lore.kernel.org/all/CAHk-=wg8+BER4VyFKG3rnPi2gXxbf-jbHS=EU+xhFqGVQfbutw@mail.gmail.com/ Signed-off-by: Linus Torvalds --- include/linux/irq-entry-common.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index 7ab41eec549f..167fba7dbf04 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -474,8 +474,6 @@ static inline void irqentry_exit_to_kernel_mode_preempt(struct pt_regs *regs, if (IS_ENABLED(CONFIG_PREEMPTION)) irqentry_exit_cond_resched(); - - hrtimer_rearm_deferred(); } /** @@ -501,6 +499,7 @@ irqentry_exit_to_kernel_mode_after_preempt(struct pt_regs *regs, irqentry_state_ */ if (state.exit_rcu) { instrumentation_begin(); + hrtimer_rearm_deferred(); /* Tell the tracer that IRET will enable interrupts */ trace_hardirqs_on_prepare(); lockdep_hardirqs_on_prepare(); @@ -511,6 +510,7 @@ irqentry_exit_to_kernel_mode_after_preempt(struct pt_regs *regs, irqentry_state_ } instrumentation_begin(); + hrtimer_rearm_deferred(); /* Covers both tracing and lockdep */ trace_hardirqs_on(); instrumentation_end(); -- cgit v1.2.3 From fbd5d52ebf49595975e24e14e57632d580738091 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 13 Apr 2026 09:01:26 +0200 Subject: ACPI: add acpi_get_cpu_uid() stub helper When ACPI is disabled, x86 Xen support fails to build: arch/x86/xen/enlighten_hvm.c: In function 'xen_cpu_up_prepare_hvm': arch/x86/xen/enlighten_hvm.c:165:13: error: implicit declaration of function 'acpi_get_cpu_uid' [-Wimplicit-function-declaration] 165 | if (acpi_get_cpu_uid(cpu, &cpu_uid) == 0) | ^~~~~~~~~~~~~~~~ Add a trivial stub that can be used in place of the real function. Fixes: f652d0a4e13c ("ACPI: Centralize acpi_get_cpu_uid() declaration in include/linux/acpi.h") Signed-off-by: Arnd Bergmann Acked-by: Chengwen Feng Link: https://patch.msgid.link/20260413070132.3828606-1-arnd@kernel.org Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index bfacb9475aac..67effb91fa98 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -959,6 +959,12 @@ static inline int acpi_table_parse(char *id, return -ENODEV; } +static inline int acpi_get_cpu_uid(unsigned int cpu, u32 *uid) +{ + *uid = cpu; + return 0; +} + static inline int acpi_nvs_register(__u64 start, __u64 size) { return 0; -- cgit v1.2.3 From 7746e3bd4cc19b5092e00d32d676e329bfcb6900 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 10 Apr 2026 16:49:47 +0200 Subject: fanotify: fix false positive on permission events fsnotify_get_mark_safe() may return false for a mark on an unrelated group, which results in bypassing the permission check. Fix by skipping over detached marks that are not in the current group. CC: stable@vger.kernel.org Fixes: abc77577a669 ("fsnotify: Provide framework for dropping SRCU lock in ->handle_event") Signed-off-by: Miklos Szeredi Link: https://patch.msgid.link/20260410144950.156160-1-mszeredi@redhat.com Signed-off-by: Jan Kara --- include/linux/fsnotify_backend.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 95985400d3d8..e5cde39d6e85 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -915,6 +915,7 @@ extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group, unsigned int obj_type); extern void fsnotify_get_mark(struct fsnotify_mark *mark); extern void fsnotify_put_mark(struct fsnotify_mark *mark); +struct fsnotify_mark *fsnotify_next_mark(struct fsnotify_mark *mark); extern void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info); extern bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info); -- cgit v1.2.3 From d3e945223e0158c85dbde23de4f89493a2a817f6 Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Thu, 16 Apr 2026 06:43:37 +0000 Subject: bpf: Move constants blinding out of arch-specific JITs During the JIT stage, constants blinding rewrites instructions but only rewrites the private instruction copy of the JITed subprog, leaving the global env->prog->insnsi and env->insn_aux_data untouched. This causes a mismatch between subprog instructions and the global state, making it difficult to use the global data in the JIT. To avoid this mismatch, and given that all arch-specific JITs already support constants blinding, move it to the generic verifier code, and switch to rewrite the global env->prog->insnsi with the global states adjusted, as other rewrites in the verifier do. This removes the constants blinding calls in each JIT, which are largely duplicated code across architectures. Since constants blinding is only required for JIT, and there are two JIT entry functions, jit_subprogs() for BPF programs with multiple subprogs and bpf_prog_select_runtime() for programs with no subprogs, move the constants blinding invocation into these two functions. In the verifier path, bpf_patch_insn_data() is used to keep global verifier auxiliary data in sync with patched instructions. A key question is whether this global auxiliary data should be restored on the failure path. Besides instructions, bpf_patch_insn_data() adjusts: - prog->aux->poke_tab - env->insn_array_maps - env->subprog_info - env->insn_aux_data For prog->aux->poke_tab, it is only used by JIT or only meaningful after JIT succeeds, so it does not need to be restored on the failure path. For env->insn_array_maps, when JIT fails, programs using insn arrays are rejected by bpf_insn_array_ready() due to missing JIT addresses. Hence, env->insn_array_maps is only meaningful for JIT and does not need to be restored. For subprog_info, if jit_subprogs fails and CONFIG_BPF_JIT_ALWAYS_ON is not enabled, kernel falls back to interpreter. In this case, env->subprog_info is used to determine subprogram stack depth. So it must be restored on failure. For env->insn_aux_data, it is freed by clear_insn_aux_data() at the end of bpf_check(). Before freeing, clear_insn_aux_data() loops over env->insn_aux_data to release jump targets recorded in it. The loop uses env->prog->len as the array length, but this length no longer matches the actual size of the adjusted env->insn_aux_data array after constants blinding. To address it, a simple approach is to keep insn_aux_data as adjusted after failure, since it will be freed shortly, and record its actual size for the loop in clear_insn_aux_data(). But since clear_insn_aux_data() uses the same index to loop over both env->prog->insnsi and env->insn_aux_data, this approach results in incorrect index for the insnsi array. So an alternative approach is adopted: clone the original env->insn_aux_data before blinding and restore it after failure, similar to env->prog. For classic BPF programs, constants blinding works as before since it is still invoked from bpf_prog_select_runtime(). Reviewed-by: Anton Protopopov # v8 Reviewed-by: Hari Bathini # powerpc jit Reviewed-by: Pu Lehui # riscv jit Acked-by: Hengqi Chen # loongarch jit Signed-off-by: Xu Kuohai Link: https://lore.kernel.org/r/20260416064341.151802-2-xukuohai@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index f552170eacf4..9fa4d4090093 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1184,6 +1184,18 @@ static inline bool bpf_dump_raw_ok(const struct cred *cred) struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); + +#ifdef CONFIG_BPF_SYSCALL +struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off, + const struct bpf_insn *patch, u32 len); +#else +static inline struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off, + const struct bpf_insn *patch, u32 len) +{ + return ERR_PTR(-ENOTSUPP); +} +#endif /* CONFIG_BPF_SYSCALL */ + int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt); static inline bool xdp_return_frame_no_direct(void) @@ -1310,9 +1322,14 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog, const char *bpf_jit_get_prog_name(struct bpf_prog *prog); -struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *fp); +struct bpf_prog *bpf_jit_blind_constants(struct bpf_verifier_env *env, struct bpf_prog *prog); void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other); +static inline bool bpf_prog_need_blind(const struct bpf_prog *prog) +{ + return prog->blinding_requested && !prog->blinded; +} + static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen, u32 pass, void *image) { @@ -1451,6 +1468,20 @@ static inline void bpf_prog_kallsyms_del(struct bpf_prog *fp) { } +static inline bool bpf_prog_need_blind(const struct bpf_prog *prog) +{ + return false; +} + +static inline +struct bpf_prog *bpf_jit_blind_constants(struct bpf_verifier_env *env, struct bpf_prog *prog) +{ + return prog; +} + +static inline void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other) +{ +} #endif /* CONFIG_BPF_JIT */ void bpf_prog_kallsyms_del_all(struct bpf_prog *fp); -- cgit v1.2.3 From d9ef13f72711f2dad64cd4445472ded98fb6c954 Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Thu, 16 Apr 2026 06:43:38 +0000 Subject: bpf: Pass bpf_verifier_env to JIT Pass bpf_verifier_env to bpf_int_jit_compile(). The follow-up patch will use env->insn_aux_data in the JIT stage to detect indirect jump targets. Since bpf_prog_select_runtime() can be called by cbpf and lib/test_bpf.c code without verifier, introduce helper __bpf_prog_select_runtime() to accept the env parameter. Remove the call to bpf_prog_select_runtime() in bpf_prog_load(), and switch to call __bpf_prog_select_runtime() in the verifier, with env variable passed. The original bpf_prog_select_runtime() is preserved for cbpf and lib/test_bpf.c, where env is NULL. Now all constants blinding calls are moved into the verifier, except the cbpf and lib/test_bpf.c cases. The instructions arrays are adjusted by bpf_patch_insn_data() function for normal cases, so there is no need to call adjust_insn_arrays() in bpf_jit_blind_constants(). Remove it. Reviewed-by: Anton Protopopov # v8 Reviewed-by: Emil Tsalapatis # v12 Acked-by: Hengqi Chen # v14 Signed-off-by: Xu Kuohai Link: https://lore.kernel.org/r/20260416064341.151802-3-xukuohai@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 9fa4d4090093..1ec6d5ba64cc 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1108,6 +1108,8 @@ sk_filter_reason(struct sock *sk, struct sk_buff *skb) return sk_filter_trim_cap(sk, skb, 1); } +struct bpf_prog *__bpf_prog_select_runtime(struct bpf_verifier_env *env, struct bpf_prog *fp, + int *err); struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err); void bpf_prog_free(struct bpf_prog *fp); @@ -1153,7 +1155,7 @@ u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); ((u64 (*)(u64, u64, u64, u64, u64, const struct bpf_insn *)) \ (void *)__bpf_call_base) -struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog); +struct bpf_prog *bpf_int_jit_compile(struct bpf_verifier_env *env, struct bpf_prog *prog); void bpf_jit_compile(struct bpf_prog *prog); bool bpf_jit_needs_zext(void); bool bpf_jit_inlines_helper_call(s32 imm); @@ -1188,12 +1190,25 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, #ifdef CONFIG_BPF_SYSCALL struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off, const struct bpf_insn *patch, u32 len); +struct bpf_insn_aux_data *bpf_dup_insn_aux_data(struct bpf_verifier_env *env); +void bpf_restore_insn_aux_data(struct bpf_verifier_env *env, + struct bpf_insn_aux_data *orig_insn_aux); #else static inline struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off, const struct bpf_insn *patch, u32 len) { return ERR_PTR(-ENOTSUPP); } + +static inline struct bpf_insn_aux_data *bpf_dup_insn_aux_data(struct bpf_verifier_env *env) +{ + return NULL; +} + +static inline void bpf_restore_insn_aux_data(struct bpf_verifier_env *env, + struct bpf_insn_aux_data *orig_insn_aux) +{ +} #endif /* CONFIG_BPF_SYSCALL */ int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt); -- cgit v1.2.3 From 07ae6c130b46cf5e3e1a7dc5c1889fefe9adc2d3 Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Thu, 16 Apr 2026 06:43:39 +0000 Subject: bpf: Add helper to detect indirect jump targets Introduce helper bpf_insn_is_indirect_target to check whether a BPF instruction is an indirect jump target. Since the verifier knows which instructions are indirect jump targets, add a new flag indirect_target to struct bpf_insn_aux_data to mark them. The verifier sets this flag when verifying an indirect jump target instruction, and the helper checks the flag to determine whether an instruction is an indirect jump target. Reviewed-by: Anton Protopopov #v8 Reviewed-by: Emil Tsalapatis #v12 Signed-off-by: Xu Kuohai Link: https://lore.kernel.org/r/20260416064341.151802-4-xukuohai@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ include/linux/bpf_verifier.h | 9 +++++---- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0136a108d083..b4b703c90ca9 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1541,6 +1541,8 @@ bool bpf_has_frame_pointer(unsigned long ip); int bpf_jit_charge_modmem(u32 size); void bpf_jit_uncharge_modmem(u32 size); bool bpf_prog_has_trampoline(const struct bpf_prog *prog); +bool bpf_insn_is_indirect_target(const struct bpf_verifier_env *env, const struct bpf_prog *prog, + int insn_idx); #else static inline int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr, diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 53e8664cb566..b148f816f25b 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -630,16 +630,17 @@ struct bpf_insn_aux_data { /* below fields are initialized once */ unsigned int orig_idx; /* original instruction index */ - bool jmp_point; - bool prune_point; + u32 jmp_point:1; + u32 prune_point:1; /* ensure we check state equivalence and save state checkpoint and * this instruction, regardless of any heuristics */ - bool force_checkpoint; + u32 force_checkpoint:1; /* true if instruction is a call to a helper function that * accepts callback function as a parameter. */ - bool calls_callback; + u32 calls_callback:1; + u32 indirect_target:1; /* if it is an indirect jump target */ /* * CFG strongly connected component this instruction belongs to, * zero if it is a singleton SCC. -- cgit v1.2.3 From 7b41ff29c8d386257bae62ad557fd6bad8cc6787 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 12 Apr 2026 20:07:21 +0200 Subject: entry: Kill ARCH_SYSCALL_WORK_{ENTER,EXIT} Nowadays nothing redefines these flags. Signed-off-by: Oleg Nesterov Signed-off-by: Thomas Gleixner Reviewed-by: Jinjie Ruan Link: https://patch.msgid.link/advfWWKgOQkFkwp9@redhat.com --- include/linux/entry-common.h | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index e04d67e999a1..416a3352261f 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -20,31 +20,21 @@ /* * SYSCALL_WORK flags handled in syscall_enter_from_user_mode() */ -#ifndef ARCH_SYSCALL_WORK_ENTER -# define ARCH_SYSCALL_WORK_ENTER (0) -#endif - -/* - * SYSCALL_WORK flags handled in syscall_exit_to_user_mode() - */ -#ifndef ARCH_SYSCALL_WORK_EXIT -# define ARCH_SYSCALL_WORK_EXIT (0) -#endif - #define SYSCALL_WORK_ENTER (SYSCALL_WORK_SECCOMP | \ SYSCALL_WORK_SYSCALL_TRACEPOINT | \ SYSCALL_WORK_SYSCALL_TRACE | \ SYSCALL_WORK_SYSCALL_EMU | \ SYSCALL_WORK_SYSCALL_AUDIT | \ SYSCALL_WORK_SYSCALL_USER_DISPATCH | \ - SYSCALL_WORK_SYSCALL_RSEQ_SLICE | \ - ARCH_SYSCALL_WORK_ENTER) + SYSCALL_WORK_SYSCALL_RSEQ_SLICE) +/* + * SYSCALL_WORK flags handled in syscall_exit_to_user_mode() + */ #define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT | \ SYSCALL_WORK_SYSCALL_TRACE | \ SYSCALL_WORK_SYSCALL_AUDIT | \ SYSCALL_WORK_SYSCALL_USER_DISPATCH | \ - SYSCALL_WORK_SYSCALL_EXIT_TRAP | \ - ARCH_SYSCALL_WORK_EXIT) + SYSCALL_WORK_SYSCALL_EXIT_TRAP) /** * arch_ptrace_report_syscall_entry - Architecture specific ptrace_report_syscall_entry() wrapper -- cgit v1.2.3 From 3cade698881eb238f88cbbfec82acc2110440a3f Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Wed, 15 Apr 2026 14:08:33 +0800 Subject: net: enetc: fix NTMP DMA use-after-free issue The AI-generated review reported a potential DMA use-after-free issue [1]. If netc_xmit_ntmp_cmd() times out and returns an error, the pending command is not explicitly aborted, while ntmp_free_data_mem() unconditionally frees the DMA buffer. If the buffer has already been reallocated elsewhere, this may lead to silent memory corruption. Because the hardware eventually processes the pending command and perform a DMA write of the response to the physical address of the freed buffer. To resolve this issue, this patch does the following modifications: 1. Convert cbdr->ring_lock from a spinlock to a mutex The lock was originally a spinlock in case NTMP operations might be invoked from atomic context. After downstream support for all NTMP tables, no such usage has materialized. A mutex lock is now required because the driver now needs to reclaim used BDs and release associated DMA memory within the lock's context, while dma_free_coherent() might sleep. 2. Introduce software command BD (struct netc_swcbd) The hardware write-back overwrites the addr and len fields of the BD, so the driver cannot rely on the hardware BD to free the associated DMA memory. The driver now maintains a software shadow BD storing the DMA buffer pointer, DMA address, and size. And netc_xmit_ntmp_cmd() only reclaims older BDs when the number of used BDs reaches NETC_CBDR_CLEAN_WORK (16). The software BD enables correct DMA memory release. With this, struct ntmp_dma_buf and ntmp_free_data_mem() are no longer needed and are removed. 3. Require callers to hold ring_lock across netc_xmit_ntmp_cmd() netc_xmit_ntmp_cmd() releases the ring_lock before the caller finishes consuming the response. At this point, if a concurrent thread submits a new command, it may trigger ntmp_clean_cbdr() and free the DMA buffer while it is still in use. Move ring_lock ownership to the caller to ensure the response buffer cannot be reclaimed prematurely. So the helpers ntmp_select_and_lock_cbdr() and ntmp_unlock_cbdr() are added. These changes eliminate the DMA use-after-free condition and ensure safe and consistent BD reclamation and DMA buffer lifecycle management. Fixes: 4701073c3deb ("net: enetc: add initial netc-lib driver to support NTMP") Link: https://lore.kernel.org/netdev/20260403011729.1795413-1-kuba@kernel.org/ # [1] Signed-off-by: Wei Fang Link: https://patch.msgid.link/20260415060833.2303846-3-wei.fang@nxp.com Signed-off-by: Jakub Kicinski --- include/linux/fsl/ntmp.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fsl/ntmp.h b/include/linux/fsl/ntmp.h index 916dc4fe7de3..83a449b4d6ec 100644 --- a/include/linux/fsl/ntmp.h +++ b/include/linux/fsl/ntmp.h @@ -31,6 +31,12 @@ struct netc_tbl_vers { u8 rsst_ver; }; +struct netc_swcbd { + void *buf; + dma_addr_t dma; + size_t size; +}; + struct netc_cbdr { struct device *dev; struct netc_cbdr_regs regs; @@ -44,9 +50,10 @@ struct netc_cbdr { void *addr_base_align; dma_addr_t dma_base; dma_addr_t dma_base_align; + struct netc_swcbd *swcbd; /* Serialize the order of command BD ring */ - spinlock_t ring_lock; + struct mutex ring_lock; }; struct ntmp_user { -- cgit v1.2.3 From 2f5015461984caa8ebf265a60b22f38c94d9c70a Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Wed, 15 Apr 2026 15:08:47 -0600 Subject: t10-pi: reduce ref tag code duplication t10_pi_ref_tag() and ext_pi_ref_tag() are identical except for the final truncation of the ref tag to 32 or 48 bits. Factor out a helper full_pi_ref_tag() to return the untruncated ref tag and use it in t10_pi_ref_tag() and ext_pi_ref_tag(). Signed-off-by: Caleb Sander Mateos Reviewed-by: Anuj Gupta Reviewed-by: Christoph Hellwig Link: https://patch.msgid.link/20260415210847.1730016-1-csander@purestorage.com Signed-off-by: Jens Axboe --- include/linux/t10-pi.h | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h index 2c59fe3efcd4..b6c2496866ea 100644 --- a/include/linux/t10-pi.h +++ b/include/linux/t10-pi.h @@ -4,6 +4,7 @@ #include #include +#include /* * A T10 PI-capable target device can be formatted with different @@ -25,6 +26,16 @@ enum t10_dif_type { T10_PI_TYPE3_PROTECTION = 0x3, }; +static inline u64 full_pi_ref_tag(const struct request *rq) +{ + unsigned int shift = ilog2(queue_logical_block_size(rq->q)); + + if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) && + rq->q->limits.integrity.interval_exp) + shift = rq->q->limits.integrity.interval_exp; + return blk_rq_pos(rq) >> (shift - SECTOR_SHIFT); +} + /* * T10 Protection Information tuple. */ @@ -39,12 +50,7 @@ struct t10_pi_tuple { static inline u32 t10_pi_ref_tag(struct request *rq) { - unsigned int shift = ilog2(queue_logical_block_size(rq->q)); - - if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) && - rq->q->limits.integrity.interval_exp) - shift = rq->q->limits.integrity.interval_exp; - return blk_rq_pos(rq) >> (shift - SECTOR_SHIFT) & 0xffffffff; + return lower_32_bits(full_pi_ref_tag(rq)); } struct crc64_pi_tuple { @@ -64,12 +70,7 @@ static inline u64 lower_48_bits(u64 n) static inline u64 ext_pi_ref_tag(struct request *rq) { - unsigned int shift = ilog2(queue_logical_block_size(rq->q)); - - if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) && - rq->q->limits.integrity.interval_exp) - shift = rq->q->limits.integrity.interval_exp; - return lower_48_bits(blk_rq_pos(rq) >> (shift - SECTOR_SHIFT)); + return lower_48_bits(full_pi_ref_tag(rq)); } #endif -- cgit v1.2.3 From 3d3544a6c996e88bb793bb6b2665c3e3f674f5eb Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 13 Apr 2026 11:57:13 +0100 Subject: mm/vma: remove __vma_check_mmap_hook() Commit c50ca15dd496 ("mm: add vm_ops->mapped hook") introduced __vma_check_mmap_hook() in order to assert that a driver doesn't incorrectly implement both an f_op->mmap() and a vm_ops->mapped hook, the latter of which would not ultimately get invoked. However, this did not correctly account for stacked drivers (or drivers that otherwise use the compatibility layer) which might recursively call an mmap_prepare hook via the compatibility layer. Thus the nested mmap_prepare() invocation might result in a VMA which has vm_ops->mapped set with an overlaying mmap() hook, causing the __vma_check_mmap_hook() to fail in vfs_mmap(), wrongly failing the operation. This patch resolves this by simply removing the check, as we can't be certain that an mmap() hook doesn't at some point invoke the compatibility layer, and it's not worth trying to track it. Link: https://lore.kernel.org/20260413105713.92625-1-ljs@kernel.org Fixes: c50ca15dd496 ("mm: add vm_ops->mapped hook") Reported-by: Shinichiro Kawasaki Closes: https://lore.kernel.org/all/adx2ws5z0NMIe5Yj@shinmob/ Signed-off-by: Lorenzo Stoakes Acked-by: Vlastimil Babka (SUSE) Tested-by: Shinichiro Kawasaki Cc: Al Viro Cc: Christian Brauner Cc: David Hildenbrand Cc: Jan Kara Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/fs.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 0bdccfa70b44..f3ca9b841892 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2062,20 +2062,13 @@ void compat_set_desc_from_vma(struct vm_area_desc *desc, const struct file *file const struct vm_area_struct *vma); int __compat_vma_mmap(struct vm_area_desc *desc, struct vm_area_struct *vma); int compat_vma_mmap(struct file *file, struct vm_area_struct *vma); -int __vma_check_mmap_hook(struct vm_area_struct *vma); static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma) { - int err; - if (file->f_op->mmap_prepare) return compat_vma_mmap(file, vma); - err = file->f_op->mmap(file, vma); - if (err) - return err; - - return __vma_check_mmap_hook(vma); + return file->f_op->mmap(file, vma); } static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc) -- cgit v1.2.3 From db128b2c6b7d0c9b514327a0873425bbf18e739b Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 5 Mar 2026 19:52:21 +0800 Subject: mm: rename unlock_page_lruvec_irq and its variants MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is inappropriate to use folio_lruvec_lock() variants in conjunction with unlock_page_lruvec() variants, as this involves the inconsistent operation of locking a folio while unlocking a page. To rectify this, the functions unlock_page_lruvec{_irq, _irqrestore} are renamed to lruvec_unlock{_irq,_irqrestore}. Link: https://lore.kernel.org/4e5e05271a250df4d1812e1832be65636a78c957.1772711148.git.zhengqi.arch@bytedance.com Signed-off-by: Muchun Song Signed-off-by: Qi Zheng Acked-by: Roman Gushchin Acked-by: Johannes Weiner Reviewed-by: Harry Yoo Reviewed-by: Chen Ridong Acked-by: David Hildenbrand (Red Hat) Acked-by: Shakeel Butt Cc: Allen Pais Cc: Axel Rasmussen Cc: Baoquan He Cc: Chengming Zhou Cc: Hamza Mahfooz Cc: Hugh Dickins Cc: Imran Khan Cc: Kamalesh Babulal Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Michal Koutný Cc: Mike Rapoport Cc: Muchun Song Cc: Nhat Pham Cc: Suren Baghdasaryan Cc: Usama Arif Cc: Vlastimil Babka Cc: Wei Xu Cc: Yosry Ahmed Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5173a9f16721..6e88288e90d8 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1479,17 +1479,17 @@ static inline struct lruvec *parent_lruvec(struct lruvec *lruvec) return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec)); } -static inline void unlock_page_lruvec(struct lruvec *lruvec) +static inline void lruvec_unlock(struct lruvec *lruvec) { spin_unlock(&lruvec->lru_lock); } -static inline void unlock_page_lruvec_irq(struct lruvec *lruvec) +static inline void lruvec_unlock_irq(struct lruvec *lruvec) { spin_unlock_irq(&lruvec->lru_lock); } -static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec, +static inline void lruvec_unlock_irqrestore(struct lruvec *lruvec, unsigned long flags) { spin_unlock_irqrestore(&lruvec->lru_lock, flags); @@ -1511,7 +1511,7 @@ static inline struct lruvec *folio_lruvec_relock_irq(struct folio *folio, if (folio_matches_lruvec(folio, locked_lruvec)) return locked_lruvec; - unlock_page_lruvec_irq(locked_lruvec); + lruvec_unlock_irq(locked_lruvec); } return folio_lruvec_lock_irq(folio); @@ -1525,7 +1525,7 @@ static inline void folio_lruvec_relock_irqsave(struct folio *folio, if (folio_matches_lruvec(folio, *lruvecp)) return; - unlock_page_lruvec_irqrestore(*lruvecp, *flags); + lruvec_unlock_irqrestore(*lruvecp, *flags); } *lruvecp = folio_lruvec_lock_irqsave(folio, flags); -- cgit v1.2.3 From d5aa8c1d136e7de89defb06f42f8108992967a70 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 5 Mar 2026 19:52:25 +0800 Subject: mm: memcontrol: return root object cgroup for root memory cgroup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Memory cgroup functions such as get_mem_cgroup_from_folio() and get_mem_cgroup_from_mm() return a valid memory cgroup pointer, even for the root memory cgroup. In contrast, the situation for object cgroups has been different. Previously, the root object cgroup couldn't be returned because it didn't exist. Now that a valid root object cgroup exists, for the sake of consistency, it's necessary to align the behavior of object-cgroup-related operations with that of memory cgroup APIs. Link: https://lore.kernel.org/e9c3f40ba7681d9753372d4ee2ac7a0216848b95.1772711148.git.zhengqi.arch@bytedance.com Signed-off-by: Muchun Song Signed-off-by: Qi Zheng Acked-by: Johannes Weiner Acked-by: Shakeel Butt Reviewed-by: Harry Yoo Cc: Allen Pais Cc: Axel Rasmussen Cc: Baoquan He Cc: Chengming Zhou Cc: Chen Ridong Cc: David Hildenbrand Cc: Hamza Mahfooz Cc: Hugh Dickins Cc: Imran Khan Cc: Kamalesh Babulal Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Michal Koutný Cc: Mike Rapoport Cc: Muchun Song Cc: Nhat Pham Cc: Roman Gushchin Cc: Suren Baghdasaryan Cc: Usama Arif Cc: Vlastimil Babka Cc: Wei Xu Cc: Yosry Ahmed Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6e88288e90d8..9a015258a2ff 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -332,6 +332,7 @@ struct mem_cgroup { #define MEMCG_CHARGE_BATCH 64U extern struct mem_cgroup *root_mem_cgroup; +extern struct obj_cgroup *root_obj_cgroup; enum page_memcg_data_flags { /* page->memcg_data is a pointer to an slabobj_ext vector */ @@ -548,6 +549,11 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) return (memcg == root_mem_cgroup); } +static inline bool obj_cgroup_is_root(const struct obj_cgroup *objcg) +{ + return objcg == root_obj_cgroup; +} + static inline bool mem_cgroup_disabled(void) { return !cgroup_subsys_enabled(memory_cgrp_subsys); @@ -774,23 +780,26 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ static inline bool obj_cgroup_tryget(struct obj_cgroup *objcg) { + if (obj_cgroup_is_root(objcg)) + return true; return percpu_ref_tryget(&objcg->refcnt); } -static inline void obj_cgroup_get(struct obj_cgroup *objcg) +static inline void obj_cgroup_get_many(struct obj_cgroup *objcg, + unsigned long nr) { - percpu_ref_get(&objcg->refcnt); + if (!obj_cgroup_is_root(objcg)) + percpu_ref_get_many(&objcg->refcnt, nr); } -static inline void obj_cgroup_get_many(struct obj_cgroup *objcg, - unsigned long nr) +static inline void obj_cgroup_get(struct obj_cgroup *objcg) { - percpu_ref_get_many(&objcg->refcnt, nr); + obj_cgroup_get_many(objcg, 1); } static inline void obj_cgroup_put(struct obj_cgroup *objcg) { - if (objcg) + if (objcg && !obj_cgroup_is_root(objcg)) percpu_ref_put(&objcg->refcnt); } @@ -1087,6 +1096,11 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) return true; } +static inline bool obj_cgroup_is_root(const struct obj_cgroup *objcg) +{ + return true; +} + static inline bool mem_cgroup_disabled(void) { return true; -- cgit v1.2.3 From 49717c7bd6b8e14329c2d04b1e8ec691175b6f4e Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 5 Mar 2026 19:52:28 +0800 Subject: writeback: prevent memory cgroup release in writeback module MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the near future, a folio will no longer pin its corresponding memory cgroup. To ensure safety, it will only be appropriate to hold the rcu read lock or acquire a reference to the memory cgroup returned by folio_memcg(), thereby preventing it from being released. In the current patch, the function get_mem_cgroup_css_from_folio() and the rcu read lock are employed to safeguard against the release of the memory cgroup. This serves as a preparatory measure for the reparenting of the LRU pages. Link: https://lore.kernel.org/645f99bc344575417f67def3744f975596df2793.1772711148.git.zhengqi.arch@bytedance.com Signed-off-by: Muchun Song Signed-off-by: Qi Zheng Reviewed-by: Harry Yoo Acked-by: Johannes Weiner Acked-by: Shakeel Butt Cc: Allen Pais Cc: Axel Rasmussen Cc: Baoquan He Cc: Chengming Zhou Cc: Chen Ridong Cc: David Hildenbrand Cc: Hamza Mahfooz Cc: Hugh Dickins Cc: Imran Khan Cc: Kamalesh Babulal Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Michal Koutný Cc: Mike Rapoport Cc: Muchun Song Cc: Nhat Pham Cc: Roman Gushchin Cc: Suren Baghdasaryan Cc: Usama Arif Cc: Vlastimil Babka Cc: Wei Xu Cc: Yosry Ahmed Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 9a015258a2ff..4454f03a4acf 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -894,7 +894,7 @@ static inline bool mm_match_cgroup(struct mm_struct *mm, return match; } -struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio); +struct cgroup_subsys_state *get_mem_cgroup_css_from_folio(struct folio *folio); ino_t page_cgroup_ino(struct page *page); static inline bool mem_cgroup_online(struct mem_cgroup *memcg) @@ -1563,9 +1563,14 @@ static inline void mem_cgroup_track_foreign_dirty(struct folio *folio, if (mem_cgroup_disabled()) return; + if (!folio_memcg_charged(folio)) + return; + + rcu_read_lock(); memcg = folio_memcg(folio); - if (unlikely(memcg && &memcg->css != wb->memcg_css)) + if (unlikely(&memcg->css != wb->memcg_css)) mem_cgroup_track_foreign_dirty_slowpath(folio, wb); + rcu_read_unlock(); } void mem_cgroup_flush_foreign(struct bdi_writeback *wb); -- cgit v1.2.3 From f995da5341c1854e59415c2c2c6f0b6406b498f2 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 5 Mar 2026 19:52:29 +0800 Subject: mm: memcontrol: prevent memory cgroup release in count_memcg_folio_events() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the near future, a folio will no longer pin its corresponding memory cgroup. To ensure safety, it will only be appropriate to hold the rcu read lock or acquire a reference to the memory cgroup returned by folio_memcg(), thereby preventing it from being released. In the current patch, the rcu read lock is employed to safeguard against the release of the memory cgroup in count_memcg_folio_events(). This serves as a preparatory measure for the reparenting of the LRU pages. Link: https://lore.kernel.org/dea6aa0389367f7fd6b715c8837a2cf7506bd889.1772711148.git.zhengqi.arch@bytedance.com Signed-off-by: Muchun Song Signed-off-by: Qi Zheng Reviewed-by: Harry Yoo Acked-by: Johannes Weiner Acked-by: Shakeel Butt Cc: Allen Pais Cc: Axel Rasmussen Cc: Baoquan He Cc: Chengming Zhou Cc: Chen Ridong Cc: David Hildenbrand Cc: Hamza Mahfooz Cc: Hugh Dickins Cc: Imran Khan Cc: Kamalesh Babulal Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Michal Koutný Cc: Mike Rapoport Cc: Muchun Song Cc: Nhat Pham Cc: Roman Gushchin Cc: Suren Baghdasaryan Cc: Usama Arif Cc: Vlastimil Babka Cc: Wei Xu Cc: Yosry Ahmed Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 4454f03a4acf..ef26ba087844 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -975,10 +975,15 @@ void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, static inline void count_memcg_folio_events(struct folio *folio, enum vm_event_item idx, unsigned long nr) { - struct mem_cgroup *memcg = folio_memcg(folio); + struct mem_cgroup *memcg; - if (memcg) - count_memcg_events(memcg, idx, nr); + if (!folio_memcg_charged(folio)) + return; + + rcu_read_lock(); + memcg = folio_memcg(folio); + count_memcg_events(memcg, idx, nr); + rcu_read_unlock(); } static inline void count_memcg_events_mm(struct mm_struct *mm, -- cgit v1.2.3 From d14f87858178c64cc94ecd05bb41bba474c1c654 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 5 Mar 2026 19:52:41 +0800 Subject: mm: do not open-code lruvec lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now we have lruvec_unlock(), lruvec_unlock_irq() and lruvec_unlock_irqrestore(), but no the paired lruvec_lock(), lruvec_lock_irq() and lruvec_lock_irqsave(). There is currently no use case for lruvec_lock_irqsave(), so only introduce lruvec_lock_irq(), and change all open-code places to use this helper function. This looks cleaner and prepares for reparenting LRU pages, preventing user from missing RCU lock calls due to open-code lruvec lock. Link: https://lore.kernel.org/2d0bafe7564e17ece46dfd58197af22ce57017dc.1772711148.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Acked-by: Muchun Song Acked-by: Shakeel Butt Reviewed-by: Harry Yoo Cc: Allen Pais Cc: Axel Rasmussen Cc: Baoquan He Cc: Chengming Zhou Cc: Chen Ridong Cc: David Hildenbrand Cc: Hamza Mahfooz Cc: Hugh Dickins Cc: Imran Khan Cc: Johannes Weiner Cc: Kamalesh Babulal Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Michal Koutný Cc: Mike Rapoport Cc: Muchun Song Cc: Nhat Pham Cc: Roman Gushchin Cc: Suren Baghdasaryan Cc: Usama Arif Cc: Vlastimil Babka Cc: Wei Xu Cc: Yosry Ahmed Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ef26ba087844..38f94c7271c1 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1498,6 +1498,11 @@ static inline struct lruvec *parent_lruvec(struct lruvec *lruvec) return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec)); } +static inline void lruvec_lock_irq(struct lruvec *lruvec) +{ + spin_lock_irq(&lruvec->lru_lock); +} + static inline void lruvec_unlock(struct lruvec *lruvec) { spin_unlock(&lruvec->lru_lock); -- cgit v1.2.3 From 31b54a5e8916fdd4819880e3aed93f65ecbb47e3 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 5 Mar 2026 19:52:42 +0800 Subject: mm: memcontrol: prepare for reparenting LRU pages for lruvec lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The following diagram illustrates how to ensure the safety of the folio lruvec lock when LRU folios undergo reparenting. In the folio_lruvec_lock(folio) function: rcu_read_lock(); retry: lruvec = folio_lruvec(folio); /* There is a possibility of folio reparenting at this point. */ spin_lock(&lruvec->lru_lock); if (unlikely(lruvec_memcg(lruvec) != folio_memcg(folio))) { /* * The wrong lruvec lock was acquired, and a retry is required. * This is because the folio resides on the parent memcg lruvec * list. */ spin_unlock(&lruvec->lru_lock); goto retry; } /* Reaching here indicates that folio_memcg() is stable. */ In the memcg_reparent_objcgs(memcg) function: spin_lock(&lruvec->lru_lock); spin_lock(&lruvec_parent->lru_lock); /* Transfer folios from the lruvec list to the parent's. */ spin_unlock(&lruvec_parent->lru_lock); spin_unlock(&lruvec->lru_lock); After acquiring the lruvec lock, it is necessary to verify whether the folio has been reparented. If reparenting has occurred, the new lruvec lock must be reacquired. During the LRU folio reparenting process, the lruvec lock will also be acquired (this will be implemented in a subsequent patch). Therefore, folio_memcg() remains unchanged while the lruvec lock is held. Given that lruvec_memcg(lruvec) is always equal to folio_memcg(folio) after the lruvec lock is acquired, the lruvec_memcg_debug() check is redundant. Hence, it is removed. This patch serves as a preparation for the reparenting of LRU folios. Link: https://lore.kernel.org/23f22cbb1419f277a3483018b32158ae2b86c666.1772711148.git.zhengqi.arch@bytedance.com Signed-off-by: Muchun Song Signed-off-by: Qi Zheng Acked-by: Johannes Weiner Acked-by: Shakeel Butt Cc: Allen Pais Cc: Axel Rasmussen Cc: Baoquan He Cc: Chengming Zhou Cc: Chen Ridong Cc: David Hildenbrand Cc: Hamza Mahfooz Cc: Harry Yoo Cc: Hugh Dickins Cc: Imran Khan Cc: Kamalesh Babulal Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Michal Koutný Cc: Mike Rapoport Cc: Muchun Song Cc: Nhat Pham Cc: Roman Gushchin Cc: Suren Baghdasaryan Cc: Usama Arif Cc: Vlastimil Babka Cc: Wei Xu Cc: Yosry Ahmed Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 34 +++++++++++++++++----------------- include/linux/swap.h | 3 +-- 2 files changed, 18 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 38f94c7271c1..12982875073e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -741,7 +741,15 @@ out: * folio_lruvec - return lruvec for isolating/putting an LRU folio * @folio: Pointer to the folio. * - * This function relies on folio->mem_cgroup being stable. + * Call with rcu_read_lock() held to ensure the lifetime of the returned lruvec. + * Note that this alone will NOT guarantee the stability of the folio->lruvec + * association; the folio can be reparented to an ancestor if this races with + * cgroup deletion. + * + * Use folio_lruvec_lock() to ensure both lifetime and stability of the binding. + * Once a lruvec is locked, folio_lruvec() can be called on other folios, and + * their binding is stable if the returned lruvec matches the one the caller has + * locked. Useful for lock batching. */ static inline struct lruvec *folio_lruvec(struct folio *folio) { @@ -764,15 +772,6 @@ struct lruvec *folio_lruvec_lock_irq(struct folio *folio); struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, unsigned long *flags); -#ifdef CONFIG_DEBUG_VM -void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio); -#else -static inline -void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) -{ -} -#endif - static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ return css ? container_of(css, struct mem_cgroup, css) : NULL; @@ -1198,11 +1197,6 @@ static inline struct lruvec *folio_lruvec(struct folio *folio) return &pgdat->__lruvec; } -static inline -void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) -{ -} - static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) { return NULL; @@ -1261,6 +1255,7 @@ static inline struct lruvec *folio_lruvec_lock(struct folio *folio) { struct pglist_data *pgdat = folio_pgdat(folio); + rcu_read_lock(); spin_lock(&pgdat->__lruvec.lru_lock); return &pgdat->__lruvec; } @@ -1269,6 +1264,7 @@ static inline struct lruvec *folio_lruvec_lock_irq(struct folio *folio) { struct pglist_data *pgdat = folio_pgdat(folio); + rcu_read_lock(); spin_lock_irq(&pgdat->__lruvec.lru_lock); return &pgdat->__lruvec; } @@ -1278,6 +1274,7 @@ static inline struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, { struct pglist_data *pgdat = folio_pgdat(folio); + rcu_read_lock(); spin_lock_irqsave(&pgdat->__lruvec.lru_lock, *flagsp); return &pgdat->__lruvec; } @@ -1500,23 +1497,26 @@ static inline struct lruvec *parent_lruvec(struct lruvec *lruvec) static inline void lruvec_lock_irq(struct lruvec *lruvec) { + rcu_read_lock(); spin_lock_irq(&lruvec->lru_lock); } static inline void lruvec_unlock(struct lruvec *lruvec) { spin_unlock(&lruvec->lru_lock); + rcu_read_unlock(); } static inline void lruvec_unlock_irq(struct lruvec *lruvec) { spin_unlock_irq(&lruvec->lru_lock); + rcu_read_unlock(); } -static inline void lruvec_unlock_irqrestore(struct lruvec *lruvec, - unsigned long flags) +static inline void lruvec_unlock_irqrestore(struct lruvec *lruvec, unsigned long flags) { spin_unlock_irqrestore(&lruvec->lru_lock, flags); + rcu_read_unlock(); } /* Test requires a stable folio->memcg binding, see folio_memcg() */ diff --git a/include/linux/swap.h b/include/linux/swap.h index 4b1f13b5bbad..ea08e2afa2b4 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -310,8 +310,7 @@ extern unsigned long totalreserve_pages; /* linux/mm/swap.c */ void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file, - unsigned int nr_io, unsigned int nr_rotated) - __releases(lruvec->lru_lock); + unsigned int nr_io, unsigned int nr_rotated); void lru_note_cost_refault(struct folio *); void folio_add_lru(struct folio *); void folio_add_lru_vma(struct folio *, struct vm_area_struct *); -- cgit v1.2.3 From 07a6e9a2c199fed361f528781284d56771d0016f Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 5 Mar 2026 19:52:43 +0800 Subject: mm: vmscan: prepare for reparenting traditional LRU folios MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To resolve the dying memcg issue, we need to reparent LRU folios of child memcg to its parent memcg. For traditional LRU list, each lruvec of every memcg comprises four LRU lists. Due to the symmetry of the LRU lists, it is feasible to transfer the LRU lists from a memcg to its parent memcg during the reparenting process. This commit implements the specific function, which will be used during the reparenting process. Link: https://lore.kernel.org/a92d217a9fc82bd0c401210204a095caaf615b1c.1772711148.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Harry Yoo Acked-by: Johannes Weiner Acked-by: Muchun Song Acked-by: Shakeel Butt Cc: Allen Pais Cc: Axel Rasmussen Cc: Baoquan He Cc: Chengming Zhou Cc: Chen Ridong Cc: David Hildenbrand Cc: Hamza Mahfooz Cc: Hugh Dickins Cc: Imran Khan Cc: Kamalesh Babulal Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Michal Koutný Cc: Mike Rapoport Cc: Muchun Song Cc: Nhat Pham Cc: Roman Gushchin Cc: Suren Baghdasaryan Cc: Usama Arif Cc: Vlastimil Babka Cc: Wei Xu Cc: Yosry Ahmed Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/swap.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index ea08e2afa2b4..d653fe050b8f 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -546,6 +546,8 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg) return READ_ONCE(memcg->swappiness); } + +void lru_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid); #else static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) { @@ -610,5 +612,24 @@ static inline bool mem_cgroup_swap_full(struct folio *folio) } #endif +/* for_each_managed_zone_pgdat - helper macro to iterate over all managed zones in a pgdat up to + * and including the specified highidx + * @zone: The current zone in the iterator + * @pgdat: The pgdat which node_zones are being iterated + * @idx: The index variable + * @highidx: The index of the highest zone to return + * + * This macro iterates through all managed zones up to and including the specified highidx. + * The zone iterator enters an invalid state after macro call and must be reinitialized + * before it can be used again. + */ +#define for_each_managed_zone_pgdat(zone, pgdat, idx, highidx) \ + for ((idx) = 0, (zone) = (pgdat)->node_zones; \ + (idx) <= (highidx); \ + (idx)++, (zone)++) \ + if (!managed_zone(zone)) \ + continue; \ + else + #endif /* __KERNEL__*/ #endif /* _LINUX_SWAP_H */ -- cgit v1.2.3 From f304652609eae3814b0e9d11c75c0e0cb62da31f Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 5 Mar 2026 19:52:44 +0800 Subject: mm: vmscan: prepare for reparenting MGLRU folios MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Similar to traditional LRU folios, in order to solve the dying memcg problem, we also need to reparenting MGLRU folios to the parent memcg when memcg offline. However, there are the following challenges: 1. Each lruvec has between MIN_NR_GENS and MAX_NR_GENS generations, the number of generations of the parent and child memcg may be different, so we cannot simply transfer MGLRU folios in the child memcg to the parent memcg as we did for traditional LRU folios. 2. The generation information is stored in folio->flags, but we cannot traverse these folios while holding the lru lock, otherwise it may cause softlockup. 3. In walk_update_folio(), the gen of folio and corresponding lru size may be updated, but the folio is not immediately moved to the corresponding lru list. Therefore, there may be folios of different generations on an LRU list. 4. In lru_gen_del_folio(), the generation to which the folio belongs is found based on the generation information in folio->flags, and the corresponding LRU size will be updated. Therefore, we need to update the lru size correctly during reparenting, otherwise the lru size may be updated incorrectly in lru_gen_del_folio(). Finally, this patch chose a compromise method, which is to splice the lru list in the child memcg to the lru list of the same generation in the parent memcg during reparenting. And in order to ensure that the parent memcg has the same generation, we need to increase the generations in the parent memcg to the MAX_NR_GENS before reparenting. Of course, the same generation has different meanings in the parent and child memcg, this will cause confusion in the hot and cold information of folios. But other than that, this method is simple enough, the lru size is correct, and there is no need to consider some concurrency issues (such as lru_gen_del_folio()). To prepare for the above work, this commit implements the specific functions, which will be used during reparenting. [zhengqi.arch@bytedance.com: use list_splice_tail_init() to reparent child folios] Link: https://lore.kernel.org/20260324114937.28569-1-qi.zheng@linux.dev Link: https://lore.kernel.org/e75050354cdbc42221a04f7cf133292b61105548.1772711148.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Suggested-by: Harry Yoo Suggested-by: Imran Khan Acked-by: Harry Yoo Cc: Allen Pais Cc: Axel Rasmussen Cc: Baoquan He Cc: Chengming Zhou Cc: Chen Ridong Cc: David Hildenbrand Cc: Hamza Mahfooz Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kamalesh Babulal Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Michal Koutný Cc: Mike Rapoport Cc: Muchun Song Cc: Muchun Song Cc: Nhat Pham Cc: Roman Gushchin Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Usama Arif Cc: Vlastimil Babka Cc: Wei Xu Cc: Yosry Ahmed Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 4a20df132258..20f920dede65 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -692,6 +692,9 @@ void lru_gen_online_memcg(struct mem_cgroup *memcg); void lru_gen_offline_memcg(struct mem_cgroup *memcg); void lru_gen_release_memcg(struct mem_cgroup *memcg); void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid); +void max_lru_gen_memcg(struct mem_cgroup *memcg, int nid); +bool recheck_lru_gen_max_memcg(struct mem_cgroup *memcg, int nid); +void lru_gen_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid); #else /* !CONFIG_LRU_GEN */ @@ -733,6 +736,20 @@ static inline void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid) { } +static inline void max_lru_gen_memcg(struct mem_cgroup *memcg, int nid) +{ +} + +static inline bool recheck_lru_gen_max_memcg(struct mem_cgroup *memcg, int nid) +{ + return true; +} + +static inline +void lru_gen_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid) +{ +} + #endif /* CONFIG_LRU_GEN */ struct lruvec { -- cgit v1.2.3 From 7404bd37cfbeb2aa06249418c1788ca94bae2875 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 5 Mar 2026 19:52:46 +0800 Subject: mm: workingset: use lruvec_lru_size() to get the number of lru pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For cgroup v2, count_shadow_nodes() is the only place to read non-hierarchical stats (lruvec_stats->state_local). To avoid the need to consider cgroup v2 during subsequent non-hierarchical stats reparenting, use lruvec_lru_size() instead of lruvec_page_state_local() to get the number of lru pages. For NR_SLAB_RECLAIMABLE_B and NR_SLAB_UNRECLAIMABLE_B cases, it appears that the statistics here have already been problematic for a while since slab pages have been reparented. So just ignore it for now. Link: https://lore.kernel.org/b1d448c667a8fb377c3390d9aba43bdb7e4d5739.1772711148.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Acked-by: Shakeel Butt Acked-by: Muchun Song Cc: Allen Pais Cc: Axel Rasmussen Cc: Baoquan He Cc: Chengming Zhou Cc: Chen Ridong Cc: David Hildenbrand Cc: Hamza Mahfooz Cc: Harry Yoo Cc: Hugh Dickins Cc: Imran Khan Cc: Johannes Weiner Cc: Kamalesh Babulal Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Michal Koutný Cc: Mike Rapoport Cc: Muchun Song Cc: Nhat Pham Cc: Roman Gushchin Cc: Suren Baghdasaryan Cc: Usama Arif Cc: Vlastimil Babka Cc: Wei Xu Cc: Yosry Ahmed Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/swap.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index d653fe050b8f..7a09df6977a5 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -352,6 +352,7 @@ extern void swap_setup(void); extern unsigned long zone_reclaimable_pages(struct zone *zone); extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); +unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx); #define MEMCG_RECLAIM_MAY_SWAP (1 << 1) #define MEMCG_RECLAIM_PROACTIVE (1 << 2) -- cgit v1.2.3 From 01b9da291c4969354807b52956f4aae1f41b4924 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 5 Mar 2026 19:52:49 +0800 Subject: mm: memcontrol: convert objcg to be per-memcg per-node type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert objcg to be per-memcg per-node type, so that when reparent LRU folios later, we can hold the lru lock at the node level, thus avoiding holding too many lru locks at once. [zhengqi.arch@bytedance.com: reset pn->orig_objcg to NULL] Link: https://lore.kernel.org/20260309112939.31937-1-qi.zheng@linux.dev [akpm@linux-foundation.org: fix comment typo, per Usama. Reflow comment to 80 cols] [devnexen@gmail.com: fix obj_cgroup leak in mem_cgroup_css_online() error path] Link: https://lore.kernel.org/20260322193631.45457-1-devnexen@gmail.com [devnexen@gmail.com: add newline, per Qi Zheng] Link: https://lore.kernel.org/20260323063007.7783-1-devnexen@gmail.com Link: https://lore.kernel.org/56c04b1c5d54f75ccdc12896df6c1ca35403ecc3.1772711148.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Signed-off-by: David Carlier Acked-by: Shakeel Butt Cc: Allen Pais Cc: Axel Rasmussen Cc: Baoquan He Cc: Chengming Zhou Cc: Chen Ridong Cc: David Hildenbrand Cc: Hamza Mahfooz Cc: Harry Yoo Cc: Hugh Dickins Cc: Imran Khan Cc: Johannes Weiner Cc: Kamalesh Babulal Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Michal Koutný Cc: Mike Rapoport Cc: Muchun Song Cc: Muchun Song Cc: Nhat Pham Cc: Roman Gushchin Cc: Suren Baghdasaryan Cc: Usama Arif Cc: Vlastimil Babka Cc: Wei Xu Cc: Yosry Ahmed Cc: Yuanchu Xie Cc: Zi Yan Cc: Usama Arif Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 23 ++++++++++++----------- include/linux/sched.h | 2 +- 2 files changed, 13 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 12982875073e..3e836b56bfcb 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -115,6 +115,16 @@ struct mem_cgroup_per_node { unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; struct mem_cgroup_reclaim_iter iter; + /* + * objcg is wiped out as a part of the objcg repaprenting process. + * orig_objcg preserves a pointer (and a reference) to the original + * objcg until the end of live of memcg. + */ + struct obj_cgroup __rcu *objcg; + struct obj_cgroup *orig_objcg; + /* list of inherited objcgs, protected by objcg_lock */ + struct list_head objcg_list; + #ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC /* slab stats for nmi context */ atomic_t slab_reclaimable; @@ -179,6 +189,7 @@ struct obj_cgroup { struct list_head list; /* protected by objcg_lock */ struct rcu_head rcu; }; + bool is_root; }; /* @@ -257,15 +268,6 @@ struct mem_cgroup { seqlock_t socket_pressure_seqlock; #endif int kmemcg_id; - /* - * memcg->objcg is wiped out as a part of the objcg repaprenting - * process. memcg->orig_objcg preserves a pointer (and a reference) - * to the original objcg until the end of live of memcg. - */ - struct obj_cgroup __rcu *objcg; - struct obj_cgroup *orig_objcg; - /* list of inherited objcgs, protected by objcg_lock */ - struct list_head objcg_list; struct memcg_vmstats_percpu __percpu *vmstats_percpu; @@ -332,7 +334,6 @@ struct mem_cgroup { #define MEMCG_CHARGE_BATCH 64U extern struct mem_cgroup *root_mem_cgroup; -extern struct obj_cgroup *root_obj_cgroup; enum page_memcg_data_flags { /* page->memcg_data is a pointer to an slabobj_ext vector */ @@ -551,7 +552,7 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) static inline bool obj_cgroup_is_root(const struct obj_cgroup *objcg) { - return objcg == root_obj_cgroup; + return objcg->is_root; } static inline bool mem_cgroup_disabled(void) diff --git a/include/linux/sched.h b/include/linux/sched.h index 5a5d3dbc9cdf..0d27775546f8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1533,7 +1533,7 @@ struct task_struct { /* Used by memcontrol for targeted memcg charge: */ struct mem_cgroup *active_memcg; - /* Cache for current->cgroups->memcg->objcg lookups: */ + /* Cache for current->cgroups->memcg->nodeinfo[nid]->objcg lookups: */ struct obj_cgroup *objcg; #endif -- cgit v1.2.3 From f1cf8d2f36dc369688bbe61ce064fbd829dbc9e1 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 5 Mar 2026 19:52:50 +0800 Subject: mm: memcontrol: eliminate the problem of dying memory cgroup for LRU folios MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that everything is set up, switch folio->memcg_data pointers to objcgs, update the accessors, and execute reparenting on cgroup death. Finally, folio->memcg_data of LRU folios and kmem folios will always point to an object cgroup pointer. The folio->memcg_data of slab folios will point to an vector of object cgroups. Link: https://lore.kernel.org/80cb7af198dc6f2173fe616d1207a4c315ece141.1772711148.git.zhengqi.arch@bytedance.com Signed-off-by: Muchun Song Signed-off-by: Qi Zheng Acked-by: Shakeel Butt Cc: Allen Pais Cc: Axel Rasmussen Cc: Baoquan He Cc: Chengming Zhou Cc: Chen Ridong Cc: David Hildenbrand Cc: Hamza Mahfooz Cc: Harry Yoo Cc: Hugh Dickins Cc: Imran Khan Cc: Johannes Weiner Cc: Kamalesh Babulal Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Michal Koutný Cc: Mike Rapoport Cc: Muchun Song Cc: Nhat Pham Cc: Roman Gushchin Cc: Suren Baghdasaryan Cc: Usama Arif Cc: Vlastimil Babka Cc: Wei Xu Cc: Yosry Ahmed Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 77 +++++++++++++++------------------------------- 1 file changed, 25 insertions(+), 52 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 3e836b56bfcb..086158969529 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -369,9 +369,6 @@ enum objext_flags { #define OBJEXTS_FLAGS_MASK (__NR_OBJEXTS_FLAGS - 1) #ifdef CONFIG_MEMCG - -static inline bool folio_memcg_kmem(struct folio *folio); - /* * After the initialization objcg->memcg is always pointing at * a valid memcg, but can be atomically swapped to the parent memcg. @@ -385,43 +382,19 @@ static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg) } /* - * __folio_memcg - Get the memory cgroup associated with a non-kmem folio - * @folio: Pointer to the folio. - * - * Returns a pointer to the memory cgroup associated with the folio, - * or NULL. This function assumes that the folio is known to have a - * proper memory cgroup pointer. It's not safe to call this function - * against some type of folios, e.g. slab folios or ex-slab folios or - * kmem folios. - */ -static inline struct mem_cgroup *__folio_memcg(struct folio *folio) -{ - unsigned long memcg_data = folio->memcg_data; - - VM_BUG_ON_FOLIO(folio_test_slab(folio), folio); - VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio); - VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_KMEM, folio); - - return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK); -} - -/* - * __folio_objcg - get the object cgroup associated with a kmem folio. + * folio_objcg - get the object cgroup associated with a folio. * @folio: Pointer to the folio. * * Returns a pointer to the object cgroup associated with the folio, * or NULL. This function assumes that the folio is known to have a - * proper object cgroup pointer. It's not safe to call this function - * against some type of folios, e.g. slab folios or ex-slab folios or - * LRU folios. + * proper object cgroup pointer. */ -static inline struct obj_cgroup *__folio_objcg(struct folio *folio) +static inline struct obj_cgroup *folio_objcg(struct folio *folio) { unsigned long memcg_data = folio->memcg_data; VM_BUG_ON_FOLIO(folio_test_slab(folio), folio); VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio); - VM_BUG_ON_FOLIO(!(memcg_data & MEMCG_DATA_KMEM), folio); return (struct obj_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK); } @@ -435,21 +408,30 @@ static inline struct obj_cgroup *__folio_objcg(struct folio *folio) * proper memory cgroup pointer. It's not safe to call this function * against some type of folios, e.g. slab folios or ex-slab folios. * - * For a non-kmem folio any of the following ensures folio and memcg binding - * stability: + * For a folio any of the following ensures folio and objcg binding stability: * * - the folio lock * - LRU isolation * - exclusive reference * - * For a kmem folio a caller should hold an rcu read lock to protect memcg - * associated with a kmem folio from being released. + * Based on the stable binding of folio and objcg, for a folio any of the + * following ensures folio and memcg binding stability: + * + * - cgroup_mutex + * - the lruvec lock + * + * If the caller only want to ensure that the page counters of memcg are + * updated correctly, ensure that the binding stability of folio and objcg + * is sufficient. + * + * Note: The caller should hold an rcu read lock or cgroup_mutex to protect + * memcg associated with a folio from being released. */ static inline struct mem_cgroup *folio_memcg(struct folio *folio) { - if (folio_memcg_kmem(folio)) - return obj_cgroup_memcg(__folio_objcg(folio)); - return __folio_memcg(folio); + struct obj_cgroup *objcg = folio_objcg(folio); + + return objcg ? obj_cgroup_memcg(objcg) : NULL; } /* @@ -473,15 +455,10 @@ static inline bool folio_memcg_charged(struct folio *folio) * has an associated memory cgroup pointer or an object cgroups vector or * an object cgroup. * - * For a non-kmem folio any of the following ensures folio and memcg binding - * stability: + * The page and objcg or memcg binding rules can refer to folio_memcg(). * - * - the folio lock - * - LRU isolation - * - exclusive reference - * - * For a kmem folio a caller should hold an rcu read lock to protect memcg - * associated with a kmem folio from being released. + * A caller should hold an rcu read lock to protect memcg associated with a + * page from being released. */ static inline struct mem_cgroup *folio_memcg_check(struct folio *folio) { @@ -490,18 +467,14 @@ static inline struct mem_cgroup *folio_memcg_check(struct folio *folio) * for slabs, READ_ONCE() should be used here. */ unsigned long memcg_data = READ_ONCE(folio->memcg_data); + struct obj_cgroup *objcg; if (memcg_data & MEMCG_DATA_OBJEXTS) return NULL; - if (memcg_data & MEMCG_DATA_KMEM) { - struct obj_cgroup *objcg; - - objcg = (void *)(memcg_data & ~OBJEXTS_FLAGS_MASK); - return obj_cgroup_memcg(objcg); - } + objcg = (void *)(memcg_data & ~OBJEXTS_FLAGS_MASK); - return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK); + return objcg ? obj_cgroup_memcg(objcg) : NULL; } static inline struct mem_cgroup *page_memcg_check(struct page *page) -- cgit v1.2.3 From 0a98e13963424d7f1f50211c692f46a3b1e8d03f Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 5 Mar 2026 19:52:51 +0800 Subject: mm: lru: add VM_WARN_ON_ONCE_FOLIO to lru maintenance helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We must ensure the folio is deleted from or added to the correct lruvec list. So, add VM_WARN_ON_ONCE_FOLIO() to catch invalid users. The VM_BUG_ON_PAGE() in move_pages_to_lru() can be removed as add_page_to_lru_list() will perform the necessary check. Link: https://lore.kernel.org/2c90fc006d9d730331a3caeef96f7e5dabe2036d.1772711148.git.zhengqi.arch@bytedance.com Signed-off-by: Muchun Song Signed-off-by: Qi Zheng Acked-by: Roman Gushchin Acked-by: Johannes Weiner Acked-by: Shakeel Butt Cc: Allen Pais Cc: Axel Rasmussen Cc: Baoquan He Cc: Chengming Zhou Cc: Chen Ridong Cc: David Hildenbrand Cc: Hamza Mahfooz Cc: Harry Yoo Cc: Hugh Dickins Cc: Imran Khan Cc: Kamalesh Babulal Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Michal Koutný Cc: Mike Rapoport Cc: Muchun Song Cc: Nhat Pham Cc: Suren Baghdasaryan Cc: Usama Arif Cc: Vlastimil Babka Cc: Wei Xu Cc: Yosry Ahmed Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 7fc2ced00f8f..a171070e15f0 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -348,6 +348,8 @@ void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio) { enum lru_list lru = folio_lru_list(folio); + VM_WARN_ON_ONCE_FOLIO(!folio_matches_lruvec(folio, lruvec), folio); + if (lru_gen_add_folio(lruvec, folio, false)) return; @@ -362,6 +364,8 @@ void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio) { enum lru_list lru = folio_lru_list(folio); + VM_WARN_ON_ONCE_FOLIO(!folio_matches_lruvec(folio, lruvec), folio); + if (lru_gen_add_folio(lruvec, folio, true)) return; @@ -376,6 +380,8 @@ void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio) { enum lru_list lru = folio_lru_list(folio); + VM_WARN_ON_ONCE_FOLIO(!folio_matches_lruvec(folio, lruvec), folio); + if (lru_gen_del_folio(lruvec, folio, false)) return; -- cgit v1.2.3 From 1c514a2c6e4c3bf2016a1dbbddc36d19fdf52ce5 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 27 Mar 2026 18:16:30 +0800 Subject: mm: memcontrol: correct the nr_pages parameter type of mem_cgroup_update_lru_size() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The nr_pages parameter of mem_cgroup_update_lru_size() represents a page count. During the reparenting of LRU folios, the value passed to it can potentially exceed the maximum value of a 32-bit integer. It should be declared as long instead of int to match the types used in lruvec size accounting and to prevent possible overflow. Update the parameter type to long to ensure correctness. Link: https://lore.kernel.org/fd4140de44fa0a3978e4e2426731187fe8625f0b.1774604356.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Lorenzo Stoakes (Oracle) Reviewed-by: Harry Yoo (Oracle) Cc: Allen Pais Cc: Axel Rasmussen Cc: Baoquan He Cc: David Hildenbrand Cc: Hamza Mahfooz Cc: Hugh Dickins Cc: Imran Khan Cc: Johannes Weiner Cc: Kamalesh Babulal Cc: Lance Yang Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Cc: Usama Arif Cc: Wei Xu Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 086158969529..dc3fa687759b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -878,7 +878,7 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg) } void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, - int zid, int nr_pages); + int zid, long nr_pages); static inline unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, -- cgit v1.2.3 From d9e4142e7635f6f7173854667c0695ce5b836bbc Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 16 Mar 2026 04:54:31 -0700 Subject: kho: add size parameter to kho_add_subtree() Patch series "kho: history: track previous kernel version and kexec boot count", v9. Use Kexec Handover (KHO) to pass the previous kernel's version string and the number of kexec reboots since the last cold boot to the next kernel, and print it at boot time. Example ======= [ 0.000000] Linux version 6.19.0-rc3-upstream-00047-ge5d992347849 ... [ 0.000000] KHO: exec from: 6.19.0-rc4-next-20260107upstream-00004-g3071b0dc4498 (count 1) Motivation ========== Bugs that only reproduce when kexecing from specific kernel versions are difficult to diagnose. These issues occur when a buggy kernel kexecs into a new kernel, with the bug manifesting only in the second kernel. Recent examples include: * eb2266312507 ("x86/boot: Fix page table access in 5-level to 4-level paging transition") * 77d48d39e991 ("efistub/tpm: Use ACPI reclaim memory for event log to avoid corruption") * 64b45dd46e15 ("x86/efi: skip memattr table on kexec boot") As kexec-based reboots become more common, these version-dependent bugs are appearing more frequently. At scale, correlating crashes to the previous kernel version is challenging, especially when issues only occur in specific transition scenarios. Some bugs manifest only after multiple consecutive kexec reboots. Tracking the kexec count helps identify these cases (this metric is already used by live update sub-system). KHO provides a reliable mechanism to pass information between kernels. By carrying the previous kernel's release string and kexec count forward, we can print this context at boot time to aid debugging. The goal of this feature is to have this information being printed in early boot, so, users can trace back kernel releases in kexec. Systemd is not helpful because we cannot assume that the previous kernel has systemd or even write access to the disk (common when using Linux as bootloaders) This patch (of 6): kho_add_subtree() assumes the fdt argument is always an FDT and calls fdt_totalsize() on it in the debugfs code path. This assumption will break if a caller passes arbitrary data instead of an FDT. When CONFIG_KEXEC_HANDOVER_DEBUGFS is enabled, kho_debugfs_fdt_add() calls __kho_debugfs_fdt_add(), which executes: f->wrapper.size = fdt_totalsize(fdt); Fix this by adding an explicit size parameter to kho_add_subtree() so callers specify the blob size. This allows subtrees to contain arbitrary data formats, not just FDTs. Update all callers: - memblock.c: use fdt_totalsize(fdt) - luo_core.c: use fdt_totalsize(fdt_out) - test_kho.c: use fdt_totalsize() - kexec_handover.c (root fdt): use fdt_totalsize(kho_out.fdt) Also update __kho_debugfs_fdt_add() to receive the size explicitly instead of computing it internally via fdt_totalsize(). In kho_in_debugfs_init(), pass fdt_totalsize() for the root FDT and sub-blobs since all current users are FDTs. A subsequent patch will persist the size in the KHO FDT so the incoming side can handle non-FDT blobs correctly. Link: https://lore.kernel.org/20260323110747.193569-1-duanchenghao@kylinos.cn Link: https://lore.kernel.org/20260316-kho-v9-1-ed6dcd951988@debian.org Signed-off-by: Breno Leitao Suggested-by: Pratyush Yadav Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: David Hildenbrand Cc: Jonathan Corbet Cc: "Liam R. Howlett" Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Pasha Tatashin Cc: SeongJae Park Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/kexec_handover.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h index ac4129d1d741..abb1d324f42d 100644 --- a/include/linux/kexec_handover.h +++ b/include/linux/kexec_handover.h @@ -32,7 +32,7 @@ void kho_restore_free(void *mem); struct folio *kho_restore_folio(phys_addr_t phys); struct page *kho_restore_pages(phys_addr_t phys, unsigned long nr_pages); void *kho_restore_vmalloc(const struct kho_vmalloc *preservation); -int kho_add_subtree(const char *name, void *fdt); +int kho_add_subtree(const char *name, void *fdt, size_t size); void kho_remove_subtree(void *fdt); int kho_retrieve_subtree(const char *name, phys_addr_t *phys); @@ -97,7 +97,7 @@ static inline void *kho_restore_vmalloc(const struct kho_vmalloc *preservation) return NULL; } -static inline int kho_add_subtree(const char *name, void *fdt) +static inline int kho_add_subtree(const char *name, void *fdt, size_t size) { return -EOPNOTSUPP; } -- cgit v1.2.3 From 4916ae386760ad666eafa8afc075957bf479afbc Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 16 Mar 2026 04:54:32 -0700 Subject: kho: rename fdt parameter to blob in kho_add/remove_subtree() Since kho_add_subtree() now accepts arbitrary data blobs (not just FDTs), rename the parameter from 'fdt' to 'blob' to better reflect its purpose. Apply the same rename to kho_remove_subtree() for consistency. Also rename kho_debugfs_fdt_add() and kho_debugfs_fdt_remove() to kho_debugfs_blob_add() and kho_debugfs_blob_remove() respectively, with the same parameter rename from 'fdt' to 'blob'. Link: https://lore.kernel.org/20260316-kho-v9-2-ed6dcd951988@debian.org Signed-off-by: Breno Leitao Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: David Hildenbrand Cc: Jonathan Corbet Cc: "Liam R. Howlett" Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Pasha Tatashin Cc: SeongJae Park Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/kexec_handover.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h index abb1d324f42d..0666cf298c7f 100644 --- a/include/linux/kexec_handover.h +++ b/include/linux/kexec_handover.h @@ -32,8 +32,8 @@ void kho_restore_free(void *mem); struct folio *kho_restore_folio(phys_addr_t phys); struct page *kho_restore_pages(phys_addr_t phys, unsigned long nr_pages); void *kho_restore_vmalloc(const struct kho_vmalloc *preservation); -int kho_add_subtree(const char *name, void *fdt, size_t size); -void kho_remove_subtree(void *fdt); +int kho_add_subtree(const char *name, void *blob, size_t size); +void kho_remove_subtree(void *blob); int kho_retrieve_subtree(const char *name, phys_addr_t *phys); void kho_memory_init(void); @@ -97,12 +97,12 @@ static inline void *kho_restore_vmalloc(const struct kho_vmalloc *preservation) return NULL; } -static inline int kho_add_subtree(const char *name, void *fdt, size_t size) +static inline int kho_add_subtree(const char *name, void *blob, size_t size) { return -EOPNOTSUPP; } -static inline void kho_remove_subtree(void *fdt) { } +static inline void kho_remove_subtree(void *blob) { } static inline int kho_retrieve_subtree(const char *name, phys_addr_t *phys) { -- cgit v1.2.3 From 85e41392820fcf0f7a3f9784cea907905f921358 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 16 Mar 2026 04:54:33 -0700 Subject: kho: persist blob size in KHO FDT kho_add_subtree() accepts a size parameter but only forwards it to debugfs. The size is not persisted in the KHO FDT, so it is lost across kexec. This makes it impossible for the incoming kernel to determine the blob size without understanding the blob format. Store the blob size as a "blob-size" property in the KHO FDT alongside the "preserved-data" physical address. This allows the receiving kernel to recover the size for any blob regardless of format. Also extend kho_retrieve_subtree() with an optional size output parameter so callers can learn the blob size without needing to understand the blob format. Update all callers to pass NULL for the new parameter. Link: https://lore.kernel.org/20260316-kho-v9-3-ed6dcd951988@debian.org Signed-off-by: Breno Leitao Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: David Hildenbrand Cc: Jonathan Corbet Cc: "Liam R. Howlett" Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Pasha Tatashin Cc: SeongJae Park Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/kexec_handover.h | 5 +++-- include/linux/kho/abi/kexec_handover.h | 20 ++++++++++++++++---- 2 files changed, 19 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h index 0666cf298c7f..8968c56d2d73 100644 --- a/include/linux/kexec_handover.h +++ b/include/linux/kexec_handover.h @@ -34,7 +34,7 @@ struct page *kho_restore_pages(phys_addr_t phys, unsigned long nr_pages); void *kho_restore_vmalloc(const struct kho_vmalloc *preservation); int kho_add_subtree(const char *name, void *blob, size_t size); void kho_remove_subtree(void *blob); -int kho_retrieve_subtree(const char *name, phys_addr_t *phys); +int kho_retrieve_subtree(const char *name, phys_addr_t *phys, size_t *size); void kho_memory_init(void); @@ -104,7 +104,8 @@ static inline int kho_add_subtree(const char *name, void *blob, size_t size) static inline void kho_remove_subtree(void *blob) { } -static inline int kho_retrieve_subtree(const char *name, phys_addr_t *phys) +static inline int kho_retrieve_subtree(const char *name, phys_addr_t *phys, + size_t *size) { return -EOPNOTSUPP; } diff --git a/include/linux/kho/abi/kexec_handover.h b/include/linux/kho/abi/kexec_handover.h index 6b7d8ef550f9..7e847a2339b0 100644 --- a/include/linux/kho/abi/kexec_handover.h +++ b/include/linux/kho/abi/kexec_handover.h @@ -41,25 +41,28 @@ * restore the preserved data.:: * * / { - * compatible = "kho-v2"; + * compatible = "kho-v3"; * * preserved-memory-map = <0x...>; * * { * preserved-data = <0x...>; + * blob-size = <0x...>; * }; * * { * preserved-data = <0x...>; + * blob-size = <0x...>; * }; * ... ... * { * preserved-data = <0x...>; + * blob-size = <0x...>; * }; * }; * * Root KHO Node (/): - * - compatible: "kho-v2" + * - compatible: "kho-v3" * * Indentifies the overall KHO ABI version. * @@ -78,16 +81,25 @@ * * Physical address pointing to a subnode data blob that is also * being preserved. + * + * - blob-size: u64 + * + * Size in bytes of the preserved data blob. This is needed because + * blobs may use arbitrary formats (not just FDT), so the size + * cannot be determined from the blob content alone. */ /* The compatible string for the KHO FDT root node. */ -#define KHO_FDT_COMPATIBLE "kho-v2" +#define KHO_FDT_COMPATIBLE "kho-v3" /* The FDT property for the preserved memory map. */ #define KHO_FDT_MEMORY_MAP_PROP_NAME "preserved-memory-map" /* The FDT property for preserved data blobs. */ -#define KHO_FDT_SUB_TREE_PROP_NAME "preserved-data" +#define KHO_SUB_TREE_PROP_NAME "preserved-data" + +/* The FDT property for the size of preserved data blobs. */ +#define KHO_SUB_TREE_SIZE_PROP_NAME "blob-size" /** * DOC: Kexec Handover ABI for vmalloc Preservation -- cgit v1.2.3 From 76aa46b9e4049247858309c6e3527d477da2b2fe Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 16 Mar 2026 04:54:35 -0700 Subject: kho: kexec-metadata: track previous kernel chain Use Kexec Handover (KHO) to pass the previous kernel's version string and the number of kexec reboots since the last cold boot to the next kernel, and print it at boot time. Example output: [ 0.000000] KHO: exec from: 6.19.0-rc4-next-20260107 (count 1) Motivation ========== Bugs that only reproduce when kexecing from specific kernel versions are difficult to diagnose. These issues occur when a buggy kernel kexecs into a new kernel, with the bug manifesting only in the second kernel. Recent examples include the following commits: * commit eb2266312507 ("x86/boot: Fix page table access in 5-level to 4-level paging transition") * commit 77d48d39e991 ("efistub/tpm: Use ACPI reclaim memory for event log to avoid corruption") * commit 64b45dd46e15 ("x86/efi: skip memattr table on kexec boot") As kexec-based reboots become more common, these version-dependent bugs are appearing more frequently. At scale, correlating crashes to the previous kernel version is challenging, especially when issues only occur in specific transition scenarios. Implementation ============== The kexec metadata is stored as a plain C struct (struct kho_kexec_metadata) rather than FDT format, for simplicity and direct field access. It is registered via kho_add_subtree() as a separate subtree, keeping it independent from the core KHO ABI. This design choice: - Keeps the core KHO ABI minimal and stable - Allows the metadata format to evolve independently - Avoids requiring version bumps for all KHO consumers (LUO, etc.) when the metadata format changes The struct kho_kexec_metadata contains two fields: - previous_release: The kernel version that initiated the kexec - kexec_count: Number of kexec boots since last cold boot On cold boot, kexec_count starts at 0 and increments with each kexec. The count helps identify issues that only manifest after multiple consecutive kexec reboots. [leitao@debian.org: call kho_kexec_metadata_init() for both boot paths] Link: https://lore.kernel.org/all/20260309-kho-v8-5-c3abcf4ac750@debian.org/ [1] Link: https://lore.kernel.org/20260409-kho_fix_merge_issue-v1-1-710c84ceaa85@debian.org Link: https://lore.kernel.org/20260316-kho-v9-5-ed6dcd951988@debian.org Signed-off-by: Breno Leitao Acked-by: SeongJae Park Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: David Hildenbrand Cc: Jonathan Corbet Cc: "Liam R. Howlett" Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Pasha Tatashin Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/kho/abi/kexec_metadata.h | 46 ++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 include/linux/kho/abi/kexec_metadata.h (limited to 'include/linux') diff --git a/include/linux/kho/abi/kexec_metadata.h b/include/linux/kho/abi/kexec_metadata.h new file mode 100644 index 000000000000..e9e3f7e38a7c --- /dev/null +++ b/include/linux/kho/abi/kexec_metadata.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +/** + * DOC: Kexec Metadata ABI + * + * The "kexec-metadata" subtree stores optional metadata about the kexec chain. + * It is registered via kho_add_subtree(), keeping it independent from the core + * KHO ABI. This allows the metadata format to evolve without affecting other + * KHO consumers. + * + * The metadata is stored as a plain C struct rather than FDT format for + * simplicity and direct field access. + * + * Copyright (c) 2026 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2026 Breno Leitao + */ + +#ifndef _LINUX_KHO_ABI_KEXEC_METADATA_H +#define _LINUX_KHO_ABI_KEXEC_METADATA_H + +#include +#include + +#define KHO_KEXEC_METADATA_VERSION 1 + +/** + * struct kho_kexec_metadata - Kexec metadata passed between kernels + * @version: ABI version of this struct (must be first field) + * @previous_release: Kernel version string that initiated the kexec + * @kexec_count: Number of kexec boots since last cold boot + * + * This structure is preserved across kexec and allows the new kernel to + * identify which kernel it was booted from and how many kexec reboots + * have occurred. + * + * __NEW_UTS_LEN is part of uABI, so it safe to use it in here. + */ +struct kho_kexec_metadata { + u32 version; + char previous_release[__NEW_UTS_LEN + 1]; + u32 kexec_count; +} __packed; + +#define KHO_METADATA_NODE_NAME "kexec-metadata" + +#endif /* _LINUX_KHO_ABI_KEXEC_METADATA_H */ -- cgit v1.2.3 From 00d0b372374f2528394aabf7b1f53f8dafe294de Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Thu, 26 Mar 2026 16:39:41 +0000 Subject: liveupdate: prevent double management of files Patch series "liveupdate: prevent double preservation", v4. Currently, LUO does not prevent the same file from being managed twice across different active sessions. Because LUO preserves files of absolutely different types: memfd, and upcoming vfiofd [1], iommufd [2], guestmefd (and possible kvmfd/cpufd). There is no common private data or guarantee on how to prevent that the same file is not preserved twice beside using inode or some slower and expensive method like hashtables. This patch (of 4) Currently, LUO does not prevent the same file from being managed twice across different active sessions. Use a global xarray luo_preserved_files to keep track of file identifiers being preserved by LUO. Update luo_preserve_file() to check and insert the file identifier into this xarray when it is preserved, and erase it in luo_file_unpreserve_files() when it is released. To allow handlers to define what constitutes a "unique" file (e.g., different struct file objects pointing to the same hardware resource), add a get_id() callback to struct liveupdate_file_ops. If not provided, the default identifier is the struct file pointer itself. This ensures that the same file (or resource) cannot be managed by multiple sessions. If another session attempts to preserve an already managed file, it will now fail with -EBUSY. Link: https://lore.kernel.org/20260326163943.574070-1-pasha.tatashin@soleen.com Link: https://lore.kernel.org/20260326163943.574070-2-pasha.tatashin@soleen.com Link: https://lore.kernel.org/all/20260129212510.967611-1-dmatlack@google.com [1] Link: https://lore.kernel.org/all/20260203220948.2176157-1-skhawaja@google.com [2] Signed-off-by: Pasha Tatashin Reviewed-by: Samiullah Khawaja Reviewed-by: Mike Rapoport (Microsoft) Cc: David Matlack Cc: Pratyush Yadav Cc: Shuah Khan Cc: Christian Brauner Signed-off-by: Andrew Morton --- include/linux/liveupdate.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/liveupdate.h b/include/linux/liveupdate.h index dd11fdc76a5f..61325ad26526 100644 --- a/include/linux/liveupdate.h +++ b/include/linux/liveupdate.h @@ -63,6 +63,7 @@ struct liveupdate_file_op_args { * finish, in order to do successful finish calls for all * resources in the session. * @finish: Required. Final cleanup in the new kernel. + * @get_id: Optional. Returns a unique identifier for the file. * @owner: Module reference * * All operations (except can_preserve) receive a pointer to a @@ -78,6 +79,7 @@ struct liveupdate_file_ops { int (*retrieve)(struct liveupdate_file_op_args *args); bool (*can_finish)(struct liveupdate_file_op_args *args); void (*finish)(struct liveupdate_file_op_args *args); + unsigned long (*get_id)(struct file *file); struct module *owner; }; -- cgit v1.2.3 From 6b2b22f7c8cf1596490beaac96a989cbafdfea57 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 27 Mar 2026 03:33:28 +0000 Subject: liveupdate: protect FLB lists with luo_register_rwlock Because liveupdate FLB objects will soon drop their persistent module references when registered, list traversals must be protected against concurrent module unloading. To provide this protection, utilize the global luo_register_rwlock. It protects the global registry of FLBs and the handler's specific list of FLB dependencies. Read locks are used during concurrent list traversals (e.g., during preservation and serialization). Write locks are taken during registration and unregistration. Link: https://lore.kernel.org/20260327033335.696621-5-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Reviewed-by: Pratyush Yadav (Google) Cc: David Matlack Cc: Mike Rapoport Cc: Samiullah Khawaja Signed-off-by: Andrew Morton --- include/linux/liveupdate.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/liveupdate.h b/include/linux/liveupdate.h index 61325ad26526..9c761d9bacf8 100644 --- a/include/linux/liveupdate.h +++ b/include/linux/liveupdate.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3 From 2ab7207e7ec6cd5af1912d9be5174f114633286b Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 27 Mar 2026 03:33:33 +0000 Subject: liveupdate: make unregister functions return void Change liveupdate_unregister_file_handler and liveupdate_unregister_flb to return void instead of an error code. This follows the design principle that unregistration during module unload should not fail, as the unload cannot be stopped at that point. Link: https://lore.kernel.org/20260327033335.696621-10-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Reviewed-by: Pratyush Yadav (Google) Cc: David Matlack Cc: Mike Rapoport Cc: Samiullah Khawaja Signed-off-by: Andrew Morton --- include/linux/liveupdate.h | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/liveupdate.h b/include/linux/liveupdate.h index 9c761d9bacf8..30c5a39ff9e9 100644 --- a/include/linux/liveupdate.h +++ b/include/linux/liveupdate.h @@ -231,12 +231,12 @@ bool liveupdate_enabled(void); int liveupdate_reboot(void); int liveupdate_register_file_handler(struct liveupdate_file_handler *fh); -int liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh); +void liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh); int liveupdate_register_flb(struct liveupdate_file_handler *fh, struct liveupdate_flb *flb); -int liveupdate_unregister_flb(struct liveupdate_file_handler *fh, - struct liveupdate_flb *flb); +void liveupdate_unregister_flb(struct liveupdate_file_handler *fh, + struct liveupdate_flb *flb); int liveupdate_flb_get_incoming(struct liveupdate_flb *flb, void **objp); int liveupdate_flb_get_outgoing(struct liveupdate_flb *flb, void **objp); @@ -258,9 +258,8 @@ static inline int liveupdate_register_file_handler(struct liveupdate_file_handle return -EOPNOTSUPP; } -static inline int liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh) +static inline void liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh) { - return -EOPNOTSUPP; } static inline int liveupdate_register_flb(struct liveupdate_file_handler *fh, @@ -269,10 +268,9 @@ static inline int liveupdate_register_flb(struct liveupdate_file_handler *fh, return -EOPNOTSUPP; } -static inline int liveupdate_unregister_flb(struct liveupdate_file_handler *fh, - struct liveupdate_flb *flb) +static inline void liveupdate_unregister_flb(struct liveupdate_file_handler *fh, + struct liveupdate_flb *flb) { - return -EOPNOTSUPP; } static inline int liveupdate_flb_get_incoming(struct liveupdate_flb *flb, -- cgit v1.2.3 From 6b1842775a460245e97d36d3a67d0cfba7c4ff79 Mon Sep 17 00:00:00 2001 From: Hao Ge Date: Tue, 31 Mar 2026 16:13:12 +0800 Subject: mm/alloc_tag: clear codetag for pages allocated before page_ext initialization Due to initialization ordering, page_ext is allocated and initialized relatively late during boot. Some pages have already been allocated and freed before page_ext becomes available, leaving their codetag uninitialized. A clear example is in init_section_page_ext(): alloc_page_ext() calls kmemleak_alloc(). If the slab cache has no free objects, it falls back to the buddy allocator to allocate memory. However, at this point page_ext is not yet fully initialized, so these newly allocated pages have no codetag set. These pages may later be reclaimed by KASAN, which causes the warning to trigger when they are freed because their codetag ref is still empty. Use a global array to track pages allocated before page_ext is fully initialized. The array size is fixed at 8192 entries, and will emit a warning if this limit is exceeded. When page_ext initialization completes, set their codetag to empty to avoid warnings when they are freed later. This warning is only observed with CONFIG_MEM_ALLOC_PROFILING_DEBUG=Y and mem_profiling_compressed disabled: [ 9.582133] ------------[ cut here ]------------ [ 9.582137] alloc_tag was not set [ 9.582139] WARNING: ./include/linux/alloc_tag.h:164 at __pgalloc_tag_sub+0x40f/0x550, CPU#5: systemd/1 [ 9.582190] CPU: 5 UID: 0 PID: 1 Comm: systemd Not tainted 7.0.0-rc4 #1 PREEMPT(lazy) [ 9.582192] Hardware name: Red Hat KVM, BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014 [ 9.582194] RIP: 0010:__pgalloc_tag_sub+0x40f/0x550 [ 9.582196] Code: 00 00 4c 29 e5 48 8b 05 1f 88 56 05 48 8d 4c ad 00 48 8d 2c c8 e9 87 fd ff ff 0f 0b 0f 0b e9 f3 fe ff ff 48 8d 3d 61 2f ed 03 <67> 48 0f b9 3a e9 b3 fd ff ff 0f 0b eb e4 e8 5e cd 14 02 4c 89 c7 [ 9.582197] RSP: 0018:ffffc9000001f940 EFLAGS: 00010246 [ 9.582200] RAX: dffffc0000000000 RBX: 1ffff92000003f2b RCX: 1ffff110200d806c [ 9.582201] RDX: ffff8881006c0360 RSI: 0000000000000004 RDI: ffffffff9bc7b460 [ 9.582202] RBP: 0000000000000000 R08: 0000000000000000 R09: fffffbfff3a62324 [ 9.582203] R10: ffffffff9d311923 R11: 0000000000000000 R12: ffffea0004001b00 [ 9.582204] R13: 0000000000002000 R14: ffffea0000000000 R15: ffff8881006c0360 [ 9.582206] FS: 00007ffbbcf2d940(0000) GS:ffff888450479000(0000) knlGS:0000000000000000 [ 9.582208] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 9.582210] CR2: 000055ee3aa260d0 CR3: 0000000148b67005 CR4: 0000000000770ef0 [ 9.582211] PKRU: 55555554 [ 9.582212] Call Trace: [ 9.582213] [ 9.582214] ? __pfx___pgalloc_tag_sub+0x10/0x10 [ 9.582216] ? check_bytes_and_report+0x68/0x140 [ 9.582219] __free_frozen_pages+0x2e4/0x1150 [ 9.582221] ? __free_slab+0xc2/0x2b0 [ 9.582224] qlist_free_all+0x4c/0xf0 [ 9.582227] kasan_quarantine_reduce+0x15d/0x180 [ 9.582229] __kasan_slab_alloc+0x69/0x90 [ 9.582232] kmem_cache_alloc_noprof+0x14a/0x500 [ 9.582234] do_getname+0x96/0x310 [ 9.582237] do_readlinkat+0x91/0x2f0 [ 9.582239] ? __pfx_do_readlinkat+0x10/0x10 [ 9.582240] ? get_random_bytes_user+0x1df/0x2c0 [ 9.582244] __x64_sys_readlinkat+0x96/0x100 [ 9.582246] do_syscall_64+0xce/0x650 [ 9.582250] ? __x64_sys_getrandom+0x13a/0x1e0 [ 9.582252] ? __pfx___x64_sys_getrandom+0x10/0x10 [ 9.582254] ? do_syscall_64+0x114/0x650 [ 9.582255] ? ksys_read+0xfc/0x1d0 [ 9.582258] ? __pfx_ksys_read+0x10/0x10 [ 9.582260] ? do_syscall_64+0x114/0x650 [ 9.582262] ? do_syscall_64+0x114/0x650 [ 9.582264] ? __pfx_fput_close_sync+0x10/0x10 [ 9.582266] ? file_close_fd_locked+0x178/0x2a0 [ 9.582268] ? __x64_sys_faccessat2+0x96/0x100 [ 9.582269] ? __x64_sys_close+0x7d/0xd0 [ 9.582271] ? do_syscall_64+0x114/0x650 [ 9.582273] ? do_syscall_64+0x114/0x650 [ 9.582275] ? clear_bhb_loop+0x50/0xa0 [ 9.582277] ? clear_bhb_loop+0x50/0xa0 [ 9.582279] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 9.582280] RIP: 0033:0x7ffbbda345ee [ 9.582282] Code: 0f 1f 40 00 48 8b 15 29 38 0d 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff c3 0f 1f 40 00 f3 0f 1e fa 49 89 ca b8 0b 01 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d fa 37 0d 00 f7 d8 64 89 01 48 [ 9.582284] RSP: 002b:00007ffe2ad8de58 EFLAGS: 00000202 ORIG_RAX: 000000000000010b [ 9.582286] RAX: ffffffffffffffda RBX: 000055ee3aa25570 RCX: 00007ffbbda345ee [ 9.582287] RDX: 000055ee3aa25570 RSI: 00007ffe2ad8dee0 RDI: 00000000ffffff9c [ 9.582288] RBP: 0000000000001000 R08: 0000000000000003 R09: 0000000000001001 [ 9.582289] R10: 0000000000001000 R11: 0000000000000202 R12: 0000000000000033 [ 9.582290] R13: 00007ffe2ad8dee0 R14: 00000000ffffff9c R15: 00007ffe2ad8deb0 [ 9.582292] [ 9.582293] ---[ end trace 0000000000000000 ]--- Link: https://lore.kernel.org/20260331081312.123719-1-hao.ge@linux.dev Fixes: dcfe378c81f72 ("lib: introduce support for page allocation tagging") Signed-off-by: Hao Ge Suggested-by: Suren Baghdasaryan Acked-by: Suren Baghdasaryan Cc: Kent Overstreet Cc: Signed-off-by: Andrew Morton --- include/linux/alloc_tag.h | 2 ++ include/linux/pgalloc_tag.h | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index d40ac39bfbe8..02de2ede560f 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -163,9 +163,11 @@ static inline void alloc_tag_sub_check(union codetag_ref *ref) { WARN_ONCE(ref && !ref->ct, "alloc_tag was not set\n"); } +void alloc_tag_add_early_pfn(unsigned long pfn); #else static inline void alloc_tag_add_check(union codetag_ref *ref, struct alloc_tag *tag) {} static inline void alloc_tag_sub_check(union codetag_ref *ref) {} +static inline void alloc_tag_add_early_pfn(unsigned long pfn) {} #endif /* Caller should verify both ref and tag to be valid */ diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index 38a82d65e58e..951d33362268 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -181,7 +181,7 @@ static inline struct alloc_tag *__pgalloc_tag_get(struct page *page) if (get_page_tag_ref(page, &ref, &handle)) { alloc_tag_sub_check(&ref); - if (ref.ct) + if (ref.ct && !is_codetag_empty(&ref)) tag = ct_to_alloc_tag(ref.ct); put_page_tag_ref(handle); } -- cgit v1.2.3 From 55da81663b9642dd046b26dd6f1baddbcf337c1e Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 27 Mar 2026 16:33:14 -0700 Subject: mm/damon/core: fix damon_call() vs kdamond_fn() exit race Patch series "mm/damon/core: fix damon_call()/damos_walk() vs kdmond exit race". damon_call() and damos_walk() can leak memory and/or deadlock when they race with kdamond terminations. Fix those. This patch (of 2); When kdamond_fn() main loop is finished, the function cancels all remaining damon_call() requests and unset the damon_ctx->kdamond so that API callers and API functions themselves can know the context is terminated. damon_call() adds the caller's request to the queue first. After that, it shows if the kdamond of the damon_ctx is still running (damon_ctx->kdamond is set). Only if the kdamond is running, damon_call() starts waiting for the kdamond's handling of the newly added request. The damon_call() requests registration and damon_ctx->kdamond unset are protected by different mutexes, though. Hence, damon_call() could race with damon_ctx->kdamond unset, and result in deadlocks. For example, let's suppose kdamond successfully finished the damon_call() requests cancelling. Right after that, damon_call() is called for the context. It registers the new request, and shows the context is still running, because damon_ctx->kdamond unset is not yet done. Hence the damon_call() caller starts waiting for the handling of the request. However, the kdamond is already on the termination steps, so it never handles the new request. As a result, the damon_call() caller threads infinitely waits. Fix this by introducing another damon_ctx field, namely call_controls_obsolete. It is protected by the damon_ctx->call_controls_lock, which protects damon_call() requests registration. Initialize (unset) it in kdamond_fn() before letting damon_start() returns and set it just before the cancelling of remaining damon_call() requests is executed. damon_call() reads the obsolete field under the lock and avoids adding a new request. After this change, only requests that are guaranteed to be handled or cancelled are registered. Hence the after-registration DAMON context termination check is no longer needed. Remove it together. Note that the deadlock will not happen when damon_call() is called for repeat mode request. In tis case, damon_call() returns instead of waiting for the handling when the request registration succeeds and it shows the kdamond is running. However, if the request also has dealloc_on_cancel, the request memory would be leaked. The issue is found by sashiko [1]. Link: https://lore.kernel.org/20260327233319.3528-1-sj@kernel.org Link: https://lore.kernel.org/20260327233319.3528-2-sj@kernel.org Link: https://lore.kernel.org/20260325141956.87144-1-sj@kernel.org [1] Fixes: 42b7491af14c ("mm/damon/core: introduce damon_call()") Signed-off-by: SeongJae Park Cc: # 6.14.x Signed-off-by: Andrew Morton --- include/linux/damon.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index d9a3babbafc1..5129de70e7b7 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -818,6 +818,7 @@ struct damon_ctx { /* lists of &struct damon_call_control */ struct list_head call_controls; + bool call_controls_obsolete; struct mutex call_controls_lock; struct damos_walk_control *walk_control; -- cgit v1.2.3 From 33c3f6c2b48cd84b441dba1ee3e62290e53930f4 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 27 Mar 2026 16:33:15 -0700 Subject: mm/damon/core: fix damos_walk() vs kdamond_fn() exit race When kdamond_fn() main loop is finished, the function cancels remaining damos_walk() request and unset the damon_ctx->kdamond so that API callers and API functions themselves can show the context is terminated. damos_walk() adds the caller's request to the queue first. After that, it shows if the kdamond of the damon_ctx is still running (damon_ctx->kdamond is set). Only if the kdamond is running, damos_walk() starts waiting for the kdamond's handling of the newly added request. The damos_walk() requests registration and damon_ctx->kdamond unset are protected by different mutexes, though. Hence, damos_walk() could race with damon_ctx->kdamond unset, and result in deadlocks. For example, let's suppose kdamond successfully finished the damow_walk() request cancelling. Right after that, damos_walk() is called for the context. It registers the new request, and shows the context is still running, because damon_ctx->kdamond unset is not yet done. Hence the damos_walk() caller starts waiting for the handling of the request. However, the kdamond is already on the termination steps, so it never handles the new request. As a result, the damos_walk() caller thread infinitely waits. Fix this by introducing another damon_ctx field, namely walk_control_obsolete. It is protected by the damon_ctx->walk_control_lock, which protects damos_walk() request registration. Initialize (unset) it in kdamond_fn() before letting damon_start() returns and set it just before the cancelling of the remaining damos_walk() request is executed. damos_walk() reads the obsolete field under the lock and avoids adding a new request. After this change, only requests that are guaranteed to be handled or cancelled are registered. Hence the after-registration DAMON context termination check is no longer needed. Remove it together. The issue is found by sashiko [1]. Link: https://lore.kernel.org/20260327233319.3528-3-sj@kernel.org Link: https://lore.kernel.org/20260325141956.87144-1-sj@kernel.org [1] Fixes: bf0eaba0ff9c ("mm/damon/core: implement damos_walk()") Signed-off-by: SeongJae Park Cc: # 6.14.x Signed-off-by: Andrew Morton --- include/linux/damon.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 5129de70e7b7..f2cdb7c3f5e6 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -822,6 +822,7 @@ struct damon_ctx { struct mutex call_controls_lock; struct damos_walk_control *walk_control; + bool walk_control_obsolete; struct mutex walk_control_lock; /* -- cgit v1.2.3 From a5bb8669872b6b8463b8777a7a259a8305060016 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Thu, 2 Apr 2026 07:11:47 +0300 Subject: userfaultfd: move vma_can_userfault out of line vma_can_userfault() has grown pretty big and it's not called on performance critical path. Move it out of line. No functional changes. Link: https://lore.kernel.org/20260402041156.1377214-7-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Reviewed-by: David Hildenbrand (Red Hat) Reviewed-by: Liam R. Howlett Cc: Andrea Arcangeli Cc: Andrei Vagin Cc: Axel Rasmussen Cc: Baolin Wang Cc: Harry Yoo Cc: Harry Yoo (Oracle) Cc: Hugh Dickins Cc: James Houghton Cc: Lorenzo Stoakes (Oracle) Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Muchun Song Cc: Nikita Kalyazin Cc: Oscar Salvador Cc: Paolo Bonzini Cc: Peter Xu Cc: Sean Christopherson Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: David Carlier Signed-off-by: Andrew Morton --- include/linux/userfaultfd_k.h | 35 ++--------------------------------- 1 file changed, 2 insertions(+), 33 deletions(-) (limited to 'include/linux') diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index d83e349900a3..ce0201c3dd82 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -211,39 +211,8 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma) return vma->vm_flags & __VM_UFFD_FLAGS; } -static inline bool vma_can_userfault(struct vm_area_struct *vma, - vm_flags_t vm_flags, - bool wp_async) -{ - vm_flags &= __VM_UFFD_FLAGS; - - if (vma->vm_flags & VM_DROPPABLE) - return false; - - if ((vm_flags & VM_UFFD_MINOR) && - (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma))) - return false; - - /* - * If wp async enabled, and WP is the only mode enabled, allow any - * memory type. - */ - if (wp_async && (vm_flags == VM_UFFD_WP)) - return true; - - /* - * If user requested uffd-wp but not enabled pte markers for - * uffd-wp, then shmem & hugetlbfs are not supported but only - * anonymous. - */ - if (!uffd_supports_wp_marker() && (vm_flags & VM_UFFD_WP) && - !vma_is_anonymous(vma)) - return false; - - /* By default, allow any of anon|shmem|hugetlb */ - return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || - vma_is_shmem(vma); -} +bool vma_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags, + bool wp_async); static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma) { -- cgit v1.2.3 From 0f48947c4232c934885711dde0b49066f9d8ee87 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Thu, 2 Apr 2026 07:11:48 +0300 Subject: userfaultfd: introduce vm_uffd_ops Current userfaultfd implementation works only with memory managed by core MM: anonymous, shmem and hugetlb. First, there is no fundamental reason to limit userfaultfd support only to the core memory types and userfaults can be handled similarly to regular page faults provided a VMA owner implements appropriate callbacks. Second, historically various code paths were conditioned on vma_is_anonymous(), vma_is_shmem() and is_vm_hugetlb_page() and some of these conditions can be expressed as operations implemented by a particular memory type. Introduce vm_uffd_ops extension to vm_operations_struct that will delegate memory type specific operations to a VMA owner. Operations for anonymous memory are handled internally in userfaultfd using anon_uffd_ops that implicitly assigned to anonymous VMAs. Start with a single operation, ->can_userfault() that will verify that a VMA meets requirements for userfaultfd support at registration time. Implement that method for anonymous, shmem and hugetlb and move relevant parts of vma_can_userfault() into the new callbacks. [rppt@kernel.org: relocate VM_DROPPABLE test, per Tal] Link: https://lore.kernel.org/adffgfM5ANxtPIEF@kernel.org Link: https://lore.kernel.org/20260402041156.1377214-8-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Cc: Andrea Arcangeli Cc: Andrei Vagin Cc: Axel Rasmussen Cc: Baolin Wang Cc: David Hildenbrand (Arm) Cc: Harry Yoo Cc: Harry Yoo (Oracle) Cc: Hugh Dickins Cc: James Houghton Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Muchun Song Cc: Nikita Kalyazin Cc: Oscar Salvador Cc: Paolo Bonzini Cc: Peter Xu Cc: Sean Christopherson Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: David Carlier Cc: Tal Zussman Signed-off-by: Andrew Morton --- include/linux/mm.h | 5 +++++ include/linux/userfaultfd_k.h | 6 ++++++ 2 files changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8260e28205e9..633bbf9a184a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -758,6 +758,8 @@ struct vm_fault { */ }; +struct vm_uffd_ops; + /* * These are the virtual MM functions - opening of an area, closing and * unmapping it (needed to keep files on disk up-to-date etc), pointer @@ -865,6 +867,9 @@ struct vm_operations_struct { struct page *(*find_normal_page)(struct vm_area_struct *vma, unsigned long addr); #endif /* CONFIG_FIND_NORMAL_PAGE */ +#ifdef CONFIG_USERFAULTFD + const struct vm_uffd_ops *uffd_ops; +#endif }; #ifdef CONFIG_NUMA_BALANCING diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index ce0201c3dd82..6d445dbfe8ff 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -83,6 +83,12 @@ struct userfaultfd_ctx { extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason); +/* VMA userfaultfd operations */ +struct vm_uffd_ops { + /* Checks if a VMA can support userfaultfd */ + bool (*can_userfault)(struct vm_area_struct *vma, vm_flags_t vm_flags); +}; + /* A combined operation mode + behavior flags. */ typedef unsigned int __bitwise uffd_flags_t; -- cgit v1.2.3 From dfc4d771820a171bd701d06252fcf920d0ede25c Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Thu, 2 Apr 2026 07:11:49 +0300 Subject: shmem, userfaultfd: use a VMA callback to handle UFFDIO_CONTINUE When userspace resolves a page fault in a shmem VMA with UFFDIO_CONTINUE it needs to get a folio that already exists in the pagecache backing that VMA. Instead of using shmem_get_folio() for that, add a get_folio_noalloc() method to 'struct vm_uffd_ops' that will return a folio if it exists in the VMA's pagecache at given pgoff. Implement get_folio_noalloc() method for shmem and slightly refactor userfaultfd's mfill_get_vma() and mfill_atomic_pte_continue() to support this new API. Link: https://lore.kernel.org/20260402041156.1377214-9-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Reviewed-by: James Houghton Cc: Andrea Arcangeli Cc: Andrei Vagin Cc: Axel Rasmussen Cc: Baolin Wang Cc: David Hildenbrand (Arm) Cc: Harry Yoo Cc: Harry Yoo (Oracle) Cc: Hugh Dickins Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Muchun Song Cc: Nikita Kalyazin Cc: Oscar Salvador Cc: Paolo Bonzini Cc: Peter Xu Cc: Sean Christopherson Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: David Carlier Signed-off-by: Andrew Morton --- include/linux/userfaultfd_k.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 6d445dbfe8ff..4bda632dae88 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -87,6 +87,13 @@ extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason); struct vm_uffd_ops { /* Checks if a VMA can support userfaultfd */ bool (*can_userfault)(struct vm_area_struct *vma, vm_flags_t vm_flags); + /* + * Called to resolve UFFDIO_CONTINUE request. + * Should return the folio found at pgoff in the VMA's pagecache if it + * exists or ERR_PTR otherwise. + * The returned folio is locked and with reference held. + */ + struct folio *(*get_folio_noalloc)(struct inode *inode, pgoff_t pgoff); }; /* A combined operation mode + behavior flags. */ -- cgit v1.2.3 From ad9ac3081332e955bc4b513018a1e0e86683bfb5 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Thu, 2 Apr 2026 07:11:50 +0300 Subject: userfaultfd: introduce vm_uffd_ops->alloc_folio() and use it to refactor mfill_atomic_pte_zeroed_folio() and mfill_atomic_pte_copy(). mfill_atomic_pte_zeroed_folio() and mfill_atomic_pte_copy() perform almost identical actions: * allocate a folio * update folio contents (either copy from userspace of fill with zeros) * update page tables with the new folio Split a __mfill_atomic_pte() helper that handles both cases and uses newly introduced vm_uffd_ops->alloc_folio() to allocate the folio. Pass the ops structure from the callers to __mfill_atomic_pte() to later allow using anon_uffd_ops for MAP_PRIVATE mappings of file-backed VMAs. Note, that the new ops method is called alloc_folio() rather than folio_alloc() to avoid clash with alloc_tag macro folio_alloc(). Link: https://lore.kernel.org/20260402041156.1377214-10-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Reviewed-by: James Houghton Cc: Andrea Arcangeli Cc: Andrei Vagin Cc: Axel Rasmussen Cc: Baolin Wang Cc: David Hildenbrand (Arm) Cc: Harry Yoo Cc: Harry Yoo (Oracle) Cc: Hugh Dickins Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Muchun Song Cc: Nikita Kalyazin Cc: Oscar Salvador Cc: Paolo Bonzini Cc: Peter Xu Cc: Sean Christopherson Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: David Carlier Signed-off-by: Andrew Morton --- include/linux/userfaultfd_k.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 4bda632dae88..0f508c752741 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -94,6 +94,12 @@ struct vm_uffd_ops { * The returned folio is locked and with reference held. */ struct folio *(*get_folio_noalloc)(struct inode *inode, pgoff_t pgoff); + /* + * Called during resolution of UFFDIO_COPY request. + * Should allocate and return a folio or NULL if allocation fails. + */ + struct folio *(*alloc_folio)(struct vm_area_struct *vma, + unsigned long addr); }; /* A combined operation mode + behavior flags. */ -- cgit v1.2.3 From f74991b4e3836dd38f3adb41b146994b283942a1 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Thu, 2 Apr 2026 07:11:51 +0300 Subject: shmem, userfaultfd: implement shmem uffd operations using vm_uffd_ops Add filemap_add() and filemap_remove() methods to vm_uffd_ops and use them in __mfill_atomic_pte() to add shmem folios to page cache and remove them in case of error. Implement these methods in shmem along with vm_uffd_ops->alloc_folio() and drop shmem_mfill_atomic_pte(). Since userfaultfd now does not reference any functions from shmem, drop include if linux/shmem_fs.h from mm/userfaultfd.c mfill_atomic_install_pte() is not used anywhere outside of mm/userfaultfd, make it static. Link: https://lore.kernel.org/20260402041156.1377214-11-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Reviewed-by: James Houghton Cc: Andrea Arcangeli Cc: Andrei Vagin Cc: Axel Rasmussen Cc: Baolin Wang Cc: David Hildenbrand (Arm) Cc: Harry Yoo Cc: Harry Yoo (Oracle) Cc: Hugh Dickins Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Muchun Song Cc: Nikita Kalyazin Cc: Oscar Salvador Cc: Paolo Bonzini Cc: Peter Xu Cc: Sean Christopherson Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: David Carlier Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 14 -------------- include/linux/userfaultfd_k.h | 19 ++++++++++++++----- 2 files changed, 14 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index a8273b32e041..1a345142af7d 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -221,20 +221,6 @@ static inline pgoff_t shmem_fallocend(struct inode *inode, pgoff_t eof) extern bool shmem_charge(struct inode *inode, long pages); -#ifdef CONFIG_USERFAULTFD -#ifdef CONFIG_SHMEM -extern int shmem_mfill_atomic_pte(pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - uffd_flags_t flags, - struct folio **foliop); -#else /* !CONFIG_SHMEM */ -#define shmem_mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, \ - src_addr, flags, foliop) ({ BUG(); 0; }) -#endif /* CONFIG_SHMEM */ -#endif /* CONFIG_USERFAULTFD */ - /* * Used space is stored as unsigned 64-bit value in bytes but * quota core supports only signed 64-bit values so use that diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 0f508c752741..d2920f98ab86 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -100,6 +100,20 @@ struct vm_uffd_ops { */ struct folio *(*alloc_folio)(struct vm_area_struct *vma, unsigned long addr); + /* + * Called during resolution of UFFDIO_COPY request. + * Should only be called with a folio returned by alloc_folio() above. + * The folio will be set to locked. + * Returns 0 on success, error code on failure. + */ + int (*filemap_add)(struct folio *folio, struct vm_area_struct *vma, + unsigned long addr); + /* + * Called during resolution of UFFDIO_COPY request on the error + * handling path. + * Should revert the operation of ->filemap_add(). + */ + void (*filemap_remove)(struct folio *folio, struct vm_area_struct *vma); }; /* A combined operation mode + behavior flags. */ @@ -133,11 +147,6 @@ static inline uffd_flags_t uffd_flags_set_mode(uffd_flags_t flags, enum mfill_at /* Flags controlling behavior. These behavior changes are mode-independent. */ #define MFILL_ATOMIC_WP MFILL_ATOMIC_FLAG(0) -extern int mfill_atomic_install_pte(pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, struct page *page, - bool newly_allocated, uffd_flags_t flags); - extern ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start, unsigned long src_start, unsigned long len, uffd_flags_t flags); -- cgit v1.2.3 From 77c368f057e17b59b23899a1907ee9d4f4d7a532 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 2 Apr 2026 18:23:20 +0800 Subject: mm/sparse: fix comment for section map alignment The comment in mmzone.h currently details exhaustive per-architecture bit-width lists and explains alignment using min(PAGE_SHIFT, PFN_SECTION_SHIFT). Such details risk falling out of date over time and may inadvertently be left un-updated. We always expect a single section to cover full pages. Therefore, we can safely assume that PFN_SECTION_SHIFT is large enough to accommodate SECTION_MAP_LAST_BIT. We use BUILD_BUG_ON() to ensure this. Update the comment to accurately reflect this consensus, making it clear that we rely on a single section covering full pages. Link: https://lore.kernel.org/20260402102320.3617578-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: David Hildenbrand (Arm) Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Petr Tesarik Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 20f920dede65..07f501a62d67 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -2068,21 +2068,16 @@ static inline struct mem_section *__nr_to_section(unsigned long nr) extern size_t mem_section_usage_size(void); /* - * We use the lower bits of the mem_map pointer to store - * a little bit of information. The pointer is calculated - * as mem_map - section_nr_to_pfn(pnum). The result is - * aligned to the minimum alignment of the two values: - * 1. All mem_map arrays are page-aligned. - * 2. section_nr_to_pfn() always clears PFN_SECTION_SHIFT - * lowest bits. PFN_SECTION_SHIFT is arch-specific - * (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the - * worst combination is powerpc with 256k pages, - * which results in PFN_SECTION_SHIFT equal 6. - * To sum it up, at least 6 bits are available on all architectures. - * However, we can exceed 6 bits on some other architectures except - * powerpc (e.g. 15 bits are available on x86_64, 13 bits are available - * with the worst case of 64K pages on arm64) if we make sure the - * exceeded bit is not applicable to powerpc. + * We use the lower bits of the mem_map pointer to store a little bit of + * information. The pointer is calculated as mem_map - section_nr_to_pfn(). + * The result is aligned to the minimum alignment of the two values: + * + * 1. All mem_map arrays are page-aligned. + * 2. section_nr_to_pfn() always clears PFN_SECTION_SHIFT lowest bits. + * + * We always expect a single section to cover full pages. Therefore, + * we can safely assume that PFN_SECTION_SHIFT is large enough to + * accommodate SECTION_MAP_LAST_BIT. We use BUILD_BUG_ON() to ensure this. */ enum { SECTION_MARKED_PRESENT_BIT, -- cgit v1.2.3 From a068c4d42c035c63b26ff91c394e6dc2cb7dc5d0 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 13 Apr 2026 12:42:39 +0200 Subject: mailbox: update kdoc for struct mbox_controller Add field for missing lock around the hrtimer. Add 'Required' where the core checks for valid entries. Signed-off-by: Wolfram Sang Reviewed-by: Geert Uytterhoeven Signed-off-by: Jassi Brar --- include/linux/mailbox_controller.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mailbox_controller.h b/include/linux/mailbox_controller.h index a49ee687d4cf..dc93287a2a01 100644 --- a/include/linux/mailbox_controller.h +++ b/include/linux/mailbox_controller.h @@ -62,10 +62,10 @@ struct mbox_chan_ops { /** * struct mbox_controller - Controller of a class of communication channels - * @dev: Device backing this controller - * @ops: Operators that work on each communication chan - * @chans: Array of channels - * @num_chans: Number of channels in the 'chans' array. + * @dev: Device backing this controller. Required. + * @ops: Operators that work on each communication chan. Required. + * @chans: Array of channels. Required. + * @num_chans: Number of channels in the 'chans' array. Required. * @txdone_irq: Indicates if the controller can report to API when * the last transmitted data was read by the remote. * Eg, if it has some TX ACK irq. @@ -78,6 +78,7 @@ struct mbox_chan_ops { * @of_xlate: Controller driver specific mapping of channel via DT * @poll_hrt: API private. hrtimer used to poll for TXDONE on all * channels. + * @poll_hrt_lock: API private. Lock protecting access to poll_hrt. * @node: API private. To hook into list of controllers. */ struct mbox_controller { -- cgit v1.2.3 From 73bd1227787bfe73eea3d04c63a89cb55db9c23e Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 18 Apr 2026 09:41:21 +0800 Subject: rhashtable: Restore insecure_elasticity toggle Some users of rhashtable cannot handle insertion failures, and are happy to accept the consequences of a hash table that having very long chains. Restore the insecure_elasticity toggle for these users. In addition to disabling the chain length checks, this also removes the emergency resize that would otherwise occur when the hash table occupancy hits 100% (an async resize is still scheduled at 75%). Signed-off-by: Herbert Xu Signed-off-by: Tejun Heo --- include/linux/rhashtable-types.h | 2 ++ include/linux/rhashtable.h | 5 +++-- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h index 015c8298bebc..72082428d6c6 100644 --- a/include/linux/rhashtable-types.h +++ b/include/linux/rhashtable-types.h @@ -49,6 +49,7 @@ typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg, * @head_offset: Offset of rhash_head in struct to be hashed * @max_size: Maximum size while expanding * @min_size: Minimum size while shrinking + * @insecure_elasticity: Set to true to disable chain length checks * @automatic_shrinking: Enable automatic shrinking of tables * @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash) * @obj_hashfn: Function to hash object @@ -61,6 +62,7 @@ struct rhashtable_params { u16 head_offset; unsigned int max_size; u16 min_size; + bool insecure_elasticity; bool automatic_shrinking; rht_hashfn_t hashfn; rht_obj_hashfn_t obj_hashfn; diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 0480509a6339..7def3f0f556b 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -821,14 +821,15 @@ slow_path: goto out; } - if (elasticity <= 0) + if (elasticity <= 0 && !params.insecure_elasticity) goto slow_path; data = ERR_PTR(-E2BIG); if (unlikely(rht_grow_above_max(ht, tbl))) goto out_unlock; - if (unlikely(rht_grow_above_100(ht, tbl))) + if (unlikely(rht_grow_above_100(ht, tbl)) && + !params.insecure_elasticity) goto slow_path; /* Inserting at head of list makes unlocking free. */ -- cgit v1.2.3 From cc1ff87bce1ccd38410ab10960f576dcd17db679 Mon Sep 17 00:00:00 2001 From: Qingfang Deng Date: Wed, 15 Apr 2026 10:24:51 +0800 Subject: pppoe: drop PFC frames RFC 2516 Section 7 states that Protocol Field Compression (PFC) is NOT RECOMMENDED for PPPoE. In practice, pppd does not support negotiating PFC for PPPoE sessions, and the current PPPoE driver assumes an uncompressed (2-byte) protocol field. However, the generic PPP layer function ppp_input() is not aware of the negotiation result, and still accepts PFC frames. If a peer with a broken implementation or an attacker sends a frame with a compressed (1-byte) protocol field, the subsequent PPP payload is shifted by one byte. This causes the network header to be 4-byte misaligned, which may trigger unaligned access exceptions on some architectures. To reduce the attack surface, drop PPPoE PFC frames. Introduce ppp_skb_is_compressed_proto() helper function to be used in both ppp_generic.c and pppoe.c to avoid open-coding. Fixes: 7fb1b8ca8fa1 ("ppp: Move PFC decompression to PPP generic layer") Signed-off-by: Qingfang Deng Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260415022456.141758-2-qingfang.deng@linux.dev Signed-off-by: Jakub Kicinski --- include/linux/ppp_defs.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ppp_defs.h b/include/linux/ppp_defs.h index b7e57fdbd413..b1d1f46d7d3b 100644 --- a/include/linux/ppp_defs.h +++ b/include/linux/ppp_defs.h @@ -8,6 +8,7 @@ #define _PPP_DEFS_H_ #include +#include #include #define PPP_FCS(fcs, c) crc_ccitt_byte(fcs, c) @@ -25,4 +26,19 @@ static inline bool ppp_proto_is_valid(u16 proto) return !!((proto & 0x0101) == 0x0001); } +/** + * ppp_skb_is_compressed_proto - checks if PPP protocol in a skb is compressed + * @skb: skb to check + * + * Check if the PPP protocol field is compressed (the least significant + * bit of the most significant octet is 1). skb->data must point to the PPP + * protocol header. + * + * Return: Whether the PPP protocol field is compressed. + */ +static inline bool ppp_skb_is_compressed_proto(const struct sk_buff *skb) +{ + return unlikely(skb->data[0] & 0x01); +} + #endif /* _PPP_DEFS_H_ */ -- cgit v1.2.3 From 4fe985292709eeb6a4653c71660f893e26c2f2dd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 20 Apr 2026 20:03:26 -1000 Subject: rhashtable: Bounce deferred worker kick through irq_work Inserts past 75% load call schedule_work(&ht->run_work) to kick an async resize. If a caller holds a raw spinlock (e.g. an insecure_elasticity user), schedule_work() under that lock records caller_lock -> pool->lock -> pi_lock -> rq->__lock A cycle forms if any of these locks is acquired in the reverse direction elsewhere. sched_ext, the only current insecure_elasticity user, hits this: it holds scx_sched_lock across rhashtable inserts of sub-schedulers, while scx_bypass() takes rq->__lock -> scx_sched_lock. Exercising the resize path produces: Chain exists of: &pool->lock --> &rq->__lock --> scx_sched_lock Bounce the kick from the insert paths through irq_work so schedule_work() runs from hard IRQ context with the caller's lock no longer held. rht_deferred_worker()'s self-rearm on error stays on schedule_work(&ht->run_work) - the worker runs in process context with no caller lock held, and keeping the self-requeue on @run_work lets cancel_work_sync() in rhashtable_free_and_destroy() drain it. v3: Keep rht_deferred_worker()'s self-rearm on schedule_work(&run_work). Routing it through irq_work in v2 broke cancel_work_sync()'s self-requeue handling - an irq_work queued after irq_work_sync() returned but while cancel_work_sync() was still waiting could fire post-teardown. v2: Bounce unconditionally instead of gating on insecure_elasticity, as suggested by Herbert. Signed-off-by: Tejun Heo Acked-by: Herbert Xu --- include/linux/rhashtable-types.h | 3 +++ include/linux/rhashtable.h | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h index 72082428d6c6..fc2f596a6df1 100644 --- a/include/linux/rhashtable-types.h +++ b/include/linux/rhashtable-types.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -77,6 +78,7 @@ struct rhashtable_params { * @p: Configuration parameters * @rhlist: True if this is an rhltable * @run_work: Deferred worker to expand/shrink asynchronously + * @run_irq_work: Bounces the @run_work kick through hard IRQ context. * @mutex: Mutex to protect current/future table swapping * @lock: Spin lock to protect walker list * @nelems: Number of elements in table @@ -88,6 +90,7 @@ struct rhashtable { struct rhashtable_params p; bool rhlist; struct work_struct run_work; + struct irq_work run_irq_work; struct mutex mutex; spinlock_t lock; atomic_t nelems; diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 7def3f0f556b..ef5230cece36 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -847,7 +848,7 @@ slow_path: rht_assign_unlock(tbl, bkt, obj, flags); if (rht_grow_above_75(ht, tbl)) - schedule_work(&ht->run_work); + irq_work_queue(&ht->run_irq_work); data = NULL; out: -- cgit v1.2.3 From f902877b635551513729bdf9a8d1422c4aab7741 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 15 Apr 2026 17:56:02 +0200 Subject: rculist: add list_splice_rcu() for private lists This patch adds a helper function, list_splice_rcu(), to safely splice a private (non-RCU-protected) list into an RCU-protected list. The function ensures that only the pointer visible to RCU readers (prev->next) is updated using rcu_assign_pointer(), while the rest of the list manipulations are performed with regular assignments, as the source list is private and not visible to concurrent RCU readers. This is useful for moving elements from a private list into a global RCU-protected list, ensuring safe publication for RCU readers. Subsystems with some sort of batching mechanism from userspace can benefit from this new function. The function __list_splice_rcu() has been added for clarity and to follow the same pattern as in the existing list_splice*() interfaces, where there is a check to ensure that the list to splice is not empty. Note that __list_splice_rcu() has no documentation for this reason. Reviewed-by: Paul E. McKenney Signed-off-by: Pablo Neira Ayuso --- include/linux/rculist.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 2abba7552605..e3bc44225692 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -261,6 +261,35 @@ static inline void list_replace_rcu(struct list_head *old, old->prev = LIST_POISON2; } +static inline void __list_splice_rcu(struct list_head *list, + struct list_head *prev, + struct list_head *next) +{ + struct list_head *first = list->next; + struct list_head *last = list->prev; + + last->next = next; + first->prev = prev; + next->prev = last; + rcu_assign_pointer(list_next_rcu(prev), first); +} + +/** + * list_splice_rcu - splice a non-RCU list into an RCU-protected list, + * designed for stacks. + * @list: the non RCU-protected list to splice + * @head: the place in the existing RCU-protected list to splice + * + * The list pointed to by @head can be RCU-read traversed concurrently with + * this function. + */ +static inline void list_splice_rcu(struct list_head *list, + struct list_head *head) +{ + if (!list_empty(list)) + __list_splice_rcu(list, head, head->next); +} + /** * __list_splice_init_rcu - join an RCU-protected list into an existing list. * @list: the RCU-protected list to splice -- cgit v1.2.3 From db9e726525e45dbd713c07897a4d20bc18333ccc Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 16 Apr 2026 11:56:58 -0700 Subject: net: add address list snapshot and reconciliation infrastructure Introduce __hw_addr_list_snapshot() and __hw_addr_list_reconcile() for use by the upcoming ndo_set_rx_mode_async callback. The async rx_mode path needs to snapshot the device's unicast and multicast address lists under the addr_lock, hand those snapshots to the driver (which may sleep), and then propagate any sync_cnt changes back to the real lists. Two identical snapshots are taken: a work copy for the driver to pass to __hw_addr_sync_dev() and a reference copy to compute deltas against. __hw_addr_list_reconcile() walks the reference snapshot comparing each entry against the work snapshot to determine what the driver synced or unsynced. It then applies those deltas to the real list, handling concurrent modifications: - If the real entry was concurrently removed but the driver synced it to hardware (delta > 0), re-insert a stale entry so the next work run properly unsyncs it from hardware. - If the entry still exists, apply the delta normally. An entry whose refcount drops to zero is removed. # dev_addr_test_snapshot_benchmark: 1024 addrs x 1000 snapshots: 89872802 ns total, 89872 ns/iter # dev_addr_test_snapshot_benchmark.speed: slow Reviewed-by: Aleksandr Loktionov Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20260416185712.2155425-2-sdf@fomichev.me Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7969fcdd5ac4..a84c55488b8c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -5004,6 +5004,13 @@ void __hw_addr_unsync_dev(struct netdev_hw_addr_list *list, int (*unsync)(struct net_device *, const unsigned char *)); void __hw_addr_init(struct netdev_hw_addr_list *list); +void __hw_addr_flush(struct netdev_hw_addr_list *list); +int __hw_addr_list_snapshot(struct netdev_hw_addr_list *snap, + const struct netdev_hw_addr_list *list, + int addr_len); +void __hw_addr_list_reconcile(struct netdev_hw_addr_list *real_list, + struct netdev_hw_addr_list *work, + struct netdev_hw_addr_list *ref, int addr_len); /* Functions used for device addresses handling */ void dev_addr_mod(struct net_device *dev, unsigned int offset, -- cgit v1.2.3 From 3554b4345d855089ab7af5e3557f5dc3262d14c9 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 16 Apr 2026 11:56:59 -0700 Subject: net: introduce ndo_set_rx_mode_async and netdev_rx_mode_work Add ndo_set_rx_mode_async callback that drivers can implement instead of the legacy ndo_set_rx_mode. The legacy callback runs under the netif_addr_lock spinlock with BHs disabled, preventing drivers from sleeping. The async variant runs from a work queue with rtnl_lock and netdev_lock_ops held, in fully sleepable context. When __dev_set_rx_mode() sees ndo_set_rx_mode_async, it schedules netdev_rx_mode_work instead of calling the driver inline. The work function takes two snapshots of each address list (uc/mc) under the addr_lock, then drops the lock and calls the driver with the work copies. After the driver returns, it reconciles the snapshots back to the real lists under the lock. Add netif_rx_mode_sync() to opportunistically execute the pending workqueue update inline, so that rx mode changes are committed before returning to userspace: - dev_change_flags (SIOCSIFFLAGS / RTM_NEWLINK) - dev_set_promiscuity - dev_set_allmulti - dev_ifsioc SIOCADDMULTI / SIOCDELMULTI - do_setlink (RTM_SETLINK) Note that some deep hierarchies still do skip the lower updates via: - dev_uc_sync - dev_mc_sync If we do end up hitting user-visible issues, we can add more calls to netif_rx_mode_sync in specific places. But hopefully we should not, the actual user-visible lists are still synced, it's that just HW state that might be lagging. Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20260416185712.2155425-3-sdf@fomichev.me Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a84c55488b8c..6ed97f4c3bc6 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1119,6 +1119,16 @@ struct netdev_net_notifier { * This function is called device changes address list filtering. * If driver handles unicast address filtering, it should set * IFF_UNICAST_FLT in its priv_flags. + * Cannot sleep, called with netif_addr_lock_bh held. + * Deprecated in favor of ndo_set_rx_mode_async. + * + * void (*ndo_set_rx_mode_async)(struct net_device *dev, + * struct netdev_hw_addr_list *uc, + * struct netdev_hw_addr_list *mc); + * Async version of ndo_set_rx_mode which runs in process context + * with rtnl_lock and netdev_lock_ops(dev) held. The uc/mc parameters + * are snapshots of the address lists - iterate with + * netdev_hw_addr_list_for_each(ha, uc). * * int (*ndo_set_mac_address)(struct net_device *dev, void *addr); * This function is called when the Media Access Control address @@ -1439,6 +1449,10 @@ struct net_device_ops { void (*ndo_change_rx_flags)(struct net_device *dev, int flags); void (*ndo_set_rx_mode)(struct net_device *dev); + void (*ndo_set_rx_mode_async)( + struct net_device *dev, + struct netdev_hw_addr_list *uc, + struct netdev_hw_addr_list *mc); int (*ndo_set_mac_address)(struct net_device *dev, void *addr); int (*ndo_validate_addr)(struct net_device *dev); @@ -1903,6 +1917,8 @@ enum netdev_reg_state { * has been enabled due to the need to listen to * additional unicast addresses in a device that * does not implement ndo_set_rx_mode() + * @rx_mode_node: List entry for rx_mode work processing + * @rx_mode_tracker: Refcount tracker for rx_mode work * @uc: unicast mac addresses * @mc: multicast mac addresses * @dev_addrs: list of device hw addresses @@ -2294,6 +2310,8 @@ struct net_device { unsigned int promiscuity; unsigned int allmulti; bool uc_promisc; + struct list_head rx_mode_node; + netdevice_tracker rx_mode_tracker; #ifdef CONFIG_LOCKDEP unsigned char nested_level; #endif -- cgit v1.2.3 From a4c833278144917982510ca43a3438155756122a Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 16 Apr 2026 11:57:00 -0700 Subject: net: cache snapshot entries for ndo_set_rx_mode_async Add a per-device netdev_hw_addr_list cache (rx_mode_addr_cache) that allows __hw_addr_list_snapshot() and __hw_addr_list_reconcile() to reuse previously allocated entries instead of hitting GFP_ATOMIC on every snapshot cycle. snapshot pops entries from the cache when available, falling back to __hw_addr_create(). reconcile splices both snapshot lists back into the cache via __hw_addr_splice(). The cache is flushed in free_netdev(). Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20260416185712.2155425-4-sdf@fomichev.me Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 6ed97f4c3bc6..97b435da5771 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1919,6 +1919,7 @@ enum netdev_reg_state { * does not implement ndo_set_rx_mode() * @rx_mode_node: List entry for rx_mode work processing * @rx_mode_tracker: Refcount tracker for rx_mode work + * @rx_mode_addr_cache: Recycled snapshot entries for rx_mode work * @uc: unicast mac addresses * @mc: multicast mac addresses * @dev_addrs: list of device hw addresses @@ -2312,6 +2313,7 @@ struct net_device { bool uc_promisc; struct list_head rx_mode_node; netdevice_tracker rx_mode_tracker; + struct netdev_hw_addr_list rx_mode_addr_cache; #ifdef CONFIG_LOCKDEP unsigned char nested_level; #endif @@ -5025,10 +5027,11 @@ void __hw_addr_init(struct netdev_hw_addr_list *list); void __hw_addr_flush(struct netdev_hw_addr_list *list); int __hw_addr_list_snapshot(struct netdev_hw_addr_list *snap, const struct netdev_hw_addr_list *list, - int addr_len); + int addr_len, struct netdev_hw_addr_list *cache); void __hw_addr_list_reconcile(struct netdev_hw_addr_list *real_list, struct netdev_hw_addr_list *work, - struct netdev_hw_addr_list *ref, int addr_len); + struct netdev_hw_addr_list *ref, int addr_len, + struct netdev_hw_addr_list *cache); /* Functions used for device addresses handling */ void dev_addr_mod(struct net_device *dev, unsigned int offset, -- cgit v1.2.3 From 922f8c28811f266fe5fc52a6d2852871e40ce098 Mon Sep 17 00:00:00 2001 From: Dewei Meng Date: Tue, 21 Apr 2026 10:58:08 +0800 Subject: spi: Fix the error description in the `ptp_sts_word_post` comment Based on the comment information, the description within the `ptp_sts_word_post` section should be changed to "See @ptp_sts_word_pre". Signed-off-by: Dewei Meng Link: https://patch.msgid.link/20260421025808.6572-1-mengdewei@cqsoftware.com.cn Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 7587b1c5d7ec..82682dd9961d 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -1019,7 +1019,7 @@ struct spi_res { * this value may have changed compared to what was requested, depending * on the available snapshotting resolution (DMA transfer, * @ptp_sts_supported is false, etc). - * @ptp_sts_word_post: See @ptp_sts_word_post. The two can be equal (meaning + * @ptp_sts_word_post: See @ptp_sts_word_pre. The two can be equal (meaning * that a single byte should be snapshotted). * If the core takes care of the timestamp (if @ptp_sts_supported is false * for this controller), it will set @ptp_sts_word_pre to 0, and -- cgit v1.2.3 From 256e5254efff48d6de97e314dc17d55504c55164 Mon Sep 17 00:00:00 2001 From: Kexin Sun Date: Tue, 24 Mar 2026 11:23:44 +0800 Subject: kgdb: update outdated references to kgdb_wait() The function kgdb_wait() was folded into the static function kgdb_cpu_enter() by commit 62fae312197a ("kgdb: eliminate kgdb_wait(), all cpus enter the same way"). Update the four stale references accordingly: - include/linux/kgdb.h and arch/x86/kernel/kgdb.c: the kgdb_roundup_cpus() kdoc describes what other CPUs are rounded up to call. Because kgdb_cpu_enter() is static, the correct public entry point is kgdb_handle_exception(); also fix a pre-existing grammar error ("get them be" -> "get them into") and reflow the text. - kernel/debug/debug_core.c: replace with the generic description "the debug trap handler", since the actual entry path is architecture-specific. - kernel/debug/gdbstub.c: kgdb_cpu_enter() is correct here (it describes internal state, not a call target); add the missing parentheses. Suggested-by: Daniel Thompson Assisted-by: unnamed:deepseek-v3.2 coccinelle Signed-off-by: Kexin Sun --- include/linux/kgdb.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h index 22b3f3839f30..6c46591a2eac 100644 --- a/include/linux/kgdb.h +++ b/include/linux/kgdb.h @@ -202,9 +202,10 @@ extern void kgdb_call_nmi_hook(void *ignored); * * On SMP systems, we need to get the attention of the other CPUs * and get them into a known state. This should do what is needed - * to get the other CPUs to call kgdb_wait(). Note that on some arches, - * the NMI approach is not used for rounding up all the CPUs. Normally - * those architectures can just not implement this and get the default. + * to get the other CPUs to call kgdb_handle_exception(). Note that + * on some arches, the NMI approach is not used for rounding up all + * the CPUs. Normally those architectures can just not implement + * this and get the default. * * On non-SMP systems, this is not called. */ -- cgit v1.2.3 From 6f1d4d2ecfcd1b577dc87350ea965fe81f272e83 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 22 Mar 2024 14:22:48 +0100 Subject: tpm: avoid -Wunused-but-set-variable Outside of the EFI tpm code, the TPM_MEMREMAP()/TPM_MEMUNMAP functions are defined as trivial macros, leading to the mapping_size variable ending up unused: In file included from drivers/char/tpm/tpm-sysfs.c:16: In file included from drivers/char/tpm/tpm.h:28: include/linux/tpm_eventlog.h:167:6: error: variable 'mapping_size' set but not used [-Werror,-Wunused-but-set-variable] 167 | int mapping_size; Turn the stubs into inline functions to avoid this warning. Cc: stable@vger.kernel.org # v5.3+ Fixes: c46f3405692d ("tpm: Reserve the TPM final events table") Signed-off-by: Arnd Bergmann Reviewed-by: Thorsten Blum Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- include/linux/tpm_eventlog.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tpm_eventlog.h b/include/linux/tpm_eventlog.h index 891368e82558..aff8ea2fa98e 100644 --- a/include/linux/tpm_eventlog.h +++ b/include/linux/tpm_eventlog.h @@ -131,11 +131,16 @@ struct tcg_algorithm_info { }; #ifndef TPM_MEMREMAP -#define TPM_MEMREMAP(start, size) NULL +static inline void *TPM_MEMREMAP(unsigned long start, size_t size) +{ + return NULL; +} #endif #ifndef TPM_MEMUNMAP -#define TPM_MEMUNMAP(start, size) do{} while(0) +static inline void TPM_MEMUNMAP(void *mapping, size_t size) +{ +} #endif /** -- cgit v1.2.3 From 5d3869a41f3608101c00ff9c9c7c2364c555fa65 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Mon, 13 Apr 2026 18:24:23 -0400 Subject: NFS: fix writeback in presence of errors After running xfstest generic/751, in certain conditions, can have a writeback IO stuck while experiencing one of the two patterns. Pattern#1: writeback IO experiences ENOSPC on an offset smaller than the filesize. Example, write offset=0 len=4096 how=unstable OK write offset=8192 len=4096 how=unstable OK write offset=12288 len=4096 how=unstable ENOSPC write offset=4096 len=4096 how=unstable ENOSPC client sends a commit and receives a verifier which is different from the last successful write. It marks pages dirty and writeback retries. But it again send writes unstable and gets into the same pattern, running into the ENOSPC error and sending a commit because writes were sent at unstable. Pattern#2: an unstable write followed by a short write and ENOSPC. write offset=0 len=4096 how=unstable OK write offset=4096 len=4096 how=unstable returns OK but count=100 write offset=4197 len=3996 how=stable returns ENOSPC client send a commit and receives a verifier different from the last unstable write. The same behaviour is retried in a loop. Instead, this patch proposes to identify those conditions and mark requests to be done synchronously instead. Previous solution tried to mark it in the nfs_page, however that's not persistent thus instead mark it in the nfs_open_context. Furthermore, the same problem occurs during localio code path so recognize that IO needs to be done sync in that case as well. Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- include/linux/nfs_fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 8dd79a3f3d66..4623262da3c0 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -109,6 +109,7 @@ struct nfs_open_context { #define NFS_CONTEXT_BAD (2) #define NFS_CONTEXT_UNLOCK (3) #define NFS_CONTEXT_FILE_OPEN (4) +#define NFS_CONTEXT_WRITE_SYNC (5) struct nfs4_threshold *mdsthreshold; struct list_head list; -- cgit v1.2.3 From a6e23843e949081b417b6078f02074074a190499 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 10 Apr 2026 17:49:07 +0200 Subject: spi: fix controller cleanup() documentation The controller cleanup() callback is no longer called when releasing a device, but rather when deregistering it (and on registration failures). Fixes: c7299fea6769 ("spi: Fix spi device unregister flow") Cc: Saravana Kannan Signed-off-by: Johan Hovold Link: https://patch.msgid.link/20260410154907.129248-3-johan@kernel.org Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 7587b1c5d7ec..bbb5b870baeb 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -701,7 +701,7 @@ struct spi_controller { int (*transfer)(struct spi_device *spi, struct spi_message *mesg); - /* Called on release() to free memory provided by spi_controller */ + /* Called on deregistration to free memory provided by spi_controller */ void (*cleanup)(struct spi_device *spi); /* -- cgit v1.2.3 From 54377fcab51f6f1f8807827d3751be42279e1a6a Mon Sep 17 00:00:00 2001 From: KaFai Wan Date: Tue, 21 Apr 2026 23:58:02 +0800 Subject: bpf: Reject TCP_NODELAY in bpf-tcp-cc A BPF TCP congestion control program can call bpf_setsockopt() from its callbacks. In current kernels, if it calls bpf_setsockopt(TCP_NODELAY) from cwnd_event_tx_start(), the call can re-enter the TCP transmit path before the outer tcp_transmit_skb() has completed and advanced the send head. This can re-trigger CA_EVENT_TX_START and lead to unbounded recursion: tcp_transmit_skb() -> tcp_event_data_sent() -> tcp_ca_event(sk, CA_EVENT_TX_START) -> cwnd_event_tx_start() -> bpf_setsockopt(TCP_NODELAY) -> tcp_push_pending_frames() -> tcp_write_xmit() -> tcp_transmit_skb() This leads to unbounded recursion and can overflow the kernel stack. Reject TCP_NODELAY with -EOPNOTSUPP for bpf-tcp-cc by introducing a dedicated setsockopt proto for BPF_PROG_TYPE_STRUCT_OPS TCP congestion control programs. To keep it simple, all tcp-cc ops is rejected for TCP_NODELAY. Fixes: 7e41df5dbba2 ("bpf: Add a few optnames to bpf_setsockopt") Suggested-by: Martin KaFai Lau Signed-off-by: KaFai Wan Signed-off-by: Martin KaFai Lau Reviewed-by: Jiayuan Chen Link: https://patch.msgid.link/20260421155804.135786-3-kafai.wan@linux.dev --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b4b703c90ca9..01e203964892 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3725,6 +3725,7 @@ extern const struct bpf_func_proto bpf_for_each_map_elem_proto; extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto; extern const struct bpf_func_proto bpf_sk_setsockopt_proto; extern const struct bpf_func_proto bpf_sk_getsockopt_proto; +extern const struct bpf_func_proto bpf_sk_setsockopt_nodelay_proto; extern const struct bpf_func_proto bpf_unlocked_sk_setsockopt_proto; extern const struct bpf_func_proto bpf_unlocked_sk_getsockopt_proto; extern const struct bpf_func_proto bpf_find_vma_proto; -- cgit v1.2.3 From bd7b7ce96db4487bb77692a85ee4489fd2c395df Mon Sep 17 00:00:00 2001 From: Chris Leech Date: Wed, 22 Apr 2026 12:06:36 -0700 Subject: nvme-auth: Hash DH shared secret to create session key The NVMe Base Specification 8.3.5.5.9 states that the session key Ks shall be computed from the ephemeral DH key by applying the hash function selected by the HashID parameter. The current implementation stores the raw DH shared secret as the session key without hashing it. This causes redundant hash operations: 1. Augmented challenge computation (section 8.3.5.5.4) requires Ca = HMAC(H(g^xy mod p), C). The code compensates by hashing the unhashed session key in nvme_auth_augmented_challenge() to produce the correct result. 2. PSK generation (section 8.3.5.5.9) requires PSK = HMAC(Ks, C1 || C2) where Ks should already be H(g^xy mod p). As the DH shared secret is always larger than the HMAC block size, HMAC internally hashes it before use, accidentally producing the correct result. When using secure channel concatenation with bidirectional authentication, this results in hashing the DH value three times: twice for augmented challenge calculations and once during PSK generation. Fix this by: - Modifying nvme_auth_gen_shared_secret() to hash the DH shared secret once after computation: Ks = H(g^xy mod p) - Removing the hash operation from nvme_auth_augmented_challenge() as the session key is now already hashed - Updating session key buffer size from DH key size to hash output size - Adding specification references in comments This avoid storing the raw DH shared secret and reduces the number of hash operations from three to one when using secure channel concatenation. Reviewed-by: Hannes Reinecke Reviewed-by: Eric Biggers Signed-off-by: Chris Leech Signed-off-by: Keith Busch --- include/linux/nvme-auth.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme-auth.h b/include/linux/nvme-auth.h index 184a1f9510fa..89902ae8b929 100644 --- a/include/linux/nvme-auth.h +++ b/include/linux/nvme-auth.h @@ -49,9 +49,9 @@ int nvme_auth_augmented_challenge(u8 hmac_id, const u8 *skey, size_t skey_len, int nvme_auth_gen_privkey(struct crypto_kpp *dh_tfm, u8 dh_gid); int nvme_auth_gen_pubkey(struct crypto_kpp *dh_tfm, u8 *host_key, size_t host_key_len); -int nvme_auth_gen_shared_secret(struct crypto_kpp *dh_tfm, - const u8 *ctrl_key, size_t ctrl_key_len, - u8 *sess_key, size_t sess_key_len); +int nvme_auth_gen_session_key(struct crypto_kpp *dh_tfm, + const u8 *public_key, size_t public_key_len, + u8 *sess_key, size_t sess_key_len, u8 hash_id); int nvme_auth_generate_psk(u8 hmac_id, const u8 *skey, size_t skey_len, const u8 *c1, const u8 *c2, size_t hash_len, u8 **ret_psk, size_t *ret_len); -- cgit v1.2.3 From 0b13173d27fa15679463b62a10cfa8b3d6c3a71c Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Wed, 15 Apr 2026 13:41:01 +0800 Subject: dma-buf: fix stale @lock references in struct dma_buf documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The kernel-doc comments for vmapping_counter and vmap_ptr in struct dma_buf reference "@lock" as the protecting lock, but struct dma_buf no longer has a "lock" member. The mutex was removed in favor of using the dma_resv lock exclusively. The implementation correctly uses dma_resv_assert_held(dmabuf->resv) in dma_buf_vmap() and dma_buf_vunmap(), so update the documentation to reference @resv instead. Signed-off-by: gaoxiang17 Reviewed-by: Christian König Signed-off-by: Christian König Link: https://lore.kernel.org/r/20260415054101.535520-1-gxxa03070307@gmail.com --- include/linux/dma-buf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index 133b9e637b55..ef6d93fd7a2c 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -322,13 +322,13 @@ struct dma_buf { * @vmapping_counter: * * Used internally to refcnt the vmaps returned by dma_buf_vmap(). - * Protected by @lock. + * Protected by @resv. */ unsigned vmapping_counter; /** * @vmap_ptr: - * The current vmap ptr if @vmapping_counter > 0. Protected by @lock. + * The current vmap ptr if @vmapping_counter > 0. Protected by @resv. */ struct iosys_map vmap_ptr; -- cgit v1.2.3 From fc69decc811b155a0ed8eef17ee940f28c4f6dbc Mon Sep 17 00:00:00 2001 From: Longxuan Yu Date: Mon, 20 Apr 2026 11:18:45 +0800 Subject: 8021q: use RCU for egress QoS mappings The TX fast path and reporting paths walk egress QoS mappings without RTNL. Convert the mapping lists to RCU-protected pointers, use RCU reader annotations in readers, and defer freeing mapping nodes with an embedded rcu_head. This prepares the egress QoS mapping code for safe removal of mapping nodes in a follow-up change while preserving the current behavior. Co-developed-by: Yuan Tan Signed-off-by: Yuan Tan Signed-off-by: Longxuan Yu Signed-off-by: Ren Wei Link: https://patch.msgid.link/9136768189f8c6d3f824f476c62d2fa1111688e8.1776647968.git.yuantan098@gmail.com Signed-off-by: Paolo Abeni --- include/linux/if_vlan.h | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index e6272f9c5e42..20cc16ea4e5a 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -147,11 +147,13 @@ extern __be16 vlan_dev_vlan_proto(const struct net_device *dev); * @priority: skb priority * @vlan_qos: vlan priority: (skb->priority << 13) & 0xE000 * @next: pointer to next struct + * @rcu: used for deferred freeing of mapping nodes */ struct vlan_priority_tci_mapping { u32 priority; u16 vlan_qos; - struct vlan_priority_tci_mapping *next; + struct vlan_priority_tci_mapping __rcu *next; + struct rcu_head rcu; }; struct proc_dir_entry; @@ -177,7 +179,7 @@ struct vlan_dev_priv { unsigned int nr_ingress_mappings; u32 ingress_priority_map[8]; unsigned int nr_egress_mappings; - struct vlan_priority_tci_mapping *egress_priority_map[16]; + struct vlan_priority_tci_mapping __rcu *egress_priority_map[16]; __be16 vlan_proto; u16 vlan_id; @@ -209,19 +211,24 @@ static inline u16 vlan_dev_get_egress_qos_mask(struct net_device *dev, u32 skprio) { struct vlan_priority_tci_mapping *mp; + u16 vlan_qos = 0; - smp_rmb(); /* coupled with smp_wmb() in vlan_dev_set_egress_priority() */ + rcu_read_lock(); - mp = vlan_dev_priv(dev)->egress_priority_map[(skprio & 0xF)]; + mp = rcu_dereference(vlan_dev_priv(dev)->egress_priority_map[skprio & 0xF]); while (mp) { if (mp->priority == skprio) { - return mp->vlan_qos; /* This should already be shifted - * to mask correctly with the - * VLAN's TCI */ + vlan_qos = READ_ONCE(mp->vlan_qos); + break; } - mp = mp->next; + mp = rcu_dereference(mp->next); } - return 0; + rcu_read_unlock(); + + /* This should already be shifted to mask correctly with + * the VLAN's TCI. + */ + return vlan_qos; } extern bool vlan_do_receive(struct sk_buff **skb); -- cgit v1.2.3 From 6d5431555de032f5ad9e08a7fb372f37bf493903 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 16 Apr 2026 11:28:28 -0700 Subject: caif: remove CAIF NETWORK LAYER Remove CAIF (Communication CPU to Application CPU Interface), the ST-Ericsson modem protocol. The subsystem has been orphaned since 2013. The last meaningful changes from the maintainers were in March 2013: a8c7687bf216 ("caif_virtio: Check that vringh_config is not null") b2273be8d2df ("caif_virtio: Use vringh_notify_enable correctly") 0d2e1a2926b1 ("caif_virtio: Introduce caif over virtio") Not-so-coincidentally, according to "the Internet" ST-Ericsson officially shut down its modem joint venture in Aug 2013. If anyone is using this code please yell! In the 13 years since, the code has accumulated 200 non-merge commits, of which 71 were cross-tree API changes, 21 carried Fixes: tags, and the remaining ~110 were cleanups, doc conversions, treewide refactors, and one partial removal (caif_hsi, ca75bcf0a83b). We are still getting fixes to this code, in the last 10 days there were 3 reports on security@ about CAIF that I have been CCed on. UAPI constants (AF_CAIF, ARPHRD_CAIF, N_CAIF, VIRTIO_ID_CAIF) and the SELinux classmap entry are intentionally kept for ABI stability. Acked-by: Michael S. Tsirkin Acked-by: Greg Kroah-Hartman Reviewed-by: Linus Walleij Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260416182829.1440262-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/virtio_caif.h | 24 ------------------------ 1 file changed, 24 deletions(-) delete mode 100644 include/linux/virtio_caif.h (limited to 'include/linux') diff --git a/include/linux/virtio_caif.h b/include/linux/virtio_caif.h deleted file mode 100644 index ea722479510c..000000000000 --- a/include/linux/virtio_caif.h +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright (C) ST-Ericsson AB 2012 - * Author: Sjur Brændeland - * - * This header is BSD licensed so - * anyone can use the definitions to implement compatible remote processors - */ - -#ifndef VIRTIO_CAIF_H -#define VIRTIO_CAIF_H - -#include -struct virtio_caif_transf_config { - __virtio16 headroom; - __virtio16 tailroom; - __virtio32 mtu; - u8 reserved[4]; -}; - -struct virtio_caif_config { - struct virtio_caif_transf_config uplink, downlink; - u8 reserved[8]; -}; -#endif -- cgit v1.2.3 From 4f10f1dfb235a28bd86cf0b00d86a59696ddbe5b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 20 Apr 2026 19:21:07 -0700 Subject: net: remove ISDN subsystem and Bluetooth CMTP Remove the ISDN (mISDN, CAPI) subsystem and Bluetooth CMTP protocol from the kernel tree. ISDN is a pretty old technology and it's unclear whether anyone still uses it. I went over the last few years of git history and all the commits are either tree-wide conversions or syzbot/static analyzer fixes. When we discussed removal in the past IIRC there were some concerns about ISDN still being used in parts of Germany. Unfortunately, the code base is quite old, none of the current maintainers are familiar with it and AI tools will have a field day finding bugs here. Delete this code and preserve it in an out-of-tree repository for any remaining users: https://github.com/linux-netdev/mod-orphan UAPI constants AF_ISDN/PF_ISDN and the SELinux isdn_socket class are preserved for ABI stability, but the rest of uAPI is removed. Signed-off-by: Jakub Kicinski Acked-by: Greg Kroah-Hartman Acked-by: Stephen Hemminger Acked-by: Luiz Augusto von Dentz Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260421022108.1299678-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- include/linux/isdn/capilli.h | 95 ------- include/linux/isdn/capiutil.h | 60 ----- include/linux/kernelcapi.h | 45 ---- include/linux/mISDNdsp.h | 40 --- include/linux/mISDNhw.h | 192 -------------- include/linux/mISDNif.h | 603 ------------------------------------------ 6 files changed, 1035 deletions(-) delete mode 100644 include/linux/isdn/capilli.h delete mode 100644 include/linux/isdn/capiutil.h delete mode 100644 include/linux/kernelcapi.h delete mode 100644 include/linux/mISDNdsp.h delete mode 100644 include/linux/mISDNhw.h delete mode 100644 include/linux/mISDNif.h (limited to 'include/linux') diff --git a/include/linux/isdn/capilli.h b/include/linux/isdn/capilli.h deleted file mode 100644 index 12be09b6883b..000000000000 --- a/include/linux/isdn/capilli.h +++ /dev/null @@ -1,95 +0,0 @@ -/* $Id: capilli.h,v 1.1.2.2 2004/01/16 21:09:27 keil Exp $ - * - * Kernel CAPI 2.0 Driver Interface for Linux - * - * Copyright 1999 by Carsten Paeth - * - * This software may be used and distributed according to the terms - * of the GNU General Public License, incorporated herein by reference. - * - */ - -#ifndef __CAPILLI_H__ -#define __CAPILLI_H__ - -#include -#include -#include -#include - -typedef struct capiloaddatapart { - int user; /* data in userspace ? */ - int len; - unsigned char *data; -} capiloaddatapart; - -typedef struct capiloaddata { - capiloaddatapart firmware; - capiloaddatapart configuration; -} capiloaddata; - -typedef struct capicardparams { - unsigned int port; - unsigned irq; - int cardtype; - int cardnr; - unsigned int membase; -} capicardparams; - -struct capi_ctr { - /* filled in before calling attach_capi_ctr */ - struct module *owner; - void *driverdata; /* driver specific */ - char name[32]; /* name of controller */ - char *driver_name; /* name of driver */ - int (*load_firmware)(struct capi_ctr *, capiloaddata *); - void (*reset_ctr)(struct capi_ctr *); - void (*register_appl)(struct capi_ctr *, u16 appl, - capi_register_params *); - void (*release_appl)(struct capi_ctr *, u16 appl); - u16 (*send_message)(struct capi_ctr *, struct sk_buff *skb); - - char *(*procinfo)(struct capi_ctr *); - int (*proc_show)(struct seq_file *, void *); - - /* filled in before calling ready callback */ - u8 manu[CAPI_MANUFACTURER_LEN]; /* CAPI_GET_MANUFACTURER */ - capi_version version; /* CAPI_GET_VERSION */ - capi_profile profile; /* CAPI_GET_PROFILE */ - u8 serial[CAPI_SERIAL_LEN]; /* CAPI_GET_SERIAL */ - - /* management information for kcapi */ - - unsigned long nrecvctlpkt; - unsigned long nrecvdatapkt; - unsigned long nsentctlpkt; - unsigned long nsentdatapkt; - - int cnr; /* controller number */ - unsigned short state; /* controller state */ - int blocked; /* output blocked */ - int traceflag; /* capi trace */ - - struct proc_dir_entry *procent; - char procfn[128]; -}; - -int attach_capi_ctr(struct capi_ctr *); -int detach_capi_ctr(struct capi_ctr *); - -void capi_ctr_ready(struct capi_ctr * card); -void capi_ctr_down(struct capi_ctr * card); -void capi_ctr_handle_message(struct capi_ctr * card, u16 appl, struct sk_buff *skb); - -// --------------------------------------------------------------------------- -// needed for AVM capi drivers - -struct capi_driver { - char name[32]; /* driver name */ - char revision[32]; - - /* management information for kcapi */ - struct list_head list; -}; - -#endif /* __CAPILLI_H__ */ diff --git a/include/linux/isdn/capiutil.h b/include/linux/isdn/capiutil.h deleted file mode 100644 index 953fd500dff7..000000000000 --- a/include/linux/isdn/capiutil.h +++ /dev/null @@ -1,60 +0,0 @@ -/* $Id: capiutil.h,v 1.5.6.2 2001/09/23 22:24:33 kai Exp $ - * - * CAPI 2.0 defines & types - * - * From CAPI 2.0 Development Kit AVM 1995 (msg.c) - * Rewritten for Linux 1996 by Carsten Paeth - * - * This software may be used and distributed according to the terms - * of the GNU General Public License, incorporated herein by reference. - * - */ - -#ifndef __CAPIUTIL_H__ -#define __CAPIUTIL_H__ - -#include - -#define CAPIMSG_BASELEN 8 -#define CAPIMSG_U8(m, off) (m[off]) -#define CAPIMSG_U16(m, off) (m[off]|(m[(off)+1]<<8)) -#define CAPIMSG_U32(m, off) (m[off]|(m[(off)+1]<<8)|(m[(off)+2]<<16)|(m[(off)+3]<<24)) -#define CAPIMSG_LEN(m) CAPIMSG_U16(m,0) -#define CAPIMSG_APPID(m) CAPIMSG_U16(m,2) -#define CAPIMSG_COMMAND(m) CAPIMSG_U8(m,4) -#define CAPIMSG_SUBCOMMAND(m) CAPIMSG_U8(m,5) -#define CAPIMSG_CMD(m) (((m[4])<<8)|(m[5])) -#define CAPIMSG_MSGID(m) CAPIMSG_U16(m,6) -#define CAPIMSG_CONTROLLER(m) (m[8] & 0x7f) -#define CAPIMSG_CONTROL(m) CAPIMSG_U32(m, 8) -#define CAPIMSG_NCCI(m) CAPIMSG_CONTROL(m) -#define CAPIMSG_DATALEN(m) CAPIMSG_U16(m,16) /* DATA_B3_REQ */ - -static inline void capimsg_setu8(void *m, int off, __u8 val) -{ - ((__u8 *)m)[off] = val; -} - -static inline void capimsg_setu16(void *m, int off, __u16 val) -{ - ((__u8 *)m)[off] = val & 0xff; - ((__u8 *)m)[off+1] = (val >> 8) & 0xff; -} - -static inline void capimsg_setu32(void *m, int off, __u32 val) -{ - ((__u8 *)m)[off] = val & 0xff; - ((__u8 *)m)[off+1] = (val >> 8) & 0xff; - ((__u8 *)m)[off+2] = (val >> 16) & 0xff; - ((__u8 *)m)[off+3] = (val >> 24) & 0xff; -} - -#define CAPIMSG_SETLEN(m, len) capimsg_setu16(m, 0, len) -#define CAPIMSG_SETAPPID(m, applid) capimsg_setu16(m, 2, applid) -#define CAPIMSG_SETCOMMAND(m,cmd) capimsg_setu8(m, 4, cmd) -#define CAPIMSG_SETSUBCOMMAND(m, cmd) capimsg_setu8(m, 5, cmd) -#define CAPIMSG_SETMSGID(m, msgid) capimsg_setu16(m, 6, msgid) -#define CAPIMSG_SETCONTROL(m, contr) capimsg_setu32(m, 8, contr) -#define CAPIMSG_SETDATALEN(m, len) capimsg_setu16(m, 16, len) - -#endif /* __CAPIUTIL_H__ */ diff --git a/include/linux/kernelcapi.h b/include/linux/kernelcapi.h deleted file mode 100644 index 94ba42bf9da1..000000000000 --- a/include/linux/kernelcapi.h +++ /dev/null @@ -1,45 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * $Id: kernelcapi.h,v 1.8.6.2 2001/02/07 11:31:31 kai Exp $ - * - * Kernel CAPI 2.0 Interface for Linux - * - * (c) Copyright 1997 by Carsten Paeth (calle@calle.in-berlin.de) - * - */ -#ifndef __KERNELCAPI_H__ -#define __KERNELCAPI_H__ - -#include -#include -#include -#include -#include - -#define CAPI_NOERROR 0x0000 - -#define CAPI_TOOMANYAPPLS 0x1001 -#define CAPI_LOGBLKSIZETOSMALL 0x1002 -#define CAPI_BUFFEXECEEDS64K 0x1003 -#define CAPI_MSGBUFSIZETOOSMALL 0x1004 -#define CAPI_ANZLOGCONNNOTSUPPORTED 0x1005 -#define CAPI_REGRESERVED 0x1006 -#define CAPI_REGBUSY 0x1007 -#define CAPI_REGOSRESOURCEERR 0x1008 -#define CAPI_REGNOTINSTALLED 0x1009 -#define CAPI_REGCTRLERNOTSUPPORTEXTEQUIP 0x100a -#define CAPI_REGCTRLERONLYSUPPORTEXTEQUIP 0x100b - -#define CAPI_ILLAPPNR 0x1101 -#define CAPI_ILLCMDORSUBCMDORMSGTOSMALL 0x1102 -#define CAPI_SENDQUEUEFULL 0x1103 -#define CAPI_RECEIVEQUEUEEMPTY 0x1104 -#define CAPI_RECEIVEOVERFLOW 0x1105 -#define CAPI_UNKNOWNNOTPAR 0x1106 -#define CAPI_MSGBUSY 0x1107 -#define CAPI_MSGOSRESOURCEERR 0x1108 -#define CAPI_MSGNOTINSTALLED 0x1109 -#define CAPI_MSGCTRLERNOTSUPPORTEXTEQUIP 0x110a -#define CAPI_MSGCTRLERONLYSUPPORTEXTEQUIP 0x110b - -#endif /* __KERNELCAPI_H__ */ diff --git a/include/linux/mISDNdsp.h b/include/linux/mISDNdsp.h deleted file mode 100644 index 00758f45fddc..000000000000 --- a/include/linux/mISDNdsp.h +++ /dev/null @@ -1,40 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __mISDNdsp_H__ -#define __mISDNdsp_H__ - -struct mISDN_dsp_element_arg { - char *name; - char *def; - char *desc; -}; - -struct mISDN_dsp_element { - char *name; - void *(*new)(const char *arg); - void (*free)(void *p); - void (*process_tx)(void *p, unsigned char *data, int len); - void (*process_rx)(void *p, unsigned char *data, int len, - unsigned int txlen); - int num_args; - struct mISDN_dsp_element_arg - *args; -}; - -extern int mISDN_dsp_element_register(struct mISDN_dsp_element *elem); -extern void mISDN_dsp_element_unregister(struct mISDN_dsp_element *elem); - -struct dsp_features { - int hfc_id; /* unique id to identify the chip (or -1) */ - int hfc_dtmf; /* set if HFCmulti card supports dtmf */ - int hfc_conf; /* set if HFCmulti card supports conferences */ - int hfc_loops; /* set if card supports tone loops */ - int hfc_echocanhw; /* set if card supports echocancelation*/ - int pcm_id; /* unique id to identify the pcm bus (or -1) */ - int pcm_slots; /* number of slots on the pcm bus */ - int pcm_banks; /* number of IO banks of pcm bus */ - int unclocked; /* data is not clocked (has jitter/loss) */ - int unordered; /* data is unordered (packets have index) */ -}; - -#endif - diff --git a/include/linux/mISDNhw.h b/include/linux/mISDNhw.h deleted file mode 100644 index ef4f8eb02eac..000000000000 --- a/include/linux/mISDNhw.h +++ /dev/null @@ -1,192 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * - * Author Karsten Keil - * - * Basic declarations for the mISDN HW channels - * - * Copyright 2008 by Karsten Keil - */ - -#ifndef MISDNHW_H -#define MISDNHW_H -#include -#include - -/* - * HW DEBUG 0xHHHHGGGG - * H - hardware driver specific bits - * G - for all drivers - */ - -#define DEBUG_HW 0x00000001 -#define DEBUG_HW_OPEN 0x00000002 -#define DEBUG_HW_DCHANNEL 0x00000100 -#define DEBUG_HW_DFIFO 0x00000200 -#define DEBUG_HW_BCHANNEL 0x00001000 -#define DEBUG_HW_BFIFO 0x00002000 - -#define MAX_DFRAME_LEN_L1 300 -#define MAX_MON_FRAME 32 -#define MAX_LOG_SPACE 2048 -#define MISDN_COPY_SIZE 32 - -/* channel->Flags bit field */ -#define FLG_TX_BUSY 0 /* tx_buf in use */ -#define FLG_TX_NEXT 1 /* next_skb in use */ -#define FLG_L1_BUSY 2 /* L1 is permanent busy */ -#define FLG_L2_ACTIVATED 3 /* activated from L2 */ -#define FLG_OPEN 5 /* channel is in use */ -#define FLG_ACTIVE 6 /* channel is activated */ -#define FLG_BUSY_TIMER 7 -/* channel type */ -#define FLG_DCHANNEL 8 /* channel is D-channel */ -#define FLG_BCHANNEL 9 /* channel is B-channel */ -#define FLG_ECHANNEL 10 /* channel is E-channel */ -#define FLG_TRANSPARENT 12 /* channel use transparent data */ -#define FLG_HDLC 13 /* channel use hdlc data */ -#define FLG_L2DATA 14 /* channel use L2 DATA primitivs */ -#define FLG_ORIGIN 15 /* channel is on origin site */ -/* channel specific stuff */ -#define FLG_FILLEMPTY 16 /* fill fifo on first frame (empty) */ -/* arcofi specific */ -#define FLG_ARCOFI_TIMER 17 -#define FLG_ARCOFI_ERROR 18 -/* isar specific */ -#define FLG_INITIALIZED 17 -#define FLG_DLEETX 18 -#define FLG_LASTDLE 19 -#define FLG_FIRST 20 -#define FLG_LASTDATA 21 -#define FLG_NMD_DATA 22 -#define FLG_FTI_RUN 23 -#define FLG_LL_OK 24 -#define FLG_LL_CONN 25 -#define FLG_DTMFSEND 26 -#define FLG_TX_EMPTY 27 -/* stop sending received data upstream */ -#define FLG_RX_OFF 28 -/* workq events */ -#define FLG_RECVQUEUE 30 -#define FLG_PHCHANGE 31 - -#define schedule_event(s, ev) do { \ - test_and_set_bit(ev, &((s)->Flags)); \ - schedule_work(&((s)->workq)); \ - } while (0) - -struct dchannel { - struct mISDNdevice dev; - u_long Flags; - struct work_struct workq; - void (*phfunc) (struct dchannel *); - u_int state; - void *l1; - void *hw; - int slot; /* multiport card channel slot */ - struct timer_list timer; - /* receive data */ - struct sk_buff *rx_skb; - int maxlen; - /* send data */ - struct sk_buff_head squeue; - struct sk_buff_head rqueue; - struct sk_buff *tx_skb; - int tx_idx; - int debug; - /* statistics */ - int err_crc; - int err_tx; - int err_rx; -}; - -typedef int (dchannel_l1callback)(struct dchannel *, u_int); -extern int create_l1(struct dchannel *, dchannel_l1callback *); - -/* private L1 commands */ -#define INFO0 0x8002 -#define INFO1 0x8102 -#define INFO2 0x8202 -#define INFO3_P8 0x8302 -#define INFO3_P10 0x8402 -#define INFO4_P8 0x8502 -#define INFO4_P10 0x8602 -#define LOSTFRAMING 0x8702 -#define ANYSIGNAL 0x8802 -#define HW_POWERDOWN 0x8902 -#define HW_RESET_REQ 0x8a02 -#define HW_POWERUP_REQ 0x8b02 -#define HW_DEACT_REQ 0x8c02 -#define HW_ACTIVATE_REQ 0x8e02 -#define HW_D_NOBLOCKED 0x8f02 -#define HW_RESET_IND 0x9002 -#define HW_POWERUP_IND 0x9102 -#define HW_DEACT_IND 0x9202 -#define HW_ACTIVATE_IND 0x9302 -#define HW_DEACT_CNF 0x9402 -#define HW_TESTLOOP 0x9502 -#define HW_TESTRX_RAW 0x9602 -#define HW_TESTRX_HDLC 0x9702 -#define HW_TESTRX_OFF 0x9802 -#define HW_TIMER3_IND 0x9902 -#define HW_TIMER3_VALUE 0x9a00 -#define HW_TIMER3_VMASK 0x00FF - -struct layer1; -extern int l1_event(struct layer1 *, u_int); - -#define MISDN_BCH_FILL_SIZE 4 - -struct bchannel { - struct mISDNchannel ch; - int nr; - u_long Flags; - struct work_struct workq; - u_int state; - void *hw; - int slot; /* multiport card channel slot */ - struct timer_list timer; - /* receive data */ - u8 fill[MISDN_BCH_FILL_SIZE]; - struct sk_buff *rx_skb; - unsigned short maxlen; - unsigned short init_maxlen; /* initial value */ - unsigned short next_maxlen; /* pending value */ - unsigned short minlen; /* for transparent data */ - unsigned short init_minlen; /* initial value */ - unsigned short next_minlen; /* pending value */ - /* send data */ - struct sk_buff *next_skb; - struct sk_buff *tx_skb; - struct sk_buff_head rqueue; - int rcount; - int tx_idx; - int debug; - /* statistics */ - int err_crc; - int err_tx; - int err_rx; - int dropcnt; -}; - -extern int mISDN_initdchannel(struct dchannel *, int, void *); -extern int mISDN_initbchannel(struct bchannel *, unsigned short, - unsigned short); -extern int mISDN_freedchannel(struct dchannel *); -extern void mISDN_clear_bchannel(struct bchannel *); -extern void mISDN_freebchannel(struct bchannel *); -extern int mISDN_ctrl_bchannel(struct bchannel *, struct mISDN_ctrl_req *); -extern void queue_ch_frame(struct mISDNchannel *, u_int, - int, struct sk_buff *); -extern int dchannel_senddata(struct dchannel *, struct sk_buff *); -extern int bchannel_senddata(struct bchannel *, struct sk_buff *); -extern int bchannel_get_rxbuf(struct bchannel *, int); -extern void recv_Dchannel(struct dchannel *); -extern void recv_Echannel(struct dchannel *, struct dchannel *); -extern void recv_Bchannel(struct bchannel *, unsigned int, bool); -extern void recv_Dchannel_skb(struct dchannel *, struct sk_buff *); -extern void recv_Bchannel_skb(struct bchannel *, struct sk_buff *); -extern int get_next_bframe(struct bchannel *); -extern int get_next_dframe(struct dchannel *); - -#endif diff --git a/include/linux/mISDNif.h b/include/linux/mISDNif.h deleted file mode 100644 index 7aab4a769736..000000000000 --- a/include/linux/mISDNif.h +++ /dev/null @@ -1,603 +0,0 @@ -/* - * - * Author Karsten Keil - * - * Copyright 2008 by Karsten Keil - * - * This code is free software; you can redistribute it and/or modify - * it under the terms of the GNU LESSER GENERAL PUBLIC LICENSE - * version 2.1 as published by the Free Software Foundation. - * - * This code is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU LESSER GENERAL PUBLIC LICENSE for more details. - * - */ - -#ifndef mISDNIF_H -#define mISDNIF_H - -#include -#include -#include - -/* - * ABI Version 32 bit - * - * <8 bit> Major version - * - changed if any interface become backwards incompatible - * - * <8 bit> Minor version - * - changed if any interface is extended but backwards compatible - * - * <16 bit> Release number - * - should be incremented on every checkin - */ -#define MISDN_MAJOR_VERSION 1 -#define MISDN_MINOR_VERSION 1 -#define MISDN_RELEASE 29 - -/* primitives for information exchange - * generell format - * <16 bit 0 > - * <8 bit command> - * BIT 8 = 1 LAYER private - * BIT 7 = 1 answer - * BIT 6 = 1 DATA - * <8 bit target layer mask> - * - * Layer = 00 is reserved for general commands - Layer = 01 L2 -> HW - Layer = 02 HW -> L2 - Layer = 04 L3 -> L2 - Layer = 08 L2 -> L3 - * Layer = FF is reserved for broadcast commands - */ - -#define MISDN_CMDMASK 0xff00 -#define MISDN_LAYERMASK 0x00ff - -/* generell commands */ -#define OPEN_CHANNEL 0x0100 -#define CLOSE_CHANNEL 0x0200 -#define CONTROL_CHANNEL 0x0300 -#define CHECK_DATA 0x0400 - -/* layer 2 -> layer 1 */ -#define PH_ACTIVATE_REQ 0x0101 -#define PH_DEACTIVATE_REQ 0x0201 -#define PH_DATA_REQ 0x2001 -#define MPH_ACTIVATE_REQ 0x0501 -#define MPH_DEACTIVATE_REQ 0x0601 -#define MPH_INFORMATION_REQ 0x0701 -#define PH_CONTROL_REQ 0x0801 - -/* layer 1 -> layer 2 */ -#define PH_ACTIVATE_IND 0x0102 -#define PH_ACTIVATE_CNF 0x4102 -#define PH_DEACTIVATE_IND 0x0202 -#define PH_DEACTIVATE_CNF 0x4202 -#define PH_DATA_IND 0x2002 -#define PH_DATA_E_IND 0x3002 -#define MPH_ACTIVATE_IND 0x0502 -#define MPH_DEACTIVATE_IND 0x0602 -#define MPH_INFORMATION_IND 0x0702 -#define PH_DATA_CNF 0x6002 -#define PH_CONTROL_IND 0x0802 -#define PH_CONTROL_CNF 0x4802 - -/* layer 3 -> layer 2 */ -#define DL_ESTABLISH_REQ 0x1004 -#define DL_RELEASE_REQ 0x1104 -#define DL_DATA_REQ 0x3004 -#define DL_UNITDATA_REQ 0x3104 -#define DL_INFORMATION_REQ 0x0004 - -/* layer 2 -> layer 3 */ -#define DL_ESTABLISH_IND 0x1008 -#define DL_ESTABLISH_CNF 0x5008 -#define DL_RELEASE_IND 0x1108 -#define DL_RELEASE_CNF 0x5108 -#define DL_DATA_IND 0x3008 -#define DL_UNITDATA_IND 0x3108 -#define DL_INFORMATION_IND 0x0008 - -/* intern layer 2 management */ -#define MDL_ASSIGN_REQ 0x1804 -#define MDL_ASSIGN_IND 0x1904 -#define MDL_REMOVE_REQ 0x1A04 -#define MDL_REMOVE_IND 0x1B04 -#define MDL_STATUS_UP_IND 0x1C04 -#define MDL_STATUS_DOWN_IND 0x1D04 -#define MDL_STATUS_UI_IND 0x1E04 -#define MDL_ERROR_IND 0x1F04 -#define MDL_ERROR_RSP 0x5F04 - -/* intern layer 2 */ -#define DL_TIMER200_IND 0x7004 -#define DL_TIMER203_IND 0x7304 -#define DL_INTERN_MSG 0x7804 - -/* DL_INFORMATION_IND types */ -#define DL_INFO_L2_CONNECT 0x0001 -#define DL_INFO_L2_REMOVED 0x0002 - -/* PH_CONTROL types */ -/* TOUCH TONE IS 0x20XX XX "0"..."9", "A","B","C","D","*","#" */ -#define DTMF_TONE_VAL 0x2000 -#define DTMF_TONE_MASK 0x007F -#define DTMF_TONE_START 0x2100 -#define DTMF_TONE_STOP 0x2200 -#define DTMF_HFC_COEF 0x4000 -#define DSP_CONF_JOIN 0x2403 -#define DSP_CONF_SPLIT 0x2404 -#define DSP_RECEIVE_OFF 0x2405 -#define DSP_RECEIVE_ON 0x2406 -#define DSP_ECHO_ON 0x2407 -#define DSP_ECHO_OFF 0x2408 -#define DSP_MIX_ON 0x2409 -#define DSP_MIX_OFF 0x240a -#define DSP_DELAY 0x240b -#define DSP_JITTER 0x240c -#define DSP_TXDATA_ON 0x240d -#define DSP_TXDATA_OFF 0x240e -#define DSP_TX_DEJITTER 0x240f -#define DSP_TX_DEJ_OFF 0x2410 -#define DSP_TONE_PATT_ON 0x2411 -#define DSP_TONE_PATT_OFF 0x2412 -#define DSP_VOL_CHANGE_TX 0x2413 -#define DSP_VOL_CHANGE_RX 0x2414 -#define DSP_BF_ENABLE_KEY 0x2415 -#define DSP_BF_DISABLE 0x2416 -#define DSP_BF_ACCEPT 0x2416 -#define DSP_BF_REJECT 0x2417 -#define DSP_PIPELINE_CFG 0x2418 -#define HFC_VOL_CHANGE_TX 0x2601 -#define HFC_VOL_CHANGE_RX 0x2602 -#define HFC_SPL_LOOP_ON 0x2603 -#define HFC_SPL_LOOP_OFF 0x2604 -/* for T30 FAX and analog modem */ -#define HW_MOD_FRM 0x4000 -#define HW_MOD_FRH 0x4001 -#define HW_MOD_FTM 0x4002 -#define HW_MOD_FTH 0x4003 -#define HW_MOD_FTS 0x4004 -#define HW_MOD_CONNECT 0x4010 -#define HW_MOD_OK 0x4011 -#define HW_MOD_NOCARR 0x4012 -#define HW_MOD_FCERROR 0x4013 -#define HW_MOD_READY 0x4014 -#define HW_MOD_LASTDATA 0x4015 - -/* DSP_TONE_PATT_ON parameter */ -#define TONE_OFF 0x0000 -#define TONE_GERMAN_DIALTONE 0x0001 -#define TONE_GERMAN_OLDDIALTONE 0x0002 -#define TONE_AMERICAN_DIALTONE 0x0003 -#define TONE_GERMAN_DIALPBX 0x0004 -#define TONE_GERMAN_OLDDIALPBX 0x0005 -#define TONE_AMERICAN_DIALPBX 0x0006 -#define TONE_GERMAN_RINGING 0x0007 -#define TONE_GERMAN_OLDRINGING 0x0008 -#define TONE_AMERICAN_RINGPBX 0x000b -#define TONE_GERMAN_RINGPBX 0x000c -#define TONE_GERMAN_OLDRINGPBX 0x000d -#define TONE_AMERICAN_RINGING 0x000e -#define TONE_GERMAN_BUSY 0x000f -#define TONE_GERMAN_OLDBUSY 0x0010 -#define TONE_AMERICAN_BUSY 0x0011 -#define TONE_GERMAN_HANGUP 0x0012 -#define TONE_GERMAN_OLDHANGUP 0x0013 -#define TONE_AMERICAN_HANGUP 0x0014 -#define TONE_SPECIAL_INFO 0x0015 -#define TONE_GERMAN_GASSENBESETZT 0x0016 -#define TONE_GERMAN_AUFSCHALTTON 0x0016 - -/* MPH_INFORMATION_IND */ -#define L1_SIGNAL_LOS_OFF 0x0010 -#define L1_SIGNAL_LOS_ON 0x0011 -#define L1_SIGNAL_AIS_OFF 0x0012 -#define L1_SIGNAL_AIS_ON 0x0013 -#define L1_SIGNAL_RDI_OFF 0x0014 -#define L1_SIGNAL_RDI_ON 0x0015 -#define L1_SIGNAL_SLIP_RX 0x0020 -#define L1_SIGNAL_SLIP_TX 0x0021 - -/* - * protocol ids - * D channel 1-31 - * B channel 33 - 63 - */ - -#define ISDN_P_NONE 0 -#define ISDN_P_BASE 0 -#define ISDN_P_TE_S0 0x01 -#define ISDN_P_NT_S0 0x02 -#define ISDN_P_TE_E1 0x03 -#define ISDN_P_NT_E1 0x04 -#define ISDN_P_TE_UP0 0x05 -#define ISDN_P_NT_UP0 0x06 - -#define IS_ISDN_P_TE(p) ((p == ISDN_P_TE_S0) || (p == ISDN_P_TE_E1) || \ - (p == ISDN_P_TE_UP0) || (p == ISDN_P_LAPD_TE)) -#define IS_ISDN_P_NT(p) ((p == ISDN_P_NT_S0) || (p == ISDN_P_NT_E1) || \ - (p == ISDN_P_NT_UP0) || (p == ISDN_P_LAPD_NT)) -#define IS_ISDN_P_S0(p) ((p == ISDN_P_TE_S0) || (p == ISDN_P_NT_S0)) -#define IS_ISDN_P_E1(p) ((p == ISDN_P_TE_E1) || (p == ISDN_P_NT_E1)) -#define IS_ISDN_P_UP0(p) ((p == ISDN_P_TE_UP0) || (p == ISDN_P_NT_UP0)) - - -#define ISDN_P_LAPD_TE 0x10 -#define ISDN_P_LAPD_NT 0x11 - -#define ISDN_P_B_MASK 0x1f -#define ISDN_P_B_START 0x20 - -#define ISDN_P_B_RAW 0x21 -#define ISDN_P_B_HDLC 0x22 -#define ISDN_P_B_X75SLP 0x23 -#define ISDN_P_B_L2DTMF 0x24 -#define ISDN_P_B_L2DSP 0x25 -#define ISDN_P_B_L2DSPHDLC 0x26 -#define ISDN_P_B_T30_FAX 0x27 -#define ISDN_P_B_MODEM_ASYNC 0x28 - -#define OPTION_L2_PMX 1 -#define OPTION_L2_PTP 2 -#define OPTION_L2_FIXEDTEI 3 -#define OPTION_L2_CLEANUP 4 -#define OPTION_L1_HOLD 5 - -/* should be in sync with linux/kobject.h:KOBJ_NAME_LEN */ -#define MISDN_MAX_IDLEN 20 - -struct mISDNhead { - unsigned int prim; - unsigned int id; -} __packed; - -#define MISDN_HEADER_LEN sizeof(struct mISDNhead) -#define MAX_DATA_SIZE 2048 -#define MAX_DATA_MEM (MAX_DATA_SIZE + MISDN_HEADER_LEN) -#define MAX_DFRAME_LEN 260 - -#define MISDN_ID_ADDR_MASK 0xFFFF -#define MISDN_ID_TEI_MASK 0xFF00 -#define MISDN_ID_SAPI_MASK 0x00FF -#define MISDN_ID_TEI_ANY 0x7F00 - -#define MISDN_ID_ANY 0xFFFF -#define MISDN_ID_NONE 0xFFFE - -#define GROUP_TEI 127 -#define TEI_SAPI 63 -#define CTRL_SAPI 0 - -#define MISDN_MAX_CHANNEL 127 -#define MISDN_CHMAP_SIZE ((MISDN_MAX_CHANNEL + 1) >> 3) - -#define SOL_MISDN 0 - -struct sockaddr_mISDN { - sa_family_t family; - unsigned char dev; - unsigned char channel; - unsigned char sapi; - unsigned char tei; -}; - -struct mISDNversion { - unsigned char major; - unsigned char minor; - unsigned short release; -}; - -struct mISDN_devinfo { - u_int id; - u_int Dprotocols; - u_int Bprotocols; - u_int protocol; - u_char channelmap[MISDN_CHMAP_SIZE]; - u_int nrbchan; - char name[MISDN_MAX_IDLEN]; -}; - -struct mISDN_devrename { - u_int id; - char name[MISDN_MAX_IDLEN]; /* new name */ -}; - -/* MPH_INFORMATION_REQ payload */ -struct ph_info_ch { - __u32 protocol; - __u64 Flags; -}; - -struct ph_info_dch { - struct ph_info_ch ch; - __u16 state; - __u16 num_bch; -}; - -struct ph_info { - struct ph_info_dch dch; - struct ph_info_ch bch[]; -}; - -/* timer device ioctl */ -#define IMADDTIMER _IOR('I', 64, int) -#define IMDELTIMER _IOR('I', 65, int) - -/* socket ioctls */ -#define IMGETVERSION _IOR('I', 66, int) -#define IMGETCOUNT _IOR('I', 67, int) -#define IMGETDEVINFO _IOR('I', 68, int) -#define IMCTRLREQ _IOR('I', 69, int) -#define IMCLEAR_L2 _IOR('I', 70, int) -#define IMSETDEVNAME _IOR('I', 71, struct mISDN_devrename) -#define IMHOLD_L1 _IOR('I', 72, int) - -static inline int -test_channelmap(u_int nr, u_char *map) -{ - if (nr <= MISDN_MAX_CHANNEL) - return map[nr >> 3] & (1 << (nr & 7)); - else - return 0; -} - -static inline void -set_channelmap(u_int nr, u_char *map) -{ - map[nr >> 3] |= (1 << (nr & 7)); -} - -static inline void -clear_channelmap(u_int nr, u_char *map) -{ - map[nr >> 3] &= ~(1 << (nr & 7)); -} - -/* CONTROL_CHANNEL parameters */ -#define MISDN_CTRL_GETOP 0x0000 -#define MISDN_CTRL_LOOP 0x0001 -#define MISDN_CTRL_CONNECT 0x0002 -#define MISDN_CTRL_DISCONNECT 0x0004 -#define MISDN_CTRL_RX_BUFFER 0x0008 -#define MISDN_CTRL_PCMCONNECT 0x0010 -#define MISDN_CTRL_PCMDISCONNECT 0x0020 -#define MISDN_CTRL_SETPEER 0x0040 -#define MISDN_CTRL_UNSETPEER 0x0080 -#define MISDN_CTRL_RX_OFF 0x0100 -#define MISDN_CTRL_FILL_EMPTY 0x0200 -#define MISDN_CTRL_GETPEER 0x0400 -#define MISDN_CTRL_L1_TIMER3 0x0800 -#define MISDN_CTRL_HW_FEATURES_OP 0x2000 -#define MISDN_CTRL_HW_FEATURES 0x2001 -#define MISDN_CTRL_HFC_OP 0x4000 -#define MISDN_CTRL_HFC_PCM_CONN 0x4001 -#define MISDN_CTRL_HFC_PCM_DISC 0x4002 -#define MISDN_CTRL_HFC_CONF_JOIN 0x4003 -#define MISDN_CTRL_HFC_CONF_SPLIT 0x4004 -#define MISDN_CTRL_HFC_RECEIVE_OFF 0x4005 -#define MISDN_CTRL_HFC_RECEIVE_ON 0x4006 -#define MISDN_CTRL_HFC_ECHOCAN_ON 0x4007 -#define MISDN_CTRL_HFC_ECHOCAN_OFF 0x4008 -#define MISDN_CTRL_HFC_WD_INIT 0x4009 -#define MISDN_CTRL_HFC_WD_RESET 0x400A - -/* special RX buffer value for MISDN_CTRL_RX_BUFFER request.p1 is the minimum - * buffer size request.p2 the maximum. Using MISDN_CTRL_RX_SIZE_IGNORE will - * not change the value, but still read back the actual stetting. - */ -#define MISDN_CTRL_RX_SIZE_IGNORE -1 - -/* socket options */ -#define MISDN_TIME_STAMP 0x0001 - -struct mISDN_ctrl_req { - int op; - int channel; - int p1; - int p2; -}; - -/* muxer options */ -#define MISDN_OPT_ALL 1 -#define MISDN_OPT_TEIMGR 2 - -#ifdef __KERNEL__ -#include -#include -#include -#include -#include - -#define DEBUG_CORE 0x000000ff -#define DEBUG_CORE_FUNC 0x00000002 -#define DEBUG_SOCKET 0x00000004 -#define DEBUG_MANAGER 0x00000008 -#define DEBUG_SEND_ERR 0x00000010 -#define DEBUG_MSG_THREAD 0x00000020 -#define DEBUG_QUEUE_FUNC 0x00000040 -#define DEBUG_L1 0x0000ff00 -#define DEBUG_L1_FSM 0x00000200 -#define DEBUG_L2 0x00ff0000 -#define DEBUG_L2_FSM 0x00020000 -#define DEBUG_L2_CTRL 0x00040000 -#define DEBUG_L2_RECV 0x00080000 -#define DEBUG_L2_TEI 0x00100000 -#define DEBUG_L2_TEIFSM 0x00200000 -#define DEBUG_TIMER 0x01000000 -#define DEBUG_CLOCK 0x02000000 - -#define mISDN_HEAD_P(s) ((struct mISDNhead *)&s->cb[0]) -#define mISDN_HEAD_PRIM(s) (((struct mISDNhead *)&s->cb[0])->prim) -#define mISDN_HEAD_ID(s) (((struct mISDNhead *)&s->cb[0])->id) - -/* socket states */ -#define MISDN_OPEN 1 -#define MISDN_BOUND 2 -#define MISDN_CLOSED 3 - -struct mISDNchannel; -struct mISDNdevice; -struct mISDNstack; -struct mISDNclock; - -struct channel_req { - u_int protocol; - struct sockaddr_mISDN adr; - struct mISDNchannel *ch; -}; - -typedef int (ctrl_func_t)(struct mISDNchannel *, u_int, void *); -typedef int (send_func_t)(struct mISDNchannel *, struct sk_buff *); -typedef int (create_func_t)(struct channel_req *); - -struct Bprotocol { - struct list_head list; - char *name; - u_int Bprotocols; - create_func_t *create; -}; - -struct mISDNchannel { - struct list_head list; - u_int protocol; - u_int nr; - u_long opt; - u_int addr; - struct mISDNstack *st; - struct mISDNchannel *peer; - send_func_t *send; - send_func_t *recv; - ctrl_func_t *ctrl; -}; - -struct mISDN_sock_list { - struct hlist_head head; - rwlock_t lock; -}; - -struct mISDN_sock { - struct sock sk; - struct mISDNchannel ch; - u_int cmask; - struct mISDNdevice *dev; -}; - - - -struct mISDNdevice { - struct mISDNchannel D; - u_int id; - u_int Dprotocols; - u_int Bprotocols; - u_int nrbchan; - u_char channelmap[MISDN_CHMAP_SIZE]; - struct list_head bchannels; - struct mISDNchannel *teimgr; - struct device dev; -}; - -struct mISDNstack { - u_long status; - struct mISDNdevice *dev; - struct task_struct *thread; - struct completion *notify; - wait_queue_head_t workq; - struct sk_buff_head msgq; - struct list_head layer2; - struct mISDNchannel *layer1; - struct mISDNchannel own; - struct mutex lmutex; /* protect lists */ - struct mISDN_sock_list l1sock; -#ifdef MISDN_MSG_STATS - u_int msg_cnt; - u_int sleep_cnt; - u_int stopped_cnt; -#endif -}; - -typedef int (clockctl_func_t)(void *, int); - -struct mISDNclock { - struct list_head list; - char name[64]; - int pri; - clockctl_func_t *ctl; - void *priv; -}; - -/* global alloc/queue functions */ - -static inline struct sk_buff * -mI_alloc_skb(unsigned int len, gfp_t gfp_mask) -{ - struct sk_buff *skb; - - skb = alloc_skb(len + MISDN_HEADER_LEN, gfp_mask); - if (likely(skb)) - skb_reserve(skb, MISDN_HEADER_LEN); - return skb; -} - -static inline struct sk_buff * -_alloc_mISDN_skb(u_int prim, u_int id, u_int len, void *dp, gfp_t gfp_mask) -{ - struct sk_buff *skb = mI_alloc_skb(len, gfp_mask); - struct mISDNhead *hh; - - if (!skb) - return NULL; - if (len) - skb_put_data(skb, dp, len); - hh = mISDN_HEAD_P(skb); - hh->prim = prim; - hh->id = id; - return skb; -} - -static inline void -_queue_data(struct mISDNchannel *ch, u_int prim, - u_int id, u_int len, void *dp, gfp_t gfp_mask) -{ - struct sk_buff *skb; - - if (!ch->peer) - return; - skb = _alloc_mISDN_skb(prim, id, len, dp, gfp_mask); - if (!skb) - return; - if (ch->recv(ch->peer, skb)) - dev_kfree_skb(skb); -} - -/* global register/unregister functions */ - -extern int mISDN_register_device(struct mISDNdevice *, - struct device *parent, char *name); -extern void mISDN_unregister_device(struct mISDNdevice *); -extern int mISDN_register_Bprotocol(struct Bprotocol *); -extern void mISDN_unregister_Bprotocol(struct Bprotocol *); -extern struct mISDNclock *mISDN_register_clock(char *, int, clockctl_func_t *, - void *); -extern void mISDN_unregister_clock(struct mISDNclock *); - -static inline struct mISDNdevice *dev_to_mISDN(const struct device *dev) -{ - if (dev) - return dev_get_drvdata(dev); - else - return NULL; -} - -extern void set_channel_address(struct mISDNchannel *, u_int, u_int); -extern void mISDN_clock_update(struct mISDNclock *, int, ktime_t *); -extern unsigned short mISDN_clock_get(void); -extern const char *mISDNDevName4ch(struct mISDNchannel *); - -#endif /* __KERNEL__ */ -#endif /* mISDNIF_H */ -- cgit v1.2.3 From dd8d4bc28ad7252610d8e79c1313a2d1e3499a51 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 20 Apr 2026 19:18:23 -0700 Subject: net: remove ax25 and amateur radio (hamradio) subsystem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the amateur radio (AX.25, NET/ROM, ROSE) protocol implementation and all associated hamradio device drivers from the kernel tree. This set of protocols has long been a huge bug/syzbot magnet, and since nobody stepped up to help us deal with the influx of the AI-generated bug reports we need to move it out of tree to protect our sanity. The code is moved to an out-of-tree repo: https://github.com/linux-netdev/mod-orphan if it's cleaned up and reworked there we can accept it back. Minimal stub headers are kept for include/net/ax25.h (AX25_P_IP, AX25_ADDR_LEN, ax25_address) and include/net/rose.h (ROSE_ADDR_LEN) so that the conditional integration code in arp.c and tun.c continues to compile and work when the out-of-tree modules are loaded. Signed-off-by: Jakub Kicinski Acked-by: Greg Kroah-Hartman Acked-by: Stephen Hemminger Acked-by: Carlos Bilbao Reviewed-by: Simon Horman Reviewed-by: Kuniyuki Iwashima Acked-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20260421021824.1293976-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- include/linux/hdlcdrv.h | 276 ---------------------------------------------- include/linux/netdevice.h | 5 +- include/linux/scc.h | 86 --------------- include/linux/yam.h | 67 ----------- 4 files changed, 1 insertion(+), 433 deletions(-) delete mode 100644 include/linux/hdlcdrv.h delete mode 100644 include/linux/scc.h delete mode 100644 include/linux/yam.h (limited to 'include/linux') diff --git a/include/linux/hdlcdrv.h b/include/linux/hdlcdrv.h deleted file mode 100644 index 5d70c3f98f5b..000000000000 --- a/include/linux/hdlcdrv.h +++ /dev/null @@ -1,276 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * hdlcdrv.h -- HDLC packet radio network driver. - * The Linux soundcard driver for 1200 baud and 9600 baud packet radio - * (C) 1996-1998 by Thomas Sailer, HB9JNX/AE4WA - */ -#ifndef _HDLCDRV_H -#define _HDLCDRV_H - - -#include -#include -#include -#include - -#define HDLCDRV_MAGIC 0x5ac6e778 -#define HDLCDRV_HDLCBUFFER 32 /* should be a power of 2 for speed reasons */ -#define HDLCDRV_BITBUFFER 256 /* should be a power of 2 for speed reasons */ -#undef HDLCDRV_LOOPBACK /* define for HDLC debugging purposes */ -#define HDLCDRV_DEBUG - -/* maximum packet length, excluding CRC */ -#define HDLCDRV_MAXFLEN 400 - - -struct hdlcdrv_hdlcbuffer { - spinlock_t lock; - unsigned rd, wr; - unsigned short buf[HDLCDRV_HDLCBUFFER]; -}; - -#ifdef HDLCDRV_DEBUG -struct hdlcdrv_bitbuffer { - unsigned int rd; - unsigned int wr; - unsigned int shreg; - unsigned char buffer[HDLCDRV_BITBUFFER]; -}; - -static inline void hdlcdrv_add_bitbuffer(struct hdlcdrv_bitbuffer *buf, - unsigned int bit) -{ - unsigned char new; - - new = buf->shreg & 1; - buf->shreg >>= 1; - buf->shreg |= (!!bit) << 7; - if (new) { - buf->buffer[buf->wr] = buf->shreg; - buf->wr = (buf->wr+1) % sizeof(buf->buffer); - buf->shreg = 0x80; - } -} - -static inline void hdlcdrv_add_bitbuffer_word(struct hdlcdrv_bitbuffer *buf, - unsigned int bits) -{ - buf->buffer[buf->wr] = bits & 0xff; - buf->wr = (buf->wr+1) % sizeof(buf->buffer); - buf->buffer[buf->wr] = (bits >> 8) & 0xff; - buf->wr = (buf->wr+1) % sizeof(buf->buffer); - -} -#endif /* HDLCDRV_DEBUG */ - -/* -------------------------------------------------------------------- */ -/* - * Information that need to be kept for each driver. - */ - -struct hdlcdrv_ops { - /* - * first some informations needed by the hdlcdrv routines - */ - const char *drvname; - const char *drvinfo; - /* - * the routines called by the hdlcdrv routines - */ - int (*open)(struct net_device *); - int (*close)(struct net_device *); - int (*ioctl)(struct net_device *, void __user *, - struct hdlcdrv_ioctl *, int); -}; - -struct hdlcdrv_state { - int magic; - int opened; - - const struct hdlcdrv_ops *ops; - - struct { - int bitrate; - } par; - - struct hdlcdrv_pttoutput { - int dma2; - int seriobase; - int pariobase; - int midiiobase; - unsigned int flags; - } ptt_out; - - struct hdlcdrv_channel_params ch_params; - - struct hdlcdrv_hdlcrx { - struct hdlcdrv_hdlcbuffer hbuf; - unsigned long in_hdlc_rx; - /* 0 = sync hunt, != 0 receiving */ - int rx_state; - unsigned int bitstream; - unsigned int bitbuf; - int numbits; - unsigned char dcd; - - int len; - unsigned char *bp; - unsigned char buffer[HDLCDRV_MAXFLEN+2]; - } hdlcrx; - - struct hdlcdrv_hdlctx { - struct hdlcdrv_hdlcbuffer hbuf; - unsigned long in_hdlc_tx; - /* - * 0 = send flags - * 1 = send txtail (flags) - * 2 = send packet - */ - int tx_state; - int numflags; - unsigned int bitstream; - unsigned char ptt; - int calibrate; - int slotcnt; - - unsigned int bitbuf; - int numbits; - - int len; - unsigned char *bp; - unsigned char buffer[HDLCDRV_MAXFLEN+2]; - } hdlctx; - -#ifdef HDLCDRV_DEBUG - struct hdlcdrv_bitbuffer bitbuf_channel; - struct hdlcdrv_bitbuffer bitbuf_hdlc; -#endif /* HDLCDRV_DEBUG */ - - int ptt_keyed; - - /* queued skb for transmission */ - struct sk_buff *skb; -}; - - -/* -------------------------------------------------------------------- */ - -static inline int hdlcdrv_hbuf_full(struct hdlcdrv_hdlcbuffer *hb) -{ - unsigned long flags; - int ret; - - spin_lock_irqsave(&hb->lock, flags); - ret = !((HDLCDRV_HDLCBUFFER - 1 + hb->rd - hb->wr) % HDLCDRV_HDLCBUFFER); - spin_unlock_irqrestore(&hb->lock, flags); - return ret; -} - -/* -------------------------------------------------------------------- */ - -static inline int hdlcdrv_hbuf_empty(struct hdlcdrv_hdlcbuffer *hb) -{ - unsigned long flags; - int ret; - - spin_lock_irqsave(&hb->lock, flags); - ret = (hb->rd == hb->wr); - spin_unlock_irqrestore(&hb->lock, flags); - return ret; -} - -/* -------------------------------------------------------------------- */ - -static inline unsigned short hdlcdrv_hbuf_get(struct hdlcdrv_hdlcbuffer *hb) -{ - unsigned long flags; - unsigned short val; - unsigned newr; - - spin_lock_irqsave(&hb->lock, flags); - if (hb->rd == hb->wr) - val = 0; - else { - newr = (hb->rd+1) % HDLCDRV_HDLCBUFFER; - val = hb->buf[hb->rd]; - hb->rd = newr; - } - spin_unlock_irqrestore(&hb->lock, flags); - return val; -} - -/* -------------------------------------------------------------------- */ - -static inline void hdlcdrv_hbuf_put(struct hdlcdrv_hdlcbuffer *hb, - unsigned short val) -{ - unsigned newp; - unsigned long flags; - - spin_lock_irqsave(&hb->lock, flags); - newp = (hb->wr+1) % HDLCDRV_HDLCBUFFER; - if (newp != hb->rd) { - hb->buf[hb->wr] = val & 0xffff; - hb->wr = newp; - } - spin_unlock_irqrestore(&hb->lock, flags); -} - -/* -------------------------------------------------------------------- */ - -static inline void hdlcdrv_putbits(struct hdlcdrv_state *s, unsigned int bits) -{ - hdlcdrv_hbuf_put(&s->hdlcrx.hbuf, bits); -} - -static inline unsigned int hdlcdrv_getbits(struct hdlcdrv_state *s) -{ - unsigned int ret; - - if (hdlcdrv_hbuf_empty(&s->hdlctx.hbuf)) { - if (s->hdlctx.calibrate > 0) - s->hdlctx.calibrate--; - else - s->hdlctx.ptt = 0; - ret = 0; - } else - ret = hdlcdrv_hbuf_get(&s->hdlctx.hbuf); -#ifdef HDLCDRV_LOOPBACK - hdlcdrv_hbuf_put(&s->hdlcrx.hbuf, ret); -#endif /* HDLCDRV_LOOPBACK */ - return ret; -} - -static inline void hdlcdrv_channelbit(struct hdlcdrv_state *s, unsigned int bit) -{ -#ifdef HDLCDRV_DEBUG - hdlcdrv_add_bitbuffer(&s->bitbuf_channel, bit); -#endif /* HDLCDRV_DEBUG */ -} - -static inline void hdlcdrv_setdcd(struct hdlcdrv_state *s, int dcd) -{ - s->hdlcrx.dcd = !!dcd; -} - -static inline int hdlcdrv_ptt(struct hdlcdrv_state *s) -{ - return s->hdlctx.ptt || (s->hdlctx.calibrate > 0); -} - -/* -------------------------------------------------------------------- */ - -void hdlcdrv_receiver(struct net_device *, struct hdlcdrv_state *); -void hdlcdrv_transmitter(struct net_device *, struct hdlcdrv_state *); -void hdlcdrv_arbitrate(struct net_device *, struct hdlcdrv_state *); -struct net_device *hdlcdrv_register(const struct hdlcdrv_ops *ops, - unsigned int privsize, const char *ifname, - unsigned int baseaddr, unsigned int irq, - unsigned int dma); -void hdlcdrv_unregister(struct net_device *dev); - -/* -------------------------------------------------------------------- */ - - - -#endif /* _HDLCDRV_H */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7969fcdd5ac4..e9e2ec8d4c19 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -162,7 +162,7 @@ static inline bool dev_xmit_complete(int rc) #if defined(CONFIG_HYPERV_NET) # define LL_MAX_HEADER 128 -#elif defined(CONFIG_WLAN) || IS_ENABLED(CONFIG_AX25) +#elif defined(CONFIG_WLAN) # if defined(CONFIG_MAC80211_MESH) # define LL_MAX_HEADER 128 # else @@ -2316,9 +2316,6 @@ struct net_device { #if IS_ENABLED(CONFIG_ATALK) void *atalk_ptr; #endif -#if IS_ENABLED(CONFIG_AX25) - struct ax25_dev __rcu *ax25_ptr; -#endif #if IS_ENABLED(CONFIG_CFG80211) struct wireless_dev *ieee80211_ptr; #endif diff --git a/include/linux/scc.h b/include/linux/scc.h deleted file mode 100644 index 745eabd17c10..000000000000 --- a/include/linux/scc.h +++ /dev/null @@ -1,86 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* $Id: scc.h,v 1.29 1997/04/02 14:56:45 jreuter Exp jreuter $ */ -#ifndef _SCC_H -#define _SCC_H - -#include - - -enum {TX_OFF, TX_ON}; /* command for scc_key_trx() */ - -/* Vector masks in RR2B */ - -#define VECTOR_MASK 0x06 -#define TXINT 0x00 -#define EXINT 0x02 -#define RXINT 0x04 -#define SPINT 0x06 - -#ifdef CONFIG_SCC_DELAY -#define Inb(port) inb_p(port) -#define Outb(port, val) outb_p(val, port) -#else -#define Inb(port) inb(port) -#define Outb(port, val) outb(val, port) -#endif - -/* SCC channel control structure for KISS */ - -struct scc_kiss { - unsigned char txdelay; /* Transmit Delay 10 ms/cnt */ - unsigned char persist; /* Persistence (0-255) as a % */ - unsigned char slottime; /* Delay to wait on persistence hit */ - unsigned char tailtime; /* Delay after last byte written */ - unsigned char fulldup; /* Full Duplex mode 0=CSMA 1=DUP 2=ALWAYS KEYED */ - unsigned char waittime; /* Waittime before any transmit attempt */ - unsigned int maxkeyup; /* Maximum time to transmit (seconds) */ - unsigned int mintime; /* Minimal offtime after MAXKEYUP timeout (seconds) */ - unsigned int idletime; /* Maximum idle time in ALWAYS KEYED mode (seconds) */ - unsigned int maxdefer; /* Timer for CSMA channel busy limit */ - unsigned char tx_inhibit; /* Transmit is not allowed when set */ - unsigned char group; /* Group ID for AX.25 TX interlocking */ - unsigned char mode; /* 'normal' or 'hwctrl' mode (unused) */ - unsigned char softdcd; /* Use DPLL instead of DCD pin for carrier detect */ -}; - - -/* SCC channel structure */ - -struct scc_channel { - int init; /* channel exists? */ - - struct net_device *dev; /* link to device control structure */ - struct net_device_stats dev_stat;/* device statistics */ - - char brand; /* manufacturer of the board */ - long clock; /* used clock */ - - io_port ctrl; /* I/O address of CONTROL register */ - io_port data; /* I/O address of DATA register */ - io_port special; /* I/O address of special function port */ - int irq; /* Number of Interrupt */ - - char option; - char enhanced; /* Enhanced SCC support */ - - unsigned char wreg[16]; /* Copy of last written value in WRx */ - unsigned char status; /* Copy of R0 at last external interrupt */ - unsigned char dcd; /* DCD status */ - - struct scc_kiss kiss; /* control structure for KISS params */ - struct scc_stat stat; /* statistical information */ - struct scc_modem modem; /* modem information */ - - struct sk_buff_head tx_queue; /* next tx buffer */ - struct sk_buff *rx_buff; /* pointer to frame currently received */ - struct sk_buff *tx_buff; /* pointer to frame currently transmitted */ - - /* Timer */ - struct timer_list tx_t; /* tx timer for this channel */ - struct timer_list tx_wdog; /* tx watchdogs */ - - /* Channel lock */ - spinlock_t lock; /* Channel guard lock */ -}; - -#endif /* defined(_SCC_H) */ diff --git a/include/linux/yam.h b/include/linux/yam.h deleted file mode 100644 index a29b04fa1e66..000000000000 --- a/include/linux/yam.h +++ /dev/null @@ -1,67 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/*****************************************************************************/ - -/* - * yam.h -- YAM radio modem driver. - * - * Copyright (C) 1998 Frederic Rible F1OAT (frible@teaser.fr) - * Adapted from baycom.c driver written by Thomas Sailer (sailer@ife.ee.ethz.ch) - * - * Please note that the GPL allows you to use the driver, NOT the radio. - * In order to use the radio, you need a license from the communications - * authority of your country. - */ - -/*****************************************************************************/ - -#define SIOCYAMRESERVED (0) -#define SIOCYAMSCFG (1) /* Set configuration */ -#define SIOCYAMGCFG (2) /* Get configuration */ -#define SIOCYAMSMCS (3) /* Set mcs data */ - -#define YAM_IOBASE (1 << 0) -#define YAM_IRQ (1 << 1) -#define YAM_BITRATE (1 << 2) /* Bit rate of radio port ->57600 */ -#define YAM_MODE (1 << 3) /* 0=simplex 1=duplex 2=duplex+tempo */ -#define YAM_HOLDDLY (1 << 4) /* duplex tempo (sec) */ -#define YAM_TXDELAY (1 << 5) /* Tx Delay (ms) */ -#define YAM_TXTAIL (1 << 6) /* Tx Tail (ms) */ -#define YAM_PERSIST (1 << 7) /* Persist (ms) */ -#define YAM_SLOTTIME (1 << 8) /* Slottime (ms) */ -#define YAM_BAUDRATE (1 << 9) /* Baud rate of rs232 port ->115200 */ - -#define YAM_MAXBITRATE 57600 -#define YAM_MAXBAUDRATE 115200 -#define YAM_MAXMODE 2 -#define YAM_MAXHOLDDLY 99 -#define YAM_MAXTXDELAY 999 -#define YAM_MAXTXTAIL 999 -#define YAM_MAXPERSIST 255 -#define YAM_MAXSLOTTIME 999 - -#define YAM_FPGA_SIZE 5302 - -struct yamcfg { - unsigned int mask; /* Mask of commands */ - unsigned int iobase; /* IO Base of COM port */ - unsigned int irq; /* IRQ of COM port */ - unsigned int bitrate; /* Bit rate of radio port */ - unsigned int baudrate; /* Baud rate of the RS232 port */ - unsigned int txdelay; /* TxDelay */ - unsigned int txtail; /* TxTail */ - unsigned int persist; /* Persistence */ - unsigned int slottime; /* Slottime */ - unsigned int mode; /* mode 0 (simp), 1(Dupl), 2(Dupl+delay) */ - unsigned int holddly; /* PTT delay in FullDuplex 2 mode */ -}; - -struct yamdrv_ioctl_cfg { - int cmd; - struct yamcfg cfg; -}; - -struct yamdrv_ioctl_mcs { - int cmd; - unsigned int bitrate; - unsigned char bits[YAM_FPGA_SIZE]; -}; -- cgit v1.2.3 From 43eb354ecb471426e97b0ce6a0c922ec20f82027 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 16 Apr 2026 14:54:29 -0700 Subject: nstree: fix func. parameter kernel-doc warnings Use the correct parameter name ("__ns") for function parameter kernel-doc to avoid 3 warnings: Warning: include/linux/nstree.h:68 function parameter '__ns' not described in 'ns_tree_add_raw' Warning: include/linux/nstree.h:77 function parameter '__ns' not described in 'ns_tree_add' Warning: include/linux/nstree.h:88 function parameter '__ns' not described in 'ns_tree_remove' Fixes: 885fc8ac0a4d ("nstree: make iterator generic") Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20260416215429.948898-1-rdunlap@infradead.org Signed-off-by: Christian Brauner --- include/linux/nstree.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nstree.h b/include/linux/nstree.h index 175e4625bfa6..5b64d4572881 100644 --- a/include/linux/nstree.h +++ b/include/linux/nstree.h @@ -61,7 +61,7 @@ static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree_root *ns_t /** * ns_tree_add_raw - Add a namespace to a namespace - * @ns: Namespace to add + * @__ns: Namespace to add * * This function adds a namespace to the appropriate namespace tree * without assigning a id. @@ -70,7 +70,7 @@ static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree_root *ns_t /** * ns_tree_add - Add a namespace to a namespace tree - * @ns: Namespace to add + * @__ns: Namespace to add * * This function assigns a new id to the namespace and adds it to the * appropriate namespace tree and list. @@ -81,7 +81,7 @@ static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree_root *ns_t /** * ns_tree_remove - Remove a namespace from a namespace tree - * @ns: Namespace to remove + * @__ns: Namespace to remove * * This function removes a namespace from the appropriate namespace * tree and list. -- cgit v1.2.3 From 33e92e9ecf48c08cb4807e9a36f9eb01619c1a1e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 23 Apr 2026 11:56:11 +0200 Subject: eventpoll: refresh eventpoll_release() fast-path comment The old comment justified the lockless READ_ONCE(file->f_ep) check with "False positives simply cannot happen because the file is on the way to be removed and nobody ( but eventpoll ) has still a reference to this file." That reasoning was the root of the UAF fixed in "eventpoll: fix ep_remove struct eventpoll / struct file UAF": __ep_remove() could clear f_ep while another close raced past the fast path and freed the watched eventpoll / recycled the struct file slot. With ep_remove() now pinning @file via epi_fget() across the f_ep clear and hlist_del_rcu(), the invariant is re-established for the right reason: anyone who might clear f_ep holds @file alive for the duration, so a NULL observation really does mean no concurrent eventpoll path has work left on this file. Refresh the comment accordingly so the next reader doesn't inherit the broken model. Link: https://patch.msgid.link/20260423-work-epoll-uaf-v1-8-2470f9eec0f5@kernel.org Signed-off-by: Christian Brauner (Amutable) --- include/linux/eventpoll.h | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h index ea9ca0e4172a..728fb5dee5ed 100644 --- a/include/linux/eventpoll.h +++ b/include/linux/eventpoll.h @@ -39,12 +39,16 @@ static inline void eventpoll_release(struct file *file) { /* - * Fast check to avoid the get/release of the semaphore. Since - * we're doing this outside the semaphore lock, it might return - * false negatives, but we don't care. It'll help in 99.99% of cases - * to avoid the semaphore lock. False positives simply cannot happen - * because the file in on the way to be removed and nobody ( but - * eventpoll ) has still a reference to this file. + * Fast check to skip the slow path in the common case where the + * file was never attached to an epoll. Safe without file->f_lock + * because every f_ep writer excludes a concurrent __fput() on + * @file: + * - ep_insert() requires the file alive (refcount > 0); + * - ep_remove() holds @file pinned via epi_fget() across the + * write; + * - eventpoll_release_file() runs from __fput() itself. + * We are in __fput() here, so none of those can race us: a NULL + * observation truly means no epoll path has work left on @file. */ if (likely(!READ_ONCE(file->f_ep))) return; -- cgit v1.2.3 From 619eab23e1ce7c97e54bfc5a417306d94b3f6f13 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 21 Apr 2026 11:21:50 +0100 Subject: mm/vma: do not try to unmap a VMA if mmap_prepare() invoked from mmap() The mmap_prepare hook functionality includes the ability to invoke mmap_prepare() from the mmap() hook of existing 'stacked' drivers, that is ones which are capable of calling the mmap hooks of other drivers/file systems (e.g. overlayfs, shm). As part of the mmap_prepare action functionality, we deal with errors by unmapping the VMA should one arise. This works in the usual mmap_prepare case, as we invoke this action at the last moment, when the VMA is established in the maple tree. However, the mmap() hook passes a not-fully-established VMA pointer to the caller (which is the motivation behind the mmap_prepare() work), which is detached. So attempting to unmap a VMA in this state will be problematic, with the most obvious symptom being a warning in vma_mark_detached(), because the VMA is already detached. It's also unncessary - the mmap() handler will clean up the VMA on error. So to fix this issue, this patch propagates whether or not an mmap action is being completed via the compatibility layer or directly. If the former, then we do not attempt VMA cleanup, if the latter, then we do. This patch also updates the userland VMA tests to reflect the change. Link: https://lore.kernel.org/20260421102150.189982-1-ljs@kernel.org Fixes: ac0a3fc9c07d ("mm: add ability to take further action in vm_area_desc") Signed-off-by: Lorenzo Stoakes Reported-by: syzbot+db390288d141a1dccf96@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/69e69734.050a0220.24bfd3.0027.GAE@google.com/ Cc: David Hildenbrand Cc: Jann Horn Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Pedro Falcato Cc: Suren Baghdasaryan Cc: Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0b776907152e..af23453e9dbd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4391,7 +4391,7 @@ static inline void mmap_action_map_kernel_pages_full(struct vm_area_desc *desc, int mmap_action_prepare(struct vm_area_desc *desc); int mmap_action_complete(struct vm_area_struct *vma, - struct mmap_action *action); + struct mmap_action *action, bool is_compat); /* Look up the first VMA which exactly match the interval vm_start ... vm_end */ static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, -- cgit v1.2.3 From 77a50e9652ac3c669c6690088bce97d960f5fd17 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 22 Apr 2026 14:43:10 -0400 Subject: MAINTAINERS: update Liam's email address Switching to private email address. Update all contact information Add an entry to mailmap at the same time. Link: https://lore.kernel.org/20260422184310.2682901-1-liam@infradead.org Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 0c464eade1d6..4a5631906aff 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -4,7 +4,7 @@ /* * Maple Tree - An RCU-safe adaptive tree for storing ranges * Copyright (c) 2018-2022 Oracle - * Authors: Liam R. Howlett + * Authors: Liam R. Howlett * Matthew Wilcox */ -- cgit v1.2.3 From 5e25407b68f460142539536e31fa20338db6146f Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Fri, 10 Apr 2026 19:41:03 +0200 Subject: mtd: spinand: Add support for packed read data ODTR commands Some devices stuff address bits in the double byte opcode (in place of the repeated byte) in order to be able to increase the size of the devices, without adding extra address bytes. Create a flag to identify those devices. When the flag is set, use the "packed" variant for the read data operation. Signed-off-by: Miquel Raynal --- include/linux/mtd/spinand.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index 58abd306ebe3..782984ba3a20 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -290,6 +290,12 @@ SPI_MEM_OP_NO_DUMMY, \ SPI_MEM_OP_NO_DATA) +#define SPINAND_PAGE_READ_PACKED_8D_8D_0_OP(addr) \ + SPI_MEM_OP(SPI_MEM_DTR_OP_PACKED_CMD(0x13, addr >> 16, 8), \ + SPI_MEM_DTR_OP_ADDR(2, addr & 0xffff, 8), \ + SPI_MEM_OP_NO_DUMMY, \ + SPI_MEM_OP_NO_DATA) + #define SPINAND_PAGE_READ_FROM_CACHE_8D_8D_8D_OP(addr, ndummy, buf, len, freq) \ SPI_MEM_OP(SPI_MEM_DTR_OP_RPT_CMD(0x9d, 8), \ SPI_MEM_DTR_OP_ADDR(2, addr, 8), \ @@ -483,6 +489,7 @@ struct spinand_ecc_info { #define SPINAND_HAS_PROG_PLANE_SELECT_BIT BIT(2) #define SPINAND_HAS_READ_PLANE_SELECT_BIT BIT(3) #define SPINAND_NO_RAW_ACCESS BIT(4) +#define SPINAND_ODTR_PACKED_PAGE_READ BIT(5) /** * struct spinand_ondie_ecc_conf - private SPI-NAND on-die ECC engine structure -- cgit v1.2.3 From 0898a817621a2f0cddca8122d9b974003fe5036d Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Mon, 27 Apr 2026 22:01:39 +0100 Subject: cdrom, scsi: sr: propagate read-only status to block layer via set_disk_ro() The cdrom core never calls set_disk_ro() for a registered device, so BLKROGET on a CD-ROM device always returns 0 (writable), even when the drive has no write capabilities and writes will inevitably fail. This causes problems for userspace that relies on BLKROGET to determine whether a block device is read-only. For example, systemd's loop device setup uses BLKROGET to decide whether to create a loop device with LO_FLAGS_READ_ONLY. Without the read-only flag, writes pass through the loop device to the CD-ROM and fail with I/O errors. systemd-fsck similarly checks BLKROGET to decide whether to run fsck in no-repair mode (-n). The write-capability bits in cdi->mask come from two different sources: CDC_DVD_RAM and CDC_CD_RW are populated by the driver from the MODE SENSE capabilities page (page 0x2A) before register_cdrom() is called, while CDC_MRW_W and CDC_RAM require the MMC GET CONFIGURATION command and were only probed by cdrom_open_write() at device open time. This meant that any attempt to compute the writable state from the full mask at probe time was incorrect, because the GET CONFIGURATION bits were still unset (and cdi->mask is initialized such that capabilities are assumed present). Fix this by factoring the GET CONFIGURATION probing out of cdrom_open_write() into a new exported helper, cdrom_probe_write_features(), and having sr call it from sr_probe() right after get_capabilities() has populated the MODE SENSE bits. register_cdrom() then calls set_disk_ro() based on the full write-capability mask (CDC_DVD_RAM | CDC_MRW_W | CDC_RAM | CDC_CD_RW) so the block layer reflects the drive's actual write support. The feature queries used (CDF_MRW and CDF_RWRT via GET CONFIGURATION with RT=00) report drive-level capabilities that are persistent across media, so a single probe before register_cdrom() is sufficient and the redundant probe at open time is dropped. With set_disk_ro() now accurate, the long-vestigial cd->writeable flag in sr can go: get_capabilities() used to set cd->writeable based on the same four mask bits, but because CDC_MRW_W and CDC_RAM default to "capability present" in cdi->mask and aren't touched by MODE SENSE, the condition that gated cd->writeable was always true, making it unconditionally 1. Replace the corresponding gate in sr_init_command() with get_disk_ro(cd->disk), which turns a previously no-op check into a real one and also catches kernel-internal bio writers that bypass blkdev_write_iter()'s bdev_read_only() check. The sd driver (SCSI disks) does not have this problem because it checks the MODE SENSE Write Protect bit and calls set_disk_ro() accordingly. The sr driver cannot use the same approach because the MMC specification does not define the WP bit in the MODE SENSE device-specific parameter byte for CD-ROM devices. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Daan De Meyer Reviewed-by: Phillip Potter Reviewed-by: Martin K. Petersen Signed-off-by: Phillip Potter Link: https://patch.msgid.link/20260427210139.1400-2-phil@philpotter.co.uk Signed-off-by: Jens Axboe --- include/linux/cdrom.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h index b907e6c2307d..260d7968cf72 100644 --- a/include/linux/cdrom.h +++ b/include/linux/cdrom.h @@ -108,6 +108,7 @@ int cdrom_ioctl(struct cdrom_device_info *cdi, struct block_device *bdev, extern unsigned int cdrom_check_events(struct cdrom_device_info *cdi, unsigned int clearing); +extern void cdrom_probe_write_features(struct cdrom_device_info *cdi); extern int register_cdrom(struct gendisk *disk, struct cdrom_device_info *cdi); extern void unregister_cdrom(struct cdrom_device_info *cdi); -- cgit v1.2.3 From b3b6babf47517fde6b6de2493dea28e8831b9347 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 23 Apr 2026 05:34:54 +0000 Subject: ipmr: Free mr_table after RCU grace period. With CONFIG_IP_MROUTE_MULTIPLE_TABLES=n, ipmr_fib_lookup() does not check if net->ipv4.mrt is NULL. Since default_device_exit_batch() is called after ->exit_rtnl(), a device could receive IGMP packets and access net->ipv4.mrt during/after ipmr_rules_exit_rtnl(). If ipmr_rules_exit_rtnl() had already cleared it and freed the memory, the access would trigger null-ptr-deref or use-after-free. Let's fix it by using RCU helper and free mrt after RCU grace period. In addition, check_net(net) is added to mroute_clean_tables() and ipmr_cache_unresolved() to synchronise via mfc_unres_lock. This prevents ipmr_cache_unresolved() from putting skb into c->_c.mfc_un.unres.unresolved after mroute_clean_tables() purges it. For the same reason, timer_shutdown_sync() is moved after mroute_clean_tables(). Since rhltable_destroy() holds mutex internally, rcu_work is used, and it is placed as the first member because rcu_head must be placed within <4K offset. mr_table is alraedy 3864 bytes without rcu_work. Note that IP6MR is not yet converted to ->exit_rtnl(), so this change is not needed for now but will be. Fixes: b22b01867406 ("ipmr: Convert ipmr_net_exit_batch() to ->exit_rtnl().") Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260423053456.4097409-1-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/linux/mroute_base.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index cf3374580f74..5d75cc5b057e 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -226,6 +226,7 @@ struct mr_table_ops { /** * struct mr_table - a multicast routing table + * @work: used for table destruction * @list: entry within a list of multicast routing tables * @net: net where this table belongs * @ops: protocol specific operations @@ -243,6 +244,7 @@ struct mr_table_ops { * @mroute_reg_vif_num: PIM-device vif index */ struct mr_table { + struct rcu_work work; struct list_head list; possible_net_t net; struct mr_table_ops ops; @@ -274,6 +276,7 @@ void vif_device_init(struct vif_device *v, unsigned short flags, unsigned short get_iflink_mask); +void mr_table_free(struct mr_table *mrt); struct mr_table * mr_table_alloc(struct net *net, u32 id, struct mr_table_ops *ops, -- cgit v1.2.3 From 0de4cb473aed57ee4ba7e0551ad27bddc19fc519 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 28 Apr 2026 08:10:43 -0700 Subject: workqueue: fix devm_alloc_workqueue() va_list misuse devm_alloc_workqueue() built a va_list and passed it as a single positional argument to the variadic alloc_workqueue() macro: va_start(args, max_active); wq = alloc_workqueue(fmt, flags, max_active, args); va_end(args); C does not allow forwarding a va_list through a ... parameter. alloc_workqueue() expands to alloc_workqueue_noprof(), which runs its own va_start() over its ... params, so the inner vsnprintf(wq->name, sizeof(wq->name), fmt, args) in __alloc_workqueue() received the outer va_list object as the first variadic slot rather than the caller's actual format arguments. Add a new static helper alloc_workqueue_va() that wraps __alloc_workqueue() and runs wq_init_lockdep() on success, and fold both alloc_workqueue_noprof() and devm_alloc_workqueue_noprof() onto it as suggested by Tejun. The wq_init_lockdep() step is required on the devm path too, otherwise __flush_workqueue()'s on-stack COMPLETION_INITIALIZER_ONSTACK_MAP would NULL-deref wq->lockdep_map. No caller changes are required. devm_alloc_ordered_workqueue() is a macro forwarding to devm_alloc_workqueue() and inherits the fix. Two in-tree callers actively trigger the broken path on every probe: drivers/power/supply/mt6370-charger.c:889 drivers/power/supply/max77705_charger.c:649 both of which use devm_alloc_ordered_workqueue(dev, "%s", 0, dev_name(dev)). A standalone reproducer module is available at[1]. Link: https://github.com/leitao/debug/blob/main/workqueue/valist/wq_va_test.c [1] Fixes: 1dfc9d60a69e ("workqueue: devres: Add device-managed allocate workqueue") Signed-off-by: Breno Leitao Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index ab6cb70ca1a5..6177624539b3 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -534,8 +534,10 @@ alloc_workqueue_noprof(const char *fmt, unsigned int flags, int max_active, ...) * Pointer to the allocated workqueue on success, %NULL on failure. */ __printf(2, 5) struct workqueue_struct * -devm_alloc_workqueue(struct device *dev, const char *fmt, unsigned int flags, - int max_active, ...); +devm_alloc_workqueue_noprof(struct device *dev, const char *fmt, + unsigned int flags, int max_active, ...); +#define devm_alloc_workqueue(...) \ + alloc_hooks(devm_alloc_workqueue_noprof(__VA_ARGS__)) #ifdef CONFIG_LOCKDEP /** -- cgit v1.2.3 From 5ec07d5204b4544271f32f6261ee097fe53cb081 Mon Sep 17 00:00:00 2001 From: Sheng Che Peng Date: Wed, 22 Apr 2026 10:18:19 +0800 Subject: tracepoint: Fix typo in tracepoint.h comment Change "my" to "may" in the description of subsystem configurations. Link: https://patch.msgid.link/20260422021819.1788091-1-synte4028@gmail.com Signed-off-by: Sheng Che Peng Signed-off-by: Steven Rostedt --- include/linux/tracepoint.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 578e520b6ee6..763eea4d80d8 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -202,7 +202,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #define TP_CONDITION(args...) args /* - * Individual subsystem my have a separate configuration to + * Individual subsystem may have a separate configuration to * enable their tracepoints. By default, this file will create * the tracepoints if CONFIG_TRACEPOINTS is defined. If a subsystem * wants to be able to disable its tracepoints from being created -- cgit v1.2.3 From 6813985ca456d1f5677ad9554f55805cbf27e16f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 28 Apr 2026 17:35:18 +0200 Subject: netfilter: x_tables: add .check_hooks to matches and targets Add a new .check_hooks interface for checking if the match/target is used from the validate hook according to its configuration. Move existing conditional hook check based on the match/target configuration from .checkentry to .check_hooks for the following matches/targets: - addrtype - devgroup - physdev - policy - set - TCPMSS - SET This is a preparation patch to fix nft_compat, not functional changes are intended. Based on patch from Florian Westphal. Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 77c778d84d4c..a81b46af5118 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -146,6 +146,9 @@ struct xt_match { /* Called when user tries to insert an entry of this type. */ int (*checkentry)(const struct xt_mtchk_param *); + /* Called to validate hooks based on the match configuration. */ + int (*check_hooks)(const struct xt_mtchk_param *); + /* Called when entry of this type deleted. */ void (*destroy)(const struct xt_mtdtor_param *); #ifdef CONFIG_NETFILTER_XTABLES_COMPAT @@ -187,6 +190,9 @@ struct xt_target { /* Should return 0 on success or an error code otherwise (-Exxxx). */ int (*checkentry)(const struct xt_tgchk_param *); + /* Called to validate hooks based on the target configuration. */ + int (*check_hooks)(const struct xt_tgchk_param *); + /* Called when entry of this type deleted. */ void (*destroy)(const struct xt_tgdtor_param *); #ifdef CONFIG_NETFILTER_XTABLES_COMPAT @@ -279,8 +285,10 @@ bool xt_find_jump_offset(const unsigned int *offsets, int xt_check_proc_name(const char *name, unsigned int size); +int xt_check_hooks_match(struct xt_mtchk_param *par); int xt_check_match(struct xt_mtchk_param *, unsigned int size, u16 proto, bool inv_proto); +int xt_check_hooks_target(struct xt_tgchk_param *par); int xt_check_target(struct xt_tgchk_param *, unsigned int size, u16 proto, bool inv_proto); -- cgit v1.2.3 From 620055cb1036a6125fd912e7a14b47a6572b809b Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Mon, 27 Apr 2026 22:22:21 -0700 Subject: dpll: export __dpll_pin_change_ntf() for use under dpll_lock Export __dpll_pin_change_ntf() so that drivers can send pin change notifications from within pin callbacks, which are already called under dpll_lock. Using dpll_pin_change_ntf() in that context would deadlock. Add lockdep_assert_held() to catch misuse without the lock held. Acked-by: Vadim Fedorenko Signed-off-by: Ivan Vecera Signed-off-by: Petr Oros Tested-by: Alexander Nowlin Reviewed-by: Arkadiusz Kubalewski Signed-off-by: Jacob Keller Link: https://patch.msgid.link/20260427-jk-iwl-net-petr-oros-fixes-v1-9-cdcb48303fd8@intel.com Signed-off-by: Paolo Abeni --- include/linux/dpll.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/dpll.h b/include/linux/dpll.h index b7277a8b484d..f8037f1ab20b 100644 --- a/include/linux/dpll.h +++ b/include/linux/dpll.h @@ -286,6 +286,7 @@ int dpll_pin_ref_sync_pair_add(struct dpll_pin *pin, int dpll_device_change_ntf(struct dpll_device *dpll); +int __dpll_pin_change_ntf(struct dpll_pin *pin); int dpll_pin_change_ntf(struct dpll_pin *pin); int register_dpll_notifier(struct notifier_block *nb); -- cgit v1.2.3 From e9766e6f7d330dce7530918d8c6e3ec96d6c6e24 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Apr 2026 10:14:41 +0200 Subject: rseq: Protect rseq_reset() against interrupts rseq_reset() uses memset() to clear the tasks rseq data. That's racy against membarrier() and preemption. Guard it with irqsave to cure this. Fixes: faba9d250eae ("rseq: Introduce struct rseq_data") Reported-by: Dmitry Vyukov Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Tested-by: Dmitry Vyukov Link: https://patch.msgid.link/20260428224427.353887714%40kernel.org Cc: stable@vger.kernel.org --- include/linux/rseq.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rseq.h b/include/linux/rseq.h index b9d62fc2140d..f446909551df 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -119,6 +119,8 @@ static inline void rseq_virt_userspace_exit(void) static inline void rseq_reset(struct task_struct *t) { + /* Protect against preemption and membarrier IPI */ + guard(irqsave)(); memset(&t->rseq, 0, sizeof(t->rseq)); t->rseq.ids.cpu_id = RSEQ_CPU_ID_UNINITIALIZED; } -- cgit v1.2.3 From e768103cfbac30a49860aca08a7710d39dbdd470 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Wed, 29 Apr 2026 15:43:36 +0200 Subject: smb: smbdirect: introduce and use include/linux/smbdirect.h This makes it easier to rebuild cifs.ko and ksmbd.ko against a running kernel. Suggested-by: Christoph Hellwig Link: https://lore.kernel.org/linux-cifs/aehrPuY60VMcYGU8@infradead.org/ Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: Namjae Jeon Cc: Christoph Hellwig Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- include/linux/smbdirect.h | 186 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 186 insertions(+) create mode 100644 include/linux/smbdirect.h (limited to 'include/linux') diff --git a/include/linux/smbdirect.h b/include/linux/smbdirect.h new file mode 100644 index 000000000000..97f5ba730fa7 --- /dev/null +++ b/include/linux/smbdirect.h @@ -0,0 +1,186 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2025, Stefan Metzmacher + */ + +#ifndef __LINUX_SMBDIRECT_H__ +#define __LINUX_SMBDIRECT_H__ + +#include + +/* SMB-DIRECT buffer descriptor V1 structure [MS-SMBD] 2.2.3.1 */ +struct smbdirect_buffer_descriptor_v1 { + __le64 offset; + __le32 token; + __le32 length; +} __packed; + +/* + * Connection parameters mostly from [MS-SMBD] 3.1.1.1 + * + * These are setup and negotiated at the beginning of a + * connection and remain constant unless explicitly changed. + * + * Some values are important for the upper layer. + */ +struct smbdirect_socket_parameters { + __u64 flags; +#define SMBDIRECT_FLAG_PORT_RANGE_ONLY_IB ((__u64)0x1) +#define SMBDIRECT_FLAG_PORT_RANGE_ONLY_IW ((__u64)0x2) + __u32 resolve_addr_timeout_msec; + __u32 resolve_route_timeout_msec; + __u32 rdma_connect_timeout_msec; + __u32 negotiate_timeout_msec; + __u16 initiator_depth; /* limited to U8_MAX */ + __u16 responder_resources; /* limited to U8_MAX */ + __u16 recv_credit_max; + __u16 send_credit_target; + __u32 max_send_size; + __u32 max_fragmented_send_size; + __u32 max_recv_size; + __u32 max_fragmented_recv_size; + __u32 max_read_write_size; + __u32 max_frmr_depth; + __u32 keepalive_interval_msec; + __u32 keepalive_timeout_msec; +} __packed; + +#define SMBDIRECT_FLAG_PORT_RANGE_MASK ( \ + SMBDIRECT_FLAG_PORT_RANGE_ONLY_IB | \ + SMBDIRECT_FLAG_PORT_RANGE_ONLY_IW) + +struct smbdirect_socket; +struct smbdirect_send_batch; +struct smbdirect_mr_io; + +#include + +u8 smbdirect_netdev_rdma_capable_node_type(struct net_device *netdev); + +bool smbdirect_frwr_is_supported(const struct ib_device_attr *attrs); + +int smbdirect_socket_create_kern(struct net *net, struct smbdirect_socket **_sc); + +int smbdirect_socket_create_accepting(struct rdma_cm_id *id, struct smbdirect_socket **_sc); + +int smbdirect_socket_set_initial_parameters(struct smbdirect_socket *sc, + const struct smbdirect_socket_parameters *sp); + +const struct smbdirect_socket_parameters * +smbdirect_socket_get_current_parameters(struct smbdirect_socket *sc); + +int smbdirect_socket_set_kernel_settings(struct smbdirect_socket *sc, + enum ib_poll_context poll_ctx, + gfp_t gfp_mask); + +#define SMBDIRECT_LOG_ERR 0x0 +#define SMBDIRECT_LOG_INFO 0x1 + +#define SMBDIRECT_LOG_OUTGOING 0x1 +#define SMBDIRECT_LOG_INCOMING 0x2 +#define SMBDIRECT_LOG_READ 0x4 +#define SMBDIRECT_LOG_WRITE 0x8 +#define SMBDIRECT_LOG_RDMA_SEND 0x10 +#define SMBDIRECT_LOG_RDMA_RECV 0x20 +#define SMBDIRECT_LOG_KEEP_ALIVE 0x40 +#define SMBDIRECT_LOG_RDMA_EVENT 0x80 +#define SMBDIRECT_LOG_RDMA_MR 0x100 +#define SMBDIRECT_LOG_RDMA_RW 0x200 +#define SMBDIRECT_LOG_NEGOTIATE 0x400 +void smbdirect_socket_set_logging(struct smbdirect_socket *sc, + void *private_ptr, + bool (*needed)(struct smbdirect_socket *sc, + void *private_ptr, + unsigned int lvl, + unsigned int cls), + void (*vaprintf)(struct smbdirect_socket *sc, + const char *func, + unsigned int line, + void *private_ptr, + unsigned int lvl, + unsigned int cls, + struct va_format *vaf)); + +bool smbdirect_connection_is_connected(struct smbdirect_socket *sc); + +int smbdirect_connection_wait_for_connected(struct smbdirect_socket *sc); + +int smbdirect_socket_bind(struct smbdirect_socket *sc, struct sockaddr *addr); + +void smbdirect_socket_shutdown(struct smbdirect_socket *sc); + +void smbdirect_socket_release(struct smbdirect_socket *sc); + +int smbdirect_connection_send_batch_flush(struct smbdirect_socket *sc, + struct smbdirect_send_batch *batch, + bool is_last); + +/* + * This is only temporary and only needed + * as long as the client still requires + * to use smbdirect_connection_send_single_iter() + */ +struct smbdirect_send_batch_storage { + union { + struct list_head __msg_list; + __aligned_u64 __space[5]; + }; +}; + +struct smbdirect_send_batch * +smbdirect_init_send_batch_storage(struct smbdirect_send_batch_storage *storage, + bool need_invalidate_rkey, + unsigned int remote_key); + +int smbdirect_connection_send_single_iter(struct smbdirect_socket *sc, + struct smbdirect_send_batch *batch, + struct iov_iter *iter, + unsigned int flags, + u32 remaining_data_length); + +int smbdirect_connection_send_wait_zero_pending(struct smbdirect_socket *sc); + +int smbdirect_connection_send_iter(struct smbdirect_socket *sc, + struct iov_iter *iter, + unsigned int flags, + bool need_invalidate, + unsigned int remote_key); + +int smbdirect_connection_recvmsg(struct smbdirect_socket *sc, + struct msghdr *msg, + unsigned int flags); + +int smbdirect_connect(struct smbdirect_socket *sc, + const struct sockaddr *dst); + +int smbdirect_connect_sync(struct smbdirect_socket *sc, + const struct sockaddr *dst); + +int smbdirect_socket_listen(struct smbdirect_socket *sc, int backlog); + +struct smbdirect_socket *smbdirect_socket_accept(struct smbdirect_socket *lsc, + long timeo, + struct proto_accept_arg *arg); + +int smbdirect_connection_rdma_xmit(struct smbdirect_socket *sc, + void *buf, size_t buf_len, + struct smbdirect_buffer_descriptor_v1 *desc, + size_t desc_len, + bool is_read); + +struct smbdirect_mr_io * +smbdirect_connection_register_mr_io(struct smbdirect_socket *sc, + struct iov_iter *iter, + bool writing, + bool need_invalidate); + +void smbdirect_mr_io_fill_buffer_descriptor(struct smbdirect_mr_io *mr, + struct smbdirect_buffer_descriptor_v1 *v1); + +void smbdirect_connection_deregister_mr_io(struct smbdirect_mr_io *mr); + +void smbdirect_connection_legacy_debug_proc_show(struct smbdirect_socket *sc, + unsigned int rdma_readwrite_threshold, + struct seq_file *m); + +#endif /* __LINUX_SMBDIRECT_H__ */ -- cgit v1.2.3 From 93618edf753838a727dbff63c7c291dee22d656b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 1 May 2026 08:31:22 -1000 Subject: cgroup: Defer css percpu_ref kill on rmdir until cgroup is depopulated A chain of commits going back to v7.0 reworked rmdir to satisfy the controller invariant that a subsystem's ->css_offline() must not run while tasks are still doing kernel-side work in the cgroup. [1] d245698d727a ("cgroup: Defer task cgroup unlink until after the task is done switching out") [2] a72f73c4dd9b ("cgroup: Don't expose dead tasks in cgroup") [3] 1b164b876c36 ("cgroup: Wait for dying tasks to leave on rmdir") [4] 4c56a8ac6869 ("cgroup: Fix cgroup_drain_dying() testing the wrong condition") [5] 13e786b64bd3 ("cgroup: Increment nr_dying_subsys_* from rmdir context") [1] moved task cset unlink from do_exit() to finish_task_switch() so a task's cset link drops only after the task has fully stopped scheduling. That made tasks past exit_signals() linger on cset->tasks until their final context switch, which led to a series of problems as what userspace expected to see after rmdir diverged from what the kernel needs to wait for. [2]-[5] tried to bridge that divergence: [2] filtered the exiting tasks from cgroup.procs; [3] had rmdir(2) sleep in TASK_UNINTERRUPTIBLE for them; [4] fixed the wait's condition; [5] made nr_dying_subsys_* visible synchronously. The cgroup_drain_dying() wait in [3] turned out to be a dead end. When the rmdir caller is also the reaper of a zombie that pins a pidns teardown (e.g. host PID 1 systemd reaping orphan pids that were re-parented to it during the same teardown), rmdir blocks in TASK_UNINTERRUPTIBLE waiting for those pids to free, the pids can't free because PID 1 is the reaper and it's stuck in rmdir, and the system A-A deadlocks. No internal lock ordering breaks this; the wait itself is the bug. The css killing side that drove the original reorder, however, can be made cleanly asynchronous: ->css_offline() is already async, run from css_killed_work_fn() driven by percpu_ref_kill_and_confirm(). The fix is to make that chain start only after all tasks have left the cgroup. rmdir's user-visible side then returns as soon as cgroup.procs and friends are empty, while ->css_offline() still runs only after the cgroup is fully drained. Verified by the original reproducer (pidns teardown + zombie reaper, runs under vng) which hangs vanilla and succeeds here, and by per-commit deterministic repros for [2], [3], [4], [5] with a boot parameter that widens the post-exit_signals() window so each state is reliably reachable. Some stress tests on top of that. cgroup_apply_control_disable() has the same shape of pre-existing race: when a controller is disabled via subtree_control, kill_css() ran synchronously while tasks past exit_signals() could still be linked to the cgroup's csets, and ->css_offline() could fire before they drained. This patch preserves the existing synchronous behavior at that call site (kill_css_sync() + kill_css_finish() back-to-back) and a follow-up patch will defer kill_css_finish() there using a per-css trigger. This seems like the right approach and I don't see problems with it. The changes are somewhat invasive but not excessively so, so backporting to -stable should be okay. If something does turn out to be wrong, the fallback is to revert the entire chain ([1]-[5]) and rework in the development branch instead. v2: Pin cgrp across the deferred destroy work with explicit cgroup_get()/cgroup_put() around queue_work() and the work_fn. v1 wasn't actually broken (ordered cgroup_offline_wq + queue_work order in cgroup_task_dead() saved it) but the explicit ref removes the dependency on those non-obvious invariants. Also note the pre-existing cgroup_apply_control_disable() race in the description; a follow-up will defer kill_css_finish() there. Fixes: 1b164b876c36 ("cgroup: Wait for dying tasks to leave on rmdir") Cc: stable@vger.kernel.org # v7.0+ Reported-and-tested-by: Martin Pitt Link: https://lore.kernel.org/all/afHNg2VX2jy9bW7y@piware.de/ Link: https://lore.kernel.org/all/35e0670adb4abeab13da2c321582af9f@kernel.org/ Signed-off-by: Tejun Heo Acked-by: Sebastian Andrzej Siewior --- include/linux/cgroup-defs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index f42563739d2e..50a784da7a81 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -611,8 +611,8 @@ struct cgroup { /* used to wait for offlining of csses */ wait_queue_head_t offline_waitq; - /* used by cgroup_rmdir() to wait for dying tasks to leave */ - wait_queue_head_t dying_populated_waitq; + /* defers killing csses after removal until cgroup is depopulated */ + struct work_struct finish_destroy_work; /* used to schedule release agent */ struct work_struct release_agent_work; -- cgit v1.2.3 From 60f21a2649308bbd84919ba6656d5ccd660953cf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 27 Apr 2026 14:16:34 -1000 Subject: cgroup, sched_ext: Include exiting tasks in cgroup iter a72f73c4dd9b ("cgroup: Don't expose dead tasks in cgroup") made css_task_iter_advance() skip exiting tasks so cgroup.procs stays consistent with waitpid() visibility. Unfortunately, this broke scx_task_iter. scx_task_iter walks either scx_tasks (global) or a cgroup subtree via css_task_iter() and the two modes are expected to cover the same set of tasks. After the above change the cgroup-scoped mode silently skips tasks past exit_signals() that are still on scx_tasks. scx_sub_enable_workfn()'s abort path is one of the symptoms: an exiting SCX_TASK_SUB_INIT task can race past the cgroup iter leaking __scx_init_task() state. Other iterations share the same gap. Add CSS_TASK_ITER_WITH_DEAD to opt out of the skip and use it from scx_task_iter(). Fixes: b0e4c2f8a0f0 ("sched_ext: Implement cgroup subtree iteration for scx_task_iter") Reported-by: Cheng-Yang Chou Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index e52160e85af4..f6d037a30fd8 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -53,6 +53,7 @@ struct kernel_clone_args; enum css_task_iter_flags { CSS_TASK_ITER_PROCS = (1U << 0), /* walk only threadgroup leaders */ CSS_TASK_ITER_THREADED = (1U << 1), /* walk all threaded css_sets in the domain */ + CSS_TASK_ITER_WITH_DEAD = (1U << 2), /* include exiting tasks */ CSS_TASK_ITER_SKIPPED = (1U << 16), /* internal flags */ }; -- cgit v1.2.3 From ff9eda4ea906b1f02fc260ddc42d2d9bd736a49c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 27 Apr 2026 14:16:35 -1000 Subject: sched_ext: Skip past-sched_ext_dead() tasks in scx_task_iter_next_locked() scx_task_iter's cgroup-scoped mode can return tasks whose sched_ext_dead() has already completed: cgroup_task_dead() removes from cset->tasks after sched_ext_dead() in finish_task_switch() and is irq-work deferred on PREEMPT_RT. The global mode is fine - sched_ext_dead() removes from scx_tasks via list_del_init() first. Callers (sub-sched enable prep/abort/apply, scx_sub_disable(), scx_fail_parent()) assume returned tasks are still on @sch and trip WARN_ON_ONCE() or operate on torn-down state otherwise. Set %SCX_TASK_OFF_TASKS in sched_ext_dead() under @p's rq lock and have scx_task_iter_next_locked() skip flagged tasks under the same lock. Setter and reader serialize on the per-task rq lock - no race. Signed-off-by: Tejun Heo --- include/linux/sched/ext.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 1a3af2ea2a79..adb9a4de068a 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -101,6 +101,7 @@ enum scx_ent_flags { SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */ SCX_TASK_SUB_INIT = 1 << 4, /* task being initialized for a sub sched */ SCX_TASK_IMMED = 1 << 5, /* task is on local DSQ with %SCX_ENQ_IMMED */ + SCX_TASK_OFF_TASKS = 1 << 6, /* removed from scx_tasks by sched_ext_dead() */ /* * Bits 8 and 9 are used to carry task state: -- cgit v1.2.3 From 8f78b749f3da0f43990490b4c1193b5ede3eec0a Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 30 Apr 2026 10:44:20 +0300 Subject: sched/isolation: Make HK_TYPE_KTHREAD an alias of HK_TYPE_DOMAIN Since commit 041ee6f3727a ("kthread: Rely on HK_TYPE_DOMAIN for preferred affinity management"), kthreads default to use the HK_TYPE_DOMAIN cpumask. IOW, it is no longer affected by the setting of the nohz_full boot kernel parameter. That means HK_TYPE_KTHREAD should now be an alias of HK_TYPE_DOMAIN instead of HK_TYPE_KERNEL_NOISE to correctly reflect the current kthread behavior. Make the change as HK_TYPE_KTHREAD is still being used in some networking code. Fixes: 041ee6f3727a ("kthread: Rely on HK_TYPE_DOMAIN for preferred affinity management") Signed-off-by: Waiman Long Signed-off-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- include/linux/sched/isolation.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index dc3975ff1b2e..cf0fd03dd7a2 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h @@ -20,6 +20,11 @@ enum hk_type { HK_TYPE_KERNEL_NOISE, HK_TYPE_MAX, + /* + * HK_TYPE_KTHREAD is now an alias of HK_TYPE_DOMAIN + */ + HK_TYPE_KTHREAD = HK_TYPE_DOMAIN, + /* * The following housekeeping types are only set by the nohz_full * boot commandline option. So they can share the same value. @@ -29,7 +34,6 @@ enum hk_type { HK_TYPE_RCU = HK_TYPE_KERNEL_NOISE, HK_TYPE_MISC = HK_TYPE_KERNEL_NOISE, HK_TYPE_WQ = HK_TYPE_KERNEL_NOISE, - HK_TYPE_KTHREAD = HK_TYPE_KERNEL_NOISE }; #ifdef CONFIG_CPU_ISOLATION -- cgit v1.2.3 From dad0d91cc2c3e6b6fb285ccfe7ddf71525797198 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 28 Apr 2026 18:14:18 +0200 Subject: mm/slab: Add kvfree_atomic() helper kvmalloc() now supports non-sleeping GFP flags, including the vmalloc fallback path. This means it may return vmalloc memory even for GFP_ATOMIC and GFP_NOWAIT allocations. Freeing such memory with kvfree() may then end up calling vfree(), which is not safe for non-sleeping contexts. Introduce kvfree_atomic() helper for such cases. It mirrors kvfree(), but uses vfree_atomic() for vmalloced memory. Signed-off-by: Uladzislau Rezki (Sony) Acked-by: Vlastimil Babka (SUSE) Acked-by: Harry Yoo (Oracle) Signed-off-by: Herbert Xu --- include/linux/slab.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 15a60b501b95..2b5ab488e96b 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -1234,6 +1234,9 @@ void *kvrealloc_node_align_noprof(const void *p, size_t size, unsigned long alig extern void kvfree(const void *addr); DEFINE_FREE(kvfree, void *, if (!IS_ERR_OR_NULL(_T)) kvfree(_T)) +extern void kvfree_atomic(const void *addr); +DEFINE_FREE(kvfree_atomic, void *, if (!IS_ERR_OR_NULL(_T)) kvfree_atomic(_T)) + extern void kvfree_sensitive(const void *addr, size_t len); unsigned int kmem_cache_size(struct kmem_cache *s); -- cgit v1.2.3 From b9eac6a9d93c952c4b7775a24d5c7a1bbf4c3c00 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 25 Apr 2026 00:47:54 +0200 Subject: rseq: Revert to historical performance killing behaviour The recent RSEQ optimization work broke the TCMalloc abuse of the RSEQ ABI as it not longer unconditionally updates the CPU, node, mm_cid fields, which are documented as read only for user space. Due to the observed behavior of the kernel it was possible for TCMalloc to overwrite the cpu_id_start field for their own purposes and rely on the kernel to update it unconditionally after each context switch and before signal delivery. The RSEQ ABI only guarantees that these fields are updated when the data changes, i.e. the task is migrated or the MMCID of the task changes due to switching from or to per CPU ownership mode. The optimization work eliminated the unconditional updates and reduced them to the documented ABI guarantees, which results in a massive performance win for syscall, scheduling heavy work loads, which in turn breaks the TCMalloc expectations. There have been several options discussed to restore the TCMalloc functionality while preserving the optimization benefits. They all end up in a series of hard to maintain workarounds, which in the worst case introduce overhead for everyone, e.g. in the scheduler. The requirements of TCMalloc and the optimization work are diametral and the required work arounds are a maintainence burden. They end up as fragile constructs, which are blocking further optimization work and are pretty much guaranteed to cause more subtle issues down the road. The optimization work heavily depends on the generic entry code, which is not used by all architectures yet. So the rework preserved the original mechanism moslty unmodified to keep the support for architectures, which handle rseq in their own exit to user space loop. That code is currently optimized out by the compiler on architectures which use the generic entry code. This allows to revert back to the original behaviour by replacing the compile time constant conditions with a runtime condition where required, which disables the optimization and the dependend time slice extension feature until the run-time condition can be enabled in the RSEQ registration code on a per task basis again. The following changes are required to restore the original behavior, which makes TCMalloc work again: 1) Replace the compile time constant conditionals with runtime conditionals where appropriate to prevent the compiler from optimizing the legacy mode out 2) Enforce unconditional update of IDs on context switch for the non-optimized v1 mode 3) Enforce update of IDs in the pre signal delivery path for the non-optimized v1 mode 4) Enforce update of IDs in the membarrier(RSEQ) IPI for the non-optimized v1 mode 5) Make time slice and future extensions depend on optimized v2 mode This brings back the full performance problems, but preserves the v2 optimization code and for generic entry code using architectures also the TIF_RSEQ optimization which avoids a full evaluation of the exit to user mode loop in many cases. Fixes: 566d8015f7ee ("rseq: Avoid CPU/MM CID updates when no event pending") Reported-by: Mathias Stearn Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Tested-by: Dmitry Vyukov Closes: https://lore.kernel.org/CAHnCjA25b+nO2n5CeifknSKHssJpPrjnf+dtr7UgzRw4Zgu=oA@mail.gmail.com Link: https://patch.msgid.link/20260428224427.517051752%40kernel.org Cc: stable@vger.kernel.org --- include/linux/rseq.h | 35 ++++++++++++++++++++++++----------- include/linux/rseq_entry.h | 39 +++++++++++++++++++++++++++++---------- include/linux/rseq_types.h | 9 ++++++++- 3 files changed, 61 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rseq.h b/include/linux/rseq.h index f446909551df..7ef79b25e714 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -9,6 +9,11 @@ void __rseq_handle_slowpath(struct pt_regs *regs); +static __always_inline bool rseq_v2(struct task_struct *t) +{ + return IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY) && likely(t->rseq.event.has_rseq > 1); +} + /* Invoked from resume_user_mode_work() */ static inline void rseq_handle_slowpath(struct pt_regs *regs) { @@ -16,8 +21,7 @@ static inline void rseq_handle_slowpath(struct pt_regs *regs) if (current->rseq.event.slowpath) __rseq_handle_slowpath(regs); } else { - /* '&' is intentional to spare one conditional branch */ - if (current->rseq.event.sched_switch & current->rseq.event.has_rseq) + if (current->rseq.event.sched_switch && current->rseq.event.has_rseq) __rseq_handle_slowpath(regs); } } @@ -30,9 +34,9 @@ void __rseq_signal_deliver(int sig, struct pt_regs *regs); */ static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { - if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { - /* '&' is intentional to spare one conditional branch */ - if (current->rseq.event.has_rseq & current->rseq.event.user_irq) + if (rseq_v2(current)) { + /* has_rseq is implied in rseq_v2() */ + if (current->rseq.event.user_irq) __rseq_signal_deliver(ksig->sig, regs); } else { if (current->rseq.event.has_rseq) @@ -50,15 +54,22 @@ static __always_inline void rseq_sched_switch_event(struct task_struct *t) { struct rseq_event *ev = &t->rseq.event; - if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { + /* + * Only apply the user_irq optimization for RSEQ ABI V2 registrations. + * Legacy users like TCMalloc rely on the original ABI V1 behaviour + * which updates IDs on every context swtich. + */ + if (rseq_v2(t)) { /* - * Avoid a boat load of conditionals by using simple logic - * to determine whether NOTIFY_RESUME needs to be raised. + * Avoid a boat load of conditionals by using simple logic to + * determine whether TIF_NOTIFY_RESUME or TIF_RSEQ needs to be + * raised. * - * It's required when the CPU or MM CID has changed or - * the entry was from user space. + * It's required when the CPU or MM CID has changed or the entry + * was via interrupt from user space. ev->has_rseq does not have + * to be evaluated here because rseq_v2() implies has_rseq. */ - bool raise = (ev->user_irq | ev->ids_changed) & ev->has_rseq; + bool raise = ev->user_irq | ev->ids_changed; if (raise) { ev->sched_switch = true; @@ -66,6 +77,7 @@ static __always_inline void rseq_sched_switch_event(struct task_struct *t) } } else { if (ev->has_rseq) { + t->rseq.event.ids_changed = true; t->rseq.event.sched_switch = true; rseq_raise_notify_resume(t); } @@ -161,6 +173,7 @@ static inline unsigned int rseq_alloc_align(void) } #else /* CONFIG_RSEQ */ +static inline bool rseq_v2(struct task_struct *t) { return false; } static inline void rseq_handle_slowpath(struct pt_regs *regs) { } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } static inline void rseq_sched_switch_event(struct task_struct *t) { } diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index f11ebd34f8b9..934db41ec782 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -111,6 +111,20 @@ static __always_inline void rseq_slice_clear_grant(struct task_struct *t) t->rseq.slice.state.granted = false; } +/* + * Open coded, so it can be invoked within a user access region. + * + * This clears the user space state of the time slice extensions field only when + * the task has registered the optimized RSEQ_ABI V2. Some legacy registrations, + * e.g. TCMalloc, have conflicting non-ABI fields in struct RSEQ, which would be + * overwritten by an unconditional write. + */ +#define rseq_slice_clear_user(rseq, efault) \ +do { \ + if (rseq_slice_extension_enabled()) \ + unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); \ +} while (0) + static __always_inline bool __rseq_grant_slice_extension(bool work_pending) { struct task_struct *curr = current; @@ -230,6 +244,7 @@ static __always_inline bool rseq_slice_extension_enabled(void) { return false; } static __always_inline bool rseq_arm_slice_extension_timer(void) { return false; } static __always_inline void rseq_slice_clear_grant(struct task_struct *t) { } static __always_inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask) { return false; } +#define rseq_slice_clear_user(rseq, efault) do { } while (0) #endif /* !CONFIG_RSEQ_SLICE_EXTENSION */ bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr); @@ -517,11 +532,9 @@ bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids, if (csaddr) unsafe_get_user(*csaddr, &rseq->rseq_cs, efault); - /* Open coded, so it's in the same user access region */ - if (rseq_slice_extension_enabled()) { - /* Unconditionally clear it, no point in conditionals */ - unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); - } + /* RSEQ ABI V2 only operations */ + if (rseq_v2(t)) + rseq_slice_clear_user(rseq, efault); } rseq_slice_clear_grant(t); @@ -612,6 +625,14 @@ static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct t * interrupts disabled */ guard(pagefault)(); + /* + * This optimization is only valid when the task registered for the + * optimized RSEQ_ABI_V2 variant. Some legacy users rely on the original + * RSEQ implementation behaviour which unconditionally updated the IDs. + * rseq_sched_switch_event() ensures that legacy registrations always + * have both sched_switch and ids_changed set, which is compatible with + * the historical TIF_NOTIFY_RESUME behaviour. + */ if (likely(!t->rseq.event.ids_changed)) { struct rseq __user *rseq = t->rseq.usrptr; /* @@ -623,11 +644,9 @@ static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct t scoped_user_rw_access(rseq, efault) { unsafe_get_user(csaddr, &rseq->rseq_cs, efault); - /* Open coded, so it's in the same user access region */ - if (rseq_slice_extension_enabled()) { - /* Unconditionally clear it, no point in conditionals */ - unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); - } + /* RSEQ ABI V2 only operations */ + if (rseq_v2(t)) + rseq_slice_clear_user(rseq, efault); } rseq_slice_clear_grant(t); diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index 0b42045988db..a469c1870849 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -9,6 +9,12 @@ #ifdef CONFIG_RSEQ struct rseq; +/* + * rseq_event::has_rseq contains the ABI version number so preserving it + * in AND operations requires a mask. + */ +#define RSEQ_HAS_RSEQ_VERSION_MASK 0xff + /** * struct rseq_event - Storage for rseq related event management * @all: Compound to initialize and clear the data efficiently @@ -17,7 +23,8 @@ struct rseq; * exit to user * @ids_changed: Indicator that IDs need to be updated * @user_irq: True on interrupt entry from user mode - * @has_rseq: True if the task has a rseq pointer installed + * @has_rseq: Greater than 0 if the task has a rseq pointer installed. + * Contains the RSEQ version number * @error: Compound error code for the slow path to analyze * @fatal: User space data corrupted or invalid * @slowpath: Indicator that slow path processing via TIF_NOTIFY_RESUME -- cgit v1.2.3 From 82f572449cfe75f12ea985986da60e11f308f77d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 26 Apr 2026 16:21:02 +0200 Subject: rseq: Implement read only ABI enforcement for optimized RSEQ V2 mode The optimized RSEQ V2 mode requires that user space adheres to the ABI specification and does not modify the read-only fields cpu_id_start, cpu_id, node_id and mm_cid behind the kernel's back. While the kernel does not rely on these fields, the adherence to this is a fundamental prerequisite to allow multiple entities, e.g. libraries, in an application to utilize the full potential of RSEQ without stepping on each other toes. Validate this adherence on every update of these fields. If the kernel detects that user space modified the fields, the application is force terminated. Fixes: d6200245c75e ("rseq: Allow registering RSEQ with slice extension") Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Tested-by: Dmitry Vyukov Link: https://patch.msgid.link/20260428224427.845230956%40kernel.org Cc: stable@vger.kernel.org --- include/linux/rseq_entry.h | 83 +++++++++++++++++----------------------------- include/linux/rseq_types.h | 4 ++- 2 files changed, 33 insertions(+), 54 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index 934db41ec782..2d0295df5107 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -248,7 +248,6 @@ static __always_inline bool rseq_grant_slice_extension(unsigned long ti_work, un #endif /* !CONFIG_RSEQ_SLICE_EXTENSION */ bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr); -bool rseq_debug_validate_ids(struct task_struct *t); static __always_inline void rseq_note_user_irq_entry(void) { @@ -368,43 +367,6 @@ efault: return false; } -/* - * On debug kernels validate that user space did not mess with it if the - * debug branch is enabled. - */ -bool rseq_debug_validate_ids(struct task_struct *t) -{ - struct rseq __user *rseq = t->rseq.usrptr; - u32 cpu_id, uval, node_id; - - /* - * On the first exit after registering the rseq region CPU ID is - * RSEQ_CPU_ID_UNINITIALIZED and node_id in user space is 0! - */ - node_id = t->rseq.ids.cpu_id != RSEQ_CPU_ID_UNINITIALIZED ? - cpu_to_node(t->rseq.ids.cpu_id) : 0; - - scoped_user_read_access(rseq, efault) { - unsafe_get_user(cpu_id, &rseq->cpu_id_start, efault); - if (cpu_id != t->rseq.ids.cpu_id) - goto die; - unsafe_get_user(uval, &rseq->cpu_id, efault); - if (uval != cpu_id) - goto die; - unsafe_get_user(uval, &rseq->node_id, efault); - if (uval != node_id) - goto die; - unsafe_get_user(uval, &rseq->mm_cid, efault); - if (uval != t->rseq.ids.mm_cid) - goto die; - } - return true; -die: - t->rseq.event.fatal = true; -efault: - return false; -} - #endif /* RSEQ_BUILD_SLOW_PATH */ /* @@ -514,20 +476,32 @@ efault: * faults in task context are fatal too. */ static rseq_inline -bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids, - u32 node_id, u64 *csaddr) +bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids, u64 *csaddr) { struct rseq __user *rseq = t->rseq.usrptr; - if (static_branch_unlikely(&rseq_debug_enabled)) { - if (!rseq_debug_validate_ids(t)) - return false; - } - scoped_user_rw_access(rseq, efault) { + /* Validate the R/O fields for debug and optimized mode */ + if (static_branch_unlikely(&rseq_debug_enabled) || rseq_v2(t)) { + u32 cpu_id, uval; + + unsafe_get_user(cpu_id, &rseq->cpu_id_start, efault); + if (cpu_id != t->rseq.ids.cpu_id) + goto die; + unsafe_get_user(uval, &rseq->cpu_id, efault); + if (uval != cpu_id) + goto die; + unsafe_get_user(uval, &rseq->node_id, efault); + if (uval != t->rseq.ids.node_id) + goto die; + unsafe_get_user(uval, &rseq->mm_cid, efault); + if (uval != t->rseq.ids.mm_cid) + goto die; + } + unsafe_put_user(ids->cpu_id, &rseq->cpu_id_start, efault); unsafe_put_user(ids->cpu_id, &rseq->cpu_id, efault); - unsafe_put_user(node_id, &rseq->node_id, efault); + unsafe_put_user(ids->node_id, &rseq->node_id, efault); unsafe_put_user(ids->mm_cid, &rseq->mm_cid, efault); if (csaddr) unsafe_get_user(*csaddr, &rseq->rseq_cs, efault); @@ -539,10 +513,13 @@ bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids, rseq_slice_clear_grant(t); /* Cache the new values */ - t->rseq.ids.cpu_cid = ids->cpu_cid; + t->rseq.ids = *ids; rseq_stat_inc(rseq_stats.ids); rseq_trace_update(t, ids); return true; + +die: + t->rseq.event.fatal = true; efault: return false; } @@ -552,11 +529,11 @@ efault: * is in a critical section. */ static rseq_inline bool rseq_update_usr(struct task_struct *t, struct pt_regs *regs, - struct rseq_ids *ids, u32 node_id) + struct rseq_ids *ids) { u64 csaddr; - if (!rseq_set_ids_get_csaddr(t, ids, node_id, &csaddr)) + if (!rseq_set_ids_get_csaddr(t, ids, &csaddr)) return false; /* @@ -659,12 +636,12 @@ static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct t } struct rseq_ids ids = { - .cpu_id = task_cpu(t), - .mm_cid = task_mm_cid(t), + .cpu_id = task_cpu(t), + .mm_cid = task_mm_cid(t), + .node_id = cpu_to_node(ids.cpu_id), }; - u32 node_id = cpu_to_node(ids.cpu_id); - return rseq_update_usr(t, regs, &ids, node_id); + return rseq_update_usr(t, regs, &ids); efault: return false; } diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index a469c1870849..85739a63e85e 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -66,8 +66,9 @@ struct rseq_event { * compiler emit a single compare on 64-bit * @cpu_id: The CPU ID which was written last to user space * @mm_cid: The MM CID which was written last to user space + * @node_id: The node ID which was written last to user space * - * @cpu_id and @mm_cid are updated when the data is written to user space. + * @cpu_id, @mm_cid and @node_id are updated when the data is written to user space. */ struct rseq_ids { union { @@ -77,6 +78,7 @@ struct rseq_ids { u32 mm_cid; }; }; + u32 node_id; }; /** -- cgit v1.2.3 From 1f7305d87aa23db2579df222eba504a333c2c978 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 5 May 2026 17:52:03 +0100 Subject: KVM: arm64: Work around C1-Pro erratum 4193714 for protected guests C1-Pro cores with SME have an erratum where TLBI+DSB does not complete all outstanding SME accesses. Instead a DSB needs to be executed on the affected CPUs. The implication is that pages cannot be unmapped from the host Stage 2 and then provided to a protected guest or to the hypervisor. Host SME accesses may still complete after this point. This erratum breaks pKVM's guarantees, and the workaround is hard to implement as EL2 and EL1 share a security state meaning EL1 can mask IPIs sent by EL2, leading to interrupt blackouts. Instead, do this in EL3. This has the advantage of a separate security state, meaning lower EL cannot mask the IPI. It is also simpler for EL3 to know about CPUs that are off or in PSCI's CPU_SUSPEND. Add the needed hook to host_stage2_set_owner_metadata_locked(). This covers the cases where the host loses access to a page: __pkvm_host_donate_guest() __pkvm_guest_unshare_host() host_stage2_set_owner_locked() when owner_id == PKVM_ID_HYP Since pKVM relies on the firmware call for correctness, check for the firmware counterpart during protected KVM initialisation and fail the pKVM initialisation if it is missing. Signed-off-by: James Morse Co-developed-by: Catalin Marinas Signed-off-by: Catalin Marinas Cc: Mark Rutland Cc: Marc Zyngier Cc: Oliver Upton Cc: Will Deacon Cc: Vincent Donnefort Cc: Lorenzo Pieralisi Cc: Sudeep Holla Link: https://patch.msgid.link/20260505165205.2690919-1-catalin.marinas@arm.com Signed-off-by: Marc Zyngier --- include/linux/arm-smccc.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index 50b47eba7d01..e7195750d21b 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -105,6 +105,12 @@ ARM_SMCCC_SMC_32, \ 0, 0x3fff) +/* C1-Pro erratum 4193714: SME DVMSync early acknowledgement */ +#define ARM_SMCCC_CPU_WORKAROUND_4193714 \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_32, \ + ARM_SMCCC_OWNER_CPU, 0x10) + #define ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID \ ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ ARM_SMCCC_SMC_32, \ -- cgit v1.2.3 From 57c347a2e2473bfb5c1f1132a3209c55efbe640b Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Thu, 30 Apr 2026 08:11:02 -0700 Subject: platform/x86: intel: Add notifiers support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In some cases a driver using services of vsec_tpmi driver requires some processing before vsec_tpmi exits. For example a children using debugfs can't use debugfs as this will be deleted by the vsec_tpmi driver. This is the case when unbind using PCI driver interface. In this case the remove callback of vsec_tpmi driver is called first, then remove callback of its children. Add support of blocking chain notifiers support. Notify on successful probe and before clean up in the remove callback. Fixes: 811f67c51636 ("platform/x86/intel/tpmi: Add new auxiliary driver for performance limits") Signed-off-by: Srinivas Pandruvada Cc: Stable@vger.kernel.org Link: https://patch.msgid.link/20260430151103.1549733-3-srinivas.pandruvada@linux.intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/intel_tpmi.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/intel_tpmi.h b/include/linux/intel_tpmi.h index 94c06bf214fb..15f02422e9ca 100644 --- a/include/linux/intel_tpmi.h +++ b/include/linux/intel_tpmi.h @@ -28,6 +28,12 @@ enum intel_tpmi_id { TPMI_INFO_ID = 0x81, /* Special ID for PCI BDF and Package ID information */ }; +#define TPMI_CORE_INIT 0 +#define TPMI_CORE_EXIT 1 + +int tpmi_register_notifier(struct notifier_block *nb); +int tpmi_unregister_notifier(struct notifier_block *nb); + struct oobmsm_plat_info *tpmi_get_platform_data(struct auxiliary_device *auxdev); struct resource *tpmi_get_resource_at_index(struct auxiliary_device *auxdev, int index); int tpmi_get_resource_count(struct auxiliary_device *auxdev); -- cgit v1.2.3 From b62eb8dcf2c47d4d676a434efbd57c4f776f7829 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 6 May 2026 12:07:14 +0200 Subject: netfilter: x_tables: allocate hook ops while under mutex arp/ip(6)t_register_table() add the table to the per-netns list via xt_register_table() before allocating the per-netns hook ops copy via kmemdup_array(). This leaves a window where the table is visible in the list with ops=NULL. If the pernet exit happens runs concurrently the pre_exit callback finds the table via xt_find_table() and passes the NULL ops pointer to nf_unregister_net_hooks(), causing a NULL dereference: general protection fault in nf_unregister_net_hooks+0xbc/0x150 RIP: nf_unregister_net_hooks (net/netfilter/core.c:613) Call Trace: ipt_unregister_table_pre_exit iptable_mangle_net_pre_exit ops_pre_exit_list cleanup_net Fix by moving the ops allocation into the xtables core so the table is never in the list without valid ops. Also ensure the table is no longer processing packets before its torn down on error unwind. nf_register_net_hooks might have published at least one hook; call synchronize_rcu() if there was an error. audit log register message gets deferred until all operations have passed, this avoids need to emit another ureg message in case of error unwinding. Based on earlier patch by Tristan Madani. Fixes: f9006acc8dfe5 ("netfilter: arp_tables: pass table pointer via nf_hook_ops") Fixes: ee177a54413a ("netfilter: ip6_tables: pass table pointer via nf_hook_ops") Fixes: ae689334225f ("netfilter: ip_tables: pass table pointer via nf_hook_ops") Link: https://lore.kernel.org/netfilter-devel/20260429175613.1459342-1-tristmd@gmail.com/ Signed-off-by: Tristan Madani Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index a81b46af5118..cb4b694dd9e4 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -305,6 +305,7 @@ struct xt_counters *xt_counters_alloc(unsigned int counters); struct xt_table *xt_register_table(struct net *net, const struct xt_table *table, + const struct nf_hook_ops *template_ops, struct xt_table_info *bootstrap, struct xt_table_info *newinfo); void *xt_unregister_table(struct xt_table *table); -- cgit v1.2.3 From 527d6931473b75d90e38942aae6537d1a527f1fd Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 6 May 2026 12:07:15 +0200 Subject: netfilter: x_tables: add and use xt_unregister_table_pre_exit Remove the copypasted variants of _pre_exit and add one single function in the xtables core. ebtables is not compatible with x_tables and therefore unchanged. This is a preparation patch to reduce noise in the followup bug fixes. Reviewed-by: Tristan Madani Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 1 + include/linux/netfilter_arp/arp_tables.h | 1 - include/linux/netfilter_ipv4/ip_tables.h | 1 - include/linux/netfilter_ipv6/ip6_tables.h | 1 - 4 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index cb4b694dd9e4..74486714ae20 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -309,6 +309,7 @@ struct xt_table *xt_register_table(struct net *net, struct xt_table_info *bootstrap, struct xt_table_info *newinfo); void *xt_unregister_table(struct xt_table *table); +void xt_unregister_table_pre_exit(struct net *net, u8 af, const char *name); struct xt_table_info *xt_replace_table(struct xt_table *table, unsigned int num_counters, diff --git a/include/linux/netfilter_arp/arp_tables.h b/include/linux/netfilter_arp/arp_tables.h index a40aaf645fa4..05631a25e622 100644 --- a/include/linux/netfilter_arp/arp_tables.h +++ b/include/linux/netfilter_arp/arp_tables.h @@ -53,7 +53,6 @@ int arpt_register_table(struct net *net, const struct xt_table *table, const struct arpt_replace *repl, const struct nf_hook_ops *ops); void arpt_unregister_table(struct net *net, const char *name); -void arpt_unregister_table_pre_exit(struct net *net, const char *name); extern unsigned int arpt_do_table(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); diff --git a/include/linux/netfilter_ipv4/ip_tables.h b/include/linux/netfilter_ipv4/ip_tables.h index 132b0e4a6d4d..13593391d605 100644 --- a/include/linux/netfilter_ipv4/ip_tables.h +++ b/include/linux/netfilter_ipv4/ip_tables.h @@ -26,7 +26,6 @@ int ipt_register_table(struct net *net, const struct xt_table *table, const struct ipt_replace *repl, const struct nf_hook_ops *ops); -void ipt_unregister_table_pre_exit(struct net *net, const char *name); void ipt_unregister_table_exit(struct net *net, const char *name); /* Standard entry. */ diff --git a/include/linux/netfilter_ipv6/ip6_tables.h b/include/linux/netfilter_ipv6/ip6_tables.h index 8b8885a73c76..c6d5b927830d 100644 --- a/include/linux/netfilter_ipv6/ip6_tables.h +++ b/include/linux/netfilter_ipv6/ip6_tables.h @@ -27,7 +27,6 @@ extern void *ip6t_alloc_initial_table(const struct xt_table *); int ip6t_register_table(struct net *net, const struct xt_table *table, const struct ip6t_replace *repl, const struct nf_hook_ops *ops); -void ip6t_unregister_table_pre_exit(struct net *net, const char *name); void ip6t_unregister_table_exit(struct net *net, const char *name); extern unsigned int ip6t_do_table(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); -- cgit v1.2.3 From b4597d5fd7d2f8cebfffd40dffb5e003cc78964c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 6 May 2026 12:07:17 +0200 Subject: netfilter: x_tables: add and use xtables_unregister_table_exit Previous change added xtables_unregister_table_pre_exit to detach the table from the packetpath and to unlink it from the active table list. In case of rmmod, userspace that is doing set/getsockopt for this table will not be able to re-instantiate the table: 1. The larval table has been removed already 2. existing instantiated table is no longer on the xt pernet table list. This adds the second stage helper: unlink the table from the dying list, free the hook ops (if any) and do the audit notification. It replaces xt_unregister_table(). Fixes: fdacd57c79b7 ("netfilter: x_tables: never register tables by default") Reported-by: Tristan Madani Reviewed-by: Tristan Madani Closes: https://lore.kernel.org/netfilter-devel/20260429175613.1459342-1-tristmd@gmail.com/ Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 74486714ae20..5a1c5c336fa4 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -308,8 +308,8 @@ struct xt_table *xt_register_table(struct net *net, const struct nf_hook_ops *template_ops, struct xt_table_info *bootstrap, struct xt_table_info *newinfo); -void *xt_unregister_table(struct xt_table *table); void xt_unregister_table_pre_exit(struct net *net, u8 af, const char *name); +struct xt_table *xt_unregister_table_exit(struct net *net, u8 af, const char *name); struct xt_table_info *xt_replace_table(struct xt_table *table, unsigned int num_counters, -- cgit v1.2.3 From 411c1cf430392c905e39f12bc305dd994da0b426 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 8 May 2026 15:20:23 +0100 Subject: arm64/entry: Fix arm64-specific rseq brokenness Mathias Stearn reports that since v6.19, there are two big issues affecting rseq: (1) On arm64 specifically, rseq critical sections aren't aborted when they should be. (2) The 'cpu_id_start' field is no longer written by the kernel in all cases it used to be, including some cases where TCMalloc depends on the kernel clobbering the field. This patch fixes issue #1. This patch DOES NOT fix issue #2, which will need to be addressed by other patches. The arm64-specific brokenness is a result of commits: 2fc0e4b4126c ("rseq: Record interrupt from user space") 39a167560a61 ("rseq: Optimize event setting") The first commit failed to add a call to rseq_note_user_irq_entry() on arm64. Thus arm64 never sets rseq_event::user_irq to record that it may be necessary to abort an active rseq critical section upon return to userspace. On its own, this commit had no functional impact as the value of rseq_event::user_irq was not consumed. The second commit relied upon rseq_event::user_irq to determine whether or not to bother to perform rseq work when returning to userspace. As rseq_event::user_irq wasn't set on arm64, this work would be skipped, and consequently an active rseq critical section would not be aborted. Fix this by giving arm64 syscall-specific entry/exit paths, and performing the relevant logic in syscall and non-syscall paths, including calling rseq_note_user_irq_entry() for non-syscall entry. Currently arm64 cannot use syscall_enter_from_user_mode(), syscall_exit_to_user_mode(), and irqentry_exit_to_user_mode(), due to ordering constraints with exception masking, and risk of ABI breakage for syscall tracing/audit/etc. For the moment the entry/exit logic is left as arm64-specific, directly using enter_from_user_mode() and exit_to_user_mode(), but mirroring the generic code. I intend to follow up with refactoring/cleanup, as we did for kernel mode entry paths in commit: 041aa7a85390 ("entry: Split preemption from irqentry_exit_to_kernel_mode()") ... which will allow arm64 to use the GENERIC_IRQ_ENTRY functions directly. Fixes: 39a167560a61 ("rseq: Optimize event setting") Reported-by: Mathias Stearn Signed-off-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Acked-by: Catalin Marinas Link: https://lore.kernel.org/regressions/CAHnCjA25b+nO2n5CeifknSKHssJpPrjnf+dtr7UgzRw4Zgu=oA@mail.gmail.com/ Link: https://patch.msgid.link/20260508142023.3268622-1-mark.rutland@arm.com --- include/linux/irq-entry-common.h | 8 -------- include/linux/rseq_entry.h | 19 ------------------- 2 files changed, 27 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index 167fba7dbf04..1fabf0f5ea8e 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -218,14 +218,6 @@ static __always_inline void __exit_to_user_mode_validate(void) lockdep_sys_exit(); } -/* Temporary workaround to keep ARM64 alive */ -static __always_inline void exit_to_user_mode_prepare_legacy(struct pt_regs *regs) -{ - __exit_to_user_mode_prepare(regs, EXIT_TO_USER_MODE_WORK); - rseq_exit_to_user_mode_legacy(); - __exit_to_user_mode_validate(); -} - /** * syscall_exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required * @regs: Pointer to pt_regs on entry stack diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index 2d0295df5107..63bc72086e75 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -749,24 +749,6 @@ static __always_inline void rseq_irqentry_exit_to_user_mode(void) ev->events = 0; } -/* Required to keep ARM64 working */ -static __always_inline void rseq_exit_to_user_mode_legacy(void) -{ - struct rseq_event *ev = ¤t->rseq.event; - - rseq_stat_inc(rseq_stats.exit); - - if (static_branch_unlikely(&rseq_debug_enabled)) - WARN_ON_ONCE(ev->sched_switch); - - /* - * Ensure that event (especially user_irq) is cleared when the - * interrupt did not result in a schedule and therefore the - * rseq processing did not clear it. - */ - ev->events = 0; -} - void __rseq_debug_syscall_return(struct pt_regs *regs); static __always_inline void rseq_debug_syscall_return(struct pt_regs *regs) @@ -782,7 +764,6 @@ static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned } static inline void rseq_syscall_exit_to_user_mode(void) { } static inline void rseq_irqentry_exit_to_user_mode(void) { } -static inline void rseq_exit_to_user_mode_legacy(void) { } static inline void rseq_debug_syscall_return(struct pt_regs *regs) { } static inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask) { return false; } #endif /* !CONFIG_RSEQ */ -- cgit v1.2.3 From cceb8fa9cb2cf98e31d81ecf6353b6ba5ac57744 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 10 May 2026 10:08:16 -1000 Subject: sched_ext: Replace SCX_TASK_OFF_TASKS flag with SCX_TASK_DEAD state SCX_TASK_OFF_TASKS marked tasks already through sched_ext_dead() so cgroup task iteration would skip them. This can be expressed better with a task state. Replace the flag with SCX_TASK_DEAD. scx_disable_and_exit_task() resets state to NONE on its way out, so sched_ext_dead() now sets DEAD after the wrapper returns. The validation matrix grows NONE -> DEAD, warns on DEAD -> NONE, and tightens READY's predecessor to INIT or ENABLED so the new DEAD value cannot silently transition to READY. Prepares for the following enable vs dead race fix. Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- include/linux/sched/ext.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index adb9a4de068a..9f1a326ad03e 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -101,24 +101,25 @@ enum scx_ent_flags { SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */ SCX_TASK_SUB_INIT = 1 << 4, /* task being initialized for a sub sched */ SCX_TASK_IMMED = 1 << 5, /* task is on local DSQ with %SCX_ENQ_IMMED */ - SCX_TASK_OFF_TASKS = 1 << 6, /* removed from scx_tasks by sched_ext_dead() */ /* - * Bits 8 and 9 are used to carry task state: + * Bits 8 to 10 are used to carry task state: * * NONE ops.init_task() not called yet * INIT ops.init_task() succeeded, but task can be cancelled * READY fully initialized, but not in sched_ext * ENABLED fully initialized and in sched_ext + * DEAD terminal state set by sched_ext_dead() */ - SCX_TASK_STATE_SHIFT = 8, /* bits 8 and 9 are used to carry task state */ - SCX_TASK_STATE_BITS = 2, + SCX_TASK_STATE_SHIFT = 8, + SCX_TASK_STATE_BITS = 3, SCX_TASK_STATE_MASK = ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT, SCX_TASK_NONE = 0 << SCX_TASK_STATE_SHIFT, SCX_TASK_INIT = 1 << SCX_TASK_STATE_SHIFT, SCX_TASK_READY = 2 << SCX_TASK_STATE_SHIFT, SCX_TASK_ENABLED = 3 << SCX_TASK_STATE_SHIFT, + SCX_TASK_DEAD = 4 << SCX_TASK_STATE_SHIFT, /* * Bits 12 and 13 are used to carry reenqueue reason. In addition to -- cgit v1.2.3 From c941d7391f258d5d06e0f7e962a52f99a547a83e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 10 May 2026 10:08:16 -1000 Subject: sched_ext: Close root-enable vs sched_ext_dead() race with SCX_TASK_INIT_BEGIN scx_root_enable_workfn() drops the iter rq lock for ops.init_task() and a TASK_DEAD @p can fall through sched_ext_dead() in that window. The race hits when sched_ext_dead() observes SCX_TASK_INIT (the intermediate state before @p->scx.sched is published) and dereferences NULL via SCX_HAS_OP(NULL, exit_task), or observes SCX_TASK_NONE during the unlocked init window and skips cleanup so exit_task() never runs. Add SCX_TASK_INIT_BEGIN. The enable path writes NONE -> INIT_BEGIN under the iter rq lock, then takes the rq lock again after init to walk INIT_BEGIN -> INIT -> READY. sched_ext_dead() that wins the rq-lock race observes INIT_BEGIN and sets DEAD without calling into ops; the post-init recheck unwinds via scx_sub_init_cancel_task(). scx_fork() runs single-threaded against sched_ext_dead() (the task is not on scx_tasks until scx_post_fork() adds it) so its INIT_BEGIN -> INIT walk needs no rq-lock pairing; it rolls back to NONE on ops.init_task() failure. The validation matrix grows the INIT_BEGIN row and the INIT_BEGIN -> DEAD edge; INIT now requires INIT_BEGIN as the predecessor. scx_sub_disable()'s migration writes INIT_BEGIN as a synthetic predecessor to satisfy the tightened verification. The sub-sched paths still race with sched_ext_dead() during the unlocked init window. This will be fixed by the next patch. Reported-by: zhidao su Link: https://lore.kernel.org/all/20260429133155.3825247-1-suzhidao@xiaomi.com/ Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- include/linux/sched/ext.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 9f1a326ad03e..2129e18ada58 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -106,6 +106,7 @@ enum scx_ent_flags { * Bits 8 to 10 are used to carry task state: * * NONE ops.init_task() not called yet + * INIT_BEGIN ops.init_task() in flight; see sched_ext_dead() * INIT ops.init_task() succeeded, but task can be cancelled * READY fully initialized, but not in sched_ext * ENABLED fully initialized and in sched_ext @@ -116,10 +117,11 @@ enum scx_ent_flags { SCX_TASK_STATE_MASK = ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT, SCX_TASK_NONE = 0 << SCX_TASK_STATE_SHIFT, - SCX_TASK_INIT = 1 << SCX_TASK_STATE_SHIFT, - SCX_TASK_READY = 2 << SCX_TASK_STATE_SHIFT, - SCX_TASK_ENABLED = 3 << SCX_TASK_STATE_SHIFT, - SCX_TASK_DEAD = 4 << SCX_TASK_STATE_SHIFT, + SCX_TASK_INIT_BEGIN = 1 << SCX_TASK_STATE_SHIFT, + SCX_TASK_INIT = 2 << SCX_TASK_STATE_SHIFT, + SCX_TASK_READY = 3 << SCX_TASK_STATE_SHIFT, + SCX_TASK_ENABLED = 4 << SCX_TASK_STATE_SHIFT, + SCX_TASK_DEAD = 5 << SCX_TASK_STATE_SHIFT, /* * Bits 12 and 13 are used to carry reenqueue reason. In addition to -- cgit v1.2.3 From 657b594b2084b39a4bc6d8493aa2140cb00cea49 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 7 May 2026 16:46:29 +0900 Subject: fprobe: Fix unregister_fprobe() to wait for RCU grace period Commit 4346ba1604093 ("fprobe: Rewrite fprobe on function-graph tracer") changed fprobe to register struct fprobe to an rcu-hlist, but it forgot to wait for RCU GP. Thus there can be use-after-free if the fprobe is released right after unregistering. This can be happened on fprobe event and sample module code. To fix this issue, add synchronize_rcu() in unregister_fprobe(). Note that BPF is OK because fprobe is used as a part of bpf_kprobe_multi_link. This unregisters its fprobe in bpf_kprobe_multi_link_release() and it is deallocated via bpf_kprobe_multi_link_dealloc(), which is invoked from bpf_link_defer_dealloc_rcu_gp() RCU callback. For BPF, this also introduced unregister_fprobe_async() which does NOT wait for RCU grace priod. Link: https://lore.kernel.org/all/177813998919.256460.2809243930741138224.stgit@mhiramat.tok.corp.google.com/ Fixes: 4346ba1604093 ("fprobe: Rewrite fprobe on function-graph tracer") Signed-off-by: Masami Hiramatsu (Google) --- include/linux/fprobe.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fprobe.h b/include/linux/fprobe.h index 0a3bcd1718f3..be1b38c981d4 100644 --- a/include/linux/fprobe.h +++ b/include/linux/fprobe.h @@ -94,6 +94,7 @@ int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num); int register_fprobe_syms(struct fprobe *fp, const char **syms, int num); int unregister_fprobe(struct fprobe *fp); +int unregister_fprobe_async(struct fprobe *fp); bool fprobe_is_registered(struct fprobe *fp); int fprobe_count_ips_from_filter(const char *filter, const char *notfilter); #else @@ -113,6 +114,10 @@ static inline int unregister_fprobe(struct fprobe *fp) { return -EOPNOTSUPP; } +static inline int unregister_fprobe_async(struct fprobe *fp) +{ + return -EOPNOTSUPP; +} static inline bool fprobe_is_registered(struct fprobe *fp) { return false; -- cgit v1.2.3 From dec85d2fbd20de3711a71e65397dfdb40c3fa953 Mon Sep 17 00:00:00 2001 From: Sascha Bischoff Date: Wed, 6 May 2026 09:37:02 +0000 Subject: irqchip/gic-v5: Move LPI allocation into the LPI domain The IPI and ITS MSI domains currently allocate and release LPIs directly, then pass the selected LPI ID to the parent LPI domain. This leaks the LPI domain's allocation policy into its child domains and forces each child to duplicate part of the parent domain's teardown. Make the LPI domain allocate LPIs in its .alloc() callback and release them in a matching .free() callback. Child domains can then request a parent interrupt without passing an implementation-specific LPI ID, and the LPI lifetime is tied to the domain that owns the LPI namespace. Remove the gicv5_alloc_lpi() and gicv5_free_lpi() wrappers now that no external caller needs to manage LPIs directly. This is a preparatory change for an actual leakage problem in the allocation code and therefore tagged with the same Fixes tag. Fixes: 0f0101325876 ("irqchip/gic-v5: Add GICv5 LPI/IPI support") Signed-off-by: Sascha Bischoff Signed-off-by: Thomas Gleixner Reviewed-by: Marc Zyngier Reviewed-by: Lorenzo Pieralisi Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20260506093634.382062-2-sascha.bischoff@arm.com --- include/linux/irqchip/arm-gic-v5.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqchip/arm-gic-v5.h b/include/linux/irqchip/arm-gic-v5.h index 40d2fce68294..f78787e654f4 100644 --- a/include/linux/irqchip/arm-gic-v5.h +++ b/include/linux/irqchip/arm-gic-v5.h @@ -425,9 +425,6 @@ struct gicv5_its_itt_cfg { void gicv5_init_lpis(u32 max); void gicv5_deinit_lpis(void); -int gicv5_alloc_lpi(void); -void gicv5_free_lpi(u32 lpi); - void __init gicv5_its_of_probe(struct device_node *parent); void __init gicv5_its_acpi_probe(void); #endif -- cgit v1.2.3 From 5dd74441cbf42c22e874450eb6a6bbb19390a216 Mon Sep 17 00:00:00 2001 From: Guopeng Zhang Date: Sat, 9 May 2026 18:20:31 +0800 Subject: cgroup/cpuset: Reserve DL bandwidth only for root-domain moves cpuset_can_attach() currently adds the bandwidth of all migrating SCHED_DEADLINE tasks to sum_migrate_dl_bw. If the source and destination cpuset effective CPU masks do not overlap, the whole sum is then reserved in the destination root domain. set_cpus_allowed_dl(), however, subtracts bandwidth from the source root domain only when the affinity change really moves the task between root domains. A DL task can move between cpusets that are still in the same root domain, so including that task in sum_migrate_dl_bw can reserve destination bandwidth without a matching source-side subtraction. Share the root-domain move test with set_cpus_allowed_dl(). Keep nr_migrate_dl_tasks counting all migrating deadline tasks for cpuset DL task accounting, but add to sum_migrate_dl_bw only for tasks that need a root-domain bandwidth move. Keep using the destination cpuset effective CPU mask and leave the broader can_attach()/attach() transaction model unchanged. Fixes: 2ef269ef1ac0 ("cgroup/cpuset: Free DL BW in case can_attach() fails") Cc: stable@vger.kernel.org # v6.10+ Signed-off-by: Guopeng Zhang Reviewed-by: Waiman Long Acked-by: Juri Lelli Tested-by: Juri Lelli Signed-off-by: Tejun Heo --- include/linux/sched/deadline.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h index 1198138cb839..273538200a44 100644 --- a/include/linux/sched/deadline.h +++ b/include/linux/sched/deadline.h @@ -33,6 +33,15 @@ struct root_domain; extern void dl_add_task_root_domain(struct task_struct *p); extern void dl_clear_root_domain(struct root_domain *rd); extern void dl_clear_root_domain_cpu(int cpu); +/* + * Return whether moving DL task @p to @new_mask requires moving DL + * bandwidth accounting between root domains. This helper is specific to + * DL bandwidth move accounting semantics and is shared by + * cpuset_can_attach() and set_cpus_allowed_dl() so both paths use the + * same source root-domain test. + */ +extern bool dl_task_needs_bw_move(struct task_struct *p, + const struct cpumask *new_mask); extern u64 dl_cookie; extern bool dl_bw_visited(int cpu, u64 cookie); -- cgit v1.2.3 From 2c85c61d1332e1e16f020d76951baf167dcb6f7a Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Mon, 4 May 2026 10:47:22 +0200 Subject: HID: pass the buffer size to hid_report_raw_event commit 0a3fe972a7cb ("HID: core: Mitigate potential OOB by removing bogus memset()") enforced the provided data to be at least the size of the declared buffer in the report descriptor to prevent a buffer overflow. However, we can try to be smarter by providing both the buffer size and the data size, meaning that hid_report_raw_event() can make better decision whether we should plaining reject the buffer (buffer overflow attempt) or if we can safely memset it to 0 and pass it to the rest of the stack. Fixes: 0a3fe972a7cb ("HID: core: Mitigate potential OOB by removing bogus memset()") Cc: stable@vger.kernel.org Signed-off-by: Benjamin Tissoires Acked-by: Johan Hovold Reviewed-by: Greg Kroah-Hartman Signed-off-by: Jiri Kosina --- include/linux/hid.h | 4 ++-- include/linux/hid_bpf.h | 14 +++++++++----- 2 files changed, 11 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index 442a80d79e89..ac432a2ef415 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -1298,8 +1298,8 @@ static inline u32 hid_report_len(struct hid_report *report) return DIV_ROUND_UP(report->size, 8) + (report->id > 0); } -int hid_report_raw_event(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size, - int interrupt); +int hid_report_raw_event(struct hid_device *hid, enum hid_report_type type, u8 *data, + size_t bufsize, u32 size, int interrupt); /* HID quirks API */ unsigned long hid_lookup_quirk(const struct hid_device *hdev); diff --git a/include/linux/hid_bpf.h b/include/linux/hid_bpf.h index a2e47dbcf82c..19fffa4574a4 100644 --- a/include/linux/hid_bpf.h +++ b/include/linux/hid_bpf.h @@ -72,8 +72,8 @@ struct hid_ops { int (*hid_hw_output_report)(struct hid_device *hdev, __u8 *buf, size_t len, u64 source, bool from_bpf); int (*hid_input_report)(struct hid_device *hid, enum hid_report_type type, - u8 *data, u32 size, int interrupt, u64 source, bool from_bpf, - bool lock_already_taken); + u8 *data, size_t bufsize, u32 size, int interrupt, u64 source, + bool from_bpf, bool lock_already_taken); struct module *owner; const struct bus_type *bus_type; }; @@ -200,7 +200,8 @@ struct hid_bpf { #ifdef CONFIG_HID_BPF u8 *dispatch_hid_bpf_device_event(struct hid_device *hid, enum hid_report_type type, u8 *data, - u32 *size, int interrupt, u64 source, bool from_bpf); + size_t *buf_size, u32 *size, int interrupt, u64 source, + bool from_bpf); int dispatch_hid_bpf_raw_requests(struct hid_device *hdev, unsigned char reportnum, __u8 *buf, u32 size, enum hid_report_type rtype, @@ -215,8 +216,11 @@ int hid_bpf_device_init(struct hid_device *hid); const u8 *call_hid_bpf_rdesc_fixup(struct hid_device *hdev, const u8 *rdesc, unsigned int *size); #else /* CONFIG_HID_BPF */ static inline u8 *dispatch_hid_bpf_device_event(struct hid_device *hid, enum hid_report_type type, - u8 *data, u32 *size, int interrupt, - u64 source, bool from_bpf) { return data; } + u8 *data, size_t *buf_size, u32 *size, + int interrupt, u64 source, bool from_bpf) +{ + return data; +} static inline int dispatch_hid_bpf_raw_requests(struct hid_device *hdev, unsigned char reportnum, u8 *buf, u32 size, enum hid_report_type rtype, -- cgit v1.2.3 From 206342541fc887ae919774a43942dc883161fece Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Mon, 4 May 2026 10:47:23 +0200 Subject: HID: core: introduce hid_safe_input_report() hid_input_report() is used in too many places to have a commit that doesn't cross subsystem borders. Instead of changing the API, introduce a new one when things matters in the transport layers: - usbhid - i2chid This effectively revert to the old behavior for those two transport layers. Fixes: 0a3fe972a7cb ("HID: core: Mitigate potential OOB by removing bogus memset()") Cc: stable@vger.kernel.org Signed-off-by: Benjamin Tissoires Signed-off-by: Jiri Kosina --- include/linux/hid.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index ac432a2ef415..bfb9859f391e 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -1030,6 +1030,8 @@ struct hid_field *hid_find_field(struct hid_device *hdev, unsigned int report_ty int hid_set_field(struct hid_field *, unsigned, __s32); int hid_input_report(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size, int interrupt); +int hid_safe_input_report(struct hid_device *hid, enum hid_report_type type, u8 *data, + size_t bufsize, u32 size, int interrupt); struct hid_field *hidinput_get_led_field(struct hid_device *hid); unsigned int hidinput_count_leds(struct hid_device *hid); __s32 hidinput_calc_abs_res(const struct hid_field *field, __u16 code); -- cgit v1.2.3 From 32d5019ed3b6ff4439cb075fb275f655c8a2059c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 May 2026 07:01:47 +0200 Subject: block: pass a minsize argument to bio_iov_iter_bounce When bouncing for block size > PAGE_SIZE file systems that require file system block size alignment (e.g. zoned XFS), the bio needs to be big enough to fit an entire block. Fixes: 8dd5e7c75d7b ("block: add helpers to bounce buffer an iov_iter into bios") Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Link: https://patch.msgid.link/20260507050153.1298375-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bio.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 97d747320b35..dc17780d6c1e 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -475,7 +475,8 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty); extern void bio_set_pages_dirty(struct bio *bio); extern void bio_check_pages_dirty(struct bio *bio); -int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter, size_t maxlen); +int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter, size_t maxlen, + size_t minsize); void bio_iov_iter_unbounce(struct bio *bio, bool is_error, bool mark_dirty); extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, -- cgit v1.2.3 From df733ddc263dbe5f471e7c80c8b669532f56bf76 Mon Sep 17 00:00:00 2001 From: Matt Evans Date: Mon, 11 May 2026 07:46:42 -0700 Subject: vfio/pci: Make VFIO_PCI_OFFSET_TO_INDEX() return unsigned VFIO_PCI_OFFSET_TO_INDEX() is used in several places with a signed parameter (e.g. loff_t). Because it makes no sense for a BAR/resource index to be negative, enforce this in the macro. This fixes at least one current issue, where vfio_pci_ioeventfd() uses this macro with an unvalidated signed loff_t returned into a signed type, leading to a possible negative array access. This instance does test against an out-of-bounds positive value, so treating the index as unsigned fixes this issue. Fixes: 89e1f7d4c66d8 ("vfio: Add PCI device driver") Signed-off-by: Matt Evans Link: https://lore.kernel.org/r/20260511144642.2926799-1-mattev@meta.com Signed-off-by: Alex Williamson --- include/linux/vfio_pci_core.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 2ebba746c18f..89165b769e5c 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -21,7 +21,7 @@ #define VFIO_PCI_CORE_H #define VFIO_PCI_OFFSET_SHIFT 40 -#define VFIO_PCI_OFFSET_TO_INDEX(off) (off >> VFIO_PCI_OFFSET_SHIFT) +#define VFIO_PCI_OFFSET_TO_INDEX(off) ((u64)(off) >> VFIO_PCI_OFFSET_SHIFT) #define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT) #define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1) -- cgit v1.2.3 From 31e62c2ebbfdc3fe3dbdf5e02c92a9dc67087a3a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 13 May 2026 11:37:18 -0700 Subject: ptrace: slightly saner 'get_dumpable()' logic The 'dumpability' of a task is fundamentally about the memory image of the task - the concept comes from whether it can core dump or not - and makes no sense when you don't have an associated mm. And almost all users do in fact use it only for the case where the task has a mm pointer. But we have one odd special case: ptrace_may_access() uses 'dumpable' to check various other things entirely independently of the MM (typically explicitly using flags like PTRACE_MODE_READ_FSCREDS). Including for threads that no longer have a VM (and maybe never did, like most kernel threads). It's not what this flag was designed for, but it is what it is. The ptrace code does check that the uid/gid matches, so you do have to be uid-0 to see kernel thread details, but this means that the traditional "drop capabilities" model doesn't make any difference for this all. Make it all make a *bit* more sense by saying that if you don't have a MM pointer, we'll use a cached "last dumpability" flag if the thread ever had a MM (it will be zero for kernel threads since it is never set), and require a proper CAP_SYS_PTRACE capability to override. Reported-by: Qualys Security Advisory Cc: Oleg Nesterov Cc: Kees Cook Signed-off-by: Linus Torvalds --- include/linux/sched.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 368c7b4d7cb5..ee06cba5c6f5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1002,6 +1002,9 @@ struct task_struct { unsigned sched_rt_mutex:1; #endif + /* Save user-dumpable when mm goes away */ + unsigned user_dumpable:1; + /* Bit to tell TOMOYO we're in execve(): */ unsigned in_execve:1; unsigned in_iowait:1; -- cgit v1.2.3