From da142f3d373a6ddaca0119615a8db2175ddc4121 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 15:26:55 -0800 Subject: KVM: Remove subtle "struct kvm_stats_desc" pseudo-overlay Remove KVM's internal pseudo-overlay of kvm_stats_desc, which subtly aliases the flexible name[] in the uAPI definition with a fixed-size array of the same name. The unusual embedded structure results in compiler warnings due to -Wflex-array-member-not-at-end, and also necessitates an extra level of dereferencing in KVM. To avoid the "overlay", define the uAPI structure to have a fixed-size name when building for the kernel. Opportunistically clean up the indentation for the stats macros, and replace spaces with tabs. No functional change intended. Reported-by: Gustavo A. R. Silva Closes: https://lore.kernel.org/all/aPfNKRpLfhmhYqfP@kspp Acked-by: Marc Zyngier Acked-by: Christian Borntraeger [..] Acked-by: Anup Patel Reviewed-by: Bibo Mao Acked-by: Gustavo A. R. Silva Link: https://patch.msgid.link/20251205232655.445294-1-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 83 ++++++++++++++++++++---------------------------- 1 file changed, 35 insertions(+), 48 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d93f75b05ae2..7428d9949382 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1927,56 +1927,43 @@ enum kvm_stat_kind { struct kvm_stat_data { struct kvm *kvm; - const struct _kvm_stats_desc *desc; + const struct kvm_stats_desc *desc; enum kvm_stat_kind kind; }; -struct _kvm_stats_desc { - struct kvm_stats_desc desc; - char name[KVM_STATS_NAME_SIZE]; -}; - -#define STATS_DESC_COMMON(type, unit, base, exp, sz, bsz) \ - .flags = type | unit | base | \ - BUILD_BUG_ON_ZERO(type & ~KVM_STATS_TYPE_MASK) | \ - BUILD_BUG_ON_ZERO(unit & ~KVM_STATS_UNIT_MASK) | \ - BUILD_BUG_ON_ZERO(base & ~KVM_STATS_BASE_MASK), \ - .exponent = exp, \ - .size = sz, \ +#define STATS_DESC_COMMON(type, unit, base, exp, sz, bsz) \ + .flags = type | unit | base | \ + BUILD_BUG_ON_ZERO(type & ~KVM_STATS_TYPE_MASK) | \ + BUILD_BUG_ON_ZERO(unit & ~KVM_STATS_UNIT_MASK) | \ + BUILD_BUG_ON_ZERO(base & ~KVM_STATS_BASE_MASK), \ + .exponent = exp, \ + .size = sz, \ .bucket_size = bsz -#define VM_GENERIC_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ - { \ - { \ - STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ - .offset = offsetof(struct kvm_vm_stat, generic.stat) \ - }, \ - .name = #stat, \ - } -#define VCPU_GENERIC_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ - { \ - { \ - STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ - .offset = offsetof(struct kvm_vcpu_stat, generic.stat) \ - }, \ - .name = #stat, \ - } -#define VM_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ - { \ - { \ - STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ - .offset = offsetof(struct kvm_vm_stat, stat) \ - }, \ - .name = #stat, \ - } -#define VCPU_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ - { \ - { \ - STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ - .offset = offsetof(struct kvm_vcpu_stat, stat) \ - }, \ - .name = #stat, \ - } +#define VM_GENERIC_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ +{ \ + STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ + .offset = offsetof(struct kvm_vm_stat, generic.stat), \ + .name = #stat, \ +} +#define VCPU_GENERIC_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ +{ \ + STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ + .offset = offsetof(struct kvm_vcpu_stat, generic.stat), \ + .name = #stat, \ +} +#define VM_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ +{ \ + STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ + .offset = offsetof(struct kvm_vm_stat, stat), \ + .name = #stat, \ +} +#define VCPU_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ +{ \ + STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ + .offset = offsetof(struct kvm_vcpu_stat, stat), \ + .name = #stat, \ +} /* SCOPE: VM, VM_GENERIC, VCPU, VCPU_GENERIC */ #define STATS_DESC(SCOPE, stat, type, unit, base, exp, sz, bsz) \ SCOPE##_STATS_DESC(stat, type, unit, base, exp, sz, bsz) @@ -2053,7 +2040,7 @@ struct _kvm_stats_desc { STATS_DESC_IBOOLEAN(VCPU_GENERIC, blocking) ssize_t kvm_stats_read(char *id, const struct kvm_stats_header *header, - const struct _kvm_stats_desc *desc, + const struct kvm_stats_desc *desc, void *stats, size_t size_stats, char __user *user_buffer, size_t size, loff_t *offset); @@ -2098,9 +2085,9 @@ static inline void kvm_stats_log_hist_update(u64 *data, size_t size, u64 value) extern const struct kvm_stats_header kvm_vm_stats_header; -extern const struct _kvm_stats_desc kvm_vm_stats_desc[]; +extern const struct kvm_stats_desc kvm_vm_stats_desc[]; extern const struct kvm_stats_header kvm_vcpu_stats_header; -extern const struct _kvm_stats_desc kvm_vcpu_stats_desc[]; +extern const struct kvm_stats_desc kvm_vcpu_stats_desc[]; #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER static inline int mmu_invalidate_retry(struct kvm *kvm, unsigned long mmu_seq) -- cgit v1.2.3 From b777b5e09eabeefc6ba80f4296366a4742701103 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 10 Feb 2026 17:02:25 +0000 Subject: time/jiffies: Inline jiffies_to_msecs() and jiffies_to_usecs() For common cases (HZ=100, 250 or 1000), these helpers are at most one multiply, so there is no point calling a tiny function. Keep them out of line for HZ=300 and others. This saves cycles in TCP fast path, among other things. $ scripts/bloat-o-meter -t vmlinux.old vmlinux.new add/remove: 0/8 grow/shrink: 25/89 up/down: 530/-3474 (-2944) ... nla_put_msecs 193 - -193 message_stats_print 2131 920 -1211 Total: Before=25365208, After=25362264, chg -0.01% Signed-off-by: Eric Dumazet Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260210170226.57209-1-edumazet@google.com --- include/linux/jiffies.h | 40 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index fdef2c155c27..d1c3d4941854 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -434,8 +434,44 @@ extern unsigned long preset_lpj; /* * Convert various time units to each other: */ -extern unsigned int jiffies_to_msecs(const unsigned long j); -extern unsigned int jiffies_to_usecs(const unsigned long j); + +#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) +/** + * jiffies_to_msecs - Convert jiffies to milliseconds + * @j: jiffies value + * + * This inline version takes care of HZ in {100,250,1000}. + * + * Return: milliseconds value + */ +static inline unsigned int jiffies_to_msecs(const unsigned long j) +{ + return (MSEC_PER_SEC / HZ) * j; +} +#else +unsigned int jiffies_to_msecs(const unsigned long j); +#endif + +#if !(USEC_PER_SEC % HZ) +/** + * jiffies_to_usecs - Convert jiffies to microseconds + * @j: jiffies value + * + * Return: microseconds value + */ +static inline unsigned int jiffies_to_usecs(const unsigned long j) +{ + /* + * Hz usually doesn't go much further MSEC_PER_SEC. + * jiffies_to_usecs() and usecs_to_jiffies() depend on that. + */ + BUILD_BUG_ON(HZ > USEC_PER_SEC); + + return (USEC_PER_SEC / HZ) * j; +} +#else +unsigned int jiffies_to_usecs(const unsigned long j); +#endif /** * jiffies_to_nsecs - Convert jiffies to nanoseconds -- cgit v1.2.3 From ce9e40a9a5e5cff0b1b0d2fa582b3d71a8ce68e8 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 6 Feb 2026 15:48:16 +0000 Subject: irqchip/gic-v3-its: Limit number of per-device MSIs to the range the ITS supports The ITS driver blindly assumes that EventIDs are in abundant supply, to the point where it never checks how many the hardware actually supports. It turns out that some pretty esoteric integrations make it so that only a few bits are available, all the way down to a single bit. Enforce the advertised limitation at the point of allocating the device structure, and hope that the endpoint driver can deal with such limitation. Fixes: 84a6a2e7fc18d ("irqchip: GICv3: ITS: device allocation and configuration") Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Reviewed-by: Robin Murphy Reviewed-by: Zenghui Yu Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20260206154816.3582887-1-maz@kernel.org --- include/linux/irqchip/arm-gic-v3.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index 70c0948f978e..0225121f3013 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -394,6 +394,7 @@ #define GITS_TYPER_VLPIS (1UL << 1) #define GITS_TYPER_ITT_ENTRY_SIZE_SHIFT 4 #define GITS_TYPER_ITT_ENTRY_SIZE GENMASK_ULL(7, 4) +#define GITS_TYPER_IDBITS GENMASK_ULL(12, 8) #define GITS_TYPER_IDBITS_SHIFT 8 #define GITS_TYPER_DEVBITS_SHIFT 13 #define GITS_TYPER_DEVBITS GENMASK_ULL(17, 13) -- cgit v1.2.3 From 249013e673fce3506c61063c7cbedd75b4c668d8 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 18 Feb 2026 22:09:21 -0800 Subject: fsnotify: drop unused helper Remove this helper now that all users have been converted to fserror_report_metadata as of 7.0-rc1. Cc: jack@suse.cz Cc: amir73il@gmail.com Signed-off-by: Darrick J. Wong Link: https://patch.msgid.link/177148129543.716249.980530449513340111.stgit@frogsfrogsfrogs Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- include/linux/fsnotify.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 28a9cb13fbfa..079c18bcdbde 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -495,19 +495,6 @@ static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid) fsnotify_dentry(dentry, mask); } -static inline int fsnotify_sb_error(struct super_block *sb, struct inode *inode, - int error) -{ - struct fs_error_report report = { - .error = error, - .inode = inode, - .sb = sb, - }; - - return fsnotify(FS_ERROR, &report, FSNOTIFY_EVENT_ERROR, - NULL, NULL, NULL, 0); -} - static inline void fsnotify_mnt_attach(struct mnt_namespace *ns, struct vfsmount *mnt) { fsnotify_mnt(FS_MNT_ATTACH, ns, mnt); -- cgit v1.2.3 From 6b3e458806e34f1142592f786d3eb0ebac209cc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnther=20Noack?= Date: Thu, 19 Feb 2026 16:43:35 +0100 Subject: HID: Document memory allocation properties of report_fixup() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The memory pointer returned by the report_fixup() hook does not get freed by the caller. Instead, report_fixup() must return (in return value and *rsize) a memory buffer with at least the same lifetime as the input buffer (defined by rdesc and original *rsize). This is usually achieved using one of the following techniques: * Returning a pointer and size to a sub-portion of the input buffer * Returning a pointer to a static buffer * Allocating a buffer with a devm_*() function, which will automatically get freed when the device is removed. Signed-off-by: Günther Noack Signed-off-by: Benjamin Tissoires --- include/linux/hid.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index dce862cafbbd..2990b9f94cb5 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -836,6 +836,12 @@ struct hid_usage_id { * raw_event and event should return negative on error, any other value will * pass the event on to .event() typically return 0 for success. * + * report_fixup must return a report descriptor pointer whose lifetime is at + * least that of the input rdesc. This is usually done by mutating the input + * rdesc and returning it or a sub-portion of it. In case a new buffer is + * allocated and returned, the implementation of report_fixup is responsible for + * freeing it later. + * * input_mapping shall return a negative value to completely ignore this usage * (e.g. doubled or invalid usage), zero to continue with parsing of this * usage by generic code (no special handling needed) or positive to skip -- cgit v1.2.3 From 6e3c0a4e1ad1e0455b7880fad02b3ee179f56c09 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 22 Apr 2025 12:16:28 +0200 Subject: sched/fair: Fix lag clamp Vincent reported that he was seeing undue lag clamping in a mixed slice workload. Implement the max_slice tracking as per the todo comment. Fixes: 147f3efaa241 ("sched/fair: Implement an EEVDF-like scheduling policy") Reported-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Tested-by: Vincent Guittot Tested-by: K Prateek Nayak Tested-by: Shubhang Kaushik Link: https://patch.msgid.link/20250422101628.GA33555@noisy.programming.kicks-ass.net --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 074ad4ef3d81..a7b4a980eb2f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -579,6 +579,7 @@ struct sched_entity { u64 deadline; u64 min_vruntime; u64 min_slice; + u64 max_slice; struct list_head group_node; unsigned char on_rq; -- cgit v1.2.3 From 4c652a47722f69c6f2685f05b17490ea97f643a8 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 6 Feb 2026 08:41:13 +0100 Subject: rseq: Mark rseq_arm_slice_extension_timer() __always_inline objtool warns about this function being called inside of a uaccess section: kernel/entry/common.o: warning: objtool: irqentry_exit+0x1dc: call to rseq_arm_slice_extension_timer() with UACCESS enabled Interestingly, this happens with CONFIG_RSEQ_SLICE_EXTENSION disabled, so this is an empty function, as the normal implementation is already marked __always_inline. I could reproduce this multiple times with gcc-11 but not with gcc-15, so the compiler probably got better at identifying the trivial function. Mark all the empty helpers for !RSEQ_SLICE_EXTENSION as __always_inline for consistency, avoiding this warning. Fixes: 0ac3b5c3dc45 ("rseq: Implement time slice extension enforcement timer") Signed-off-by: Arnd Bergmann Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260206074122.709580-1-arnd@kernel.org --- include/linux/rseq_entry.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index cbc4a791618b..c6831c93cd6e 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -216,10 +216,10 @@ efault: } #else /* CONFIG_RSEQ_SLICE_EXTENSION */ -static inline bool rseq_slice_extension_enabled(void) { return false; } -static inline bool rseq_arm_slice_extension_timer(void) { return false; } -static inline void rseq_slice_clear_grant(struct task_struct *t) { } -static inline bool rseq_grant_slice_extension(bool work_pending) { return false; } +static __always_inline bool rseq_slice_extension_enabled(void) { return false; } +static __always_inline bool rseq_arm_slice_extension_timer(void) { return false; } +static __always_inline void rseq_slice_clear_grant(struct task_struct *t) { } +static __always_inline bool rseq_grant_slice_extension(bool work_pending) { return false; } #endif /* !CONFIG_RSEQ_SLICE_EXTENSION */ bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr); -- cgit v1.2.3 From 3b68df978133ac3d46d570af065a73debbb68248 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 20 Feb 2026 15:06:41 -0500 Subject: rseq: slice ext: Ensure rseq feature size differs from original rseq size Before rseq became extensible, its original size was 32 bytes even though the active rseq area was only 20 bytes. This had the following impact in terms of userspace ecosystem evolution: * The GNU libc between 2.35 and 2.39 expose a __rseq_size symbol set to 32, even though the size of the active rseq area is really 20. * The GNU libc 2.40 changes this __rseq_size to 20, thus making it express the active rseq area. * Starting from glibc 2.41, __rseq_size corresponds to the AT_RSEQ_FEATURE_SIZE from getauxval(3). This means that users of __rseq_size can always expect it to correspond to the active rseq area, except for the value 32, for which the active rseq area is 20 bytes. Exposing a 32 bytes feature size would make life needlessly painful for userspace. Therefore, add a reserved field at the end of the rseq area to bump the feature size to 33 bytes. This reserved field is expected to be replaced with whatever field will come next, expecting that this field will be larger than 1 byte. The effect of this change is to increase the size from 32 to 64 bytes before we actually have fields using that memory. Clarify the allocation size and alignment requirements in the struct rseq uapi comment. Change the value returned by getauxval(AT_RSEQ_ALIGN) to return the value of the active rseq area size rounded up to next power of 2, which guarantees that the rseq structure will always be aligned on the nearest power of two large enough to contain it, even as it grows. Change the alignment check in the rseq registration accordingly. This will minimize the amount of ABI corner-cases we need to document and require userspace to play games with. The rule stays simple when __rseq_size != 32: #define rseq_field_available(field) (__rseq_size >= offsetofend(struct rseq_abi, field)) Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260220200642.1317826-3-mathieu.desnoyers@efficios.com --- include/linux/rseq.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rseq.h b/include/linux/rseq.h index 7a01a0760405..b9d62fc2140d 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -146,6 +146,18 @@ static inline void rseq_fork(struct task_struct *t, u64 clone_flags) t->rseq = current->rseq; } +/* + * Value returned by getauxval(AT_RSEQ_ALIGN) and expected by rseq + * registration. This is the active rseq area size rounded up to next + * power of 2, which guarantees that the rseq structure will always be + * aligned on the nearest power of two large enough to contain it, even + * as it grows. + */ +static inline unsigned int rseq_alloc_align(void) +{ + return 1U << get_count_order(offsetof(struct rseq, end)); +} + #else /* CONFIG_RSEQ */ static inline void rseq_handle_slowpath(struct pt_regs *regs) { } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } -- cgit v1.2.3 From 901084c51a0a8fb42a3f37d2e9c62083c495f824 Mon Sep 17 00:00:00 2001 From: Penghe Geng Date: Thu, 19 Feb 2026 15:29:54 -0500 Subject: mmc: core: Avoid bitfield RMW for claim/retune flags Move claimed and retune control flags out of the bitfield word to avoid unrelated RMW side effects in asynchronous contexts. The host->claimed bit shared a word with retune flags. Writes to claimed in __mmc_claim_host() or retune_now in mmc_mq_queue_rq() can overwrite other bits when concurrent updates happen in other contexts, triggering spurious WARN_ON(!host->claimed). Convert claimed, can_retune, retune_now and retune_paused to bool to remove shared-word coupling. Fixes: 6c0cedd1ef952 ("mmc: core: Introduce host claiming by context") Fixes: 1e8e55b67030c ("mmc: block: Add CQE support") Cc: stable@vger.kernel.org Suggested-by: Adrian Hunter Signed-off-by: Penghe Geng Acked-by: Adrian Hunter Signed-off-by: Ulf Hansson --- include/linux/mmc/host.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index e0e2c265e5d1..ba84f02c2a10 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -486,14 +486,12 @@ struct mmc_host { struct mmc_ios ios; /* current io bus settings */ + bool claimed; /* host exclusively claimed */ + /* group bitfields together to minimize padding */ unsigned int use_spi_crc:1; - unsigned int claimed:1; /* host exclusively claimed */ unsigned int doing_init_tune:1; /* initial tuning in progress */ - unsigned int can_retune:1; /* re-tuning can be used */ unsigned int doing_retune:1; /* re-tuning in progress */ - unsigned int retune_now:1; /* do re-tuning at next req */ - unsigned int retune_paused:1; /* re-tuning is temporarily disabled */ unsigned int retune_crc_disable:1; /* don't trigger retune upon crc */ unsigned int can_dma_map_merge:1; /* merging can be used */ unsigned int vqmmc_enabled:1; /* vqmmc regulator is enabled */ @@ -508,6 +506,9 @@ struct mmc_host { int rescan_disable; /* disable card detection */ int rescan_entered; /* used with nonremovable devices */ + bool can_retune; /* re-tuning can be used */ + bool retune_now; /* do re-tuning at next req */ + bool retune_paused; /* re-tuning is temporarily disabled */ int need_retune; /* re-tuning is needed */ int hold_retune; /* hold off re-tuning */ unsigned int retune_period; /* re-tuning period in secs */ -- cgit v1.2.3 From 3afd8df024339c7da1a5a0302f3987866dd16e40 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 22 Dec 2025 21:36:25 +0100 Subject: PM: runtime: Change pm_runtime_put() return type to void The primary role of pm_runtime_put() is to decrement the runtime PM usage counter of the given device. It always does that regardless of the value returned by it later. In addition, if the runtime PM usage counter after decrementation turns out to be zero, a work item is queued up to check whether or not the device can be suspended. This is not guaranteed to succeed though and even if it is successful, the device may still not be suspended going forward. There are multiple valid reasons why pm_runtime_put() may not decide to queue up the work item mentioned above, including, but not limited to, the case when user space has written "on" to the device's runtime PM "control" file in sysfs. In all of those cases, pm_runtime_put() returns a negative error code (even though the device's runtime PM usage counter has been successfully decremented by it) which is very confusing. In fact, its return value should only be used for debug purposes and care should be taken when doing it even in that case. Accordingly, to avoid the confusion mentioned above, change the return type of pm_runtime_put() to void. Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson Reviewed-by: Brian Norris Link: https://patch.msgid.link/14387202.RDIVbhacDa@rafael.j.wysocki --- include/linux/pm_runtime.h | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 41037c513f06..64921b10ac74 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -545,22 +545,10 @@ static inline int pm_runtime_resume_and_get(struct device *dev) * * Decrement the runtime PM usage counter of @dev and if it turns out to be * equal to 0, queue up a work item for @dev like in pm_request_idle(). - * - * Return: - * * 1: Success. Usage counter dropped to zero, but device was already suspended. - * * 0: Success. - * * -EINVAL: Runtime PM error. - * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage counter became non-zero or Runtime PM status - * change ongoing. - * * -EBUSY: Runtime PM child_count non-zero. - * * -EPERM: Device PM QoS resume latency 0. - * * -EINPROGRESS: Suspend already in progress. - * * -ENOSYS: CONFIG_PM not enabled. */ -static inline int pm_runtime_put(struct device *dev) +static inline void pm_runtime_put(struct device *dev) { - return __pm_runtime_idle(dev, RPM_GET_PUT | RPM_ASYNC); + __pm_runtime_idle(dev, RPM_GET_PUT | RPM_ASYNC); } /** -- cgit v1.2.3 From 2a7b7652b1bb3fadc3bd47d622bfb127a93ab6b0 Mon Sep 17 00:00:00 2001 From: Leif Skunberg Date: Tue, 10 Feb 2026 14:21:29 +0100 Subject: platform/x86: int3472: Handle GPIO type 0x10 (DOVDD) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Lenovo ThinkPad X1 Fold 16 Gen 1 has an OV5675 sensor (ACPI HID OVTI5675) behind an INT3472 discrete PMIC controller. The INT3472 _DSM returns GPIO type 0x10 for one of the pins, which controls the DOVDD (digital I/O power) regulator enable. Type 0x10 is not currently handled by the driver, causing the GPIO to be ignored with a warning. Add INT3472_GPIO_TYPE_DOVDD (0x10) and handle it as a regulator with con_id "dovdd" to match the supply name used by sensor drivers (e.g. ov5675). Also increase GPIO_SUPPLY_NAME_LENGTH from 5 to 6 to accommodate the "dovdd" name (5 chars + null terminator). Signed-off-by: Leif Skunberg Reviewed-by: Hans de Goede Link: https://patch.msgid.link/20260210132129.17943-1-diamondback@cohunt.app Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/platform_data/x86/int3472.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/int3472.h b/include/linux/platform_data/x86/int3472.h index b1b837583d54..dbe745dc88d5 100644 --- a/include/linux/platform_data/x86/int3472.h +++ b/include/linux/platform_data/x86/int3472.h @@ -26,6 +26,7 @@ #define INT3472_GPIO_TYPE_POWER_ENABLE 0x0b #define INT3472_GPIO_TYPE_CLK_ENABLE 0x0c #define INT3472_GPIO_TYPE_PRIVACY_LED 0x0d +#define INT3472_GPIO_TYPE_DOVDD 0x10 #define INT3472_GPIO_TYPE_HANDSHAKE 0x12 #define INT3472_GPIO_TYPE_HOTPLUG_DETECT 0x13 @@ -33,8 +34,8 @@ #define INT3472_MAX_SENSOR_GPIOS 3 #define INT3472_MAX_REGULATORS 3 -/* E.g. "avdd\0" */ -#define GPIO_SUPPLY_NAME_LENGTH 5 +/* E.g. "dovdd\0" */ +#define GPIO_SUPPLY_NAME_LENGTH 6 /* 12 chars for acpi_dev_name() + "-", e.g. "ABCD1234:00-" */ #define GPIO_REGULATOR_NAME_LENGTH (12 + GPIO_SUPPLY_NAME_LENGTH) /* lower- and upper-case mapping */ -- cgit v1.2.3 From 551d44200152cb26f75d2ef990aeb6185b7e37fd Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 23 Feb 2026 09:33:08 -0800 Subject: default_gfp(): avoid using the "newfangled" __VA_OPT__ trick The default_gfp() helper that I added is not wrong, but it turns out that it causes unnecessary headaches for 'sparse' which doesn't support the use of __VA_OPT__ (introduced in C++20 and C23, and supported by gcc and clang for a long time). We do already use __VA_OPT__ in some other cases in the kernel (drm/xe and btrfs), but it has been fairly limited. Now it triggers for pretty much everything, and sparse ends up not working at all. We can use the traditional gcc ',##__VA_ARGS__' syntax instead: it may not be the "C standard" way and is slightly less natural in this context, but it is the traditional model for this and avoids the sparse problem. Reported-and-tested-by: Ricardo Ribalda Reported-and-tested-by: Richard Fitzgerald Reported-by: Ben Dooks Fixes: e19e1b480ac7 ("add default_gfp() helper macro and use it in the new *alloc_obj() helpers") Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 2b30a0529d48..90536b2bc42e 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -14,8 +14,8 @@ struct vm_area_struct; struct mempolicy; /* Helper macro to avoid gfp flags if they are the default one */ -#define __default_gfp(a,...) a -#define default_gfp(...) __default_gfp(__VA_ARGS__ __VA_OPT__(,) GFP_KERNEL) +#define __default_gfp(a,b,...) b +#define default_gfp(...) __default_gfp(,##__VA_ARGS__,GFP_KERNEL) /* Convert GFP flags to their corresponding migrate type */ #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE) -- cgit v1.2.3 From eb9549346f7578eda3755683ac2cfb4d94c0675f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 16 Feb 2026 13:17:44 +0100 Subject: mm: change vma_alloc_folio_noprof() macro to inline function In a few rare configurations with extra warnings eanbled, the new drm_pagemap_migrate_populate_ram_pfn() calls vma_alloc_folio_noprof() but that does not use all the arguments, leading to a harmless warning: drivers/gpu/drm/drm_pagemap.c: In function 'drm_pagemap_migrate_populate_ram_pfn': drivers/gpu/drm/drm_pagemap.c:701:63: error: parameter 'addr' set but not used [-Werror=unused-but-set-parameter=] 701 | unsigned long addr) | ~~~~~~~~~~~~~~^~~~ Replace the macro with an inline function so the compiler can see how the argument would be used, but is still able to optimize out the assignments. Link: https://lkml.kernel.org/r/20260216121751.2378374-1-arnd@kernel.org Signed-off-by: Arnd Bergmann Reviewed-by: Lorenzo Stoakes Acked-by: Zi Yan Reviewed-by: Suren Baghdasaryan Cc: Alexei Starovoitov Cc: Brendan Jackman Cc: David Hildenbrand Cc: Johannes Weiner Cc: Joshua Hahn Cc: Kefeng Wang Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Shakeel Butt Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/gfp.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 2b30a0529d48..f82d74a77cad 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -339,8 +339,11 @@ static inline struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int orde { return folio_alloc_noprof(gfp, order); } -#define vma_alloc_folio_noprof(gfp, order, vma, addr) \ - folio_alloc_noprof(gfp, order) +static inline struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, + struct vm_area_struct *vma, unsigned long addr) +{ + return folio_alloc_noprof(gfp, order); +} #endif #define alloc_pages(...) alloc_hooks(alloc_pages_noprof(__VA_ARGS__)) -- cgit v1.2.3 From f85b1c6af5bc3872f994df0a5688c1162de07a62 Mon Sep 17 00:00:00 2001 From: "Pratyush Yadav (Google)" Date: Mon, 16 Feb 2026 14:22:19 +0100 Subject: liveupdate: luo_file: remember retrieve() status LUO keeps track of successful retrieve attempts on a LUO file. It does so to avoid multiple retrievals of the same file. Multiple retrievals cause problems because once the file is retrieved, the serialized data structures are likely freed and the file is likely in a very different state from what the code expects. The retrieve boolean in struct luo_file keeps track of this, and is passed to the finish callback so it knows what work was already done and what it has left to do. All this works well when retrieve succeeds. When it fails, luo_retrieve_file() returns the error immediately, without ever storing anywhere that a retrieve was attempted or what its error code was. This results in an errored LIVEUPDATE_SESSION_RETRIEVE_FD ioctl to userspace, but nothing prevents it from trying this again. The retry is problematic for much of the same reasons listed above. The file is likely in a very different state than what the retrieve logic normally expects, and it might even have freed some serialization data structures. Attempting to access them or free them again is going to break things. For example, if memfd managed to restore 8 of its 10 folios, but fails on the 9th, a subsequent retrieve attempt will try to call kho_restore_folio() on the first folio again, and that will fail with a warning since it is an invalid operation. Apart from the retry, finish() also breaks. Since on failure the retrieved bool in luo_file is never touched, the finish() call on session close will tell the file handler that retrieve was never attempted, and it will try to access or free the data structures that might not exist, much in the same way as the retry attempt. There is no sane way of attempting the retrieve again. Remember the error retrieve returned and directly return it on a retry. Also pass this status code to finish() so it can make the right decision on the work it needs to do. This is done by changing the bool to an integer. A value of 0 means retrieve was never attempted, a positive value means it succeeded, and a negative value means it failed and the error code is the value. Link: https://lkml.kernel.org/r/20260216132221.987987-1-pratyush@kernel.org Fixes: 7c722a7f44e0 ("liveupdate: luo_file: implement file systems callbacks") Signed-off-by: Pratyush Yadav (Google) Reviewed-by: Mike Rapoport (Microsoft) Cc: Pasha Tatashin Cc: Signed-off-by: Andrew Morton --- include/linux/liveupdate.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/liveupdate.h b/include/linux/liveupdate.h index fe82a6c3005f..dd11fdc76a5f 100644 --- a/include/linux/liveupdate.h +++ b/include/linux/liveupdate.h @@ -23,8 +23,11 @@ struct file; /** * struct liveupdate_file_op_args - Arguments for file operation callbacks. * @handler: The file handler being called. - * @retrieved: The retrieve status for the 'can_finish / finish' - * operation. + * @retrieve_status: The retrieve status for the 'can_finish / finish' + * operation. A value of 0 means the retrieve has not been + * attempted, a positive value means the retrieve was + * successful, and a negative value means the retrieve failed, + * and the value is the error code of the call. * @file: The file object. For retrieve: [OUT] The callback sets * this to the new file. For other ops: [IN] The caller sets * this to the file being operated on. @@ -40,7 +43,7 @@ struct file; */ struct liveupdate_file_op_args { struct liveupdate_file_handler *handler; - bool retrieved; + int retrieve_status; struct file *file; u64 serialized_data; void *private_data; -- cgit v1.2.3 From 4b44cbb264d0ed3f2f2bc2659db6ce45882f4670 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 24 Feb 2026 15:24:52 -0800 Subject: overflow: Make sure size helpers are always inlined With kmalloc_obj() performing implicit size calculations, the embedded size_mul() calls, while marked inline, were not always being inlined. I noticed a couple places where allocations were making a call out for things that would otherwise be compile-time calculated. Force the compilers to always inline these calculations. Reviewed-by: Gustavo A. R. Silva Link: https://patch.msgid.link/20260224232451.work.614-kees@kernel.org Signed-off-by: Kees Cook --- include/linux/overflow.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/overflow.h b/include/linux/overflow.h index eddd987a8513..a8cb6319b4fb 100644 --- a/include/linux/overflow.h +++ b/include/linux/overflow.h @@ -42,7 +42,7 @@ * both the type-agnostic benefits of the macros while also being able to * enforce that the return value is, in fact, checked. */ -static inline bool __must_check __must_check_overflow(bool overflow) +static __always_inline bool __must_check __must_check_overflow(bool overflow) { return unlikely(overflow); } @@ -327,7 +327,7 @@ static inline bool __must_check __must_check_overflow(bool overflow) * with any overflow causing the return value to be SIZE_MAX. The * lvalue must be size_t to avoid implicit type conversion. */ -static inline size_t __must_check size_mul(size_t factor1, size_t factor2) +static __always_inline size_t __must_check size_mul(size_t factor1, size_t factor2) { size_t bytes; @@ -346,7 +346,7 @@ static inline size_t __must_check size_mul(size_t factor1, size_t factor2) * with any overflow causing the return value to be SIZE_MAX. The * lvalue must be size_t to avoid implicit type conversion. */ -static inline size_t __must_check size_add(size_t addend1, size_t addend2) +static __always_inline size_t __must_check size_add(size_t addend1, size_t addend2) { size_t bytes; @@ -367,7 +367,7 @@ static inline size_t __must_check size_add(size_t addend1, size_t addend2) * argument may be SIZE_MAX (or the result with be forced to SIZE_MAX). * The lvalue must be size_t to avoid implicit type conversion. */ -static inline size_t __must_check size_sub(size_t minuend, size_t subtrahend) +static __always_inline size_t __must_check size_sub(size_t minuend, size_t subtrahend) { size_t bytes; -- cgit v1.2.3 From 28aaa9c39945b7925a1cc1d513c8f21ed38f5e4f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 26 Feb 2026 10:43:55 +0100 Subject: kthread: consolidate kthread exit paths to prevent use-after-free Guillaume reported crashes via corrupted RCU callback function pointers during KUnit testing. The crash was traced back to the pidfs rhashtable conversion which replaced the 24-byte rb_node with an 8-byte rhash_head in struct pid, shrinking it from 160 to 144 bytes. struct kthread (without CONFIG_BLK_CGROUP) is also 144 bytes. With CONFIG_SLAB_MERGE_DEFAULT and SLAB_HWCACHE_ALIGN both round up to 192 bytes and share the same slab cache. struct pid.rcu.func and struct kthread.affinity_node both sit at offset 0x78. When a kthread exits via make_task_dead() it bypasses kthread_exit() and misses the affinity_node cleanup. free_kthread_struct() frees the memory while the node is still linked into the global kthread_affinity_list. A subsequent list_del() by another kthread writes through dangling list pointers into the freed and reused memory, corrupting the pid's rcu.func pointer. Instead of patching free_kthread_struct() to handle the missed cleanup, consolidate all kthread exit paths. Turn kthread_exit() into a macro that calls do_exit() and add kthread_do_exit() which is called from do_exit() for any task with PF_KTHREAD set. This guarantees that kthread-specific cleanup always happens regardless of the exit path - make_task_dead(), direct do_exit(), or kthread_exit(). Replace __to_kthread() with a new tsk_is_kthread() accessor in the public header. Export do_exit() since module code using the kthread_exit() macro now needs it directly. Reported-by: Guillaume Tucker Tested-by: Guillaume Tucker Tested-by: Mark Brown Tested-by: David Gow Cc: Link: https://lore.kernel.org/all/20260224-mittlerweile-besessen-2738831ae7f6@brauner Co-developed-by: Linus Torvalds Fixes: 4d13f4304fa4 ("kthread: Implement preferred affinity") Signed-off-by: Linus Torvalds Signed-off-by: Christian Brauner --- include/linux/kthread.h | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kthread.h b/include/linux/kthread.h index c92c1149ee6e..a01a474719a7 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -7,6 +7,24 @@ struct mm_struct; +/* opaque kthread data */ +struct kthread; + +/* + * When "(p->flags & PF_KTHREAD)" is set the task is a kthread and will + * always remain a kthread. For kthreads p->worker_private always + * points to a struct kthread. For tasks that are not kthreads + * p->worker_private is used to point to other things. + * + * Return NULL for any task that is not a kthread. + */ +static inline struct kthread *tsk_is_kthread(struct task_struct *p) +{ + if (p->flags & PF_KTHREAD) + return p->worker_private; + return NULL; +} + __printf(4, 5) struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), void *data, @@ -98,9 +116,10 @@ void *kthread_probe_data(struct task_struct *k); int kthread_park(struct task_struct *k); void kthread_unpark(struct task_struct *k); void kthread_parkme(void); -void kthread_exit(long result) __noreturn; +#define kthread_exit(result) do_exit(result) void kthread_complete_and_exit(struct completion *, long) __noreturn; int kthreads_update_housekeeping(void); +void kthread_do_exit(struct kthread *, long); int kthreadd(void *unused); extern struct task_struct *kthreadd_task; -- cgit v1.2.3 From f3ec502b6755a3bfb12c1c47025ef989ff9efc72 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Wed, 25 Feb 2026 08:34:07 -0800 Subject: mm/slab: mark alloc tags empty for sheaves allocated with __GFP_NO_OBJ_EXT alloc_empty_sheaf() allocates sheaves from SLAB_KMALLOC caches using __GFP_NO_OBJ_EXT to avoid recursion, however it does not mark their allocation tags empty before freeing, which results in a warning when CONFIG_MEM_ALLOC_PROFILING_DEBUG is set. Fix this by marking allocation tags for such sheaves as empty. The problem was technically introduced in commit 4c0a17e28340 but only becomes possible to hit with commit 913ffd3a1bf5. Fixes: 4c0a17e28340 ("slab: prevent recursive kmalloc() in alloc_empty_sheaf()") Fixes: 913ffd3a1bf5 ("slab: handle kmalloc sheaves bootstrap") Reported-by: David Wang <00107082@163.com> Closes: https://lore.kernel.org/all/20260223155128.3849-1-00107082@163.com/ Analyzed-by: Harry Yoo Signed-off-by: Suren Baghdasaryan Reviewed-by: Harry Yoo Tested-by: Harry Yoo Tested-by: David Wang <00107082@163.com> Link: https://patch.msgid.link/20260225163407.2218712-1-surenb@google.com Signed-off-by: Vlastimil Babka (SUSE) --- include/linux/gfp_types.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index 814bb2892f99..6c75df30a281 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -139,6 +139,8 @@ enum { * %__GFP_ACCOUNT causes the allocation to be accounted to kmemcg. * * %__GFP_NO_OBJ_EXT causes slab allocation to have no object extension. + * mark_obj_codetag_empty() should be called upon freeing for objects allocated + * with this flag to indicate that their NULL tags are expected and normal. */ #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) #define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) -- cgit v1.2.3 From 2b351ea42820a7ecc2e8305724536512984f4419 Mon Sep 17 00:00:00 2001 From: Sanjay Chitroda Date: Thu, 26 Feb 2026 11:17:12 +0530 Subject: mm/slub: drop duplicate kernel-doc for ksize() The implementation of ksize() was updated with kernel-doc by commit fab0694646d7 ("mm/slab: move [__]ksize and slab_ksize() to mm/slub.c") However, the public header still contains a kernel-doc comment attached to the ksize() prototype. Having documentation both in the header and next to the implementation causes Sphinx to treat the function as being documented twice, resulting in the warning: WARNING: Duplicate C declaration, also defined at core-api/mm-api:521 Declaration is '.. c:function:: size_t ksize(const void *objp)' Kernel-doc guidelines recommend keeping the documentation with the function implementation. Therefore remove the redundant kernel-doc block from include/linux/slab.h so that the implementation in slub.c remains the canonical source for documentation. No functional change. Fixes: fab0694646d7 ("mm/slab: move [__]ksize and slab_ksize() to mm/slub.c") Signed-off-by: Sanjay Chitroda Link: https://patch.msgid.link/20260226054712.3610744-1-sanjayembedded@gmail.com Signed-off-by: Vlastimil Babka (SUSE) --- include/linux/slab.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index a5a5e4108ae5..15a60b501b95 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -517,18 +517,6 @@ void kfree_sensitive(const void *objp); DEFINE_FREE(kfree, void *, if (!IS_ERR_OR_NULL(_T)) kfree(_T)) DEFINE_FREE(kfree_sensitive, void *, if (_T) kfree_sensitive(_T)) -/** - * ksize - Report actual allocation size of associated object - * - * @objp: Pointer returned from a prior kmalloc()-family allocation. - * - * This should not be used for writing beyond the originally requested - * allocation size. Either use krealloc() or round up the allocation size - * with kmalloc_size_roundup() prior to allocation. If this is used to - * access beyond the originally requested allocation size, UBSAN_BOUNDS - * and/or FORTIFY_SOURCE may trip, since they only know about the - * originally allocated size via the __alloc_size attribute. - */ size_t ksize(const void *objp); #ifdef CONFIG_PRINTK -- cgit v1.2.3 From 3350c2b3f2b8a3b985a020a4ef4f2f050a4b6a1d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 25 Feb 2026 21:12:29 -0800 Subject: platform_data/mlxreg: mlxreg.h: fix all kernel-doc warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the correct kernel-doc format & notation to eliminate kernel-doc warnings: Warning: include/linux/platform_data/mlxreg.h:24 Enum value 'MLX_WDT_TYPE1' not described in enum 'mlxreg_wdt_type' Warning: include/linux/platform_data/mlxreg.h:24 Enum value 'MLX_WDT_TYPE2' not described in enum 'mlxreg_wdt_type' Warning: include/linux/platform_data/mlxreg.h:24 Enum value 'MLX_WDT_TYPE3' not described in enum 'mlxreg_wdt_type' Warning: include/linux/platform_data/mlxreg.h:37 bad line: PHYs ready / unready state; Warning: include/linux/platform_data/mlxreg.h:153 struct member 'np' not described in 'mlxreg_core_data' Warning: include/linux/platform_data/mlxreg.h:153 struct member 'hpdev' not described in 'mlxreg_core_data' Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20260226051232.549537-1-rdunlap@infradead.org Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/platform_data/mlxreg.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/mlxreg.h b/include/linux/platform_data/mlxreg.h index f6cca7a035c7..50b6be57da66 100644 --- a/include/linux/platform_data/mlxreg.h +++ b/include/linux/platform_data/mlxreg.h @@ -13,10 +13,10 @@ /** * enum mlxreg_wdt_type - type of HW watchdog * - * TYPE1 HW watchdog implementation exist in old systems. - * All new systems have TYPE2 HW watchdog. - * TYPE3 HW watchdog can exist on all systems with new CPLD. - * TYPE3 is selected by WD capability bit. + * @MLX_WDT_TYPE1: HW watchdog implementation in old systems. + * @MLX_WDT_TYPE2: All new systems have TYPE2 HW watchdog. + * @MLX_WDT_TYPE3: HW watchdog that can exist on all systems with new CPLD. + * TYPE3 is selected by WD capability bit. */ enum mlxreg_wdt_type { MLX_WDT_TYPE1, @@ -35,7 +35,7 @@ enum mlxreg_wdt_type { * @MLXREG_HOTPLUG_LC_SYNCED: entry for line card synchronization events, coming * after hardware-firmware synchronization handshake; * @MLXREG_HOTPLUG_LC_READY: entry for line card ready events, indicating line card - PHYs ready / unready state; + * PHYs ready / unready state; * @MLXREG_HOTPLUG_LC_ACTIVE: entry for line card active events, indicating firmware * availability / unavailability for the ports on line card; * @MLXREG_HOTPLUG_LC_THERMAL: entry for line card thermal shutdown events, positive @@ -123,8 +123,8 @@ struct mlxreg_hotplug_device { * @reg_pwr: attribute power register; * @reg_ena: attribute enable register; * @mode: access mode; - * @np - pointer to node platform associated with attribute; - * @hpdev - hotplug device data; + * @np: pointer to node platform associated with attribute; + * @hpdev: hotplug device data; * @notifier: pointer to event notifier block; * @health_cntr: dynamic device health indication counter; * @attached: true if device has been attached after good health indication; -- cgit v1.2.3 From e6b899f08066e744f89df16ceb782e06868bd148 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 26 Feb 2026 14:50:09 +0100 Subject: nsfs: tighten permission checks for ns iteration ioctls Even privileged services should not necessarily be able to see other privileged service's namespaces so they can't leak information to each other. Use may_see_all_namespaces() helper that centralizes this policy until the nstree adapts. Link: https://patch.msgid.link/20260226-work-visibility-fixes-v1-1-d2c2853313bd@kernel.org Fixes: a1d220d9dafa ("nsfs: iterate through mount namespaces") Reviewed-by: Jeff Layton Cc: stable@kernel.org # v6.12+ Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 825f5865bfc5..c8e227a3f9e2 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -55,6 +55,8 @@ static __always_inline bool is_ns_init_id(const struct ns_common *ns) #define ns_common_free(__ns) __ns_common_free(to_ns_common((__ns))) +bool may_see_all_namespaces(void); + static __always_inline __must_check int __ns_ref_active_read(const struct ns_common *ns) { return atomic_read(&ns->__ns_ref_active); -- cgit v1.2.3 From 1df97a7453eec80c1912c2d0360290a3970a7671 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 27 Feb 2026 14:48:01 -0800 Subject: bpf: Register dtor for freeing special fields There is a race window where BPF hash map elements can leak special fields if the program with access to the map value recreates these special fields between the check_and_free_fields done on the map value and its eventual return to the memory allocator. Several ways were explored prior to this patch, most notably [0] tried to use a poison value to reject attempts to recreate special fields for map values that have been logically deleted but still accessible to BPF programs (either while sitting in the free list or when reused). While this approach works well for task work, timers, wq, etc., it is harder to apply the idea to kptrs, which have a similar race and failure mode. Instead, we change bpf_mem_alloc to allow registering destructor for allocated elements, such that when they are returned to the allocator, any special fields created while they were accessible to programs in the mean time will be freed. If these values get reused, we do not free the fields again before handing the element back. The special fields thus may remain initialized while the map value sits in a free list. When bpf_mem_alloc is retired in the future, a similar concept can be introduced to kmalloc_nolock-backed kmem_cache, paired with the existing idea of a constructor. Note that the destructor registration happens in map_check_btf, after the BTF record is populated and (at that point) avaiable for inspection and duplication. Duplication is necessary since the freeing of embedded bpf_mem_alloc can be decoupled from actual map lifetime due to logic introduced to reduce the cost of rcu_barrier()s in mem alloc free path in 9f2c6e96c65e ("bpf: Optimize rcu_barrier usage between hash map and bpf_mem_alloc."). As such, once all callbacks are done, we must also free the duplicated record. To remove dependency on the bpf_map itself, also stash the key size of the map to obtain value from htab_elem long after the map is gone. [0]: https://lore.kernel.org/bpf/20260216131341.1285427-1-mykyta.yatsenko5@gmail.com Fixes: 14a324f6a67e ("bpf: Wire up freeing of referenced kptr") Fixes: 1bfbc267ec91 ("bpf: Enable bpf_timer and bpf_wq in any context") Reported-by: Alexei Starovoitov Tested-by: syzbot@syzkaller.appspotmail.com Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260227224806.646888-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_mem_alloc.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h index e45162ef59bb..4ce0d27f8ea2 100644 --- a/include/linux/bpf_mem_alloc.h +++ b/include/linux/bpf_mem_alloc.h @@ -14,6 +14,8 @@ struct bpf_mem_alloc { struct obj_cgroup *objcg; bool percpu; struct work_struct work; + void (*dtor_ctx_free)(void *ctx); + void *dtor_ctx; }; /* 'size != 0' is for bpf_mem_alloc which manages fixed-size objects. @@ -32,6 +34,10 @@ int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg /* The percpu allocation with a specific unit size. */ int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size); void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma); +void bpf_mem_alloc_set_dtor(struct bpf_mem_alloc *ma, + void (*dtor)(void *obj, void *ctx), + void (*dtor_ctx_free)(void *ctx), + void *ctx); /* Check the allocation size for kmalloc equivalent allocator */ int bpf_mem_alloc_check_size(bool percpu, size_t size); -- cgit v1.2.3 From ae51772b1e94ba1d76db19085957dbccac189c1c Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 27 Feb 2026 14:48:02 -0800 Subject: bpf: Lose const-ness of map in map_check_btf() BPF hash map may now use the map_check_btf() callback to decide whether to set a dtor on its bpf_mem_alloc or not. Unlike C++ where members can opt out of const-ness using mutable, we must lose the const qualifier on the callback such that we can avoid the ugly cast. Make the change and adjust all existing users, and lose the comment in hashtab.c. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260227224806.646888-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 4 ++-- include/linux/bpf_local_storage.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b78b53198a2e..05b34a6355b0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -124,7 +124,7 @@ struct bpf_map_ops { u32 (*map_fd_sys_lookup_elem)(void *ptr); void (*map_seq_show_elem)(struct bpf_map *map, void *key, struct seq_file *m); - int (*map_check_btf)(const struct bpf_map *map, + int (*map_check_btf)(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type); @@ -656,7 +656,7 @@ static inline bool bpf_map_support_seq_show(const struct bpf_map *map) map->ops->map_seq_show_elem; } -int map_check_no_btf(const struct bpf_map *map, +int map_check_no_btf(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type); diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 85efa9772530..8157e8da61d4 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -176,7 +176,7 @@ u32 bpf_local_storage_destroy(struct bpf_local_storage *local_storage); void bpf_local_storage_map_free(struct bpf_map *map, struct bpf_local_storage_cache *cache); -int bpf_local_storage_map_check_btf(const struct bpf_map *map, +int bpf_local_storage_map_check_btf(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type); -- cgit v1.2.3 From 76e954155b45294c502e3d3a9e15757c858ca55e Mon Sep 17 00:00:00 2001 From: Harishankar Vishwanathan Date: Fri, 27 Feb 2026 22:32:21 +0100 Subject: bpf: Introduce tnum_step to step through tnum's members This commit introduces tnum_step(), a function that, when given t, and a number z returns the smallest member of t larger than z. The number z must be greater or equal to the smallest member of t and less than the largest member of t. The first step is to compute j, a number that keeps all of t's known bits, and matches all unknown bits to z's bits. Since j is a member of the t, it is already a candidate for result. However, we want our result to be (minimally) greater than z. There are only two possible cases: (1) Case j <= z. In this case, we want to increase the value of j and make it > z. (2) Case j > z. In this case, we want to decrease the value of j while keeping it > z. (Case 1) j <= z t = xx11x0x0 z = 10111101 (189) j = 10111000 (184) ^ k (Case 1.1) Let's first consider the case where j < z. We will address j == z later. Since z > j, there had to be a bit position that was 1 in z and a 0 in j, beyond which all positions of higher significance are equal in j and z. Further, this position could not have been unknown in a, because the unknown positions of a match z. This position had to be a 1 in z and known 0 in t. Let k be position of the most significant 1-to-0 flip. In our example, k = 3 (starting the count at 1 at the least significant bit). Setting (to 1) the unknown bits of t in positions of significance smaller than k will not produce a result > z. Hence, we must set/unset the unknown bits at positions of significance higher than k. Specifically, we look for the next larger combination of 1s and 0s to place in those positions, relative to the combination that exists in z. We can achieve this by concatenating bits at unknown positions of t into an integer, adding 1, and writing the bits of that result back into the corresponding bit positions previously extracted from z. >From our example, considering only positions of significance greater than k: t = xx..x z = 10..1 + 1 ----- 11..0 This is the exact combination 1s and 0s we need at the unknown bits of t in positions of significance greater than k. Further, our result must only increase the value minimally above z. Hence, unknown bits in positions of significance smaller than k should remain 0. We finally have, result = 11110000 (240) (Case 1.2) Now consider the case when j = z, for example t = 1x1x0xxx z = 10110100 (180) j = 10110100 (180) Matching the unknown bits of the t to the bits of z yielded exactly z. To produce a number greater than z, we must set/unset the unknown bits in t, and *all* the unknown bits of t candidates for being set/unset. We can do this similar to Case 1.1, by adding 1 to the bits extracted from the masked bit positions of z. Essentially, this case is equivalent to Case 1.1, with k = 0. t = 1x1x0xxx z = .0.1.100 + 1 --------- .0.1.101 This is the exact combination of bits needed in the unknown positions of t. After recalling the known positions of t, we get result = 10110101 (181) (Case 2) j > z t = x00010x1 z = 10000010 (130) j = 10001011 (139) ^ k Since j > z, there had to be a bit position which was 0 in z, and a 1 in j, beyond which all positions of higher significance are equal in j and z. This position had to be a 0 in z and known 1 in t. Let k be the position of the most significant 0-to-1 flip. In our example, k = 4. Because of the 0-to-1 flip at position k, a member of t can become greater than z if the bits in positions greater than k are themselves >= to z. To make that member *minimally* greater than z, the bits in positions greater than k must be exactly = z. Hence, we simply match all of t's unknown bits in positions more significant than k to z's bits. In positions less significant than k, we set all t's unknown bits to 0 to retain minimality. In our example, in positions of greater significance than k (=4), t=x000. These positions are matched with z (1000) to produce 1000. In positions of lower significance than k, t=10x1. All unknown bits are set to 0 to produce 1001. The final result is: result = 10001001 (137) This concludes the computation for a result > z that is a member of t. The procedure for tnum_step() in this commit implements the idea described above. As a proof of correctness, we verified the algorithm against a logical specification of tnum_step. The specification asserts the following about the inputs t, z and output res that: 1. res is a member of t, and 2. res is strictly greater than z, and 3. there does not exist another value res2 such that 3a. res2 is also a member of t, and 3b. res2 is greater than z 3c. res2 is smaller than res We checked the implementation against this logical specification using an SMT solver. The verification formula in SMTLIB format is available at [1]. The verification returned an "unsat": indicating that no input assignment exists for which the implementation and the specification produce different outputs. In addition, we also automatically generated the logical encoding of the C implementation using Agni [2] and verified it against the same specification. This verification also returned an "unsat", confirming that the implementation is equivalent to the specification. The formula for this check is also available at [3]. Link: https://pastebin.com/raw/2eRWbiit [1] Link: https://github.com/bpfverif/agni [2] Link: https://pastebin.com/raw/EztVbBJ2 [3] Co-developed-by: Srinivas Narayana Signed-off-by: Srinivas Narayana Co-developed-by: Santosh Nagarakatte Signed-off-by: Santosh Nagarakatte Signed-off-by: Harishankar Vishwanathan Link: https://lore.kernel.org/r/93fdf71910411c0f19e282ba6d03b4c65f9c5d73.1772225741.git.paul.chaignon@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/tnum.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tnum.h b/include/linux/tnum.h index fa4654ffb621..ca2cfec8de08 100644 --- a/include/linux/tnum.h +++ b/include/linux/tnum.h @@ -131,4 +131,7 @@ static inline bool tnum_subreg_is_const(struct tnum a) return !(tnum_subreg(a)).mask; } +/* Returns the smallest member of t larger than z */ +u64 tnum_step(struct tnum t, u64 z); + #endif /* _LINUX_TNUM_H */ -- cgit v1.2.3 From 15fba71533bcdfaa8eeba69a5a5a2927afdf664a Mon Sep 17 00:00:00 2001 From: Valentin Spreckels Date: Thu, 26 Feb 2026 20:54:09 +0100 Subject: net: usb: r8152: add TRENDnet TUC-ET2G The TRENDnet TUC-ET2G is a RTL8156 based usb ethernet adapter. Add its vendor and product IDs. Signed-off-by: Valentin Spreckels Link: https://patch.msgid.link/20260226195409.7891-2-valentin@spreckels.dev Signed-off-by: Jakub Kicinski --- include/linux/usb/r8152.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/usb/r8152.h b/include/linux/usb/r8152.h index 2ca60828f28b..1502b2a355f9 100644 --- a/include/linux/usb/r8152.h +++ b/include/linux/usb/r8152.h @@ -32,6 +32,7 @@ #define VENDOR_ID_DLINK 0x2001 #define VENDOR_ID_DELL 0x413c #define VENDOR_ID_ASUS 0x0b05 +#define VENDOR_ID_TRENDNET 0x20f4 #if IS_REACHABLE(CONFIG_USB_RTL8152) extern u8 rtl8152_get_version(struct usb_interface *intf); -- cgit v1.2.3 From 407fd8b8d8cce03856aa67329715de48b254b529 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 11 Feb 2026 19:03:03 +0100 Subject: KVM: remove CONFIG_KVM_GENERIC_MMU_NOTIFIER All architectures now use MMU notifier for KVM page table management. Remove the Kconfig symbol and the code that is used when it is disabled. Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index dde605cb894e..34759a262b28 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -253,7 +253,6 @@ bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); #endif -#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER union kvm_mmu_notifier_arg { unsigned long attributes; }; @@ -275,7 +274,6 @@ struct kvm_gfn_range { bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); -#endif enum { OUTSIDE_GUEST_MODE, @@ -849,13 +847,12 @@ struct kvm { struct hlist_head irq_ack_notifier_list; #endif -#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER struct mmu_notifier mmu_notifier; unsigned long mmu_invalidate_seq; long mmu_invalidate_in_progress; gfn_t mmu_invalidate_range_start; gfn_t mmu_invalidate_range_end; -#endif + struct list_head devices; u64 manual_dirty_log_protect; struct dentry *debugfs_dentry; @@ -2118,7 +2115,6 @@ extern const struct _kvm_stats_desc kvm_vm_stats_desc[]; extern const struct kvm_stats_header kvm_vcpu_stats_header; extern const struct _kvm_stats_desc kvm_vcpu_stats_desc[]; -#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER static inline int mmu_invalidate_retry(struct kvm *kvm, unsigned long mmu_seq) { if (unlikely(kvm->mmu_invalidate_in_progress)) @@ -2196,7 +2192,6 @@ static inline bool mmu_invalidate_retry_gfn_unsafe(struct kvm *kvm, return READ_ONCE(kvm->mmu_invalidate_seq) != mmu_seq; } -#endif #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING -- cgit v1.2.3 From b570f37a2ce480be26c665345c5514686a8a0274 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Tue, 10 Feb 2026 12:56:53 +0100 Subject: mm: Fix a hmm_range_fault() livelock / starvation problem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If hmm_range_fault() fails a folio_trylock() in do_swap_page, trying to acquire the lock of a device-private folio for migration, to ram, the function will spin until it succeeds grabbing the lock. However, if the process holding the lock is depending on a work item to be completed, which is scheduled on the same CPU as the spinning hmm_range_fault(), that work item might be starved and we end up in a livelock / starvation situation which is never resolved. This can happen, for example if the process holding the device-private folio lock is stuck in migrate_device_unmap()->lru_add_drain_all() sinc lru_add_drain_all() requires a short work-item to be run on all online cpus to complete. A prerequisite for this to happen is: a) Both zone device and system memory folios are considered in migrate_device_unmap(), so that there is a reason to call lru_add_drain_all() for a system memory folio while a folio lock is held on a zone device folio. b) The zone device folio has an initial mapcount > 1 which causes at least one migration PTE entry insertion to be deferred to try_to_migrate(), which can happen after the call to lru_add_drain_all(). c) No or voluntary only preemption. This all seems pretty unlikely to happen, but indeed is hit by the "xe_exec_system_allocator" igt test. Resolve this by waiting for the folio to be unlocked if the folio_trylock() fails in do_swap_page(). Rename migration_entry_wait_on_locked() to softleaf_entry_wait_unlock() and update its documentation to indicate the new use-case. Future code improvements might consider moving the lru_add_drain_all() call in migrate_device_unmap() to be called *after* all pages have migration entries inserted. That would eliminate also b) above. v2: - Instead of a cond_resched() in hmm_range_fault(), eliminate the problem by waiting for the folio to be unlocked in do_swap_page() (Alistair Popple, Andrew Morton) v3: - Add a stub migration_entry_wait_on_locked() for the !CONFIG_MIGRATION case. (Kernel Test Robot) v4: - Rename migrate_entry_wait_on_locked() to softleaf_entry_wait_on_locked() and update docs (Alistair Popple) v5: - Add a WARN_ON_ONCE() for the !CONFIG_MIGRATION version of softleaf_entry_wait_on_locked(). - Modify wording around function names in the commit message (Andrew Morton) Suggested-by: Alistair Popple Fixes: 1afaeb8293c9 ("mm/migrate: Trylock device page in do_swap_page") Cc: Ralph Campbell Cc: Christoph Hellwig Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: Leon Romanovsky Cc: Andrew Morton Cc: Matthew Brost Cc: John Hubbard Cc: Alistair Popple Cc: linux-mm@kvack.org Cc: Signed-off-by: Thomas Hellström Cc: # v6.15+ Reviewed-by: John Hubbard #v3 Reviewed-by: Alistair Popple Link: https://patch.msgid.link/20260210115653.92413-1-thomas.hellstrom@linux.intel.com (cherry picked from commit a69d1ab971a624c6f112cea61536569d579c3215) Signed-off-by: Rodrigo Vivi --- include/linux/migrate.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 26ca00c325d9..d5af2b7f577b 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -65,7 +65,7 @@ bool isolate_folio_to_list(struct folio *folio, struct list_head *list); int migrate_huge_page_move_mapping(struct address_space *mapping, struct folio *dst, struct folio *src); -void migration_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl) +void softleaf_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl) __releases(ptl); void folio_migrate_flags(struct folio *newfolio, struct folio *folio); int folio_migrate_mapping(struct address_space *mapping, @@ -97,6 +97,14 @@ static inline int set_movable_ops(const struct movable_operations *ops, enum pag return -ENOSYS; } +static inline void softleaf_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl) + __releases(ptl) +{ + WARN_ON_ONCE(1); + + spin_unlock(ptl); +} + #endif /* CONFIG_MIGRATION */ #ifdef CONFIG_NUMA_BALANCING -- cgit v1.2.3 From af4e9ef3d78420feb8fe58cd9a1ab80c501b3c08 Mon Sep 17 00:00:00 2001 From: David Laight Date: Mon, 2 Mar 2026 13:27:51 +0000 Subject: uaccess: Fix scoped_user_read_access() for 'pointer to const' If a 'const struct foo __user *ptr' is used for the address passed to scoped_user_read_access() then you get a warning/error uaccess.h:691:1: error: initialization discards 'const' qualifier from pointer target type [-Werror=discarded-qualifiers] for the void __user *_tmpptr = __scoped_user_access_begin(mode, uptr, size, elbl) assignment. Fix by using 'auto' for both _tmpptr and the redeclaration of uptr. Replace the CLASS() with explicit __cleanup() functions on uptr. Fixes: e497310b4ffb ("uaccess: Provide scoped user access regions") Signed-off-by: David Laight Reviewed-and-tested-by: Christophe Leroy (CS GROUP) Signed-off-by: Linus Torvalds --- include/linux/uaccess.h | 54 ++++++++++++++++++------------------------------- 1 file changed, 20 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 1f3804245c06..809e4f7dfdbd 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -647,36 +647,22 @@ static inline void user_access_restore(unsigned long flags) { } /* Define RW variant so the below _mode macro expansion works */ #define masked_user_rw_access_begin(u) masked_user_access_begin(u) #define user_rw_access_begin(u, s) user_access_begin(u, s) -#define user_rw_access_end() user_access_end() /* Scoped user access */ -#define USER_ACCESS_GUARD(_mode) \ -static __always_inline void __user * \ -class_user_##_mode##_begin(void __user *ptr) \ -{ \ - return ptr; \ -} \ - \ -static __always_inline void \ -class_user_##_mode##_end(void __user *ptr) \ -{ \ - user_##_mode##_access_end(); \ -} \ - \ -DEFINE_CLASS(user_ ##_mode## _access, void __user *, \ - class_user_##_mode##_end(_T), \ - class_user_##_mode##_begin(ptr), void __user *ptr) \ - \ -static __always_inline class_user_##_mode##_access_t \ -class_user_##_mode##_access_ptr(void __user *scope) \ -{ \ - return scope; \ -} -USER_ACCESS_GUARD(read) -USER_ACCESS_GUARD(write) -USER_ACCESS_GUARD(rw) -#undef USER_ACCESS_GUARD +/* Cleanup wrapper functions */ +static __always_inline void __scoped_user_read_access_end(const void *p) +{ + user_read_access_end(); +}; +static __always_inline void __scoped_user_write_access_end(const void *p) +{ + user_write_access_end(); +}; +static __always_inline void __scoped_user_rw_access_end(const void *p) +{ + user_access_end(); +}; /** * __scoped_user_access_begin - Start a scoped user access @@ -750,13 +736,13 @@ USER_ACCESS_GUARD(rw) * * Don't use directly. Use scoped_masked_user_$MODE_access() instead. */ -#define __scoped_user_access(mode, uptr, size, elbl) \ -for (bool done = false; !done; done = true) \ - for (void __user *_tmpptr = __scoped_user_access_begin(mode, uptr, size, elbl); \ - !done; done = true) \ - for (CLASS(user_##mode##_access, scope)(_tmpptr); !done; done = true) \ - /* Force modified pointer usage within the scope */ \ - for (const typeof(uptr) uptr = _tmpptr; !done; done = true) +#define __scoped_user_access(mode, uptr, size, elbl) \ +for (bool done = false; !done; done = true) \ + for (auto _tmpptr = __scoped_user_access_begin(mode, uptr, size, elbl); \ + !done; done = true) \ + /* Force modified pointer usage within the scope */ \ + for (const auto uptr __cleanup(__scoped_user_##mode##_access_end) = \ + _tmpptr; !done; done = true) /** * scoped_user_read_access_size - Start a scoped user read access with given size -- cgit v1.2.3 From 710f5c76580306cdb9ec51fac8fcf6a8faff7821 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Feb 2026 17:26:03 +0000 Subject: indirect_call_wrapper: do not reevaluate function pointer We have an increasing number of READ_ONCE(xxx->function) combined with INDIRECT_CALL_[1234]() helpers. Unfortunately this forces INDIRECT_CALL_[1234]() to read xxx->function many times, which is not what we wanted. Fix these macros so that xxx->function value is not reloaded. $ scripts/bloat-o-meter -t vmlinux.0 vmlinux add/remove: 0/0 grow/shrink: 1/65 up/down: 122/-1084 (-962) Function old new delta ip_push_pending_frames 59 181 +122 ip6_finish_output 687 681 -6 __udp_enqueue_schedule_skb 1078 1072 -6 ioam6_output 2319 2312 -7 xfrm4_rcv_encap_finish2 64 56 -8 xfrm4_output 297 289 -8 vrf_ip_local_out 278 270 -8 vrf_ip6_local_out 278 270 -8 seg6_input_finish 64 56 -8 rpl_output 700 692 -8 ipmr_forward_finish 124 116 -8 ip_forward_finish 143 135 -8 ip6mr_forward2_finish 100 92 -8 ip6_forward_finish 73 65 -8 input_action_end_bpf 1091 1083 -8 dst_input 52 44 -8 __xfrm6_output 801 793 -8 __xfrm4_output 83 75 -8 bpf_input 500 491 -9 __tcp_check_space 530 521 -9 input_action_end_dt6 291 280 -11 vti6_tnl_xmit 1634 1622 -12 bpf_xmit 1203 1191 -12 rpl_input 497 483 -14 rawv6_send_hdrinc 1355 1341 -14 ndisc_send_skb 1030 1016 -14 ipv6_srh_rcv 1377 1363 -14 ip_send_unicast_reply 1253 1239 -14 ip_rcv_finish 226 212 -14 ip6_rcv_finish 300 286 -14 input_action_end_x_core 205 191 -14 input_action_end_x 355 341 -14 input_action_end_t 205 191 -14 input_action_end_dx6_finish 127 113 -14 input_action_end_dx4_finish 373 359 -14 input_action_end_dt4 426 412 -14 input_action_end_core 186 172 -14 input_action_end_b6_encap 292 278 -14 input_action_end_b6 198 184 -14 igmp6_send 1332 1318 -14 ip_sublist_rcv 864 848 -16 ip6_sublist_rcv 1091 1075 -16 ipv6_rpl_srh_rcv 1937 1920 -17 xfrm_policy_queue_process 1246 1228 -18 seg6_output_core 903 885 -18 mld_sendpack 856 836 -20 NF_HOOK 756 736 -20 vti_tunnel_xmit 1447 1426 -21 input_action_end_dx6 664 642 -22 input_action_end 1502 1480 -22 sock_sendmsg_nosec 134 111 -23 ip6mr_forward2 388 364 -24 sock_recvmsg_nosec 134 109 -25 seg6_input_core 836 810 -26 ip_send_skb 172 146 -26 ip_local_out 140 114 -26 ip6_local_out 140 114 -26 __sock_sendmsg 162 136 -26 __ip_queue_xmit 1196 1170 -26 __ip_finish_output 405 379 -26 ipmr_queue_fwd_xmit 373 346 -27 sock_recvmsg 173 145 -28 ip6_xmit 1635 1607 -28 xfrm_output_resume 1418 1389 -29 ip_build_and_send_pkt 625 591 -34 dst_output 504 432 -72 Total: Before=25217686, After=25216724, chg -0.00% Fixes: 283c16a2dfd3 ("indirect call wrappers: helpers to speed-up indirect calls of builtin") Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260227172603.1700433-1-edumazet@google.com Signed-off-by: Paolo Abeni --- include/linux/indirect_call_wrapper.h | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/indirect_call_wrapper.h b/include/linux/indirect_call_wrapper.h index 35227d47cfc9..dc272b514a01 100644 --- a/include/linux/indirect_call_wrapper.h +++ b/include/linux/indirect_call_wrapper.h @@ -16,22 +16,26 @@ */ #define INDIRECT_CALL_1(f, f1, ...) \ ({ \ - likely(f == f1) ? f1(__VA_ARGS__) : f(__VA_ARGS__); \ + typeof(f) __f1 = (f); \ + likely(__f1 == f1) ? f1(__VA_ARGS__) : __f1(__VA_ARGS__); \ }) #define INDIRECT_CALL_2(f, f2, f1, ...) \ ({ \ - likely(f == f2) ? f2(__VA_ARGS__) : \ - INDIRECT_CALL_1(f, f1, __VA_ARGS__); \ + typeof(f) __f2 = (f); \ + likely(__f2 == f2) ? f2(__VA_ARGS__) : \ + INDIRECT_CALL_1(__f2, f1, __VA_ARGS__); \ }) #define INDIRECT_CALL_3(f, f3, f2, f1, ...) \ ({ \ - likely(f == f3) ? f3(__VA_ARGS__) : \ - INDIRECT_CALL_2(f, f2, f1, __VA_ARGS__); \ + typeof(f) __f3 = (f); \ + likely(__f3 == f3) ? f3(__VA_ARGS__) : \ + INDIRECT_CALL_2(__f3, f2, f1, __VA_ARGS__); \ }) #define INDIRECT_CALL_4(f, f4, f3, f2, f1, ...) \ ({ \ - likely(f == f4) ? f4(__VA_ARGS__) : \ - INDIRECT_CALL_3(f, f3, f2, f1, __VA_ARGS__); \ + typeof(f) __f4 = (f); \ + likely(__f4 == f4) ? f4(__VA_ARGS__) : \ + INDIRECT_CALL_3(__f4, f3, f2, f1, __VA_ARGS__); \ }) #define INDIRECT_CALLABLE_DECLARE(f) f -- cgit v1.2.3 From 9de68394a61528d40f575c3e6719cc75c56f62c3 Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Mon, 2 Mar 2026 01:25:44 +0100 Subject: Revert "driver core: enforce device_lock for driver_match_device()" This reverts commit dc23806a7c47 ("driver core: enforce device_lock for driver_match_device()") and commit 289b14592cef ("driver core: fix inverted "locked" suffix of driver_match_device()"). While technically correct, there is a major downside to this approach: When a device is already present in the system and a driver is registered on the same bus, we iterate over all devices registered on this bus to see if one of them matches. If we come across an already bound one where the corresponding driver crashed while holding the device lock (e.g. in probe()) we can't make any progress anymore. However, drivers are typically the least tested code in the kernel and hence it is a case that is likely to happen regularly. Besides hurting developer ergonomics, it potentially decreases chances of shutting things down cleanly and obtaining logs in production environments as well [1]. This came up in the context of a firewire bug, which only in combination with the reverted commit, caused the machine to hang [2]. Additionally, it was observed in [3]. Thus, revert commit dc23806a7c47 ("driver core: enforce device_lock for driver_match_device()") and add a brief note clarifying that an implementer of struct bus_type must not expect match() to be called with the device lock held. Link: https://lore.kernel.org/driver-core/DGRGTIRHA62X.3RY09D9SOK77P@kernel.org/ [1] Link: https://lore.kernel.org/all/67f655bb-4d81-4609-b008-68d200255dd2@davidgow.net/ [2] Link: https://lore.kernel.org/lkml/CALbr=LZ4v7N=tO1vgOsyj9AS+XuNbn6kG-QcF+PacdMjSo0iyw@mail.gmail.com/ [3] Reported-by: Linus Torvalds Closes: https://lore.kernel.org/driver-core/CAHk-=wgJ_L1C=HjcYJotg_zrZEmiLFJaoic+PWthjuQrutrfJw@mail.gmail.com/ Reviewed-by: Gui-Dong Han Acked-by: Greg Kroah-Hartman Link: https://patch.msgid.link/20260302002545.19389-1-dakr@kernel.org [ Add additional Link: reference. - Danilo ] Signed-off-by: Danilo Krummrich --- include/linux/device/bus.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/device/bus.h b/include/linux/device/bus.h index 99c3c83ea520..63de5f053c33 100644 --- a/include/linux/device/bus.h +++ b/include/linux/device/bus.h @@ -35,6 +35,8 @@ struct fwnode_handle; * otherwise. It may also return error code if determining that * the driver supports the device is not possible. In case of * -EPROBE_DEFER it will queue the device for deferred probing. + * Note: This callback may be invoked with or without the device + * lock held. * @uevent: Called when a device is added, removed, or a few other things * that generate uevents to add the environment variables. * @probe: Called when a new device or driver add to this bus, and callback -- cgit v1.2.3 From e39bb9e02b68942f8e9359d2a3efe7d37ae6be0e Mon Sep 17 00:00:00 2001 From: Qing Wang Date: Fri, 27 Feb 2026 10:58:42 +0800 Subject: tracing: Fix WARN_ON in tracing_buffers_mmap_close When a process forks, the child process copies the parent's VMAs but the user_mapped reference count is not incremented. As a result, when both the parent and child processes exit, tracing_buffers_mmap_close() is called twice. On the second call, user_mapped is already 0, causing the function to return -ENODEV and triggering a WARN_ON. Normally, this isn't an issue as the memory is mapped with VM_DONTCOPY set. But this is only a hint, and the application can call madvise(MADVISE_DOFORK) which resets the VM_DONTCOPY flag. When the application does that, it can trigger this issue on fork. Fix it by incrementing the user_mapped reference count without re-mapping the pages in the VMA's open callback. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Vincent Donnefort Cc: Lorenzo Stoakes Link: https://patch.msgid.link/20260227025842.1085206-1-wangqing7171@gmail.com Fixes: cf9f0f7c4c5bb ("tracing: Allow user-space mapping of the ring-buffer") Reported-by: syzbot+3b5dd2030fe08afdf65d@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=3b5dd2030fe08afdf65d Tested-by: syzbot+3b5dd2030fe08afdf65d@syzkaller.appspotmail.com Signed-off-by: Qing Wang Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 876358cfe1b1..d862fa610270 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -248,6 +248,7 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node); int ring_buffer_map(struct trace_buffer *buffer, int cpu, struct vm_area_struct *vma); +void ring_buffer_map_dup(struct trace_buffer *buffer, int cpu); int ring_buffer_unmap(struct trace_buffer *buffer, int cpu); int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu); #endif /* _LINUX_RING_BUFFER_H */ -- cgit v1.2.3 From 2d28ed588f8d7d0d41b0a4fad7f0d05e4bbf1797 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Tue, 24 Feb 2026 16:24:34 -0800 Subject: Revert "ptdesc: remove references to folios from __pagetable_ctor() and pagetable_dtor()" This change swapped out mod_node_page_state for lruvec_stat_add_folio. But, these two APIs are not interchangeable: the lruvec version also increments memcg stats, in addition to "global" pgdat stats. So after this change, the "pagetables" memcg stat in memory.stat always yields "0", which is a userspace visible regression. I tried to look for a refactor where we add a variant of lruvec_stat_mod_folio which takes a pgdat and a memcg instead of a folio, to try to adhere to the spirit of the original patch. But at the end of the day this just means we have to call folio_memcg(ptdesc_folio(ptdesc)) anyway, which doesn't really accomplish much. This regression is visible in master as well as 6.18 stable, so CC stable too. Link: https://lkml.kernel.org/r/20260225002434.2953895-1-axelrasmussen@google.com Fixes: f0c92726e89f ("ptdesc: remove references to folios from __pagetable_ctor() and pagetable_dtor()") Signed-off-by: Axel Rasmussen Acked-by: Shakeel Butt Acked-by: Johannes Weiner Reviewed-by: Vishal Moola (Oracle) Cc: David Hildenbrand Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Roman Gushchin Cc: Muchun Song Cc: Signed-off-by: Andrew Morton --- include/linux/mm.h | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 5be3d8a8f806..abb4963c1f06 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3514,26 +3514,21 @@ static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; } static inline void ptlock_free(struct ptdesc *ptdesc) {} #endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */ -static inline unsigned long ptdesc_nr_pages(const struct ptdesc *ptdesc) -{ - return compound_nr(ptdesc_page(ptdesc)); -} - static inline void __pagetable_ctor(struct ptdesc *ptdesc) { - pg_data_t *pgdat = NODE_DATA(memdesc_nid(ptdesc->pt_flags)); + struct folio *folio = ptdesc_folio(ptdesc); - __SetPageTable(ptdesc_page(ptdesc)); - mod_node_page_state(pgdat, NR_PAGETABLE, ptdesc_nr_pages(ptdesc)); + __folio_set_pgtable(folio); + lruvec_stat_add_folio(folio, NR_PAGETABLE); } static inline void pagetable_dtor(struct ptdesc *ptdesc) { - pg_data_t *pgdat = NODE_DATA(memdesc_nid(ptdesc->pt_flags)); + struct folio *folio = ptdesc_folio(ptdesc); ptlock_free(ptdesc); - __ClearPageTable(ptdesc_page(ptdesc)); - mod_node_page_state(pgdat, NR_PAGETABLE, -ptdesc_nr_pages(ptdesc)); + __folio_clear_pgtable(folio); + lruvec_stat_sub_folio(folio, NR_PAGETABLE); } static inline void pagetable_dtor_free(struct ptdesc *ptdesc) -- cgit v1.2.3 From 7392f8e4ea632622b2cd2086675ba022db238b3a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 1 Mar 2026 16:52:29 -0800 Subject: uaccess: correct kernel-doc parameter format Use the correct kernel-doc function parameter format to avoid kernel-doc warnings: Warning: include/linux/uaccess.h:814 function parameter 'uptr' not described in 'scoped_user_rw_access_size' Warning: include/linux/uaccess.h:826 function parameter 'uptr' not described in 'scoped_user_rw_access' Link: https://lkml.kernel.org/r/20260302005229.3471955-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton --- include/linux/uaccess.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 1f3804245c06..001cfef21b61 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -806,7 +806,7 @@ for (bool done = false; !done; done = true) \ /** * scoped_user_rw_access_size - Start a scoped user read/write access with given size - * @uptr Pointer to the user space address to read from and write to + * @uptr: Pointer to the user space address to read from and write to * @size: Size of the access starting from @uptr * @elbl: Error label to goto when the access region is rejected * @@ -817,7 +817,7 @@ for (bool done = false; !done; done = true) \ /** * scoped_user_rw_access - Start a scoped user read/write access - * @uptr Pointer to the user space address to read from and write to + * @uptr: Pointer to the user space address to read from and write to * @elbl: Error label to goto when the access region is rejected * * The size of the access starting from @uptr is determined via sizeof(*@uptr)). -- cgit v1.2.3 From 599b4e290c8766b19378d85d4310c6ec8f90ade4 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 1 Mar 2026 16:52:22 -0800 Subject: mm/mmu_notifier: clean up mmu_notifier.h kernel-doc Eliminate kernel-doc warnings in mmu_notifier.h: - add a missing struct short description - use the correct format for function parameters - add missing function return comment sections Warning: include/linux/mmu_notifier.h:236 missing initial short description on line: * struct mmu_interval_notifier_ops Warning: include/linux/mmu_notifier.h:325 function parameter 'interval_sub' not described in 'mmu_interval_set_seq' Warning: include/linux/mmu_notifier.h:325 function parameter 'cur_seq' not described in 'mmu_interval_set_seq' Warning: include/linux/mmu_notifier.h:346 function parameter 'interval_sub' not described in 'mmu_interval_read_retry' Warning: include/linux/mmu_notifier.h:346 function parameter 'seq' not described in 'mmu_interval_read_retry' Warning: include/linux/mmu_notifier.h:346 No description found for return value of 'mmu_interval_read_retry' Warning: include/linux/mmu_notifier.h:370 function parameter 'interval_sub' not described in 'mmu_interval_check_retry' Warning: include/linux/mmu_notifier.h:370 function parameter 'seq' not described in 'mmu_interval_check_retry' Warning: include/linux/mmu_notifier.h:370 No description found for return value of 'mmu_interval_check_retry' Link: https://lkml.kernel.org/r/20260302005222.3470783-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Reviewed-by: Jason Gunthorpe Cc: David Hildenbrand Cc: "Liam R. Howlett" Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Randy Dunlap Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mmu_notifier.h | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 07a2bbaf86e9..8450e18a87c2 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -234,7 +234,7 @@ struct mmu_notifier { }; /** - * struct mmu_interval_notifier_ops + * struct mmu_interval_notifier_ops - callback for range notification * @invalidate: Upon return the caller must stop using any SPTEs within this * range. This function can sleep. Return false only if sleeping * was required but mmu_notifier_range_blockable(range) is false. @@ -309,8 +309,8 @@ void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub); /** * mmu_interval_set_seq - Save the invalidation sequence - * @interval_sub - The subscription passed to invalidate - * @cur_seq - The cur_seq passed to the invalidate() callback + * @interval_sub: The subscription passed to invalidate + * @cur_seq: The cur_seq passed to the invalidate() callback * * This must be called unconditionally from the invalidate callback of a * struct mmu_interval_notifier_ops under the same lock that is used to call @@ -329,8 +329,8 @@ mmu_interval_set_seq(struct mmu_interval_notifier *interval_sub, /** * mmu_interval_read_retry - End a read side critical section against a VA range - * interval_sub: The subscription - * seq: The return of the paired mmu_interval_read_begin() + * @interval_sub: The subscription + * @seq: The return of the paired mmu_interval_read_begin() * * This MUST be called under a user provided lock that is also held * unconditionally by op->invalidate() when it calls mmu_interval_set_seq(). @@ -338,7 +338,7 @@ mmu_interval_set_seq(struct mmu_interval_notifier *interval_sub, * Each call should be paired with a single mmu_interval_read_begin() and * should be used to conclude the read side. * - * Returns true if an invalidation collided with this critical section, and + * Returns: true if an invalidation collided with this critical section, and * the caller should retry. */ static inline bool @@ -350,20 +350,21 @@ mmu_interval_read_retry(struct mmu_interval_notifier *interval_sub, /** * mmu_interval_check_retry - Test if a collision has occurred - * interval_sub: The subscription - * seq: The return of the matching mmu_interval_read_begin() + * @interval_sub: The subscription + * @seq: The return of the matching mmu_interval_read_begin() * * This can be used in the critical section between mmu_interval_read_begin() - * and mmu_interval_read_retry(). A return of true indicates an invalidation - * has collided with this critical region and a future - * mmu_interval_read_retry() will return true. - * - * False is not reliable and only suggests a collision may not have - * occurred. It can be called many times and does not have to hold the user - * provided lock. + * and mmu_interval_read_retry(). * * This call can be used as part of loops and other expensive operations to * expedite a retry. + * It can be called many times and does not have to hold the user + * provided lock. + * + * Returns: true indicates an invalidation has collided with this critical + * region and a future mmu_interval_read_retry() will return true. + * False is not reliable and only suggests a collision may not have + * occurred. */ static inline bool mmu_interval_check_retry(struct mmu_interval_notifier *interval_sub, -- cgit v1.2.3 From b824c3e16c1904bf80df489e293d1e3cbf98896d Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 2 Mar 2026 17:26:31 +0100 Subject: net: Provide a PREEMPT_RT specific check for netdev_queue::_xmit_lock After acquiring netdev_queue::_xmit_lock the number of the CPU owning the lock is recorded in netdev_queue::xmit_lock_owner. This works as long as the BH context is not preemptible. On PREEMPT_RT the softirq context is preemptible and without the softirq-lock it is possible to have multiple user in __dev_queue_xmit() submitting a skb on the same CPU. This is fine in general but this means also that the current CPU is recorded as netdev_queue::xmit_lock_owner. This in turn leads to the recursion alert and the skb is dropped. Instead checking the for CPU number, that owns the lock, PREEMPT_RT can check if the lockowner matches the current task. Add netif_tx_owned() which returns true if the current context owns the lock by comparing the provided CPU number with the recorded number. This resembles the current check by negating the condition (the current check returns true if the lock is not owned). On PREEMPT_RT use rt_mutex_owner() to return the lock owner and compare the current task against it. Use the new helper in __dev_queue_xmit() and netif_local_xmit_active() which provides a similar check. Update comments regarding pairing READ_ONCE(). Reported-by: Bert Karwatzki Closes: https://lore.kernel.org/all/20260216134333.412332-1-spasswolf@web.de Fixes: 3253cb49cbad4 ("softirq: Allow to drop the softirq-BKL lock on PREEMPT_RT") Signed-off-by: Sebastian Andrzej Siewior Reported-by: Bert Karwatzki Signed-off-by: Sebastian Andrzej Siewior Link: https://patch.msgid.link/20260302162631.uGUyIqDT@linutronix.de Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d4e6e00bb90a..67e25f6d15a4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4711,7 +4711,7 @@ static inline u32 netif_msg_init(int debug_value, int default_msg_enable_bits) static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu) { spin_lock(&txq->_xmit_lock); - /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + /* Pairs with READ_ONCE() in netif_tx_owned() */ WRITE_ONCE(txq->xmit_lock_owner, cpu); } @@ -4729,7 +4729,7 @@ static inline void __netif_tx_release(struct netdev_queue *txq) static inline void __netif_tx_lock_bh(struct netdev_queue *txq) { spin_lock_bh(&txq->_xmit_lock); - /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + /* Pairs with READ_ONCE() in netif_tx_owned() */ WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id()); } @@ -4738,7 +4738,7 @@ static inline bool __netif_tx_trylock(struct netdev_queue *txq) bool ok = spin_trylock(&txq->_xmit_lock); if (likely(ok)) { - /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + /* Pairs with READ_ONCE() in netif_tx_owned() */ WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id()); } return ok; @@ -4746,14 +4746,14 @@ static inline bool __netif_tx_trylock(struct netdev_queue *txq) static inline void __netif_tx_unlock(struct netdev_queue *txq) { - /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + /* Pairs with READ_ONCE() in netif_tx_owned() */ WRITE_ONCE(txq->xmit_lock_owner, -1); spin_unlock(&txq->_xmit_lock); } static inline void __netif_tx_unlock_bh(struct netdev_queue *txq) { - /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + /* Pairs with READ_ONCE() in netif_tx_owned() */ WRITE_ONCE(txq->xmit_lock_owner, -1); spin_unlock_bh(&txq->_xmit_lock); } @@ -4846,6 +4846,23 @@ static inline void netif_tx_disable(struct net_device *dev) local_bh_enable(); } +#ifndef CONFIG_PREEMPT_RT +static inline bool netif_tx_owned(struct netdev_queue *txq, unsigned int cpu) +{ + /* Other cpus might concurrently change txq->xmit_lock_owner + * to -1 or to their cpu id, but not to our id. + */ + return READ_ONCE(txq->xmit_lock_owner) == cpu; +} + +#else +static inline bool netif_tx_owned(struct netdev_queue *txq, unsigned int cpu) +{ + return rt_mutex_owner(&txq->_xmit_lock.lock) == current; +} + +#endif + static inline void netif_addr_lock(struct net_device *dev) { unsigned char nest_level = 0; -- cgit v1.2.3 From 55f854dd5bdd8e19b936a00ef1f8d776ac32c7b0 Mon Sep 17 00:00:00 2001 From: Laurent Vivier Date: Wed, 4 Mar 2026 14:43:38 +0100 Subject: qmi_wwan: allow max_mtu above hard_mtu to control rx_urb_size Commit c7159e960f14 ("usbnet: limit max_mtu based on device's hard_mtu") capped net->max_mtu to the device's hard_mtu in usbnet_probe(). While this correctly prevents oversized packets on standard USB network devices, it breaks the qmi_wwan driver. qmi_wwan relies on userspace (e.g. ModemManager) setting a large MTU on the wwan0 interface to configure rx_urb_size via usbnet_change_mtu(). QMI modems negotiate USB transfer sizes of 16,383 or 32,767 bytes, and the USB receive buffers must be sized accordingly. With max_mtu capped to hard_mtu (~1500 bytes), userspace can no longer raise the MTU, the receive buffers remain small, and download speeds drop from >300 Mbps to ~0.8 Mbps. Introduce a FLAG_NOMAXMTU driver flag that allows individual usbnet drivers to opt out of the max_mtu cap. Set this flag in qmi_wwan's driver_info structures to restore the previous behavior for QMI devices, while keeping the safety fix in place for all other usbnet drivers. Fixes: c7159e960f14 ("usbnet: limit max_mtu based on device's hard_mtu") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/lkml/CAPh3n803k8JcBPV5qEzUB-oKzWkAs-D5CU7z=Vd_nLRCr5ZqQg@mail.gmail.com/ Reported-by: Koen Vandeputte Tested-by: Daniele Palmas Signed-off-by: Laurent Vivier Link: https://patch.msgid.link/20260304134338.1785002-1-lvivier@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/usb/usbnet.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h index b0e84896e6ac..bbf799ccf3b3 100644 --- a/include/linux/usb/usbnet.h +++ b/include/linux/usb/usbnet.h @@ -132,6 +132,7 @@ struct driver_info { #define FLAG_MULTI_PACKET 0x2000 #define FLAG_RX_ASSEMBLE 0x4000 /* rx packets may span >1 frames */ #define FLAG_NOARP 0x8000 /* device can't do ARP */ +#define FLAG_NOMAXMTU 0x10000 /* allow max_mtu above hard_mtu */ /* init device ... can sleep, or cause probe() failure */ int (*bind)(struct usbnet *, struct usb_interface *); -- cgit v1.2.3 From 1954c4f012206147c34acda8da04f827aa7d3ee3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 7 Mar 2026 20:07:15 +0000 Subject: eventpoll: Convert epoll_put_uevent() to scoped user access Saves two function calls, and one stac/clac pair. stac/clac is rather expensive on older cpus like Zen 2. A synthetic network stress test gives a ~1.5% increase of pps on AMD Zen 2. Signed-off-by: Eric Dumazet Cc: Christophe Leroy Cc: Dave Hansen Cc: Kuniyuki Iwashima Signed-off-by: Linus Torvalds --- include/linux/eventpoll.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h index ccb478eb174b..ea9ca0e4172a 100644 --- a/include/linux/eventpoll.h +++ b/include/linux/eventpoll.h @@ -82,11 +82,14 @@ static inline struct epoll_event __user * epoll_put_uevent(__poll_t revents, __u64 data, struct epoll_event __user *uevent) { - if (__put_user(revents, &uevent->events) || - __put_user(data, &uevent->data)) - return NULL; - + scoped_user_write_access_size(uevent, sizeof(*uevent), efault) { + unsafe_put_user(revents, &uevent->events, efault); + unsafe_put_user(data, &uevent->data, efault); + } return uevent+1; + +efault: + return NULL; } #endif -- cgit v1.2.3 From 6f1a9140ecda3baba3d945b9a6155af4268aafc4 Mon Sep 17 00:00:00 2001 From: Weiming Shi Date: Sat, 7 Mar 2026 00:01:34 +0800 Subject: net: add xmit recursion limit to tunnel xmit functions Tunnel xmit functions (iptunnel_xmit, ip6tunnel_xmit) lack their own recursion limit. When a bond device in broadcast mode has GRE tap interfaces as slaves, and those GRE tunnels route back through the bond, multicast/broadcast traffic triggers infinite recursion between bond_xmit_broadcast() and ip_tunnel_xmit()/ip6_tnl_xmit(), causing kernel stack overflow. The existing XMIT_RECURSION_LIMIT (8) in the no-qdisc path is not sufficient because tunnel recursion involves route lookups and full IP output, consuming much more stack per level. Use a lower limit of 4 (IP_TUNNEL_RECURSION_LIMIT) to prevent overflow. Add recursion detection using dev_xmit_recursion helpers directly in iptunnel_xmit() and ip6tunnel_xmit() to cover all IPv4/IPv6 tunnel paths including UDP encapsulated tunnels (VXLAN, Geneve, etc.). Move dev_xmit_recursion helpers from net/core/dev.h to public header include/linux/netdevice.h so they can be used by tunnel code. BUG: KASAN: stack-out-of-bounds in blake2s.constprop.0+0xe7/0x160 Write of size 32 at addr ffff88810033fed0 by task kworker/0:1/11 Workqueue: mld mld_ifc_work Call Trace: __build_flow_key.constprop.0 (net/ipv4/route.c:515) ip_rt_update_pmtu (net/ipv4/route.c:1073) iptunnel_xmit (net/ipv4/ip_tunnel_core.c:84) ip_tunnel_xmit (net/ipv4/ip_tunnel.c:847) gre_tap_xmit (net/ipv4/ip_gre.c:779) dev_hard_start_xmit (net/core/dev.c:3887) sch_direct_xmit (net/sched/sch_generic.c:347) __dev_queue_xmit (net/core/dev.c:4802) bond_dev_queue_xmit (drivers/net/bonding/bond_main.c:312) bond_xmit_broadcast (drivers/net/bonding/bond_main.c:5279) bond_start_xmit (drivers/net/bonding/bond_main.c:5530) dev_hard_start_xmit (net/core/dev.c:3887) __dev_queue_xmit (net/core/dev.c:4841) ip_finish_output2 (net/ipv4/ip_output.c:237) ip_output (net/ipv4/ip_output.c:438) iptunnel_xmit (net/ipv4/ip_tunnel_core.c:86) gre_tap_xmit (net/ipv4/ip_gre.c:779) dev_hard_start_xmit (net/core/dev.c:3887) sch_direct_xmit (net/sched/sch_generic.c:347) __dev_queue_xmit (net/core/dev.c:4802) bond_dev_queue_xmit (drivers/net/bonding/bond_main.c:312) bond_xmit_broadcast (drivers/net/bonding/bond_main.c:5279) bond_start_xmit (drivers/net/bonding/bond_main.c:5530) dev_hard_start_xmit (net/core/dev.c:3887) __dev_queue_xmit (net/core/dev.c:4841) ip_finish_output2 (net/ipv4/ip_output.c:237) ip_output (net/ipv4/ip_output.c:438) iptunnel_xmit (net/ipv4/ip_tunnel_core.c:86) ip_tunnel_xmit (net/ipv4/ip_tunnel.c:847) gre_tap_xmit (net/ipv4/ip_gre.c:779) dev_hard_start_xmit (net/core/dev.c:3887) sch_direct_xmit (net/sched/sch_generic.c:347) __dev_queue_xmit (net/core/dev.c:4802) bond_dev_queue_xmit (drivers/net/bonding/bond_main.c:312) bond_xmit_broadcast (drivers/net/bonding/bond_main.c:5279) bond_start_xmit (drivers/net/bonding/bond_main.c:5530) dev_hard_start_xmit (net/core/dev.c:3887) __dev_queue_xmit (net/core/dev.c:4841) mld_sendpack mld_ifc_work process_one_work worker_thread Fixes: 745e20f1b626 ("net: add a recursion limit in xmit path") Reported-by: Xiang Mei Signed-off-by: Weiming Shi Link: https://patch.msgid.link/20260306160133.3852900-2-bestswngs@gmail.com Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 67e25f6d15a4..ae269a2e7f4d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3576,17 +3576,49 @@ struct page_pool_bh { }; DECLARE_PER_CPU(struct page_pool_bh, system_page_pool); +#define XMIT_RECURSION_LIMIT 8 + #ifndef CONFIG_PREEMPT_RT static inline int dev_recursion_level(void) { return this_cpu_read(softnet_data.xmit.recursion); } + +static inline bool dev_xmit_recursion(void) +{ + return unlikely(__this_cpu_read(softnet_data.xmit.recursion) > + XMIT_RECURSION_LIMIT); +} + +static inline void dev_xmit_recursion_inc(void) +{ + __this_cpu_inc(softnet_data.xmit.recursion); +} + +static inline void dev_xmit_recursion_dec(void) +{ + __this_cpu_dec(softnet_data.xmit.recursion); +} #else static inline int dev_recursion_level(void) { return current->net_xmit.recursion; } +static inline bool dev_xmit_recursion(void) +{ + return unlikely(current->net_xmit.recursion > XMIT_RECURSION_LIMIT); +} + +static inline void dev_xmit_recursion_inc(void) +{ + current->net_xmit.recursion++; +} + +static inline void dev_xmit_recursion_dec(void) +{ + current->net_xmit.recursion--; +} #endif void __netif_schedule(struct Qdisc *q); -- cgit v1.2.3 From fa655a9ca73f7df32b8ca4d14ce11742f9578288 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 3 Mar 2026 22:31:01 +0100 Subject: nvme: Annotate struct nvme_dhchap_key with __counted_by Add the __counted_by() compiler attribute to the flexible array member 'key' to improve access bounds-checking via CONFIG_UBSAN_BOUNDS and CONFIG_FORTIFY_SOURCE. Reviewed-by: Christoph Hellwig Signed-off-by: Thorsten Blum Signed-off-by: Keith Busch --- include/linux/nvme-auth.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nvme-auth.h b/include/linux/nvme-auth.h index 60e069a6757f..e75c29c51464 100644 --- a/include/linux/nvme-auth.h +++ b/include/linux/nvme-auth.h @@ -11,7 +11,7 @@ struct nvme_dhchap_key { size_t len; u8 hash; - u8 key[]; + u8 key[] __counted_by(len); }; u32 nvme_auth_get_seqnum(void); -- cgit v1.2.3 From 22fd7f7fed2ae3702f90d1985c326354e86b9c75 Mon Sep 17 00:00:00 2001 From: Muhammad Amirul Asyraf Mohamad Jamian Date: Thu, 5 Mar 2026 01:31:51 -0800 Subject: firmware: stratix10-svc: Add Multi SVC clients support In the current implementation, SVC client drivers such as socfpga-hwmon, intel_fcs, stratix10-soc, stratix10-rsu each send an SMC command that triggers a single thread in the stratix10-svc driver. Upon receiving a callback, the initiating client driver sends a stratix10-svc-done signal, terminating the thread without waiting for other pending SMC commands to complete. This leads to a timeout issue in the firmware SVC mailbox service when multiple client drivers send SMC commands concurrently. To resolve this issue, a dedicated thread is now created per channel. The stratix10-svc driver will support up to the number of channels defined by SVC_NUM_CHANNEL. Thread synchronization is handled using a mutex to prevent simultaneous issuance of SMC commands by multiple threads. SVC_NUM_DATA_IN_FIFO is reduced from 32 to 8, since each channel now has its own dedicated FIFO and the SDM processes commands one at a time. 8 entries per channel is sufficient while keeping the total aggregate capacity the same (4 channels x 8 = 32 entries). Additionally, a thread task is now validated before invoking kthread_stop when the user aborts, ensuring safe termination. Timeout values have also been adjusted to accommodate the increased load from concurrent client driver activity. Fixes: 7ca5ce896524 ("firmware: add Intel Stratix10 service layer driver") Cc: stable@vger.kernel.org Signed-off-by: Ang Tien Sung Signed-off-by: Fong, Yan Kei Signed-off-by: Muhammad Amirul Asyraf Mohamad Jamian Link: https://lore.kernel.org/all/20260305093151.2678-1-muhammad.amirul.asyraf.mohamad.jamian@altera.com Signed-off-by: Dinh Nguyen --- include/linux/firmware/intel/stratix10-svc-client.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/firmware/intel/stratix10-svc-client.h b/include/linux/firmware/intel/stratix10-svc-client.h index d290060f4c73..91013161e9db 100644 --- a/include/linux/firmware/intel/stratix10-svc-client.h +++ b/include/linux/firmware/intel/stratix10-svc-client.h @@ -68,12 +68,12 @@ * timeout value used in Stratix10 FPGA manager driver. * timeout value used in RSU driver */ -#define SVC_RECONFIG_REQUEST_TIMEOUT_MS 300 -#define SVC_RECONFIG_BUFFER_TIMEOUT_MS 720 -#define SVC_RSU_REQUEST_TIMEOUT_MS 300 +#define SVC_RECONFIG_REQUEST_TIMEOUT_MS 5000 +#define SVC_RECONFIG_BUFFER_TIMEOUT_MS 5000 +#define SVC_RSU_REQUEST_TIMEOUT_MS 2000 #define SVC_FCS_REQUEST_TIMEOUT_MS 2000 #define SVC_COMPLETED_TIMEOUT_MS 30000 -#define SVC_HWMON_REQUEST_TIMEOUT_MS 300 +#define SVC_HWMON_REQUEST_TIMEOUT_MS 2000 struct stratix10_svc_chan; -- cgit v1.2.3 From b2e48c429ec54715d16fefa719dd2fbded2e65be Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 10 Mar 2026 21:28:53 +0100 Subject: sched/mmcid: Prevent CID stalls due to concurrent forks A newly forked task is accounted as MMCID user before the task is visible in the process' thread list and the global task list. This creates the following problem: CPU1 CPU2 fork() sched_mm_cid_fork(tnew1) tnew1->mm.mm_cid_users++; tnew1->mm_cid.cid = getcid() -> preemption fork() sched_mm_cid_fork(tnew2) tnew2->mm.mm_cid_users++; // Reaches the per CPU threshold mm_cid_fixup_tasks_to_cpus() for_each_other(current, p) .... As tnew1 is not visible yet, this fails to fix up the already allocated CID of tnew1. As a consequence a subsequent schedule in might fail to acquire a (transitional) CID and the machine stalls. Move the invocation of sched_mm_cid_fork() after the new task becomes visible in the thread and the task list to prevent this. This also makes it symmetrical vs. exit() where the task is removed as CID user before the task is removed from the thread and task lists. Fixes: fbd0e71dc370 ("sched/mmcid: Provide CID ownership mode fixup functions") Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Tested-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260310202525.969061974@kernel.org --- include/linux/sched.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index a7b4a980eb2f..5a5d3dbc9cdf 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2354,7 +2354,6 @@ static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct allo #ifdef CONFIG_SCHED_MM_CID void sched_mm_cid_before_execve(struct task_struct *t); void sched_mm_cid_after_execve(struct task_struct *t); -void sched_mm_cid_fork(struct task_struct *t); void sched_mm_cid_exit(struct task_struct *t); static __always_inline int task_mm_cid(struct task_struct *t) { @@ -2363,7 +2362,6 @@ static __always_inline int task_mm_cid(struct task_struct *t) #else static inline void sched_mm_cid_before_execve(struct task_struct *t) { } static inline void sched_mm_cid_after_execve(struct task_struct *t) { } -static inline void sched_mm_cid_fork(struct task_struct *t) { } static inline void sched_mm_cid_exit(struct task_struct *t) { } static __always_inline int task_mm_cid(struct task_struct *t) { -- cgit v1.2.3 From 192d852129b1b7c4f0ddbab95d0de1efd5ee1405 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 10 Mar 2026 21:29:09 +0100 Subject: sched/mmcid: Avoid full tasklist walks Chasing vfork()'ed tasks on a CID ownership mode switch requires a full task list walk, which is obviously expensive on large systems. Avoid that by keeping a list of tasks using a mm MMCID entity in mm::mm_cid and walk this list instead. This removes the proven to be flaky counting logic and avoids a full task list walk in the case of vfork()'ed tasks. Fixes: fbd0e71dc370 ("sched/mmcid: Provide CID ownership mode fixup functions") Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Tested-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260310202526.183824481@kernel.org --- include/linux/rseq_types.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index da5fa6f40294..0b42045988db 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -133,10 +133,12 @@ struct rseq_data { }; * @active: MM CID is active for the task * @cid: The CID associated to the task either permanently or * borrowed from the CPU + * @node: Queued in the per MM MMCID list */ struct sched_mm_cid { unsigned int active; unsigned int cid; + struct hlist_node node; }; /** @@ -157,6 +159,7 @@ struct mm_cid_pcpu { * @work: Regular work to handle the affinity mode change case * @lock: Spinlock to protect against affinity setting which can't take @mutex * @mutex: Mutex to serialize forks and exits related to this mm + * @user_list: List of the MM CID users of a MM * @nr_cpus_allowed: The number of CPUs in the per MM allowed CPUs map. The map * is growth only. * @users: The number of tasks sharing this MM. Separate from mm::mm_users @@ -177,13 +180,14 @@ struct mm_mm_cid { raw_spinlock_t lock; struct mutex mutex; + struct hlist_head user_list; /* Low frequency modified */ unsigned int nr_cpus_allowed; unsigned int users; unsigned int pcpu_thrs; unsigned int update_deferred; -}____cacheline_aligned_in_smp; +} ____cacheline_aligned; #else /* CONFIG_SCHED_MM_CID */ struct mm_mm_cid { }; struct sched_mm_cid { }; -- cgit v1.2.3 From 416909962e7cdf29fd01ac523c953f37708df93d Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Tue, 17 Feb 2026 22:07:47 -0500 Subject: USB: usbcore: Introduce usb_bulk_msg_killable() The synchronous message API in usbcore (usb_control_msg(), usb_bulk_msg(), and so on) uses uninterruptible waits. However, drivers may call these routines in the context of a user thread, which means it ought to be possible to at least kill them. For this reason, introduce a new usb_bulk_msg_killable() function which behaves the same as usb_bulk_msg() except for using wait_for_completion_killable_timeout() instead of wait_for_completion_timeout(). The same can be done later for usb_control_msg() later on, if it turns out to be needed. Signed-off-by: Alan Stern Suggested-by: Oliver Neukum Link: https://lore.kernel.org/linux-usb/3acfe838-6334-4f6d-be7c-4bb01704b33d@rowland.harvard.edu/ Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") CC: stable@vger.kernel.org Link: https://patch.msgid.link/248628b4-cc83-4e81-a620-3ce4e0376d41@rowland.harvard.edu Signed-off-by: Greg Kroah-Hartman --- include/linux/usb.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb.h b/include/linux/usb.h index fbfcc70b07fb..57ceeb02a7cb 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -1868,8 +1868,9 @@ extern int usb_control_msg(struct usb_device *dev, unsigned int pipe, extern int usb_interrupt_msg(struct usb_device *usb_dev, unsigned int pipe, void *data, int len, int *actual_length, int timeout); extern int usb_bulk_msg(struct usb_device *usb_dev, unsigned int pipe, - void *data, int len, int *actual_length, - int timeout); + void *data, int len, int *actual_length, int timeout); +extern int usb_bulk_msg_killable(struct usb_device *usb_dev, unsigned int pipe, + void *data, int len, int *actual_length, int timeout); /* wrappers around usb_control_msg() for the most common standard requests */ int usb_control_msg_send(struct usb_device *dev, __u8 endpoint, __u8 request, -- cgit v1.2.3 From 1015c27a5e1a63efae2b18a9901494474b4d1dc3 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Tue, 17 Feb 2026 22:10:32 -0500 Subject: USB: core: Limit the length of unkillable synchronous timeouts The usb_control_msg(), usb_bulk_msg(), and usb_interrupt_msg() APIs in usbcore allow unlimited timeout durations. And since they use uninterruptible waits, this leaves open the possibility of hanging a task for an indefinitely long time, with no way to kill it short of unplugging the target device. To prevent this sort of problem, enforce a maximum limit on the length of these unkillable timeouts. The limit chosen here, somewhat arbitrarily, is 60 seconds. On many systems (although not all) this is short enough to avoid triggering the kernel's hung-task detector. In addition, clear up the ambiguity of negative timeout values by treating them the same as 0, i.e., using the maximum allowed timeout. Signed-off-by: Alan Stern Link: https://lore.kernel.org/linux-usb/3acfe838-6334-4f6d-be7c-4bb01704b33d@rowland.harvard.edu/ Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") CC: stable@vger.kernel.org Link: https://patch.msgid.link/15fc9773-a007-47b0-a703-df89a8cf83dd@rowland.harvard.edu Signed-off-by: Greg Kroah-Hartman --- include/linux/usb.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb.h b/include/linux/usb.h index 57ceeb02a7cb..04277af4bb9d 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -1862,6 +1862,9 @@ void usb_free_noncoherent(struct usb_device *dev, size_t size, * SYNCHRONOUS CALL SUPPORT * *-------------------------------------------------------------------*/ +/* Maximum value allowed for timeout in synchronous routines below */ +#define USB_MAX_SYNCHRONOUS_TIMEOUT 60000 /* ms */ + extern int usb_control_msg(struct usb_device *dev, unsigned int pipe, __u8 request, __u8 requesttype, __u16 value, __u16 index, void *data, __u16 size, int timeout); -- cgit v1.2.3 From 9f6a983cfa22ac662c86e60816d3a357d4b551e9 Mon Sep 17 00:00:00 2001 From: Jie Deng Date: Fri, 27 Feb 2026 16:49:31 +0800 Subject: usb: core: new quirk to handle devices with zero configurations Some USB devices incorrectly report bNumConfigurations as 0 in their device descriptor, which causes the USB core to reject them during enumeration. logs: usb 1-2: device descriptor read/64, error -71 usb 1-2: no configurations usb 1-2: can't read configurations, error -22 However, these devices actually work correctly when treated as having a single configuration. Add a new quirk USB_QUIRK_FORCE_ONE_CONFIG to handle such devices. When this quirk is set, assume the device has 1 configuration instead of failing with -EINVAL. This quirk is applied to the device with VID:PID 5131:2007 which exhibits this behavior. Signed-off-by: Jie Deng Link: https://patch.msgid.link/20260227084931.1527461-1-dengjie03@kylinos.cn Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/quirks.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/quirks.h b/include/linux/usb/quirks.h index 2f7bd2fdc616..b3cc7beab4a3 100644 --- a/include/linux/usb/quirks.h +++ b/include/linux/usb/quirks.h @@ -78,4 +78,7 @@ /* skip BOS descriptor request */ #define USB_QUIRK_NO_BOS BIT(17) +/* Device claims zero configurations, forcing to 1 */ +#define USB_QUIRK_FORCE_ONE_CONFIG BIT(18) + #endif /* __LINUX_USB_QUIRKS_H */ -- cgit v1.2.3 From 96189080265e6bb5dde3a4afbaf947af493e3f82 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 9 Mar 2026 14:21:37 -0600 Subject: io_uring: ensure ctx->rings is stable for task work flags manipulation If DEFER_TASKRUN | SETUP_TASKRUN is used and task work is added while the ring is being resized, it's possible for the OR'ing of IORING_SQ_TASKRUN to happen in the small window of swapping into the new rings and the old rings being freed. Prevent this by adding a 2nd ->rings pointer, ->rings_rcu, which is protected by RCU. The task work flags manipulation is inside RCU already, and if the resize ring freeing is done post an RCU synchronize, then there's no need to add locking to the fast path of task work additions. Note: this is only done for DEFER_TASKRUN, as that's the only setup mode that supports ring resizing. If this ever changes, then they too need to use the io_ctx_mark_taskrun() helper. Link: https://lore.kernel.org/io-uring/20260309062759.482210-1-naup96721@gmail.com/ Cc: stable@vger.kernel.org Fixes: 79cfe9e59c2a ("io_uring/register: add IORING_REGISTER_RESIZE_RINGS") Reported-by: Hao-Yu Yang Suggested-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 3e4a82a6f817..dd1420bfcb73 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -388,6 +388,7 @@ struct io_ring_ctx { * regularly bounce b/w CPUs. */ struct { + struct io_rings __rcu *rings_rcu; struct llist_head work_llist; struct llist_head retry_llist; unsigned long check_cq; -- cgit v1.2.3 From 94a4b1f959989de9c54d43c3a102fb1ee92e1414 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 7 Mar 2026 17:50:53 -0300 Subject: ipv6: move the disable_ipv6_mod knob to core code From: Jakub Kicinski Make sure disable_ipv6_mod itself is not part of the IPv6 module, in case core code wants to refer to it. We will remove support for IPv6=m soon, this change helps make fixes we commit before that less messy. Link: https://patch.msgid.link/20260307-net-nd_tbl_fixes-v4-1-e2677e85628c@suse.com Signed-off-by: Jakub Kicinski --- include/linux/ipv6.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 443053a76dcf..a7421382a916 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -333,7 +333,12 @@ struct tcp6_timewait_sock { }; #if IS_ENABLED(CONFIG_IPV6) -bool ipv6_mod_enabled(void); +extern int disable_ipv6_mod; + +static inline bool ipv6_mod_enabled(void) +{ + return disable_ipv6_mod == 0; +} static inline struct ipv6_pinfo *inet6_sk(const struct sock *__sk) { -- cgit v1.2.3