From da142f3d373a6ddaca0119615a8db2175ddc4121 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 15:26:55 -0800 Subject: KVM: Remove subtle "struct kvm_stats_desc" pseudo-overlay Remove KVM's internal pseudo-overlay of kvm_stats_desc, which subtly aliases the flexible name[] in the uAPI definition with a fixed-size array of the same name. The unusual embedded structure results in compiler warnings due to -Wflex-array-member-not-at-end, and also necessitates an extra level of dereferencing in KVM. To avoid the "overlay", define the uAPI structure to have a fixed-size name when building for the kernel. Opportunistically clean up the indentation for the stats macros, and replace spaces with tabs. No functional change intended. Reported-by: Gustavo A. R. Silva Closes: https://lore.kernel.org/all/aPfNKRpLfhmhYqfP@kspp Acked-by: Marc Zyngier Acked-by: Christian Borntraeger [..] Acked-by: Anup Patel Reviewed-by: Bibo Mao Acked-by: Gustavo A. R. Silva Link: https://patch.msgid.link/20251205232655.445294-1-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 83 ++++++++++++++++++++---------------------------- 1 file changed, 35 insertions(+), 48 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d93f75b05ae2..7428d9949382 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1927,56 +1927,43 @@ enum kvm_stat_kind { struct kvm_stat_data { struct kvm *kvm; - const struct _kvm_stats_desc *desc; + const struct kvm_stats_desc *desc; enum kvm_stat_kind kind; }; -struct _kvm_stats_desc { - struct kvm_stats_desc desc; - char name[KVM_STATS_NAME_SIZE]; -}; - -#define STATS_DESC_COMMON(type, unit, base, exp, sz, bsz) \ - .flags = type | unit | base | \ - BUILD_BUG_ON_ZERO(type & ~KVM_STATS_TYPE_MASK) | \ - BUILD_BUG_ON_ZERO(unit & ~KVM_STATS_UNIT_MASK) | \ - BUILD_BUG_ON_ZERO(base & ~KVM_STATS_BASE_MASK), \ - .exponent = exp, \ - .size = sz, \ +#define STATS_DESC_COMMON(type, unit, base, exp, sz, bsz) \ + .flags = type | unit | base | \ + BUILD_BUG_ON_ZERO(type & ~KVM_STATS_TYPE_MASK) | \ + BUILD_BUG_ON_ZERO(unit & ~KVM_STATS_UNIT_MASK) | \ + BUILD_BUG_ON_ZERO(base & ~KVM_STATS_BASE_MASK), \ + .exponent = exp, \ + .size = sz, \ .bucket_size = bsz -#define VM_GENERIC_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ - { \ - { \ - STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ - .offset = offsetof(struct kvm_vm_stat, generic.stat) \ - }, \ - .name = #stat, \ - } -#define VCPU_GENERIC_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ - { \ - { \ - STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ - .offset = offsetof(struct kvm_vcpu_stat, generic.stat) \ - }, \ - .name = #stat, \ - } -#define VM_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ - { \ - { \ - STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ - .offset = offsetof(struct kvm_vm_stat, stat) \ - }, \ - .name = #stat, \ - } -#define VCPU_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ - { \ - { \ - STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ - .offset = offsetof(struct kvm_vcpu_stat, stat) \ - }, \ - .name = #stat, \ - } +#define VM_GENERIC_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ +{ \ + STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ + .offset = offsetof(struct kvm_vm_stat, generic.stat), \ + .name = #stat, \ +} +#define VCPU_GENERIC_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ +{ \ + STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ + .offset = offsetof(struct kvm_vcpu_stat, generic.stat), \ + .name = #stat, \ +} +#define VM_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ +{ \ + STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ + .offset = offsetof(struct kvm_vm_stat, stat), \ + .name = #stat, \ +} +#define VCPU_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ +{ \ + STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ + .offset = offsetof(struct kvm_vcpu_stat, stat), \ + .name = #stat, \ +} /* SCOPE: VM, VM_GENERIC, VCPU, VCPU_GENERIC */ #define STATS_DESC(SCOPE, stat, type, unit, base, exp, sz, bsz) \ SCOPE##_STATS_DESC(stat, type, unit, base, exp, sz, bsz) @@ -2053,7 +2040,7 @@ struct _kvm_stats_desc { STATS_DESC_IBOOLEAN(VCPU_GENERIC, blocking) ssize_t kvm_stats_read(char *id, const struct kvm_stats_header *header, - const struct _kvm_stats_desc *desc, + const struct kvm_stats_desc *desc, void *stats, size_t size_stats, char __user *user_buffer, size_t size, loff_t *offset); @@ -2098,9 +2085,9 @@ static inline void kvm_stats_log_hist_update(u64 *data, size_t size, u64 value) extern const struct kvm_stats_header kvm_vm_stats_header; -extern const struct _kvm_stats_desc kvm_vm_stats_desc[]; +extern const struct kvm_stats_desc kvm_vm_stats_desc[]; extern const struct kvm_stats_header kvm_vcpu_stats_header; -extern const struct _kvm_stats_desc kvm_vcpu_stats_desc[]; +extern const struct kvm_stats_desc kvm_vcpu_stats_desc[]; #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER static inline int mmu_invalidate_retry(struct kvm *kvm, unsigned long mmu_seq) -- cgit v1.2.3 From 6b3e458806e34f1142592f786d3eb0ebac209cc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnther=20Noack?= Date: Thu, 19 Feb 2026 16:43:35 +0100 Subject: HID: Document memory allocation properties of report_fixup() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The memory pointer returned by the report_fixup() hook does not get freed by the caller. Instead, report_fixup() must return (in return value and *rsize) a memory buffer with at least the same lifetime as the input buffer (defined by rdesc and original *rsize). This is usually achieved using one of the following techniques: * Returning a pointer and size to a sub-portion of the input buffer * Returning a pointer to a static buffer * Allocating a buffer with a devm_*() function, which will automatically get freed when the device is removed. Signed-off-by: Günther Noack Signed-off-by: Benjamin Tissoires --- include/linux/hid.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index dce862cafbbd..2990b9f94cb5 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -836,6 +836,12 @@ struct hid_usage_id { * raw_event and event should return negative on error, any other value will * pass the event on to .event() typically return 0 for success. * + * report_fixup must return a report descriptor pointer whose lifetime is at + * least that of the input rdesc. This is usually done by mutating the input + * rdesc and returning it or a sub-portion of it. In case a new buffer is + * allocated and returned, the implementation of report_fixup is responsible for + * freeing it later. + * * input_mapping shall return a negative value to completely ignore this usage * (e.g. doubled or invalid usage), zero to continue with parsing of this * usage by generic code (no special handling needed) or positive to skip -- cgit v1.2.3 From 2a7b7652b1bb3fadc3bd47d622bfb127a93ab6b0 Mon Sep 17 00:00:00 2001 From: Leif Skunberg Date: Tue, 10 Feb 2026 14:21:29 +0100 Subject: platform/x86: int3472: Handle GPIO type 0x10 (DOVDD) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Lenovo ThinkPad X1 Fold 16 Gen 1 has an OV5675 sensor (ACPI HID OVTI5675) behind an INT3472 discrete PMIC controller. The INT3472 _DSM returns GPIO type 0x10 for one of the pins, which controls the DOVDD (digital I/O power) regulator enable. Type 0x10 is not currently handled by the driver, causing the GPIO to be ignored with a warning. Add INT3472_GPIO_TYPE_DOVDD (0x10) and handle it as a regulator with con_id "dovdd" to match the supply name used by sensor drivers (e.g. ov5675). Also increase GPIO_SUPPLY_NAME_LENGTH from 5 to 6 to accommodate the "dovdd" name (5 chars + null terminator). Signed-off-by: Leif Skunberg Reviewed-by: Hans de Goede Link: https://patch.msgid.link/20260210132129.17943-1-diamondback@cohunt.app Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/platform_data/x86/int3472.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/int3472.h b/include/linux/platform_data/x86/int3472.h index b1b837583d54..dbe745dc88d5 100644 --- a/include/linux/platform_data/x86/int3472.h +++ b/include/linux/platform_data/x86/int3472.h @@ -26,6 +26,7 @@ #define INT3472_GPIO_TYPE_POWER_ENABLE 0x0b #define INT3472_GPIO_TYPE_CLK_ENABLE 0x0c #define INT3472_GPIO_TYPE_PRIVACY_LED 0x0d +#define INT3472_GPIO_TYPE_DOVDD 0x10 #define INT3472_GPIO_TYPE_HANDSHAKE 0x12 #define INT3472_GPIO_TYPE_HOTPLUG_DETECT 0x13 @@ -33,8 +34,8 @@ #define INT3472_MAX_SENSOR_GPIOS 3 #define INT3472_MAX_REGULATORS 3 -/* E.g. "avdd\0" */ -#define GPIO_SUPPLY_NAME_LENGTH 5 +/* E.g. "dovdd\0" */ +#define GPIO_SUPPLY_NAME_LENGTH 6 /* 12 chars for acpi_dev_name() + "-", e.g. "ABCD1234:00-" */ #define GPIO_REGULATOR_NAME_LENGTH (12 + GPIO_SUPPLY_NAME_LENGTH) /* lower- and upper-case mapping */ -- cgit v1.2.3 From 28aaa9c39945b7925a1cc1d513c8f21ed38f5e4f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 26 Feb 2026 10:43:55 +0100 Subject: kthread: consolidate kthread exit paths to prevent use-after-free Guillaume reported crashes via corrupted RCU callback function pointers during KUnit testing. The crash was traced back to the pidfs rhashtable conversion which replaced the 24-byte rb_node with an 8-byte rhash_head in struct pid, shrinking it from 160 to 144 bytes. struct kthread (without CONFIG_BLK_CGROUP) is also 144 bytes. With CONFIG_SLAB_MERGE_DEFAULT and SLAB_HWCACHE_ALIGN both round up to 192 bytes and share the same slab cache. struct pid.rcu.func and struct kthread.affinity_node both sit at offset 0x78. When a kthread exits via make_task_dead() it bypasses kthread_exit() and misses the affinity_node cleanup. free_kthread_struct() frees the memory while the node is still linked into the global kthread_affinity_list. A subsequent list_del() by another kthread writes through dangling list pointers into the freed and reused memory, corrupting the pid's rcu.func pointer. Instead of patching free_kthread_struct() to handle the missed cleanup, consolidate all kthread exit paths. Turn kthread_exit() into a macro that calls do_exit() and add kthread_do_exit() which is called from do_exit() for any task with PF_KTHREAD set. This guarantees that kthread-specific cleanup always happens regardless of the exit path - make_task_dead(), direct do_exit(), or kthread_exit(). Replace __to_kthread() with a new tsk_is_kthread() accessor in the public header. Export do_exit() since module code using the kthread_exit() macro now needs it directly. Reported-by: Guillaume Tucker Tested-by: Guillaume Tucker Tested-by: Mark Brown Tested-by: David Gow Cc: Link: https://lore.kernel.org/all/20260224-mittlerweile-besessen-2738831ae7f6@brauner Co-developed-by: Linus Torvalds Fixes: 4d13f4304fa4 ("kthread: Implement preferred affinity") Signed-off-by: Linus Torvalds Signed-off-by: Christian Brauner --- include/linux/kthread.h | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kthread.h b/include/linux/kthread.h index c92c1149ee6e..a01a474719a7 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -7,6 +7,24 @@ struct mm_struct; +/* opaque kthread data */ +struct kthread; + +/* + * When "(p->flags & PF_KTHREAD)" is set the task is a kthread and will + * always remain a kthread. For kthreads p->worker_private always + * points to a struct kthread. For tasks that are not kthreads + * p->worker_private is used to point to other things. + * + * Return NULL for any task that is not a kthread. + */ +static inline struct kthread *tsk_is_kthread(struct task_struct *p) +{ + if (p->flags & PF_KTHREAD) + return p->worker_private; + return NULL; +} + __printf(4, 5) struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), void *data, @@ -98,9 +116,10 @@ void *kthread_probe_data(struct task_struct *k); int kthread_park(struct task_struct *k); void kthread_unpark(struct task_struct *k); void kthread_parkme(void); -void kthread_exit(long result) __noreturn; +#define kthread_exit(result) do_exit(result) void kthread_complete_and_exit(struct completion *, long) __noreturn; int kthreads_update_housekeeping(void); +void kthread_do_exit(struct kthread *, long); int kthreadd(void *unused); extern struct task_struct *kthreadd_task; -- cgit v1.2.3 From 3350c2b3f2b8a3b985a020a4ef4f2f050a4b6a1d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 25 Feb 2026 21:12:29 -0800 Subject: platform_data/mlxreg: mlxreg.h: fix all kernel-doc warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the correct kernel-doc format & notation to eliminate kernel-doc warnings: Warning: include/linux/platform_data/mlxreg.h:24 Enum value 'MLX_WDT_TYPE1' not described in enum 'mlxreg_wdt_type' Warning: include/linux/platform_data/mlxreg.h:24 Enum value 'MLX_WDT_TYPE2' not described in enum 'mlxreg_wdt_type' Warning: include/linux/platform_data/mlxreg.h:24 Enum value 'MLX_WDT_TYPE3' not described in enum 'mlxreg_wdt_type' Warning: include/linux/platform_data/mlxreg.h:37 bad line: PHYs ready / unready state; Warning: include/linux/platform_data/mlxreg.h:153 struct member 'np' not described in 'mlxreg_core_data' Warning: include/linux/platform_data/mlxreg.h:153 struct member 'hpdev' not described in 'mlxreg_core_data' Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20260226051232.549537-1-rdunlap@infradead.org Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/platform_data/mlxreg.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/mlxreg.h b/include/linux/platform_data/mlxreg.h index f6cca7a035c7..50b6be57da66 100644 --- a/include/linux/platform_data/mlxreg.h +++ b/include/linux/platform_data/mlxreg.h @@ -13,10 +13,10 @@ /** * enum mlxreg_wdt_type - type of HW watchdog * - * TYPE1 HW watchdog implementation exist in old systems. - * All new systems have TYPE2 HW watchdog. - * TYPE3 HW watchdog can exist on all systems with new CPLD. - * TYPE3 is selected by WD capability bit. + * @MLX_WDT_TYPE1: HW watchdog implementation in old systems. + * @MLX_WDT_TYPE2: All new systems have TYPE2 HW watchdog. + * @MLX_WDT_TYPE3: HW watchdog that can exist on all systems with new CPLD. + * TYPE3 is selected by WD capability bit. */ enum mlxreg_wdt_type { MLX_WDT_TYPE1, @@ -35,7 +35,7 @@ enum mlxreg_wdt_type { * @MLXREG_HOTPLUG_LC_SYNCED: entry for line card synchronization events, coming * after hardware-firmware synchronization handshake; * @MLXREG_HOTPLUG_LC_READY: entry for line card ready events, indicating line card - PHYs ready / unready state; + * PHYs ready / unready state; * @MLXREG_HOTPLUG_LC_ACTIVE: entry for line card active events, indicating firmware * availability / unavailability for the ports on line card; * @MLXREG_HOTPLUG_LC_THERMAL: entry for line card thermal shutdown events, positive @@ -123,8 +123,8 @@ struct mlxreg_hotplug_device { * @reg_pwr: attribute power register; * @reg_ena: attribute enable register; * @mode: access mode; - * @np - pointer to node platform associated with attribute; - * @hpdev - hotplug device data; + * @np: pointer to node platform associated with attribute; + * @hpdev: hotplug device data; * @notifier: pointer to event notifier block; * @health_cntr: dynamic device health indication counter; * @attached: true if device has been attached after good health indication; -- cgit v1.2.3 From e6b899f08066e744f89df16ceb782e06868bd148 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 26 Feb 2026 14:50:09 +0100 Subject: nsfs: tighten permission checks for ns iteration ioctls Even privileged services should not necessarily be able to see other privileged service's namespaces so they can't leak information to each other. Use may_see_all_namespaces() helper that centralizes this policy until the nstree adapts. Link: https://patch.msgid.link/20260226-work-visibility-fixes-v1-1-d2c2853313bd@kernel.org Fixes: a1d220d9dafa ("nsfs: iterate through mount namespaces") Reviewed-by: Jeff Layton Cc: stable@kernel.org # v6.12+ Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 825f5865bfc5..c8e227a3f9e2 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -55,6 +55,8 @@ static __always_inline bool is_ns_init_id(const struct ns_common *ns) #define ns_common_free(__ns) __ns_common_free(to_ns_common((__ns))) +bool may_see_all_namespaces(void); + static __always_inline __must_check int __ns_ref_active_read(const struct ns_common *ns) { return atomic_read(&ns->__ns_ref_active); -- cgit v1.2.3 From 15fba71533bcdfaa8eeba69a5a5a2927afdf664a Mon Sep 17 00:00:00 2001 From: Valentin Spreckels Date: Thu, 26 Feb 2026 20:54:09 +0100 Subject: net: usb: r8152: add TRENDnet TUC-ET2G The TRENDnet TUC-ET2G is a RTL8156 based usb ethernet adapter. Add its vendor and product IDs. Signed-off-by: Valentin Spreckels Link: https://patch.msgid.link/20260226195409.7891-2-valentin@spreckels.dev Signed-off-by: Jakub Kicinski --- include/linux/usb/r8152.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/usb/r8152.h b/include/linux/usb/r8152.h index 2ca60828f28b..1502b2a355f9 100644 --- a/include/linux/usb/r8152.h +++ b/include/linux/usb/r8152.h @@ -32,6 +32,7 @@ #define VENDOR_ID_DLINK 0x2001 #define VENDOR_ID_DELL 0x413c #define VENDOR_ID_ASUS 0x0b05 +#define VENDOR_ID_TRENDNET 0x20f4 #if IS_REACHABLE(CONFIG_USB_RTL8152) extern u8 rtl8152_get_version(struct usb_interface *intf); -- cgit v1.2.3 From b570f37a2ce480be26c665345c5514686a8a0274 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Tue, 10 Feb 2026 12:56:53 +0100 Subject: mm: Fix a hmm_range_fault() livelock / starvation problem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If hmm_range_fault() fails a folio_trylock() in do_swap_page, trying to acquire the lock of a device-private folio for migration, to ram, the function will spin until it succeeds grabbing the lock. However, if the process holding the lock is depending on a work item to be completed, which is scheduled on the same CPU as the spinning hmm_range_fault(), that work item might be starved and we end up in a livelock / starvation situation which is never resolved. This can happen, for example if the process holding the device-private folio lock is stuck in migrate_device_unmap()->lru_add_drain_all() sinc lru_add_drain_all() requires a short work-item to be run on all online cpus to complete. A prerequisite for this to happen is: a) Both zone device and system memory folios are considered in migrate_device_unmap(), so that there is a reason to call lru_add_drain_all() for a system memory folio while a folio lock is held on a zone device folio. b) The zone device folio has an initial mapcount > 1 which causes at least one migration PTE entry insertion to be deferred to try_to_migrate(), which can happen after the call to lru_add_drain_all(). c) No or voluntary only preemption. This all seems pretty unlikely to happen, but indeed is hit by the "xe_exec_system_allocator" igt test. Resolve this by waiting for the folio to be unlocked if the folio_trylock() fails in do_swap_page(). Rename migration_entry_wait_on_locked() to softleaf_entry_wait_unlock() and update its documentation to indicate the new use-case. Future code improvements might consider moving the lru_add_drain_all() call in migrate_device_unmap() to be called *after* all pages have migration entries inserted. That would eliminate also b) above. v2: - Instead of a cond_resched() in hmm_range_fault(), eliminate the problem by waiting for the folio to be unlocked in do_swap_page() (Alistair Popple, Andrew Morton) v3: - Add a stub migration_entry_wait_on_locked() for the !CONFIG_MIGRATION case. (Kernel Test Robot) v4: - Rename migrate_entry_wait_on_locked() to softleaf_entry_wait_on_locked() and update docs (Alistair Popple) v5: - Add a WARN_ON_ONCE() for the !CONFIG_MIGRATION version of softleaf_entry_wait_on_locked(). - Modify wording around function names in the commit message (Andrew Morton) Suggested-by: Alistair Popple Fixes: 1afaeb8293c9 ("mm/migrate: Trylock device page in do_swap_page") Cc: Ralph Campbell Cc: Christoph Hellwig Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: Leon Romanovsky Cc: Andrew Morton Cc: Matthew Brost Cc: John Hubbard Cc: Alistair Popple Cc: linux-mm@kvack.org Cc: Signed-off-by: Thomas Hellström Cc: # v6.15+ Reviewed-by: John Hubbard #v3 Reviewed-by: Alistair Popple Link: https://patch.msgid.link/20260210115653.92413-1-thomas.hellstrom@linux.intel.com (cherry picked from commit a69d1ab971a624c6f112cea61536569d579c3215) Signed-off-by: Rodrigo Vivi --- include/linux/migrate.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 26ca00c325d9..d5af2b7f577b 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -65,7 +65,7 @@ bool isolate_folio_to_list(struct folio *folio, struct list_head *list); int migrate_huge_page_move_mapping(struct address_space *mapping, struct folio *dst, struct folio *src); -void migration_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl) +void softleaf_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl) __releases(ptl); void folio_migrate_flags(struct folio *newfolio, struct folio *folio); int folio_migrate_mapping(struct address_space *mapping, @@ -97,6 +97,14 @@ static inline int set_movable_ops(const struct movable_operations *ops, enum pag return -ENOSYS; } +static inline void softleaf_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl) + __releases(ptl) +{ + WARN_ON_ONCE(1); + + spin_unlock(ptl); +} + #endif /* CONFIG_MIGRATION */ #ifdef CONFIG_NUMA_BALANCING -- cgit v1.2.3 From af4e9ef3d78420feb8fe58cd9a1ab80c501b3c08 Mon Sep 17 00:00:00 2001 From: David Laight Date: Mon, 2 Mar 2026 13:27:51 +0000 Subject: uaccess: Fix scoped_user_read_access() for 'pointer to const' If a 'const struct foo __user *ptr' is used for the address passed to scoped_user_read_access() then you get a warning/error uaccess.h:691:1: error: initialization discards 'const' qualifier from pointer target type [-Werror=discarded-qualifiers] for the void __user *_tmpptr = __scoped_user_access_begin(mode, uptr, size, elbl) assignment. Fix by using 'auto' for both _tmpptr and the redeclaration of uptr. Replace the CLASS() with explicit __cleanup() functions on uptr. Fixes: e497310b4ffb ("uaccess: Provide scoped user access regions") Signed-off-by: David Laight Reviewed-and-tested-by: Christophe Leroy (CS GROUP) Signed-off-by: Linus Torvalds --- include/linux/uaccess.h | 54 ++++++++++++++++++------------------------------- 1 file changed, 20 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 1f3804245c06..809e4f7dfdbd 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -647,36 +647,22 @@ static inline void user_access_restore(unsigned long flags) { } /* Define RW variant so the below _mode macro expansion works */ #define masked_user_rw_access_begin(u) masked_user_access_begin(u) #define user_rw_access_begin(u, s) user_access_begin(u, s) -#define user_rw_access_end() user_access_end() /* Scoped user access */ -#define USER_ACCESS_GUARD(_mode) \ -static __always_inline void __user * \ -class_user_##_mode##_begin(void __user *ptr) \ -{ \ - return ptr; \ -} \ - \ -static __always_inline void \ -class_user_##_mode##_end(void __user *ptr) \ -{ \ - user_##_mode##_access_end(); \ -} \ - \ -DEFINE_CLASS(user_ ##_mode## _access, void __user *, \ - class_user_##_mode##_end(_T), \ - class_user_##_mode##_begin(ptr), void __user *ptr) \ - \ -static __always_inline class_user_##_mode##_access_t \ -class_user_##_mode##_access_ptr(void __user *scope) \ -{ \ - return scope; \ -} -USER_ACCESS_GUARD(read) -USER_ACCESS_GUARD(write) -USER_ACCESS_GUARD(rw) -#undef USER_ACCESS_GUARD +/* Cleanup wrapper functions */ +static __always_inline void __scoped_user_read_access_end(const void *p) +{ + user_read_access_end(); +}; +static __always_inline void __scoped_user_write_access_end(const void *p) +{ + user_write_access_end(); +}; +static __always_inline void __scoped_user_rw_access_end(const void *p) +{ + user_access_end(); +}; /** * __scoped_user_access_begin - Start a scoped user access @@ -750,13 +736,13 @@ USER_ACCESS_GUARD(rw) * * Don't use directly. Use scoped_masked_user_$MODE_access() instead. */ -#define __scoped_user_access(mode, uptr, size, elbl) \ -for (bool done = false; !done; done = true) \ - for (void __user *_tmpptr = __scoped_user_access_begin(mode, uptr, size, elbl); \ - !done; done = true) \ - for (CLASS(user_##mode##_access, scope)(_tmpptr); !done; done = true) \ - /* Force modified pointer usage within the scope */ \ - for (const typeof(uptr) uptr = _tmpptr; !done; done = true) +#define __scoped_user_access(mode, uptr, size, elbl) \ +for (bool done = false; !done; done = true) \ + for (auto _tmpptr = __scoped_user_access_begin(mode, uptr, size, elbl); \ + !done; done = true) \ + /* Force modified pointer usage within the scope */ \ + for (const auto uptr __cleanup(__scoped_user_##mode##_access_end) = \ + _tmpptr; !done; done = true) /** * scoped_user_read_access_size - Start a scoped user read access with given size -- cgit v1.2.3 From 710f5c76580306cdb9ec51fac8fcf6a8faff7821 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Feb 2026 17:26:03 +0000 Subject: indirect_call_wrapper: do not reevaluate function pointer We have an increasing number of READ_ONCE(xxx->function) combined with INDIRECT_CALL_[1234]() helpers. Unfortunately this forces INDIRECT_CALL_[1234]() to read xxx->function many times, which is not what we wanted. Fix these macros so that xxx->function value is not reloaded. $ scripts/bloat-o-meter -t vmlinux.0 vmlinux add/remove: 0/0 grow/shrink: 1/65 up/down: 122/-1084 (-962) Function old new delta ip_push_pending_frames 59 181 +122 ip6_finish_output 687 681 -6 __udp_enqueue_schedule_skb 1078 1072 -6 ioam6_output 2319 2312 -7 xfrm4_rcv_encap_finish2 64 56 -8 xfrm4_output 297 289 -8 vrf_ip_local_out 278 270 -8 vrf_ip6_local_out 278 270 -8 seg6_input_finish 64 56 -8 rpl_output 700 692 -8 ipmr_forward_finish 124 116 -8 ip_forward_finish 143 135 -8 ip6mr_forward2_finish 100 92 -8 ip6_forward_finish 73 65 -8 input_action_end_bpf 1091 1083 -8 dst_input 52 44 -8 __xfrm6_output 801 793 -8 __xfrm4_output 83 75 -8 bpf_input 500 491 -9 __tcp_check_space 530 521 -9 input_action_end_dt6 291 280 -11 vti6_tnl_xmit 1634 1622 -12 bpf_xmit 1203 1191 -12 rpl_input 497 483 -14 rawv6_send_hdrinc 1355 1341 -14 ndisc_send_skb 1030 1016 -14 ipv6_srh_rcv 1377 1363 -14 ip_send_unicast_reply 1253 1239 -14 ip_rcv_finish 226 212 -14 ip6_rcv_finish 300 286 -14 input_action_end_x_core 205 191 -14 input_action_end_x 355 341 -14 input_action_end_t 205 191 -14 input_action_end_dx6_finish 127 113 -14 input_action_end_dx4_finish 373 359 -14 input_action_end_dt4 426 412 -14 input_action_end_core 186 172 -14 input_action_end_b6_encap 292 278 -14 input_action_end_b6 198 184 -14 igmp6_send 1332 1318 -14 ip_sublist_rcv 864 848 -16 ip6_sublist_rcv 1091 1075 -16 ipv6_rpl_srh_rcv 1937 1920 -17 xfrm_policy_queue_process 1246 1228 -18 seg6_output_core 903 885 -18 mld_sendpack 856 836 -20 NF_HOOK 756 736 -20 vti_tunnel_xmit 1447 1426 -21 input_action_end_dx6 664 642 -22 input_action_end 1502 1480 -22 sock_sendmsg_nosec 134 111 -23 ip6mr_forward2 388 364 -24 sock_recvmsg_nosec 134 109 -25 seg6_input_core 836 810 -26 ip_send_skb 172 146 -26 ip_local_out 140 114 -26 ip6_local_out 140 114 -26 __sock_sendmsg 162 136 -26 __ip_queue_xmit 1196 1170 -26 __ip_finish_output 405 379 -26 ipmr_queue_fwd_xmit 373 346 -27 sock_recvmsg 173 145 -28 ip6_xmit 1635 1607 -28 xfrm_output_resume 1418 1389 -29 ip_build_and_send_pkt 625 591 -34 dst_output 504 432 -72 Total: Before=25217686, After=25216724, chg -0.00% Fixes: 283c16a2dfd3 ("indirect call wrappers: helpers to speed-up indirect calls of builtin") Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260227172603.1700433-1-edumazet@google.com Signed-off-by: Paolo Abeni --- include/linux/indirect_call_wrapper.h | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/indirect_call_wrapper.h b/include/linux/indirect_call_wrapper.h index 35227d47cfc9..dc272b514a01 100644 --- a/include/linux/indirect_call_wrapper.h +++ b/include/linux/indirect_call_wrapper.h @@ -16,22 +16,26 @@ */ #define INDIRECT_CALL_1(f, f1, ...) \ ({ \ - likely(f == f1) ? f1(__VA_ARGS__) : f(__VA_ARGS__); \ + typeof(f) __f1 = (f); \ + likely(__f1 == f1) ? f1(__VA_ARGS__) : __f1(__VA_ARGS__); \ }) #define INDIRECT_CALL_2(f, f2, f1, ...) \ ({ \ - likely(f == f2) ? f2(__VA_ARGS__) : \ - INDIRECT_CALL_1(f, f1, __VA_ARGS__); \ + typeof(f) __f2 = (f); \ + likely(__f2 == f2) ? f2(__VA_ARGS__) : \ + INDIRECT_CALL_1(__f2, f1, __VA_ARGS__); \ }) #define INDIRECT_CALL_3(f, f3, f2, f1, ...) \ ({ \ - likely(f == f3) ? f3(__VA_ARGS__) : \ - INDIRECT_CALL_2(f, f2, f1, __VA_ARGS__); \ + typeof(f) __f3 = (f); \ + likely(__f3 == f3) ? f3(__VA_ARGS__) : \ + INDIRECT_CALL_2(__f3, f2, f1, __VA_ARGS__); \ }) #define INDIRECT_CALL_4(f, f4, f3, f2, f1, ...) \ ({ \ - likely(f == f4) ? f4(__VA_ARGS__) : \ - INDIRECT_CALL_3(f, f3, f2, f1, __VA_ARGS__); \ + typeof(f) __f4 = (f); \ + likely(__f4 == f4) ? f4(__VA_ARGS__) : \ + INDIRECT_CALL_3(__f4, f3, f2, f1, __VA_ARGS__); \ }) #define INDIRECT_CALLABLE_DECLARE(f) f -- cgit v1.2.3 From 9de68394a61528d40f575c3e6719cc75c56f62c3 Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Mon, 2 Mar 2026 01:25:44 +0100 Subject: Revert "driver core: enforce device_lock for driver_match_device()" This reverts commit dc23806a7c47 ("driver core: enforce device_lock for driver_match_device()") and commit 289b14592cef ("driver core: fix inverted "locked" suffix of driver_match_device()"). While technically correct, there is a major downside to this approach: When a device is already present in the system and a driver is registered on the same bus, we iterate over all devices registered on this bus to see if one of them matches. If we come across an already bound one where the corresponding driver crashed while holding the device lock (e.g. in probe()) we can't make any progress anymore. However, drivers are typically the least tested code in the kernel and hence it is a case that is likely to happen regularly. Besides hurting developer ergonomics, it potentially decreases chances of shutting things down cleanly and obtaining logs in production environments as well [1]. This came up in the context of a firewire bug, which only in combination with the reverted commit, caused the machine to hang [2]. Additionally, it was observed in [3]. Thus, revert commit dc23806a7c47 ("driver core: enforce device_lock for driver_match_device()") and add a brief note clarifying that an implementer of struct bus_type must not expect match() to be called with the device lock held. Link: https://lore.kernel.org/driver-core/DGRGTIRHA62X.3RY09D9SOK77P@kernel.org/ [1] Link: https://lore.kernel.org/all/67f655bb-4d81-4609-b008-68d200255dd2@davidgow.net/ [2] Link: https://lore.kernel.org/lkml/CALbr=LZ4v7N=tO1vgOsyj9AS+XuNbn6kG-QcF+PacdMjSo0iyw@mail.gmail.com/ [3] Reported-by: Linus Torvalds Closes: https://lore.kernel.org/driver-core/CAHk-=wgJ_L1C=HjcYJotg_zrZEmiLFJaoic+PWthjuQrutrfJw@mail.gmail.com/ Reviewed-by: Gui-Dong Han Acked-by: Greg Kroah-Hartman Link: https://patch.msgid.link/20260302002545.19389-1-dakr@kernel.org [ Add additional Link: reference. - Danilo ] Signed-off-by: Danilo Krummrich --- include/linux/device/bus.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/device/bus.h b/include/linux/device/bus.h index 99c3c83ea520..63de5f053c33 100644 --- a/include/linux/device/bus.h +++ b/include/linux/device/bus.h @@ -35,6 +35,8 @@ struct fwnode_handle; * otherwise. It may also return error code if determining that * the driver supports the device is not possible. In case of * -EPROBE_DEFER it will queue the device for deferred probing. + * Note: This callback may be invoked with or without the device + * lock held. * @uevent: Called when a device is added, removed, or a few other things * that generate uevents to add the environment variables. * @probe: Called when a new device or driver add to this bus, and callback -- cgit v1.2.3 From e39bb9e02b68942f8e9359d2a3efe7d37ae6be0e Mon Sep 17 00:00:00 2001 From: Qing Wang Date: Fri, 27 Feb 2026 10:58:42 +0800 Subject: tracing: Fix WARN_ON in tracing_buffers_mmap_close When a process forks, the child process copies the parent's VMAs but the user_mapped reference count is not incremented. As a result, when both the parent and child processes exit, tracing_buffers_mmap_close() is called twice. On the second call, user_mapped is already 0, causing the function to return -ENODEV and triggering a WARN_ON. Normally, this isn't an issue as the memory is mapped with VM_DONTCOPY set. But this is only a hint, and the application can call madvise(MADVISE_DOFORK) which resets the VM_DONTCOPY flag. When the application does that, it can trigger this issue on fork. Fix it by incrementing the user_mapped reference count without re-mapping the pages in the VMA's open callback. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Vincent Donnefort Cc: Lorenzo Stoakes Link: https://patch.msgid.link/20260227025842.1085206-1-wangqing7171@gmail.com Fixes: cf9f0f7c4c5bb ("tracing: Allow user-space mapping of the ring-buffer") Reported-by: syzbot+3b5dd2030fe08afdf65d@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=3b5dd2030fe08afdf65d Tested-by: syzbot+3b5dd2030fe08afdf65d@syzkaller.appspotmail.com Signed-off-by: Qing Wang Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 876358cfe1b1..d862fa610270 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -248,6 +248,7 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node); int ring_buffer_map(struct trace_buffer *buffer, int cpu, struct vm_area_struct *vma); +void ring_buffer_map_dup(struct trace_buffer *buffer, int cpu); int ring_buffer_unmap(struct trace_buffer *buffer, int cpu); int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu); #endif /* _LINUX_RING_BUFFER_H */ -- cgit v1.2.3 From 2d28ed588f8d7d0d41b0a4fad7f0d05e4bbf1797 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Tue, 24 Feb 2026 16:24:34 -0800 Subject: Revert "ptdesc: remove references to folios from __pagetable_ctor() and pagetable_dtor()" This change swapped out mod_node_page_state for lruvec_stat_add_folio. But, these two APIs are not interchangeable: the lruvec version also increments memcg stats, in addition to "global" pgdat stats. So after this change, the "pagetables" memcg stat in memory.stat always yields "0", which is a userspace visible regression. I tried to look for a refactor where we add a variant of lruvec_stat_mod_folio which takes a pgdat and a memcg instead of a folio, to try to adhere to the spirit of the original patch. But at the end of the day this just means we have to call folio_memcg(ptdesc_folio(ptdesc)) anyway, which doesn't really accomplish much. This regression is visible in master as well as 6.18 stable, so CC stable too. Link: https://lkml.kernel.org/r/20260225002434.2953895-1-axelrasmussen@google.com Fixes: f0c92726e89f ("ptdesc: remove references to folios from __pagetable_ctor() and pagetable_dtor()") Signed-off-by: Axel Rasmussen Acked-by: Shakeel Butt Acked-by: Johannes Weiner Reviewed-by: Vishal Moola (Oracle) Cc: David Hildenbrand Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Roman Gushchin Cc: Muchun Song Cc: Signed-off-by: Andrew Morton --- include/linux/mm.h | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 5be3d8a8f806..abb4963c1f06 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3514,26 +3514,21 @@ static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; } static inline void ptlock_free(struct ptdesc *ptdesc) {} #endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */ -static inline unsigned long ptdesc_nr_pages(const struct ptdesc *ptdesc) -{ - return compound_nr(ptdesc_page(ptdesc)); -} - static inline void __pagetable_ctor(struct ptdesc *ptdesc) { - pg_data_t *pgdat = NODE_DATA(memdesc_nid(ptdesc->pt_flags)); + struct folio *folio = ptdesc_folio(ptdesc); - __SetPageTable(ptdesc_page(ptdesc)); - mod_node_page_state(pgdat, NR_PAGETABLE, ptdesc_nr_pages(ptdesc)); + __folio_set_pgtable(folio); + lruvec_stat_add_folio(folio, NR_PAGETABLE); } static inline void pagetable_dtor(struct ptdesc *ptdesc) { - pg_data_t *pgdat = NODE_DATA(memdesc_nid(ptdesc->pt_flags)); + struct folio *folio = ptdesc_folio(ptdesc); ptlock_free(ptdesc); - __ClearPageTable(ptdesc_page(ptdesc)); - mod_node_page_state(pgdat, NR_PAGETABLE, -ptdesc_nr_pages(ptdesc)); + __folio_clear_pgtable(folio); + lruvec_stat_sub_folio(folio, NR_PAGETABLE); } static inline void pagetable_dtor_free(struct ptdesc *ptdesc) -- cgit v1.2.3 From 7392f8e4ea632622b2cd2086675ba022db238b3a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 1 Mar 2026 16:52:29 -0800 Subject: uaccess: correct kernel-doc parameter format Use the correct kernel-doc function parameter format to avoid kernel-doc warnings: Warning: include/linux/uaccess.h:814 function parameter 'uptr' not described in 'scoped_user_rw_access_size' Warning: include/linux/uaccess.h:826 function parameter 'uptr' not described in 'scoped_user_rw_access' Link: https://lkml.kernel.org/r/20260302005229.3471955-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton --- include/linux/uaccess.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 1f3804245c06..001cfef21b61 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -806,7 +806,7 @@ for (bool done = false; !done; done = true) \ /** * scoped_user_rw_access_size - Start a scoped user read/write access with given size - * @uptr Pointer to the user space address to read from and write to + * @uptr: Pointer to the user space address to read from and write to * @size: Size of the access starting from @uptr * @elbl: Error label to goto when the access region is rejected * @@ -817,7 +817,7 @@ for (bool done = false; !done; done = true) \ /** * scoped_user_rw_access - Start a scoped user read/write access - * @uptr Pointer to the user space address to read from and write to + * @uptr: Pointer to the user space address to read from and write to * @elbl: Error label to goto when the access region is rejected * * The size of the access starting from @uptr is determined via sizeof(*@uptr)). -- cgit v1.2.3 From 599b4e290c8766b19378d85d4310c6ec8f90ade4 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 1 Mar 2026 16:52:22 -0800 Subject: mm/mmu_notifier: clean up mmu_notifier.h kernel-doc Eliminate kernel-doc warnings in mmu_notifier.h: - add a missing struct short description - use the correct format for function parameters - add missing function return comment sections Warning: include/linux/mmu_notifier.h:236 missing initial short description on line: * struct mmu_interval_notifier_ops Warning: include/linux/mmu_notifier.h:325 function parameter 'interval_sub' not described in 'mmu_interval_set_seq' Warning: include/linux/mmu_notifier.h:325 function parameter 'cur_seq' not described in 'mmu_interval_set_seq' Warning: include/linux/mmu_notifier.h:346 function parameter 'interval_sub' not described in 'mmu_interval_read_retry' Warning: include/linux/mmu_notifier.h:346 function parameter 'seq' not described in 'mmu_interval_read_retry' Warning: include/linux/mmu_notifier.h:346 No description found for return value of 'mmu_interval_read_retry' Warning: include/linux/mmu_notifier.h:370 function parameter 'interval_sub' not described in 'mmu_interval_check_retry' Warning: include/linux/mmu_notifier.h:370 function parameter 'seq' not described in 'mmu_interval_check_retry' Warning: include/linux/mmu_notifier.h:370 No description found for return value of 'mmu_interval_check_retry' Link: https://lkml.kernel.org/r/20260302005222.3470783-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Reviewed-by: Jason Gunthorpe Cc: David Hildenbrand Cc: "Liam R. Howlett" Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Randy Dunlap Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mmu_notifier.h | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 07a2bbaf86e9..8450e18a87c2 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -234,7 +234,7 @@ struct mmu_notifier { }; /** - * struct mmu_interval_notifier_ops + * struct mmu_interval_notifier_ops - callback for range notification * @invalidate: Upon return the caller must stop using any SPTEs within this * range. This function can sleep. Return false only if sleeping * was required but mmu_notifier_range_blockable(range) is false. @@ -309,8 +309,8 @@ void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub); /** * mmu_interval_set_seq - Save the invalidation sequence - * @interval_sub - The subscription passed to invalidate - * @cur_seq - The cur_seq passed to the invalidate() callback + * @interval_sub: The subscription passed to invalidate + * @cur_seq: The cur_seq passed to the invalidate() callback * * This must be called unconditionally from the invalidate callback of a * struct mmu_interval_notifier_ops under the same lock that is used to call @@ -329,8 +329,8 @@ mmu_interval_set_seq(struct mmu_interval_notifier *interval_sub, /** * mmu_interval_read_retry - End a read side critical section against a VA range - * interval_sub: The subscription - * seq: The return of the paired mmu_interval_read_begin() + * @interval_sub: The subscription + * @seq: The return of the paired mmu_interval_read_begin() * * This MUST be called under a user provided lock that is also held * unconditionally by op->invalidate() when it calls mmu_interval_set_seq(). @@ -338,7 +338,7 @@ mmu_interval_set_seq(struct mmu_interval_notifier *interval_sub, * Each call should be paired with a single mmu_interval_read_begin() and * should be used to conclude the read side. * - * Returns true if an invalidation collided with this critical section, and + * Returns: true if an invalidation collided with this critical section, and * the caller should retry. */ static inline bool @@ -350,20 +350,21 @@ mmu_interval_read_retry(struct mmu_interval_notifier *interval_sub, /** * mmu_interval_check_retry - Test if a collision has occurred - * interval_sub: The subscription - * seq: The return of the matching mmu_interval_read_begin() + * @interval_sub: The subscription + * @seq: The return of the matching mmu_interval_read_begin() * * This can be used in the critical section between mmu_interval_read_begin() - * and mmu_interval_read_retry(). A return of true indicates an invalidation - * has collided with this critical region and a future - * mmu_interval_read_retry() will return true. - * - * False is not reliable and only suggests a collision may not have - * occurred. It can be called many times and does not have to hold the user - * provided lock. + * and mmu_interval_read_retry(). * * This call can be used as part of loops and other expensive operations to * expedite a retry. + * It can be called many times and does not have to hold the user + * provided lock. + * + * Returns: true indicates an invalidation has collided with this critical + * region and a future mmu_interval_read_retry() will return true. + * False is not reliable and only suggests a collision may not have + * occurred. */ static inline bool mmu_interval_check_retry(struct mmu_interval_notifier *interval_sub, -- cgit v1.2.3 From b824c3e16c1904bf80df489e293d1e3cbf98896d Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 2 Mar 2026 17:26:31 +0100 Subject: net: Provide a PREEMPT_RT specific check for netdev_queue::_xmit_lock After acquiring netdev_queue::_xmit_lock the number of the CPU owning the lock is recorded in netdev_queue::xmit_lock_owner. This works as long as the BH context is not preemptible. On PREEMPT_RT the softirq context is preemptible and without the softirq-lock it is possible to have multiple user in __dev_queue_xmit() submitting a skb on the same CPU. This is fine in general but this means also that the current CPU is recorded as netdev_queue::xmit_lock_owner. This in turn leads to the recursion alert and the skb is dropped. Instead checking the for CPU number, that owns the lock, PREEMPT_RT can check if the lockowner matches the current task. Add netif_tx_owned() which returns true if the current context owns the lock by comparing the provided CPU number with the recorded number. This resembles the current check by negating the condition (the current check returns true if the lock is not owned). On PREEMPT_RT use rt_mutex_owner() to return the lock owner and compare the current task against it. Use the new helper in __dev_queue_xmit() and netif_local_xmit_active() which provides a similar check. Update comments regarding pairing READ_ONCE(). Reported-by: Bert Karwatzki Closes: https://lore.kernel.org/all/20260216134333.412332-1-spasswolf@web.de Fixes: 3253cb49cbad4 ("softirq: Allow to drop the softirq-BKL lock on PREEMPT_RT") Signed-off-by: Sebastian Andrzej Siewior Reported-by: Bert Karwatzki Signed-off-by: Sebastian Andrzej Siewior Link: https://patch.msgid.link/20260302162631.uGUyIqDT@linutronix.de Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d4e6e00bb90a..67e25f6d15a4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4711,7 +4711,7 @@ static inline u32 netif_msg_init(int debug_value, int default_msg_enable_bits) static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu) { spin_lock(&txq->_xmit_lock); - /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + /* Pairs with READ_ONCE() in netif_tx_owned() */ WRITE_ONCE(txq->xmit_lock_owner, cpu); } @@ -4729,7 +4729,7 @@ static inline void __netif_tx_release(struct netdev_queue *txq) static inline void __netif_tx_lock_bh(struct netdev_queue *txq) { spin_lock_bh(&txq->_xmit_lock); - /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + /* Pairs with READ_ONCE() in netif_tx_owned() */ WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id()); } @@ -4738,7 +4738,7 @@ static inline bool __netif_tx_trylock(struct netdev_queue *txq) bool ok = spin_trylock(&txq->_xmit_lock); if (likely(ok)) { - /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + /* Pairs with READ_ONCE() in netif_tx_owned() */ WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id()); } return ok; @@ -4746,14 +4746,14 @@ static inline bool __netif_tx_trylock(struct netdev_queue *txq) static inline void __netif_tx_unlock(struct netdev_queue *txq) { - /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + /* Pairs with READ_ONCE() in netif_tx_owned() */ WRITE_ONCE(txq->xmit_lock_owner, -1); spin_unlock(&txq->_xmit_lock); } static inline void __netif_tx_unlock_bh(struct netdev_queue *txq) { - /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + /* Pairs with READ_ONCE() in netif_tx_owned() */ WRITE_ONCE(txq->xmit_lock_owner, -1); spin_unlock_bh(&txq->_xmit_lock); } @@ -4846,6 +4846,23 @@ static inline void netif_tx_disable(struct net_device *dev) local_bh_enable(); } +#ifndef CONFIG_PREEMPT_RT +static inline bool netif_tx_owned(struct netdev_queue *txq, unsigned int cpu) +{ + /* Other cpus might concurrently change txq->xmit_lock_owner + * to -1 or to their cpu id, but not to our id. + */ + return READ_ONCE(txq->xmit_lock_owner) == cpu; +} + +#else +static inline bool netif_tx_owned(struct netdev_queue *txq, unsigned int cpu) +{ + return rt_mutex_owner(&txq->_xmit_lock.lock) == current; +} + +#endif + static inline void netif_addr_lock(struct net_device *dev) { unsigned char nest_level = 0; -- cgit v1.2.3 From 55f854dd5bdd8e19b936a00ef1f8d776ac32c7b0 Mon Sep 17 00:00:00 2001 From: Laurent Vivier Date: Wed, 4 Mar 2026 14:43:38 +0100 Subject: qmi_wwan: allow max_mtu above hard_mtu to control rx_urb_size Commit c7159e960f14 ("usbnet: limit max_mtu based on device's hard_mtu") capped net->max_mtu to the device's hard_mtu in usbnet_probe(). While this correctly prevents oversized packets on standard USB network devices, it breaks the qmi_wwan driver. qmi_wwan relies on userspace (e.g. ModemManager) setting a large MTU on the wwan0 interface to configure rx_urb_size via usbnet_change_mtu(). QMI modems negotiate USB transfer sizes of 16,383 or 32,767 bytes, and the USB receive buffers must be sized accordingly. With max_mtu capped to hard_mtu (~1500 bytes), userspace can no longer raise the MTU, the receive buffers remain small, and download speeds drop from >300 Mbps to ~0.8 Mbps. Introduce a FLAG_NOMAXMTU driver flag that allows individual usbnet drivers to opt out of the max_mtu cap. Set this flag in qmi_wwan's driver_info structures to restore the previous behavior for QMI devices, while keeping the safety fix in place for all other usbnet drivers. Fixes: c7159e960f14 ("usbnet: limit max_mtu based on device's hard_mtu") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/lkml/CAPh3n803k8JcBPV5qEzUB-oKzWkAs-D5CU7z=Vd_nLRCr5ZqQg@mail.gmail.com/ Reported-by: Koen Vandeputte Tested-by: Daniele Palmas Signed-off-by: Laurent Vivier Link: https://patch.msgid.link/20260304134338.1785002-1-lvivier@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/usb/usbnet.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h index b0e84896e6ac..bbf799ccf3b3 100644 --- a/include/linux/usb/usbnet.h +++ b/include/linux/usb/usbnet.h @@ -132,6 +132,7 @@ struct driver_info { #define FLAG_MULTI_PACKET 0x2000 #define FLAG_RX_ASSEMBLE 0x4000 /* rx packets may span >1 frames */ #define FLAG_NOARP 0x8000 /* device can't do ARP */ +#define FLAG_NOMAXMTU 0x10000 /* allow max_mtu above hard_mtu */ /* init device ... can sleep, or cause probe() failure */ int (*bind)(struct usbnet *, struct usb_interface *); -- cgit v1.2.3 From 1954c4f012206147c34acda8da04f827aa7d3ee3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 7 Mar 2026 20:07:15 +0000 Subject: eventpoll: Convert epoll_put_uevent() to scoped user access Saves two function calls, and one stac/clac pair. stac/clac is rather expensive on older cpus like Zen 2. A synthetic network stress test gives a ~1.5% increase of pps on AMD Zen 2. Signed-off-by: Eric Dumazet Cc: Christophe Leroy Cc: Dave Hansen Cc: Kuniyuki Iwashima Signed-off-by: Linus Torvalds --- include/linux/eventpoll.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h index ccb478eb174b..ea9ca0e4172a 100644 --- a/include/linux/eventpoll.h +++ b/include/linux/eventpoll.h @@ -82,11 +82,14 @@ static inline struct epoll_event __user * epoll_put_uevent(__poll_t revents, __u64 data, struct epoll_event __user *uevent) { - if (__put_user(revents, &uevent->events) || - __put_user(data, &uevent->data)) - return NULL; - + scoped_user_write_access_size(uevent, sizeof(*uevent), efault) { + unsafe_put_user(revents, &uevent->events, efault); + unsafe_put_user(data, &uevent->data, efault); + } return uevent+1; + +efault: + return NULL; } #endif -- cgit v1.2.3 From 6f1a9140ecda3baba3d945b9a6155af4268aafc4 Mon Sep 17 00:00:00 2001 From: Weiming Shi Date: Sat, 7 Mar 2026 00:01:34 +0800 Subject: net: add xmit recursion limit to tunnel xmit functions Tunnel xmit functions (iptunnel_xmit, ip6tunnel_xmit) lack their own recursion limit. When a bond device in broadcast mode has GRE tap interfaces as slaves, and those GRE tunnels route back through the bond, multicast/broadcast traffic triggers infinite recursion between bond_xmit_broadcast() and ip_tunnel_xmit()/ip6_tnl_xmit(), causing kernel stack overflow. The existing XMIT_RECURSION_LIMIT (8) in the no-qdisc path is not sufficient because tunnel recursion involves route lookups and full IP output, consuming much more stack per level. Use a lower limit of 4 (IP_TUNNEL_RECURSION_LIMIT) to prevent overflow. Add recursion detection using dev_xmit_recursion helpers directly in iptunnel_xmit() and ip6tunnel_xmit() to cover all IPv4/IPv6 tunnel paths including UDP encapsulated tunnels (VXLAN, Geneve, etc.). Move dev_xmit_recursion helpers from net/core/dev.h to public header include/linux/netdevice.h so they can be used by tunnel code. BUG: KASAN: stack-out-of-bounds in blake2s.constprop.0+0xe7/0x160 Write of size 32 at addr ffff88810033fed0 by task kworker/0:1/11 Workqueue: mld mld_ifc_work Call Trace: __build_flow_key.constprop.0 (net/ipv4/route.c:515) ip_rt_update_pmtu (net/ipv4/route.c:1073) iptunnel_xmit (net/ipv4/ip_tunnel_core.c:84) ip_tunnel_xmit (net/ipv4/ip_tunnel.c:847) gre_tap_xmit (net/ipv4/ip_gre.c:779) dev_hard_start_xmit (net/core/dev.c:3887) sch_direct_xmit (net/sched/sch_generic.c:347) __dev_queue_xmit (net/core/dev.c:4802) bond_dev_queue_xmit (drivers/net/bonding/bond_main.c:312) bond_xmit_broadcast (drivers/net/bonding/bond_main.c:5279) bond_start_xmit (drivers/net/bonding/bond_main.c:5530) dev_hard_start_xmit (net/core/dev.c:3887) __dev_queue_xmit (net/core/dev.c:4841) ip_finish_output2 (net/ipv4/ip_output.c:237) ip_output (net/ipv4/ip_output.c:438) iptunnel_xmit (net/ipv4/ip_tunnel_core.c:86) gre_tap_xmit (net/ipv4/ip_gre.c:779) dev_hard_start_xmit (net/core/dev.c:3887) sch_direct_xmit (net/sched/sch_generic.c:347) __dev_queue_xmit (net/core/dev.c:4802) bond_dev_queue_xmit (drivers/net/bonding/bond_main.c:312) bond_xmit_broadcast (drivers/net/bonding/bond_main.c:5279) bond_start_xmit (drivers/net/bonding/bond_main.c:5530) dev_hard_start_xmit (net/core/dev.c:3887) __dev_queue_xmit (net/core/dev.c:4841) ip_finish_output2 (net/ipv4/ip_output.c:237) ip_output (net/ipv4/ip_output.c:438) iptunnel_xmit (net/ipv4/ip_tunnel_core.c:86) ip_tunnel_xmit (net/ipv4/ip_tunnel.c:847) gre_tap_xmit (net/ipv4/ip_gre.c:779) dev_hard_start_xmit (net/core/dev.c:3887) sch_direct_xmit (net/sched/sch_generic.c:347) __dev_queue_xmit (net/core/dev.c:4802) bond_dev_queue_xmit (drivers/net/bonding/bond_main.c:312) bond_xmit_broadcast (drivers/net/bonding/bond_main.c:5279) bond_start_xmit (drivers/net/bonding/bond_main.c:5530) dev_hard_start_xmit (net/core/dev.c:3887) __dev_queue_xmit (net/core/dev.c:4841) mld_sendpack mld_ifc_work process_one_work worker_thread Fixes: 745e20f1b626 ("net: add a recursion limit in xmit path") Reported-by: Xiang Mei Signed-off-by: Weiming Shi Link: https://patch.msgid.link/20260306160133.3852900-2-bestswngs@gmail.com Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 67e25f6d15a4..ae269a2e7f4d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3576,17 +3576,49 @@ struct page_pool_bh { }; DECLARE_PER_CPU(struct page_pool_bh, system_page_pool); +#define XMIT_RECURSION_LIMIT 8 + #ifndef CONFIG_PREEMPT_RT static inline int dev_recursion_level(void) { return this_cpu_read(softnet_data.xmit.recursion); } + +static inline bool dev_xmit_recursion(void) +{ + return unlikely(__this_cpu_read(softnet_data.xmit.recursion) > + XMIT_RECURSION_LIMIT); +} + +static inline void dev_xmit_recursion_inc(void) +{ + __this_cpu_inc(softnet_data.xmit.recursion); +} + +static inline void dev_xmit_recursion_dec(void) +{ + __this_cpu_dec(softnet_data.xmit.recursion); +} #else static inline int dev_recursion_level(void) { return current->net_xmit.recursion; } +static inline bool dev_xmit_recursion(void) +{ + return unlikely(current->net_xmit.recursion > XMIT_RECURSION_LIMIT); +} + +static inline void dev_xmit_recursion_inc(void) +{ + current->net_xmit.recursion++; +} + +static inline void dev_xmit_recursion_dec(void) +{ + current->net_xmit.recursion--; +} #endif void __netif_schedule(struct Qdisc *q); -- cgit v1.2.3 From fa655a9ca73f7df32b8ca4d14ce11742f9578288 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 3 Mar 2026 22:31:01 +0100 Subject: nvme: Annotate struct nvme_dhchap_key with __counted_by Add the __counted_by() compiler attribute to the flexible array member 'key' to improve access bounds-checking via CONFIG_UBSAN_BOUNDS and CONFIG_FORTIFY_SOURCE. Reviewed-by: Christoph Hellwig Signed-off-by: Thorsten Blum Signed-off-by: Keith Busch --- include/linux/nvme-auth.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nvme-auth.h b/include/linux/nvme-auth.h index 60e069a6757f..e75c29c51464 100644 --- a/include/linux/nvme-auth.h +++ b/include/linux/nvme-auth.h @@ -11,7 +11,7 @@ struct nvme_dhchap_key { size_t len; u8 hash; - u8 key[]; + u8 key[] __counted_by(len); }; u32 nvme_auth_get_seqnum(void); -- cgit v1.2.3 From 22fd7f7fed2ae3702f90d1985c326354e86b9c75 Mon Sep 17 00:00:00 2001 From: Muhammad Amirul Asyraf Mohamad Jamian Date: Thu, 5 Mar 2026 01:31:51 -0800 Subject: firmware: stratix10-svc: Add Multi SVC clients support In the current implementation, SVC client drivers such as socfpga-hwmon, intel_fcs, stratix10-soc, stratix10-rsu each send an SMC command that triggers a single thread in the stratix10-svc driver. Upon receiving a callback, the initiating client driver sends a stratix10-svc-done signal, terminating the thread without waiting for other pending SMC commands to complete. This leads to a timeout issue in the firmware SVC mailbox service when multiple client drivers send SMC commands concurrently. To resolve this issue, a dedicated thread is now created per channel. The stratix10-svc driver will support up to the number of channels defined by SVC_NUM_CHANNEL. Thread synchronization is handled using a mutex to prevent simultaneous issuance of SMC commands by multiple threads. SVC_NUM_DATA_IN_FIFO is reduced from 32 to 8, since each channel now has its own dedicated FIFO and the SDM processes commands one at a time. 8 entries per channel is sufficient while keeping the total aggregate capacity the same (4 channels x 8 = 32 entries). Additionally, a thread task is now validated before invoking kthread_stop when the user aborts, ensuring safe termination. Timeout values have also been adjusted to accommodate the increased load from concurrent client driver activity. Fixes: 7ca5ce896524 ("firmware: add Intel Stratix10 service layer driver") Cc: stable@vger.kernel.org Signed-off-by: Ang Tien Sung Signed-off-by: Fong, Yan Kei Signed-off-by: Muhammad Amirul Asyraf Mohamad Jamian Link: https://lore.kernel.org/all/20260305093151.2678-1-muhammad.amirul.asyraf.mohamad.jamian@altera.com Signed-off-by: Dinh Nguyen --- include/linux/firmware/intel/stratix10-svc-client.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/firmware/intel/stratix10-svc-client.h b/include/linux/firmware/intel/stratix10-svc-client.h index d290060f4c73..91013161e9db 100644 --- a/include/linux/firmware/intel/stratix10-svc-client.h +++ b/include/linux/firmware/intel/stratix10-svc-client.h @@ -68,12 +68,12 @@ * timeout value used in Stratix10 FPGA manager driver. * timeout value used in RSU driver */ -#define SVC_RECONFIG_REQUEST_TIMEOUT_MS 300 -#define SVC_RECONFIG_BUFFER_TIMEOUT_MS 720 -#define SVC_RSU_REQUEST_TIMEOUT_MS 300 +#define SVC_RECONFIG_REQUEST_TIMEOUT_MS 5000 +#define SVC_RECONFIG_BUFFER_TIMEOUT_MS 5000 +#define SVC_RSU_REQUEST_TIMEOUT_MS 2000 #define SVC_FCS_REQUEST_TIMEOUT_MS 2000 #define SVC_COMPLETED_TIMEOUT_MS 30000 -#define SVC_HWMON_REQUEST_TIMEOUT_MS 300 +#define SVC_HWMON_REQUEST_TIMEOUT_MS 2000 struct stratix10_svc_chan; -- cgit v1.2.3 From b2e48c429ec54715d16fefa719dd2fbded2e65be Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 10 Mar 2026 21:28:53 +0100 Subject: sched/mmcid: Prevent CID stalls due to concurrent forks A newly forked task is accounted as MMCID user before the task is visible in the process' thread list and the global task list. This creates the following problem: CPU1 CPU2 fork() sched_mm_cid_fork(tnew1) tnew1->mm.mm_cid_users++; tnew1->mm_cid.cid = getcid() -> preemption fork() sched_mm_cid_fork(tnew2) tnew2->mm.mm_cid_users++; // Reaches the per CPU threshold mm_cid_fixup_tasks_to_cpus() for_each_other(current, p) .... As tnew1 is not visible yet, this fails to fix up the already allocated CID of tnew1. As a consequence a subsequent schedule in might fail to acquire a (transitional) CID and the machine stalls. Move the invocation of sched_mm_cid_fork() after the new task becomes visible in the thread and the task list to prevent this. This also makes it symmetrical vs. exit() where the task is removed as CID user before the task is removed from the thread and task lists. Fixes: fbd0e71dc370 ("sched/mmcid: Provide CID ownership mode fixup functions") Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Tested-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260310202525.969061974@kernel.org --- include/linux/sched.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index a7b4a980eb2f..5a5d3dbc9cdf 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2354,7 +2354,6 @@ static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct allo #ifdef CONFIG_SCHED_MM_CID void sched_mm_cid_before_execve(struct task_struct *t); void sched_mm_cid_after_execve(struct task_struct *t); -void sched_mm_cid_fork(struct task_struct *t); void sched_mm_cid_exit(struct task_struct *t); static __always_inline int task_mm_cid(struct task_struct *t) { @@ -2363,7 +2362,6 @@ static __always_inline int task_mm_cid(struct task_struct *t) #else static inline void sched_mm_cid_before_execve(struct task_struct *t) { } static inline void sched_mm_cid_after_execve(struct task_struct *t) { } -static inline void sched_mm_cid_fork(struct task_struct *t) { } static inline void sched_mm_cid_exit(struct task_struct *t) { } static __always_inline int task_mm_cid(struct task_struct *t) { -- cgit v1.2.3 From 192d852129b1b7c4f0ddbab95d0de1efd5ee1405 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 10 Mar 2026 21:29:09 +0100 Subject: sched/mmcid: Avoid full tasklist walks Chasing vfork()'ed tasks on a CID ownership mode switch requires a full task list walk, which is obviously expensive on large systems. Avoid that by keeping a list of tasks using a mm MMCID entity in mm::mm_cid and walk this list instead. This removes the proven to be flaky counting logic and avoids a full task list walk in the case of vfork()'ed tasks. Fixes: fbd0e71dc370 ("sched/mmcid: Provide CID ownership mode fixup functions") Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Tested-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260310202526.183824481@kernel.org --- include/linux/rseq_types.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index da5fa6f40294..0b42045988db 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -133,10 +133,12 @@ struct rseq_data { }; * @active: MM CID is active for the task * @cid: The CID associated to the task either permanently or * borrowed from the CPU + * @node: Queued in the per MM MMCID list */ struct sched_mm_cid { unsigned int active; unsigned int cid; + struct hlist_node node; }; /** @@ -157,6 +159,7 @@ struct mm_cid_pcpu { * @work: Regular work to handle the affinity mode change case * @lock: Spinlock to protect against affinity setting which can't take @mutex * @mutex: Mutex to serialize forks and exits related to this mm + * @user_list: List of the MM CID users of a MM * @nr_cpus_allowed: The number of CPUs in the per MM allowed CPUs map. The map * is growth only. * @users: The number of tasks sharing this MM. Separate from mm::mm_users @@ -177,13 +180,14 @@ struct mm_mm_cid { raw_spinlock_t lock; struct mutex mutex; + struct hlist_head user_list; /* Low frequency modified */ unsigned int nr_cpus_allowed; unsigned int users; unsigned int pcpu_thrs; unsigned int update_deferred; -}____cacheline_aligned_in_smp; +} ____cacheline_aligned; #else /* CONFIG_SCHED_MM_CID */ struct mm_mm_cid { }; struct sched_mm_cid { }; -- cgit v1.2.3 From 416909962e7cdf29fd01ac523c953f37708df93d Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Tue, 17 Feb 2026 22:07:47 -0500 Subject: USB: usbcore: Introduce usb_bulk_msg_killable() The synchronous message API in usbcore (usb_control_msg(), usb_bulk_msg(), and so on) uses uninterruptible waits. However, drivers may call these routines in the context of a user thread, which means it ought to be possible to at least kill them. For this reason, introduce a new usb_bulk_msg_killable() function which behaves the same as usb_bulk_msg() except for using wait_for_completion_killable_timeout() instead of wait_for_completion_timeout(). The same can be done later for usb_control_msg() later on, if it turns out to be needed. Signed-off-by: Alan Stern Suggested-by: Oliver Neukum Link: https://lore.kernel.org/linux-usb/3acfe838-6334-4f6d-be7c-4bb01704b33d@rowland.harvard.edu/ Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") CC: stable@vger.kernel.org Link: https://patch.msgid.link/248628b4-cc83-4e81-a620-3ce4e0376d41@rowland.harvard.edu Signed-off-by: Greg Kroah-Hartman --- include/linux/usb.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb.h b/include/linux/usb.h index fbfcc70b07fb..57ceeb02a7cb 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -1868,8 +1868,9 @@ extern int usb_control_msg(struct usb_device *dev, unsigned int pipe, extern int usb_interrupt_msg(struct usb_device *usb_dev, unsigned int pipe, void *data, int len, int *actual_length, int timeout); extern int usb_bulk_msg(struct usb_device *usb_dev, unsigned int pipe, - void *data, int len, int *actual_length, - int timeout); + void *data, int len, int *actual_length, int timeout); +extern int usb_bulk_msg_killable(struct usb_device *usb_dev, unsigned int pipe, + void *data, int len, int *actual_length, int timeout); /* wrappers around usb_control_msg() for the most common standard requests */ int usb_control_msg_send(struct usb_device *dev, __u8 endpoint, __u8 request, -- cgit v1.2.3 From 1015c27a5e1a63efae2b18a9901494474b4d1dc3 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Tue, 17 Feb 2026 22:10:32 -0500 Subject: USB: core: Limit the length of unkillable synchronous timeouts The usb_control_msg(), usb_bulk_msg(), and usb_interrupt_msg() APIs in usbcore allow unlimited timeout durations. And since they use uninterruptible waits, this leaves open the possibility of hanging a task for an indefinitely long time, with no way to kill it short of unplugging the target device. To prevent this sort of problem, enforce a maximum limit on the length of these unkillable timeouts. The limit chosen here, somewhat arbitrarily, is 60 seconds. On many systems (although not all) this is short enough to avoid triggering the kernel's hung-task detector. In addition, clear up the ambiguity of negative timeout values by treating them the same as 0, i.e., using the maximum allowed timeout. Signed-off-by: Alan Stern Link: https://lore.kernel.org/linux-usb/3acfe838-6334-4f6d-be7c-4bb01704b33d@rowland.harvard.edu/ Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") CC: stable@vger.kernel.org Link: https://patch.msgid.link/15fc9773-a007-47b0-a703-df89a8cf83dd@rowland.harvard.edu Signed-off-by: Greg Kroah-Hartman --- include/linux/usb.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb.h b/include/linux/usb.h index 57ceeb02a7cb..04277af4bb9d 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -1862,6 +1862,9 @@ void usb_free_noncoherent(struct usb_device *dev, size_t size, * SYNCHRONOUS CALL SUPPORT * *-------------------------------------------------------------------*/ +/* Maximum value allowed for timeout in synchronous routines below */ +#define USB_MAX_SYNCHRONOUS_TIMEOUT 60000 /* ms */ + extern int usb_control_msg(struct usb_device *dev, unsigned int pipe, __u8 request, __u8 requesttype, __u16 value, __u16 index, void *data, __u16 size, int timeout); -- cgit v1.2.3 From 9f6a983cfa22ac662c86e60816d3a357d4b551e9 Mon Sep 17 00:00:00 2001 From: Jie Deng Date: Fri, 27 Feb 2026 16:49:31 +0800 Subject: usb: core: new quirk to handle devices with zero configurations Some USB devices incorrectly report bNumConfigurations as 0 in their device descriptor, which causes the USB core to reject them during enumeration. logs: usb 1-2: device descriptor read/64, error -71 usb 1-2: no configurations usb 1-2: can't read configurations, error -22 However, these devices actually work correctly when treated as having a single configuration. Add a new quirk USB_QUIRK_FORCE_ONE_CONFIG to handle such devices. When this quirk is set, assume the device has 1 configuration instead of failing with -EINVAL. This quirk is applied to the device with VID:PID 5131:2007 which exhibits this behavior. Signed-off-by: Jie Deng Link: https://patch.msgid.link/20260227084931.1527461-1-dengjie03@kylinos.cn Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/quirks.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/quirks.h b/include/linux/usb/quirks.h index 2f7bd2fdc616..b3cc7beab4a3 100644 --- a/include/linux/usb/quirks.h +++ b/include/linux/usb/quirks.h @@ -78,4 +78,7 @@ /* skip BOS descriptor request */ #define USB_QUIRK_NO_BOS BIT(17) +/* Device claims zero configurations, forcing to 1 */ +#define USB_QUIRK_FORCE_ONE_CONFIG BIT(18) + #endif /* __LINUX_USB_QUIRKS_H */ -- cgit v1.2.3 From 96189080265e6bb5dde3a4afbaf947af493e3f82 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 9 Mar 2026 14:21:37 -0600 Subject: io_uring: ensure ctx->rings is stable for task work flags manipulation If DEFER_TASKRUN | SETUP_TASKRUN is used and task work is added while the ring is being resized, it's possible for the OR'ing of IORING_SQ_TASKRUN to happen in the small window of swapping into the new rings and the old rings being freed. Prevent this by adding a 2nd ->rings pointer, ->rings_rcu, which is protected by RCU. The task work flags manipulation is inside RCU already, and if the resize ring freeing is done post an RCU synchronize, then there's no need to add locking to the fast path of task work additions. Note: this is only done for DEFER_TASKRUN, as that's the only setup mode that supports ring resizing. If this ever changes, then they too need to use the io_ctx_mark_taskrun() helper. Link: https://lore.kernel.org/io-uring/20260309062759.482210-1-naup96721@gmail.com/ Cc: stable@vger.kernel.org Fixes: 79cfe9e59c2a ("io_uring/register: add IORING_REGISTER_RESIZE_RINGS") Reported-by: Hao-Yu Yang Suggested-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 3e4a82a6f817..dd1420bfcb73 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -388,6 +388,7 @@ struct io_ring_ctx { * regularly bounce b/w CPUs. */ struct { + struct io_rings __rcu *rings_rcu; struct llist_head work_llist; struct llist_head retry_llist; unsigned long check_cq; -- cgit v1.2.3 From 94a4b1f959989de9c54d43c3a102fb1ee92e1414 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 7 Mar 2026 17:50:53 -0300 Subject: ipv6: move the disable_ipv6_mod knob to core code From: Jakub Kicinski Make sure disable_ipv6_mod itself is not part of the IPv6 module, in case core code wants to refer to it. We will remove support for IPv6=m soon, this change helps make fixes we commit before that less messy. Link: https://patch.msgid.link/20260307-net-nd_tbl_fixes-v4-1-e2677e85628c@suse.com Signed-off-by: Jakub Kicinski --- include/linux/ipv6.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 443053a76dcf..a7421382a916 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -333,7 +333,12 @@ struct tcp6_timewait_sock { }; #if IS_ENABLED(CONFIG_IPV6) -bool ipv6_mod_enabled(void); +extern int disable_ipv6_mod; + +static inline bool ipv6_mod_enabled(void) +{ + return disable_ipv6_mod == 0; +} static inline struct ipv6_pinfo *inet6_sk(const struct sock *__sk) { -- cgit v1.2.3