From 88692f0c33a788072abfa1888b28bc6d7d7d1165 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Mon, 27 Apr 2026 13:43:15 +0200 Subject: bpf: arena: use page_ref_count() instead of page_mapped() in arena_free_pages() Pages that BPF arena code maps are allocated through bpf_map_alloc_pages(), which does not allocate folios but pages. In the future, pages will not have a mapcount, only folios will. Converting the code to use folios and rely on folio_mapped() sounds like the wrong approach. Should BPF arena code allocate folios and use folio_mapped() here? But likely we would not want to use folios here longterm, as we don't really need folio information. Hard to tell. But in the meantime, we can simply use the page refcount instead, as a heuristic whether the page might be mapped to user space and we would want to try zapping it, so we can get rid of page_mapped(). Page allocation will give us a page with a refcount of 1. Any user space mapping adds a page reference. While there can be references from other subsystems (e.g., GUP), in the common case for this test here relying on the page count is good enough. Link: https://lore.kernel.org/20260427-page_mapped-v1-2-e89c3592c74c@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Matthew Wilcox (Oracle) Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Eduard Zingerman Cc: Harry Yoo Cc: Jann Horn Cc: Jiri Olsa Cc: John Paul Adrian Glaubitz Cc: Kumar Kartikeya Dwivedi Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Martin KaFai Lau Cc: Michal Hocko Cc: Mike Rapoport Cc: Rich Felker Cc: Rik van Riel Cc: Song Liu Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yonghong Song Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- kernel/bpf/arena.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 49a8f7b1beef..a497c5913bd4 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -729,7 +729,7 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, llist_for_each_safe(pos, t, __llist_del_all(&free_pages)) { page = llist_entry(pos, struct page, pcp_llist); - if (page_cnt == 1 && page_mapped(page)) /* mapped by some user process */ + if (page_cnt == 1 && page_ref_count(page) > 1) /* maybe mapped by user space */ /* Optimization for the common case of page_cnt==1: * If page wasn't mapped into some user vma there * is no need to call zap_pages which is slow. When -- cgit v1.2.3 From 6ae51adb084a9d87a8b9501d2231e20271dece87 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Wed, 29 Apr 2026 15:57:03 +0530 Subject: kasan: skip HW tagging for all kernel thread stacks HW-tag KASAN never checks kernel stacks because stack pointers carry the match-all tag, so setting/poisoning tags is pure overhead. - Add __GFP_SKIP_KASAN to THREADINFO_GFP so every stack allocator that uses it skips tagging (fork path plus arch users) - Add __GFP_SKIP_KASAN to GFP_VMAP_STACK for the fork-specific vmap stacks. - When reusing cached vmap stacks, skip kasan_unpoison_range() if HW tags are enabled. Software KASAN is unchanged; this only affects tag-based KASAN. Link: https://lore.kernel.org/20260429102704.680174-3-dev.jain@arm.com Signed-off-by: Muhammad Usama Anjum Signed-off-by: Dev Jain Reviewed-by: Catalin Marinas Cc: Arnd Bergmann Cc: Ben Segall Cc: David Hildenbrand (Arm) Cc: Dietmar Eggemann Cc: Ingo Molnar Cc: Juri Lelli Cc: Kees Cook Cc: K Prateek Nayak Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mathieu Desnoyers Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: "Uladzislau Rezki (Sony)" Cc: Valentin Schneider Cc: Vincent Guittot Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- kernel/fork.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 8ac38beae360..ec6a120291e5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -204,7 +204,7 @@ static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]); * accounting is performed by the code assigning/releasing stacks to tasks. * We need a zeroed memory without __GFP_ACCOUNT. */ -#define GFP_VMAP_STACK (GFP_KERNEL | __GFP_ZERO) +#define GFP_VMAP_STACK (GFP_KERNEL | __GFP_ZERO | __GFP_SKIP_KASAN) struct vm_stack { struct rcu_head rcu; @@ -342,7 +342,8 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node) } /* Reset stack metadata. */ - kasan_unpoison_range(vm_area->addr, THREAD_SIZE); + if (!kasan_hw_tags_enabled()) + kasan_unpoison_range(vm_area->addr, THREAD_SIZE); stack = kasan_reset_tag(vm_area->addr); -- cgit v1.2.3 From 9c860d1d5d69f9cb19eb7c36573ee14065a9c85a Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Wed, 13 May 2026 12:35:13 +0000 Subject: mm: introduce for_each_free_list() Patch series "mm: misc cleanups from __GFP_UNMAPPED series". In v2 of the __GFP_UNMAPPED series [0], we realised that some of the patches could potentially be merged as independent cleanups. These are all independent of one another, if you think some are useful cleanups and others are pointless churn, it should be fine to just pick whatever subset you prefer. No functional change intended. This patch (of 4): There are a couple of places that iterate over the freelists with awareness of the data structures' layout. It seems ideally, code outside of mm should not be aware of the page allocator's freelists at all. But, this patch just doesn't hide them completely, it's just a meek incremental step in that direction: provide a macro to iterate over it without needing to be aware of the actual struct fields. Link: https://lore.kernel.org/20260513-page_alloc-unmapped-prep-v1-0-dacdf5402be8@google.com Link: https://lore.kernel.org/20260513-page_alloc-unmapped-prep-v1-1-dacdf5402be8@google.com Link: https://lore.kernel.org/all/20260320-page_alloc-unmapped-v2-0-28bf1bd54f41@google.com/ [0] Signed-off-by: Brendan Jackman Acked-by: Mike Rapoport (Microsoft) Reviewed-by: Vlastimil Babka (SUSE) Cc: Axel Rasmussen Cc: Barry Song Cc: David Hildenbrand Cc: Johannes Weiner Cc: Kairui Song Cc: Len Brown Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: "Rafael J. Wysocki" Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Wei Xu Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- kernel/power/snapshot.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index a564650734dc..d933b5b2c05d 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1244,8 +1244,9 @@ unsigned int snapshot_additional_pages(struct zone *zone) static void mark_free_pages(struct zone *zone) { unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT; + struct list_head *free_list; unsigned long flags; - unsigned int order, t; + unsigned int order; struct page *page; if (zone_is_empty(zone)) @@ -1269,9 +1270,8 @@ static void mark_free_pages(struct zone *zone) swsusp_unset_page_free(page); } - for_each_migratetype_order(order, t) { - list_for_each_entry(page, - &zone->free_area[order].free_list[t], buddy_list) { + for_each_free_list(free_list, zone, order) { + list_for_each_entry(page, free_list, buddy_list) { unsigned long i; pfn = page_to_pfn(page); -- cgit v1.2.3 From c13a0316aef5f4b73e8b4bf6943737f836d65e1d Mon Sep 17 00:00:00 2001 From: Youngjun Park Date: Tue, 24 Mar 2026 01:08:21 +0900 Subject: mm/swap, PM: hibernate: fix swapoff race in uswsusp by pinning swap device Patch series "mm/swap, PM: hibernate: fix swapoff race in uswsusp by pinning swap device", v8. Currently, in the uswsusp path, only the swap type value is retrieved at lookup time without holding a reference. If swapoff races after the type is acquired, subsequent slot allocations operate on a stale swap device. Additionally, grabbing and releasing the swap device reference on every slot allocation is inefficient across the entire hibernation swap path. This patch series addresses these issues: - Patch 1: Fixes the swapoff race in uswsusp by pinning the swap device from the point it is looked up until the session completes. - Patch 2: Removes the overhead of per-slot reference counting in alloc/free paths and cleans up the redundant SWP_WRITEOK check. This patch (of 2): Hibernation via uswsusp (/dev/snapshot ioctls) has a race window: after selecting the resume swap area but before user space is frozen, swapoff may run and invalidate the selected swap device. Fix this by pinning the swap device with SWP_HIBERNATION while it is in use. The pin is exclusive, which is sufficient since hibernate_acquire() already prevents concurrent hibernation sessions. The kernel swsusp path (sysfs-based hibernate/resume) uses find_hibernation_swap_type() which is not affected by the pin. It freezes user space before touching swap, so swapoff cannot race. Introduce dedicated helpers: - pin_hibernation_swap_type(): Look up and pin the swap device. Used by the uswsusp path. - find_hibernation_swap_type(): Lookup without pinning. Used by the kernel swsusp path. - unpin_hibernation_swap_type(): Clear the hibernation pin. While a swap device is pinned, swapoff is prevented from proceeding. Link: https://lore.kernel.org/20260323160822.1409904-1-youngjun.park@lge.com Link: https://lore.kernel.org/20260323160822.1409904-2-youngjun.park@lge.com Signed-off-by: Youngjun Park Reviewed-by: Kairui Song Cc: Baoquan He Cc: Barry Song Cc: Chris Li Cc: Kemeng Shi Cc: Nhat Pham Cc: "Rafael J . Wysocki" Signed-off-by: Andrew Morton --- kernel/power/swap.c | 2 +- kernel/power/user.c | 15 ++++++++++++--- 2 files changed, 13 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 2e64869bb5a0..cc4764149e8f 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -341,7 +341,7 @@ static int swsusp_swap_check(void) * This is called before saving the image. */ if (swsusp_resume_device) - res = swap_type_of(swsusp_resume_device, swsusp_resume_block); + res = find_hibernation_swap_type(swsusp_resume_device, swsusp_resume_block); else res = find_first_swap(&swsusp_resume_device); if (res < 0) diff --git a/kernel/power/user.c b/kernel/power/user.c index be77f3556bd7..d0fcfba7ac23 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -71,7 +71,7 @@ static int snapshot_open(struct inode *inode, struct file *filp) memset(&data->handle, 0, sizeof(struct snapshot_handle)); if ((filp->f_flags & O_ACCMODE) == O_RDONLY) { /* Hibernating. The image device should be accessible. */ - data->swap = swap_type_of(swsusp_resume_device, 0); + data->swap = pin_hibernation_swap_type(swsusp_resume_device, 0); data->mode = O_RDONLY; data->free_bitmaps = false; error = pm_notifier_call_chain_robust(PM_HIBERNATION_PREPARE, PM_POST_HIBERNATION); @@ -90,8 +90,10 @@ static int snapshot_open(struct inode *inode, struct file *filp) data->free_bitmaps = !error; } } - if (error) + if (error) { + unpin_hibernation_swap_type(data->swap); hibernate_release(); + } data->frozen = false; data->ready = false; @@ -115,6 +117,7 @@ static int snapshot_release(struct inode *inode, struct file *filp) data = filp->private_data; data->dev = 0; free_all_swap_pages(data->swap); + unpin_hibernation_swap_type(data->swap); if (data->frozen) { pm_restore_gfp_mask(); free_basic_memory_bitmaps(); @@ -235,11 +238,17 @@ static int snapshot_set_swap_area(struct snapshot_data *data, offset = swap_area.offset; } + /* + * Unpin the swap device if a swap area was already + * set by SNAPSHOT_SET_SWAP_AREA. + */ + unpin_hibernation_swap_type(data->swap); + /* * User space encodes device types as two-byte values, * so we need to recode them */ - data->swap = swap_type_of(swdev, offset); + data->swap = pin_hibernation_swap_type(swdev, offset); if (data->swap < 0) return swdev ? -ENODEV : -EINVAL; data->dev = swdev; -- cgit v1.2.3