From 8613803cf5d532316aa886f17066c5e5968ea21e Mon Sep 17 00:00:00 2001 From: Chengkaitao Date: Thu, 23 Apr 2026 18:14:41 +0800 Subject: mm: convert vmemmap_p?d_populate() to static functions Since the vmemmap_p?d_populate functions are unused outside the mm subsystem, we can remove their external declarations and convert them to static functions. Link: https://lore.kernel.org/20260423101441.7089-1-kaitao.cheng@linux.dev Signed-off-by: Chengkaitao Acked-by: David Hildenbrand (arm) Acked-by: Mike Rapoport (Microsoft) Acked-by: Oscar Salvador Cc: David Hildenbrand Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/mm.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 06bbe9eba636..e3b6112a8d79 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4860,13 +4860,6 @@ unsigned long section_map_size(void); struct page * __populate_section_memmap(unsigned long pfn, unsigned long nr_pages, int nid, struct vmem_altmap *altmap, struct dev_pagemap *pgmap); -pgd_t *vmemmap_pgd_populate(unsigned long addr, int node); -p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node); -pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node); -pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node); -pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node, - struct vmem_altmap *altmap, unsigned long ptpfn, - unsigned long flags); void *vmemmap_alloc_block(unsigned long size, int node); struct vmem_altmap; void *vmemmap_alloc_block_buf(unsigned long size, int node, -- cgit v1.2.3 From 4221aadd720bef7df1268391d6eb1ea1f0476b38 Mon Sep 17 00:00:00 2001 From: Bunyod Suvonov Date: Thu, 23 Apr 2026 18:37:53 +0800 Subject: mm/vmscan: add balance_pgdat begin/end tracepoints Vmscan has six main reclaim entry points: try_to_free_pages() for direct reclaim, try_to_free_mem_cgroup_pages() for memcg reclaim, mem_cgroup_shrink_node() for memcg soft limit reclaim, node_reclaim() for node reclaim, shrink_all_memory() for hibernation reclaim, and balance_pgdat() for kswapd reclaim. All of them, except for shrink_all_memory() and balance_pgdat(), already have begin/end tracepoints. This makes it harder to trace which reclaim path is responsible for memory reclaim activity, because kswapd reclaim cannot be identified as cleanly as other reclaim entry points, even though it is the main background reclaim path under memory pressure. There may be no need to trace shrink_all_memory() as it is primarily used during hibernation. So this patch adds the missing tracepoint pair for balance_pgdat(). The begin tracepoint records the node id, requested reclaim order, and the requested classzone bound (highest_zoneidx). The end tracepoint records the node id, the reclaim order that balance_pgdat() finished with, the requested classzone bound, and nr_reclaimed. Together, they show the requested reclaim order and classzone bound, whether reclaim fell back to a lower order, and how much reclaim work was done. The end tracepoint also records highest_zoneidx even though it does not change within a balance_pgdat() invocation. This keeps the end event self-contained, so users can analyze reclaim results directly from end events without depending on begin/end correlation, which is less convenient when tracing is filtered or records are dropped. It also makes it straightforward to relate nr_reclaimed and the final reclaim order to the requested classzone bound. Link: https://lore.kernel.org/20260424031418.174597-1-b.suvonov@sjtu.edu.cn Link: https://lore.kernel.org/20260423103753.546582-1-b.suvonov@sjtu.edu.cn Signed-off-by: Bunyod Suvonov Acked-by: Shakeel Butt Cc: Axel Rasmussen Cc: Barry Song Cc: David Hildenbrand Cc: Johannes Weiner Cc: Kairui Song Cc: Lorenzo Stoakes Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Qi Zheng Cc: Steven Rostedt Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/trace/events/vmscan.h | 52 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) (limited to 'include') diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 4445a8d9218d..b4bf7b8def1f 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -96,6 +96,58 @@ TRACE_EVENT(mm_vmscan_kswapd_wake, __entry->order) ); +TRACE_EVENT(mm_vmscan_balance_pgdat_begin, + + TP_PROTO(int nid, int order, int highest_zoneidx), + + TP_ARGS(nid, order, highest_zoneidx), + + TP_STRUCT__entry( + __field(int, nid) + __field(int, order) + __field(int, highest_zoneidx) + ), + + TP_fast_assign( + __entry->nid = nid; + __entry->order = order; + __entry->highest_zoneidx = highest_zoneidx; + ), + + TP_printk("nid=%d order=%d highest_zoneidx=%-8s", + __entry->nid, + __entry->order, + __print_symbolic(__entry->highest_zoneidx, ZONE_TYPE)) +); + +TRACE_EVENT(mm_vmscan_balance_pgdat_end, + + TP_PROTO(int nid, int order, int highest_zoneidx, + unsigned long nr_reclaimed), + + TP_ARGS(nid, order, highest_zoneidx, nr_reclaimed), + + TP_STRUCT__entry( + __field(int, nid) + __field(int, order) + __field(int, highest_zoneidx) + __field(unsigned long, nr_reclaimed) + ), + + TP_fast_assign( + __entry->nid = nid; + __entry->order = order; + __entry->highest_zoneidx = highest_zoneidx; + __entry->nr_reclaimed = nr_reclaimed; + ), + + TP_printk("nid=%d order=%d highest_zoneidx=%-8s nr_reclaimed=%lu", + __entry->nid, + __entry->order, + __print_symbolic(__entry->highest_zoneidx, ZONE_TYPE), + __entry->nr_reclaimed) +); + TRACE_EVENT(mm_vmscan_wakeup_kswapd, TP_PROTO(int nid, int zid, int order, gfp_t gfp_flags), -- cgit v1.2.3 From 4aa4abf1f14bd6d0748b7d35a803cc2376a8e20b Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Wed, 1 Apr 2026 11:16:19 +0100 Subject: mm/page_alloc: optimize free_contig_range() Patch series "mm: Free contiguous order-0 pages efficiently", v6. A recent change to vmalloc caused some performance benchmark regressions (see [1]). I'm attempting to fix that (and at the same time significantly improve beyond the baseline) by freeing a contiguous set of order-0 pages as a batch. At the same time I observed that free_contig_range() was essentially doing the same thing as vfree() so I've fixed it there too. While at it, optimize the __free_contig_frozen_range() as well. Check that the contiguous range falls in the same section. If they aren't enabled, the if conditions get optimized out by the compiler as memdesc_section() returns 0. See num_pages_contiguous() for more details about it. This patch (of 3): Decompose the range of order-0 pages to be freed into the set of largest possible power-of-2 size and aligned chunks and free them to the pcp or buddy. This improves on the previous approach which freed each order-0 page individually in a loop. Testing shows performance to be improved by more than 10x in some cases. Since each page is order-0, we must decrement each page's reference count individually and only consider the page for freeing as part of a high order chunk if the reference count goes to zero. Additionally free_pages_prepare() must be called for each individual order-0 page too, so that the struct page state and global accounting state can be appropriately managed. But once this is done, the resulting high order chunks can be freed as a unit to the pcp or buddy. This significantly speeds up the free operation but also has the side benefit that high order blocks are added to the pcp instead of each page ending up on the pcp order-0 list; memory remains more readily available in high orders. vmalloc will shortly become a user of this new optimized free_contig_range() since it aggressively allocates high order non-compound pages, but then calls split_page() to end up with contiguous order-0 pages. These can now be freed much more efficiently. The execution time of the following function was measured in a server class arm64 machine: static int page_alloc_high_order_test(void) { unsigned int order = HPAGE_PMD_ORDER; struct page *page; int i; for (i = 0; i < 100000; i++) { page = alloc_pages(GFP_KERNEL, order); if (!page) return -1; split_page(page, order); free_contig_range(page_to_pfn(page), 1UL << order); } return 0; } Execution time before: 4097358 usec Execution time after: 729831 usec Perf trace before: 99.63% 0.00% kthreadd [kernel.kallsyms] [.] kthread | ---kthread 0xffffb33c12a26af8 | |--98.13%--0xffffb33c12a26060 | | | |--97.37%--free_contig_range | | | | | |--94.93%--___free_pages | | | | | | | |--55.42%--__free_frozen_pages | | | | | | | | | --43.20%--free_frozen_page_commit | | | | | | | | | --35.37%--_raw_spin_unlock_irqrestore | | | | | | | |--11.53%--_raw_spin_trylock | | | | | | | |--8.19%--__preempt_count_dec_and_test | | | | | | | |--5.64%--_raw_spin_unlock | | | | | | | |--2.37%--__get_pfnblock_flags_mask.isra.0 | | | | | | | --1.07%--free_frozen_page_commit | | | | | --1.54%--__free_frozen_pages | | | --0.77%--___free_pages | --0.98%--0xffffb33c12a26078 alloc_pages_noprof Perf trace after: 8.42% 2.90% kthreadd [kernel.kallsyms] [k] __free_contig_range | |--5.52%--__free_contig_range | | | |--5.00%--free_prepared_contig_range | | | | | |--1.43%--__free_frozen_pages | | | | | | | --0.51%--free_frozen_page_commit | | | | | |--1.08%--_raw_spin_trylock | | | | | --0.89%--_raw_spin_unlock | | | --0.52%--free_pages_prepare | --2.90%--ret_from_fork kthread 0xffffae1c12abeaf8 0xffffae1c12abe7a0 | --2.69%--vfree __free_contig_range Link: https://lore.kernel.org/20260401101634.2868165-1-usama.anjum@arm.com Link: https://lore.kernel.org/20260401101634.2868165-2-usama.anjum@arm.com Link: https://lore.kernel.org/all/66919a28-bc81-49c9-b68f-dd7c73395a0d@arm.com [1] Signed-off-by: Ryan Roberts Co-developed-by: Muhammad Usama Anjum Signed-off-by: Muhammad Usama Anjum Acked-by: David Hildenbrand (Arm) Acked-by: Vlastimil Babka (SUSE) Reviewed-by: Zi Yan Cc: Brendan Jackman Cc: David Sterba Cc: Johannes Weiner Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Nick Terrell Cc: Suren Baghdasaryan Cc: "Uladzislau Rezki (Sony)" Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- include/linux/gfp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 51ef13ed756e..87259e309dee 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -467,6 +467,8 @@ void free_contig_frozen_range(unsigned long pfn, unsigned long nr_pages); void free_contig_range(unsigned long pfn, unsigned long nr_pages); #endif +void __free_contig_range(unsigned long pfn, unsigned long nr_pages); + DEFINE_FREE(free_page, void *, free_page((unsigned long)_T)) #endif /* __LINUX_GFP_H */ -- cgit v1.2.3 From 60ced5818f64ac356620d1ad3e0d473c457dbf5b Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Wed, 1 Apr 2026 11:16:20 +0100 Subject: vmalloc: optimize vfree with free_pages_bulk() Whenever vmalloc allocates high order pages (e.g. for a huge mapping) it must immediately split_page() to order-0 so that it remains compatible with users that want to access the underlying struct page. Commit a06157804399 ("mm/vmalloc: request large order pages from buddy allocator") recently made it much more likely for vmalloc to allocate high order pages which are subsequently split to order-0. Unfortunately this had the side effect of causing performance regressions for tight vmalloc/vfree loops (e.g. test_vmalloc.ko benchmarks). See Closes: tag. This happens because the high order pages must be gotten from the buddy but then because they are split to order-0, when they are freed they are freed to the order-0 pcp. Previously allocation was for order-0 pages so they were recycled from the pcp. It would be preferable if when vmalloc allocates an (e.g.) order-3 page that it also frees that order-3 page to the order-3 pcp, then the regression could be removed. So let's do exactly that; update stats separately first as coalescing is hard to do correctly without complexity. Use free_pages_bulk() which uses the new __free_contig_range() API to batch-free contiguous ranges of pfns. This not only removes the regression, but significantly improves performance of vfree beyond the baseline. A selection of test_vmalloc benchmarks running on arm64 server class system. mm-new is the baseline. Commit a06157804399 ("mm/vmalloc: request large order pages from buddy allocator") was added in v6.19-rc1 where we see regressions. Then with this change performance is much better. (>0 is faster, <0 is slower, (R)/(I) = statistically significant Regression/Improvement): +-----------------+----------------------------------------------------------+-------------------+--------------------+ | Benchmark | Result Class | mm-new | this series | +=================+==========================================================+===================+====================+ | micromm/vmalloc | fix_align_alloc_test: p:1, h:0, l:500000 (usec) | 1331843.33 | (I) 67.17% | | | fix_size_alloc_test: p:1, h:0, l:500000 (usec) | 415907.33 | -5.14% | | | fix_size_alloc_test: p:4, h:0, l:500000 (usec) | 755448.00 | (I) 53.55% | | | fix_size_alloc_test: p:16, h:0, l:500000 (usec) | 1591331.33 | (I) 57.26% | | | fix_size_alloc_test: p:16, h:1, l:500000 (usec) | 1594345.67 | (I) 68.46% | | | fix_size_alloc_test: p:64, h:0, l:100000 (usec) | 1071826.00 | (I) 79.27% | | | fix_size_alloc_test: p:64, h:1, l:100000 (usec) | 1018385.00 | (I) 84.17% | | | fix_size_alloc_test: p:256, h:0, l:100000 (usec) | 3970899.67 | (I) 77.01% | | | fix_size_alloc_test: p:256, h:1, l:100000 (usec) | 3821788.67 | (I) 89.44% | | | fix_size_alloc_test: p:512, h:0, l:100000 (usec) | 7795968.00 | (I) 82.67% | | | fix_size_alloc_test: p:512, h:1, l:100000 (usec) | 6530169.67 | (I) 118.09% | | | full_fit_alloc_test: p:1, h:0, l:500000 (usec) | 626808.33 | -0.98% | | | kvfree_rcu_1_arg_vmalloc_test: p:1, h:0, l:500000 (usec) | 532145.67 | -1.68% | | | kvfree_rcu_2_arg_vmalloc_test: p:1, h:0, l:500000 (usec) | 537032.67 | -0.96% | | | long_busy_list_alloc_test: p:1, h:0, l:500000 (usec) | 8805069.00 | (I) 74.58% | | | pcpu_alloc_test: p:1, h:0, l:500000 (usec) | 500824.67 | 4.35% | | | random_size_align_alloc_test: p:1, h:0, l:500000 (usec) | 1637554.67 | (I) 76.99% | | | random_size_alloc_test: p:1, h:0, l:500000 (usec) | 4556288.67 | (I) 72.23% | | | vm_map_ram_test: p:1, h:0, l:500000 (usec) | 107371.00 | -0.70% | +-----------------+----------------------------------------------------------+-------------------+--------------------+ Link: https://lore.kernel.org/20260401101634.2868165-3-usama.anjum@arm.com Fixes: a06157804399 ("mm/vmalloc: request large order pages from buddy allocator") Closes: https://lore.kernel.org/all/66919a28-bc81-49c9-b68f-dd7c73395a0d@arm.com/ Signed-off-by: Ryan Roberts Co-developed-by: Muhammad Usama Anjum Signed-off-by: Muhammad Usama Anjum Acked-by: Vlastimil Babka (SUSE) Acked-by: Zi Yan Acked-by: David Hildenbrand (Arm) Reviewed-by: Uladzislau Rezki (Sony) Cc: Brendan Jackman Cc: David Sterba Cc: Johannes Weiner Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Nick Terrell Cc: Suren Baghdasaryan Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- include/linux/gfp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 87259e309dee..cdf95a9f0b87 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -239,6 +239,8 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, struct page **page_array); #define __alloc_pages_bulk(...) alloc_hooks(alloc_pages_bulk_noprof(__VA_ARGS__)) +void free_pages_bulk(struct page **page_array, unsigned long nr_pages); + unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp, unsigned long nr_pages, struct page **page_array); -- cgit v1.2.3 From 9138e27a3bc380cd88475546688f23d5eda1ad23 Mon Sep 17 00:00:00 2001 From: Ravi Jonnalagadda Date: Mon, 27 Apr 2026 20:05:20 -0700 Subject: mm/damon: add node_eligible_mem_bp goal metric Background and Motivation ========================= In heterogeneous memory systems, controlling memory distribution across NUMA nodes is essential for performance optimization. This patch enables system-wide page distribution with target-state goals such as "maintain 60% of scheme-eligible memory on DRAM" using PA-mode DAMON schemes. Rather than using absolute thresholds, this metric tracks the ratio of memory that matches each scheme's access pattern filters on a target node, enabling the quota system to automatically adjust migration aggressiveness to maintain the desired distribution. What This Metric Measures ========================= node_eligible_mem_bp: scheme_eligible_bytes_on_node / total_scheme_eligible_bytes * 10000 Two-Scheme Setup for Hot Page Distribution ========================================== For maintaining 60% of hot memory on DRAM (node 0) and 40% on CXL (node 1): PULL scheme: migrate_hot to node 0 goal: node_eligible_mem_bp, nid=0, target=6000 addr filter: node 1 address range (only migrate FROM CXL) "Move hot pages to DRAM if less than 60% of hot data is in DRAM" PUSH scheme: migrate_hot to node 1 goal: node_eligible_mem_bp, nid=1, target=4000 addr filter: node 0 address range (only migrate FROM DRAM) "Move hot pages to CXL if less than 40% of hot data is in CXL" Each scheme independently measures its own eligible memory and adjusts its quota to achieve its target ratio. The schemes work in concert through DAMON's unified monitoring context, with the quota autotuner balancing their relative aggressiveness. Implementation Details ====================== The implementation adds a new quota goal metric type DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP to the existing DAMOS quota goal framework. When this metric is configured for a scheme: 1. During each quota adjustment cycle, damos_get_node_eligible_mem_bp() is called to calculate the current memory distribution. 2. The function iterates through all regions that match the scheme's access pattern (via __damos_valid_target()) and calculates: - Total eligible bytes across all nodes - Eligible bytes specifically on the target node (goal->nid) 3. For each eligible region, damos_calc_eligible_bytes() walks through the physical address range, using damon_get_folio() to look up each folio and determine its NUMA node via folio_nid(). 4. Large folios are handled by calculating the exact overlap between the region boundaries and folio boundaries, ensuring accurate byte counts even when regions partially span folios. 5. The ratio (node_eligible / total_eligible * 10000) is returned as basis points, which the quota autotuner uses to adjust the scheme's effective quota size (esz). The implementation requires CONFIG_DAMON_PADDR since damon_get_folio() is only available for physical address space monitoring. Testing Results =============== Functionally tested on a two-node heterogeneous memory system with DRAM (node 0) and CXL memory (node 1). A PUSH+PULL scheme configuration using migrate_hot actions was used to reach a target hot memory ratio between the two tiers. With the TEMPORAL tuner, the system converges quickly to the target distribution. The tuner drives esz to maximum when under goal and to zero once the goal is met, forming a simple on/off feedback loop that stabilizes at the desired ratio. With the CONSIST tuner, the scheme still converges but more slowly, as it migrates and then throttles itself based on quota feedback. The time to reach the goal varies depending on workload intensity. Note: This metric works with both TEMPORAL and CONSIST goal tuners. Link: https://lore.kernel.org/20260428030520.701-1-ravis.opensrc@gmail.com Signed-off-by: Ravi Jonnalagadda Suggested-by: SeongJae Park Reviewed-by: SeongJae Park Cc: Honggyu Kim Cc: Jonathan Corbet Cc: Yunjeong Mun Signed-off-by: Andrew Morton --- include/linux/damon.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index f2cdb7c3f5e6..986b8c902585 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -159,6 +159,8 @@ enum damos_action { * @DAMOS_QUOTA_NODE_MEMCG_FREE_BP: MemFree ratio of a node for a cgroup. * @DAMOS_QUOTA_ACTIVE_MEM_BP: Active to total LRU memory ratio. * @DAMOS_QUOTA_INACTIVE_MEM_BP: Inactive to total LRU memory ratio. + * @DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP: Scheme-eligible memory ratio of a + * node in basis points (0-10000). * @NR_DAMOS_QUOTA_GOAL_METRICS: Number of DAMOS quota goal metrics. * * Metrics equal to larger than @NR_DAMOS_QUOTA_GOAL_METRICS are unsupported. @@ -172,6 +174,7 @@ enum damos_quota_goal_metric { DAMOS_QUOTA_NODE_MEMCG_FREE_BP, DAMOS_QUOTA_ACTIVE_MEM_BP, DAMOS_QUOTA_INACTIVE_MEM_BP, + DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP, NR_DAMOS_QUOTA_GOAL_METRICS, }; -- cgit v1.2.3 From 4ee4fb3214a8aadf5e8d253f8a34b76baff7f37d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 27 Apr 2026 18:33:52 -0700 Subject: mm/damon/core: introduce failed region quota charge ratio DAMOS quota is charged to all DAMOS action application attempted memory, regardless of how much of the memory the action was successful and failed. This makes understanding quota behavior without DAMOS stat but only with end level metrics (e.g., increased amount of free memory for DAMOS_PAGEOUT action) difficult. Also, charging action-failed memory same as action-successful memory is somewhat unfair, as successful action application will induce more overhead in most cases. Introduce DAMON core API for setting the charge ratio for such action-failed memory. It allows API callers to specify the ratio in a flexible way, by setting the numerator and the denominator. Link: https://lore.kernel.org/20260428013402.115171-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 986b8c902585..2bb43910e22e 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -236,6 +236,8 @@ enum damos_quota_goal_tuner { * @goals: Head of quota tuning goals (&damos_quota_goal) list. * @goal_tuner: Goal-based @esz tuning algorithm to use. * @esz: Effective size quota in bytes. + * @fail_charge_num: Failed regions charge rate numerator. + * @fail_charge_denom: Failed regions charge rate denominator. * * @weight_sz: Weight of the region's size for prioritization. * @weight_nr_accesses: Weight of the region's nr_accesses for prioritization. @@ -265,6 +267,10 @@ enum damos_quota_goal_tuner { * * The resulting effective size quota in bytes is set to @esz. * + * For DAMOS action applying failed amount of regions, charging those same to + * those that the action has successfully applied may be unfair. For the + * reason, 'the size * @fail_charge_num / @fail_charge_denom' is charged. + * * For selecting regions within the quota, DAMON prioritizes current scheme's * target memory regions using the &struct damon_operations->get_scheme_score. * You could customize the prioritization logic by setting &weight_sz, @@ -279,6 +285,9 @@ struct damos_quota { enum damos_quota_goal_tuner goal_tuner; unsigned long esz; + unsigned int fail_charge_num; + unsigned int fail_charge_denom; + unsigned int weight_sz; unsigned int weight_nr_accesses; unsigned int weight_age; -- cgit v1.2.3 From ffe55393137c01aa01940b528afcea8c5a108ed7 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 10 Apr 2026 17:24:19 +0800 Subject: mm/sparse: remove sparse buffer pre-allocation mechanism Commit 9bdac9142407 ("sparsemem: Put mem map for one node together.") introduced a mechanism to pre-allocate a large memory block to hold all memmaps for a NUMA node upfront. However, the original commit message did not clearly state the actual benefits or the necessity of explicitly pre-allocating a single chunk for all memmap areas of a given node. One of the concerns about removing this pre-allocation is that the subsequent per-section memmap allocations could become scattered around, and might turn too many memory blocks/sections into an "un-offlinable" state. However, tests show that even without the explicit node-wide pre-allocation, memblock still allocates memory closely and back-to-back. When tracing vmemmap_set_pmd allocations, the physical chunks allocated by memblock are strictly adjacent to each other in a single contiguous physical range (mapped top-down). Because they are packed tightly together naturally, they will at most consume or pollute the exact same number of memory blocks as the explicit pre-allocation did. Another concern is the boot performance impact of calling memmap_alloc() multiple times compared to one large node-wide allocation. Tests on a 256GB VM showed that memmap allocation time increased from 199,555 ns to 741,292 ns. Even though it is 3.7x slower, on a 1TB machine, the entire memory allocation time would only take a few milliseconds. This boot performance difference is completely negligible. Since no negative impact on memory offlining behavior or noticeable boot performance regression was found, this patch proposes removing the explicit node-wide memmap pre-allocation mechanism to reduce the maintenance burden. Link: https://lore.kernel.org/20260410092419.2446420-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Mike Rapoport (Microsoft) Acked-by: David Hildenbrand (Arm) Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index e3b6112a8d79..8a0078a4dc78 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4855,7 +4855,6 @@ static inline void print_vma_addr(char *prefix, unsigned long rip) } #endif -void *sparse_buffer_alloc(unsigned long size); unsigned long section_map_size(void); struct page * __populate_section_memmap(unsigned long pfn, unsigned long nr_pages, int nid, struct vmem_altmap *altmap, -- cgit v1.2.3 From f2a950170f7a78761c2b2e5e535716fb0f8c0813 Mon Sep 17 00:00:00 2001 From: "JP Kobryn (Meta)" Date: Mon, 6 Apr 2026 12:50:14 -0700 Subject: mm/vmpressure: skip socket pressure for costly order reclaim When reclaim is triggered by high order allocations on a fragmented system, vmpressure() can report poor reclaim efficiency even though the system has plenty of free memory. This is because many pages are scanned, but few are found to actually reclaim - the pages are actively in use and don't need to be freed. The resulting scan:reclaim ratio causes vmpressure() to assert socket pressure, throttling TCP throughput unnecessarily. Costly order allocations (above PAGE_ALLOC_COSTLY_ORDER) rely heavily on compaction to succeed, so poor reclaim efficiency at these orders does not necessarily indicate memory pressure. The kernel already treats this order as the boundary where reclaim is no longer expected to succeed and compaction may take over. Make vmpressure() order-aware through an additional parameter sourced from scan_control at existing call sites. Socket pressure is now only asserted when order <= PAGE_ALLOC_COSTLY_ORDER. Memcg reclaim is unaffected since try_to_free_mem_cgroup_pages() always uses order 0, which passes the filter unconditionally. Similarly, vmpressure_prio() now passes order 0 internally when calling vmpressure(), ensuring critical pressure from low reclaim priority is not suppressed by the order filter. The patch was motivated by a case of impacted net throughput in production. On one affected host, the memory state at the time showed ~15GB available, zero cgroup pressure, and the following buddyinfo state: Order FreePages 0: 133,970 1: 29,230 2: 17,351 3: 18,984 7+: 0 Using bpf, it was found that 94% of vmpressure calls on this host were from order-7 kswapd reclaim. TCP minimum recv window is rcv_ssthresh:19712. Before patch: 723 out of 3,843 (19%) TCP connections stuck at minimum recv window After live-patching and ~30min elapsed: 0 out of 3,470 TCP connections stuck at minimum recv window Link: https://lore.kernel.org/20260406195014.112521-1-jp.kobryn@linux.dev Signed-off-by: JP Kobryn (Meta) Reviewed-by: Rik van Riel Acked-by: Johannes Weiner Acked-by: Shakeel Butt Acked-by: Jakub Kicinski Reviewed-by: Barry Song Acked-by: Vlastimil Babka (SUSE) Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Eric Dumazet Cc: Kairui Song Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Qi Zheng Cc: Suren Baghdasaryan Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/vmpressure.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h index 6a2f51ebbfd3..faecd5522401 100644 --- a/include/linux/vmpressure.h +++ b/include/linux/vmpressure.h @@ -30,8 +30,8 @@ struct vmpressure { struct mem_cgroup; #ifdef CONFIG_MEMCG -extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, - unsigned long scanned, unsigned long reclaimed); +void vmpressure(gfp_t gfp, int order, struct mem_cgroup *memcg, bool tree, + unsigned long scanned, unsigned long reclaimed); extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio); extern void vmpressure_init(struct vmpressure *vmpr); @@ -44,8 +44,9 @@ extern int vmpressure_register_event(struct mem_cgroup *memcg, extern void vmpressure_unregister_event(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd); #else -static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, - unsigned long scanned, unsigned long reclaimed) {} +static inline void vmpressure(gfp_t gfp, int order, struct mem_cgroup *memcg, + bool tree, unsigned long scanned, + unsigned long reclaimed) {} static inline void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) {} #endif /* CONFIG_MEMCG */ -- cgit v1.2.3 From 3bbc54dd1b62f1a4b218c70aafbeceeba7c90c5d Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 28 Apr 2026 16:18:52 +0800 Subject: mm/sparse-vmemmap: pass @pgmap argument to memory deactivation paths Currently, the memory hot-remove call chain -- arch_remove_memory(), __remove_pages(), sparse_remove_section() and section_deactivate() -- does not carry the struct dev_pagemap pointer. This prevents the lower levels from knowing whether the section was originally populated with vmemmap optimizations (e.g., DAX with vmemmap optimization enabled). Without this information, we cannot call vmemmap_can_optimize() to determine if the vmemmap pages were optimized. As a result, the vmemmap page accounting during teardown will mistakenly assume a non-optimized allocation, leading to incorrect memmap statistics. To lay the groundwork for fixing the vmemmap page accounting, we need to pass the @pgmap pointer down to the deactivation location. Plumb the @pgmap argument through the APIs of arch_remove_memory(), __remove_pages() and sparse_remove_section(), mirroring the corresponding *_activate() paths. Link: https://lore.kernel.org/20260428081855.1249045-4-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Mike Rapoport (Microsoft) Reviewed-by: Oscar Salvador Acked-by: David Hildenbrand (Arm) Acked-by: Liam R. Howlett Cc: "Aneesh Kumar K.V" Cc: Joao Martins Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Nicholas Piggin Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/memory_hotplug.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 815e908c4135..7c9d66729c60 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -135,9 +135,10 @@ static inline bool movable_node_is_enabled(void) return movable_node_enabled; } -extern void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap); +extern void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap, + struct dev_pagemap *pgmap); extern void __remove_pages(unsigned long start_pfn, unsigned long nr_pages, - struct vmem_altmap *altmap); + struct vmem_altmap *altmap, struct dev_pagemap *pgmap); /* reasonably generic interface to expand the physical pages */ extern int __add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, @@ -307,7 +308,8 @@ extern int sparse_add_section(int nid, unsigned long pfn, unsigned long nr_pages, struct vmem_altmap *altmap, struct dev_pagemap *pgmap); extern void sparse_remove_section(unsigned long pfn, unsigned long nr_pages, - struct vmem_altmap *altmap); + struct vmem_altmap *altmap, + struct dev_pagemap *pgmap); extern struct zone *zone_for_pfn_range(enum mmop online_type, int nid, struct memory_group *group, unsigned long start_pfn, unsigned long nr_pages); -- cgit v1.2.3 From 58996503b631adc6a268a42f4624a34513c16199 Mon Sep 17 00:00:00 2001 From: Asier Gutierrez Date: Sun, 26 Apr 2026 16:16:17 -0700 Subject: mm/damon: support MADV_COLLAPSE via DAMOS_COLLAPSE scheme action This patch set introces a new action: DAMOS_COLLAPSE. For DAMOS_HUGEPAGE and DAMOS_NOHUGEPAGE to work, khugepaged should be working, since it relies on hugepage_madvise to add a new slot. This slot should be picked up by khugepaged and eventually collapse (or not, if we are using DAMOS_NOHUGEPAGE) the pages. If THP is not enabled, khugepaged will not be working, and therefore no collapse will happen. DAMOS_COLLAPSE eventually calls madvise_collapse, which will collapse the address range synchronously. In cases where there is a large VMA (databases, for example), DAMOS_COLLAPSE allows us to collapse only the hot region, and not the entire VMA. This new action may be required to support autotuning with hugepage as a goal[1]. ========= Benchmarks: ========= MySQL ===== Tests were performed in an ARM physical server with MariaDB 10.5 and sysbench. Read only benchmark was perform with gaussian row hitting, which follows a normal distribution. T n, D h: THP set to never, DAMON action set to hugepage T m, D h: THP set to madvise, DAMON action set to hugepage T n, D c: THP set to never, DAMON action set to collapse Memory consumption. Lower is better. +------------------+----------+----------+----------+ | | T n, D h | T m, D h | T n, D c | +------------------+----------+----------+----------+ | Total memory use | 2.13 | 2.20 | 2.20 | | Huge pages | 0 | 1.3 | 1.27 | +------------------+----------+----------+----------+ Performance in TPS (Transactions Per Second). Higher is better. T n, D h: 18225.58 T m, D h 18252.93 T n, D c: 18270.21 Performance counter I got the number of L1 D/I TLB accesses and the number a D/I TLB accesses that triggered a page walk. I divided the second by the first to get the percentage of page walkes per TLB access. The lower the better. +---------------+--------------+--------------+--------------+ | | T n, D h | T m, D h | T n, D c | +---------------+--------------+--------------+--------------+ | L1 DTLB | 127248242753 | 125431020479 | 125327001821 | | L1 ITLB | 80332558619 | 79346759071 | 79298139590 | | DTLB walk | 75011087 | 52800418 | 55895794 | | ITLB walk | 71577076 | 71505137 | 67262140 | | DTLB % misses | 0.058948623 | 0.042095183 | 0.044599961 | | ITLB % misses | 0.089100954 | 0.090117275 | 0.084821839 | +---------------+--------------+--------------+--------------+ Masim ===== I used masim with the "demo" configuration, but changing the times to 100 seconds for the initial phase and 50 seconds for the rest of the phases. Memory consumption: +------------------+----------+----------+----------+ | | T n, D h | T m, D h | T n, D c | +------------------+----------+----------+----------+ | Total memory use | 2.38 GB | 2.36 GB | 2.37 GB | | Huge pages | 0 | 190 MB | 188 MB | +------------------+----------+----------+----------+ Performance: THP never, DAMOS_HUGEPAGE initial phase: 40,491 accesses/msec, 100001 msecs run low phase 0: 39,658 accesses/msec, 50002 msecs run high phase 0: 41,678 accesses/msec, 50000 msecs run low phase 1: 39,625 accesses/msec, 50003 msecs run high phase 1: 41,658 accesses/msec, 50002 msecs run low phase 2: 39,642 accesses/msec, 50002 msecs run high phase 2: 41,640 accesses/msec, 50001 msecs run THP madvise, DAMOS_HUGEPAGE initial phase: 51,977 accesses/msec, 100000 msecs run low phase 0: 86,953 accesses/msec, 50000 msecs run high phase 0: 94,812 accesses/msec, 50000 msecs run low phase 1: 101,017 accesses/msec, 50000 msecs run high phase 1: 94,841 accesses/msec, 50000 msecs run low phase 2: 100,993 accesses/msec, 50000 msecs run high phase 2: 94,791 accesses/msec, 50001 msecs run THP never, DAMOS_COLLAPSE initial phase: 93,678 accesses/msec, 100001 msecs run low phase 0: 101,475 accesses/msec, 50000 msecs run high phase 0: 98,589 accesses/msec, 50000 msecs run low phase 1: 101,531 accesses/msec, 50001 msecs run high phase 1: 98,506 accesses/msec, 50001 msecs run low phase 2: 101,458 accesses/msec, 50001 msecs run high phase 2: 98,555 accesses/msec, 50000 msecs run Memory consumption dynamic (how quickly collapses occur): It shows in seconds how many huge pages are allocated. +----+----------+----------+ | | T m, D h | T n, D c | +----+----------+----------+ | 5 | 32 | 188 | | 10 | 48 | 188 | | 15 | 64 | 188 | | 20 | 96 | 188 | | 30 | 112 | 188 | | 35 | 144 | 188 | | 40 | 160 | 188 | | 45 | 190 | 188 | | 50 | 190 | 188 | | 55 | 190 | 188 | | 60 | 190 | 188 | +----+----------+----------+ ========= - We can see that DAMOS "hugepage" action works only when THP is set to madvise. "collapse" action works even when THP is set to never. - Performance for "collapse" action is slightly lower than "hugepage" action and THP madvise. This is due to the fact that collapases occur synchronously. With "hugepage" they may occur during page faults. - Memory consumption is slighly lower for "collapse" than "hugepage" with THP madvise. This is due to the khugepage collapses all VMAs, while "collapse" action only collapses the VMAs in the hot region. - There is an improvement in TLB utilization when collapse through "hugepage" or "collapse" actions are triggered. The amount of TLB misses is lower. - "collapse" action is performance synchronously, which means that page collapses happen earlier and more rapidly. This can be useful or not, depending on the scenario. - "hugepage" action may trigger a VMA split in some scenarios, since it needs to change the flag of the VMA to THP enabled. This may lead to additional overhead. Collapse action just adds a new option to chose the correct system balance. Link: https://lore.kernel.org/20260426231619.107231-5-sj@kernel.org Link: https://lore.kernel.org/damon/20260313000816.79933-1-sj@kernel.org/ [1] Signed-off-by: Asier Gutierrez Signed-off-by: SeongJae Park Reviewed-by: SeongJae Park Cc: Cheng-Han Wu Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Liew Rui Yan Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 2bb43910e22e..d3a231275c23 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -121,6 +121,7 @@ struct damon_target { * @DAMOS_PAGEOUT: Reclaim the region. * @DAMOS_HUGEPAGE: Call ``madvise()`` for the region with MADV_HUGEPAGE. * @DAMOS_NOHUGEPAGE: Call ``madvise()`` for the region with MADV_NOHUGEPAGE. + * @DAMOS_COLLAPSE: Call ``madvise()`` for the region with MADV_COLLAPSE. * @DAMOS_LRU_PRIO: Prioritize the region on its LRU lists. * @DAMOS_LRU_DEPRIO: Deprioritize the region on its LRU lists. * @DAMOS_MIGRATE_HOT: Migrate the regions prioritizing warmer regions. @@ -140,6 +141,7 @@ enum damos_action { DAMOS_PAGEOUT, DAMOS_HUGEPAGE, DAMOS_NOHUGEPAGE, + DAMOS_COLLAPSE, DAMOS_LRU_PRIO, DAMOS_LRU_DEPRIO, DAMOS_MIGRATE_HOT, -- cgit v1.2.3 From 90f01f5d6ba57d93363289b3247314b7fd5e8d49 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Mon, 27 Apr 2026 13:43:16 +0200 Subject: mm: remove page_mapped() Let's replace the last user of page_mapped() by folio_mapped() so we can get rid of page_mapped(). Replace the remaining occurrences of page_mapped() in rmap documentation by folio_mapped(). Link: https://lore.kernel.org/20260427-page_mapped-v1-3-e89c3592c74c@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Matthew Wilcox (Oracle) Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Eduard Zingerman Cc: Harry Yoo Cc: Jann Horn Cc: Jiri Olsa Cc: John Paul Adrian Glaubitz Cc: Kumar Kartikeya Dwivedi Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Martin KaFai Lau Cc: Michal Hocko Cc: Mike Rapoport Cc: Rich Felker Cc: Rik van Riel Cc: Song Liu Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yonghong Song Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/mm.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8a0078a4dc78..9cedc5e75aa9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1888,16 +1888,6 @@ static inline bool folio_mapped(const struct folio *folio) return folio_mapcount(folio) >= 1; } -/* - * Return true if this page is mapped into pagetables. - * For compound page it returns true if any sub-page of compound page is mapped, - * even if this particular sub-page is not itself mapped by any PTE or PMD. - */ -static inline bool page_mapped(const struct page *page) -{ - return folio_mapped(page_folio(page)); -} - static inline struct page *virt_to_head_page(const void *x) { struct page *page = virt_to_page(x); -- cgit v1.2.3 From 7b32f64bc512b40b268776c5ac4d354b325b3197 Mon Sep 17 00:00:00 2001 From: Frederick Mayle Date: Sun, 26 Apr 2026 20:01:47 -0700 Subject: mm: limit filemap_fault readahead to VMA boundaries When a file mapping covers a strict subset of a file, an access to the mapping can trigger readahead of file pages outside the mapped region. Readahead is meant to prefetch pages likely to be accessed soon, but these pages aren't accessible via the same means, so it fair to say we don't have a good indicator they'll be accessed soon. Take an ELF file for example: an access to the end of a program's read-only segment isn't a sign that nearby file contents will be accessed next (they are likely to be mapped discontiguously, or not at all). The pressure from loading these pages into the cache can evict more useful pages. To improve the behavior, make three changes: * Introduce a new readahead_control field, max_index, as a hard limit on the readahead. The existing file_ra_state->size can't be used as a limit, it is more of a hint and can be increased by various heuristics. * Set readahead_control->max_index to the end of the VMA in all of the readahead paths that can be triggered from a fault on a file mapping (both "sync" and "async" readahead). * Limit the read-around range start to the VMA's start. Note that these changes only affect readahead triggered in the context of a fault, they do not affect readahead triggered by read syscalls. If a user mixes the two types of accesses, the behavior is expected to be the following: if a fault causes readahead and places a PG_readahead marker and then a read(2) syscall hits the PG_readahead marker, the resulting async readahead *will not* be limited to the VMA end. Conversely, if a read(2) syscall places a PG_readahead marker and then a fault hits the marker, the async readahead *will* be limited to the VMA end. There is an edge case that the above motivation glosses over: A single file mapping might be backed by multiple VMAs. For example, a whole file could be mapped RW, then part of the mapping made RO using mprotect. This patch would hurt performance of a sequential faulted read of such a mapping, the degree depending on how fragmented the VMAs are. A usage pattern like that is likely rare and already suffering from sub-optimal performance because, e.g., the fragmented VMAs limit the fault-around, so each VMA boundary in a sequential faulted read would cause a minor fault. Still, this patch would make it worse. See a previous discussion of this topic at [1]. Tested by mapping and reading a small subset of a large file, then using the cachestat syscall to verify the number of cached pages didn't exceed the mapping size. In practical scenarios, the effect depends on the specific file and usage. Sometimes there is no effect at all, but, for some ELF files in Android, we see ~20% fewer pages pulled into the cache. A comprehensive performance evaluation hasn't been done, but, in addition to the anecdontal memory savings mentioned above, a benchmark was run with fio 3.38, showing neutral looking results: /data/local/tmp/fio --version fio --name=mmap_test --ioengine=mmap --rw=read --bs=4k \ --offset=1G --size=1G --filesize=3G --numjobs=1 \ --filename=testfile.bin Before: 4366.6 MiB/s (avg of 3459, 4592, 4613, 4697, 4472) After: 4444.0 MiB/s (avg of 4633, 4655, 4511, 4571, 3850) +1.7% Same, with --ioengine=mmap --rw=randread Before: 445.6 MiB/s (avg of 446, 447, 442, 452, 441) After: 447.0 MiB/s (avg of 447, 446, 446, 451, 445) +0.3% Same, with --ioengine=psync --rw=read Before: 3086.6 MiB/s (avg of 3122, 3094, 3066, 3094, 3057) After: 3084.6 MiB/s (avg of 3039, 3103, 3103, 3084, 3094) -0.06% Same, with --ioengine=psync --rw=randread Before: 2226.4 MiB/s (avg of 2256, 2183, 2207, 2265, 2221) After: 2231.4 MiB/s (avg of 2236, 2241, 2236, 2193, 2251) +0.2% Link: https://lore.kernel.org/20260427030148.653228-1-fmayle@google.com Link: https://lore.kernel.org/all/ivnv2crd3et76p2nx7oszuqhzzah756oecn5yuykzqfkqzoygw@yvnlkhjjssoz/ [1] Signed-off-by: Frederick Mayle Reviewed-by: Jan Kara Reviewed-by: Kalesh Singh Cc: David Hildenbrand Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 31a848485ad9..1f50991b43e3 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1350,6 +1350,7 @@ struct readahead_control { struct file_ra_state *ra; /* private: use the readahead_* accessors instead */ pgoff_t _index; + pgoff_t _max_index; /* limit readahead to _max_index, inclusive */ unsigned int _nr_pages; unsigned int _batch_count; bool dropbehind; @@ -1363,6 +1364,7 @@ struct readahead_control { .mapping = m, \ .ra = r, \ ._index = i, \ + ._max_index = ULONG_MAX, \ } #define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE) -- cgit v1.2.3 From 3b9e3cc0405b422db884054ea2417b7b85220c56 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 27 Apr 2026 08:12:20 -0700 Subject: mm/damon/core: introduce damon_ctx->paused Patch series "mm/damon: let DAMON be paused and resumed", v2. DAMON utilizes a few mechanisms that enhance itself over time. Adaptive regions adjustment, goal-based DAMOS quota auto-tuning and monitoring intervals auto-tuning like self-training mechanisms are such examples. It also adds access frequency stability information (age) to the monitoring results, which makes it enhanced over time. Sometimes users have to stop DAMON. In this case, DAMON internal state that enhanced over the time of the last execution simply goes away. Restarted DAMON have to train itself and enhance its output from the scratch. This makes DAMON less useful in such cases. Introducing three such use cases below. Investigation of DAMON. It is best to do the investigation online, especially when it is a production environment. DAMON therefore provides features for such online investigations, including DAMOS stats, monitoring result snapshot exposure, and multiple tracepoints. When those are insufficient, and there are additional clues that could be interfered by DAMON, users have to temporarily stop DAMON to collect the additional clues. It is not very useful since many of DAMON internal clues are gone when DAMON is stopped. The loss of the monitoring results that improved over time is also problematic, especially in production environments. Monitoring of workloads that have different user-known phases. For example, in Android, applications are known to have very different access patterns and behaviors when they are running on the foreground and the background. It can therefore be useful to separate monitoring of apps based on whether they are running on the foreground and on the background. Having two DAMON threads per application that paused and resumed for the apps foreground/background switches can be useful for the purpose. But such pause/resume of the execution is not supported. Tests of DAMON. A few DAMON selftests are using drgn to dump the internal DAMON status. The tests show if the dumped status is the same as what the test code expected. Because DAMON keeps running and modifying its internal status, there are chances of data races that can cause false test results. Stopping DAMON can avoid the race. But, since the internal state of DAMON is dropped, the test coverage will be limited. Let DAMON execution be paused and resumed without loss of the internal state, to overhaul the limitations. For this, introduce a new DAMON context parameter, namely 'pause'. API callers can update it while the context is running, using the online parameters update functions (damon_commit_ctx() and damon_call()). Once it is set, kdamond_fn() main loop will do only limited works excluding the monitoring and DAMOS works, while sleeping sampling intervals per the work. The limited works include handling of the online parameters update. Hence users can unset the 'pause' parameter again. Once it is unset, kdamond_fn() main loop will do all the work again (resumed). Under the paused state, it also does stop condition checks and handling of it, so that paused DAMON can also be stopped if needed. Expose the feature to the user space via DAMON sysfs interface. Also, update existing drgn-based tests to test and use the feature. Tests ===== I confirmed the feature functionality using real time tracing ('perf trace' or 'trace-cmd stream') of damon:damon_aggregated DAMON tracepoint. By pausing and resuming the DAMON execution, I was able to see the trace stops and continued as expected. Note that the pause feature support is added to DAMON user-space tool (damo) after v3.1.9. Users can use '--pause_ctx' command line option of damo for that, and I actually used it for my test. The extended drgn-based selftests are also testing a part of the functionality. Patches Sequence ================ Patch 1 introduces the new core API for the pause feature. Patch 2 extend DAMON sysfs interface for the new parameter. Patches 3-5 update design, usage and ABI documents for the new sysfs file, respectively. The following five patches are for tests. Patch 6 implements a new kunit test for the pause parameter online commitment. Patches 7 and 8 extend DAMON selftest helpers to support the new feature. Patch 9 extends selftest to test the commitment of the feature. Finally, patch 10 updates existing selftest to be safe from the race condition using the pause/resume feature. This patch (of 10): DAMON supports only start and stop of the execution. When it is stopped, its internal data that it self-trained goes away. It will be useful if the execution can be paused and resumed with the previous self-trained data. Introduce per-context API parameter, 'paused', for the purpose. The parameter can be set and unset while DAMON is running and paused, using the online parameters commit helper functions (damon_commit_ctx() and damon_call()). Once 'paused' is set, the kdamond_fn() main loop does only limited works with sampling interval sleep during the works. The limited works include the handling of the online parameters update, so that users can unset the 'pause' and resume the execution when they want. It also keep checking DAMON stop conditions and handling of it, so that DAMON can be stopped while paused if needed. Link: https://lore.kernel.org/20260427151231.113429-1-sj@kernel.org Link: https://lore.kernel.org/20260427151231.113429-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index d3a231275c23..f2370a3a4a9a 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -801,6 +801,7 @@ struct damon_attrs { * @ops: Set of monitoring operations for given use cases. * @addr_unit: Scale factor for core to ops address conversion. * @min_region_sz: Minimum region size. + * @pause: Pause kdamond main loop. * @adaptive_targets: Head of monitoring targets (&damon_target) list. * @schemes: Head of schemes (&damos) list. */ @@ -854,6 +855,7 @@ struct damon_ctx { struct damon_operations ops; unsigned long addr_unit; unsigned long min_region_sz; + bool pause; struct list_head adaptive_targets; struct list_head schemes; -- cgit v1.2.3 From b56ca146a2b2750172f91f6db960a37a1a546efd Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Wed, 29 Apr 2026 15:57:02 +0530 Subject: vmalloc: add __GFP_SKIP_KASAN support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "kasan: hw_tags: Disable tagging for stack and page-tables", v4. Stacks and page tables are always accessed with the match-all tag, so assigning a new random tag every time at allocation and setting invalid tag at deallocation time, just adds overhead without improving the detection. With __GFP_SKIP_KASAN the page keeps its poison tag and KASAN_TAG_KERNEL (match-all tag) is stored in the page flags while keeping the poison tag in the hardware. The benefit of it is that 256 tag setting instruction per 4 kB page aren't needed at allocation and deallocation time. Thus match-all pointers still work, while non-match tags (other than poison tag) still fault. __GFP_SKIP_KASAN only skips for KASAN_HW_TAGS mode, so coverage is unchanged. Benchmark: The benchmark has two modes. In thread mode, the child process forks and creates N threads. In pgtable mode, the parent maps and faults a specified memory size and then forks repeatedly with children exiting immediately. Thread benchmark: 2000 iterations, 2000 threads: 2.575 s → 2.229 s (~13.4% faster) The pgtable samples: - 2048 MB, 2000 iters 19.08 s → 17.62 s (~7.6% faster) This patch (of 3): For allocations that will be accessed only with match-all pointers (e.g., kernel stacks), setting tags is wasted work. If the caller already set __GFP_SKIP_KASAN, skip tag setting of vmalloc pages. Before this patch, __GFP_SKIP_KASAN wasn't being used with vmalloc APIs. So it wasn't being checked. Now its being checked and acted upon. Other KASAN modes are unchanged because __GFP_SKIP_KASAN is ignored for them in the page allocator, and in vmalloc too we ignore this flag for them. This is a preparatory patch for optimizing kernel stack allocations. Link: https://lore.kernel.org/20260429102704.680174-1-dev.jain@arm.com Link: https://lore.kernel.org/20260429102704.680174-2-dev.jain@arm.com Signed-off-by: Muhammad Usama Anjum Co-developed-by: Ryan Roberts Signed-off-by: Ryan Roberts Co-developed-by: Dev Jain Signed-off-by: Dev Jain Reviewed-by: Catalin Marinas Cc: Arnd Bergmann Cc: Ben Segall Cc: David Hildenbrand Cc: Dietmar Eggemann Cc: Ingo Molnar Cc: Juri Lelli Cc: Kees Cook Cc: K Prateek Nayak Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mathieu Desnoyers Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: "Uladzislau Rezki (Sony)" Cc: Valentin Schneider Cc: Vincent Guittot Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/gfp_types.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index cd4972a7c97c..54ca0c88bab6 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -281,9 +281,9 @@ enum { * * %__GFP_SKIP_KASAN makes KASAN skip unpoisoning on page allocation. * Used for userspace and vmalloc pages; the latter are unpoisoned by - * kasan_unpoison_vmalloc instead. For userspace pages, results in - * poisoning being skipped as well, see should_skip_kasan_poison for - * details. Only effective in HW_TAGS mode. + * kasan_unpoison_vmalloc instead. If passed to vmalloc, kasan_unpoison_vmalloc + * is skipped too. For userspace pages, results in poisoning being skipped as + * well, see should_skip_kasan_poison for details. Only effective in HW_TAGS mode. */ #define __GFP_NOWARN ((__force gfp_t)___GFP_NOWARN) #define __GFP_COMP ((__force gfp_t)___GFP_COMP) -- cgit v1.2.3 From 6ae51adb084a9d87a8b9501d2231e20271dece87 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Wed, 29 Apr 2026 15:57:03 +0530 Subject: kasan: skip HW tagging for all kernel thread stacks HW-tag KASAN never checks kernel stacks because stack pointers carry the match-all tag, so setting/poisoning tags is pure overhead. - Add __GFP_SKIP_KASAN to THREADINFO_GFP so every stack allocator that uses it skips tagging (fork path plus arch users) - Add __GFP_SKIP_KASAN to GFP_VMAP_STACK for the fork-specific vmap stacks. - When reusing cached vmap stacks, skip kasan_unpoison_range() if HW tags are enabled. Software KASAN is unchanged; this only affects tag-based KASAN. Link: https://lore.kernel.org/20260429102704.680174-3-dev.jain@arm.com Signed-off-by: Muhammad Usama Anjum Signed-off-by: Dev Jain Reviewed-by: Catalin Marinas Cc: Arnd Bergmann Cc: Ben Segall Cc: David Hildenbrand (Arm) Cc: Dietmar Eggemann Cc: Ingo Molnar Cc: Juri Lelli Cc: Kees Cook Cc: K Prateek Nayak Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mathieu Desnoyers Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: "Uladzislau Rezki (Sony)" Cc: Valentin Schneider Cc: Vincent Guittot Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/thread_info.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 051e42902690..307b8390fc67 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -92,7 +92,7 @@ static inline long set_restart_fn(struct restart_block *restart, #define THREAD_ALIGN THREAD_SIZE #endif -#define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO) +#define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_SKIP_KASAN) /* * flag set/clear/test wrappers -- cgit v1.2.3 From d46644af7636c4cb876110c8ff7f1efbbb815bfe Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Wed, 29 Apr 2026 15:57:04 +0530 Subject: mm: skip KASAN tagging for page-allocated page tables Page tables are always accessed via the linear mapping with a match-all tag, so HW-tag KASAN never checks them. For page-allocated tables (PTEs and PGDs etc), avoid the tag setup and poisoning overhead by using __GFP_SKIP_KASAN. SLUB-backed page tables are unchanged for now. (They aren't widely used and require more SLUB related skip logic. Leave it later.) Link: https://lore.kernel.org/20260429102704.680174-4-dev.jain@arm.com Signed-off-by: Muhammad Usama Anjum Signed-off-by: Dev Jain Reviewed-by: Ryan Roberts Reviewed-by: Catalin Marinas Acked-by: David Hildenbrand (Arm) Cc: Arnd Bergmann Cc: Ben Segall Cc: Dietmar Eggemann Cc: Ingo Molnar Cc: Juri Lelli Cc: Kees Cook Cc: K Prateek Nayak Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mathieu Desnoyers Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: "Uladzislau Rezki (Sony)" Cc: Valentin Schneider Cc: Vincent Guittot Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/asm-generic/pgalloc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index 57137d3ac159..051aa1331051 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -4,7 +4,7 @@ #ifdef CONFIG_MMU -#define GFP_PGTABLE_KERNEL (GFP_KERNEL | __GFP_ZERO) +#define GFP_PGTABLE_KERNEL (GFP_KERNEL | __GFP_ZERO | __GFP_SKIP_KASAN) #define GFP_PGTABLE_USER (GFP_PGTABLE_KERNEL | __GFP_ACCOUNT) /** -- cgit v1.2.3 From 70d8797c15d640982365e96e34e93a3aa38e82da Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 28 Apr 2026 21:12:23 -0700 Subject: mm/damon: introduce damon_set_region_system_rams_default() Patch series "mm/damon/reclaim,lru_sort: monitor all system rams by default". DAMON_RECLAIM and DAMON_LRU_SORT set the biggest 'System RAM' resource of the system as the default monitoring target address range. The main intention behind the design is to minimize the overhead coming from monitoring of non-System RAM areas. This could result in an odd setup when there are multiple discrete System RAMs of considerable sizes. For example, there are System RAMs each having 500 GiB size. In this case, only the first 500 GiB will be set as the monitoring region by default. This is particularly common on NUMA systems. Hence the modules allow users to set the monitoring target address range using the module parameters if the default setup doesn't work for them. In other words, the current design trades ease of setup for lower overhead. However, because DAMON utilizes the sampling based access check and the adaptive regions adjustment mechanisms, the overhead from the monitoring of non-System RAM areas should be negligible in most setups. Meanwhile, the setup complexity is causing real headaches for users who need to run those modules on various types of systems. That is, the current tradeoff is not a good deal. Set the physical address range that can cover all System RAM areas of the system as the default monitoring regions for DAMON_RECLAIM and DAMON_LRU_SORT. Technically speaking, this is changing documented behavior. However, it makes no sense to believe there is a real use case that really depends on the old weird default behavior. If the old default behavior was working for them in the reasonable way, this change will only add a negligible amount of monitoring overhead. If it didn't work, the users may already be using manual monitoring regions setup, and they will not be affected by this change. Patches Sequence ================ Patch 1 introduces a new core function that will be used for the new default monitoring target region setup. Patch 2 and 3 update DAMON_RECLAIM and DAMON_LRU_SORT to use the new function instead of the old one, respectively. Patch 4 removes the old core function that was replaced by the new one, as there is no more user of it. Patch 5 updates DAMON_STAT to use the new one instead of its in-house nearly-duplicate self implementation of the functionality. Finally patches 6 and 7 update the DAMON_RECLAIM and DAMON_LRU_SORT user documentation for the new behaviors, respectively. This patch (of 7): damon_set_region_biggest_system_ram_default() sets the monitoring target region as the caller requested. If the caller didn't specify the region, it finds the biggest System RAM of the system and sets it as the target region. When there are more than one considerable size of System RAM resources in the system, the default target setup makes no sense. Introduce a variant, namely damon_set_region_system_rams_default(). It sets a physical address range that covers all System RAM resources as the default target region. Link: https://lore.kernel.org/20260429041232.90257-1-sj@kernel.org Link: https://lore.kernel.org/20260429041232.90257-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index f2370a3a4a9a..f656908b2d38 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -1010,6 +1010,11 @@ int damon_kdamond_pid(struct damon_ctx *ctx); int damon_call(struct damon_ctx *ctx, struct damon_call_control *control); int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control); +int damon_set_region_system_rams_default(struct damon_target *t, + unsigned long *start, unsigned long *end, + unsigned long addr_unit, + unsigned long min_region_sz); + int damon_set_region_biggest_system_ram_default(struct damon_target *t, unsigned long *start, unsigned long *end, unsigned long addr_unit, -- cgit v1.2.3 From 3a870b43776c0c9740a087eb0d831cd6cb8016f7 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 28 Apr 2026 21:12:26 -0700 Subject: mm/damon/core: remove damon_set_region_biggest_system_ram_default() Now nobody is using damon_set_region_biggest_system_ram_default(). Remove it. Link: https://lore.kernel.org/20260429041232.90257-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index f656908b2d38..c7a31572689b 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -1015,11 +1015,6 @@ int damon_set_region_system_rams_default(struct damon_target *t, unsigned long addr_unit, unsigned long min_region_sz); -int damon_set_region_biggest_system_ram_default(struct damon_target *t, - unsigned long *start, unsigned long *end, - unsigned long addr_unit, - unsigned long min_region_sz); - #endif /* CONFIG_DAMON */ #endif /* _DAMON_H */ -- cgit v1.2.3 From 9d5685286aa8c5ef70d8e1a34cef5daf518ae237 Mon Sep 17 00:00:00 2001 From: Zhouyi Zhou Date: Tue, 5 May 2026 02:11:25 +0000 Subject: highmem-internal.h: fix typo in the comment for kunmap_atomic() Replace `PREEMP_RT` with `PREEMPT_RT` in the header comment to match the correct kernel configuration name. Link: https://lore.kernel.org/20260505021125.1941691-1-zhouzhouyi@gmail.com Signed-off-by: Zhouyi Zhou Reviewed-by: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton --- include/linux/highmem-internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h index 0574c21ca45d..bb71e7dba4f7 100644 --- a/include/linux/highmem-internal.h +++ b/include/linux/highmem-internal.h @@ -262,7 +262,7 @@ static inline bool is_kmap_addr(const void *x) * @__addr: Virtual address to be unmapped * * Unmaps an address previously mapped by kmap_atomic() and re-enables - * pagefaults. Depending on PREEMP_RT configuration, re-enables also + * pagefaults. Depending on PREEMPT_RT configuration, re-enables also * migration and preemption. Users should not count on these side effects. * * Mappings should be unmapped in the reverse order that they were mapped. -- cgit v1.2.3 From 9012c4e647df9a3c5450dcccd766877a3efebc46 Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Fri, 8 May 2026 18:18:15 -0700 Subject: mm/damon: replace damon_rand() with a per-ctx lockless PRNG damon_rand() on the sampling_addr hot path called get_random_u32_below(), which takes a local_lock_irqsave() around a per-CPU batched entropy pool and periodically refills it with ChaCha20. At elevated nr_regions counts (20k+), the lock_acquire / local_lock pair plus __get_random_u32_below() dominate kdamond perf profiles. Replace the helper with a lockless lfsr113 generator (struct rnd_state) held per damon_ctx and seeded from get_random_u64() in damon_new_ctx(). kdamond is the single consumer of a given ctx, so no synchronization is required. Range mapping uses traditional reciprocal multiplication, similar as get_random_u32_below(); for spans larger than U32_MAX (only reachable on 64-bit) the slow path combines two u32 outputs and uses mul_u64_u64_shr() at 64-bit width. On 32-bit the slow path is dead code and gets eliminated by the compiler. The new helper takes a ctx parameter; damon_split_regions_of() and the kunit tests that call it directly are updated accordingly. lfsr113 is a linear PRNG and MUST NOT be used for anything security-sensitive. DAMON's sampling_addr is not exposed to userspace and is only consumed as a probe point for PTE accessed-bit sampling, so a non-cryptographic PRNG is appropriate here. Tested with paddr monitoring and max_nr_regions=20000: kdamond CPU usage reduced from ~72% to ~50% of one core. Link: https://lore.kernel.org/20260505145212.108644-1-jiayuan.chen@linux.dev Link: https://lore.kernel.org/damon/20260426173346.86238-1-sj@kernel.org/T/#m4f1fd74112728f83a41511e394e8c3fef703039c Link: https://lore.kernel.org/20260509011816.85145-1-sj@kernel.org Signed-off-by: Jiayuan Chen Signed-off-by: SeongJae Park Reviewed-by: SeongJae Park Cc: Shu Anzai Cc: Quanmin Yan Signed-off-by: Andrew Morton --- include/linux/damon.h | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index c7a31572689b..4d4f031bcb45 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -8,23 +8,18 @@ #ifndef _DAMON_H_ #define _DAMON_H_ +#include #include #include +#include #include #include -#include /* Minimal region size. Every damon_region is aligned by this. */ #define DAMON_MIN_REGION_SZ PAGE_SIZE /* Max priority score for DAMON-based operation schemes */ #define DAMOS_MAX_SCORE (99) -/* Get a random number in [l, r) */ -static inline unsigned long damon_rand(unsigned long l, unsigned long r) -{ - return l + get_random_u32_below(r - l); -} - /** * struct damon_addr_range - Represents an address region of [@start, @end). * @start: Start address of the region (inclusive). @@ -859,8 +854,27 @@ struct damon_ctx { struct list_head adaptive_targets; struct list_head schemes; + + /* Per-ctx PRNG state for damon_rand(); kdamond is the sole consumer. */ + struct rnd_state rnd_state; }; +/* Get a random number in [@l, @r) using @ctx's lockless PRNG. */ +static inline unsigned long damon_rand(struct damon_ctx *ctx, + unsigned long l, unsigned long r) +{ + unsigned long span = r - l; + u64 rnd; + + if (span <= U32_MAX) { + rnd = prandom_u32_state(&ctx->rnd_state); + return l + (unsigned long)((rnd * span) >> 32); + } + rnd = ((u64)prandom_u32_state(&ctx->rnd_state) << 32) | + prandom_u32_state(&ctx->rnd_state); + return l + mul_u64_u64_shr(rnd, span, 64); +} + static inline struct damon_region *damon_next_region(struct damon_region *r) { return container_of(r->list.next, struct damon_region, list); -- cgit v1.2.3 From cf49b4ebd2ae13554c780eb482e7900447f29ce9 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Mon, 11 May 2026 16:05:32 +0200 Subject: mm/bootmem_info: remove call to kmemleak_free_part_phys() The call to kmemleak_free_part_phys() was added in 2022 in commit dd0ff4d12dd2 ("bootmem: remove the vmemmap pages from kmemleak in put_page_bootmem"). In 2025, commit b2aad24b5333 ("mm/memmap: prevent double scanning of memmap by kmemleak") started to use MEMBLOCK_ALLOC_NOLEAKTRACE when allocating the memmap to skip the kmemleak_alloc_phys() in the buddy. So remove the call to kmemleak_free_part_phys(). If this would still be required for other purposes, either free_reserved_page() should take care of it, or selected users. Link: https://lore.kernel.org/20260511-bootmem_info_prep-v1-4-3fb0be6fc688@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Oscar Salvador Reviewed-by: Mike Rapoport (Microsoft) Tested-by: Lance Yang Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Christian Borntraeger Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Nicholas Piggin Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Ritesh Harjani (IBM) Signed-off-by: Andrew Morton --- include/linux/bootmem_info.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/bootmem_info.h b/include/linux/bootmem_info.h index 492ceeb1cdf8..f724340755e5 100644 --- a/include/linux/bootmem_info.h +++ b/include/linux/bootmem_info.h @@ -82,7 +82,6 @@ static inline void get_page_bootmem(unsigned long info, struct page *page, static inline void free_bootmem_page(struct page *page) { - kmemleak_free_part_phys(PFN_PHYS(page_to_pfn(page)), PAGE_SIZE); free_reserved_page(page); } #endif -- cgit v1.2.3 From 9b1b295e9fd354b2263aee80a1ef3605d1eee32e Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 12 May 2026 15:26:35 +0800 Subject: drivers/base/memory: make memory block get/put explicit Rename the memory block lookup helper to make the acquired reference explicit, add memory_block_put() to wrap put_device(), remove find_memory_block(), and use memory_block_get() as the single block-id based lookup interface. This makes it clearer to callers that a successful lookup holds a reference that must be dropped, reducing the chance of forgetting the matching put and leaking the memory block device reference. Link: https://lore.kernel.org/linux-mm/7887915D-E598-42B3-9AFE-BFFBACE8DE2D@linux.dev/#t Link: https://lore.kernel.org/20260512072635.3969576-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Oscar Salvador Acked-by: David Hildenbrand (Arm) Acked-by: Michal Hocko Tested-by: Donet Tom Reviewed-by: Lorenzo Stoakes Tested-by: Sumanth Korikkar #s390 Cc: Richard Cheng Acked-by: Mike Rapoport (Microsoft) Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: Danilo Krummrich Cc: Doug Anderson Cc: Greg Kroah-Hartman Cc: Heiko Carstens Cc: Kees Cook Cc: Liam R. Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Nicholas Piggin Cc: "Rafael J. Wysocki" Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/memory.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/memory.h b/include/linux/memory.h index 5bb5599c6b2b..463dc02f6cff 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -158,7 +158,11 @@ int create_memory_block_devices(unsigned long start, unsigned long size, void remove_memory_block_devices(unsigned long start, unsigned long size); extern void memory_dev_init(void); extern int memory_notify(enum memory_block_state state, void *v); -extern struct memory_block *find_memory_block(unsigned long section_nr); +struct memory_block *memory_block_get(unsigned long block_id); +static inline void memory_block_put(struct memory_block *mem) +{ + put_device(&mem->dev); +} typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *); extern int walk_memory_blocks(unsigned long start, unsigned long size, void *arg, walk_memory_blocks_func_t func); @@ -171,7 +175,6 @@ struct memory_group *memory_group_find_by_id(int mgid); typedef int (*walk_memory_groups_func_t)(struct memory_group *, void *); int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func, struct memory_group *excluded, void *arg); -struct memory_block *find_memory_block_by_id(unsigned long block_id); #define hotplug_memory_notifier(fn, pri) ({ \ static __meminitdata struct notifier_block fn##_mem_nb =\ { .notifier_call = fn, .priority = pri };\ -- cgit v1.2.3 From 80eacd489a50ab2a560bc233b26b94ad9df68410 Mon Sep 17 00:00:00 2001 From: Takahiro Itazuri Date: Wed, 13 May 2026 09:35:46 -0700 Subject: mm/mmu_notifier: fix a begin vs. start typo in the invalidate range comment Fix a goof in the block comment for invalidate_range_{start,end}() where start() is incorrectly referred to as begin(). No functional change intended. [seanjc@google.com: split to separate patch, write changelog] Link: https://lore.kernel.org/20260513163546.1176742-1-seanjc@google.com Signed-off-by: Takahiro Itazuri Signed-off-by: Sean Christopherson Reviewed-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes Reviewed-by: Mike Rapoport (Microsoft) Cc: Liam R. Howlett Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mmu_notifier.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 69c304b467df..a11a44eef521 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -134,8 +134,8 @@ struct mmu_notifier_ops { * Invalidation of multiple concurrent ranges may be * optionally permitted by the driver. Either way the * establishment of sptes is forbidden in the range passed to - * invalidate_range_begin/end for the whole duration of the - * invalidate_range_begin/end critical section. + * invalidate_range_start/end for the whole duration of the + * invalidate_range_start/end critical section. * * invalidate_range_start() is called when all pages in the * range are still mapped and have at least a refcount of one. -- cgit v1.2.3 From 9c860d1d5d69f9cb19eb7c36573ee14065a9c85a Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Wed, 13 May 2026 12:35:13 +0000 Subject: mm: introduce for_each_free_list() Patch series "mm: misc cleanups from __GFP_UNMAPPED series". In v2 of the __GFP_UNMAPPED series [0], we realised that some of the patches could potentially be merged as independent cleanups. These are all independent of one another, if you think some are useful cleanups and others are pointless churn, it should be fine to just pick whatever subset you prefer. No functional change intended. This patch (of 4): There are a couple of places that iterate over the freelists with awareness of the data structures' layout. It seems ideally, code outside of mm should not be aware of the page allocator's freelists at all. But, this patch just doesn't hide them completely, it's just a meek incremental step in that direction: provide a macro to iterate over it without needing to be aware of the actual struct fields. Link: https://lore.kernel.org/20260513-page_alloc-unmapped-prep-v1-0-dacdf5402be8@google.com Link: https://lore.kernel.org/20260513-page_alloc-unmapped-prep-v1-1-dacdf5402be8@google.com Link: https://lore.kernel.org/all/20260320-page_alloc-unmapped-v2-0-28bf1bd54f41@google.com/ [0] Signed-off-by: Brendan Jackman Acked-by: Mike Rapoport (Microsoft) Reviewed-by: Vlastimil Babka (SUSE) Cc: Axel Rasmussen Cc: Barry Song Cc: David Hildenbrand Cc: Johannes Weiner Cc: Kairui Song Cc: Len Brown Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: "Rafael J. Wysocki" Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Wei Xu Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 9adb2ad21da5..1331a7b93f33 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -177,9 +177,12 @@ static inline bool migratetype_is_mergeable(int mt) return mt < MIGRATE_PCPTYPES; } -#define for_each_migratetype_order(order, type) \ - for (order = 0; order < NR_PAGE_ORDERS; order++) \ - for (type = 0; type < MIGRATE_TYPES; type++) +#define for_each_free_list(list, zone, order) \ + for (order = 0; order < NR_PAGE_ORDERS; order++) \ + for (unsigned int __type = 0; \ + __type < MIGRATE_TYPES && \ + (list = &(zone)->free_area[order].free_list[__type], 1); \ + __type++) extern int page_group_by_mobility_disabled; -- cgit v1.2.3 From 3687c0fd67249cb971990b382a47f02f19ed9f67 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Wed, 13 May 2026 12:35:15 +0000 Subject: mm: rejig pageblock mask definitions - Add a PAGEBLOCK_ prefix to the names to avoid polluting the "global namespace" too much. - This new prefix makes MIGRATETYPE_AND_ISO_MASK look pretty long. Well, that global mask only exists for quite a specific purpose, and is quite a weird thing to have a name for anyway. So drop it and take advantage of the newly-defined PAGEBLOCK_ISO_MASK. Link: https://lore.kernel.org/20260513-page_alloc-unmapped-prep-v1-3-dacdf5402be8@google.com Signed-off-by: Brendan Jackman Reviewed-by: Vlastimil Babka (SUSE) Cc: Axel Rasmussen Cc: Barry Song Cc: David Hildenbrand Cc: Johannes Weiner Cc: Kairui Song Cc: Len Brown Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport (Microsoft) Cc: "Rafael J. Wysocki" Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Wei Xu Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/pageblock-flags.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index e046278a01fa..9a6c3ea17684 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -36,12 +36,12 @@ enum pageblock_bits { #define NR_PAGEBLOCK_BITS (roundup_pow_of_two(__NR_PAGEBLOCK_BITS)) -#define MIGRATETYPE_MASK (BIT(PB_migrate_0)|BIT(PB_migrate_1)|BIT(PB_migrate_2)) +#define PAGEBLOCK_MIGRATETYPE_MASK (BIT(PB_migrate_0)|BIT(PB_migrate_1)|BIT(PB_migrate_2)) #ifdef CONFIG_MEMORY_ISOLATION -#define MIGRATETYPE_AND_ISO_MASK (MIGRATETYPE_MASK | BIT(PB_migrate_isolate)) +#define PAGEBLOCK_ISO_MASK BIT(PB_migrate_isolate) #else -#define MIGRATETYPE_AND_ISO_MASK MIGRATETYPE_MASK +#define PAGEBLOCK_ISO_MASK 0 #endif #if defined(CONFIG_HUGETLB_PAGE) -- cgit v1.2.3 From 1dfbe92e702675964da45847ffe022a41bf4045e Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Sun, 17 May 2026 23:39:42 +0800 Subject: mm/huge_memory: move THP gfp limit helper into header Shmem has some special requirements for THP GFP and has to limit it in certain zones or provide a more lenient fallback. We'll use this helper for generic swap THP allocation, which needs to support shmem. For a typical GFP_HIGHUSER_MOVABLE swap-in, this helper is basically a no-op. But it's necessary for certain shmem users, mostly drivers. No feature change. Link: https://lore.kernel.org/20260517-swap-table-p4-v5-3-88ae43e064c7@tencent.com Signed-off-by: Kairui Song Acked-by: Chris Li Reviewed-by: Zi Yan Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Chengming Zhou Cc: David Hildenbrand Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: Lorenzo Stoakes Cc: Muchun Song Cc: Nhat Pham Cc: Roman Gushchin Cc: Shakeel Butt Cc: Youngjun Park Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 2949e5acff35..58382e97a66d 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -237,6 +237,31 @@ static inline bool thp_vma_suitable_order(struct vm_area_struct *vma, return true; } +/* + * Make sure huge_gfp is always more limited than limit_gfp. + * Some shmem users want THP allocation to be done less aggressively + * and only in certain zone. + */ +static inline gfp_t thp_shmem_limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) +{ + gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM; + gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY; + gfp_t zoneflags = limit_gfp & GFP_ZONEMASK; + gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK); + + /* Allow allocations only from the originally specified zones. */ + result |= zoneflags; + + /* + * Minimize the result gfp by taking the union with the deny flags, + * and the intersection of the allow flags. + */ + result |= (limit_gfp & denyflags); + result |= (huge_gfp & limit_gfp) & allowflags; + + return result; +} + /* * Filter the bitfield of input orders to the ones suitable for use in the vma. * See thp_vma_suitable_order(). @@ -581,6 +606,11 @@ static inline bool thp_vma_suitable_order(struct vm_area_struct *vma, return false; } +static inline gfp_t thp_shmem_limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) +{ + return huge_gfp; +} + static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma, unsigned long addr, unsigned long orders) { -- cgit v1.2.3 From 945578fee2ec17bebdec067371214d3cbed48822 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Sun, 17 May 2026 23:39:45 +0800 Subject: mm/memcg, swap: tidy up cgroup v1 memsw swap helpers The cgroup v1 swap helpers always operate on swap cache folios whose swap entry is stable: the folio is locked and in the swap cache. There is no need to pass the swap entry or page count as separate parameters when they can be derived from the folio itself. Simplify the redundant parameters and add sanity checks to document the required preconditions. Also rename memcg1_swapout to __memcg1_swapout to indicate it requires special calling context: the folio must be isolated and dying, and the call must be made with interrupts disabled. No functional change. Link: https://lore.kernel.org/20260517-swap-table-p4-v5-6-88ae43e064c7@tencent.com Signed-off-by: Kairui Song Acked-by: Chris Li Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Chengming Zhou Cc: David Hildenbrand Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: Lorenzo Stoakes Cc: Muchun Song Cc: Nhat Pham Cc: Roman Gushchin Cc: Shakeel Butt Cc: Youngjun Park Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 8 ++++---- include/linux/swap.h | 10 ++++------ 2 files changed, 8 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index dc3fa687759b..7d08128de1fd 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1899,8 +1899,8 @@ static inline void mem_cgroup_exit_user_fault(void) current->in_user_fault = 0; } -void memcg1_swapout(struct folio *folio, swp_entry_t entry); -void memcg1_swapin(swp_entry_t entry, unsigned int nr_pages); +void __memcg1_swapout(struct folio *folio); +void memcg1_swapin(struct folio *folio); #else /* CONFIG_MEMCG_V1 */ static inline @@ -1929,11 +1929,11 @@ static inline void mem_cgroup_exit_user_fault(void) { } -static inline void memcg1_swapout(struct folio *folio, swp_entry_t entry) +static inline void __memcg1_swapout(struct folio *folio) { } -static inline void memcg1_swapin(swp_entry_t entry, unsigned int nr_pages) +static inline void memcg1_swapin(struct folio *folio) { } diff --git a/include/linux/swap.h b/include/linux/swap.h index 7a09df6977a5..f907d3df52d0 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -571,13 +571,12 @@ static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp) #endif #if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP) -int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry); -static inline int mem_cgroup_try_charge_swap(struct folio *folio, - swp_entry_t entry) +int __mem_cgroup_try_charge_swap(struct folio *folio); +static inline int mem_cgroup_try_charge_swap(struct folio *folio) { if (mem_cgroup_disabled()) return 0; - return __mem_cgroup_try_charge_swap(folio, entry); + return __mem_cgroup_try_charge_swap(folio); } extern void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages); @@ -591,8 +590,7 @@ static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_p extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg); extern bool mem_cgroup_swap_full(struct folio *folio); #else -static inline int mem_cgroup_try_charge_swap(struct folio *folio, - swp_entry_t entry) +static inline int mem_cgroup_try_charge_swap(struct folio *folio) { return 0; } -- cgit v1.2.3 From bc34e87a51d9e51d398ef6d8c2c35cf1a4ff38b9 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Sun, 17 May 2026 23:39:47 +0800 Subject: mm, swap: delay and unify memcg lookup and charging for swapin Instead of checking the cgroup private ID during page table walk in swap_pte_batch(), move the memcg lookup into __swap_cache_add_check() under the cluster lock. The first pre-alloc check is speculative and skips the memcg check since the post-alloc stable check ensures all slots covered by the folio belong to the same memcg. It is very rare for contiguous and aligned entries across a contiguous region of a page table of the same process or shmem mapping to belong to different memcgs. This also prepares for recording the memcg info in the cluster's table. Also make the order check and fallback more compact. There should be no user-observable behavior change. Link: https://lore.kernel.org/20260517-swap-table-p4-v5-8-88ae43e064c7@tencent.com Signed-off-by: Kairui Song Acked-by: Chris Li Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Chengming Zhou Cc: David Hildenbrand Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: Lorenzo Stoakes Cc: Muchun Song Cc: Nhat Pham Cc: Roman Gushchin Cc: Shakeel Butt Cc: Youngjun Park Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 7d08128de1fd..a013f37f24aa 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -646,8 +646,8 @@ static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, int mem_cgroup_charge_hugetlb(struct folio* folio, gfp_t gfp); -int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, - gfp_t gfp, swp_entry_t entry); +int mem_cgroup_swapin_charge_folio(struct folio *folio, unsigned short id, + struct mm_struct *mm, gfp_t gfp); void __mem_cgroup_uncharge(struct folio *folio); @@ -1137,7 +1137,7 @@ static inline int mem_cgroup_charge_hugetlb(struct folio* folio, gfp_t gfp) } static inline int mem_cgroup_swapin_charge_folio(struct folio *folio, - struct mm_struct *mm, gfp_t gfp, swp_entry_t entry) + unsigned short id, struct mm_struct *mm, gfp_t gfp) { return 0; } -- cgit v1.2.3 From b197d41462c2076bc88c79fead7f400e48881c19 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Sun, 17 May 2026 23:39:49 +0800 Subject: mm/memcg, swap: store cgroup id in cluster table directly Drop the usage of the swap_cgroup_ctrl, and use the dynamic cluster table instead. The per-cluster memcg table is 1024 / 512 bytes on most archs, and does not need RCU protection: the cgroup data is only read and written under the cluster lock. That keeps things simple, lets the allocation use plain kmalloc with immediate kfree (no deferred free), and keeps fragmentation acceptable. [akpm@linux-foundation.org: memcgv1: don't compile swap functions when CONFIG_SWAP=n] Link: https://lore.kernel.org/202605281711.bSeZlErK-lkp@intel.com [akpm@linux-foundation.org: fix CONFIG_SWAP=n build] Link: https://lore.kernel.org/20260517-swap-table-p4-v5-10-88ae43e064c7@tencent.com Signed-off-by: Kairui Song Acked-by: Chris Li Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Chengming Zhou Cc: David Hildenbrand Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: Lorenzo Stoakes Cc: Muchun Song Cc: Nhat Pham Cc: Roman Gushchin Cc: Shakeel Butt Cc: Youngjun Park Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 19 +++++++++++++------ include/linux/swap.h | 8 ++++---- 2 files changed, 17 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index a013f37f24aa..8f2662db166b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -29,6 +29,7 @@ struct obj_cgroup; struct page; struct mm_struct; struct kmem_cache; +struct swap_cluster_info; /* Cgroup-specific page state, on top of universal node page state */ enum memcg_stat_item { @@ -1899,9 +1900,6 @@ static inline void mem_cgroup_exit_user_fault(void) current->in_user_fault = 0; } -void __memcg1_swapout(struct folio *folio); -void memcg1_swapin(struct folio *folio); - #else /* CONFIG_MEMCG_V1 */ static inline unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order, @@ -1929,14 +1927,23 @@ static inline void mem_cgroup_exit_user_fault(void) { } -static inline void __memcg1_swapout(struct folio *folio) +#endif /* CONFIG_MEMCG_V1 */ + +#if defined(CONFIG_MEMCG_V1) && defined(CONFIG_SWAP) + +void __memcg1_swapout(struct folio *folio, struct swap_cluster_info *ci); +void memcg1_swapin(struct folio *folio); + +#else + +static inline void __memcg1_swapout(struct folio *folio, + struct swap_cluster_info *ci) { } static inline void memcg1_swapin(struct folio *folio) { } - -#endif /* CONFIG_MEMCG_V1 */ +#endif #endif /* _LINUX_MEMCONTROL_H */ diff --git a/include/linux/swap.h b/include/linux/swap.h index f907d3df52d0..200e7c345f26 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -579,12 +579,12 @@ static inline int mem_cgroup_try_charge_swap(struct folio *folio) return __mem_cgroup_try_charge_swap(folio); } -extern void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages); -static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) +extern void __mem_cgroup_uncharge_swap(unsigned short id, unsigned int nr_pages); +static inline void mem_cgroup_uncharge_swap(unsigned short id, unsigned int nr_pages) { if (mem_cgroup_disabled()) return; - __mem_cgroup_uncharge_swap(entry, nr_pages); + __mem_cgroup_uncharge_swap(id, nr_pages); } extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg); @@ -595,7 +595,7 @@ static inline int mem_cgroup_try_charge_swap(struct folio *folio) return 0; } -static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, +static inline void mem_cgroup_uncharge_swap(unsigned short id, unsigned int nr_pages) { } -- cgit v1.2.3 From 4e8e1c498de9f97628207d1ef84506058b06bb51 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Sun, 17 May 2026 23:39:50 +0800 Subject: mm/memcg: remove no longer used swap cgroup array Now all swap cgroup records are stored in the swap cluster directly, the static array is no longer needed. Link: https://lore.kernel.org/20260517-swap-table-p4-v5-11-88ae43e064c7@tencent.com Signed-off-by: Kairui Song Acked-by: Chris Li Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Chengming Zhou Cc: David Hildenbrand Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: Lorenzo Stoakes Cc: Muchun Song Cc: Nhat Pham Cc: Roman Gushchin Cc: Shakeel Butt Cc: Youngjun Park Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/swap_cgroup.h | 47 --------------------------------------------- 1 file changed, 47 deletions(-) delete mode 100644 include/linux/swap_cgroup.h (limited to 'include') diff --git a/include/linux/swap_cgroup.h b/include/linux/swap_cgroup.h deleted file mode 100644 index 91cdf12190a0..000000000000 --- a/include/linux/swap_cgroup.h +++ /dev/null @@ -1,47 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __LINUX_SWAP_CGROUP_H -#define __LINUX_SWAP_CGROUP_H - -#include - -#if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP) - -extern void swap_cgroup_record(struct folio *folio, unsigned short id, swp_entry_t ent); -extern unsigned short swap_cgroup_clear(swp_entry_t ent, unsigned int nr_ents); -extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent); -extern int swap_cgroup_swapon(int type, unsigned long max_pages); -extern void swap_cgroup_swapoff(int type); - -#else - -static inline -void swap_cgroup_record(struct folio *folio, unsigned short id, swp_entry_t ent) -{ -} - -static inline -unsigned short swap_cgroup_clear(swp_entry_t ent, unsigned int nr_ents) -{ - return 0; -} - -static inline -unsigned short lookup_swap_cgroup_id(swp_entry_t ent) -{ - return 0; -} - -static inline int -swap_cgroup_swapon(int type, unsigned long max_pages) -{ - return 0; -} - -static inline void swap_cgroup_swapoff(int type) -{ - return; -} - -#endif - -#endif /* __LINUX_SWAP_CGROUP_H */ -- cgit v1.2.3 From d9ceded101a142cd56f1e88fc7e893560ee59f4d Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Sun, 17 May 2026 23:39:51 +0800 Subject: mm, swap: merge zeromap into swap table By allocating one additional bit in the swap table entry's flags field alongside the count, we can store the zeromap inline For 64 bit systems, zeromap will store in the swap table, avoiding zeromap allocation. It reduces the allocated memory. That is the happy path. For certain 32-bit archs, there might not be enough bits in the swap table to contain both PFN and flags. Therefore, conditionally let each cluster have a zeromap field at build time, and use that instead. If the swapfile cluster is not fully used, it will still save memory for zeromap. The empty cluster does not allocate a zeromap. In the worst case, all cluster are fully populated. We will use memory similar to the previous zeromap implementation. A few macros were moved to different headers for build time struct definition. [akpm@linux-foundation.org: swap_cluster_alloc_table(): remove unused local `ret] [akpm@linux-foundation.org: fix unused label `err_free'] Link: https://lore.kernel.org/20260517-swap-table-p4-v5-12-88ae43e064c7@tencent.com Signed-off-by: Kairui Song Acked-by: Chris Li Reviewed-by: Youngjun Park Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Chengming Zhou Cc: David Hildenbrand Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: Lorenzo Stoakes Cc: Muchun Song Cc: Nhat Pham Cc: Roman Gushchin Cc: Shakeel Butt Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/swap.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 200e7c345f26..8c43bc3055c9 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -252,7 +252,6 @@ struct swap_info_struct { struct plist_node list; /* entry in swap_active_head */ signed char type; /* strange name for an index */ unsigned int max; /* size of this swap device */ - unsigned long *zeromap; /* kvmalloc'ed bitmap to track zero pages */ struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ struct list_head free_clusters; /* free clusters list */ struct list_head full_clusters; /* full clusters list */ -- cgit v1.2.3 From 45c49d9fd6089e344663176b8488c97d905ca3ac Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 18 May 2026 16:40:49 -0700 Subject: mm/damon/core: introduce struct damon_probe Patch series "mm/damon: introduce data attributes monitoring". TL; DR ====== Extend DAMON for monitoring general data attributes other than accesses. The short term motivation is lightweight page type (e.g., belonging cgroup) aware monitoring. In long term, this will help extending DAMON for multiple access events capture primitives (e.g., page faults and PMU) and eventually pivotting DAMON to a "Data Attributes Monitoring and Operations eNgine" in long term. Background: High Cost of Page Level Properties Monitoring ========================================================= DAMON is initially introduced as a Data Access MONitor. It has been extended for not only access monitoring but also data access-aware system operations (DAMOS). But still the monitoring part is only for data accesses. Data access patterns is good information, but some users need more holistic views. Particularly, users want to show the access pattern information together with the types of the memory. For example, users who work for making huge pages efficiently want to know how much of DAMON-found hot/cold regions are backed by huge pages. Users who run multiple workloads with different cgroups want to know how much of DAMON-found hot/cold regions belong to specific cgroups. For the user demand, we developed a DAMOS extension for page level properties based monitoring [1], which has landed on 6.14. Using the feature, users can inform the page level data properties that they are interested in, in a flexible format that uses DAMOS filters. Then, DAMON applies the filters to each folio of the entire DAMON region and lets users know how many bytes of memory in each DAMON region passed the given filters. This gives page level detailed and deterministic information to users. But, because the operation is done at page level, the overhead is proportional to the memory size. It was useful for test or debugging purposes on a small number of machines. But it was obviously too heavy to be enabled always on all machines running the real user workloads. For real world workloads, it was recommended to use the feature with user-space controlled sampling approaches. For example, users could do the page level monitoring only once per hour, on randomly selected one percent of machines of their fleet. If the runtime and the size of the fleet is long and big enough, it should provide statistically meaningful data. But users are too busy to implement such controls on their own. Data Attributes Monitoring ========================== Extend DAMON to monitor not only data accesses, but also general data attributes. Do the extension while keeping the main promise of DAMON, the bounded and best-effort minimum overhead. Allow users to specify what data attributes in addition to the data access they want to monitor. Users can install one 'data probe' per data attribute of their interest for this purpose. The 'data probe' should be able to be applied to any memory, and determine if the given memory has the appropriate data attribute. E.g., if memory of physical address 42 belongs to cgroup A. Each 'data probe' is configured with filters that are very similar to the DAMOS filters. When DAMON checks if each sampling address memory of each region is accessed since the last check, it applies data probes if registered. Same to the number of access check-positive samples accounting (nr_accesses), it accounts the number of each data probe-positive samples in another per-region counters array, namely 'probe_hits'. When DAMON resets nr_accesses every aggregation interval, it resets 'probe_hits' together. Users can read 'probe_hits' just before the values are reset. In this way, users can know how many hot/cold memory regions have data attributes of their interest. E.g., 30 percent of this system's hot memory is belonging to cgroup A, and 80 percent of the cgroup A-belonging hot memory is backed by huge pages. Patches Sequence ================ First eight patches implement the core feature, interface and the working support. Patch 1 introduces data probe data structure, namely damon_probe. Patch 2 extends damon_ctx for installing data probes. Patch 3 introduces another data structure for filters of each data probe, namely damon_filter. Patch 4 updates damon_ctx commit function to handle the probes. Patch 5 extends damon_region for the per-region per-probe positive samples counter, namely probe_hits. Patch 6 extends damon_operations for applying probes on the underlying DAMON operations implementation. Patch 7 updates kdamond_fn() to invoke the probes applying callback. Patch 8 finally implements the probes support on paddr ops. Ten changes for user interface (patches 9-18) come next. Patches 9-13 implements sysfs directories and files for setting data probes, namely probes directory, probe directory, filters directory, filter directory and filter directory internal files, respectively. Patch 14 connects the user inputs that are made via the sysfs files to DAMON core. Following three patches (patches 15-17) implement sysfs directories and files for showing the probe_hits to users, namely probes directory, probe directory and hits files, respectively. Patch 18 introduces a new tracepoint for showing the probe_hits via tracefs. Patch 19 adds a selftest for the sysfs files. Patches 20 and 21 documents the design and usage of the new feature, respectively. Seven additional patches (patches 22-28) for monitoring belonging memory cgroup follow. Depending on the feedback, this part might be separated to another series in future. Patch 22 defines the DAMON filter type for the new attribute, namely DAMON_FILTER_TYPE_MEMCG. Patch 23 add the support on paddr ops. Patch 24 updates the sysfs interface for setup of the target memcg. Patch 25 move code for easy reuse of the filter target memcg setup. Patch 26 connects the user input to the core layer. Finally, patches 27 and 28 update the design and usage documents for the memcg attribute monitoring support. Discussion ========== This allows the page properties monitoring with overhead that is low enough to be enabled always on real world workloads. Because the sampling time for access check is reused for data attributes check, the upper-bounded and best-effort minimum overhead of DAMON is kept. Because the sampling memory for access check is reused for data attributes check, additional overhead is minimum. Still DAMOS-based page level properties monitoring should be useful, because it provides a deterministic page level information. When in doubt of the sampling based information, running DAMOS-based one together and comparing the results would be useful, for debugging and tuning. Future Works: Mid Term ======================== This version of implementation is limiting the maximum number of data probes to four. I will try to find a way to remove the limit in future. I personally think it should be enough for common use cases, though, and therefore not giving high priority at the moment. Future Works: Long Term ======================= There are user requests for extending DAMON with detailed access information, for example, per-CPUs/threads/read/writes monitoring. For that, I was working [2] on extending DAMON to use page fault events as another access check primitives, and making the infrastructure flexible for future use of yet another access check primitive. Actually there is another ongoing work [3] for extending DAMON with PMU events. The motivation of the work is reducing the overhead, though. In my work [2], I was introducing a new interface for access sampling primitives control. Now I think this data probe interface can be used for that, too. That is, data access becomes just one type of data attribute. Also, pg_idle-confirmed access, page fault-confirmed access, and PMU event-confirmed access will be different types of data attributes. The regions adjustment mechanism is currently working based on the access information. That's because DAMON is designed for data access monitoring. That is, data access information is the primary interest, and therefore DAMON adjusts regions in a way that can best-present the information. Once data access becomes just one of data attributes, there is no reason to think data access that special. There might be some users not interested in access at all but want to know the location of memory of specific type. Data probes interface will allow doing that. Further, we could extend the interface to let users set any data attribute as the 'primary' attribute. Then, DAMON will split and merge regions in a way that can best-present the 'primary' attributes. DAMOS will also be extended, to specify targets based on not only the data access pattern, but all user-registered data attributes. From this stage, we may be able to call DAMON as a "Data Attributes Monitoring and Operations eNgine". This patch (of 28): Introduce a data structure for data attribute probe. It is just a linked list header at this step. It will be extended in a way that it can determine if a given memory has a specific data attribute. Link: https://lore.kernel.org/20260518234119.97569-1-sj@kernel.org Link: https://lore.kernel.org/20260518234119.97569-2-sj@kernel.org Link: https://lore.kernel.org/20250106193401.109161-1-sj@kernel.org [1] Link: https://lore.kernel.org/20251208062943.68824-1-sj@kernel.org/ [2] Link: https://lore.kernel.org/20260423004211.7037-1-akinobu.mita@gmail.com [3] Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 4d4f031bcb45..4794931fa2ea 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -730,6 +730,15 @@ struct damon_intervals_goal { unsigned long max_sample_us; }; +/** + * struct damon_probe - Data region attribute probe. + * + * @list: Siblings list. + */ +struct damon_probe { + struct list_head list; +}; + /** * struct damon_attrs - Monitoring attributes for accuracy/overhead control. * -- cgit v1.2.3 From 18c777859f28d5e9b65d94c4fdc64f240250df3a Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 18 May 2026 16:40:50 -0700 Subject: mm/damon/core: embed damon_probe objects in damon_ctx Let damon_probe objects be able to be installed on a given damon_ctx, by adding a linked list header for storing the objects. Add initialization and cleanup of the new field with helper functions, too. Link: https://lore.kernel.org/20260518234119.97569-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 4794931fa2ea..43d71eb24ccb 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -857,6 +857,7 @@ struct damon_ctx { /* public: */ struct damon_operations ops; + struct list_head probes; unsigned long addr_unit; unsigned long min_region_sz; bool pause; @@ -909,6 +910,11 @@ static inline unsigned long damon_sz_region(struct damon_region *r) return r->ar.end - r->ar.start; } +#define damon_for_each_probe(p, ctx) \ + list_for_each_entry(p, &(ctx)->probes, list) + +#define damon_for_each_probe_safe(p, next, ctx) \ + list_for_each_entry_safe(p, next, &(ctx)->probes, list) #define damon_for_each_region(r, t) \ list_for_each_entry(r, &t->regions_list, list) @@ -951,6 +957,9 @@ static inline unsigned long damon_sz_region(struct damon_region *r) #ifdef CONFIG_DAMON +struct damon_probe *damon_new_probe(void); +void damon_add_probe(struct damon_ctx *ctx, struct damon_probe *probe); + struct damon_region *damon_new_region(unsigned long start, unsigned long end); /* -- cgit v1.2.3 From f557693dd8ac9cd87d2a1ae1025ee9f568e916e6 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 18 May 2026 16:40:51 -0700 Subject: mm/damon/core: introduce damon_filter Define a data structure for constructing damon_probe's attributes check, namely damon_filter. It is very similar to damos_filter but works only for monitoring purposes. Also embed that into damon_probe, implement essential handling of the link, with fundamental helpers. Link: https://lore.kernel.org/20260518234119.97569-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 43d71eb24ccb..f8b679dd944d 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -730,12 +730,38 @@ struct damon_intervals_goal { unsigned long max_sample_us; }; +/** + * enum damon_filter_type - Type of &struct damon_filter + * + * @DAMON_FILTER_TYPE_ANON: Anonymous pages. + */ +enum damon_filter_type { + DAMON_FILTER_TYPE_ANON, +}; + +/** + * struct damon_filter - DAMON region filter for &struct damon_probe. + * + * @type: Type of the region. + * @matching: Whether this filter is for the type-matching ones. + * @allow: Whether the @type-@matching ones should pass this filter. + * @list: Siblings list. + */ +struct damon_filter { + enum damon_filter_type type; + bool matching; + bool allow; + struct list_head list; +}; + /** * struct damon_probe - Data region attribute probe. * + * @filters: Filters for assessing if a given region is for this probe. * @list: Siblings list. */ struct damon_probe { + struct list_head filters; struct list_head list; }; @@ -910,6 +936,12 @@ static inline unsigned long damon_sz_region(struct damon_region *r) return r->ar.end - r->ar.start; } +#define damon_for_each_filter(f, p) \ + list_for_each_entry(f, &(p)->filters, list) + +#define damon_for_each_filter_safe(f, next, p) \ + list_for_each_entry_safe(f, next, &(p)->filters, list) + #define damon_for_each_probe(p, ctx) \ list_for_each_entry(p, &(ctx)->probes, list) @@ -957,6 +989,10 @@ static inline unsigned long damon_sz_region(struct damon_region *r) #ifdef CONFIG_DAMON +struct damon_filter *damon_new_filter(enum damon_filter_type type, + bool matching, bool allow); +void damon_add_filter(struct damon_probe *probe, struct damon_filter *f); + struct damon_probe *damon_new_probe(void); void damon_add_probe(struct damon_ctx *ctx, struct damon_probe *probe); -- cgit v1.2.3 From 57c6332f2548d94f137f51bd18111e4316fd1ba4 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 18 May 2026 16:40:53 -0700 Subject: mm/damon/core: introduce damon_region->probe_hits Add an array for the per-region per-probe positive samples count. For simple and efficient implementation, add a limit to the number of data probes and set the array to support only the limited number of counters. Link: https://lore.kernel.org/20260518234119.97569-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index f8b679dd944d..3a30af119ac6 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -17,6 +17,8 @@ /* Minimal region size. Every damon_region is aligned by this. */ #define DAMON_MIN_REGION_SZ PAGE_SIZE +/* Maximum number of monitoring probes. */ +#define DAMON_MAX_PROBES (4) /* Max priority score for DAMON-based operation schemes */ #define DAMOS_MAX_SCORE (99) @@ -47,6 +49,7 @@ struct damon_size_range { * @nr_accesses: Access frequency of this region. * @nr_accesses_bp: @nr_accesses in basis point (0.01%) that updated for * each sampling interval. + * @probe_hits: Number of probe-positive region samples. * @list: List head for siblings. * @age: Age of this region. * @@ -75,6 +78,7 @@ struct damon_region { unsigned long sampling_addr; unsigned int nr_accesses; unsigned int nr_accesses_bp; + unsigned char probe_hits[DAMON_MAX_PROBES]; struct list_head list; unsigned int age; -- cgit v1.2.3 From 1a9e847589180359be4198c7d2a3d2ea15b2ddd0 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 18 May 2026 16:40:54 -0700 Subject: mm/damon/core: introduce damon_ops->apply_probes Extend damon_operations struct with a new callback, namely apply_probes. The callback will be invoked for data attributes monitoring. More specifically, the callback will apply damon_probe objects to each region and update the per-region per-probe counters for the number of encountered probe-positive samples. Link: https://lore.kernel.org/20260518234119.97569-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 3a30af119ac6..1fb271a35e98 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -630,6 +630,7 @@ enum damon_ops_id { * @update: Update operations-related data structures. * @prepare_access_checks: Prepare next access check of target regions. * @check_accesses: Check the accesses to target regions. + * @apply_probes: Apply probes for each region. * @get_scheme_score: Get the score of a region for a scheme. * @apply_scheme: Apply a DAMON-based operation scheme. * @target_valid: Determine if the target is valid. @@ -656,6 +657,8 @@ enum damon_ops_id { * last preparation and update the number of observed accesses of each region. * It should also return max number of observed accesses that made as a result * of its update. The value will be used for regions adjustment threshold. + * @apply_probes should apply the data attribute probes to each region and + * accordingly update the probe hits counter of the region. * @get_scheme_score should return the priority score of a region for a scheme * as an integer in [0, &DAMOS_MAX_SCORE]. * @apply_scheme is called from @kdamond when a region for user provided @@ -673,6 +676,7 @@ struct damon_operations { void (*update)(struct damon_ctx *context); void (*prepare_access_checks)(struct damon_ctx *context); unsigned int (*check_accesses)(struct damon_ctx *context); + void (*apply_probes)(struct damon_ctx *context); int (*get_scheme_score)(struct damon_ctx *context, struct damon_region *r, struct damos *scheme); unsigned long (*apply_scheme)(struct damon_ctx *context, -- cgit v1.2.3 From b9b7bad279de29294c4d3314fe90fca345c38ea6 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 18 May 2026 16:41:06 -0700 Subject: mm/damon: trace probe_hits Introduce a new tracepoint for exposing the per-region per-probe positive sample count via tracefs. Link: https://lore.kernel.org/20260518234119.97569-19-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/trace/events/damon.h | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'include') diff --git a/include/trace/events/damon.h b/include/trace/events/damon.h index 7e25f4469b81..78388538acf4 100644 --- a/include/trace/events/damon.h +++ b/include/trace/events/damon.h @@ -130,6 +130,44 @@ TRACE_EVENT(damon_monitor_intervals_tune, TP_printk("sample_us=%lu", __entry->sample_us) ); +TRACE_EVENT_CONDITION(damon_region_aggregated, + + TP_PROTO(unsigned int target_id, struct damon_region *r, + unsigned int nr_regions, unsigned int nr_probes), + + TP_ARGS(target_id, r, nr_regions, nr_probes), + + TP_CONDITION(nr_probes > 0), + + TP_STRUCT__entry( + __field(unsigned long, target_id) + __field(unsigned long, start) + __field(unsigned long, end) + __field(unsigned int, nr_regions) + __field(unsigned int, nr_accesses) + __field(unsigned int, age) + __dynamic_array(unsigned char, probe_hits, nr_probes) + ), + + TP_fast_assign( + __entry->target_id = target_id; + __entry->start = r->ar.start; + __entry->end = r->ar.end; + __entry->nr_regions = nr_regions; + __entry->nr_accesses = r->nr_accesses; + __entry->age = r->age; + memcpy(__get_dynamic_array(probe_hits), r->probe_hits, + sizeof(*r->probe_hits) * nr_probes); + ), + + TP_printk("target_id=%lu nr_regions=%u %lu-%lu: %u %u probe_hits=%s", + __entry->target_id, __entry->nr_regions, + __entry->start, __entry->end, + __entry->nr_accesses, __entry->age, + __print_hex(__get_dynamic_array(probe_hits), + __get_dynamic_array_len(probe_hits))) +); + TRACE_EVENT(damon_aggregated, TP_PROTO(unsigned int target_id, struct damon_region *r, -- cgit v1.2.3 From d9f23f2f822a59771fdc3cab648785d4f651e1b2 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 18 May 2026 16:41:10 -0700 Subject: mm/damon/core: introduce DAMON_FILTER_TYPE_MEMCG Belonging memory cgoup is another data attribute that can be useful to monitor. Introduce a new DAMON filter type, namely DAMON_FILTER_TYPE_MEMCG, for monitoring of this attribute. Link: https://lore.kernel.org/20260518234119.97569-23-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 1fb271a35e98..6a54c601889b 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -742,9 +742,11 @@ struct damon_intervals_goal { * enum damon_filter_type - Type of &struct damon_filter * * @DAMON_FILTER_TYPE_ANON: Anonymous pages. + * @DAMON_FILTER_TYPE_MEMCG: Specific memcg's pages. */ enum damon_filter_type { DAMON_FILTER_TYPE_ANON, + DAMON_FILTER_TYPE_MEMCG, }; /** @@ -753,12 +755,16 @@ enum damon_filter_type { * @type: Type of the region. * @matching: Whether this filter is for the type-matching ones. * @allow: Whether the @type-@matching ones should pass this filter. + * @memcg_id: Memcg id of the question if @type is DAMON_FILTER_MEMCG. * @list: Siblings list. */ struct damon_filter { enum damon_filter_type type; bool matching; bool allow; + union { + u64 memcg_id; + }; struct list_head list; }; -- cgit v1.2.3 From 543ab01db7ace5bb28972ac70f321d55cc4f0214 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 18 May 2026 16:41:14 -0700 Subject: mm/damon/sysfs: setup damon_filter->memcg_id from path Find and set the memcg_id for damon_filter from the user-passed memory cgroup path when updating the DAMON input parameters. Link: https://lore.kernel.org/20260518234119.97569-27-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 6a54c601889b..4014fd0d463c 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -1006,6 +1006,7 @@ static inline unsigned long damon_sz_region(struct damon_region *r) struct damon_filter *damon_new_filter(enum damon_filter_type type, bool matching, bool allow); void damon_add_filter(struct damon_probe *probe, struct damon_filter *f); +void damon_destroy_filter(struct damon_filter *f); struct damon_probe *damon_new_probe(void); void damon_add_probe(struct damon_ctx *ctx, struct damon_probe *probe); -- cgit v1.2.3 From 17986198a7b99485d7b2bc4eb8d700fbf8c8629e Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 2 Jun 2026 12:06:25 +0100 Subject: drivers/char/mem: eliminate unnecessary use of success_hook Patch series "remove mmap_action success, error hooks", v3. The mmap_action->success_hook was a strange beast added to enable code which appeared to absolutely require access to a VMA pointer to work correctly. Primarily this was for hugetlb, however a different approach will be taken there, as clearly more work is required to figure out a sensible way of converting hugetlb to use mmap_prepare. The other user was the memory char driver, specifically /dev/zero which has the unusual property of explicitly setting file-backed VMAs anonymous. Providing the success hook was always foolish, as it allowed drivers a way to workaround the restriction that they should not access a pointer to a not-yet-correctly-initialised VMA - which defeats the purpose of the mmap_prepare work. We can achieve the same thing in memory char driver without needing the success hook, so this series removes that, then removes the success hook altogether. The error hook is also unnecessary - the motivation for this was for functions which need to override the error code when performing an mmap action in order to avoid breaking userspace. We can achieve this by just providing a field for the error code. Doing this means we don't have to worry about the hook doing anything odd. We also add a check to ensure the error code is in fact valid. Again the memory char driver is the only current user of this, so this series updates it to use that. After this change mmap_action has no custom hooks at all, which seems rather more cromulent than before. This patch (of 3): /dev/zero, uniquely, marks memory mapped there as anonymous. This is currently achieved using the mmap_action->success_hook. However this hook circumvents the abstraction of VMA initialisation so it's preferable to do things a different way. To achieve this, this patch firstly defaults the VMA descriptor's vm_ops field to the dummy VMA operations, which is what file-backed VMAs default this field to. That way, we can detect whether a driver sets this field to NULL in order to mark it anonymous. We then introduce vma_desc_set_anonymous() to do this explicitly, and invoke it in mmap_zero_prepare(). This way, any driver which does not explicitly set desc->vm_ops, retains the dummy vm_ops as they would previously. We also update set_vma_user_defined_fields() to make clear that we are either setting vma->vm_ops to what is provided by the driver (or defaulting to dummy_vm_ops if not set), or setting the VMA anonymous. This lays the groundwork for removing the success hook. Link: https://lore.kernel.org/cover.1780397980.git.ljs@kernel.org Link: https://lore.kernel.org/010579cca6787cf7bb057ab1f7228978b10601c8.1780397980.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes Acked-by: David Hildenbrand (Arm) Cc: Arnd Bergmann Cc: Greg Kroah-Hartman Cc: Jann Horn Cc: Liam R. Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Pedro Falcato Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 11f440e9d7cd..0f2612a70fb1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1489,6 +1489,11 @@ static inline void vma_set_anonymous(struct vm_area_struct *vma) vma->vm_ops = NULL; } +static inline void vma_desc_set_anonymous(struct vm_area_desc *desc) +{ + desc->vm_ops = NULL; +} + static inline bool vma_is_anonymous(struct vm_area_struct *vma) { return !vma->vm_ops; -- cgit v1.2.3 From 8876dc0780f23eb499b42cc84df2dd795aada6be Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 2 Jun 2026 12:06:26 +0100 Subject: mm/vma: remove mmap_action->success_hook This hook was introduced to work around code that seemed to absolutely require access to a VMA pointer upon mmap(). However, providing this hook leaves a backdoor to drivers getting access to the very thing mmap_prepare eliminates - a pointer to the VMA. Let's solve this contradiction by removing it. The key intended user was hugetlb, however it seems that the best course now is to avoid allowing all drivers the ability to work around mmap_prepare, and find a different solution there. Link: https://lore.kernel.org/f79434e6d30af6d92999be6b76e197f1847105fa.1780397980.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes Acked-by: David Hildenbrand (Arm) Cc: Arnd Bergmann Cc: Greg Kroah-Hartman Cc: Jann Horn Cc: Liam R. Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Pedro Falcato Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index a308e2c23b82..945c0a5386d6 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -843,16 +843,6 @@ struct mmap_action { }; enum mmap_action_type type; - /* - * If specified, this hook is invoked after the selected action has been - * successfully completed. Note that the VMA write lock still held. - * - * The absolute minimum ought to be done here. - * - * Returns 0 on success, or an error code. - */ - int (*success_hook)(const struct vm_area_struct *vma); - /* * If specified, this hook is invoked when an error occurred when * attempting the selected action. -- cgit v1.2.3 From 4f5b8759262e5e65373638346307836de1290b22 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 2 Jun 2026 12:06:27 +0100 Subject: mm/vma: eliminate mmap_action->error_hook, introduce error_override Rather than providing a hook, simplify things by providing the ability to override mmap action errors. This allows us to more carefully validate the value provided and thus ensure only a valid error code is specified, and simplifies the interface. This way, we eliminate all hooks but mmap_prepare and allow only mmap actions to be specified (which core mm controls). This significantly improves robustness and eliminates any unnecessary code duplication in driver mmap hooks. We also update the /dev/mem logic (the only user) to use mmap_action->error_override instead. Link: https://lore.kernel.org/55d13f7d016b827c459946d46a56105635be111c.1780397980.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes Acked-by: David Hildenbrand (Arm) Cc: Arnd Bergmann Cc: Greg Kroah-Hartman Cc: Jann Horn Cc: Liam R. Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Pedro Falcato Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 945c0a5386d6..5ef78617ce93 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -844,13 +844,10 @@ struct mmap_action { enum mmap_action_type type; /* - * If specified, this hook is invoked when an error occurred when - * attempting the selected action. - * - * The hook can return an error code in order to filter the error, but - * it is not valid to clear the error here. + * If non-zero, replace errors that arise from mmap actions with this + * value instead. Only valid error codes may be specified. */ - int (*error_hook)(int err); + int error_override; /* * This should be set in rare instances where the operation required -- cgit v1.2.3 From 9cf7ef2d6665dff35b3b522c84509c2d256bf3aa Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 22 May 2026 08:40:16 -0700 Subject: mm/damon/core: hide damon_add_region() damon_add_region() is being used by only DAMON core, but exposed to DAMON API callers. Exposing something that is not really being used by others will only increase the maintenance cost. Hide it. Link: https://lore.kernel.org/20260522154026.80546-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Shuah Khan Signed-off-by: Andrew Morton --- include/linux/damon.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 4014fd0d463c..b9370c1779cb 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -1024,7 +1024,6 @@ static inline void damon_insert_region(struct damon_region *r, t->nr_regions++; } -void damon_add_region(struct damon_region *r, struct damon_target *t); void damon_destroy_region(struct damon_region *r, struct damon_target *t); int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, unsigned int nr_ranges, unsigned long min_region_sz); -- cgit v1.2.3 From 26d6f6960ff91ebb267cd80efe7772c6427b4cc1 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 22 May 2026 08:40:17 -0700 Subject: mm/damon/core: hide damon_insert_region() damon_insert_region() is being used by only DAMON core, but exposed to DAMON API callers. Exposing something that is not really being used by others will only increase the maintenance cost. Hide it. Link: https://lore.kernel.org/20260522154026.80546-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Shuah Khan Signed-off-by: Andrew Morton --- include/linux/damon.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index b9370c1779cb..3acca7deb169 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -1013,17 +1013,6 @@ void damon_add_probe(struct damon_ctx *ctx, struct damon_probe *probe); struct damon_region *damon_new_region(unsigned long start, unsigned long end); -/* - * Add a region between two other regions - */ -static inline void damon_insert_region(struct damon_region *r, - struct damon_region *prev, struct damon_region *next, - struct damon_target *t) -{ - __list_add(&r->list, &prev->list, &next->list); - t->nr_regions++; -} - void damon_destroy_region(struct damon_region *r, struct damon_target *t); int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, unsigned int nr_ranges, unsigned long min_region_sz); -- cgit v1.2.3 From 50d2dec8af1a09056b6b29b54a30e32281b30e2c Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 22 May 2026 08:40:18 -0700 Subject: mm/damon/core: hide damon_destroy_region() damon_destroy_region() is being used by only DAMON core, but exposed to DAMON API callers. Exposing something that is not really being used by others will only increase the maintenance cost. Hide it. Link: https://lore.kernel.org/20260522154026.80546-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Shuah Khan Signed-off-by: Andrew Morton --- include/linux/damon.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 3acca7deb169..638ee65f88dc 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -1013,7 +1013,6 @@ void damon_add_probe(struct damon_ctx *ctx, struct damon_probe *probe); struct damon_region *damon_new_region(unsigned long start, unsigned long end); -void damon_destroy_region(struct damon_region *r, struct damon_target *t); int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, unsigned int nr_ranges, unsigned long min_region_sz); void damon_update_region_access_rate(struct damon_region *r, bool accessed, -- cgit v1.2.3 From d63e9d829e42b29201ebbcf0a070acea8ed40b2c Mon Sep 17 00:00:00 2001 From: Maksym Shcherba Date: Thu, 21 May 2026 23:20:19 +0300 Subject: mm/damon: fix missing parens in macro arguments Patch series "mm/damon: fix macro arguments and clarify quota goals doc", v2. This patch (of 2): The DAMON iterator macros do not wrap their pointer arguments with parentheses. This can cause build failures when the argument is a complex expression due to operator precedence issues. Add missing parentheses around the arguments in the following macros to prevent potential build failures: - damon_for_each_region() - damon_for_each_region_from() - damon_for_each_region_safe() - damos_for_each_quota_goal() Link: https://lore.kernel.org/20260521202020.126500-1-maksym.shcherba@lnu.edu.ua Link: https://lore.kernel.org/20260521202020.126500-2-maksym.shcherba@lnu.edu.ua Signed-off-by: Maksym Shcherba Reviewed-by: SeongJae Park Assisted-by: Antigravity:Gemini-3.1-Pro Signed-off-by: Andrew Morton --- include/linux/damon.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 638ee65f88dc..6f7edb3590ef 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -963,13 +963,13 @@ static inline unsigned long damon_sz_region(struct damon_region *r) list_for_each_entry_safe(p, next, &(ctx)->probes, list) #define damon_for_each_region(r, t) \ - list_for_each_entry(r, &t->regions_list, list) + list_for_each_entry(r, &(t)->regions_list, list) #define damon_for_each_region_from(r, t) \ - list_for_each_entry_from(r, &t->regions_list, list) + list_for_each_entry_from(r, &(t)->regions_list, list) #define damon_for_each_region_safe(r, next, t) \ - list_for_each_entry_safe(r, next, &t->regions_list, list) + list_for_each_entry_safe(r, next, &(t)->regions_list, list) #define damon_for_each_target(t, ctx) \ list_for_each_entry(t, &(ctx)->adaptive_targets, list) @@ -984,7 +984,7 @@ static inline unsigned long damon_sz_region(struct damon_region *r) list_for_each_entry_safe(s, next, &(ctx)->schemes, list) #define damos_for_each_quota_goal(goal, quota) \ - list_for_each_entry(goal, "a->goals, list) + list_for_each_entry(goal, &(quota)->goals, list) #define damos_for_each_quota_goal_safe(goal, next, quota) \ list_for_each_entry_safe(goal, next, &(quota)->goals, list) -- cgit v1.2.3 From 6cbdd9726fb50d749b06ab45a8ef81dff02e69b8 Mon Sep 17 00:00:00 2001 From: "Barry Song (Xiaomi)" Date: Tue, 26 May 2026 21:09:38 +0800 Subject: mm/mglru: use folio_mark_accessed to replace folio_set_active MGLRU gives high priority to folios mapped in page tables. As a result, folio_set_active() is invoked for all folios read during page faults. In practice, however, readahead can bring in many folios that are never accessed via page tables. A previous attempt by Lei Liu proposed introducing a separate LRU for readahead[1] to make readahead pages easier to reclaim, but that approach is likely over-engineered. Before commit 4d5d14a01e2c ("mm/mglru: rework workingset protection"), folios with PG_active were always placed in the youngest generation, leading to over-protection and increased refaults. After that commit, PG_active folios are placed in the second youngest generation, which is still too optimistic given the presence of readahead. In contrast, the classic active/inactive scheme is more conservative. This patch switches to using folio_mark_accessed() and begins prefaulted file folios from the second oldest generation instead of active generations. We should also adjust the following accordingly: - WORKINGSET_ACTIVATE: aligned with setting active for refaulted workingset folios; - lru_gen_folio_seq(): place (pre)faulted file folios into the second oldest generation; - promote second-scanned folios to workingset in folio_check_references(): we now have to depend on folio_lru_refs() > 1, since we previously relied on PG_referenced being set during the first scan, but PG_referenced is now set earlier. On x86, running a kernel build inside a memcg with a 1GB memory limit using 20 threads. w/o patch: real 1m50.764s user 25m32.305s sys 4m0.012s pswpin: 1333245 pswpout: 4366443 pgpgin: 6962592 pgpgout: 17780712 swpout_zero: 1019603 swpin_zero: 14764 refault_file: 287794 refault_anon: 1347963 w/ patch: real 1m48.879s user 25m29.224s sys 3m37.421s pswpin: 568480 pswpout: 2322657 pgpgin: 4073416 pgpgout: 9613408 swpout_zero: 593275 swpin_zero: 9118 refault_file: 262505 refault_anon: 577550 active/inactive LRU: real 1m49.928s user 25m28.196s sys 3m40.740s pswpin: 463452 pswpout: 2309119 pgpgin: 4438856 pgpgout: 9568628 swpout_zero: 743704 swpin_zero: 7244 refault_file: 562555 refault_anon: 470694 Lance and Xueyuan made a huge contribution to this patch through testing. Link: https://lore.kernel.org/20260526130938.66253-1-baohua@kernel.org Link: https://lore.kernel.org/linux-mm/20250916072226.220426-1-liulei.rjpt@vivo.com/ [1] Signed-off-by: Barry Song (Xiaomi) Tested-by: Lance Yang Tested-by: Xueyuan Chen Cc: Pedro Falcato Cc: Kairui Song Cc: Qi Zheng Cc: Shakeel Butt Cc: wangzicheng Cc: Suren Baghdasaryan Cc: Lei Liu Cc: Matthew Wilcox (Oracle) Cc: Axel Rasmussen Cc: Yuanchu Xie Cc: Wei Xu Cc: Will Deacon Cc: Kalesh Singh Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index a171070e15f0..a8430a7ae054 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -247,7 +247,7 @@ static inline unsigned long lru_gen_folio_seq(const struct lruvec *lruvec, (folio_test_dirty(folio) || folio_test_writeback(folio)))) gen = MIN_NR_GENS; else - gen = MAX_NR_GENS - folio_test_workingset(folio); + gen = MAX_NR_GENS - (folio_test_workingset(folio) || folio_test_referenced(folio)); return max(READ_ONCE(lrugen->max_seq) - gen + 1, READ_ONCE(lrugen->min_seq[type])); } -- cgit v1.2.3 From b182633f8ce52172a40097ccc0c60047e58c2320 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sat, 23 May 2026 20:37:59 +0300 Subject: userfaultfd: make functions that are not used outside uffd static After merging fs/userfaultfd.c into mm/userfaultfd.c, several functions that were previously shared between the two files are now only used within mm/userfaultfd.c. Make them static and remove their declarations from include/linux/userfaultfd_k.h. Link: https://lore.kernel.org/20260523173759.3964908-3-rppt@kernel.org Assisted-by: Copilot:claude-opus-4-6 Signed-off-by: Mike Rapoport (Microsoft) Cc: Al Viro Cc: Christian Brauner Cc: David Hildenbrand Cc: Jan Kara Cc: "Kirill A. Shutemov" Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/userfaultfd_k.h | 36 ------------------------------------ 1 file changed, 36 deletions(-) (limited to 'include') diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index d2920f98ab86..3ec8e1071673 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -147,26 +147,12 @@ static inline uffd_flags_t uffd_flags_set_mode(uffd_flags_t flags, enum mfill_at /* Flags controlling behavior. These behavior changes are mode-independent. */ #define MFILL_ATOMIC_WP MFILL_ATOMIC_FLAG(0) -extern ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start, - unsigned long src_start, unsigned long len, - uffd_flags_t flags); -extern ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx, - unsigned long dst_start, - unsigned long len); -extern ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long dst_start, - unsigned long len, uffd_flags_t flags); -extern ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start, - unsigned long len, uffd_flags_t flags); -extern int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start, - unsigned long len, bool enable_wp); extern long uffd_wp_range(struct vm_area_struct *vma, unsigned long start, unsigned long len, bool enable_wp); /* move_pages */ void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2); void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2); -ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, - unsigned long src_start, unsigned long len, __u64 flags); int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, @@ -239,9 +225,6 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma) return vma->vm_flags & __VM_UFFD_FLAGS; } -bool vma_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags, - bool wp_async); - static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma) { struct userfaultfd_ctx *uffd_ctx = vma->vm_userfaultfd_ctx.ctx; @@ -271,25 +254,6 @@ extern void userfaultfd_unmap_complete(struct mm_struct *mm, extern bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma); extern bool userfaultfd_wp_async(struct vm_area_struct *vma); -void userfaultfd_reset_ctx(struct vm_area_struct *vma); - -struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, - struct vm_area_struct *prev, - struct vm_area_struct *vma, - unsigned long start, - unsigned long end); - -int userfaultfd_register_range(struct userfaultfd_ctx *ctx, - struct vm_area_struct *vma, - vm_flags_t vm_flags, - unsigned long start, unsigned long end, - bool wp_async); - -void userfaultfd_release_new(struct userfaultfd_ctx *ctx); - -void userfaultfd_release_all(struct mm_struct *mm, - struct userfaultfd_ctx *ctx); - static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma) { /* Only wr-protect mode uses pte markers */ -- cgit v1.2.3 From 9c87962f85106a4d330a91b26b054376245f47c0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 26 May 2026 21:00:30 +0100 Subject: mm: document the folio refcount a little better Expand the documentation of folio_ref_count() to talk about expected, temporary and spurious refcounts as well as the concept of freezing. Link: https://lore.kernel.org/20260526200032.353868-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/page_ref.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include') diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h index 94d3f0e71c06..9f5c75d06f76 100644 --- a/include/linux/page_ref.h +++ b/include/linux/page_ref.h @@ -71,6 +71,12 @@ static inline int page_ref_count(const struct page *page) * folio_ref_count - The reference count on this folio. * @folio: The folio. * + * Folios contain a reference count. When that reference count reaches + * zero, the folio is referred to as frozen. At this point, it will + * usually be returned to the memory allocator, but some parts of the + * kernel freeze folios in order to perform unusual operations on them + * such as splitting or migration. + * * The refcount is usually incremented by calls to folio_get() and * decremented by calls to folio_put(). Some typical users of the * folio refcount: @@ -82,6 +88,18 @@ static inline int page_ref_count(const struct page *page) * - Pipes * - Direct IO which references this page in the process address space * + * The reference count has three components: expected, temporary and + * spurious. The expected reference count of a folio is that which + * we would logically expect it to be from just reading the code. + * Temporary refcounts are gained by threads which need a temporary + * reference to make sure the folio isn't reallocated while they use it. + * Spurious refcounts are gained by threads which, thanks to RCU walks + * of the page tables or file cache, find a stale pointer to a folio. + * These threads will drop the refcount after discoveering the pointer + * is stale, but it can surprise other users to see the spurious refcount + * on a freshly allocated folio (eg they may see a refcount of 2 instead + * of 1). + * * Return: The number of references to this folio. */ static inline int folio_ref_count(const struct folio *folio) -- cgit v1.2.3 From f5cf8c92a2b9fd176d90b6e217ed50fbb5d1f48d Mon Sep 17 00:00:00 2001 From: Joshua Hahn Date: Fri, 29 May 2026 13:27:54 -0700 Subject: mm/nodemask: correctly describe nodemask operation return types Commit 0dfe54071d7c8 ("nodemask: Fix return values to be unsigned") changed a number of nodemask operations that used to return int to returning a bool instead. However, it did not update the comment block that described these functions, leaving the documentation incorrect. Fix the comment block to accurately describe the functions. Also fix a typo (unsigend --> unsigned), and fix a callsite in mempolicy.c that did not get updated during the conversion. No functional changes intended; changes are purely cosmetic. Link: https://lore.kernel.org/20260529202755.1846800-1-joshua.hahnjy@gmail.com Signed-off-by: Joshua Hahn Reviewed-by: SeongJae Park Cc: Alistair Popple Cc: Byungchul Park Cc: David Hildenbrand Cc: Gregory Price Cc: Matthew Brost Cc: Rakie Kim Cc: Rasmus Villemoes Cc: Ying Huang Cc: Yury Norov (NVIDIA) Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/nodemask.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 204c92462f3c..b842aa525546 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -24,23 +24,23 @@ * void nodes_setall(mask) set all bits * void nodes_clear(mask) clear all bits * int node_isset(node, mask) true iff bit 'node' set in mask - * int node_test_and_set(node, mask) test and set bit 'node' in mask + * bool node_test_and_set(node, mask) test and set bit 'node' in mask * - * void nodes_and(dst, src1, src2) dst = src1 & src2 [intersection] + * bool nodes_and(dst, src1, src2) dst = src1 & src2 [intersection] * void nodes_or(dst, src1, src2) dst = src1 | src2 [union] * void nodes_xor(dst, src1, src2) dst = src1 ^ src2 - * void nodes_andnot(dst, src1, src2) dst = src1 & ~src2 + * bool nodes_andnot(dst, src1, src2) dst = src1 & ~src2 * void nodes_complement(dst, src) dst = ~src * - * int nodes_equal(mask1, mask2) Does mask1 == mask2? - * int nodes_intersects(mask1, mask2) Do mask1 and mask2 intersect? - * int nodes_subset(mask1, mask2) Is mask1 a subset of mask2? - * int nodes_empty(mask) Is mask empty (no bits sets)? - * int nodes_full(mask) Is mask full (all bits sets)? + * bool nodes_equal(mask1, mask2) Does mask1 == mask2? + * bool nodes_intersects(mask1, mask2) Do mask1 and mask2 intersect? + * bool nodes_subset(mask1, mask2) Is mask1 a subset of mask2? + * bool nodes_empty(mask) Is mask empty (no bits sets)? + * bool nodes_full(mask) Is mask full (all bits sets)? * int nodes_weight(mask) Hamming weight - number of set bits * * unsigned int first_node(mask) Number lowest set bit, or MAX_NUMNODES - * unsigend int next_node(node, mask) Next node past 'node', or MAX_NUMNODES + * unsigned int next_node(node, mask) Next node past 'node', or MAX_NUMNODES * unsigned int next_node_in(node, mask) Next node past 'node', or wrap to first, * or MAX_NUMNODES * unsigned int first_unset_node(mask) First node not set in mask, or -- cgit v1.2.3 From 1479b44c7203b2cad3393533c64aa16d42056310 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 27 May 2026 16:45:13 -0400 Subject: mm: list_lru: introduce caller locking for additions and deletions Locking is currently internal to the list_lru API. However, a caller might want to keep auxiliary state synchronized with the LRU state. For example, the THP shrinker uses the lock of its custom LRU to keep PG_partially_mapped and vmstats consistent. To allow the THP shrinker to switch to list_lru, provide normal and irqsafe locking primitives as well as caller-locked variants of the addition and deletion functions. Link: https://lore.kernel.org/20260527204757.2544958-7-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: David Hildenbrand (Arm) Acked-by: Shakeel Butt Reviewed-by: Lorenzo Stoakes (Oracle) Reviewed-by: Liam R. Howlett (Oracle) Cc: Baolin Wang Cc: Barry Song Cc: Dave Chinner Cc: Dev Jain Cc: Kairui Song Cc: Lance Yang Cc: Michal Hocko Cc: Mikhail Zaslonko Cc: Muchun Song Cc: Nico Pache Cc: Roman Gushchin Cc: Ryan Roberts Cc: Usama Arif Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/list_lru.h | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'include') diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index fe739d35a864..134cb3e5652a 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -83,6 +83,46 @@ int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, gfp_t gfp); void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent); +/** + * list_lru_lock: lock the sublist for the given node and memcg + * @lru: the lru pointer + * @nid: the node id of the sublist to lock. + * @memcg: pointer to the cgroup of the sublist to lock. On return, + * updated to the cgroup whose sublist was actually locked, + * which may be an ancestor if the original memcg was dying. + * + * Returns the locked list_lru_one sublist. The caller must call + * list_lru_unlock() when done. + * + * You must ensure that the memcg is not freed during this call (e.g., with + * rcu or by taking a css refcnt). + * + * Return: the locked list_lru_one, or NULL on failure + */ +struct list_lru_one *list_lru_lock(struct list_lru *lru, int nid, + struct mem_cgroup **memcg); + +/** + * list_lru_unlock: unlock a sublist locked by list_lru_lock() + * @l: the list_lru_one to unlock + */ +void list_lru_unlock(struct list_lru_one *l); + +struct list_lru_one *list_lru_lock_irq(struct list_lru *lru, int nid, + struct mem_cgroup **memcg); +void list_lru_unlock_irq(struct list_lru_one *l); + +struct list_lru_one *list_lru_lock_irqsave(struct list_lru *lru, int nid, + struct mem_cgroup **memcg, unsigned long *irq_flags); +void list_lru_unlock_irqrestore(struct list_lru_one *l, + unsigned long *irq_flags); + +/* Caller-locked variants, see list_lru_add() etc for documentation */ +bool __list_lru_add(struct list_lru *lru, struct list_lru_one *l, + struct list_head *item, int nid, struct mem_cgroup *memcg); +bool __list_lru_del(struct list_lru *lru, struct list_lru_one *l, + struct list_head *item, int nid); + /** * list_lru_add: add an element to the lru list's tail * @lru: the lru pointer @@ -115,6 +155,9 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, struct mem_cgroup *memcg); +bool list_lru_add_irq(struct list_lru *lru, struct list_head *item, int nid, + struct mem_cgroup *memcg); + /** * list_lru_add_obj: add an element to the lru list's tail * @lru: the lru pointer -- cgit v1.2.3 From ae64f07a6a4018c73111b3cd4e1c5598ce5cfa84 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 27 May 2026 16:45:14 -0400 Subject: mm: list_lru: introduce folio_memcg_list_lru_alloc() memcg_list_lru_alloc() is called every time an object that may end up on the list_lru is created. It needs to quickly check if the list_lru heads for the memcg already exist, and allocate them when they don't. Doing this with folio objects is tricky: folio_memcg() is not stable and requires either RCU protection or pinning the cgroup. But it's desirable to make the existence check lightweight under RCU, and only pin the memcg when we need to allocate list_lru heads and may block. In preparation for switching the THP shrinker to list_lru, add a helper function for allocating list_lru heads coming from a folio. Link: https://lore.kernel.org/20260527204757.2544958-8-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: David Hildenbrand (Arm) Acked-by: Shakeel Butt Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Baolin Wang Cc: Barry Song Cc: Dave Chinner Cc: Dev Jain Cc: Kairui Song Cc: Lance Yang Cc: Liam R. Howlett Cc: Michal Hocko Cc: Mikhail Zaslonko Cc: Muchun Song Cc: Nico Pache Cc: Roman Gushchin Cc: Ryan Roberts Cc: Usama Arif Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/list_lru.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'include') diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index 134cb3e5652a..a450fffe1550 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -81,6 +81,33 @@ static inline int list_lru_init_memcg_key(struct list_lru *lru, struct shrinker int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, gfp_t gfp); + +#ifdef CONFIG_MEMCG +/** + * folio_memcg_list_lru_alloc - allocate list_lru heads for shrinkable folio + * @folio: the newly allocated & charged folio + * @lru: the list_lru this might be queued on + * @gfp: gfp mask + * + * Allocate list_lru heads (per-memcg, per-node) needed to queue this + * particular folio down the line. + * + * This does memcg_list_lru_alloc(), but on the memcg that @folio is + * associated with. Handles folio_memcg() access rules in the fast + * path (list_lru heads allocated) and the allocation slowpath. + * + * Returns 0 on success, a negative error value otherwise. + */ +int folio_memcg_list_lru_alloc(struct folio *folio, struct list_lru *lru, + gfp_t gfp); +#else +static inline int folio_memcg_list_lru_alloc(struct folio *folio, + struct list_lru *lru, gfp_t gfp) +{ + return 0; +} +#endif + void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent); /** -- cgit v1.2.3 From fafaeceb89a5e2e856ff04c2cacb6cae4a2ecb67 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 27 May 2026 16:45:16 -0400 Subject: mm: switch deferred split shrinker to list_lru The deferred split queue handles cgroups in a suboptimal fashion. The queue is per-NUMA node or per-cgroup, not the intersection. That means on a cgrouped system, a node-restricted allocation entering reclaim can end up splitting large pages on other nodes: alloc/unmap deferred_split_folio() list_add_tail(memcg->split_queue) set_shrinker_bit(memcg, node, deferred_shrinker_id) for_each_zone_zonelist_nodemask(restricted_nodes) mem_cgroup_iter() shrink_slab(node, memcg) shrink_slab_memcg(node, memcg) if test_shrinker_bit(memcg, node, deferred_shrinker_id) deferred_split_scan() walks memcg->split_queue The shrinker bit adds an imperfect guard rail. As soon as the cgroup has a single large page on the node of interest, all large pages owned by that memcg, including those on other nodes, will be split. list_lru properly sets up per-node, per-cgroup lists. As a bonus, it streamlines a lot of the list operations and reclaim walks. It's used widely by other major shrinkers already. Convert the deferred split queue as well. The list_lru per-memcg heads are instantiated on demand when the first object of interest is allocated for a cgroup, by calling folio_memcg_alloc_deferred(). Add calls to where splittable pages are created: anon faults, swapin faults, khugepaged collapse. These calls create all possible node heads for the cgroup at once, so the migration code (between nodes) doesn't need any special care. [akpm@linux-foundation.org: fix build with CONFIG_TRANSPARENT_HUGEPAGE=n] Link: https://lore.kernel.org/202605281620.lc3rtkBm-lkp@intel.com [hannes@cmpxchg.org: fix cgroup.memory=nokmem handling] Link: https://lore.kernel.org/ah9PGv12mqai84ES@cmpxchg.org Link: https://lore.kernel.org/20260527204757.2544958-10-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reported-by: Mikhail Zaslonko Tested-by: Mikhail Zaslonko Acked-by: Shakeel Butt Reviewed-by: Lorenzo Stoakes (Oracle) Acked-by: Usama Arif Reviewed-by: Kairui Song Cc: Baolin Wang Cc: Barry Song Cc: Dave Chinner Cc: David Hildenbrand (Arm) Cc: Dev Jain Cc: Lance Yang Cc: Liam R. Howlett Cc: Michal Hocko Cc: Muchun Song Cc: Nico Pache Cc: Roman Gushchin Cc: Ryan Roberts Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Zi Yan Cc: kernel test robot Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 17 ++++++++++++----- include/linux/memcontrol.h | 4 ---- include/linux/mmzone.h | 12 ------------ 3 files changed, 12 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 58382e97a66d..c0d223d0c556 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -439,10 +439,10 @@ static inline int split_huge_page(struct page *page) { return split_huge_page_to_list_to_order(page, NULL, 0); } + +int folio_memcg_alloc_deferred(struct folio *folio); + void deferred_split_folio(struct folio *folio, bool partially_mapped); -#ifdef CONFIG_MEMCG -void reparent_deferred_split_queue(struct mem_cgroup *memcg); -#endif void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, bool freeze); @@ -679,8 +679,15 @@ static inline int try_folio_split_to_order(struct folio *folio, return -EINVAL; } -static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {} -static inline void reparent_deferred_split_queue(struct mem_cgroup *memcg) {} +static inline int folio_memcg_alloc_deferred(struct folio *folio) +{ + return 0; +} + +static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) +{ +} + #define split_huge_pmd(__vma, __pmd, __address) \ do { } while (0) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 8f2662db166b..e1f46a0016fc 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -278,10 +278,6 @@ struct mem_cgroup { struct memcg_cgwb_frn cgwb_frn[MEMCG_CGWB_FRN_CNT]; #endif -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - struct deferred_split deferred_split_queue; -#endif - #ifdef CONFIG_LRU_GEN_WALKS_MMU /* per-memcg mm_struct list */ struct lru_gen_mm_list mm_list; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 1331a7b93f33..8e449f524f26 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1431,14 +1431,6 @@ struct zonelist { */ extern struct page *mem_map; -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -struct deferred_split { - spinlock_t split_queue_lock; - struct list_head split_queue; - unsigned long split_queue_len; -}; -#endif - #ifdef CONFIG_MEMORY_FAILURE /* * Per NUMA node memory failure handling statistics. @@ -1564,10 +1556,6 @@ typedef struct pglist_data { unsigned long first_deferred_pfn; #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - struct deferred_split deferred_split_queue; -#endif - #ifdef CONFIG_NUMA_BALANCING /* start time in ms of current promote rate limit period */ unsigned int nbp_rl_start; -- cgit v1.2.3 From ed384eb3a3e121c1d6d5c5d36950fbd286b92026 Mon Sep 17 00:00:00 2001 From: fujunjie Date: Tue, 26 May 2026 12:22:41 +0000 Subject: mm/compaction: respect cpusets when checking retry suitability should_compact_retry() handles COMPACT_SKIPPED by asking compaction_zonelist_suitable() whether reclaim can make a later compaction attempt worthwhile. That answer is used for the current allocation, so it should follow the same zone eligibility rules as the allocation itself. When cpusets are enabled, allocator slowpath decisions are marked with ALLOC_CPUSET. The allocation path, direct compaction and reclaim retry all skip zones rejected by __cpuset_zone_allowed(). compaction_zonelist_suitable() does not apply that filter. It only walks ac->zonelist/ac->nodemask, so it can return true because a zone that is not usable for the current allocation would pass __compaction_suitable(). That does not let the allocation use the disallowed zone. Later allocation and direct compaction paths still apply cpuset filtering. However, it can make should_compact_retry() retry based on memory that this allocation cannot use. Pass gfp_mask down and apply the same ALLOC_CPUSET check in compaction_zonelist_suitable(). This keeps the retry decision aligned with the zones that the allocation is allowed to use. A temporary debugfs probe was also used to call the old and new compaction_zonelist_suitable() predicates in the same two-node NUMA guest. The task was restricted to mems=0 while ac->nodemask covered nodes 0-1. After putting pressure on node0, node0 failed __compaction_suitable() for order-10 and node1 passed it, but node1 was rejected by __cpuset_zone_allowed(). In that state the old predicate returned true and the patched predicate returned false. Link: https://lore.kernel.org/tencent_F59F2BA2CC5779308E10DF54593C736D3E0A@qq.com Fixes: 435b3894e742 ("mm:page_alloc: fix the NULL ac->nodemask in __alloc_pages_slowpath()") Signed-off-by: fujunjie Reviewed-by: Vlastimil Babka (SUSE) Cc: Brendan Jackman Cc: Johannes Weiner Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/compaction.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 173d9c07a895..c829c48d1c71 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -101,7 +101,7 @@ extern void compaction_defer_reset(struct zone *zone, int order, bool alloc_success); bool compaction_zonelist_suitable(struct alloc_context *ac, int order, - int alloc_flags); + int alloc_flags, gfp_t gfp_mask); extern void __meminit kcompactd_run(int nid); extern void __meminit kcompactd_stop(int nid); -- cgit v1.2.3 From cdea4acce026f4dc6a1689cb991a2bf3a4333ecd Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Mon, 1 Jun 2026 11:40:09 +0000 Subject: mm: delete stale comment about cachelines These comments have been wrong since commit a211c6550efc ("mm: page_alloc: defrag_mode kswapd/kcompactd watermarks") added NR_FREE_PAGES_BLOCKS. Since nobody has complained about it in the last year, it seems unlikely these comments were particularly useful anyway, so delete them. Link: https://lore.kernel.org/20260601-zone_stat_item-comment-v1-1-f452dd91d5eb@google.com Signed-off-by: Brendan Jackman Acked-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes Acked-by: Vlastimil Babka (SUSE) Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 8e449f524f26..ca2712187147 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -214,7 +214,6 @@ enum numa_stat_item { #endif enum zone_stat_item { - /* First 128 byte cacheline (assuming 64 bit words) */ NR_FREE_PAGES, NR_FREE_PAGES_BLOCKS, NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */ @@ -225,7 +224,6 @@ enum zone_stat_item { NR_ZONE_UNEVICTABLE, NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */ NR_MLOCK, /* mlock()ed pages found and moved off LRU */ - /* Second 128 byte cacheline */ #if IS_ENABLED(CONFIG_ZSMALLOC) NR_ZSPAGES, /* allocated in zsmalloc */ #endif -- cgit v1.2.3 From cc7a9f6e57c4f71e8e1fee3274b1ae8770f2a743 Mon Sep 17 00:00:00 2001 From: "Kiryl Shutsemau (Meta)" Date: Fri, 29 May 2026 18:23:30 +0100 Subject: userfaultfd: build __VMA_UFFD_FLAGS from config-gated masks The VMA flags bitmap is a single word today: NUM_VMA_FLAG_BITS is BITS_PER_LONG, so on 32-bit vma_flags_t holds only 32 bits. (The bitmap type exists so this can grow past BITS_PER_LONG later; until it does, anything declared above the first word is out of range on 32-bit.) The bit enum nevertheless declares some bits unconditionally above BITS_PER_LONG -- VMA_UFFD_MINOR_BIT is 41, with VM_UFFD_MINOR == VM_NONE on 32-bit so no VMA actually carries the bit. __VMA_UFFD_FLAGS feeds VMA_UFFD_MINOR_BIT to mk_vma_flags() unconditionally. On 32-bit that becomes __set_bit(41, &one_long), a write one word past the end of the single-word bitmap. The compiler folds the out-of-bounds store with wraparound (1UL << (41 % 32) == bit 9) into the first word; bit 9 is already in __VMA_UFFD_FLAGS so the mask happens to come out right today, but it is an out-of-bounds write all the same, and any high-numbered bit whose mod-BITS_PER_LONG position is otherwise unused would silently OR an extra bit into the mask. Rather than feed bit numbers that may not exist on the current build to mk_vma_flags(), build the mask from whole per-mode masks that collapse to EMPTY_VMA_FLAGS when their feature is unavailable. Add mk_vma_flags_from_masks() for that, and define VMA_UFFD_MISSING / _WP / _MINOR alongside the VM_UFFD_* flags, gating VMA_UFFD_MINOR on the same config as VM_UFFD_MINOR (which implies 64BIT, where bit 41 fits). An out-of-range bit is then never materialised, on any arch, and the in-range fast path stays a compile-time constant. Link: https://lore.kernel.org/20260529172331.356655-7-kas@kernel.org Fixes: 9ea35a25d51b ("mm: introduce VMA flags bitmap type") Signed-off-by: Kiryl Shutsemau Reported-by: Sashiko AI review Suggested-by: Lorenzo Stoakes Reviewed-by: Lorenzo Stoakes Assisted-by: Claude:claude-opus-4-8 Cc: David Hildenbrand Cc: Michal Hocko Cc: Mike Rapoport Cc: Peter Xu Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Balbir Singh Cc: Signed-off-by: Andrew Morton --- include/linux/mm.h | 39 +++++++++++++++++++++++++++++++++++++++ include/linux/userfaultfd_k.h | 4 ++-- 2 files changed, 41 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0f2612a70fb1..485df9c2dbdd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -496,6 +496,21 @@ enum { #else #define VM_UFFD_MINOR VM_NONE #endif + +/* + * vma_flags_t masks for the userfaultfd VMA flags. VMA_UFFD_MINOR is gated on + * the same config as VM_UFFD_MINOR -- which implies 64BIT, where the bit fits + * -- so an out-of-range bit is never fed to mk_vma_flags() on a build whose + * bitmap cannot hold it. + */ +#define VMA_UFFD_MISSING mk_vma_flags(VMA_UFFD_MISSING_BIT) +#define VMA_UFFD_WP mk_vma_flags(VMA_UFFD_WP_BIT) +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR +#define VMA_UFFD_MINOR mk_vma_flags(VMA_UFFD_MINOR_BIT) +#else +#define VMA_UFFD_MINOR EMPTY_VMA_FLAGS +#endif + #ifdef CONFIG_64BIT #define VM_ALLOW_ANY_UNCACHED INIT_VM_FLAG(ALLOW_ANY_UNCACHED) #define VM_SEALED INIT_VM_FLAG(SEALED) @@ -1238,6 +1253,30 @@ static __always_inline void vma_flags_set_mask(vma_flags_t *flags, #define vma_flags_set(flags, ...) \ vma_flags_set_mask(flags, mk_vma_flags(__VA_ARGS__)) +static __always_inline vma_flags_t __mk_vma_flags_from_masks(size_t count, + const vma_flags_t *masks) +{ + vma_flags_t flags = EMPTY_VMA_FLAGS; + size_t i; + + for (i = 0; i < count; i++) + vma_flags_set_mask(&flags, masks[i]); + return flags; +} + +/* + * Combine pre-computed vma_flags_t masks into one value, e.g.: + * + * vma_flags_t flags = mk_vma_flags_from_masks(VMA_UFFD_WP, VMA_UFFD_MINOR); + * + * Unlike mk_vma_flags(), which takes bit numbers, this takes whole masks -- + * each of which may be EMPTY_VMA_FLAGS when its feature is unavailable -- so a + * bit that does not exist on the current build is never materialised. + */ +#define mk_vma_flags_from_masks(...) \ + __mk_vma_flags_from_masks(COUNT_ARGS(__VA_ARGS__), \ + (const vma_flags_t []){__VA_ARGS__}) + /* Clear all of the to-clear flags in flags, non-atomically. */ static __always_inline void vma_flags_clear_mask(vma_flags_t *flags, vma_flags_t to_clear) diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 3ec8e1071673..68edac4dcd78 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -23,8 +23,8 @@ /* The set of all possible UFFD-related VM flags. */ #define __VM_UFFD_FLAGS (VM_UFFD_MISSING | VM_UFFD_WP | VM_UFFD_MINOR) -#define __VMA_UFFD_FLAGS mk_vma_flags(VMA_UFFD_MISSING_BIT, VMA_UFFD_WP_BIT, \ - VMA_UFFD_MINOR_BIT) +#define __VMA_UFFD_FLAGS mk_vma_flags_from_masks(VMA_UFFD_MISSING, VMA_UFFD_WP, \ + VMA_UFFD_MINOR) /* * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining -- cgit v1.2.3 From c55dd3b46c1208d6d2ea737a8aefef4aa4c70cb8 Mon Sep 17 00:00:00 2001 From: Hui Zhu Date: Fri, 29 May 2026 09:41:30 +0800 Subject: vmalloc: fix NULL pointer dereference in is_vm_area_hugepages() find_vm_area() can return NULL if the given address is not a valid vmalloc area. Check the return value before dereferencing it to avoid a kernel crash. Link: https://lore.kernel.org/20260529014130.671291-1-hui.zhu@linux.dev Fixes: 121e6f3258fe ("mm/vmalloc: hugepage vmalloc mappings") Signed-off-by: Hui Zhu Reviewed-by: Dev Jain Reviewed-by: Uladzislau Rezki (Sony) Cc: Nicholas Piggin Signed-off-by: Andrew Morton --- include/linux/vmalloc.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 3b02c0c6b371..d87dc7f77f4e 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -265,7 +265,9 @@ static inline bool is_vm_area_hugepages(const void *addr) * allocated in the vmalloc layer. */ #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC - return find_vm_area(addr)->page_order > 0; + struct vm_struct *area = find_vm_area(addr); + + return area && area->page_order > 0; #else return false; #endif -- cgit v1.2.3 From c13a0316aef5f4b73e8b4bf6943737f836d65e1d Mon Sep 17 00:00:00 2001 From: Youngjun Park Date: Tue, 24 Mar 2026 01:08:21 +0900 Subject: mm/swap, PM: hibernate: fix swapoff race in uswsusp by pinning swap device Patch series "mm/swap, PM: hibernate: fix swapoff race in uswsusp by pinning swap device", v8. Currently, in the uswsusp path, only the swap type value is retrieved at lookup time without holding a reference. If swapoff races after the type is acquired, subsequent slot allocations operate on a stale swap device. Additionally, grabbing and releasing the swap device reference on every slot allocation is inefficient across the entire hibernation swap path. This patch series addresses these issues: - Patch 1: Fixes the swapoff race in uswsusp by pinning the swap device from the point it is looked up until the session completes. - Patch 2: Removes the overhead of per-slot reference counting in alloc/free paths and cleans up the redundant SWP_WRITEOK check. This patch (of 2): Hibernation via uswsusp (/dev/snapshot ioctls) has a race window: after selecting the resume swap area but before user space is frozen, swapoff may run and invalidate the selected swap device. Fix this by pinning the swap device with SWP_HIBERNATION while it is in use. The pin is exclusive, which is sufficient since hibernate_acquire() already prevents concurrent hibernation sessions. The kernel swsusp path (sysfs-based hibernate/resume) uses find_hibernation_swap_type() which is not affected by the pin. It freezes user space before touching swap, so swapoff cannot race. Introduce dedicated helpers: - pin_hibernation_swap_type(): Look up and pin the swap device. Used by the uswsusp path. - find_hibernation_swap_type(): Lookup without pinning. Used by the kernel swsusp path. - unpin_hibernation_swap_type(): Clear the hibernation pin. While a swap device is pinned, swapoff is prevented from proceeding. Link: https://lore.kernel.org/20260323160822.1409904-1-youngjun.park@lge.com Link: https://lore.kernel.org/20260323160822.1409904-2-youngjun.park@lge.com Signed-off-by: Youngjun Park Reviewed-by: Kairui Song Cc: Baoquan He Cc: Barry Song Cc: Chris Li Cc: Kemeng Shi Cc: Nhat Pham Cc: "Rafael J . Wysocki" Signed-off-by: Andrew Morton --- include/linux/swap.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 8c43bc3055c9..8f0f68e245ba 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -213,6 +213,7 @@ enum { SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */ SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */ SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */ + SWP_HIBERNATION = (1 << 13), /* pinned for hibernation */ /* add others here before... */ }; @@ -432,7 +433,9 @@ static inline long get_nr_swap_pages(void) } extern void si_swapinfo(struct sysinfo *); -int swap_type_of(dev_t device, sector_t offset); +extern int pin_hibernation_swap_type(dev_t device, sector_t offset); +extern void unpin_hibernation_swap_type(int type); +extern int find_hibernation_swap_type(dev_t device, sector_t offset); int find_first_swap(dev_t *device); extern unsigned int count_swap_pages(int, int); extern sector_t swapdev_block(int, pgoff_t); -- cgit v1.2.3 From 32a2b73ec232b284b029d34bcfaa9a7f424151d2 Mon Sep 17 00:00:00 2001 From: JP Kobryn Date: Wed, 3 Jun 2026 23:17:25 -0700 Subject: mm/compaction: cap compact_gap() at COMPACT_CLUSTER_MAX compact_gap() returns 2 << order, which is used as watermark headroom in __compaction_suitable() and as a threshold in kswapd reclaim decisions. The computed value scales exponentially by order. For order-9 THP allocations this evaluates to 1024 pages, but the compaction free scanner's working set is bounded by COMPACT_CLUSTER_MAX (32 pages). The scanner stops isolating free pages once it matches the migration batch. The current gap over-reserves by 32x. On fragmented production hosts, kswapd will try to reclaim up to the gap, but it only reaches that threshold in 18% of attempts. As a result, reclaim continues in the majority of cases despite many lower-order free pages being available. The over-sized gap also causes 46% of order-9 compaction suitability checks to fail unnecessarily: the zone has sufficient free pages for the scanner to operate, but not enough to clear the inflated threshold. Cap compact_gap() at COMPACT_CLUSTER_MAX so the watermark headroom reflects the scanner's actual capacity. This function is used by two key heuristics. The first is when kswapd can stop high-order reclaim and downgrade to order-0 balancing, allowing kcompactd to be woken for the original higher allocation order. The second is zone suitability checking, where the smaller gap allows compaction to start sooner. Note that orders 0-4 are unaffected since their gap is already less than or equal to COMPACT_CLUSTER_MAX. A/B test on v6.13-based instagram production hosts (64GB, 60s measurement): Unpatched (43 hosts) pgscan_kswapd (mean/host): ~1.6M reclaim efficiency (steal/scan): 83.8% per-compaction success (success/stall): 2.1% THP success (alloc/alloc+fallback): 4.9% forced lru_add_drain (mean/host): ~107K Patched (59 hosts) pgscan_kswapd (mean/host): ~449K reclaim efficiency (steal/scan): 91.0% per-compaction success (success/stall): 28.3% THP success (alloc/alloc+fallback): 17.2% forced lru_add_drain (mean/host): ~64K Additional tests were also performed using a workload of similar shape and based on mm-new at the time of testing. Across three 60s runs, the patch showed improvements consistent with the previous test: reduced kswapd reclaim and fewer THP fault fallbacks. Unpatched kswapd_shrink_node downgrade to order-0 (mean): 0 thp_fault_fallback (mean): 1217 pgscan_kswapd (mean): 6328 pgsteal_kswapd (mean): 5657 Patched kswapd_shrink_node downgrade to order-0 (mean): 28 thp_fault_fallback (mean): 738 pgscan_kswapd (mean): 3773 pgsteal_kswapd (mean): 3243 Link: https://lore.kernel.org/20260604061725.13800-1-jp.kobryn@linux.dev Signed-off-by: JP Kobryn (Meta) Reviewed-by: Vlastimil Babka (SUSE) Cc: Brendan Jackman Cc: Johannes Weiner Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/compaction.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index c829c48d1c71..f29ef0653546 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -2,6 +2,8 @@ #ifndef _LINUX_COMPACTION_H #define _LINUX_COMPACTION_H +#include + /* * Determines how hard direct compaction should try to succeed. * Lower value means higher priority, analogically to reclaim priority. @@ -73,11 +75,9 @@ static inline unsigned long compact_gap(unsigned int order) * effectively limited by COMPACT_CLUSTER_MAX, as that's the maximum * that the migrate scanner can have isolated on migrate list, and free * scanner is only invoked when the number of isolated free pages is - * lower than that. But it's not worth to complicate the formula here - * as a bigger gap for higher orders than strictly necessary can also - * improve chances of compaction success. + * lower than that. */ - return 2UL << order; + return min(2UL << order, COMPACT_CLUSTER_MAX); } static inline int current_is_kcompactd(void) -- cgit v1.2.3