summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
Diffstat (limited to 'include')
-rw-r--r--include/asm-generic/pgalloc.h2
-rw-r--r--include/linux/bootmem_info.h1
-rw-r--r--include/linux/compaction.h10
-rw-r--r--include/linux/damon.h136
-rw-r--r--include/linux/gfp.h4
-rw-r--r--include/linux/gfp_types.h6
-rw-r--r--include/linux/highmem-internal.h2
-rw-r--r--include/linux/huge_mm.h47
-rw-r--r--include/linux/list_lru.h70
-rw-r--r--include/linux/memcontrol.h31
-rw-r--r--include/linux/memory.h7
-rw-r--r--include/linux/memory_hotplug.h8
-rw-r--r--include/linux/mm.h62
-rw-r--r--include/linux/mm_inline.h2
-rw-r--r--include/linux/mm_types.h19
-rw-r--r--include/linux/mmu_notifier.h4
-rw-r--r--include/linux/mmzone.h23
-rw-r--r--include/linux/nodemask.h18
-rw-r--r--include/linux/page_ref.h18
-rw-r--r--include/linux/pageblock-flags.h6
-rw-r--r--include/linux/pagemap.h2
-rw-r--r--include/linux/swap.h24
-rw-r--r--include/linux/swap_cgroup.h47
-rw-r--r--include/linux/thread_info.h2
-rw-r--r--include/linux/userfaultfd_k.h40
-rw-r--r--include/linux/vmalloc.h4
-rw-r--r--include/linux/vmpressure.h9
-rw-r--r--include/trace/events/damon.h38
-rw-r--r--include/trace/events/vmscan.h52
29 files changed, 465 insertions, 229 deletions
diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h
index 57137d3ac159..051aa1331051 100644
--- a/include/asm-generic/pgalloc.h
+++ b/include/asm-generic/pgalloc.h
@@ -4,7 +4,7 @@
#ifdef CONFIG_MMU
-#define GFP_PGTABLE_KERNEL (GFP_KERNEL | __GFP_ZERO)
+#define GFP_PGTABLE_KERNEL (GFP_KERNEL | __GFP_ZERO | __GFP_SKIP_KASAN)
#define GFP_PGTABLE_USER (GFP_PGTABLE_KERNEL | __GFP_ACCOUNT)
/**
diff --git a/include/linux/bootmem_info.h b/include/linux/bootmem_info.h
index 492ceeb1cdf8..f724340755e5 100644
--- a/include/linux/bootmem_info.h
+++ b/include/linux/bootmem_info.h
@@ -82,7 +82,6 @@ static inline void get_page_bootmem(unsigned long info, struct page *page,
static inline void free_bootmem_page(struct page *page)
{
- kmemleak_free_part_phys(PFN_PHYS(page_to_pfn(page)), PAGE_SIZE);
free_reserved_page(page);
}
#endif
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 173d9c07a895..f29ef0653546 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -2,6 +2,8 @@
#ifndef _LINUX_COMPACTION_H
#define _LINUX_COMPACTION_H
+#include <linux/swap.h>
+
/*
* Determines how hard direct compaction should try to succeed.
* Lower value means higher priority, analogically to reclaim priority.
@@ -73,11 +75,9 @@ static inline unsigned long compact_gap(unsigned int order)
* effectively limited by COMPACT_CLUSTER_MAX, as that's the maximum
* that the migrate scanner can have isolated on migrate list, and free
* scanner is only invoked when the number of isolated free pages is
- * lower than that. But it's not worth to complicate the formula here
- * as a bigger gap for higher orders than strictly necessary can also
- * improve chances of compaction success.
+ * lower than that.
*/
- return 2UL << order;
+ return min(2UL << order, COMPACT_CLUSTER_MAX);
}
static inline int current_is_kcompactd(void)
@@ -101,7 +101,7 @@ extern void compaction_defer_reset(struct zone *zone, int order,
bool alloc_success);
bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
- int alloc_flags);
+ int alloc_flags, gfp_t gfp_mask);
extern void __meminit kcompactd_run(int nid);
extern void __meminit kcompactd_stop(int nid);
diff --git a/include/linux/damon.h b/include/linux/damon.h
index f2cdb7c3f5e6..6f7edb3590ef 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -8,23 +8,20 @@
#ifndef _DAMON_H_
#define _DAMON_H_
+#include <linux/math64.h>
#include <linux/memcontrol.h>
#include <linux/mutex.h>
+#include <linux/prandom.h>
#include <linux/time64.h>
#include <linux/types.h>
-#include <linux/random.h>
/* Minimal region size. Every damon_region is aligned by this. */
#define DAMON_MIN_REGION_SZ PAGE_SIZE
+/* Maximum number of monitoring probes. */
+#define DAMON_MAX_PROBES (4)
/* Max priority score for DAMON-based operation schemes */
#define DAMOS_MAX_SCORE (99)
-/* Get a random number in [l, r) */
-static inline unsigned long damon_rand(unsigned long l, unsigned long r)
-{
- return l + get_random_u32_below(r - l);
-}
-
/**
* struct damon_addr_range - Represents an address region of [@start, @end).
* @start: Start address of the region (inclusive).
@@ -52,6 +49,7 @@ struct damon_size_range {
* @nr_accesses: Access frequency of this region.
* @nr_accesses_bp: @nr_accesses in basis point (0.01%) that updated for
* each sampling interval.
+ * @probe_hits: Number of probe-positive region samples.
* @list: List head for siblings.
* @age: Age of this region.
*
@@ -80,6 +78,7 @@ struct damon_region {
unsigned long sampling_addr;
unsigned int nr_accesses;
unsigned int nr_accesses_bp;
+ unsigned char probe_hits[DAMON_MAX_PROBES];
struct list_head list;
unsigned int age;
@@ -121,6 +120,7 @@ struct damon_target {
* @DAMOS_PAGEOUT: Reclaim the region.
* @DAMOS_HUGEPAGE: Call ``madvise()`` for the region with MADV_HUGEPAGE.
* @DAMOS_NOHUGEPAGE: Call ``madvise()`` for the region with MADV_NOHUGEPAGE.
+ * @DAMOS_COLLAPSE: Call ``madvise()`` for the region with MADV_COLLAPSE.
* @DAMOS_LRU_PRIO: Prioritize the region on its LRU lists.
* @DAMOS_LRU_DEPRIO: Deprioritize the region on its LRU lists.
* @DAMOS_MIGRATE_HOT: Migrate the regions prioritizing warmer regions.
@@ -140,6 +140,7 @@ enum damos_action {
DAMOS_PAGEOUT,
DAMOS_HUGEPAGE,
DAMOS_NOHUGEPAGE,
+ DAMOS_COLLAPSE,
DAMOS_LRU_PRIO,
DAMOS_LRU_DEPRIO,
DAMOS_MIGRATE_HOT,
@@ -159,6 +160,8 @@ enum damos_action {
* @DAMOS_QUOTA_NODE_MEMCG_FREE_BP: MemFree ratio of a node for a cgroup.
* @DAMOS_QUOTA_ACTIVE_MEM_BP: Active to total LRU memory ratio.
* @DAMOS_QUOTA_INACTIVE_MEM_BP: Inactive to total LRU memory ratio.
+ * @DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP: Scheme-eligible memory ratio of a
+ * node in basis points (0-10000).
* @NR_DAMOS_QUOTA_GOAL_METRICS: Number of DAMOS quota goal metrics.
*
* Metrics equal to larger than @NR_DAMOS_QUOTA_GOAL_METRICS are unsupported.
@@ -172,6 +175,7 @@ enum damos_quota_goal_metric {
DAMOS_QUOTA_NODE_MEMCG_FREE_BP,
DAMOS_QUOTA_ACTIVE_MEM_BP,
DAMOS_QUOTA_INACTIVE_MEM_BP,
+ DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP,
NR_DAMOS_QUOTA_GOAL_METRICS,
};
@@ -233,6 +237,8 @@ enum damos_quota_goal_tuner {
* @goals: Head of quota tuning goals (&damos_quota_goal) list.
* @goal_tuner: Goal-based @esz tuning algorithm to use.
* @esz: Effective size quota in bytes.
+ * @fail_charge_num: Failed regions charge rate numerator.
+ * @fail_charge_denom: Failed regions charge rate denominator.
*
* @weight_sz: Weight of the region's size for prioritization.
* @weight_nr_accesses: Weight of the region's nr_accesses for prioritization.
@@ -262,6 +268,10 @@ enum damos_quota_goal_tuner {
*
* The resulting effective size quota in bytes is set to @esz.
*
+ * For DAMOS action applying failed amount of regions, charging those same to
+ * those that the action has successfully applied may be unfair. For the
+ * reason, 'the size * @fail_charge_num / @fail_charge_denom' is charged.
+ *
* For selecting regions within the quota, DAMON prioritizes current scheme's
* target memory regions using the &struct damon_operations->get_scheme_score.
* You could customize the prioritization logic by setting &weight_sz,
@@ -276,6 +286,9 @@ struct damos_quota {
enum damos_quota_goal_tuner goal_tuner;
unsigned long esz;
+ unsigned int fail_charge_num;
+ unsigned int fail_charge_denom;
+
unsigned int weight_sz;
unsigned int weight_nr_accesses;
unsigned int weight_age;
@@ -617,6 +630,7 @@ enum damon_ops_id {
* @update: Update operations-related data structures.
* @prepare_access_checks: Prepare next access check of target regions.
* @check_accesses: Check the accesses to target regions.
+ * @apply_probes: Apply probes for each region.
* @get_scheme_score: Get the score of a region for a scheme.
* @apply_scheme: Apply a DAMON-based operation scheme.
* @target_valid: Determine if the target is valid.
@@ -643,6 +657,8 @@ enum damon_ops_id {
* last preparation and update the number of observed accesses of each region.
* It should also return max number of observed accesses that made as a result
* of its update. The value will be used for regions adjustment threshold.
+ * @apply_probes should apply the data attribute probes to each region and
+ * accordingly update the probe hits counter of the region.
* @get_scheme_score should return the priority score of a region for a scheme
* as an integer in [0, &DAMOS_MAX_SCORE].
* @apply_scheme is called from @kdamond when a region for user provided
@@ -660,6 +676,7 @@ struct damon_operations {
void (*update)(struct damon_ctx *context);
void (*prepare_access_checks)(struct damon_ctx *context);
unsigned int (*check_accesses)(struct damon_ctx *context);
+ void (*apply_probes)(struct damon_ctx *context);
int (*get_scheme_score)(struct damon_ctx *context,
struct damon_region *r, struct damos *scheme);
unsigned long (*apply_scheme)(struct damon_ctx *context,
@@ -722,6 +739,47 @@ struct damon_intervals_goal {
};
/**
+ * enum damon_filter_type - Type of &struct damon_filter
+ *
+ * @DAMON_FILTER_TYPE_ANON: Anonymous pages.
+ * @DAMON_FILTER_TYPE_MEMCG: Specific memcg's pages.
+ */
+enum damon_filter_type {
+ DAMON_FILTER_TYPE_ANON,
+ DAMON_FILTER_TYPE_MEMCG,
+};
+
+/**
+ * struct damon_filter - DAMON region filter for &struct damon_probe.
+ *
+ * @type: Type of the region.
+ * @matching: Whether this filter is for the type-matching ones.
+ * @allow: Whether the @type-@matching ones should pass this filter.
+ * @memcg_id: Memcg id of the question if @type is DAMON_FILTER_MEMCG.
+ * @list: Siblings list.
+ */
+struct damon_filter {
+ enum damon_filter_type type;
+ bool matching;
+ bool allow;
+ union {
+ u64 memcg_id;
+ };
+ struct list_head list;
+};
+
+/**
+ * struct damon_probe - Data region attribute probe.
+ *
+ * @filters: Filters for assessing if a given region is for this probe.
+ * @list: Siblings list.
+ */
+struct damon_probe {
+ struct list_head filters;
+ struct list_head list;
+};
+
+/**
* struct damon_attrs - Monitoring attributes for accuracy/overhead control.
*
* @sample_interval: The time between access samplings.
@@ -787,6 +845,7 @@ struct damon_attrs {
* @ops: Set of monitoring operations for given use cases.
* @addr_unit: Scale factor for core to ops address conversion.
* @min_region_sz: Minimum region size.
+ * @pause: Pause kdamond main loop.
* @adaptive_targets: Head of monitoring targets (&damon_target) list.
* @schemes: Head of schemes (&damos) list.
*/
@@ -838,13 +897,34 @@ struct damon_ctx {
/* public: */
struct damon_operations ops;
+ struct list_head probes;
unsigned long addr_unit;
unsigned long min_region_sz;
+ bool pause;
struct list_head adaptive_targets;
struct list_head schemes;
+
+ /* Per-ctx PRNG state for damon_rand(); kdamond is the sole consumer. */
+ struct rnd_state rnd_state;
};
+/* Get a random number in [@l, @r) using @ctx's lockless PRNG. */
+static inline unsigned long damon_rand(struct damon_ctx *ctx,
+ unsigned long l, unsigned long r)
+{
+ unsigned long span = r - l;
+ u64 rnd;
+
+ if (span <= U32_MAX) {
+ rnd = prandom_u32_state(&ctx->rnd_state);
+ return l + (unsigned long)((rnd * span) >> 32);
+ }
+ rnd = ((u64)prandom_u32_state(&ctx->rnd_state) << 32) |
+ prandom_u32_state(&ctx->rnd_state);
+ return l + mul_u64_u64_shr(rnd, span, 64);
+}
+
static inline struct damon_region *damon_next_region(struct damon_region *r)
{
return container_of(r->list.next, struct damon_region, list);
@@ -870,15 +950,26 @@ static inline unsigned long damon_sz_region(struct damon_region *r)
return r->ar.end - r->ar.start;
}
+#define damon_for_each_filter(f, p) \
+ list_for_each_entry(f, &(p)->filters, list)
+
+#define damon_for_each_filter_safe(f, next, p) \
+ list_for_each_entry_safe(f, next, &(p)->filters, list)
+
+#define damon_for_each_probe(p, ctx) \
+ list_for_each_entry(p, &(ctx)->probes, list)
+
+#define damon_for_each_probe_safe(p, next, ctx) \
+ list_for_each_entry_safe(p, next, &(ctx)->probes, list)
#define damon_for_each_region(r, t) \
- list_for_each_entry(r, &t->regions_list, list)
+ list_for_each_entry(r, &(t)->regions_list, list)
#define damon_for_each_region_from(r, t) \
- list_for_each_entry_from(r, &t->regions_list, list)
+ list_for_each_entry_from(r, &(t)->regions_list, list)
#define damon_for_each_region_safe(r, next, t) \
- list_for_each_entry_safe(r, next, &t->regions_list, list)
+ list_for_each_entry_safe(r, next, &(t)->regions_list, list)
#define damon_for_each_target(t, ctx) \
list_for_each_entry(t, &(ctx)->adaptive_targets, list)
@@ -893,7 +984,7 @@ static inline unsigned long damon_sz_region(struct damon_region *r)
list_for_each_entry_safe(s, next, &(ctx)->schemes, list)
#define damos_for_each_quota_goal(goal, quota) \
- list_for_each_entry(goal, &quota->goals, list)
+ list_for_each_entry(goal, &(quota)->goals, list)
#define damos_for_each_quota_goal_safe(goal, next, quota) \
list_for_each_entry_safe(goal, next, &(quota)->goals, list)
@@ -912,21 +1003,16 @@ static inline unsigned long damon_sz_region(struct damon_region *r)
#ifdef CONFIG_DAMON
-struct damon_region *damon_new_region(unsigned long start, unsigned long end);
+struct damon_filter *damon_new_filter(enum damon_filter_type type,
+ bool matching, bool allow);
+void damon_add_filter(struct damon_probe *probe, struct damon_filter *f);
+void damon_destroy_filter(struct damon_filter *f);
-/*
- * Add a region between two other regions
- */
-static inline void damon_insert_region(struct damon_region *r,
- struct damon_region *prev, struct damon_region *next,
- struct damon_target *t)
-{
- __list_add(&r->list, &prev->list, &next->list);
- t->nr_regions++;
-}
+struct damon_probe *damon_new_probe(void);
+void damon_add_probe(struct damon_ctx *ctx, struct damon_probe *probe);
+
+struct damon_region *damon_new_region(unsigned long start, unsigned long end);
-void damon_add_region(struct damon_region *r, struct damon_target *t);
-void damon_destroy_region(struct damon_region *r, struct damon_target *t);
int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
unsigned int nr_ranges, unsigned long min_region_sz);
void damon_update_region_access_rate(struct damon_region *r, bool accessed,
@@ -994,7 +1080,7 @@ int damon_kdamond_pid(struct damon_ctx *ctx);
int damon_call(struct damon_ctx *ctx, struct damon_call_control *control);
int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control);
-int damon_set_region_biggest_system_ram_default(struct damon_target *t,
+int damon_set_region_system_rams_default(struct damon_target *t,
unsigned long *start, unsigned long *end,
unsigned long addr_unit,
unsigned long min_region_sz);
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 51ef13ed756e..cdf95a9f0b87 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -239,6 +239,8 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
struct page **page_array);
#define __alloc_pages_bulk(...) alloc_hooks(alloc_pages_bulk_noprof(__VA_ARGS__))
+void free_pages_bulk(struct page **page_array, unsigned long nr_pages);
+
unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp,
unsigned long nr_pages,
struct page **page_array);
@@ -467,6 +469,8 @@ void free_contig_frozen_range(unsigned long pfn, unsigned long nr_pages);
void free_contig_range(unsigned long pfn, unsigned long nr_pages);
#endif
+void __free_contig_range(unsigned long pfn, unsigned long nr_pages);
+
DEFINE_FREE(free_page, void *, free_page((unsigned long)_T))
#endif /* __LINUX_GFP_H */
diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h
index cd4972a7c97c..54ca0c88bab6 100644
--- a/include/linux/gfp_types.h
+++ b/include/linux/gfp_types.h
@@ -281,9 +281,9 @@ enum {
*
* %__GFP_SKIP_KASAN makes KASAN skip unpoisoning on page allocation.
* Used for userspace and vmalloc pages; the latter are unpoisoned by
- * kasan_unpoison_vmalloc instead. For userspace pages, results in
- * poisoning being skipped as well, see should_skip_kasan_poison for
- * details. Only effective in HW_TAGS mode.
+ * kasan_unpoison_vmalloc instead. If passed to vmalloc, kasan_unpoison_vmalloc
+ * is skipped too. For userspace pages, results in poisoning being skipped as
+ * well, see should_skip_kasan_poison for details. Only effective in HW_TAGS mode.
*/
#define __GFP_NOWARN ((__force gfp_t)___GFP_NOWARN)
#define __GFP_COMP ((__force gfp_t)___GFP_COMP)
diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h
index 0574c21ca45d..bb71e7dba4f7 100644
--- a/include/linux/highmem-internal.h
+++ b/include/linux/highmem-internal.h
@@ -262,7 +262,7 @@ static inline bool is_kmap_addr(const void *x)
* @__addr: Virtual address to be unmapped
*
* Unmaps an address previously mapped by kmap_atomic() and re-enables
- * pagefaults. Depending on PREEMP_RT configuration, re-enables also
+ * pagefaults. Depending on PREEMPT_RT configuration, re-enables also
* migration and preemption. Users should not count on these side effects.
*
* Mappings should be unmapped in the reverse order that they were mapped.
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 2949e5acff35..c0d223d0c556 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -238,6 +238,31 @@ static inline bool thp_vma_suitable_order(struct vm_area_struct *vma,
}
/*
+ * Make sure huge_gfp is always more limited than limit_gfp.
+ * Some shmem users want THP allocation to be done less aggressively
+ * and only in certain zone.
+ */
+static inline gfp_t thp_shmem_limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
+{
+ gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM;
+ gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY;
+ gfp_t zoneflags = limit_gfp & GFP_ZONEMASK;
+ gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK);
+
+ /* Allow allocations only from the originally specified zones. */
+ result |= zoneflags;
+
+ /*
+ * Minimize the result gfp by taking the union with the deny flags,
+ * and the intersection of the allow flags.
+ */
+ result |= (limit_gfp & denyflags);
+ result |= (huge_gfp & limit_gfp) & allowflags;
+
+ return result;
+}
+
+/*
* Filter the bitfield of input orders to the ones suitable for use in the vma.
* See thp_vma_suitable_order().
* All orders that pass the checks are returned as a bitfield.
@@ -414,10 +439,10 @@ static inline int split_huge_page(struct page *page)
{
return split_huge_page_to_list_to_order(page, NULL, 0);
}
+
+int folio_memcg_alloc_deferred(struct folio *folio);
+
void deferred_split_folio(struct folio *folio, bool partially_mapped);
-#ifdef CONFIG_MEMCG
-void reparent_deferred_split_queue(struct mem_cgroup *memcg);
-#endif
void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long address, bool freeze);
@@ -581,6 +606,11 @@ static inline bool thp_vma_suitable_order(struct vm_area_struct *vma,
return false;
}
+static inline gfp_t thp_shmem_limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
+{
+ return huge_gfp;
+}
+
static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma,
unsigned long addr, unsigned long orders)
{
@@ -649,8 +679,15 @@ static inline int try_folio_split_to_order(struct folio *folio,
return -EINVAL;
}
-static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
-static inline void reparent_deferred_split_queue(struct mem_cgroup *memcg) {}
+static inline int folio_memcg_alloc_deferred(struct folio *folio)
+{
+ return 0;
+}
+
+static inline void deferred_split_folio(struct folio *folio, bool partially_mapped)
+{
+}
+
#define split_huge_pmd(__vma, __pmd, __address) \
do { } while (0)
diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index fe739d35a864..a450fffe1550 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -81,9 +81,76 @@ static inline int list_lru_init_memcg_key(struct list_lru *lru, struct shrinker
int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru,
gfp_t gfp);
+
+#ifdef CONFIG_MEMCG
+/**
+ * folio_memcg_list_lru_alloc - allocate list_lru heads for shrinkable folio
+ * @folio: the newly allocated & charged folio
+ * @lru: the list_lru this might be queued on
+ * @gfp: gfp mask
+ *
+ * Allocate list_lru heads (per-memcg, per-node) needed to queue this
+ * particular folio down the line.
+ *
+ * This does memcg_list_lru_alloc(), but on the memcg that @folio is
+ * associated with. Handles folio_memcg() access rules in the fast
+ * path (list_lru heads allocated) and the allocation slowpath.
+ *
+ * Returns 0 on success, a negative error value otherwise.
+ */
+int folio_memcg_list_lru_alloc(struct folio *folio, struct list_lru *lru,
+ gfp_t gfp);
+#else
+static inline int folio_memcg_list_lru_alloc(struct folio *folio,
+ struct list_lru *lru, gfp_t gfp)
+{
+ return 0;
+}
+#endif
+
void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent);
/**
+ * list_lru_lock: lock the sublist for the given node and memcg
+ * @lru: the lru pointer
+ * @nid: the node id of the sublist to lock.
+ * @memcg: pointer to the cgroup of the sublist to lock. On return,
+ * updated to the cgroup whose sublist was actually locked,
+ * which may be an ancestor if the original memcg was dying.
+ *
+ * Returns the locked list_lru_one sublist. The caller must call
+ * list_lru_unlock() when done.
+ *
+ * You must ensure that the memcg is not freed during this call (e.g., with
+ * rcu or by taking a css refcnt).
+ *
+ * Return: the locked list_lru_one, or NULL on failure
+ */
+struct list_lru_one *list_lru_lock(struct list_lru *lru, int nid,
+ struct mem_cgroup **memcg);
+
+/**
+ * list_lru_unlock: unlock a sublist locked by list_lru_lock()
+ * @l: the list_lru_one to unlock
+ */
+void list_lru_unlock(struct list_lru_one *l);
+
+struct list_lru_one *list_lru_lock_irq(struct list_lru *lru, int nid,
+ struct mem_cgroup **memcg);
+void list_lru_unlock_irq(struct list_lru_one *l);
+
+struct list_lru_one *list_lru_lock_irqsave(struct list_lru *lru, int nid,
+ struct mem_cgroup **memcg, unsigned long *irq_flags);
+void list_lru_unlock_irqrestore(struct list_lru_one *l,
+ unsigned long *irq_flags);
+
+/* Caller-locked variants, see list_lru_add() etc for documentation */
+bool __list_lru_add(struct list_lru *lru, struct list_lru_one *l,
+ struct list_head *item, int nid, struct mem_cgroup *memcg);
+bool __list_lru_del(struct list_lru *lru, struct list_lru_one *l,
+ struct list_head *item, int nid);
+
+/**
* list_lru_add: add an element to the lru list's tail
* @lru: the lru pointer
* @item: the item to be added.
@@ -115,6 +182,9 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren
bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid,
struct mem_cgroup *memcg);
+bool list_lru_add_irq(struct list_lru *lru, struct list_head *item, int nid,
+ struct mem_cgroup *memcg);
+
/**
* list_lru_add_obj: add an element to the lru list's tail
* @lru: the lru pointer
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index dc3fa687759b..e1f46a0016fc 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -29,6 +29,7 @@ struct obj_cgroup;
struct page;
struct mm_struct;
struct kmem_cache;
+struct swap_cluster_info;
/* Cgroup-specific page state, on top of universal node page state */
enum memcg_stat_item {
@@ -277,10 +278,6 @@ struct mem_cgroup {
struct memcg_cgwb_frn cgwb_frn[MEMCG_CGWB_FRN_CNT];
#endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- struct deferred_split deferred_split_queue;
-#endif
-
#ifdef CONFIG_LRU_GEN_WALKS_MMU
/* per-memcg mm_struct list */
struct lru_gen_mm_list mm_list;
@@ -646,8 +643,8 @@ static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm,
int mem_cgroup_charge_hugetlb(struct folio* folio, gfp_t gfp);
-int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
- gfp_t gfp, swp_entry_t entry);
+int mem_cgroup_swapin_charge_folio(struct folio *folio, unsigned short id,
+ struct mm_struct *mm, gfp_t gfp);
void __mem_cgroup_uncharge(struct folio *folio);
@@ -1137,7 +1134,7 @@ static inline int mem_cgroup_charge_hugetlb(struct folio* folio, gfp_t gfp)
}
static inline int mem_cgroup_swapin_charge_folio(struct folio *folio,
- struct mm_struct *mm, gfp_t gfp, swp_entry_t entry)
+ unsigned short id, struct mm_struct *mm, gfp_t gfp)
{
return 0;
}
@@ -1899,9 +1896,6 @@ static inline void mem_cgroup_exit_user_fault(void)
current->in_user_fault = 0;
}
-void memcg1_swapout(struct folio *folio, swp_entry_t entry);
-void memcg1_swapin(swp_entry_t entry, unsigned int nr_pages);
-
#else /* CONFIG_MEMCG_V1 */
static inline
unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order,
@@ -1929,14 +1923,23 @@ static inline void mem_cgroup_exit_user_fault(void)
{
}
-static inline void memcg1_swapout(struct folio *folio, swp_entry_t entry)
+#endif /* CONFIG_MEMCG_V1 */
+
+#if defined(CONFIG_MEMCG_V1) && defined(CONFIG_SWAP)
+
+void __memcg1_swapout(struct folio *folio, struct swap_cluster_info *ci);
+void memcg1_swapin(struct folio *folio);
+
+#else
+
+static inline void __memcg1_swapout(struct folio *folio,
+ struct swap_cluster_info *ci)
{
}
-static inline void memcg1_swapin(swp_entry_t entry, unsigned int nr_pages)
+static inline void memcg1_swapin(struct folio *folio)
{
}
-
-#endif /* CONFIG_MEMCG_V1 */
+#endif
#endif /* _LINUX_MEMCONTROL_H */
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 5bb5599c6b2b..463dc02f6cff 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -158,7 +158,11 @@ int create_memory_block_devices(unsigned long start, unsigned long size,
void remove_memory_block_devices(unsigned long start, unsigned long size);
extern void memory_dev_init(void);
extern int memory_notify(enum memory_block_state state, void *v);
-extern struct memory_block *find_memory_block(unsigned long section_nr);
+struct memory_block *memory_block_get(unsigned long block_id);
+static inline void memory_block_put(struct memory_block *mem)
+{
+ put_device(&mem->dev);
+}
typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *);
extern int walk_memory_blocks(unsigned long start, unsigned long size,
void *arg, walk_memory_blocks_func_t func);
@@ -171,7 +175,6 @@ struct memory_group *memory_group_find_by_id(int mgid);
typedef int (*walk_memory_groups_func_t)(struct memory_group *, void *);
int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func,
struct memory_group *excluded, void *arg);
-struct memory_block *find_memory_block_by_id(unsigned long block_id);
#define hotplug_memory_notifier(fn, pri) ({ \
static __meminitdata struct notifier_block fn##_mem_nb =\
{ .notifier_call = fn, .priority = pri };\
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 815e908c4135..7c9d66729c60 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -135,9 +135,10 @@ static inline bool movable_node_is_enabled(void)
return movable_node_enabled;
}
-extern void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap);
+extern void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap,
+ struct dev_pagemap *pgmap);
extern void __remove_pages(unsigned long start_pfn, unsigned long nr_pages,
- struct vmem_altmap *altmap);
+ struct vmem_altmap *altmap, struct dev_pagemap *pgmap);
/* reasonably generic interface to expand the physical pages */
extern int __add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
@@ -307,7 +308,8 @@ extern int sparse_add_section(int nid, unsigned long pfn,
unsigned long nr_pages, struct vmem_altmap *altmap,
struct dev_pagemap *pgmap);
extern void sparse_remove_section(unsigned long pfn, unsigned long nr_pages,
- struct vmem_altmap *altmap);
+ struct vmem_altmap *altmap,
+ struct dev_pagemap *pgmap);
extern struct zone *zone_for_pfn_range(enum mmop online_type,
int nid, struct memory_group *group, unsigned long start_pfn,
unsigned long nr_pages);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index fc2acedf0b76..485df9c2dbdd 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -496,6 +496,21 @@ enum {
#else
#define VM_UFFD_MINOR VM_NONE
#endif
+
+/*
+ * vma_flags_t masks for the userfaultfd VMA flags. VMA_UFFD_MINOR is gated on
+ * the same config as VM_UFFD_MINOR -- which implies 64BIT, where the bit fits
+ * -- so an out-of-range bit is never fed to mk_vma_flags() on a build whose
+ * bitmap cannot hold it.
+ */
+#define VMA_UFFD_MISSING mk_vma_flags(VMA_UFFD_MISSING_BIT)
+#define VMA_UFFD_WP mk_vma_flags(VMA_UFFD_WP_BIT)
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
+#define VMA_UFFD_MINOR mk_vma_flags(VMA_UFFD_MINOR_BIT)
+#else
+#define VMA_UFFD_MINOR EMPTY_VMA_FLAGS
+#endif
+
#ifdef CONFIG_64BIT
#define VM_ALLOW_ANY_UNCACHED INIT_VM_FLAG(ALLOW_ANY_UNCACHED)
#define VM_SEALED INIT_VM_FLAG(SEALED)
@@ -1238,6 +1253,30 @@ static __always_inline void vma_flags_set_mask(vma_flags_t *flags,
#define vma_flags_set(flags, ...) \
vma_flags_set_mask(flags, mk_vma_flags(__VA_ARGS__))
+static __always_inline vma_flags_t __mk_vma_flags_from_masks(size_t count,
+ const vma_flags_t *masks)
+{
+ vma_flags_t flags = EMPTY_VMA_FLAGS;
+ size_t i;
+
+ for (i = 0; i < count; i++)
+ vma_flags_set_mask(&flags, masks[i]);
+ return flags;
+}
+
+/*
+ * Combine pre-computed vma_flags_t masks into one value, e.g.:
+ *
+ * vma_flags_t flags = mk_vma_flags_from_masks(VMA_UFFD_WP, VMA_UFFD_MINOR);
+ *
+ * Unlike mk_vma_flags(), which takes bit numbers, this takes whole masks --
+ * each of which may be EMPTY_VMA_FLAGS when its feature is unavailable -- so a
+ * bit that does not exist on the current build is never materialised.
+ */
+#define mk_vma_flags_from_masks(...) \
+ __mk_vma_flags_from_masks(COUNT_ARGS(__VA_ARGS__), \
+ (const vma_flags_t []){__VA_ARGS__})
+
/* Clear all of the to-clear flags in flags, non-atomically. */
static __always_inline void vma_flags_clear_mask(vma_flags_t *flags,
vma_flags_t to_clear)
@@ -1489,6 +1528,11 @@ static inline void vma_set_anonymous(struct vm_area_struct *vma)
vma->vm_ops = NULL;
}
+static inline void vma_desc_set_anonymous(struct vm_area_desc *desc)
+{
+ desc->vm_ops = NULL;
+}
+
static inline bool vma_is_anonymous(struct vm_area_struct *vma)
{
return !vma->vm_ops;
@@ -1888,16 +1932,6 @@ static inline bool folio_mapped(const struct folio *folio)
return folio_mapcount(folio) >= 1;
}
-/*
- * Return true if this page is mapped into pagetables.
- * For compound page it returns true if any sub-page of compound page is mapped,
- * even if this particular sub-page is not itself mapped by any PTE or PMD.
- */
-static inline bool page_mapped(const struct page *page)
-{
- return folio_mapped(page_folio(page));
-}
-
static inline struct page *virt_to_head_page(const void *x)
{
struct page *page = virt_to_page(x);
@@ -4855,18 +4889,10 @@ static inline void print_vma_addr(char *prefix, unsigned long rip)
}
#endif
-void *sparse_buffer_alloc(unsigned long size);
unsigned long section_map_size(void);
struct page * __populate_section_memmap(unsigned long pfn,
unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
struct dev_pagemap *pgmap);
-pgd_t *vmemmap_pgd_populate(unsigned long addr, int node);
-p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node);
-pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node);
-pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node);
-pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
- struct vmem_altmap *altmap, unsigned long ptpfn,
- unsigned long flags);
void *vmemmap_alloc_block(unsigned long size, int node);
struct vmem_altmap;
void *vmemmap_alloc_block_buf(unsigned long size, int node,
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index a171070e15f0..a8430a7ae054 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -247,7 +247,7 @@ static inline unsigned long lru_gen_folio_seq(const struct lruvec *lruvec,
(folio_test_dirty(folio) || folio_test_writeback(folio))))
gen = MIN_NR_GENS;
else
- gen = MAX_NR_GENS - folio_test_workingset(folio);
+ gen = MAX_NR_GENS - (folio_test_workingset(folio) || folio_test_referenced(folio));
return max(READ_ONCE(lrugen->max_seq) - gen + 1, READ_ONCE(lrugen->min_seq[type]));
}
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 5cadb00d9352..b18c2b2e7d2c 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -845,23 +845,10 @@ struct mmap_action {
enum mmap_action_type type;
/*
- * If specified, this hook is invoked after the selected action has been
- * successfully completed. Note that the VMA write lock still held.
- *
- * The absolute minimum ought to be done here.
- *
- * Returns 0 on success, or an error code.
- */
- int (*success_hook)(const struct vm_area_struct *vma);
-
- /*
- * If specified, this hook is invoked when an error occurred when
- * attempting the selected action.
- *
- * The hook can return an error code in order to filter the error, but
- * it is not valid to clear the error here.
+ * If non-zero, replace errors that arise from mmap actions with this
+ * value instead. Only valid error codes may be specified.
*/
- int (*error_hook)(int err);
+ int error_override;
/*
* This should be set in rare instances where the operation required
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 69c304b467df..a11a44eef521 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -134,8 +134,8 @@ struct mmu_notifier_ops {
* Invalidation of multiple concurrent ranges may be
* optionally permitted by the driver. Either way the
* establishment of sptes is forbidden in the range passed to
- * invalidate_range_begin/end for the whole duration of the
- * invalidate_range_begin/end critical section.
+ * invalidate_range_start/end for the whole duration of the
+ * invalidate_range_start/end critical section.
*
* invalidate_range_start() is called when all pages in the
* range are still mapped and have at least a refcount of one.
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9adb2ad21da5..ca2712187147 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -177,9 +177,12 @@ static inline bool migratetype_is_mergeable(int mt)
return mt < MIGRATE_PCPTYPES;
}
-#define for_each_migratetype_order(order, type) \
- for (order = 0; order < NR_PAGE_ORDERS; order++) \
- for (type = 0; type < MIGRATE_TYPES; type++)
+#define for_each_free_list(list, zone, order) \
+ for (order = 0; order < NR_PAGE_ORDERS; order++) \
+ for (unsigned int __type = 0; \
+ __type < MIGRATE_TYPES && \
+ (list = &(zone)->free_area[order].free_list[__type], 1); \
+ __type++)
extern int page_group_by_mobility_disabled;
@@ -211,7 +214,6 @@ enum numa_stat_item {
#endif
enum zone_stat_item {
- /* First 128 byte cacheline (assuming 64 bit words) */
NR_FREE_PAGES,
NR_FREE_PAGES_BLOCKS,
NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */
@@ -222,7 +224,6 @@ enum zone_stat_item {
NR_ZONE_UNEVICTABLE,
NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */
NR_MLOCK, /* mlock()ed pages found and moved off LRU */
- /* Second 128 byte cacheline */
#if IS_ENABLED(CONFIG_ZSMALLOC)
NR_ZSPAGES, /* allocated in zsmalloc */
#endif
@@ -1428,14 +1429,6 @@ struct zonelist {
*/
extern struct page *mem_map;
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-struct deferred_split {
- spinlock_t split_queue_lock;
- struct list_head split_queue;
- unsigned long split_queue_len;
-};
-#endif
-
#ifdef CONFIG_MEMORY_FAILURE
/*
* Per NUMA node memory failure handling statistics.
@@ -1561,10 +1554,6 @@ typedef struct pglist_data {
unsigned long first_deferred_pfn;
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- struct deferred_split deferred_split_queue;
-#endif
-
#ifdef CONFIG_NUMA_BALANCING
/* start time in ms of current promote rate limit period */
unsigned int nbp_rl_start;
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 204c92462f3c..b842aa525546 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -24,23 +24,23 @@
* void nodes_setall(mask) set all bits
* void nodes_clear(mask) clear all bits
* int node_isset(node, mask) true iff bit 'node' set in mask
- * int node_test_and_set(node, mask) test and set bit 'node' in mask
+ * bool node_test_and_set(node, mask) test and set bit 'node' in mask
*
- * void nodes_and(dst, src1, src2) dst = src1 & src2 [intersection]
+ * bool nodes_and(dst, src1, src2) dst = src1 & src2 [intersection]
* void nodes_or(dst, src1, src2) dst = src1 | src2 [union]
* void nodes_xor(dst, src1, src2) dst = src1 ^ src2
- * void nodes_andnot(dst, src1, src2) dst = src1 & ~src2
+ * bool nodes_andnot(dst, src1, src2) dst = src1 & ~src2
* void nodes_complement(dst, src) dst = ~src
*
- * int nodes_equal(mask1, mask2) Does mask1 == mask2?
- * int nodes_intersects(mask1, mask2) Do mask1 and mask2 intersect?
- * int nodes_subset(mask1, mask2) Is mask1 a subset of mask2?
- * int nodes_empty(mask) Is mask empty (no bits sets)?
- * int nodes_full(mask) Is mask full (all bits sets)?
+ * bool nodes_equal(mask1, mask2) Does mask1 == mask2?
+ * bool nodes_intersects(mask1, mask2) Do mask1 and mask2 intersect?
+ * bool nodes_subset(mask1, mask2) Is mask1 a subset of mask2?
+ * bool nodes_empty(mask) Is mask empty (no bits sets)?
+ * bool nodes_full(mask) Is mask full (all bits sets)?
* int nodes_weight(mask) Hamming weight - number of set bits
*
* unsigned int first_node(mask) Number lowest set bit, or MAX_NUMNODES
- * unsigend int next_node(node, mask) Next node past 'node', or MAX_NUMNODES
+ * unsigned int next_node(node, mask) Next node past 'node', or MAX_NUMNODES
* unsigned int next_node_in(node, mask) Next node past 'node', or wrap to first,
* or MAX_NUMNODES
* unsigned int first_unset_node(mask) First node not set in mask, or
diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h
index 94d3f0e71c06..9f5c75d06f76 100644
--- a/include/linux/page_ref.h
+++ b/include/linux/page_ref.h
@@ -71,6 +71,12 @@ static inline int page_ref_count(const struct page *page)
* folio_ref_count - The reference count on this folio.
* @folio: The folio.
*
+ * Folios contain a reference count. When that reference count reaches
+ * zero, the folio is referred to as frozen. At this point, it will
+ * usually be returned to the memory allocator, but some parts of the
+ * kernel freeze folios in order to perform unusual operations on them
+ * such as splitting or migration.
+ *
* The refcount is usually incremented by calls to folio_get() and
* decremented by calls to folio_put(). Some typical users of the
* folio refcount:
@@ -82,6 +88,18 @@ static inline int page_ref_count(const struct page *page)
* - Pipes
* - Direct IO which references this page in the process address space
*
+ * The reference count has three components: expected, temporary and
+ * spurious. The expected reference count of a folio is that which
+ * we would logically expect it to be from just reading the code.
+ * Temporary refcounts are gained by threads which need a temporary
+ * reference to make sure the folio isn't reallocated while they use it.
+ * Spurious refcounts are gained by threads which, thanks to RCU walks
+ * of the page tables or file cache, find a stale pointer to a folio.
+ * These threads will drop the refcount after discoveering the pointer
+ * is stale, but it can surprise other users to see the spurious refcount
+ * on a freshly allocated folio (eg they may see a refcount of 2 instead
+ * of 1).
+ *
* Return: The number of references to this folio.
*/
static inline int folio_ref_count(const struct folio *folio)
diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
index e046278a01fa..9a6c3ea17684 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -36,12 +36,12 @@ enum pageblock_bits {
#define NR_PAGEBLOCK_BITS (roundup_pow_of_two(__NR_PAGEBLOCK_BITS))
-#define MIGRATETYPE_MASK (BIT(PB_migrate_0)|BIT(PB_migrate_1)|BIT(PB_migrate_2))
+#define PAGEBLOCK_MIGRATETYPE_MASK (BIT(PB_migrate_0)|BIT(PB_migrate_1)|BIT(PB_migrate_2))
#ifdef CONFIG_MEMORY_ISOLATION
-#define MIGRATETYPE_AND_ISO_MASK (MIGRATETYPE_MASK | BIT(PB_migrate_isolate))
+#define PAGEBLOCK_ISO_MASK BIT(PB_migrate_isolate)
#else
-#define MIGRATETYPE_AND_ISO_MASK MIGRATETYPE_MASK
+#define PAGEBLOCK_ISO_MASK 0
#endif
#if defined(CONFIG_HUGETLB_PAGE)
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 31a848485ad9..1f50991b43e3 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -1350,6 +1350,7 @@ struct readahead_control {
struct file_ra_state *ra;
/* private: use the readahead_* accessors instead */
pgoff_t _index;
+ pgoff_t _max_index; /* limit readahead to _max_index, inclusive */
unsigned int _nr_pages;
unsigned int _batch_count;
bool dropbehind;
@@ -1363,6 +1364,7 @@ struct readahead_control {
.mapping = m, \
.ra = r, \
._index = i, \
+ ._max_index = ULONG_MAX, \
}
#define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 7a09df6977a5..8f0f68e245ba 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -213,6 +213,7 @@ enum {
SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */
SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */
SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */
+ SWP_HIBERNATION = (1 << 13), /* pinned for hibernation */
/* add others here before... */
};
@@ -252,7 +253,6 @@ struct swap_info_struct {
struct plist_node list; /* entry in swap_active_head */
signed char type; /* strange name for an index */
unsigned int max; /* size of this swap device */
- unsigned long *zeromap; /* kvmalloc'ed bitmap to track zero pages */
struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
struct list_head free_clusters; /* free clusters list */
struct list_head full_clusters; /* full clusters list */
@@ -433,7 +433,9 @@ static inline long get_nr_swap_pages(void)
}
extern void si_swapinfo(struct sysinfo *);
-int swap_type_of(dev_t device, sector_t offset);
+extern int pin_hibernation_swap_type(dev_t device, sector_t offset);
+extern void unpin_hibernation_swap_type(int type);
+extern int find_hibernation_swap_type(dev_t device, sector_t offset);
int find_first_swap(dev_t *device);
extern unsigned int count_swap_pages(int, int);
extern sector_t swapdev_block(int, pgoff_t);
@@ -571,33 +573,31 @@ static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
#endif
#if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP)
-int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry);
-static inline int mem_cgroup_try_charge_swap(struct folio *folio,
- swp_entry_t entry)
+int __mem_cgroup_try_charge_swap(struct folio *folio);
+static inline int mem_cgroup_try_charge_swap(struct folio *folio)
{
if (mem_cgroup_disabled())
return 0;
- return __mem_cgroup_try_charge_swap(folio, entry);
+ return __mem_cgroup_try_charge_swap(folio);
}
-extern void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages);
-static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
+extern void __mem_cgroup_uncharge_swap(unsigned short id, unsigned int nr_pages);
+static inline void mem_cgroup_uncharge_swap(unsigned short id, unsigned int nr_pages)
{
if (mem_cgroup_disabled())
return;
- __mem_cgroup_uncharge_swap(entry, nr_pages);
+ __mem_cgroup_uncharge_swap(id, nr_pages);
}
extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg);
extern bool mem_cgroup_swap_full(struct folio *folio);
#else
-static inline int mem_cgroup_try_charge_swap(struct folio *folio,
- swp_entry_t entry)
+static inline int mem_cgroup_try_charge_swap(struct folio *folio)
{
return 0;
}
-static inline void mem_cgroup_uncharge_swap(swp_entry_t entry,
+static inline void mem_cgroup_uncharge_swap(unsigned short id,
unsigned int nr_pages)
{
}
diff --git a/include/linux/swap_cgroup.h b/include/linux/swap_cgroup.h
deleted file mode 100644
index 91cdf12190a0..000000000000
--- a/include/linux/swap_cgroup.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __LINUX_SWAP_CGROUP_H
-#define __LINUX_SWAP_CGROUP_H
-
-#include <linux/swap.h>
-
-#if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP)
-
-extern void swap_cgroup_record(struct folio *folio, unsigned short id, swp_entry_t ent);
-extern unsigned short swap_cgroup_clear(swp_entry_t ent, unsigned int nr_ents);
-extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent);
-extern int swap_cgroup_swapon(int type, unsigned long max_pages);
-extern void swap_cgroup_swapoff(int type);
-
-#else
-
-static inline
-void swap_cgroup_record(struct folio *folio, unsigned short id, swp_entry_t ent)
-{
-}
-
-static inline
-unsigned short swap_cgroup_clear(swp_entry_t ent, unsigned int nr_ents)
-{
- return 0;
-}
-
-static inline
-unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
-{
- return 0;
-}
-
-static inline int
-swap_cgroup_swapon(int type, unsigned long max_pages)
-{
- return 0;
-}
-
-static inline void swap_cgroup_swapoff(int type)
-{
- return;
-}
-
-#endif
-
-#endif /* __LINUX_SWAP_CGROUP_H */
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 051e42902690..307b8390fc67 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -92,7 +92,7 @@ static inline long set_restart_fn(struct restart_block *restart,
#define THREAD_ALIGN THREAD_SIZE
#endif
-#define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO)
+#define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_SKIP_KASAN)
/*
* flag set/clear/test wrappers
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index d2920f98ab86..68edac4dcd78 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -23,8 +23,8 @@
/* The set of all possible UFFD-related VM flags. */
#define __VM_UFFD_FLAGS (VM_UFFD_MISSING | VM_UFFD_WP | VM_UFFD_MINOR)
-#define __VMA_UFFD_FLAGS mk_vma_flags(VMA_UFFD_MISSING_BIT, VMA_UFFD_WP_BIT, \
- VMA_UFFD_MINOR_BIT)
+#define __VMA_UFFD_FLAGS mk_vma_flags_from_masks(VMA_UFFD_MISSING, VMA_UFFD_WP, \
+ VMA_UFFD_MINOR)
/*
* CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining
@@ -147,26 +147,12 @@ static inline uffd_flags_t uffd_flags_set_mode(uffd_flags_t flags, enum mfill_at
/* Flags controlling behavior. These behavior changes are mode-independent. */
#define MFILL_ATOMIC_WP MFILL_ATOMIC_FLAG(0)
-extern ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start,
- unsigned long src_start, unsigned long len,
- uffd_flags_t flags);
-extern ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx,
- unsigned long dst_start,
- unsigned long len);
-extern ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long dst_start,
- unsigned long len, uffd_flags_t flags);
-extern ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start,
- unsigned long len, uffd_flags_t flags);
-extern int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
- unsigned long len, bool enable_wp);
extern long uffd_wp_range(struct vm_area_struct *vma,
unsigned long start, unsigned long len, bool enable_wp);
/* move_pages */
void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2);
void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2);
-ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
- unsigned long src_start, unsigned long len, __u64 flags);
int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
struct vm_area_struct *dst_vma,
struct vm_area_struct *src_vma,
@@ -239,9 +225,6 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
return vma->vm_flags & __VM_UFFD_FLAGS;
}
-bool vma_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags,
- bool wp_async);
-
static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma)
{
struct userfaultfd_ctx *uffd_ctx = vma->vm_userfaultfd_ctx.ctx;
@@ -271,25 +254,6 @@ extern void userfaultfd_unmap_complete(struct mm_struct *mm,
extern bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma);
extern bool userfaultfd_wp_async(struct vm_area_struct *vma);
-void userfaultfd_reset_ctx(struct vm_area_struct *vma);
-
-struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi,
- struct vm_area_struct *prev,
- struct vm_area_struct *vma,
- unsigned long start,
- unsigned long end);
-
-int userfaultfd_register_range(struct userfaultfd_ctx *ctx,
- struct vm_area_struct *vma,
- vm_flags_t vm_flags,
- unsigned long start, unsigned long end,
- bool wp_async);
-
-void userfaultfd_release_new(struct userfaultfd_ctx *ctx);
-
-void userfaultfd_release_all(struct mm_struct *mm,
- struct userfaultfd_ctx *ctx);
-
static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma)
{
/* Only wr-protect mode uses pte markers */
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 3b02c0c6b371..d87dc7f77f4e 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -265,7 +265,9 @@ static inline bool is_vm_area_hugepages(const void *addr)
* allocated in the vmalloc layer.
*/
#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
- return find_vm_area(addr)->page_order > 0;
+ struct vm_struct *area = find_vm_area(addr);
+
+ return area && area->page_order > 0;
#else
return false;
#endif
diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h
index 6a2f51ebbfd3..faecd5522401 100644
--- a/include/linux/vmpressure.h
+++ b/include/linux/vmpressure.h
@@ -30,8 +30,8 @@ struct vmpressure {
struct mem_cgroup;
#ifdef CONFIG_MEMCG
-extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
- unsigned long scanned, unsigned long reclaimed);
+void vmpressure(gfp_t gfp, int order, struct mem_cgroup *memcg, bool tree,
+ unsigned long scanned, unsigned long reclaimed);
extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio);
extern void vmpressure_init(struct vmpressure *vmpr);
@@ -44,8 +44,9 @@ extern int vmpressure_register_event(struct mem_cgroup *memcg,
extern void vmpressure_unregister_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd);
#else
-static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
- unsigned long scanned, unsigned long reclaimed) {}
+static inline void vmpressure(gfp_t gfp, int order, struct mem_cgroup *memcg,
+ bool tree, unsigned long scanned,
+ unsigned long reclaimed) {}
static inline void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg,
int prio) {}
#endif /* CONFIG_MEMCG */
diff --git a/include/trace/events/damon.h b/include/trace/events/damon.h
index 7e25f4469b81..78388538acf4 100644
--- a/include/trace/events/damon.h
+++ b/include/trace/events/damon.h
@@ -130,6 +130,44 @@ TRACE_EVENT(damon_monitor_intervals_tune,
TP_printk("sample_us=%lu", __entry->sample_us)
);
+TRACE_EVENT_CONDITION(damon_region_aggregated,
+
+ TP_PROTO(unsigned int target_id, struct damon_region *r,
+ unsigned int nr_regions, unsigned int nr_probes),
+
+ TP_ARGS(target_id, r, nr_regions, nr_probes),
+
+ TP_CONDITION(nr_probes > 0),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, target_id)
+ __field(unsigned long, start)
+ __field(unsigned long, end)
+ __field(unsigned int, nr_regions)
+ __field(unsigned int, nr_accesses)
+ __field(unsigned int, age)
+ __dynamic_array(unsigned char, probe_hits, nr_probes)
+ ),
+
+ TP_fast_assign(
+ __entry->target_id = target_id;
+ __entry->start = r->ar.start;
+ __entry->end = r->ar.end;
+ __entry->nr_regions = nr_regions;
+ __entry->nr_accesses = r->nr_accesses;
+ __entry->age = r->age;
+ memcpy(__get_dynamic_array(probe_hits), r->probe_hits,
+ sizeof(*r->probe_hits) * nr_probes);
+ ),
+
+ TP_printk("target_id=%lu nr_regions=%u %lu-%lu: %u %u probe_hits=%s",
+ __entry->target_id, __entry->nr_regions,
+ __entry->start, __entry->end,
+ __entry->nr_accesses, __entry->age,
+ __print_hex(__get_dynamic_array(probe_hits),
+ __get_dynamic_array_len(probe_hits)))
+);
+
TRACE_EVENT(damon_aggregated,
TP_PROTO(unsigned int target_id, struct damon_region *r,
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index 4445a8d9218d..b4bf7b8def1f 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -96,6 +96,58 @@ TRACE_EVENT(mm_vmscan_kswapd_wake,
__entry->order)
);
+TRACE_EVENT(mm_vmscan_balance_pgdat_begin,
+
+ TP_PROTO(int nid, int order, int highest_zoneidx),
+
+ TP_ARGS(nid, order, highest_zoneidx),
+
+ TP_STRUCT__entry(
+ __field(int, nid)
+ __field(int, order)
+ __field(int, highest_zoneidx)
+ ),
+
+ TP_fast_assign(
+ __entry->nid = nid;
+ __entry->order = order;
+ __entry->highest_zoneidx = highest_zoneidx;
+ ),
+
+ TP_printk("nid=%d order=%d highest_zoneidx=%-8s",
+ __entry->nid,
+ __entry->order,
+ __print_symbolic(__entry->highest_zoneidx, ZONE_TYPE))
+);
+
+TRACE_EVENT(mm_vmscan_balance_pgdat_end,
+
+ TP_PROTO(int nid, int order, int highest_zoneidx,
+ unsigned long nr_reclaimed),
+
+ TP_ARGS(nid, order, highest_zoneidx, nr_reclaimed),
+
+ TP_STRUCT__entry(
+ __field(int, nid)
+ __field(int, order)
+ __field(int, highest_zoneidx)
+ __field(unsigned long, nr_reclaimed)
+ ),
+
+ TP_fast_assign(
+ __entry->nid = nid;
+ __entry->order = order;
+ __entry->highest_zoneidx = highest_zoneidx;
+ __entry->nr_reclaimed = nr_reclaimed;
+ ),
+
+ TP_printk("nid=%d order=%d highest_zoneidx=%-8s nr_reclaimed=%lu",
+ __entry->nid,
+ __entry->order,
+ __print_symbolic(__entry->highest_zoneidx, ZONE_TYPE),
+ __entry->nr_reclaimed)
+);
+
TRACE_EVENT(mm_vmscan_wakeup_kswapd,
TP_PROTO(int nid, int zid, int order, gfp_t gfp_flags),