diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2026-01-15 10:47:14 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2026-01-15 10:47:14 -0800 |
| commit | 13b2d15d991b3f0f4ebfffbed081dbff27ac1c9d (patch) | |
| tree | a6168d85390cd48e7081891821327e88e3d435fc | |
| parent | 9e995c573b63453a904f3157813dc8cde4a6aba4 (diff) | |
| parent | 3e8e590fd65d0572584ab7bba89a35e6d19931f1 (diff) | |
Merge tag 'mm-hotfixes-stable-2026-01-15-08-03' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull misc fixes from Andrew Morton:
- kerneldoc fixes from Bagas Sanjaya
- DAMON fixes from SeongJae
- mremap VMA-related fixes from Lorenzo
- various singletons - please see the changelogs for details
* tag 'mm-hotfixes-stable-2026-01-15-08-03' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (30 commits)
drivers/dax: add some missing kerneldoc comment fields for struct dev_dax
mm: numa,memblock: include <asm/numa.h> for 'numa_nodes_parsed'
mailmap: add entry for Daniel Thompson
tools/testing/selftests: fix gup_longterm for unknown fs
mm/page_alloc: prevent pcp corruption with SMP=n
iommu/sva: include mmu_notifier.h header
mm: kmsan: fix poisoning of high-order non-compound pages
tools/testing/selftests: add forked (un)/faulted VMA merge tests
mm/vma: enforce VMA fork limit on unfaulted,faulted mremap merge too
tools/testing/selftests: add tests for !tgt, src mremap() merges
mm/vma: fix anon_vma UAF on mremap() faulted, unfaulted merge
mm/zswap: fix error pointer free in zswap_cpu_comp_prepare()
mm/damon/sysfs-scheme: cleanup access_pattern subdirs on scheme dir setup failure
mm/damon/sysfs-scheme: cleanup quotas subdirs on scheme dir setup failure
mm/damon/sysfs: cleanup attrs subdirs on context dir setup failure
mm/damon/sysfs: cleanup intervals subdirs on attrs dir setup failure
mm/damon/core: remove call_control in inactive contexts
powerpc/watchdog: add support for hardlockup_sys_info sysctl
mips: fix HIGHMEM initialization
mm/hugetlb: ignore hugepage kernel args if hugepages are unsupported
...
| -rw-r--r-- | .mailmap | 2 | ||||
| -rw-r--r-- | Documentation/admin-guide/kernel-parameters.txt | 35 | ||||
| -rw-r--r-- | arch/mips/mm/init.c | 23 | ||||
| -rw-r--r-- | arch/powerpc/kernel/watchdog.c | 15 | ||||
| -rw-r--r-- | drivers/dax/dax-private.h | 10 | ||||
| -rw-r--r-- | drivers/iommu/iommu-sva.c | 1 | ||||
| -rw-r--r-- | include/linux/kfence.h | 1 | ||||
| -rw-r--r-- | include/linux/nmi.h | 1 | ||||
| -rw-r--r-- | include/linux/sched/mm.h | 1 | ||||
| -rw-r--r-- | include/linux/textsearch.h | 1 | ||||
| -rw-r--r-- | kernel/liveupdate/kexec_handover.c | 37 | ||||
| -rw-r--r-- | kernel/watchdog.c | 2 | ||||
| -rw-r--r-- | lib/buildid.c | 32 | ||||
| -rw-r--r-- | mm/damon/core.c | 41 | ||||
| -rw-r--r-- | mm/damon/sysfs-schemes.c | 10 | ||||
| -rw-r--r-- | mm/damon/sysfs.c | 9 | ||||
| -rw-r--r-- | mm/hugetlb.c | 16 | ||||
| -rw-r--r-- | mm/kmsan/shadow.c | 2 | ||||
| -rw-r--r-- | mm/numa_memblks.c | 2 | ||||
| -rw-r--r-- | mm/page_alloc.c | 57 | ||||
| -rw-r--r-- | mm/vma.c | 111 | ||||
| -rw-r--r-- | mm/vma.h | 3 | ||||
| -rw-r--r-- | mm/vmalloc.c | 2 | ||||
| -rw-r--r-- | mm/zswap.c | 2 | ||||
| -rw-r--r-- | tools/testing/selftests/mm/gup_longterm.c | 2 | ||||
| -rw-r--r-- | tools/testing/selftests/mm/merge.c | 384 |
26 files changed, 674 insertions, 128 deletions
@@ -207,6 +207,7 @@ Daniel Borkmann <daniel@iogearbox.net> <daniel.borkmann@tik.ee.ethz.ch> Daniel Borkmann <daniel@iogearbox.net> <dborkmann@redhat.com> Daniel Borkmann <daniel@iogearbox.net> <dborkman@redhat.com> Daniel Borkmann <daniel@iogearbox.net> <dxchgb@gmail.com> +Daniel Thompson <danielt@kernel.org> <daniel.thompson@linaro.org> Danilo Krummrich <dakr@kernel.org> <dakr@redhat.com> David Brownell <david-b@pacbell.net> David Collins <quic_collinsd@quicinc.com> <collinsd@codeaurora.org> @@ -794,6 +795,7 @@ Sven Eckelmann <sven@narfation.org> <sven.eckelmann@open-mesh.com> Sven Eckelmann <sven@narfation.org> <sven.eckelmann@openmesh.com> Sven Eckelmann <sven@narfation.org> <sven@open-mesh.com> Sven Peter <sven@kernel.org> <sven@svenpeter.dev> +Szymon Wilczek <swilczek.lx@gmail.com> <szymonwilczek@gmx.com> Takashi YOSHII <takashi.yoshii.zj@renesas.com> Tamizh Chelvam Raja <quic_tamizhr@quicinc.com> <tamizhr@codeaurora.org> Taniya Das <quic_tdas@quicinc.com> <tdas@codeaurora.org> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a8d0afde7f85..1058f2a6d6a8 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2917,6 +2917,41 @@ Kernel parameters for Movable pages. "nn[KMGTPE]", "nn%", and "mirror" are exclusive, so you cannot specify multiple forms. + kfence.burst= [MM,KFENCE] The number of additional successive + allocations to be attempted through KFENCE for each + sample interval. + Format: <unsigned integer> + Default: 0 + + kfence.check_on_panic= + [MM,KFENCE] Whether to check all KFENCE-managed objects' + canaries on panic. + Format: <bool> + Default: false + + kfence.deferrable= + [MM,KFENCE] Whether to use a deferrable timer to trigger + allocations. This avoids forcing CPU wake-ups if the + system is idle, at the risk of a less predictable + sample interval. + Format: <bool> + Default: CONFIG_KFENCE_DEFERRABLE + + kfence.sample_interval= + [MM,KFENCE] KFENCE's sample interval in milliseconds. + Format: <unsigned integer> + 0 - Disable KFENCE. + >0 - Enabled KFENCE with given sample interval. + Default: CONFIG_KFENCE_SAMPLE_INTERVAL + + kfence.skip_covered_thresh= + [MM,KFENCE] If pool utilization reaches this threshold + (pool usage%), KFENCE limits currently covered + allocations of the same source from further filling + up the pool. + Format: <unsigned integer> + Default: 75 + kgdbdbgp= [KGDB,HW,EARLY] kgdb over EHCI usb debug port. Format: <Controller#>[,poll interval] The controller # is the number of the ehci usb debug diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index a673d3d68254..8986048f9b11 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -425,6 +425,28 @@ void __init paging_init(void) static struct kcore_list kcore_kseg0; #endif +static inline void __init highmem_init(void) +{ +#ifdef CONFIG_HIGHMEM + unsigned long tmp; + + /* + * If CPU cannot support HIGHMEM discard the memory above highstart_pfn + */ + if (cpu_has_dc_aliases) { + memblock_remove(PFN_PHYS(highstart_pfn), -1); + return; + } + + for (tmp = highstart_pfn; tmp < highend_pfn; tmp++) { + struct page *page = pfn_to_page(tmp); + + if (!memblock_is_memory(PFN_PHYS(tmp))) + SetPageReserved(page); + } +#endif +} + void __init arch_mm_preinit(void) { /* @@ -435,6 +457,7 @@ void __init arch_mm_preinit(void) maar_init(); setup_zero_pages(); /* Setup zeroed pages. */ + highmem_init(); #ifdef CONFIG_64BIT if ((unsigned long) &_text > (unsigned long) CKSEG0) diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index 2429cb1c7baa..764001deb060 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -26,6 +26,7 @@ #include <linux/delay.h> #include <linux/processor.h> #include <linux/smp.h> +#include <linux/sys_info.h> #include <asm/interrupt.h> #include <asm/paca.h> @@ -235,7 +236,11 @@ static void watchdog_smp_panic(int cpu) pr_emerg("CPU %d TB:%lld, last SMP heartbeat TB:%lld (%lldms ago)\n", cpu, tb, last_reset, tb_to_ns(tb - last_reset) / 1000000); - if (!sysctl_hardlockup_all_cpu_backtrace) { + if (sysctl_hardlockup_all_cpu_backtrace || + (hardlockup_si_mask & SYS_INFO_ALL_BT)) { + trigger_allbutcpu_cpu_backtrace(cpu); + cpumask_clear(&wd_smp_cpus_ipi); + } else { /* * Try to trigger the stuck CPUs, unless we are going to * get a backtrace on all of them anyway. @@ -244,11 +249,9 @@ static void watchdog_smp_panic(int cpu) smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000); __cpumask_clear_cpu(c, &wd_smp_cpus_ipi); } - } else { - trigger_allbutcpu_cpu_backtrace(cpu); - cpumask_clear(&wd_smp_cpus_ipi); } + sys_info(hardlockup_si_mask & ~SYS_INFO_ALL_BT); if (hardlockup_panic) nmi_panic(NULL, "Hard LOCKUP"); @@ -415,9 +418,11 @@ DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt) xchg(&__wd_nmi_output, 1); // see wd_lockup_ipi - if (sysctl_hardlockup_all_cpu_backtrace) + if (sysctl_hardlockup_all_cpu_backtrace || + (hardlockup_si_mask & SYS_INFO_ALL_BT)) trigger_allbutcpu_cpu_backtrace(cpu); + sys_info(hardlockup_si_mask & ~SYS_INFO_ALL_BT); if (hardlockup_panic) nmi_panic(regs, "Hard LOCKUP"); diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h index 0867115aeef2..c6ae27c982f4 100644 --- a/drivers/dax/dax-private.h +++ b/drivers/dax/dax-private.h @@ -67,14 +67,16 @@ struct dev_dax_range { /** * struct dev_dax - instance data for a subdivision of a dax region, and * data while the device is activated in the driver. - * @region - parent region - * @dax_dev - core dax functionality + * @region: parent region + * @dax_dev: core dax functionality + * @align: alignment of this instance * @target_node: effective numa node if dev_dax memory range is onlined * @dyn_id: is this a dynamic or statically created instance * @id: ida allocated id when the dax_region is not static * @ida: mapping id allocator - * @dev - device core - * @pgmap - pgmap for memmap setup / lifetime (driver owned) + * @dev: device core + * @pgmap: pgmap for memmap setup / lifetime (driver owned) + * @memmap_on_memory: allow kmem to put the memmap in the memory * @nr_range: size of @ranges * @ranges: range tuples of memory used */ diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c index d236aef80a8d..e1e63c2be82b 100644 --- a/drivers/iommu/iommu-sva.c +++ b/drivers/iommu/iommu-sva.c @@ -3,6 +3,7 @@ * Helpers for IOMMU drivers implementing SVA */ #include <linux/mmu_context.h> +#include <linux/mmu_notifier.h> #include <linux/mutex.h> #include <linux/sched/mm.h> #include <linux/iommu.h> diff --git a/include/linux/kfence.h b/include/linux/kfence.h index 0ad1ddbb8b99..e5822f6e7f27 100644 --- a/include/linux/kfence.h +++ b/include/linux/kfence.h @@ -211,6 +211,7 @@ struct kmem_obj_info; * __kfence_obj_info() - fill kmem_obj_info struct * @kpp: kmem_obj_info to be filled * @object: the object + * @slab: the slab * * Return: * * false - not a KFENCE object diff --git a/include/linux/nmi.h b/include/linux/nmi.h index cf3c6ab408aa..207156f2143c 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -83,6 +83,7 @@ static inline void reset_hung_task_detector(void) { } #if defined(CONFIG_HARDLOCKUP_DETECTOR) extern void hardlockup_detector_disable(void); extern unsigned int hardlockup_panic; +extern unsigned long hardlockup_si_mask; #else static inline void hardlockup_detector_disable(void) {} #endif diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 0e1d73955fa5..95d0040df584 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -325,6 +325,7 @@ static inline void might_alloc(gfp_t gfp_mask) /** * memalloc_flags_save - Add a PF_* flag to current->flags, save old value + * @flags: Flags to add. * * This allows PF_* flags to be conveniently added, irrespective of current * value, and then the old version restored with memalloc_flags_restore(). diff --git a/include/linux/textsearch.h b/include/linux/textsearch.h index 6673e4d4ac2e..4933777404d6 100644 --- a/include/linux/textsearch.h +++ b/include/linux/textsearch.h @@ -35,6 +35,7 @@ struct ts_state * @get_pattern: return head of pattern * @get_pattern_len: return length of pattern * @owner: module reference to algorithm + * @list: list to search */ struct ts_ops { diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 9dc51fab604f..d4482b6e3cae 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -460,27 +460,23 @@ static void __init deserialize_bitmap(unsigned int order, } } -/* Return true if memory was deserizlied */ -static bool __init kho_mem_deserialize(const void *fdt) +/* Returns physical address of the preserved memory map from FDT */ +static phys_addr_t __init kho_get_mem_map_phys(const void *fdt) { - struct khoser_mem_chunk *chunk; const void *mem_ptr; - u64 mem; int len; mem_ptr = fdt_getprop(fdt, 0, PROP_PRESERVED_MEMORY_MAP, &len); if (!mem_ptr || len != sizeof(u64)) { pr_err("failed to get preserved memory bitmaps\n"); - return false; + return 0; } - mem = get_unaligned((const u64 *)mem_ptr); - chunk = mem ? phys_to_virt(mem) : NULL; - - /* No preserved physical pages were passed, no deserialization */ - if (!chunk) - return false; + return get_unaligned((const u64 *)mem_ptr); +} +static void __init kho_mem_deserialize(struct khoser_mem_chunk *chunk) +{ while (chunk) { unsigned int i; @@ -489,8 +485,6 @@ static bool __init kho_mem_deserialize(const void *fdt) &chunk->bitmaps[i]); chunk = KHOSER_LOAD_PTR(chunk->hdr.next); } - - return true; } /* @@ -1253,6 +1247,7 @@ bool kho_finalized(void) struct kho_in { phys_addr_t fdt_phys; phys_addr_t scratch_phys; + phys_addr_t mem_map_phys; struct kho_debugfs dbg; }; @@ -1434,12 +1429,10 @@ static void __init kho_release_scratch(void) void __init kho_memory_init(void) { - if (kho_in.scratch_phys) { + if (kho_in.mem_map_phys) { kho_scratch = phys_to_virt(kho_in.scratch_phys); kho_release_scratch(); - - if (!kho_mem_deserialize(kho_get_fdt())) - kho_in.fdt_phys = 0; + kho_mem_deserialize(phys_to_virt(kho_in.mem_map_phys)); } else { kho_reserve_scratch(); } @@ -1448,8 +1441,9 @@ void __init kho_memory_init(void) void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len, phys_addr_t scratch_phys, u64 scratch_len) { - void *fdt = NULL; struct kho_scratch *scratch = NULL; + phys_addr_t mem_map_phys; + void *fdt = NULL; int err = 0; unsigned int scratch_cnt = scratch_len / sizeof(*kho_scratch); @@ -1475,6 +1469,12 @@ void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len, goto out; } + mem_map_phys = kho_get_mem_map_phys(fdt); + if (!mem_map_phys) { + err = -ENOENT; + goto out; + } + scratch = early_memremap(scratch_phys, scratch_len); if (!scratch) { pr_warn("setup: failed to memremap scratch (phys=0x%llx, len=%lld)\n", @@ -1515,6 +1515,7 @@ void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len, kho_in.fdt_phys = fdt_phys; kho_in.scratch_phys = scratch_phys; + kho_in.mem_map_phys = mem_map_phys; kho_scratch_cnt = scratch_cnt; pr_info("found kexec handover data.\n"); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 0685e3a8aa0a..366122f4a0f8 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -71,7 +71,7 @@ unsigned int __read_mostly hardlockup_panic = * hard lockup is detected, it could be task, memory, lock etc. * Refer include/linux/sys_info.h for detailed bit definition. */ -static unsigned long hardlockup_si_mask; +unsigned long hardlockup_si_mask; #ifdef CONFIG_SYSFS diff --git a/lib/buildid.c b/lib/buildid.c index aaf61dfc0919..818331051afe 100644 --- a/lib/buildid.c +++ b/lib/buildid.c @@ -5,6 +5,7 @@ #include <linux/elf.h> #include <linux/kernel.h> #include <linux/pagemap.h> +#include <linux/fs.h> #include <linux/secretmem.h> #define BUILD_ID 3 @@ -46,20 +47,9 @@ static int freader_get_folio(struct freader *r, loff_t file_off) freader_put_folio(r); - /* reject secretmem folios created with memfd_secret() */ - if (secretmem_mapping(r->file->f_mapping)) - return -EFAULT; - + /* only use page cache lookup - fail if not already cached */ r->folio = filemap_get_folio(r->file->f_mapping, file_off >> PAGE_SHIFT); - /* if sleeping is allowed, wait for the page, if necessary */ - if (r->may_fault && (IS_ERR(r->folio) || !folio_test_uptodate(r->folio))) { - filemap_invalidate_lock_shared(r->file->f_mapping); - r->folio = read_cache_folio(r->file->f_mapping, file_off >> PAGE_SHIFT, - NULL, r->file); - filemap_invalidate_unlock_shared(r->file->f_mapping); - } - if (IS_ERR(r->folio) || !folio_test_uptodate(r->folio)) { if (!IS_ERR(r->folio)) folio_put(r->folio); @@ -97,6 +87,24 @@ const void *freader_fetch(struct freader *r, loff_t file_off, size_t sz) return r->data + file_off; } + /* reject secretmem folios created with memfd_secret() */ + if (secretmem_mapping(r->file->f_mapping)) { + r->err = -EFAULT; + return NULL; + } + + /* use __kernel_read() for sleepable context */ + if (r->may_fault) { + ssize_t ret; + + ret = __kernel_read(r->file, r->buf, sz, &file_off); + if (ret != sz) { + r->err = (ret < 0) ? ret : -EIO; + return NULL; + } + return r->buf; + } + /* fetch or reuse folio for given file offset */ r->err = freader_get_folio(r, file_off); if (r->err) diff --git a/mm/damon/core.c b/mm/damon/core.c index f9fc0375890a..84f80a20f233 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1431,6 +1431,35 @@ bool damon_is_running(struct damon_ctx *ctx) return running; } +/* + * damon_call_handle_inactive_ctx() - handle DAMON call request that added to + * an inactive context. + * @ctx: The inactive DAMON context. + * @control: Control variable of the call request. + * + * This function is called in a case that @control is added to @ctx but @ctx is + * not running (inactive). See if @ctx handled @control or not, and cleanup + * @control if it was not handled. + * + * Returns 0 if @control was handled by @ctx, negative error code otherwise. + */ +static int damon_call_handle_inactive_ctx( + struct damon_ctx *ctx, struct damon_call_control *control) +{ + struct damon_call_control *c; + + mutex_lock(&ctx->call_controls_lock); + list_for_each_entry(c, &ctx->call_controls, list) { + if (c == control) { + list_del(&control->list); + mutex_unlock(&ctx->call_controls_lock); + return -EINVAL; + } + } + mutex_unlock(&ctx->call_controls_lock); + return 0; +} + /** * damon_call() - Invoke a given function on DAMON worker thread (kdamond). * @ctx: DAMON context to call the function for. @@ -1461,7 +1490,7 @@ int damon_call(struct damon_ctx *ctx, struct damon_call_control *control) list_add_tail(&control->list, &ctx->call_controls); mutex_unlock(&ctx->call_controls_lock); if (!damon_is_running(ctx)) - return -EINVAL; + return damon_call_handle_inactive_ctx(ctx, control); if (control->repeat) return 0; wait_for_completion(&control->completion); @@ -2051,13 +2080,15 @@ static unsigned long damos_get_node_memcg_used_bp( rcu_read_lock(); memcg = mem_cgroup_from_id(goal->memcg_id); - rcu_read_unlock(); - if (!memcg) { + if (!memcg || !mem_cgroup_tryget(memcg)) { + rcu_read_unlock(); if (goal->metric == DAMOS_QUOTA_NODE_MEMCG_USED_BP) return 0; else /* DAMOS_QUOTA_NODE_MEMCG_FREE_BP */ return 10000; } + rcu_read_unlock(); + mem_cgroup_flush_stats(memcg); lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(goal->nid)); used_pages = lruvec_page_state(lruvec, NR_ACTIVE_ANON); @@ -2065,6 +2096,8 @@ static unsigned long damos_get_node_memcg_used_bp( used_pages += lruvec_page_state(lruvec, NR_ACTIVE_FILE); used_pages += lruvec_page_state(lruvec, NR_INACTIVE_FILE); + mem_cgroup_put(memcg); + si_meminfo_node(&i, goal->nid); if (goal->metric == DAMOS_QUOTA_NODE_MEMCG_USED_BP) numerator = used_pages; @@ -2751,13 +2784,13 @@ done: if (ctx->ops.cleanup) ctx->ops.cleanup(ctx); kfree(ctx->regions_score_histogram); + kdamond_call(ctx, true); pr_debug("kdamond (%d) finishes\n", current->pid); mutex_lock(&ctx->kdamond_lock); ctx->kdamond = NULL; mutex_unlock(&ctx->kdamond_lock); - kdamond_call(ctx, true); damos_walk_cancel(ctx); mutex_lock(&damon_lock); diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 30d20f5b3192..3a699dcd5a7f 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -2152,13 +2152,13 @@ static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme) return err; err = damos_sysfs_set_dests(scheme); if (err) - goto put_access_pattern_out; + goto rmdir_put_access_pattern_out; err = damon_sysfs_scheme_set_quotas(scheme); if (err) goto put_dests_out; err = damon_sysfs_scheme_set_watermarks(scheme); if (err) - goto put_quotas_access_pattern_out; + goto rmdir_put_quotas_access_pattern_out; err = damos_sysfs_set_filter_dirs(scheme); if (err) goto put_watermarks_quotas_access_pattern_out; @@ -2183,13 +2183,15 @@ put_filters_watermarks_quotas_access_pattern_out: put_watermarks_quotas_access_pattern_out: kobject_put(&scheme->watermarks->kobj); scheme->watermarks = NULL; -put_quotas_access_pattern_out: +rmdir_put_quotas_access_pattern_out: + damon_sysfs_quotas_rm_dirs(scheme->quotas); kobject_put(&scheme->quotas->kobj); scheme->quotas = NULL; put_dests_out: kobject_put(&scheme->dests->kobj); scheme->dests = NULL; -put_access_pattern_out: +rmdir_put_access_pattern_out: + damon_sysfs_access_pattern_rm_dirs(scheme->access_pattern); kobject_put(&scheme->access_pattern->kobj); scheme->access_pattern = NULL; return err; diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index e2bd2d7becdd..95fd9375a7d8 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -792,7 +792,7 @@ static int damon_sysfs_attrs_add_dirs(struct damon_sysfs_attrs *attrs) nr_regions_range = damon_sysfs_ul_range_alloc(10, 1000); if (!nr_regions_range) { err = -ENOMEM; - goto put_intervals_out; + goto rmdir_put_intervals_out; } err = kobject_init_and_add(&nr_regions_range->kobj, @@ -806,6 +806,8 @@ static int damon_sysfs_attrs_add_dirs(struct damon_sysfs_attrs *attrs) put_nr_regions_intervals_out: kobject_put(&nr_regions_range->kobj); attrs->nr_regions_range = NULL; +rmdir_put_intervals_out: + damon_sysfs_intervals_rm_dirs(intervals); put_intervals_out: kobject_put(&intervals->kobj); attrs->intervals = NULL; @@ -948,7 +950,7 @@ static int damon_sysfs_context_add_dirs(struct damon_sysfs_context *context) err = damon_sysfs_context_set_targets(context); if (err) - goto put_attrs_out; + goto rmdir_put_attrs_out; err = damon_sysfs_context_set_schemes(context); if (err) @@ -958,7 +960,8 @@ static int damon_sysfs_context_add_dirs(struct damon_sysfs_context *context) put_targets_attrs_out: kobject_put(&context->targets->kobj); context->targets = NULL; -put_attrs_out: +rmdir_put_attrs_out: + damon_sysfs_attrs_rm_dirs(context->attrs); kobject_put(&context->attrs->kobj); context->attrs = NULL; return err; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 51273baec9e5..e0ab14020513 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4286,6 +4286,11 @@ static int __init hugepages_setup(char *s) unsigned long tmp; char *p = s; + if (!hugepages_supported()) { + pr_warn("HugeTLB: hugepages unsupported, ignoring hugepages=%s cmdline\n", s); + return 0; + } + if (!parsed_valid_hugepagesz) { pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s); parsed_valid_hugepagesz = true; @@ -4366,6 +4371,11 @@ static int __init hugepagesz_setup(char *s) unsigned long size; struct hstate *h; + if (!hugepages_supported()) { + pr_warn("HugeTLB: hugepages unsupported, ignoring hugepagesz=%s cmdline\n", s); + return 0; + } + parsed_valid_hugepagesz = false; size = (unsigned long)memparse(s, NULL); @@ -4414,6 +4424,12 @@ static int __init default_hugepagesz_setup(char *s) unsigned long size; int i; + if (!hugepages_supported()) { + pr_warn("HugeTLB: hugepages unsupported, ignoring default_hugepagesz=%s cmdline\n", + s); + return 0; + } + parsed_valid_hugepagesz = false; if (parsed_default_hugepagesz) { pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s); diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c index e7f554a31bb4..9e1c5f2b7a41 100644 --- a/mm/kmsan/shadow.c +++ b/mm/kmsan/shadow.c @@ -207,7 +207,7 @@ void kmsan_free_page(struct page *page, unsigned int order) if (!kmsan_enabled || kmsan_in_runtime()) return; kmsan_enter_runtime(); - kmsan_internal_poison_memory(page_address(page), page_size(page), + kmsan_internal_poison_memory(page_address(page), PAGE_SIZE << order, GFP_KERNEL & ~(__GFP_RECLAIM), KMSAN_POISON_CHECK | KMSAN_POISON_FREE); kmsan_leave_runtime(); diff --git a/mm/numa_memblks.c b/mm/numa_memblks.c index 5b009a9cd8b4..8f5735fda0a2 100644 --- a/mm/numa_memblks.c +++ b/mm/numa_memblks.c @@ -7,6 +7,8 @@ #include <linux/numa.h> #include <linux/numa_memblks.h> +#include <asm/numa.h> + int numa_distance_cnt; static u8 *numa_distance; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c380f063e8b7..f65c4edf199d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -167,6 +167,33 @@ static inline void __pcp_trylock_noop(unsigned long *flags) { } pcp_trylock_finish(UP_flags); \ }) +/* + * With the UP spinlock implementation, when we spin_lock(&pcp->lock) (for i.e. + * a potentially remote cpu drain) and get interrupted by an operation that + * attempts pcp_spin_trylock(), we can't rely on the trylock failure due to UP + * spinlock assumptions making the trylock a no-op. So we have to turn that + * spin_lock() to a spin_lock_irqsave(). This works because on UP there are no + * remote cpu's so we can only be locking the only existing local one. + */ +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) +static inline void __flags_noop(unsigned long *flags) { } +#define pcp_spin_lock_maybe_irqsave(ptr, flags) \ +({ \ + __flags_noop(&(flags)); \ + spin_lock(&(ptr)->lock); \ +}) +#define pcp_spin_unlock_maybe_irqrestore(ptr, flags) \ +({ \ + spin_unlock(&(ptr)->lock); \ + __flags_noop(&(flags)); \ +}) +#else +#define pcp_spin_lock_maybe_irqsave(ptr, flags) \ + spin_lock_irqsave(&(ptr)->lock, flags) +#define pcp_spin_unlock_maybe_irqrestore(ptr, flags) \ + spin_unlock_irqrestore(&(ptr)->lock, flags) +#endif + #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DEFINE_PER_CPU(int, numa_node); EXPORT_PER_CPU_SYMBOL(numa_node); @@ -2556,6 +2583,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp) { int high_min, to_drain, to_drain_batched, batch; + unsigned long UP_flags; bool todo = false; high_min = READ_ONCE(pcp->high_min); @@ -2575,9 +2603,9 @@ bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp) to_drain = pcp->count - pcp->high; while (to_drain > 0) { to_drain_batched = min(to_drain, batch); - spin_lock(&pcp->lock); + pcp_spin_lock_maybe_irqsave(pcp, UP_flags); free_pcppages_bulk(zone, to_drain_batched, pcp, 0); - spin_unlock(&pcp->lock); + pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags); todo = true; to_drain -= to_drain_batched; @@ -2594,14 +2622,15 @@ bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp) */ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) { + unsigned long UP_flags; int to_drain, batch; batch = READ_ONCE(pcp->batch); to_drain = min(pcp->count, batch); if (to_drain > 0) { - spin_lock(&pcp->lock); + pcp_spin_lock_maybe_irqsave(pcp, UP_flags); free_pcppages_bulk(zone, to_drain, pcp, 0); - spin_unlock(&pcp->lock); + pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags); } } #endif @@ -2612,10 +2641,11 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) static void drain_pages_zone(unsigned int cpu, struct zone *zone) { struct per_cpu_pages *pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); + unsigned long UP_flags; int count; do { - spin_lock(&pcp->lock); + pcp_spin_lock_maybe_irqsave(pcp, UP_flags); count = pcp->count; if (count) { int to_drain = min(count, @@ -2624,7 +2654,7 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone) free_pcppages_bulk(zone, to_drain, pcp, 0); count -= to_drain; } - spin_unlock(&pcp->lock); + pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags); } while (count); } @@ -6109,6 +6139,7 @@ static void zone_pcp_update_cacheinfo(struct zone *zone, unsigned int cpu) { struct per_cpu_pages *pcp; struct cpu_cacheinfo *cci; + unsigned long UP_flags; pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); cci = get_cpu_cacheinfo(cpu); @@ -6119,12 +6150,12 @@ static void zone_pcp_update_cacheinfo(struct zone *zone, unsigned int cpu) * This can reduce zone lock contention without hurting * cache-hot pages sharing. */ - spin_lock(&pcp->lock); + pcp_spin_lock_maybe_irqsave(pcp, UP_flags); if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch) pcp->flags |= PCPF_FREE_HIGH_BATCH; else pcp->flags &= ~PCPF_FREE_HIGH_BATCH; - spin_unlock(&pcp->lock); + pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags); } void setup_pcp_cacheinfo(unsigned int cpu) @@ -6667,11 +6698,19 @@ static int percpu_pagelist_high_fraction_sysctl_handler(const struct ctl_table * int old_percpu_pagelist_high_fraction; int ret; + /* + * Avoid using pcp_batch_high_lock for reads as the value is read + * atomically and a race with offlining is harmless. + */ + + if (!write) + return proc_dointvec_minmax(table, write, buffer, length, ppos); + mutex_lock(&pcp_batch_high_lock); old_percpu_pagelist_high_fraction = percpu_pagelist_high_fraction; ret = proc_dointvec_minmax(table, write, buffer, length, ppos); - if (!write || ret < 0) + if (ret < 0) goto out; /* Sanity checking to avoid pcp imbalance */ @@ -67,18 +67,13 @@ struct mmap_state { .state = VMA_MERGE_START, \ } -/* - * If, at any point, the VMA had unCoW'd mappings from parents, it will maintain - * more than one anon_vma_chain connecting it to more than one anon_vma. A merge - * would mean a wider range of folios sharing the root anon_vma lock, and thus - * potential lock contention, we do not wish to encourage merging such that this - * scales to a problem. - */ -static bool vma_had_uncowed_parents(struct vm_area_struct *vma) +/* Was this VMA ever forked from a parent, i.e. maybe contains CoW mappings? */ +static bool vma_is_fork_child(struct vm_area_struct *vma) { /* * The list_is_singular() test is to avoid merging VMA cloned from - * parents. This can improve scalability caused by anon_vma lock. + * parents. This can improve scalability caused by the anon_vma root + * lock. */ return vma && vma->anon_vma && !list_is_singular(&vma->anon_vma_chain); } @@ -115,11 +110,19 @@ static bool is_mergeable_anon_vma(struct vma_merge_struct *vmg, bool merge_next) VM_WARN_ON(src && src_anon != src->anon_vma); /* Case 1 - we will dup_anon_vma() from src into tgt. */ - if (!tgt_anon && src_anon) - return !vma_had_uncowed_parents(src); + if (!tgt_anon && src_anon) { + struct vm_area_struct *copied_from = vmg->copied_from; + + if (vma_is_fork_child(src)) + return false; + if (vma_is_fork_child(copied_from)) + return false; + + return true; + } /* Case 2 - we will simply use tgt's anon_vma. */ if (tgt_anon && !src_anon) - return !vma_had_uncowed_parents(tgt); + return !vma_is_fork_child(tgt); /* Case 3 - the anon_vma's are already shared. */ return src_anon == tgt_anon; } @@ -829,6 +832,8 @@ static __must_check struct vm_area_struct *vma_merge_existing_range( VM_WARN_ON_VMG(middle && !(vma_iter_addr(vmg->vmi) >= middle->vm_start && vma_iter_addr(vmg->vmi) < middle->vm_end), vmg); + /* An existing merge can never be used by the mremap() logic. */ + VM_WARN_ON_VMG(vmg->copied_from, vmg); vmg->state = VMA_MERGE_NOMERGE; @@ -1099,6 +1104,33 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) } /* + * vma_merge_copied_range - Attempt to merge a VMA that is being copied by + * mremap() + * + * @vmg: Describes the VMA we are adding, in the copied-to range @vmg->start to + * @vmg->end (exclusive), which we try to merge with any adjacent VMAs if + * possible. + * + * vmg->prev, next, start, end, pgoff should all be relative to the COPIED TO + * range, i.e. the target range for the VMA. + * + * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer + * to the VMA we expanded. + * + * ASSUMPTIONS: Same as vma_merge_new_range(), except vmg->middle must contain + * the copied-from VMA. + */ +static struct vm_area_struct *vma_merge_copied_range(struct vma_merge_struct *vmg) +{ + /* We must have a copied-from VMA. */ + VM_WARN_ON_VMG(!vmg->middle, vmg); + + vmg->copied_from = vmg->middle; + vmg->middle = NULL; + return vma_merge_new_range(vmg); +} + +/* * vma_expand - Expand an existing VMA * * @vmg: Describes a VMA expansion operation. @@ -1117,46 +1149,52 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) int vma_expand(struct vma_merge_struct *vmg) { struct vm_area_struct *anon_dup = NULL; - bool remove_next = false; struct vm_area_struct *target = vmg->target; struct vm_area_struct *next = vmg->next; + bool remove_next = false; vm_flags_t sticky_flags; - - sticky_flags = vmg->vm_flags & VM_STICKY; - sticky_flags |= target->vm_flags & VM_STICKY; - - VM_WARN_ON_VMG(!target, vmg); + int ret = 0; mmap_assert_write_locked(vmg->mm); - vma_start_write(target); - if (next && (target != next) && (vmg->end == next->vm_end)) { - int ret; - sticky_flags |= next->vm_flags & VM_STICKY; + if (next && target != next && vmg->end == next->vm_end) remove_next = true; - /* This should already have been checked by this point. */ - VM_WARN_ON_VMG(!can_merge_remove_vma(next), vmg); - vma_start_write(next); - /* - * In this case we don't report OOM, so vmg->give_up_on_mm is - * safe. - */ - ret = dup_anon_vma(target, next, &anon_dup); - if (ret) - return ret; - } + /* We must have a target. */ + VM_WARN_ON_VMG(!target, vmg); + /* This should have already been checked by this point. */ + VM_WARN_ON_VMG(remove_next && !can_merge_remove_vma(next), vmg); /* Not merging but overwriting any part of next is not handled. */ VM_WARN_ON_VMG(next && !remove_next && next != target && vmg->end > next->vm_start, vmg); - /* Only handles expanding */ + /* Only handles expanding. */ VM_WARN_ON_VMG(target->vm_start < vmg->start || target->vm_end > vmg->end, vmg); + sticky_flags = vmg->vm_flags & VM_STICKY; + sticky_flags |= target->vm_flags & VM_STICKY; if (remove_next) - vmg->__remove_next = true; + sticky_flags |= next->vm_flags & VM_STICKY; + + /* + * If we are removing the next VMA or copying from a VMA + * (e.g. mremap()'ing), we must propagate anon_vma state. + * + * Note that, by convention, callers ignore OOM for this case, so + * we don't need to account for vmg->give_up_on_mm here. + */ + if (remove_next) + ret = dup_anon_vma(target, next, &anon_dup); + if (!ret && vmg->copied_from) + ret = dup_anon_vma(target, vmg->copied_from, &anon_dup); + if (ret) + return ret; + if (remove_next) { + vma_start_write(next); + vmg->__remove_next = true; + } if (commit_merge(vmg)) goto nomem; @@ -1828,10 +1866,9 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, if (new_vma && new_vma->vm_start < addr + len) return NULL; /* should never get here */ - vmg.middle = NULL; /* New VMA range. */ vmg.pgoff = pgoff; vmg.next = vma_iter_next_rewind(&vmi, NULL); - new_vma = vma_merge_new_range(&vmg); + new_vma = vma_merge_copied_range(&vmg); if (new_vma) { /* @@ -106,6 +106,9 @@ struct vma_merge_struct { struct anon_vma_name *anon_name; enum vma_merge_state state; + /* If copied from (i.e. mremap()'d) the VMA from which we are copying. */ + struct vm_area_struct *copied_from; + /* Flags which callers can use to modify merge behaviour: */ /* diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 41dd01e8430c..628f96e83b11 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -4248,7 +4248,7 @@ void *vzalloc_node_noprof(unsigned long size, int node) EXPORT_SYMBOL(vzalloc_node_noprof); /** - * vrealloc_node_align_noprof - reallocate virtually contiguous memory; contents + * vrealloc_node_align - reallocate virtually contiguous memory; contents * remain unchanged * @p: object to reallocate memory for * @size: the size to reallocate diff --git a/mm/zswap.c b/mm/zswap.c index 5d0f8b13a958..ac9b7a60736b 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -787,7 +787,7 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) return 0; fail: - if (acomp) + if (!IS_ERR_OR_NULL(acomp)) crypto_free_acomp(acomp); kfree(buffer); return ret; diff --git a/tools/testing/selftests/mm/gup_longterm.c b/tools/testing/selftests/mm/gup_longterm.c index 6279893a0adc..f61150d28eb2 100644 --- a/tools/testing/selftests/mm/gup_longterm.c +++ b/tools/testing/selftests/mm/gup_longterm.c @@ -179,7 +179,7 @@ static void do_test(int fd, size_t size, enum test_type type, bool shared) if (rw && shared && fs_is_unknown(fs_type)) { ksft_print_msg("Unknown filesystem\n"); result = KSFT_SKIP; - return; + break; } /* * R/O pinning or pinning in a private mapping is always diff --git a/tools/testing/selftests/mm/merge.c b/tools/testing/selftests/mm/merge.c index 363c1033cc7d..10b686102b79 100644 --- a/tools/testing/selftests/mm/merge.c +++ b/tools/testing/selftests/mm/merge.c @@ -22,12 +22,37 @@ FIXTURE(merge) struct procmap_fd procmap; }; +static char *map_carveout(unsigned int page_size) +{ + return mmap(NULL, 30 * page_size, PROT_NONE, + MAP_ANON | MAP_PRIVATE, -1, 0); +} + +static pid_t do_fork(struct procmap_fd *procmap) +{ + pid_t pid = fork(); + + if (pid == -1) + return -1; + if (pid != 0) { + wait(NULL); + return pid; + } + + /* Reopen for child. */ + if (close_procmap(procmap)) + return -1; + if (open_self_procmap(procmap)) + return -1; + + return 0; +} + FIXTURE_SETUP(merge) { self->page_size = psize(); /* Carve out PROT_NONE region to map over. */ - self->carveout = mmap(NULL, 30 * self->page_size, PROT_NONE, - MAP_ANON | MAP_PRIVATE, -1, 0); + self->carveout = map_carveout(self->page_size); ASSERT_NE(self->carveout, MAP_FAILED); /* Setup PROCMAP_QUERY interface. */ ASSERT_EQ(open_self_procmap(&self->procmap), 0); @@ -36,7 +61,8 @@ FIXTURE_SETUP(merge) FIXTURE_TEARDOWN(merge) { ASSERT_EQ(munmap(self->carveout, 30 * self->page_size), 0); - ASSERT_EQ(close_procmap(&self->procmap), 0); + /* May fail for parent of forked process. */ + close_procmap(&self->procmap); /* * Clear unconditionally, as some tests set this. It is no issue if this * fails (KSM may be disabled for instance). @@ -44,6 +70,44 @@ FIXTURE_TEARDOWN(merge) prctl(PR_SET_MEMORY_MERGE, 0, 0, 0, 0); } +FIXTURE(merge_with_fork) +{ + unsigned int page_size; + char *carveout; + struct procmap_fd procmap; +}; + +FIXTURE_VARIANT(merge_with_fork) +{ + bool forked; +}; + +FIXTURE_VARIANT_ADD(merge_with_fork, forked) +{ + .forked = true, +}; + +FIXTURE_VARIANT_ADD(merge_with_fork, unforked) +{ + .forked = false, +}; + +FIXTURE_SETUP(merge_with_fork) +{ + self->page_size = psize(); + self->carveout = map_carveout(self->page_size); + ASSERT_NE(self->carveout, MAP_FAILED); + ASSERT_EQ(open_self_procmap(&self->procmap), 0); +} + +FIXTURE_TEARDOWN(merge_with_fork) +{ + ASSERT_EQ(munmap(self->carveout, 30 * self->page_size), 0); + ASSERT_EQ(close_procmap(&self->procmap), 0); + /* See above. */ + prctl(PR_SET_MEMORY_MERGE, 0, 0, 0, 0); +} + TEST_F(merge, mprotect_unfaulted_left) { unsigned int page_size = self->page_size; @@ -322,8 +386,8 @@ TEST_F(merge, forked_target_vma) unsigned int page_size = self->page_size; char *carveout = self->carveout; struct procmap_fd *procmap = &self->procmap; - pid_t pid; char *ptr, *ptr2; + pid_t pid; int i; /* @@ -344,19 +408,10 @@ TEST_F(merge, forked_target_vma) */ ptr[0] = 'x'; - pid = fork(); + pid = do_fork(&self->procmap); ASSERT_NE(pid, -1); - - if (pid != 0) { - wait(NULL); + if (pid != 0) return; - } - - /* Child process below: */ - - /* Reopen for child. */ - ASSERT_EQ(close_procmap(&self->procmap), 0); - ASSERT_EQ(open_self_procmap(&self->procmap), 0); /* unCOWing everything does not cause the AVC to go away. */ for (i = 0; i < 5 * page_size; i += page_size) @@ -386,8 +441,8 @@ TEST_F(merge, forked_source_vma) unsigned int page_size = self->page_size; char *carveout = self->carveout; struct procmap_fd *procmap = &self->procmap; - pid_t pid; char *ptr, *ptr2; + pid_t pid; int i; /* @@ -408,19 +463,10 @@ TEST_F(merge, forked_source_vma) */ ptr[0] = 'x'; - pid = fork(); + pid = do_fork(&self->procmap); ASSERT_NE(pid, -1); - - if (pid != 0) { - wait(NULL); + if (pid != 0) return; - } - - /* Child process below: */ - - /* Reopen for child. */ - ASSERT_EQ(close_procmap(&self->procmap), 0); - ASSERT_EQ(open_self_procmap(&self->procmap), 0); /* unCOWing everything does not cause the AVC to go away. */ for (i = 0; i < 5 * page_size; i += page_size) @@ -1171,4 +1217,288 @@ TEST_F(merge, mremap_correct_placed_faulted) ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr + 15 * page_size); } +TEST_F(merge_with_fork, mremap_faulted_to_unfaulted_prev) +{ + struct procmap_fd *procmap = &self->procmap; + unsigned int page_size = self->page_size; + unsigned long offset; + char *ptr_a, *ptr_b; + + /* + * mremap() such that A and B merge: + * + * |------------| + * | \ | + * |-----------| | / |---------| + * | unfaulted | v \ | faulted | + * |-----------| / |---------| + * B \ A + */ + + /* Map VMA A into place. */ + ptr_a = mmap(&self->carveout[page_size + 3 * page_size], + 3 * page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0); + ASSERT_NE(ptr_a, MAP_FAILED); + /* Fault it in. */ + ptr_a[0] = 'x'; + + if (variant->forked) { + pid_t pid = do_fork(&self->procmap); + + ASSERT_NE(pid, -1); + if (pid != 0) + return; + } + + /* + * Now move it out of the way so we can place VMA B in position, + * unfaulted. + */ + ptr_a = mremap(ptr_a, 3 * page_size, 3 * page_size, + MREMAP_FIXED | MREMAP_MAYMOVE, &self->carveout[20 * page_size]); + ASSERT_NE(ptr_a, MAP_FAILED); + + /* Map VMA B into place. */ + ptr_b = mmap(&self->carveout[page_size], 3 * page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0); + ASSERT_NE(ptr_b, MAP_FAILED); + + /* + * Now move VMA A into position with MREMAP_DONTUNMAP to catch incorrect + * anon_vma propagation. + */ + ptr_a = mremap(ptr_a, 3 * page_size, 3 * page_size, + MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP, + &self->carveout[page_size + 3 * page_size]); + ASSERT_NE(ptr_a, MAP_FAILED); + + /* The VMAs should have merged, if not forked. */ + ASSERT_TRUE(find_vma_procmap(procmap, ptr_b)); + ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr_b); + + offset = variant->forked ? 3 * page_size : 6 * page_size; + ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_b + offset); +} + +TEST_F(merge_with_fork, mremap_faulted_to_unfaulted_next) +{ + struct procmap_fd *procmap = &self->procmap; + unsigned int page_size = self->page_size; + unsigned long offset; + char *ptr_a, *ptr_b; + + /* + * mremap() such that A and B merge: + * + * |---------------------------| + * | \ | + * | |-----------| / |---------| + * v | unfaulted | \ | faulted | + * |-----------| / |---------| + * B \ A + * + * Then unmap VMA A to trigger the bug. + */ + + /* Map VMA A into place. */ + ptr_a = mmap(&self->carveout[page_size], 3 * page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0); + ASSERT_NE(ptr_a, MAP_FAILED); + /* Fault it in. */ + ptr_a[0] = 'x'; + + if (variant->forked) { + pid_t pid = do_fork(&self->procmap); + + ASSERT_NE(pid, -1); + if (pid != 0) + return; + } + + /* + * Now move it out of the way so we can place VMA B in position, + * unfaulted. + */ + ptr_a = mremap(ptr_a, 3 * page_size, 3 * page_size, + MREMAP_FIXED | MREMAP_MAYMOVE, &self->carveout[20 * page_size]); + ASSERT_NE(ptr_a, MAP_FAILED); + + /* Map VMA B into place. */ + ptr_b = mmap(&self->carveout[page_size + 3 * page_size], 3 * page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0); + ASSERT_NE(ptr_b, MAP_FAILED); + + /* + * Now move VMA A into position with MREMAP_DONTUNMAP to catch incorrect + * anon_vma propagation. + */ + ptr_a = mremap(ptr_a, 3 * page_size, 3 * page_size, + MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP, + &self->carveout[page_size]); + ASSERT_NE(ptr_a, MAP_FAILED); + + /* The VMAs should have merged, if not forked. */ + ASSERT_TRUE(find_vma_procmap(procmap, ptr_a)); + ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr_a); + offset = variant->forked ? 3 * page_size : 6 * page_size; + ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_a + offset); +} + +TEST_F(merge_with_fork, mremap_faulted_to_unfaulted_prev_unfaulted_next) +{ + struct procmap_fd *procmap = &self->procmap; + unsigned int page_size = self->page_size; + unsigned long offset; + char *ptr_a, *ptr_b, *ptr_c; + + /* + * mremap() with MREMAP_DONTUNMAP such that A, B and C merge: + * + * |---------------------------| + * | \ | + * |-----------| | |-----------| / |---------| + * | unfaulted | v | unfaulted | \ | faulted | + * |-----------| |-----------| / |---------| + * A C \ B + */ + + /* Map VMA B into place. */ + ptr_b = mmap(&self->carveout[page_size + 3 * page_size], 3 * page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0); + ASSERT_NE(ptr_b, MAP_FAILED); + /* Fault it in. */ + ptr_b[0] = 'x'; + + if (variant->forked) { + pid_t pid = do_fork(&self->procmap); + + ASSERT_NE(pid, -1); + if (pid != 0) + return; + } + + /* + * Now move it out of the way so we can place VMAs A, C in position, + * unfaulted. + */ + ptr_b = mremap(ptr_b, 3 * page_size, 3 * page_size, + MREMAP_FIXED | MREMAP_MAYMOVE, &self->carveout[20 * page_size]); + ASSERT_NE(ptr_b, MAP_FAILED); + + /* Map VMA A into place. */ + + ptr_a = mmap(&self->carveout[page_size], 3 * page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0); + ASSERT_NE(ptr_a, MAP_FAILED); + + /* Map VMA C into place. */ + ptr_c = mmap(&self->carveout[page_size + 3 * page_size + 3 * page_size], + 3 * page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0); + ASSERT_NE(ptr_c, MAP_FAILED); + + /* + * Now move VMA B into position with MREMAP_DONTUNMAP to catch incorrect + * anon_vma propagation. + */ + ptr_b = mremap(ptr_b, 3 * page_size, 3 * page_size, + MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP, + &self->carveout[page_size + 3 * page_size]); + ASSERT_NE(ptr_b, MAP_FAILED); + + /* The VMAs should have merged, if not forked. */ + ASSERT_TRUE(find_vma_procmap(procmap, ptr_a)); + ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr_a); + offset = variant->forked ? 3 * page_size : 9 * page_size; + ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_a + offset); + + /* If forked, B and C should also not have merged. */ + if (variant->forked) { + ASSERT_TRUE(find_vma_procmap(procmap, ptr_b)); + ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr_b); + ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_b + 3 * page_size); + } +} + +TEST_F(merge_with_fork, mremap_faulted_to_unfaulted_prev_faulted_next) +{ + struct procmap_fd *procmap = &self->procmap; + unsigned int page_size = self->page_size; + char *ptr_a, *ptr_b, *ptr_bc; + + /* + * mremap() with MREMAP_DONTUNMAP such that A, B and C merge: + * + * |---------------------------| + * | \ | + * |-----------| | |-----------| / |---------| + * | unfaulted | v | faulted | \ | faulted | + * |-----------| |-----------| / |---------| + * A C \ B + */ + + /* + * Map VMA B and C into place. We have to map them together so their + * anon_vma is the same and the vma->vm_pgoff's are correctly aligned. + */ + ptr_bc = mmap(&self->carveout[page_size + 3 * page_size], + 3 * page_size + 3 * page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0); + ASSERT_NE(ptr_bc, MAP_FAILED); + + /* Fault it in. */ + ptr_bc[0] = 'x'; + + if (variant->forked) { + pid_t pid = do_fork(&self->procmap); + + ASSERT_NE(pid, -1); + if (pid != 0) + return; + } + + /* + * Now move VMA B out the way (splitting VMA BC) so we can place VMA A + * in position, unfaulted, and leave the remainder of the VMA we just + * moved in place, faulted, as VMA C. + */ + ptr_b = mremap(ptr_bc, 3 * page_size, 3 * page_size, + MREMAP_FIXED | MREMAP_MAYMOVE, &self->carveout[20 * page_size]); + ASSERT_NE(ptr_b, MAP_FAILED); + + /* Map VMA A into place. */ + ptr_a = mmap(&self->carveout[page_size], 3 * page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0); + ASSERT_NE(ptr_a, MAP_FAILED); + + /* + * Now move VMA B into position with MREMAP_DONTUNMAP to catch incorrect + * anon_vma propagation. + */ + ptr_b = mremap(ptr_b, 3 * page_size, 3 * page_size, + MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP, + &self->carveout[page_size + 3 * page_size]); + ASSERT_NE(ptr_b, MAP_FAILED); + + /* The VMAs should have merged. A,B,C if unforked, B, C if forked. */ + if (variant->forked) { + ASSERT_TRUE(find_vma_procmap(procmap, ptr_b)); + ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr_b); + ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_b + 6 * page_size); + } else { + ASSERT_TRUE(find_vma_procmap(procmap, ptr_a)); + ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr_a); + ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_a + 9 * page_size); + } +} + TEST_HARNESS_MAIN |
