From e930d999715073a70d306fb59a394ea8b84d0b45 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 22 Mar 2022 14:46:51 -0700 Subject: mm, memory_hotplug: make arch_alloc_nodedata independent on CONFIG_MEMORY_HOTPLUG Patch series "mm, memory_hotplug: handle unitialized numa node gracefully". The core of the fix is patch 2 which also links existing bug reports. The high level goal is to have all possible numa nodes have their pgdat allocated and initialized so for_each_possible_node(nid) NODE_DATA(nid) will never return garbage. This has proven to be problem in several places when an offline numa node is used for an allocation just to realize that node_data and therefore allocation fallback zonelists are not initialized and such an allocation request blows up. There were attempts to address that by checking node_online in several places including the page allocator. This patchset approaches the problem from a different perspective and instead of special casing, which just adds a runtime overhead, it allocates pglist_data for each possible node. This can add some memory overhead for platforms with high number of possible nodes if they do not contain any memory. This should be a rather rare configuration though. How to test this? David has provided and excellent howto: http://lkml.kernel.org/r/6e5ebc19-890c-b6dd-1924-9f25c441010d@redhat.com Patches 1 and 3-6 are mostly cleanups. The patchset has been reviewed by Rafael (thanks!) and the core fix tested by Rafael and Alexey (thanks to both). David has tested as per instructions above and hasn't found any fallouts in the memory hotplug scenarios. This patch (of 6): This is a preparatory patch and it doesn't introduce any functional change. It merely pulls out arch_alloc_nodedata (and co) outside of CONFIG_MEMORY_HOTPLUG because the following patch will need to call this from the generic MM code. Link: https://lkml.kernel.org/r/20220127085305.20890-1-mhocko@kernel.org Link: https://lkml.kernel.org/r/20220127085305.20890-2-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Rafael Aquini Acked-by: David Hildenbrand Acked-by: Mike Rapoport Reviewed-by: Oscar Salvador Reviewed-by: Wei Yang Cc: Alexey Makhalov Cc: Christoph Lameter Cc: Dennis Zhou Cc: Eric Dumazet Cc: Nico Pache Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 119 ++++++++++++++++++++--------------------- 1 file changed, 59 insertions(+), 60 deletions(-) (limited to 'include/linux/memory_hotplug.h') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index be48e003a518..4355983b364d 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -16,6 +16,65 @@ struct memory_group; struct resource; struct vmem_altmap; +#ifdef CONFIG_HAVE_ARCH_NODEDATA_EXTENSION +/* + * For supporting node-hotadd, we have to allocate a new pgdat. + * + * If an arch has generic style NODE_DATA(), + * node_data[nid] = kzalloc() works well. But it depends on the architecture. + * + * In general, generic_alloc_nodedata() is used. + * Now, arch_free_nodedata() is just defined for error path of node_hot_add. + * + */ +extern pg_data_t *arch_alloc_nodedata(int nid); +extern void arch_free_nodedata(pg_data_t *pgdat); +extern void arch_refresh_nodedata(int nid, pg_data_t *pgdat); + +#else /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */ + +#define arch_alloc_nodedata(nid) generic_alloc_nodedata(nid) +#define arch_free_nodedata(pgdat) generic_free_nodedata(pgdat) + +#ifdef CONFIG_NUMA +/* + * XXX: node aware allocation can't work well to get new node's memory at this time. + * Because, pgdat for the new node is not allocated/initialized yet itself. + * To use new node's memory, more consideration will be necessary. + */ +#define generic_alloc_nodedata(nid) \ +({ \ + kzalloc(sizeof(pg_data_t), GFP_KERNEL); \ +}) +/* + * This definition is just for error path in node hotadd. + * For node hotremove, we have to replace this. + */ +#define generic_free_nodedata(pgdat) kfree(pgdat) + +extern pg_data_t *node_data[]; +static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat) +{ + node_data[nid] = pgdat; +} + +#else /* !CONFIG_NUMA */ + +/* never called */ +static inline pg_data_t *generic_alloc_nodedata(int nid) +{ + BUG(); + return NULL; +} +static inline void generic_free_nodedata(pg_data_t *pgdat) +{ +} +static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat) +{ +} +#endif /* CONFIG_NUMA */ +#endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */ + #ifdef CONFIG_MEMORY_HOTPLUG struct page *pfn_to_online_page(unsigned long pfn); @@ -154,66 +213,6 @@ int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, struct mhp_params *params); #endif /* ARCH_HAS_ADD_PAGES */ -#ifdef CONFIG_HAVE_ARCH_NODEDATA_EXTENSION -/* - * For supporting node-hotadd, we have to allocate a new pgdat. - * - * If an arch has generic style NODE_DATA(), - * node_data[nid] = kzalloc() works well. But it depends on the architecture. - * - * In general, generic_alloc_nodedata() is used. - * Now, arch_free_nodedata() is just defined for error path of node_hot_add. - * - */ -extern pg_data_t *arch_alloc_nodedata(int nid); -extern void arch_free_nodedata(pg_data_t *pgdat); -extern void arch_refresh_nodedata(int nid, pg_data_t *pgdat); - -#else /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */ - -#define arch_alloc_nodedata(nid) generic_alloc_nodedata(nid) -#define arch_free_nodedata(pgdat) generic_free_nodedata(pgdat) - -#ifdef CONFIG_NUMA -/* - * If ARCH_HAS_NODEDATA_EXTENSION=n, this func is used to allocate pgdat. - * XXX: kmalloc_node() can't work well to get new node's memory at this time. - * Because, pgdat for the new node is not allocated/initialized yet itself. - * To use new node's memory, more consideration will be necessary. - */ -#define generic_alloc_nodedata(nid) \ -({ \ - kzalloc(sizeof(pg_data_t), GFP_KERNEL); \ -}) -/* - * This definition is just for error path in node hotadd. - * For node hotremove, we have to replace this. - */ -#define generic_free_nodedata(pgdat) kfree(pgdat) - -extern pg_data_t *node_data[]; -static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat) -{ - node_data[nid] = pgdat; -} - -#else /* !CONFIG_NUMA */ - -/* never called */ -static inline pg_data_t *generic_alloc_nodedata(int nid) -{ - BUG(); - return NULL; -} -static inline void generic_free_nodedata(pg_data_t *pgdat) -{ -} -static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat) -{ -} -#endif /* CONFIG_NUMA */ -#endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */ - void get_online_mems(void); void put_online_mems(void); -- cgit v1.2.3 From 09f49dca570a917a8c6bccd7e8c61f5141534e3a Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 22 Mar 2022 14:46:54 -0700 Subject: mm: handle uninitialized numa nodes gracefully We have had several reports [1][2][3] that page allocator blows up when an allocation from a possible node is requested. The underlying reason is that NODE_DATA for the specific node is not allocated. NUMA specific initialization is arch specific and it can vary a lot. E.g. x86 tries to initialize all nodes that have some cpu affinity (see init_cpu_to_node) but this can be insufficient because the node might be cpuless for example. One way to address this problem would be to check for !node_online nodes when trying to get a zonelist and silently fall back to another node. That is unfortunately adding a branch into allocator hot path and it doesn't handle any other potential NODE_DATA users. This patch takes a different approach (following a lead of [3]) and it pre allocates pgdat for all possible nodes in an arch indipendent code - free_area_init. All uninitialized nodes are treated as memoryless nodes. node_state of the node is not changed because that would lead to other side effects - e.g. sysfs representation of such a node and from past discussions [4] it is known that some tools might have problems digesting that. Newly allocated pgdat only gets a minimal initialization and the rest of the work is expected to be done by the memory hotplug - hotadd_new_pgdat (renamed to hotadd_init_pgdat). generic_alloc_nodedata is changed to use the memblock allocator because neither page nor slab allocators are available at the stage when all pgdats are allocated. Hotplug doesn't allocate pgdat anymore so we can use the early boot allocator. The only arch specific implementation is ia64 and that is changed to use the early allocator as well. [1] http://lkml.kernel.org/r/20211101201312.11589-1-amakhalov@vmware.com [2] http://lkml.kernel.org/r/20211207224013.880775-1-npache@redhat.com [3] http://lkml.kernel.org/r/20190114082416.30939-1-mhocko@kernel.org [4] http://lkml.kernel.org/r/20200428093836.27190-1-srikar@linux.vnet.ibm.com [akpm@linux-foundation.org: replace comment, per Mike] Link: https://lkml.kernel.org/r/Yfe7RBeLCijnWBON@dhcp22.suse.cz Reported-by: Alexey Makhalov Tested-by: Alexey Makhalov Reported-by: Nico Pache Acked-by: Rafael Aquini Tested-by: Rafael Aquini Acked-by: David Hildenbrand Reviewed-by: Oscar Salvador Acked-by: Mike Rapoport Signed-off-by: Michal Hocko Cc: Christoph Lameter Cc: Dennis Zhou Cc: Eric Dumazet Cc: Tejun Heo Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/memory_hotplug.h') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 4355983b364d..cdd66bfdf855 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -44,7 +44,7 @@ extern void arch_refresh_nodedata(int nid, pg_data_t *pgdat); */ #define generic_alloc_nodedata(nid) \ ({ \ - kzalloc(sizeof(pg_data_t), GFP_KERNEL); \ + memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES); \ }) /* * This definition is just for error path in node hotadd. -- cgit v1.2.3 From 390511e1476eb1cc41d420a7661b33f4d8584c3f Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 22 Mar 2022 14:46:57 -0700 Subject: mm, memory_hotplug: drop arch_free_nodedata Prior to "mm: handle uninitialized numa nodes gracefully" memory hotplug used to allocate pgdat when memory has been added to a node (hotadd_init_pgdat) arch_free_nodedata has been only used in the failure path because once the pgdat is exported (to be visible by NODA_DATA(nid)) it cannot really be freed because there is no synchronization available for that. pgdat is allocated for each possible nodes now so the memory hotplug doesn't need to do the ever use arch_free_nodedata so drop it. This patch doesn't introduce any functional change. Link: https://lkml.kernel.org/r/20220127085305.20890-4-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Rafael Aquini Acked-by: David Hildenbrand Acked-by: Mike Rapoport Reviewed-by: Oscar Salvador Cc: Alexey Makhalov Cc: Christoph Lameter Cc: Dennis Zhou Cc: Eric Dumazet Cc: Nico Pache Cc: Tejun Heo Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux/memory_hotplug.h') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index cdd66bfdf855..60f09d3ebb3d 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -24,17 +24,14 @@ struct vmem_altmap; * node_data[nid] = kzalloc() works well. But it depends on the architecture. * * In general, generic_alloc_nodedata() is used. - * Now, arch_free_nodedata() is just defined for error path of node_hot_add. * */ extern pg_data_t *arch_alloc_nodedata(int nid); -extern void arch_free_nodedata(pg_data_t *pgdat); extern void arch_refresh_nodedata(int nid, pg_data_t *pgdat); #else /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */ #define arch_alloc_nodedata(nid) generic_alloc_nodedata(nid) -#define arch_free_nodedata(pgdat) generic_free_nodedata(pgdat) #ifdef CONFIG_NUMA /* -- cgit v1.2.3 From 70b5b46a754245d383811b4d2f2c76c34bb7e145 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 22 Mar 2022 14:47:00 -0700 Subject: mm, memory_hotplug: reorganize new pgdat initialization When a !node_online node is brought up it needs a hotplug specific initialization because the node could be either uninitialized yet or it could have been recycled after previous hotremove. hotadd_init_pgdat is responsible for that. Internal pgdat state is initialized at two places currently - hotadd_init_pgdat - free_area_init_core_hotplug There is no real clear cut what should go where but this patch's chosen to move the whole internal state initialization into free_area_init_core_hotplug. hotadd_init_pgdat is still responsible to pull all the parts together - most notably to initialize zonelists because those depend on the overall topology. This patch doesn't introduce any functional change. Link: https://lkml.kernel.org/r/20220127085305.20890-5-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Rafael Aquini Acked-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: Alexey Makhalov Cc: Christoph Lameter Cc: Dennis Zhou Cc: Eric Dumazet Cc: Mike Rapoport Cc: Nico Pache Cc: Tejun Heo Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/memory_hotplug.h') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 60f09d3ebb3d..76bf2de86def 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -319,7 +319,7 @@ extern void set_zone_contiguous(struct zone *zone); extern void clear_zone_contiguous(struct zone *zone); #ifdef CONFIG_MEMORY_HOTPLUG -extern void __ref free_area_init_core_hotplug(int nid); +extern void __ref free_area_init_core_hotplug(struct pglist_data *pgdat); extern int __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags); extern int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags); extern int add_memory_resource(int nid, struct resource *resource, -- cgit v1.2.3 From 395f6081bad49f9c54abafebab49ee23aa985bbd Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 22 Mar 2022 14:47:31 -0700 Subject: drivers/base/memory: determine and store zone for single-zone memory blocks test_pages_in_a_zone() is just another nasty PFN walker that can easily stumble over ZONE_DEVICE memory ranges falling into the same memory block as ordinary system RAM: the memmap of parts of these ranges might possibly be uninitialized. In fact, we observed (on an older kernel) with UBSAN: UBSAN: Undefined behaviour in ./include/linux/mm.h:1133:50 index 7 is out of range for type 'zone [5]' CPU: 121 PID: 35603 Comm: read_all Kdump: loaded Tainted: [...] Hardware name: Dell Inc. PowerEdge R7425/08V001, BIOS 1.12.2 11/15/2019 Call Trace: dump_stack+0x9a/0xf0 ubsan_epilogue+0x9/0x7a __ubsan_handle_out_of_bounds+0x13a/0x181 test_pages_in_a_zone+0x3c4/0x500 show_valid_zones+0x1fa/0x380 dev_attr_show+0x43/0xb0 sysfs_kf_seq_show+0x1c5/0x440 seq_read+0x49d/0x1190 vfs_read+0xff/0x300 ksys_read+0xb8/0x170 do_syscall_64+0xa5/0x4b0 entry_SYSCALL_64_after_hwframe+0x6a/0xdf RIP: 0033:0x7f01f4439b52 We seem to stumble over a memmap that contains a garbage zone id. While we could try inserting pfn_to_online_page() calls, it will just make memory offlining slower, because we use test_pages_in_a_zone() to make sure we're offlining pages that all belong to the same zone. Let's just get rid of this PFN walker and determine the single zone of a memory block -- if any -- for early memory blocks during boot. For memory onlining, we know the single zone already. Let's avoid any additional memmap scanning and just rely on the zone information available during boot. For memory hot(un)plug, we only really care about memory blocks that: * span a single zone (and, thereby, a single node) * are completely System RAM (IOW, no holes, no ZONE_DEVICE) If one of these conditions is not met, we reject memory offlining. Hotplugged memory blocks (starting out offline), always meet both conditions. There are three scenarios to handle: (1) Memory hot(un)plug A memory block with zone == NULL cannot be offlined, corresponding to our previous test_pages_in_a_zone() check. After successful memory onlining/offlining, we simply set the zone accordingly. * Memory onlining: set the zone we just used for onlining * Memory offlining: set zone = NULL So a hotplugged memory block starts with zone = NULL. Once memory onlining is done, we set the proper zone. (2) Boot memory with !CONFIG_NUMA We know that there is just a single pgdat, so we simply scan all zones of that pgdat for an intersection with our memory block PFN range when adding the memory block. If more than one zone intersects (e.g., DMA and DMA32 on x86 for the first memory block) we set zone = NULL and consequently mimic what test_pages_in_a_zone() used to do. (3) Boot memory with CONFIG_NUMA At the point in time we create the memory block devices during boot, we don't know yet which nodes *actually* span a memory block. While we could scan all zones of all nodes for intersections, overlapping nodes complicate the situation and scanning all nodes is possibly expensive. But that problem has already been solved by the code that sets the node of a memory block and creates the link in the sysfs -- do_register_memory_block_under_node(). So, we hook into the code that sets the node id for a memory block. If we already have a different node id set for the memory block, we know that multiple nodes *actually* have PFNs falling into our memory block: we set zone = NULL and consequently mimic what test_pages_in_a_zone() used to do. If there is no node id set, we do the same as (2) for the given node. Note that the call order in driver_init() is: -> memory_dev_init(): create memory block devices -> node_dev_init(): link memory block devices to the node and set the node id So in summary, we detect if there is a single zone responsible for this memory block and we consequently store the zone in that case in the memory block, updating it during memory onlining/offlining. Link: https://lkml.kernel.org/r/20220210184359.235565-3-david@redhat.com Signed-off-by: David Hildenbrand Reported-by: Rafael Parra Reviewed-by: Oscar Salvador Cc: "Rafael J. Wysocki" Cc: Greg Kroah-Hartman Cc: Michal Hocko Cc: Rafael Parra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux/memory_hotplug.h') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 76bf2de86def..1ce6f8044f1e 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -163,8 +163,6 @@ extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, extern void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages); extern int online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *zone, struct memory_group *group); -extern struct zone *test_pages_in_a_zone(unsigned long start_pfn, - unsigned long end_pfn); extern void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn); @@ -293,7 +291,7 @@ static inline void pgdat_resize_init(struct pglist_data *pgdat) {} extern void try_offline_node(int nid); extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages, - struct memory_group *group); + struct zone *zone, struct memory_group *group); extern int remove_memory(u64 start, u64 size); extern void __remove_memory(u64 start, u64 size); extern int offline_and_remove_memory(u64 start, u64 size); @@ -302,7 +300,7 @@ extern int offline_and_remove_memory(u64 start, u64 size); static inline void try_offline_node(int nid) {} static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages, - struct memory_group *group) + struct zone *zone, struct memory_group *group) { return -EINVAL; } -- cgit v1.2.3 From e3246d8f52173a798710314a42fea83223036fc8 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Thu, 28 Apr 2022 23:16:15 -0700 Subject: mm/sparse-vmemmap: add a pgmap argument to section activation Patch series "sparse-vmemmap: memory savings for compound devmaps (device-dax)", v9. This series minimizes 'struct page' overhead by pursuing a similar approach as Muchun Song series "Free some vmemmap pages of hugetlb page" (now merged since v5.14), but applied to devmap with @vmemmap_shift (device-dax). The vmemmap dedpulication original idea (already used in HugeTLB) is to reuse/deduplicate tail page vmemmap areas, particular the area which only describes tail pages. So a vmemmap page describes 64 struct pages, and the first page for a given ZONE_DEVICE vmemmap would contain the head page and 63 tail pages. The second vmemmap page would contain only tail pages, and that's what gets reused across the rest of the subsection/section. The bigger the page size, the bigger the savings (2M hpage -> save 6 vmemmap pages; 1G hpage -> save 4094 vmemmap pages). This is done for PMEM /specifically only/ on device-dax configured namespaces, not fsdax. In other words, a devmap with a @vmemmap_shift. In terms of savings, per 1Tb of memory, the struct page cost would go down with compound devmap: * with 2M pages we lose 4G instead of 16G (0.39% instead of 1.5% of total memory) * with 1G pages we lose 40MB instead of 16G (0.0014% instead of 1.5% of total memory) The series is mostly summed up by patch 4, and to summarize what the series does: Patches 1 - 3: Minor cleanups in preparation for patch 4. Move the very nice docs of hugetlb_vmemmap.c into a Documentation/vm/ entry. Patch 4: Patch 4 is the one that takes care of the struct page savings (also referred to here as tail-page/vmemmap deduplication). Much like Muchun series, we reuse the second PTE tail page vmemmap areas across a given @vmemmap_shift On important difference though, is that contrary to the hugetlbfs series, there's no vmemmap for the area because we are late-populating it as opposed to remapping a system-ram range. IOW no freeing of pages of already initialized vmemmap like the case for hugetlbfs, which greatly simplifies the logic (besides not being arch-specific). altmap case unchanged and still goes via the vmemmap_populate(). Also adjust the newly added docs to the device-dax case. [Note that device-dax is still a little behind HugeTLB in terms of savings. I have an additional simple patch that reuses the head vmemmap page too, as a follow-up. That will double the savings and namespaces initialization.] Patch 5: Initialize fewer struct pages depending on the page size with DRAM backed struct pages -- because fewer pages are unique and most tail pages (with bigger vmemmap_shift). NVDIMM namespace bootstrap improves from ~268-358 ms to ~80-110/<1ms on 128G NVDIMMs with 2M and 1G respectivally. And struct page needed capacity will be 3.8x / 1071x smaller for 2M and 1G respectivelly. Tested on x86 with 1.5Tb of pmem (including pinning, and RDMA registration/deregistration scalability with 2M MRs) This patch (of 5): In support of using compound pages for devmap mappings, plumb the pgmap down to the vmemmap_populate implementation. Note that while altmap is retrievable from pgmap the memory hotplug code passes altmap without pgmap[*], so both need to be independently plumbed. So in addition to @altmap, pass @pgmap to sparse section populate functions namely: sparse_add_section section_activate populate_section_memmap __populate_section_memmap Passing @pgmap allows __populate_section_memmap() to both fetch the vmemmap_shift in which memmap metadata is created for and also to let sparse-vmemmap fetch pgmap ranges to co-relate to a given section and pick whether to just reuse tail pages from past onlined sections. While at it, fix the kdoc for @altmap for sparse_add_section(). [*] https://lore.kernel.org/linux-mm/20210319092635.6214-1-osalvador@suse.de/ Link: https://lkml.kernel.org/r/20220420155310.9712-1-joao.m.martins@oracle.com Link: https://lkml.kernel.org/r/20220420155310.9712-2-joao.m.martins@oracle.com Signed-off-by: Joao Martins Reviewed-by: Dan Williams Reviewed-by: Muchun Song Cc: Vishal Verma Cc: Matthew Wilcox Cc: Jason Gunthorpe Cc: Jane Chu Cc: Mike Kravetz Cc: Jonathan Corbet Cc: Christoph Hellwig Signed-off-by: Andrew Morton --- include/linux/memory_hotplug.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux/memory_hotplug.h') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 1ce6f8044f1e..e0b2209ab71c 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -15,6 +15,7 @@ struct memory_block; struct memory_group; struct resource; struct vmem_altmap; +struct dev_pagemap; #ifdef CONFIG_HAVE_ARCH_NODEDATA_EXTENSION /* @@ -122,6 +123,7 @@ typedef int __bitwise mhp_t; struct mhp_params { struct vmem_altmap *altmap; pgprot_t pgprot; + struct dev_pagemap *pgmap; }; bool mhp_range_allowed(u64 start, u64 size, bool need_mapping); @@ -333,7 +335,8 @@ extern void remove_pfn_range_from_zone(struct zone *zone, unsigned long nr_pages); extern bool is_memblock_offlined(struct memory_block *mem); extern int sparse_add_section(int nid, unsigned long pfn, - unsigned long nr_pages, struct vmem_altmap *altmap); + unsigned long nr_pages, struct vmem_altmap *altmap, + struct dev_pagemap *pgmap); extern void sparse_remove_section(struct mem_section *ms, unsigned long pfn, unsigned long nr_pages, unsigned long map_offset, struct vmem_altmap *altmap); -- cgit v1.2.3 From 78f39084b41d287aedb2ea55f2c1895cfa11d61a Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 13 May 2022 16:48:56 -0700 Subject: mm: hugetlb_vmemmap: add hugetlb_optimize_vmemmap sysctl We must add hugetlb_free_vmemmap=on (or "off") to the boot cmdline and reboot the server to enable or disable the feature of optimizing vmemmap pages associated with HugeTLB pages. However, rebooting usually takes a long time. So add a sysctl to enable or disable the feature at runtime without rebooting. Why we need this? There are 3 use cases. 1) The feature of minimizing overhead of struct page associated with each HugeTLB is disabled by default without passing "hugetlb_free_vmemmap=on" to the boot cmdline. When we (ByteDance) deliver the servers to the users who want to enable this feature, they have to configure the grub (change boot cmdline) and reboot the servers, whereas rebooting usually takes a long time (we have thousands of servers). It's a very bad experience for the users. So we need a approach to enable this feature after rebooting. This is a use case in our practical environment. 2) Some use cases are that HugeTLB pages are allocated 'on the fly' instead of being pulled from the HugeTLB pool, those workloads would be affected with this feature enabled. Those workloads could be identified by the characteristics of they never explicitly allocating huge pages with 'nr_hugepages' but only set 'nr_overcommit_hugepages' and then let the pages be allocated from the buddy allocator at fault time. We can confirm it is a real use case from the commit 099730d67417. For those workloads, the page fault time could be ~2x slower than before. We suspect those users want to disable this feature if the system has enabled this before and they don't think the memory savings benefit is enough to make up for the performance drop. 3) If the workload which wants vmemmap pages to be optimized and the workload which wants to set 'nr_overcommit_hugepages' and does not want the extera overhead at fault time when the overcommitted pages be allocated from the buddy allocator are deployed in the same server. The user could enable this feature and set 'nr_hugepages' and 'nr_overcommit_hugepages', then disable the feature. In this case, the overcommited HugeTLB pages will not encounter the extra overhead at fault time. Link: https://lkml.kernel.org/r/20220512041142.39501-5-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Mike Kravetz Cc: Jonathan Corbet Cc: Luis Chamberlain Cc: Kees Cook Cc: Iurii Zaikin Cc: Oscar Salvador Cc: David Hildenbrand Cc: Masahiro Yamada Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- include/linux/memory_hotplug.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux/memory_hotplug.h') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index e0b2209ab71c..20d7edf62a6a 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -351,4 +351,13 @@ void arch_remove_linear_mapping(u64 start, u64 size); extern bool mhp_supports_memmap_on_memory(unsigned long size); #endif /* CONFIG_MEMORY_HOTPLUG */ +#ifdef CONFIG_MHP_MEMMAP_ON_MEMORY +bool mhp_memmap_on_memory(void); +#else +static inline bool mhp_memmap_on_memory(void) +{ + return false; +} +#endif + #endif /* __LINUX_MEMORY_HOTPLUG_H */ -- cgit v1.2.3