Merge tag '6.11-rc-smb-client-fixes-part2' of git://git.samba.org/sfrench/cifs-2.6
[linux-stable.git] / mm / hugetlb_vmemmap.c
blob829112b0a914c9aacf3e5aab38e144a1eb7d3f40
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * HugeTLB Vmemmap Optimization (HVO)
5 * Copyright (c) 2020, ByteDance. All rights reserved.
7 * Author: Muchun Song <songmuchun@bytedance.com>
9 * See Documentation/mm/vmemmap_dedup.rst
11 #define pr_fmt(fmt) "HugeTLB: " fmt
13 #include <linux/pgtable.h>
14 #include <linux/moduleparam.h>
15 #include <linux/bootmem_info.h>
16 #include <linux/mmdebug.h>
17 #include <linux/pagewalk.h>
18 #include <asm/pgalloc.h>
19 #include <asm/tlbflush.h>
20 #include "hugetlb_vmemmap.h"
22 /**
23 * struct vmemmap_remap_walk - walk vmemmap page table
25 * @remap_pte: called for each lowest-level entry (PTE).
26 * @nr_walked: the number of walked pte.
27 * @reuse_page: the page which is reused for the tail vmemmap pages.
28 * @reuse_addr: the virtual address of the @reuse_page page.
29 * @vmemmap_pages: the list head of the vmemmap pages that can be freed
30 * or is mapped from.
31 * @flags: used to modify behavior in vmemmap page table walking
32 * operations.
34 struct vmemmap_remap_walk {
35 void (*remap_pte)(pte_t *pte, unsigned long addr,
36 struct vmemmap_remap_walk *walk);
37 unsigned long nr_walked;
38 struct page *reuse_page;
39 unsigned long reuse_addr;
40 struct list_head *vmemmap_pages;
42 /* Skip the TLB flush when we split the PMD */
43 #define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0)
44 /* Skip the TLB flush when we remap the PTE */
45 #define VMEMMAP_REMAP_NO_TLB_FLUSH BIT(1)
46 unsigned long flags;
49 static int vmemmap_split_pmd(pmd_t *pmd, struct page *head, unsigned long start,
50 struct vmemmap_remap_walk *walk)
52 pmd_t __pmd;
53 int i;
54 unsigned long addr = start;
55 pte_t *pgtable;
57 pgtable = pte_alloc_one_kernel(&init_mm);
58 if (!pgtable)
59 return -ENOMEM;
61 pmd_populate_kernel(&init_mm, &__pmd, pgtable);
63 for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
64 pte_t entry, *pte;
65 pgprot_t pgprot = PAGE_KERNEL;
67 entry = mk_pte(head + i, pgprot);
68 pte = pte_offset_kernel(&__pmd, addr);
69 set_pte_at(&init_mm, addr, pte, entry);
72 spin_lock(&init_mm.page_table_lock);
73 if (likely(pmd_leaf(*pmd))) {
75 * Higher order allocations from buddy allocator must be able to
76 * be treated as indepdenent small pages (as they can be freed
77 * individually).
79 if (!PageReserved(head))
80 split_page(head, get_order(PMD_SIZE));
82 /* Make pte visible before pmd. See comment in pmd_install(). */
83 smp_wmb();
84 pmd_populate_kernel(&init_mm, pmd, pgtable);
85 if (!(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH))
86 flush_tlb_kernel_range(start, start + PMD_SIZE);
87 } else {
88 pte_free_kernel(&init_mm, pgtable);
90 spin_unlock(&init_mm.page_table_lock);
92 return 0;
95 static int vmemmap_pmd_entry(pmd_t *pmd, unsigned long addr,
96 unsigned long next, struct mm_walk *walk)
98 int ret = 0;
99 struct page *head;
100 struct vmemmap_remap_walk *vmemmap_walk = walk->private;
102 /* Only splitting, not remapping the vmemmap pages. */
103 if (!vmemmap_walk->remap_pte)
104 walk->action = ACTION_CONTINUE;
106 spin_lock(&init_mm.page_table_lock);
107 head = pmd_leaf(*pmd) ? pmd_page(*pmd) : NULL;
109 * Due to HugeTLB alignment requirements and the vmemmap
110 * pages being at the start of the hotplugged memory
111 * region in memory_hotplug.memmap_on_memory case. Checking
112 * the vmemmap page associated with the first vmemmap page
113 * if it is self-hosted is sufficient.
115 * [ hotplugged memory ]
116 * [ section ][...][ section ]
117 * [ vmemmap ][ usable memory ]
118 * ^ | ^ |
119 * +--+ | |
120 * +------------------------+
122 if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && unlikely(!vmemmap_walk->nr_walked)) {
123 struct page *page = head ? head + pte_index(addr) :
124 pte_page(ptep_get(pte_offset_kernel(pmd, addr)));
126 if (PageVmemmapSelfHosted(page))
127 ret = -ENOTSUPP;
129 spin_unlock(&init_mm.page_table_lock);
130 if (!head || ret)
131 return ret;
133 return vmemmap_split_pmd(pmd, head, addr & PMD_MASK, vmemmap_walk);
136 static int vmemmap_pte_entry(pte_t *pte, unsigned long addr,
137 unsigned long next, struct mm_walk *walk)
139 struct vmemmap_remap_walk *vmemmap_walk = walk->private;
142 * The reuse_page is found 'first' in page table walking before
143 * starting remapping.
145 if (!vmemmap_walk->reuse_page)
146 vmemmap_walk->reuse_page = pte_page(ptep_get(pte));
147 else
148 vmemmap_walk->remap_pte(pte, addr, vmemmap_walk);
149 vmemmap_walk->nr_walked++;
151 return 0;
154 static const struct mm_walk_ops vmemmap_remap_ops = {
155 .pmd_entry = vmemmap_pmd_entry,
156 .pte_entry = vmemmap_pte_entry,
159 static int vmemmap_remap_range(unsigned long start, unsigned long end,
160 struct vmemmap_remap_walk *walk)
162 int ret;
164 VM_BUG_ON(!PAGE_ALIGNED(start | end));
166 mmap_read_lock(&init_mm);
167 ret = walk_page_range_novma(&init_mm, start, end, &vmemmap_remap_ops,
168 NULL, walk);
169 mmap_read_unlock(&init_mm);
170 if (ret)
171 return ret;
173 if (walk->remap_pte && !(walk->flags & VMEMMAP_REMAP_NO_TLB_FLUSH))
174 flush_tlb_kernel_range(start, end);
176 return 0;
180 * Free a vmemmap page. A vmemmap page can be allocated from the memblock
181 * allocator or buddy allocator. If the PG_reserved flag is set, it means
182 * that it allocated from the memblock allocator, just free it via the
183 * free_bootmem_page(). Otherwise, use __free_page().
185 static inline void free_vmemmap_page(struct page *page)
187 if (PageReserved(page)) {
188 free_bootmem_page(page);
189 mod_node_page_state(page_pgdat(page), NR_MEMMAP_BOOT, -1);
190 } else {
191 __free_page(page);
192 mod_node_page_state(page_pgdat(page), NR_MEMMAP, -1);
196 /* Free a list of the vmemmap pages */
197 static void free_vmemmap_page_list(struct list_head *list)
199 struct page *page, *next;
201 list_for_each_entry_safe(page, next, list, lru)
202 free_vmemmap_page(page);
205 static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
206 struct vmemmap_remap_walk *walk)
209 * Remap the tail pages as read-only to catch illegal write operation
210 * to the tail pages.
212 pgprot_t pgprot = PAGE_KERNEL_RO;
213 struct page *page = pte_page(ptep_get(pte));
214 pte_t entry;
216 /* Remapping the head page requires r/w */
217 if (unlikely(addr == walk->reuse_addr)) {
218 pgprot = PAGE_KERNEL;
219 list_del(&walk->reuse_page->lru);
222 * Makes sure that preceding stores to the page contents from
223 * vmemmap_remap_free() become visible before the set_pte_at()
224 * write.
226 smp_wmb();
229 entry = mk_pte(walk->reuse_page, pgprot);
230 list_add(&page->lru, walk->vmemmap_pages);
231 set_pte_at(&init_mm, addr, pte, entry);
235 * How many struct page structs need to be reset. When we reuse the head
236 * struct page, the special metadata (e.g. page->flags or page->mapping)
237 * cannot copy to the tail struct page structs. The invalid value will be
238 * checked in the free_tail_page_prepare(). In order to avoid the message
239 * of "corrupted mapping in tail page". We need to reset at least 3 (one
240 * head struct page struct and two tail struct page structs) struct page
241 * structs.
243 #define NR_RESET_STRUCT_PAGE 3
245 static inline void reset_struct_pages(struct page *start)
247 struct page *from = start + NR_RESET_STRUCT_PAGE;
249 BUILD_BUG_ON(NR_RESET_STRUCT_PAGE * 2 > PAGE_SIZE / sizeof(struct page));
250 memcpy(start, from, sizeof(*from) * NR_RESET_STRUCT_PAGE);
253 static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
254 struct vmemmap_remap_walk *walk)
256 pgprot_t pgprot = PAGE_KERNEL;
257 struct page *page;
258 void *to;
260 BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page);
262 page = list_first_entry(walk->vmemmap_pages, struct page, lru);
263 list_del(&page->lru);
264 to = page_to_virt(page);
265 copy_page(to, (void *)walk->reuse_addr);
266 reset_struct_pages(to);
269 * Makes sure that preceding stores to the page contents become visible
270 * before the set_pte_at() write.
272 smp_wmb();
273 set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
277 * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end)
278 * backing PMDs of the directmap into PTEs
279 * @start: start address of the vmemmap virtual address range that we want
280 * to remap.
281 * @end: end address of the vmemmap virtual address range that we want to
282 * remap.
283 * @reuse: reuse address.
285 * Return: %0 on success, negative error code otherwise.
287 static int vmemmap_remap_split(unsigned long start, unsigned long end,
288 unsigned long reuse)
290 struct vmemmap_remap_walk walk = {
291 .remap_pte = NULL,
292 .flags = VMEMMAP_SPLIT_NO_TLB_FLUSH,
295 /* See the comment in the vmemmap_remap_free(). */
296 BUG_ON(start - reuse != PAGE_SIZE);
298 return vmemmap_remap_range(reuse, end, &walk);
302 * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
303 * to the page which @reuse is mapped to, then free vmemmap
304 * which the range are mapped to.
305 * @start: start address of the vmemmap virtual address range that we want
306 * to remap.
307 * @end: end address of the vmemmap virtual address range that we want to
308 * remap.
309 * @reuse: reuse address.
310 * @vmemmap_pages: list to deposit vmemmap pages to be freed. It is callers
311 * responsibility to free pages.
312 * @flags: modifications to vmemmap_remap_walk flags
314 * Return: %0 on success, negative error code otherwise.
316 static int vmemmap_remap_free(unsigned long start, unsigned long end,
317 unsigned long reuse,
318 struct list_head *vmemmap_pages,
319 unsigned long flags)
321 int ret;
322 struct vmemmap_remap_walk walk = {
323 .remap_pte = vmemmap_remap_pte,
324 .reuse_addr = reuse,
325 .vmemmap_pages = vmemmap_pages,
326 .flags = flags,
328 int nid = page_to_nid((struct page *)reuse);
329 gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
332 * Allocate a new head vmemmap page to avoid breaking a contiguous
333 * block of struct page memory when freeing it back to page allocator
334 * in free_vmemmap_page_list(). This will allow the likely contiguous
335 * struct page backing memory to be kept contiguous and allowing for
336 * more allocations of hugepages. Fallback to the currently
337 * mapped head page in case should it fail to allocate.
339 walk.reuse_page = alloc_pages_node(nid, gfp_mask, 0);
340 if (walk.reuse_page) {
341 copy_page(page_to_virt(walk.reuse_page),
342 (void *)walk.reuse_addr);
343 list_add(&walk.reuse_page->lru, vmemmap_pages);
344 mod_node_page_state(NODE_DATA(nid), NR_MEMMAP, 1);
348 * In order to make remapping routine most efficient for the huge pages,
349 * the routine of vmemmap page table walking has the following rules
350 * (see more details from the vmemmap_pte_range()):
352 * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
353 * should be continuous.
354 * - The @reuse address is part of the range [@reuse, @end) that we are
355 * walking which is passed to vmemmap_remap_range().
356 * - The @reuse address is the first in the complete range.
358 * So we need to make sure that @start and @reuse meet the above rules.
360 BUG_ON(start - reuse != PAGE_SIZE);
362 ret = vmemmap_remap_range(reuse, end, &walk);
363 if (ret && walk.nr_walked) {
364 end = reuse + walk.nr_walked * PAGE_SIZE;
366 * vmemmap_pages contains pages from the previous
367 * vmemmap_remap_range call which failed. These
368 * are pages which were removed from the vmemmap.
369 * They will be restored in the following call.
371 walk = (struct vmemmap_remap_walk) {
372 .remap_pte = vmemmap_restore_pte,
373 .reuse_addr = reuse,
374 .vmemmap_pages = vmemmap_pages,
375 .flags = 0,
378 vmemmap_remap_range(reuse, end, &walk);
381 return ret;
384 static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
385 struct list_head *list)
387 gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL;
388 unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
389 int nid = page_to_nid((struct page *)start);
390 struct page *page, *next;
391 int i;
393 for (i = 0; i < nr_pages; i++) {
394 page = alloc_pages_node(nid, gfp_mask, 0);
395 if (!page) {
396 mod_node_page_state(NODE_DATA(nid), NR_MEMMAP, i);
397 goto out;
399 list_add(&page->lru, list);
402 mod_node_page_state(NODE_DATA(nid), NR_MEMMAP, nr_pages);
404 return 0;
405 out:
406 list_for_each_entry_safe(page, next, list, lru)
407 __free_page(page);
408 return -ENOMEM;
412 * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
413 * to the page which is from the @vmemmap_pages
414 * respectively.
415 * @start: start address of the vmemmap virtual address range that we want
416 * to remap.
417 * @end: end address of the vmemmap virtual address range that we want to
418 * remap.
419 * @reuse: reuse address.
420 * @flags: modifications to vmemmap_remap_walk flags
422 * Return: %0 on success, negative error code otherwise.
424 static int vmemmap_remap_alloc(unsigned long start, unsigned long end,
425 unsigned long reuse, unsigned long flags)
427 LIST_HEAD(vmemmap_pages);
428 struct vmemmap_remap_walk walk = {
429 .remap_pte = vmemmap_restore_pte,
430 .reuse_addr = reuse,
431 .vmemmap_pages = &vmemmap_pages,
432 .flags = flags,
435 /* See the comment in the vmemmap_remap_free(). */
436 BUG_ON(start - reuse != PAGE_SIZE);
438 if (alloc_vmemmap_page_list(start, end, &vmemmap_pages))
439 return -ENOMEM;
441 return vmemmap_remap_range(reuse, end, &walk);
444 DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);
445 EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
447 static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON);
448 core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0);
450 static int __hugetlb_vmemmap_restore_folio(const struct hstate *h,
451 struct folio *folio, unsigned long flags)
453 int ret;
454 unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
455 unsigned long vmemmap_reuse;
457 VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
458 VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio);
460 if (!folio_test_hugetlb_vmemmap_optimized(folio))
461 return 0;
463 vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
464 vmemmap_reuse = vmemmap_start;
465 vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE;
468 * The pages which the vmemmap virtual address range [@vmemmap_start,
469 * @vmemmap_end) are mapped to are freed to the buddy allocator, and
470 * the range is mapped to the page which @vmemmap_reuse is mapped to.
471 * When a HugeTLB page is freed to the buddy allocator, previously
472 * discarded vmemmap pages must be allocated and remapping.
474 ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse, flags);
475 if (!ret) {
476 folio_clear_hugetlb_vmemmap_optimized(folio);
477 static_branch_dec(&hugetlb_optimize_vmemmap_key);
480 return ret;
484 * hugetlb_vmemmap_restore_folio - restore previously optimized (by
485 * hugetlb_vmemmap_optimize_folio()) vmemmap pages which
486 * will be reallocated and remapped.
487 * @h: struct hstate.
488 * @folio: the folio whose vmemmap pages will be restored.
490 * Return: %0 if @folio's vmemmap pages have been reallocated and remapped,
491 * negative error code otherwise.
493 int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio)
495 /* avoid writes from page_ref_add_unless() while unfolding vmemmap */
496 synchronize_rcu();
498 return __hugetlb_vmemmap_restore_folio(h, folio, 0);
502 * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list.
503 * @h: hstate.
504 * @folio_list: list of folios.
505 * @non_hvo_folios: Output list of folios for which vmemmap exists.
507 * Return: number of folios for which vmemmap was restored, or an error code
508 * if an error was encountered restoring vmemmap for a folio.
509 * Folios that have vmemmap are moved to the non_hvo_folios
510 * list. Processing of entries stops when the first error is
511 * encountered. The folio that experienced the error and all
512 * non-processed folios will remain on folio_list.
514 long hugetlb_vmemmap_restore_folios(const struct hstate *h,
515 struct list_head *folio_list,
516 struct list_head *non_hvo_folios)
518 struct folio *folio, *t_folio;
519 long restored = 0;
520 long ret = 0;
522 /* avoid writes from page_ref_add_unless() while unfolding vmemmap */
523 synchronize_rcu();
525 list_for_each_entry_safe(folio, t_folio, folio_list, lru) {
526 if (folio_test_hugetlb_vmemmap_optimized(folio)) {
527 ret = __hugetlb_vmemmap_restore_folio(h, folio,
528 VMEMMAP_REMAP_NO_TLB_FLUSH);
529 if (ret)
530 break;
531 restored++;
534 /* Add non-optimized folios to output list */
535 list_move(&folio->lru, non_hvo_folios);
538 if (restored)
539 flush_tlb_all();
540 if (!ret)
541 ret = restored;
542 return ret;
545 /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */
546 static bool vmemmap_should_optimize_folio(const struct hstate *h, struct folio *folio)
548 if (folio_test_hugetlb_vmemmap_optimized(folio))
549 return false;
551 if (!READ_ONCE(vmemmap_optimize_enabled))
552 return false;
554 if (!hugetlb_vmemmap_optimizable(h))
555 return false;
557 return true;
560 static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
561 struct folio *folio,
562 struct list_head *vmemmap_pages,
563 unsigned long flags)
565 int ret = 0;
566 unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
567 unsigned long vmemmap_reuse;
569 VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
570 VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio);
572 if (!vmemmap_should_optimize_folio(h, folio))
573 return ret;
575 static_branch_inc(&hugetlb_optimize_vmemmap_key);
577 * Very Subtle
578 * If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed
579 * immediately after remapping. As a result, subsequent accesses
580 * and modifications to struct pages associated with the hugetlb
581 * page could be to the OLD struct pages. Set the vmemmap optimized
582 * flag here so that it is copied to the new head page. This keeps
583 * the old and new struct pages in sync.
584 * If there is an error during optimization, we will immediately FLUSH
585 * the TLB and clear the flag below.
587 folio_set_hugetlb_vmemmap_optimized(folio);
589 vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
590 vmemmap_reuse = vmemmap_start;
591 vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE;
594 * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end)
595 * to the page which @vmemmap_reuse is mapped to. Add pages previously
596 * mapping the range to vmemmap_pages list so that they can be freed by
597 * the caller.
599 ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse,
600 vmemmap_pages, flags);
601 if (ret) {
602 static_branch_dec(&hugetlb_optimize_vmemmap_key);
603 folio_clear_hugetlb_vmemmap_optimized(folio);
606 return ret;
610 * hugetlb_vmemmap_optimize_folio - optimize @folio's vmemmap pages.
611 * @h: struct hstate.
612 * @folio: the folio whose vmemmap pages will be optimized.
614 * This function only tries to optimize @folio's vmemmap pages and does not
615 * guarantee that the optimization will succeed after it returns. The caller
616 * can use folio_test_hugetlb_vmemmap_optimized(@folio) to detect if @folio's
617 * vmemmap pages have been optimized.
619 void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio)
621 LIST_HEAD(vmemmap_pages);
623 /* avoid writes from page_ref_add_unless() while folding vmemmap */
624 synchronize_rcu();
626 __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, 0);
627 free_vmemmap_page_list(&vmemmap_pages);
630 static int hugetlb_vmemmap_split_folio(const struct hstate *h, struct folio *folio)
632 unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
633 unsigned long vmemmap_reuse;
635 if (!vmemmap_should_optimize_folio(h, folio))
636 return 0;
638 vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
639 vmemmap_reuse = vmemmap_start;
640 vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE;
643 * Split PMDs on the vmemmap virtual address range [@vmemmap_start,
644 * @vmemmap_end]
646 return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse);
649 void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list)
651 struct folio *folio;
652 LIST_HEAD(vmemmap_pages);
654 list_for_each_entry(folio, folio_list, lru) {
655 int ret = hugetlb_vmemmap_split_folio(h, folio);
658 * Spliting the PMD requires allocating a page, thus lets fail
659 * early once we encounter the first OOM. No point in retrying
660 * as it can be dynamically done on remap with the memory
661 * we get back from the vmemmap deduplication.
663 if (ret == -ENOMEM)
664 break;
667 flush_tlb_all();
669 /* avoid writes from page_ref_add_unless() while folding vmemmap */
670 synchronize_rcu();
672 list_for_each_entry(folio, folio_list, lru) {
673 int ret;
675 ret = __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages,
676 VMEMMAP_REMAP_NO_TLB_FLUSH);
679 * Pages to be freed may have been accumulated. If we
680 * encounter an ENOMEM, free what we have and try again.
681 * This can occur in the case that both spliting fails
682 * halfway and head page allocation also failed. In this
683 * case __hugetlb_vmemmap_optimize_folio() would free memory
684 * allowing more vmemmap remaps to occur.
686 if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) {
687 flush_tlb_all();
688 free_vmemmap_page_list(&vmemmap_pages);
689 INIT_LIST_HEAD(&vmemmap_pages);
690 __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages,
691 VMEMMAP_REMAP_NO_TLB_FLUSH);
695 flush_tlb_all();
696 free_vmemmap_page_list(&vmemmap_pages);
699 static struct ctl_table hugetlb_vmemmap_sysctls[] = {
701 .procname = "hugetlb_optimize_vmemmap",
702 .data = &vmemmap_optimize_enabled,
703 .maxlen = sizeof(vmemmap_optimize_enabled),
704 .mode = 0644,
705 .proc_handler = proc_dobool,
709 static int __init hugetlb_vmemmap_init(void)
711 const struct hstate *h;
713 /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */
714 BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES);
716 for_each_hstate(h) {
717 if (hugetlb_vmemmap_optimizable(h)) {
718 register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
719 break;
722 return 0;
724 late_initcall(hugetlb_vmemmap_init);