mm/hugetlb_vmemmap.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * HugeTLB Vmemmap Optimization (HVO)
   4  *
   5  * Copyright (c) 2020, ByteDance. All rights reserved.
   6  *
   7  *     Author: Muchun Song <songmuchun@bytedance.com>
   8  *
   9  * See Documentation/mm/vmemmap_dedup.rst
  10  */
  11 #define pr_fmt(fmt)     "HugeTLB: " fmt
  12
  13 #include <linux/pgtable.h>
  14 #include <linux/moduleparam.h>
  15 #include <linux/bootmem_info.h>
  16 #include <linux/mmdebug.h>
  17 #include <linux/pagewalk.h>
  18 #include <asm/pgalloc.h>
  19 #include <asm/tlbflush.h>
  20 #include "hugetlb_vmemmap.h"
  21
  22 /**
  23  * struct vmemmap_remap_walk - walk vmemmap page table
  24  *
  25  * @remap_pte:          called for each lowest-level entry (PTE).
  26  * @nr_walked:          the number of walked pte.
  27  * @reuse_page:         the page which is reused for the tail vmemmap pages.
  28  * @reuse_addr:         the virtual address of the @reuse_page page.
  29  * @vmemmap_pages:      the list head of the vmemmap pages that can be freed
  30  *                      or is mapped from.
  31  * @flags:              used to modify behavior in vmemmap page table walking
  32  *                      operations.
  33  */
  34 struct vmemmap_remap_walk {
  35         void                    (*remap_pte)(pte_t *pte, unsigned long addr,
  36                                              struct vmemmap_remap_walk *walk);
  37         unsigned long           nr_walked;
  38         struct page             *reuse_page;
  39         unsigned long           reuse_addr;
  40         struct list_head        *vmemmap_pages;
  41
  42 /* Skip the TLB flush when we split the PMD */
  43 #define VMEMMAP_SPLIT_NO_TLB_FLUSH      BIT(0)
  44 /* Skip the TLB flush when we remap the PTE */
  45 #define VMEMMAP_REMAP_NO_TLB_FLUSH      BIT(1)
  46 /* synchronize_rcu() to avoid writes from page_ref_add_unless() */
  47 #define VMEMMAP_SYNCHRONIZE_RCU         BIT(2)
  48         unsigned long           flags;
  49 };
  50
  51 static int vmemmap_split_pmd(pmd_t *pmd, struct page *head, unsigned long start,
  52                              struct vmemmap_remap_walk *walk)
  53 {
  54         pmd_t __pmd;
  55         int i;
  56         unsigned long addr = start;
  57         pte_t *pgtable;
  58
  59         pgtable = pte_alloc_one_kernel(&init_mm);
  60         if (!pgtable)
  61                 return -ENOMEM;
  62
  63         pmd_populate_kernel(&init_mm, &__pmd, pgtable);
  64
  65         for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
  66                 pte_t entry, *pte;
  67                 pgprot_t pgprot = PAGE_KERNEL;
  68
  69                 entry = mk_pte(head + i, pgprot);
  70                 pte = pte_offset_kernel(&__pmd, addr);
  71                 set_pte_at(&init_mm, addr, pte, entry);
  72         }
  73
  74         spin_lock(&init_mm.page_table_lock);
  75         if (likely(pmd_leaf(*pmd))) {
  76                 /*
  77                  * Higher order allocations from buddy allocator must be able to
  78                  * be treated as indepdenent small pages (as they can be freed
  79                  * individually).
  80                  */
  81                 if (!PageReserved(head))
  82                         split_page(head, get_order(PMD_SIZE));
  83
  84                 /* Make pte visible before pmd. See comment in pmd_install(). */
  85                 smp_wmb();
  86                 pmd_populate_kernel(&init_mm, pmd, pgtable);
  87                 if (!(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH))
  88                         flush_tlb_kernel_range(start, start + PMD_SIZE);
  89         } else {
  90                 pte_free_kernel(&init_mm, pgtable);
  91         }
  92         spin_unlock(&init_mm.page_table_lock);
  93
  94         return 0;
  95 }
  96
  97 static int vmemmap_pmd_entry(pmd_t *pmd, unsigned long addr,
  98                              unsigned long next, struct mm_walk *walk)
  99 {
 100         int ret = 0;
 101         struct page *head;
 102         struct vmemmap_remap_walk *vmemmap_walk = walk->private;
 103
 104         /* Only splitting, not remapping the vmemmap pages. */
 105         if (!vmemmap_walk->remap_pte)
 106                 walk->action = ACTION_CONTINUE;
 107
 108         spin_lock(&init_mm.page_table_lock);
 109         head = pmd_leaf(*pmd) ? pmd_page(*pmd) : NULL;
 110         /*
 111          * Due to HugeTLB alignment requirements and the vmemmap
 112          * pages being at the start of the hotplugged memory
 113          * region in memory_hotplug.memmap_on_memory case. Checking
 114          * the vmemmap page associated with the first vmemmap page
 115          * if it is self-hosted is sufficient.
 116          *
 117          * [                  hotplugged memory                  ]
 118          * [        section        ][...][        section        ]
 119          * [ vmemmap ][              usable memory               ]
 120          *   ^  | ^                        |
 121          *   +--+ |                        |
 122          *        +------------------------+
 123          */
 124         if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && unlikely(!vmemmap_walk->nr_walked)) {
 125                 struct page *page = head ? head + pte_index(addr) :
 126                                     pte_page(ptep_get(pte_offset_kernel(pmd, addr)));
 127
 128                 if (PageVmemmapSelfHosted(page))
 129                         ret = -ENOTSUPP;
 130         }
 131         spin_unlock(&init_mm.page_table_lock);
 132         if (!head || ret)
 133                 return ret;
 134
 135         return vmemmap_split_pmd(pmd, head, addr & PMD_MASK, vmemmap_walk);
 136 }
 137
 138 static int vmemmap_pte_entry(pte_t *pte, unsigned long addr,
 139                              unsigned long next, struct mm_walk *walk)
 140 {
 141         struct vmemmap_remap_walk *vmemmap_walk = walk->private;
 142
 143         /*
 144          * The reuse_page is found 'first' in page table walking before
 145          * starting remapping.
 146          */
 147         if (!vmemmap_walk->reuse_page)
 148                 vmemmap_walk->reuse_page = pte_page(ptep_get(pte));
 149         else
 150                 vmemmap_walk->remap_pte(pte, addr, vmemmap_walk);
 151         vmemmap_walk->nr_walked++;
 152
 153         return 0;
 154 }
 155
 156 static const struct mm_walk_ops vmemmap_remap_ops = {
 157         .pmd_entry      = vmemmap_pmd_entry,
 158         .pte_entry      = vmemmap_pte_entry,
 159 };
 160
 161 static int vmemmap_remap_range(unsigned long start, unsigned long end,
 162                                struct vmemmap_remap_walk *walk)
 163 {
 164         int ret;
 165
 166         VM_BUG_ON(!PAGE_ALIGNED(start | end));
 167
 168         mmap_read_lock(&init_mm);
 169         ret = walk_page_range_novma(&init_mm, start, end, &vmemmap_remap_ops,
 170                                     NULL, walk);
 171         mmap_read_unlock(&init_mm);
 172         if (ret)
 173                 return ret;
 174
 175         if (walk->remap_pte && !(walk->flags & VMEMMAP_REMAP_NO_TLB_FLUSH))
 176                 flush_tlb_kernel_range(start, end);
 177
 178         return 0;
 179 }
 180
 181 /*
 182  * Free a vmemmap page. A vmemmap page can be allocated from the memblock
 183  * allocator or buddy allocator. If the PG_reserved flag is set, it means
 184  * that it allocated from the memblock allocator, just free it via the
 185  * free_bootmem_page(). Otherwise, use __free_page().
 186  */
 187 static inline void free_vmemmap_page(struct page *page)
 188 {
 189         if (PageReserved(page)) {
 190                 memmap_boot_pages_add(-1);
 191                 free_bootmem_page(page);
 192         } else {
 193                 memmap_pages_add(-1);
 194                 __free_page(page);
 195         }
 196 }
 197
 198 /* Free a list of the vmemmap pages */
 199 static void free_vmemmap_page_list(struct list_head *list)
 200 {
 201         struct page *page, *next;
 202
 203         list_for_each_entry_safe(page, next, list, lru)
 204                 free_vmemmap_page(page);
 205 }
 206
 207 static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
 208                               struct vmemmap_remap_walk *walk)
 209 {
 210         /*
 211          * Remap the tail pages as read-only to catch illegal write operation
 212          * to the tail pages.
 213          */
 214         pgprot_t pgprot = PAGE_KERNEL_RO;
 215         struct page *page = pte_page(ptep_get(pte));
 216         pte_t entry;
 217
 218         /* Remapping the head page requires r/w */
 219         if (unlikely(addr == walk->reuse_addr)) {
 220                 pgprot = PAGE_KERNEL;
 221                 list_del(&walk->reuse_page->lru);
 222
 223                 /*
 224                  * Makes sure that preceding stores to the page contents from
 225                  * vmemmap_remap_free() become visible before the set_pte_at()
 226                  * write.
 227                  */
 228                 smp_wmb();
 229         }
 230
 231         entry = mk_pte(walk->reuse_page, pgprot);
 232         list_add(&page->lru, walk->vmemmap_pages);
 233         set_pte_at(&init_mm, addr, pte, entry);
 234 }
 235
 236 /*
 237  * How many struct page structs need to be reset. When we reuse the head
 238  * struct page, the special metadata (e.g. page->flags or page->mapping)
 239  * cannot copy to the tail struct page structs. The invalid value will be
 240  * checked in the free_tail_page_prepare(). In order to avoid the message
 241  * of "corrupted mapping in tail page". We need to reset at least 3 (one
 242  * head struct page struct and two tail struct page structs) struct page
 243  * structs.
 244  */
 245 #define NR_RESET_STRUCT_PAGE            3
 246
 247 static inline void reset_struct_pages(struct page *start)
 248 {
 249         struct page *from = start + NR_RESET_STRUCT_PAGE;
 250
 251         BUILD_BUG_ON(NR_RESET_STRUCT_PAGE * 2 > PAGE_SIZE / sizeof(struct page));
 252         memcpy(start, from, sizeof(*from) * NR_RESET_STRUCT_PAGE);
 253 }
 254
 255 static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
 256                                 struct vmemmap_remap_walk *walk)
 257 {
 258         pgprot_t pgprot = PAGE_KERNEL;
 259         struct page *page;
 260         void *to;
 261
 262         BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page);
 263
 264         page = list_first_entry(walk->vmemmap_pages, struct page, lru);
 265         list_del(&page->lru);
 266         to = page_to_virt(page);
 267         copy_page(to, (void *)walk->reuse_addr);
 268         reset_struct_pages(to);
 269
 270         /*
 271          * Makes sure that preceding stores to the page contents become visible
 272          * before the set_pte_at() write.
 273          */
 274         smp_wmb();
 275         set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
 276 }
 277
 278 /**
 279  * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end)
 280  *                      backing PMDs of the directmap into PTEs
 281  * @start:     start address of the vmemmap virtual address range that we want
 282  *             to remap.
 283  * @end:       end address of the vmemmap virtual address range that we want to
 284  *             remap.
 285  * @reuse:     reuse address.
 286  *
 287  * Return: %0 on success, negative error code otherwise.
 288  */
 289 static int vmemmap_remap_split(unsigned long start, unsigned long end,
 290                                unsigned long reuse)
 291 {
 292         struct vmemmap_remap_walk walk = {
 293                 .remap_pte      = NULL,
 294                 .flags          = VMEMMAP_SPLIT_NO_TLB_FLUSH,
 295         };
 296
 297         /* See the comment in the vmemmap_remap_free(). */
 298         BUG_ON(start - reuse != PAGE_SIZE);
 299
 300         return vmemmap_remap_range(reuse, end, &walk);
 301 }
 302
 303 /**
 304  * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
 305  *                      to the page which @reuse is mapped to, then free vmemmap
 306  *                      which the range are mapped to.
 307  * @start:      start address of the vmemmap virtual address range that we want
 308  *              to remap.
 309  * @end:        end address of the vmemmap virtual address range that we want to
 310  *              remap.
 311  * @reuse:      reuse address.
 312  * @vmemmap_pages: list to deposit vmemmap pages to be freed.  It is callers
 313  *              responsibility to free pages.
 314  * @flags:      modifications to vmemmap_remap_walk flags
 315  *
 316  * Return: %0 on success, negative error code otherwise.
 317  */
 318 static int vmemmap_remap_free(unsigned long start, unsigned long end,
 319                               unsigned long reuse,
 320                               struct list_head *vmemmap_pages,
 321                               unsigned long flags)
 322 {
 323         int ret;
 324         struct vmemmap_remap_walk walk = {
 325                 .remap_pte      = vmemmap_remap_pte,
 326                 .reuse_addr     = reuse,
 327                 .vmemmap_pages  = vmemmap_pages,
 328                 .flags          = flags,
 329         };
 330         int nid = page_to_nid((struct page *)reuse);
 331         gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
 332
 333         /*
 334          * Allocate a new head vmemmap page to avoid breaking a contiguous
 335          * block of struct page memory when freeing it back to page allocator
 336          * in free_vmemmap_page_list(). This will allow the likely contiguous
 337          * struct page backing memory to be kept contiguous and allowing for
 338          * more allocations of hugepages. Fallback to the currently
 339          * mapped head page in case should it fail to allocate.
 340          */
 341         walk.reuse_page = alloc_pages_node(nid, gfp_mask, 0);
 342         if (walk.reuse_page) {
 343                 copy_page(page_to_virt(walk.reuse_page),
 344                           (void *)walk.reuse_addr);
 345                 list_add(&walk.reuse_page->lru, vmemmap_pages);
 346                 memmap_pages_add(1);
 347         }
 348
 349         /*
 350          * In order to make remapping routine most efficient for the huge pages,
 351          * the routine of vmemmap page table walking has the following rules
 352          * (see more details from the vmemmap_pte_range()):
 353          *
 354          * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
 355          *   should be continuous.
 356          * - The @reuse address is part of the range [@reuse, @end) that we are
 357          *   walking which is passed to vmemmap_remap_range().
 358          * - The @reuse address is the first in the complete range.
 359          *
 360          * So we need to make sure that @start and @reuse meet the above rules.
 361          */
 362         BUG_ON(start - reuse != PAGE_SIZE);
 363
 364         ret = vmemmap_remap_range(reuse, end, &walk);
 365         if (ret && walk.nr_walked) {
 366                 end = reuse + walk.nr_walked * PAGE_SIZE;
 367                 /*
 368                  * vmemmap_pages contains pages from the previous
 369                  * vmemmap_remap_range call which failed.  These
 370                  * are pages which were removed from the vmemmap.
 371                  * They will be restored in the following call.
 372                  */
 373                 walk = (struct vmemmap_remap_walk) {
 374                         .remap_pte      = vmemmap_restore_pte,
 375                         .reuse_addr     = reuse,
 376                         .vmemmap_pages  = vmemmap_pages,
 377                         .flags          = 0,
 378                 };
 379
 380                 vmemmap_remap_range(reuse, end, &walk);
 381         }
 382
 383         return ret;
 384 }
 385
 386 static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
 387                                    struct list_head *list)
 388 {
 389         gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL;
 390         unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
 391         int nid = page_to_nid((struct page *)start);
 392         struct page *page, *next;
 393         int i;
 394
 395         for (i = 0; i < nr_pages; i++) {
 396                 page = alloc_pages_node(nid, gfp_mask, 0);
 397                 if (!page)
 398                         goto out;
 399                 list_add(&page->lru, list);
 400         }
 401         memmap_pages_add(nr_pages);
 402
 403         return 0;
 404 out:
 405         list_for_each_entry_safe(page, next, list, lru)
 406                 __free_page(page);
 407         return -ENOMEM;
 408 }
 409
 410 /**
 411  * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
 412  *                       to the page which is from the @vmemmap_pages
 413  *                       respectively.
 414  * @start:      start address of the vmemmap virtual address range that we want
 415  *              to remap.
 416  * @end:        end address of the vmemmap virtual address range that we want to
 417  *              remap.
 418  * @reuse:      reuse address.
 419  * @flags:      modifications to vmemmap_remap_walk flags
 420  *
 421  * Return: %0 on success, negative error code otherwise.
 422  */
 423 static int vmemmap_remap_alloc(unsigned long start, unsigned long end,
 424                                unsigned long reuse, unsigned long flags)
 425 {
 426         LIST_HEAD(vmemmap_pages);
 427         struct vmemmap_remap_walk walk = {
 428                 .remap_pte      = vmemmap_restore_pte,
 429                 .reuse_addr     = reuse,
 430                 .vmemmap_pages  = &vmemmap_pages,
 431                 .flags          = flags,
 432         };
 433
 434         /* See the comment in the vmemmap_remap_free(). */
 435         BUG_ON(start - reuse != PAGE_SIZE);
 436
 437         if (alloc_vmemmap_page_list(start, end, &vmemmap_pages))
 438                 return -ENOMEM;
 439
 440         return vmemmap_remap_range(reuse, end, &walk);
 441 }
 442
 443 DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);
 444 EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
 445
 446 static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON);
 447 core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0);
 448
 449 static int __hugetlb_vmemmap_restore_folio(const struct hstate *h,
 450                                            struct folio *folio, unsigned long flags)
 451 {
 452         int ret;
 453         unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
 454         unsigned long vmemmap_reuse;
 455
 456         VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
 457         VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio);
 458
 459         if (!folio_test_hugetlb_vmemmap_optimized(folio))
 460                 return 0;
 461
 462         if (flags & VMEMMAP_SYNCHRONIZE_RCU)
 463                 synchronize_rcu();
 464
 465         vmemmap_end     = vmemmap_start + hugetlb_vmemmap_size(h);
 466         vmemmap_reuse   = vmemmap_start;
 467         vmemmap_start   += HUGETLB_VMEMMAP_RESERVE_SIZE;
 468
 469         /*
 470          * The pages which the vmemmap virtual address range [@vmemmap_start,
 471          * @vmemmap_end) are mapped to are freed to the buddy allocator, and
 472          * the range is mapped to the page which @vmemmap_reuse is mapped to.
 473          * When a HugeTLB page is freed to the buddy allocator, previously
 474          * discarded vmemmap pages must be allocated and remapping.
 475          */
 476         ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse, flags);
 477         if (!ret) {
 478                 folio_clear_hugetlb_vmemmap_optimized(folio);
 479                 static_branch_dec(&hugetlb_optimize_vmemmap_key);
 480         }
 481
 482         return ret;
 483 }
 484
 485 /**
 486  * hugetlb_vmemmap_restore_folio - restore previously optimized (by
 487  *                              hugetlb_vmemmap_optimize_folio()) vmemmap pages which
 488  *                              will be reallocated and remapped.
 489  * @h:          struct hstate.
 490  * @folio:     the folio whose vmemmap pages will be restored.
 491  *
 492  * Return: %0 if @folio's vmemmap pages have been reallocated and remapped,
 493  * negative error code otherwise.
 494  */
 495 int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio)
 496 {
 497         return __hugetlb_vmemmap_restore_folio(h, folio, VMEMMAP_SYNCHRONIZE_RCU);
 498 }
 499
 500 /**
 501  * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list.
 502  * @h:                  hstate.
 503  * @folio_list:         list of folios.
 504  * @non_hvo_folios:     Output list of folios for which vmemmap exists.
 505  *
 506  * Return: number of folios for which vmemmap was restored, or an error code
 507  *              if an error was encountered restoring vmemmap for a folio.
 508  *              Folios that have vmemmap are moved to the non_hvo_folios
 509  *              list.  Processing of entries stops when the first error is
 510  *              encountered. The folio that experienced the error and all
 511  *              non-processed folios will remain on folio_list.
 512  */
 513 long hugetlb_vmemmap_restore_folios(const struct hstate *h,
 514                                         struct list_head *folio_list,
 515                                         struct list_head *non_hvo_folios)
 516 {
 517         struct folio *folio, *t_folio;
 518         long restored = 0;
 519         long ret = 0;
 520         unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU;
 521
 522         list_for_each_entry_safe(folio, t_folio, folio_list, lru) {
 523                 if (folio_test_hugetlb_vmemmap_optimized(folio)) {
 524                         ret = __hugetlb_vmemmap_restore_folio(h, folio, flags);
 525                         /* only need to synchronize_rcu() once for each batch */
 526                         flags &= ~VMEMMAP_SYNCHRONIZE_RCU;
 527
 528                         if (ret)
 529                                 break;
 530                         restored++;
 531                 }
 532
 533                 /* Add non-optimized folios to output list */
 534                 list_move(&folio->lru, non_hvo_folios);
 535         }
 536
 537         if (restored)
 538                 flush_tlb_all();
 539         if (!ret)
 540                 ret = restored;
 541         return ret;
 542 }
 543
 544 /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */
 545 static bool vmemmap_should_optimize_folio(const struct hstate *h, struct folio *folio)
 546 {
 547         if (folio_test_hugetlb_vmemmap_optimized(folio))
 548                 return false;
 549
 550         if (!READ_ONCE(vmemmap_optimize_enabled))
 551                 return false;
 552
 553         if (!hugetlb_vmemmap_optimizable(h))
 554                 return false;
 555
 556         return true;
 557 }
 558
 559 static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
 560                                             struct folio *folio,
 561                                             struct list_head *vmemmap_pages,
 562                                             unsigned long flags)
 563 {
 564         int ret = 0;
 565         unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
 566         unsigned long vmemmap_reuse;
 567
 568         VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
 569         VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio);
 570
 571         if (!vmemmap_should_optimize_folio(h, folio))
 572                 return ret;
 573
 574         static_branch_inc(&hugetlb_optimize_vmemmap_key);
 575
 576         if (flags & VMEMMAP_SYNCHRONIZE_RCU)
 577                 synchronize_rcu();
 578         /*
 579          * Very Subtle
 580          * If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed
 581          * immediately after remapping.  As a result, subsequent accesses
 582          * and modifications to struct pages associated with the hugetlb
 583          * page could be to the OLD struct pages.  Set the vmemmap optimized
 584          * flag here so that it is copied to the new head page.  This keeps
 585          * the old and new struct pages in sync.
 586          * If there is an error during optimization, we will immediately FLUSH
 587          * the TLB and clear the flag below.
 588          */
 589         folio_set_hugetlb_vmemmap_optimized(folio);
 590
 591         vmemmap_end     = vmemmap_start + hugetlb_vmemmap_size(h);
 592         vmemmap_reuse   = vmemmap_start;
 593         vmemmap_start   += HUGETLB_VMEMMAP_RESERVE_SIZE;
 594
 595         /*
 596          * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end)
 597          * to the page which @vmemmap_reuse is mapped to.  Add pages previously
 598          * mapping the range to vmemmap_pages list so that they can be freed by
 599          * the caller.
 600          */
 601         ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse,
 602                                  vmemmap_pages, flags);
 603         if (ret) {
 604                 static_branch_dec(&hugetlb_optimize_vmemmap_key);
 605                 folio_clear_hugetlb_vmemmap_optimized(folio);
 606         }
 607
 608         return ret;
 609 }
 610
 611 /**
 612  * hugetlb_vmemmap_optimize_folio - optimize @folio's vmemmap pages.
 613  * @h:          struct hstate.
 614  * @folio:     the folio whose vmemmap pages will be optimized.
 615  *
 616  * This function only tries to optimize @folio's vmemmap pages and does not
 617  * guarantee that the optimization will succeed after it returns. The caller
 618  * can use folio_test_hugetlb_vmemmap_optimized(@folio) to detect if @folio's
 619  * vmemmap pages have been optimized.
 620  */
 621 void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio)
 622 {
 623         LIST_HEAD(vmemmap_pages);
 624
 625         __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, VMEMMAP_SYNCHRONIZE_RCU);
 626         free_vmemmap_page_list(&vmemmap_pages);
 627 }
 628
 629 static int hugetlb_vmemmap_split_folio(const struct hstate *h, struct folio *folio)
 630 {
 631         unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
 632         unsigned long vmemmap_reuse;
 633
 634         if (!vmemmap_should_optimize_folio(h, folio))
 635                 return 0;
 636
 637         vmemmap_end     = vmemmap_start + hugetlb_vmemmap_size(h);
 638         vmemmap_reuse   = vmemmap_start;
 639         vmemmap_start   += HUGETLB_VMEMMAP_RESERVE_SIZE;
 640
 641         /*
 642          * Split PMDs on the vmemmap virtual address range [@vmemmap_start,
 643          * @vmemmap_end]
 644          */
 645         return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse);
 646 }
 647
 648 void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list)
 649 {
 650         struct folio *folio;
 651         LIST_HEAD(vmemmap_pages);
 652         unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU;
 653
 654         list_for_each_entry(folio, folio_list, lru) {
 655                 int ret = hugetlb_vmemmap_split_folio(h, folio);
 656
 657                 /*
 658                  * Spliting the PMD requires allocating a page, thus lets fail
 659                  * early once we encounter the first OOM. No point in retrying
 660                  * as it can be dynamically done on remap with the memory
 661                  * we get back from the vmemmap deduplication.
 662                  */
 663                 if (ret == -ENOMEM)
 664                         break;
 665         }
 666
 667         flush_tlb_all();
 668
 669         list_for_each_entry(folio, folio_list, lru) {
 670                 int ret;
 671
 672                 ret = __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags);
 673                 /* only need to synchronize_rcu() once for each batch */
 674                 flags &= ~VMEMMAP_SYNCHRONIZE_RCU;
 675
 676                 /*
 677                  * Pages to be freed may have been accumulated.  If we
 678                  * encounter an ENOMEM,  free what we have and try again.
 679                  * This can occur in the case that both spliting fails
 680                  * halfway and head page allocation also failed. In this
 681                  * case __hugetlb_vmemmap_optimize_folio() would free memory
 682                  * allowing more vmemmap remaps to occur.
 683                  */
 684                 if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) {
 685                         flush_tlb_all();
 686                         free_vmemmap_page_list(&vmemmap_pages);
 687                         INIT_LIST_HEAD(&vmemmap_pages);
 688                         __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags);
 689                 }
 690         }
 691
 692         flush_tlb_all();
 693         free_vmemmap_page_list(&vmemmap_pages);
 694 }
 695
 696 static struct ctl_table hugetlb_vmemmap_sysctls[] = {
 697         {
 698                 .procname       = "hugetlb_optimize_vmemmap",
 699                 .data           = &vmemmap_optimize_enabled,
 700                 .maxlen         = sizeof(vmemmap_optimize_enabled),
 701                 .mode           = 0644,
 702                 .proc_handler   = proc_dobool,
 703         },
 704 };
 705
 706 static int __init hugetlb_vmemmap_init(void)
 707 {
 708         const struct hstate *h;
 709
 710         /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */
 711         BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES);
 712
 713         for_each_hstate(h) {
 714                 if (hugetlb_vmemmap_optimizable(h)) {
 715                         register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
 716                         break;
 717                 }
 718         }
 719         return 0;
 720 }
 721 late_initcall(hugetlb_vmemmap_init);