mm/hugetlb_vmemmap.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * HugeTLB Vmemmap Optimization (HVO)
   4  *
   5  * Copyright (c) 2020, ByteDance. All rights reserved.
   6  *
   7  *     Author: Muchun Song <songmuchun@bytedance.com>
   8  *
   9  * See Documentation/mm/vmemmap_dedup.rst
  10  */
  11 #define pr_fmt(fmt)     "HugeTLB: " fmt
  12
  13 #include <linux/pgtable.h>
  14 #include <linux/moduleparam.h>
  15 #include <linux/bootmem_info.h>
  16 #include <linux/mmdebug.h>
  17 #include <linux/pagewalk.h>
  18 #include <asm/pgalloc.h>
  19 #include <asm/tlbflush.h>
  20 #include "hugetlb_vmemmap.h"
  21
  22 /**
  23  * struct vmemmap_remap_walk - walk vmemmap page table
  24  *
  25  * @remap_pte:          called for each lowest-level entry (PTE).
  26  * @nr_walked:          the number of walked pte.
  27  * @reuse_page:         the page which is reused for the tail vmemmap pages.
  28  * @reuse_addr:         the virtual address of the @reuse_page page.
  29  * @vmemmap_pages:      the list head of the vmemmap pages that can be freed
  30  *                      or is mapped from.
  31  * @flags:              used to modify behavior in vmemmap page table walking
  32  *                      operations.
  33  */
  34 struct vmemmap_remap_walk {
  35         void                    (*remap_pte)(pte_t *pte, unsigned long addr,
  36                                              struct vmemmap_remap_walk *walk);
  37         unsigned long           nr_walked;
  38         struct page             *reuse_page;
  39         unsigned long           reuse_addr;
  40         struct list_head        *vmemmap_pages;
  41
  42 /* Skip the TLB flush when we split the PMD */
  43 #define VMEMMAP_SPLIT_NO_TLB_FLUSH      BIT(0)
  44 /* Skip the TLB flush when we remap the PTE */
  45 #define VMEMMAP_REMAP_NO_TLB_FLUSH      BIT(1)
  46         unsigned long           flags;
  47 };
  48
  49 static int vmemmap_split_pmd(pmd_t *pmd, struct page *head, unsigned long start,
  50                              struct vmemmap_remap_walk *walk)
  51 {
  52         pmd_t __pmd;
  53         int i;
  54         unsigned long addr = start;
  55         pte_t *pgtable;
  56
  57         pgtable = pte_alloc_one_kernel(&init_mm);
  58         if (!pgtable)
  59                 return -ENOMEM;
  60
  61         pmd_populate_kernel(&init_mm, &__pmd, pgtable);
  62
  63         for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
  64                 pte_t entry, *pte;
  65                 pgprot_t pgprot = PAGE_KERNEL;
  66
  67                 entry = mk_pte(head + i, pgprot);
  68                 pte = pte_offset_kernel(&__pmd, addr);
  69                 set_pte_at(&init_mm, addr, pte, entry);
  70         }
  71
  72         spin_lock(&init_mm.page_table_lock);
  73         if (likely(pmd_leaf(*pmd))) {
  74                 /*
  75                  * Higher order allocations from buddy allocator must be able to
  76                  * be treated as indepdenent small pages (as they can be freed
  77                  * individually).
  78                  */
  79                 if (!PageReserved(head))
  80                         split_page(head, get_order(PMD_SIZE));
  81
  82                 /* Make pte visible before pmd. See comment in pmd_install(). */
  83                 smp_wmb();
  84                 pmd_populate_kernel(&init_mm, pmd, pgtable);
  85                 if (!(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH))
  86                         flush_tlb_kernel_range(start, start + PMD_SIZE);
  87         } else {
  88                 pte_free_kernel(&init_mm, pgtable);
  89         }
  90         spin_unlock(&init_mm.page_table_lock);
  91
  92         return 0;
  93 }
  94
  95 static int vmemmap_pmd_entry(pmd_t *pmd, unsigned long addr,
  96                              unsigned long next, struct mm_walk *walk)
  97 {
  98         int ret = 0;
  99         struct page *head;
 100         struct vmemmap_remap_walk *vmemmap_walk = walk->private;
 101
 102         /* Only splitting, not remapping the vmemmap pages. */
 103         if (!vmemmap_walk->remap_pte)
 104                 walk->action = ACTION_CONTINUE;
 105
 106         spin_lock(&init_mm.page_table_lock);
 107         head = pmd_leaf(*pmd) ? pmd_page(*pmd) : NULL;
 108         /*
 109          * Due to HugeTLB alignment requirements and the vmemmap
 110          * pages being at the start of the hotplugged memory
 111          * region in memory_hotplug.memmap_on_memory case. Checking
 112          * the vmemmap page associated with the first vmemmap page
 113          * if it is self-hosted is sufficient.
 114          *
 115          * [                  hotplugged memory                  ]
 116          * [        section        ][...][        section        ]
 117          * [ vmemmap ][              usable memory               ]
 118          *   ^  | ^                        |
 119          *   +--+ |                        |
 120          *        +------------------------+
 121          */
 122         if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && unlikely(!vmemmap_walk->nr_walked)) {
 123                 struct page *page = head ? head + pte_index(addr) :
 124                                     pte_page(ptep_get(pte_offset_kernel(pmd, addr)));
 125
 126                 if (PageVmemmapSelfHosted(page))
 127                         ret = -ENOTSUPP;
 128         }
 129         spin_unlock(&init_mm.page_table_lock);
 130         if (!head || ret)
 131                 return ret;
 132
 133         return vmemmap_split_pmd(pmd, head, addr & PMD_MASK, vmemmap_walk);
 134 }
 135
 136 static int vmemmap_pte_entry(pte_t *pte, unsigned long addr,
 137                              unsigned long next, struct mm_walk *walk)
 138 {
 139         struct vmemmap_remap_walk *vmemmap_walk = walk->private;
 140
 141         /*
 142          * The reuse_page is found 'first' in page table walking before
 143          * starting remapping.
 144          */
 145         if (!vmemmap_walk->reuse_page)
 146                 vmemmap_walk->reuse_page = pte_page(ptep_get(pte));
 147         else
 148                 vmemmap_walk->remap_pte(pte, addr, vmemmap_walk);
 149         vmemmap_walk->nr_walked++;
 150
 151         return 0;
 152 }
 153
 154 static const struct mm_walk_ops vmemmap_remap_ops = {
 155         .pmd_entry      = vmemmap_pmd_entry,
 156         .pte_entry      = vmemmap_pte_entry,
 157 };
 158
 159 static int vmemmap_remap_range(unsigned long start, unsigned long end,
 160                                struct vmemmap_remap_walk *walk)
 161 {
 162         int ret;
 163
 164         VM_BUG_ON(!PAGE_ALIGNED(start | end));
 165
 166         mmap_read_lock(&init_mm);
 167         ret = walk_page_range_novma(&init_mm, start, end, &vmemmap_remap_ops,
 168                                     NULL, walk);
 169         mmap_read_unlock(&init_mm);
 170         if (ret)
 171                 return ret;
 172
 173         if (walk->remap_pte && !(walk->flags & VMEMMAP_REMAP_NO_TLB_FLUSH))
 174                 flush_tlb_kernel_range(start, end);
 175
 176         return 0;
 177 }
 178
 179 /*
 180  * Free a vmemmap page. A vmemmap page can be allocated from the memblock
 181  * allocator or buddy allocator. If the PG_reserved flag is set, it means
 182  * that it allocated from the memblock allocator, just free it via the
 183  * free_bootmem_page(). Otherwise, use __free_page().
 184  */
 185 static inline void free_vmemmap_page(struct page *page)
 186 {
 187         if (PageReserved(page)) {
 188                 free_bootmem_page(page);
 189                 mod_node_page_state(page_pgdat(page), NR_MEMMAP_BOOT, -1);
 190         } else {
 191                 __free_page(page);
 192                 mod_node_page_state(page_pgdat(page), NR_MEMMAP, -1);
 193         }
 194 }
 195
 196 /* Free a list of the vmemmap pages */
 197 static void free_vmemmap_page_list(struct list_head *list)
 198 {
 199         struct page *page, *next;
 200
 201         list_for_each_entry_safe(page, next, list, lru)
 202                 free_vmemmap_page(page);
 203 }
 204
 205 static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
 206                               struct vmemmap_remap_walk *walk)
 207 {
 208         /*
 209          * Remap the tail pages as read-only to catch illegal write operation
 210          * to the tail pages.
 211          */
 212         pgprot_t pgprot = PAGE_KERNEL_RO;
 213         struct page *page = pte_page(ptep_get(pte));
 214         pte_t entry;
 215
 216         /* Remapping the head page requires r/w */
 217         if (unlikely(addr == walk->reuse_addr)) {
 218                 pgprot = PAGE_KERNEL;
 219                 list_del(&walk->reuse_page->lru);
 220
 221                 /*
 222                  * Makes sure that preceding stores to the page contents from
 223                  * vmemmap_remap_free() become visible before the set_pte_at()
 224                  * write.
 225                  */
 226                 smp_wmb();
 227         }
 228
 229         entry = mk_pte(walk->reuse_page, pgprot);
 230         list_add(&page->lru, walk->vmemmap_pages);
 231         set_pte_at(&init_mm, addr, pte, entry);
 232 }
 233
 234 /*
 235  * How many struct page structs need to be reset. When we reuse the head
 236  * struct page, the special metadata (e.g. page->flags or page->mapping)
 237  * cannot copy to the tail struct page structs. The invalid value will be
 238  * checked in the free_tail_page_prepare(). In order to avoid the message
 239  * of "corrupted mapping in tail page". We need to reset at least 3 (one
 240  * head struct page struct and two tail struct page structs) struct page
 241  * structs.
 242  */
 243 #define NR_RESET_STRUCT_PAGE            3
 244
 245 static inline void reset_struct_pages(struct page *start)
 246 {
 247         struct page *from = start + NR_RESET_STRUCT_PAGE;
 248
 249         BUILD_BUG_ON(NR_RESET_STRUCT_PAGE * 2 > PAGE_SIZE / sizeof(struct page));
 250         memcpy(start, from, sizeof(*from) * NR_RESET_STRUCT_PAGE);
 251 }
 252
 253 static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
 254                                 struct vmemmap_remap_walk *walk)
 255 {
 256         pgprot_t pgprot = PAGE_KERNEL;
 257         struct page *page;
 258         void *to;
 259
 260         BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page);
 261
 262         page = list_first_entry(walk->vmemmap_pages, struct page, lru);
 263         list_del(&page->lru);
 264         to = page_to_virt(page);
 265         copy_page(to, (void *)walk->reuse_addr);
 266         reset_struct_pages(to);
 267
 268         /*
 269          * Makes sure that preceding stores to the page contents become visible
 270          * before the set_pte_at() write.
 271          */
 272         smp_wmb();
 273         set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
 274 }
 275
 276 /**
 277  * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end)
 278  *                      backing PMDs of the directmap into PTEs
 279  * @start:     start address of the vmemmap virtual address range that we want
 280  *             to remap.
 281  * @end:       end address of the vmemmap virtual address range that we want to
 282  *             remap.
 283  * @reuse:     reuse address.
 284  *
 285  * Return: %0 on success, negative error code otherwise.
 286  */
 287 static int vmemmap_remap_split(unsigned long start, unsigned long end,
 288                                unsigned long reuse)
 289 {
 290         struct vmemmap_remap_walk walk = {
 291                 .remap_pte      = NULL,
 292                 .flags          = VMEMMAP_SPLIT_NO_TLB_FLUSH,
 293         };
 294
 295         /* See the comment in the vmemmap_remap_free(). */
 296         BUG_ON(start - reuse != PAGE_SIZE);
 297
 298         return vmemmap_remap_range(reuse, end, &walk);
 299 }
 300
 301 /**
 302  * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
 303  *                      to the page which @reuse is mapped to, then free vmemmap
 304  *                      which the range are mapped to.
 305  * @start:      start address of the vmemmap virtual address range that we want
 306  *              to remap.
 307  * @end:        end address of the vmemmap virtual address range that we want to
 308  *              remap.
 309  * @reuse:      reuse address.
 310  * @vmemmap_pages: list to deposit vmemmap pages to be freed.  It is callers
 311  *              responsibility to free pages.
 312  * @flags:      modifications to vmemmap_remap_walk flags
 313  *
 314  * Return: %0 on success, negative error code otherwise.
 315  */
 316 static int vmemmap_remap_free(unsigned long start, unsigned long end,
 317                               unsigned long reuse,
 318                               struct list_head *vmemmap_pages,
 319                               unsigned long flags)
 320 {
 321         int ret;
 322         struct vmemmap_remap_walk walk = {
 323                 .remap_pte      = vmemmap_remap_pte,
 324                 .reuse_addr     = reuse,
 325                 .vmemmap_pages  = vmemmap_pages,
 326                 .flags          = flags,
 327         };
 328         int nid = page_to_nid((struct page *)reuse);
 329         gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
 330
 331         /*
 332          * Allocate a new head vmemmap page to avoid breaking a contiguous
 333          * block of struct page memory when freeing it back to page allocator
 334          * in free_vmemmap_page_list(). This will allow the likely contiguous
 335          * struct page backing memory to be kept contiguous and allowing for
 336          * more allocations of hugepages. Fallback to the currently
 337          * mapped head page in case should it fail to allocate.
 338          */
 339         walk.reuse_page = alloc_pages_node(nid, gfp_mask, 0);
 340         if (walk.reuse_page) {
 341                 copy_page(page_to_virt(walk.reuse_page),
 342                           (void *)walk.reuse_addr);
 343                 list_add(&walk.reuse_page->lru, vmemmap_pages);
 344                 mod_node_page_state(NODE_DATA(nid), NR_MEMMAP, 1);
 345         }
 346
 347         /*
 348          * In order to make remapping routine most efficient for the huge pages,
 349          * the routine of vmemmap page table walking has the following rules
 350          * (see more details from the vmemmap_pte_range()):
 351          *
 352          * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
 353          *   should be continuous.
 354          * - The @reuse address is part of the range [@reuse, @end) that we are
 355          *   walking which is passed to vmemmap_remap_range().
 356          * - The @reuse address is the first in the complete range.
 357          *
 358          * So we need to make sure that @start and @reuse meet the above rules.
 359          */
 360         BUG_ON(start - reuse != PAGE_SIZE);
 361
 362         ret = vmemmap_remap_range(reuse, end, &walk);
 363         if (ret && walk.nr_walked) {
 364                 end = reuse + walk.nr_walked * PAGE_SIZE;
 365                 /*
 366                  * vmemmap_pages contains pages from the previous
 367                  * vmemmap_remap_range call which failed.  These
 368                  * are pages which were removed from the vmemmap.
 369                  * They will be restored in the following call.
 370                  */
 371                 walk = (struct vmemmap_remap_walk) {
 372                         .remap_pte      = vmemmap_restore_pte,
 373                         .reuse_addr     = reuse,
 374                         .vmemmap_pages  = vmemmap_pages,
 375                         .flags          = 0,
 376                 };
 377
 378                 vmemmap_remap_range(reuse, end, &walk);
 379         }
 380
 381         return ret;
 382 }
 383
 384 static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
 385                                    struct list_head *list)
 386 {
 387         gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL;
 388         unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
 389         int nid = page_to_nid((struct page *)start);
 390         struct page *page, *next;
 391         int i;
 392
 393         for (i = 0; i < nr_pages; i++) {
 394                 page = alloc_pages_node(nid, gfp_mask, 0);
 395                 if (!page) {
 396                         mod_node_page_state(NODE_DATA(nid), NR_MEMMAP, i);
 397                         goto out;
 398                 }
 399                 list_add(&page->lru, list);
 400         }
 401
 402         mod_node_page_state(NODE_DATA(nid), NR_MEMMAP, nr_pages);
 403
 404         return 0;
 405 out:
 406         list_for_each_entry_safe(page, next, list, lru)
 407                 __free_page(page);
 408         return -ENOMEM;
 409 }
 410
 411 /**
 412  * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
 413  *                       to the page which is from the @vmemmap_pages
 414  *                       respectively.
 415  * @start:      start address of the vmemmap virtual address range that we want
 416  *              to remap.
 417  * @end:        end address of the vmemmap virtual address range that we want to
 418  *              remap.
 419  * @reuse:      reuse address.
 420  * @flags:      modifications to vmemmap_remap_walk flags
 421  *
 422  * Return: %0 on success, negative error code otherwise.
 423  */
 424 static int vmemmap_remap_alloc(unsigned long start, unsigned long end,
 425                                unsigned long reuse, unsigned long flags)
 426 {
 427         LIST_HEAD(vmemmap_pages);
 428         struct vmemmap_remap_walk walk = {
 429                 .remap_pte      = vmemmap_restore_pte,
 430                 .reuse_addr     = reuse,
 431                 .vmemmap_pages  = &vmemmap_pages,
 432                 .flags          = flags,
 433         };
 434
 435         /* See the comment in the vmemmap_remap_free(). */
 436         BUG_ON(start - reuse != PAGE_SIZE);
 437
 438         if (alloc_vmemmap_page_list(start, end, &vmemmap_pages))
 439                 return -ENOMEM;
 440
 441         return vmemmap_remap_range(reuse, end, &walk);
 442 }
 443
 444 DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);
 445 EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
 446
 447 static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON);
 448 core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0);
 449
 450 static int __hugetlb_vmemmap_restore_folio(const struct hstate *h,
 451                                            struct folio *folio, unsigned long flags)
 452 {
 453         int ret;
 454         unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
 455         unsigned long vmemmap_reuse;
 456
 457         VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
 458         VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio);
 459
 460         if (!folio_test_hugetlb_vmemmap_optimized(folio))
 461                 return 0;
 462
 463         vmemmap_end     = vmemmap_start + hugetlb_vmemmap_size(h);
 464         vmemmap_reuse   = vmemmap_start;
 465         vmemmap_start   += HUGETLB_VMEMMAP_RESERVE_SIZE;
 466
 467         /*
 468          * The pages which the vmemmap virtual address range [@vmemmap_start,
 469          * @vmemmap_end) are mapped to are freed to the buddy allocator, and
 470          * the range is mapped to the page which @vmemmap_reuse is mapped to.
 471          * When a HugeTLB page is freed to the buddy allocator, previously
 472          * discarded vmemmap pages must be allocated and remapping.
 473          */
 474         ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse, flags);
 475         if (!ret) {
 476                 folio_clear_hugetlb_vmemmap_optimized(folio);
 477                 static_branch_dec(&hugetlb_optimize_vmemmap_key);
 478         }
 479
 480         return ret;
 481 }
 482
 483 /**
 484  * hugetlb_vmemmap_restore_folio - restore previously optimized (by
 485  *                              hugetlb_vmemmap_optimize_folio()) vmemmap pages which
 486  *                              will be reallocated and remapped.
 487  * @h:          struct hstate.
 488  * @folio:     the folio whose vmemmap pages will be restored.
 489  *
 490  * Return: %0 if @folio's vmemmap pages have been reallocated and remapped,
 491  * negative error code otherwise.
 492  */
 493 int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio)
 494 {
 495         /* avoid writes from page_ref_add_unless() while unfolding vmemmap */
 496         synchronize_rcu();
 497
 498         return __hugetlb_vmemmap_restore_folio(h, folio, 0);
 499 }
 500
 501 /**
 502  * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list.
 503  * @h:                  hstate.
 504  * @folio_list:         list of folios.
 505  * @non_hvo_folios:     Output list of folios for which vmemmap exists.
 506  *
 507  * Return: number of folios for which vmemmap was restored, or an error code
 508  *              if an error was encountered restoring vmemmap for a folio.
 509  *              Folios that have vmemmap are moved to the non_hvo_folios
 510  *              list.  Processing of entries stops when the first error is
 511  *              encountered. The folio that experienced the error and all
 512  *              non-processed folios will remain on folio_list.
 513  */
 514 long hugetlb_vmemmap_restore_folios(const struct hstate *h,
 515                                         struct list_head *folio_list,
 516                                         struct list_head *non_hvo_folios)
 517 {
 518         struct folio *folio, *t_folio;
 519         long restored = 0;
 520         long ret = 0;
 521
 522         /* avoid writes from page_ref_add_unless() while unfolding vmemmap */
 523         synchronize_rcu();
 524
 525         list_for_each_entry_safe(folio, t_folio, folio_list, lru) {
 526                 if (folio_test_hugetlb_vmemmap_optimized(folio)) {
 527                         ret = __hugetlb_vmemmap_restore_folio(h, folio,
 528                                                               VMEMMAP_REMAP_NO_TLB_FLUSH);
 529                         if (ret)
 530                                 break;
 531                         restored++;
 532                 }
 533
 534                 /* Add non-optimized folios to output list */
 535                 list_move(&folio->lru, non_hvo_folios);
 536         }
 537
 538         if (restored)
 539                 flush_tlb_all();
 540         if (!ret)
 541                 ret = restored;
 542         return ret;
 543 }
 544
 545 /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */
 546 static bool vmemmap_should_optimize_folio(const struct hstate *h, struct folio *folio)
 547 {
 548         if (folio_test_hugetlb_vmemmap_optimized(folio))
 549                 return false;
 550
 551         if (!READ_ONCE(vmemmap_optimize_enabled))
 552                 return false;
 553
 554         if (!hugetlb_vmemmap_optimizable(h))
 555                 return false;
 556
 557         return true;
 558 }
 559
 560 static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
 561                                             struct folio *folio,
 562                                             struct list_head *vmemmap_pages,
 563                                             unsigned long flags)
 564 {
 565         int ret = 0;
 566         unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
 567         unsigned long vmemmap_reuse;
 568
 569         VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
 570         VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio);
 571
 572         if (!vmemmap_should_optimize_folio(h, folio))
 573                 return ret;
 574
 575         static_branch_inc(&hugetlb_optimize_vmemmap_key);
 576         /*
 577          * Very Subtle
 578          * If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed
 579          * immediately after remapping.  As a result, subsequent accesses
 580          * and modifications to struct pages associated with the hugetlb
 581          * page could be to the OLD struct pages.  Set the vmemmap optimized
 582          * flag here so that it is copied to the new head page.  This keeps
 583          * the old and new struct pages in sync.
 584          * If there is an error during optimization, we will immediately FLUSH
 585          * the TLB and clear the flag below.
 586          */
 587         folio_set_hugetlb_vmemmap_optimized(folio);
 588
 589         vmemmap_end     = vmemmap_start + hugetlb_vmemmap_size(h);
 590         vmemmap_reuse   = vmemmap_start;
 591         vmemmap_start   += HUGETLB_VMEMMAP_RESERVE_SIZE;
 592
 593         /*
 594          * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end)
 595          * to the page which @vmemmap_reuse is mapped to.  Add pages previously
 596          * mapping the range to vmemmap_pages list so that they can be freed by
 597          * the caller.
 598          */
 599         ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse,
 600                                  vmemmap_pages, flags);
 601         if (ret) {
 602                 static_branch_dec(&hugetlb_optimize_vmemmap_key);
 603                 folio_clear_hugetlb_vmemmap_optimized(folio);
 604         }
 605
 606         return ret;
 607 }
 608
 609 /**
 610  * hugetlb_vmemmap_optimize_folio - optimize @folio's vmemmap pages.
 611  * @h:          struct hstate.
 612  * @folio:     the folio whose vmemmap pages will be optimized.
 613  *
 614  * This function only tries to optimize @folio's vmemmap pages and does not
 615  * guarantee that the optimization will succeed after it returns. The caller
 616  * can use folio_test_hugetlb_vmemmap_optimized(@folio) to detect if @folio's
 617  * vmemmap pages have been optimized.
 618  */
 619 void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio)
 620 {
 621         LIST_HEAD(vmemmap_pages);
 622
 623         /* avoid writes from page_ref_add_unless() while folding vmemmap */
 624         synchronize_rcu();
 625
 626         __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, 0);
 627         free_vmemmap_page_list(&vmemmap_pages);
 628 }
 629
 630 static int hugetlb_vmemmap_split_folio(const struct hstate *h, struct folio *folio)
 631 {
 632         unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
 633         unsigned long vmemmap_reuse;
 634
 635         if (!vmemmap_should_optimize_folio(h, folio))
 636                 return 0;
 637
 638         vmemmap_end     = vmemmap_start + hugetlb_vmemmap_size(h);
 639         vmemmap_reuse   = vmemmap_start;
 640         vmemmap_start   += HUGETLB_VMEMMAP_RESERVE_SIZE;
 641
 642         /*
 643          * Split PMDs on the vmemmap virtual address range [@vmemmap_start,
 644          * @vmemmap_end]
 645          */
 646         return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse);
 647 }
 648
 649 void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list)
 650 {
 651         struct folio *folio;
 652         LIST_HEAD(vmemmap_pages);
 653
 654         list_for_each_entry(folio, folio_list, lru) {
 655                 int ret = hugetlb_vmemmap_split_folio(h, folio);
 656
 657                 /*
 658                  * Spliting the PMD requires allocating a page, thus lets fail
 659                  * early once we encounter the first OOM. No point in retrying
 660                  * as it can be dynamically done on remap with the memory
 661                  * we get back from the vmemmap deduplication.
 662                  */
 663                 if (ret == -ENOMEM)
 664                         break;
 665         }
 666
 667         flush_tlb_all();
 668
 669         /* avoid writes from page_ref_add_unless() while folding vmemmap */
 670         synchronize_rcu();
 671
 672         list_for_each_entry(folio, folio_list, lru) {
 673                 int ret;
 674
 675                 ret = __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages,
 676                                                        VMEMMAP_REMAP_NO_TLB_FLUSH);
 677
 678                 /*
 679                  * Pages to be freed may have been accumulated.  If we
 680                  * encounter an ENOMEM,  free what we have and try again.
 681                  * This can occur in the case that both spliting fails
 682                  * halfway and head page allocation also failed. In this
 683                  * case __hugetlb_vmemmap_optimize_folio() would free memory
 684                  * allowing more vmemmap remaps to occur.
 685                  */
 686                 if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) {
 687                         flush_tlb_all();
 688                         free_vmemmap_page_list(&vmemmap_pages);
 689                         INIT_LIST_HEAD(&vmemmap_pages);
 690                         __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages,
 691                                                          VMEMMAP_REMAP_NO_TLB_FLUSH);
 692                 }
 693         }
 694
 695         flush_tlb_all();
 696         free_vmemmap_page_list(&vmemmap_pages);
 697 }
 698
 699 static struct ctl_table hugetlb_vmemmap_sysctls[] = {
 700         {
 701                 .procname       = "hugetlb_optimize_vmemmap",
 702                 .data           = &vmemmap_optimize_enabled,
 703                 .maxlen         = sizeof(vmemmap_optimize_enabled),
 704                 .mode           = 0644,
 705                 .proc_handler   = proc_dobool,
 706         },
 707 };
 708
 709 static int __init hugetlb_vmemmap_init(void)
 710 {
 711         const struct hstate *h;
 712
 713         /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */
 714         BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES);
 715
 716         for_each_hstate(h) {
 717                 if (hugetlb_vmemmap_optimizable(h)) {
 718                         register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
 719                         break;
 720                 }
 721         }
 722         return 0;
 723 }
 724 late_initcall(hugetlb_vmemmap_init);