mm/mempolicy.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Simple NUMA memory policy for the Linux kernel.
   4  *
   5  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   6  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * weighted interleave
  23  *                Allocate memory interleaved over a set of nodes based on
  24  *                a set of weights (per-node), with normal fallback if it
  25  *                fails.  Otherwise operates the same as interleave.
  26  *                Example: nodeset(0,1) & weights (2,1) - 2 pages allocated
  27  *                on node 0 for every 1 page allocated on node 1.
  28  *
  29  * bind           Only allocate memory on a specific set of nodes,
  30  *                no fallback.
  31  *                FIXME: memory is allocated starting with the first node
  32  *                to the last. It would be better if bind would truly restrict
  33  *                the allocation to memory nodes instead
  34  *
  35  * preferred      Try a specific node first before normal fallback.
  36  *                As a special case NUMA_NO_NODE here means do the allocation
  37  *                on the local CPU. This is normally identical to default,
  38  *                but useful to set in a VMA when you have a non default
  39  *                process policy.
  40  *
  41  * preferred many Try a set of nodes first before normal fallback. This is
  42  *                similar to preferred without the special case.
  43  *
  44  * default        Allocate on the local node first, or when on a VMA
  45  *                use the process policy. This is what Linux always did
  46  *                in a NUMA aware kernel and still does by, ahem, default.
  47  *
  48  * The process policy is applied for most non interrupt memory allocations
  49  * in that process' context. Interrupts ignore the policies and always
  50  * try to allocate on the local CPU. The VMA policy is only applied for memory
  51  * allocations for a VMA in the VM.
  52  *
  53  * Currently there are a few corner cases in swapping where the policy
  54  * is not applied, but the majority should be handled. When process policy
  55  * is used it is not remembered over swap outs/swap ins.
  56  *
  57  * Only the highest zone in the zone hierarchy gets policied. Allocations
  58  * requesting a lower zone just use default policy. This implies that
  59  * on systems with highmem kernel lowmem allocation don't get policied.
  60  * Same with GFP_DMA allocations.
  61  *
  62  * For shmem/tmpfs shared memory the policy is shared between
  63  * all users and remembered even when nobody has memory mapped.
  64  */
  65
  66 /* Notebook:
  67    fix mmap readahead to honour policy and enable policy for any page cache
  68    object
  69    statistics for bigpages
  70    global policy for page cache? currently it uses process policy. Requires
  71    first item above.
  72    handle mremap for shared memory (currently ignored for the policy)
  73    grows down?
  74    make bind policy root only? It can trigger oom much faster and the
  75    kernel is not always grateful with that.
  76 */
  77
  78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  79
  80 #include <linux/mempolicy.h>
  81 #include <linux/pagewalk.h>
  82 #include <linux/highmem.h>
  83 #include <linux/hugetlb.h>
  84 #include <linux/kernel.h>
  85 #include <linux/sched.h>
  86 #include <linux/sched/mm.h>
  87 #include <linux/sched/numa_balancing.h>
  88 #include <linux/sched/task.h>
  89 #include <linux/nodemask.h>
  90 #include <linux/cpuset.h>
  91 #include <linux/slab.h>
  92 #include <linux/string.h>
  93 #include <linux/export.h>
  94 #include <linux/nsproxy.h>
  95 #include <linux/interrupt.h>
  96 #include <linux/init.h>
  97 #include <linux/compat.h>
  98 #include <linux/ptrace.h>
  99 #include <linux/swap.h>
 100 #include <linux/seq_file.h>
 101 #include <linux/proc_fs.h>
 102 #include <linux/migrate.h>
 103 #include <linux/ksm.h>
 104 #include <linux/rmap.h>
 105 #include <linux/security.h>
 106 #include <linux/syscalls.h>
 107 #include <linux/ctype.h>
 108 #include <linux/mm_inline.h>
 109 #include <linux/mmu_notifier.h>
 110 #include <linux/printk.h>
 111 #include <linux/swapops.h>
 112
 113 #include <asm/tlbflush.h>
 114 #include <asm/tlb.h>
 115 #include <linux/uaccess.h>
 116
 117 #include "internal.h"
 118
 119 /* Internal flags */
 120 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
 121 #define MPOL_MF_INVERT       (MPOL_MF_INTERNAL << 1)    /* Invert check for nodemask */
 122 #define MPOL_MF_WRLOCK       (MPOL_MF_INTERNAL << 2)    /* Write-lock walked vmas */
 123
 124 static struct kmem_cache *policy_cache;
 125 static struct kmem_cache *sn_cache;
 126
 127 /* Highest zone. An specific allocation for a zone below that is not
 128    policied. */
 129 enum zone_type policy_zone = 0;
 130
 131 /*
 132  * run-time system-wide default policy => local allocation
 133  */
 134 static struct mempolicy default_policy = {
 135         .refcnt = ATOMIC_INIT(1), /* never free it */
 136         .mode = MPOL_LOCAL,
 137 };
 138
 139 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
 140
 141 /*
 142  * iw_table is the sysfs-set interleave weight table, a value of 0 denotes
 143  * system-default value should be used. A NULL iw_table also denotes that
 144  * system-default values should be used. Until the system-default table
 145  * is implemented, the system-default is always 1.
 146  *
 147  * iw_table is RCU protected
 148  */
 149 static u8 __rcu *iw_table;
 150 static DEFINE_MUTEX(iw_table_lock);
 151
 152 static u8 get_il_weight(int node)
 153 {
 154         u8 *table;
 155         u8 weight;
 156
 157         rcu_read_lock();
 158         table = rcu_dereference(iw_table);
 159         /* if no iw_table, use system default */
 160         weight = table ? table[node] : 1;
 161         /* if value in iw_table is 0, use system default */
 162         weight = weight ? weight : 1;
 163         rcu_read_unlock();
 164         return weight;
 165 }
 166
 167 /**
 168  * numa_nearest_node - Find nearest node by state
 169  * @node: Node id to start the search
 170  * @state: State to filter the search
 171  *
 172  * Lookup the closest node by distance if @nid is not in state.
 173  *
 174  * Return: this @node if it is in state, otherwise the closest node by distance
 175  */
 176 int numa_nearest_node(int node, unsigned int state)
 177 {
 178         int min_dist = INT_MAX, dist, n, min_node;
 179
 180         if (state >= NR_NODE_STATES)
 181                 return -EINVAL;
 182
 183         if (node == NUMA_NO_NODE || node_state(node, state))
 184                 return node;
 185
 186         min_node = node;
 187         for_each_node_state(n, state) {
 188                 dist = node_distance(node, n);
 189                 if (dist < min_dist) {
 190                         min_dist = dist;
 191                         min_node = n;
 192                 }
 193         }
 194
 195         return min_node;
 196 }
 197 EXPORT_SYMBOL_GPL(numa_nearest_node);
 198
 199 struct mempolicy *get_task_policy(struct task_struct *p)
 200 {
 201         struct mempolicy *pol = p->mempolicy;
 202         int node;
 203
 204         if (pol)
 205                 return pol;
 206
 207         node = numa_node_id();
 208         if (node != NUMA_NO_NODE) {
 209                 pol = &preferred_node_policy[node];
 210                 /* preferred_node_policy is not initialised early in boot */
 211                 if (pol->mode)
 212                         return pol;
 213         }
 214
 215         return &default_policy;
 216 }
 217
 218 static const struct mempolicy_operations {
 219         int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 220         void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
 221 } mpol_ops[MPOL_MAX];
 222
 223 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 224 {
 225         return pol->flags & MPOL_MODE_FLAGS;
 226 }
 227
 228 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 229                                    const nodemask_t *rel)
 230 {
 231         nodemask_t tmp;
 232         nodes_fold(tmp, *orig, nodes_weight(*rel));
 233         nodes_onto(*ret, tmp, *rel);
 234 }
 235
 236 static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
 237 {
 238         if (nodes_empty(*nodes))
 239                 return -EINVAL;
 240         pol->nodes = *nodes;
 241         return 0;
 242 }
 243
 244 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 245 {
 246         if (nodes_empty(*nodes))
 247                 return -EINVAL;
 248
 249         nodes_clear(pol->nodes);
 250         node_set(first_node(*nodes), pol->nodes);
 251         return 0;
 252 }
 253
 254 /*
 255  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 256  * any, for the new policy.  mpol_new() has already validated the nodes
 257  * parameter with respect to the policy mode and flags.
 258  *
 259  * Must be called holding task's alloc_lock to protect task's mems_allowed
 260  * and mempolicy.  May also be called holding the mmap_lock for write.
 261  */
 262 static int mpol_set_nodemask(struct mempolicy *pol,
 263                      const nodemask_t *nodes, struct nodemask_scratch *nsc)
 264 {
 265         int ret;
 266
 267         /*
 268          * Default (pol==NULL) resp. local memory policies are not a
 269          * subject of any remapping. They also do not need any special
 270          * constructor.
 271          */
 272         if (!pol || pol->mode == MPOL_LOCAL)
 273                 return 0;
 274
 275         /* Check N_MEMORY */
 276         nodes_and(nsc->mask1,
 277                   cpuset_current_mems_allowed, node_states[N_MEMORY]);
 278
 279         VM_BUG_ON(!nodes);
 280
 281         if (pol->flags & MPOL_F_RELATIVE_NODES)
 282                 mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
 283         else
 284                 nodes_and(nsc->mask2, *nodes, nsc->mask1);
 285
 286         if (mpol_store_user_nodemask(pol))
 287                 pol->w.user_nodemask = *nodes;
 288         else
 289                 pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
 290
 291         ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
 292         return ret;
 293 }
 294
 295 /*
 296  * This function just creates a new policy, does some check and simple
 297  * initialization. You must invoke mpol_set_nodemask() to set nodes.
 298  */
 299 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 300                                   nodemask_t *nodes)
 301 {
 302         struct mempolicy *policy;
 303
 304         if (mode == MPOL_DEFAULT) {
 305                 if (nodes && !nodes_empty(*nodes))
 306                         return ERR_PTR(-EINVAL);
 307                 return NULL;
 308         }
 309         VM_BUG_ON(!nodes);
 310
 311         /*
 312          * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 313          * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 314          * All other modes require a valid pointer to a non-empty nodemask.
 315          */
 316         if (mode == MPOL_PREFERRED) {
 317                 if (nodes_empty(*nodes)) {
 318                         if (((flags & MPOL_F_STATIC_NODES) ||
 319                              (flags & MPOL_F_RELATIVE_NODES)))
 320                                 return ERR_PTR(-EINVAL);
 321
 322                         mode = MPOL_LOCAL;
 323                 }
 324         } else if (mode == MPOL_LOCAL) {
 325                 if (!nodes_empty(*nodes) ||
 326                     (flags & MPOL_F_STATIC_NODES) ||
 327                     (flags & MPOL_F_RELATIVE_NODES))
 328                         return ERR_PTR(-EINVAL);
 329         } else if (nodes_empty(*nodes))
 330                 return ERR_PTR(-EINVAL);
 331
 332         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 333         if (!policy)
 334                 return ERR_PTR(-ENOMEM);
 335         atomic_set(&policy->refcnt, 1);
 336         policy->mode = mode;
 337         policy->flags = flags;
 338         policy->home_node = NUMA_NO_NODE;
 339
 340         return policy;
 341 }
 342
 343 /* Slow path of a mpol destructor. */
 344 void __mpol_put(struct mempolicy *pol)
 345 {
 346         if (!atomic_dec_and_test(&pol->refcnt))
 347                 return;
 348         kmem_cache_free(policy_cache, pol);
 349 }
 350
 351 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
 352 {
 353 }
 354
 355 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
 356 {
 357         nodemask_t tmp;
 358
 359         if (pol->flags & MPOL_F_STATIC_NODES)
 360                 nodes_and(tmp, pol->w.user_nodemask, *nodes);
 361         else if (pol->flags & MPOL_F_RELATIVE_NODES)
 362                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 363         else {
 364                 nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
 365                                                                 *nodes);
 366                 pol->w.cpuset_mems_allowed = *nodes;
 367         }
 368
 369         if (nodes_empty(tmp))
 370                 tmp = *nodes;
 371
 372         pol->nodes = tmp;
 373 }
 374
 375 static void mpol_rebind_preferred(struct mempolicy *pol,
 376                                                 const nodemask_t *nodes)
 377 {
 378         pol->w.cpuset_mems_allowed = *nodes;
 379 }
 380
 381 /*
 382  * mpol_rebind_policy - Migrate a policy to a different set of nodes
 383  *
 384  * Per-vma policies are protected by mmap_lock. Allocations using per-task
 385  * policies are protected by task->mems_allowed_seq to prevent a premature
 386  * OOM/allocation failure due to parallel nodemask modification.
 387  */
 388 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
 389 {
 390         if (!pol || pol->mode == MPOL_LOCAL)
 391                 return;
 392         if (!mpol_store_user_nodemask(pol) &&
 393             nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 394                 return;
 395
 396         mpol_ops[pol->mode].rebind(pol, newmask);
 397 }
 398
 399 /*
 400  * Wrapper for mpol_rebind_policy() that just requires task
 401  * pointer, and updates task mempolicy.
 402  *
 403  * Called with task's alloc_lock held.
 404  */
 405 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
 406 {
 407         mpol_rebind_policy(tsk->mempolicy, new);
 408 }
 409
 410 /*
 411  * Rebind each vma in mm to new nodemask.
 412  *
 413  * Call holding a reference to mm.  Takes mm->mmap_lock during call.
 414  */
 415 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 416 {
 417         struct vm_area_struct *vma;
 418         VMA_ITERATOR(vmi, mm, 0);
 419
 420         mmap_write_lock(mm);
 421         for_each_vma(vmi, vma) {
 422                 vma_start_write(vma);
 423                 mpol_rebind_policy(vma->vm_policy, new);
 424         }
 425         mmap_write_unlock(mm);
 426 }
 427
 428 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 429         [MPOL_DEFAULT] = {
 430                 .rebind = mpol_rebind_default,
 431         },
 432         [MPOL_INTERLEAVE] = {
 433                 .create = mpol_new_nodemask,
 434                 .rebind = mpol_rebind_nodemask,
 435         },
 436         [MPOL_PREFERRED] = {
 437                 .create = mpol_new_preferred,
 438                 .rebind = mpol_rebind_preferred,
 439         },
 440         [MPOL_BIND] = {
 441                 .create = mpol_new_nodemask,
 442                 .rebind = mpol_rebind_nodemask,
 443         },
 444         [MPOL_LOCAL] = {
 445                 .rebind = mpol_rebind_default,
 446         },
 447         [MPOL_PREFERRED_MANY] = {
 448                 .create = mpol_new_nodemask,
 449                 .rebind = mpol_rebind_preferred,
 450         },
 451         [MPOL_WEIGHTED_INTERLEAVE] = {
 452                 .create = mpol_new_nodemask,
 453                 .rebind = mpol_rebind_nodemask,
 454         },
 455 };
 456
 457 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
 458                                 unsigned long flags);
 459 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
 460                                 pgoff_t ilx, int *nid);
 461
 462 static bool strictly_unmovable(unsigned long flags)
 463 {
 464         /*
 465          * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO
 466          * if any misplaced page is found.
 467          */
 468         return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ==
 469                          MPOL_MF_STRICT;
 470 }
 471
 472 struct migration_mpol {         /* for alloc_migration_target_by_mpol() */
 473         struct mempolicy *pol;
 474         pgoff_t ilx;
 475 };
 476
 477 struct queue_pages {
 478         struct list_head *pagelist;
 479         unsigned long flags;
 480         nodemask_t *nmask;
 481         unsigned long start;
 482         unsigned long end;
 483         struct vm_area_struct *first;
 484         struct folio *large;            /* note last large folio encountered */
 485         long nr_failed;                 /* could not be isolated at this time */
 486 };
 487
 488 /*
 489  * Check if the folio's nid is in qp->nmask.
 490  *
 491  * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
 492  * in the invert of qp->nmask.
 493  */
 494 static inline bool queue_folio_required(struct folio *folio,
 495                                         struct queue_pages *qp)
 496 {
 497         int nid = folio_nid(folio);
 498         unsigned long flags = qp->flags;
 499
 500         return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
 501 }
 502
 503 static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk)
 504 {
 505         struct folio *folio;
 506         struct queue_pages *qp = walk->private;
 507
 508         if (unlikely(is_pmd_migration_entry(*pmd))) {
 509                 qp->nr_failed++;
 510                 return;
 511         }
 512         folio = pmd_folio(*pmd);
 513         if (is_huge_zero_folio(folio)) {
 514                 walk->action = ACTION_CONTINUE;
 515                 return;
 516         }
 517         if (!queue_folio_required(folio, qp))
 518                 return;
 519         if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
 520             !vma_migratable(walk->vma) ||
 521             !migrate_folio_add(folio, qp->pagelist, qp->flags))
 522                 qp->nr_failed++;
 523 }
 524
 525 /*
 526  * Scan through folios, checking if they satisfy the required conditions,
 527  * moving them from LRU to local pagelist for migration if they do (or not).
 528  *
 529  * queue_folios_pte_range() has two possible return values:
 530  * 0 - continue walking to scan for more, even if an existing folio on the
 531  *     wrong node could not be isolated and queued for migration.
 532  * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL,
 533  *        and an existing folio was on a node that does not follow the policy.
 534  */
 535 static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
 536                         unsigned long end, struct mm_walk *walk)
 537 {
 538         struct vm_area_struct *vma = walk->vma;
 539         struct folio *folio;
 540         struct queue_pages *qp = walk->private;
 541         unsigned long flags = qp->flags;
 542         pte_t *pte, *mapped_pte;
 543         pte_t ptent;
 544         spinlock_t *ptl;
 545
 546         ptl = pmd_trans_huge_lock(pmd, vma);
 547         if (ptl) {
 548                 queue_folios_pmd(pmd, walk);
 549                 spin_unlock(ptl);
 550                 goto out;
 551         }
 552
 553         mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
 554         if (!pte) {
 555                 walk->action = ACTION_AGAIN;
 556                 return 0;
 557         }
 558         for (; addr != end; pte++, addr += PAGE_SIZE) {
 559                 ptent = ptep_get(pte);
 560                 if (pte_none(ptent))
 561                         continue;
 562                 if (!pte_present(ptent)) {
 563                         if (is_migration_entry(pte_to_swp_entry(ptent)))
 564                                 qp->nr_failed++;
 565                         continue;
 566                 }
 567                 folio = vm_normal_folio(vma, addr, ptent);
 568                 if (!folio || folio_is_zone_device(folio))
 569                         continue;
 570                 /*
 571                  * vm_normal_folio() filters out zero pages, but there might
 572                  * still be reserved folios to skip, perhaps in a VDSO.
 573                  */
 574                 if (folio_test_reserved(folio))
 575                         continue;
 576                 if (!queue_folio_required(folio, qp))
 577                         continue;
 578                 if (folio_test_large(folio)) {
 579                         /*
 580                          * A large folio can only be isolated from LRU once,
 581                          * but may be mapped by many PTEs (and Copy-On-Write may
 582                          * intersperse PTEs of other, order 0, folios).  This is
 583                          * a common case, so don't mistake it for failure (but
 584                          * there can be other cases of multi-mapped pages which
 585                          * this quick check does not help to filter out - and a
 586                          * search of the pagelist might grow to be prohibitive).
 587                          *
 588                          * migrate_pages(&pagelist) returns nr_failed folios, so
 589                          * check "large" now so that queue_pages_range() returns
 590                          * a comparable nr_failed folios.  This does imply that
 591                          * if folio could not be isolated for some racy reason
 592                          * at its first PTE, later PTEs will not give it another
 593                          * chance of isolation; but keeps the accounting simple.
 594                          */
 595                         if (folio == qp->large)
 596                                 continue;
 597                         qp->large = folio;
 598                 }
 599                 if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
 600                     !vma_migratable(vma) ||
 601                     !migrate_folio_add(folio, qp->pagelist, flags)) {
 602                         qp->nr_failed++;
 603                         if (strictly_unmovable(flags))
 604                                 break;
 605                 }
 606         }
 607         pte_unmap_unlock(mapped_pte, ptl);
 608         cond_resched();
 609 out:
 610         if (qp->nr_failed && strictly_unmovable(flags))
 611                 return -EIO;
 612         return 0;
 613 }
 614
 615 static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
 616                                unsigned long addr, unsigned long end,
 617                                struct mm_walk *walk)
 618 {
 619 #ifdef CONFIG_HUGETLB_PAGE
 620         struct queue_pages *qp = walk->private;
 621         unsigned long flags = qp->flags;
 622         struct folio *folio;
 623         spinlock_t *ptl;
 624         pte_t entry;
 625
 626         ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
 627         entry = huge_ptep_get(walk->mm, addr, pte);
 628         if (!pte_present(entry)) {
 629                 if (unlikely(is_hugetlb_entry_migration(entry)))
 630                         qp->nr_failed++;
 631                 goto unlock;
 632         }
 633         folio = pfn_folio(pte_pfn(entry));
 634         if (!queue_folio_required(folio, qp))
 635                 goto unlock;
 636         if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
 637             !vma_migratable(walk->vma)) {
 638                 qp->nr_failed++;
 639                 goto unlock;
 640         }
 641         /*
 642          * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
 643          * Choosing not to migrate a shared folio is not counted as a failure.
 644          *
 645          * See folio_likely_mapped_shared() on possible imprecision when we
 646          * cannot easily detect if a folio is shared.
 647          */
 648         if ((flags & MPOL_MF_MOVE_ALL) ||
 649             (!folio_likely_mapped_shared(folio) && !hugetlb_pmd_shared(pte)))
 650                 if (!isolate_hugetlb(folio, qp->pagelist))
 651                         qp->nr_failed++;
 652 unlock:
 653         spin_unlock(ptl);
 654         if (qp->nr_failed && strictly_unmovable(flags))
 655                 return -EIO;
 656 #endif
 657         return 0;
 658 }
 659
 660 #ifdef CONFIG_NUMA_BALANCING
 661 /*
 662  * This is used to mark a range of virtual addresses to be inaccessible.
 663  * These are later cleared by a NUMA hinting fault. Depending on these
 664  * faults, pages may be migrated for better NUMA placement.
 665  *
 666  * This is assuming that NUMA faults are handled using PROT_NONE. If
 667  * an architecture makes a different choice, it will need further
 668  * changes to the core.
 669  */
 670 unsigned long change_prot_numa(struct vm_area_struct *vma,
 671                         unsigned long addr, unsigned long end)
 672 {
 673         struct mmu_gather tlb;
 674         long nr_updated;
 675
 676         tlb_gather_mmu(&tlb, vma->vm_mm);
 677
 678         nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA);
 679         if (nr_updated > 0) {
 680                 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
 681                 count_memcg_events_mm(vma->vm_mm, NUMA_PTE_UPDATES, nr_updated);
 682         }
 683
 684         tlb_finish_mmu(&tlb);
 685
 686         return nr_updated;
 687 }
 688 #endif /* CONFIG_NUMA_BALANCING */
 689
 690 static int queue_pages_test_walk(unsigned long start, unsigned long end,
 691                                 struct mm_walk *walk)
 692 {
 693         struct vm_area_struct *next, *vma = walk->vma;
 694         struct queue_pages *qp = walk->private;
 695         unsigned long flags = qp->flags;
 696
 697         /* range check first */
 698         VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
 699
 700         if (!qp->first) {
 701                 qp->first = vma;
 702                 if (!(flags & MPOL_MF_DISCONTIG_OK) &&
 703                         (qp->start < vma->vm_start))
 704                         /* hole at head side of range */
 705                         return -EFAULT;
 706         }
 707         next = find_vma(vma->vm_mm, vma->vm_end);
 708         if (!(flags & MPOL_MF_DISCONTIG_OK) &&
 709                 ((vma->vm_end < qp->end) &&
 710                 (!next || vma->vm_end < next->vm_start)))
 711                 /* hole at middle or tail of range */
 712                 return -EFAULT;
 713
 714         /*
 715          * Need check MPOL_MF_STRICT to return -EIO if possible
 716          * regardless of vma_migratable
 717          */
 718         if (!vma_migratable(vma) &&
 719             !(flags & MPOL_MF_STRICT))
 720                 return 1;
 721
 722         /*
 723          * Check page nodes, and queue pages to move, in the current vma.
 724          * But if no moving, and no strict checking, the scan can be skipped.
 725          */
 726         if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 727                 return 0;
 728         return 1;
 729 }
 730
 731 static const struct mm_walk_ops queue_pages_walk_ops = {
 732         .hugetlb_entry          = queue_folios_hugetlb,
 733         .pmd_entry              = queue_folios_pte_range,
 734         .test_walk              = queue_pages_test_walk,
 735         .walk_lock              = PGWALK_RDLOCK,
 736 };
 737
 738 static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = {
 739         .hugetlb_entry          = queue_folios_hugetlb,
 740         .pmd_entry              = queue_folios_pte_range,
 741         .test_walk              = queue_pages_test_walk,
 742         .walk_lock              = PGWALK_WRLOCK,
 743 };
 744
 745 /*
 746  * Walk through page tables and collect pages to be migrated.
 747  *
 748  * If pages found in a given range are not on the required set of @nodes,
 749  * and migration is allowed, they are isolated and queued to @pagelist.
 750  *
 751  * queue_pages_range() may return:
 752  * 0 - all pages already on the right node, or successfully queued for moving
 753  *     (or neither strict checking nor moving requested: only range checking).
 754  * >0 - this number of misplaced folios could not be queued for moving
 755  *      (a hugetlbfs page or a transparent huge page being counted as 1).
 756  * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs.
 757  * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified.
 758  */
 759 static long
 760 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 761                 nodemask_t *nodes, unsigned long flags,
 762                 struct list_head *pagelist)
 763 {
 764         int err;
 765         struct queue_pages qp = {
 766                 .pagelist = pagelist,
 767                 .flags = flags,
 768                 .nmask = nodes,
 769                 .start = start,
 770                 .end = end,
 771                 .first = NULL,
 772         };
 773         const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ?
 774                         &queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops;
 775
 776         err = walk_page_range(mm, start, end, ops, &qp);
 777
 778         if (!qp.first)
 779                 /* whole range in hole */
 780                 err = -EFAULT;
 781
 782         return err ? : qp.nr_failed;
 783 }
 784
 785 /*
 786  * Apply policy to a single VMA
 787  * This must be called with the mmap_lock held for writing.
 788  */
 789 static int vma_replace_policy(struct vm_area_struct *vma,
 790                                 struct mempolicy *pol)
 791 {
 792         int err;
 793         struct mempolicy *old;
 794         struct mempolicy *new;
 795
 796         vma_assert_write_locked(vma);
 797
 798         new = mpol_dup(pol);
 799         if (IS_ERR(new))
 800                 return PTR_ERR(new);
 801
 802         if (vma->vm_ops && vma->vm_ops->set_policy) {
 803                 err = vma->vm_ops->set_policy(vma, new);
 804                 if (err)
 805                         goto err_out;
 806         }
 807
 808         old = vma->vm_policy;
 809         vma->vm_policy = new; /* protected by mmap_lock */
 810         mpol_put(old);
 811
 812         return 0;
 813  err_out:
 814         mpol_put(new);
 815         return err;
 816 }
 817
 818 /* Split or merge the VMA (if required) and apply the new policy */
 819 static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma,
 820                 struct vm_area_struct **prev, unsigned long start,
 821                 unsigned long end, struct mempolicy *new_pol)
 822 {
 823         unsigned long vmstart, vmend;
 824
 825         vmend = min(end, vma->vm_end);
 826         if (start > vma->vm_start) {
 827                 *prev = vma;
 828                 vmstart = start;
 829         } else {
 830                 vmstart = vma->vm_start;
 831         }
 832
 833         if (mpol_equal(vma->vm_policy, new_pol)) {
 834                 *prev = vma;
 835                 return 0;
 836         }
 837
 838         vma =  vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol);
 839         if (IS_ERR(vma))
 840                 return PTR_ERR(vma);
 841
 842         *prev = vma;
 843         return vma_replace_policy(vma, new_pol);
 844 }
 845
 846 /* Set the process memory policy */
 847 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 848                              nodemask_t *nodes)
 849 {
 850         struct mempolicy *new, *old;
 851         NODEMASK_SCRATCH(scratch);
 852         int ret;
 853
 854         if (!scratch)
 855                 return -ENOMEM;
 856
 857         new = mpol_new(mode, flags, nodes);
 858         if (IS_ERR(new)) {
 859                 ret = PTR_ERR(new);
 860                 goto out;
 861         }
 862
 863         task_lock(current);
 864         ret = mpol_set_nodemask(new, nodes, scratch);
 865         if (ret) {
 866                 task_unlock(current);
 867                 mpol_put(new);
 868                 goto out;
 869         }
 870
 871         old = current->mempolicy;
 872         current->mempolicy = new;
 873         if (new && (new->mode == MPOL_INTERLEAVE ||
 874                     new->mode == MPOL_WEIGHTED_INTERLEAVE)) {
 875                 current->il_prev = MAX_NUMNODES-1;
 876                 current->il_weight = 0;
 877         }
 878         task_unlock(current);
 879         mpol_put(old);
 880         ret = 0;
 881 out:
 882         NODEMASK_SCRATCH_FREE(scratch);
 883         return ret;
 884 }
 885
 886 /*
 887  * Return nodemask for policy for get_mempolicy() query
 888  *
 889  * Called with task's alloc_lock held
 890  */
 891 static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes)
 892 {
 893         nodes_clear(*nodes);
 894         if (pol == &default_policy)
 895                 return;
 896
 897         switch (pol->mode) {
 898         case MPOL_BIND:
 899         case MPOL_INTERLEAVE:
 900         case MPOL_PREFERRED:
 901         case MPOL_PREFERRED_MANY:
 902         case MPOL_WEIGHTED_INTERLEAVE:
 903                 *nodes = pol->nodes;
 904                 break;
 905         case MPOL_LOCAL:
 906                 /* return empty node mask for local allocation */
 907                 break;
 908         default:
 909                 BUG();
 910         }
 911 }
 912
 913 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 914 {
 915         struct page *p = NULL;
 916         int ret;
 917
 918         ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p);
 919         if (ret > 0) {
 920                 ret = page_to_nid(p);
 921                 put_page(p);
 922         }
 923         return ret;
 924 }
 925
 926 /* Retrieve NUMA policy */
 927 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 928                              unsigned long addr, unsigned long flags)
 929 {
 930         int err;
 931         struct mm_struct *mm = current->mm;
 932         struct vm_area_struct *vma = NULL;
 933         struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
 934
 935         if (flags &
 936                 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 937                 return -EINVAL;
 938
 939         if (flags & MPOL_F_MEMS_ALLOWED) {
 940                 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 941                         return -EINVAL;
 942                 *policy = 0;    /* just so it's initialized */
 943                 task_lock(current);
 944                 *nmask  = cpuset_current_mems_allowed;
 945                 task_unlock(current);
 946                 return 0;
 947         }
 948
 949         if (flags & MPOL_F_ADDR) {
 950                 pgoff_t ilx;            /* ignored here */
 951                 /*
 952                  * Do NOT fall back to task policy if the
 953                  * vma/shared policy at addr is NULL.  We
 954                  * want to return MPOL_DEFAULT in this case.
 955                  */
 956                 mmap_read_lock(mm);
 957                 vma = vma_lookup(mm, addr);
 958                 if (!vma) {
 959                         mmap_read_unlock(mm);
 960                         return -EFAULT;
 961                 }
 962                 pol = __get_vma_policy(vma, addr, &ilx);
 963         } else if (addr)
 964                 return -EINVAL;
 965
 966         if (!pol)
 967                 pol = &default_policy;  /* indicates default behavior */
 968
 969         if (flags & MPOL_F_NODE) {
 970                 if (flags & MPOL_F_ADDR) {
 971                         /*
 972                          * Take a refcount on the mpol, because we are about to
 973                          * drop the mmap_lock, after which only "pol" remains
 974                          * valid, "vma" is stale.
 975                          */
 976                         pol_refcount = pol;
 977                         vma = NULL;
 978                         mpol_get(pol);
 979                         mmap_read_unlock(mm);
 980                         err = lookup_node(mm, addr);
 981                         if (err < 0)
 982                                 goto out;
 983                         *policy = err;
 984                 } else if (pol == current->mempolicy &&
 985                                 pol->mode == MPOL_INTERLEAVE) {
 986                         *policy = next_node_in(current->il_prev, pol->nodes);
 987                 } else if (pol == current->mempolicy &&
 988                                 pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
 989                         if (current->il_weight)
 990                                 *policy = current->il_prev;
 991                         else
 992                                 *policy = next_node_in(current->il_prev,
 993                                                        pol->nodes);
 994                 } else {
 995                         err = -EINVAL;
 996                         goto out;
 997                 }
 998         } else {
 999                 *policy = pol == &default_policy ? MPOL_DEFAULT :
1000                                                 pol->mode;
1001                 /*
1002                  * Internal mempolicy flags must be masked off before exposing
1003                  * the policy to userspace.
1004                  */
1005                 *policy |= (pol->flags & MPOL_MODE_FLAGS);
1006         }
1007
1008         err = 0;
1009         if (nmask) {
1010                 if (mpol_store_user_nodemask(pol)) {
1011                         *nmask = pol->w.user_nodemask;
1012                 } else {
1013                         task_lock(current);
1014                         get_policy_nodemask(pol, nmask);
1015                         task_unlock(current);
1016                 }
1017         }
1018
1019  out:
1020         mpol_cond_put(pol);
1021         if (vma)
1022                 mmap_read_unlock(mm);
1023         if (pol_refcount)
1024                 mpol_put(pol_refcount);
1025         return err;
1026 }
1027
1028 #ifdef CONFIG_MIGRATION
1029 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1030                                 unsigned long flags)
1031 {
1032         /*
1033          * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
1034          * Choosing not to migrate a shared folio is not counted as a failure.
1035          *
1036          * See folio_likely_mapped_shared() on possible imprecision when we
1037          * cannot easily detect if a folio is shared.
1038          */
1039         if ((flags & MPOL_MF_MOVE_ALL) || !folio_likely_mapped_shared(folio)) {
1040                 if (folio_isolate_lru(folio)) {
1041                         list_add_tail(&folio->lru, foliolist);
1042                         node_stat_mod_folio(folio,
1043                                 NR_ISOLATED_ANON + folio_is_file_lru(folio),
1044                                 folio_nr_pages(folio));
1045                 } else {
1046                         /*
1047                          * Non-movable folio may reach here.  And, there may be
1048                          * temporary off LRU folios or non-LRU movable folios.
1049                          * Treat them as unmovable folios since they can't be
1050                          * isolated, so they can't be moved at the moment.
1051                          */
1052                         return false;
1053                 }
1054         }
1055         return true;
1056 }
1057
1058 /*
1059  * Migrate pages from one node to a target node.
1060  * Returns error or the number of pages not migrated.
1061  */
1062 static long migrate_to_node(struct mm_struct *mm, int source, int dest,
1063                             int flags)
1064 {
1065         nodemask_t nmask;
1066         struct vm_area_struct *vma;
1067         LIST_HEAD(pagelist);
1068         long nr_failed;
1069         long err = 0;
1070         struct migration_target_control mtc = {
1071                 .nid = dest,
1072                 .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1073                 .reason = MR_SYSCALL,
1074         };
1075
1076         nodes_clear(nmask);
1077         node_set(source, nmask);
1078
1079         VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1080
1081         mmap_read_lock(mm);
1082         vma = find_vma(mm, 0);
1083
1084         /*
1085          * This does not migrate the range, but isolates all pages that
1086          * need migration.  Between passing in the full user address
1087          * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail,
1088          * but passes back the count of pages which could not be isolated.
1089          */
1090         nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
1091                                       flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1092         mmap_read_unlock(mm);
1093
1094         if (!list_empty(&pagelist)) {
1095                 err = migrate_pages(&pagelist, alloc_migration_target, NULL,
1096                         (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
1097                 if (err)
1098                         putback_movable_pages(&pagelist);
1099         }
1100
1101         if (err >= 0)
1102                 err += nr_failed;
1103         return err;
1104 }
1105
1106 /*
1107  * Move pages between the two nodesets so as to preserve the physical
1108  * layout as much as possible.
1109  *
1110  * Returns the number of page that could not be moved.
1111  */
1112 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1113                      const nodemask_t *to, int flags)
1114 {
1115         long nr_failed = 0;
1116         long err = 0;
1117         nodemask_t tmp;
1118
1119         lru_cache_disable();
1120
1121         /*
1122          * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1123          * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1124          * bit in 'tmp', and return that <source, dest> pair for migration.
1125          * The pair of nodemasks 'to' and 'from' define the map.
1126          *
1127          * If no pair of bits is found that way, fallback to picking some
1128          * pair of 'source' and 'dest' bits that are not the same.  If the
1129          * 'source' and 'dest' bits are the same, this represents a node
1130          * that will be migrating to itself, so no pages need move.
1131          *
1132          * If no bits are left in 'tmp', or if all remaining bits left
1133          * in 'tmp' correspond to the same bit in 'to', return false
1134          * (nothing left to migrate).
1135          *
1136          * This lets us pick a pair of nodes to migrate between, such that
1137          * if possible the dest node is not already occupied by some other
1138          * source node, minimizing the risk of overloading the memory on a
1139          * node that would happen if we migrated incoming memory to a node
1140          * before migrating outgoing memory source that same node.
1141          *
1142          * A single scan of tmp is sufficient.  As we go, we remember the
1143          * most recent <s, d> pair that moved (s != d).  If we find a pair
1144          * that not only moved, but what's better, moved to an empty slot
1145          * (d is not set in tmp), then we break out then, with that pair.
1146          * Otherwise when we finish scanning from_tmp, we at least have the
1147          * most recent <s, d> pair that moved.  If we get all the way through
1148          * the scan of tmp without finding any node that moved, much less
1149          * moved to an empty node, then there is nothing left worth migrating.
1150          */
1151
1152         tmp = *from;
1153         while (!nodes_empty(tmp)) {
1154                 int s, d;
1155                 int source = NUMA_NO_NODE;
1156                 int dest = 0;
1157
1158                 for_each_node_mask(s, tmp) {
1159
1160                         /*
1161                          * do_migrate_pages() tries to maintain the relative
1162                          * node relationship of the pages established between
1163                          * threads and memory areas.
1164                          *
1165                          * However if the number of source nodes is not equal to
1166                          * the number of destination nodes we can not preserve
1167                          * this node relative relationship.  In that case, skip
1168                          * copying memory from a node that is in the destination
1169                          * mask.
1170                          *
1171                          * Example: [2,3,4] -> [3,4,5] moves everything.
1172                          *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1173                          */
1174
1175                         if ((nodes_weight(*from) != nodes_weight(*to)) &&
1176                                                 (node_isset(s, *to)))
1177                                 continue;
1178
1179                         d = node_remap(s, *from, *to);
1180                         if (s == d)
1181                                 continue;
1182
1183                         source = s;     /* Node moved. Memorize */
1184                         dest = d;
1185
1186                         /* dest not in remaining from nodes? */
1187                         if (!node_isset(dest, tmp))
1188                                 break;
1189                 }
1190                 if (source == NUMA_NO_NODE)
1191                         break;
1192
1193                 node_clear(source, tmp);
1194                 err = migrate_to_node(mm, source, dest, flags);
1195                 if (err > 0)
1196                         nr_failed += err;
1197                 if (err < 0)
1198                         break;
1199         }
1200
1201         lru_cache_enable();
1202         if (err < 0)
1203                 return err;
1204         return (nr_failed < INT_MAX) ? nr_failed : INT_MAX;
1205 }
1206
1207 /*
1208  * Allocate a new folio for page migration, according to NUMA mempolicy.
1209  */
1210 static struct folio *alloc_migration_target_by_mpol(struct folio *src,
1211                                                     unsigned long private)
1212 {
1213         struct migration_mpol *mmpol = (struct migration_mpol *)private;
1214         struct mempolicy *pol = mmpol->pol;
1215         pgoff_t ilx = mmpol->ilx;
1216         unsigned int order;
1217         int nid = numa_node_id();
1218         gfp_t gfp;
1219
1220         order = folio_order(src);
1221         ilx += src->index >> order;
1222
1223         if (folio_test_hugetlb(src)) {
1224                 nodemask_t *nodemask;
1225                 struct hstate *h;
1226
1227                 h = folio_hstate(src);
1228                 gfp = htlb_alloc_mask(h);
1229                 nodemask = policy_nodemask(gfp, pol, ilx, &nid);
1230                 return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp,
1231                                 htlb_allow_alloc_fallback(MR_MEMPOLICY_MBIND));
1232         }
1233
1234         if (folio_test_large(src))
1235                 gfp = GFP_TRANSHUGE;
1236         else
1237                 gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP;
1238
1239         return folio_alloc_mpol(gfp, order, pol, ilx, nid);
1240 }
1241 #else
1242
1243 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1244                                 unsigned long flags)
1245 {
1246         return false;
1247 }
1248
1249 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1250                      const nodemask_t *to, int flags)
1251 {
1252         return -ENOSYS;
1253 }
1254
1255 static struct folio *alloc_migration_target_by_mpol(struct folio *src,
1256                                                     unsigned long private)
1257 {
1258         return NULL;
1259 }
1260 #endif
1261
1262 static long do_mbind(unsigned long start, unsigned long len,
1263                      unsigned short mode, unsigned short mode_flags,
1264                      nodemask_t *nmask, unsigned long flags)
1265 {
1266         struct mm_struct *mm = current->mm;
1267         struct vm_area_struct *vma, *prev;
1268         struct vma_iterator vmi;
1269         struct migration_mpol mmpol;
1270         struct mempolicy *new;
1271         unsigned long end;
1272         long err;
1273         long nr_failed;
1274         LIST_HEAD(pagelist);
1275
1276         if (flags & ~(unsigned long)MPOL_MF_VALID)
1277                 return -EINVAL;
1278         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1279                 return -EPERM;
1280
1281         if (start & ~PAGE_MASK)
1282                 return -EINVAL;
1283
1284         if (mode == MPOL_DEFAULT)
1285                 flags &= ~MPOL_MF_STRICT;
1286
1287         len = PAGE_ALIGN(len);
1288         end = start + len;
1289
1290         if (end < start)
1291                 return -EINVAL;
1292         if (end == start)
1293                 return 0;
1294
1295         new = mpol_new(mode, mode_flags, nmask);
1296         if (IS_ERR(new))
1297                 return PTR_ERR(new);
1298
1299         /*
1300          * If we are using the default policy then operation
1301          * on discontinuous address spaces is okay after all
1302          */
1303         if (!new)
1304                 flags |= MPOL_MF_DISCONTIG_OK;
1305
1306         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1307                 lru_cache_disable();
1308         {
1309                 NODEMASK_SCRATCH(scratch);
1310                 if (scratch) {
1311                         mmap_write_lock(mm);
1312                         err = mpol_set_nodemask(new, nmask, scratch);
1313                         if (err)
1314                                 mmap_write_unlock(mm);
1315                 } else
1316                         err = -ENOMEM;
1317                 NODEMASK_SCRATCH_FREE(scratch);
1318         }
1319         if (err)
1320                 goto mpol_out;
1321
1322         /*
1323          * Lock the VMAs before scanning for pages to migrate,
1324          * to ensure we don't miss a concurrently inserted page.
1325          */
1326         nr_failed = queue_pages_range(mm, start, end, nmask,
1327                         flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist);
1328
1329         if (nr_failed < 0) {
1330                 err = nr_failed;
1331                 nr_failed = 0;
1332         } else {
1333                 vma_iter_init(&vmi, mm, start);
1334                 prev = vma_prev(&vmi);
1335                 for_each_vma_range(vmi, vma, end) {
1336                         err = mbind_range(&vmi, vma, &prev, start, end, new);
1337                         if (err)
1338                                 break;
1339                 }
1340         }
1341
1342         if (!err && !list_empty(&pagelist)) {
1343                 /* Convert MPOL_DEFAULT's NULL to task or default policy */
1344                 if (!new) {
1345                         new = get_task_policy(current);
1346                         mpol_get(new);
1347                 }
1348                 mmpol.pol = new;
1349                 mmpol.ilx = 0;
1350
1351                 /*
1352                  * In the interleaved case, attempt to allocate on exactly the
1353                  * targeted nodes, for the first VMA to be migrated; for later
1354                  * VMAs, the nodes will still be interleaved from the targeted
1355                  * nodemask, but one by one may be selected differently.
1356                  */
1357                 if (new->mode == MPOL_INTERLEAVE ||
1358                     new->mode == MPOL_WEIGHTED_INTERLEAVE) {
1359                         struct folio *folio;
1360                         unsigned int order;
1361                         unsigned long addr = -EFAULT;
1362
1363                         list_for_each_entry(folio, &pagelist, lru) {
1364                                 if (!folio_test_ksm(folio))
1365                                         break;
1366                         }
1367                         if (!list_entry_is_head(folio, &pagelist, lru)) {
1368                                 vma_iter_init(&vmi, mm, start);
1369                                 for_each_vma_range(vmi, vma, end) {
1370                                         addr = page_address_in_vma(
1371                                                 folio_page(folio, 0), vma);
1372                                         if (addr != -EFAULT)
1373                                                 break;
1374                                 }
1375                         }
1376                         if (addr != -EFAULT) {
1377                                 order = folio_order(folio);
1378                                 /* We already know the pol, but not the ilx */
1379                                 mpol_cond_put(get_vma_policy(vma, addr, order,
1380                                                              &mmpol.ilx));
1381                                 /* Set base from which to increment by index */
1382                                 mmpol.ilx -= folio->index >> order;
1383                         }
1384                 }
1385         }
1386
1387         mmap_write_unlock(mm);
1388
1389         if (!err && !list_empty(&pagelist)) {
1390                 nr_failed |= migrate_pages(&pagelist,
1391                                 alloc_migration_target_by_mpol, NULL,
1392                                 (unsigned long)&mmpol, MIGRATE_SYNC,
1393                                 MR_MEMPOLICY_MBIND, NULL);
1394         }
1395
1396         if (nr_failed && (flags & MPOL_MF_STRICT))
1397                 err = -EIO;
1398         if (!list_empty(&pagelist))
1399                 putback_movable_pages(&pagelist);
1400 mpol_out:
1401         mpol_put(new);
1402         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1403                 lru_cache_enable();
1404         return err;
1405 }
1406
1407 /*
1408  * User space interface with variable sized bitmaps for nodelists.
1409  */
1410 static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
1411                       unsigned long maxnode)
1412 {
1413         unsigned long nlongs = BITS_TO_LONGS(maxnode);
1414         int ret;
1415
1416         if (in_compat_syscall())
1417                 ret = compat_get_bitmap(mask,
1418                                         (const compat_ulong_t __user *)nmask,
1419                                         maxnode);
1420         else
1421                 ret = copy_from_user(mask, nmask,
1422                                      nlongs * sizeof(unsigned long));
1423
1424         if (ret)
1425                 return -EFAULT;
1426
1427         if (maxnode % BITS_PER_LONG)
1428                 mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1;
1429
1430         return 0;
1431 }
1432
1433 /* Copy a node mask from user space. */
1434 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1435                      unsigned long maxnode)
1436 {
1437         --maxnode;
1438         nodes_clear(*nodes);
1439         if (maxnode == 0 || !nmask)
1440                 return 0;
1441         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1442                 return -EINVAL;
1443
1444         /*
1445          * When the user specified more nodes than supported just check
1446          * if the non supported part is all zero, one word at a time,
1447          * starting at the end.
1448          */
1449         while (maxnode > MAX_NUMNODES) {
1450                 unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
1451                 unsigned long t;
1452
1453                 if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits))
1454                         return -EFAULT;
1455
1456                 if (maxnode - bits >= MAX_NUMNODES) {
1457                         maxnode -= bits;
1458                 } else {
1459                         maxnode = MAX_NUMNODES;
1460                         t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1461                 }
1462                 if (t)
1463                         return -EINVAL;
1464         }
1465
1466         return get_bitmap(nodes_addr(*nodes), nmask, maxnode);
1467 }
1468
1469 /* Copy a kernel node mask to user space */
1470 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1471                               nodemask_t *nodes)
1472 {
1473         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1474         unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1475         bool compat = in_compat_syscall();
1476
1477         if (compat)
1478                 nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t);
1479
1480         if (copy > nbytes) {
1481                 if (copy > PAGE_SIZE)
1482                         return -EINVAL;
1483                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1484                         return -EFAULT;
1485                 copy = nbytes;
1486                 maxnode = nr_node_ids;
1487         }
1488
1489         if (compat)
1490                 return compat_put_bitmap((compat_ulong_t __user *)mask,
1491                                          nodes_addr(*nodes), maxnode);
1492
1493         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1494 }
1495
1496 /* Basic parameter sanity check used by both mbind() and set_mempolicy() */
1497 static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
1498 {
1499         *flags = *mode & MPOL_MODE_FLAGS;
1500         *mode &= ~MPOL_MODE_FLAGS;
1501
1502         if ((unsigned int)(*mode) >=  MPOL_MAX)
1503                 return -EINVAL;
1504         if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
1505                 return -EINVAL;
1506         if (*flags & MPOL_F_NUMA_BALANCING) {
1507                 if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY)
1508                         *flags |= (MPOL_F_MOF | MPOL_F_MORON);
1509                 else
1510                         return -EINVAL;
1511         }
1512         return 0;
1513 }
1514
1515 static long kernel_mbind(unsigned long start, unsigned long len,
1516                          unsigned long mode, const unsigned long __user *nmask,
1517                          unsigned long maxnode, unsigned int flags)
1518 {
1519         unsigned short mode_flags;
1520         nodemask_t nodes;
1521         int lmode = mode;
1522         int err;
1523
1524         start = untagged_addr(start);
1525         err = sanitize_mpol_flags(&lmode, &mode_flags);
1526         if (err)
1527                 return err;
1528
1529         err = get_nodes(&nodes, nmask, maxnode);
1530         if (err)
1531                 return err;
1532
1533         return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
1534 }
1535
1536 SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
1537                 unsigned long, home_node, unsigned long, flags)
1538 {
1539         struct mm_struct *mm = current->mm;
1540         struct vm_area_struct *vma, *prev;
1541         struct mempolicy *new, *old;
1542         unsigned long end;
1543         int err = -ENOENT;
1544         VMA_ITERATOR(vmi, mm, start);
1545
1546         start = untagged_addr(start);
1547         if (start & ~PAGE_MASK)
1548                 return -EINVAL;
1549         /*
1550          * flags is used for future extension if any.
1551          */
1552         if (flags != 0)
1553                 return -EINVAL;
1554
1555         /*
1556          * Check home_node is online to avoid accessing uninitialized
1557          * NODE_DATA.
1558          */
1559         if (home_node >= MAX_NUMNODES || !node_online(home_node))
1560                 return -EINVAL;
1561
1562         len = PAGE_ALIGN(len);
1563         end = start + len;
1564
1565         if (end < start)
1566                 return -EINVAL;
1567         if (end == start)
1568                 return 0;
1569         mmap_write_lock(mm);
1570         prev = vma_prev(&vmi);
1571         for_each_vma_range(vmi, vma, end) {
1572                 /*
1573                  * If any vma in the range got policy other than MPOL_BIND
1574                  * or MPOL_PREFERRED_MANY we return error. We don't reset
1575                  * the home node for vmas we already updated before.
1576                  */
1577                 old = vma_policy(vma);
1578                 if (!old) {
1579                         prev = vma;
1580                         continue;
1581                 }
1582                 if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) {
1583                         err = -EOPNOTSUPP;
1584                         break;
1585                 }
1586                 new = mpol_dup(old);
1587                 if (IS_ERR(new)) {
1588                         err = PTR_ERR(new);
1589                         break;
1590                 }
1591
1592                 vma_start_write(vma);
1593                 new->home_node = home_node;
1594                 err = mbind_range(&vmi, vma, &prev, start, end, new);
1595                 mpol_put(new);
1596                 if (err)
1597                         break;
1598         }
1599         mmap_write_unlock(mm);
1600         return err;
1601 }
1602
1603 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1604                 unsigned long, mode, const unsigned long __user *, nmask,
1605                 unsigned long, maxnode, unsigned int, flags)
1606 {
1607         return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1608 }
1609
1610 /* Set the process memory policy */
1611 static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1612                                  unsigned long maxnode)
1613 {
1614         unsigned short mode_flags;
1615         nodemask_t nodes;
1616         int lmode = mode;
1617         int err;
1618
1619         err = sanitize_mpol_flags(&lmode, &mode_flags);
1620         if (err)
1621                 return err;
1622
1623         err = get_nodes(&nodes, nmask, maxnode);
1624         if (err)
1625                 return err;
1626
1627         return do_set_mempolicy(lmode, mode_flags, &nodes);
1628 }
1629
1630 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1631                 unsigned long, maxnode)
1632 {
1633         return kernel_set_mempolicy(mode, nmask, maxnode);
1634 }
1635
1636 static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1637                                 const unsigned long __user *old_nodes,
1638                                 const unsigned long __user *new_nodes)
1639 {
1640         struct mm_struct *mm = NULL;
1641         struct task_struct *task;
1642         nodemask_t task_nodes;
1643         int err;
1644         nodemask_t *old;
1645         nodemask_t *new;
1646         NODEMASK_SCRATCH(scratch);
1647
1648         if (!scratch)
1649                 return -ENOMEM;
1650
1651         old = &scratch->mask1;
1652         new = &scratch->mask2;
1653
1654         err = get_nodes(old, old_nodes, maxnode);
1655         if (err)
1656                 goto out;
1657
1658         err = get_nodes(new, new_nodes, maxnode);
1659         if (err)
1660                 goto out;
1661
1662         /* Find the mm_struct */
1663         rcu_read_lock();
1664         task = pid ? find_task_by_vpid(pid) : current;
1665         if (!task) {
1666                 rcu_read_unlock();
1667                 err = -ESRCH;
1668                 goto out;
1669         }
1670         get_task_struct(task);
1671
1672         err = -EINVAL;
1673
1674         /*
1675          * Check if this process has the right to modify the specified process.
1676          * Use the regular "ptrace_may_access()" checks.
1677          */
1678         if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1679                 rcu_read_unlock();
1680                 err = -EPERM;
1681                 goto out_put;
1682         }
1683         rcu_read_unlock();
1684
1685         task_nodes = cpuset_mems_allowed(task);
1686         /* Is the user allowed to access the target nodes? */
1687         if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1688                 err = -EPERM;
1689                 goto out_put;
1690         }
1691
1692         task_nodes = cpuset_mems_allowed(current);
1693         nodes_and(*new, *new, task_nodes);
1694         if (nodes_empty(*new))
1695                 goto out_put;
1696
1697         err = security_task_movememory(task);
1698         if (err)
1699                 goto out_put;
1700
1701         mm = get_task_mm(task);
1702         put_task_struct(task);
1703
1704         if (!mm) {
1705                 err = -EINVAL;
1706                 goto out;
1707         }
1708
1709         err = do_migrate_pages(mm, old, new,
1710                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1711
1712         mmput(mm);
1713 out:
1714         NODEMASK_SCRATCH_FREE(scratch);
1715
1716         return err;
1717
1718 out_put:
1719         put_task_struct(task);
1720         goto out;
1721 }
1722
1723 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1724                 const unsigned long __user *, old_nodes,
1725                 const unsigned long __user *, new_nodes)
1726 {
1727         return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1728 }
1729
1730 /* Retrieve NUMA policy */
1731 static int kernel_get_mempolicy(int __user *policy,
1732                                 unsigned long __user *nmask,
1733                                 unsigned long maxnode,
1734                                 unsigned long addr,
1735                                 unsigned long flags)
1736 {
1737         int err;
1738         int pval;
1739         nodemask_t nodes;
1740
1741         if (nmask != NULL && maxnode < nr_node_ids)
1742                 return -EINVAL;
1743
1744         addr = untagged_addr(addr);
1745
1746         err = do_get_mempolicy(&pval, &nodes, addr, flags);
1747
1748         if (err)
1749                 return err;
1750
1751         if (policy && put_user(pval, policy))
1752                 return -EFAULT;
1753
1754         if (nmask)
1755                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1756
1757         return err;
1758 }
1759
1760 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1761                 unsigned long __user *, nmask, unsigned long, maxnode,
1762                 unsigned long, addr, unsigned long, flags)
1763 {
1764         return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1765 }
1766
1767 bool vma_migratable(struct vm_area_struct *vma)
1768 {
1769         if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1770                 return false;
1771
1772         /*
1773          * DAX device mappings require predictable access latency, so avoid
1774          * incurring periodic faults.
1775          */
1776         if (vma_is_dax(vma))
1777                 return false;
1778
1779         if (is_vm_hugetlb_page(vma) &&
1780                 !hugepage_migration_supported(hstate_vma(vma)))
1781                 return false;
1782
1783         /*
1784          * Migration allocates pages in the highest zone. If we cannot
1785          * do so then migration (at least from node to node) is not
1786          * possible.
1787          */
1788         if (vma->vm_file &&
1789                 gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
1790                         < policy_zone)
1791                 return false;
1792         return true;
1793 }
1794
1795 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1796                                    unsigned long addr, pgoff_t *ilx)
1797 {
1798         *ilx = 0;
1799         return (vma->vm_ops && vma->vm_ops->get_policy) ?
1800                 vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy;
1801 }
1802
1803 /*
1804  * get_vma_policy(@vma, @addr, @order, @ilx)
1805  * @vma: virtual memory area whose policy is sought
1806  * @addr: address in @vma for shared policy lookup
1807  * @order: 0, or appropriate huge_page_order for interleaving
1808  * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or
1809  *       MPOL_WEIGHTED_INTERLEAVE
1810  *
1811  * Returns effective policy for a VMA at specified address.
1812  * Falls back to current->mempolicy or system default policy, as necessary.
1813  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1814  * count--added by the get_policy() vm_op, as appropriate--to protect against
1815  * freeing by another task.  It is the caller's responsibility to free the
1816  * extra reference for shared policies.
1817  */
1818 struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1819                                  unsigned long addr, int order, pgoff_t *ilx)
1820 {
1821         struct mempolicy *pol;
1822
1823         pol = __get_vma_policy(vma, addr, ilx);
1824         if (!pol)
1825                 pol = get_task_policy(current);
1826         if (pol->mode == MPOL_INTERLEAVE ||
1827             pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
1828                 *ilx += vma->vm_pgoff >> order;
1829                 *ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order);
1830         }
1831         return pol;
1832 }
1833
1834 bool vma_policy_mof(struct vm_area_struct *vma)
1835 {
1836         struct mempolicy *pol;
1837
1838         if (vma->vm_ops && vma->vm_ops->get_policy) {
1839                 bool ret = false;
1840                 pgoff_t ilx;            /* ignored here */
1841
1842                 pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx);
1843                 if (pol && (pol->flags & MPOL_F_MOF))
1844                         ret = true;
1845                 mpol_cond_put(pol);
1846
1847                 return ret;
1848         }
1849
1850         pol = vma->vm_policy;
1851         if (!pol)
1852                 pol = get_task_policy(current);
1853
1854         return pol->flags & MPOL_F_MOF;
1855 }
1856
1857 bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1858 {
1859         enum zone_type dynamic_policy_zone = policy_zone;
1860
1861         BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1862
1863         /*
1864          * if policy->nodes has movable memory only,
1865          * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1866          *
1867          * policy->nodes is intersect with node_states[N_MEMORY].
1868          * so if the following test fails, it implies
1869          * policy->nodes has movable memory only.
1870          */
1871         if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
1872                 dynamic_policy_zone = ZONE_MOVABLE;
1873
1874         return zone >= dynamic_policy_zone;
1875 }
1876
1877 static unsigned int weighted_interleave_nodes(struct mempolicy *policy)
1878 {
1879         unsigned int node;
1880         unsigned int cpuset_mems_cookie;
1881
1882 retry:
1883         /* to prevent miscount use tsk->mems_allowed_seq to detect rebind */
1884         cpuset_mems_cookie = read_mems_allowed_begin();
1885         node = current->il_prev;
1886         if (!current->il_weight || !node_isset(node, policy->nodes)) {
1887                 node = next_node_in(node, policy->nodes);
1888                 if (read_mems_allowed_retry(cpuset_mems_cookie))
1889                         goto retry;
1890                 if (node == MAX_NUMNODES)
1891                         return node;
1892                 current->il_prev = node;
1893                 current->il_weight = get_il_weight(node);
1894         }
1895         current->il_weight--;
1896         return node;
1897 }
1898
1899 /* Do dynamic interleaving for a process */
1900 static unsigned int interleave_nodes(struct mempolicy *policy)
1901 {
1902         unsigned int nid;
1903         unsigned int cpuset_mems_cookie;
1904
1905         /* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */
1906         do {
1907                 cpuset_mems_cookie = read_mems_allowed_begin();
1908                 nid = next_node_in(current->il_prev, policy->nodes);
1909         } while (read_mems_allowed_retry(cpuset_mems_cookie));
1910
1911         if (nid < MAX_NUMNODES)
1912                 current->il_prev = nid;
1913         return nid;
1914 }
1915
1916 /*
1917  * Depending on the memory policy provide a node from which to allocate the
1918  * next slab entry.
1919  */
1920 unsigned int mempolicy_slab_node(void)
1921 {
1922         struct mempolicy *policy;
1923         int node = numa_mem_id();
1924
1925         if (!in_task())
1926                 return node;
1927
1928         policy = current->mempolicy;
1929         if (!policy)
1930                 return node;
1931
1932         switch (policy->mode) {
1933         case MPOL_PREFERRED:
1934                 return first_node(policy->nodes);
1935
1936         case MPOL_INTERLEAVE:
1937                 return interleave_nodes(policy);
1938
1939         case MPOL_WEIGHTED_INTERLEAVE:
1940                 return weighted_interleave_nodes(policy);
1941
1942         case MPOL_BIND:
1943         case MPOL_PREFERRED_MANY:
1944         {
1945                 struct zoneref *z;
1946
1947                 /*
1948                  * Follow bind policy behavior and start allocation at the
1949                  * first node.
1950                  */
1951                 struct zonelist *zonelist;
1952                 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1953                 zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1954                 z = first_zones_zonelist(zonelist, highest_zoneidx,
1955                                                         &policy->nodes);
1956                 return zonelist_zone(z) ? zonelist_node_idx(z) : node;
1957         }
1958         case MPOL_LOCAL:
1959                 return node;
1960
1961         default:
1962                 BUG();
1963         }
1964 }
1965
1966 static unsigned int read_once_policy_nodemask(struct mempolicy *pol,
1967                                               nodemask_t *mask)
1968 {
1969         /*
1970          * barrier stabilizes the nodemask locally so that it can be iterated
1971          * over safely without concern for changes. Allocators validate node
1972          * selection does not violate mems_allowed, so this is safe.
1973          */
1974         barrier();
1975         memcpy(mask, &pol->nodes, sizeof(nodemask_t));
1976         barrier();
1977         return nodes_weight(*mask);
1978 }
1979
1980 static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
1981 {
1982         nodemask_t nodemask;
1983         unsigned int target, nr_nodes;
1984         u8 *table;
1985         unsigned int weight_total = 0;
1986         u8 weight;
1987         int nid;
1988
1989         nr_nodes = read_once_policy_nodemask(pol, &nodemask);
1990         if (!nr_nodes)
1991                 return numa_node_id();
1992
1993         rcu_read_lock();
1994         table = rcu_dereference(iw_table);
1995         /* calculate the total weight */
1996         for_each_node_mask(nid, nodemask) {
1997                 /* detect system default usage */
1998                 weight = table ? table[nid] : 1;
1999                 weight = weight ? weight : 1;
2000                 weight_total += weight;
2001         }
2002
2003         /* Calculate the node offset based on totals */
2004         target = ilx % weight_total;
2005         nid = first_node(nodemask);
2006         while (target) {
2007                 /* detect system default usage */
2008                 weight = table ? table[nid] : 1;
2009                 weight = weight ? weight : 1;
2010                 if (target < weight)
2011                         break;
2012                 target -= weight;
2013                 nid = next_node_in(nid, nodemask);
2014         }
2015         rcu_read_unlock();
2016         return nid;
2017 }
2018
2019 /*
2020  * Do static interleaving for interleave index @ilx.  Returns the ilx'th
2021  * node in pol->nodes (starting from ilx=0), wrapping around if ilx
2022  * exceeds the number of present nodes.
2023  */
2024 static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx)
2025 {
2026         nodemask_t nodemask;
2027         unsigned int target, nnodes;
2028         int i;
2029         int nid;
2030
2031         nnodes = read_once_policy_nodemask(pol, &nodemask);
2032         if (!nnodes)
2033                 return numa_node_id();
2034         target = ilx % nnodes;
2035         nid = first_node(nodemask);
2036         for (i = 0; i < target; i++)
2037                 nid = next_node(nid, nodemask);
2038         return nid;
2039 }
2040
2041 /*
2042  * Return a nodemask representing a mempolicy for filtering nodes for
2043  * page allocation, together with preferred node id (or the input node id).
2044  */
2045 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
2046                                    pgoff_t ilx, int *nid)
2047 {
2048         nodemask_t *nodemask = NULL;
2049
2050         switch (pol->mode) {
2051         case MPOL_PREFERRED:
2052                 /* Override input node id */
2053                 *nid = first_node(pol->nodes);
2054                 break;
2055         case MPOL_PREFERRED_MANY:
2056                 nodemask = &pol->nodes;
2057                 if (pol->home_node != NUMA_NO_NODE)
2058                         *nid = pol->home_node;
2059                 break;
2060         case MPOL_BIND:
2061                 /* Restrict to nodemask (but not on lower zones) */
2062                 if (apply_policy_zone(pol, gfp_zone(gfp)) &&
2063                     cpuset_nodemask_valid_mems_allowed(&pol->nodes))
2064                         nodemask = &pol->nodes;
2065                 if (pol->home_node != NUMA_NO_NODE)
2066                         *nid = pol->home_node;
2067                 /*
2068                  * __GFP_THISNODE shouldn't even be used with the bind policy
2069                  * because we might easily break the expectation to stay on the
2070                  * requested node and not break the policy.
2071                  */
2072                 WARN_ON_ONCE(gfp & __GFP_THISNODE);
2073                 break;
2074         case MPOL_INTERLEAVE:
2075                 /* Override input node id */
2076                 *nid = (ilx == NO_INTERLEAVE_INDEX) ?
2077                         interleave_nodes(pol) : interleave_nid(pol, ilx);
2078                 break;
2079         case MPOL_WEIGHTED_INTERLEAVE:
2080                 *nid = (ilx == NO_INTERLEAVE_INDEX) ?
2081                         weighted_interleave_nodes(pol) :
2082                         weighted_interleave_nid(pol, ilx);
2083                 break;
2084         }
2085
2086         return nodemask;
2087 }
2088
2089 #ifdef CONFIG_HUGETLBFS
2090 /*
2091  * huge_node(@vma, @addr, @gfp_flags, @mpol)
2092  * @vma: virtual memory area whose policy is sought
2093  * @addr: address in @vma for shared policy lookup and interleave policy
2094  * @gfp_flags: for requested zone
2095  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
2096  * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
2097  *
2098  * Returns a nid suitable for a huge page allocation and a pointer
2099  * to the struct mempolicy for conditional unref after allocation.
2100  * If the effective policy is 'bind' or 'prefer-many', returns a pointer
2101  * to the mempolicy's @nodemask for filtering the zonelist.
2102  */
2103 int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
2104                 struct mempolicy **mpol, nodemask_t **nodemask)
2105 {
2106         pgoff_t ilx;
2107         int nid;
2108
2109         nid = numa_node_id();
2110         *mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx);
2111         *nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid);
2112         return nid;
2113 }
2114
2115 /*
2116  * init_nodemask_of_mempolicy
2117  *
2118  * If the current task's mempolicy is "default" [NULL], return 'false'
2119  * to indicate default policy.  Otherwise, extract the policy nodemask
2120  * for 'bind' or 'interleave' policy into the argument nodemask, or
2121  * initialize the argument nodemask to contain the single node for
2122  * 'preferred' or 'local' policy and return 'true' to indicate presence
2123  * of non-default mempolicy.
2124  *
2125  * We don't bother with reference counting the mempolicy [mpol_get/put]
2126  * because the current task is examining it's own mempolicy and a task's
2127  * mempolicy is only ever changed by the task itself.
2128  *
2129  * N.B., it is the caller's responsibility to free a returned nodemask.
2130  */
2131 bool init_nodemask_of_mempolicy(nodemask_t *mask)
2132 {
2133         struct mempolicy *mempolicy;
2134
2135         if (!(mask && current->mempolicy))
2136                 return false;
2137
2138         task_lock(current);
2139         mempolicy = current->mempolicy;
2140         switch (mempolicy->mode) {
2141         case MPOL_PREFERRED:
2142         case MPOL_PREFERRED_MANY:
2143         case MPOL_BIND:
2144         case MPOL_INTERLEAVE:
2145         case MPOL_WEIGHTED_INTERLEAVE:
2146                 *mask = mempolicy->nodes;
2147                 break;
2148
2149         case MPOL_LOCAL:
2150                 init_nodemask_of_node(mask, numa_node_id());
2151                 break;
2152
2153         default:
2154                 BUG();
2155         }
2156         task_unlock(current);
2157
2158         return true;
2159 }
2160 #endif
2161
2162 /*
2163  * mempolicy_in_oom_domain
2164  *
2165  * If tsk's mempolicy is "bind", check for intersection between mask and
2166  * the policy nodemask. Otherwise, return true for all other policies
2167  * including "interleave", as a tsk with "interleave" policy may have
2168  * memory allocated from all nodes in system.
2169  *
2170  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2171  */
2172 bool mempolicy_in_oom_domain(struct task_struct *tsk,
2173                                         const nodemask_t *mask)
2174 {
2175         struct mempolicy *mempolicy;
2176         bool ret = true;
2177
2178         if (!mask)
2179                 return ret;
2180
2181         task_lock(tsk);
2182         mempolicy = tsk->mempolicy;
2183         if (mempolicy && mempolicy->mode == MPOL_BIND)
2184                 ret = nodes_intersects(mempolicy->nodes, *mask);
2185         task_unlock(tsk);
2186
2187         return ret;
2188 }
2189
2190 static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
2191                                                 int nid, nodemask_t *nodemask)
2192 {
2193         struct page *page;
2194         gfp_t preferred_gfp;
2195
2196         /*
2197          * This is a two pass approach. The first pass will only try the
2198          * preferred nodes but skip the direct reclaim and allow the
2199          * allocation to fail, while the second pass will try all the
2200          * nodes in system.
2201          */
2202         preferred_gfp = gfp | __GFP_NOWARN;
2203         preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2204         page = __alloc_pages_noprof(preferred_gfp, order, nid, nodemask);
2205         if (!page)
2206                 page = __alloc_pages_noprof(gfp, order, nid, NULL);
2207
2208         return page;
2209 }
2210
2211 /**
2212  * alloc_pages_mpol - Allocate pages according to NUMA mempolicy.
2213  * @gfp: GFP flags.
2214  * @order: Order of the page allocation.
2215  * @pol: Pointer to the NUMA mempolicy.
2216  * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()).
2217  * @nid: Preferred node (usually numa_node_id() but @mpol may override it).
2218  *
2219  * Return: The page on success or NULL if allocation fails.
2220  */
2221 struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order,
2222                 struct mempolicy *pol, pgoff_t ilx, int nid)
2223 {
2224         nodemask_t *nodemask;
2225         struct page *page;
2226
2227         nodemask = policy_nodemask(gfp, pol, ilx, &nid);
2228
2229         if (pol->mode == MPOL_PREFERRED_MANY)
2230                 return alloc_pages_preferred_many(gfp, order, nid, nodemask);
2231
2232         if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
2233             /* filter "hugepage" allocation, unless from alloc_pages() */
2234             order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) {
2235                 /*
2236                  * For hugepage allocation and non-interleave policy which
2237                  * allows the current node (or other explicitly preferred
2238                  * node) we only try to allocate from the current/preferred
2239                  * node and don't fall back to other nodes, as the cost of
2240                  * remote accesses would likely offset THP benefits.
2241                  *
2242                  * If the policy is interleave or does not allow the current
2243                  * node in its nodemask, we allocate the standard way.
2244                  */
2245                 if (pol->mode != MPOL_INTERLEAVE &&
2246                     pol->mode != MPOL_WEIGHTED_INTERLEAVE &&
2247                     (!nodemask || node_isset(nid, *nodemask))) {
2248                         /*
2249                          * First, try to allocate THP only on local node, but
2250                          * don't reclaim unnecessarily, just compact.
2251                          */
2252                         page = __alloc_pages_node_noprof(nid,
2253                                 gfp | __GFP_THISNODE | __GFP_NORETRY, order);
2254                         if (page || !(gfp & __GFP_DIRECT_RECLAIM))
2255                                 return page;
2256                         /*
2257                          * If hugepage allocations are configured to always
2258                          * synchronous compact or the vma has been madvised
2259                          * to prefer hugepage backing, retry allowing remote
2260                          * memory with both reclaim and compact as well.
2261                          */
2262                 }
2263         }
2264
2265         page = __alloc_pages_noprof(gfp, order, nid, nodemask);
2266
2267         if (unlikely(pol->mode == MPOL_INTERLEAVE) && page) {
2268                 /* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */
2269                 if (static_branch_likely(&vm_numa_stat_key) &&
2270                     page_to_nid(page) == nid) {
2271                         preempt_disable();
2272                         __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
2273                         preempt_enable();
2274                 }
2275         }
2276
2277         return page;
2278 }
2279
2280 struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order,
2281                 struct mempolicy *pol, pgoff_t ilx, int nid)
2282 {
2283         return page_rmappable_folio(alloc_pages_mpol_noprof(gfp | __GFP_COMP,
2284                                                         order, pol, ilx, nid));
2285 }
2286
2287 /**
2288  * vma_alloc_folio - Allocate a folio for a VMA.
2289  * @gfp: GFP flags.
2290  * @order: Order of the folio.
2291  * @vma: Pointer to VMA.
2292  * @addr: Virtual address of the allocation.  Must be inside @vma.
2293  * @hugepage: Unused (was: For hugepages try only preferred node if possible).
2294  *
2295  * Allocate a folio for a specific address in @vma, using the appropriate
2296  * NUMA policy.  The caller must hold the mmap_lock of the mm_struct of the
2297  * VMA to prevent it from going away.  Should be used for all allocations
2298  * for folios that will be mapped into user space, excepting hugetlbfs, and
2299  * excepting where direct use of alloc_pages_mpol() is more appropriate.
2300  *
2301  * Return: The folio on success or NULL if allocation fails.
2302  */
2303 struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma,
2304                 unsigned long addr, bool hugepage)
2305 {
2306         struct mempolicy *pol;
2307         pgoff_t ilx;
2308         struct folio *folio;
2309
2310         if (vma->vm_flags & VM_DROPPABLE)
2311                 gfp |= __GFP_NOWARN;
2312
2313         pol = get_vma_policy(vma, addr, order, &ilx);
2314         folio = folio_alloc_mpol_noprof(gfp, order, pol, ilx, numa_node_id());
2315         mpol_cond_put(pol);
2316         return folio;
2317 }
2318 EXPORT_SYMBOL(vma_alloc_folio_noprof);
2319
2320 /**
2321  * alloc_pages - Allocate pages.
2322  * @gfp: GFP flags.
2323  * @order: Power of two of number of pages to allocate.
2324  *
2325  * Allocate 1 << @order contiguous pages.  The physical address of the
2326  * first page is naturally aligned (eg an order-3 allocation will be aligned
2327  * to a multiple of 8 * PAGE_SIZE bytes).  The NUMA policy of the current
2328  * process is honoured when in process context.
2329  *
2330  * Context: Can be called from any context, providing the appropriate GFP
2331  * flags are used.
2332  * Return: The page on success or NULL if allocation fails.
2333  */
2334 struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order)
2335 {
2336         struct mempolicy *pol = &default_policy;
2337
2338         /*
2339          * No reference counting needed for current->mempolicy
2340          * nor system default_policy
2341          */
2342         if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2343                 pol = get_task_policy(current);
2344
2345         return alloc_pages_mpol_noprof(gfp, order, pol, NO_INTERLEAVE_INDEX,
2346                                        numa_node_id());
2347 }
2348 EXPORT_SYMBOL(alloc_pages_noprof);
2349
2350 struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order)
2351 {
2352         return page_rmappable_folio(alloc_pages_noprof(gfp | __GFP_COMP, order));
2353 }
2354 EXPORT_SYMBOL(folio_alloc_noprof);
2355
2356 static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
2357                 struct mempolicy *pol, unsigned long nr_pages,
2358                 struct page **page_array)
2359 {
2360         int nodes;
2361         unsigned long nr_pages_per_node;
2362         int delta;
2363         int i;
2364         unsigned long nr_allocated;
2365         unsigned long total_allocated = 0;
2366
2367         nodes = nodes_weight(pol->nodes);
2368         nr_pages_per_node = nr_pages / nodes;
2369         delta = nr_pages - nodes * nr_pages_per_node;
2370
2371         for (i = 0; i < nodes; i++) {
2372                 if (delta) {
2373                         nr_allocated = alloc_pages_bulk_noprof(gfp,
2374                                         interleave_nodes(pol), NULL,
2375                                         nr_pages_per_node + 1, NULL,
2376                                         page_array);
2377                         delta--;
2378                 } else {
2379                         nr_allocated = alloc_pages_bulk_noprof(gfp,
2380                                         interleave_nodes(pol), NULL,
2381                                         nr_pages_per_node, NULL, page_array);
2382                 }
2383
2384                 page_array += nr_allocated;
2385                 total_allocated += nr_allocated;
2386         }
2387
2388         return total_allocated;
2389 }
2390
2391 static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
2392                 struct mempolicy *pol, unsigned long nr_pages,
2393                 struct page **page_array)
2394 {
2395         struct task_struct *me = current;
2396         unsigned int cpuset_mems_cookie;
2397         unsigned long total_allocated = 0;
2398         unsigned long nr_allocated = 0;
2399         unsigned long rounds;
2400         unsigned long node_pages, delta;
2401         u8 *table, *weights, weight;
2402         unsigned int weight_total = 0;
2403         unsigned long rem_pages = nr_pages;
2404         nodemask_t nodes;
2405         int nnodes, node;
2406         int resume_node = MAX_NUMNODES - 1;
2407         u8 resume_weight = 0;
2408         int prev_node;
2409         int i;
2410
2411         if (!nr_pages)
2412                 return 0;
2413
2414         /* read the nodes onto the stack, retry if done during rebind */
2415         do {
2416                 cpuset_mems_cookie = read_mems_allowed_begin();
2417                 nnodes = read_once_policy_nodemask(pol, &nodes);
2418         } while (read_mems_allowed_retry(cpuset_mems_cookie));
2419
2420         /* if the nodemask has become invalid, we cannot do anything */
2421         if (!nnodes)
2422                 return 0;
2423
2424         /* Continue allocating from most recent node and adjust the nr_pages */
2425         node = me->il_prev;
2426         weight = me->il_weight;
2427         if (weight && node_isset(node, nodes)) {
2428                 node_pages = min(rem_pages, weight);
2429                 nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
2430                                                   NULL, page_array);
2431                 page_array += nr_allocated;
2432                 total_allocated += nr_allocated;
2433                 /* if that's all the pages, no need to interleave */
2434                 if (rem_pages <= weight) {
2435                         me->il_weight -= rem_pages;
2436                         return total_allocated;
2437                 }
2438                 /* Otherwise we adjust remaining pages, continue from there */
2439                 rem_pages -= weight;
2440         }
2441         /* clear active weight in case of an allocation failure */
2442         me->il_weight = 0;
2443         prev_node = node;
2444
2445         /* create a local copy of node weights to operate on outside rcu */
2446         weights = kzalloc(nr_node_ids, GFP_KERNEL);
2447         if (!weights)
2448                 return total_allocated;
2449
2450         rcu_read_lock();
2451         table = rcu_dereference(iw_table);
2452         if (table)
2453                 memcpy(weights, table, nr_node_ids);
2454         rcu_read_unlock();
2455
2456         /* calculate total, detect system default usage */
2457         for_each_node_mask(node, nodes) {
2458                 if (!weights[node])
2459                         weights[node] = 1;
2460                 weight_total += weights[node];
2461         }
2462
2463         /*
2464          * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls.
2465          * Track which node weighted interleave should resume from.
2466          *
2467          * if (rounds > 0) and (delta == 0), resume_node will always be
2468          * the node following prev_node and its weight.
2469          */
2470         rounds = rem_pages / weight_total;
2471         delta = rem_pages % weight_total;
2472         resume_node = next_node_in(prev_node, nodes);
2473         resume_weight = weights[resume_node];
2474         for (i = 0; i < nnodes; i++) {
2475                 node = next_node_in(prev_node, nodes);
2476                 weight = weights[node];
2477                 node_pages = weight * rounds;
2478                 /* If a delta exists, add this node's portion of the delta */
2479                 if (delta > weight) {
2480                         node_pages += weight;
2481                         delta -= weight;
2482                 } else if (delta) {
2483                         /* when delta is depleted, resume from that node */
2484                         node_pages += delta;
2485                         resume_node = node;
2486                         resume_weight = weight - delta;
2487                         delta = 0;
2488                 }
2489                 /* node_pages can be 0 if an allocation fails and rounds == 0 */
2490                 if (!node_pages)
2491                         break;
2492                 nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
2493                                                   NULL, page_array);
2494                 page_array += nr_allocated;
2495                 total_allocated += nr_allocated;
2496                 if (total_allocated == nr_pages)
2497                         break;
2498                 prev_node = node;
2499         }
2500         me->il_prev = resume_node;
2501         me->il_weight = resume_weight;
2502         kfree(weights);
2503         return total_allocated;
2504 }
2505
2506 static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
2507                 struct mempolicy *pol, unsigned long nr_pages,
2508                 struct page **page_array)
2509 {
2510         gfp_t preferred_gfp;
2511         unsigned long nr_allocated = 0;
2512
2513         preferred_gfp = gfp | __GFP_NOWARN;
2514         preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2515
2516         nr_allocated  = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes,
2517                                            nr_pages, NULL, page_array);
2518
2519         if (nr_allocated < nr_pages)
2520                 nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL,
2521                                 nr_pages - nr_allocated, NULL,
2522                                 page_array + nr_allocated);
2523         return nr_allocated;
2524 }
2525
2526 /* alloc pages bulk and mempolicy should be considered at the
2527  * same time in some situation such as vmalloc.
2528  *
2529  * It can accelerate memory allocation especially interleaving
2530  * allocate memory.
2531  */
2532 unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp,
2533                 unsigned long nr_pages, struct page **page_array)
2534 {
2535         struct mempolicy *pol = &default_policy;
2536         nodemask_t *nodemask;
2537         int nid;
2538
2539         if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2540                 pol = get_task_policy(current);
2541
2542         if (pol->mode == MPOL_INTERLEAVE)
2543                 return alloc_pages_bulk_array_interleave(gfp, pol,
2544                                                          nr_pages, page_array);
2545
2546         if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
2547                 return alloc_pages_bulk_array_weighted_interleave(
2548                                   gfp, pol, nr_pages, page_array);
2549
2550         if (pol->mode == MPOL_PREFERRED_MANY)
2551                 return alloc_pages_bulk_array_preferred_many(gfp,
2552                                 numa_node_id(), pol, nr_pages, page_array);
2553
2554         nid = numa_node_id();
2555         nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid);
2556         return alloc_pages_bulk_noprof(gfp, nid, nodemask,
2557                                        nr_pages, NULL, page_array);
2558 }
2559
2560 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2561 {
2562         struct mempolicy *pol = mpol_dup(src->vm_policy);
2563
2564         if (IS_ERR(pol))
2565                 return PTR_ERR(pol);
2566         dst->vm_policy = pol;
2567         return 0;
2568 }
2569
2570 /*
2571  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2572  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2573  * with the mems_allowed returned by cpuset_mems_allowed().  This
2574  * keeps mempolicies cpuset relative after its cpuset moves.  See
2575  * further kernel/cpuset.c update_nodemask().
2576  *
2577  * current's mempolicy may be rebinded by the other task(the task that changes
2578  * cpuset's mems), so we needn't do rebind work for current task.
2579  */
2580
2581 /* Slow path of a mempolicy duplicate */
2582 struct mempolicy *__mpol_dup(struct mempolicy *old)
2583 {
2584         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2585
2586         if (!new)
2587                 return ERR_PTR(-ENOMEM);
2588
2589         /* task's mempolicy is protected by alloc_lock */
2590         if (old == current->mempolicy) {
2591                 task_lock(current);
2592                 *new = *old;
2593                 task_unlock(current);
2594         } else
2595                 *new = *old;
2596
2597         if (current_cpuset_is_being_rebound()) {
2598                 nodemask_t mems = cpuset_mems_allowed(current);
2599                 mpol_rebind_policy(new, &mems);
2600         }
2601         atomic_set(&new->refcnt, 1);
2602         return new;
2603 }
2604
2605 /* Slow path of a mempolicy comparison */
2606 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2607 {
2608         if (!a || !b)
2609                 return false;
2610         if (a->mode != b->mode)
2611                 return false;
2612         if (a->flags != b->flags)
2613                 return false;
2614         if (a->home_node != b->home_node)
2615                 return false;
2616         if (mpol_store_user_nodemask(a))
2617                 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2618                         return false;
2619
2620         switch (a->mode) {
2621         case MPOL_BIND:
2622         case MPOL_INTERLEAVE:
2623         case MPOL_PREFERRED:
2624         case MPOL_PREFERRED_MANY:
2625         case MPOL_WEIGHTED_INTERLEAVE:
2626                 return !!nodes_equal(a->nodes, b->nodes);
2627         case MPOL_LOCAL:
2628                 return true;
2629         default:
2630                 BUG();
2631                 return false;
2632         }
2633 }
2634
2635 /*
2636  * Shared memory backing store policy support.
2637  *
2638  * Remember policies even when nobody has shared memory mapped.
2639  * The policies are kept in Red-Black tree linked from the inode.
2640  * They are protected by the sp->lock rwlock, which should be held
2641  * for any accesses to the tree.
2642  */
2643
2644 /*
2645  * lookup first element intersecting start-end.  Caller holds sp->lock for
2646  * reading or for writing
2647  */
2648 static struct sp_node *sp_lookup(struct shared_policy *sp,
2649                                         pgoff_t start, pgoff_t end)
2650 {
2651         struct rb_node *n = sp->root.rb_node;
2652
2653         while (n) {
2654                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2655
2656                 if (start >= p->end)
2657                         n = n->rb_right;
2658                 else if (end <= p->start)
2659                         n = n->rb_left;
2660                 else
2661                         break;
2662         }
2663         if (!n)
2664                 return NULL;
2665         for (;;) {
2666                 struct sp_node *w = NULL;
2667                 struct rb_node *prev = rb_prev(n);
2668                 if (!prev)
2669                         break;
2670                 w = rb_entry(prev, struct sp_node, nd);
2671                 if (w->end <= start)
2672                         break;
2673                 n = prev;
2674         }
2675         return rb_entry(n, struct sp_node, nd);
2676 }
2677
2678 /*
2679  * Insert a new shared policy into the list.  Caller holds sp->lock for
2680  * writing.
2681  */
2682 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2683 {
2684         struct rb_node **p = &sp->root.rb_node;
2685         struct rb_node *parent = NULL;
2686         struct sp_node *nd;
2687
2688         while (*p) {
2689                 parent = *p;
2690                 nd = rb_entry(parent, struct sp_node, nd);
2691                 if (new->start < nd->start)
2692                         p = &(*p)->rb_left;
2693                 else if (new->end > nd->end)
2694                         p = &(*p)->rb_right;
2695                 else
2696                         BUG();
2697         }
2698         rb_link_node(&new->nd, parent, p);
2699         rb_insert_color(&new->nd, &sp->root);
2700 }
2701
2702 /* Find shared policy intersecting idx */
2703 struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
2704                                                 pgoff_t idx)
2705 {
2706         struct mempolicy *pol = NULL;
2707         struct sp_node *sn;
2708
2709         if (!sp->root.rb_node)
2710                 return NULL;
2711         read_lock(&sp->lock);
2712         sn = sp_lookup(sp, idx, idx+1);
2713         if (sn) {
2714                 mpol_get(sn->policy);
2715                 pol = sn->policy;
2716         }
2717         read_unlock(&sp->lock);
2718         return pol;
2719 }
2720
2721 static void sp_free(struct sp_node *n)
2722 {
2723         mpol_put(n->policy);
2724         kmem_cache_free(sn_cache, n);
2725 }
2726
2727 /**
2728  * mpol_misplaced - check whether current folio node is valid in policy
2729  *
2730  * @folio: folio to be checked
2731  * @vmf: structure describing the fault
2732  * @addr: virtual address in @vma for shared policy lookup and interleave policy
2733  *
2734  * Lookup current policy node id for vma,addr and "compare to" folio's
2735  * node id.  Policy determination "mimics" alloc_page_vma().
2736  * Called from fault path where we know the vma and faulting address.
2737  *
2738  * Return: NUMA_NO_NODE if the page is in a node that is valid for this
2739  * policy, or a suitable node ID to allocate a replacement folio from.
2740  */
2741 int mpol_misplaced(struct folio *folio, struct vm_fault *vmf,
2742                    unsigned long addr)
2743 {
2744         struct mempolicy *pol;
2745         pgoff_t ilx;
2746         struct zoneref *z;
2747         int curnid = folio_nid(folio);
2748         struct vm_area_struct *vma = vmf->vma;
2749         int thiscpu = raw_smp_processor_id();
2750         int thisnid = numa_node_id();
2751         int polnid = NUMA_NO_NODE;
2752         int ret = NUMA_NO_NODE;
2753
2754         /*
2755          * Make sure ptl is held so that we don't preempt and we
2756          * have a stable smp processor id
2757          */
2758         lockdep_assert_held(vmf->ptl);
2759         pol = get_vma_policy(vma, addr, folio_order(folio), &ilx);
2760         if (!(pol->flags & MPOL_F_MOF))
2761                 goto out;
2762
2763         switch (pol->mode) {
2764         case MPOL_INTERLEAVE:
2765                 polnid = interleave_nid(pol, ilx);
2766                 break;
2767
2768         case MPOL_WEIGHTED_INTERLEAVE:
2769                 polnid = weighted_interleave_nid(pol, ilx);
2770                 break;
2771
2772         case MPOL_PREFERRED:
2773                 if (node_isset(curnid, pol->nodes))
2774                         goto out;
2775                 polnid = first_node(pol->nodes);
2776                 break;
2777
2778         case MPOL_LOCAL:
2779                 polnid = numa_node_id();
2780                 break;
2781
2782         case MPOL_BIND:
2783         case MPOL_PREFERRED_MANY:
2784                 /*
2785                  * Even though MPOL_PREFERRED_MANY can allocate pages outside
2786                  * policy nodemask we don't allow numa migration to nodes
2787                  * outside policy nodemask for now. This is done so that if we
2788                  * want demotion to slow memory to happen, before allocating
2789                  * from some DRAM node say 'x', we will end up using a
2790                  * MPOL_PREFERRED_MANY mask excluding node 'x'. In such scenario
2791                  * we should not promote to node 'x' from slow memory node.
2792                  */
2793                 if (pol->flags & MPOL_F_MORON) {
2794                         /*
2795                          * Optimize placement among multiple nodes
2796                          * via NUMA balancing
2797                          */
2798                         if (node_isset(thisnid, pol->nodes))
2799                                 break;
2800                         goto out;
2801                 }
2802
2803                 /*
2804                  * use current page if in policy nodemask,
2805                  * else select nearest allowed node, if any.
2806                  * If no allowed nodes, use current [!misplaced].
2807                  */
2808                 if (node_isset(curnid, pol->nodes))
2809                         goto out;
2810                 z = first_zones_zonelist(
2811                                 node_zonelist(thisnid, GFP_HIGHUSER),
2812                                 gfp_zone(GFP_HIGHUSER),
2813                                 &pol->nodes);
2814                 polnid = zonelist_node_idx(z);
2815                 break;
2816
2817         default:
2818                 BUG();
2819         }
2820
2821         /* Migrate the folio towards the node whose CPU is referencing it */
2822         if (pol->flags & MPOL_F_MORON) {
2823                 polnid = thisnid;
2824
2825                 if (!should_numa_migrate_memory(current, folio, curnid,
2826                                                 thiscpu))
2827                         goto out;
2828         }
2829
2830         if (curnid != polnid)
2831                 ret = polnid;
2832 out:
2833         mpol_cond_put(pol);
2834
2835         return ret;
2836 }
2837
2838 /*
2839  * Drop the (possibly final) reference to task->mempolicy.  It needs to be
2840  * dropped after task->mempolicy is set to NULL so that any allocation done as
2841  * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2842  * policy.
2843  */
2844 void mpol_put_task_policy(struct task_struct *task)
2845 {
2846         struct mempolicy *pol;
2847
2848         task_lock(task);
2849         pol = task->mempolicy;
2850         task->mempolicy = NULL;
2851         task_unlock(task);
2852         mpol_put(pol);
2853 }
2854
2855 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2856 {
2857         rb_erase(&n->nd, &sp->root);
2858         sp_free(n);
2859 }
2860
2861 static void sp_node_init(struct sp_node *node, unsigned long start,
2862                         unsigned long end, struct mempolicy *pol)
2863 {
2864         node->start = start;
2865         node->end = end;
2866         node->policy = pol;
2867 }
2868
2869 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2870                                 struct mempolicy *pol)
2871 {
2872         struct sp_node *n;
2873         struct mempolicy *newpol;
2874
2875         n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2876         if (!n)
2877                 return NULL;
2878
2879         newpol = mpol_dup(pol);
2880         if (IS_ERR(newpol)) {
2881                 kmem_cache_free(sn_cache, n);
2882                 return NULL;
2883         }
2884         newpol->flags |= MPOL_F_SHARED;
2885         sp_node_init(n, start, end, newpol);
2886
2887         return n;
2888 }
2889
2890 /* Replace a policy range. */
2891 static int shared_policy_replace(struct shared_policy *sp, pgoff_t start,
2892                                  pgoff_t end, struct sp_node *new)
2893 {
2894         struct sp_node *n;
2895         struct sp_node *n_new = NULL;
2896         struct mempolicy *mpol_new = NULL;
2897         int ret = 0;
2898
2899 restart:
2900         write_lock(&sp->lock);
2901         n = sp_lookup(sp, start, end);
2902         /* Take care of old policies in the same range. */
2903         while (n && n->start < end) {
2904                 struct rb_node *next = rb_next(&n->nd);
2905                 if (n->start >= start) {
2906                         if (n->end <= end)
2907                                 sp_delete(sp, n);
2908                         else
2909                                 n->start = end;
2910                 } else {
2911                         /* Old policy spanning whole new range. */
2912                         if (n->end > end) {
2913                                 if (!n_new)
2914                                         goto alloc_new;
2915
2916                                 *mpol_new = *n->policy;
2917                                 atomic_set(&mpol_new->refcnt, 1);
2918                                 sp_node_init(n_new, end, n->end, mpol_new);
2919                                 n->end = start;
2920                                 sp_insert(sp, n_new);
2921                                 n_new = NULL;
2922                                 mpol_new = NULL;
2923                                 break;
2924                         } else
2925                                 n->end = start;
2926                 }
2927                 if (!next)
2928                         break;
2929                 n = rb_entry(next, struct sp_node, nd);
2930         }
2931         if (new)
2932                 sp_insert(sp, new);
2933         write_unlock(&sp->lock);
2934         ret = 0;
2935
2936 err_out:
2937         if (mpol_new)
2938                 mpol_put(mpol_new);
2939         if (n_new)
2940                 kmem_cache_free(sn_cache, n_new);
2941
2942         return ret;
2943
2944 alloc_new:
2945         write_unlock(&sp->lock);
2946         ret = -ENOMEM;
2947         n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2948         if (!n_new)
2949                 goto err_out;
2950         mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2951         if (!mpol_new)
2952                 goto err_out;
2953         atomic_set(&mpol_new->refcnt, 1);
2954         goto restart;
2955 }
2956
2957 /**
2958  * mpol_shared_policy_init - initialize shared policy for inode
2959  * @sp: pointer to inode shared policy
2960  * @mpol:  struct mempolicy to install
2961  *
2962  * Install non-NULL @mpol in inode's shared policy rb-tree.
2963  * On entry, the current task has a reference on a non-NULL @mpol.
2964  * This must be released on exit.
2965  * This is called at get_inode() calls and we can use GFP_KERNEL.
2966  */
2967 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2968 {
2969         int ret;
2970
2971         sp->root = RB_ROOT;             /* empty tree == default mempolicy */
2972         rwlock_init(&sp->lock);
2973
2974         if (mpol) {
2975                 struct sp_node *sn;
2976                 struct mempolicy *npol;
2977                 NODEMASK_SCRATCH(scratch);
2978
2979                 if (!scratch)
2980                         goto put_mpol;
2981
2982                 /* contextualize the tmpfs mount point mempolicy to this file */
2983                 npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2984                 if (IS_ERR(npol))
2985                         goto free_scratch; /* no valid nodemask intersection */
2986
2987                 task_lock(current);
2988                 ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch);
2989                 task_unlock(current);
2990                 if (ret)
2991                         goto put_npol;
2992
2993                 /* alloc node covering entire file; adds ref to file's npol */
2994                 sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol);
2995                 if (sn)
2996                         sp_insert(sp, sn);
2997 put_npol:
2998                 mpol_put(npol); /* drop initial ref on file's npol */
2999 free_scratch:
3000                 NODEMASK_SCRATCH_FREE(scratch);
3001 put_mpol:
3002                 mpol_put(mpol); /* drop our incoming ref on sb mpol */
3003         }
3004 }
3005
3006 int mpol_set_shared_policy(struct shared_policy *sp,
3007                         struct vm_area_struct *vma, struct mempolicy *pol)
3008 {
3009         int err;
3010         struct sp_node *new = NULL;
3011         unsigned long sz = vma_pages(vma);
3012
3013         if (pol) {
3014                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol);
3015                 if (!new)
3016                         return -ENOMEM;
3017         }
3018         err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new);
3019         if (err && new)
3020                 sp_free(new);
3021         return err;
3022 }
3023
3024 /* Free a backing policy store on inode delete. */
3025 void mpol_free_shared_policy(struct shared_policy *sp)
3026 {
3027         struct sp_node *n;
3028         struct rb_node *next;
3029
3030         if (!sp->root.rb_node)
3031                 return;
3032         write_lock(&sp->lock);
3033         next = rb_first(&sp->root);
3034         while (next) {
3035                 n = rb_entry(next, struct sp_node, nd);
3036                 next = rb_next(&n->nd);
3037                 sp_delete(sp, n);
3038         }
3039         write_unlock(&sp->lock);
3040 }
3041
3042 #ifdef CONFIG_NUMA_BALANCING
3043 static int __initdata numabalancing_override;
3044
3045 static void __init check_numabalancing_enable(void)
3046 {
3047         bool numabalancing_default = false;
3048
3049         if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
3050                 numabalancing_default = true;
3051
3052         /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
3053         if (numabalancing_override)
3054                 set_numabalancing_state(numabalancing_override == 1);
3055
3056         if (num_online_nodes() > 1 && !numabalancing_override) {
3057                 pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
3058                         numabalancing_default ? "Enabling" : "Disabling");
3059                 set_numabalancing_state(numabalancing_default);
3060         }
3061 }
3062
3063 static int __init setup_numabalancing(char *str)
3064 {
3065         int ret = 0;
3066         if (!str)
3067                 goto out;
3068
3069         if (!strcmp(str, "enable")) {
3070                 numabalancing_override = 1;
3071                 ret = 1;
3072         } else if (!strcmp(str, "disable")) {
3073                 numabalancing_override = -1;
3074                 ret = 1;
3075         }
3076 out:
3077         if (!ret)
3078                 pr_warn("Unable to parse numa_balancing=\n");
3079
3080         return ret;
3081 }
3082 __setup("numa_balancing=", setup_numabalancing);
3083 #else
3084 static inline void __init check_numabalancing_enable(void)
3085 {
3086 }
3087 #endif /* CONFIG_NUMA_BALANCING */
3088
3089 void __init numa_policy_init(void)
3090 {
3091         nodemask_t interleave_nodes;
3092         unsigned long largest = 0;
3093         int nid, prefer = 0;
3094
3095         policy_cache = kmem_cache_create("numa_policy",
3096                                          sizeof(struct mempolicy),
3097                                          0, SLAB_PANIC, NULL);
3098
3099         sn_cache = kmem_cache_create("shared_policy_node",
3100                                      sizeof(struct sp_node),
3101                                      0, SLAB_PANIC, NULL);
3102
3103         for_each_node(nid) {
3104                 preferred_node_policy[nid] = (struct mempolicy) {
3105                         .refcnt = ATOMIC_INIT(1),
3106                         .mode = MPOL_PREFERRED,
3107                         .flags = MPOL_F_MOF | MPOL_F_MORON,
3108                         .nodes = nodemask_of_node(nid),
3109                 };
3110         }
3111
3112         /*
3113          * Set interleaving policy for system init. Interleaving is only
3114          * enabled across suitably sized nodes (default is >= 16MB), or
3115          * fall back to the largest node if they're all smaller.
3116          */
3117         nodes_clear(interleave_nodes);
3118         for_each_node_state(nid, N_MEMORY) {
3119                 unsigned long total_pages = node_present_pages(nid);
3120
3121                 /* Preserve the largest node */
3122                 if (largest < total_pages) {
3123                         largest = total_pages;
3124                         prefer = nid;
3125                 }
3126
3127                 /* Interleave this node? */
3128                 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
3129                         node_set(nid, interleave_nodes);
3130         }
3131
3132         /* All too small, use the largest */
3133         if (unlikely(nodes_empty(interleave_nodes)))
3134                 node_set(prefer, interleave_nodes);
3135
3136         if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
3137                 pr_err("%s: interleaving failed\n", __func__);
3138
3139         check_numabalancing_enable();
3140 }
3141
3142 /* Reset policy of current process to default */
3143 void numa_default_policy(void)
3144 {
3145         do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
3146 }
3147
3148 /*
3149  * Parse and format mempolicy from/to strings
3150  */
3151 static const char * const policy_modes[] =
3152 {
3153         [MPOL_DEFAULT]    = "default",
3154         [MPOL_PREFERRED]  = "prefer",
3155         [MPOL_BIND]       = "bind",
3156         [MPOL_INTERLEAVE] = "interleave",
3157         [MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave",
3158         [MPOL_LOCAL]      = "local",
3159         [MPOL_PREFERRED_MANY]  = "prefer (many)",
3160 };
3161
3162 #ifdef CONFIG_TMPFS
3163 /**
3164  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
3165  * @str:  string containing mempolicy to parse
3166  * @mpol:  pointer to struct mempolicy pointer, returned on success.
3167  *
3168  * Format of input:
3169  *      <mode>[=<flags>][:<nodelist>]
3170  *
3171  * Return: %0 on success, else %1
3172  */
3173 int mpol_parse_str(char *str, struct mempolicy **mpol)
3174 {
3175         struct mempolicy *new = NULL;
3176         unsigned short mode_flags;
3177         nodemask_t nodes;
3178         char *nodelist = strchr(str, ':');
3179         char *flags = strchr(str, '=');
3180         int err = 1, mode;
3181
3182         if (flags)
3183                 *flags++ = '\0';        /* terminate mode string */
3184
3185         if (nodelist) {
3186                 /* NUL-terminate mode or flags string */
3187                 *nodelist++ = '\0';
3188                 if (nodelist_parse(nodelist, nodes))
3189                         goto out;
3190                 if (!nodes_subset(nodes, node_states[N_MEMORY]))
3191                         goto out;
3192         } else
3193                 nodes_clear(nodes);
3194
3195         mode = match_string(policy_modes, MPOL_MAX, str);
3196         if (mode < 0)
3197                 goto out;
3198
3199         switch (mode) {
3200         case MPOL_PREFERRED:
3201                 /*
3202                  * Insist on a nodelist of one node only, although later
3203                  * we use first_node(nodes) to grab a single node, so here
3204                  * nodelist (or nodes) cannot be empty.
3205                  */
3206                 if (nodelist) {
3207                         char *rest = nodelist;
3208                         while (isdigit(*rest))
3209                                 rest++;
3210                         if (*rest)
3211                                 goto out;
3212                         if (nodes_empty(nodes))
3213                                 goto out;
3214                 }
3215                 break;
3216         case MPOL_INTERLEAVE:
3217         case MPOL_WEIGHTED_INTERLEAVE:
3218                 /*
3219                  * Default to online nodes with memory if no nodelist
3220                  */
3221                 if (!nodelist)
3222                         nodes = node_states[N_MEMORY];
3223                 break;
3224         case MPOL_LOCAL:
3225                 /*
3226                  * Don't allow a nodelist;  mpol_new() checks flags
3227                  */
3228                 if (nodelist)
3229                         goto out;
3230                 break;
3231         case MPOL_DEFAULT:
3232                 /*
3233                  * Insist on a empty nodelist
3234                  */
3235                 if (!nodelist)
3236                         err = 0;
3237                 goto out;
3238         case MPOL_PREFERRED_MANY:
3239         case MPOL_BIND:
3240                 /*
3241                  * Insist on a nodelist
3242                  */
3243                 if (!nodelist)
3244                         goto out;
3245         }
3246
3247         mode_flags = 0;
3248         if (flags) {
3249                 /*
3250                  * Currently, we only support two mutually exclusive
3251                  * mode flags.
3252                  */
3253                 if (!strcmp(flags, "static"))
3254                         mode_flags |= MPOL_F_STATIC_NODES;
3255                 else if (!strcmp(flags, "relative"))
3256                         mode_flags |= MPOL_F_RELATIVE_NODES;
3257                 else
3258                         goto out;
3259         }
3260
3261         new = mpol_new(mode, mode_flags, &nodes);
3262         if (IS_ERR(new))
3263                 goto out;
3264
3265         /*
3266          * Save nodes for mpol_to_str() to show the tmpfs mount options
3267          * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
3268          */
3269         if (mode != MPOL_PREFERRED) {
3270                 new->nodes = nodes;
3271         } else if (nodelist) {
3272                 nodes_clear(new->nodes);
3273                 node_set(first_node(nodes), new->nodes);
3274         } else {
3275                 new->mode = MPOL_LOCAL;
3276         }
3277
3278         /*
3279          * Save nodes for contextualization: this will be used to "clone"
3280          * the mempolicy in a specific context [cpuset] at a later time.
3281          */
3282         new->w.user_nodemask = nodes;
3283
3284         err = 0;
3285
3286 out:
3287         /* Restore string for error message */
3288         if (nodelist)
3289                 *--nodelist = ':';
3290         if (flags)
3291                 *--flags = '=';
3292         if (!err)
3293                 *mpol = new;
3294         return err;
3295 }
3296 #endif /* CONFIG_TMPFS */
3297
3298 /**
3299  * mpol_to_str - format a mempolicy structure for printing
3300  * @buffer:  to contain formatted mempolicy string
3301  * @maxlen:  length of @buffer
3302  * @pol:  pointer to mempolicy to be formatted
3303  *
3304  * Convert @pol into a string.  If @buffer is too short, truncate the string.
3305  * Recommend a @maxlen of at least 51 for the longest mode, "weighted
3306  * interleave", plus the longest flag flags, "relative|balancing", and to
3307  * display at least a few node ids.
3308  */
3309 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
3310 {
3311         char *p = buffer;
3312         nodemask_t nodes = NODE_MASK_NONE;
3313         unsigned short mode = MPOL_DEFAULT;
3314         unsigned short flags = 0;
3315
3316         if (pol &&
3317             pol != &default_policy &&
3318             !(pol >= &preferred_node_policy[0] &&
3319               pol <= &preferred_node_policy[ARRAY_SIZE(preferred_node_policy) - 1])) {
3320                 mode = pol->mode;
3321                 flags = pol->flags;
3322         }
3323
3324         switch (mode) {
3325         case MPOL_DEFAULT:
3326         case MPOL_LOCAL:
3327                 break;
3328         case MPOL_PREFERRED:
3329         case MPOL_PREFERRED_MANY:
3330         case MPOL_BIND:
3331         case MPOL_INTERLEAVE:
3332         case MPOL_WEIGHTED_INTERLEAVE:
3333                 nodes = pol->nodes;
3334                 break;
3335         default:
3336                 WARN_ON_ONCE(1);
3337                 snprintf(p, maxlen, "unknown");
3338                 return;
3339         }
3340
3341         p += snprintf(p, maxlen, "%s", policy_modes[mode]);
3342
3343         if (flags & MPOL_MODE_FLAGS) {
3344                 p += snprintf(p, buffer + maxlen - p, "=");
3345
3346                 /*
3347                  * Static and relative are mutually exclusive.
3348                  */
3349                 if (flags & MPOL_F_STATIC_NODES)
3350                         p += snprintf(p, buffer + maxlen - p, "static");
3351                 else if (flags & MPOL_F_RELATIVE_NODES)
3352                         p += snprintf(p, buffer + maxlen - p, "relative");
3353
3354                 if (flags & MPOL_F_NUMA_BALANCING) {
3355                         if (!is_power_of_2(flags & MPOL_MODE_FLAGS))
3356                                 p += snprintf(p, buffer + maxlen - p, "|");
3357                         p += snprintf(p, buffer + maxlen - p, "balancing");
3358                 }
3359         }
3360
3361         if (!nodes_empty(nodes))
3362                 p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
3363                                nodemask_pr_args(&nodes));
3364 }
3365
3366 #ifdef CONFIG_SYSFS
3367 struct iw_node_attr {
3368         struct kobj_attribute kobj_attr;
3369         int nid;
3370 };
3371
3372 static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
3373                          char *buf)
3374 {
3375         struct iw_node_attr *node_attr;
3376         u8 weight;
3377
3378         node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
3379         weight = get_il_weight(node_attr->nid);
3380         return sysfs_emit(buf, "%d\n", weight);
3381 }
3382
3383 static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
3384                           const char *buf, size_t count)
3385 {
3386         struct iw_node_attr *node_attr;
3387         u8 *new;
3388         u8 *old;
3389         u8 weight = 0;
3390
3391         node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
3392         if (count == 0 || sysfs_streq(buf, ""))
3393                 weight = 0;
3394         else if (kstrtou8(buf, 0, &weight))
3395                 return -EINVAL;
3396
3397         new = kzalloc(nr_node_ids, GFP_KERNEL);
3398         if (!new)
3399                 return -ENOMEM;
3400
3401         mutex_lock(&iw_table_lock);
3402         old = rcu_dereference_protected(iw_table,
3403                                         lockdep_is_held(&iw_table_lock));
3404         if (old)
3405                 memcpy(new, old, nr_node_ids);
3406         new[node_attr->nid] = weight;
3407         rcu_assign_pointer(iw_table, new);
3408         mutex_unlock(&iw_table_lock);
3409         synchronize_rcu();
3410         kfree(old);
3411         return count;
3412 }
3413
3414 static struct iw_node_attr **node_attrs;
3415
3416 static void sysfs_wi_node_release(struct iw_node_attr *node_attr,
3417                                   struct kobject *parent)
3418 {
3419         if (!node_attr)
3420                 return;
3421         sysfs_remove_file(parent, &node_attr->kobj_attr.attr);
3422         kfree(node_attr->kobj_attr.attr.name);
3423         kfree(node_attr);
3424 }
3425
3426 static void sysfs_wi_release(struct kobject *wi_kobj)
3427 {
3428         int i;
3429
3430         for (i = 0; i < nr_node_ids; i++)
3431                 sysfs_wi_node_release(node_attrs[i], wi_kobj);
3432         kobject_put(wi_kobj);
3433 }
3434
3435 static const struct kobj_type wi_ktype = {
3436         .sysfs_ops = &kobj_sysfs_ops,
3437         .release = sysfs_wi_release,
3438 };
3439
3440 static int add_weight_node(int nid, struct kobject *wi_kobj)
3441 {
3442         struct iw_node_attr *node_attr;
3443         char *name;
3444
3445         node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL);
3446         if (!node_attr)
3447                 return -ENOMEM;
3448
3449         name = kasprintf(GFP_KERNEL, "node%d", nid);
3450         if (!name) {
3451                 kfree(node_attr);
3452                 return -ENOMEM;
3453         }
3454
3455         sysfs_attr_init(&node_attr->kobj_attr.attr);
3456         node_attr->kobj_attr.attr.name = name;
3457         node_attr->kobj_attr.attr.mode = 0644;
3458         node_attr->kobj_attr.show = node_show;
3459         node_attr->kobj_attr.store = node_store;
3460         node_attr->nid = nid;
3461
3462         if (sysfs_create_file(wi_kobj, &node_attr->kobj_attr.attr)) {
3463                 kfree(node_attr->kobj_attr.attr.name);
3464                 kfree(node_attr);
3465                 pr_err("failed to add attribute to weighted_interleave\n");
3466                 return -ENOMEM;
3467         }
3468
3469         node_attrs[nid] = node_attr;
3470         return 0;
3471 }
3472
3473 static int add_weighted_interleave_group(struct kobject *root_kobj)
3474 {
3475         struct kobject *wi_kobj;
3476         int nid, err;
3477
3478         wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
3479         if (!wi_kobj)
3480                 return -ENOMEM;
3481
3482         err = kobject_init_and_add(wi_kobj, &wi_ktype, root_kobj,
3483                                    "weighted_interleave");
3484         if (err) {
3485                 kfree(wi_kobj);
3486                 return err;
3487         }
3488
3489         for_each_node_state(nid, N_POSSIBLE) {
3490                 err = add_weight_node(nid, wi_kobj);
3491                 if (err) {
3492                         pr_err("failed to add sysfs [node%d]\n", nid);
3493                         break;
3494                 }
3495         }
3496         if (err)
3497                 kobject_put(wi_kobj);
3498         return 0;
3499 }
3500
3501 static void mempolicy_kobj_release(struct kobject *kobj)
3502 {
3503         u8 *old;
3504
3505         mutex_lock(&iw_table_lock);
3506         old = rcu_dereference_protected(iw_table,
3507                                         lockdep_is_held(&iw_table_lock));
3508         rcu_assign_pointer(iw_table, NULL);
3509         mutex_unlock(&iw_table_lock);
3510         synchronize_rcu();
3511         kfree(old);
3512         kfree(node_attrs);
3513         kfree(kobj);
3514 }
3515
3516 static const struct kobj_type mempolicy_ktype = {
3517         .release = mempolicy_kobj_release
3518 };
3519
3520 static int __init mempolicy_sysfs_init(void)
3521 {
3522         int err;
3523         static struct kobject *mempolicy_kobj;
3524
3525         mempolicy_kobj = kzalloc(sizeof(*mempolicy_kobj), GFP_KERNEL);
3526         if (!mempolicy_kobj) {
3527                 err = -ENOMEM;
3528                 goto err_out;
3529         }
3530
3531         node_attrs = kcalloc(nr_node_ids, sizeof(struct iw_node_attr *),
3532                              GFP_KERNEL);
3533         if (!node_attrs) {
3534                 err = -ENOMEM;
3535                 goto mempol_out;
3536         }
3537
3538         err = kobject_init_and_add(mempolicy_kobj, &mempolicy_ktype, mm_kobj,
3539                                    "mempolicy");
3540         if (err)
3541                 goto node_out;
3542
3543         err = add_weighted_interleave_group(mempolicy_kobj);
3544         if (err) {
3545                 pr_err("mempolicy sysfs structure failed to initialize\n");
3546                 kobject_put(mempolicy_kobj);
3547                 return err;
3548         }
3549
3550         return err;
3551 node_out:
3552         kfree(node_attrs);
3553 mempol_out:
3554         kfree(mempolicy_kobj);
3555 err_out:
3556         pr_err("failed to add mempolicy kobject to the system\n");
3557         return err;
3558 }
3559
3560 late_initcall(mempolicy_sysfs_init);
3561 #endif /* CONFIG_SYSFS */