mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case NUMA_NO_NODE here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66 */
  67
  68 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  69
  70 #include <linux/mempolicy.h>
  71 #include <linux/mm.h>
  72 #include <linux/highmem.h>
  73 #include <linux/hugetlb.h>
  74 #include <linux/kernel.h>
  75 #include <linux/sched.h>
  76 #include <linux/nodemask.h>
  77 #include <linux/cpuset.h>
  78 #include <linux/slab.h>
  79 #include <linux/string.h>
  80 #include <linux/export.h>
  81 #include <linux/nsproxy.h>
  82 #include <linux/interrupt.h>
  83 #include <linux/init.h>
  84 #include <linux/compat.h>
  85 #include <linux/swap.h>
  86 #include <linux/seq_file.h>
  87 #include <linux/proc_fs.h>
  88 #include <linux/migrate.h>
  89 #include <linux/ksm.h>
  90 #include <linux/rmap.h>
  91 #include <linux/security.h>
  92 #include <linux/syscalls.h>
  93 #include <linux/ctype.h>
  94 #include <linux/mm_inline.h>
  95 #include <linux/mmu_notifier.h>
  96 #include <linux/printk.h>
  97
  98 #include <asm/tlbflush.h>
  99 #include <asm/uaccess.h>
 100 #include <linux/random.h>
 101
 102 #include "internal.h"
 103
 104 /* Internal flags */
 105 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
 106 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
 107
 108 static struct kmem_cache *policy_cache;
 109 static struct kmem_cache *sn_cache;
 110
 111 /* Highest zone. An specific allocation for a zone below that is not
 112    policied. */
 113 enum zone_type policy_zone = 0;
 114
 115 /*
 116  * run-time system-wide default policy => local allocation
 117  */
 118 static struct mempolicy default_policy = {
 119         .refcnt = ATOMIC_INIT(1), /* never free it */
 120         .mode = MPOL_PREFERRED,
 121         .flags = MPOL_F_LOCAL,
 122 };
 123
 124 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
 125
 126 static struct mempolicy *get_task_policy(struct task_struct *p)
 127 {
 128         struct mempolicy *pol = p->mempolicy;
 129
 130         if (!pol) {
 131                 int node = numa_node_id();
 132
 133                 if (node != NUMA_NO_NODE) {
 134                         pol = &preferred_node_policy[node];
 135                         /*
 136                          * preferred_node_policy is not initialised early in
 137                          * boot
 138                          */
 139                         if (!pol->mode)
 140                                 pol = NULL;
 141                 }
 142         }
 143
 144         return pol;
 145 }
 146
 147 static const struct mempolicy_operations {
 148         int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 149         /*
 150          * If read-side task has no lock to protect task->mempolicy, write-side
 151          * task will rebind the task->mempolicy by two step. The first step is
 152          * setting all the newly nodes, and the second step is cleaning all the
 153          * disallowed nodes. In this way, we can avoid finding no node to alloc
 154          * page.
 155          * If we have a lock to protect task->mempolicy in read-side, we do
 156          * rebind directly.
 157          *
 158          * step:
 159          *      MPOL_REBIND_ONCE - do rebind work at once
 160          *      MPOL_REBIND_STEP1 - set all the newly nodes
 161          *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 162          */
 163         void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
 164                         enum mpol_rebind_step step);
 165 } mpol_ops[MPOL_MAX];
 166
 167 /* Check that the nodemask contains at least one populated zone */
 168 static int is_valid_nodemask(const nodemask_t *nodemask)
 169 {
 170         return nodes_intersects(*nodemask, node_states[N_MEMORY]);
 171 }
 172
 173 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 174 {
 175         return pol->flags & MPOL_MODE_FLAGS;
 176 }
 177
 178 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 179                                    const nodemask_t *rel)
 180 {
 181         nodemask_t tmp;
 182         nodes_fold(tmp, *orig, nodes_weight(*rel));
 183         nodes_onto(*ret, tmp, *rel);
 184 }
 185
 186 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 187 {
 188         if (nodes_empty(*nodes))
 189                 return -EINVAL;
 190         pol->v.nodes = *nodes;
 191         return 0;
 192 }
 193
 194 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 195 {
 196         if (!nodes)
 197                 pol->flags |= MPOL_F_LOCAL;     /* local allocation */
 198         else if (nodes_empty(*nodes))
 199                 return -EINVAL;                 /*  no allowed nodes */
 200         else
 201                 pol->v.preferred_node = first_node(*nodes);
 202         return 0;
 203 }
 204
 205 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 206 {
 207         if (!is_valid_nodemask(nodes))
 208                 return -EINVAL;
 209         pol->v.nodes = *nodes;
 210         return 0;
 211 }
 212
 213 /*
 214  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 215  * any, for the new policy.  mpol_new() has already validated the nodes
 216  * parameter with respect to the policy mode and flags.  But, we need to
 217  * handle an empty nodemask with MPOL_PREFERRED here.
 218  *
 219  * Must be called holding task's alloc_lock to protect task's mems_allowed
 220  * and mempolicy.  May also be called holding the mmap_semaphore for write.
 221  */
 222 static int mpol_set_nodemask(struct mempolicy *pol,
 223                      const nodemask_t *nodes, struct nodemask_scratch *nsc)
 224 {
 225         int ret;
 226
 227         /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
 228         if (pol == NULL)
 229                 return 0;
 230         /* Check N_MEMORY */
 231         nodes_and(nsc->mask1,
 232                   cpuset_current_mems_allowed, node_states[N_MEMORY]);
 233
 234         VM_BUG_ON(!nodes);
 235         if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
 236                 nodes = NULL;   /* explicit local allocation */
 237         else {
 238                 if (pol->flags & MPOL_F_RELATIVE_NODES)
 239                         mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
 240                 else
 241                         nodes_and(nsc->mask2, *nodes, nsc->mask1);
 242
 243                 if (mpol_store_user_nodemask(pol))
 244                         pol->w.user_nodemask = *nodes;
 245                 else
 246                         pol->w.cpuset_mems_allowed =
 247                                                 cpuset_current_mems_allowed;
 248         }
 249
 250         if (nodes)
 251                 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
 252         else
 253                 ret = mpol_ops[pol->mode].create(pol, NULL);
 254         return ret;
 255 }
 256
 257 /*
 258  * This function just creates a new policy, does some check and simple
 259  * initialization. You must invoke mpol_set_nodemask() to set nodes.
 260  */
 261 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 262                                   nodemask_t *nodes)
 263 {
 264         struct mempolicy *policy;
 265
 266         pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 267                  mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
 268
 269         if (mode == MPOL_DEFAULT) {
 270                 if (nodes && !nodes_empty(*nodes))
 271                         return ERR_PTR(-EINVAL);
 272                 return NULL;
 273         }
 274         VM_BUG_ON(!nodes);
 275
 276         /*
 277          * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 278          * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 279          * All other modes require a valid pointer to a non-empty nodemask.
 280          */
 281         if (mode == MPOL_PREFERRED) {
 282                 if (nodes_empty(*nodes)) {
 283                         if (((flags & MPOL_F_STATIC_NODES) ||
 284                              (flags & MPOL_F_RELATIVE_NODES)))
 285                                 return ERR_PTR(-EINVAL);
 286                 }
 287         } else if (mode == MPOL_LOCAL) {
 288                 if (!nodes_empty(*nodes))
 289                         return ERR_PTR(-EINVAL);
 290                 mode = MPOL_PREFERRED;
 291         } else if (nodes_empty(*nodes))
 292                 return ERR_PTR(-EINVAL);
 293         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 294         if (!policy)
 295                 return ERR_PTR(-ENOMEM);
 296         atomic_set(&policy->refcnt, 1);
 297         policy->mode = mode;
 298         policy->flags = flags;
 299
 300         return policy;
 301 }
 302
 303 /* Slow path of a mpol destructor. */
 304 void __mpol_put(struct mempolicy *p)
 305 {
 306         if (!atomic_dec_and_test(&p->refcnt))
 307                 return;
 308         kmem_cache_free(policy_cache, p);
 309 }
 310
 311 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
 312                                 enum mpol_rebind_step step)
 313 {
 314 }
 315
 316 /*
 317  * step:
 318  *      MPOL_REBIND_ONCE  - do rebind work at once
 319  *      MPOL_REBIND_STEP1 - set all the newly nodes
 320  *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 321  */
 322 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
 323                                  enum mpol_rebind_step step)
 324 {
 325         nodemask_t tmp;
 326
 327         if (pol->flags & MPOL_F_STATIC_NODES)
 328                 nodes_and(tmp, pol->w.user_nodemask, *nodes);
 329         else if (pol->flags & MPOL_F_RELATIVE_NODES)
 330                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 331         else {
 332                 /*
 333                  * if step == 1, we use ->w.cpuset_mems_allowed to cache the
 334                  * result
 335                  */
 336                 if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
 337                         nodes_remap(tmp, pol->v.nodes,
 338                                         pol->w.cpuset_mems_allowed, *nodes);
 339                         pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
 340                 } else if (step == MPOL_REBIND_STEP2) {
 341                         tmp = pol->w.cpuset_mems_allowed;
 342                         pol->w.cpuset_mems_allowed = *nodes;
 343                 } else
 344                         BUG();
 345         }
 346
 347         if (nodes_empty(tmp))
 348                 tmp = *nodes;
 349
 350         if (step == MPOL_REBIND_STEP1)
 351                 nodes_or(pol->v.nodes, pol->v.nodes, tmp);
 352         else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
 353                 pol->v.nodes = tmp;
 354         else
 355                 BUG();
 356
 357         if (!node_isset(current->il_next, tmp)) {
 358                 current->il_next = next_node(current->il_next, tmp);
 359                 if (current->il_next >= MAX_NUMNODES)
 360                         current->il_next = first_node(tmp);
 361                 if (current->il_next >= MAX_NUMNODES)
 362                         current->il_next = numa_node_id();
 363         }
 364 }
 365
 366 static void mpol_rebind_preferred(struct mempolicy *pol,
 367                                   const nodemask_t *nodes,
 368                                   enum mpol_rebind_step step)
 369 {
 370         nodemask_t tmp;
 371
 372         if (pol->flags & MPOL_F_STATIC_NODES) {
 373                 int node = first_node(pol->w.user_nodemask);
 374
 375                 if (node_isset(node, *nodes)) {
 376                         pol->v.preferred_node = node;
 377                         pol->flags &= ~MPOL_F_LOCAL;
 378                 } else
 379                         pol->flags |= MPOL_F_LOCAL;
 380         } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 381                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 382                 pol->v.preferred_node = first_node(tmp);
 383         } else if (!(pol->flags & MPOL_F_LOCAL)) {
 384                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
 385                                                    pol->w.cpuset_mems_allowed,
 386                                                    *nodes);
 387                 pol->w.cpuset_mems_allowed = *nodes;
 388         }
 389 }
 390
 391 /*
 392  * mpol_rebind_policy - Migrate a policy to a different set of nodes
 393  *
 394  * If read-side task has no lock to protect task->mempolicy, write-side
 395  * task will rebind the task->mempolicy by two step. The first step is
 396  * setting all the newly nodes, and the second step is cleaning all the
 397  * disallowed nodes. In this way, we can avoid finding no node to alloc
 398  * page.
 399  * If we have a lock to protect task->mempolicy in read-side, we do
 400  * rebind directly.
 401  *
 402  * step:
 403  *      MPOL_REBIND_ONCE  - do rebind work at once
 404  *      MPOL_REBIND_STEP1 - set all the newly nodes
 405  *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 406  */
 407 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
 408                                 enum mpol_rebind_step step)
 409 {
 410         if (!pol)
 411                 return;
 412         if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
 413             nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 414                 return;
 415
 416         if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
 417                 return;
 418
 419         if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
 420                 BUG();
 421
 422         if (step == MPOL_REBIND_STEP1)
 423                 pol->flags |= MPOL_F_REBINDING;
 424         else if (step == MPOL_REBIND_STEP2)
 425                 pol->flags &= ~MPOL_F_REBINDING;
 426         else if (step >= MPOL_REBIND_NSTEP)
 427                 BUG();
 428
 429         mpol_ops[pol->mode].rebind(pol, newmask, step);
 430 }
 431
 432 /*
 433  * Wrapper for mpol_rebind_policy() that just requires task
 434  * pointer, and updates task mempolicy.
 435  *
 436  * Called with task's alloc_lock held.
 437  */
 438
 439 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
 440                         enum mpol_rebind_step step)
 441 {
 442         mpol_rebind_policy(tsk->mempolicy, new, step);
 443 }
 444
 445 /*
 446  * Rebind each vma in mm to new nodemask.
 447  *
 448  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
 449  */
 450
 451 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 452 {
 453         struct vm_area_struct *vma;
 454
 455         down_write(&mm->mmap_sem);
 456         for (vma = mm->mmap; vma; vma = vma->vm_next)
 457                 mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
 458         up_write(&mm->mmap_sem);
 459 }
 460
 461 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 462         [MPOL_DEFAULT] = {
 463                 .rebind = mpol_rebind_default,
 464         },
 465         [MPOL_INTERLEAVE] = {
 466                 .create = mpol_new_interleave,
 467                 .rebind = mpol_rebind_nodemask,
 468         },
 469         [MPOL_PREFERRED] = {
 470                 .create = mpol_new_preferred,
 471                 .rebind = mpol_rebind_preferred,
 472         },
 473         [MPOL_BIND] = {
 474                 .create = mpol_new_bind,
 475                 .rebind = mpol_rebind_nodemask,
 476         },
 477 };
 478
 479 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 480                                 unsigned long flags);
 481
 482 /*
 483  * Scan through pages checking if pages follow certain conditions,
 484  * and move them to the pagelist if they do.
 485  */
 486 static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 487                 unsigned long addr, unsigned long end,
 488                 const nodemask_t *nodes, unsigned long flags,
 489                 void *private)
 490 {
 491         pte_t *orig_pte;
 492         pte_t *pte;
 493         spinlock_t *ptl;
 494
 495         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 496         do {
 497                 struct page *page;
 498                 int nid;
 499
 500                 if (!pte_present(*pte))
 501                         continue;
 502                 page = vm_normal_page(vma, addr, *pte);
 503                 if (!page)
 504                         continue;
 505                 /*
 506                  * vm_normal_page() filters out zero pages, but there might
 507                  * still be PageReserved pages to skip, perhaps in a VDSO.
 508                  */
 509                 if (PageReserved(page))
 510                         continue;
 511                 nid = page_to_nid(page);
 512                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 513                         continue;
 514
 515                 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 516                         migrate_page_add(page, private, flags);
 517                 else
 518                         break;
 519         } while (pte++, addr += PAGE_SIZE, addr != end);
 520         pte_unmap_unlock(orig_pte, ptl);
 521         return addr != end;
 522 }
 523
 524 static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
 525                 pmd_t *pmd, const nodemask_t *nodes, unsigned long flags,
 526                                     void *private)
 527 {
 528 #ifdef CONFIG_HUGETLB_PAGE
 529         int nid;
 530         struct page *page;
 531         spinlock_t *ptl;
 532         pte_t entry;
 533
 534         ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
 535         entry = huge_ptep_get((pte_t *)pmd);
 536         if (!pte_present(entry))
 537                 goto unlock;
 538         page = pte_page(entry);
 539         nid = page_to_nid(page);
 540         if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 541                 goto unlock;
 542         /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
 543         if (flags & (MPOL_MF_MOVE_ALL) ||
 544             (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
 545                 isolate_huge_page(page, private);
 546 unlock:
 547         spin_unlock(ptl);
 548 #else
 549         BUG();
 550 #endif
 551 }
 552
 553 static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 554                 unsigned long addr, unsigned long end,
 555                 const nodemask_t *nodes, unsigned long flags,
 556                 void *private)
 557 {
 558         pmd_t *pmd;
 559         unsigned long next;
 560
 561         pmd = pmd_offset(pud, addr);
 562         do {
 563                 next = pmd_addr_end(addr, end);
 564                 if (!pmd_present(*pmd))
 565                         continue;
 566                 if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) {
 567                         queue_pages_hugetlb_pmd_range(vma, pmd, nodes,
 568                                                 flags, private);
 569                         continue;
 570                 }
 571                 split_huge_page_pmd(vma, addr, pmd);
 572                 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
 573                         continue;
 574                 if (queue_pages_pte_range(vma, pmd, addr, next, nodes,
 575                                     flags, private))
 576                         return -EIO;
 577         } while (pmd++, addr = next, addr != end);
 578         return 0;
 579 }
 580
 581 static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 582                 unsigned long addr, unsigned long end,
 583                 const nodemask_t *nodes, unsigned long flags,
 584                 void *private)
 585 {
 586         pud_t *pud;
 587         unsigned long next;
 588
 589         pud = pud_offset(pgd, addr);
 590         do {
 591                 next = pud_addr_end(addr, end);
 592                 if (pud_huge(*pud) && is_vm_hugetlb_page(vma))
 593                         continue;
 594                 if (pud_none_or_clear_bad(pud))
 595                         continue;
 596                 if (queue_pages_pmd_range(vma, pud, addr, next, nodes,
 597                                     flags, private))
 598                         return -EIO;
 599         } while (pud++, addr = next, addr != end);
 600         return 0;
 601 }
 602
 603 static inline int queue_pages_pgd_range(struct vm_area_struct *vma,
 604                 unsigned long addr, unsigned long end,
 605                 const nodemask_t *nodes, unsigned long flags,
 606                 void *private)
 607 {
 608         pgd_t *pgd;
 609         unsigned long next;
 610
 611         pgd = pgd_offset(vma->vm_mm, addr);
 612         do {
 613                 next = pgd_addr_end(addr, end);
 614                 if (pgd_none_or_clear_bad(pgd))
 615                         continue;
 616                 if (queue_pages_pud_range(vma, pgd, addr, next, nodes,
 617                                     flags, private))
 618                         return -EIO;
 619         } while (pgd++, addr = next, addr != end);
 620         return 0;
 621 }
 622
 623 #ifdef CONFIG_NUMA_BALANCING
 624 /*
 625  * This is used to mark a range of virtual addresses to be inaccessible.
 626  * These are later cleared by a NUMA hinting fault. Depending on these
 627  * faults, pages may be migrated for better NUMA placement.
 628  *
 629  * This is assuming that NUMA faults are handled using PROT_NONE. If
 630  * an architecture makes a different choice, it will need further
 631  * changes to the core.
 632  */
 633 unsigned long change_prot_numa(struct vm_area_struct *vma,
 634                         unsigned long addr, unsigned long end)
 635 {
 636         int nr_updated;
 637
 638         nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
 639         if (nr_updated)
 640                 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
 641
 642         return nr_updated;
 643 }
 644 #else
 645 static unsigned long change_prot_numa(struct vm_area_struct *vma,
 646                         unsigned long addr, unsigned long end)
 647 {
 648         return 0;
 649 }
 650 #endif /* CONFIG_NUMA_BALANCING */
 651
 652 /*
 653  * Walk through page tables and collect pages to be migrated.
 654  *
 655  * If pages found in a given range are on a set of nodes (determined by
 656  * @nodes and @flags,) it's isolated and queued to the pagelist which is
 657  * passed via @private.)
 658  */
 659 static struct vm_area_struct *
 660 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 661                 const nodemask_t *nodes, unsigned long flags, void *private)
 662 {
 663         int err;
 664         struct vm_area_struct *first, *vma, *prev;
 665
 666
 667         first = find_vma(mm, start);
 668         if (!first)
 669                 return ERR_PTR(-EFAULT);
 670         prev = NULL;
 671         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 672                 unsigned long endvma = vma->vm_end;
 673
 674                 if (endvma > end)
 675                         endvma = end;
 676                 if (vma->vm_start > start)
 677                         start = vma->vm_start;
 678
 679                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 680                         if (!vma->vm_next && vma->vm_end < end)
 681                                 return ERR_PTR(-EFAULT);
 682                         if (prev && prev->vm_end < vma->vm_start)
 683                                 return ERR_PTR(-EFAULT);
 684                 }
 685
 686                 if (flags & MPOL_MF_LAZY) {
 687                         change_prot_numa(vma, start, endvma);
 688                         goto next;
 689                 }
 690
 691                 if ((flags & MPOL_MF_STRICT) ||
 692                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 693                       vma_migratable(vma))) {
 694
 695                         err = queue_pages_pgd_range(vma, start, endvma, nodes,
 696                                                 flags, private);
 697                         if (err) {
 698                                 first = ERR_PTR(err);
 699                                 break;
 700                         }
 701                 }
 702 next:
 703                 prev = vma;
 704         }
 705         return first;
 706 }
 707
 708 /*
 709  * Apply policy to a single VMA
 710  * This must be called with the mmap_sem held for writing.
 711  */
 712 static int vma_replace_policy(struct vm_area_struct *vma,
 713                                                 struct mempolicy *pol)
 714 {
 715         int err;
 716         struct mempolicy *old;
 717         struct mempolicy *new;
 718
 719         pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 720                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 721                  vma->vm_ops, vma->vm_file,
 722                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 723
 724         new = mpol_dup(pol);
 725         if (IS_ERR(new))
 726                 return PTR_ERR(new);
 727
 728         if (vma->vm_ops && vma->vm_ops->set_policy) {
 729                 err = vma->vm_ops->set_policy(vma, new);
 730                 if (err)
 731                         goto err_out;
 732         }
 733
 734         old = vma->vm_policy;
 735         vma->vm_policy = new; /* protected by mmap_sem */
 736         mpol_put(old);
 737
 738         return 0;
 739  err_out:
 740         mpol_put(new);
 741         return err;
 742 }
 743
 744 /* Step 2: apply policy to a range and do splits. */
 745 static int mbind_range(struct mm_struct *mm, unsigned long start,
 746                        unsigned long end, struct mempolicy *new_pol)
 747 {
 748         struct vm_area_struct *next;
 749         struct vm_area_struct *prev;
 750         struct vm_area_struct *vma;
 751         int err = 0;
 752         pgoff_t pgoff;
 753         unsigned long vmstart;
 754         unsigned long vmend;
 755
 756         vma = find_vma(mm, start);
 757         if (!vma || vma->vm_start > start)
 758                 return -EFAULT;
 759
 760         prev = vma->vm_prev;
 761         if (start > vma->vm_start)
 762                 prev = vma;
 763
 764         for (; vma && vma->vm_start < end; prev = vma, vma = next) {
 765                 next = vma->vm_next;
 766                 vmstart = max(start, vma->vm_start);
 767                 vmend   = min(end, vma->vm_end);
 768
 769                 if (mpol_equal(vma_policy(vma), new_pol))
 770                         continue;
 771
 772                 pgoff = vma->vm_pgoff +
 773                         ((vmstart - vma->vm_start) >> PAGE_SHIFT);
 774                 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
 775                                   vma->anon_vma, vma->vm_file, pgoff,
 776                                   new_pol);
 777                 if (prev) {
 778                         vma = prev;
 779                         next = vma->vm_next;
 780                         if (mpol_equal(vma_policy(vma), new_pol))
 781                                 continue;
 782                         /* vma_merge() joined vma && vma->next, case 8 */
 783                         goto replace;
 784                 }
 785                 if (vma->vm_start != vmstart) {
 786                         err = split_vma(vma->vm_mm, vma, vmstart, 1);
 787                         if (err)
 788                                 goto out;
 789                 }
 790                 if (vma->vm_end != vmend) {
 791                         err = split_vma(vma->vm_mm, vma, vmend, 0);
 792                         if (err)
 793                                 goto out;
 794                 }
 795  replace:
 796                 err = vma_replace_policy(vma, new_pol);
 797                 if (err)
 798                         goto out;
 799         }
 800
 801  out:
 802         return err;
 803 }
 804
 805 /* Set the process memory policy */
 806 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 807                              nodemask_t *nodes)
 808 {
 809         struct mempolicy *new, *old;
 810         struct mm_struct *mm = current->mm;
 811         NODEMASK_SCRATCH(scratch);
 812         int ret;
 813
 814         if (!scratch)
 815                 return -ENOMEM;
 816
 817         new = mpol_new(mode, flags, nodes);
 818         if (IS_ERR(new)) {
 819                 ret = PTR_ERR(new);
 820                 goto out;
 821         }
 822         /*
 823          * prevent changing our mempolicy while show_numa_maps()
 824          * is using it.
 825          * Note:  do_set_mempolicy() can be called at init time
 826          * with no 'mm'.
 827          */
 828         if (mm)
 829                 down_write(&mm->mmap_sem);
 830         task_lock(current);
 831         ret = mpol_set_nodemask(new, nodes, scratch);
 832         if (ret) {
 833                 task_unlock(current);
 834                 if (mm)
 835                         up_write(&mm->mmap_sem);
 836                 mpol_put(new);
 837                 goto out;
 838         }
 839         old = current->mempolicy;
 840         current->mempolicy = new;
 841         if (new && new->mode == MPOL_INTERLEAVE &&
 842             nodes_weight(new->v.nodes))
 843                 current->il_next = first_node(new->v.nodes);
 844         task_unlock(current);
 845         if (mm)
 846                 up_write(&mm->mmap_sem);
 847
 848         mpol_put(old);
 849         ret = 0;
 850 out:
 851         NODEMASK_SCRATCH_FREE(scratch);
 852         return ret;
 853 }
 854
 855 /*
 856  * Return nodemask for policy for get_mempolicy() query
 857  *
 858  * Called with task's alloc_lock held
 859  */
 860 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 861 {
 862         nodes_clear(*nodes);
 863         if (p == &default_policy)
 864                 return;
 865
 866         switch (p->mode) {
 867         case MPOL_BIND:
 868                 /* Fall through */
 869         case MPOL_INTERLEAVE:
 870                 *nodes = p->v.nodes;
 871                 break;
 872         case MPOL_PREFERRED:
 873                 if (!(p->flags & MPOL_F_LOCAL))
 874                         node_set(p->v.preferred_node, *nodes);
 875                 /* else return empty node mask for local allocation */
 876                 break;
 877         default:
 878                 BUG();
 879         }
 880 }
 881
 882 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 883 {
 884         struct page *p;
 885         int err;
 886
 887         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 888         if (err >= 0) {
 889                 err = page_to_nid(p);
 890                 put_page(p);
 891         }
 892         return err;
 893 }
 894
 895 /* Retrieve NUMA policy */
 896 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 897                              unsigned long addr, unsigned long flags)
 898 {
 899         int err;
 900         struct mm_struct *mm = current->mm;
 901         struct vm_area_struct *vma = NULL;
 902         struct mempolicy *pol = current->mempolicy;
 903
 904         if (flags &
 905                 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 906                 return -EINVAL;
 907
 908         if (flags & MPOL_F_MEMS_ALLOWED) {
 909                 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 910                         return -EINVAL;
 911                 *policy = 0;    /* just so it's initialized */
 912                 task_lock(current);
 913                 *nmask  = cpuset_current_mems_allowed;
 914                 task_unlock(current);
 915                 return 0;
 916         }
 917
 918         if (flags & MPOL_F_ADDR) {
 919                 /*
 920                  * Do NOT fall back to task policy if the
 921                  * vma/shared policy at addr is NULL.  We
 922                  * want to return MPOL_DEFAULT in this case.
 923                  */
 924                 down_read(&mm->mmap_sem);
 925                 vma = find_vma_intersection(mm, addr, addr+1);
 926                 if (!vma) {
 927                         up_read(&mm->mmap_sem);
 928                         return -EFAULT;
 929                 }
 930                 if (vma->vm_ops && vma->vm_ops->get_policy)
 931                         pol = vma->vm_ops->get_policy(vma, addr);
 932                 else
 933                         pol = vma->vm_policy;
 934         } else if (addr)
 935                 return -EINVAL;
 936
 937         if (!pol)
 938                 pol = &default_policy;  /* indicates default behavior */
 939
 940         if (flags & MPOL_F_NODE) {
 941                 if (flags & MPOL_F_ADDR) {
 942                         err = lookup_node(mm, addr);
 943                         if (err < 0)
 944                                 goto out;
 945                         *policy = err;
 946                 } else if (pol == current->mempolicy &&
 947                                 pol->mode == MPOL_INTERLEAVE) {
 948                         *policy = current->il_next;
 949                 } else {
 950                         err = -EINVAL;
 951                         goto out;
 952                 }
 953         } else {
 954                 *policy = pol == &default_policy ? MPOL_DEFAULT :
 955                                                 pol->mode;
 956                 /*
 957                  * Internal mempolicy flags must be masked off before exposing
 958                  * the policy to userspace.
 959                  */
 960                 *policy |= (pol->flags & MPOL_MODE_FLAGS);
 961         }
 962
 963         if (vma) {
 964                 up_read(&current->mm->mmap_sem);
 965                 vma = NULL;
 966         }
 967
 968         err = 0;
 969         if (nmask) {
 970                 if (mpol_store_user_nodemask(pol)) {
 971                         *nmask = pol->w.user_nodemask;
 972                 } else {
 973                         task_lock(current);
 974                         get_policy_nodemask(pol, nmask);
 975                         task_unlock(current);
 976                 }
 977         }
 978
 979  out:
 980         mpol_cond_put(pol);
 981         if (vma)
 982                 up_read(&current->mm->mmap_sem);
 983         return err;
 984 }
 985
 986 #ifdef CONFIG_MIGRATION
 987 /*
 988  * page migration
 989  */
 990 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 991                                 unsigned long flags)
 992 {
 993         /*
 994          * Avoid migrating a page that is shared with others.
 995          */
 996         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
 997                 if (!isolate_lru_page(page)) {
 998                         list_add_tail(&page->lru, pagelist);
 999                         inc_zone_page_state(page, NR_ISOLATED_ANON +
1000                                             page_is_file_cache(page));
1001                 }
1002         }
1003 }
1004
1005 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
1006 {
1007         if (PageHuge(page))
1008                 return alloc_huge_page_node(page_hstate(compound_head(page)),
1009                                         node);
1010         else
1011                 return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
1012 }
1013
1014 /*
1015  * Migrate pages from one node to a target node.
1016  * Returns error or the number of pages not migrated.
1017  */
1018 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1019                            int flags)
1020 {
1021         nodemask_t nmask;
1022         LIST_HEAD(pagelist);
1023         int err = 0;
1024
1025         nodes_clear(nmask);
1026         node_set(source, nmask);
1027
1028         /*
1029          * This does not "check" the range but isolates all pages that
1030          * need migration.  Between passing in the full user address
1031          * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1032          */
1033         VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1034         queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1035                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1036
1037         if (!list_empty(&pagelist)) {
1038                 err = migrate_pages(&pagelist, new_node_page, NULL, dest,
1039                                         MIGRATE_SYNC, MR_SYSCALL);
1040                 if (err)
1041                         putback_movable_pages(&pagelist);
1042         }
1043
1044         return err;
1045 }
1046
1047 /*
1048  * Move pages between the two nodesets so as to preserve the physical
1049  * layout as much as possible.
1050  *
1051  * Returns the number of page that could not be moved.
1052  */
1053 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1054                      const nodemask_t *to, int flags)
1055 {
1056         int busy = 0;
1057         int err;
1058         nodemask_t tmp;
1059
1060         err = migrate_prep();
1061         if (err)
1062                 return err;
1063
1064         down_read(&mm->mmap_sem);
1065
1066         err = migrate_vmas(mm, from, to, flags);
1067         if (err)
1068                 goto out;
1069
1070         /*
1071          * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1072          * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1073          * bit in 'tmp', and return that <source, dest> pair for migration.
1074          * The pair of nodemasks 'to' and 'from' define the map.
1075          *
1076          * If no pair of bits is found that way, fallback to picking some
1077          * pair of 'source' and 'dest' bits that are not the same.  If the
1078          * 'source' and 'dest' bits are the same, this represents a node
1079          * that will be migrating to itself, so no pages need move.
1080          *
1081          * If no bits are left in 'tmp', or if all remaining bits left
1082          * in 'tmp' correspond to the same bit in 'to', return false
1083          * (nothing left to migrate).
1084          *
1085          * This lets us pick a pair of nodes to migrate between, such that
1086          * if possible the dest node is not already occupied by some other
1087          * source node, minimizing the risk of overloading the memory on a
1088          * node that would happen if we migrated incoming memory to a node
1089          * before migrating outgoing memory source that same node.
1090          *
1091          * A single scan of tmp is sufficient.  As we go, we remember the
1092          * most recent <s, d> pair that moved (s != d).  If we find a pair
1093          * that not only moved, but what's better, moved to an empty slot
1094          * (d is not set in tmp), then we break out then, with that pair.
1095          * Otherwise when we finish scanning from_tmp, we at least have the
1096          * most recent <s, d> pair that moved.  If we get all the way through
1097          * the scan of tmp without finding any node that moved, much less
1098          * moved to an empty node, then there is nothing left worth migrating.
1099          */
1100
1101         tmp = *from;
1102         while (!nodes_empty(tmp)) {
1103                 int s,d;
1104                 int source = NUMA_NO_NODE;
1105                 int dest = 0;
1106
1107                 for_each_node_mask(s, tmp) {
1108
1109                         /*
1110                          * do_migrate_pages() tries to maintain the relative
1111                          * node relationship of the pages established between
1112                          * threads and memory areas.
1113                          *
1114                          * However if the number of source nodes is not equal to
1115                          * the number of destination nodes we can not preserve
1116                          * this node relative relationship.  In that case, skip
1117                          * copying memory from a node that is in the destination
1118                          * mask.
1119                          *
1120                          * Example: [2,3,4] -> [3,4,5] moves everything.
1121                          *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1122                          */
1123
1124                         if ((nodes_weight(*from) != nodes_weight(*to)) &&
1125                                                 (node_isset(s, *to)))
1126                                 continue;
1127
1128                         d = node_remap(s, *from, *to);
1129                         if (s == d)
1130                                 continue;
1131
1132                         source = s;     /* Node moved. Memorize */
1133                         dest = d;
1134
1135                         /* dest not in remaining from nodes? */
1136                         if (!node_isset(dest, tmp))
1137                                 break;
1138                 }
1139                 if (source == NUMA_NO_NODE)
1140                         break;
1141
1142                 node_clear(source, tmp);
1143                 err = migrate_to_node(mm, source, dest, flags);
1144                 if (err > 0)
1145                         busy += err;
1146                 if (err < 0)
1147                         break;
1148         }
1149 out:
1150         up_read(&mm->mmap_sem);
1151         if (err < 0)
1152                 return err;
1153         return busy;
1154
1155 }
1156
1157 /*
1158  * Allocate a new page for page migration based on vma policy.
1159  * Start assuming that page is mapped by vma pointed to by @private.
1160  * Search forward from there, if not.  N.B., this assumes that the
1161  * list of pages handed to migrate_pages()--which is how we get here--
1162  * is in virtual address order.
1163  */
1164 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1165 {
1166         struct vm_area_struct *vma = (struct vm_area_struct *)private;
1167         unsigned long uninitialized_var(address);
1168
1169         while (vma) {
1170                 address = page_address_in_vma(page, vma);
1171                 if (address != -EFAULT)
1172                         break;
1173                 vma = vma->vm_next;
1174         }
1175
1176         if (PageHuge(page)) {
1177                 BUG_ON(!vma);
1178                 return alloc_huge_page_noerr(vma, address, 1);
1179         }
1180         /*
1181          * if !vma, alloc_page_vma() will use task or system default policy
1182          */
1183         return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1184 }
1185 #else
1186
1187 static void migrate_page_add(struct page *page, struct list_head *pagelist,
1188                                 unsigned long flags)
1189 {
1190 }
1191
1192 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1193                      const nodemask_t *to, int flags)
1194 {
1195         return -ENOSYS;
1196 }
1197
1198 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1199 {
1200         return NULL;
1201 }
1202 #endif
1203
1204 static long do_mbind(unsigned long start, unsigned long len,
1205                      unsigned short mode, unsigned short mode_flags,
1206                      nodemask_t *nmask, unsigned long flags)
1207 {
1208         struct vm_area_struct *vma;
1209         struct mm_struct *mm = current->mm;
1210         struct mempolicy *new;
1211         unsigned long end;
1212         int err;
1213         LIST_HEAD(pagelist);
1214
1215         if (flags & ~(unsigned long)MPOL_MF_VALID)
1216                 return -EINVAL;
1217         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1218                 return -EPERM;
1219
1220         if (start & ~PAGE_MASK)
1221                 return -EINVAL;
1222
1223         if (mode == MPOL_DEFAULT)
1224                 flags &= ~MPOL_MF_STRICT;
1225
1226         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1227         end = start + len;
1228
1229         if (end < start)
1230                 return -EINVAL;
1231         if (end == start)
1232                 return 0;
1233
1234         new = mpol_new(mode, mode_flags, nmask);
1235         if (IS_ERR(new))
1236                 return PTR_ERR(new);
1237
1238         if (flags & MPOL_MF_LAZY)
1239                 new->flags |= MPOL_F_MOF;
1240
1241         /*
1242          * If we are using the default policy then operation
1243          * on discontinuous address spaces is okay after all
1244          */
1245         if (!new)
1246                 flags |= MPOL_MF_DISCONTIG_OK;
1247
1248         pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1249                  start, start + len, mode, mode_flags,
1250                  nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1251
1252         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1253
1254                 err = migrate_prep();
1255                 if (err)
1256                         goto mpol_out;
1257         }
1258         {
1259                 NODEMASK_SCRATCH(scratch);
1260                 if (scratch) {
1261                         down_write(&mm->mmap_sem);
1262                         task_lock(current);
1263                         err = mpol_set_nodemask(new, nmask, scratch);
1264                         task_unlock(current);
1265                         if (err)
1266                                 up_write(&mm->mmap_sem);
1267                 } else
1268                         err = -ENOMEM;
1269                 NODEMASK_SCRATCH_FREE(scratch);
1270         }
1271         if (err)
1272                 goto mpol_out;
1273
1274         vma = queue_pages_range(mm, start, end, nmask,
1275                           flags | MPOL_MF_INVERT, &pagelist);
1276
1277         err = PTR_ERR(vma);     /* maybe ... */
1278         if (!IS_ERR(vma))
1279                 err = mbind_range(mm, start, end, new);
1280
1281         if (!err) {
1282                 int nr_failed = 0;
1283
1284                 if (!list_empty(&pagelist)) {
1285                         WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1286                         nr_failed = migrate_pages(&pagelist, new_vma_page,
1287                                         NULL, (unsigned long)vma,
1288                                         MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1289                         if (nr_failed)
1290                                 putback_movable_pages(&pagelist);
1291                 }
1292
1293                 if (nr_failed && (flags & MPOL_MF_STRICT))
1294                         err = -EIO;
1295         } else
1296                 putback_movable_pages(&pagelist);
1297
1298         up_write(&mm->mmap_sem);
1299  mpol_out:
1300         mpol_put(new);
1301         return err;
1302 }
1303
1304 /*
1305  * User space interface with variable sized bitmaps for nodelists.
1306  */
1307
1308 /* Copy a node mask from user space. */
1309 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1310                      unsigned long maxnode)
1311 {
1312         unsigned long k;
1313         unsigned long nlongs;
1314         unsigned long endmask;
1315
1316         --maxnode;
1317         nodes_clear(*nodes);
1318         if (maxnode == 0 || !nmask)
1319                 return 0;
1320         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1321                 return -EINVAL;
1322
1323         nlongs = BITS_TO_LONGS(maxnode);
1324         if ((maxnode % BITS_PER_LONG) == 0)
1325                 endmask = ~0UL;
1326         else
1327                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1328
1329         /* When the user specified more nodes than supported just check
1330            if the non supported part is all zero. */
1331         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1332                 if (nlongs > PAGE_SIZE/sizeof(long))
1333                         return -EINVAL;
1334                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1335                         unsigned long t;
1336                         if (get_user(t, nmask + k))
1337                                 return -EFAULT;
1338                         if (k == nlongs - 1) {
1339                                 if (t & endmask)
1340                                         return -EINVAL;
1341                         } else if (t)
1342                                 return -EINVAL;
1343                 }
1344                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1345                 endmask = ~0UL;
1346         }
1347
1348         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1349                 return -EFAULT;
1350         nodes_addr(*nodes)[nlongs-1] &= endmask;
1351         return 0;
1352 }
1353
1354 /* Copy a kernel node mask to user space */
1355 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1356                               nodemask_t *nodes)
1357 {
1358         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1359         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1360
1361         if (copy > nbytes) {
1362                 if (copy > PAGE_SIZE)
1363                         return -EINVAL;
1364                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1365                         return -EFAULT;
1366                 copy = nbytes;
1367         }
1368         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1369 }
1370
1371 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1372                 unsigned long, mode, const unsigned long __user *, nmask,
1373                 unsigned long, maxnode, unsigned, flags)
1374 {
1375         nodemask_t nodes;
1376         int err;
1377         unsigned short mode_flags;
1378
1379         mode_flags = mode & MPOL_MODE_FLAGS;
1380         mode &= ~MPOL_MODE_FLAGS;
1381         if (mode >= MPOL_MAX)
1382                 return -EINVAL;
1383         if ((mode_flags & MPOL_F_STATIC_NODES) &&
1384             (mode_flags & MPOL_F_RELATIVE_NODES))
1385                 return -EINVAL;
1386         err = get_nodes(&nodes, nmask, maxnode);
1387         if (err)
1388                 return err;
1389         return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1390 }
1391
1392 /* Set the process memory policy */
1393 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1394                 unsigned long, maxnode)
1395 {
1396         int err;
1397         nodemask_t nodes;
1398         unsigned short flags;
1399
1400         flags = mode & MPOL_MODE_FLAGS;
1401         mode &= ~MPOL_MODE_FLAGS;
1402         if ((unsigned int)mode >= MPOL_MAX)
1403                 return -EINVAL;
1404         if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1405                 return -EINVAL;
1406         err = get_nodes(&nodes, nmask, maxnode);
1407         if (err)
1408                 return err;
1409         return do_set_mempolicy(mode, flags, &nodes);
1410 }
1411
1412 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1413                 const unsigned long __user *, old_nodes,
1414                 const unsigned long __user *, new_nodes)
1415 {
1416         const struct cred *cred = current_cred(), *tcred;
1417         struct mm_struct *mm = NULL;
1418         struct task_struct *task;
1419         nodemask_t task_nodes;
1420         int err;
1421         nodemask_t *old;
1422         nodemask_t *new;
1423         NODEMASK_SCRATCH(scratch);
1424
1425         if (!scratch)
1426                 return -ENOMEM;
1427
1428         old = &scratch->mask1;
1429         new = &scratch->mask2;
1430
1431         err = get_nodes(old, old_nodes, maxnode);
1432         if (err)
1433                 goto out;
1434
1435         err = get_nodes(new, new_nodes, maxnode);
1436         if (err)
1437                 goto out;
1438
1439         /* Find the mm_struct */
1440         rcu_read_lock();
1441         task = pid ? find_task_by_vpid(pid) : current;
1442         if (!task) {
1443                 rcu_read_unlock();
1444                 err = -ESRCH;
1445                 goto out;
1446         }
1447         get_task_struct(task);
1448
1449         err = -EINVAL;
1450
1451         /*
1452          * Check if this process has the right to modify the specified
1453          * process. The right exists if the process has administrative
1454          * capabilities, superuser privileges or the same
1455          * userid as the target process.
1456          */
1457         tcred = __task_cred(task);
1458         if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1459             !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
1460             !capable(CAP_SYS_NICE)) {
1461                 rcu_read_unlock();
1462                 err = -EPERM;
1463                 goto out_put;
1464         }
1465         rcu_read_unlock();
1466
1467         task_nodes = cpuset_mems_allowed(task);
1468         /* Is the user allowed to access the target nodes? */
1469         if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1470                 err = -EPERM;
1471                 goto out_put;
1472         }
1473
1474         if (!nodes_subset(*new, node_states[N_MEMORY])) {
1475                 err = -EINVAL;
1476                 goto out_put;
1477         }
1478
1479         err = security_task_movememory(task);
1480         if (err)
1481                 goto out_put;
1482
1483         mm = get_task_mm(task);
1484         put_task_struct(task);
1485
1486         if (!mm) {
1487                 err = -EINVAL;
1488                 goto out;
1489         }
1490
1491         err = do_migrate_pages(mm, old, new,
1492                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1493
1494         mmput(mm);
1495 out:
1496         NODEMASK_SCRATCH_FREE(scratch);
1497
1498         return err;
1499
1500 out_put:
1501         put_task_struct(task);
1502         goto out;
1503
1504 }
1505
1506
1507 /* Retrieve NUMA policy */
1508 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1509                 unsigned long __user *, nmask, unsigned long, maxnode,
1510                 unsigned long, addr, unsigned long, flags)
1511 {
1512         int err;
1513         int uninitialized_var(pval);
1514         nodemask_t nodes;
1515
1516         if (nmask != NULL && maxnode < MAX_NUMNODES)
1517                 return -EINVAL;
1518
1519         err = do_get_mempolicy(&pval, &nodes, addr, flags);
1520
1521         if (err)
1522                 return err;
1523
1524         if (policy && put_user(pval, policy))
1525                 return -EFAULT;
1526
1527         if (nmask)
1528                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1529
1530         return err;
1531 }
1532
1533 #ifdef CONFIG_COMPAT
1534
1535 COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1536                        compat_ulong_t __user *, nmask,
1537                        compat_ulong_t, maxnode,
1538                        compat_ulong_t, addr, compat_ulong_t, flags)
1539 {
1540         long err;
1541         unsigned long __user *nm = NULL;
1542         unsigned long nr_bits, alloc_size;
1543         DECLARE_BITMAP(bm, MAX_NUMNODES);
1544
1545         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1546         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1547
1548         if (nmask)
1549                 nm = compat_alloc_user_space(alloc_size);
1550
1551         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1552
1553         if (!err && nmask) {
1554                 unsigned long copy_size;
1555                 copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1556                 err = copy_from_user(bm, nm, copy_size);
1557                 /* ensure entire bitmap is zeroed */
1558                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1559                 err |= compat_put_bitmap(nmask, bm, nr_bits);
1560         }
1561
1562         return err;
1563 }
1564
1565 COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1566                        compat_ulong_t, maxnode)
1567 {
1568         long err = 0;
1569         unsigned long __user *nm = NULL;
1570         unsigned long nr_bits, alloc_size;
1571         DECLARE_BITMAP(bm, MAX_NUMNODES);
1572
1573         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1574         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1575
1576         if (nmask) {
1577                 err = compat_get_bitmap(bm, nmask, nr_bits);
1578                 nm = compat_alloc_user_space(alloc_size);
1579                 err |= copy_to_user(nm, bm, alloc_size);
1580         }
1581
1582         if (err)
1583                 return -EFAULT;
1584
1585         return sys_set_mempolicy(mode, nm, nr_bits+1);
1586 }
1587
1588 COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1589                        compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1590                        compat_ulong_t, maxnode, compat_ulong_t, flags)
1591 {
1592         long err = 0;
1593         unsigned long __user *nm = NULL;
1594         unsigned long nr_bits, alloc_size;
1595         nodemask_t bm;
1596
1597         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1598         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1599
1600         if (nmask) {
1601                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1602                 nm = compat_alloc_user_space(alloc_size);
1603                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1604         }
1605
1606         if (err)
1607                 return -EFAULT;
1608
1609         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1610 }
1611
1612 #endif
1613
1614 /*
1615  * get_vma_policy(@task, @vma, @addr)
1616  * @task: task for fallback if vma policy == default
1617  * @vma: virtual memory area whose policy is sought
1618  * @addr: address in @vma for shared policy lookup
1619  *
1620  * Returns effective policy for a VMA at specified address.
1621  * Falls back to @task or system default policy, as necessary.
1622  * Current or other task's task mempolicy and non-shared vma policies must be
1623  * protected by task_lock(task) by the caller.
1624  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1625  * count--added by the get_policy() vm_op, as appropriate--to protect against
1626  * freeing by another task.  It is the caller's responsibility to free the
1627  * extra reference for shared policies.
1628  */
1629 struct mempolicy *get_vma_policy(struct task_struct *task,
1630                 struct vm_area_struct *vma, unsigned long addr)
1631 {
1632         struct mempolicy *pol = get_task_policy(task);
1633
1634         if (vma) {
1635                 if (vma->vm_ops && vma->vm_ops->get_policy) {
1636                         struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1637                                                                         addr);
1638                         if (vpol)
1639                                 pol = vpol;
1640                 } else if (vma->vm_policy) {
1641                         pol = vma->vm_policy;
1642
1643                         /*
1644                          * shmem_alloc_page() passes MPOL_F_SHARED policy with
1645                          * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1646                          * count on these policies which will be dropped by
1647                          * mpol_cond_put() later
1648                          */
1649                         if (mpol_needs_cond_ref(pol))
1650                                 mpol_get(pol);
1651                 }
1652         }
1653         if (!pol)
1654                 pol = &default_policy;
1655         return pol;
1656 }
1657
1658 bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma)
1659 {
1660         struct mempolicy *pol = get_task_policy(task);
1661         if (vma) {
1662                 if (vma->vm_ops && vma->vm_ops->get_policy) {
1663                         bool ret = false;
1664
1665                         pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1666                         if (pol && (pol->flags & MPOL_F_MOF))
1667                                 ret = true;
1668                         mpol_cond_put(pol);
1669
1670                         return ret;
1671                 } else if (vma->vm_policy) {
1672                         pol = vma->vm_policy;
1673                 }
1674         }
1675
1676         if (!pol)
1677                 return default_policy.flags & MPOL_F_MOF;
1678
1679         return pol->flags & MPOL_F_MOF;
1680 }
1681
1682 static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1683 {
1684         enum zone_type dynamic_policy_zone = policy_zone;
1685
1686         BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1687
1688         /*
1689          * if policy->v.nodes has movable memory only,
1690          * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1691          *
1692          * policy->v.nodes is intersect with node_states[N_MEMORY].
1693          * so if the following test faile, it implies
1694          * policy->v.nodes has movable memory only.
1695          */
1696         if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1697                 dynamic_policy_zone = ZONE_MOVABLE;
1698
1699         return zone >= dynamic_policy_zone;
1700 }
1701
1702 /*
1703  * Return a nodemask representing a mempolicy for filtering nodes for
1704  * page allocation
1705  */
1706 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1707 {
1708         /* Lower zones don't get a nodemask applied for MPOL_BIND */
1709         if (unlikely(policy->mode == MPOL_BIND) &&
1710                         apply_policy_zone(policy, gfp_zone(gfp)) &&
1711                         cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1712                 return &policy->v.nodes;
1713
1714         return NULL;
1715 }
1716
1717 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1718 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1719         int nd)
1720 {
1721         switch (policy->mode) {
1722         case MPOL_PREFERRED:
1723                 if (!(policy->flags & MPOL_F_LOCAL))
1724                         nd = policy->v.preferred_node;
1725                 break;
1726         case MPOL_BIND:
1727                 /*
1728                  * Normally, MPOL_BIND allocations are node-local within the
1729                  * allowed nodemask.  However, if __GFP_THISNODE is set and the
1730                  * current node isn't part of the mask, we use the zonelist for
1731                  * the first node in the mask instead.
1732                  */
1733                 if (unlikely(gfp & __GFP_THISNODE) &&
1734                                 unlikely(!node_isset(nd, policy->v.nodes)))
1735                         nd = first_node(policy->v.nodes);
1736                 break;
1737         default:
1738                 BUG();
1739         }
1740         return node_zonelist(nd, gfp);
1741 }
1742
1743 /* Do dynamic interleaving for a process */
1744 static unsigned interleave_nodes(struct mempolicy *policy)
1745 {
1746         unsigned nid, next;
1747         struct task_struct *me = current;
1748
1749         nid = me->il_next;
1750         next = next_node(nid, policy->v.nodes);
1751         if (next >= MAX_NUMNODES)
1752                 next = first_node(policy->v.nodes);
1753         if (next < MAX_NUMNODES)
1754                 me->il_next = next;
1755         return nid;
1756 }
1757
1758 /*
1759  * Depending on the memory policy provide a node from which to allocate the
1760  * next slab entry.
1761  */
1762 unsigned int mempolicy_slab_node(void)
1763 {
1764         struct mempolicy *policy;
1765         int node = numa_mem_id();
1766
1767         if (in_interrupt())
1768                 return node;
1769
1770         policy = current->mempolicy;
1771         if (!policy || policy->flags & MPOL_F_LOCAL)
1772                 return node;
1773
1774         switch (policy->mode) {
1775         case MPOL_PREFERRED:
1776                 /*
1777                  * handled MPOL_F_LOCAL above
1778                  */
1779                 return policy->v.preferred_node;
1780
1781         case MPOL_INTERLEAVE:
1782                 return interleave_nodes(policy);
1783
1784         case MPOL_BIND: {
1785                 /*
1786                  * Follow bind policy behavior and start allocation at the
1787                  * first node.
1788                  */
1789                 struct zonelist *zonelist;
1790                 struct zone *zone;
1791                 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1792                 zonelist = &NODE_DATA(node)->node_zonelists[0];
1793                 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1794                                                         &policy->v.nodes,
1795                                                         &zone);
1796                 return zone ? zone->node : node;
1797         }
1798
1799         default:
1800                 BUG();
1801         }
1802 }
1803
1804 /* Do static interleaving for a VMA with known offset. */
1805 static unsigned offset_il_node(struct mempolicy *pol,
1806                 struct vm_area_struct *vma, unsigned long off)
1807 {
1808         unsigned nnodes = nodes_weight(pol->v.nodes);
1809         unsigned target;
1810         int c;
1811         int nid = NUMA_NO_NODE;
1812
1813         if (!nnodes)
1814                 return numa_node_id();
1815         target = (unsigned int)off % nnodes;
1816         c = 0;
1817         do {
1818                 nid = next_node(nid, pol->v.nodes);
1819                 c++;
1820         } while (c <= target);
1821         return nid;
1822 }
1823
1824 /* Determine a node number for interleave */
1825 static inline unsigned interleave_nid(struct mempolicy *pol,
1826                  struct vm_area_struct *vma, unsigned long addr, int shift)
1827 {
1828         if (vma) {
1829                 unsigned long off;
1830
1831                 /*
1832                  * for small pages, there is no difference between
1833                  * shift and PAGE_SHIFT, so the bit-shift is safe.
1834                  * for huge pages, since vm_pgoff is in units of small
1835                  * pages, we need to shift off the always 0 bits to get
1836                  * a useful offset.
1837                  */
1838                 BUG_ON(shift < PAGE_SHIFT);
1839                 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1840                 off += (addr - vma->vm_start) >> shift;
1841                 return offset_il_node(pol, vma, off);
1842         } else
1843                 return interleave_nodes(pol);
1844 }
1845
1846 /*
1847  * Return the bit number of a random bit set in the nodemask.
1848  * (returns NUMA_NO_NODE if nodemask is empty)
1849  */
1850 int node_random(const nodemask_t *maskp)
1851 {
1852         int w, bit = NUMA_NO_NODE;
1853
1854         w = nodes_weight(*maskp);
1855         if (w)
1856                 bit = bitmap_ord_to_pos(maskp->bits,
1857                         get_random_int() % w, MAX_NUMNODES);
1858         return bit;
1859 }
1860
1861 #ifdef CONFIG_HUGETLBFS
1862 /*
1863  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1864  * @vma: virtual memory area whose policy is sought
1865  * @addr: address in @vma for shared policy lookup and interleave policy
1866  * @gfp_flags: for requested zone
1867  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1868  * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
1869  *
1870  * Returns a zonelist suitable for a huge page allocation and a pointer
1871  * to the struct mempolicy for conditional unref after allocation.
1872  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1873  * @nodemask for filtering the zonelist.
1874  *
1875  * Must be protected by read_mems_allowed_begin()
1876  */
1877 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1878                                 gfp_t gfp_flags, struct mempolicy **mpol,
1879                                 nodemask_t **nodemask)
1880 {
1881         struct zonelist *zl;
1882
1883         *mpol = get_vma_policy(current, vma, addr);
1884         *nodemask = NULL;       /* assume !MPOL_BIND */
1885
1886         if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1887                 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1888                                 huge_page_shift(hstate_vma(vma))), gfp_flags);
1889         } else {
1890                 zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1891                 if ((*mpol)->mode == MPOL_BIND)
1892                         *nodemask = &(*mpol)->v.nodes;
1893         }
1894         return zl;
1895 }
1896
1897 /*
1898  * init_nodemask_of_mempolicy
1899  *
1900  * If the current task's mempolicy is "default" [NULL], return 'false'
1901  * to indicate default policy.  Otherwise, extract the policy nodemask
1902  * for 'bind' or 'interleave' policy into the argument nodemask, or
1903  * initialize the argument nodemask to contain the single node for
1904  * 'preferred' or 'local' policy and return 'true' to indicate presence
1905  * of non-default mempolicy.
1906  *
1907  * We don't bother with reference counting the mempolicy [mpol_get/put]
1908  * because the current task is examining it's own mempolicy and a task's
1909  * mempolicy is only ever changed by the task itself.
1910  *
1911  * N.B., it is the caller's responsibility to free a returned nodemask.
1912  */
1913 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1914 {
1915         struct mempolicy *mempolicy;
1916         int nid;
1917
1918         if (!(mask && current->mempolicy))
1919                 return false;
1920
1921         task_lock(current);
1922         mempolicy = current->mempolicy;
1923         switch (mempolicy->mode) {
1924         case MPOL_PREFERRED:
1925                 if (mempolicy->flags & MPOL_F_LOCAL)
1926                         nid = numa_node_id();
1927                 else
1928                         nid = mempolicy->v.preferred_node;
1929                 init_nodemask_of_node(mask, nid);
1930                 break;
1931
1932         case MPOL_BIND:
1933                 /* Fall through */
1934         case MPOL_INTERLEAVE:
1935                 *mask =  mempolicy->v.nodes;
1936                 break;
1937
1938         default:
1939                 BUG();
1940         }
1941         task_unlock(current);
1942
1943         return true;
1944 }
1945 #endif
1946
1947 /*
1948  * mempolicy_nodemask_intersects
1949  *
1950  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1951  * policy.  Otherwise, check for intersection between mask and the policy
1952  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1953  * policy, always return true since it may allocate elsewhere on fallback.
1954  *
1955  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1956  */
1957 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1958                                         const nodemask_t *mask)
1959 {
1960         struct mempolicy *mempolicy;
1961         bool ret = true;
1962
1963         if (!mask)
1964                 return ret;
1965         task_lock(tsk);
1966         mempolicy = tsk->mempolicy;
1967         if (!mempolicy)
1968                 goto out;
1969
1970         switch (mempolicy->mode) {
1971         case MPOL_PREFERRED:
1972                 /*
1973                  * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1974                  * allocate from, they may fallback to other nodes when oom.
1975                  * Thus, it's possible for tsk to have allocated memory from
1976                  * nodes in mask.
1977                  */
1978                 break;
1979         case MPOL_BIND:
1980         case MPOL_INTERLEAVE:
1981                 ret = nodes_intersects(mempolicy->v.nodes, *mask);
1982                 break;
1983         default:
1984                 BUG();
1985         }
1986 out:
1987         task_unlock(tsk);
1988         return ret;
1989 }
1990
1991 /* Allocate a page in interleaved policy.
1992    Own path because it needs to do special accounting. */
1993 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1994                                         unsigned nid)
1995 {
1996         struct zonelist *zl;
1997         struct page *page;
1998
1999         zl = node_zonelist(nid, gfp);
2000         page = __alloc_pages(gfp, order, zl);
2001         if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
2002                 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
2003         return page;
2004 }
2005
2006 /**
2007  *      alloc_pages_vma - Allocate a page for a VMA.
2008  *
2009  *      @gfp:
2010  *      %GFP_USER    user allocation.
2011  *      %GFP_KERNEL  kernel allocations,
2012  *      %GFP_HIGHMEM highmem/user allocations,
2013  *      %GFP_FS      allocation should not call back into a file system.
2014  *      %GFP_ATOMIC  don't sleep.
2015  *
2016  *      @order:Order of the GFP allocation.
2017  *      @vma:  Pointer to VMA or NULL if not available.
2018  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
2019  *
2020  *      This function allocates a page from the kernel page pool and applies
2021  *      a NUMA policy associated with the VMA or the current process.
2022  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
2023  *      mm_struct of the VMA to prevent it from going away. Should be used for
2024  *      all allocations for pages that will be mapped into
2025  *      user space. Returns NULL when no page can be allocated.
2026  *
2027  *      Should be called with the mm_sem of the vma hold.
2028  */
2029 struct page *
2030 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2031                 unsigned long addr, int node)
2032 {
2033         struct mempolicy *pol;
2034         struct page *page;
2035         unsigned int cpuset_mems_cookie;
2036
2037 retry_cpuset:
2038         pol = get_vma_policy(current, vma, addr);
2039         cpuset_mems_cookie = read_mems_allowed_begin();
2040
2041         if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
2042                 unsigned nid;
2043
2044                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2045                 mpol_cond_put(pol);
2046                 page = alloc_page_interleave(gfp, order, nid);
2047                 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2048                         goto retry_cpuset;
2049
2050                 return page;
2051         }
2052         page = __alloc_pages_nodemask(gfp, order,
2053                                       policy_zonelist(gfp, pol, node),
2054                                       policy_nodemask(gfp, pol));
2055         if (unlikely(mpol_needs_cond_ref(pol)))
2056                 __mpol_put(pol);
2057         if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2058                 goto retry_cpuset;
2059         return page;
2060 }
2061
2062 /**
2063  *      alloc_pages_current - Allocate pages.
2064  *
2065  *      @gfp:
2066  *              %GFP_USER   user allocation,
2067  *              %GFP_KERNEL kernel allocation,
2068  *              %GFP_HIGHMEM highmem allocation,
2069  *              %GFP_FS     don't call back into a file system.
2070  *              %GFP_ATOMIC don't sleep.
2071  *      @order: Power of two of allocation size in pages. 0 is a single page.
2072  *
2073  *      Allocate a page from the kernel page pool.  When not in
2074  *      interrupt context and apply the current process NUMA policy.
2075  *      Returns NULL when no page can be allocated.
2076  *
2077  *      Don't call cpuset_update_task_memory_state() unless
2078  *      1) it's ok to take cpuset_sem (can WAIT), and
2079  *      2) allocating for current task (not interrupt).
2080  */
2081 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2082 {
2083         struct mempolicy *pol = get_task_policy(current);
2084         struct page *page;
2085         unsigned int cpuset_mems_cookie;
2086
2087         if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
2088                 pol = &default_policy;
2089
2090 retry_cpuset:
2091         cpuset_mems_cookie = read_mems_allowed_begin();
2092
2093         /*
2094          * No reference counting needed for current->mempolicy
2095          * nor system default_policy
2096          */
2097         if (pol->mode == MPOL_INTERLEAVE)
2098                 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2099         else
2100                 page = __alloc_pages_nodemask(gfp, order,
2101                                 policy_zonelist(gfp, pol, numa_node_id()),
2102                                 policy_nodemask(gfp, pol));
2103
2104         if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2105                 goto retry_cpuset;
2106
2107         return page;
2108 }
2109 EXPORT_SYMBOL(alloc_pages_current);
2110
2111 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2112 {
2113         struct mempolicy *pol = mpol_dup(vma_policy(src));
2114
2115         if (IS_ERR(pol))
2116                 return PTR_ERR(pol);
2117         dst->vm_policy = pol;
2118         return 0;
2119 }
2120
2121 /*
2122  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2123  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2124  * with the mems_allowed returned by cpuset_mems_allowed().  This
2125  * keeps mempolicies cpuset relative after its cpuset moves.  See
2126  * further kernel/cpuset.c update_nodemask().
2127  *
2128  * current's mempolicy may be rebinded by the other task(the task that changes
2129  * cpuset's mems), so we needn't do rebind work for current task.
2130  */
2131
2132 /* Slow path of a mempolicy duplicate */
2133 struct mempolicy *__mpol_dup(struct mempolicy *old)
2134 {
2135         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2136
2137         if (!new)
2138                 return ERR_PTR(-ENOMEM);
2139
2140         /* task's mempolicy is protected by alloc_lock */
2141         if (old == current->mempolicy) {
2142                 task_lock(current);
2143                 *new = *old;
2144                 task_unlock(current);
2145         } else
2146                 *new = *old;
2147
2148         rcu_read_lock();
2149         if (current_cpuset_is_being_rebound()) {
2150                 nodemask_t mems = cpuset_mems_allowed(current);
2151                 if (new->flags & MPOL_F_REBINDING)
2152                         mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2153                 else
2154                         mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2155         }
2156         rcu_read_unlock();
2157         atomic_set(&new->refcnt, 1);
2158         return new;
2159 }
2160
2161 /* Slow path of a mempolicy comparison */
2162 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2163 {
2164         if (!a || !b)
2165                 return false;
2166         if (a->mode != b->mode)
2167                 return false;
2168         if (a->flags != b->flags)
2169                 return false;
2170         if (mpol_store_user_nodemask(a))
2171                 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2172                         return false;
2173
2174         switch (a->mode) {
2175         case MPOL_BIND:
2176                 /* Fall through */
2177         case MPOL_INTERLEAVE:
2178                 return !!nodes_equal(a->v.nodes, b->v.nodes);
2179         case MPOL_PREFERRED:
2180                 return a->v.preferred_node == b->v.preferred_node;
2181         default:
2182                 BUG();
2183                 return false;
2184         }
2185 }
2186
2187 /*
2188  * Shared memory backing store policy support.
2189  *
2190  * Remember policies even when nobody has shared memory mapped.
2191  * The policies are kept in Red-Black tree linked from the inode.
2192  * They are protected by the sp->lock spinlock, which should be held
2193  * for any accesses to the tree.
2194  */
2195
2196 /* lookup first element intersecting start-end */
2197 /* Caller holds sp->lock */
2198 static struct sp_node *
2199 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2200 {
2201         struct rb_node *n = sp->root.rb_node;
2202
2203         while (n) {
2204                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2205
2206                 if (start >= p->end)
2207                         n = n->rb_right;
2208                 else if (end <= p->start)
2209                         n = n->rb_left;
2210                 else
2211                         break;
2212         }
2213         if (!n)
2214                 return NULL;
2215         for (;;) {
2216                 struct sp_node *w = NULL;
2217                 struct rb_node *prev = rb_prev(n);
2218                 if (!prev)
2219                         break;
2220                 w = rb_entry(prev, struct sp_node, nd);
2221                 if (w->end <= start)
2222                         break;
2223                 n = prev;
2224         }
2225         return rb_entry(n, struct sp_node, nd);
2226 }
2227
2228 /* Insert a new shared policy into the list. */
2229 /* Caller holds sp->lock */
2230 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2231 {
2232         struct rb_node **p = &sp->root.rb_node;
2233         struct rb_node *parent = NULL;
2234         struct sp_node *nd;
2235
2236         while (*p) {
2237                 parent = *p;
2238                 nd = rb_entry(parent, struct sp_node, nd);
2239                 if (new->start < nd->start)
2240                         p = &(*p)->rb_left;
2241                 else if (new->end > nd->end)
2242                         p = &(*p)->rb_right;
2243                 else
2244                         BUG();
2245         }
2246         rb_link_node(&new->nd, parent, p);
2247         rb_insert_color(&new->nd, &sp->root);
2248         pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2249                  new->policy ? new->policy->mode : 0);
2250 }
2251
2252 /* Find shared policy intersecting idx */
2253 struct mempolicy *
2254 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2255 {
2256         struct mempolicy *pol = NULL;
2257         struct sp_node *sn;
2258
2259         if (!sp->root.rb_node)
2260                 return NULL;
2261         spin_lock(&sp->lock);
2262         sn = sp_lookup(sp, idx, idx+1);
2263         if (sn) {
2264                 mpol_get(sn->policy);
2265                 pol = sn->policy;
2266         }
2267         spin_unlock(&sp->lock);
2268         return pol;
2269 }
2270
2271 static void sp_free(struct sp_node *n)
2272 {
2273         mpol_put(n->policy);
2274         kmem_cache_free(sn_cache, n);
2275 }
2276
2277 /**
2278  * mpol_misplaced - check whether current page node is valid in policy
2279  *
2280  * @page: page to be checked
2281  * @vma: vm area where page mapped
2282  * @addr: virtual address where page mapped
2283  *
2284  * Lookup current policy node id for vma,addr and "compare to" page's
2285  * node id.
2286  *
2287  * Returns:
2288  *      -1      - not misplaced, page is in the right node
2289  *      node    - node id where the page should be
2290  *
2291  * Policy determination "mimics" alloc_page_vma().
2292  * Called from fault path where we know the vma and faulting address.
2293  */
2294 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2295 {
2296         struct mempolicy *pol;
2297         struct zone *zone;
2298         int curnid = page_to_nid(page);
2299         unsigned long pgoff;
2300         int thiscpu = raw_smp_processor_id();
2301         int thisnid = cpu_to_node(thiscpu);
2302         int polnid = -1;
2303         int ret = -1;
2304
2305         BUG_ON(!vma);
2306
2307         pol = get_vma_policy(current, vma, addr);
2308         if (!(pol->flags & MPOL_F_MOF))
2309                 goto out;
2310
2311         switch (pol->mode) {
2312         case MPOL_INTERLEAVE:
2313                 BUG_ON(addr >= vma->vm_end);
2314                 BUG_ON(addr < vma->vm_start);
2315
2316                 pgoff = vma->vm_pgoff;
2317                 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2318                 polnid = offset_il_node(pol, vma, pgoff);
2319                 break;
2320
2321         case MPOL_PREFERRED:
2322                 if (pol->flags & MPOL_F_LOCAL)
2323                         polnid = numa_node_id();
2324                 else
2325                         polnid = pol->v.preferred_node;
2326                 break;
2327
2328         case MPOL_BIND:
2329                 /*
2330                  * allows binding to multiple nodes.
2331                  * use current page if in policy nodemask,
2332                  * else select nearest allowed node, if any.
2333                  * If no allowed nodes, use current [!misplaced].
2334                  */
2335                 if (node_isset(curnid, pol->v.nodes))
2336                         goto out;
2337                 (void)first_zones_zonelist(
2338                                 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2339                                 gfp_zone(GFP_HIGHUSER),
2340                                 &pol->v.nodes, &zone);
2341                 polnid = zone->node;
2342                 break;
2343
2344         default:
2345                 BUG();
2346         }
2347
2348         /* Migrate the page towards the node whose CPU is referencing it */
2349         if (pol->flags & MPOL_F_MORON) {
2350                 polnid = thisnid;
2351
2352                 if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2353                         goto out;
2354         }
2355
2356         if (curnid != polnid)
2357                 ret = polnid;
2358 out:
2359         mpol_cond_put(pol);
2360
2361         return ret;
2362 }
2363
2364 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2365 {
2366         pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2367         rb_erase(&n->nd, &sp->root);
2368         sp_free(n);
2369 }
2370
2371 static void sp_node_init(struct sp_node *node, unsigned long start,
2372                         unsigned long end, struct mempolicy *pol)
2373 {
2374         node->start = start;
2375         node->end = end;
2376         node->policy = pol;
2377 }
2378
2379 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2380                                 struct mempolicy *pol)
2381 {
2382         struct sp_node *n;
2383         struct mempolicy *newpol;
2384
2385         n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2386         if (!n)
2387                 return NULL;
2388
2389         newpol = mpol_dup(pol);
2390         if (IS_ERR(newpol)) {
2391                 kmem_cache_free(sn_cache, n);
2392                 return NULL;
2393         }
2394         newpol->flags |= MPOL_F_SHARED;
2395         sp_node_init(n, start, end, newpol);
2396
2397         return n;
2398 }
2399
2400 /* Replace a policy range. */
2401 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2402                                  unsigned long end, struct sp_node *new)
2403 {
2404         struct sp_node *n;
2405         struct sp_node *n_new = NULL;
2406         struct mempolicy *mpol_new = NULL;
2407         int ret = 0;
2408
2409 restart:
2410         spin_lock(&sp->lock);
2411         n = sp_lookup(sp, start, end);
2412         /* Take care of old policies in the same range. */
2413         while (n && n->start < end) {
2414                 struct rb_node *next = rb_next(&n->nd);
2415                 if (n->start >= start) {
2416                         if (n->end <= end)
2417                                 sp_delete(sp, n);
2418                         else
2419                                 n->start = end;
2420                 } else {
2421                         /* Old policy spanning whole new range. */
2422                         if (n->end > end) {
2423                                 if (!n_new)
2424                                         goto alloc_new;
2425
2426                                 *mpol_new = *n->policy;
2427                                 atomic_set(&mpol_new->refcnt, 1);
2428                                 sp_node_init(n_new, end, n->end, mpol_new);
2429                                 n->end = start;
2430                                 sp_insert(sp, n_new);
2431                                 n_new = NULL;
2432                                 mpol_new = NULL;
2433                                 break;
2434                         } else
2435                                 n->end = start;
2436                 }
2437                 if (!next)
2438                         break;
2439                 n = rb_entry(next, struct sp_node, nd);
2440         }
2441         if (new)
2442                 sp_insert(sp, new);
2443         spin_unlock(&sp->lock);
2444         ret = 0;
2445
2446 err_out:
2447         if (mpol_new)
2448                 mpol_put(mpol_new);
2449         if (n_new)
2450                 kmem_cache_free(sn_cache, n_new);
2451
2452         return ret;
2453
2454 alloc_new:
2455         spin_unlock(&sp->lock);
2456         ret = -ENOMEM;
2457         n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2458         if (!n_new)
2459                 goto err_out;
2460         mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2461         if (!mpol_new)
2462                 goto err_out;
2463         goto restart;
2464 }
2465
2466 /**
2467  * mpol_shared_policy_init - initialize shared policy for inode
2468  * @sp: pointer to inode shared policy
2469  * @mpol:  struct mempolicy to install
2470  *
2471  * Install non-NULL @mpol in inode's shared policy rb-tree.
2472  * On entry, the current task has a reference on a non-NULL @mpol.
2473  * This must be released on exit.
2474  * This is called at get_inode() calls and we can use GFP_KERNEL.
2475  */
2476 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2477 {
2478         int ret;
2479
2480         sp->root = RB_ROOT;             /* empty tree == default mempolicy */
2481         spin_lock_init(&sp->lock);
2482
2483         if (mpol) {
2484                 struct vm_area_struct pvma;
2485                 struct mempolicy *new;
2486                 NODEMASK_SCRATCH(scratch);
2487
2488                 if (!scratch)
2489                         goto put_mpol;
2490                 /* contextualize the tmpfs mount point mempolicy */
2491                 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2492                 if (IS_ERR(new))
2493                         goto free_scratch; /* no valid nodemask intersection */
2494
2495                 task_lock(current);
2496                 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2497                 task_unlock(current);
2498                 if (ret)
2499                         goto put_new;
2500
2501                 /* Create pseudo-vma that contains just the policy */
2502                 memset(&pvma, 0, sizeof(struct vm_area_struct));
2503                 pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
2504                 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2505
2506 put_new:
2507                 mpol_put(new);                  /* drop initial ref */
2508 free_scratch:
2509                 NODEMASK_SCRATCH_FREE(scratch);
2510 put_mpol:
2511                 mpol_put(mpol); /* drop our incoming ref on sb mpol */
2512         }
2513 }
2514
2515 int mpol_set_shared_policy(struct shared_policy *info,
2516                         struct vm_area_struct *vma, struct mempolicy *npol)
2517 {
2518         int err;
2519         struct sp_node *new = NULL;
2520         unsigned long sz = vma_pages(vma);
2521
2522         pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2523                  vma->vm_pgoff,
2524                  sz, npol ? npol->mode : -1,
2525                  npol ? npol->flags : -1,
2526                  npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2527
2528         if (npol) {
2529                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2530                 if (!new)
2531                         return -ENOMEM;
2532         }
2533         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2534         if (err && new)
2535                 sp_free(new);
2536         return err;
2537 }
2538
2539 /* Free a backing policy store on inode delete. */
2540 void mpol_free_shared_policy(struct shared_policy *p)
2541 {
2542         struct sp_node *n;
2543         struct rb_node *next;
2544
2545         if (!p->root.rb_node)
2546                 return;
2547         spin_lock(&p->lock);
2548         next = rb_first(&p->root);
2549         while (next) {
2550                 n = rb_entry(next, struct sp_node, nd);
2551                 next = rb_next(&n->nd);
2552                 sp_delete(p, n);
2553         }
2554         spin_unlock(&p->lock);
2555 }
2556
2557 #ifdef CONFIG_NUMA_BALANCING
2558 static int __initdata numabalancing_override;
2559
2560 static void __init check_numabalancing_enable(void)
2561 {
2562         bool numabalancing_default = false;
2563
2564         if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2565                 numabalancing_default = true;
2566
2567         /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2568         if (numabalancing_override)
2569                 set_numabalancing_state(numabalancing_override == 1);
2570
2571         if (nr_node_ids > 1 && !numabalancing_override) {
2572                 pr_info("%s automatic NUMA balancing. "
2573                         "Configure with numa_balancing= or the "
2574                         "kernel.numa_balancing sysctl",
2575                         numabalancing_default ? "Enabling" : "Disabling");
2576                 set_numabalancing_state(numabalancing_default);
2577         }
2578 }
2579
2580 static int __init setup_numabalancing(char *str)
2581 {
2582         int ret = 0;
2583         if (!str)
2584                 goto out;
2585
2586         if (!strcmp(str, "enable")) {
2587                 numabalancing_override = 1;
2588                 ret = 1;
2589         } else if (!strcmp(str, "disable")) {
2590                 numabalancing_override = -1;
2591                 ret = 1;
2592         }
2593 out:
2594         if (!ret)
2595                 pr_warn("Unable to parse numa_balancing=\n");
2596
2597         return ret;
2598 }
2599 __setup("numa_balancing=", setup_numabalancing);
2600 #else
2601 static inline void __init check_numabalancing_enable(void)
2602 {
2603 }
2604 #endif /* CONFIG_NUMA_BALANCING */
2605
2606 /* assumes fs == KERNEL_DS */
2607 void __init numa_policy_init(void)
2608 {
2609         nodemask_t interleave_nodes;
2610         unsigned long largest = 0;
2611         int nid, prefer = 0;
2612
2613         policy_cache = kmem_cache_create("numa_policy",
2614                                          sizeof(struct mempolicy),
2615                                          0, SLAB_PANIC, NULL);
2616
2617         sn_cache = kmem_cache_create("shared_policy_node",
2618                                      sizeof(struct sp_node),
2619                                      0, SLAB_PANIC, NULL);
2620
2621         for_each_node(nid) {
2622                 preferred_node_policy[nid] = (struct mempolicy) {
2623                         .refcnt = ATOMIC_INIT(1),
2624                         .mode = MPOL_PREFERRED,
2625                         .flags = MPOL_F_MOF | MPOL_F_MORON,
2626                         .v = { .preferred_node = nid, },
2627                 };
2628         }
2629
2630         /*
2631          * Set interleaving policy for system init. Interleaving is only
2632          * enabled across suitably sized nodes (default is >= 16MB), or
2633          * fall back to the largest node if they're all smaller.
2634          */
2635         nodes_clear(interleave_nodes);
2636         for_each_node_state(nid, N_MEMORY) {
2637                 unsigned long total_pages = node_present_pages(nid);
2638
2639                 /* Preserve the largest node */
2640                 if (largest < total_pages) {
2641                         largest = total_pages;
2642                         prefer = nid;
2643                 }
2644
2645                 /* Interleave this node? */
2646                 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2647                         node_set(nid, interleave_nodes);
2648         }
2649
2650         /* All too small, use the largest */
2651         if (unlikely(nodes_empty(interleave_nodes)))
2652                 node_set(prefer, interleave_nodes);
2653
2654         if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2655                 pr_err("%s: interleaving failed\n", __func__);
2656
2657         check_numabalancing_enable();
2658 }
2659
2660 /* Reset policy of current process to default */
2661 void numa_default_policy(void)
2662 {
2663         do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2664 }
2665
2666 /*
2667  * Parse and format mempolicy from/to strings
2668  */
2669
2670 /*
2671  * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2672  */
2673 static const char * const policy_modes[] =
2674 {
2675         [MPOL_DEFAULT]    = "default",
2676         [MPOL_PREFERRED]  = "prefer",
2677         [MPOL_BIND]       = "bind",
2678         [MPOL_INTERLEAVE] = "interleave",
2679         [MPOL_LOCAL]      = "local",
2680 };
2681
2682
2683 #ifdef CONFIG_TMPFS
2684 /**
2685  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2686  * @str:  string containing mempolicy to parse
2687  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2688  *
2689  * Format of input:
2690  *      <mode>[=<flags>][:<nodelist>]
2691  *
2692  * On success, returns 0, else 1
2693  */
2694 int mpol_parse_str(char *str, struct mempolicy **mpol)
2695 {
2696         struct mempolicy *new = NULL;
2697         unsigned short mode;
2698         unsigned short mode_flags;
2699         nodemask_t nodes;
2700         char *nodelist = strchr(str, ':');
2701         char *flags = strchr(str, '=');
2702         int err = 1;
2703
2704         if (nodelist) {
2705                 /* NUL-terminate mode or flags string */
2706                 *nodelist++ = '\0';
2707                 if (nodelist_parse(nodelist, nodes))
2708                         goto out;
2709                 if (!nodes_subset(nodes, node_states[N_MEMORY]))
2710                         goto out;
2711         } else
2712                 nodes_clear(nodes);
2713
2714         if (flags)
2715                 *flags++ = '\0';        /* terminate mode string */
2716
2717         for (mode = 0; mode < MPOL_MAX; mode++) {
2718                 if (!strcmp(str, policy_modes[mode])) {
2719                         break;
2720                 }
2721         }
2722         if (mode >= MPOL_MAX)
2723                 goto out;
2724
2725         switch (mode) {
2726         case MPOL_PREFERRED:
2727                 /*
2728                  * Insist on a nodelist of one node only
2729                  */
2730                 if (nodelist) {
2731                         char *rest = nodelist;
2732                         while (isdigit(*rest))
2733                                 rest++;
2734                         if (*rest)
2735                                 goto out;
2736                 }
2737                 break;
2738         case MPOL_INTERLEAVE:
2739                 /*
2740                  * Default to online nodes with memory if no nodelist
2741                  */
2742                 if (!nodelist)
2743                         nodes = node_states[N_MEMORY];
2744                 break;
2745         case MPOL_LOCAL:
2746                 /*
2747                  * Don't allow a nodelist;  mpol_new() checks flags
2748                  */
2749                 if (nodelist)
2750                         goto out;
2751                 mode = MPOL_PREFERRED;
2752                 break;
2753         case MPOL_DEFAULT:
2754                 /*
2755                  * Insist on a empty nodelist
2756                  */
2757                 if (!nodelist)
2758                         err = 0;
2759                 goto out;
2760         case MPOL_BIND:
2761                 /*
2762                  * Insist on a nodelist
2763                  */
2764                 if (!nodelist)
2765                         goto out;
2766         }
2767
2768         mode_flags = 0;
2769         if (flags) {
2770                 /*
2771                  * Currently, we only support two mutually exclusive
2772                  * mode flags.
2773                  */
2774                 if (!strcmp(flags, "static"))
2775                         mode_flags |= MPOL_F_STATIC_NODES;
2776                 else if (!strcmp(flags, "relative"))
2777                         mode_flags |= MPOL_F_RELATIVE_NODES;
2778                 else
2779                         goto out;
2780         }
2781
2782         new = mpol_new(mode, mode_flags, &nodes);
2783         if (IS_ERR(new))
2784                 goto out;
2785
2786         /*
2787          * Save nodes for mpol_to_str() to show the tmpfs mount options
2788          * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2789          */
2790         if (mode != MPOL_PREFERRED)
2791                 new->v.nodes = nodes;
2792         else if (nodelist)
2793                 new->v.preferred_node = first_node(nodes);
2794         else
2795                 new->flags |= MPOL_F_LOCAL;
2796
2797         /*
2798          * Save nodes for contextualization: this will be used to "clone"
2799          * the mempolicy in a specific context [cpuset] at a later time.
2800          */
2801         new->w.user_nodemask = nodes;
2802
2803         err = 0;
2804
2805 out:
2806         /* Restore string for error message */
2807         if (nodelist)
2808                 *--nodelist = ':';
2809         if (flags)
2810                 *--flags = '=';
2811         if (!err)
2812                 *mpol = new;
2813         return err;
2814 }
2815 #endif /* CONFIG_TMPFS */
2816
2817 /**
2818  * mpol_to_str - format a mempolicy structure for printing
2819  * @buffer:  to contain formatted mempolicy string
2820  * @maxlen:  length of @buffer
2821  * @pol:  pointer to mempolicy to be formatted
2822  *
2823  * Convert @pol into a string.  If @buffer is too short, truncate the string.
2824  * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2825  * longest flag, "relative", and to display at least a few node ids.
2826  */
2827 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2828 {
2829         char *p = buffer;
2830         nodemask_t nodes = NODE_MASK_NONE;
2831         unsigned short mode = MPOL_DEFAULT;
2832         unsigned short flags = 0;
2833
2834         if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2835                 mode = pol->mode;
2836                 flags = pol->flags;
2837         }
2838
2839         switch (mode) {
2840         case MPOL_DEFAULT:
2841                 break;
2842         case MPOL_PREFERRED:
2843                 if (flags & MPOL_F_LOCAL)
2844                         mode = MPOL_LOCAL;
2845                 else
2846                         node_set(pol->v.preferred_node, nodes);
2847                 break;
2848         case MPOL_BIND:
2849         case MPOL_INTERLEAVE:
2850                 nodes = pol->v.nodes;
2851                 break;
2852         default:
2853                 WARN_ON_ONCE(1);
2854                 snprintf(p, maxlen, "unknown");
2855                 return;
2856         }
2857
2858         p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2859
2860         if (flags & MPOL_MODE_FLAGS) {
2861                 p += snprintf(p, buffer + maxlen - p, "=");
2862
2863                 /*
2864                  * Currently, the only defined flags are mutually exclusive
2865                  */
2866                 if (flags & MPOL_F_STATIC_NODES)
2867                         p += snprintf(p, buffer + maxlen - p, "static");
2868                 else if (flags & MPOL_F_RELATIVE_NODES)
2869                         p += snprintf(p, buffer + maxlen - p, "relative");
2870         }
2871
2872         if (!nodes_empty(nodes)) {
2873                 p += snprintf(p, buffer + maxlen - p, ":");
2874                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2875         }
2876 }