mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66 */
  67
  68 #include <linux/mempolicy.h>
  69 #include <linux/mm.h>
  70 #include <linux/highmem.h>
  71 #include <linux/hugetlb.h>
  72 #include <linux/kernel.h>
  73 #include <linux/sched.h>
  74 #include <linux/nodemask.h>
  75 #include <linux/cpuset.h>
  76 #include <linux/gfp.h>
  77 #include <linux/slab.h>
  78 #include <linux/string.h>
  79 #include <linux/module.h>
  80 #include <linux/nsproxy.h>
  81 #include <linux/interrupt.h>
  82 #include <linux/init.h>
  83 #include <linux/compat.h>
  84 #include <linux/swap.h>
  85 #include <linux/seq_file.h>
  86 #include <linux/proc_fs.h>
  87 #include <linux/migrate.h>
  88 #include <linux/ksm.h>
  89 #include <linux/rmap.h>
  90 #include <linux/security.h>
  91 #include <linux/syscalls.h>
  92 #include <linux/ctype.h>
  93 #include <linux/mm_inline.h>
  94
  95 #include <asm/tlbflush.h>
  96 #include <asm/uaccess.h>
  97
  98 #include "internal.h"
  99
 100 /* Internal flags */
 101 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
 102 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
 103 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
 104
 105 static struct kmem_cache *policy_cache;
 106 static struct kmem_cache *sn_cache;
 107
 108 /* Highest zone. An specific allocation for a zone below that is not
 109    policied. */
 110 enum zone_type policy_zone = 0;
 111
 112 /*
 113  * run-time system-wide default policy => local allocation
 114  */
 115 struct mempolicy default_policy = {
 116         .refcnt = ATOMIC_INIT(1), /* never free it */
 117         .mode = MPOL_PREFERRED,
 118         .flags = MPOL_F_LOCAL,
 119 };
 120
 121 static const struct mempolicy_operations {
 122         int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 123         void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
 124 } mpol_ops[MPOL_MAX];
 125
 126 /* Check that the nodemask contains at least one populated zone */
 127 static int is_valid_nodemask(const nodemask_t *nodemask)
 128 {
 129         int nd, k;
 130
 131         /* Check that there is something useful in this mask */
 132         k = policy_zone;
 133
 134         for_each_node_mask(nd, *nodemask) {
 135                 struct zone *z;
 136
 137                 for (k = 0; k <= policy_zone; k++) {
 138                         z = &NODE_DATA(nd)->node_zones[k];
 139                         if (z->present_pages > 0)
 140                                 return 1;
 141                 }
 142         }
 143
 144         return 0;
 145 }
 146
 147 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 148 {
 149         return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
 150 }
 151
 152 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 153                                    const nodemask_t *rel)
 154 {
 155         nodemask_t tmp;
 156         nodes_fold(tmp, *orig, nodes_weight(*rel));
 157         nodes_onto(*ret, tmp, *rel);
 158 }
 159
 160 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 161 {
 162         if (nodes_empty(*nodes))
 163                 return -EINVAL;
 164         pol->v.nodes = *nodes;
 165         return 0;
 166 }
 167
 168 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 169 {
 170         if (!nodes)
 171                 pol->flags |= MPOL_F_LOCAL;     /* local allocation */
 172         else if (nodes_empty(*nodes))
 173                 return -EINVAL;                 /*  no allowed nodes */
 174         else
 175                 pol->v.preferred_node = first_node(*nodes);
 176         return 0;
 177 }
 178
 179 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 180 {
 181         if (!is_valid_nodemask(nodes))
 182                 return -EINVAL;
 183         pol->v.nodes = *nodes;
 184         return 0;
 185 }
 186
 187 /*
 188  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 189  * any, for the new policy.  mpol_new() has already validated the nodes
 190  * parameter with respect to the policy mode and flags.  But, we need to
 191  * handle an empty nodemask with MPOL_PREFERRED here.
 192  *
 193  * Must be called holding task's alloc_lock to protect task's mems_allowed
 194  * and mempolicy.  May also be called holding the mmap_semaphore for write.
 195  */
 196 static int mpol_set_nodemask(struct mempolicy *pol,
 197                      const nodemask_t *nodes, struct nodemask_scratch *nsc)
 198 {
 199         int ret;
 200
 201         /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
 202         if (pol == NULL)
 203                 return 0;
 204         /* Check N_HIGH_MEMORY */
 205         nodes_and(nsc->mask1,
 206                   cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]);
 207
 208         VM_BUG_ON(!nodes);
 209         if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
 210                 nodes = NULL;   /* explicit local allocation */
 211         else {
 212                 if (pol->flags & MPOL_F_RELATIVE_NODES)
 213                         mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
 214                 else
 215                         nodes_and(nsc->mask2, *nodes, nsc->mask1);
 216
 217                 if (mpol_store_user_nodemask(pol))
 218                         pol->w.user_nodemask = *nodes;
 219                 else
 220                         pol->w.cpuset_mems_allowed =
 221                                                 cpuset_current_mems_allowed;
 222         }
 223
 224         if (nodes)
 225                 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
 226         else
 227                 ret = mpol_ops[pol->mode].create(pol, NULL);
 228         return ret;
 229 }
 230
 231 /*
 232  * This function just creates a new policy, does some check and simple
 233  * initialization. You must invoke mpol_set_nodemask() to set nodes.
 234  */
 235 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 236                                   nodemask_t *nodes)
 237 {
 238         struct mempolicy *policy;
 239
 240         pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 241                  mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
 242
 243         if (mode == MPOL_DEFAULT) {
 244                 if (nodes && !nodes_empty(*nodes))
 245                         return ERR_PTR(-EINVAL);
 246                 return NULL;    /* simply delete any existing policy */
 247         }
 248         VM_BUG_ON(!nodes);
 249
 250         /*
 251          * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 252          * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 253          * All other modes require a valid pointer to a non-empty nodemask.
 254          */
 255         if (mode == MPOL_PREFERRED) {
 256                 if (nodes_empty(*nodes)) {
 257                         if (((flags & MPOL_F_STATIC_NODES) ||
 258                              (flags & MPOL_F_RELATIVE_NODES)))
 259                                 return ERR_PTR(-EINVAL);
 260                 }
 261         } else if (nodes_empty(*nodes))
 262                 return ERR_PTR(-EINVAL);
 263         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 264         if (!policy)
 265                 return ERR_PTR(-ENOMEM);
 266         atomic_set(&policy->refcnt, 1);
 267         policy->mode = mode;
 268         policy->flags = flags;
 269
 270         return policy;
 271 }
 272
 273 /* Slow path of a mpol destructor. */
 274 void __mpol_put(struct mempolicy *p)
 275 {
 276         if (!atomic_dec_and_test(&p->refcnt))
 277                 return;
 278         kmem_cache_free(policy_cache, p);
 279 }
 280
 281 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
 282 {
 283 }
 284
 285 static void mpol_rebind_nodemask(struct mempolicy *pol,
 286                                  const nodemask_t *nodes)
 287 {
 288         nodemask_t tmp;
 289
 290         if (pol->flags & MPOL_F_STATIC_NODES)
 291                 nodes_and(tmp, pol->w.user_nodemask, *nodes);
 292         else if (pol->flags & MPOL_F_RELATIVE_NODES)
 293                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 294         else {
 295                 nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
 296                             *nodes);
 297                 pol->w.cpuset_mems_allowed = *nodes;
 298         }
 299
 300         pol->v.nodes = tmp;
 301         if (!node_isset(current->il_next, tmp)) {
 302                 current->il_next = next_node(current->il_next, tmp);
 303                 if (current->il_next >= MAX_NUMNODES)
 304                         current->il_next = first_node(tmp);
 305                 if (current->il_next >= MAX_NUMNODES)
 306                         current->il_next = numa_node_id();
 307         }
 308 }
 309
 310 static void mpol_rebind_preferred(struct mempolicy *pol,
 311                                   const nodemask_t *nodes)
 312 {
 313         nodemask_t tmp;
 314
 315         if (pol->flags & MPOL_F_STATIC_NODES) {
 316                 int node = first_node(pol->w.user_nodemask);
 317
 318                 if (node_isset(node, *nodes)) {
 319                         pol->v.preferred_node = node;
 320                         pol->flags &= ~MPOL_F_LOCAL;
 321                 } else
 322                         pol->flags |= MPOL_F_LOCAL;
 323         } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 324                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 325                 pol->v.preferred_node = first_node(tmp);
 326         } else if (!(pol->flags & MPOL_F_LOCAL)) {
 327                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
 328                                                    pol->w.cpuset_mems_allowed,
 329                                                    *nodes);
 330                 pol->w.cpuset_mems_allowed = *nodes;
 331         }
 332 }
 333
 334 /* Migrate a policy to a different set of nodes */
 335 static void mpol_rebind_policy(struct mempolicy *pol,
 336                                const nodemask_t *newmask)
 337 {
 338         if (!pol)
 339                 return;
 340         if (!mpol_store_user_nodemask(pol) &&
 341             nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 342                 return;
 343         mpol_ops[pol->mode].rebind(pol, newmask);
 344 }
 345
 346 /*
 347  * Wrapper for mpol_rebind_policy() that just requires task
 348  * pointer, and updates task mempolicy.
 349  *
 350  * Called with task's alloc_lock held.
 351  */
 352
 353 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
 354 {
 355         mpol_rebind_policy(tsk->mempolicy, new);
 356 }
 357
 358 /*
 359  * Rebind each vma in mm to new nodemask.
 360  *
 361  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
 362  */
 363
 364 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 365 {
 366         struct vm_area_struct *vma;
 367
 368         down_write(&mm->mmap_sem);
 369         for (vma = mm->mmap; vma; vma = vma->vm_next)
 370                 mpol_rebind_policy(vma->vm_policy, new);
 371         up_write(&mm->mmap_sem);
 372 }
 373
 374 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 375         [MPOL_DEFAULT] = {
 376                 .rebind = mpol_rebind_default,
 377         },
 378         [MPOL_INTERLEAVE] = {
 379                 .create = mpol_new_interleave,
 380                 .rebind = mpol_rebind_nodemask,
 381         },
 382         [MPOL_PREFERRED] = {
 383                 .create = mpol_new_preferred,
 384                 .rebind = mpol_rebind_preferred,
 385         },
 386         [MPOL_BIND] = {
 387                 .create = mpol_new_bind,
 388                 .rebind = mpol_rebind_nodemask,
 389         },
 390 };
 391
 392 static void gather_stats(struct page *, void *, int pte_dirty);
 393 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 394                                 unsigned long flags);
 395
 396 /* Scan through pages checking if pages follow certain conditions. */
 397 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 398                 unsigned long addr, unsigned long end,
 399                 const nodemask_t *nodes, unsigned long flags,
 400                 void *private)
 401 {
 402         pte_t *orig_pte;
 403         pte_t *pte;
 404         spinlock_t *ptl;
 405
 406         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 407         do {
 408                 struct page *page;
 409                 int nid;
 410
 411                 if (!pte_present(*pte))
 412                         continue;
 413                 page = vm_normal_page(vma, addr, *pte);
 414                 if (!page)
 415                         continue;
 416                 /*
 417                  * vm_normal_page() filters out zero pages, but there might
 418                  * still be PageReserved pages to skip, perhaps in a VDSO.
 419                  * And we cannot move PageKsm pages sensibly or safely yet.
 420                  */
 421                 if (PageReserved(page) || PageKsm(page))
 422                         continue;
 423                 nid = page_to_nid(page);
 424                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 425                         continue;
 426
 427                 if (flags & MPOL_MF_STATS)
 428                         gather_stats(page, private, pte_dirty(*pte));
 429                 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 430                         migrate_page_add(page, private, flags);
 431                 else
 432                         break;
 433         } while (pte++, addr += PAGE_SIZE, addr != end);
 434         pte_unmap_unlock(orig_pte, ptl);
 435         return addr != end;
 436 }
 437
 438 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 439                 unsigned long addr, unsigned long end,
 440                 const nodemask_t *nodes, unsigned long flags,
 441                 void *private)
 442 {
 443         pmd_t *pmd;
 444         unsigned long next;
 445
 446         pmd = pmd_offset(pud, addr);
 447         do {
 448                 next = pmd_addr_end(addr, end);
 449                 if (pmd_none_or_clear_bad(pmd))
 450                         continue;
 451                 if (check_pte_range(vma, pmd, addr, next, nodes,
 452                                     flags, private))
 453                         return -EIO;
 454         } while (pmd++, addr = next, addr != end);
 455         return 0;
 456 }
 457
 458 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 459                 unsigned long addr, unsigned long end,
 460                 const nodemask_t *nodes, unsigned long flags,
 461                 void *private)
 462 {
 463         pud_t *pud;
 464         unsigned long next;
 465
 466         pud = pud_offset(pgd, addr);
 467         do {
 468                 next = pud_addr_end(addr, end);
 469                 if (pud_none_or_clear_bad(pud))
 470                         continue;
 471                 if (check_pmd_range(vma, pud, addr, next, nodes,
 472                                     flags, private))
 473                         return -EIO;
 474         } while (pud++, addr = next, addr != end);
 475         return 0;
 476 }
 477
 478 static inline int check_pgd_range(struct vm_area_struct *vma,
 479                 unsigned long addr, unsigned long end,
 480                 const nodemask_t *nodes, unsigned long flags,
 481                 void *private)
 482 {
 483         pgd_t *pgd;
 484         unsigned long next;
 485
 486         pgd = pgd_offset(vma->vm_mm, addr);
 487         do {
 488                 next = pgd_addr_end(addr, end);
 489                 if (pgd_none_or_clear_bad(pgd))
 490                         continue;
 491                 if (check_pud_range(vma, pgd, addr, next, nodes,
 492                                     flags, private))
 493                         return -EIO;
 494         } while (pgd++, addr = next, addr != end);
 495         return 0;
 496 }
 497
 498 /*
 499  * Check if all pages in a range are on a set of nodes.
 500  * If pagelist != NULL then isolate pages from the LRU and
 501  * put them on the pagelist.
 502  */
 503 static struct vm_area_struct *
 504 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 505                 const nodemask_t *nodes, unsigned long flags, void *private)
 506 {
 507         int err;
 508         struct vm_area_struct *first, *vma, *prev;
 509
 510
 511         first = find_vma(mm, start);
 512         if (!first)
 513                 return ERR_PTR(-EFAULT);
 514         prev = NULL;
 515         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 516                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 517                         if (!vma->vm_next && vma->vm_end < end)
 518                                 return ERR_PTR(-EFAULT);
 519                         if (prev && prev->vm_end < vma->vm_start)
 520                                 return ERR_PTR(-EFAULT);
 521                 }
 522                 if (!is_vm_hugetlb_page(vma) &&
 523                     ((flags & MPOL_MF_STRICT) ||
 524                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 525                                 vma_migratable(vma)))) {
 526                         unsigned long endvma = vma->vm_end;
 527
 528                         if (endvma > end)
 529                                 endvma = end;
 530                         if (vma->vm_start > start)
 531                                 start = vma->vm_start;
 532                         err = check_pgd_range(vma, start, endvma, nodes,
 533                                                 flags, private);
 534                         if (err) {
 535                                 first = ERR_PTR(err);
 536                                 break;
 537                         }
 538                 }
 539                 prev = vma;
 540         }
 541         return first;
 542 }
 543
 544 /* Apply policy to a single VMA */
 545 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 546 {
 547         int err = 0;
 548         struct mempolicy *old = vma->vm_policy;
 549
 550         pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 551                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 552                  vma->vm_ops, vma->vm_file,
 553                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 554
 555         if (vma->vm_ops && vma->vm_ops->set_policy)
 556                 err = vma->vm_ops->set_policy(vma, new);
 557         if (!err) {
 558                 mpol_get(new);
 559                 vma->vm_policy = new;
 560                 mpol_put(old);
 561         }
 562         return err;
 563 }
 564
 565 /* Step 2: apply policy to a range and do splits. */
 566 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 567                        unsigned long end, struct mempolicy *new)
 568 {
 569         struct vm_area_struct *next;
 570         int err;
 571
 572         err = 0;
 573         for (; vma && vma->vm_start < end; vma = next) {
 574                 next = vma->vm_next;
 575                 if (vma->vm_start < start)
 576                         err = split_vma(vma->vm_mm, vma, start, 1);
 577                 if (!err && vma->vm_end > end)
 578                         err = split_vma(vma->vm_mm, vma, end, 0);
 579                 if (!err)
 580                         err = policy_vma(vma, new);
 581                 if (err)
 582                         break;
 583         }
 584         return err;
 585 }
 586
 587 /*
 588  * Update task->flags PF_MEMPOLICY bit: set iff non-default
 589  * mempolicy.  Allows more rapid checking of this (combined perhaps
 590  * with other PF_* flag bits) on memory allocation hot code paths.
 591  *
 592  * If called from outside this file, the task 'p' should -only- be
 593  * a newly forked child not yet visible on the task list, because
 594  * manipulating the task flags of a visible task is not safe.
 595  *
 596  * The above limitation is why this routine has the funny name
 597  * mpol_fix_fork_child_flag().
 598  *
 599  * It is also safe to call this with a task pointer of current,
 600  * which the static wrapper mpol_set_task_struct_flag() does,
 601  * for use within this file.
 602  */
 603
 604 void mpol_fix_fork_child_flag(struct task_struct *p)
 605 {
 606         if (p->mempolicy)
 607                 p->flags |= PF_MEMPOLICY;
 608         else
 609                 p->flags &= ~PF_MEMPOLICY;
 610 }
 611
 612 static void mpol_set_task_struct_flag(void)
 613 {
 614         mpol_fix_fork_child_flag(current);
 615 }
 616
 617 /* Set the process memory policy */
 618 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 619                              nodemask_t *nodes)
 620 {
 621         struct mempolicy *new, *old;
 622         struct mm_struct *mm = current->mm;
 623         NODEMASK_SCRATCH(scratch);
 624         int ret;
 625
 626         if (!scratch)
 627                 return -ENOMEM;
 628
 629         new = mpol_new(mode, flags, nodes);
 630         if (IS_ERR(new)) {
 631                 ret = PTR_ERR(new);
 632                 goto out;
 633         }
 634         /*
 635          * prevent changing our mempolicy while show_numa_maps()
 636          * is using it.
 637          * Note:  do_set_mempolicy() can be called at init time
 638          * with no 'mm'.
 639          */
 640         if (mm)
 641                 down_write(&mm->mmap_sem);
 642         task_lock(current);
 643         ret = mpol_set_nodemask(new, nodes, scratch);
 644         if (ret) {
 645                 task_unlock(current);
 646                 if (mm)
 647                         up_write(&mm->mmap_sem);
 648                 mpol_put(new);
 649                 goto out;
 650         }
 651         old = current->mempolicy;
 652         current->mempolicy = new;
 653         mpol_set_task_struct_flag();
 654         if (new && new->mode == MPOL_INTERLEAVE &&
 655             nodes_weight(new->v.nodes))
 656                 current->il_next = first_node(new->v.nodes);
 657         task_unlock(current);
 658         if (mm)
 659                 up_write(&mm->mmap_sem);
 660
 661         mpol_put(old);
 662         ret = 0;
 663 out:
 664         NODEMASK_SCRATCH_FREE(scratch);
 665         return ret;
 666 }
 667
 668 /*
 669  * Return nodemask for policy for get_mempolicy() query
 670  *
 671  * Called with task's alloc_lock held
 672  */
 673 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 674 {
 675         nodes_clear(*nodes);
 676         if (p == &default_policy)
 677                 return;
 678
 679         switch (p->mode) {
 680         case MPOL_BIND:
 681                 /* Fall through */
 682         case MPOL_INTERLEAVE:
 683                 *nodes = p->v.nodes;
 684                 break;
 685         case MPOL_PREFERRED:
 686                 if (!(p->flags & MPOL_F_LOCAL))
 687                         node_set(p->v.preferred_node, *nodes);
 688                 /* else return empty node mask for local allocation */
 689                 break;
 690         default:
 691                 BUG();
 692         }
 693 }
 694
 695 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 696 {
 697         struct page *p;
 698         int err;
 699
 700         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 701         if (err >= 0) {
 702                 err = page_to_nid(p);
 703                 put_page(p);
 704         }
 705         return err;
 706 }
 707
 708 /* Retrieve NUMA policy */
 709 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 710                              unsigned long addr, unsigned long flags)
 711 {
 712         int err;
 713         struct mm_struct *mm = current->mm;
 714         struct vm_area_struct *vma = NULL;
 715         struct mempolicy *pol = current->mempolicy;
 716
 717         if (flags &
 718                 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 719                 return -EINVAL;
 720
 721         if (flags & MPOL_F_MEMS_ALLOWED) {
 722                 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 723                         return -EINVAL;
 724                 *policy = 0;    /* just so it's initialized */
 725                 task_lock(current);
 726                 *nmask  = cpuset_current_mems_allowed;
 727                 task_unlock(current);
 728                 return 0;
 729         }
 730
 731         if (flags & MPOL_F_ADDR) {
 732                 /*
 733                  * Do NOT fall back to task policy if the
 734                  * vma/shared policy at addr is NULL.  We
 735                  * want to return MPOL_DEFAULT in this case.
 736                  */
 737                 down_read(&mm->mmap_sem);
 738                 vma = find_vma_intersection(mm, addr, addr+1);
 739                 if (!vma) {
 740                         up_read(&mm->mmap_sem);
 741                         return -EFAULT;
 742                 }
 743                 if (vma->vm_ops && vma->vm_ops->get_policy)
 744                         pol = vma->vm_ops->get_policy(vma, addr);
 745                 else
 746                         pol = vma->vm_policy;
 747         } else if (addr)
 748                 return -EINVAL;
 749
 750         if (!pol)
 751                 pol = &default_policy;  /* indicates default behavior */
 752
 753         if (flags & MPOL_F_NODE) {
 754                 if (flags & MPOL_F_ADDR) {
 755                         err = lookup_node(mm, addr);
 756                         if (err < 0)
 757                                 goto out;
 758                         *policy = err;
 759                 } else if (pol == current->mempolicy &&
 760                                 pol->mode == MPOL_INTERLEAVE) {
 761                         *policy = current->il_next;
 762                 } else {
 763                         err = -EINVAL;
 764                         goto out;
 765                 }
 766         } else {
 767                 *policy = pol == &default_policy ? MPOL_DEFAULT :
 768                                                 pol->mode;
 769                 /*
 770                  * Internal mempolicy flags must be masked off before exposing
 771                  * the policy to userspace.
 772                  */
 773                 *policy |= (pol->flags & MPOL_MODE_FLAGS);
 774         }
 775
 776         if (vma) {
 777                 up_read(&current->mm->mmap_sem);
 778                 vma = NULL;
 779         }
 780
 781         err = 0;
 782         if (nmask) {
 783                 task_lock(current);
 784                 get_policy_nodemask(pol, nmask);
 785                 task_unlock(current);
 786         }
 787
 788  out:
 789         mpol_cond_put(pol);
 790         if (vma)
 791                 up_read(&current->mm->mmap_sem);
 792         return err;
 793 }
 794
 795 #ifdef CONFIG_MIGRATION
 796 /*
 797  * page migration
 798  */
 799 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 800                                 unsigned long flags)
 801 {
 802         /*
 803          * Avoid migrating a page that is shared with others.
 804          */
 805         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
 806                 if (!isolate_lru_page(page)) {
 807                         list_add_tail(&page->lru, pagelist);
 808                         inc_zone_page_state(page, NR_ISOLATED_ANON +
 809                                             page_is_file_cache(page));
 810                 }
 811         }
 812 }
 813
 814 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 815 {
 816         return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
 817 }
 818
 819 /*
 820  * Migrate pages from one node to a target node.
 821  * Returns error or the number of pages not migrated.
 822  */
 823 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 824                            int flags)
 825 {
 826         nodemask_t nmask;
 827         LIST_HEAD(pagelist);
 828         int err = 0;
 829
 830         nodes_clear(nmask);
 831         node_set(source, nmask);
 832
 833         check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
 834                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 835
 836         if (!list_empty(&pagelist))
 837                 err = migrate_pages(&pagelist, new_node_page, dest, 0);
 838
 839         return err;
 840 }
 841
 842 /*
 843  * Move pages between the two nodesets so as to preserve the physical
 844  * layout as much as possible.
 845  *
 846  * Returns the number of page that could not be moved.
 847  */
 848 int do_migrate_pages(struct mm_struct *mm,
 849         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 850 {
 851         int busy = 0;
 852         int err;
 853         nodemask_t tmp;
 854
 855         err = migrate_prep();
 856         if (err)
 857                 return err;
 858
 859         down_read(&mm->mmap_sem);
 860
 861         err = migrate_vmas(mm, from_nodes, to_nodes, flags);
 862         if (err)
 863                 goto out;
 864
 865 /*
 866  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
 867  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
 868  * bit in 'tmp', and return that <source, dest> pair for migration.
 869  * The pair of nodemasks 'to' and 'from' define the map.
 870  *
 871  * If no pair of bits is found that way, fallback to picking some
 872  * pair of 'source' and 'dest' bits that are not the same.  If the
 873  * 'source' and 'dest' bits are the same, this represents a node
 874  * that will be migrating to itself, so no pages need move.
 875  *
 876  * If no bits are left in 'tmp', or if all remaining bits left
 877  * in 'tmp' correspond to the same bit in 'to', return false
 878  * (nothing left to migrate).
 879  *
 880  * This lets us pick a pair of nodes to migrate between, such that
 881  * if possible the dest node is not already occupied by some other
 882  * source node, minimizing the risk of overloading the memory on a
 883  * node that would happen if we migrated incoming memory to a node
 884  * before migrating outgoing memory source that same node.
 885  *
 886  * A single scan of tmp is sufficient.  As we go, we remember the
 887  * most recent <s, d> pair that moved (s != d).  If we find a pair
 888  * that not only moved, but what's better, moved to an empty slot
 889  * (d is not set in tmp), then we break out then, with that pair.
 890  * Otherwise when we finish scannng from_tmp, we at least have the
 891  * most recent <s, d> pair that moved.  If we get all the way through
 892  * the scan of tmp without finding any node that moved, much less
 893  * moved to an empty node, then there is nothing left worth migrating.
 894  */
 895
 896         tmp = *from_nodes;
 897         while (!nodes_empty(tmp)) {
 898                 int s,d;
 899                 int source = -1;
 900                 int dest = 0;
 901
 902                 for_each_node_mask(s, tmp) {
 903                         d = node_remap(s, *from_nodes, *to_nodes);
 904                         if (s == d)
 905                                 continue;
 906
 907                         source = s;     /* Node moved. Memorize */
 908                         dest = d;
 909
 910                         /* dest not in remaining from nodes? */
 911                         if (!node_isset(dest, tmp))
 912                                 break;
 913                 }
 914                 if (source == -1)
 915                         break;
 916
 917                 node_clear(source, tmp);
 918                 err = migrate_to_node(mm, source, dest, flags);
 919                 if (err > 0)
 920                         busy += err;
 921                 if (err < 0)
 922                         break;
 923         }
 924 out:
 925         up_read(&mm->mmap_sem);
 926         if (err < 0)
 927                 return err;
 928         return busy;
 929
 930 }
 931
 932 /*
 933  * Allocate a new page for page migration based on vma policy.
 934  * Start assuming that page is mapped by vma pointed to by @private.
 935  * Search forward from there, if not.  N.B., this assumes that the
 936  * list of pages handed to migrate_pages()--which is how we get here--
 937  * is in virtual address order.
 938  */
 939 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 940 {
 941         struct vm_area_struct *vma = (struct vm_area_struct *)private;
 942         unsigned long uninitialized_var(address);
 943
 944         while (vma) {
 945                 address = page_address_in_vma(page, vma);
 946                 if (address != -EFAULT)
 947                         break;
 948                 vma = vma->vm_next;
 949         }
 950
 951         /*
 952          * if !vma, alloc_page_vma() will use task or system default policy
 953          */
 954         return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
 955 }
 956 #else
 957
 958 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 959                                 unsigned long flags)
 960 {
 961 }
 962
 963 int do_migrate_pages(struct mm_struct *mm,
 964         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 965 {
 966         return -ENOSYS;
 967 }
 968
 969 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 970 {
 971         return NULL;
 972 }
 973 #endif
 974
 975 static long do_mbind(unsigned long start, unsigned long len,
 976                      unsigned short mode, unsigned short mode_flags,
 977                      nodemask_t *nmask, unsigned long flags)
 978 {
 979         struct vm_area_struct *vma;
 980         struct mm_struct *mm = current->mm;
 981         struct mempolicy *new;
 982         unsigned long end;
 983         int err;
 984         LIST_HEAD(pagelist);
 985
 986         if (flags & ~(unsigned long)(MPOL_MF_STRICT |
 987                                      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 988                 return -EINVAL;
 989         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
 990                 return -EPERM;
 991
 992         if (start & ~PAGE_MASK)
 993                 return -EINVAL;
 994
 995         if (mode == MPOL_DEFAULT)
 996                 flags &= ~MPOL_MF_STRICT;
 997
 998         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 999         end = start + len;
1000
1001         if (end < start)
1002                 return -EINVAL;
1003         if (end == start)
1004                 return 0;
1005
1006         new = mpol_new(mode, mode_flags, nmask);
1007         if (IS_ERR(new))
1008                 return PTR_ERR(new);
1009
1010         /*
1011          * If we are using the default policy then operation
1012          * on discontinuous address spaces is okay after all
1013          */
1014         if (!new)
1015                 flags |= MPOL_MF_DISCONTIG_OK;
1016
1017         pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1018                  start, start + len, mode, mode_flags,
1019                  nmask ? nodes_addr(*nmask)[0] : -1);
1020
1021         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1022
1023                 err = migrate_prep();
1024                 if (err)
1025                         goto mpol_out;
1026         }
1027         {
1028                 NODEMASK_SCRATCH(scratch);
1029                 if (scratch) {
1030                         down_write(&mm->mmap_sem);
1031                         task_lock(current);
1032                         err = mpol_set_nodemask(new, nmask, scratch);
1033                         task_unlock(current);
1034                         if (err)
1035                                 up_write(&mm->mmap_sem);
1036                 } else
1037                         err = -ENOMEM;
1038                 NODEMASK_SCRATCH_FREE(scratch);
1039         }
1040         if (err)
1041                 goto mpol_out;
1042
1043         vma = check_range(mm, start, end, nmask,
1044                           flags | MPOL_MF_INVERT, &pagelist);
1045
1046         err = PTR_ERR(vma);
1047         if (!IS_ERR(vma)) {
1048                 int nr_failed = 0;
1049
1050                 err = mbind_range(vma, start, end, new);
1051
1052                 if (!list_empty(&pagelist))
1053                         nr_failed = migrate_pages(&pagelist, new_vma_page,
1054                                                 (unsigned long)vma, 0);
1055
1056                 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
1057                         err = -EIO;
1058         } else
1059                 putback_lru_pages(&pagelist);
1060
1061         up_write(&mm->mmap_sem);
1062  mpol_out:
1063         mpol_put(new);
1064         return err;
1065 }
1066
1067 /*
1068  * User space interface with variable sized bitmaps for nodelists.
1069  */
1070
1071 /* Copy a node mask from user space. */
1072 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1073                      unsigned long maxnode)
1074 {
1075         unsigned long k;
1076         unsigned long nlongs;
1077         unsigned long endmask;
1078
1079         --maxnode;
1080         nodes_clear(*nodes);
1081         if (maxnode == 0 || !nmask)
1082                 return 0;
1083         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1084                 return -EINVAL;
1085
1086         nlongs = BITS_TO_LONGS(maxnode);
1087         if ((maxnode % BITS_PER_LONG) == 0)
1088                 endmask = ~0UL;
1089         else
1090                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1091
1092         /* When the user specified more nodes than supported just check
1093            if the non supported part is all zero. */
1094         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1095                 if (nlongs > PAGE_SIZE/sizeof(long))
1096                         return -EINVAL;
1097                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1098                         unsigned long t;
1099                         if (get_user(t, nmask + k))
1100                                 return -EFAULT;
1101                         if (k == nlongs - 1) {
1102                                 if (t & endmask)
1103                                         return -EINVAL;
1104                         } else if (t)
1105                                 return -EINVAL;
1106                 }
1107                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1108                 endmask = ~0UL;
1109         }
1110
1111         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1112                 return -EFAULT;
1113         nodes_addr(*nodes)[nlongs-1] &= endmask;
1114         return 0;
1115 }
1116
1117 /* Copy a kernel node mask to user space */
1118 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1119                               nodemask_t *nodes)
1120 {
1121         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1122         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1123
1124         if (copy > nbytes) {
1125                 if (copy > PAGE_SIZE)
1126                         return -EINVAL;
1127                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1128                         return -EFAULT;
1129                 copy = nbytes;
1130         }
1131         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1132 }
1133
1134 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1135                 unsigned long, mode, unsigned long __user *, nmask,
1136                 unsigned long, maxnode, unsigned, flags)
1137 {
1138         nodemask_t nodes;
1139         int err;
1140         unsigned short mode_flags;
1141
1142         mode_flags = mode & MPOL_MODE_FLAGS;
1143         mode &= ~MPOL_MODE_FLAGS;
1144         if (mode >= MPOL_MAX)
1145                 return -EINVAL;
1146         if ((mode_flags & MPOL_F_STATIC_NODES) &&
1147             (mode_flags & MPOL_F_RELATIVE_NODES))
1148                 return -EINVAL;
1149         err = get_nodes(&nodes, nmask, maxnode);
1150         if (err)
1151                 return err;
1152         return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1153 }
1154
1155 /* Set the process memory policy */
1156 SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
1157                 unsigned long, maxnode)
1158 {
1159         int err;
1160         nodemask_t nodes;
1161         unsigned short flags;
1162
1163         flags = mode & MPOL_MODE_FLAGS;
1164         mode &= ~MPOL_MODE_FLAGS;
1165         if ((unsigned int)mode >= MPOL_MAX)
1166                 return -EINVAL;
1167         if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1168                 return -EINVAL;
1169         err = get_nodes(&nodes, nmask, maxnode);
1170         if (err)
1171                 return err;
1172         return do_set_mempolicy(mode, flags, &nodes);
1173 }
1174
1175 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1176                 const unsigned long __user *, old_nodes,
1177                 const unsigned long __user *, new_nodes)
1178 {
1179         const struct cred *cred = current_cred(), *tcred;
1180         struct mm_struct *mm;
1181         struct task_struct *task;
1182         nodemask_t old;
1183         nodemask_t new;
1184         nodemask_t task_nodes;
1185         int err;
1186
1187         err = get_nodes(&old, old_nodes, maxnode);
1188         if (err)
1189                 return err;
1190
1191         err = get_nodes(&new, new_nodes, maxnode);
1192         if (err)
1193                 return err;
1194
1195         /* Find the mm_struct */
1196         read_lock(&tasklist_lock);
1197         task = pid ? find_task_by_vpid(pid) : current;
1198         if (!task) {
1199                 read_unlock(&tasklist_lock);
1200                 return -ESRCH;
1201         }
1202         mm = get_task_mm(task);
1203         read_unlock(&tasklist_lock);
1204
1205         if (!mm)
1206                 return -EINVAL;
1207
1208         /*
1209          * Check if this process has the right to modify the specified
1210          * process. The right exists if the process has administrative
1211          * capabilities, superuser privileges or the same
1212          * userid as the target process.
1213          */
1214         rcu_read_lock();
1215         tcred = __task_cred(task);
1216         if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
1217             cred->uid  != tcred->suid && cred->uid  != tcred->uid &&
1218             !capable(CAP_SYS_NICE)) {
1219                 rcu_read_unlock();
1220                 err = -EPERM;
1221                 goto out;
1222         }
1223         rcu_read_unlock();
1224
1225         task_nodes = cpuset_mems_allowed(task);
1226         /* Is the user allowed to access the target nodes? */
1227         if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
1228                 err = -EPERM;
1229                 goto out;
1230         }
1231
1232         if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
1233                 err = -EINVAL;
1234                 goto out;
1235         }
1236
1237         err = security_task_movememory(task);
1238         if (err)
1239                 goto out;
1240
1241         err = do_migrate_pages(mm, &old, &new,
1242                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1243 out:
1244         mmput(mm);
1245         return err;
1246 }
1247
1248
1249 /* Retrieve NUMA policy */
1250 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1251                 unsigned long __user *, nmask, unsigned long, maxnode,
1252                 unsigned long, addr, unsigned long, flags)
1253 {
1254         int err;
1255         int uninitialized_var(pval);
1256         nodemask_t nodes;
1257
1258         if (nmask != NULL && maxnode < MAX_NUMNODES)
1259                 return -EINVAL;
1260
1261         err = do_get_mempolicy(&pval, &nodes, addr, flags);
1262
1263         if (err)
1264                 return err;
1265
1266         if (policy && put_user(pval, policy))
1267                 return -EFAULT;
1268
1269         if (nmask)
1270                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1271
1272         return err;
1273 }
1274
1275 #ifdef CONFIG_COMPAT
1276
1277 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1278                                      compat_ulong_t __user *nmask,
1279                                      compat_ulong_t maxnode,
1280                                      compat_ulong_t addr, compat_ulong_t flags)
1281 {
1282         long err;
1283         unsigned long __user *nm = NULL;
1284         unsigned long nr_bits, alloc_size;
1285         DECLARE_BITMAP(bm, MAX_NUMNODES);
1286
1287         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1288         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1289
1290         if (nmask)
1291                 nm = compat_alloc_user_space(alloc_size);
1292
1293         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1294
1295         if (!err && nmask) {
1296                 err = copy_from_user(bm, nm, alloc_size);
1297                 /* ensure entire bitmap is zeroed */
1298                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1299                 err |= compat_put_bitmap(nmask, bm, nr_bits);
1300         }
1301
1302         return err;
1303 }
1304
1305 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1306                                      compat_ulong_t maxnode)
1307 {
1308         long err = 0;
1309         unsigned long __user *nm = NULL;
1310         unsigned long nr_bits, alloc_size;
1311         DECLARE_BITMAP(bm, MAX_NUMNODES);
1312
1313         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1314         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1315
1316         if (nmask) {
1317                 err = compat_get_bitmap(bm, nmask, nr_bits);
1318                 nm = compat_alloc_user_space(alloc_size);
1319                 err |= copy_to_user(nm, bm, alloc_size);
1320         }
1321
1322         if (err)
1323                 return -EFAULT;
1324
1325         return sys_set_mempolicy(mode, nm, nr_bits+1);
1326 }
1327
1328 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1329                              compat_ulong_t mode, compat_ulong_t __user *nmask,
1330                              compat_ulong_t maxnode, compat_ulong_t flags)
1331 {
1332         long err = 0;
1333         unsigned long __user *nm = NULL;
1334         unsigned long nr_bits, alloc_size;
1335         nodemask_t bm;
1336
1337         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1338         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1339
1340         if (nmask) {
1341                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1342                 nm = compat_alloc_user_space(alloc_size);
1343                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1344         }
1345
1346         if (err)
1347                 return -EFAULT;
1348
1349         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1350 }
1351
1352 #endif
1353
1354 /*
1355  * get_vma_policy(@task, @vma, @addr)
1356  * @task - task for fallback if vma policy == default
1357  * @vma   - virtual memory area whose policy is sought
1358  * @addr  - address in @vma for shared policy lookup
1359  *
1360  * Returns effective policy for a VMA at specified address.
1361  * Falls back to @task or system default policy, as necessary.
1362  * Current or other task's task mempolicy and non-shared vma policies
1363  * are protected by the task's mmap_sem, which must be held for read by
1364  * the caller.
1365  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1366  * count--added by the get_policy() vm_op, as appropriate--to protect against
1367  * freeing by another task.  It is the caller's responsibility to free the
1368  * extra reference for shared policies.
1369  */
1370 static struct mempolicy *get_vma_policy(struct task_struct *task,
1371                 struct vm_area_struct *vma, unsigned long addr)
1372 {
1373         struct mempolicy *pol = task->mempolicy;
1374
1375         if (vma) {
1376                 if (vma->vm_ops && vma->vm_ops->get_policy) {
1377                         struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1378                                                                         addr);
1379                         if (vpol)
1380                                 pol = vpol;
1381                 } else if (vma->vm_policy)
1382                         pol = vma->vm_policy;
1383         }
1384         if (!pol)
1385                 pol = &default_policy;
1386         return pol;
1387 }
1388
1389 /*
1390  * Return a nodemask representing a mempolicy for filtering nodes for
1391  * page allocation
1392  */
1393 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1394 {
1395         /* Lower zones don't get a nodemask applied for MPOL_BIND */
1396         if (unlikely(policy->mode == MPOL_BIND) &&
1397                         gfp_zone(gfp) >= policy_zone &&
1398                         cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1399                 return &policy->v.nodes;
1400
1401         return NULL;
1402 }
1403
1404 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1405 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
1406 {
1407         int nd = numa_node_id();
1408
1409         switch (policy->mode) {
1410         case MPOL_PREFERRED:
1411                 if (!(policy->flags & MPOL_F_LOCAL))
1412                         nd = policy->v.preferred_node;
1413                 break;
1414         case MPOL_BIND:
1415                 /*
1416                  * Normally, MPOL_BIND allocations are node-local within the
1417                  * allowed nodemask.  However, if __GFP_THISNODE is set and the
1418                  * current node is part of the mask, we use the zonelist for
1419                  * the first node in the mask instead.
1420                  */
1421                 if (unlikely(gfp & __GFP_THISNODE) &&
1422                                 unlikely(!node_isset(nd, policy->v.nodes)))
1423                         nd = first_node(policy->v.nodes);
1424                 break;
1425         case MPOL_INTERLEAVE: /* should not happen */
1426                 break;
1427         default:
1428                 BUG();
1429         }
1430         return node_zonelist(nd, gfp);
1431 }
1432
1433 /* Do dynamic interleaving for a process */
1434 static unsigned interleave_nodes(struct mempolicy *policy)
1435 {
1436         unsigned nid, next;
1437         struct task_struct *me = current;
1438
1439         nid = me->il_next;
1440         next = next_node(nid, policy->v.nodes);
1441         if (next >= MAX_NUMNODES)
1442                 next = first_node(policy->v.nodes);
1443         if (next < MAX_NUMNODES)
1444                 me->il_next = next;
1445         return nid;
1446 }
1447
1448 /*
1449  * Depending on the memory policy provide a node from which to allocate the
1450  * next slab entry.
1451  * @policy must be protected by freeing by the caller.  If @policy is
1452  * the current task's mempolicy, this protection is implicit, as only the
1453  * task can change it's policy.  The system default policy requires no
1454  * such protection.
1455  */
1456 unsigned slab_node(struct mempolicy *policy)
1457 {
1458         if (!policy || policy->flags & MPOL_F_LOCAL)
1459                 return numa_node_id();
1460
1461         switch (policy->mode) {
1462         case MPOL_PREFERRED:
1463                 /*
1464                  * handled MPOL_F_LOCAL above
1465                  */
1466                 return policy->v.preferred_node;
1467
1468         case MPOL_INTERLEAVE:
1469                 return interleave_nodes(policy);
1470
1471         case MPOL_BIND: {
1472                 /*
1473                  * Follow bind policy behavior and start allocation at the
1474                  * first node.
1475                  */
1476                 struct zonelist *zonelist;
1477                 struct zone *zone;
1478                 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1479                 zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1480                 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1481                                                         &policy->v.nodes,
1482                                                         &zone);
1483                 return zone ? zone->node : numa_node_id();
1484         }
1485
1486         default:
1487                 BUG();
1488         }
1489 }
1490
1491 /* Do static interleaving for a VMA with known offset. */
1492 static unsigned offset_il_node(struct mempolicy *pol,
1493                 struct vm_area_struct *vma, unsigned long off)
1494 {
1495         unsigned nnodes = nodes_weight(pol->v.nodes);
1496         unsigned target;
1497         int c;
1498         int nid = -1;
1499
1500         if (!nnodes)
1501                 return numa_node_id();
1502         target = (unsigned int)off % nnodes;
1503         c = 0;
1504         do {
1505                 nid = next_node(nid, pol->v.nodes);
1506                 c++;
1507         } while (c <= target);
1508         return nid;
1509 }
1510
1511 /* Determine a node number for interleave */
1512 static inline unsigned interleave_nid(struct mempolicy *pol,
1513                  struct vm_area_struct *vma, unsigned long addr, int shift)
1514 {
1515         if (vma) {
1516                 unsigned long off;
1517
1518                 /*
1519                  * for small pages, there is no difference between
1520                  * shift and PAGE_SHIFT, so the bit-shift is safe.
1521                  * for huge pages, since vm_pgoff is in units of small
1522                  * pages, we need to shift off the always 0 bits to get
1523                  * a useful offset.
1524                  */
1525                 BUG_ON(shift < PAGE_SHIFT);
1526                 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1527                 off += (addr - vma->vm_start) >> shift;
1528                 return offset_il_node(pol, vma, off);
1529         } else
1530                 return interleave_nodes(pol);
1531 }
1532
1533 #ifdef CONFIG_HUGETLBFS
1534 /*
1535  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1536  * @vma = virtual memory area whose policy is sought
1537  * @addr = address in @vma for shared policy lookup and interleave policy
1538  * @gfp_flags = for requested zone
1539  * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1540  * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1541  *
1542  * Returns a zonelist suitable for a huge page allocation and a pointer
1543  * to the struct mempolicy for conditional unref after allocation.
1544  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1545  * @nodemask for filtering the zonelist.
1546  */
1547 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1548                                 gfp_t gfp_flags, struct mempolicy **mpol,
1549                                 nodemask_t **nodemask)
1550 {
1551         struct zonelist *zl;
1552
1553         *mpol = get_vma_policy(current, vma, addr);
1554         *nodemask = NULL;       /* assume !MPOL_BIND */
1555
1556         if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1557                 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1558                                 huge_page_shift(hstate_vma(vma))), gfp_flags);
1559         } else {
1560                 zl = policy_zonelist(gfp_flags, *mpol);
1561                 if ((*mpol)->mode == MPOL_BIND)
1562                         *nodemask = &(*mpol)->v.nodes;
1563         }
1564         return zl;
1565 }
1566
1567 /*
1568  * init_nodemask_of_mempolicy
1569  *
1570  * If the current task's mempolicy is "default" [NULL], return 'false'
1571  * to indicate default policy.  Otherwise, extract the policy nodemask
1572  * for 'bind' or 'interleave' policy into the argument nodemask, or
1573  * initialize the argument nodemask to contain the single node for
1574  * 'preferred' or 'local' policy and return 'true' to indicate presence
1575  * of non-default mempolicy.
1576  *
1577  * We don't bother with reference counting the mempolicy [mpol_get/put]
1578  * because the current task is examining it's own mempolicy and a task's
1579  * mempolicy is only ever changed by the task itself.
1580  *
1581  * N.B., it is the caller's responsibility to free a returned nodemask.
1582  */
1583 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1584 {
1585         struct mempolicy *mempolicy;
1586         int nid;
1587
1588         if (!(mask && current->mempolicy))
1589                 return false;
1590
1591         mempolicy = current->mempolicy;
1592         switch (mempolicy->mode) {
1593         case MPOL_PREFERRED:
1594                 if (mempolicy->flags & MPOL_F_LOCAL)
1595                         nid = numa_node_id();
1596                 else
1597                         nid = mempolicy->v.preferred_node;
1598                 init_nodemask_of_node(mask, nid);
1599                 break;
1600
1601         case MPOL_BIND:
1602                 /* Fall through */
1603         case MPOL_INTERLEAVE:
1604                 *mask =  mempolicy->v.nodes;
1605                 break;
1606
1607         default:
1608                 BUG();
1609         }
1610
1611         return true;
1612 }
1613 #endif
1614
1615 /* Allocate a page in interleaved policy.
1616    Own path because it needs to do special accounting. */
1617 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1618                                         unsigned nid)
1619 {
1620         struct zonelist *zl;
1621         struct page *page;
1622
1623         zl = node_zonelist(nid, gfp);
1624         page = __alloc_pages(gfp, order, zl);
1625         if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1626                 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1627         return page;
1628 }
1629
1630 /**
1631  *      alloc_page_vma  - Allocate a page for a VMA.
1632  *
1633  *      @gfp:
1634  *      %GFP_USER    user allocation.
1635  *      %GFP_KERNEL  kernel allocations,
1636  *      %GFP_HIGHMEM highmem/user allocations,
1637  *      %GFP_FS      allocation should not call back into a file system.
1638  *      %GFP_ATOMIC  don't sleep.
1639  *
1640  *      @vma:  Pointer to VMA or NULL if not available.
1641  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1642  *
1643  *      This function allocates a page from the kernel page pool and applies
1644  *      a NUMA policy associated with the VMA or the current process.
1645  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1646  *      mm_struct of the VMA to prevent it from going away. Should be used for
1647  *      all allocations for pages that will be mapped into
1648  *      user space. Returns NULL when no page can be allocated.
1649  *
1650  *      Should be called with the mm_sem of the vma hold.
1651  */
1652 struct page *
1653 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1654 {
1655         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1656         struct zonelist *zl;
1657
1658         if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1659                 unsigned nid;
1660
1661                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1662                 mpol_cond_put(pol);
1663                 return alloc_page_interleave(gfp, 0, nid);
1664         }
1665         zl = policy_zonelist(gfp, pol);
1666         if (unlikely(mpol_needs_cond_ref(pol))) {
1667                 /*
1668                  * slow path: ref counted shared policy
1669                  */
1670                 struct page *page =  __alloc_pages_nodemask(gfp, 0,
1671                                                 zl, policy_nodemask(gfp, pol));
1672                 __mpol_put(pol);
1673                 return page;
1674         }
1675         /*
1676          * fast path:  default or task policy
1677          */
1678         return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
1679 }
1680
1681 /**
1682  *      alloc_pages_current - Allocate pages.
1683  *
1684  *      @gfp:
1685  *              %GFP_USER   user allocation,
1686  *              %GFP_KERNEL kernel allocation,
1687  *              %GFP_HIGHMEM highmem allocation,
1688  *              %GFP_FS     don't call back into a file system.
1689  *              %GFP_ATOMIC don't sleep.
1690  *      @order: Power of two of allocation size in pages. 0 is a single page.
1691  *
1692  *      Allocate a page from the kernel page pool.  When not in
1693  *      interrupt context and apply the current process NUMA policy.
1694  *      Returns NULL when no page can be allocated.
1695  *
1696  *      Don't call cpuset_update_task_memory_state() unless
1697  *      1) it's ok to take cpuset_sem (can WAIT), and
1698  *      2) allocating for current task (not interrupt).
1699  */
1700 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1701 {
1702         struct mempolicy *pol = current->mempolicy;
1703
1704         if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1705                 pol = &default_policy;
1706
1707         /*
1708          * No reference counting needed for current->mempolicy
1709          * nor system default_policy
1710          */
1711         if (pol->mode == MPOL_INTERLEAVE)
1712                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1713         return __alloc_pages_nodemask(gfp, order,
1714                         policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
1715 }
1716 EXPORT_SYMBOL(alloc_pages_current);
1717
1718 /*
1719  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
1720  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1721  * with the mems_allowed returned by cpuset_mems_allowed().  This
1722  * keeps mempolicies cpuset relative after its cpuset moves.  See
1723  * further kernel/cpuset.c update_nodemask().
1724  */
1725
1726 /* Slow path of a mempolicy duplicate */
1727 struct mempolicy *__mpol_dup(struct mempolicy *old)
1728 {
1729         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1730
1731         if (!new)
1732                 return ERR_PTR(-ENOMEM);
1733         if (current_cpuset_is_being_rebound()) {
1734                 nodemask_t mems = cpuset_mems_allowed(current);
1735                 mpol_rebind_policy(old, &mems);
1736         }
1737         *new = *old;
1738         atomic_set(&new->refcnt, 1);
1739         return new;
1740 }
1741
1742 /*
1743  * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
1744  * eliminate the * MPOL_F_* flags that require conditional ref and
1745  * [NOTE!!!] drop the extra ref.  Not safe to reference *frompol directly
1746  * after return.  Use the returned value.
1747  *
1748  * Allows use of a mempolicy for, e.g., multiple allocations with a single
1749  * policy lookup, even if the policy needs/has extra ref on lookup.
1750  * shmem_readahead needs this.
1751  */
1752 struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
1753                                                 struct mempolicy *frompol)
1754 {
1755         if (!mpol_needs_cond_ref(frompol))
1756                 return frompol;
1757
1758         *tompol = *frompol;
1759         tompol->flags &= ~MPOL_F_SHARED;        /* copy doesn't need unref */
1760         __mpol_put(frompol);
1761         return tompol;
1762 }
1763
1764 static int mpol_match_intent(const struct mempolicy *a,
1765                              const struct mempolicy *b)
1766 {
1767         if (a->flags != b->flags)
1768                 return 0;
1769         if (!mpol_store_user_nodemask(a))
1770                 return 1;
1771         return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
1772 }
1773
1774 /* Slow path of a mempolicy comparison */
1775 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1776 {
1777         if (!a || !b)
1778                 return 0;
1779         if (a->mode != b->mode)
1780                 return 0;
1781         if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b))
1782                 return 0;
1783         switch (a->mode) {
1784         case MPOL_BIND:
1785                 /* Fall through */
1786         case MPOL_INTERLEAVE:
1787                 return nodes_equal(a->v.nodes, b->v.nodes);
1788         case MPOL_PREFERRED:
1789                 return a->v.preferred_node == b->v.preferred_node &&
1790                         a->flags == b->flags;
1791         default:
1792                 BUG();
1793                 return 0;
1794         }
1795 }
1796
1797 /*
1798  * Shared memory backing store policy support.
1799  *
1800  * Remember policies even when nobody has shared memory mapped.
1801  * The policies are kept in Red-Black tree linked from the inode.
1802  * They are protected by the sp->lock spinlock, which should be held
1803  * for any accesses to the tree.
1804  */
1805
1806 /* lookup first element intersecting start-end */
1807 /* Caller holds sp->lock */
1808 static struct sp_node *
1809 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1810 {
1811         struct rb_node *n = sp->root.rb_node;
1812
1813         while (n) {
1814                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1815
1816                 if (start >= p->end)
1817                         n = n->rb_right;
1818                 else if (end <= p->start)
1819                         n = n->rb_left;
1820                 else
1821                         break;
1822         }
1823         if (!n)
1824                 return NULL;
1825         for (;;) {
1826                 struct sp_node *w = NULL;
1827                 struct rb_node *prev = rb_prev(n);
1828                 if (!prev)
1829                         break;
1830                 w = rb_entry(prev, struct sp_node, nd);
1831                 if (w->end <= start)
1832                         break;
1833                 n = prev;
1834         }
1835         return rb_entry(n, struct sp_node, nd);
1836 }
1837
1838 /* Insert a new shared policy into the list. */
1839 /* Caller holds sp->lock */
1840 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1841 {
1842         struct rb_node **p = &sp->root.rb_node;
1843         struct rb_node *parent = NULL;
1844         struct sp_node *nd;
1845
1846         while (*p) {
1847                 parent = *p;
1848                 nd = rb_entry(parent, struct sp_node, nd);
1849                 if (new->start < nd->start)
1850                         p = &(*p)->rb_left;
1851                 else if (new->end > nd->end)
1852                         p = &(*p)->rb_right;
1853                 else
1854                         BUG();
1855         }
1856         rb_link_node(&new->nd, parent, p);
1857         rb_insert_color(&new->nd, &sp->root);
1858         pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
1859                  new->policy ? new->policy->mode : 0);
1860 }
1861
1862 /* Find shared policy intersecting idx */
1863 struct mempolicy *
1864 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1865 {
1866         struct mempolicy *pol = NULL;
1867         struct sp_node *sn;
1868
1869         if (!sp->root.rb_node)
1870                 return NULL;
1871         spin_lock(&sp->lock);
1872         sn = sp_lookup(sp, idx, idx+1);
1873         if (sn) {
1874                 mpol_get(sn->policy);
1875                 pol = sn->policy;
1876         }
1877         spin_unlock(&sp->lock);
1878         return pol;
1879 }
1880
1881 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1882 {
1883         pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1884         rb_erase(&n->nd, &sp->root);
1885         mpol_put(n->policy);
1886         kmem_cache_free(sn_cache, n);
1887 }
1888
1889 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1890                                 struct mempolicy *pol)
1891 {
1892         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1893
1894         if (!n)
1895                 return NULL;
1896         n->start = start;
1897         n->end = end;
1898         mpol_get(pol);
1899         pol->flags |= MPOL_F_SHARED;    /* for unref */
1900         n->policy = pol;
1901         return n;
1902 }
1903
1904 /* Replace a policy range. */
1905 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1906                                  unsigned long end, struct sp_node *new)
1907 {
1908         struct sp_node *n, *new2 = NULL;
1909
1910 restart:
1911         spin_lock(&sp->lock);
1912         n = sp_lookup(sp, start, end);
1913         /* Take care of old policies in the same range. */
1914         while (n && n->start < end) {
1915                 struct rb_node *next = rb_next(&n->nd);
1916                 if (n->start >= start) {
1917                         if (n->end <= end)
1918                                 sp_delete(sp, n);
1919                         else
1920                                 n->start = end;
1921                 } else {
1922                         /* Old policy spanning whole new range. */
1923                         if (n->end > end) {
1924                                 if (!new2) {
1925                                         spin_unlock(&sp->lock);
1926                                         new2 = sp_alloc(end, n->end, n->policy);
1927                                         if (!new2)
1928                                                 return -ENOMEM;
1929                                         goto restart;
1930                                 }
1931                                 n->end = start;
1932                                 sp_insert(sp, new2);
1933                                 new2 = NULL;
1934                                 break;
1935                         } else
1936                                 n->end = start;
1937                 }
1938                 if (!next)
1939                         break;
1940                 n = rb_entry(next, struct sp_node, nd);
1941         }
1942         if (new)
1943                 sp_insert(sp, new);
1944         spin_unlock(&sp->lock);
1945         if (new2) {
1946                 mpol_put(new2->policy);
1947                 kmem_cache_free(sn_cache, new2);
1948         }
1949         return 0;
1950 }
1951
1952 /**
1953  * mpol_shared_policy_init - initialize shared policy for inode
1954  * @sp: pointer to inode shared policy
1955  * @mpol:  struct mempolicy to install
1956  *
1957  * Install non-NULL @mpol in inode's shared policy rb-tree.
1958  * On entry, the current task has a reference on a non-NULL @mpol.
1959  * This must be released on exit.
1960  * This is called at get_inode() calls and we can use GFP_KERNEL.
1961  */
1962 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1963 {
1964         int ret;
1965
1966         sp->root = RB_ROOT;             /* empty tree == default mempolicy */
1967         spin_lock_init(&sp->lock);
1968
1969         if (mpol) {
1970                 struct vm_area_struct pvma;
1971                 struct mempolicy *new;
1972                 NODEMASK_SCRATCH(scratch);
1973
1974                 if (!scratch)
1975                         return;
1976                 /* contextualize the tmpfs mount point mempolicy */
1977                 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
1978                 if (IS_ERR(new)) {
1979                         mpol_put(mpol); /* drop our ref on sb mpol */
1980                         NODEMASK_SCRATCH_FREE(scratch);
1981                         return;         /* no valid nodemask intersection */
1982                 }
1983
1984                 task_lock(current);
1985                 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
1986                 task_unlock(current);
1987                 mpol_put(mpol); /* drop our ref on sb mpol */
1988                 if (ret) {
1989                         NODEMASK_SCRATCH_FREE(scratch);
1990                         mpol_put(new);
1991                         return;
1992                 }
1993
1994                 /* Create pseudo-vma that contains just the policy */
1995                 memset(&pvma, 0, sizeof(struct vm_area_struct));
1996                 pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
1997                 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
1998                 mpol_put(new);                  /* drop initial ref */
1999                 NODEMASK_SCRATCH_FREE(scratch);
2000         }
2001 }
2002
2003 int mpol_set_shared_policy(struct shared_policy *info,
2004                         struct vm_area_struct *vma, struct mempolicy *npol)
2005 {
2006         int err;
2007         struct sp_node *new = NULL;
2008         unsigned long sz = vma_pages(vma);
2009
2010         pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2011                  vma->vm_pgoff,
2012                  sz, npol ? npol->mode : -1,
2013                  npol ? npol->flags : -1,
2014                  npol ? nodes_addr(npol->v.nodes)[0] : -1);
2015
2016         if (npol) {
2017                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2018                 if (!new)
2019                         return -ENOMEM;
2020         }
2021         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2022         if (err && new)
2023                 kmem_cache_free(sn_cache, new);
2024         return err;
2025 }
2026
2027 /* Free a backing policy store on inode delete. */
2028 void mpol_free_shared_policy(struct shared_policy *p)
2029 {
2030         struct sp_node *n;
2031         struct rb_node *next;
2032
2033         if (!p->root.rb_node)
2034                 return;
2035         spin_lock(&p->lock);
2036         next = rb_first(&p->root);
2037         while (next) {
2038                 n = rb_entry(next, struct sp_node, nd);
2039                 next = rb_next(&n->nd);
2040                 rb_erase(&n->nd, &p->root);
2041                 mpol_put(n->policy);
2042                 kmem_cache_free(sn_cache, n);
2043         }
2044         spin_unlock(&p->lock);
2045 }
2046
2047 /* assumes fs == KERNEL_DS */
2048 void __init numa_policy_init(void)
2049 {
2050         nodemask_t interleave_nodes;
2051         unsigned long largest = 0;
2052         int nid, prefer = 0;
2053
2054         policy_cache = kmem_cache_create("numa_policy",
2055                                          sizeof(struct mempolicy),
2056                                          0, SLAB_PANIC, NULL);
2057
2058         sn_cache = kmem_cache_create("shared_policy_node",
2059                                      sizeof(struct sp_node),
2060                                      0, SLAB_PANIC, NULL);
2061
2062         /*
2063          * Set interleaving policy for system init. Interleaving is only
2064          * enabled across suitably sized nodes (default is >= 16MB), or
2065          * fall back to the largest node if they're all smaller.
2066          */
2067         nodes_clear(interleave_nodes);
2068         for_each_node_state(nid, N_HIGH_MEMORY) {
2069                 unsigned long total_pages = node_present_pages(nid);
2070
2071                 /* Preserve the largest node */
2072                 if (largest < total_pages) {
2073                         largest = total_pages;
2074                         prefer = nid;
2075                 }
2076
2077                 /* Interleave this node? */
2078                 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2079                         node_set(nid, interleave_nodes);
2080         }
2081
2082         /* All too small, use the largest */
2083         if (unlikely(nodes_empty(interleave_nodes)))
2084                 node_set(prefer, interleave_nodes);
2085
2086         if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2087                 printk("numa_policy_init: interleaving failed\n");
2088 }
2089
2090 /* Reset policy of current process to default */
2091 void numa_default_policy(void)
2092 {
2093         do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2094 }
2095
2096 /*
2097  * Parse and format mempolicy from/to strings
2098  */
2099
2100 /*
2101  * "local" is pseudo-policy:  MPOL_PREFERRED with MPOL_F_LOCAL flag
2102  * Used only for mpol_parse_str() and mpol_to_str()
2103  */
2104 #define MPOL_LOCAL (MPOL_INTERLEAVE + 1)
2105 static const char * const policy_types[] =
2106         { "default", "prefer", "bind", "interleave", "local" };
2107
2108
2109 #ifdef CONFIG_TMPFS
2110 /**
2111  * mpol_parse_str - parse string to mempolicy
2112  * @str:  string containing mempolicy to parse
2113  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2114  * @no_context:  flag whether to "contextualize" the mempolicy
2115  *
2116  * Format of input:
2117  *      <mode>[=<flags>][:<nodelist>]
2118  *
2119  * if @no_context is true, save the input nodemask in w.user_nodemask in
2120  * the returned mempolicy.  This will be used to "clone" the mempolicy in
2121  * a specific context [cpuset] at a later time.  Used to parse tmpfs mpol
2122  * mount option.  Note that if 'static' or 'relative' mode flags were
2123  * specified, the input nodemask will already have been saved.  Saving
2124  * it again is redundant, but safe.
2125  *
2126  * On success, returns 0, else 1
2127  */
2128 int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2129 {
2130         struct mempolicy *new = NULL;
2131         unsigned short uninitialized_var(mode);
2132         unsigned short uninitialized_var(mode_flags);
2133         nodemask_t nodes;
2134         char *nodelist = strchr(str, ':');
2135         char *flags = strchr(str, '=');
2136         int i;
2137         int err = 1;
2138
2139         if (nodelist) {
2140                 /* NUL-terminate mode or flags string */
2141                 *nodelist++ = '\0';
2142                 if (nodelist_parse(nodelist, nodes))
2143                         goto out;
2144                 if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
2145                         goto out;
2146         } else
2147                 nodes_clear(nodes);
2148
2149         if (flags)
2150                 *flags++ = '\0';        /* terminate mode string */
2151
2152         for (i = 0; i <= MPOL_LOCAL; i++) {
2153                 if (!strcmp(str, policy_types[i])) {
2154                         mode = i;
2155                         break;
2156                 }
2157         }
2158         if (i > MPOL_LOCAL)
2159                 goto out;
2160
2161         switch (mode) {
2162         case MPOL_PREFERRED:
2163                 /*
2164                  * Insist on a nodelist of one node only
2165                  */
2166                 if (nodelist) {
2167                         char *rest = nodelist;
2168                         while (isdigit(*rest))
2169                                 rest++;
2170                         if (*rest)
2171                                 goto out;
2172                 }
2173                 break;
2174         case MPOL_INTERLEAVE:
2175                 /*
2176                  * Default to online nodes with memory if no nodelist
2177                  */
2178                 if (!nodelist)
2179                         nodes = node_states[N_HIGH_MEMORY];
2180                 break;
2181         case MPOL_LOCAL:
2182                 /*
2183                  * Don't allow a nodelist;  mpol_new() checks flags
2184                  */
2185                 if (nodelist)
2186                         goto out;
2187                 mode = MPOL_PREFERRED;
2188                 break;
2189         case MPOL_DEFAULT:
2190                 /*
2191                  * Insist on a empty nodelist
2192                  */
2193                 if (!nodelist)
2194                         err = 0;
2195                 goto out;
2196         case MPOL_BIND:
2197                 /*
2198                  * Insist on a nodelist
2199                  */
2200                 if (!nodelist)
2201                         goto out;
2202         }
2203
2204         mode_flags = 0;
2205         if (flags) {
2206                 /*
2207                  * Currently, we only support two mutually exclusive
2208                  * mode flags.
2209                  */
2210                 if (!strcmp(flags, "static"))
2211                         mode_flags |= MPOL_F_STATIC_NODES;
2212                 else if (!strcmp(flags, "relative"))
2213                         mode_flags |= MPOL_F_RELATIVE_NODES;
2214                 else
2215                         goto out;
2216         }
2217
2218         new = mpol_new(mode, mode_flags, &nodes);
2219         if (IS_ERR(new))
2220                 goto out;
2221
2222         {
2223                 int ret;
2224                 NODEMASK_SCRATCH(scratch);
2225                 if (scratch) {
2226                         task_lock(current);
2227                         ret = mpol_set_nodemask(new, &nodes, scratch);
2228                         task_unlock(current);
2229                 } else
2230                         ret = -ENOMEM;
2231                 NODEMASK_SCRATCH_FREE(scratch);
2232                 if (ret) {
2233                         mpol_put(new);
2234                         goto out;
2235                 }
2236         }
2237         err = 0;
2238         if (no_context) {
2239                 /* save for contextualization */
2240                 new->w.user_nodemask = nodes;
2241         }
2242
2243 out:
2244         /* Restore string for error message */
2245         if (nodelist)
2246                 *--nodelist = ':';
2247         if (flags)
2248                 *--flags = '=';
2249         if (!err)
2250                 *mpol = new;
2251         return err;
2252 }
2253 #endif /* CONFIG_TMPFS */
2254
2255 /**
2256  * mpol_to_str - format a mempolicy structure for printing
2257  * @buffer:  to contain formatted mempolicy string
2258  * @maxlen:  length of @buffer
2259  * @pol:  pointer to mempolicy to be formatted
2260  * @no_context:  "context free" mempolicy - use nodemask in w.user_nodemask
2261  *
2262  * Convert a mempolicy into a string.
2263  * Returns the number of characters in buffer (if positive)
2264  * or an error (negative)
2265  */
2266 int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2267 {
2268         char *p = buffer;
2269         int l;
2270         nodemask_t nodes;
2271         unsigned short mode;
2272         unsigned short flags = pol ? pol->flags : 0;
2273
2274         /*
2275          * Sanity check:  room for longest mode, flag and some nodes
2276          */
2277         VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2278
2279         if (!pol || pol == &default_policy)
2280                 mode = MPOL_DEFAULT;
2281         else
2282                 mode = pol->mode;
2283
2284         switch (mode) {
2285         case MPOL_DEFAULT:
2286                 nodes_clear(nodes);
2287                 break;
2288
2289         case MPOL_PREFERRED:
2290                 nodes_clear(nodes);
2291                 if (flags & MPOL_F_LOCAL)
2292                         mode = MPOL_LOCAL;      /* pseudo-policy */
2293                 else
2294                         node_set(pol->v.preferred_node, nodes);
2295                 break;
2296
2297         case MPOL_BIND:
2298                 /* Fall through */
2299         case MPOL_INTERLEAVE:
2300                 if (no_context)
2301                         nodes = pol->w.user_nodemask;
2302                 else
2303                         nodes = pol->v.nodes;
2304                 break;
2305
2306         default:
2307                 BUG();
2308         }
2309
2310         l = strlen(policy_types[mode]);
2311         if (buffer + maxlen < p + l + 1)
2312                 return -ENOSPC;
2313
2314         strcpy(p, policy_types[mode]);
2315         p += l;
2316
2317         if (flags & MPOL_MODE_FLAGS) {
2318                 if (buffer + maxlen < p + 2)
2319                         return -ENOSPC;
2320                 *p++ = '=';
2321
2322                 /*
2323                  * Currently, the only defined flags are mutually exclusive
2324                  */
2325                 if (flags & MPOL_F_STATIC_NODES)
2326                         p += snprintf(p, buffer + maxlen - p, "static");
2327                 else if (flags & MPOL_F_RELATIVE_NODES)
2328                         p += snprintf(p, buffer + maxlen - p, "relative");
2329         }
2330
2331         if (!nodes_empty(nodes)) {
2332                 if (buffer + maxlen < p + 2)
2333                         return -ENOSPC;
2334                 *p++ = ':';
2335                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2336         }
2337         return p - buffer;
2338 }
2339
2340 struct numa_maps {
2341         unsigned long pages;
2342         unsigned long anon;
2343         unsigned long active;
2344         unsigned long writeback;
2345         unsigned long mapcount_max;
2346         unsigned long dirty;
2347         unsigned long swapcache;
2348         unsigned long node[MAX_NUMNODES];
2349 };
2350
2351 static void gather_stats(struct page *page, void *private, int pte_dirty)
2352 {
2353         struct numa_maps *md = private;
2354         int count = page_mapcount(page);
2355
2356         md->pages++;
2357         if (pte_dirty || PageDirty(page))
2358                 md->dirty++;
2359
2360         if (PageSwapCache(page))
2361                 md->swapcache++;
2362
2363         if (PageActive(page) || PageUnevictable(page))
2364                 md->active++;
2365
2366         if (PageWriteback(page))
2367                 md->writeback++;
2368
2369         if (PageAnon(page))
2370                 md->anon++;
2371
2372         if (count > md->mapcount_max)
2373                 md->mapcount_max = count;
2374
2375         md->node[page_to_nid(page)]++;
2376 }
2377
2378 #ifdef CONFIG_HUGETLB_PAGE
2379 static void check_huge_range(struct vm_area_struct *vma,
2380                 unsigned long start, unsigned long end,
2381                 struct numa_maps *md)
2382 {
2383         unsigned long addr;
2384         struct page *page;
2385         struct hstate *h = hstate_vma(vma);
2386         unsigned long sz = huge_page_size(h);
2387
2388         for (addr = start; addr < end; addr += sz) {
2389                 pte_t *ptep = huge_pte_offset(vma->vm_mm,
2390                                                 addr & huge_page_mask(h));
2391                 pte_t pte;
2392
2393                 if (!ptep)
2394                         continue;
2395
2396                 pte = *ptep;
2397                 if (pte_none(pte))
2398                         continue;
2399
2400                 page = pte_page(pte);
2401                 if (!page)
2402                         continue;
2403
2404                 gather_stats(page, md, pte_dirty(*ptep));
2405         }
2406 }
2407 #else
2408 static inline void check_huge_range(struct vm_area_struct *vma,
2409                 unsigned long start, unsigned long end,
2410                 struct numa_maps *md)
2411 {
2412 }
2413 #endif
2414
2415 /*
2416  * Display pages allocated per node and memory policy via /proc.
2417  */
2418 int show_numa_map(struct seq_file *m, void *v)
2419 {
2420         struct proc_maps_private *priv = m->private;
2421         struct vm_area_struct *vma = v;
2422         struct numa_maps *md;
2423         struct file *file = vma->vm_file;
2424         struct mm_struct *mm = vma->vm_mm;
2425         struct mempolicy *pol;
2426         int n;
2427         char buffer[50];
2428
2429         if (!mm)
2430                 return 0;
2431
2432         md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
2433         if (!md)
2434                 return 0;
2435
2436         pol = get_vma_policy(priv->task, vma, vma->vm_start);
2437         mpol_to_str(buffer, sizeof(buffer), pol, 0);
2438         mpol_cond_put(pol);
2439
2440         seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2441
2442         if (file) {
2443                 seq_printf(m, " file=");
2444                 seq_path(m, &file->f_path, "\n\t= ");
2445         } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2446                 seq_printf(m, " heap");
2447         } else if (vma->vm_start <= mm->start_stack &&
2448                         vma->vm_end >= mm->start_stack) {
2449                 seq_printf(m, " stack");
2450         }
2451
2452         if (is_vm_hugetlb_page(vma)) {
2453                 check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2454                 seq_printf(m, " huge");
2455         } else {
2456                 check_pgd_range(vma, vma->vm_start, vma->vm_end,
2457                         &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
2458         }
2459
2460         if (!md->pages)
2461                 goto out;
2462
2463         if (md->anon)
2464                 seq_printf(m," anon=%lu",md->anon);
2465
2466         if (md->dirty)
2467                 seq_printf(m," dirty=%lu",md->dirty);
2468
2469         if (md->pages != md->anon && md->pages != md->dirty)
2470                 seq_printf(m, " mapped=%lu", md->pages);
2471
2472         if (md->mapcount_max > 1)
2473                 seq_printf(m, " mapmax=%lu", md->mapcount_max);
2474
2475         if (md->swapcache)
2476                 seq_printf(m," swapcache=%lu", md->swapcache);
2477
2478         if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2479                 seq_printf(m," active=%lu", md->active);
2480
2481         if (md->writeback)
2482                 seq_printf(m," writeback=%lu", md->writeback);
2483
2484         for_each_node_state(n, N_HIGH_MEMORY)
2485                 if (md->node[n])
2486                         seq_printf(m, " N%d=%lu", n, md->node[n]);
2487 out:
2488         seq_putc(m, '\n');
2489         kfree(md);
2490
2491         if (m->count < m->size)
2492                 m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
2493         return 0;
2494 }