mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66 */
  67
  68 #include <linux/mempolicy.h>
  69 #include <linux/mm.h>
  70 #include <linux/highmem.h>
  71 #include <linux/hugetlb.h>
  72 #include <linux/kernel.h>
  73 #include <linux/sched.h>
  74 #include <linux/nodemask.h>
  75 #include <linux/cpuset.h>
  76 #include <linux/gfp.h>
  77 #include <linux/slab.h>
  78 #include <linux/string.h>
  79 #include <linux/module.h>
  80 #include <linux/nsproxy.h>
  81 #include <linux/interrupt.h>
  82 #include <linux/init.h>
  83 #include <linux/compat.h>
  84 #include <linux/swap.h>
  85 #include <linux/seq_file.h>
  86 #include <linux/proc_fs.h>
  87 #include <linux/migrate.h>
  88 #include <linux/rmap.h>
  89 #include <linux/security.h>
  90 #include <linux/syscalls.h>
  91 #include <linux/ctype.h>
  92
  93 #include <asm/tlbflush.h>
  94 #include <asm/uaccess.h>
  95
  96 /* Internal flags */
  97 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
  98 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
  99 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
 100
 101 static struct kmem_cache *policy_cache;
 102 static struct kmem_cache *sn_cache;
 103
 104 /* Highest zone. An specific allocation for a zone below that is not
 105    policied. */
 106 enum zone_type policy_zone = 0;
 107
 108 /*
 109  * run-time system-wide default policy => local allocation
 110  */
 111 struct mempolicy default_policy = {
 112         .refcnt = ATOMIC_INIT(1), /* never free it */
 113         .mode = MPOL_PREFERRED,
 114         .flags = MPOL_F_LOCAL,
 115 };
 116
 117 static const struct mempolicy_operations {
 118         int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 119         void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
 120 } mpol_ops[MPOL_MAX];
 121
 122 /* Check that the nodemask contains at least one populated zone */
 123 static int is_valid_nodemask(const nodemask_t *nodemask)
 124 {
 125         int nd, k;
 126
 127         /* Check that there is something useful in this mask */
 128         k = policy_zone;
 129
 130         for_each_node_mask(nd, *nodemask) {
 131                 struct zone *z;
 132
 133                 for (k = 0; k <= policy_zone; k++) {
 134                         z = &NODE_DATA(nd)->node_zones[k];
 135                         if (z->present_pages > 0)
 136                                 return 1;
 137                 }
 138         }
 139
 140         return 0;
 141 }
 142
 143 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 144 {
 145         return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
 146 }
 147
 148 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 149                                    const nodemask_t *rel)
 150 {
 151         nodemask_t tmp;
 152         nodes_fold(tmp, *orig, nodes_weight(*rel));
 153         nodes_onto(*ret, tmp, *rel);
 154 }
 155
 156 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 157 {
 158         if (nodes_empty(*nodes))
 159                 return -EINVAL;
 160         pol->v.nodes = *nodes;
 161         return 0;
 162 }
 163
 164 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 165 {
 166         if (!nodes)
 167                 pol->flags |= MPOL_F_LOCAL;     /* local allocation */
 168         else if (nodes_empty(*nodes))
 169                 return -EINVAL;                 /*  no allowed nodes */
 170         else
 171                 pol->v.preferred_node = first_node(*nodes);
 172         return 0;
 173 }
 174
 175 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 176 {
 177         if (!is_valid_nodemask(nodes))
 178                 return -EINVAL;
 179         pol->v.nodes = *nodes;
 180         return 0;
 181 }
 182
 183 /* Create a new policy */
 184 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 185                                   nodemask_t *nodes)
 186 {
 187         struct mempolicy *policy;
 188         nodemask_t cpuset_context_nmask;
 189         int ret;
 190
 191         pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 192                  mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
 193
 194         if (mode == MPOL_DEFAULT) {
 195                 if (nodes && !nodes_empty(*nodes))
 196                         return ERR_PTR(-EINVAL);
 197                 return NULL;    /* simply delete any existing policy */
 198         }
 199         VM_BUG_ON(!nodes);
 200
 201         /*
 202          * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 203          * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 204          * All other modes require a valid pointer to a non-empty nodemask.
 205          */
 206         if (mode == MPOL_PREFERRED) {
 207                 if (nodes_empty(*nodes)) {
 208                         if (((flags & MPOL_F_STATIC_NODES) ||
 209                              (flags & MPOL_F_RELATIVE_NODES)))
 210                                 return ERR_PTR(-EINVAL);
 211                         nodes = NULL;   /* flag local alloc */
 212                 }
 213         } else if (nodes_empty(*nodes))
 214                 return ERR_PTR(-EINVAL);
 215         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 216         if (!policy)
 217                 return ERR_PTR(-ENOMEM);
 218         atomic_set(&policy->refcnt, 1);
 219         policy->mode = mode;
 220         policy->flags = flags;
 221
 222         if (nodes) {
 223                 /*
 224                  * cpuset related setup doesn't apply to local allocation
 225                  */
 226                 cpuset_update_task_memory_state();
 227                 if (flags & MPOL_F_RELATIVE_NODES)
 228                         mpol_relative_nodemask(&cpuset_context_nmask, nodes,
 229                                                &cpuset_current_mems_allowed);
 230                 else
 231                         nodes_and(cpuset_context_nmask, *nodes,
 232                                   cpuset_current_mems_allowed);
 233                 if (mpol_store_user_nodemask(policy))
 234                         policy->w.user_nodemask = *nodes;
 235                 else
 236                         policy->w.cpuset_mems_allowed =
 237                                                 cpuset_mems_allowed(current);
 238         }
 239
 240         ret = mpol_ops[mode].create(policy,
 241                                 nodes ? &cpuset_context_nmask : NULL);
 242         if (ret < 0) {
 243                 kmem_cache_free(policy_cache, policy);
 244                 return ERR_PTR(ret);
 245         }
 246         return policy;
 247 }
 248
 249 /* Slow path of a mpol destructor. */
 250 void __mpol_put(struct mempolicy *p)
 251 {
 252         if (!atomic_dec_and_test(&p->refcnt))
 253                 return;
 254         kmem_cache_free(policy_cache, p);
 255 }
 256
 257 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
 258 {
 259 }
 260
 261 static void mpol_rebind_nodemask(struct mempolicy *pol,
 262                                  const nodemask_t *nodes)
 263 {
 264         nodemask_t tmp;
 265
 266         if (pol->flags & MPOL_F_STATIC_NODES)
 267                 nodes_and(tmp, pol->w.user_nodemask, *nodes);
 268         else if (pol->flags & MPOL_F_RELATIVE_NODES)
 269                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 270         else {
 271                 nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
 272                             *nodes);
 273                 pol->w.cpuset_mems_allowed = *nodes;
 274         }
 275
 276         pol->v.nodes = tmp;
 277         if (!node_isset(current->il_next, tmp)) {
 278                 current->il_next = next_node(current->il_next, tmp);
 279                 if (current->il_next >= MAX_NUMNODES)
 280                         current->il_next = first_node(tmp);
 281                 if (current->il_next >= MAX_NUMNODES)
 282                         current->il_next = numa_node_id();
 283         }
 284 }
 285
 286 static void mpol_rebind_preferred(struct mempolicy *pol,
 287                                   const nodemask_t *nodes)
 288 {
 289         nodemask_t tmp;
 290
 291         if (pol->flags & MPOL_F_STATIC_NODES) {
 292                 int node = first_node(pol->w.user_nodemask);
 293
 294                 if (node_isset(node, *nodes)) {
 295                         pol->v.preferred_node = node;
 296                         pol->flags &= ~MPOL_F_LOCAL;
 297                 } else
 298                         pol->flags |= MPOL_F_LOCAL;
 299         } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 300                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 301                 pol->v.preferred_node = first_node(tmp);
 302         } else if (!(pol->flags & MPOL_F_LOCAL)) {
 303                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
 304                                                    pol->w.cpuset_mems_allowed,
 305                                                    *nodes);
 306                 pol->w.cpuset_mems_allowed = *nodes;
 307         }
 308 }
 309
 310 /* Migrate a policy to a different set of nodes */
 311 static void mpol_rebind_policy(struct mempolicy *pol,
 312                                const nodemask_t *newmask)
 313 {
 314         if (!pol)
 315                 return;
 316         if (!mpol_store_user_nodemask(pol) &&
 317             nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 318                 return;
 319         mpol_ops[pol->mode].rebind(pol, newmask);
 320 }
 321
 322 /*
 323  * Wrapper for mpol_rebind_policy() that just requires task
 324  * pointer, and updates task mempolicy.
 325  */
 326
 327 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
 328 {
 329         mpol_rebind_policy(tsk->mempolicy, new);
 330 }
 331
 332 /*
 333  * Rebind each vma in mm to new nodemask.
 334  *
 335  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
 336  */
 337
 338 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 339 {
 340         struct vm_area_struct *vma;
 341
 342         down_write(&mm->mmap_sem);
 343         for (vma = mm->mmap; vma; vma = vma->vm_next)
 344                 mpol_rebind_policy(vma->vm_policy, new);
 345         up_write(&mm->mmap_sem);
 346 }
 347
 348 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 349         [MPOL_DEFAULT] = {
 350                 .rebind = mpol_rebind_default,
 351         },
 352         [MPOL_INTERLEAVE] = {
 353                 .create = mpol_new_interleave,
 354                 .rebind = mpol_rebind_nodemask,
 355         },
 356         [MPOL_PREFERRED] = {
 357                 .create = mpol_new_preferred,
 358                 .rebind = mpol_rebind_preferred,
 359         },
 360         [MPOL_BIND] = {
 361                 .create = mpol_new_bind,
 362                 .rebind = mpol_rebind_nodemask,
 363         },
 364 };
 365
 366 static void gather_stats(struct page *, void *, int pte_dirty);
 367 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 368                                 unsigned long flags);
 369
 370 /* Scan through pages checking if pages follow certain conditions. */
 371 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 372                 unsigned long addr, unsigned long end,
 373                 const nodemask_t *nodes, unsigned long flags,
 374                 void *private)
 375 {
 376         pte_t *orig_pte;
 377         pte_t *pte;
 378         spinlock_t *ptl;
 379
 380         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 381         do {
 382                 struct page *page;
 383                 int nid;
 384
 385                 if (!pte_present(*pte))
 386                         continue;
 387                 page = vm_normal_page(vma, addr, *pte);
 388                 if (!page)
 389                         continue;
 390                 /*
 391                  * The check for PageReserved here is important to avoid
 392                  * handling zero pages and other pages that may have been
 393                  * marked special by the system.
 394                  *
 395                  * If the PageReserved would not be checked here then f.e.
 396                  * the location of the zero page could have an influence
 397                  * on MPOL_MF_STRICT, zero pages would be counted for
 398                  * the per node stats, and there would be useless attempts
 399                  * to put zero pages on the migration list.
 400                  */
 401                 if (PageReserved(page))
 402                         continue;
 403                 nid = page_to_nid(page);
 404                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 405                         continue;
 406
 407                 if (flags & MPOL_MF_STATS)
 408                         gather_stats(page, private, pte_dirty(*pte));
 409                 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 410                         migrate_page_add(page, private, flags);
 411                 else
 412                         break;
 413         } while (pte++, addr += PAGE_SIZE, addr != end);
 414         pte_unmap_unlock(orig_pte, ptl);
 415         return addr != end;
 416 }
 417
 418 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 419                 unsigned long addr, unsigned long end,
 420                 const nodemask_t *nodes, unsigned long flags,
 421                 void *private)
 422 {
 423         pmd_t *pmd;
 424         unsigned long next;
 425
 426         pmd = pmd_offset(pud, addr);
 427         do {
 428                 next = pmd_addr_end(addr, end);
 429                 if (pmd_none_or_clear_bad(pmd))
 430                         continue;
 431                 if (check_pte_range(vma, pmd, addr, next, nodes,
 432                                     flags, private))
 433                         return -EIO;
 434         } while (pmd++, addr = next, addr != end);
 435         return 0;
 436 }
 437
 438 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 439                 unsigned long addr, unsigned long end,
 440                 const nodemask_t *nodes, unsigned long flags,
 441                 void *private)
 442 {
 443         pud_t *pud;
 444         unsigned long next;
 445
 446         pud = pud_offset(pgd, addr);
 447         do {
 448                 next = pud_addr_end(addr, end);
 449                 if (pud_none_or_clear_bad(pud))
 450                         continue;
 451                 if (check_pmd_range(vma, pud, addr, next, nodes,
 452                                     flags, private))
 453                         return -EIO;
 454         } while (pud++, addr = next, addr != end);
 455         return 0;
 456 }
 457
 458 static inline int check_pgd_range(struct vm_area_struct *vma,
 459                 unsigned long addr, unsigned long end,
 460                 const nodemask_t *nodes, unsigned long flags,
 461                 void *private)
 462 {
 463         pgd_t *pgd;
 464         unsigned long next;
 465
 466         pgd = pgd_offset(vma->vm_mm, addr);
 467         do {
 468                 next = pgd_addr_end(addr, end);
 469                 if (pgd_none_or_clear_bad(pgd))
 470                         continue;
 471                 if (check_pud_range(vma, pgd, addr, next, nodes,
 472                                     flags, private))
 473                         return -EIO;
 474         } while (pgd++, addr = next, addr != end);
 475         return 0;
 476 }
 477
 478 /*
 479  * Check if all pages in a range are on a set of nodes.
 480  * If pagelist != NULL then isolate pages from the LRU and
 481  * put them on the pagelist.
 482  */
 483 static struct vm_area_struct *
 484 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 485                 const nodemask_t *nodes, unsigned long flags, void *private)
 486 {
 487         int err;
 488         struct vm_area_struct *first, *vma, *prev;
 489
 490         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 491
 492                 err = migrate_prep();
 493                 if (err)
 494                         return ERR_PTR(err);
 495         }
 496
 497         first = find_vma(mm, start);
 498         if (!first)
 499                 return ERR_PTR(-EFAULT);
 500         prev = NULL;
 501         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 502                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 503                         if (!vma->vm_next && vma->vm_end < end)
 504                                 return ERR_PTR(-EFAULT);
 505                         if (prev && prev->vm_end < vma->vm_start)
 506                                 return ERR_PTR(-EFAULT);
 507                 }
 508                 if (!is_vm_hugetlb_page(vma) &&
 509                     ((flags & MPOL_MF_STRICT) ||
 510                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 511                                 vma_migratable(vma)))) {
 512                         unsigned long endvma = vma->vm_end;
 513
 514                         if (endvma > end)
 515                                 endvma = end;
 516                         if (vma->vm_start > start)
 517                                 start = vma->vm_start;
 518                         err = check_pgd_range(vma, start, endvma, nodes,
 519                                                 flags, private);
 520                         if (err) {
 521                                 first = ERR_PTR(err);
 522                                 break;
 523                         }
 524                 }
 525                 prev = vma;
 526         }
 527         return first;
 528 }
 529
 530 /* Apply policy to a single VMA */
 531 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 532 {
 533         int err = 0;
 534         struct mempolicy *old = vma->vm_policy;
 535
 536         pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 537                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 538                  vma->vm_ops, vma->vm_file,
 539                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 540
 541         if (vma->vm_ops && vma->vm_ops->set_policy)
 542                 err = vma->vm_ops->set_policy(vma, new);
 543         if (!err) {
 544                 mpol_get(new);
 545                 vma->vm_policy = new;
 546                 mpol_put(old);
 547         }
 548         return err;
 549 }
 550
 551 /* Step 2: apply policy to a range and do splits. */
 552 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 553                        unsigned long end, struct mempolicy *new)
 554 {
 555         struct vm_area_struct *next;
 556         int err;
 557
 558         err = 0;
 559         for (; vma && vma->vm_start < end; vma = next) {
 560                 next = vma->vm_next;
 561                 if (vma->vm_start < start)
 562                         err = split_vma(vma->vm_mm, vma, start, 1);
 563                 if (!err && vma->vm_end > end)
 564                         err = split_vma(vma->vm_mm, vma, end, 0);
 565                 if (!err)
 566                         err = policy_vma(vma, new);
 567                 if (err)
 568                         break;
 569         }
 570         return err;
 571 }
 572
 573 /*
 574  * Update task->flags PF_MEMPOLICY bit: set iff non-default
 575  * mempolicy.  Allows more rapid checking of this (combined perhaps
 576  * with other PF_* flag bits) on memory allocation hot code paths.
 577  *
 578  * If called from outside this file, the task 'p' should -only- be
 579  * a newly forked child not yet visible on the task list, because
 580  * manipulating the task flags of a visible task is not safe.
 581  *
 582  * The above limitation is why this routine has the funny name
 583  * mpol_fix_fork_child_flag().
 584  *
 585  * It is also safe to call this with a task pointer of current,
 586  * which the static wrapper mpol_set_task_struct_flag() does,
 587  * for use within this file.
 588  */
 589
 590 void mpol_fix_fork_child_flag(struct task_struct *p)
 591 {
 592         if (p->mempolicy)
 593                 p->flags |= PF_MEMPOLICY;
 594         else
 595                 p->flags &= ~PF_MEMPOLICY;
 596 }
 597
 598 static void mpol_set_task_struct_flag(void)
 599 {
 600         mpol_fix_fork_child_flag(current);
 601 }
 602
 603 /* Set the process memory policy */
 604 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 605                              nodemask_t *nodes)
 606 {
 607         struct mempolicy *new;
 608         struct mm_struct *mm = current->mm;
 609
 610         new = mpol_new(mode, flags, nodes);
 611         if (IS_ERR(new))
 612                 return PTR_ERR(new);
 613
 614         /*
 615          * prevent changing our mempolicy while show_numa_maps()
 616          * is using it.
 617          * Note:  do_set_mempolicy() can be called at init time
 618          * with no 'mm'.
 619          */
 620         if (mm)
 621                 down_write(&mm->mmap_sem);
 622         mpol_put(current->mempolicy);
 623         current->mempolicy = new;
 624         mpol_set_task_struct_flag();
 625         if (new && new->mode == MPOL_INTERLEAVE &&
 626             nodes_weight(new->v.nodes))
 627                 current->il_next = first_node(new->v.nodes);
 628         if (mm)
 629                 up_write(&mm->mmap_sem);
 630
 631         return 0;
 632 }
 633
 634 /*
 635  * Return nodemask for policy for get_mempolicy() query
 636  */
 637 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 638 {
 639         nodes_clear(*nodes);
 640         if (p == &default_policy)
 641                 return;
 642
 643         switch (p->mode) {
 644         case MPOL_BIND:
 645                 /* Fall through */
 646         case MPOL_INTERLEAVE:
 647                 *nodes = p->v.nodes;
 648                 break;
 649         case MPOL_PREFERRED:
 650                 if (!(p->flags & MPOL_F_LOCAL))
 651                         node_set(p->v.preferred_node, *nodes);
 652                 /* else return empty node mask for local allocation */
 653                 break;
 654         default:
 655                 BUG();
 656         }
 657 }
 658
 659 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 660 {
 661         struct page *p;
 662         int err;
 663
 664         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 665         if (err >= 0) {
 666                 err = page_to_nid(p);
 667                 put_page(p);
 668         }
 669         return err;
 670 }
 671
 672 /* Retrieve NUMA policy */
 673 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 674                              unsigned long addr, unsigned long flags)
 675 {
 676         int err;
 677         struct mm_struct *mm = current->mm;
 678         struct vm_area_struct *vma = NULL;
 679         struct mempolicy *pol = current->mempolicy;
 680
 681         cpuset_update_task_memory_state();
 682         if (flags &
 683                 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 684                 return -EINVAL;
 685
 686         if (flags & MPOL_F_MEMS_ALLOWED) {
 687                 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 688                         return -EINVAL;
 689                 *policy = 0;    /* just so it's initialized */
 690                 *nmask  = cpuset_current_mems_allowed;
 691                 return 0;
 692         }
 693
 694         if (flags & MPOL_F_ADDR) {
 695                 /*
 696                  * Do NOT fall back to task policy if the
 697                  * vma/shared policy at addr is NULL.  We
 698                  * want to return MPOL_DEFAULT in this case.
 699                  */
 700                 down_read(&mm->mmap_sem);
 701                 vma = find_vma_intersection(mm, addr, addr+1);
 702                 if (!vma) {
 703                         up_read(&mm->mmap_sem);
 704                         return -EFAULT;
 705                 }
 706                 if (vma->vm_ops && vma->vm_ops->get_policy)
 707                         pol = vma->vm_ops->get_policy(vma, addr);
 708                 else
 709                         pol = vma->vm_policy;
 710         } else if (addr)
 711                 return -EINVAL;
 712
 713         if (!pol)
 714                 pol = &default_policy;  /* indicates default behavior */
 715
 716         if (flags & MPOL_F_NODE) {
 717                 if (flags & MPOL_F_ADDR) {
 718                         err = lookup_node(mm, addr);
 719                         if (err < 0)
 720                                 goto out;
 721                         *policy = err;
 722                 } else if (pol == current->mempolicy &&
 723                                 pol->mode == MPOL_INTERLEAVE) {
 724                         *policy = current->il_next;
 725                 } else {
 726                         err = -EINVAL;
 727                         goto out;
 728                 }
 729         } else {
 730                 *policy = pol == &default_policy ? MPOL_DEFAULT :
 731                                                 pol->mode;
 732                 *policy |= pol->flags;
 733         }
 734
 735         if (vma) {
 736                 up_read(&current->mm->mmap_sem);
 737                 vma = NULL;
 738         }
 739
 740         err = 0;
 741         if (nmask)
 742                 get_policy_nodemask(pol, nmask);
 743
 744  out:
 745         mpol_cond_put(pol);
 746         if (vma)
 747                 up_read(&current->mm->mmap_sem);
 748         return err;
 749 }
 750
 751 #ifdef CONFIG_MIGRATION
 752 /*
 753  * page migration
 754  */
 755 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 756                                 unsigned long flags)
 757 {
 758         /*
 759          * Avoid migrating a page that is shared with others.
 760          */
 761         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
 762                 isolate_lru_page(page, pagelist);
 763 }
 764
 765 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 766 {
 767         return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
 768 }
 769
 770 /*
 771  * Migrate pages from one node to a target node.
 772  * Returns error or the number of pages not migrated.
 773  */
 774 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 775                            int flags)
 776 {
 777         nodemask_t nmask;
 778         LIST_HEAD(pagelist);
 779         int err = 0;
 780
 781         nodes_clear(nmask);
 782         node_set(source, nmask);
 783
 784         check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
 785                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 786
 787         if (!list_empty(&pagelist))
 788                 err = migrate_pages(&pagelist, new_node_page, dest);
 789
 790         return err;
 791 }
 792
 793 /*
 794  * Move pages between the two nodesets so as to preserve the physical
 795  * layout as much as possible.
 796  *
 797  * Returns the number of page that could not be moved.
 798  */
 799 int do_migrate_pages(struct mm_struct *mm,
 800         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 801 {
 802         LIST_HEAD(pagelist);
 803         int busy = 0;
 804         int err = 0;
 805         nodemask_t tmp;
 806
 807         down_read(&mm->mmap_sem);
 808
 809         err = migrate_vmas(mm, from_nodes, to_nodes, flags);
 810         if (err)
 811                 goto out;
 812
 813 /*
 814  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
 815  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
 816  * bit in 'tmp', and return that <source, dest> pair for migration.
 817  * The pair of nodemasks 'to' and 'from' define the map.
 818  *
 819  * If no pair of bits is found that way, fallback to picking some
 820  * pair of 'source' and 'dest' bits that are not the same.  If the
 821  * 'source' and 'dest' bits are the same, this represents a node
 822  * that will be migrating to itself, so no pages need move.
 823  *
 824  * If no bits are left in 'tmp', or if all remaining bits left
 825  * in 'tmp' correspond to the same bit in 'to', return false
 826  * (nothing left to migrate).
 827  *
 828  * This lets us pick a pair of nodes to migrate between, such that
 829  * if possible the dest node is not already occupied by some other
 830  * source node, minimizing the risk of overloading the memory on a
 831  * node that would happen if we migrated incoming memory to a node
 832  * before migrating outgoing memory source that same node.
 833  *
 834  * A single scan of tmp is sufficient.  As we go, we remember the
 835  * most recent <s, d> pair that moved (s != d).  If we find a pair
 836  * that not only moved, but what's better, moved to an empty slot
 837  * (d is not set in tmp), then we break out then, with that pair.
 838  * Otherwise when we finish scannng from_tmp, we at least have the
 839  * most recent <s, d> pair that moved.  If we get all the way through
 840  * the scan of tmp without finding any node that moved, much less
 841  * moved to an empty node, then there is nothing left worth migrating.
 842  */
 843
 844         tmp = *from_nodes;
 845         while (!nodes_empty(tmp)) {
 846                 int s,d;
 847                 int source = -1;
 848                 int dest = 0;
 849
 850                 for_each_node_mask(s, tmp) {
 851                         d = node_remap(s, *from_nodes, *to_nodes);
 852                         if (s == d)
 853                                 continue;
 854
 855                         source = s;     /* Node moved. Memorize */
 856                         dest = d;
 857
 858                         /* dest not in remaining from nodes? */
 859                         if (!node_isset(dest, tmp))
 860                                 break;
 861                 }
 862                 if (source == -1)
 863                         break;
 864
 865                 node_clear(source, tmp);
 866                 err = migrate_to_node(mm, source, dest, flags);
 867                 if (err > 0)
 868                         busy += err;
 869                 if (err < 0)
 870                         break;
 871         }
 872 out:
 873         up_read(&mm->mmap_sem);
 874         if (err < 0)
 875                 return err;
 876         return busy;
 877
 878 }
 879
 880 /*
 881  * Allocate a new page for page migration based on vma policy.
 882  * Start assuming that page is mapped by vma pointed to by @private.
 883  * Search forward from there, if not.  N.B., this assumes that the
 884  * list of pages handed to migrate_pages()--which is how we get here--
 885  * is in virtual address order.
 886  */
 887 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 888 {
 889         struct vm_area_struct *vma = (struct vm_area_struct *)private;
 890         unsigned long uninitialized_var(address);
 891
 892         while (vma) {
 893                 address = page_address_in_vma(page, vma);
 894                 if (address != -EFAULT)
 895                         break;
 896                 vma = vma->vm_next;
 897         }
 898
 899         /*
 900          * if !vma, alloc_page_vma() will use task or system default policy
 901          */
 902         return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
 903 }
 904 #else
 905
 906 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 907                                 unsigned long flags)
 908 {
 909 }
 910
 911 int do_migrate_pages(struct mm_struct *mm,
 912         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 913 {
 914         return -ENOSYS;
 915 }
 916
 917 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 918 {
 919         return NULL;
 920 }
 921 #endif
 922
 923 static long do_mbind(unsigned long start, unsigned long len,
 924                      unsigned short mode, unsigned short mode_flags,
 925                      nodemask_t *nmask, unsigned long flags)
 926 {
 927         struct vm_area_struct *vma;
 928         struct mm_struct *mm = current->mm;
 929         struct mempolicy *new;
 930         unsigned long end;
 931         int err;
 932         LIST_HEAD(pagelist);
 933
 934         if (flags & ~(unsigned long)(MPOL_MF_STRICT |
 935                                      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 936                 return -EINVAL;
 937         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
 938                 return -EPERM;
 939
 940         if (start & ~PAGE_MASK)
 941                 return -EINVAL;
 942
 943         if (mode == MPOL_DEFAULT)
 944                 flags &= ~MPOL_MF_STRICT;
 945
 946         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 947         end = start + len;
 948
 949         if (end < start)
 950                 return -EINVAL;
 951         if (end == start)
 952                 return 0;
 953
 954         new = mpol_new(mode, mode_flags, nmask);
 955         if (IS_ERR(new))
 956                 return PTR_ERR(new);
 957
 958         /*
 959          * If we are using the default policy then operation
 960          * on discontinuous address spaces is okay after all
 961          */
 962         if (!new)
 963                 flags |= MPOL_MF_DISCONTIG_OK;
 964
 965         pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
 966                  start, start + len, mode, mode_flags,
 967                  nmask ? nodes_addr(*nmask)[0] : -1);
 968
 969         down_write(&mm->mmap_sem);
 970         vma = check_range(mm, start, end, nmask,
 971                           flags | MPOL_MF_INVERT, &pagelist);
 972
 973         err = PTR_ERR(vma);
 974         if (!IS_ERR(vma)) {
 975                 int nr_failed = 0;
 976
 977                 err = mbind_range(vma, start, end, new);
 978
 979                 if (!list_empty(&pagelist))
 980                         nr_failed = migrate_pages(&pagelist, new_vma_page,
 981                                                 (unsigned long)vma);
 982
 983                 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 984                         err = -EIO;
 985         }
 986
 987         up_write(&mm->mmap_sem);
 988         mpol_put(new);
 989         return err;
 990 }
 991
 992 /*
 993  * User space interface with variable sized bitmaps for nodelists.
 994  */
 995
 996 /* Copy a node mask from user space. */
 997 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 998                      unsigned long maxnode)
 999 {
1000         unsigned long k;
1001         unsigned long nlongs;
1002         unsigned long endmask;
1003
1004         --maxnode;
1005         nodes_clear(*nodes);
1006         if (maxnode == 0 || !nmask)
1007                 return 0;
1008         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1009                 return -EINVAL;
1010
1011         nlongs = BITS_TO_LONGS(maxnode);
1012         if ((maxnode % BITS_PER_LONG) == 0)
1013                 endmask = ~0UL;
1014         else
1015                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1016
1017         /* When the user specified more nodes than supported just check
1018            if the non supported part is all zero. */
1019         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1020                 if (nlongs > PAGE_SIZE/sizeof(long))
1021                         return -EINVAL;
1022                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1023                         unsigned long t;
1024                         if (get_user(t, nmask + k))
1025                                 return -EFAULT;
1026                         if (k == nlongs - 1) {
1027                                 if (t & endmask)
1028                                         return -EINVAL;
1029                         } else if (t)
1030                                 return -EINVAL;
1031                 }
1032                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1033                 endmask = ~0UL;
1034         }
1035
1036         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1037                 return -EFAULT;
1038         nodes_addr(*nodes)[nlongs-1] &= endmask;
1039         return 0;
1040 }
1041
1042 /* Copy a kernel node mask to user space */
1043 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1044                               nodemask_t *nodes)
1045 {
1046         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1047         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1048
1049         if (copy > nbytes) {
1050                 if (copy > PAGE_SIZE)
1051                         return -EINVAL;
1052                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1053                         return -EFAULT;
1054                 copy = nbytes;
1055         }
1056         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1057 }
1058
1059 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
1060                         unsigned long mode,
1061                         unsigned long __user *nmask, unsigned long maxnode,
1062                         unsigned flags)
1063 {
1064         nodemask_t nodes;
1065         int err;
1066         unsigned short mode_flags;
1067
1068         mode_flags = mode & MPOL_MODE_FLAGS;
1069         mode &= ~MPOL_MODE_FLAGS;
1070         if (mode >= MPOL_MAX)
1071                 return -EINVAL;
1072         if ((mode_flags & MPOL_F_STATIC_NODES) &&
1073             (mode_flags & MPOL_F_RELATIVE_NODES))
1074                 return -EINVAL;
1075         err = get_nodes(&nodes, nmask, maxnode);
1076         if (err)
1077                 return err;
1078         return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1079 }
1080
1081 /* Set the process memory policy */
1082 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
1083                 unsigned long maxnode)
1084 {
1085         int err;
1086         nodemask_t nodes;
1087         unsigned short flags;
1088
1089         flags = mode & MPOL_MODE_FLAGS;
1090         mode &= ~MPOL_MODE_FLAGS;
1091         if ((unsigned int)mode >= MPOL_MAX)
1092                 return -EINVAL;
1093         if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1094                 return -EINVAL;
1095         err = get_nodes(&nodes, nmask, maxnode);
1096         if (err)
1097                 return err;
1098         return do_set_mempolicy(mode, flags, &nodes);
1099 }
1100
1101 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
1102                 const unsigned long __user *old_nodes,
1103                 const unsigned long __user *new_nodes)
1104 {
1105         struct mm_struct *mm;
1106         struct task_struct *task;
1107         nodemask_t old;
1108         nodemask_t new;
1109         nodemask_t task_nodes;
1110         int err;
1111
1112         err = get_nodes(&old, old_nodes, maxnode);
1113         if (err)
1114                 return err;
1115
1116         err = get_nodes(&new, new_nodes, maxnode);
1117         if (err)
1118                 return err;
1119
1120         /* Find the mm_struct */
1121         read_lock(&tasklist_lock);
1122         task = pid ? find_task_by_vpid(pid) : current;
1123         if (!task) {
1124                 read_unlock(&tasklist_lock);
1125                 return -ESRCH;
1126         }
1127         mm = get_task_mm(task);
1128         read_unlock(&tasklist_lock);
1129
1130         if (!mm)
1131                 return -EINVAL;
1132
1133         /*
1134          * Check if this process has the right to modify the specified
1135          * process. The right exists if the process has administrative
1136          * capabilities, superuser privileges or the same
1137          * userid as the target process.
1138          */
1139         if ((current->euid != task->suid) && (current->euid != task->uid) &&
1140             (current->uid != task->suid) && (current->uid != task->uid) &&
1141             !capable(CAP_SYS_NICE)) {
1142                 err = -EPERM;
1143                 goto out;
1144         }
1145
1146         task_nodes = cpuset_mems_allowed(task);
1147         /* Is the user allowed to access the target nodes? */
1148         if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
1149                 err = -EPERM;
1150                 goto out;
1151         }
1152
1153         if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
1154                 err = -EINVAL;
1155                 goto out;
1156         }
1157
1158         err = security_task_movememory(task);
1159         if (err)
1160                 goto out;
1161
1162         err = do_migrate_pages(mm, &old, &new,
1163                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1164 out:
1165         mmput(mm);
1166         return err;
1167 }
1168
1169
1170 /* Retrieve NUMA policy */
1171 asmlinkage long sys_get_mempolicy(int __user *policy,
1172                                 unsigned long __user *nmask,
1173                                 unsigned long maxnode,
1174                                 unsigned long addr, unsigned long flags)
1175 {
1176         int err;
1177         int uninitialized_var(pval);
1178         nodemask_t nodes;
1179
1180         if (nmask != NULL && maxnode < MAX_NUMNODES)
1181                 return -EINVAL;
1182
1183         err = do_get_mempolicy(&pval, &nodes, addr, flags);
1184
1185         if (err)
1186                 return err;
1187
1188         if (policy && put_user(pval, policy))
1189                 return -EFAULT;
1190
1191         if (nmask)
1192                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1193
1194         return err;
1195 }
1196
1197 #ifdef CONFIG_COMPAT
1198
1199 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1200                                      compat_ulong_t __user *nmask,
1201                                      compat_ulong_t maxnode,
1202                                      compat_ulong_t addr, compat_ulong_t flags)
1203 {
1204         long err;
1205         unsigned long __user *nm = NULL;
1206         unsigned long nr_bits, alloc_size;
1207         DECLARE_BITMAP(bm, MAX_NUMNODES);
1208
1209         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1210         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1211
1212         if (nmask)
1213                 nm = compat_alloc_user_space(alloc_size);
1214
1215         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1216
1217         if (!err && nmask) {
1218                 err = copy_from_user(bm, nm, alloc_size);
1219                 /* ensure entire bitmap is zeroed */
1220                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1221                 err |= compat_put_bitmap(nmask, bm, nr_bits);
1222         }
1223
1224         return err;
1225 }
1226
1227 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1228                                      compat_ulong_t maxnode)
1229 {
1230         long err = 0;
1231         unsigned long __user *nm = NULL;
1232         unsigned long nr_bits, alloc_size;
1233         DECLARE_BITMAP(bm, MAX_NUMNODES);
1234
1235         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1236         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1237
1238         if (nmask) {
1239                 err = compat_get_bitmap(bm, nmask, nr_bits);
1240                 nm = compat_alloc_user_space(alloc_size);
1241                 err |= copy_to_user(nm, bm, alloc_size);
1242         }
1243
1244         if (err)
1245                 return -EFAULT;
1246
1247         return sys_set_mempolicy(mode, nm, nr_bits+1);
1248 }
1249
1250 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1251                              compat_ulong_t mode, compat_ulong_t __user *nmask,
1252                              compat_ulong_t maxnode, compat_ulong_t flags)
1253 {
1254         long err = 0;
1255         unsigned long __user *nm = NULL;
1256         unsigned long nr_bits, alloc_size;
1257         nodemask_t bm;
1258
1259         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1260         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1261
1262         if (nmask) {
1263                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1264                 nm = compat_alloc_user_space(alloc_size);
1265                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1266         }
1267
1268         if (err)
1269                 return -EFAULT;
1270
1271         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1272 }
1273
1274 #endif
1275
1276 /*
1277  * get_vma_policy(@task, @vma, @addr)
1278  * @task - task for fallback if vma policy == default
1279  * @vma   - virtual memory area whose policy is sought
1280  * @addr  - address in @vma for shared policy lookup
1281  *
1282  * Returns effective policy for a VMA at specified address.
1283  * Falls back to @task or system default policy, as necessary.
1284  * Current or other task's task mempolicy and non-shared vma policies
1285  * are protected by the task's mmap_sem, which must be held for read by
1286  * the caller.
1287  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1288  * count--added by the get_policy() vm_op, as appropriate--to protect against
1289  * freeing by another task.  It is the caller's responsibility to free the
1290  * extra reference for shared policies.
1291  */
1292 static struct mempolicy *get_vma_policy(struct task_struct *task,
1293                 struct vm_area_struct *vma, unsigned long addr)
1294 {
1295         struct mempolicy *pol = task->mempolicy;
1296
1297         if (vma) {
1298                 if (vma->vm_ops && vma->vm_ops->get_policy) {
1299                         struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1300                                                                         addr);
1301                         if (vpol)
1302                                 pol = vpol;
1303                 } else if (vma->vm_policy)
1304                         pol = vma->vm_policy;
1305         }
1306         if (!pol)
1307                 pol = &default_policy;
1308         return pol;
1309 }
1310
1311 /*
1312  * Return a nodemask representing a mempolicy for filtering nodes for
1313  * page allocation
1314  */
1315 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1316 {
1317         /* Lower zones don't get a nodemask applied for MPOL_BIND */
1318         if (unlikely(policy->mode == MPOL_BIND) &&
1319                         gfp_zone(gfp) >= policy_zone &&
1320                         cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1321                 return &policy->v.nodes;
1322
1323         return NULL;
1324 }
1325
1326 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1327 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
1328 {
1329         int nd = numa_node_id();
1330
1331         switch (policy->mode) {
1332         case MPOL_PREFERRED:
1333                 if (!(policy->flags & MPOL_F_LOCAL))
1334                         nd = policy->v.preferred_node;
1335                 break;
1336         case MPOL_BIND:
1337                 /*
1338                  * Normally, MPOL_BIND allocations are node-local within the
1339                  * allowed nodemask.  However, if __GFP_THISNODE is set and the
1340                  * current node is part of the mask, we use the zonelist for
1341                  * the first node in the mask instead.
1342                  */
1343                 if (unlikely(gfp & __GFP_THISNODE) &&
1344                                 unlikely(!node_isset(nd, policy->v.nodes)))
1345                         nd = first_node(policy->v.nodes);
1346                 break;
1347         case MPOL_INTERLEAVE: /* should not happen */
1348                 break;
1349         default:
1350                 BUG();
1351         }
1352         return node_zonelist(nd, gfp);
1353 }
1354
1355 /* Do dynamic interleaving for a process */
1356 static unsigned interleave_nodes(struct mempolicy *policy)
1357 {
1358         unsigned nid, next;
1359         struct task_struct *me = current;
1360
1361         nid = me->il_next;
1362         next = next_node(nid, policy->v.nodes);
1363         if (next >= MAX_NUMNODES)
1364                 next = first_node(policy->v.nodes);
1365         if (next < MAX_NUMNODES)
1366                 me->il_next = next;
1367         return nid;
1368 }
1369
1370 /*
1371  * Depending on the memory policy provide a node from which to allocate the
1372  * next slab entry.
1373  * @policy must be protected by freeing by the caller.  If @policy is
1374  * the current task's mempolicy, this protection is implicit, as only the
1375  * task can change it's policy.  The system default policy requires no
1376  * such protection.
1377  */
1378 unsigned slab_node(struct mempolicy *policy)
1379 {
1380         if (!policy || policy->flags & MPOL_F_LOCAL)
1381                 return numa_node_id();
1382
1383         switch (policy->mode) {
1384         case MPOL_PREFERRED:
1385                 /*
1386                  * handled MPOL_F_LOCAL above
1387                  */
1388                 return policy->v.preferred_node;
1389
1390         case MPOL_INTERLEAVE:
1391                 return interleave_nodes(policy);
1392
1393         case MPOL_BIND: {
1394                 /*
1395                  * Follow bind policy behavior and start allocation at the
1396                  * first node.
1397                  */
1398                 struct zonelist *zonelist;
1399                 struct zone *zone;
1400                 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1401                 zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1402                 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1403                                                         &policy->v.nodes,
1404                                                         &zone);
1405                 return zone->node;
1406         }
1407
1408         default:
1409                 BUG();
1410         }
1411 }
1412
1413 /* Do static interleaving for a VMA with known offset. */
1414 static unsigned offset_il_node(struct mempolicy *pol,
1415                 struct vm_area_struct *vma, unsigned long off)
1416 {
1417         unsigned nnodes = nodes_weight(pol->v.nodes);
1418         unsigned target;
1419         int c;
1420         int nid = -1;
1421
1422         if (!nnodes)
1423                 return numa_node_id();
1424         target = (unsigned int)off % nnodes;
1425         c = 0;
1426         do {
1427                 nid = next_node(nid, pol->v.nodes);
1428                 c++;
1429         } while (c <= target);
1430         return nid;
1431 }
1432
1433 /* Determine a node number for interleave */
1434 static inline unsigned interleave_nid(struct mempolicy *pol,
1435                  struct vm_area_struct *vma, unsigned long addr, int shift)
1436 {
1437         if (vma) {
1438                 unsigned long off;
1439
1440                 /*
1441                  * for small pages, there is no difference between
1442                  * shift and PAGE_SHIFT, so the bit-shift is safe.
1443                  * for huge pages, since vm_pgoff is in units of small
1444                  * pages, we need to shift off the always 0 bits to get
1445                  * a useful offset.
1446                  */
1447                 BUG_ON(shift < PAGE_SHIFT);
1448                 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1449                 off += (addr - vma->vm_start) >> shift;
1450                 return offset_il_node(pol, vma, off);
1451         } else
1452                 return interleave_nodes(pol);
1453 }
1454
1455 #ifdef CONFIG_HUGETLBFS
1456 /*
1457  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1458  * @vma = virtual memory area whose policy is sought
1459  * @addr = address in @vma for shared policy lookup and interleave policy
1460  * @gfp_flags = for requested zone
1461  * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1462  * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1463  *
1464  * Returns a zonelist suitable for a huge page allocation and a pointer
1465  * to the struct mempolicy for conditional unref after allocation.
1466  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1467  * @nodemask for filtering the zonelist.
1468  */
1469 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1470                                 gfp_t gfp_flags, struct mempolicy **mpol,
1471                                 nodemask_t **nodemask)
1472 {
1473         struct zonelist *zl;
1474
1475         *mpol = get_vma_policy(current, vma, addr);
1476         *nodemask = NULL;       /* assume !MPOL_BIND */
1477
1478         if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1479                 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1480                                                 HPAGE_SHIFT), gfp_flags);
1481         } else {
1482                 zl = policy_zonelist(gfp_flags, *mpol);
1483                 if ((*mpol)->mode == MPOL_BIND)
1484                         *nodemask = &(*mpol)->v.nodes;
1485         }
1486         return zl;
1487 }
1488 #endif
1489
1490 /* Allocate a page in interleaved policy.
1491    Own path because it needs to do special accounting. */
1492 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1493                                         unsigned nid)
1494 {
1495         struct zonelist *zl;
1496         struct page *page;
1497
1498         zl = node_zonelist(nid, gfp);
1499         page = __alloc_pages(gfp, order, zl);
1500         if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1501                 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1502         return page;
1503 }
1504
1505 /**
1506  *      alloc_page_vma  - Allocate a page for a VMA.
1507  *
1508  *      @gfp:
1509  *      %GFP_USER    user allocation.
1510  *      %GFP_KERNEL  kernel allocations,
1511  *      %GFP_HIGHMEM highmem/user allocations,
1512  *      %GFP_FS      allocation should not call back into a file system.
1513  *      %GFP_ATOMIC  don't sleep.
1514  *
1515  *      @vma:  Pointer to VMA or NULL if not available.
1516  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1517  *
1518  *      This function allocates a page from the kernel page pool and applies
1519  *      a NUMA policy associated with the VMA or the current process.
1520  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1521  *      mm_struct of the VMA to prevent it from going away. Should be used for
1522  *      all allocations for pages that will be mapped into
1523  *      user space. Returns NULL when no page can be allocated.
1524  *
1525  *      Should be called with the mm_sem of the vma hold.
1526  */
1527 struct page *
1528 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1529 {
1530         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1531         struct zonelist *zl;
1532
1533         cpuset_update_task_memory_state();
1534
1535         if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1536                 unsigned nid;
1537
1538                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1539                 mpol_cond_put(pol);
1540                 return alloc_page_interleave(gfp, 0, nid);
1541         }
1542         zl = policy_zonelist(gfp, pol);
1543         if (unlikely(mpol_needs_cond_ref(pol))) {
1544                 /*
1545                  * slow path: ref counted shared policy
1546                  */
1547                 struct page *page =  __alloc_pages_nodemask(gfp, 0,
1548                                                 zl, policy_nodemask(gfp, pol));
1549                 __mpol_put(pol);
1550                 return page;
1551         }
1552         /*
1553          * fast path:  default or task policy
1554          */
1555         return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
1556 }
1557
1558 /**
1559  *      alloc_pages_current - Allocate pages.
1560  *
1561  *      @gfp:
1562  *              %GFP_USER   user allocation,
1563  *              %GFP_KERNEL kernel allocation,
1564  *              %GFP_HIGHMEM highmem allocation,
1565  *              %GFP_FS     don't call back into a file system.
1566  *              %GFP_ATOMIC don't sleep.
1567  *      @order: Power of two of allocation size in pages. 0 is a single page.
1568  *
1569  *      Allocate a page from the kernel page pool.  When not in
1570  *      interrupt context and apply the current process NUMA policy.
1571  *      Returns NULL when no page can be allocated.
1572  *
1573  *      Don't call cpuset_update_task_memory_state() unless
1574  *      1) it's ok to take cpuset_sem (can WAIT), and
1575  *      2) allocating for current task (not interrupt).
1576  */
1577 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1578 {
1579         struct mempolicy *pol = current->mempolicy;
1580
1581         if ((gfp & __GFP_WAIT) && !in_interrupt())
1582                 cpuset_update_task_memory_state();
1583         if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1584                 pol = &default_policy;
1585
1586         /*
1587          * No reference counting needed for current->mempolicy
1588          * nor system default_policy
1589          */
1590         if (pol->mode == MPOL_INTERLEAVE)
1591                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1592         return __alloc_pages_nodemask(gfp, order,
1593                         policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
1594 }
1595 EXPORT_SYMBOL(alloc_pages_current);
1596
1597 /*
1598  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
1599  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1600  * with the mems_allowed returned by cpuset_mems_allowed().  This
1601  * keeps mempolicies cpuset relative after its cpuset moves.  See
1602  * further kernel/cpuset.c update_nodemask().
1603  */
1604
1605 /* Slow path of a mempolicy duplicate */
1606 struct mempolicy *__mpol_dup(struct mempolicy *old)
1607 {
1608         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1609
1610         if (!new)
1611                 return ERR_PTR(-ENOMEM);
1612         if (current_cpuset_is_being_rebound()) {
1613                 nodemask_t mems = cpuset_mems_allowed(current);
1614                 mpol_rebind_policy(old, &mems);
1615         }
1616         *new = *old;
1617         atomic_set(&new->refcnt, 1);
1618         return new;
1619 }
1620
1621 /*
1622  * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
1623  * eliminate the * MPOL_F_* flags that require conditional ref and
1624  * [NOTE!!!] drop the extra ref.  Not safe to reference *frompol directly
1625  * after return.  Use the returned value.
1626  *
1627  * Allows use of a mempolicy for, e.g., multiple allocations with a single
1628  * policy lookup, even if the policy needs/has extra ref on lookup.
1629  * shmem_readahead needs this.
1630  */
1631 struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
1632                                                 struct mempolicy *frompol)
1633 {
1634         if (!mpol_needs_cond_ref(frompol))
1635                 return frompol;
1636
1637         *tompol = *frompol;
1638         tompol->flags &= ~MPOL_F_SHARED;        /* copy doesn't need unref */
1639         __mpol_put(frompol);
1640         return tompol;
1641 }
1642
1643 static int mpol_match_intent(const struct mempolicy *a,
1644                              const struct mempolicy *b)
1645 {
1646         if (a->flags != b->flags)
1647                 return 0;
1648         if (!mpol_store_user_nodemask(a))
1649                 return 1;
1650         return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
1651 }
1652
1653 /* Slow path of a mempolicy comparison */
1654 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1655 {
1656         if (!a || !b)
1657                 return 0;
1658         if (a->mode != b->mode)
1659                 return 0;
1660         if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b))
1661                 return 0;
1662         switch (a->mode) {
1663         case MPOL_BIND:
1664                 /* Fall through */
1665         case MPOL_INTERLEAVE:
1666                 return nodes_equal(a->v.nodes, b->v.nodes);
1667         case MPOL_PREFERRED:
1668                 return a->v.preferred_node == b->v.preferred_node &&
1669                         a->flags == b->flags;
1670         default:
1671                 BUG();
1672                 return 0;
1673         }
1674 }
1675
1676 /*
1677  * Shared memory backing store policy support.
1678  *
1679  * Remember policies even when nobody has shared memory mapped.
1680  * The policies are kept in Red-Black tree linked from the inode.
1681  * They are protected by the sp->lock spinlock, which should be held
1682  * for any accesses to the tree.
1683  */
1684
1685 /* lookup first element intersecting start-end */
1686 /* Caller holds sp->lock */
1687 static struct sp_node *
1688 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1689 {
1690         struct rb_node *n = sp->root.rb_node;
1691
1692         while (n) {
1693                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1694
1695                 if (start >= p->end)
1696                         n = n->rb_right;
1697                 else if (end <= p->start)
1698                         n = n->rb_left;
1699                 else
1700                         break;
1701         }
1702         if (!n)
1703                 return NULL;
1704         for (;;) {
1705                 struct sp_node *w = NULL;
1706                 struct rb_node *prev = rb_prev(n);
1707                 if (!prev)
1708                         break;
1709                 w = rb_entry(prev, struct sp_node, nd);
1710                 if (w->end <= start)
1711                         break;
1712                 n = prev;
1713         }
1714         return rb_entry(n, struct sp_node, nd);
1715 }
1716
1717 /* Insert a new shared policy into the list. */
1718 /* Caller holds sp->lock */
1719 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1720 {
1721         struct rb_node **p = &sp->root.rb_node;
1722         struct rb_node *parent = NULL;
1723         struct sp_node *nd;
1724
1725         while (*p) {
1726                 parent = *p;
1727                 nd = rb_entry(parent, struct sp_node, nd);
1728                 if (new->start < nd->start)
1729                         p = &(*p)->rb_left;
1730                 else if (new->end > nd->end)
1731                         p = &(*p)->rb_right;
1732                 else
1733                         BUG();
1734         }
1735         rb_link_node(&new->nd, parent, p);
1736         rb_insert_color(&new->nd, &sp->root);
1737         pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
1738                  new->policy ? new->policy->mode : 0);
1739 }
1740
1741 /* Find shared policy intersecting idx */
1742 struct mempolicy *
1743 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1744 {
1745         struct mempolicy *pol = NULL;
1746         struct sp_node *sn;
1747
1748         if (!sp->root.rb_node)
1749                 return NULL;
1750         spin_lock(&sp->lock);
1751         sn = sp_lookup(sp, idx, idx+1);
1752         if (sn) {
1753                 mpol_get(sn->policy);
1754                 pol = sn->policy;
1755         }
1756         spin_unlock(&sp->lock);
1757         return pol;
1758 }
1759
1760 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1761 {
1762         pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1763         rb_erase(&n->nd, &sp->root);
1764         mpol_put(n->policy);
1765         kmem_cache_free(sn_cache, n);
1766 }
1767
1768 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1769                                 struct mempolicy *pol)
1770 {
1771         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1772
1773         if (!n)
1774                 return NULL;
1775         n->start = start;
1776         n->end = end;
1777         mpol_get(pol);
1778         pol->flags |= MPOL_F_SHARED;    /* for unref */
1779         n->policy = pol;
1780         return n;
1781 }
1782
1783 /* Replace a policy range. */
1784 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1785                                  unsigned long end, struct sp_node *new)
1786 {
1787         struct sp_node *n, *new2 = NULL;
1788
1789 restart:
1790         spin_lock(&sp->lock);
1791         n = sp_lookup(sp, start, end);
1792         /* Take care of old policies in the same range. */
1793         while (n && n->start < end) {
1794                 struct rb_node *next = rb_next(&n->nd);
1795                 if (n->start >= start) {
1796                         if (n->end <= end)
1797                                 sp_delete(sp, n);
1798                         else
1799                                 n->start = end;
1800                 } else {
1801                         /* Old policy spanning whole new range. */
1802                         if (n->end > end) {
1803                                 if (!new2) {
1804                                         spin_unlock(&sp->lock);
1805                                         new2 = sp_alloc(end, n->end, n->policy);
1806                                         if (!new2)
1807                                                 return -ENOMEM;
1808                                         goto restart;
1809                                 }
1810                                 n->end = start;
1811                                 sp_insert(sp, new2);
1812                                 new2 = NULL;
1813                                 break;
1814                         } else
1815                                 n->end = start;
1816                 }
1817                 if (!next)
1818                         break;
1819                 n = rb_entry(next, struct sp_node, nd);
1820         }
1821         if (new)
1822                 sp_insert(sp, new);
1823         spin_unlock(&sp->lock);
1824         if (new2) {
1825                 mpol_put(new2->policy);
1826                 kmem_cache_free(sn_cache, new2);
1827         }
1828         return 0;
1829 }
1830
1831 /**
1832  * mpol_shared_policy_init - initialize shared policy for inode
1833  * @sp: pointer to inode shared policy
1834  * @mpol:  struct mempolicy to install
1835  *
1836  * Install non-NULL @mpol in inode's shared policy rb-tree.
1837  * On entry, the current task has a reference on a non-NULL @mpol.
1838  * This must be released on exit.
1839  */
1840 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1841 {
1842         sp->root = RB_ROOT;             /* empty tree == default mempolicy */
1843         spin_lock_init(&sp->lock);
1844
1845         if (mpol) {
1846                 struct vm_area_struct pvma;
1847                 struct mempolicy *new;
1848
1849                 /* contextualize the tmpfs mount point mempolicy */
1850                 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
1851                 mpol_put(mpol); /* drop our ref on sb mpol */
1852                 if (IS_ERR(new))
1853                         return;         /* no valid nodemask intersection */
1854
1855                 /* Create pseudo-vma that contains just the policy */
1856                 memset(&pvma, 0, sizeof(struct vm_area_struct));
1857                 pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
1858                 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
1859                 mpol_put(new);                  /* drop initial ref */
1860         }
1861 }
1862
1863 int mpol_set_shared_policy(struct shared_policy *info,
1864                         struct vm_area_struct *vma, struct mempolicy *npol)
1865 {
1866         int err;
1867         struct sp_node *new = NULL;
1868         unsigned long sz = vma_pages(vma);
1869
1870         pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
1871                  vma->vm_pgoff,
1872                  sz, npol ? npol->mode : -1,
1873                  npol ? npol->flags : -1,
1874                  npol ? nodes_addr(npol->v.nodes)[0] : -1);
1875
1876         if (npol) {
1877                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1878                 if (!new)
1879                         return -ENOMEM;
1880         }
1881         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1882         if (err && new)
1883                 kmem_cache_free(sn_cache, new);
1884         return err;
1885 }
1886
1887 /* Free a backing policy store on inode delete. */
1888 void mpol_free_shared_policy(struct shared_policy *p)
1889 {
1890         struct sp_node *n;
1891         struct rb_node *next;
1892
1893         if (!p->root.rb_node)
1894                 return;
1895         spin_lock(&p->lock);
1896         next = rb_first(&p->root);
1897         while (next) {
1898                 n = rb_entry(next, struct sp_node, nd);
1899                 next = rb_next(&n->nd);
1900                 rb_erase(&n->nd, &p->root);
1901                 mpol_put(n->policy);
1902                 kmem_cache_free(sn_cache, n);
1903         }
1904         spin_unlock(&p->lock);
1905 }
1906
1907 /* assumes fs == KERNEL_DS */
1908 void __init numa_policy_init(void)
1909 {
1910         nodemask_t interleave_nodes;
1911         unsigned long largest = 0;
1912         int nid, prefer = 0;
1913
1914         policy_cache = kmem_cache_create("numa_policy",
1915                                          sizeof(struct mempolicy),
1916                                          0, SLAB_PANIC, NULL);
1917
1918         sn_cache = kmem_cache_create("shared_policy_node",
1919                                      sizeof(struct sp_node),
1920                                      0, SLAB_PANIC, NULL);
1921
1922         /*
1923          * Set interleaving policy for system init. Interleaving is only
1924          * enabled across suitably sized nodes (default is >= 16MB), or
1925          * fall back to the largest node if they're all smaller.
1926          */
1927         nodes_clear(interleave_nodes);
1928         for_each_node_state(nid, N_HIGH_MEMORY) {
1929                 unsigned long total_pages = node_present_pages(nid);
1930
1931                 /* Preserve the largest node */
1932                 if (largest < total_pages) {
1933                         largest = total_pages;
1934                         prefer = nid;
1935                 }
1936
1937                 /* Interleave this node? */
1938                 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
1939                         node_set(nid, interleave_nodes);
1940         }
1941
1942         /* All too small, use the largest */
1943         if (unlikely(nodes_empty(interleave_nodes)))
1944                 node_set(prefer, interleave_nodes);
1945
1946         if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
1947                 printk("numa_policy_init: interleaving failed\n");
1948 }
1949
1950 /* Reset policy of current process to default */
1951 void numa_default_policy(void)
1952 {
1953         do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
1954 }
1955
1956 /*
1957  * Parse and format mempolicy from/to strings
1958  */
1959
1960 /*
1961  * "local" is pseudo-policy:  MPOL_PREFERRED with MPOL_F_LOCAL flag
1962  * Used only for mpol_parse_str() and mpol_to_str()
1963  */
1964 #define MPOL_LOCAL (MPOL_INTERLEAVE + 1)
1965 static const char * const policy_types[] =
1966         { "default", "prefer", "bind", "interleave", "local" };
1967
1968
1969 #ifdef CONFIG_TMPFS
1970 /**
1971  * mpol_parse_str - parse string to mempolicy
1972  * @str:  string containing mempolicy to parse
1973  * @mpol:  pointer to struct mempolicy pointer, returned on success.
1974  * @no_context:  flag whether to "contextualize" the mempolicy
1975  *
1976  * Format of input:
1977  *      <mode>[=<flags>][:<nodelist>]
1978  *
1979  * if @no_context is true, save the input nodemask in w.user_nodemask in
1980  * the returned mempolicy.  This will be used to "clone" the mempolicy in
1981  * a specific context [cpuset] at a later time.  Used to parse tmpfs mpol
1982  * mount option.  Note that if 'static' or 'relative' mode flags were
1983  * specified, the input nodemask will already have been saved.  Saving
1984  * it again is redundant, but safe.
1985  *
1986  * On success, returns 0, else 1
1987  */
1988 int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
1989 {
1990         struct mempolicy *new = NULL;
1991         unsigned short uninitialized_var(mode);
1992         unsigned short uninitialized_var(mode_flags);
1993         nodemask_t nodes;
1994         char *nodelist = strchr(str, ':');
1995         char *flags = strchr(str, '=');
1996         int i;
1997         int err = 1;
1998
1999         if (nodelist) {
2000                 /* NUL-terminate mode or flags string */
2001                 *nodelist++ = '\0';
2002                 if (nodelist_parse(nodelist, nodes))
2003                         goto out;
2004                 if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
2005                         goto out;
2006         } else
2007                 nodes_clear(nodes);
2008
2009         if (flags)
2010                 *flags++ = '\0';        /* terminate mode string */
2011
2012         for (i = 0; i <= MPOL_LOCAL; i++) {
2013                 if (!strcmp(str, policy_types[i])) {
2014                         mode = i;
2015                         break;
2016                 }
2017         }
2018         if (i > MPOL_LOCAL)
2019                 goto out;
2020
2021         switch (mode) {
2022         case MPOL_PREFERRED:
2023                 /*
2024                  * Insist on a nodelist of one node only
2025                  */
2026                 if (nodelist) {
2027                         char *rest = nodelist;
2028                         while (isdigit(*rest))
2029                                 rest++;
2030                         if (!*rest)
2031                                 err = 0;
2032                 }
2033                 break;
2034         case MPOL_INTERLEAVE:
2035                 /*
2036                  * Default to online nodes with memory if no nodelist
2037                  */
2038                 if (!nodelist)
2039                         nodes = node_states[N_HIGH_MEMORY];
2040                 err = 0;
2041                 break;
2042         case MPOL_LOCAL:
2043                 /*
2044                  * Don't allow a nodelist;  mpol_new() checks flags
2045                  */
2046                 if (nodelist)
2047                         goto out;
2048                 mode = MPOL_PREFERRED;
2049                 break;
2050
2051         /*
2052          * case MPOL_BIND:    mpol_new() enforces non-empty nodemask.
2053          * case MPOL_DEFAULT: mpol_new() enforces empty nodemask, ignores flags.
2054          */
2055         }
2056
2057         mode_flags = 0;
2058         if (flags) {
2059                 /*
2060                  * Currently, we only support two mutually exclusive
2061                  * mode flags.
2062                  */
2063                 if (!strcmp(flags, "static"))
2064                         mode_flags |= MPOL_F_STATIC_NODES;
2065                 else if (!strcmp(flags, "relative"))
2066                         mode_flags |= MPOL_F_RELATIVE_NODES;
2067                 else
2068                         err = 1;
2069         }
2070
2071         new = mpol_new(mode, mode_flags, &nodes);
2072         if (IS_ERR(new))
2073                 err = 1;
2074         else if (no_context)
2075                 new->w.user_nodemask = nodes;   /* save for contextualization */
2076
2077 out:
2078         /* Restore string for error message */
2079         if (nodelist)
2080                 *--nodelist = ':';
2081         if (flags)
2082                 *--flags = '=';
2083         if (!err)
2084                 *mpol = new;
2085         return err;
2086 }
2087 #endif /* CONFIG_TMPFS */
2088
2089 /**
2090  * mpol_to_str - format a mempolicy structure for printing
2091  * @buffer:  to contain formatted mempolicy string
2092  * @maxlen:  length of @buffer
2093  * @pol:  pointer to mempolicy to be formatted
2094  * @no_context:  "context free" mempolicy - use nodemask in w.user_nodemask
2095  *
2096  * Convert a mempolicy into a string.
2097  * Returns the number of characters in buffer (if positive)
2098  * or an error (negative)
2099  */
2100 int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2101 {
2102         char *p = buffer;
2103         int l;
2104         nodemask_t nodes;
2105         unsigned short mode;
2106         unsigned short flags = pol ? pol->flags : 0;
2107
2108         /*
2109          * Sanity check:  room for longest mode, flag and some nodes
2110          */
2111         VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2112
2113         if (!pol || pol == &default_policy)
2114                 mode = MPOL_DEFAULT;
2115         else
2116                 mode = pol->mode;
2117
2118         switch (mode) {
2119         case MPOL_DEFAULT:
2120                 nodes_clear(nodes);
2121                 break;
2122
2123         case MPOL_PREFERRED:
2124                 nodes_clear(nodes);
2125                 if (flags & MPOL_F_LOCAL)
2126                         mode = MPOL_LOCAL;      /* pseudo-policy */
2127                 else
2128                         node_set(pol->v.preferred_node, nodes);
2129                 break;
2130
2131         case MPOL_BIND:
2132                 /* Fall through */
2133         case MPOL_INTERLEAVE:
2134                 if (no_context)
2135                         nodes = pol->w.user_nodemask;
2136                 else
2137                         nodes = pol->v.nodes;
2138                 break;
2139
2140         default:
2141                 BUG();
2142         }
2143
2144         l = strlen(policy_types[mode]);
2145         if (buffer + maxlen < p + l + 1)
2146                 return -ENOSPC;
2147
2148         strcpy(p, policy_types[mode]);
2149         p += l;
2150
2151         if (flags & MPOL_MODE_FLAGS) {
2152                 if (buffer + maxlen < p + 2)
2153                         return -ENOSPC;
2154                 *p++ = '=';
2155
2156                 /*
2157                  * Currently, the only defined flags are mutually exclusive
2158                  */
2159                 if (flags & MPOL_F_STATIC_NODES)
2160                         p += snprintf(p, buffer + maxlen - p, "static");
2161                 else if (flags & MPOL_F_RELATIVE_NODES)
2162                         p += snprintf(p, buffer + maxlen - p, "relative");
2163         }
2164
2165         if (!nodes_empty(nodes)) {
2166                 if (buffer + maxlen < p + 2)
2167                         return -ENOSPC;
2168                 *p++ = ':';
2169                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2170         }
2171         return p - buffer;
2172 }
2173
2174 struct numa_maps {
2175         unsigned long pages;
2176         unsigned long anon;
2177         unsigned long active;
2178         unsigned long writeback;
2179         unsigned long mapcount_max;
2180         unsigned long dirty;
2181         unsigned long swapcache;
2182         unsigned long node[MAX_NUMNODES];
2183 };
2184
2185 static void gather_stats(struct page *page, void *private, int pte_dirty)
2186 {
2187         struct numa_maps *md = private;
2188         int count = page_mapcount(page);
2189
2190         md->pages++;
2191         if (pte_dirty || PageDirty(page))
2192                 md->dirty++;
2193
2194         if (PageSwapCache(page))
2195                 md->swapcache++;
2196
2197         if (PageActive(page))
2198                 md->active++;
2199
2200         if (PageWriteback(page))
2201                 md->writeback++;
2202
2203         if (PageAnon(page))
2204                 md->anon++;
2205
2206         if (count > md->mapcount_max)
2207                 md->mapcount_max = count;
2208
2209         md->node[page_to_nid(page)]++;
2210 }
2211
2212 #ifdef CONFIG_HUGETLB_PAGE
2213 static void check_huge_range(struct vm_area_struct *vma,
2214                 unsigned long start, unsigned long end,
2215                 struct numa_maps *md)
2216 {
2217         unsigned long addr;
2218         struct page *page;
2219
2220         for (addr = start; addr < end; addr += HPAGE_SIZE) {
2221                 pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
2222                 pte_t pte;
2223
2224                 if (!ptep)
2225                         continue;
2226
2227                 pte = *ptep;
2228                 if (pte_none(pte))
2229                         continue;
2230
2231                 page = pte_page(pte);
2232                 if (!page)
2233                         continue;
2234
2235                 gather_stats(page, md, pte_dirty(*ptep));
2236         }
2237 }
2238 #else
2239 static inline void check_huge_range(struct vm_area_struct *vma,
2240                 unsigned long start, unsigned long end,
2241                 struct numa_maps *md)
2242 {
2243 }
2244 #endif
2245
2246 /*
2247  * Display pages allocated per node and memory policy via /proc.
2248  */
2249 int show_numa_map(struct seq_file *m, void *v)
2250 {
2251         struct proc_maps_private *priv = m->private;
2252         struct vm_area_struct *vma = v;
2253         struct numa_maps *md;
2254         struct file *file = vma->vm_file;
2255         struct mm_struct *mm = vma->vm_mm;
2256         struct mempolicy *pol;
2257         int n;
2258         char buffer[50];
2259
2260         if (!mm)
2261                 return 0;
2262
2263         md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
2264         if (!md)
2265                 return 0;
2266
2267         pol = get_vma_policy(priv->task, vma, vma->vm_start);
2268         mpol_to_str(buffer, sizeof(buffer), pol, 0);
2269         mpol_cond_put(pol);
2270
2271         seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2272
2273         if (file) {
2274                 seq_printf(m, " file=");
2275                 seq_path(m, &file->f_path, "\n\t= ");
2276         } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2277                 seq_printf(m, " heap");
2278         } else if (vma->vm_start <= mm->start_stack &&
2279                         vma->vm_end >= mm->start_stack) {
2280                 seq_printf(m, " stack");
2281         }
2282
2283         if (is_vm_hugetlb_page(vma)) {
2284                 check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2285                 seq_printf(m, " huge");
2286         } else {
2287                 check_pgd_range(vma, vma->vm_start, vma->vm_end,
2288                         &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
2289         }
2290
2291         if (!md->pages)
2292                 goto out;
2293
2294         if (md->anon)
2295                 seq_printf(m," anon=%lu",md->anon);
2296
2297         if (md->dirty)
2298                 seq_printf(m," dirty=%lu",md->dirty);
2299
2300         if (md->pages != md->anon && md->pages != md->dirty)
2301                 seq_printf(m, " mapped=%lu", md->pages);
2302
2303         if (md->mapcount_max > 1)
2304                 seq_printf(m, " mapmax=%lu", md->mapcount_max);
2305
2306         if (md->swapcache)
2307                 seq_printf(m," swapcache=%lu", md->swapcache);
2308
2309         if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2310                 seq_printf(m," active=%lu", md->active);
2311
2312         if (md->writeback)
2313                 seq_printf(m," writeback=%lu", md->writeback);
2314
2315         for_each_node_state(n, N_HIGH_MEMORY)
2316                 if (md->node[n])
2317                         seq_printf(m, " N%d=%lu", n, md->node[n]);
2318 out:
2319         seq_putc(m, '\n');
2320         kfree(md);
2321
2322         if (m->count < m->size)
2323                 m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
2324         return 0;
2325 }