mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66 */
  67
  68 #include <linux/mempolicy.h>
  69 #include <linux/mm.h>
  70 #include <linux/highmem.h>
  71 #include <linux/hugetlb.h>
  72 #include <linux/kernel.h>
  73 #include <linux/sched.h>
  74 #include <linux/nodemask.h>
  75 #include <linux/cpuset.h>
  76 #include <linux/gfp.h>
  77 #include <linux/slab.h>
  78 #include <linux/string.h>
  79 #include <linux/module.h>
  80 #include <linux/nsproxy.h>
  81 #include <linux/interrupt.h>
  82 #include <linux/init.h>
  83 #include <linux/compat.h>
  84 #include <linux/swap.h>
  85 #include <linux/seq_file.h>
  86 #include <linux/proc_fs.h>
  87 #include <linux/migrate.h>
  88 #include <linux/rmap.h>
  89 #include <linux/security.h>
  90 #include <linux/syscalls.h>
  91 #include <linux/ctype.h>
  92
  93 #include <asm/tlbflush.h>
  94 #include <asm/uaccess.h>
  95
  96 /* Internal flags */
  97 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
  98 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
  99 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
 100
 101 static struct kmem_cache *policy_cache;
 102 static struct kmem_cache *sn_cache;
 103
 104 /* Highest zone. An specific allocation for a zone below that is not
 105    policied. */
 106 enum zone_type policy_zone = 0;
 107
 108 /*
 109  * run-time system-wide default policy => local allocation
 110  */
 111 struct mempolicy default_policy = {
 112         .refcnt = ATOMIC_INIT(1), /* never free it */
 113         .mode = MPOL_PREFERRED,
 114         .flags = MPOL_F_LOCAL,
 115 };
 116
 117 static const struct mempolicy_operations {
 118         int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 119         void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
 120 } mpol_ops[MPOL_MAX];
 121
 122 /* Check that the nodemask contains at least one populated zone */
 123 static int is_valid_nodemask(const nodemask_t *nodemask)
 124 {
 125         int nd, k;
 126
 127         /* Check that there is something useful in this mask */
 128         k = policy_zone;
 129
 130         for_each_node_mask(nd, *nodemask) {
 131                 struct zone *z;
 132
 133                 for (k = 0; k <= policy_zone; k++) {
 134                         z = &NODE_DATA(nd)->node_zones[k];
 135                         if (z->present_pages > 0)
 136                                 return 1;
 137                 }
 138         }
 139
 140         return 0;
 141 }
 142
 143 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 144 {
 145         return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
 146 }
 147
 148 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 149                                    const nodemask_t *rel)
 150 {
 151         nodemask_t tmp;
 152         nodes_fold(tmp, *orig, nodes_weight(*rel));
 153         nodes_onto(*ret, tmp, *rel);
 154 }
 155
 156 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 157 {
 158         if (nodes_empty(*nodes))
 159                 return -EINVAL;
 160         pol->v.nodes = *nodes;
 161         return 0;
 162 }
 163
 164 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 165 {
 166         if (!nodes)
 167                 pol->flags |= MPOL_F_LOCAL;     /* local allocation */
 168         else if (nodes_empty(*nodes))
 169                 return -EINVAL;                 /*  no allowed nodes */
 170         else
 171                 pol->v.preferred_node = first_node(*nodes);
 172         return 0;
 173 }
 174
 175 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 176 {
 177         if (!is_valid_nodemask(nodes))
 178                 return -EINVAL;
 179         pol->v.nodes = *nodes;
 180         return 0;
 181 }
 182
 183 /* Create a new policy */
 184 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 185                                   nodemask_t *nodes)
 186 {
 187         struct mempolicy *policy;
 188         nodemask_t cpuset_context_nmask;
 189         int ret;
 190
 191         pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 192                  mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
 193
 194         if (mode == MPOL_DEFAULT) {
 195                 if (nodes && !nodes_empty(*nodes))
 196                         return ERR_PTR(-EINVAL);
 197                 return NULL;    /* simply delete any existing policy */
 198         }
 199         VM_BUG_ON(!nodes);
 200
 201         /*
 202          * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 203          * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 204          * All other modes require a valid pointer to a non-empty nodemask.
 205          */
 206         if (mode == MPOL_PREFERRED) {
 207                 if (nodes_empty(*nodes)) {
 208                         if (((flags & MPOL_F_STATIC_NODES) ||
 209                              (flags & MPOL_F_RELATIVE_NODES)))
 210                                 return ERR_PTR(-EINVAL);
 211                         nodes = NULL;   /* flag local alloc */
 212                 }
 213         } else if (nodes_empty(*nodes))
 214                 return ERR_PTR(-EINVAL);
 215         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 216         if (!policy)
 217                 return ERR_PTR(-ENOMEM);
 218         atomic_set(&policy->refcnt, 1);
 219         policy->mode = mode;
 220         policy->flags = flags;
 221
 222         if (nodes) {
 223                 /*
 224                  * cpuset related setup doesn't apply to local allocation
 225                  */
 226                 cpuset_update_task_memory_state();
 227                 if (flags & MPOL_F_RELATIVE_NODES)
 228                         mpol_relative_nodemask(&cpuset_context_nmask, nodes,
 229                                                &cpuset_current_mems_allowed);
 230                 else
 231                         nodes_and(cpuset_context_nmask, *nodes,
 232                                   cpuset_current_mems_allowed);
 233                 if (mpol_store_user_nodemask(policy))
 234                         policy->w.user_nodemask = *nodes;
 235                 else
 236                         policy->w.cpuset_mems_allowed =
 237                                                 cpuset_mems_allowed(current);
 238         }
 239
 240         ret = mpol_ops[mode].create(policy,
 241                                 nodes ? &cpuset_context_nmask : NULL);
 242         if (ret < 0) {
 243                 kmem_cache_free(policy_cache, policy);
 244                 return ERR_PTR(ret);
 245         }
 246         return policy;
 247 }
 248
 249 /* Slow path of a mpol destructor. */
 250 void __mpol_put(struct mempolicy *p)
 251 {
 252         if (!atomic_dec_and_test(&p->refcnt))
 253                 return;
 254         kmem_cache_free(policy_cache, p);
 255 }
 256
 257 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
 258 {
 259 }
 260
 261 static void mpol_rebind_nodemask(struct mempolicy *pol,
 262                                  const nodemask_t *nodes)
 263 {
 264         nodemask_t tmp;
 265
 266         if (pol->flags & MPOL_F_STATIC_NODES)
 267                 nodes_and(tmp, pol->w.user_nodemask, *nodes);
 268         else if (pol->flags & MPOL_F_RELATIVE_NODES)
 269                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 270         else {
 271                 nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
 272                             *nodes);
 273                 pol->w.cpuset_mems_allowed = *nodes;
 274         }
 275
 276         pol->v.nodes = tmp;
 277         if (!node_isset(current->il_next, tmp)) {
 278                 current->il_next = next_node(current->il_next, tmp);
 279                 if (current->il_next >= MAX_NUMNODES)
 280                         current->il_next = first_node(tmp);
 281                 if (current->il_next >= MAX_NUMNODES)
 282                         current->il_next = numa_node_id();
 283         }
 284 }
 285
 286 static void mpol_rebind_preferred(struct mempolicy *pol,
 287                                   const nodemask_t *nodes)
 288 {
 289         nodemask_t tmp;
 290
 291         if (pol->flags & MPOL_F_STATIC_NODES) {
 292                 int node = first_node(pol->w.user_nodemask);
 293
 294                 if (node_isset(node, *nodes)) {
 295                         pol->v.preferred_node = node;
 296                         pol->flags &= ~MPOL_F_LOCAL;
 297                 } else
 298                         pol->flags |= MPOL_F_LOCAL;
 299         } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 300                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 301                 pol->v.preferred_node = first_node(tmp);
 302         } else if (!(pol->flags & MPOL_F_LOCAL)) {
 303                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
 304                                                    pol->w.cpuset_mems_allowed,
 305                                                    *nodes);
 306                 pol->w.cpuset_mems_allowed = *nodes;
 307         }
 308 }
 309
 310 /* Migrate a policy to a different set of nodes */
 311 static void mpol_rebind_policy(struct mempolicy *pol,
 312                                const nodemask_t *newmask)
 313 {
 314         if (!pol)
 315                 return;
 316         if (!mpol_store_user_nodemask(pol) &&
 317             nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 318                 return;
 319         mpol_ops[pol->mode].rebind(pol, newmask);
 320 }
 321
 322 /*
 323  * Wrapper for mpol_rebind_policy() that just requires task
 324  * pointer, and updates task mempolicy.
 325  */
 326
 327 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
 328 {
 329         mpol_rebind_policy(tsk->mempolicy, new);
 330 }
 331
 332 /*
 333  * Rebind each vma in mm to new nodemask.
 334  *
 335  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
 336  */
 337
 338 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 339 {
 340         struct vm_area_struct *vma;
 341
 342         down_write(&mm->mmap_sem);
 343         for (vma = mm->mmap; vma; vma = vma->vm_next)
 344                 mpol_rebind_policy(vma->vm_policy, new);
 345         up_write(&mm->mmap_sem);
 346 }
 347
 348 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 349         [MPOL_DEFAULT] = {
 350                 .rebind = mpol_rebind_default,
 351         },
 352         [MPOL_INTERLEAVE] = {
 353                 .create = mpol_new_interleave,
 354                 .rebind = mpol_rebind_nodemask,
 355         },
 356         [MPOL_PREFERRED] = {
 357                 .create = mpol_new_preferred,
 358                 .rebind = mpol_rebind_preferred,
 359         },
 360         [MPOL_BIND] = {
 361                 .create = mpol_new_bind,
 362                 .rebind = mpol_rebind_nodemask,
 363         },
 364 };
 365
 366 static void gather_stats(struct page *, void *, int pte_dirty);
 367 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 368                                 unsigned long flags);
 369
 370 /* Scan through pages checking if pages follow certain conditions. */
 371 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 372                 unsigned long addr, unsigned long end,
 373                 const nodemask_t *nodes, unsigned long flags,
 374                 void *private)
 375 {
 376         pte_t *orig_pte;
 377         pte_t *pte;
 378         spinlock_t *ptl;
 379
 380         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 381         do {
 382                 struct page *page;
 383                 int nid;
 384
 385                 if (!pte_present(*pte))
 386                         continue;
 387                 page = vm_normal_page(vma, addr, *pte);
 388                 if (!page)
 389                         continue;
 390                 /*
 391                  * The check for PageReserved here is important to avoid
 392                  * handling zero pages and other pages that may have been
 393                  * marked special by the system.
 394                  *
 395                  * If the PageReserved would not be checked here then f.e.
 396                  * the location of the zero page could have an influence
 397                  * on MPOL_MF_STRICT, zero pages would be counted for
 398                  * the per node stats, and there would be useless attempts
 399                  * to put zero pages on the migration list.
 400                  */
 401                 if (PageReserved(page))
 402                         continue;
 403                 nid = page_to_nid(page);
 404                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 405                         continue;
 406
 407                 if (flags & MPOL_MF_STATS)
 408                         gather_stats(page, private, pte_dirty(*pte));
 409                 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 410                         migrate_page_add(page, private, flags);
 411                 else
 412                         break;
 413         } while (pte++, addr += PAGE_SIZE, addr != end);
 414         pte_unmap_unlock(orig_pte, ptl);
 415         return addr != end;
 416 }
 417
 418 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 419                 unsigned long addr, unsigned long end,
 420                 const nodemask_t *nodes, unsigned long flags,
 421                 void *private)
 422 {
 423         pmd_t *pmd;
 424         unsigned long next;
 425
 426         pmd = pmd_offset(pud, addr);
 427         do {
 428                 next = pmd_addr_end(addr, end);
 429                 if (pmd_none_or_clear_bad(pmd))
 430                         continue;
 431                 if (check_pte_range(vma, pmd, addr, next, nodes,
 432                                     flags, private))
 433                         return -EIO;
 434         } while (pmd++, addr = next, addr != end);
 435         return 0;
 436 }
 437
 438 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 439                 unsigned long addr, unsigned long end,
 440                 const nodemask_t *nodes, unsigned long flags,
 441                 void *private)
 442 {
 443         pud_t *pud;
 444         unsigned long next;
 445
 446         pud = pud_offset(pgd, addr);
 447         do {
 448                 next = pud_addr_end(addr, end);
 449                 if (pud_none_or_clear_bad(pud))
 450                         continue;
 451                 if (check_pmd_range(vma, pud, addr, next, nodes,
 452                                     flags, private))
 453                         return -EIO;
 454         } while (pud++, addr = next, addr != end);
 455         return 0;
 456 }
 457
 458 static inline int check_pgd_range(struct vm_area_struct *vma,
 459                 unsigned long addr, unsigned long end,
 460                 const nodemask_t *nodes, unsigned long flags,
 461                 void *private)
 462 {
 463         pgd_t *pgd;
 464         unsigned long next;
 465
 466         pgd = pgd_offset(vma->vm_mm, addr);
 467         do {
 468                 next = pgd_addr_end(addr, end);
 469                 if (pgd_none_or_clear_bad(pgd))
 470                         continue;
 471                 if (check_pud_range(vma, pgd, addr, next, nodes,
 472                                     flags, private))
 473                         return -EIO;
 474         } while (pgd++, addr = next, addr != end);
 475         return 0;
 476 }
 477
 478 /*
 479  * Check if all pages in a range are on a set of nodes.
 480  * If pagelist != NULL then isolate pages from the LRU and
 481  * put them on the pagelist.
 482  */
 483 static struct vm_area_struct *
 484 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 485                 const nodemask_t *nodes, unsigned long flags, void *private)
 486 {
 487         int err;
 488         struct vm_area_struct *first, *vma, *prev;
 489
 490         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 491
 492                 err = migrate_prep();
 493                 if (err)
 494                         return ERR_PTR(err);
 495         }
 496
 497         first = find_vma(mm, start);
 498         if (!first)
 499                 return ERR_PTR(-EFAULT);
 500         prev = NULL;
 501         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 502                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 503                         if (!vma->vm_next && vma->vm_end < end)
 504                                 return ERR_PTR(-EFAULT);
 505                         if (prev && prev->vm_end < vma->vm_start)
 506                                 return ERR_PTR(-EFAULT);
 507                 }
 508                 if (!is_vm_hugetlb_page(vma) &&
 509                     ((flags & MPOL_MF_STRICT) ||
 510                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 511                                 vma_migratable(vma)))) {
 512                         unsigned long endvma = vma->vm_end;
 513
 514                         if (endvma > end)
 515                                 endvma = end;
 516                         if (vma->vm_start > start)
 517                                 start = vma->vm_start;
 518                         err = check_pgd_range(vma, start, endvma, nodes,
 519                                                 flags, private);
 520                         if (err) {
 521                                 first = ERR_PTR(err);
 522                                 break;
 523                         }
 524                 }
 525                 prev = vma;
 526         }
 527         return first;
 528 }
 529
 530 /* Apply policy to a single VMA */
 531 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 532 {
 533         int err = 0;
 534         struct mempolicy *old = vma->vm_policy;
 535
 536         pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 537                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 538                  vma->vm_ops, vma->vm_file,
 539                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 540
 541         if (vma->vm_ops && vma->vm_ops->set_policy)
 542                 err = vma->vm_ops->set_policy(vma, new);
 543         if (!err) {
 544                 mpol_get(new);
 545                 vma->vm_policy = new;
 546                 mpol_put(old);
 547         }
 548         return err;
 549 }
 550
 551 /* Step 2: apply policy to a range and do splits. */
 552 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 553                        unsigned long end, struct mempolicy *new)
 554 {
 555         struct vm_area_struct *next;
 556         int err;
 557
 558         err = 0;
 559         for (; vma && vma->vm_start < end; vma = next) {
 560                 next = vma->vm_next;
 561                 if (vma->vm_start < start)
 562                         err = split_vma(vma->vm_mm, vma, start, 1);
 563                 if (!err && vma->vm_end > end)
 564                         err = split_vma(vma->vm_mm, vma, end, 0);
 565                 if (!err)
 566                         err = policy_vma(vma, new);
 567                 if (err)
 568                         break;
 569         }
 570         return err;
 571 }
 572
 573 /*
 574  * Update task->flags PF_MEMPOLICY bit: set iff non-default
 575  * mempolicy.  Allows more rapid checking of this (combined perhaps
 576  * with other PF_* flag bits) on memory allocation hot code paths.
 577  *
 578  * If called from outside this file, the task 'p' should -only- be
 579  * a newly forked child not yet visible on the task list, because
 580  * manipulating the task flags of a visible task is not safe.
 581  *
 582  * The above limitation is why this routine has the funny name
 583  * mpol_fix_fork_child_flag().
 584  *
 585  * It is also safe to call this with a task pointer of current,
 586  * which the static wrapper mpol_set_task_struct_flag() does,
 587  * for use within this file.
 588  */
 589
 590 void mpol_fix_fork_child_flag(struct task_struct *p)
 591 {
 592         if (p->mempolicy)
 593                 p->flags |= PF_MEMPOLICY;
 594         else
 595                 p->flags &= ~PF_MEMPOLICY;
 596 }
 597
 598 static void mpol_set_task_struct_flag(void)
 599 {
 600         mpol_fix_fork_child_flag(current);
 601 }
 602
 603 /* Set the process memory policy */
 604 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 605                              nodemask_t *nodes)
 606 {
 607         struct mempolicy *new;
 608         struct mm_struct *mm = current->mm;
 609
 610         new = mpol_new(mode, flags, nodes);
 611         if (IS_ERR(new))
 612                 return PTR_ERR(new);
 613
 614         /*
 615          * prevent changing our mempolicy while show_numa_maps()
 616          * is using it.
 617          * Note:  do_set_mempolicy() can be called at init time
 618          * with no 'mm'.
 619          */
 620         if (mm)
 621                 down_write(&mm->mmap_sem);
 622         mpol_put(current->mempolicy);
 623         current->mempolicy = new;
 624         mpol_set_task_struct_flag();
 625         if (new && new->mode == MPOL_INTERLEAVE &&
 626             nodes_weight(new->v.nodes))
 627                 current->il_next = first_node(new->v.nodes);
 628         if (mm)
 629                 up_write(&mm->mmap_sem);
 630
 631         return 0;
 632 }
 633
 634 /*
 635  * Return nodemask for policy for get_mempolicy() query
 636  */
 637 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 638 {
 639         nodes_clear(*nodes);
 640         if (p == &default_policy)
 641                 return;
 642
 643         switch (p->mode) {
 644         case MPOL_BIND:
 645                 /* Fall through */
 646         case MPOL_INTERLEAVE:
 647                 *nodes = p->v.nodes;
 648                 break;
 649         case MPOL_PREFERRED:
 650                 if (!(p->flags & MPOL_F_LOCAL))
 651                         node_set(p->v.preferred_node, *nodes);
 652                 /* else return empty node mask for local allocation */
 653                 break;
 654         default:
 655                 BUG();
 656         }
 657 }
 658
 659 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 660 {
 661         struct page *p;
 662         int err;
 663
 664         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 665         if (err >= 0) {
 666                 err = page_to_nid(p);
 667                 put_page(p);
 668         }
 669         return err;
 670 }
 671
 672 /* Retrieve NUMA policy */
 673 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 674                              unsigned long addr, unsigned long flags)
 675 {
 676         int err;
 677         struct mm_struct *mm = current->mm;
 678         struct vm_area_struct *vma = NULL;
 679         struct mempolicy *pol = current->mempolicy;
 680
 681         cpuset_update_task_memory_state();
 682         if (flags &
 683                 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 684                 return -EINVAL;
 685
 686         if (flags & MPOL_F_MEMS_ALLOWED) {
 687                 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 688                         return -EINVAL;
 689                 *policy = 0;    /* just so it's initialized */
 690                 *nmask  = cpuset_current_mems_allowed;
 691                 return 0;
 692         }
 693
 694         if (flags & MPOL_F_ADDR) {
 695                 /*
 696                  * Do NOT fall back to task policy if the
 697                  * vma/shared policy at addr is NULL.  We
 698                  * want to return MPOL_DEFAULT in this case.
 699                  */
 700                 down_read(&mm->mmap_sem);
 701                 vma = find_vma_intersection(mm, addr, addr+1);
 702                 if (!vma) {
 703                         up_read(&mm->mmap_sem);
 704                         return -EFAULT;
 705                 }
 706                 if (vma->vm_ops && vma->vm_ops->get_policy)
 707                         pol = vma->vm_ops->get_policy(vma, addr);
 708                 else
 709                         pol = vma->vm_policy;
 710         } else if (addr)
 711                 return -EINVAL;
 712
 713         if (!pol)
 714                 pol = &default_policy;  /* indicates default behavior */
 715
 716         if (flags & MPOL_F_NODE) {
 717                 if (flags & MPOL_F_ADDR) {
 718                         err = lookup_node(mm, addr);
 719                         if (err < 0)
 720                                 goto out;
 721                         *policy = err;
 722                 } else if (pol == current->mempolicy &&
 723                                 pol->mode == MPOL_INTERLEAVE) {
 724                         *policy = current->il_next;
 725                 } else {
 726                         err = -EINVAL;
 727                         goto out;
 728                 }
 729         } else {
 730                 *policy = pol == &default_policy ? MPOL_DEFAULT :
 731                                                 pol->mode;
 732                 /*
 733                  * Internal mempolicy flags must be masked off before exposing
 734                  * the policy to userspace.
 735                  */
 736                 *policy |= (pol->flags & MPOL_MODE_FLAGS);
 737         }
 738
 739         if (vma) {
 740                 up_read(&current->mm->mmap_sem);
 741                 vma = NULL;
 742         }
 743
 744         err = 0;
 745         if (nmask)
 746                 get_policy_nodemask(pol, nmask);
 747
 748  out:
 749         mpol_cond_put(pol);
 750         if (vma)
 751                 up_read(&current->mm->mmap_sem);
 752         return err;
 753 }
 754
 755 #ifdef CONFIG_MIGRATION
 756 /*
 757  * page migration
 758  */
 759 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 760                                 unsigned long flags)
 761 {
 762         /*
 763          * Avoid migrating a page that is shared with others.
 764          */
 765         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
 766                 isolate_lru_page(page, pagelist);
 767 }
 768
 769 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 770 {
 771         return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
 772 }
 773
 774 /*
 775  * Migrate pages from one node to a target node.
 776  * Returns error or the number of pages not migrated.
 777  */
 778 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 779                            int flags)
 780 {
 781         nodemask_t nmask;
 782         LIST_HEAD(pagelist);
 783         int err = 0;
 784
 785         nodes_clear(nmask);
 786         node_set(source, nmask);
 787
 788         check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
 789                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 790
 791         if (!list_empty(&pagelist))
 792                 err = migrate_pages(&pagelist, new_node_page, dest);
 793
 794         return err;
 795 }
 796
 797 /*
 798  * Move pages between the two nodesets so as to preserve the physical
 799  * layout as much as possible.
 800  *
 801  * Returns the number of page that could not be moved.
 802  */
 803 int do_migrate_pages(struct mm_struct *mm,
 804         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 805 {
 806         int busy = 0;
 807         int err = 0;
 808         nodemask_t tmp;
 809
 810         down_read(&mm->mmap_sem);
 811
 812         err = migrate_vmas(mm, from_nodes, to_nodes, flags);
 813         if (err)
 814                 goto out;
 815
 816 /*
 817  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
 818  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
 819  * bit in 'tmp', and return that <source, dest> pair for migration.
 820  * The pair of nodemasks 'to' and 'from' define the map.
 821  *
 822  * If no pair of bits is found that way, fallback to picking some
 823  * pair of 'source' and 'dest' bits that are not the same.  If the
 824  * 'source' and 'dest' bits are the same, this represents a node
 825  * that will be migrating to itself, so no pages need move.
 826  *
 827  * If no bits are left in 'tmp', or if all remaining bits left
 828  * in 'tmp' correspond to the same bit in 'to', return false
 829  * (nothing left to migrate).
 830  *
 831  * This lets us pick a pair of nodes to migrate between, such that
 832  * if possible the dest node is not already occupied by some other
 833  * source node, minimizing the risk of overloading the memory on a
 834  * node that would happen if we migrated incoming memory to a node
 835  * before migrating outgoing memory source that same node.
 836  *
 837  * A single scan of tmp is sufficient.  As we go, we remember the
 838  * most recent <s, d> pair that moved (s != d).  If we find a pair
 839  * that not only moved, but what's better, moved to an empty slot
 840  * (d is not set in tmp), then we break out then, with that pair.
 841  * Otherwise when we finish scannng from_tmp, we at least have the
 842  * most recent <s, d> pair that moved.  If we get all the way through
 843  * the scan of tmp without finding any node that moved, much less
 844  * moved to an empty node, then there is nothing left worth migrating.
 845  */
 846
 847         tmp = *from_nodes;
 848         while (!nodes_empty(tmp)) {
 849                 int s,d;
 850                 int source = -1;
 851                 int dest = 0;
 852
 853                 for_each_node_mask(s, tmp) {
 854                         d = node_remap(s, *from_nodes, *to_nodes);
 855                         if (s == d)
 856                                 continue;
 857
 858                         source = s;     /* Node moved. Memorize */
 859                         dest = d;
 860
 861                         /* dest not in remaining from nodes? */
 862                         if (!node_isset(dest, tmp))
 863                                 break;
 864                 }
 865                 if (source == -1)
 866                         break;
 867
 868                 node_clear(source, tmp);
 869                 err = migrate_to_node(mm, source, dest, flags);
 870                 if (err > 0)
 871                         busy += err;
 872                 if (err < 0)
 873                         break;
 874         }
 875 out:
 876         up_read(&mm->mmap_sem);
 877         if (err < 0)
 878                 return err;
 879         return busy;
 880
 881 }
 882
 883 /*
 884  * Allocate a new page for page migration based on vma policy.
 885  * Start assuming that page is mapped by vma pointed to by @private.
 886  * Search forward from there, if not.  N.B., this assumes that the
 887  * list of pages handed to migrate_pages()--which is how we get here--
 888  * is in virtual address order.
 889  */
 890 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 891 {
 892         struct vm_area_struct *vma = (struct vm_area_struct *)private;
 893         unsigned long uninitialized_var(address);
 894
 895         while (vma) {
 896                 address = page_address_in_vma(page, vma);
 897                 if (address != -EFAULT)
 898                         break;
 899                 vma = vma->vm_next;
 900         }
 901
 902         /*
 903          * if !vma, alloc_page_vma() will use task or system default policy
 904          */
 905         return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
 906 }
 907 #else
 908
 909 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 910                                 unsigned long flags)
 911 {
 912 }
 913
 914 int do_migrate_pages(struct mm_struct *mm,
 915         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 916 {
 917         return -ENOSYS;
 918 }
 919
 920 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 921 {
 922         return NULL;
 923 }
 924 #endif
 925
 926 static long do_mbind(unsigned long start, unsigned long len,
 927                      unsigned short mode, unsigned short mode_flags,
 928                      nodemask_t *nmask, unsigned long flags)
 929 {
 930         struct vm_area_struct *vma;
 931         struct mm_struct *mm = current->mm;
 932         struct mempolicy *new;
 933         unsigned long end;
 934         int err;
 935         LIST_HEAD(pagelist);
 936
 937         if (flags & ~(unsigned long)(MPOL_MF_STRICT |
 938                                      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 939                 return -EINVAL;
 940         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
 941                 return -EPERM;
 942
 943         if (start & ~PAGE_MASK)
 944                 return -EINVAL;
 945
 946         if (mode == MPOL_DEFAULT)
 947                 flags &= ~MPOL_MF_STRICT;
 948
 949         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 950         end = start + len;
 951
 952         if (end < start)
 953                 return -EINVAL;
 954         if (end == start)
 955                 return 0;
 956
 957         new = mpol_new(mode, mode_flags, nmask);
 958         if (IS_ERR(new))
 959                 return PTR_ERR(new);
 960
 961         /*
 962          * If we are using the default policy then operation
 963          * on discontinuous address spaces is okay after all
 964          */
 965         if (!new)
 966                 flags |= MPOL_MF_DISCONTIG_OK;
 967
 968         pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
 969                  start, start + len, mode, mode_flags,
 970                  nmask ? nodes_addr(*nmask)[0] : -1);
 971
 972         down_write(&mm->mmap_sem);
 973         vma = check_range(mm, start, end, nmask,
 974                           flags | MPOL_MF_INVERT, &pagelist);
 975
 976         err = PTR_ERR(vma);
 977         if (!IS_ERR(vma)) {
 978                 int nr_failed = 0;
 979
 980                 err = mbind_range(vma, start, end, new);
 981
 982                 if (!list_empty(&pagelist))
 983                         nr_failed = migrate_pages(&pagelist, new_vma_page,
 984                                                 (unsigned long)vma);
 985
 986                 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 987                         err = -EIO;
 988         }
 989
 990         up_write(&mm->mmap_sem);
 991         mpol_put(new);
 992         return err;
 993 }
 994
 995 /*
 996  * User space interface with variable sized bitmaps for nodelists.
 997  */
 998
 999 /* Copy a node mask from user space. */
1000 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1001                      unsigned long maxnode)
1002 {
1003         unsigned long k;
1004         unsigned long nlongs;
1005         unsigned long endmask;
1006
1007         --maxnode;
1008         nodes_clear(*nodes);
1009         if (maxnode == 0 || !nmask)
1010                 return 0;
1011         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1012                 return -EINVAL;
1013
1014         nlongs = BITS_TO_LONGS(maxnode);
1015         if ((maxnode % BITS_PER_LONG) == 0)
1016                 endmask = ~0UL;
1017         else
1018                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1019
1020         /* When the user specified more nodes than supported just check
1021            if the non supported part is all zero. */
1022         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1023                 if (nlongs > PAGE_SIZE/sizeof(long))
1024                         return -EINVAL;
1025                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1026                         unsigned long t;
1027                         if (get_user(t, nmask + k))
1028                                 return -EFAULT;
1029                         if (k == nlongs - 1) {
1030                                 if (t & endmask)
1031                                         return -EINVAL;
1032                         } else if (t)
1033                                 return -EINVAL;
1034                 }
1035                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1036                 endmask = ~0UL;
1037         }
1038
1039         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1040                 return -EFAULT;
1041         nodes_addr(*nodes)[nlongs-1] &= endmask;
1042         return 0;
1043 }
1044
1045 /* Copy a kernel node mask to user space */
1046 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1047                               nodemask_t *nodes)
1048 {
1049         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1050         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1051
1052         if (copy > nbytes) {
1053                 if (copy > PAGE_SIZE)
1054                         return -EINVAL;
1055                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1056                         return -EFAULT;
1057                 copy = nbytes;
1058         }
1059         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1060 }
1061
1062 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1063                 unsigned long, mode, unsigned long __user *, nmask,
1064                 unsigned long, maxnode, unsigned, flags)
1065 {
1066         nodemask_t nodes;
1067         int err;
1068         unsigned short mode_flags;
1069
1070         mode_flags = mode & MPOL_MODE_FLAGS;
1071         mode &= ~MPOL_MODE_FLAGS;
1072         if (mode >= MPOL_MAX)
1073                 return -EINVAL;
1074         if ((mode_flags & MPOL_F_STATIC_NODES) &&
1075             (mode_flags & MPOL_F_RELATIVE_NODES))
1076                 return -EINVAL;
1077         err = get_nodes(&nodes, nmask, maxnode);
1078         if (err)
1079                 return err;
1080         return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1081 }
1082
1083 /* Set the process memory policy */
1084 SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
1085                 unsigned long, maxnode)
1086 {
1087         int err;
1088         nodemask_t nodes;
1089         unsigned short flags;
1090
1091         flags = mode & MPOL_MODE_FLAGS;
1092         mode &= ~MPOL_MODE_FLAGS;
1093         if ((unsigned int)mode >= MPOL_MAX)
1094                 return -EINVAL;
1095         if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1096                 return -EINVAL;
1097         err = get_nodes(&nodes, nmask, maxnode);
1098         if (err)
1099                 return err;
1100         return do_set_mempolicy(mode, flags, &nodes);
1101 }
1102
1103 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1104                 const unsigned long __user *, old_nodes,
1105                 const unsigned long __user *, new_nodes)
1106 {
1107         struct mm_struct *mm;
1108         struct task_struct *task;
1109         nodemask_t old;
1110         nodemask_t new;
1111         nodemask_t task_nodes;
1112         int err;
1113
1114         err = get_nodes(&old, old_nodes, maxnode);
1115         if (err)
1116                 return err;
1117
1118         err = get_nodes(&new, new_nodes, maxnode);
1119         if (err)
1120                 return err;
1121
1122         /* Find the mm_struct */
1123         read_lock(&tasklist_lock);
1124         task = pid ? find_task_by_vpid(pid) : current;
1125         if (!task) {
1126                 read_unlock(&tasklist_lock);
1127                 return -ESRCH;
1128         }
1129         mm = get_task_mm(task);
1130         read_unlock(&tasklist_lock);
1131
1132         if (!mm)
1133                 return -EINVAL;
1134
1135         /*
1136          * Check if this process has the right to modify the specified
1137          * process. The right exists if the process has administrative
1138          * capabilities, superuser privileges or the same
1139          * userid as the target process.
1140          */
1141         if ((current->euid != task->suid) && (current->euid != task->uid) &&
1142             (current->uid != task->suid) && (current->uid != task->uid) &&
1143             !capable(CAP_SYS_NICE)) {
1144                 err = -EPERM;
1145                 goto out;
1146         }
1147
1148         task_nodes = cpuset_mems_allowed(task);
1149         /* Is the user allowed to access the target nodes? */
1150         if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
1151                 err = -EPERM;
1152                 goto out;
1153         }
1154
1155         if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
1156                 err = -EINVAL;
1157                 goto out;
1158         }
1159
1160         err = security_task_movememory(task);
1161         if (err)
1162                 goto out;
1163
1164         err = do_migrate_pages(mm, &old, &new,
1165                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1166 out:
1167         mmput(mm);
1168         return err;
1169 }
1170
1171
1172 /* Retrieve NUMA policy */
1173 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1174                 unsigned long __user *, nmask, unsigned long, maxnode,
1175                 unsigned long, addr, unsigned long, flags)
1176 {
1177         int err;
1178         int uninitialized_var(pval);
1179         nodemask_t nodes;
1180
1181         if (nmask != NULL && maxnode < MAX_NUMNODES)
1182                 return -EINVAL;
1183
1184         err = do_get_mempolicy(&pval, &nodes, addr, flags);
1185
1186         if (err)
1187                 return err;
1188
1189         if (policy && put_user(pval, policy))
1190                 return -EFAULT;
1191
1192         if (nmask)
1193                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1194
1195         return err;
1196 }
1197
1198 #ifdef CONFIG_COMPAT
1199
1200 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1201                                      compat_ulong_t __user *nmask,
1202                                      compat_ulong_t maxnode,
1203                                      compat_ulong_t addr, compat_ulong_t flags)
1204 {
1205         long err;
1206         unsigned long __user *nm = NULL;
1207         unsigned long nr_bits, alloc_size;
1208         DECLARE_BITMAP(bm, MAX_NUMNODES);
1209
1210         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1211         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1212
1213         if (nmask)
1214                 nm = compat_alloc_user_space(alloc_size);
1215
1216         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1217
1218         if (!err && nmask) {
1219                 err = copy_from_user(bm, nm, alloc_size);
1220                 /* ensure entire bitmap is zeroed */
1221                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1222                 err |= compat_put_bitmap(nmask, bm, nr_bits);
1223         }
1224
1225         return err;
1226 }
1227
1228 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1229                                      compat_ulong_t maxnode)
1230 {
1231         long err = 0;
1232         unsigned long __user *nm = NULL;
1233         unsigned long nr_bits, alloc_size;
1234         DECLARE_BITMAP(bm, MAX_NUMNODES);
1235
1236         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1237         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1238
1239         if (nmask) {
1240                 err = compat_get_bitmap(bm, nmask, nr_bits);
1241                 nm = compat_alloc_user_space(alloc_size);
1242                 err |= copy_to_user(nm, bm, alloc_size);
1243         }
1244
1245         if (err)
1246                 return -EFAULT;
1247
1248         return sys_set_mempolicy(mode, nm, nr_bits+1);
1249 }
1250
1251 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1252                              compat_ulong_t mode, compat_ulong_t __user *nmask,
1253                              compat_ulong_t maxnode, compat_ulong_t flags)
1254 {
1255         long err = 0;
1256         unsigned long __user *nm = NULL;
1257         unsigned long nr_bits, alloc_size;
1258         nodemask_t bm;
1259
1260         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1261         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1262
1263         if (nmask) {
1264                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1265                 nm = compat_alloc_user_space(alloc_size);
1266                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1267         }
1268
1269         if (err)
1270                 return -EFAULT;
1271
1272         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1273 }
1274
1275 #endif
1276
1277 /*
1278  * get_vma_policy(@task, @vma, @addr)
1279  * @task - task for fallback if vma policy == default
1280  * @vma   - virtual memory area whose policy is sought
1281  * @addr  - address in @vma for shared policy lookup
1282  *
1283  * Returns effective policy for a VMA at specified address.
1284  * Falls back to @task or system default policy, as necessary.
1285  * Current or other task's task mempolicy and non-shared vma policies
1286  * are protected by the task's mmap_sem, which must be held for read by
1287  * the caller.
1288  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1289  * count--added by the get_policy() vm_op, as appropriate--to protect against
1290  * freeing by another task.  It is the caller's responsibility to free the
1291  * extra reference for shared policies.
1292  */
1293 static struct mempolicy *get_vma_policy(struct task_struct *task,
1294                 struct vm_area_struct *vma, unsigned long addr)
1295 {
1296         struct mempolicy *pol = task->mempolicy;
1297
1298         if (vma) {
1299                 if (vma->vm_ops && vma->vm_ops->get_policy) {
1300                         struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1301                                                                         addr);
1302                         if (vpol)
1303                                 pol = vpol;
1304                 } else if (vma->vm_policy)
1305                         pol = vma->vm_policy;
1306         }
1307         if (!pol)
1308                 pol = &default_policy;
1309         return pol;
1310 }
1311
1312 /*
1313  * Return a nodemask representing a mempolicy for filtering nodes for
1314  * page allocation
1315  */
1316 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1317 {
1318         /* Lower zones don't get a nodemask applied for MPOL_BIND */
1319         if (unlikely(policy->mode == MPOL_BIND) &&
1320                         gfp_zone(gfp) >= policy_zone &&
1321                         cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1322                 return &policy->v.nodes;
1323
1324         return NULL;
1325 }
1326
1327 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1328 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
1329 {
1330         int nd = numa_node_id();
1331
1332         switch (policy->mode) {
1333         case MPOL_PREFERRED:
1334                 if (!(policy->flags & MPOL_F_LOCAL))
1335                         nd = policy->v.preferred_node;
1336                 break;
1337         case MPOL_BIND:
1338                 /*
1339                  * Normally, MPOL_BIND allocations are node-local within the
1340                  * allowed nodemask.  However, if __GFP_THISNODE is set and the
1341                  * current node is part of the mask, we use the zonelist for
1342                  * the first node in the mask instead.
1343                  */
1344                 if (unlikely(gfp & __GFP_THISNODE) &&
1345                                 unlikely(!node_isset(nd, policy->v.nodes)))
1346                         nd = first_node(policy->v.nodes);
1347                 break;
1348         case MPOL_INTERLEAVE: /* should not happen */
1349                 break;
1350         default:
1351                 BUG();
1352         }
1353         return node_zonelist(nd, gfp);
1354 }
1355
1356 /* Do dynamic interleaving for a process */
1357 static unsigned interleave_nodes(struct mempolicy *policy)
1358 {
1359         unsigned nid, next;
1360         struct task_struct *me = current;
1361
1362         nid = me->il_next;
1363         next = next_node(nid, policy->v.nodes);
1364         if (next >= MAX_NUMNODES)
1365                 next = first_node(policy->v.nodes);
1366         if (next < MAX_NUMNODES)
1367                 me->il_next = next;
1368         return nid;
1369 }
1370
1371 /*
1372  * Depending on the memory policy provide a node from which to allocate the
1373  * next slab entry.
1374  * @policy must be protected by freeing by the caller.  If @policy is
1375  * the current task's mempolicy, this protection is implicit, as only the
1376  * task can change it's policy.  The system default policy requires no
1377  * such protection.
1378  */
1379 unsigned slab_node(struct mempolicy *policy)
1380 {
1381         if (!policy || policy->flags & MPOL_F_LOCAL)
1382                 return numa_node_id();
1383
1384         switch (policy->mode) {
1385         case MPOL_PREFERRED:
1386                 /*
1387                  * handled MPOL_F_LOCAL above
1388                  */
1389                 return policy->v.preferred_node;
1390
1391         case MPOL_INTERLEAVE:
1392                 return interleave_nodes(policy);
1393
1394         case MPOL_BIND: {
1395                 /*
1396                  * Follow bind policy behavior and start allocation at the
1397                  * first node.
1398                  */
1399                 struct zonelist *zonelist;
1400                 struct zone *zone;
1401                 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1402                 zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1403                 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1404                                                         &policy->v.nodes,
1405                                                         &zone);
1406                 return zone->node;
1407         }
1408
1409         default:
1410                 BUG();
1411         }
1412 }
1413
1414 /* Do static interleaving for a VMA with known offset. */
1415 static unsigned offset_il_node(struct mempolicy *pol,
1416                 struct vm_area_struct *vma, unsigned long off)
1417 {
1418         unsigned nnodes = nodes_weight(pol->v.nodes);
1419         unsigned target;
1420         int c;
1421         int nid = -1;
1422
1423         if (!nnodes)
1424                 return numa_node_id();
1425         target = (unsigned int)off % nnodes;
1426         c = 0;
1427         do {
1428                 nid = next_node(nid, pol->v.nodes);
1429                 c++;
1430         } while (c <= target);
1431         return nid;
1432 }
1433
1434 /* Determine a node number for interleave */
1435 static inline unsigned interleave_nid(struct mempolicy *pol,
1436                  struct vm_area_struct *vma, unsigned long addr, int shift)
1437 {
1438         if (vma) {
1439                 unsigned long off;
1440
1441                 /*
1442                  * for small pages, there is no difference between
1443                  * shift and PAGE_SHIFT, so the bit-shift is safe.
1444                  * for huge pages, since vm_pgoff is in units of small
1445                  * pages, we need to shift off the always 0 bits to get
1446                  * a useful offset.
1447                  */
1448                 BUG_ON(shift < PAGE_SHIFT);
1449                 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1450                 off += (addr - vma->vm_start) >> shift;
1451                 return offset_il_node(pol, vma, off);
1452         } else
1453                 return interleave_nodes(pol);
1454 }
1455
1456 #ifdef CONFIG_HUGETLBFS
1457 /*
1458  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1459  * @vma = virtual memory area whose policy is sought
1460  * @addr = address in @vma for shared policy lookup and interleave policy
1461  * @gfp_flags = for requested zone
1462  * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1463  * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1464  *
1465  * Returns a zonelist suitable for a huge page allocation and a pointer
1466  * to the struct mempolicy for conditional unref after allocation.
1467  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1468  * @nodemask for filtering the zonelist.
1469  */
1470 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1471                                 gfp_t gfp_flags, struct mempolicy **mpol,
1472                                 nodemask_t **nodemask)
1473 {
1474         struct zonelist *zl;
1475
1476         *mpol = get_vma_policy(current, vma, addr);
1477         *nodemask = NULL;       /* assume !MPOL_BIND */
1478
1479         if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1480                 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1481                                 huge_page_shift(hstate_vma(vma))), gfp_flags);
1482         } else {
1483                 zl = policy_zonelist(gfp_flags, *mpol);
1484                 if ((*mpol)->mode == MPOL_BIND)
1485                         *nodemask = &(*mpol)->v.nodes;
1486         }
1487         return zl;
1488 }
1489 #endif
1490
1491 /* Allocate a page in interleaved policy.
1492    Own path because it needs to do special accounting. */
1493 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1494                                         unsigned nid)
1495 {
1496         struct zonelist *zl;
1497         struct page *page;
1498
1499         zl = node_zonelist(nid, gfp);
1500         page = __alloc_pages(gfp, order, zl);
1501         if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1502                 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1503         return page;
1504 }
1505
1506 /**
1507  *      alloc_page_vma  - Allocate a page for a VMA.
1508  *
1509  *      @gfp:
1510  *      %GFP_USER    user allocation.
1511  *      %GFP_KERNEL  kernel allocations,
1512  *      %GFP_HIGHMEM highmem/user allocations,
1513  *      %GFP_FS      allocation should not call back into a file system.
1514  *      %GFP_ATOMIC  don't sleep.
1515  *
1516  *      @vma:  Pointer to VMA or NULL if not available.
1517  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1518  *
1519  *      This function allocates a page from the kernel page pool and applies
1520  *      a NUMA policy associated with the VMA or the current process.
1521  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1522  *      mm_struct of the VMA to prevent it from going away. Should be used for
1523  *      all allocations for pages that will be mapped into
1524  *      user space. Returns NULL when no page can be allocated.
1525  *
1526  *      Should be called with the mm_sem of the vma hold.
1527  */
1528 struct page *
1529 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1530 {
1531         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1532         struct zonelist *zl;
1533
1534         cpuset_update_task_memory_state();
1535
1536         if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1537                 unsigned nid;
1538
1539                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1540                 mpol_cond_put(pol);
1541                 return alloc_page_interleave(gfp, 0, nid);
1542         }
1543         zl = policy_zonelist(gfp, pol);
1544         if (unlikely(mpol_needs_cond_ref(pol))) {
1545                 /*
1546                  * slow path: ref counted shared policy
1547                  */
1548                 struct page *page =  __alloc_pages_nodemask(gfp, 0,
1549                                                 zl, policy_nodemask(gfp, pol));
1550                 __mpol_put(pol);
1551                 return page;
1552         }
1553         /*
1554          * fast path:  default or task policy
1555          */
1556         return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
1557 }
1558
1559 /**
1560  *      alloc_pages_current - Allocate pages.
1561  *
1562  *      @gfp:
1563  *              %GFP_USER   user allocation,
1564  *              %GFP_KERNEL kernel allocation,
1565  *              %GFP_HIGHMEM highmem allocation,
1566  *              %GFP_FS     don't call back into a file system.
1567  *              %GFP_ATOMIC don't sleep.
1568  *      @order: Power of two of allocation size in pages. 0 is a single page.
1569  *
1570  *      Allocate a page from the kernel page pool.  When not in
1571  *      interrupt context and apply the current process NUMA policy.
1572  *      Returns NULL when no page can be allocated.
1573  *
1574  *      Don't call cpuset_update_task_memory_state() unless
1575  *      1) it's ok to take cpuset_sem (can WAIT), and
1576  *      2) allocating for current task (not interrupt).
1577  */
1578 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1579 {
1580         struct mempolicy *pol = current->mempolicy;
1581
1582         if ((gfp & __GFP_WAIT) && !in_interrupt())
1583                 cpuset_update_task_memory_state();
1584         if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1585                 pol = &default_policy;
1586
1587         /*
1588          * No reference counting needed for current->mempolicy
1589          * nor system default_policy
1590          */
1591         if (pol->mode == MPOL_INTERLEAVE)
1592                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1593         return __alloc_pages_nodemask(gfp, order,
1594                         policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
1595 }
1596 EXPORT_SYMBOL(alloc_pages_current);
1597
1598 /*
1599  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
1600  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1601  * with the mems_allowed returned by cpuset_mems_allowed().  This
1602  * keeps mempolicies cpuset relative after its cpuset moves.  See
1603  * further kernel/cpuset.c update_nodemask().
1604  */
1605
1606 /* Slow path of a mempolicy duplicate */
1607 struct mempolicy *__mpol_dup(struct mempolicy *old)
1608 {
1609         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1610
1611         if (!new)
1612                 return ERR_PTR(-ENOMEM);
1613         if (current_cpuset_is_being_rebound()) {
1614                 nodemask_t mems = cpuset_mems_allowed(current);
1615                 mpol_rebind_policy(old, &mems);
1616         }
1617         *new = *old;
1618         atomic_set(&new->refcnt, 1);
1619         return new;
1620 }
1621
1622 /*
1623  * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
1624  * eliminate the * MPOL_F_* flags that require conditional ref and
1625  * [NOTE!!!] drop the extra ref.  Not safe to reference *frompol directly
1626  * after return.  Use the returned value.
1627  *
1628  * Allows use of a mempolicy for, e.g., multiple allocations with a single
1629  * policy lookup, even if the policy needs/has extra ref on lookup.
1630  * shmem_readahead needs this.
1631  */
1632 struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
1633                                                 struct mempolicy *frompol)
1634 {
1635         if (!mpol_needs_cond_ref(frompol))
1636                 return frompol;
1637
1638         *tompol = *frompol;
1639         tompol->flags &= ~MPOL_F_SHARED;        /* copy doesn't need unref */
1640         __mpol_put(frompol);
1641         return tompol;
1642 }
1643
1644 static int mpol_match_intent(const struct mempolicy *a,
1645                              const struct mempolicy *b)
1646 {
1647         if (a->flags != b->flags)
1648                 return 0;
1649         if (!mpol_store_user_nodemask(a))
1650                 return 1;
1651         return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
1652 }
1653
1654 /* Slow path of a mempolicy comparison */
1655 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1656 {
1657         if (!a || !b)
1658                 return 0;
1659         if (a->mode != b->mode)
1660                 return 0;
1661         if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b))
1662                 return 0;
1663         switch (a->mode) {
1664         case MPOL_BIND:
1665                 /* Fall through */
1666         case MPOL_INTERLEAVE:
1667                 return nodes_equal(a->v.nodes, b->v.nodes);
1668         case MPOL_PREFERRED:
1669                 return a->v.preferred_node == b->v.preferred_node &&
1670                         a->flags == b->flags;
1671         default:
1672                 BUG();
1673                 return 0;
1674         }
1675 }
1676
1677 /*
1678  * Shared memory backing store policy support.
1679  *
1680  * Remember policies even when nobody has shared memory mapped.
1681  * The policies are kept in Red-Black tree linked from the inode.
1682  * They are protected by the sp->lock spinlock, which should be held
1683  * for any accesses to the tree.
1684  */
1685
1686 /* lookup first element intersecting start-end */
1687 /* Caller holds sp->lock */
1688 static struct sp_node *
1689 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1690 {
1691         struct rb_node *n = sp->root.rb_node;
1692
1693         while (n) {
1694                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1695
1696                 if (start >= p->end)
1697                         n = n->rb_right;
1698                 else if (end <= p->start)
1699                         n = n->rb_left;
1700                 else
1701                         break;
1702         }
1703         if (!n)
1704                 return NULL;
1705         for (;;) {
1706                 struct sp_node *w = NULL;
1707                 struct rb_node *prev = rb_prev(n);
1708                 if (!prev)
1709                         break;
1710                 w = rb_entry(prev, struct sp_node, nd);
1711                 if (w->end <= start)
1712                         break;
1713                 n = prev;
1714         }
1715         return rb_entry(n, struct sp_node, nd);
1716 }
1717
1718 /* Insert a new shared policy into the list. */
1719 /* Caller holds sp->lock */
1720 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1721 {
1722         struct rb_node **p = &sp->root.rb_node;
1723         struct rb_node *parent = NULL;
1724         struct sp_node *nd;
1725
1726         while (*p) {
1727                 parent = *p;
1728                 nd = rb_entry(parent, struct sp_node, nd);
1729                 if (new->start < nd->start)
1730                         p = &(*p)->rb_left;
1731                 else if (new->end > nd->end)
1732                         p = &(*p)->rb_right;
1733                 else
1734                         BUG();
1735         }
1736         rb_link_node(&new->nd, parent, p);
1737         rb_insert_color(&new->nd, &sp->root);
1738         pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
1739                  new->policy ? new->policy->mode : 0);
1740 }
1741
1742 /* Find shared policy intersecting idx */
1743 struct mempolicy *
1744 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1745 {
1746         struct mempolicy *pol = NULL;
1747         struct sp_node *sn;
1748
1749         if (!sp->root.rb_node)
1750                 return NULL;
1751         spin_lock(&sp->lock);
1752         sn = sp_lookup(sp, idx, idx+1);
1753         if (sn) {
1754                 mpol_get(sn->policy);
1755                 pol = sn->policy;
1756         }
1757         spin_unlock(&sp->lock);
1758         return pol;
1759 }
1760
1761 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1762 {
1763         pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1764         rb_erase(&n->nd, &sp->root);
1765         mpol_put(n->policy);
1766         kmem_cache_free(sn_cache, n);
1767 }
1768
1769 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1770                                 struct mempolicy *pol)
1771 {
1772         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1773
1774         if (!n)
1775                 return NULL;
1776         n->start = start;
1777         n->end = end;
1778         mpol_get(pol);
1779         pol->flags |= MPOL_F_SHARED;    /* for unref */
1780         n->policy = pol;
1781         return n;
1782 }
1783
1784 /* Replace a policy range. */
1785 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1786                                  unsigned long end, struct sp_node *new)
1787 {
1788         struct sp_node *n, *new2 = NULL;
1789
1790 restart:
1791         spin_lock(&sp->lock);
1792         n = sp_lookup(sp, start, end);
1793         /* Take care of old policies in the same range. */
1794         while (n && n->start < end) {
1795                 struct rb_node *next = rb_next(&n->nd);
1796                 if (n->start >= start) {
1797                         if (n->end <= end)
1798                                 sp_delete(sp, n);
1799                         else
1800                                 n->start = end;
1801                 } else {
1802                         /* Old policy spanning whole new range. */
1803                         if (n->end > end) {
1804                                 if (!new2) {
1805                                         spin_unlock(&sp->lock);
1806                                         new2 = sp_alloc(end, n->end, n->policy);
1807                                         if (!new2)
1808                                                 return -ENOMEM;
1809                                         goto restart;
1810                                 }
1811                                 n->end = start;
1812                                 sp_insert(sp, new2);
1813                                 new2 = NULL;
1814                                 break;
1815                         } else
1816                                 n->end = start;
1817                 }
1818                 if (!next)
1819                         break;
1820                 n = rb_entry(next, struct sp_node, nd);
1821         }
1822         if (new)
1823                 sp_insert(sp, new);
1824         spin_unlock(&sp->lock);
1825         if (new2) {
1826                 mpol_put(new2->policy);
1827                 kmem_cache_free(sn_cache, new2);
1828         }
1829         return 0;
1830 }
1831
1832 /**
1833  * mpol_shared_policy_init - initialize shared policy for inode
1834  * @sp: pointer to inode shared policy
1835  * @mpol:  struct mempolicy to install
1836  *
1837  * Install non-NULL @mpol in inode's shared policy rb-tree.
1838  * On entry, the current task has a reference on a non-NULL @mpol.
1839  * This must be released on exit.
1840  */
1841 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1842 {
1843         sp->root = RB_ROOT;             /* empty tree == default mempolicy */
1844         spin_lock_init(&sp->lock);
1845
1846         if (mpol) {
1847                 struct vm_area_struct pvma;
1848                 struct mempolicy *new;
1849
1850                 /* contextualize the tmpfs mount point mempolicy */
1851                 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
1852                 mpol_put(mpol); /* drop our ref on sb mpol */
1853                 if (IS_ERR(new))
1854                         return;         /* no valid nodemask intersection */
1855
1856                 /* Create pseudo-vma that contains just the policy */
1857                 memset(&pvma, 0, sizeof(struct vm_area_struct));
1858                 pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
1859                 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
1860                 mpol_put(new);                  /* drop initial ref */
1861         }
1862 }
1863
1864 int mpol_set_shared_policy(struct shared_policy *info,
1865                         struct vm_area_struct *vma, struct mempolicy *npol)
1866 {
1867         int err;
1868         struct sp_node *new = NULL;
1869         unsigned long sz = vma_pages(vma);
1870
1871         pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
1872                  vma->vm_pgoff,
1873                  sz, npol ? npol->mode : -1,
1874                  npol ? npol->flags : -1,
1875                  npol ? nodes_addr(npol->v.nodes)[0] : -1);
1876
1877         if (npol) {
1878                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1879                 if (!new)
1880                         return -ENOMEM;
1881         }
1882         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1883         if (err && new)
1884                 kmem_cache_free(sn_cache, new);
1885         return err;
1886 }
1887
1888 /* Free a backing policy store on inode delete. */
1889 void mpol_free_shared_policy(struct shared_policy *p)
1890 {
1891         struct sp_node *n;
1892         struct rb_node *next;
1893
1894         if (!p->root.rb_node)
1895                 return;
1896         spin_lock(&p->lock);
1897         next = rb_first(&p->root);
1898         while (next) {
1899                 n = rb_entry(next, struct sp_node, nd);
1900                 next = rb_next(&n->nd);
1901                 rb_erase(&n->nd, &p->root);
1902                 mpol_put(n->policy);
1903                 kmem_cache_free(sn_cache, n);
1904         }
1905         spin_unlock(&p->lock);
1906 }
1907
1908 /* assumes fs == KERNEL_DS */
1909 void __init numa_policy_init(void)
1910 {
1911         nodemask_t interleave_nodes;
1912         unsigned long largest = 0;
1913         int nid, prefer = 0;
1914
1915         policy_cache = kmem_cache_create("numa_policy",
1916                                          sizeof(struct mempolicy),
1917                                          0, SLAB_PANIC, NULL);
1918
1919         sn_cache = kmem_cache_create("shared_policy_node",
1920                                      sizeof(struct sp_node),
1921                                      0, SLAB_PANIC, NULL);
1922
1923         /*
1924          * Set interleaving policy for system init. Interleaving is only
1925          * enabled across suitably sized nodes (default is >= 16MB), or
1926          * fall back to the largest node if they're all smaller.
1927          */
1928         nodes_clear(interleave_nodes);
1929         for_each_node_state(nid, N_HIGH_MEMORY) {
1930                 unsigned long total_pages = node_present_pages(nid);
1931
1932                 /* Preserve the largest node */
1933                 if (largest < total_pages) {
1934                         largest = total_pages;
1935                         prefer = nid;
1936                 }
1937
1938                 /* Interleave this node? */
1939                 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
1940                         node_set(nid, interleave_nodes);
1941         }
1942
1943         /* All too small, use the largest */
1944         if (unlikely(nodes_empty(interleave_nodes)))
1945                 node_set(prefer, interleave_nodes);
1946
1947         if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
1948                 printk("numa_policy_init: interleaving failed\n");
1949 }
1950
1951 /* Reset policy of current process to default */
1952 void numa_default_policy(void)
1953 {
1954         do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
1955 }
1956
1957 /*
1958  * Parse and format mempolicy from/to strings
1959  */
1960
1961 /*
1962  * "local" is pseudo-policy:  MPOL_PREFERRED with MPOL_F_LOCAL flag
1963  * Used only for mpol_parse_str() and mpol_to_str()
1964  */
1965 #define MPOL_LOCAL (MPOL_INTERLEAVE + 1)
1966 static const char * const policy_types[] =
1967         { "default", "prefer", "bind", "interleave", "local" };
1968
1969
1970 #ifdef CONFIG_TMPFS
1971 /**
1972  * mpol_parse_str - parse string to mempolicy
1973  * @str:  string containing mempolicy to parse
1974  * @mpol:  pointer to struct mempolicy pointer, returned on success.
1975  * @no_context:  flag whether to "contextualize" the mempolicy
1976  *
1977  * Format of input:
1978  *      <mode>[=<flags>][:<nodelist>]
1979  *
1980  * if @no_context is true, save the input nodemask in w.user_nodemask in
1981  * the returned mempolicy.  This will be used to "clone" the mempolicy in
1982  * a specific context [cpuset] at a later time.  Used to parse tmpfs mpol
1983  * mount option.  Note that if 'static' or 'relative' mode flags were
1984  * specified, the input nodemask will already have been saved.  Saving
1985  * it again is redundant, but safe.
1986  *
1987  * On success, returns 0, else 1
1988  */
1989 int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
1990 {
1991         struct mempolicy *new = NULL;
1992         unsigned short uninitialized_var(mode);
1993         unsigned short uninitialized_var(mode_flags);
1994         nodemask_t nodes;
1995         char *nodelist = strchr(str, ':');
1996         char *flags = strchr(str, '=');
1997         int i;
1998         int err = 1;
1999
2000         if (nodelist) {
2001                 /* NUL-terminate mode or flags string */
2002                 *nodelist++ = '\0';
2003                 if (nodelist_parse(nodelist, nodes))
2004                         goto out;
2005                 if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
2006                         goto out;
2007         } else
2008                 nodes_clear(nodes);
2009
2010         if (flags)
2011                 *flags++ = '\0';        /* terminate mode string */
2012
2013         for (i = 0; i <= MPOL_LOCAL; i++) {
2014                 if (!strcmp(str, policy_types[i])) {
2015                         mode = i;
2016                         break;
2017                 }
2018         }
2019         if (i > MPOL_LOCAL)
2020                 goto out;
2021
2022         switch (mode) {
2023         case MPOL_PREFERRED:
2024                 /*
2025                  * Insist on a nodelist of one node only
2026                  */
2027                 if (nodelist) {
2028                         char *rest = nodelist;
2029                         while (isdigit(*rest))
2030                                 rest++;
2031                         if (!*rest)
2032                                 err = 0;
2033                 }
2034                 break;
2035         case MPOL_INTERLEAVE:
2036                 /*
2037                  * Default to online nodes with memory if no nodelist
2038                  */
2039                 if (!nodelist)
2040                         nodes = node_states[N_HIGH_MEMORY];
2041                 err = 0;
2042                 break;
2043         case MPOL_LOCAL:
2044                 /*
2045                  * Don't allow a nodelist;  mpol_new() checks flags
2046                  */
2047                 if (nodelist)
2048                         goto out;
2049                 mode = MPOL_PREFERRED;
2050                 break;
2051
2052         /*
2053          * case MPOL_BIND:    mpol_new() enforces non-empty nodemask.
2054          * case MPOL_DEFAULT: mpol_new() enforces empty nodemask, ignores flags.
2055          */
2056         }
2057
2058         mode_flags = 0;
2059         if (flags) {
2060                 /*
2061                  * Currently, we only support two mutually exclusive
2062                  * mode flags.
2063                  */
2064                 if (!strcmp(flags, "static"))
2065                         mode_flags |= MPOL_F_STATIC_NODES;
2066                 else if (!strcmp(flags, "relative"))
2067                         mode_flags |= MPOL_F_RELATIVE_NODES;
2068                 else
2069                         err = 1;
2070         }
2071
2072         new = mpol_new(mode, mode_flags, &nodes);
2073         if (IS_ERR(new))
2074                 err = 1;
2075         else if (no_context)
2076                 new->w.user_nodemask = nodes;   /* save for contextualization */
2077
2078 out:
2079         /* Restore string for error message */
2080         if (nodelist)
2081                 *--nodelist = ':';
2082         if (flags)
2083                 *--flags = '=';
2084         if (!err)
2085                 *mpol = new;
2086         return err;
2087 }
2088 #endif /* CONFIG_TMPFS */
2089
2090 /**
2091  * mpol_to_str - format a mempolicy structure for printing
2092  * @buffer:  to contain formatted mempolicy string
2093  * @maxlen:  length of @buffer
2094  * @pol:  pointer to mempolicy to be formatted
2095  * @no_context:  "context free" mempolicy - use nodemask in w.user_nodemask
2096  *
2097  * Convert a mempolicy into a string.
2098  * Returns the number of characters in buffer (if positive)
2099  * or an error (negative)
2100  */
2101 int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2102 {
2103         char *p = buffer;
2104         int l;
2105         nodemask_t nodes;
2106         unsigned short mode;
2107         unsigned short flags = pol ? pol->flags : 0;
2108
2109         /*
2110          * Sanity check:  room for longest mode, flag and some nodes
2111          */
2112         VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2113
2114         if (!pol || pol == &default_policy)
2115                 mode = MPOL_DEFAULT;
2116         else
2117                 mode = pol->mode;
2118
2119         switch (mode) {
2120         case MPOL_DEFAULT:
2121                 nodes_clear(nodes);
2122                 break;
2123
2124         case MPOL_PREFERRED:
2125                 nodes_clear(nodes);
2126                 if (flags & MPOL_F_LOCAL)
2127                         mode = MPOL_LOCAL;      /* pseudo-policy */
2128                 else
2129                         node_set(pol->v.preferred_node, nodes);
2130                 break;
2131
2132         case MPOL_BIND:
2133                 /* Fall through */
2134         case MPOL_INTERLEAVE:
2135                 if (no_context)
2136                         nodes = pol->w.user_nodemask;
2137                 else
2138                         nodes = pol->v.nodes;
2139                 break;
2140
2141         default:
2142                 BUG();
2143         }
2144
2145         l = strlen(policy_types[mode]);
2146         if (buffer + maxlen < p + l + 1)
2147                 return -ENOSPC;
2148
2149         strcpy(p, policy_types[mode]);
2150         p += l;
2151
2152         if (flags & MPOL_MODE_FLAGS) {
2153                 if (buffer + maxlen < p + 2)
2154                         return -ENOSPC;
2155                 *p++ = '=';
2156
2157                 /*
2158                  * Currently, the only defined flags are mutually exclusive
2159                  */
2160                 if (flags & MPOL_F_STATIC_NODES)
2161                         p += snprintf(p, buffer + maxlen - p, "static");
2162                 else if (flags & MPOL_F_RELATIVE_NODES)
2163                         p += snprintf(p, buffer + maxlen - p, "relative");
2164         }
2165
2166         if (!nodes_empty(nodes)) {
2167                 if (buffer + maxlen < p + 2)
2168                         return -ENOSPC;
2169                 *p++ = ':';
2170                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2171         }
2172         return p - buffer;
2173 }
2174
2175 struct numa_maps {
2176         unsigned long pages;
2177         unsigned long anon;
2178         unsigned long active;
2179         unsigned long writeback;
2180         unsigned long mapcount_max;
2181         unsigned long dirty;
2182         unsigned long swapcache;
2183         unsigned long node[MAX_NUMNODES];
2184 };
2185
2186 static void gather_stats(struct page *page, void *private, int pte_dirty)
2187 {
2188         struct numa_maps *md = private;
2189         int count = page_mapcount(page);
2190
2191         md->pages++;
2192         if (pte_dirty || PageDirty(page))
2193                 md->dirty++;
2194
2195         if (PageSwapCache(page))
2196                 md->swapcache++;
2197
2198         if (PageActive(page))
2199                 md->active++;
2200
2201         if (PageWriteback(page))
2202                 md->writeback++;
2203
2204         if (PageAnon(page))
2205                 md->anon++;
2206
2207         if (count > md->mapcount_max)
2208                 md->mapcount_max = count;
2209
2210         md->node[page_to_nid(page)]++;
2211 }
2212
2213 #ifdef CONFIG_HUGETLB_PAGE
2214 static void check_huge_range(struct vm_area_struct *vma,
2215                 unsigned long start, unsigned long end,
2216                 struct numa_maps *md)
2217 {
2218         unsigned long addr;
2219         struct page *page;
2220         struct hstate *h = hstate_vma(vma);
2221         unsigned long sz = huge_page_size(h);
2222
2223         for (addr = start; addr < end; addr += sz) {
2224                 pte_t *ptep = huge_pte_offset(vma->vm_mm,
2225                                                 addr & huge_page_mask(h));
2226                 pte_t pte;
2227
2228                 if (!ptep)
2229                         continue;
2230
2231                 pte = *ptep;
2232                 if (pte_none(pte))
2233                         continue;
2234
2235                 page = pte_page(pte);
2236                 if (!page)
2237                         continue;
2238
2239                 gather_stats(page, md, pte_dirty(*ptep));
2240         }
2241 }
2242 #else
2243 static inline void check_huge_range(struct vm_area_struct *vma,
2244                 unsigned long start, unsigned long end,
2245                 struct numa_maps *md)
2246 {
2247 }
2248 #endif
2249
2250 /*
2251  * Display pages allocated per node and memory policy via /proc.
2252  */
2253 int show_numa_map(struct seq_file *m, void *v)
2254 {
2255         struct proc_maps_private *priv = m->private;
2256         struct vm_area_struct *vma = v;
2257         struct numa_maps *md;
2258         struct file *file = vma->vm_file;
2259         struct mm_struct *mm = vma->vm_mm;
2260         struct mempolicy *pol;
2261         int n;
2262         char buffer[50];
2263
2264         if (!mm)
2265                 return 0;
2266
2267         md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
2268         if (!md)
2269                 return 0;
2270
2271         pol = get_vma_policy(priv->task, vma, vma->vm_start);
2272         mpol_to_str(buffer, sizeof(buffer), pol, 0);
2273         mpol_cond_put(pol);
2274
2275         seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2276
2277         if (file) {
2278                 seq_printf(m, " file=");
2279                 seq_path(m, &file->f_path, "\n\t= ");
2280         } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2281                 seq_printf(m, " heap");
2282         } else if (vma->vm_start <= mm->start_stack &&
2283                         vma->vm_end >= mm->start_stack) {
2284                 seq_printf(m, " stack");
2285         }
2286
2287         if (is_vm_hugetlb_page(vma)) {
2288                 check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2289                 seq_printf(m, " huge");
2290         } else {
2291                 check_pgd_range(vma, vma->vm_start, vma->vm_end,
2292                         &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
2293         }
2294
2295         if (!md->pages)
2296                 goto out;
2297
2298         if (md->anon)
2299                 seq_printf(m," anon=%lu",md->anon);
2300
2301         if (md->dirty)
2302                 seq_printf(m," dirty=%lu",md->dirty);
2303
2304         if (md->pages != md->anon && md->pages != md->dirty)
2305                 seq_printf(m, " mapped=%lu", md->pages);
2306
2307         if (md->mapcount_max > 1)
2308                 seq_printf(m, " mapmax=%lu", md->mapcount_max);
2309
2310         if (md->swapcache)
2311                 seq_printf(m," swapcache=%lu", md->swapcache);
2312
2313         if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2314                 seq_printf(m," active=%lu", md->active);
2315
2316         if (md->writeback)
2317                 seq_printf(m," writeback=%lu", md->writeback);
2318
2319         for_each_node_state(n, N_HIGH_MEMORY)
2320                 if (md->node[n])
2321                         seq_printf(m, " N%d=%lu", n, md->node[n]);
2322 out:
2323         seq_putc(m, '\n');
2324         kfree(md);
2325
2326         if (m->count < m->size)
2327                 m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
2328         return 0;
2329 }