mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66 */
  67
  68 #include <linux/mempolicy.h>
  69 #include <linux/mm.h>
  70 #include <linux/highmem.h>
  71 #include <linux/hugetlb.h>
  72 #include <linux/kernel.h>
  73 #include <linux/sched.h>
  74 #include <linux/nodemask.h>
  75 #include <linux/cpuset.h>
  76 #include <linux/gfp.h>
  77 #include <linux/slab.h>
  78 #include <linux/string.h>
  79 #include <linux/module.h>
  80 #include <linux/nsproxy.h>
  81 #include <linux/interrupt.h>
  82 #include <linux/init.h>
  83 #include <linux/compat.h>
  84 #include <linux/swap.h>
  85 #include <linux/seq_file.h>
  86 #include <linux/proc_fs.h>
  87 #include <linux/migrate.h>
  88 #include <linux/rmap.h>
  89 #include <linux/security.h>
  90 #include <linux/syscalls.h>
  91
  92 #include <asm/tlbflush.h>
  93 #include <asm/uaccess.h>
  94
  95 /* Internal flags */
  96 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
  97 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
  98 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
  99
 100 static struct kmem_cache *policy_cache;
 101 static struct kmem_cache *sn_cache;
 102
 103 /* Highest zone. An specific allocation for a zone below that is not
 104    policied. */
 105 enum zone_type policy_zone = 0;
 106
 107 /*
 108  * run-time system-wide default policy => local allocation
 109  */
 110 struct mempolicy default_policy = {
 111         .refcnt = ATOMIC_INIT(1), /* never free it */
 112         .mode = MPOL_PREFERRED,
 113         .flags = MPOL_F_LOCAL,
 114 };
 115
 116 static const struct mempolicy_operations {
 117         int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 118         void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
 119 } mpol_ops[MPOL_MAX];
 120
 121 /* Check that the nodemask contains at least one populated zone */
 122 static int is_valid_nodemask(const nodemask_t *nodemask)
 123 {
 124         int nd, k;
 125
 126         /* Check that there is something useful in this mask */
 127         k = policy_zone;
 128
 129         for_each_node_mask(nd, *nodemask) {
 130                 struct zone *z;
 131
 132                 for (k = 0; k <= policy_zone; k++) {
 133                         z = &NODE_DATA(nd)->node_zones[k];
 134                         if (z->present_pages > 0)
 135                                 return 1;
 136                 }
 137         }
 138
 139         return 0;
 140 }
 141
 142 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 143 {
 144         return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
 145 }
 146
 147 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 148                                    const nodemask_t *rel)
 149 {
 150         nodemask_t tmp;
 151         nodes_fold(tmp, *orig, nodes_weight(*rel));
 152         nodes_onto(*ret, tmp, *rel);
 153 }
 154
 155 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 156 {
 157         if (nodes_empty(*nodes))
 158                 return -EINVAL;
 159         pol->v.nodes = *nodes;
 160         return 0;
 161 }
 162
 163 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 164 {
 165         if (!nodes)
 166                 pol->flags |= MPOL_F_LOCAL;     /* local allocation */
 167         else if (nodes_empty(*nodes))
 168                 return -EINVAL;                 /*  no allowed nodes */
 169         else
 170                 pol->v.preferred_node = first_node(*nodes);
 171         return 0;
 172 }
 173
 174 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 175 {
 176         if (!is_valid_nodemask(nodes))
 177                 return -EINVAL;
 178         pol->v.nodes = *nodes;
 179         return 0;
 180 }
 181
 182 /* Create a new policy */
 183 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 184                                   nodemask_t *nodes)
 185 {
 186         struct mempolicy *policy;
 187         nodemask_t cpuset_context_nmask;
 188         int ret;
 189
 190         pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 191                  mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
 192
 193         if (mode == MPOL_DEFAULT) {
 194                 if (nodes && !nodes_empty(*nodes))
 195                         return ERR_PTR(-EINVAL);
 196                 return NULL;    /* simply delete any existing policy */
 197         }
 198         VM_BUG_ON(!nodes);
 199
 200         /*
 201          * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 202          * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 203          * All other modes require a valid pointer to a non-empty nodemask.
 204          */
 205         if (mode == MPOL_PREFERRED) {
 206                 if (nodes_empty(*nodes)) {
 207                         if (((flags & MPOL_F_STATIC_NODES) ||
 208                              (flags & MPOL_F_RELATIVE_NODES)))
 209                                 return ERR_PTR(-EINVAL);
 210                         nodes = NULL;   /* flag local alloc */
 211                 }
 212         } else if (nodes_empty(*nodes))
 213                 return ERR_PTR(-EINVAL);
 214         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 215         if (!policy)
 216                 return ERR_PTR(-ENOMEM);
 217         atomic_set(&policy->refcnt, 1);
 218         policy->mode = mode;
 219         policy->flags = flags;
 220
 221         if (nodes) {
 222                 /*
 223                  * cpuset related setup doesn't apply to local allocation
 224                  */
 225                 cpuset_update_task_memory_state();
 226                 if (flags & MPOL_F_RELATIVE_NODES)
 227                         mpol_relative_nodemask(&cpuset_context_nmask, nodes,
 228                                                &cpuset_current_mems_allowed);
 229                 else
 230                         nodes_and(cpuset_context_nmask, *nodes,
 231                                   cpuset_current_mems_allowed);
 232                 if (mpol_store_user_nodemask(policy))
 233                         policy->w.user_nodemask = *nodes;
 234                 else
 235                         policy->w.cpuset_mems_allowed =
 236                                                 cpuset_mems_allowed(current);
 237         }
 238
 239         ret = mpol_ops[mode].create(policy,
 240                                 nodes ? &cpuset_context_nmask : NULL);
 241         if (ret < 0) {
 242                 kmem_cache_free(policy_cache, policy);
 243                 return ERR_PTR(ret);
 244         }
 245         return policy;
 246 }
 247
 248 /* Slow path of a mpol destructor. */
 249 void __mpol_put(struct mempolicy *p)
 250 {
 251         if (!atomic_dec_and_test(&p->refcnt))
 252                 return;
 253         kmem_cache_free(policy_cache, p);
 254 }
 255
 256 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
 257 {
 258 }
 259
 260 static void mpol_rebind_nodemask(struct mempolicy *pol,
 261                                  const nodemask_t *nodes)
 262 {
 263         nodemask_t tmp;
 264
 265         if (pol->flags & MPOL_F_STATIC_NODES)
 266                 nodes_and(tmp, pol->w.user_nodemask, *nodes);
 267         else if (pol->flags & MPOL_F_RELATIVE_NODES)
 268                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 269         else {
 270                 nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
 271                             *nodes);
 272                 pol->w.cpuset_mems_allowed = *nodes;
 273         }
 274
 275         pol->v.nodes = tmp;
 276         if (!node_isset(current->il_next, tmp)) {
 277                 current->il_next = next_node(current->il_next, tmp);
 278                 if (current->il_next >= MAX_NUMNODES)
 279                         current->il_next = first_node(tmp);
 280                 if (current->il_next >= MAX_NUMNODES)
 281                         current->il_next = numa_node_id();
 282         }
 283 }
 284
 285 static void mpol_rebind_preferred(struct mempolicy *pol,
 286                                   const nodemask_t *nodes)
 287 {
 288         nodemask_t tmp;
 289
 290         if (pol->flags & MPOL_F_STATIC_NODES) {
 291                 int node = first_node(pol->w.user_nodemask);
 292
 293                 if (node_isset(node, *nodes)) {
 294                         pol->v.preferred_node = node;
 295                         pol->flags &= ~MPOL_F_LOCAL;
 296                 } else
 297                         pol->flags |= MPOL_F_LOCAL;
 298         } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 299                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 300                 pol->v.preferred_node = first_node(tmp);
 301         } else if (!(pol->flags & MPOL_F_LOCAL)) {
 302                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
 303                                                    pol->w.cpuset_mems_allowed,
 304                                                    *nodes);
 305                 pol->w.cpuset_mems_allowed = *nodes;
 306         }
 307 }
 308
 309 /* Migrate a policy to a different set of nodes */
 310 static void mpol_rebind_policy(struct mempolicy *pol,
 311                                const nodemask_t *newmask)
 312 {
 313         if (!pol)
 314                 return;
 315         if (!mpol_store_user_nodemask(pol) &&
 316             nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 317                 return;
 318         mpol_ops[pol->mode].rebind(pol, newmask);
 319 }
 320
 321 /*
 322  * Wrapper for mpol_rebind_policy() that just requires task
 323  * pointer, and updates task mempolicy.
 324  */
 325
 326 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
 327 {
 328         mpol_rebind_policy(tsk->mempolicy, new);
 329 }
 330
 331 /*
 332  * Rebind each vma in mm to new nodemask.
 333  *
 334  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
 335  */
 336
 337 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 338 {
 339         struct vm_area_struct *vma;
 340
 341         down_write(&mm->mmap_sem);
 342         for (vma = mm->mmap; vma; vma = vma->vm_next)
 343                 mpol_rebind_policy(vma->vm_policy, new);
 344         up_write(&mm->mmap_sem);
 345 }
 346
 347 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 348         [MPOL_DEFAULT] = {
 349                 .rebind = mpol_rebind_default,
 350         },
 351         [MPOL_INTERLEAVE] = {
 352                 .create = mpol_new_interleave,
 353                 .rebind = mpol_rebind_nodemask,
 354         },
 355         [MPOL_PREFERRED] = {
 356                 .create = mpol_new_preferred,
 357                 .rebind = mpol_rebind_preferred,
 358         },
 359         [MPOL_BIND] = {
 360                 .create = mpol_new_bind,
 361                 .rebind = mpol_rebind_nodemask,
 362         },
 363 };
 364
 365 static void gather_stats(struct page *, void *, int pte_dirty);
 366 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 367                                 unsigned long flags);
 368
 369 /* Scan through pages checking if pages follow certain conditions. */
 370 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 371                 unsigned long addr, unsigned long end,
 372                 const nodemask_t *nodes, unsigned long flags,
 373                 void *private)
 374 {
 375         pte_t *orig_pte;
 376         pte_t *pte;
 377         spinlock_t *ptl;
 378
 379         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 380         do {
 381                 struct page *page;
 382                 int nid;
 383
 384                 if (!pte_present(*pte))
 385                         continue;
 386                 page = vm_normal_page(vma, addr, *pte);
 387                 if (!page)
 388                         continue;
 389                 /*
 390                  * The check for PageReserved here is important to avoid
 391                  * handling zero pages and other pages that may have been
 392                  * marked special by the system.
 393                  *
 394                  * If the PageReserved would not be checked here then f.e.
 395                  * the location of the zero page could have an influence
 396                  * on MPOL_MF_STRICT, zero pages would be counted for
 397                  * the per node stats, and there would be useless attempts
 398                  * to put zero pages on the migration list.
 399                  */
 400                 if (PageReserved(page))
 401                         continue;
 402                 nid = page_to_nid(page);
 403                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 404                         continue;
 405
 406                 if (flags & MPOL_MF_STATS)
 407                         gather_stats(page, private, pte_dirty(*pte));
 408                 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 409                         migrate_page_add(page, private, flags);
 410                 else
 411                         break;
 412         } while (pte++, addr += PAGE_SIZE, addr != end);
 413         pte_unmap_unlock(orig_pte, ptl);
 414         return addr != end;
 415 }
 416
 417 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 418                 unsigned long addr, unsigned long end,
 419                 const nodemask_t *nodes, unsigned long flags,
 420                 void *private)
 421 {
 422         pmd_t *pmd;
 423         unsigned long next;
 424
 425         pmd = pmd_offset(pud, addr);
 426         do {
 427                 next = pmd_addr_end(addr, end);
 428                 if (pmd_none_or_clear_bad(pmd))
 429                         continue;
 430                 if (check_pte_range(vma, pmd, addr, next, nodes,
 431                                     flags, private))
 432                         return -EIO;
 433         } while (pmd++, addr = next, addr != end);
 434         return 0;
 435 }
 436
 437 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 438                 unsigned long addr, unsigned long end,
 439                 const nodemask_t *nodes, unsigned long flags,
 440                 void *private)
 441 {
 442         pud_t *pud;
 443         unsigned long next;
 444
 445         pud = pud_offset(pgd, addr);
 446         do {
 447                 next = pud_addr_end(addr, end);
 448                 if (pud_none_or_clear_bad(pud))
 449                         continue;
 450                 if (check_pmd_range(vma, pud, addr, next, nodes,
 451                                     flags, private))
 452                         return -EIO;
 453         } while (pud++, addr = next, addr != end);
 454         return 0;
 455 }
 456
 457 static inline int check_pgd_range(struct vm_area_struct *vma,
 458                 unsigned long addr, unsigned long end,
 459                 const nodemask_t *nodes, unsigned long flags,
 460                 void *private)
 461 {
 462         pgd_t *pgd;
 463         unsigned long next;
 464
 465         pgd = pgd_offset(vma->vm_mm, addr);
 466         do {
 467                 next = pgd_addr_end(addr, end);
 468                 if (pgd_none_or_clear_bad(pgd))
 469                         continue;
 470                 if (check_pud_range(vma, pgd, addr, next, nodes,
 471                                     flags, private))
 472                         return -EIO;
 473         } while (pgd++, addr = next, addr != end);
 474         return 0;
 475 }
 476
 477 /*
 478  * Check if all pages in a range are on a set of nodes.
 479  * If pagelist != NULL then isolate pages from the LRU and
 480  * put them on the pagelist.
 481  */
 482 static struct vm_area_struct *
 483 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 484                 const nodemask_t *nodes, unsigned long flags, void *private)
 485 {
 486         int err;
 487         struct vm_area_struct *first, *vma, *prev;
 488
 489         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 490
 491                 err = migrate_prep();
 492                 if (err)
 493                         return ERR_PTR(err);
 494         }
 495
 496         first = find_vma(mm, start);
 497         if (!first)
 498                 return ERR_PTR(-EFAULT);
 499         prev = NULL;
 500         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 501                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 502                         if (!vma->vm_next && vma->vm_end < end)
 503                                 return ERR_PTR(-EFAULT);
 504                         if (prev && prev->vm_end < vma->vm_start)
 505                                 return ERR_PTR(-EFAULT);
 506                 }
 507                 if (!is_vm_hugetlb_page(vma) &&
 508                     ((flags & MPOL_MF_STRICT) ||
 509                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 510                                 vma_migratable(vma)))) {
 511                         unsigned long endvma = vma->vm_end;
 512
 513                         if (endvma > end)
 514                                 endvma = end;
 515                         if (vma->vm_start > start)
 516                                 start = vma->vm_start;
 517                         err = check_pgd_range(vma, start, endvma, nodes,
 518                                                 flags, private);
 519                         if (err) {
 520                                 first = ERR_PTR(err);
 521                                 break;
 522                         }
 523                 }
 524                 prev = vma;
 525         }
 526         return first;
 527 }
 528
 529 /* Apply policy to a single VMA */
 530 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 531 {
 532         int err = 0;
 533         struct mempolicy *old = vma->vm_policy;
 534
 535         pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 536                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 537                  vma->vm_ops, vma->vm_file,
 538                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 539
 540         if (vma->vm_ops && vma->vm_ops->set_policy)
 541                 err = vma->vm_ops->set_policy(vma, new);
 542         if (!err) {
 543                 mpol_get(new);
 544                 vma->vm_policy = new;
 545                 mpol_put(old);
 546         }
 547         return err;
 548 }
 549
 550 /* Step 2: apply policy to a range and do splits. */
 551 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 552                        unsigned long end, struct mempolicy *new)
 553 {
 554         struct vm_area_struct *next;
 555         int err;
 556
 557         err = 0;
 558         for (; vma && vma->vm_start < end; vma = next) {
 559                 next = vma->vm_next;
 560                 if (vma->vm_start < start)
 561                         err = split_vma(vma->vm_mm, vma, start, 1);
 562                 if (!err && vma->vm_end > end)
 563                         err = split_vma(vma->vm_mm, vma, end, 0);
 564                 if (!err)
 565                         err = policy_vma(vma, new);
 566                 if (err)
 567                         break;
 568         }
 569         return err;
 570 }
 571
 572 /*
 573  * Update task->flags PF_MEMPOLICY bit: set iff non-default
 574  * mempolicy.  Allows more rapid checking of this (combined perhaps
 575  * with other PF_* flag bits) on memory allocation hot code paths.
 576  *
 577  * If called from outside this file, the task 'p' should -only- be
 578  * a newly forked child not yet visible on the task list, because
 579  * manipulating the task flags of a visible task is not safe.
 580  *
 581  * The above limitation is why this routine has the funny name
 582  * mpol_fix_fork_child_flag().
 583  *
 584  * It is also safe to call this with a task pointer of current,
 585  * which the static wrapper mpol_set_task_struct_flag() does,
 586  * for use within this file.
 587  */
 588
 589 void mpol_fix_fork_child_flag(struct task_struct *p)
 590 {
 591         if (p->mempolicy)
 592                 p->flags |= PF_MEMPOLICY;
 593         else
 594                 p->flags &= ~PF_MEMPOLICY;
 595 }
 596
 597 static void mpol_set_task_struct_flag(void)
 598 {
 599         mpol_fix_fork_child_flag(current);
 600 }
 601
 602 /* Set the process memory policy */
 603 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 604                              nodemask_t *nodes)
 605 {
 606         struct mempolicy *new;
 607         struct mm_struct *mm = current->mm;
 608
 609         new = mpol_new(mode, flags, nodes);
 610         if (IS_ERR(new))
 611                 return PTR_ERR(new);
 612
 613         /*
 614          * prevent changing our mempolicy while show_numa_maps()
 615          * is using it.
 616          * Note:  do_set_mempolicy() can be called at init time
 617          * with no 'mm'.
 618          */
 619         if (mm)
 620                 down_write(&mm->mmap_sem);
 621         mpol_put(current->mempolicy);
 622         current->mempolicy = new;
 623         mpol_set_task_struct_flag();
 624         if (new && new->mode == MPOL_INTERLEAVE &&
 625             nodes_weight(new->v.nodes))
 626                 current->il_next = first_node(new->v.nodes);
 627         if (mm)
 628                 up_write(&mm->mmap_sem);
 629
 630         return 0;
 631 }
 632
 633 /*
 634  * Return nodemask for policy for get_mempolicy() query
 635  */
 636 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 637 {
 638         nodes_clear(*nodes);
 639         if (p == &default_policy)
 640                 return;
 641
 642         switch (p->mode) {
 643         case MPOL_BIND:
 644                 /* Fall through */
 645         case MPOL_INTERLEAVE:
 646                 *nodes = p->v.nodes;
 647                 break;
 648         case MPOL_PREFERRED:
 649                 if (!(p->flags & MPOL_F_LOCAL))
 650                         node_set(p->v.preferred_node, *nodes);
 651                 /* else return empty node mask for local allocation */
 652                 break;
 653         default:
 654                 BUG();
 655         }
 656 }
 657
 658 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 659 {
 660         struct page *p;
 661         int err;
 662
 663         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 664         if (err >= 0) {
 665                 err = page_to_nid(p);
 666                 put_page(p);
 667         }
 668         return err;
 669 }
 670
 671 /* Retrieve NUMA policy */
 672 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 673                              unsigned long addr, unsigned long flags)
 674 {
 675         int err;
 676         struct mm_struct *mm = current->mm;
 677         struct vm_area_struct *vma = NULL;
 678         struct mempolicy *pol = current->mempolicy;
 679
 680         cpuset_update_task_memory_state();
 681         if (flags &
 682                 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 683                 return -EINVAL;
 684
 685         if (flags & MPOL_F_MEMS_ALLOWED) {
 686                 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 687                         return -EINVAL;
 688                 *policy = 0;    /* just so it's initialized */
 689                 *nmask  = cpuset_current_mems_allowed;
 690                 return 0;
 691         }
 692
 693         if (flags & MPOL_F_ADDR) {
 694                 /*
 695                  * Do NOT fall back to task policy if the
 696                  * vma/shared policy at addr is NULL.  We
 697                  * want to return MPOL_DEFAULT in this case.
 698                  */
 699                 down_read(&mm->mmap_sem);
 700                 vma = find_vma_intersection(mm, addr, addr+1);
 701                 if (!vma) {
 702                         up_read(&mm->mmap_sem);
 703                         return -EFAULT;
 704                 }
 705                 if (vma->vm_ops && vma->vm_ops->get_policy)
 706                         pol = vma->vm_ops->get_policy(vma, addr);
 707                 else
 708                         pol = vma->vm_policy;
 709         } else if (addr)
 710                 return -EINVAL;
 711
 712         if (!pol)
 713                 pol = &default_policy;  /* indicates default behavior */
 714
 715         if (flags & MPOL_F_NODE) {
 716                 if (flags & MPOL_F_ADDR) {
 717                         err = lookup_node(mm, addr);
 718                         if (err < 0)
 719                                 goto out;
 720                         *policy = err;
 721                 } else if (pol == current->mempolicy &&
 722                                 pol->mode == MPOL_INTERLEAVE) {
 723                         *policy = current->il_next;
 724                 } else {
 725                         err = -EINVAL;
 726                         goto out;
 727                 }
 728         } else {
 729                 *policy = pol == &default_policy ? MPOL_DEFAULT :
 730                                                 pol->mode;
 731                 *policy |= pol->flags;
 732         }
 733
 734         if (vma) {
 735                 up_read(&current->mm->mmap_sem);
 736                 vma = NULL;
 737         }
 738
 739         err = 0;
 740         if (nmask)
 741                 get_policy_nodemask(pol, nmask);
 742
 743  out:
 744         mpol_cond_put(pol);
 745         if (vma)
 746                 up_read(&current->mm->mmap_sem);
 747         return err;
 748 }
 749
 750 #ifdef CONFIG_MIGRATION
 751 /*
 752  * page migration
 753  */
 754 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 755                                 unsigned long flags)
 756 {
 757         /*
 758          * Avoid migrating a page that is shared with others.
 759          */
 760         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
 761                 isolate_lru_page(page, pagelist);
 762 }
 763
 764 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 765 {
 766         return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
 767 }
 768
 769 /*
 770  * Migrate pages from one node to a target node.
 771  * Returns error or the number of pages not migrated.
 772  */
 773 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 774                            int flags)
 775 {
 776         nodemask_t nmask;
 777         LIST_HEAD(pagelist);
 778         int err = 0;
 779
 780         nodes_clear(nmask);
 781         node_set(source, nmask);
 782
 783         check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
 784                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 785
 786         if (!list_empty(&pagelist))
 787                 err = migrate_pages(&pagelist, new_node_page, dest);
 788
 789         return err;
 790 }
 791
 792 /*
 793  * Move pages between the two nodesets so as to preserve the physical
 794  * layout as much as possible.
 795  *
 796  * Returns the number of page that could not be moved.
 797  */
 798 int do_migrate_pages(struct mm_struct *mm,
 799         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 800 {
 801         LIST_HEAD(pagelist);
 802         int busy = 0;
 803         int err = 0;
 804         nodemask_t tmp;
 805
 806         down_read(&mm->mmap_sem);
 807
 808         err = migrate_vmas(mm, from_nodes, to_nodes, flags);
 809         if (err)
 810                 goto out;
 811
 812 /*
 813  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
 814  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
 815  * bit in 'tmp', and return that <source, dest> pair for migration.
 816  * The pair of nodemasks 'to' and 'from' define the map.
 817  *
 818  * If no pair of bits is found that way, fallback to picking some
 819  * pair of 'source' and 'dest' bits that are not the same.  If the
 820  * 'source' and 'dest' bits are the same, this represents a node
 821  * that will be migrating to itself, so no pages need move.
 822  *
 823  * If no bits are left in 'tmp', or if all remaining bits left
 824  * in 'tmp' correspond to the same bit in 'to', return false
 825  * (nothing left to migrate).
 826  *
 827  * This lets us pick a pair of nodes to migrate between, such that
 828  * if possible the dest node is not already occupied by some other
 829  * source node, minimizing the risk of overloading the memory on a
 830  * node that would happen if we migrated incoming memory to a node
 831  * before migrating outgoing memory source that same node.
 832  *
 833  * A single scan of tmp is sufficient.  As we go, we remember the
 834  * most recent <s, d> pair that moved (s != d).  If we find a pair
 835  * that not only moved, but what's better, moved to an empty slot
 836  * (d is not set in tmp), then we break out then, with that pair.
 837  * Otherwise when we finish scannng from_tmp, we at least have the
 838  * most recent <s, d> pair that moved.  If we get all the way through
 839  * the scan of tmp without finding any node that moved, much less
 840  * moved to an empty node, then there is nothing left worth migrating.
 841  */
 842
 843         tmp = *from_nodes;
 844         while (!nodes_empty(tmp)) {
 845                 int s,d;
 846                 int source = -1;
 847                 int dest = 0;
 848
 849                 for_each_node_mask(s, tmp) {
 850                         d = node_remap(s, *from_nodes, *to_nodes);
 851                         if (s == d)
 852                                 continue;
 853
 854                         source = s;     /* Node moved. Memorize */
 855                         dest = d;
 856
 857                         /* dest not in remaining from nodes? */
 858                         if (!node_isset(dest, tmp))
 859                                 break;
 860                 }
 861                 if (source == -1)
 862                         break;
 863
 864                 node_clear(source, tmp);
 865                 err = migrate_to_node(mm, source, dest, flags);
 866                 if (err > 0)
 867                         busy += err;
 868                 if (err < 0)
 869                         break;
 870         }
 871 out:
 872         up_read(&mm->mmap_sem);
 873         if (err < 0)
 874                 return err;
 875         return busy;
 876
 877 }
 878
 879 /*
 880  * Allocate a new page for page migration based on vma policy.
 881  * Start assuming that page is mapped by vma pointed to by @private.
 882  * Search forward from there, if not.  N.B., this assumes that the
 883  * list of pages handed to migrate_pages()--which is how we get here--
 884  * is in virtual address order.
 885  */
 886 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 887 {
 888         struct vm_area_struct *vma = (struct vm_area_struct *)private;
 889         unsigned long uninitialized_var(address);
 890
 891         while (vma) {
 892                 address = page_address_in_vma(page, vma);
 893                 if (address != -EFAULT)
 894                         break;
 895                 vma = vma->vm_next;
 896         }
 897
 898         /*
 899          * if !vma, alloc_page_vma() will use task or system default policy
 900          */
 901         return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
 902 }
 903 #else
 904
 905 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 906                                 unsigned long flags)
 907 {
 908 }
 909
 910 int do_migrate_pages(struct mm_struct *mm,
 911         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 912 {
 913         return -ENOSYS;
 914 }
 915
 916 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 917 {
 918         return NULL;
 919 }
 920 #endif
 921
 922 static long do_mbind(unsigned long start, unsigned long len,
 923                      unsigned short mode, unsigned short mode_flags,
 924                      nodemask_t *nmask, unsigned long flags)
 925 {
 926         struct vm_area_struct *vma;
 927         struct mm_struct *mm = current->mm;
 928         struct mempolicy *new;
 929         unsigned long end;
 930         int err;
 931         LIST_HEAD(pagelist);
 932
 933         if (flags & ~(unsigned long)(MPOL_MF_STRICT |
 934                                      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 935                 return -EINVAL;
 936         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
 937                 return -EPERM;
 938
 939         if (start & ~PAGE_MASK)
 940                 return -EINVAL;
 941
 942         if (mode == MPOL_DEFAULT)
 943                 flags &= ~MPOL_MF_STRICT;
 944
 945         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 946         end = start + len;
 947
 948         if (end < start)
 949                 return -EINVAL;
 950         if (end == start)
 951                 return 0;
 952
 953         new = mpol_new(mode, mode_flags, nmask);
 954         if (IS_ERR(new))
 955                 return PTR_ERR(new);
 956
 957         /*
 958          * If we are using the default policy then operation
 959          * on discontinuous address spaces is okay after all
 960          */
 961         if (!new)
 962                 flags |= MPOL_MF_DISCONTIG_OK;
 963
 964         pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
 965                  start, start + len, mode, mode_flags,
 966                  nmask ? nodes_addr(*nmask)[0] : -1);
 967
 968         down_write(&mm->mmap_sem);
 969         vma = check_range(mm, start, end, nmask,
 970                           flags | MPOL_MF_INVERT, &pagelist);
 971
 972         err = PTR_ERR(vma);
 973         if (!IS_ERR(vma)) {
 974                 int nr_failed = 0;
 975
 976                 err = mbind_range(vma, start, end, new);
 977
 978                 if (!list_empty(&pagelist))
 979                         nr_failed = migrate_pages(&pagelist, new_vma_page,
 980                                                 (unsigned long)vma);
 981
 982                 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 983                         err = -EIO;
 984         }
 985
 986         up_write(&mm->mmap_sem);
 987         mpol_put(new);
 988         return err;
 989 }
 990
 991 /*
 992  * User space interface with variable sized bitmaps for nodelists.
 993  */
 994
 995 /* Copy a node mask from user space. */
 996 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 997                      unsigned long maxnode)
 998 {
 999         unsigned long k;
1000         unsigned long nlongs;
1001         unsigned long endmask;
1002
1003         --maxnode;
1004         nodes_clear(*nodes);
1005         if (maxnode == 0 || !nmask)
1006                 return 0;
1007         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1008                 return -EINVAL;
1009
1010         nlongs = BITS_TO_LONGS(maxnode);
1011         if ((maxnode % BITS_PER_LONG) == 0)
1012                 endmask = ~0UL;
1013         else
1014                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1015
1016         /* When the user specified more nodes than supported just check
1017            if the non supported part is all zero. */
1018         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1019                 if (nlongs > PAGE_SIZE/sizeof(long))
1020                         return -EINVAL;
1021                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1022                         unsigned long t;
1023                         if (get_user(t, nmask + k))
1024                                 return -EFAULT;
1025                         if (k == nlongs - 1) {
1026                                 if (t & endmask)
1027                                         return -EINVAL;
1028                         } else if (t)
1029                                 return -EINVAL;
1030                 }
1031                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1032                 endmask = ~0UL;
1033         }
1034
1035         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1036                 return -EFAULT;
1037         nodes_addr(*nodes)[nlongs-1] &= endmask;
1038         return 0;
1039 }
1040
1041 /* Copy a kernel node mask to user space */
1042 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1043                               nodemask_t *nodes)
1044 {
1045         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1046         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1047
1048         if (copy > nbytes) {
1049                 if (copy > PAGE_SIZE)
1050                         return -EINVAL;
1051                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1052                         return -EFAULT;
1053                 copy = nbytes;
1054         }
1055         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1056 }
1057
1058 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
1059                         unsigned long mode,
1060                         unsigned long __user *nmask, unsigned long maxnode,
1061                         unsigned flags)
1062 {
1063         nodemask_t nodes;
1064         int err;
1065         unsigned short mode_flags;
1066
1067         mode_flags = mode & MPOL_MODE_FLAGS;
1068         mode &= ~MPOL_MODE_FLAGS;
1069         if (mode >= MPOL_MAX)
1070                 return -EINVAL;
1071         if ((mode_flags & MPOL_F_STATIC_NODES) &&
1072             (mode_flags & MPOL_F_RELATIVE_NODES))
1073                 return -EINVAL;
1074         err = get_nodes(&nodes, nmask, maxnode);
1075         if (err)
1076                 return err;
1077         return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1078 }
1079
1080 /* Set the process memory policy */
1081 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
1082                 unsigned long maxnode)
1083 {
1084         int err;
1085         nodemask_t nodes;
1086         unsigned short flags;
1087
1088         flags = mode & MPOL_MODE_FLAGS;
1089         mode &= ~MPOL_MODE_FLAGS;
1090         if ((unsigned int)mode >= MPOL_MAX)
1091                 return -EINVAL;
1092         if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1093                 return -EINVAL;
1094         err = get_nodes(&nodes, nmask, maxnode);
1095         if (err)
1096                 return err;
1097         return do_set_mempolicy(mode, flags, &nodes);
1098 }
1099
1100 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
1101                 const unsigned long __user *old_nodes,
1102                 const unsigned long __user *new_nodes)
1103 {
1104         struct mm_struct *mm;
1105         struct task_struct *task;
1106         nodemask_t old;
1107         nodemask_t new;
1108         nodemask_t task_nodes;
1109         int err;
1110
1111         err = get_nodes(&old, old_nodes, maxnode);
1112         if (err)
1113                 return err;
1114
1115         err = get_nodes(&new, new_nodes, maxnode);
1116         if (err)
1117                 return err;
1118
1119         /* Find the mm_struct */
1120         read_lock(&tasklist_lock);
1121         task = pid ? find_task_by_vpid(pid) : current;
1122         if (!task) {
1123                 read_unlock(&tasklist_lock);
1124                 return -ESRCH;
1125         }
1126         mm = get_task_mm(task);
1127         read_unlock(&tasklist_lock);
1128
1129         if (!mm)
1130                 return -EINVAL;
1131
1132         /*
1133          * Check if this process has the right to modify the specified
1134          * process. The right exists if the process has administrative
1135          * capabilities, superuser privileges or the same
1136          * userid as the target process.
1137          */
1138         if ((current->euid != task->suid) && (current->euid != task->uid) &&
1139             (current->uid != task->suid) && (current->uid != task->uid) &&
1140             !capable(CAP_SYS_NICE)) {
1141                 err = -EPERM;
1142                 goto out;
1143         }
1144
1145         task_nodes = cpuset_mems_allowed(task);
1146         /* Is the user allowed to access the target nodes? */
1147         if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
1148                 err = -EPERM;
1149                 goto out;
1150         }
1151
1152         if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
1153                 err = -EINVAL;
1154                 goto out;
1155         }
1156
1157         err = security_task_movememory(task);
1158         if (err)
1159                 goto out;
1160
1161         err = do_migrate_pages(mm, &old, &new,
1162                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1163 out:
1164         mmput(mm);
1165         return err;
1166 }
1167
1168
1169 /* Retrieve NUMA policy */
1170 asmlinkage long sys_get_mempolicy(int __user *policy,
1171                                 unsigned long __user *nmask,
1172                                 unsigned long maxnode,
1173                                 unsigned long addr, unsigned long flags)
1174 {
1175         int err;
1176         int uninitialized_var(pval);
1177         nodemask_t nodes;
1178
1179         if (nmask != NULL && maxnode < MAX_NUMNODES)
1180                 return -EINVAL;
1181
1182         err = do_get_mempolicy(&pval, &nodes, addr, flags);
1183
1184         if (err)
1185                 return err;
1186
1187         if (policy && put_user(pval, policy))
1188                 return -EFAULT;
1189
1190         if (nmask)
1191                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1192
1193         return err;
1194 }
1195
1196 #ifdef CONFIG_COMPAT
1197
1198 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1199                                      compat_ulong_t __user *nmask,
1200                                      compat_ulong_t maxnode,
1201                                      compat_ulong_t addr, compat_ulong_t flags)
1202 {
1203         long err;
1204         unsigned long __user *nm = NULL;
1205         unsigned long nr_bits, alloc_size;
1206         DECLARE_BITMAP(bm, MAX_NUMNODES);
1207
1208         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1209         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1210
1211         if (nmask)
1212                 nm = compat_alloc_user_space(alloc_size);
1213
1214         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1215
1216         if (!err && nmask) {
1217                 err = copy_from_user(bm, nm, alloc_size);
1218                 /* ensure entire bitmap is zeroed */
1219                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1220                 err |= compat_put_bitmap(nmask, bm, nr_bits);
1221         }
1222
1223         return err;
1224 }
1225
1226 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1227                                      compat_ulong_t maxnode)
1228 {
1229         long err = 0;
1230         unsigned long __user *nm = NULL;
1231         unsigned long nr_bits, alloc_size;
1232         DECLARE_BITMAP(bm, MAX_NUMNODES);
1233
1234         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1235         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1236
1237         if (nmask) {
1238                 err = compat_get_bitmap(bm, nmask, nr_bits);
1239                 nm = compat_alloc_user_space(alloc_size);
1240                 err |= copy_to_user(nm, bm, alloc_size);
1241         }
1242
1243         if (err)
1244                 return -EFAULT;
1245
1246         return sys_set_mempolicy(mode, nm, nr_bits+1);
1247 }
1248
1249 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1250                              compat_ulong_t mode, compat_ulong_t __user *nmask,
1251                              compat_ulong_t maxnode, compat_ulong_t flags)
1252 {
1253         long err = 0;
1254         unsigned long __user *nm = NULL;
1255         unsigned long nr_bits, alloc_size;
1256         nodemask_t bm;
1257
1258         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1259         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1260
1261         if (nmask) {
1262                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1263                 nm = compat_alloc_user_space(alloc_size);
1264                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1265         }
1266
1267         if (err)
1268                 return -EFAULT;
1269
1270         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1271 }
1272
1273 #endif
1274
1275 /*
1276  * get_vma_policy(@task, @vma, @addr)
1277  * @task - task for fallback if vma policy == default
1278  * @vma   - virtual memory area whose policy is sought
1279  * @addr  - address in @vma for shared policy lookup
1280  *
1281  * Returns effective policy for a VMA at specified address.
1282  * Falls back to @task or system default policy, as necessary.
1283  * Current or other task's task mempolicy and non-shared vma policies
1284  * are protected by the task's mmap_sem, which must be held for read by
1285  * the caller.
1286  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1287  * count--added by the get_policy() vm_op, as appropriate--to protect against
1288  * freeing by another task.  It is the caller's responsibility to free the
1289  * extra reference for shared policies.
1290  */
1291 static struct mempolicy *get_vma_policy(struct task_struct *task,
1292                 struct vm_area_struct *vma, unsigned long addr)
1293 {
1294         struct mempolicy *pol = task->mempolicy;
1295
1296         if (vma) {
1297                 if (vma->vm_ops && vma->vm_ops->get_policy) {
1298                         struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1299                                                                         addr);
1300                         if (vpol)
1301                                 pol = vpol;
1302                 } else if (vma->vm_policy)
1303                         pol = vma->vm_policy;
1304         }
1305         if (!pol)
1306                 pol = &default_policy;
1307         return pol;
1308 }
1309
1310 /*
1311  * Return a nodemask representing a mempolicy for filtering nodes for
1312  * page allocation
1313  */
1314 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1315 {
1316         /* Lower zones don't get a nodemask applied for MPOL_BIND */
1317         if (unlikely(policy->mode == MPOL_BIND) &&
1318                         gfp_zone(gfp) >= policy_zone &&
1319                         cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1320                 return &policy->v.nodes;
1321
1322         return NULL;
1323 }
1324
1325 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1326 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
1327 {
1328         int nd = numa_node_id();
1329
1330         switch (policy->mode) {
1331         case MPOL_PREFERRED:
1332                 if (!(policy->flags & MPOL_F_LOCAL))
1333                         nd = policy->v.preferred_node;
1334                 break;
1335         case MPOL_BIND:
1336                 /*
1337                  * Normally, MPOL_BIND allocations are node-local within the
1338                  * allowed nodemask.  However, if __GFP_THISNODE is set and the
1339                  * current node is part of the mask, we use the zonelist for
1340                  * the first node in the mask instead.
1341                  */
1342                 if (unlikely(gfp & __GFP_THISNODE) &&
1343                                 unlikely(!node_isset(nd, policy->v.nodes)))
1344                         nd = first_node(policy->v.nodes);
1345                 break;
1346         case MPOL_INTERLEAVE: /* should not happen */
1347                 break;
1348         default:
1349                 BUG();
1350         }
1351         return node_zonelist(nd, gfp);
1352 }
1353
1354 /* Do dynamic interleaving for a process */
1355 static unsigned interleave_nodes(struct mempolicy *policy)
1356 {
1357         unsigned nid, next;
1358         struct task_struct *me = current;
1359
1360         nid = me->il_next;
1361         next = next_node(nid, policy->v.nodes);
1362         if (next >= MAX_NUMNODES)
1363                 next = first_node(policy->v.nodes);
1364         if (next < MAX_NUMNODES)
1365                 me->il_next = next;
1366         return nid;
1367 }
1368
1369 /*
1370  * Depending on the memory policy provide a node from which to allocate the
1371  * next slab entry.
1372  * @policy must be protected by freeing by the caller.  If @policy is
1373  * the current task's mempolicy, this protection is implicit, as only the
1374  * task can change it's policy.  The system default policy requires no
1375  * such protection.
1376  */
1377 unsigned slab_node(struct mempolicy *policy)
1378 {
1379         if (!policy || policy->flags & MPOL_F_LOCAL)
1380                 return numa_node_id();
1381
1382         switch (policy->mode) {
1383         case MPOL_PREFERRED:
1384                 /*
1385                  * handled MPOL_F_LOCAL above
1386                  */
1387                 return policy->v.preferred_node;
1388
1389         case MPOL_INTERLEAVE:
1390                 return interleave_nodes(policy);
1391
1392         case MPOL_BIND: {
1393                 /*
1394                  * Follow bind policy behavior and start allocation at the
1395                  * first node.
1396                  */
1397                 struct zonelist *zonelist;
1398                 struct zone *zone;
1399                 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1400                 zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1401                 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1402                                                         &policy->v.nodes,
1403                                                         &zone);
1404                 return zone->node;
1405         }
1406
1407         default:
1408                 BUG();
1409         }
1410 }
1411
1412 /* Do static interleaving for a VMA with known offset. */
1413 static unsigned offset_il_node(struct mempolicy *pol,
1414                 struct vm_area_struct *vma, unsigned long off)
1415 {
1416         unsigned nnodes = nodes_weight(pol->v.nodes);
1417         unsigned target;
1418         int c;
1419         int nid = -1;
1420
1421         if (!nnodes)
1422                 return numa_node_id();
1423         target = (unsigned int)off % nnodes;
1424         c = 0;
1425         do {
1426                 nid = next_node(nid, pol->v.nodes);
1427                 c++;
1428         } while (c <= target);
1429         return nid;
1430 }
1431
1432 /* Determine a node number for interleave */
1433 static inline unsigned interleave_nid(struct mempolicy *pol,
1434                  struct vm_area_struct *vma, unsigned long addr, int shift)
1435 {
1436         if (vma) {
1437                 unsigned long off;
1438
1439                 /*
1440                  * for small pages, there is no difference between
1441                  * shift and PAGE_SHIFT, so the bit-shift is safe.
1442                  * for huge pages, since vm_pgoff is in units of small
1443                  * pages, we need to shift off the always 0 bits to get
1444                  * a useful offset.
1445                  */
1446                 BUG_ON(shift < PAGE_SHIFT);
1447                 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1448                 off += (addr - vma->vm_start) >> shift;
1449                 return offset_il_node(pol, vma, off);
1450         } else
1451                 return interleave_nodes(pol);
1452 }
1453
1454 #ifdef CONFIG_HUGETLBFS
1455 /*
1456  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1457  * @vma = virtual memory area whose policy is sought
1458  * @addr = address in @vma for shared policy lookup and interleave policy
1459  * @gfp_flags = for requested zone
1460  * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1461  * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1462  *
1463  * Returns a zonelist suitable for a huge page allocation and a pointer
1464  * to the struct mempolicy for conditional unref after allocation.
1465  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1466  * @nodemask for filtering the zonelist.
1467  */
1468 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1469                                 gfp_t gfp_flags, struct mempolicy **mpol,
1470                                 nodemask_t **nodemask)
1471 {
1472         struct zonelist *zl;
1473
1474         *mpol = get_vma_policy(current, vma, addr);
1475         *nodemask = NULL;       /* assume !MPOL_BIND */
1476
1477         if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1478                 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1479                                                 HPAGE_SHIFT), gfp_flags);
1480         } else {
1481                 zl = policy_zonelist(gfp_flags, *mpol);
1482                 if ((*mpol)->mode == MPOL_BIND)
1483                         *nodemask = &(*mpol)->v.nodes;
1484         }
1485         return zl;
1486 }
1487 #endif
1488
1489 /* Allocate a page in interleaved policy.
1490    Own path because it needs to do special accounting. */
1491 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1492                                         unsigned nid)
1493 {
1494         struct zonelist *zl;
1495         struct page *page;
1496
1497         zl = node_zonelist(nid, gfp);
1498         page = __alloc_pages(gfp, order, zl);
1499         if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1500                 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1501         return page;
1502 }
1503
1504 /**
1505  *      alloc_page_vma  - Allocate a page for a VMA.
1506  *
1507  *      @gfp:
1508  *      %GFP_USER    user allocation.
1509  *      %GFP_KERNEL  kernel allocations,
1510  *      %GFP_HIGHMEM highmem/user allocations,
1511  *      %GFP_FS      allocation should not call back into a file system.
1512  *      %GFP_ATOMIC  don't sleep.
1513  *
1514  *      @vma:  Pointer to VMA or NULL if not available.
1515  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1516  *
1517  *      This function allocates a page from the kernel page pool and applies
1518  *      a NUMA policy associated with the VMA or the current process.
1519  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1520  *      mm_struct of the VMA to prevent it from going away. Should be used for
1521  *      all allocations for pages that will be mapped into
1522  *      user space. Returns NULL when no page can be allocated.
1523  *
1524  *      Should be called with the mm_sem of the vma hold.
1525  */
1526 struct page *
1527 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1528 {
1529         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1530         struct zonelist *zl;
1531
1532         cpuset_update_task_memory_state();
1533
1534         if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1535                 unsigned nid;
1536
1537                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1538                 mpol_cond_put(pol);
1539                 return alloc_page_interleave(gfp, 0, nid);
1540         }
1541         zl = policy_zonelist(gfp, pol);
1542         if (unlikely(mpol_needs_cond_ref(pol))) {
1543                 /*
1544                  * slow path: ref counted shared policy
1545                  */
1546                 struct page *page =  __alloc_pages_nodemask(gfp, 0,
1547                                                 zl, policy_nodemask(gfp, pol));
1548                 __mpol_put(pol);
1549                 return page;
1550         }
1551         /*
1552          * fast path:  default or task policy
1553          */
1554         return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
1555 }
1556
1557 /**
1558  *      alloc_pages_current - Allocate pages.
1559  *
1560  *      @gfp:
1561  *              %GFP_USER   user allocation,
1562  *              %GFP_KERNEL kernel allocation,
1563  *              %GFP_HIGHMEM highmem allocation,
1564  *              %GFP_FS     don't call back into a file system.
1565  *              %GFP_ATOMIC don't sleep.
1566  *      @order: Power of two of allocation size in pages. 0 is a single page.
1567  *
1568  *      Allocate a page from the kernel page pool.  When not in
1569  *      interrupt context and apply the current process NUMA policy.
1570  *      Returns NULL when no page can be allocated.
1571  *
1572  *      Don't call cpuset_update_task_memory_state() unless
1573  *      1) it's ok to take cpuset_sem (can WAIT), and
1574  *      2) allocating for current task (not interrupt).
1575  */
1576 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1577 {
1578         struct mempolicy *pol = current->mempolicy;
1579
1580         if ((gfp & __GFP_WAIT) && !in_interrupt())
1581                 cpuset_update_task_memory_state();
1582         if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1583                 pol = &default_policy;
1584
1585         /*
1586          * No reference counting needed for current->mempolicy
1587          * nor system default_policy
1588          */
1589         if (pol->mode == MPOL_INTERLEAVE)
1590                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1591         return __alloc_pages_nodemask(gfp, order,
1592                         policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
1593 }
1594 EXPORT_SYMBOL(alloc_pages_current);
1595
1596 /*
1597  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
1598  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1599  * with the mems_allowed returned by cpuset_mems_allowed().  This
1600  * keeps mempolicies cpuset relative after its cpuset moves.  See
1601  * further kernel/cpuset.c update_nodemask().
1602  */
1603
1604 /* Slow path of a mempolicy duplicate */
1605 struct mempolicy *__mpol_dup(struct mempolicy *old)
1606 {
1607         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1608
1609         if (!new)
1610                 return ERR_PTR(-ENOMEM);
1611         if (current_cpuset_is_being_rebound()) {
1612                 nodemask_t mems = cpuset_mems_allowed(current);
1613                 mpol_rebind_policy(old, &mems);
1614         }
1615         *new = *old;
1616         atomic_set(&new->refcnt, 1);
1617         return new;
1618 }
1619
1620 /*
1621  * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
1622  * eliminate the * MPOL_F_* flags that require conditional ref and
1623  * [NOTE!!!] drop the extra ref.  Not safe to reference *frompol directly
1624  * after return.  Use the returned value.
1625  *
1626  * Allows use of a mempolicy for, e.g., multiple allocations with a single
1627  * policy lookup, even if the policy needs/has extra ref on lookup.
1628  * shmem_readahead needs this.
1629  */
1630 struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
1631                                                 struct mempolicy *frompol)
1632 {
1633         if (!mpol_needs_cond_ref(frompol))
1634                 return frompol;
1635
1636         *tompol = *frompol;
1637         tompol->flags &= ~MPOL_F_SHARED;        /* copy doesn't need unref */
1638         __mpol_put(frompol);
1639         return tompol;
1640 }
1641
1642 static int mpol_match_intent(const struct mempolicy *a,
1643                              const struct mempolicy *b)
1644 {
1645         if (a->flags != b->flags)
1646                 return 0;
1647         if (!mpol_store_user_nodemask(a))
1648                 return 1;
1649         return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
1650 }
1651
1652 /* Slow path of a mempolicy comparison */
1653 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1654 {
1655         if (!a || !b)
1656                 return 0;
1657         if (a->mode != b->mode)
1658                 return 0;
1659         if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b))
1660                 return 0;
1661         switch (a->mode) {
1662         case MPOL_BIND:
1663                 /* Fall through */
1664         case MPOL_INTERLEAVE:
1665                 return nodes_equal(a->v.nodes, b->v.nodes);
1666         case MPOL_PREFERRED:
1667                 return a->v.preferred_node == b->v.preferred_node &&
1668                         a->flags == b->flags;
1669         default:
1670                 BUG();
1671                 return 0;
1672         }
1673 }
1674
1675 /*
1676  * Shared memory backing store policy support.
1677  *
1678  * Remember policies even when nobody has shared memory mapped.
1679  * The policies are kept in Red-Black tree linked from the inode.
1680  * They are protected by the sp->lock spinlock, which should be held
1681  * for any accesses to the tree.
1682  */
1683
1684 /* lookup first element intersecting start-end */
1685 /* Caller holds sp->lock */
1686 static struct sp_node *
1687 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1688 {
1689         struct rb_node *n = sp->root.rb_node;
1690
1691         while (n) {
1692                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1693
1694                 if (start >= p->end)
1695                         n = n->rb_right;
1696                 else if (end <= p->start)
1697                         n = n->rb_left;
1698                 else
1699                         break;
1700         }
1701         if (!n)
1702                 return NULL;
1703         for (;;) {
1704                 struct sp_node *w = NULL;
1705                 struct rb_node *prev = rb_prev(n);
1706                 if (!prev)
1707                         break;
1708                 w = rb_entry(prev, struct sp_node, nd);
1709                 if (w->end <= start)
1710                         break;
1711                 n = prev;
1712         }
1713         return rb_entry(n, struct sp_node, nd);
1714 }
1715
1716 /* Insert a new shared policy into the list. */
1717 /* Caller holds sp->lock */
1718 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1719 {
1720         struct rb_node **p = &sp->root.rb_node;
1721         struct rb_node *parent = NULL;
1722         struct sp_node *nd;
1723
1724         while (*p) {
1725                 parent = *p;
1726                 nd = rb_entry(parent, struct sp_node, nd);
1727                 if (new->start < nd->start)
1728                         p = &(*p)->rb_left;
1729                 else if (new->end > nd->end)
1730                         p = &(*p)->rb_right;
1731                 else
1732                         BUG();
1733         }
1734         rb_link_node(&new->nd, parent, p);
1735         rb_insert_color(&new->nd, &sp->root);
1736         pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
1737                  new->policy ? new->policy->mode : 0);
1738 }
1739
1740 /* Find shared policy intersecting idx */
1741 struct mempolicy *
1742 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1743 {
1744         struct mempolicy *pol = NULL;
1745         struct sp_node *sn;
1746
1747         if (!sp->root.rb_node)
1748                 return NULL;
1749         spin_lock(&sp->lock);
1750         sn = sp_lookup(sp, idx, idx+1);
1751         if (sn) {
1752                 mpol_get(sn->policy);
1753                 pol = sn->policy;
1754         }
1755         spin_unlock(&sp->lock);
1756         return pol;
1757 }
1758
1759 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1760 {
1761         pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1762         rb_erase(&n->nd, &sp->root);
1763         mpol_put(n->policy);
1764         kmem_cache_free(sn_cache, n);
1765 }
1766
1767 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1768                                 struct mempolicy *pol)
1769 {
1770         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1771
1772         if (!n)
1773                 return NULL;
1774         n->start = start;
1775         n->end = end;
1776         mpol_get(pol);
1777         pol->flags |= MPOL_F_SHARED;    /* for unref */
1778         n->policy = pol;
1779         return n;
1780 }
1781
1782 /* Replace a policy range. */
1783 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1784                                  unsigned long end, struct sp_node *new)
1785 {
1786         struct sp_node *n, *new2 = NULL;
1787
1788 restart:
1789         spin_lock(&sp->lock);
1790         n = sp_lookup(sp, start, end);
1791         /* Take care of old policies in the same range. */
1792         while (n && n->start < end) {
1793                 struct rb_node *next = rb_next(&n->nd);
1794                 if (n->start >= start) {
1795                         if (n->end <= end)
1796                                 sp_delete(sp, n);
1797                         else
1798                                 n->start = end;
1799                 } else {
1800                         /* Old policy spanning whole new range. */
1801                         if (n->end > end) {
1802                                 if (!new2) {
1803                                         spin_unlock(&sp->lock);
1804                                         new2 = sp_alloc(end, n->end, n->policy);
1805                                         if (!new2)
1806                                                 return -ENOMEM;
1807                                         goto restart;
1808                                 }
1809                                 n->end = start;
1810                                 sp_insert(sp, new2);
1811                                 new2 = NULL;
1812                                 break;
1813                         } else
1814                                 n->end = start;
1815                 }
1816                 if (!next)
1817                         break;
1818                 n = rb_entry(next, struct sp_node, nd);
1819         }
1820         if (new)
1821                 sp_insert(sp, new);
1822         spin_unlock(&sp->lock);
1823         if (new2) {
1824                 mpol_put(new2->policy);
1825                 kmem_cache_free(sn_cache, new2);
1826         }
1827         return 0;
1828 }
1829
1830 void mpol_shared_policy_init(struct shared_policy *info, unsigned short policy,
1831                         unsigned short flags, nodemask_t *policy_nodes)
1832 {
1833         info->root = RB_ROOT;
1834         spin_lock_init(&info->lock);
1835
1836         if (policy != MPOL_DEFAULT) {
1837                 struct mempolicy *newpol;
1838
1839                 /* Falls back to NULL policy [MPOL_DEFAULT] on any error */
1840                 newpol = mpol_new(policy, flags, policy_nodes);
1841                 if (!IS_ERR(newpol)) {
1842                         /* Create pseudo-vma that contains just the policy */
1843                         struct vm_area_struct pvma;
1844
1845                         memset(&pvma, 0, sizeof(struct vm_area_struct));
1846                         /* Policy covers entire file */
1847                         pvma.vm_end = TASK_SIZE;
1848                         mpol_set_shared_policy(info, &pvma, newpol);
1849                         mpol_put(newpol);
1850                 }
1851         }
1852 }
1853
1854 int mpol_set_shared_policy(struct shared_policy *info,
1855                         struct vm_area_struct *vma, struct mempolicy *npol)
1856 {
1857         int err;
1858         struct sp_node *new = NULL;
1859         unsigned long sz = vma_pages(vma);
1860
1861         pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
1862                  vma->vm_pgoff,
1863                  sz, npol ? npol->mode : -1,
1864                  npol ? npol->flags : -1,
1865                  npol ? nodes_addr(npol->v.nodes)[0] : -1);
1866
1867         if (npol) {
1868                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1869                 if (!new)
1870                         return -ENOMEM;
1871         }
1872         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1873         if (err && new)
1874                 kmem_cache_free(sn_cache, new);
1875         return err;
1876 }
1877
1878 /* Free a backing policy store on inode delete. */
1879 void mpol_free_shared_policy(struct shared_policy *p)
1880 {
1881         struct sp_node *n;
1882         struct rb_node *next;
1883
1884         if (!p->root.rb_node)
1885                 return;
1886         spin_lock(&p->lock);
1887         next = rb_first(&p->root);
1888         while (next) {
1889                 n = rb_entry(next, struct sp_node, nd);
1890                 next = rb_next(&n->nd);
1891                 rb_erase(&n->nd, &p->root);
1892                 mpol_put(n->policy);
1893                 kmem_cache_free(sn_cache, n);
1894         }
1895         spin_unlock(&p->lock);
1896 }
1897
1898 /* assumes fs == KERNEL_DS */
1899 void __init numa_policy_init(void)
1900 {
1901         nodemask_t interleave_nodes;
1902         unsigned long largest = 0;
1903         int nid, prefer = 0;
1904
1905         policy_cache = kmem_cache_create("numa_policy",
1906                                          sizeof(struct mempolicy),
1907                                          0, SLAB_PANIC, NULL);
1908
1909         sn_cache = kmem_cache_create("shared_policy_node",
1910                                      sizeof(struct sp_node),
1911                                      0, SLAB_PANIC, NULL);
1912
1913         /*
1914          * Set interleaving policy for system init. Interleaving is only
1915          * enabled across suitably sized nodes (default is >= 16MB), or
1916          * fall back to the largest node if they're all smaller.
1917          */
1918         nodes_clear(interleave_nodes);
1919         for_each_node_state(nid, N_HIGH_MEMORY) {
1920                 unsigned long total_pages = node_present_pages(nid);
1921
1922                 /* Preserve the largest node */
1923                 if (largest < total_pages) {
1924                         largest = total_pages;
1925                         prefer = nid;
1926                 }
1927
1928                 /* Interleave this node? */
1929                 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
1930                         node_set(nid, interleave_nodes);
1931         }
1932
1933         /* All too small, use the largest */
1934         if (unlikely(nodes_empty(interleave_nodes)))
1935                 node_set(prefer, interleave_nodes);
1936
1937         if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
1938                 printk("numa_policy_init: interleaving failed\n");
1939 }
1940
1941 /* Reset policy of current process to default */
1942 void numa_default_policy(void)
1943 {
1944         do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
1945 }
1946
1947 /*
1948  * "local" is pseudo-policy:  MPOL_PREFERRED with MPOL_F_LOCAL flag
1949  * Used only for mpol_to_str()
1950  */
1951 #define MPOL_LOCAL (MPOL_INTERLEAVE + 1)
1952 static const char * const policy_types[] =
1953         { "default", "prefer", "bind", "interleave", "local" };
1954
1955 /*
1956  * Convert a mempolicy into a string.
1957  * Returns the number of characters in buffer (if positive)
1958  * or an error (negative)
1959  */
1960 static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1961 {
1962         char *p = buffer;
1963         int l;
1964         nodemask_t nodes;
1965         unsigned short mode;
1966         unsigned short flags = pol ? pol->flags : 0;
1967
1968         /*
1969          * Sanity check:  room for longest mode, flag and some nodes
1970          */
1971         VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
1972
1973         if (!pol || pol == &default_policy)
1974                 mode = MPOL_DEFAULT;
1975         else
1976                 mode = pol->mode;
1977
1978         switch (mode) {
1979         case MPOL_DEFAULT:
1980                 nodes_clear(nodes);
1981                 break;
1982
1983         case MPOL_PREFERRED:
1984                 nodes_clear(nodes);
1985                 if (flags & MPOL_F_LOCAL)
1986                         mode = MPOL_LOCAL;      /* pseudo-policy */
1987                 else
1988                         node_set(pol->v.preferred_node, nodes);
1989                 break;
1990
1991         case MPOL_BIND:
1992                 /* Fall through */
1993         case MPOL_INTERLEAVE:
1994                 nodes = pol->v.nodes;
1995                 break;
1996
1997         default:
1998                 BUG();
1999         }
2000
2001         l = strlen(policy_types[mode]);
2002         if (buffer + maxlen < p + l + 1)
2003                 return -ENOSPC;
2004
2005         strcpy(p, policy_types[mode]);
2006         p += l;
2007
2008         if (flags & MPOL_MODE_FLAGS) {
2009                 if (buffer + maxlen < p + 2)
2010                         return -ENOSPC;
2011                 *p++ = '=';
2012
2013                 /*
2014                  * Currently, the only defined flags are mutually exclusive
2015                  */
2016                 if (flags & MPOL_F_STATIC_NODES)
2017                         p += snprintf(p, buffer + maxlen - p, "static");
2018                 else if (flags & MPOL_F_RELATIVE_NODES)
2019                         p += snprintf(p, buffer + maxlen - p, "relative");
2020         }
2021
2022         if (!nodes_empty(nodes)) {
2023                 if (buffer + maxlen < p + 2)
2024                         return -ENOSPC;
2025                 *p++ = '=';
2026                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2027         }
2028         return p - buffer;
2029 }
2030
2031 struct numa_maps {
2032         unsigned long pages;
2033         unsigned long anon;
2034         unsigned long active;
2035         unsigned long writeback;
2036         unsigned long mapcount_max;
2037         unsigned long dirty;
2038         unsigned long swapcache;
2039         unsigned long node[MAX_NUMNODES];
2040 };
2041
2042 static void gather_stats(struct page *page, void *private, int pte_dirty)
2043 {
2044         struct numa_maps *md = private;
2045         int count = page_mapcount(page);
2046
2047         md->pages++;
2048         if (pte_dirty || PageDirty(page))
2049                 md->dirty++;
2050
2051         if (PageSwapCache(page))
2052                 md->swapcache++;
2053
2054         if (PageActive(page))
2055                 md->active++;
2056
2057         if (PageWriteback(page))
2058                 md->writeback++;
2059
2060         if (PageAnon(page))
2061                 md->anon++;
2062
2063         if (count > md->mapcount_max)
2064                 md->mapcount_max = count;
2065
2066         md->node[page_to_nid(page)]++;
2067 }
2068
2069 #ifdef CONFIG_HUGETLB_PAGE
2070 static void check_huge_range(struct vm_area_struct *vma,
2071                 unsigned long start, unsigned long end,
2072                 struct numa_maps *md)
2073 {
2074         unsigned long addr;
2075         struct page *page;
2076
2077         for (addr = start; addr < end; addr += HPAGE_SIZE) {
2078                 pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
2079                 pte_t pte;
2080
2081                 if (!ptep)
2082                         continue;
2083
2084                 pte = *ptep;
2085                 if (pte_none(pte))
2086                         continue;
2087
2088                 page = pte_page(pte);
2089                 if (!page)
2090                         continue;
2091
2092                 gather_stats(page, md, pte_dirty(*ptep));
2093         }
2094 }
2095 #else
2096 static inline void check_huge_range(struct vm_area_struct *vma,
2097                 unsigned long start, unsigned long end,
2098                 struct numa_maps *md)
2099 {
2100 }
2101 #endif
2102
2103 /*
2104  * Display pages allocated per node and memory policy via /proc.
2105  */
2106 int show_numa_map(struct seq_file *m, void *v)
2107 {
2108         struct proc_maps_private *priv = m->private;
2109         struct vm_area_struct *vma = v;
2110         struct numa_maps *md;
2111         struct file *file = vma->vm_file;
2112         struct mm_struct *mm = vma->vm_mm;
2113         struct mempolicy *pol;
2114         int n;
2115         char buffer[50];
2116
2117         if (!mm)
2118                 return 0;
2119
2120         md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
2121         if (!md)
2122                 return 0;
2123
2124         pol = get_vma_policy(priv->task, vma, vma->vm_start);
2125         mpol_to_str(buffer, sizeof(buffer), pol);
2126         mpol_cond_put(pol);
2127
2128         seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2129
2130         if (file) {
2131                 seq_printf(m, " file=");
2132                 seq_path(m, &file->f_path, "\n\t= ");
2133         } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2134                 seq_printf(m, " heap");
2135         } else if (vma->vm_start <= mm->start_stack &&
2136                         vma->vm_end >= mm->start_stack) {
2137                 seq_printf(m, " stack");
2138         }
2139
2140         if (is_vm_hugetlb_page(vma)) {
2141                 check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2142                 seq_printf(m, " huge");
2143         } else {
2144                 check_pgd_range(vma, vma->vm_start, vma->vm_end,
2145                         &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
2146         }
2147
2148         if (!md->pages)
2149                 goto out;
2150
2151         if (md->anon)
2152                 seq_printf(m," anon=%lu",md->anon);
2153
2154         if (md->dirty)
2155                 seq_printf(m," dirty=%lu",md->dirty);
2156
2157         if (md->pages != md->anon && md->pages != md->dirty)
2158                 seq_printf(m, " mapped=%lu", md->pages);
2159
2160         if (md->mapcount_max > 1)
2161                 seq_printf(m, " mapmax=%lu", md->mapcount_max);
2162
2163         if (md->swapcache)
2164                 seq_printf(m," swapcache=%lu", md->swapcache);
2165
2166         if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2167                 seq_printf(m," active=%lu", md->active);
2168
2169         if (md->writeback)
2170                 seq_printf(m," writeback=%lu", md->writeback);
2171
2172         for_each_node_state(n, N_HIGH_MEMORY)
2173                 if (md->node[n])
2174                         seq_printf(m, " N%d=%lu", n, md->node[n]);
2175 out:
2176         seq_putc(m, '\n');
2177         kfree(md);
2178
2179         if (m->count < m->size)
2180                 m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
2181         return 0;
2182 }