mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66    could replace all the switch()es with a mempolicy_ops structure.
  67 */
  68
  69 #include <linux/mempolicy.h>
  70 #include <linux/mm.h>
  71 #include <linux/highmem.h>
  72 #include <linux/hugetlb.h>
  73 #include <linux/kernel.h>
  74 #include <linux/sched.h>
  75 #include <linux/mm.h>
  76 #include <linux/nodemask.h>
  77 #include <linux/cpuset.h>
  78 #include <linux/gfp.h>
  79 #include <linux/slab.h>
  80 #include <linux/string.h>
  81 #include <linux/module.h>
  82 #include <linux/interrupt.h>
  83 #include <linux/init.h>
  84 #include <linux/compat.h>
  85 #include <linux/mempolicy.h>
  86 #include <linux/swap.h>
  87 #include <linux/seq_file.h>
  88 #include <linux/proc_fs.h>
  89 #include <linux/migrate.h>
  90 #include <linux/rmap.h>
  91 #include <linux/security.h>
  92
  93 #include <asm/tlbflush.h>
  94 #include <asm/uaccess.h>
  95
  96 /* Internal flags */
  97 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
  98 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
  99 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
 100
 101 static struct kmem_cache *policy_cache;
 102 static struct kmem_cache *sn_cache;
 103
 104 #define PDprintk(fmt...)
 105
 106 /* Highest zone. An specific allocation for a zone below that is not
 107    policied. */
 108 int policy_zone = ZONE_DMA;
 109
 110 struct mempolicy default_policy = {
 111         .refcnt = ATOMIC_INIT(1), /* never free it */
 112         .policy = MPOL_DEFAULT,
 113 };
 114
 115 /* Do sanity checking on a policy */
 116 static int mpol_check_policy(int mode, nodemask_t *nodes)
 117 {
 118         int empty = nodes_empty(*nodes);
 119
 120         switch (mode) {
 121         case MPOL_DEFAULT:
 122                 if (!empty)
 123                         return -EINVAL;
 124                 break;
 125         case MPOL_BIND:
 126         case MPOL_INTERLEAVE:
 127                 /* Preferred will only use the first bit, but allow
 128                    more for now. */
 129                 if (empty)
 130                         return -EINVAL;
 131                 break;
 132         }
 133         return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
 134 }
 135
 136 /* Generate a custom zonelist for the BIND policy. */
 137 static struct zonelist *bind_zonelist(nodemask_t *nodes)
 138 {
 139         struct zonelist *zl;
 140         int num, max, nd, k;
 141
 142         max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
 143         zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
 144         if (!zl)
 145                 return NULL;
 146         num = 0;
 147         /* First put in the highest zones from all nodes, then all the next
 148            lower zones etc. Avoid empty zones because the memory allocator
 149            doesn't like them. If you implement node hot removal you
 150            have to fix that. */
 151         for (k = policy_zone; k >= 0; k--) {
 152                 for_each_node_mask(nd, *nodes) {
 153                         struct zone *z = &NODE_DATA(nd)->node_zones[k];
 154                         if (z->present_pages > 0)
 155                                 zl->zones[num++] = z;
 156                 }
 157         }
 158         zl->zones[num] = NULL;
 159         return zl;
 160 }
 161
 162 /* Create a new policy */
 163 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 164 {
 165         struct mempolicy *policy;
 166
 167         PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
 168         if (mode == MPOL_DEFAULT)
 169                 return NULL;
 170         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 171         if (!policy)
 172                 return ERR_PTR(-ENOMEM);
 173         atomic_set(&policy->refcnt, 1);
 174         switch (mode) {
 175         case MPOL_INTERLEAVE:
 176                 policy->v.nodes = *nodes;
 177                 if (nodes_weight(*nodes) == 0) {
 178                         kmem_cache_free(policy_cache, policy);
 179                         return ERR_PTR(-EINVAL);
 180                 }
 181                 break;
 182         case MPOL_PREFERRED:
 183                 policy->v.preferred_node = first_node(*nodes);
 184                 if (policy->v.preferred_node >= MAX_NUMNODES)
 185                         policy->v.preferred_node = -1;
 186                 break;
 187         case MPOL_BIND:
 188                 policy->v.zonelist = bind_zonelist(nodes);
 189                 if (policy->v.zonelist == NULL) {
 190                         kmem_cache_free(policy_cache, policy);
 191                         return ERR_PTR(-ENOMEM);
 192                 }
 193                 break;
 194         }
 195         policy->policy = mode;
 196         policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
 197         return policy;
 198 }
 199
 200 static void gather_stats(struct page *, void *, int pte_dirty);
 201 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 202                                 unsigned long flags);
 203
 204 /* Scan through pages checking if pages follow certain conditions. */
 205 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 206                 unsigned long addr, unsigned long end,
 207                 const nodemask_t *nodes, unsigned long flags,
 208                 void *private)
 209 {
 210         pte_t *orig_pte;
 211         pte_t *pte;
 212         spinlock_t *ptl;
 213
 214         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 215         do {
 216                 struct page *page;
 217                 unsigned int nid;
 218
 219                 if (!pte_present(*pte))
 220                         continue;
 221                 page = vm_normal_page(vma, addr, *pte);
 222                 if (!page)
 223                         continue;
 224                 /*
 225                  * The check for PageReserved here is important to avoid
 226                  * handling zero pages and other pages that may have been
 227                  * marked special by the system.
 228                  *
 229                  * If the PageReserved would not be checked here then f.e.
 230                  * the location of the zero page could have an influence
 231                  * on MPOL_MF_STRICT, zero pages would be counted for
 232                  * the per node stats, and there would be useless attempts
 233                  * to put zero pages on the migration list.
 234                  */
 235                 if (PageReserved(page))
 236                         continue;
 237                 nid = page_to_nid(page);
 238                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 239                         continue;
 240
 241                 if (flags & MPOL_MF_STATS)
 242                         gather_stats(page, private, pte_dirty(*pte));
 243                 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 244                         migrate_page_add(page, private, flags);
 245                 else
 246                         break;
 247         } while (pte++, addr += PAGE_SIZE, addr != end);
 248         pte_unmap_unlock(orig_pte, ptl);
 249         return addr != end;
 250 }
 251
 252 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 253                 unsigned long addr, unsigned long end,
 254                 const nodemask_t *nodes, unsigned long flags,
 255                 void *private)
 256 {
 257         pmd_t *pmd;
 258         unsigned long next;
 259
 260         pmd = pmd_offset(pud, addr);
 261         do {
 262                 next = pmd_addr_end(addr, end);
 263                 if (pmd_none_or_clear_bad(pmd))
 264                         continue;
 265                 if (check_pte_range(vma, pmd, addr, next, nodes,
 266                                     flags, private))
 267                         return -EIO;
 268         } while (pmd++, addr = next, addr != end);
 269         return 0;
 270 }
 271
 272 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 273                 unsigned long addr, unsigned long end,
 274                 const nodemask_t *nodes, unsigned long flags,
 275                 void *private)
 276 {
 277         pud_t *pud;
 278         unsigned long next;
 279
 280         pud = pud_offset(pgd, addr);
 281         do {
 282                 next = pud_addr_end(addr, end);
 283                 if (pud_none_or_clear_bad(pud))
 284                         continue;
 285                 if (check_pmd_range(vma, pud, addr, next, nodes,
 286                                     flags, private))
 287                         return -EIO;
 288         } while (pud++, addr = next, addr != end);
 289         return 0;
 290 }
 291
 292 static inline int check_pgd_range(struct vm_area_struct *vma,
 293                 unsigned long addr, unsigned long end,
 294                 const nodemask_t *nodes, unsigned long flags,
 295                 void *private)
 296 {
 297         pgd_t *pgd;
 298         unsigned long next;
 299
 300         pgd = pgd_offset(vma->vm_mm, addr);
 301         do {
 302                 next = pgd_addr_end(addr, end);
 303                 if (pgd_none_or_clear_bad(pgd))
 304                         continue;
 305                 if (check_pud_range(vma, pgd, addr, next, nodes,
 306                                     flags, private))
 307                         return -EIO;
 308         } while (pgd++, addr = next, addr != end);
 309         return 0;
 310 }
 311
 312 /* Check if a vma is migratable */
 313 static inline int vma_migratable(struct vm_area_struct *vma)
 314 {
 315         if (vma->vm_flags & (
 316                 VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED))
 317                 return 0;
 318         return 1;
 319 }
 320
 321 /*
 322  * Check if all pages in a range are on a set of nodes.
 323  * If pagelist != NULL then isolate pages from the LRU and
 324  * put them on the pagelist.
 325  */
 326 static struct vm_area_struct *
 327 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 328                 const nodemask_t *nodes, unsigned long flags, void *private)
 329 {
 330         int err;
 331         struct vm_area_struct *first, *vma, *prev;
 332
 333         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 334
 335                 err = migrate_prep();
 336                 if (err)
 337                         return ERR_PTR(err);
 338         }
 339
 340         first = find_vma(mm, start);
 341         if (!first)
 342                 return ERR_PTR(-EFAULT);
 343         prev = NULL;
 344         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 345                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 346                         if (!vma->vm_next && vma->vm_end < end)
 347                                 return ERR_PTR(-EFAULT);
 348                         if (prev && prev->vm_end < vma->vm_start)
 349                                 return ERR_PTR(-EFAULT);
 350                 }
 351                 if (!is_vm_hugetlb_page(vma) &&
 352                     ((flags & MPOL_MF_STRICT) ||
 353                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 354                                 vma_migratable(vma)))) {
 355                         unsigned long endvma = vma->vm_end;
 356
 357                         if (endvma > end)
 358                                 endvma = end;
 359                         if (vma->vm_start > start)
 360                                 start = vma->vm_start;
 361                         err = check_pgd_range(vma, start, endvma, nodes,
 362                                                 flags, private);
 363                         if (err) {
 364                                 first = ERR_PTR(err);
 365                                 break;
 366                         }
 367                 }
 368                 prev = vma;
 369         }
 370         return first;
 371 }
 372
 373 /* Apply policy to a single VMA */
 374 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 375 {
 376         int err = 0;
 377         struct mempolicy *old = vma->vm_policy;
 378
 379         PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 380                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 381                  vma->vm_ops, vma->vm_file,
 382                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 383
 384         if (vma->vm_ops && vma->vm_ops->set_policy)
 385                 err = vma->vm_ops->set_policy(vma, new);
 386         if (!err) {
 387                 mpol_get(new);
 388                 vma->vm_policy = new;
 389                 mpol_free(old);
 390         }
 391         return err;
 392 }
 393
 394 /* Step 2: apply policy to a range and do splits. */
 395 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 396                        unsigned long end, struct mempolicy *new)
 397 {
 398         struct vm_area_struct *next;
 399         int err;
 400
 401         err = 0;
 402         for (; vma && vma->vm_start < end; vma = next) {
 403                 next = vma->vm_next;
 404                 if (vma->vm_start < start)
 405                         err = split_vma(vma->vm_mm, vma, start, 1);
 406                 if (!err && vma->vm_end > end)
 407                         err = split_vma(vma->vm_mm, vma, end, 0);
 408                 if (!err)
 409                         err = policy_vma(vma, new);
 410                 if (err)
 411                         break;
 412         }
 413         return err;
 414 }
 415
 416 static int contextualize_policy(int mode, nodemask_t *nodes)
 417 {
 418         if (!nodes)
 419                 return 0;
 420
 421         cpuset_update_task_memory_state();
 422         if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
 423                 return -EINVAL;
 424         return mpol_check_policy(mode, nodes);
 425 }
 426
 427
 428 /*
 429  * Update task->flags PF_MEMPOLICY bit: set iff non-default
 430  * mempolicy.  Allows more rapid checking of this (combined perhaps
 431  * with other PF_* flag bits) on memory allocation hot code paths.
 432  *
 433  * If called from outside this file, the task 'p' should -only- be
 434  * a newly forked child not yet visible on the task list, because
 435  * manipulating the task flags of a visible task is not safe.
 436  *
 437  * The above limitation is why this routine has the funny name
 438  * mpol_fix_fork_child_flag().
 439  *
 440  * It is also safe to call this with a task pointer of current,
 441  * which the static wrapper mpol_set_task_struct_flag() does,
 442  * for use within this file.
 443  */
 444
 445 void mpol_fix_fork_child_flag(struct task_struct *p)
 446 {
 447         if (p->mempolicy)
 448                 p->flags |= PF_MEMPOLICY;
 449         else
 450                 p->flags &= ~PF_MEMPOLICY;
 451 }
 452
 453 static void mpol_set_task_struct_flag(void)
 454 {
 455         mpol_fix_fork_child_flag(current);
 456 }
 457
 458 /* Set the process memory policy */
 459 long do_set_mempolicy(int mode, nodemask_t *nodes)
 460 {
 461         struct mempolicy *new;
 462
 463         if (contextualize_policy(mode, nodes))
 464                 return -EINVAL;
 465         new = mpol_new(mode, nodes);
 466         if (IS_ERR(new))
 467                 return PTR_ERR(new);
 468         mpol_free(current->mempolicy);
 469         current->mempolicy = new;
 470         mpol_set_task_struct_flag();
 471         if (new && new->policy == MPOL_INTERLEAVE)
 472                 current->il_next = first_node(new->v.nodes);
 473         return 0;
 474 }
 475
 476 /* Fill a zone bitmap for a policy */
 477 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 478 {
 479         int i;
 480
 481         nodes_clear(*nodes);
 482         switch (p->policy) {
 483         case MPOL_BIND:
 484                 for (i = 0; p->v.zonelist->zones[i]; i++)
 485                         node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
 486                                 *nodes);
 487                 break;
 488         case MPOL_DEFAULT:
 489                 break;
 490         case MPOL_INTERLEAVE:
 491                 *nodes = p->v.nodes;
 492                 break;
 493         case MPOL_PREFERRED:
 494                 /* or use current node instead of online map? */
 495                 if (p->v.preferred_node < 0)
 496                         *nodes = node_online_map;
 497                 else
 498                         node_set(p->v.preferred_node, *nodes);
 499                 break;
 500         default:
 501                 BUG();
 502         }
 503 }
 504
 505 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 506 {
 507         struct page *p;
 508         int err;
 509
 510         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 511         if (err >= 0) {
 512                 err = page_to_nid(p);
 513                 put_page(p);
 514         }
 515         return err;
 516 }
 517
 518 /* Retrieve NUMA policy */
 519 long do_get_mempolicy(int *policy, nodemask_t *nmask,
 520                         unsigned long addr, unsigned long flags)
 521 {
 522         int err;
 523         struct mm_struct *mm = current->mm;
 524         struct vm_area_struct *vma = NULL;
 525         struct mempolicy *pol = current->mempolicy;
 526
 527         cpuset_update_task_memory_state();
 528         if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
 529                 return -EINVAL;
 530         if (flags & MPOL_F_ADDR) {
 531                 down_read(&mm->mmap_sem);
 532                 vma = find_vma_intersection(mm, addr, addr+1);
 533                 if (!vma) {
 534                         up_read(&mm->mmap_sem);
 535                         return -EFAULT;
 536                 }
 537                 if (vma->vm_ops && vma->vm_ops->get_policy)
 538                         pol = vma->vm_ops->get_policy(vma, addr);
 539                 else
 540                         pol = vma->vm_policy;
 541         } else if (addr)
 542                 return -EINVAL;
 543
 544         if (!pol)
 545                 pol = &default_policy;
 546
 547         if (flags & MPOL_F_NODE) {
 548                 if (flags & MPOL_F_ADDR) {
 549                         err = lookup_node(mm, addr);
 550                         if (err < 0)
 551                                 goto out;
 552                         *policy = err;
 553                 } else if (pol == current->mempolicy &&
 554                                 pol->policy == MPOL_INTERLEAVE) {
 555                         *policy = current->il_next;
 556                 } else {
 557                         err = -EINVAL;
 558                         goto out;
 559                 }
 560         } else
 561                 *policy = pol->policy;
 562
 563         if (vma) {
 564                 up_read(&current->mm->mmap_sem);
 565                 vma = NULL;
 566         }
 567
 568         err = 0;
 569         if (nmask)
 570                 get_zonemask(pol, nmask);
 571
 572  out:
 573         if (vma)
 574                 up_read(&current->mm->mmap_sem);
 575         return err;
 576 }
 577
 578 #ifdef CONFIG_MIGRATION
 579 /*
 580  * page migration
 581  */
 582 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 583                                 unsigned long flags)
 584 {
 585         /*
 586          * Avoid migrating a page that is shared with others.
 587          */
 588         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
 589                 isolate_lru_page(page, pagelist);
 590 }
 591
 592 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 593 {
 594         return alloc_pages_node(node, GFP_HIGHUSER, 0);
 595 }
 596
 597 /*
 598  * Migrate pages from one node to a target node.
 599  * Returns error or the number of pages not migrated.
 600  */
 601 int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
 602 {
 603         nodemask_t nmask;
 604         LIST_HEAD(pagelist);
 605         int err = 0;
 606
 607         nodes_clear(nmask);
 608         node_set(source, nmask);
 609
 610         check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
 611                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 612
 613         if (!list_empty(&pagelist))
 614                 err = migrate_pages(&pagelist, new_node_page, dest);
 615
 616         return err;
 617 }
 618
 619 /*
 620  * Move pages between the two nodesets so as to preserve the physical
 621  * layout as much as possible.
 622  *
 623  * Returns the number of page that could not be moved.
 624  */
 625 int do_migrate_pages(struct mm_struct *mm,
 626         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 627 {
 628         LIST_HEAD(pagelist);
 629         int busy = 0;
 630         int err = 0;
 631         nodemask_t tmp;
 632
 633         down_read(&mm->mmap_sem);
 634
 635         err = migrate_vmas(mm, from_nodes, to_nodes, flags);
 636         if (err)
 637                 goto out;
 638
 639 /*
 640  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
 641  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
 642  * bit in 'tmp', and return that <source, dest> pair for migration.
 643  * The pair of nodemasks 'to' and 'from' define the map.
 644  *
 645  * If no pair of bits is found that way, fallback to picking some
 646  * pair of 'source' and 'dest' bits that are not the same.  If the
 647  * 'source' and 'dest' bits are the same, this represents a node
 648  * that will be migrating to itself, so no pages need move.
 649  *
 650  * If no bits are left in 'tmp', or if all remaining bits left
 651  * in 'tmp' correspond to the same bit in 'to', return false
 652  * (nothing left to migrate).
 653  *
 654  * This lets us pick a pair of nodes to migrate between, such that
 655  * if possible the dest node is not already occupied by some other
 656  * source node, minimizing the risk of overloading the memory on a
 657  * node that would happen if we migrated incoming memory to a node
 658  * before migrating outgoing memory source that same node.
 659  *
 660  * A single scan of tmp is sufficient.  As we go, we remember the
 661  * most recent <s, d> pair that moved (s != d).  If we find a pair
 662  * that not only moved, but what's better, moved to an empty slot
 663  * (d is not set in tmp), then we break out then, with that pair.
 664  * Otherwise when we finish scannng from_tmp, we at least have the
 665  * most recent <s, d> pair that moved.  If we get all the way through
 666  * the scan of tmp without finding any node that moved, much less
 667  * moved to an empty node, then there is nothing left worth migrating.
 668  */
 669
 670         tmp = *from_nodes;
 671         while (!nodes_empty(tmp)) {
 672                 int s,d;
 673                 int source = -1;
 674                 int dest = 0;
 675
 676                 for_each_node_mask(s, tmp) {
 677                         d = node_remap(s, *from_nodes, *to_nodes);
 678                         if (s == d)
 679                                 continue;
 680
 681                         source = s;     /* Node moved. Memorize */
 682                         dest = d;
 683
 684                         /* dest not in remaining from nodes? */
 685                         if (!node_isset(dest, tmp))
 686                                 break;
 687                 }
 688                 if (source == -1)
 689                         break;
 690
 691                 node_clear(source, tmp);
 692                 err = migrate_to_node(mm, source, dest, flags);
 693                 if (err > 0)
 694                         busy += err;
 695                 if (err < 0)
 696                         break;
 697         }
 698 out:
 699         up_read(&mm->mmap_sem);
 700         if (err < 0)
 701                 return err;
 702         return busy;
 703
 704 }
 705
 706 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 707 {
 708         struct vm_area_struct *vma = (struct vm_area_struct *)private;
 709
 710         return alloc_page_vma(GFP_HIGHUSER, vma, page_address_in_vma(page, vma));
 711 }
 712 #else
 713
 714 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 715                                 unsigned long flags)
 716 {
 717 }
 718
 719 int do_migrate_pages(struct mm_struct *mm,
 720         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 721 {
 722         return -ENOSYS;
 723 }
 724
 725 static struct page *new_vma_page(struct page *page, unsigned long private)
 726 {
 727         return NULL;
 728 }
 729 #endif
 730
 731 long do_mbind(unsigned long start, unsigned long len,
 732                 unsigned long mode, nodemask_t *nmask, unsigned long flags)
 733 {
 734         struct vm_area_struct *vma;
 735         struct mm_struct *mm = current->mm;
 736         struct mempolicy *new;
 737         unsigned long end;
 738         int err;
 739         LIST_HEAD(pagelist);
 740
 741         if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
 742                                       MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 743             || mode > MPOL_MAX)
 744                 return -EINVAL;
 745         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
 746                 return -EPERM;
 747
 748         if (start & ~PAGE_MASK)
 749                 return -EINVAL;
 750
 751         if (mode == MPOL_DEFAULT)
 752                 flags &= ~MPOL_MF_STRICT;
 753
 754         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 755         end = start + len;
 756
 757         if (end < start)
 758                 return -EINVAL;
 759         if (end == start)
 760                 return 0;
 761
 762         if (mpol_check_policy(mode, nmask))
 763                 return -EINVAL;
 764
 765         new = mpol_new(mode, nmask);
 766         if (IS_ERR(new))
 767                 return PTR_ERR(new);
 768
 769         /*
 770          * If we are using the default policy then operation
 771          * on discontinuous address spaces is okay after all
 772          */
 773         if (!new)
 774                 flags |= MPOL_MF_DISCONTIG_OK;
 775
 776         PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
 777                         mode,nodes_addr(nodes)[0]);
 778
 779         down_write(&mm->mmap_sem);
 780         vma = check_range(mm, start, end, nmask,
 781                           flags | MPOL_MF_INVERT, &pagelist);
 782
 783         err = PTR_ERR(vma);
 784         if (!IS_ERR(vma)) {
 785                 int nr_failed = 0;
 786
 787                 err = mbind_range(vma, start, end, new);
 788
 789                 if (!list_empty(&pagelist))
 790                         nr_failed = migrate_pages(&pagelist, new_vma_page,
 791                                                 (unsigned long)vma);
 792
 793                 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 794                         err = -EIO;
 795         }
 796
 797         up_write(&mm->mmap_sem);
 798         mpol_free(new);
 799         return err;
 800 }
 801
 802 /*
 803  * User space interface with variable sized bitmaps for nodelists.
 804  */
 805
 806 /* Copy a node mask from user space. */
 807 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 808                      unsigned long maxnode)
 809 {
 810         unsigned long k;
 811         unsigned long nlongs;
 812         unsigned long endmask;
 813
 814         --maxnode;
 815         nodes_clear(*nodes);
 816         if (maxnode == 0 || !nmask)
 817                 return 0;
 818         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
 819                 return -EINVAL;
 820
 821         nlongs = BITS_TO_LONGS(maxnode);
 822         if ((maxnode % BITS_PER_LONG) == 0)
 823                 endmask = ~0UL;
 824         else
 825                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
 826
 827         /* When the user specified more nodes than supported just check
 828            if the non supported part is all zero. */
 829         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
 830                 if (nlongs > PAGE_SIZE/sizeof(long))
 831                         return -EINVAL;
 832                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
 833                         unsigned long t;
 834                         if (get_user(t, nmask + k))
 835                                 return -EFAULT;
 836                         if (k == nlongs - 1) {
 837                                 if (t & endmask)
 838                                         return -EINVAL;
 839                         } else if (t)
 840                                 return -EINVAL;
 841                 }
 842                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
 843                 endmask = ~0UL;
 844         }
 845
 846         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
 847                 return -EFAULT;
 848         nodes_addr(*nodes)[nlongs-1] &= endmask;
 849         return 0;
 850 }
 851
 852 /* Copy a kernel node mask to user space */
 853 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 854                               nodemask_t *nodes)
 855 {
 856         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
 857         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
 858
 859         if (copy > nbytes) {
 860                 if (copy > PAGE_SIZE)
 861                         return -EINVAL;
 862                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
 863                         return -EFAULT;
 864                 copy = nbytes;
 865         }
 866         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
 867 }
 868
 869 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 870                         unsigned long mode,
 871                         unsigned long __user *nmask, unsigned long maxnode,
 872                         unsigned flags)
 873 {
 874         nodemask_t nodes;
 875         int err;
 876
 877         err = get_nodes(&nodes, nmask, maxnode);
 878         if (err)
 879                 return err;
 880         return do_mbind(start, len, mode, &nodes, flags);
 881 }
 882
 883 /* Set the process memory policy */
 884 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 885                 unsigned long maxnode)
 886 {
 887         int err;
 888         nodemask_t nodes;
 889
 890         if (mode < 0 || mode > MPOL_MAX)
 891                 return -EINVAL;
 892         err = get_nodes(&nodes, nmask, maxnode);
 893         if (err)
 894                 return err;
 895         return do_set_mempolicy(mode, &nodes);
 896 }
 897
 898 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 899                 const unsigned long __user *old_nodes,
 900                 const unsigned long __user *new_nodes)
 901 {
 902         struct mm_struct *mm;
 903         struct task_struct *task;
 904         nodemask_t old;
 905         nodemask_t new;
 906         nodemask_t task_nodes;
 907         int err;
 908
 909         err = get_nodes(&old, old_nodes, maxnode);
 910         if (err)
 911                 return err;
 912
 913         err = get_nodes(&new, new_nodes, maxnode);
 914         if (err)
 915                 return err;
 916
 917         /* Find the mm_struct */
 918         read_lock(&tasklist_lock);
 919         task = pid ? find_task_by_pid(pid) : current;
 920         if (!task) {
 921                 read_unlock(&tasklist_lock);
 922                 return -ESRCH;
 923         }
 924         mm = get_task_mm(task);
 925         read_unlock(&tasklist_lock);
 926
 927         if (!mm)
 928                 return -EINVAL;
 929
 930         /*
 931          * Check if this process has the right to modify the specified
 932          * process. The right exists if the process has administrative
 933          * capabilities, superuser privileges or the same
 934          * userid as the target process.
 935          */
 936         if ((current->euid != task->suid) && (current->euid != task->uid) &&
 937             (current->uid != task->suid) && (current->uid != task->uid) &&
 938             !capable(CAP_SYS_NICE)) {
 939                 err = -EPERM;
 940                 goto out;
 941         }
 942
 943         task_nodes = cpuset_mems_allowed(task);
 944         /* Is the user allowed to access the target nodes? */
 945         if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
 946                 err = -EPERM;
 947                 goto out;
 948         }
 949
 950         err = security_task_movememory(task);
 951         if (err)
 952                 goto out;
 953
 954         err = do_migrate_pages(mm, &old, &new,
 955                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
 956 out:
 957         mmput(mm);
 958         return err;
 959 }
 960
 961
 962 /* Retrieve NUMA policy */
 963 asmlinkage long sys_get_mempolicy(int __user *policy,
 964                                 unsigned long __user *nmask,
 965                                 unsigned long maxnode,
 966                                 unsigned long addr, unsigned long flags)
 967 {
 968         int err, pval;
 969         nodemask_t nodes;
 970
 971         if (nmask != NULL && maxnode < MAX_NUMNODES)
 972                 return -EINVAL;
 973
 974         err = do_get_mempolicy(&pval, &nodes, addr, flags);
 975
 976         if (err)
 977                 return err;
 978
 979         if (policy && put_user(pval, policy))
 980                 return -EFAULT;
 981
 982         if (nmask)
 983                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
 984
 985         return err;
 986 }
 987
 988 #ifdef CONFIG_COMPAT
 989
 990 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
 991                                      compat_ulong_t __user *nmask,
 992                                      compat_ulong_t maxnode,
 993                                      compat_ulong_t addr, compat_ulong_t flags)
 994 {
 995         long err;
 996         unsigned long __user *nm = NULL;
 997         unsigned long nr_bits, alloc_size;
 998         DECLARE_BITMAP(bm, MAX_NUMNODES);
 999
1000         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1001         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1002
1003         if (nmask)
1004                 nm = compat_alloc_user_space(alloc_size);
1005
1006         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1007
1008         if (!err && nmask) {
1009                 err = copy_from_user(bm, nm, alloc_size);
1010                 /* ensure entire bitmap is zeroed */
1011                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1012                 err |= compat_put_bitmap(nmask, bm, nr_bits);
1013         }
1014
1015         return err;
1016 }
1017
1018 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1019                                      compat_ulong_t maxnode)
1020 {
1021         long err = 0;
1022         unsigned long __user *nm = NULL;
1023         unsigned long nr_bits, alloc_size;
1024         DECLARE_BITMAP(bm, MAX_NUMNODES);
1025
1026         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1027         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1028
1029         if (nmask) {
1030                 err = compat_get_bitmap(bm, nmask, nr_bits);
1031                 nm = compat_alloc_user_space(alloc_size);
1032                 err |= copy_to_user(nm, bm, alloc_size);
1033         }
1034
1035         if (err)
1036                 return -EFAULT;
1037
1038         return sys_set_mempolicy(mode, nm, nr_bits+1);
1039 }
1040
1041 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1042                              compat_ulong_t mode, compat_ulong_t __user *nmask,
1043                              compat_ulong_t maxnode, compat_ulong_t flags)
1044 {
1045         long err = 0;
1046         unsigned long __user *nm = NULL;
1047         unsigned long nr_bits, alloc_size;
1048         nodemask_t bm;
1049
1050         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1051         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1052
1053         if (nmask) {
1054                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1055                 nm = compat_alloc_user_space(alloc_size);
1056                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1057         }
1058
1059         if (err)
1060                 return -EFAULT;
1061
1062         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1063 }
1064
1065 #endif
1066
1067 /* Return effective policy for a VMA */
1068 static struct mempolicy * get_vma_policy(struct task_struct *task,
1069                 struct vm_area_struct *vma, unsigned long addr)
1070 {
1071         struct mempolicy *pol = task->mempolicy;
1072
1073         if (vma) {
1074                 if (vma->vm_ops && vma->vm_ops->get_policy)
1075                         pol = vma->vm_ops->get_policy(vma, addr);
1076                 else if (vma->vm_policy &&
1077                                 vma->vm_policy->policy != MPOL_DEFAULT)
1078                         pol = vma->vm_policy;
1079         }
1080         if (!pol)
1081                 pol = &default_policy;
1082         return pol;
1083 }
1084
1085 /* Return a zonelist representing a mempolicy */
1086 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1087 {
1088         int nd;
1089
1090         switch (policy->policy) {
1091         case MPOL_PREFERRED:
1092                 nd = policy->v.preferred_node;
1093                 if (nd < 0)
1094                         nd = numa_node_id();
1095                 break;
1096         case MPOL_BIND:
1097                 /* Lower zones don't get a policy applied */
1098                 /* Careful: current->mems_allowed might have moved */
1099                 if (gfp_zone(gfp) >= policy_zone)
1100                         if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
1101                                 return policy->v.zonelist;
1102                 /*FALL THROUGH*/
1103         case MPOL_INTERLEAVE: /* should not happen */
1104         case MPOL_DEFAULT:
1105                 nd = numa_node_id();
1106                 break;
1107         default:
1108                 nd = 0;
1109                 BUG();
1110         }
1111         return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
1112 }
1113
1114 /* Do dynamic interleaving for a process */
1115 static unsigned interleave_nodes(struct mempolicy *policy)
1116 {
1117         unsigned nid, next;
1118         struct task_struct *me = current;
1119
1120         nid = me->il_next;
1121         next = next_node(nid, policy->v.nodes);
1122         if (next >= MAX_NUMNODES)
1123                 next = first_node(policy->v.nodes);
1124         me->il_next = next;
1125         return nid;
1126 }
1127
1128 /*
1129  * Depending on the memory policy provide a node from which to allocate the
1130  * next slab entry.
1131  */
1132 unsigned slab_node(struct mempolicy *policy)
1133 {
1134         switch (policy->policy) {
1135         case MPOL_INTERLEAVE:
1136                 return interleave_nodes(policy);
1137
1138         case MPOL_BIND:
1139                 /*
1140                  * Follow bind policy behavior and start allocation at the
1141                  * first node.
1142                  */
1143                 return policy->v.zonelist->zones[0]->zone_pgdat->node_id;
1144
1145         case MPOL_PREFERRED:
1146                 if (policy->v.preferred_node >= 0)
1147                         return policy->v.preferred_node;
1148                 /* Fall through */
1149
1150         default:
1151                 return numa_node_id();
1152         }
1153 }
1154
1155 /* Do static interleaving for a VMA with known offset. */
1156 static unsigned offset_il_node(struct mempolicy *pol,
1157                 struct vm_area_struct *vma, unsigned long off)
1158 {
1159         unsigned nnodes = nodes_weight(pol->v.nodes);
1160         unsigned target = (unsigned)off % nnodes;
1161         int c;
1162         int nid = -1;
1163
1164         c = 0;
1165         do {
1166                 nid = next_node(nid, pol->v.nodes);
1167                 c++;
1168         } while (c <= target);
1169         return nid;
1170 }
1171
1172 /* Determine a node number for interleave */
1173 static inline unsigned interleave_nid(struct mempolicy *pol,
1174                  struct vm_area_struct *vma, unsigned long addr, int shift)
1175 {
1176         if (vma) {
1177                 unsigned long off;
1178
1179                 /*
1180                  * for small pages, there is no difference between
1181                  * shift and PAGE_SHIFT, so the bit-shift is safe.
1182                  * for huge pages, since vm_pgoff is in units of small
1183                  * pages, we need to shift off the always 0 bits to get
1184                  * a useful offset.
1185                  */
1186                 BUG_ON(shift < PAGE_SHIFT);
1187                 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1188                 off += (addr - vma->vm_start) >> shift;
1189                 return offset_il_node(pol, vma, off);
1190         } else
1191                 return interleave_nodes(pol);
1192 }
1193
1194 #ifdef CONFIG_HUGETLBFS
1195 /* Return a zonelist suitable for a huge page allocation. */
1196 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1197 {
1198         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1199
1200         if (pol->policy == MPOL_INTERLEAVE) {
1201                 unsigned nid;
1202
1203                 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1204                 return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
1205         }
1206         return zonelist_policy(GFP_HIGHUSER, pol);
1207 }
1208 #endif
1209
1210 /* Allocate a page in interleaved policy.
1211    Own path because it needs to do special accounting. */
1212 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1213                                         unsigned nid)
1214 {
1215         struct zonelist *zl;
1216         struct page *page;
1217
1218         zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1219         page = __alloc_pages(gfp, order, zl);
1220         if (page && page_zone(page) == zl->zones[0])
1221                 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1222         return page;
1223 }
1224
1225 /**
1226  *      alloc_page_vma  - Allocate a page for a VMA.
1227  *
1228  *      @gfp:
1229  *      %GFP_USER    user allocation.
1230  *      %GFP_KERNEL  kernel allocations,
1231  *      %GFP_HIGHMEM highmem/user allocations,
1232  *      %GFP_FS      allocation should not call back into a file system.
1233  *      %GFP_ATOMIC  don't sleep.
1234  *
1235  *      @vma:  Pointer to VMA or NULL if not available.
1236  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1237  *
1238  *      This function allocates a page from the kernel page pool and applies
1239  *      a NUMA policy associated with the VMA or the current process.
1240  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1241  *      mm_struct of the VMA to prevent it from going away. Should be used for
1242  *      all allocations for pages that will be mapped into
1243  *      user space. Returns NULL when no page can be allocated.
1244  *
1245  *      Should be called with the mm_sem of the vma hold.
1246  */
1247 struct page *
1248 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1249 {
1250         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1251
1252         cpuset_update_task_memory_state();
1253
1254         if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1255                 unsigned nid;
1256
1257                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1258                 return alloc_page_interleave(gfp, 0, nid);
1259         }
1260         return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
1261 }
1262
1263 /**
1264  *      alloc_pages_current - Allocate pages.
1265  *
1266  *      @gfp:
1267  *              %GFP_USER   user allocation,
1268  *              %GFP_KERNEL kernel allocation,
1269  *              %GFP_HIGHMEM highmem allocation,
1270  *              %GFP_FS     don't call back into a file system.
1271  *              %GFP_ATOMIC don't sleep.
1272  *      @order: Power of two of allocation size in pages. 0 is a single page.
1273  *
1274  *      Allocate a page from the kernel page pool.  When not in
1275  *      interrupt context and apply the current process NUMA policy.
1276  *      Returns NULL when no page can be allocated.
1277  *
1278  *      Don't call cpuset_update_task_memory_state() unless
1279  *      1) it's ok to take cpuset_sem (can WAIT), and
1280  *      2) allocating for current task (not interrupt).
1281  */
1282 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1283 {
1284         struct mempolicy *pol = current->mempolicy;
1285
1286         if ((gfp & __GFP_WAIT) && !in_interrupt())
1287                 cpuset_update_task_memory_state();
1288         if (!pol || in_interrupt())
1289                 pol = &default_policy;
1290         if (pol->policy == MPOL_INTERLEAVE)
1291                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1292         return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1293 }
1294 EXPORT_SYMBOL(alloc_pages_current);
1295
1296 /*
1297  * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1298  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1299  * with the mems_allowed returned by cpuset_mems_allowed().  This
1300  * keeps mempolicies cpuset relative after its cpuset moves.  See
1301  * further kernel/cpuset.c update_nodemask().
1302  */
1303 void *cpuset_being_rebound;
1304
1305 /* Slow path of a mempolicy copy */
1306 struct mempolicy *__mpol_copy(struct mempolicy *old)
1307 {
1308         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1309
1310         if (!new)
1311                 return ERR_PTR(-ENOMEM);
1312         if (current_cpuset_is_being_rebound()) {
1313                 nodemask_t mems = cpuset_mems_allowed(current);
1314                 mpol_rebind_policy(old, &mems);
1315         }
1316         *new = *old;
1317         atomic_set(&new->refcnt, 1);
1318         if (new->policy == MPOL_BIND) {
1319                 int sz = ksize(old->v.zonelist);
1320                 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
1321                 if (!new->v.zonelist) {
1322                         kmem_cache_free(policy_cache, new);
1323                         return ERR_PTR(-ENOMEM);
1324                 }
1325                 memcpy(new->v.zonelist, old->v.zonelist, sz);
1326         }
1327         return new;
1328 }
1329
1330 /* Slow path of a mempolicy comparison */
1331 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1332 {
1333         if (!a || !b)
1334                 return 0;
1335         if (a->policy != b->policy)
1336                 return 0;
1337         switch (a->policy) {
1338         case MPOL_DEFAULT:
1339                 return 1;
1340         case MPOL_INTERLEAVE:
1341                 return nodes_equal(a->v.nodes, b->v.nodes);
1342         case MPOL_PREFERRED:
1343                 return a->v.preferred_node == b->v.preferred_node;
1344         case MPOL_BIND: {
1345                 int i;
1346                 for (i = 0; a->v.zonelist->zones[i]; i++)
1347                         if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1348                                 return 0;
1349                 return b->v.zonelist->zones[i] == NULL;
1350         }
1351         default:
1352                 BUG();
1353                 return 0;
1354         }
1355 }
1356
1357 /* Slow path of a mpol destructor. */
1358 void __mpol_free(struct mempolicy *p)
1359 {
1360         if (!atomic_dec_and_test(&p->refcnt))
1361                 return;
1362         if (p->policy == MPOL_BIND)
1363                 kfree(p->v.zonelist);
1364         p->policy = MPOL_DEFAULT;
1365         kmem_cache_free(policy_cache, p);
1366 }
1367
1368 /*
1369  * Shared memory backing store policy support.
1370  *
1371  * Remember policies even when nobody has shared memory mapped.
1372  * The policies are kept in Red-Black tree linked from the inode.
1373  * They are protected by the sp->lock spinlock, which should be held
1374  * for any accesses to the tree.
1375  */
1376
1377 /* lookup first element intersecting start-end */
1378 /* Caller holds sp->lock */
1379 static struct sp_node *
1380 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1381 {
1382         struct rb_node *n = sp->root.rb_node;
1383
1384         while (n) {
1385                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1386
1387                 if (start >= p->end)
1388                         n = n->rb_right;
1389                 else if (end <= p->start)
1390                         n = n->rb_left;
1391                 else
1392                         break;
1393         }
1394         if (!n)
1395                 return NULL;
1396         for (;;) {
1397                 struct sp_node *w = NULL;
1398                 struct rb_node *prev = rb_prev(n);
1399                 if (!prev)
1400                         break;
1401                 w = rb_entry(prev, struct sp_node, nd);
1402                 if (w->end <= start)
1403                         break;
1404                 n = prev;
1405         }
1406         return rb_entry(n, struct sp_node, nd);
1407 }
1408
1409 /* Insert a new shared policy into the list. */
1410 /* Caller holds sp->lock */
1411 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1412 {
1413         struct rb_node **p = &sp->root.rb_node;
1414         struct rb_node *parent = NULL;
1415         struct sp_node *nd;
1416
1417         while (*p) {
1418                 parent = *p;
1419                 nd = rb_entry(parent, struct sp_node, nd);
1420                 if (new->start < nd->start)
1421                         p = &(*p)->rb_left;
1422                 else if (new->end > nd->end)
1423                         p = &(*p)->rb_right;
1424                 else
1425                         BUG();
1426         }
1427         rb_link_node(&new->nd, parent, p);
1428         rb_insert_color(&new->nd, &sp->root);
1429         PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1430                  new->policy ? new->policy->policy : 0);
1431 }
1432
1433 /* Find shared policy intersecting idx */
1434 struct mempolicy *
1435 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1436 {
1437         struct mempolicy *pol = NULL;
1438         struct sp_node *sn;
1439
1440         if (!sp->root.rb_node)
1441                 return NULL;
1442         spin_lock(&sp->lock);
1443         sn = sp_lookup(sp, idx, idx+1);
1444         if (sn) {
1445                 mpol_get(sn->policy);
1446                 pol = sn->policy;
1447         }
1448         spin_unlock(&sp->lock);
1449         return pol;
1450 }
1451
1452 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1453 {
1454         PDprintk("deleting %lx-l%x\n", n->start, n->end);
1455         rb_erase(&n->nd, &sp->root);
1456         mpol_free(n->policy);
1457         kmem_cache_free(sn_cache, n);
1458 }
1459
1460 struct sp_node *
1461 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1462 {
1463         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1464
1465         if (!n)
1466                 return NULL;
1467         n->start = start;
1468         n->end = end;
1469         mpol_get(pol);
1470         n->policy = pol;
1471         return n;
1472 }
1473
1474 /* Replace a policy range. */
1475 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1476                                  unsigned long end, struct sp_node *new)
1477 {
1478         struct sp_node *n, *new2 = NULL;
1479
1480 restart:
1481         spin_lock(&sp->lock);
1482         n = sp_lookup(sp, start, end);
1483         /* Take care of old policies in the same range. */
1484         while (n && n->start < end) {
1485                 struct rb_node *next = rb_next(&n->nd);
1486                 if (n->start >= start) {
1487                         if (n->end <= end)
1488                                 sp_delete(sp, n);
1489                         else
1490                                 n->start = end;
1491                 } else {
1492                         /* Old policy spanning whole new range. */
1493                         if (n->end > end) {
1494                                 if (!new2) {
1495                                         spin_unlock(&sp->lock);
1496                                         new2 = sp_alloc(end, n->end, n->policy);
1497                                         if (!new2)
1498                                                 return -ENOMEM;
1499                                         goto restart;
1500                                 }
1501                                 n->end = start;
1502                                 sp_insert(sp, new2);
1503                                 new2 = NULL;
1504                                 break;
1505                         } else
1506                                 n->end = start;
1507                 }
1508                 if (!next)
1509                         break;
1510                 n = rb_entry(next, struct sp_node, nd);
1511         }
1512         if (new)
1513                 sp_insert(sp, new);
1514         spin_unlock(&sp->lock);
1515         if (new2) {
1516                 mpol_free(new2->policy);
1517                 kmem_cache_free(sn_cache, new2);
1518         }
1519         return 0;
1520 }
1521
1522 void mpol_shared_policy_init(struct shared_policy *info, int policy,
1523                                 nodemask_t *policy_nodes)
1524 {
1525         info->root = RB_ROOT;
1526         spin_lock_init(&info->lock);
1527
1528         if (policy != MPOL_DEFAULT) {
1529                 struct mempolicy *newpol;
1530
1531                 /* Falls back to MPOL_DEFAULT on any error */
1532                 newpol = mpol_new(policy, policy_nodes);
1533                 if (!IS_ERR(newpol)) {
1534                         /* Create pseudo-vma that contains just the policy */
1535                         struct vm_area_struct pvma;
1536
1537                         memset(&pvma, 0, sizeof(struct vm_area_struct));
1538                         /* Policy covers entire file */
1539                         pvma.vm_end = TASK_SIZE;
1540                         mpol_set_shared_policy(info, &pvma, newpol);
1541                         mpol_free(newpol);
1542                 }
1543         }
1544 }
1545
1546 int mpol_set_shared_policy(struct shared_policy *info,
1547                         struct vm_area_struct *vma, struct mempolicy *npol)
1548 {
1549         int err;
1550         struct sp_node *new = NULL;
1551         unsigned long sz = vma_pages(vma);
1552
1553         PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1554                  vma->vm_pgoff,
1555                  sz, npol? npol->policy : -1,
1556                 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1557
1558         if (npol) {
1559                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1560                 if (!new)
1561                         return -ENOMEM;
1562         }
1563         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1564         if (err && new)
1565                 kmem_cache_free(sn_cache, new);
1566         return err;
1567 }
1568
1569 /* Free a backing policy store on inode delete. */
1570 void mpol_free_shared_policy(struct shared_policy *p)
1571 {
1572         struct sp_node *n;
1573         struct rb_node *next;
1574
1575         if (!p->root.rb_node)
1576                 return;
1577         spin_lock(&p->lock);
1578         next = rb_first(&p->root);
1579         while (next) {
1580                 n = rb_entry(next, struct sp_node, nd);
1581                 next = rb_next(&n->nd);
1582                 rb_erase(&n->nd, &p->root);
1583                 mpol_free(n->policy);
1584                 kmem_cache_free(sn_cache, n);
1585         }
1586         spin_unlock(&p->lock);
1587 }
1588
1589 /* assumes fs == KERNEL_DS */
1590 void __init numa_policy_init(void)
1591 {
1592         policy_cache = kmem_cache_create("numa_policy",
1593                                          sizeof(struct mempolicy),
1594                                          0, SLAB_PANIC, NULL, NULL);
1595
1596         sn_cache = kmem_cache_create("shared_policy_node",
1597                                      sizeof(struct sp_node),
1598                                      0, SLAB_PANIC, NULL, NULL);
1599
1600         /* Set interleaving policy for system init. This way not all
1601            the data structures allocated at system boot end up in node zero. */
1602
1603         if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1604                 printk("numa_policy_init: interleaving failed\n");
1605 }
1606
1607 /* Reset policy of current process to default */
1608 void numa_default_policy(void)
1609 {
1610         do_set_mempolicy(MPOL_DEFAULT, NULL);
1611 }
1612
1613 /* Migrate a policy to a different set of nodes */
1614 void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1615 {
1616         nodemask_t *mpolmask;
1617         nodemask_t tmp;
1618
1619         if (!pol)
1620                 return;
1621         mpolmask = &pol->cpuset_mems_allowed;
1622         if (nodes_equal(*mpolmask, *newmask))
1623                 return;
1624
1625         switch (pol->policy) {
1626         case MPOL_DEFAULT:
1627                 break;
1628         case MPOL_INTERLEAVE:
1629                 nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1630                 pol->v.nodes = tmp;
1631                 *mpolmask = *newmask;
1632                 current->il_next = node_remap(current->il_next,
1633                                                 *mpolmask, *newmask);
1634                 break;
1635         case MPOL_PREFERRED:
1636                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1637                                                 *mpolmask, *newmask);
1638                 *mpolmask = *newmask;
1639                 break;
1640         case MPOL_BIND: {
1641                 nodemask_t nodes;
1642                 struct zone **z;
1643                 struct zonelist *zonelist;
1644
1645                 nodes_clear(nodes);
1646                 for (z = pol->v.zonelist->zones; *z; z++)
1647                         node_set((*z)->zone_pgdat->node_id, nodes);
1648                 nodes_remap(tmp, nodes, *mpolmask, *newmask);
1649                 nodes = tmp;
1650
1651                 zonelist = bind_zonelist(&nodes);
1652
1653                 /* If no mem, then zonelist is NULL and we keep old zonelist.
1654                  * If that old zonelist has no remaining mems_allowed nodes,
1655                  * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1656                  */
1657
1658                 if (zonelist) {
1659                         /* Good - got mem - substitute new zonelist */
1660                         kfree(pol->v.zonelist);
1661                         pol->v.zonelist = zonelist;
1662                 }
1663                 *mpolmask = *newmask;
1664                 break;
1665         }
1666         default:
1667                 BUG();
1668                 break;
1669         }
1670 }
1671
1672 /*
1673  * Wrapper for mpol_rebind_policy() that just requires task
1674  * pointer, and updates task mempolicy.
1675  */
1676
1677 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1678 {
1679         mpol_rebind_policy(tsk->mempolicy, new);
1680 }
1681
1682 /*
1683  * Rebind each vma in mm to new nodemask.
1684  *
1685  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
1686  */
1687
1688 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1689 {
1690         struct vm_area_struct *vma;
1691
1692         down_write(&mm->mmap_sem);
1693         for (vma = mm->mmap; vma; vma = vma->vm_next)
1694                 mpol_rebind_policy(vma->vm_policy, new);
1695         up_write(&mm->mmap_sem);
1696 }
1697
1698 /*
1699  * Display pages allocated per node and memory policy via /proc.
1700  */
1701
1702 static const char *policy_types[] = { "default", "prefer", "bind",
1703                                       "interleave" };
1704
1705 /*
1706  * Convert a mempolicy into a string.
1707  * Returns the number of characters in buffer (if positive)
1708  * or an error (negative)
1709  */
1710 static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1711 {
1712         char *p = buffer;
1713         int l;
1714         nodemask_t nodes;
1715         int mode = pol ? pol->policy : MPOL_DEFAULT;
1716
1717         switch (mode) {
1718         case MPOL_DEFAULT:
1719                 nodes_clear(nodes);
1720                 break;
1721
1722         case MPOL_PREFERRED:
1723                 nodes_clear(nodes);
1724                 node_set(pol->v.preferred_node, nodes);
1725                 break;
1726
1727         case MPOL_BIND:
1728                 get_zonemask(pol, &nodes);
1729                 break;
1730
1731         case MPOL_INTERLEAVE:
1732                 nodes = pol->v.nodes;
1733                 break;
1734
1735         default:
1736                 BUG();
1737                 return -EFAULT;
1738         }
1739
1740         l = strlen(policy_types[mode]);
1741         if (buffer + maxlen < p + l + 1)
1742                 return -ENOSPC;
1743
1744         strcpy(p, policy_types[mode]);
1745         p += l;
1746
1747         if (!nodes_empty(nodes)) {
1748                 if (buffer + maxlen < p + 2)
1749                         return -ENOSPC;
1750                 *p++ = '=';
1751                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1752         }
1753         return p - buffer;
1754 }
1755
1756 struct numa_maps {
1757         unsigned long pages;
1758         unsigned long anon;
1759         unsigned long active;
1760         unsigned long writeback;
1761         unsigned long mapcount_max;
1762         unsigned long dirty;
1763         unsigned long swapcache;
1764         unsigned long node[MAX_NUMNODES];
1765 };
1766
1767 static void gather_stats(struct page *page, void *private, int pte_dirty)
1768 {
1769         struct numa_maps *md = private;
1770         int count = page_mapcount(page);
1771
1772         md->pages++;
1773         if (pte_dirty || PageDirty(page))
1774                 md->dirty++;
1775
1776         if (PageSwapCache(page))
1777                 md->swapcache++;
1778
1779         if (PageActive(page))
1780                 md->active++;
1781
1782         if (PageWriteback(page))
1783                 md->writeback++;
1784
1785         if (PageAnon(page))
1786                 md->anon++;
1787
1788         if (count > md->mapcount_max)
1789                 md->mapcount_max = count;
1790
1791         md->node[page_to_nid(page)]++;
1792 }
1793
1794 #ifdef CONFIG_HUGETLB_PAGE
1795 static void check_huge_range(struct vm_area_struct *vma,
1796                 unsigned long start, unsigned long end,
1797                 struct numa_maps *md)
1798 {
1799         unsigned long addr;
1800         struct page *page;
1801
1802         for (addr = start; addr < end; addr += HPAGE_SIZE) {
1803                 pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
1804                 pte_t pte;
1805
1806                 if (!ptep)
1807                         continue;
1808
1809                 pte = *ptep;
1810                 if (pte_none(pte))
1811                         continue;
1812
1813                 page = pte_page(pte);
1814                 if (!page)
1815                         continue;
1816
1817                 gather_stats(page, md, pte_dirty(*ptep));
1818         }
1819 }
1820 #else
1821 static inline void check_huge_range(struct vm_area_struct *vma,
1822                 unsigned long start, unsigned long end,
1823                 struct numa_maps *md)
1824 {
1825 }
1826 #endif
1827
1828 int show_numa_map(struct seq_file *m, void *v)
1829 {
1830         struct proc_maps_private *priv = m->private;
1831         struct vm_area_struct *vma = v;
1832         struct numa_maps *md;
1833         struct file *file = vma->vm_file;
1834         struct mm_struct *mm = vma->vm_mm;
1835         int n;
1836         char buffer[50];
1837
1838         if (!mm)
1839                 return 0;
1840
1841         md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1842         if (!md)
1843                 return 0;
1844
1845         mpol_to_str(buffer, sizeof(buffer),
1846                             get_vma_policy(priv->task, vma, vma->vm_start));
1847
1848         seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1849
1850         if (file) {
1851                 seq_printf(m, " file=");
1852                 seq_path(m, file->f_vfsmnt, file->f_dentry, "\n\t= ");
1853         } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1854                 seq_printf(m, " heap");
1855         } else if (vma->vm_start <= mm->start_stack &&
1856                         vma->vm_end >= mm->start_stack) {
1857                 seq_printf(m, " stack");
1858         }
1859
1860         if (is_vm_hugetlb_page(vma)) {
1861                 check_huge_range(vma, vma->vm_start, vma->vm_end, md);
1862                 seq_printf(m, " huge");
1863         } else {
1864                 check_pgd_range(vma, vma->vm_start, vma->vm_end,
1865                                 &node_online_map, MPOL_MF_STATS, md);
1866         }
1867
1868         if (!md->pages)
1869                 goto out;
1870
1871         if (md->anon)
1872                 seq_printf(m," anon=%lu",md->anon);
1873
1874         if (md->dirty)
1875                 seq_printf(m," dirty=%lu",md->dirty);
1876
1877         if (md->pages != md->anon && md->pages != md->dirty)
1878                 seq_printf(m, " mapped=%lu", md->pages);
1879
1880         if (md->mapcount_max > 1)
1881                 seq_printf(m, " mapmax=%lu", md->mapcount_max);
1882
1883         if (md->swapcache)
1884                 seq_printf(m," swapcache=%lu", md->swapcache);
1885
1886         if (md->active < md->pages && !is_vm_hugetlb_page(vma))
1887                 seq_printf(m," active=%lu", md->active);
1888
1889         if (md->writeback)
1890                 seq_printf(m," writeback=%lu", md->writeback);
1891
1892         for_each_online_node(n)
1893                 if (md->node[n])
1894                         seq_printf(m, " N%d=%lu", n, md->node[n]);
1895 out:
1896         seq_putc(m, '\n');
1897         kfree(md);
1898
1899         if (m->count < m->size)
1900                 m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
1901         return 0;
1902 }
1903