mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66    could replace all the switch()es with a mempolicy_ops structure.
  67 */
  68
  69 #include <linux/mempolicy.h>
  70 #include <linux/mm.h>
  71 #include <linux/highmem.h>
  72 #include <linux/hugetlb.h>
  73 #include <linux/kernel.h>
  74 #include <linux/sched.h>
  75 #include <linux/mm.h>
  76 #include <linux/nodemask.h>
  77 #include <linux/cpuset.h>
  78 #include <linux/gfp.h>
  79 #include <linux/slab.h>
  80 #include <linux/string.h>
  81 #include <linux/module.h>
  82 #include <linux/interrupt.h>
  83 #include <linux/init.h>
  84 #include <linux/compat.h>
  85 #include <linux/mempolicy.h>
  86 #include <linux/swap.h>
  87 #include <linux/seq_file.h>
  88 #include <linux/proc_fs.h>
  89 #include <linux/migrate.h>
  90 #include <linux/rmap.h>
  91 #include <linux/security.h>
  92
  93 #include <asm/tlbflush.h>
  94 #include <asm/uaccess.h>
  95
  96 /* Internal flags */
  97 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
  98 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
  99 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
 100
 101 static struct kmem_cache *policy_cache;
 102 static struct kmem_cache *sn_cache;
 103
 104 #define PDprintk(fmt...)
 105
 106 /* Highest zone. An specific allocation for a zone below that is not
 107    policied. */
 108 int policy_zone = ZONE_DMA;
 109
 110 struct mempolicy default_policy = {
 111         .refcnt = ATOMIC_INIT(1), /* never free it */
 112         .policy = MPOL_DEFAULT,
 113 };
 114
 115 /* Do sanity checking on a policy */
 116 static int mpol_check_policy(int mode, nodemask_t *nodes)
 117 {
 118         int empty = nodes_empty(*nodes);
 119
 120         switch (mode) {
 121         case MPOL_DEFAULT:
 122                 if (!empty)
 123                         return -EINVAL;
 124                 break;
 125         case MPOL_BIND:
 126         case MPOL_INTERLEAVE:
 127                 /* Preferred will only use the first bit, but allow
 128                    more for now. */
 129                 if (empty)
 130                         return -EINVAL;
 131                 break;
 132         }
 133         return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
 134 }
 135
 136 /* Generate a custom zonelist for the BIND policy. */
 137 static struct zonelist *bind_zonelist(nodemask_t *nodes)
 138 {
 139         struct zonelist *zl;
 140         int num, max, nd, k;
 141
 142         max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
 143         zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
 144         if (!zl)
 145                 return NULL;
 146         num = 0;
 147         /* First put in the highest zones from all nodes, then all the next
 148            lower zones etc. Avoid empty zones because the memory allocator
 149            doesn't like them. If you implement node hot removal you
 150            have to fix that. */
 151         for (k = policy_zone; k >= 0; k--) {
 152                 for_each_node_mask(nd, *nodes) {
 153                         struct zone *z = &NODE_DATA(nd)->node_zones[k];
 154                         if (z->present_pages > 0)
 155                                 zl->zones[num++] = z;
 156                 }
 157         }
 158         zl->zones[num] = NULL;
 159         return zl;
 160 }
 161
 162 /* Create a new policy */
 163 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 164 {
 165         struct mempolicy *policy;
 166
 167         PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
 168         if (mode == MPOL_DEFAULT)
 169                 return NULL;
 170         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 171         if (!policy)
 172                 return ERR_PTR(-ENOMEM);
 173         atomic_set(&policy->refcnt, 1);
 174         switch (mode) {
 175         case MPOL_INTERLEAVE:
 176                 policy->v.nodes = *nodes;
 177                 if (nodes_weight(*nodes) == 0) {
 178                         kmem_cache_free(policy_cache, policy);
 179                         return ERR_PTR(-EINVAL);
 180                 }
 181                 break;
 182         case MPOL_PREFERRED:
 183                 policy->v.preferred_node = first_node(*nodes);
 184                 if (policy->v.preferred_node >= MAX_NUMNODES)
 185                         policy->v.preferred_node = -1;
 186                 break;
 187         case MPOL_BIND:
 188                 policy->v.zonelist = bind_zonelist(nodes);
 189                 if (policy->v.zonelist == NULL) {
 190                         kmem_cache_free(policy_cache, policy);
 191                         return ERR_PTR(-ENOMEM);
 192                 }
 193                 break;
 194         }
 195         policy->policy = mode;
 196         policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
 197         return policy;
 198 }
 199
 200 static void gather_stats(struct page *, void *, int pte_dirty);
 201 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 202                                 unsigned long flags);
 203
 204 /* Scan through pages checking if pages follow certain conditions. */
 205 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 206                 unsigned long addr, unsigned long end,
 207                 const nodemask_t *nodes, unsigned long flags,
 208                 void *private)
 209 {
 210         pte_t *orig_pte;
 211         pte_t *pte;
 212         spinlock_t *ptl;
 213
 214         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 215         do {
 216                 struct page *page;
 217                 unsigned int nid;
 218
 219                 if (!pte_present(*pte))
 220                         continue;
 221                 page = vm_normal_page(vma, addr, *pte);
 222                 if (!page)
 223                         continue;
 224                 /*
 225                  * The check for PageReserved here is important to avoid
 226                  * handling zero pages and other pages that may have been
 227                  * marked special by the system.
 228                  *
 229                  * If the PageReserved would not be checked here then f.e.
 230                  * the location of the zero page could have an influence
 231                  * on MPOL_MF_STRICT, zero pages would be counted for
 232                  * the per node stats, and there would be useless attempts
 233                  * to put zero pages on the migration list.
 234                  */
 235                 if (PageReserved(page))
 236                         continue;
 237                 nid = page_to_nid(page);
 238                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 239                         continue;
 240
 241                 if (flags & MPOL_MF_STATS)
 242                         gather_stats(page, private, pte_dirty(*pte));
 243                 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 244                         migrate_page_add(page, private, flags);
 245                 else
 246                         break;
 247         } while (pte++, addr += PAGE_SIZE, addr != end);
 248         pte_unmap_unlock(orig_pte, ptl);
 249         return addr != end;
 250 }
 251
 252 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 253                 unsigned long addr, unsigned long end,
 254                 const nodemask_t *nodes, unsigned long flags,
 255                 void *private)
 256 {
 257         pmd_t *pmd;
 258         unsigned long next;
 259
 260         pmd = pmd_offset(pud, addr);
 261         do {
 262                 next = pmd_addr_end(addr, end);
 263                 if (pmd_none_or_clear_bad(pmd))
 264                         continue;
 265                 if (check_pte_range(vma, pmd, addr, next, nodes,
 266                                     flags, private))
 267                         return -EIO;
 268         } while (pmd++, addr = next, addr != end);
 269         return 0;
 270 }
 271
 272 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 273                 unsigned long addr, unsigned long end,
 274                 const nodemask_t *nodes, unsigned long flags,
 275                 void *private)
 276 {
 277         pud_t *pud;
 278         unsigned long next;
 279
 280         pud = pud_offset(pgd, addr);
 281         do {
 282                 next = pud_addr_end(addr, end);
 283                 if (pud_none_or_clear_bad(pud))
 284                         continue;
 285                 if (check_pmd_range(vma, pud, addr, next, nodes,
 286                                     flags, private))
 287                         return -EIO;
 288         } while (pud++, addr = next, addr != end);
 289         return 0;
 290 }
 291
 292 static inline int check_pgd_range(struct vm_area_struct *vma,
 293                 unsigned long addr, unsigned long end,
 294                 const nodemask_t *nodes, unsigned long flags,
 295                 void *private)
 296 {
 297         pgd_t *pgd;
 298         unsigned long next;
 299
 300         pgd = pgd_offset(vma->vm_mm, addr);
 301         do {
 302                 next = pgd_addr_end(addr, end);
 303                 if (pgd_none_or_clear_bad(pgd))
 304                         continue;
 305                 if (check_pud_range(vma, pgd, addr, next, nodes,
 306                                     flags, private))
 307                         return -EIO;
 308         } while (pgd++, addr = next, addr != end);
 309         return 0;
 310 }
 311
 312 /* Check if a vma is migratable */
 313 static inline int vma_migratable(struct vm_area_struct *vma)
 314 {
 315         if (vma->vm_flags & (
 316                 VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED))
 317                 return 0;
 318         return 1;
 319 }
 320
 321 /*
 322  * Check if all pages in a range are on a set of nodes.
 323  * If pagelist != NULL then isolate pages from the LRU and
 324  * put them on the pagelist.
 325  */
 326 static struct vm_area_struct *
 327 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 328                 const nodemask_t *nodes, unsigned long flags, void *private)
 329 {
 330         int err;
 331         struct vm_area_struct *first, *vma, *prev;
 332
 333         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 334
 335                 err = migrate_prep();
 336                 if (err)
 337                         return ERR_PTR(err);
 338         }
 339
 340         first = find_vma(mm, start);
 341         if (!first)
 342                 return ERR_PTR(-EFAULT);
 343         prev = NULL;
 344         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 345                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 346                         if (!vma->vm_next && vma->vm_end < end)
 347                                 return ERR_PTR(-EFAULT);
 348                         if (prev && prev->vm_end < vma->vm_start)
 349                                 return ERR_PTR(-EFAULT);
 350                 }
 351                 if (!is_vm_hugetlb_page(vma) &&
 352                     ((flags & MPOL_MF_STRICT) ||
 353                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 354                                 vma_migratable(vma)))) {
 355                         unsigned long endvma = vma->vm_end;
 356
 357                         if (endvma > end)
 358                                 endvma = end;
 359                         if (vma->vm_start > start)
 360                                 start = vma->vm_start;
 361                         err = check_pgd_range(vma, start, endvma, nodes,
 362                                                 flags, private);
 363                         if (err) {
 364                                 first = ERR_PTR(err);
 365                                 break;
 366                         }
 367                 }
 368                 prev = vma;
 369         }
 370         return first;
 371 }
 372
 373 /* Apply policy to a single VMA */
 374 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 375 {
 376         int err = 0;
 377         struct mempolicy *old = vma->vm_policy;
 378
 379         PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 380                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 381                  vma->vm_ops, vma->vm_file,
 382                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 383
 384         if (vma->vm_ops && vma->vm_ops->set_policy)
 385                 err = vma->vm_ops->set_policy(vma, new);
 386         if (!err) {
 387                 mpol_get(new);
 388                 vma->vm_policy = new;
 389                 mpol_free(old);
 390         }
 391         return err;
 392 }
 393
 394 /* Step 2: apply policy to a range and do splits. */
 395 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 396                        unsigned long end, struct mempolicy *new)
 397 {
 398         struct vm_area_struct *next;
 399         int err;
 400
 401         err = 0;
 402         for (; vma && vma->vm_start < end; vma = next) {
 403                 next = vma->vm_next;
 404                 if (vma->vm_start < start)
 405                         err = split_vma(vma->vm_mm, vma, start, 1);
 406                 if (!err && vma->vm_end > end)
 407                         err = split_vma(vma->vm_mm, vma, end, 0);
 408                 if (!err)
 409                         err = policy_vma(vma, new);
 410                 if (err)
 411                         break;
 412         }
 413         return err;
 414 }
 415
 416 static int contextualize_policy(int mode, nodemask_t *nodes)
 417 {
 418         if (!nodes)
 419                 return 0;
 420
 421         cpuset_update_task_memory_state();
 422         if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
 423                 return -EINVAL;
 424         return mpol_check_policy(mode, nodes);
 425 }
 426
 427
 428 /*
 429  * Update task->flags PF_MEMPOLICY bit: set iff non-default
 430  * mempolicy.  Allows more rapid checking of this (combined perhaps
 431  * with other PF_* flag bits) on memory allocation hot code paths.
 432  *
 433  * If called from outside this file, the task 'p' should -only- be
 434  * a newly forked child not yet visible on the task list, because
 435  * manipulating the task flags of a visible task is not safe.
 436  *
 437  * The above limitation is why this routine has the funny name
 438  * mpol_fix_fork_child_flag().
 439  *
 440  * It is also safe to call this with a task pointer of current,
 441  * which the static wrapper mpol_set_task_struct_flag() does,
 442  * for use within this file.
 443  */
 444
 445 void mpol_fix_fork_child_flag(struct task_struct *p)
 446 {
 447         if (p->mempolicy)
 448                 p->flags |= PF_MEMPOLICY;
 449         else
 450                 p->flags &= ~PF_MEMPOLICY;
 451 }
 452
 453 static void mpol_set_task_struct_flag(void)
 454 {
 455         mpol_fix_fork_child_flag(current);
 456 }
 457
 458 /* Set the process memory policy */
 459 long do_set_mempolicy(int mode, nodemask_t *nodes)
 460 {
 461         struct mempolicy *new;
 462
 463         if (contextualize_policy(mode, nodes))
 464                 return -EINVAL;
 465         new = mpol_new(mode, nodes);
 466         if (IS_ERR(new))
 467                 return PTR_ERR(new);
 468         mpol_free(current->mempolicy);
 469         current->mempolicy = new;
 470         mpol_set_task_struct_flag();
 471         if (new && new->policy == MPOL_INTERLEAVE)
 472                 current->il_next = first_node(new->v.nodes);
 473         return 0;
 474 }
 475
 476 /* Fill a zone bitmap for a policy */
 477 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 478 {
 479         int i;
 480
 481         nodes_clear(*nodes);
 482         switch (p->policy) {
 483         case MPOL_BIND:
 484                 for (i = 0; p->v.zonelist->zones[i]; i++)
 485                         node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
 486                                 *nodes);
 487                 break;
 488         case MPOL_DEFAULT:
 489                 break;
 490         case MPOL_INTERLEAVE:
 491                 *nodes = p->v.nodes;
 492                 break;
 493         case MPOL_PREFERRED:
 494                 /* or use current node instead of online map? */
 495                 if (p->v.preferred_node < 0)
 496                         *nodes = node_online_map;
 497                 else
 498                         node_set(p->v.preferred_node, *nodes);
 499                 break;
 500         default:
 501                 BUG();
 502         }
 503 }
 504
 505 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 506 {
 507         struct page *p;
 508         int err;
 509
 510         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 511         if (err >= 0) {
 512                 err = page_to_nid(p);
 513                 put_page(p);
 514         }
 515         return err;
 516 }
 517
 518 /* Retrieve NUMA policy */
 519 long do_get_mempolicy(int *policy, nodemask_t *nmask,
 520                         unsigned long addr, unsigned long flags)
 521 {
 522         int err;
 523         struct mm_struct *mm = current->mm;
 524         struct vm_area_struct *vma = NULL;
 525         struct mempolicy *pol = current->mempolicy;
 526
 527         cpuset_update_task_memory_state();
 528         if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
 529                 return -EINVAL;
 530         if (flags & MPOL_F_ADDR) {
 531                 down_read(&mm->mmap_sem);
 532                 vma = find_vma_intersection(mm, addr, addr+1);
 533                 if (!vma) {
 534                         up_read(&mm->mmap_sem);
 535                         return -EFAULT;
 536                 }
 537                 if (vma->vm_ops && vma->vm_ops->get_policy)
 538                         pol = vma->vm_ops->get_policy(vma, addr);
 539                 else
 540                         pol = vma->vm_policy;
 541         } else if (addr)
 542                 return -EINVAL;
 543
 544         if (!pol)
 545                 pol = &default_policy;
 546
 547         if (flags & MPOL_F_NODE) {
 548                 if (flags & MPOL_F_ADDR) {
 549                         err = lookup_node(mm, addr);
 550                         if (err < 0)
 551                                 goto out;
 552                         *policy = err;
 553                 } else if (pol == current->mempolicy &&
 554                                 pol->policy == MPOL_INTERLEAVE) {
 555                         *policy = current->il_next;
 556                 } else {
 557                         err = -EINVAL;
 558                         goto out;
 559                 }
 560         } else
 561                 *policy = pol->policy;
 562
 563         if (vma) {
 564                 up_read(&current->mm->mmap_sem);
 565                 vma = NULL;
 566         }
 567
 568         err = 0;
 569         if (nmask)
 570                 get_zonemask(pol, nmask);
 571
 572  out:
 573         if (vma)
 574                 up_read(&current->mm->mmap_sem);
 575         return err;
 576 }
 577
 578 #ifdef CONFIG_MIGRATION
 579 /*
 580  * page migration
 581  */
 582 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 583                                 unsigned long flags)
 584 {
 585         /*
 586          * Avoid migrating a page that is shared with others.
 587          */
 588         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
 589                 isolate_lru_page(page, pagelist);
 590 }
 591
 592 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 593 {
 594         return alloc_pages_node(node, GFP_HIGHUSER, 0);
 595 }
 596
 597 /*
 598  * Migrate pages from one node to a target node.
 599  * Returns error or the number of pages not migrated.
 600  */
 601 int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
 602 {
 603         nodemask_t nmask;
 604         LIST_HEAD(pagelist);
 605         int err = 0;
 606
 607         nodes_clear(nmask);
 608         node_set(source, nmask);
 609
 610         check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
 611                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 612
 613         if (!list_empty(&pagelist))
 614                 err = migrate_pages(&pagelist, new_node_page, dest);
 615
 616         return err;
 617 }
 618
 619 /*
 620  * Move pages between the two nodesets so as to preserve the physical
 621  * layout as much as possible.
 622  *
 623  * Returns the number of page that could not be moved.
 624  */
 625 int do_migrate_pages(struct mm_struct *mm,
 626         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 627 {
 628         LIST_HEAD(pagelist);
 629         int busy = 0;
 630         int err = 0;
 631         nodemask_t tmp;
 632
 633         down_read(&mm->mmap_sem);
 634
 635         err = migrate_vmas(mm, from_nodes, to_nodes, flags);
 636         if (err)
 637                 goto out;
 638
 639 /*
 640  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
 641  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
 642  * bit in 'tmp', and return that <source, dest> pair for migration.
 643  * The pair of nodemasks 'to' and 'from' define the map.
 644  *
 645  * If no pair of bits is found that way, fallback to picking some
 646  * pair of 'source' and 'dest' bits that are not the same.  If the
 647  * 'source' and 'dest' bits are the same, this represents a node
 648  * that will be migrating to itself, so no pages need move.
 649  *
 650  * If no bits are left in 'tmp', or if all remaining bits left
 651  * in 'tmp' correspond to the same bit in 'to', return false
 652  * (nothing left to migrate).
 653  *
 654  * This lets us pick a pair of nodes to migrate between, such that
 655  * if possible the dest node is not already occupied by some other
 656  * source node, minimizing the risk of overloading the memory on a
 657  * node that would happen if we migrated incoming memory to a node
 658  * before migrating outgoing memory source that same node.
 659  *
 660  * A single scan of tmp is sufficient.  As we go, we remember the
 661  * most recent <s, d> pair that moved (s != d).  If we find a pair
 662  * that not only moved, but what's better, moved to an empty slot
 663  * (d is not set in tmp), then we break out then, with that pair.
 664  * Otherwise when we finish scannng from_tmp, we at least have the
 665  * most recent <s, d> pair that moved.  If we get all the way through
 666  * the scan of tmp without finding any node that moved, much less
 667  * moved to an empty node, then there is nothing left worth migrating.
 668  */
 669
 670         tmp = *from_nodes;
 671         while (!nodes_empty(tmp)) {
 672                 int s,d;
 673                 int source = -1;
 674                 int dest = 0;
 675
 676                 for_each_node_mask(s, tmp) {
 677                         d = node_remap(s, *from_nodes, *to_nodes);
 678                         if (s == d)
 679                                 continue;
 680
 681                         source = s;     /* Node moved. Memorize */
 682                         dest = d;
 683
 684                         /* dest not in remaining from nodes? */
 685                         if (!node_isset(dest, tmp))
 686                                 break;
 687                 }
 688                 if (source == -1)
 689                         break;
 690
 691                 node_clear(source, tmp);
 692                 err = migrate_to_node(mm, source, dest, flags);
 693                 if (err > 0)
 694                         busy += err;
 695                 if (err < 0)
 696                         break;
 697         }
 698 out:
 699         up_read(&mm->mmap_sem);
 700         if (err < 0)
 701                 return err;
 702         return busy;
 703
 704 }
 705
 706 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 707 {
 708         struct vm_area_struct *vma = (struct vm_area_struct *)private;
 709
 710         return alloc_page_vma(GFP_HIGHUSER, vma, page_address_in_vma(page, vma));
 711 }
 712 #else
 713
 714 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 715                                 unsigned long flags)
 716 {
 717 }
 718
 719 int do_migrate_pages(struct mm_struct *mm,
 720         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 721 {
 722         return -ENOSYS;
 723 }
 724
 725 static struct page *new_vma_page(struct page *page, unsigned long private)
 726 {
 727         return NULL;
 728 }
 729 #endif
 730
 731 long do_mbind(unsigned long start, unsigned long len,
 732                 unsigned long mode, nodemask_t *nmask, unsigned long flags)
 733 {
 734         struct vm_area_struct *vma;
 735         struct mm_struct *mm = current->mm;
 736         struct mempolicy *new;
 737         unsigned long end;
 738         int err;
 739         LIST_HEAD(pagelist);
 740
 741         if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
 742                                       MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 743             || mode > MPOL_MAX)
 744                 return -EINVAL;
 745         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
 746                 return -EPERM;
 747
 748         if (start & ~PAGE_MASK)
 749                 return -EINVAL;
 750
 751         if (mode == MPOL_DEFAULT)
 752                 flags &= ~MPOL_MF_STRICT;
 753
 754         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 755         end = start + len;
 756
 757         if (end < start)
 758                 return -EINVAL;
 759         if (end == start)
 760                 return 0;
 761
 762         if (mpol_check_policy(mode, nmask))
 763                 return -EINVAL;
 764
 765         new = mpol_new(mode, nmask);
 766         if (IS_ERR(new))
 767                 return PTR_ERR(new);
 768
 769         /*
 770          * If we are using the default policy then operation
 771          * on discontinuous address spaces is okay after all
 772          */
 773         if (!new)
 774                 flags |= MPOL_MF_DISCONTIG_OK;
 775
 776         PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
 777                         mode,nodes_addr(nodes)[0]);
 778
 779         down_write(&mm->mmap_sem);
 780         vma = check_range(mm, start, end, nmask,
 781                           flags | MPOL_MF_INVERT, &pagelist);
 782
 783         err = PTR_ERR(vma);
 784         if (!IS_ERR(vma)) {
 785                 int nr_failed = 0;
 786
 787                 err = mbind_range(vma, start, end, new);
 788
 789                 if (!list_empty(&pagelist))
 790                         nr_failed = migrate_pages(&pagelist, new_vma_page,
 791                                                 (unsigned long)vma);
 792
 793                 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 794                         err = -EIO;
 795         }
 796
 797         up_write(&mm->mmap_sem);
 798         mpol_free(new);
 799         return err;
 800 }
 801
 802 /*
 803  * User space interface with variable sized bitmaps for nodelists.
 804  */
 805
 806 /* Copy a node mask from user space. */
 807 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 808                      unsigned long maxnode)
 809 {
 810         unsigned long k;
 811         unsigned long nlongs;
 812         unsigned long endmask;
 813
 814         --maxnode;
 815         nodes_clear(*nodes);
 816         if (maxnode == 0 || !nmask)
 817                 return 0;
 818         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
 819                 return -EINVAL;
 820
 821         nlongs = BITS_TO_LONGS(maxnode);
 822         if ((maxnode % BITS_PER_LONG) == 0)
 823                 endmask = ~0UL;
 824         else
 825                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
 826
 827         /* When the user specified more nodes than supported just check
 828            if the non supported part is all zero. */
 829         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
 830                 if (nlongs > PAGE_SIZE/sizeof(long))
 831                         return -EINVAL;
 832                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
 833                         unsigned long t;
 834                         if (get_user(t, nmask + k))
 835                                 return -EFAULT;
 836                         if (k == nlongs - 1) {
 837                                 if (t & endmask)
 838                                         return -EINVAL;
 839                         } else if (t)
 840                                 return -EINVAL;
 841                 }
 842                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
 843                 endmask = ~0UL;
 844         }
 845
 846         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
 847                 return -EFAULT;
 848         nodes_addr(*nodes)[nlongs-1] &= endmask;
 849         return 0;
 850 }
 851
 852 /* Copy a kernel node mask to user space */
 853 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 854                               nodemask_t *nodes)
 855 {
 856         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
 857         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
 858
 859         if (copy > nbytes) {
 860                 if (copy > PAGE_SIZE)
 861                         return -EINVAL;
 862                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
 863                         return -EFAULT;
 864                 copy = nbytes;
 865         }
 866         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
 867 }
 868
 869 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 870                         unsigned long mode,
 871                         unsigned long __user *nmask, unsigned long maxnode,
 872                         unsigned flags)
 873 {
 874         nodemask_t nodes;
 875         int err;
 876
 877         err = get_nodes(&nodes, nmask, maxnode);
 878         if (err)
 879                 return err;
 880         return do_mbind(start, len, mode, &nodes, flags);
 881 }
 882
 883 /* Set the process memory policy */
 884 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 885                 unsigned long maxnode)
 886 {
 887         int err;
 888         nodemask_t nodes;
 889
 890         if (mode < 0 || mode > MPOL_MAX)
 891                 return -EINVAL;
 892         err = get_nodes(&nodes, nmask, maxnode);
 893         if (err)
 894                 return err;
 895         return do_set_mempolicy(mode, &nodes);
 896 }
 897
 898 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 899                 const unsigned long __user *old_nodes,
 900                 const unsigned long __user *new_nodes)
 901 {
 902         struct mm_struct *mm;
 903         struct task_struct *task;
 904         nodemask_t old;
 905         nodemask_t new;
 906         nodemask_t task_nodes;
 907         int err;
 908
 909         err = get_nodes(&old, old_nodes, maxnode);
 910         if (err)
 911                 return err;
 912
 913         err = get_nodes(&new, new_nodes, maxnode);
 914         if (err)
 915                 return err;
 916
 917         /* Find the mm_struct */
 918         read_lock(&tasklist_lock);
 919         task = pid ? find_task_by_pid(pid) : current;
 920         if (!task) {
 921                 read_unlock(&tasklist_lock);
 922                 return -ESRCH;
 923         }
 924         mm = get_task_mm(task);
 925         read_unlock(&tasklist_lock);
 926
 927         if (!mm)
 928                 return -EINVAL;
 929
 930         /*
 931          * Check if this process has the right to modify the specified
 932          * process. The right exists if the process has administrative
 933          * capabilities, superuser privileges or the same
 934          * userid as the target process.
 935          */
 936         if ((current->euid != task->suid) && (current->euid != task->uid) &&
 937             (current->uid != task->suid) && (current->uid != task->uid) &&
 938             !capable(CAP_SYS_NICE)) {
 939                 err = -EPERM;
 940                 goto out;
 941         }
 942
 943         task_nodes = cpuset_mems_allowed(task);
 944         /* Is the user allowed to access the target nodes? */
 945         if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
 946                 err = -EPERM;
 947                 goto out;
 948         }
 949
 950         err = security_task_movememory(task);
 951         if (err)
 952                 goto out;
 953
 954         err = do_migrate_pages(mm, &old, &new,
 955                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
 956 out:
 957         mmput(mm);
 958         return err;
 959 }
 960
 961
 962 /* Retrieve NUMA policy */
 963 asmlinkage long sys_get_mempolicy(int __user *policy,
 964                                 unsigned long __user *nmask,
 965                                 unsigned long maxnode,
 966                                 unsigned long addr, unsigned long flags)
 967 {
 968         int err, pval;
 969         nodemask_t nodes;
 970
 971         if (nmask != NULL && maxnode < MAX_NUMNODES)
 972                 return -EINVAL;
 973
 974         err = do_get_mempolicy(&pval, &nodes, addr, flags);
 975
 976         if (err)
 977                 return err;
 978
 979         if (policy && put_user(pval, policy))
 980                 return -EFAULT;
 981
 982         if (nmask)
 983                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
 984
 985         return err;
 986 }
 987
 988 #ifdef CONFIG_COMPAT
 989
 990 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
 991                                      compat_ulong_t __user *nmask,
 992                                      compat_ulong_t maxnode,
 993                                      compat_ulong_t addr, compat_ulong_t flags)
 994 {
 995         long err;
 996         unsigned long __user *nm = NULL;
 997         unsigned long nr_bits, alloc_size;
 998         DECLARE_BITMAP(bm, MAX_NUMNODES);
 999
1000         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1001         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1002
1003         if (nmask)
1004                 nm = compat_alloc_user_space(alloc_size);
1005
1006         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1007
1008         if (!err && nmask) {
1009                 err = copy_from_user(bm, nm, alloc_size);
1010                 /* ensure entire bitmap is zeroed */
1011                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1012                 err |= compat_put_bitmap(nmask, bm, nr_bits);
1013         }
1014
1015         return err;
1016 }
1017
1018 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1019                                      compat_ulong_t maxnode)
1020 {
1021         long err = 0;
1022         unsigned long __user *nm = NULL;
1023         unsigned long nr_bits, alloc_size;
1024         DECLARE_BITMAP(bm, MAX_NUMNODES);
1025
1026         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1027         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1028
1029         if (nmask) {
1030                 err = compat_get_bitmap(bm, nmask, nr_bits);
1031                 nm = compat_alloc_user_space(alloc_size);
1032                 err |= copy_to_user(nm, bm, alloc_size);
1033         }
1034
1035         if (err)
1036                 return -EFAULT;
1037
1038         return sys_set_mempolicy(mode, nm, nr_bits+1);
1039 }
1040
1041 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1042                              compat_ulong_t mode, compat_ulong_t __user *nmask,
1043                              compat_ulong_t maxnode, compat_ulong_t flags)
1044 {
1045         long err = 0;
1046         unsigned long __user *nm = NULL;
1047         unsigned long nr_bits, alloc_size;
1048         nodemask_t bm;
1049
1050         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1051         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1052
1053         if (nmask) {
1054                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1055                 nm = compat_alloc_user_space(alloc_size);
1056                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1057         }
1058
1059         if (err)
1060                 return -EFAULT;
1061
1062         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1063 }
1064
1065 #endif
1066
1067 /* Return effective policy for a VMA */
1068 static struct mempolicy * get_vma_policy(struct task_struct *task,
1069                 struct vm_area_struct *vma, unsigned long addr)
1070 {
1071         struct mempolicy *pol = task->mempolicy;
1072
1073         if (vma) {
1074                 if (vma->vm_ops && vma->vm_ops->get_policy)
1075                         pol = vma->vm_ops->get_policy(vma, addr);
1076                 else if (vma->vm_policy &&
1077                                 vma->vm_policy->policy != MPOL_DEFAULT)
1078                         pol = vma->vm_policy;
1079         }
1080         if (!pol)
1081                 pol = &default_policy;
1082         return pol;
1083 }
1084
1085 /* Return a zonelist representing a mempolicy */
1086 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1087 {
1088         int nd;
1089
1090         switch (policy->policy) {
1091         case MPOL_PREFERRED:
1092                 nd = policy->v.preferred_node;
1093                 if (nd < 0)
1094                         nd = numa_node_id();
1095                 break;
1096         case MPOL_BIND:
1097                 /* Lower zones don't get a policy applied */
1098                 /* Careful: current->mems_allowed might have moved */
1099                 if (gfp_zone(gfp) >= policy_zone)
1100                         if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
1101                                 return policy->v.zonelist;
1102                 /*FALL THROUGH*/
1103         case MPOL_INTERLEAVE: /* should not happen */
1104         case MPOL_DEFAULT:
1105                 nd = numa_node_id();
1106                 break;
1107         default:
1108                 nd = 0;
1109                 BUG();
1110         }
1111         return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
1112 }
1113
1114 /* Do dynamic interleaving for a process */
1115 static unsigned interleave_nodes(struct mempolicy *policy)
1116 {
1117         unsigned nid, next;
1118         struct task_struct *me = current;
1119
1120         nid = me->il_next;
1121         next = next_node(nid, policy->v.nodes);
1122         if (next >= MAX_NUMNODES)
1123                 next = first_node(policy->v.nodes);
1124         me->il_next = next;
1125         return nid;
1126 }
1127
1128 /*
1129  * Depending on the memory policy provide a node from which to allocate the
1130  * next slab entry.
1131  */
1132 unsigned slab_node(struct mempolicy *policy)
1133 {
1134         switch (policy->policy) {
1135         case MPOL_INTERLEAVE:
1136                 return interleave_nodes(policy);
1137
1138         case MPOL_BIND:
1139                 /*
1140                  * Follow bind policy behavior and start allocation at the
1141                  * first node.
1142                  */
1143                 return policy->v.zonelist->zones[0]->zone_pgdat->node_id;
1144
1145         case MPOL_PREFERRED:
1146                 if (policy->v.preferred_node >= 0)
1147                         return policy->v.preferred_node;
1148                 /* Fall through */
1149
1150         default:
1151                 return numa_node_id();
1152         }
1153 }
1154
1155 /* Do static interleaving for a VMA with known offset. */
1156 static unsigned offset_il_node(struct mempolicy *pol,
1157                 struct vm_area_struct *vma, unsigned long off)
1158 {
1159         unsigned nnodes = nodes_weight(pol->v.nodes);
1160         unsigned target = (unsigned)off % nnodes;
1161         int c;
1162         int nid = -1;
1163
1164         c = 0;
1165         do {
1166                 nid = next_node(nid, pol->v.nodes);
1167                 c++;
1168         } while (c <= target);
1169         return nid;
1170 }
1171
1172 /* Determine a node number for interleave */
1173 static inline unsigned interleave_nid(struct mempolicy *pol,
1174                  struct vm_area_struct *vma, unsigned long addr, int shift)
1175 {
1176         if (vma) {
1177                 unsigned long off;
1178
1179                 off = vma->vm_pgoff;
1180                 off += (addr - vma->vm_start) >> shift;
1181                 return offset_il_node(pol, vma, off);
1182         } else
1183                 return interleave_nodes(pol);
1184 }
1185
1186 #ifdef CONFIG_HUGETLBFS
1187 /* Return a zonelist suitable for a huge page allocation. */
1188 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1189 {
1190         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1191
1192         if (pol->policy == MPOL_INTERLEAVE) {
1193                 unsigned nid;
1194
1195                 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1196                 return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
1197         }
1198         return zonelist_policy(GFP_HIGHUSER, pol);
1199 }
1200 #endif
1201
1202 /* Allocate a page in interleaved policy.
1203    Own path because it needs to do special accounting. */
1204 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1205                                         unsigned nid)
1206 {
1207         struct zonelist *zl;
1208         struct page *page;
1209
1210         zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1211         page = __alloc_pages(gfp, order, zl);
1212         if (page && page_zone(page) == zl->zones[0])
1213                 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1214         return page;
1215 }
1216
1217 /**
1218  *      alloc_page_vma  - Allocate a page for a VMA.
1219  *
1220  *      @gfp:
1221  *      %GFP_USER    user allocation.
1222  *      %GFP_KERNEL  kernel allocations,
1223  *      %GFP_HIGHMEM highmem/user allocations,
1224  *      %GFP_FS      allocation should not call back into a file system.
1225  *      %GFP_ATOMIC  don't sleep.
1226  *
1227  *      @vma:  Pointer to VMA or NULL if not available.
1228  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1229  *
1230  *      This function allocates a page from the kernel page pool and applies
1231  *      a NUMA policy associated with the VMA or the current process.
1232  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1233  *      mm_struct of the VMA to prevent it from going away. Should be used for
1234  *      all allocations for pages that will be mapped into
1235  *      user space. Returns NULL when no page can be allocated.
1236  *
1237  *      Should be called with the mm_sem of the vma hold.
1238  */
1239 struct page *
1240 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1241 {
1242         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1243
1244         cpuset_update_task_memory_state();
1245
1246         if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1247                 unsigned nid;
1248
1249                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1250                 return alloc_page_interleave(gfp, 0, nid);
1251         }
1252         return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
1253 }
1254
1255 /**
1256  *      alloc_pages_current - Allocate pages.
1257  *
1258  *      @gfp:
1259  *              %GFP_USER   user allocation,
1260  *              %GFP_KERNEL kernel allocation,
1261  *              %GFP_HIGHMEM highmem allocation,
1262  *              %GFP_FS     don't call back into a file system.
1263  *              %GFP_ATOMIC don't sleep.
1264  *      @order: Power of two of allocation size in pages. 0 is a single page.
1265  *
1266  *      Allocate a page from the kernel page pool.  When not in
1267  *      interrupt context and apply the current process NUMA policy.
1268  *      Returns NULL when no page can be allocated.
1269  *
1270  *      Don't call cpuset_update_task_memory_state() unless
1271  *      1) it's ok to take cpuset_sem (can WAIT), and
1272  *      2) allocating for current task (not interrupt).
1273  */
1274 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1275 {
1276         struct mempolicy *pol = current->mempolicy;
1277
1278         if ((gfp & __GFP_WAIT) && !in_interrupt())
1279                 cpuset_update_task_memory_state();
1280         if (!pol || in_interrupt())
1281                 pol = &default_policy;
1282         if (pol->policy == MPOL_INTERLEAVE)
1283                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1284         return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1285 }
1286 EXPORT_SYMBOL(alloc_pages_current);
1287
1288 /*
1289  * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1290  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1291  * with the mems_allowed returned by cpuset_mems_allowed().  This
1292  * keeps mempolicies cpuset relative after its cpuset moves.  See
1293  * further kernel/cpuset.c update_nodemask().
1294  */
1295 void *cpuset_being_rebound;
1296
1297 /* Slow path of a mempolicy copy */
1298 struct mempolicy *__mpol_copy(struct mempolicy *old)
1299 {
1300         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1301
1302         if (!new)
1303                 return ERR_PTR(-ENOMEM);
1304         if (current_cpuset_is_being_rebound()) {
1305                 nodemask_t mems = cpuset_mems_allowed(current);
1306                 mpol_rebind_policy(old, &mems);
1307         }
1308         *new = *old;
1309         atomic_set(&new->refcnt, 1);
1310         if (new->policy == MPOL_BIND) {
1311                 int sz = ksize(old->v.zonelist);
1312                 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
1313                 if (!new->v.zonelist) {
1314                         kmem_cache_free(policy_cache, new);
1315                         return ERR_PTR(-ENOMEM);
1316                 }
1317                 memcpy(new->v.zonelist, old->v.zonelist, sz);
1318         }
1319         return new;
1320 }
1321
1322 /* Slow path of a mempolicy comparison */
1323 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1324 {
1325         if (!a || !b)
1326                 return 0;
1327         if (a->policy != b->policy)
1328                 return 0;
1329         switch (a->policy) {
1330         case MPOL_DEFAULT:
1331                 return 1;
1332         case MPOL_INTERLEAVE:
1333                 return nodes_equal(a->v.nodes, b->v.nodes);
1334         case MPOL_PREFERRED:
1335                 return a->v.preferred_node == b->v.preferred_node;
1336         case MPOL_BIND: {
1337                 int i;
1338                 for (i = 0; a->v.zonelist->zones[i]; i++)
1339                         if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1340                                 return 0;
1341                 return b->v.zonelist->zones[i] == NULL;
1342         }
1343         default:
1344                 BUG();
1345                 return 0;
1346         }
1347 }
1348
1349 /* Slow path of a mpol destructor. */
1350 void __mpol_free(struct mempolicy *p)
1351 {
1352         if (!atomic_dec_and_test(&p->refcnt))
1353                 return;
1354         if (p->policy == MPOL_BIND)
1355                 kfree(p->v.zonelist);
1356         p->policy = MPOL_DEFAULT;
1357         kmem_cache_free(policy_cache, p);
1358 }
1359
1360 /*
1361  * Shared memory backing store policy support.
1362  *
1363  * Remember policies even when nobody has shared memory mapped.
1364  * The policies are kept in Red-Black tree linked from the inode.
1365  * They are protected by the sp->lock spinlock, which should be held
1366  * for any accesses to the tree.
1367  */
1368
1369 /* lookup first element intersecting start-end */
1370 /* Caller holds sp->lock */
1371 static struct sp_node *
1372 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1373 {
1374         struct rb_node *n = sp->root.rb_node;
1375
1376         while (n) {
1377                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1378
1379                 if (start >= p->end)
1380                         n = n->rb_right;
1381                 else if (end <= p->start)
1382                         n = n->rb_left;
1383                 else
1384                         break;
1385         }
1386         if (!n)
1387                 return NULL;
1388         for (;;) {
1389                 struct sp_node *w = NULL;
1390                 struct rb_node *prev = rb_prev(n);
1391                 if (!prev)
1392                         break;
1393                 w = rb_entry(prev, struct sp_node, nd);
1394                 if (w->end <= start)
1395                         break;
1396                 n = prev;
1397         }
1398         return rb_entry(n, struct sp_node, nd);
1399 }
1400
1401 /* Insert a new shared policy into the list. */
1402 /* Caller holds sp->lock */
1403 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1404 {
1405         struct rb_node **p = &sp->root.rb_node;
1406         struct rb_node *parent = NULL;
1407         struct sp_node *nd;
1408
1409         while (*p) {
1410                 parent = *p;
1411                 nd = rb_entry(parent, struct sp_node, nd);
1412                 if (new->start < nd->start)
1413                         p = &(*p)->rb_left;
1414                 else if (new->end > nd->end)
1415                         p = &(*p)->rb_right;
1416                 else
1417                         BUG();
1418         }
1419         rb_link_node(&new->nd, parent, p);
1420         rb_insert_color(&new->nd, &sp->root);
1421         PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1422                  new->policy ? new->policy->policy : 0);
1423 }
1424
1425 /* Find shared policy intersecting idx */
1426 struct mempolicy *
1427 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1428 {
1429         struct mempolicy *pol = NULL;
1430         struct sp_node *sn;
1431
1432         if (!sp->root.rb_node)
1433                 return NULL;
1434         spin_lock(&sp->lock);
1435         sn = sp_lookup(sp, idx, idx+1);
1436         if (sn) {
1437                 mpol_get(sn->policy);
1438                 pol = sn->policy;
1439         }
1440         spin_unlock(&sp->lock);
1441         return pol;
1442 }
1443
1444 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1445 {
1446         PDprintk("deleting %lx-l%x\n", n->start, n->end);
1447         rb_erase(&n->nd, &sp->root);
1448         mpol_free(n->policy);
1449         kmem_cache_free(sn_cache, n);
1450 }
1451
1452 struct sp_node *
1453 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1454 {
1455         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1456
1457         if (!n)
1458                 return NULL;
1459         n->start = start;
1460         n->end = end;
1461         mpol_get(pol);
1462         n->policy = pol;
1463         return n;
1464 }
1465
1466 /* Replace a policy range. */
1467 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1468                                  unsigned long end, struct sp_node *new)
1469 {
1470         struct sp_node *n, *new2 = NULL;
1471
1472 restart:
1473         spin_lock(&sp->lock);
1474         n = sp_lookup(sp, start, end);
1475         /* Take care of old policies in the same range. */
1476         while (n && n->start < end) {
1477                 struct rb_node *next = rb_next(&n->nd);
1478                 if (n->start >= start) {
1479                         if (n->end <= end)
1480                                 sp_delete(sp, n);
1481                         else
1482                                 n->start = end;
1483                 } else {
1484                         /* Old policy spanning whole new range. */
1485                         if (n->end > end) {
1486                                 if (!new2) {
1487                                         spin_unlock(&sp->lock);
1488                                         new2 = sp_alloc(end, n->end, n->policy);
1489                                         if (!new2)
1490                                                 return -ENOMEM;
1491                                         goto restart;
1492                                 }
1493                                 n->end = start;
1494                                 sp_insert(sp, new2);
1495                                 new2 = NULL;
1496                                 break;
1497                         } else
1498                                 n->end = start;
1499                 }
1500                 if (!next)
1501                         break;
1502                 n = rb_entry(next, struct sp_node, nd);
1503         }
1504         if (new)
1505                 sp_insert(sp, new);
1506         spin_unlock(&sp->lock);
1507         if (new2) {
1508                 mpol_free(new2->policy);
1509                 kmem_cache_free(sn_cache, new2);
1510         }
1511         return 0;
1512 }
1513
1514 void mpol_shared_policy_init(struct shared_policy *info, int policy,
1515                                 nodemask_t *policy_nodes)
1516 {
1517         info->root = RB_ROOT;
1518         spin_lock_init(&info->lock);
1519
1520         if (policy != MPOL_DEFAULT) {
1521                 struct mempolicy *newpol;
1522
1523                 /* Falls back to MPOL_DEFAULT on any error */
1524                 newpol = mpol_new(policy, policy_nodes);
1525                 if (!IS_ERR(newpol)) {
1526                         /* Create pseudo-vma that contains just the policy */
1527                         struct vm_area_struct pvma;
1528
1529                         memset(&pvma, 0, sizeof(struct vm_area_struct));
1530                         /* Policy covers entire file */
1531                         pvma.vm_end = TASK_SIZE;
1532                         mpol_set_shared_policy(info, &pvma, newpol);
1533                         mpol_free(newpol);
1534                 }
1535         }
1536 }
1537
1538 int mpol_set_shared_policy(struct shared_policy *info,
1539                         struct vm_area_struct *vma, struct mempolicy *npol)
1540 {
1541         int err;
1542         struct sp_node *new = NULL;
1543         unsigned long sz = vma_pages(vma);
1544
1545         PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1546                  vma->vm_pgoff,
1547                  sz, npol? npol->policy : -1,
1548                 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1549
1550         if (npol) {
1551                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1552                 if (!new)
1553                         return -ENOMEM;
1554         }
1555         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1556         if (err && new)
1557                 kmem_cache_free(sn_cache, new);
1558         return err;
1559 }
1560
1561 /* Free a backing policy store on inode delete. */
1562 void mpol_free_shared_policy(struct shared_policy *p)
1563 {
1564         struct sp_node *n;
1565         struct rb_node *next;
1566
1567         if (!p->root.rb_node)
1568                 return;
1569         spin_lock(&p->lock);
1570         next = rb_first(&p->root);
1571         while (next) {
1572                 n = rb_entry(next, struct sp_node, nd);
1573                 next = rb_next(&n->nd);
1574                 rb_erase(&n->nd, &p->root);
1575                 mpol_free(n->policy);
1576                 kmem_cache_free(sn_cache, n);
1577         }
1578         spin_unlock(&p->lock);
1579 }
1580
1581 /* assumes fs == KERNEL_DS */
1582 void __init numa_policy_init(void)
1583 {
1584         policy_cache = kmem_cache_create("numa_policy",
1585                                          sizeof(struct mempolicy),
1586                                          0, SLAB_PANIC, NULL, NULL);
1587
1588         sn_cache = kmem_cache_create("shared_policy_node",
1589                                      sizeof(struct sp_node),
1590                                      0, SLAB_PANIC, NULL, NULL);
1591
1592         /* Set interleaving policy for system init. This way not all
1593            the data structures allocated at system boot end up in node zero. */
1594
1595         if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1596                 printk("numa_policy_init: interleaving failed\n");
1597 }
1598
1599 /* Reset policy of current process to default */
1600 void numa_default_policy(void)
1601 {
1602         do_set_mempolicy(MPOL_DEFAULT, NULL);
1603 }
1604
1605 /* Migrate a policy to a different set of nodes */
1606 void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1607 {
1608         nodemask_t *mpolmask;
1609         nodemask_t tmp;
1610
1611         if (!pol)
1612                 return;
1613         mpolmask = &pol->cpuset_mems_allowed;
1614         if (nodes_equal(*mpolmask, *newmask))
1615                 return;
1616
1617         switch (pol->policy) {
1618         case MPOL_DEFAULT:
1619                 break;
1620         case MPOL_INTERLEAVE:
1621                 nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1622                 pol->v.nodes = tmp;
1623                 *mpolmask = *newmask;
1624                 current->il_next = node_remap(current->il_next,
1625                                                 *mpolmask, *newmask);
1626                 break;
1627         case MPOL_PREFERRED:
1628                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1629                                                 *mpolmask, *newmask);
1630                 *mpolmask = *newmask;
1631                 break;
1632         case MPOL_BIND: {
1633                 nodemask_t nodes;
1634                 struct zone **z;
1635                 struct zonelist *zonelist;
1636
1637                 nodes_clear(nodes);
1638                 for (z = pol->v.zonelist->zones; *z; z++)
1639                         node_set((*z)->zone_pgdat->node_id, nodes);
1640                 nodes_remap(tmp, nodes, *mpolmask, *newmask);
1641                 nodes = tmp;
1642
1643                 zonelist = bind_zonelist(&nodes);
1644
1645                 /* If no mem, then zonelist is NULL and we keep old zonelist.
1646                  * If that old zonelist has no remaining mems_allowed nodes,
1647                  * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1648                  */
1649
1650                 if (zonelist) {
1651                         /* Good - got mem - substitute new zonelist */
1652                         kfree(pol->v.zonelist);
1653                         pol->v.zonelist = zonelist;
1654                 }
1655                 *mpolmask = *newmask;
1656                 break;
1657         }
1658         default:
1659                 BUG();
1660                 break;
1661         }
1662 }
1663
1664 /*
1665  * Wrapper for mpol_rebind_policy() that just requires task
1666  * pointer, and updates task mempolicy.
1667  */
1668
1669 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1670 {
1671         mpol_rebind_policy(tsk->mempolicy, new);
1672 }
1673
1674 /*
1675  * Rebind each vma in mm to new nodemask.
1676  *
1677  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
1678  */
1679
1680 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1681 {
1682         struct vm_area_struct *vma;
1683
1684         down_write(&mm->mmap_sem);
1685         for (vma = mm->mmap; vma; vma = vma->vm_next)
1686                 mpol_rebind_policy(vma->vm_policy, new);
1687         up_write(&mm->mmap_sem);
1688 }
1689
1690 /*
1691  * Display pages allocated per node and memory policy via /proc.
1692  */
1693
1694 static const char *policy_types[] = { "default", "prefer", "bind",
1695                                       "interleave" };
1696
1697 /*
1698  * Convert a mempolicy into a string.
1699  * Returns the number of characters in buffer (if positive)
1700  * or an error (negative)
1701  */
1702 static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1703 {
1704         char *p = buffer;
1705         int l;
1706         nodemask_t nodes;
1707         int mode = pol ? pol->policy : MPOL_DEFAULT;
1708
1709         switch (mode) {
1710         case MPOL_DEFAULT:
1711                 nodes_clear(nodes);
1712                 break;
1713
1714         case MPOL_PREFERRED:
1715                 nodes_clear(nodes);
1716                 node_set(pol->v.preferred_node, nodes);
1717                 break;
1718
1719         case MPOL_BIND:
1720                 get_zonemask(pol, &nodes);
1721                 break;
1722
1723         case MPOL_INTERLEAVE:
1724                 nodes = pol->v.nodes;
1725                 break;
1726
1727         default:
1728                 BUG();
1729                 return -EFAULT;
1730         }
1731
1732         l = strlen(policy_types[mode]);
1733         if (buffer + maxlen < p + l + 1)
1734                 return -ENOSPC;
1735
1736         strcpy(p, policy_types[mode]);
1737         p += l;
1738
1739         if (!nodes_empty(nodes)) {
1740                 if (buffer + maxlen < p + 2)
1741                         return -ENOSPC;
1742                 *p++ = '=';
1743                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1744         }
1745         return p - buffer;
1746 }
1747
1748 struct numa_maps {
1749         unsigned long pages;
1750         unsigned long anon;
1751         unsigned long active;
1752         unsigned long writeback;
1753         unsigned long mapcount_max;
1754         unsigned long dirty;
1755         unsigned long swapcache;
1756         unsigned long node[MAX_NUMNODES];
1757 };
1758
1759 static void gather_stats(struct page *page, void *private, int pte_dirty)
1760 {
1761         struct numa_maps *md = private;
1762         int count = page_mapcount(page);
1763
1764         md->pages++;
1765         if (pte_dirty || PageDirty(page))
1766                 md->dirty++;
1767
1768         if (PageSwapCache(page))
1769                 md->swapcache++;
1770
1771         if (PageActive(page))
1772                 md->active++;
1773
1774         if (PageWriteback(page))
1775                 md->writeback++;
1776
1777         if (PageAnon(page))
1778                 md->anon++;
1779
1780         if (count > md->mapcount_max)
1781                 md->mapcount_max = count;
1782
1783         md->node[page_to_nid(page)]++;
1784 }
1785
1786 #ifdef CONFIG_HUGETLB_PAGE
1787 static void check_huge_range(struct vm_area_struct *vma,
1788                 unsigned long start, unsigned long end,
1789                 struct numa_maps *md)
1790 {
1791         unsigned long addr;
1792         struct page *page;
1793
1794         for (addr = start; addr < end; addr += HPAGE_SIZE) {
1795                 pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
1796                 pte_t pte;
1797
1798                 if (!ptep)
1799                         continue;
1800
1801                 pte = *ptep;
1802                 if (pte_none(pte))
1803                         continue;
1804
1805                 page = pte_page(pte);
1806                 if (!page)
1807                         continue;
1808
1809                 gather_stats(page, md, pte_dirty(*ptep));
1810         }
1811 }
1812 #else
1813 static inline void check_huge_range(struct vm_area_struct *vma,
1814                 unsigned long start, unsigned long end,
1815                 struct numa_maps *md)
1816 {
1817 }
1818 #endif
1819
1820 int show_numa_map(struct seq_file *m, void *v)
1821 {
1822         struct proc_maps_private *priv = m->private;
1823         struct vm_area_struct *vma = v;
1824         struct numa_maps *md;
1825         struct file *file = vma->vm_file;
1826         struct mm_struct *mm = vma->vm_mm;
1827         int n;
1828         char buffer[50];
1829
1830         if (!mm)
1831                 return 0;
1832
1833         md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1834         if (!md)
1835                 return 0;
1836
1837         mpol_to_str(buffer, sizeof(buffer),
1838                             get_vma_policy(priv->task, vma, vma->vm_start));
1839
1840         seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1841
1842         if (file) {
1843                 seq_printf(m, " file=");
1844                 seq_path(m, file->f_vfsmnt, file->f_dentry, "\n\t= ");
1845         } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1846                 seq_printf(m, " heap");
1847         } else if (vma->vm_start <= mm->start_stack &&
1848                         vma->vm_end >= mm->start_stack) {
1849                 seq_printf(m, " stack");
1850         }
1851
1852         if (is_vm_hugetlb_page(vma)) {
1853                 check_huge_range(vma, vma->vm_start, vma->vm_end, md);
1854                 seq_printf(m, " huge");
1855         } else {
1856                 check_pgd_range(vma, vma->vm_start, vma->vm_end,
1857                                 &node_online_map, MPOL_MF_STATS, md);
1858         }
1859
1860         if (!md->pages)
1861                 goto out;
1862
1863         if (md->anon)
1864                 seq_printf(m," anon=%lu",md->anon);
1865
1866         if (md->dirty)
1867                 seq_printf(m," dirty=%lu",md->dirty);
1868
1869         if (md->pages != md->anon && md->pages != md->dirty)
1870                 seq_printf(m, " mapped=%lu", md->pages);
1871
1872         if (md->mapcount_max > 1)
1873                 seq_printf(m, " mapmax=%lu", md->mapcount_max);
1874
1875         if (md->swapcache)
1876                 seq_printf(m," swapcache=%lu", md->swapcache);
1877
1878         if (md->active < md->pages && !is_vm_hugetlb_page(vma))
1879                 seq_printf(m," active=%lu", md->active);
1880
1881         if (md->writeback)
1882                 seq_printf(m," writeback=%lu", md->writeback);
1883
1884         for_each_online_node(n)
1885                 if (md->node[n])
1886                         seq_printf(m, " N%d=%lu", n, md->node[n]);
1887 out:
1888         seq_putc(m, '\n');
1889         kfree(md);
1890
1891         if (m->count < m->size)
1892                 m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
1893         return 0;
1894 }
1895