mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66    could replace all the switch()es with a mempolicy_ops structure.
  67 */
  68
  69 #include <linux/mempolicy.h>
  70 #include <linux/mm.h>
  71 #include <linux/highmem.h>
  72 #include <linux/hugetlb.h>
  73 #include <linux/kernel.h>
  74 #include <linux/sched.h>
  75 #include <linux/mm.h>
  76 #include <linux/nodemask.h>
  77 #include <linux/cpuset.h>
  78 #include <linux/gfp.h>
  79 #include <linux/slab.h>
  80 #include <linux/string.h>
  81 #include <linux/module.h>
  82 #include <linux/interrupt.h>
  83 #include <linux/init.h>
  84 #include <linux/compat.h>
  85 #include <linux/mempolicy.h>
  86 #include <linux/swap.h>
  87 #include <linux/seq_file.h>
  88 #include <linux/proc_fs.h>
  89
  90 #include <asm/tlbflush.h>
  91 #include <asm/uaccess.h>
  92
  93 /* Internal flags */
  94 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
  95 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
  96 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
  97
  98 /* The number of pages to migrate per call to migrate_pages() */
  99 #define MIGRATE_CHUNK_SIZE 256
 100
 101 static kmem_cache_t *policy_cache;
 102 static kmem_cache_t *sn_cache;
 103
 104 #define PDprintk(fmt...)
 105
 106 /* Highest zone. An specific allocation for a zone below that is not
 107    policied. */
 108 int policy_zone = ZONE_DMA;
 109
 110 struct mempolicy default_policy = {
 111         .refcnt = ATOMIC_INIT(1), /* never free it */
 112         .policy = MPOL_DEFAULT,
 113 };
 114
 115 /* Do sanity checking on a policy */
 116 static int mpol_check_policy(int mode, nodemask_t *nodes)
 117 {
 118         int empty = nodes_empty(*nodes);
 119
 120         switch (mode) {
 121         case MPOL_DEFAULT:
 122                 if (!empty)
 123                         return -EINVAL;
 124                 break;
 125         case MPOL_BIND:
 126         case MPOL_INTERLEAVE:
 127                 /* Preferred will only use the first bit, but allow
 128                    more for now. */
 129                 if (empty)
 130                         return -EINVAL;
 131                 break;
 132         }
 133         return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
 134 }
 135 /* Generate a custom zonelist for the BIND policy. */
 136 static struct zonelist *bind_zonelist(nodemask_t *nodes)
 137 {
 138         struct zonelist *zl;
 139         int num, max, nd;
 140
 141         max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
 142         zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
 143         if (!zl)
 144                 return NULL;
 145         num = 0;
 146         for_each_node_mask(nd, *nodes)
 147                 zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
 148         zl->zones[num] = NULL;
 149         return zl;
 150 }
 151
 152 /* Create a new policy */
 153 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 154 {
 155         struct mempolicy *policy;
 156
 157         PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
 158         if (mode == MPOL_DEFAULT)
 159                 return NULL;
 160         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 161         if (!policy)
 162                 return ERR_PTR(-ENOMEM);
 163         atomic_set(&policy->refcnt, 1);
 164         switch (mode) {
 165         case MPOL_INTERLEAVE:
 166                 policy->v.nodes = *nodes;
 167                 if (nodes_weight(*nodes) == 0) {
 168                         kmem_cache_free(policy_cache, policy);
 169                         return ERR_PTR(-EINVAL);
 170                 }
 171                 break;
 172         case MPOL_PREFERRED:
 173                 policy->v.preferred_node = first_node(*nodes);
 174                 if (policy->v.preferred_node >= MAX_NUMNODES)
 175                         policy->v.preferred_node = -1;
 176                 break;
 177         case MPOL_BIND:
 178                 policy->v.zonelist = bind_zonelist(nodes);
 179                 if (policy->v.zonelist == NULL) {
 180                         kmem_cache_free(policy_cache, policy);
 181                         return ERR_PTR(-ENOMEM);
 182                 }
 183                 break;
 184         }
 185         policy->policy = mode;
 186         policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
 187         return policy;
 188 }
 189
 190 static void gather_stats(struct page *, void *);
 191 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 192                                 unsigned long flags);
 193
 194 /* Scan through pages checking if pages follow certain conditions. */
 195 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 196                 unsigned long addr, unsigned long end,
 197                 const nodemask_t *nodes, unsigned long flags,
 198                 void *private)
 199 {
 200         pte_t *orig_pte;
 201         pte_t *pte;
 202         spinlock_t *ptl;
 203
 204         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 205         do {
 206                 struct page *page;
 207                 unsigned int nid;
 208
 209                 if (!pte_present(*pte))
 210                         continue;
 211                 page = vm_normal_page(vma, addr, *pte);
 212                 if (!page)
 213                         continue;
 214                 /*
 215                  * The check for PageReserved here is important to avoid
 216                  * handling zero pages and other pages that may have been
 217                  * marked special by the system.
 218                  *
 219                  * If the PageReserved would not be checked here then f.e.
 220                  * the location of the zero page could have an influence
 221                  * on MPOL_MF_STRICT, zero pages would be counted for
 222                  * the per node stats, and there would be useless attempts
 223                  * to put zero pages on the migration list.
 224                  */
 225                 if (PageReserved(page))
 226                         continue;
 227                 nid = page_to_nid(page);
 228                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 229                         continue;
 230
 231                 if (flags & MPOL_MF_STATS)
 232                         gather_stats(page, private);
 233                 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 234                         migrate_page_add(page, private, flags);
 235                 else
 236                         break;
 237         } while (pte++, addr += PAGE_SIZE, addr != end);
 238         pte_unmap_unlock(orig_pte, ptl);
 239         return addr != end;
 240 }
 241
 242 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 243                 unsigned long addr, unsigned long end,
 244                 const nodemask_t *nodes, unsigned long flags,
 245                 void *private)
 246 {
 247         pmd_t *pmd;
 248         unsigned long next;
 249
 250         pmd = pmd_offset(pud, addr);
 251         do {
 252                 next = pmd_addr_end(addr, end);
 253                 if (pmd_none_or_clear_bad(pmd))
 254                         continue;
 255                 if (check_pte_range(vma, pmd, addr, next, nodes,
 256                                     flags, private))
 257                         return -EIO;
 258         } while (pmd++, addr = next, addr != end);
 259         return 0;
 260 }
 261
 262 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 263                 unsigned long addr, unsigned long end,
 264                 const nodemask_t *nodes, unsigned long flags,
 265                 void *private)
 266 {
 267         pud_t *pud;
 268         unsigned long next;
 269
 270         pud = pud_offset(pgd, addr);
 271         do {
 272                 next = pud_addr_end(addr, end);
 273                 if (pud_none_or_clear_bad(pud))
 274                         continue;
 275                 if (check_pmd_range(vma, pud, addr, next, nodes,
 276                                     flags, private))
 277                         return -EIO;
 278         } while (pud++, addr = next, addr != end);
 279         return 0;
 280 }
 281
 282 static inline int check_pgd_range(struct vm_area_struct *vma,
 283                 unsigned long addr, unsigned long end,
 284                 const nodemask_t *nodes, unsigned long flags,
 285                 void *private)
 286 {
 287         pgd_t *pgd;
 288         unsigned long next;
 289
 290         pgd = pgd_offset(vma->vm_mm, addr);
 291         do {
 292                 next = pgd_addr_end(addr, end);
 293                 if (pgd_none_or_clear_bad(pgd))
 294                         continue;
 295                 if (check_pud_range(vma, pgd, addr, next, nodes,
 296                                     flags, private))
 297                         return -EIO;
 298         } while (pgd++, addr = next, addr != end);
 299         return 0;
 300 }
 301
 302 /* Check if a vma is migratable */
 303 static inline int vma_migratable(struct vm_area_struct *vma)
 304 {
 305         if (vma->vm_flags & (
 306                 VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED))
 307                 return 0;
 308         return 1;
 309 }
 310
 311 /*
 312  * Check if all pages in a range are on a set of nodes.
 313  * If pagelist != NULL then isolate pages from the LRU and
 314  * put them on the pagelist.
 315  */
 316 static struct vm_area_struct *
 317 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 318                 const nodemask_t *nodes, unsigned long flags, void *private)
 319 {
 320         int err;
 321         struct vm_area_struct *first, *vma, *prev;
 322
 323         /* Clear the LRU lists so pages can be isolated */
 324         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 325                 lru_add_drain_all();
 326
 327         first = find_vma(mm, start);
 328         if (!first)
 329                 return ERR_PTR(-EFAULT);
 330         prev = NULL;
 331         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 332                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 333                         if (!vma->vm_next && vma->vm_end < end)
 334                                 return ERR_PTR(-EFAULT);
 335                         if (prev && prev->vm_end < vma->vm_start)
 336                                 return ERR_PTR(-EFAULT);
 337                 }
 338                 if (!is_vm_hugetlb_page(vma) &&
 339                     ((flags & MPOL_MF_STRICT) ||
 340                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 341                                 vma_migratable(vma)))) {
 342                         unsigned long endvma = vma->vm_end;
 343
 344                         if (endvma > end)
 345                                 endvma = end;
 346                         if (vma->vm_start > start)
 347                                 start = vma->vm_start;
 348                         err = check_pgd_range(vma, start, endvma, nodes,
 349                                                 flags, private);
 350                         if (err) {
 351                                 first = ERR_PTR(err);
 352                                 break;
 353                         }
 354                 }
 355                 prev = vma;
 356         }
 357         return first;
 358 }
 359
 360 /* Apply policy to a single VMA */
 361 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 362 {
 363         int err = 0;
 364         struct mempolicy *old = vma->vm_policy;
 365
 366         PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 367                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 368                  vma->vm_ops, vma->vm_file,
 369                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 370
 371         if (vma->vm_ops && vma->vm_ops->set_policy)
 372                 err = vma->vm_ops->set_policy(vma, new);
 373         if (!err) {
 374                 mpol_get(new);
 375                 vma->vm_policy = new;
 376                 mpol_free(old);
 377         }
 378         return err;
 379 }
 380
 381 /* Step 2: apply policy to a range and do splits. */
 382 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 383                        unsigned long end, struct mempolicy *new)
 384 {
 385         struct vm_area_struct *next;
 386         int err;
 387
 388         err = 0;
 389         for (; vma && vma->vm_start < end; vma = next) {
 390                 next = vma->vm_next;
 391                 if (vma->vm_start < start)
 392                         err = split_vma(vma->vm_mm, vma, start, 1);
 393                 if (!err && vma->vm_end > end)
 394                         err = split_vma(vma->vm_mm, vma, end, 0);
 395                 if (!err)
 396                         err = policy_vma(vma, new);
 397                 if (err)
 398                         break;
 399         }
 400         return err;
 401 }
 402
 403 static int contextualize_policy(int mode, nodemask_t *nodes)
 404 {
 405         if (!nodes)
 406                 return 0;
 407
 408         cpuset_update_task_memory_state();
 409         if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
 410                 return -EINVAL;
 411         return mpol_check_policy(mode, nodes);
 412 }
 413
 414 /* Set the process memory policy */
 415 long do_set_mempolicy(int mode, nodemask_t *nodes)
 416 {
 417         struct mempolicy *new;
 418
 419         if (contextualize_policy(mode, nodes))
 420                 return -EINVAL;
 421         new = mpol_new(mode, nodes);
 422         if (IS_ERR(new))
 423                 return PTR_ERR(new);
 424         mpol_free(current->mempolicy);
 425         current->mempolicy = new;
 426         if (new && new->policy == MPOL_INTERLEAVE)
 427                 current->il_next = first_node(new->v.nodes);
 428         return 0;
 429 }
 430
 431 /* Fill a zone bitmap for a policy */
 432 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 433 {
 434         int i;
 435
 436         nodes_clear(*nodes);
 437         switch (p->policy) {
 438         case MPOL_BIND:
 439                 for (i = 0; p->v.zonelist->zones[i]; i++)
 440                         node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
 441                                 *nodes);
 442                 break;
 443         case MPOL_DEFAULT:
 444                 break;
 445         case MPOL_INTERLEAVE:
 446                 *nodes = p->v.nodes;
 447                 break;
 448         case MPOL_PREFERRED:
 449                 /* or use current node instead of online map? */
 450                 if (p->v.preferred_node < 0)
 451                         *nodes = node_online_map;
 452                 else
 453                         node_set(p->v.preferred_node, *nodes);
 454                 break;
 455         default:
 456                 BUG();
 457         }
 458 }
 459
 460 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 461 {
 462         struct page *p;
 463         int err;
 464
 465         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 466         if (err >= 0) {
 467                 err = page_to_nid(p);
 468                 put_page(p);
 469         }
 470         return err;
 471 }
 472
 473 /* Retrieve NUMA policy */
 474 long do_get_mempolicy(int *policy, nodemask_t *nmask,
 475                         unsigned long addr, unsigned long flags)
 476 {
 477         int err;
 478         struct mm_struct *mm = current->mm;
 479         struct vm_area_struct *vma = NULL;
 480         struct mempolicy *pol = current->mempolicy;
 481
 482         cpuset_update_task_memory_state();
 483         if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
 484                 return -EINVAL;
 485         if (flags & MPOL_F_ADDR) {
 486                 down_read(&mm->mmap_sem);
 487                 vma = find_vma_intersection(mm, addr, addr+1);
 488                 if (!vma) {
 489                         up_read(&mm->mmap_sem);
 490                         return -EFAULT;
 491                 }
 492                 if (vma->vm_ops && vma->vm_ops->get_policy)
 493                         pol = vma->vm_ops->get_policy(vma, addr);
 494                 else
 495                         pol = vma->vm_policy;
 496         } else if (addr)
 497                 return -EINVAL;
 498
 499         if (!pol)
 500                 pol = &default_policy;
 501
 502         if (flags & MPOL_F_NODE) {
 503                 if (flags & MPOL_F_ADDR) {
 504                         err = lookup_node(mm, addr);
 505                         if (err < 0)
 506                                 goto out;
 507                         *policy = err;
 508                 } else if (pol == current->mempolicy &&
 509                                 pol->policy == MPOL_INTERLEAVE) {
 510                         *policy = current->il_next;
 511                 } else {
 512                         err = -EINVAL;
 513                         goto out;
 514                 }
 515         } else
 516                 *policy = pol->policy;
 517
 518         if (vma) {
 519                 up_read(&current->mm->mmap_sem);
 520                 vma = NULL;
 521         }
 522
 523         err = 0;
 524         if (nmask)
 525                 get_zonemask(pol, nmask);
 526
 527  out:
 528         if (vma)
 529                 up_read(&current->mm->mmap_sem);
 530         return err;
 531 }
 532
 533 /*
 534  * page migration
 535  */
 536
 537 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 538                                 unsigned long flags)
 539 {
 540         /*
 541          * Avoid migrating a page that is shared with others.
 542          */
 543         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
 544                 if (isolate_lru_page(page))
 545                         list_add(&page->lru, pagelist);
 546         }
 547 }
 548
 549 /*
 550  * Migrate the list 'pagelist' of pages to a certain destination.
 551  *
 552  * Specify destination with either non-NULL vma or dest_node >= 0
 553  * Return the number of pages not migrated or error code
 554  */
 555 static int migrate_pages_to(struct list_head *pagelist,
 556                         struct vm_area_struct *vma, int dest)
 557 {
 558         LIST_HEAD(newlist);
 559         LIST_HEAD(moved);
 560         LIST_HEAD(failed);
 561         int err = 0;
 562         int nr_pages;
 563         struct page *page;
 564         struct list_head *p;
 565
 566 redo:
 567         nr_pages = 0;
 568         list_for_each(p, pagelist) {
 569                 if (vma)
 570                         page = alloc_page_vma(GFP_HIGHUSER, vma, vma->vm_start);
 571                 else
 572                         page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
 573
 574                 if (!page) {
 575                         err = -ENOMEM;
 576                         goto out;
 577                 }
 578                 list_add(&page->lru, &newlist);
 579                 nr_pages++;
 580                 if (nr_pages > MIGRATE_CHUNK_SIZE);
 581                         break;
 582         }
 583         err = migrate_pages(pagelist, &newlist, &moved, &failed);
 584
 585         putback_lru_pages(&moved);      /* Call release pages instead ?? */
 586
 587         if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
 588                 goto redo;
 589 out:
 590         /* Return leftover allocated pages */
 591         while (!list_empty(&newlist)) {
 592                 page = list_entry(newlist.next, struct page, lru);
 593                 list_del(&page->lru);
 594                 __free_page(page);
 595         }
 596         list_splice(&failed, pagelist);
 597         if (err < 0)
 598                 return err;
 599
 600         /* Calculate number of leftover pages */
 601         nr_pages = 0;
 602         list_for_each(p, pagelist)
 603                 nr_pages++;
 604         return nr_pages;
 605 }
 606
 607 /*
 608  * Migrate pages from one node to a target node.
 609  * Returns error or the number of pages not migrated.
 610  */
 611 int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
 612 {
 613         nodemask_t nmask;
 614         LIST_HEAD(pagelist);
 615         int err = 0;
 616
 617         nodes_clear(nmask);
 618         node_set(source, nmask);
 619
 620         check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
 621                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 622
 623         if (!list_empty(&pagelist)) {
 624                 err = migrate_pages_to(&pagelist, NULL, dest);
 625                 if (!list_empty(&pagelist))
 626                         putback_lru_pages(&pagelist);
 627         }
 628         return err;
 629 }
 630
 631 /*
 632  * Move pages between the two nodesets so as to preserve the physical
 633  * layout as much as possible.
 634  *
 635  * Returns the number of page that could not be moved.
 636  */
 637 int do_migrate_pages(struct mm_struct *mm,
 638         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 639 {
 640         LIST_HEAD(pagelist);
 641         int busy = 0;
 642         int err = 0;
 643         nodemask_t tmp;
 644
 645         down_read(&mm->mmap_sem);
 646
 647 /*
 648  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
 649  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
 650  * bit in 'tmp', and return that <source, dest> pair for migration.
 651  * The pair of nodemasks 'to' and 'from' define the map.
 652  *
 653  * If no pair of bits is found that way, fallback to picking some
 654  * pair of 'source' and 'dest' bits that are not the same.  If the
 655  * 'source' and 'dest' bits are the same, this represents a node
 656  * that will be migrating to itself, so no pages need move.
 657  *
 658  * If no bits are left in 'tmp', or if all remaining bits left
 659  * in 'tmp' correspond to the same bit in 'to', return false
 660  * (nothing left to migrate).
 661  *
 662  * This lets us pick a pair of nodes to migrate between, such that
 663  * if possible the dest node is not already occupied by some other
 664  * source node, minimizing the risk of overloading the memory on a
 665  * node that would happen if we migrated incoming memory to a node
 666  * before migrating outgoing memory source that same node.
 667  *
 668  * A single scan of tmp is sufficient.  As we go, we remember the
 669  * most recent <s, d> pair that moved (s != d).  If we find a pair
 670  * that not only moved, but what's better, moved to an empty slot
 671  * (d is not set in tmp), then we break out then, with that pair.
 672  * Otherwise when we finish scannng from_tmp, we at least have the
 673  * most recent <s, d> pair that moved.  If we get all the way through
 674  * the scan of tmp without finding any node that moved, much less
 675  * moved to an empty node, then there is nothing left worth migrating.
 676  */
 677
 678         tmp = *from_nodes;
 679         while (!nodes_empty(tmp)) {
 680                 int s,d;
 681                 int source = -1;
 682                 int dest = 0;
 683
 684                 for_each_node_mask(s, tmp) {
 685                         d = node_remap(s, *from_nodes, *to_nodes);
 686                         if (s == d)
 687                                 continue;
 688
 689                         source = s;     /* Node moved. Memorize */
 690                         dest = d;
 691
 692                         /* dest not in remaining from nodes? */
 693                         if (!node_isset(dest, tmp))
 694                                 break;
 695                 }
 696                 if (source == -1)
 697                         break;
 698
 699                 node_clear(source, tmp);
 700                 err = migrate_to_node(mm, source, dest, flags);
 701                 if (err > 0)
 702                         busy += err;
 703                 if (err < 0)
 704                         break;
 705         }
 706
 707         up_read(&mm->mmap_sem);
 708         if (err < 0)
 709                 return err;
 710         return busy;
 711 }
 712
 713 long do_mbind(unsigned long start, unsigned long len,
 714                 unsigned long mode, nodemask_t *nmask, unsigned long flags)
 715 {
 716         struct vm_area_struct *vma;
 717         struct mm_struct *mm = current->mm;
 718         struct mempolicy *new;
 719         unsigned long end;
 720         int err;
 721         LIST_HEAD(pagelist);
 722
 723         if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
 724                                       MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 725             || mode > MPOL_MAX)
 726                 return -EINVAL;
 727         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
 728                 return -EPERM;
 729
 730         if (start & ~PAGE_MASK)
 731                 return -EINVAL;
 732
 733         if (mode == MPOL_DEFAULT)
 734                 flags &= ~MPOL_MF_STRICT;
 735
 736         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 737         end = start + len;
 738
 739         if (end < start)
 740                 return -EINVAL;
 741         if (end == start)
 742                 return 0;
 743
 744         if (mpol_check_policy(mode, nmask))
 745                 return -EINVAL;
 746
 747         new = mpol_new(mode, nmask);
 748         if (IS_ERR(new))
 749                 return PTR_ERR(new);
 750
 751         /*
 752          * If we are using the default policy then operation
 753          * on discontinuous address spaces is okay after all
 754          */
 755         if (!new)
 756                 flags |= MPOL_MF_DISCONTIG_OK;
 757
 758         PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
 759                         mode,nodes_addr(nodes)[0]);
 760
 761         down_write(&mm->mmap_sem);
 762         vma = check_range(mm, start, end, nmask,
 763                           flags | MPOL_MF_INVERT, &pagelist);
 764
 765         err = PTR_ERR(vma);
 766         if (!IS_ERR(vma)) {
 767                 int nr_failed = 0;
 768
 769                 err = mbind_range(vma, start, end, new);
 770
 771                 if (!list_empty(&pagelist))
 772                         nr_failed = migrate_pages_to(&pagelist, vma, -1);
 773
 774                 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 775                         err = -EIO;
 776         }
 777         if (!list_empty(&pagelist))
 778                 putback_lru_pages(&pagelist);
 779
 780         up_write(&mm->mmap_sem);
 781         mpol_free(new);
 782         return err;
 783 }
 784
 785 /*
 786  * User space interface with variable sized bitmaps for nodelists.
 787  */
 788
 789 /* Copy a node mask from user space. */
 790 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 791                      unsigned long maxnode)
 792 {
 793         unsigned long k;
 794         unsigned long nlongs;
 795         unsigned long endmask;
 796
 797         --maxnode;
 798         nodes_clear(*nodes);
 799         if (maxnode == 0 || !nmask)
 800                 return 0;
 801
 802         nlongs = BITS_TO_LONGS(maxnode);
 803         if ((maxnode % BITS_PER_LONG) == 0)
 804                 endmask = ~0UL;
 805         else
 806                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
 807
 808         /* When the user specified more nodes than supported just check
 809            if the non supported part is all zero. */
 810         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
 811                 if (nlongs > PAGE_SIZE/sizeof(long))
 812                         return -EINVAL;
 813                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
 814                         unsigned long t;
 815                         if (get_user(t, nmask + k))
 816                                 return -EFAULT;
 817                         if (k == nlongs - 1) {
 818                                 if (t & endmask)
 819                                         return -EINVAL;
 820                         } else if (t)
 821                                 return -EINVAL;
 822                 }
 823                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
 824                 endmask = ~0UL;
 825         }
 826
 827         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
 828                 return -EFAULT;
 829         nodes_addr(*nodes)[nlongs-1] &= endmask;
 830         return 0;
 831 }
 832
 833 /* Copy a kernel node mask to user space */
 834 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 835                               nodemask_t *nodes)
 836 {
 837         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
 838         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
 839
 840         if (copy > nbytes) {
 841                 if (copy > PAGE_SIZE)
 842                         return -EINVAL;
 843                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
 844                         return -EFAULT;
 845                 copy = nbytes;
 846         }
 847         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
 848 }
 849
 850 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 851                         unsigned long mode,
 852                         unsigned long __user *nmask, unsigned long maxnode,
 853                         unsigned flags)
 854 {
 855         nodemask_t nodes;
 856         int err;
 857
 858         err = get_nodes(&nodes, nmask, maxnode);
 859         if (err)
 860                 return err;
 861         return do_mbind(start, len, mode, &nodes, flags);
 862 }
 863
 864 /* Set the process memory policy */
 865 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 866                 unsigned long maxnode)
 867 {
 868         int err;
 869         nodemask_t nodes;
 870
 871         if (mode < 0 || mode > MPOL_MAX)
 872                 return -EINVAL;
 873         err = get_nodes(&nodes, nmask, maxnode);
 874         if (err)
 875                 return err;
 876         return do_set_mempolicy(mode, &nodes);
 877 }
 878
 879 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 880                 const unsigned long __user *old_nodes,
 881                 const unsigned long __user *new_nodes)
 882 {
 883         struct mm_struct *mm;
 884         struct task_struct *task;
 885         nodemask_t old;
 886         nodemask_t new;
 887         nodemask_t task_nodes;
 888         int err;
 889
 890         err = get_nodes(&old, old_nodes, maxnode);
 891         if (err)
 892                 return err;
 893
 894         err = get_nodes(&new, new_nodes, maxnode);
 895         if (err)
 896                 return err;
 897
 898         /* Find the mm_struct */
 899         read_lock(&tasklist_lock);
 900         task = pid ? find_task_by_pid(pid) : current;
 901         if (!task) {
 902                 read_unlock(&tasklist_lock);
 903                 return -ESRCH;
 904         }
 905         mm = get_task_mm(task);
 906         read_unlock(&tasklist_lock);
 907
 908         if (!mm)
 909                 return -EINVAL;
 910
 911         /*
 912          * Check if this process has the right to modify the specified
 913          * process. The right exists if the process has administrative
 914          * capabilities, superuser priviledges or the same
 915          * userid as the target process.
 916          */
 917         if ((current->euid != task->suid) && (current->euid != task->uid) &&
 918             (current->uid != task->suid) && (current->uid != task->uid) &&
 919             !capable(CAP_SYS_ADMIN)) {
 920                 err = -EPERM;
 921                 goto out;
 922         }
 923
 924         task_nodes = cpuset_mems_allowed(task);
 925         /* Is the user allowed to access the target nodes? */
 926         if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
 927                 err = -EPERM;
 928                 goto out;
 929         }
 930
 931         err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
 932 out:
 933         mmput(mm);
 934         return err;
 935 }
 936
 937
 938 /* Retrieve NUMA policy */
 939 asmlinkage long sys_get_mempolicy(int __user *policy,
 940                                 unsigned long __user *nmask,
 941                                 unsigned long maxnode,
 942                                 unsigned long addr, unsigned long flags)
 943 {
 944         int err, pval;
 945         nodemask_t nodes;
 946
 947         if (nmask != NULL && maxnode < MAX_NUMNODES)
 948                 return -EINVAL;
 949
 950         err = do_get_mempolicy(&pval, &nodes, addr, flags);
 951
 952         if (err)
 953                 return err;
 954
 955         if (policy && put_user(pval, policy))
 956                 return -EFAULT;
 957
 958         if (nmask)
 959                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
 960
 961         return err;
 962 }
 963
 964 #ifdef CONFIG_COMPAT
 965
 966 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
 967                                      compat_ulong_t __user *nmask,
 968                                      compat_ulong_t maxnode,
 969                                      compat_ulong_t addr, compat_ulong_t flags)
 970 {
 971         long err;
 972         unsigned long __user *nm = NULL;
 973         unsigned long nr_bits, alloc_size;
 974         DECLARE_BITMAP(bm, MAX_NUMNODES);
 975
 976         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 977         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 978
 979         if (nmask)
 980                 nm = compat_alloc_user_space(alloc_size);
 981
 982         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
 983
 984         if (!err && nmask) {
 985                 err = copy_from_user(bm, nm, alloc_size);
 986                 /* ensure entire bitmap is zeroed */
 987                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
 988                 err |= compat_put_bitmap(nmask, bm, nr_bits);
 989         }
 990
 991         return err;
 992 }
 993
 994 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
 995                                      compat_ulong_t maxnode)
 996 {
 997         long err = 0;
 998         unsigned long __user *nm = NULL;
 999         unsigned long nr_bits, alloc_size;
1000         DECLARE_BITMAP(bm, MAX_NUMNODES);
1001
1002         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1003         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1004
1005         if (nmask) {
1006                 err = compat_get_bitmap(bm, nmask, nr_bits);
1007                 nm = compat_alloc_user_space(alloc_size);
1008                 err |= copy_to_user(nm, bm, alloc_size);
1009         }
1010
1011         if (err)
1012                 return -EFAULT;
1013
1014         return sys_set_mempolicy(mode, nm, nr_bits+1);
1015 }
1016
1017 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1018                              compat_ulong_t mode, compat_ulong_t __user *nmask,
1019                              compat_ulong_t maxnode, compat_ulong_t flags)
1020 {
1021         long err = 0;
1022         unsigned long __user *nm = NULL;
1023         unsigned long nr_bits, alloc_size;
1024         nodemask_t bm;
1025
1026         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1027         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1028
1029         if (nmask) {
1030                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1031                 nm = compat_alloc_user_space(alloc_size);
1032                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1033         }
1034
1035         if (err)
1036                 return -EFAULT;
1037
1038         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1039 }
1040
1041 #endif
1042
1043 /* Return effective policy for a VMA */
1044 static struct mempolicy * get_vma_policy(struct task_struct *task,
1045                 struct vm_area_struct *vma, unsigned long addr)
1046 {
1047         struct mempolicy *pol = task->mempolicy;
1048
1049         if (vma) {
1050                 if (vma->vm_ops && vma->vm_ops->get_policy)
1051                         pol = vma->vm_ops->get_policy(vma, addr);
1052                 else if (vma->vm_policy &&
1053                                 vma->vm_policy->policy != MPOL_DEFAULT)
1054                         pol = vma->vm_policy;
1055         }
1056         if (!pol)
1057                 pol = &default_policy;
1058         return pol;
1059 }
1060
1061 /* Return a zonelist representing a mempolicy */
1062 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1063 {
1064         int nd;
1065
1066         switch (policy->policy) {
1067         case MPOL_PREFERRED:
1068                 nd = policy->v.preferred_node;
1069                 if (nd < 0)
1070                         nd = numa_node_id();
1071                 break;
1072         case MPOL_BIND:
1073                 /* Lower zones don't get a policy applied */
1074                 /* Careful: current->mems_allowed might have moved */
1075                 if (gfp_zone(gfp) >= policy_zone)
1076                         if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
1077                                 return policy->v.zonelist;
1078                 /*FALL THROUGH*/
1079         case MPOL_INTERLEAVE: /* should not happen */
1080         case MPOL_DEFAULT:
1081                 nd = numa_node_id();
1082                 break;
1083         default:
1084                 nd = 0;
1085                 BUG();
1086         }
1087         return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
1088 }
1089
1090 /* Do dynamic interleaving for a process */
1091 static unsigned interleave_nodes(struct mempolicy *policy)
1092 {
1093         unsigned nid, next;
1094         struct task_struct *me = current;
1095
1096         nid = me->il_next;
1097         next = next_node(nid, policy->v.nodes);
1098         if (next >= MAX_NUMNODES)
1099                 next = first_node(policy->v.nodes);
1100         me->il_next = next;
1101         return nid;
1102 }
1103
1104 /*
1105  * Depending on the memory policy provide a node from which to allocate the
1106  * next slab entry.
1107  */
1108 unsigned slab_node(struct mempolicy *policy)
1109 {
1110         switch (policy->policy) {
1111         case MPOL_INTERLEAVE:
1112                 return interleave_nodes(policy);
1113
1114         case MPOL_BIND:
1115                 /*
1116                  * Follow bind policy behavior and start allocation at the
1117                  * first node.
1118                  */
1119                 return policy->v.zonelist->zones[0]->zone_pgdat->node_id;
1120
1121         case MPOL_PREFERRED:
1122                 if (policy->v.preferred_node >= 0)
1123                         return policy->v.preferred_node;
1124                 /* Fall through */
1125
1126         default:
1127                 return numa_node_id();
1128         }
1129 }
1130
1131 /* Do static interleaving for a VMA with known offset. */
1132 static unsigned offset_il_node(struct mempolicy *pol,
1133                 struct vm_area_struct *vma, unsigned long off)
1134 {
1135         unsigned nnodes = nodes_weight(pol->v.nodes);
1136         unsigned target = (unsigned)off % nnodes;
1137         int c;
1138         int nid = -1;
1139
1140         c = 0;
1141         do {
1142                 nid = next_node(nid, pol->v.nodes);
1143                 c++;
1144         } while (c <= target);
1145         return nid;
1146 }
1147
1148 /* Determine a node number for interleave */
1149 static inline unsigned interleave_nid(struct mempolicy *pol,
1150                  struct vm_area_struct *vma, unsigned long addr, int shift)
1151 {
1152         if (vma) {
1153                 unsigned long off;
1154
1155                 off = vma->vm_pgoff;
1156                 off += (addr - vma->vm_start) >> shift;
1157                 return offset_il_node(pol, vma, off);
1158         } else
1159                 return interleave_nodes(pol);
1160 }
1161
1162 #ifdef CONFIG_HUGETLBFS
1163 /* Return a zonelist suitable for a huge page allocation. */
1164 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1165 {
1166         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1167
1168         if (pol->policy == MPOL_INTERLEAVE) {
1169                 unsigned nid;
1170
1171                 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1172                 return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
1173         }
1174         return zonelist_policy(GFP_HIGHUSER, pol);
1175 }
1176 #endif
1177
1178 /* Allocate a page in interleaved policy.
1179    Own path because it needs to do special accounting. */
1180 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1181                                         unsigned nid)
1182 {
1183         struct zonelist *zl;
1184         struct page *page;
1185
1186         zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1187         page = __alloc_pages(gfp, order, zl);
1188         if (page && page_zone(page) == zl->zones[0]) {
1189                 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
1190                 put_cpu();
1191         }
1192         return page;
1193 }
1194
1195 /**
1196  *      alloc_page_vma  - Allocate a page for a VMA.
1197  *
1198  *      @gfp:
1199  *      %GFP_USER    user allocation.
1200  *      %GFP_KERNEL  kernel allocations,
1201  *      %GFP_HIGHMEM highmem/user allocations,
1202  *      %GFP_FS      allocation should not call back into a file system.
1203  *      %GFP_ATOMIC  don't sleep.
1204  *
1205  *      @vma:  Pointer to VMA or NULL if not available.
1206  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1207  *
1208  *      This function allocates a page from the kernel page pool and applies
1209  *      a NUMA policy associated with the VMA or the current process.
1210  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1211  *      mm_struct of the VMA to prevent it from going away. Should be used for
1212  *      all allocations for pages that will be mapped into
1213  *      user space. Returns NULL when no page can be allocated.
1214  *
1215  *      Should be called with the mm_sem of the vma hold.
1216  */
1217 struct page *
1218 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1219 {
1220         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1221
1222         cpuset_update_task_memory_state();
1223
1224         if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1225                 unsigned nid;
1226
1227                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1228                 return alloc_page_interleave(gfp, 0, nid);
1229         }
1230         return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
1231 }
1232
1233 /**
1234  *      alloc_pages_current - Allocate pages.
1235  *
1236  *      @gfp:
1237  *              %GFP_USER   user allocation,
1238  *              %GFP_KERNEL kernel allocation,
1239  *              %GFP_HIGHMEM highmem allocation,
1240  *              %GFP_FS     don't call back into a file system.
1241  *              %GFP_ATOMIC don't sleep.
1242  *      @order: Power of two of allocation size in pages. 0 is a single page.
1243  *
1244  *      Allocate a page from the kernel page pool.  When not in
1245  *      interrupt context and apply the current process NUMA policy.
1246  *      Returns NULL when no page can be allocated.
1247  *
1248  *      Don't call cpuset_update_task_memory_state() unless
1249  *      1) it's ok to take cpuset_sem (can WAIT), and
1250  *      2) allocating for current task (not interrupt).
1251  */
1252 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1253 {
1254         struct mempolicy *pol = current->mempolicy;
1255
1256         if ((gfp & __GFP_WAIT) && !in_interrupt())
1257                 cpuset_update_task_memory_state();
1258         if (!pol || in_interrupt())
1259                 pol = &default_policy;
1260         if (pol->policy == MPOL_INTERLEAVE)
1261                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1262         return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1263 }
1264 EXPORT_SYMBOL(alloc_pages_current);
1265
1266 /*
1267  * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1268  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1269  * with the mems_allowed returned by cpuset_mems_allowed().  This
1270  * keeps mempolicies cpuset relative after its cpuset moves.  See
1271  * further kernel/cpuset.c update_nodemask().
1272  */
1273 void *cpuset_being_rebound;
1274
1275 /* Slow path of a mempolicy copy */
1276 struct mempolicy *__mpol_copy(struct mempolicy *old)
1277 {
1278         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1279
1280         if (!new)
1281                 return ERR_PTR(-ENOMEM);
1282         if (current_cpuset_is_being_rebound()) {
1283                 nodemask_t mems = cpuset_mems_allowed(current);
1284                 mpol_rebind_policy(old, &mems);
1285         }
1286         *new = *old;
1287         atomic_set(&new->refcnt, 1);
1288         if (new->policy == MPOL_BIND) {
1289                 int sz = ksize(old->v.zonelist);
1290                 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
1291                 if (!new->v.zonelist) {
1292                         kmem_cache_free(policy_cache, new);
1293                         return ERR_PTR(-ENOMEM);
1294                 }
1295                 memcpy(new->v.zonelist, old->v.zonelist, sz);
1296         }
1297         return new;
1298 }
1299
1300 /* Slow path of a mempolicy comparison */
1301 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1302 {
1303         if (!a || !b)
1304                 return 0;
1305         if (a->policy != b->policy)
1306                 return 0;
1307         switch (a->policy) {
1308         case MPOL_DEFAULT:
1309                 return 1;
1310         case MPOL_INTERLEAVE:
1311                 return nodes_equal(a->v.nodes, b->v.nodes);
1312         case MPOL_PREFERRED:
1313                 return a->v.preferred_node == b->v.preferred_node;
1314         case MPOL_BIND: {
1315                 int i;
1316                 for (i = 0; a->v.zonelist->zones[i]; i++)
1317                         if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1318                                 return 0;
1319                 return b->v.zonelist->zones[i] == NULL;
1320         }
1321         default:
1322                 BUG();
1323                 return 0;
1324         }
1325 }
1326
1327 /* Slow path of a mpol destructor. */
1328 void __mpol_free(struct mempolicy *p)
1329 {
1330         if (!atomic_dec_and_test(&p->refcnt))
1331                 return;
1332         if (p->policy == MPOL_BIND)
1333                 kfree(p->v.zonelist);
1334         p->policy = MPOL_DEFAULT;
1335         kmem_cache_free(policy_cache, p);
1336 }
1337
1338 /*
1339  * Shared memory backing store policy support.
1340  *
1341  * Remember policies even when nobody has shared memory mapped.
1342  * The policies are kept in Red-Black tree linked from the inode.
1343  * They are protected by the sp->lock spinlock, which should be held
1344  * for any accesses to the tree.
1345  */
1346
1347 /* lookup first element intersecting start-end */
1348 /* Caller holds sp->lock */
1349 static struct sp_node *
1350 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1351 {
1352         struct rb_node *n = sp->root.rb_node;
1353
1354         while (n) {
1355                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1356
1357                 if (start >= p->end)
1358                         n = n->rb_right;
1359                 else if (end <= p->start)
1360                         n = n->rb_left;
1361                 else
1362                         break;
1363         }
1364         if (!n)
1365                 return NULL;
1366         for (;;) {
1367                 struct sp_node *w = NULL;
1368                 struct rb_node *prev = rb_prev(n);
1369                 if (!prev)
1370                         break;
1371                 w = rb_entry(prev, struct sp_node, nd);
1372                 if (w->end <= start)
1373                         break;
1374                 n = prev;
1375         }
1376         return rb_entry(n, struct sp_node, nd);
1377 }
1378
1379 /* Insert a new shared policy into the list. */
1380 /* Caller holds sp->lock */
1381 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1382 {
1383         struct rb_node **p = &sp->root.rb_node;
1384         struct rb_node *parent = NULL;
1385         struct sp_node *nd;
1386
1387         while (*p) {
1388                 parent = *p;
1389                 nd = rb_entry(parent, struct sp_node, nd);
1390                 if (new->start < nd->start)
1391                         p = &(*p)->rb_left;
1392                 else if (new->end > nd->end)
1393                         p = &(*p)->rb_right;
1394                 else
1395                         BUG();
1396         }
1397         rb_link_node(&new->nd, parent, p);
1398         rb_insert_color(&new->nd, &sp->root);
1399         PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1400                  new->policy ? new->policy->policy : 0);
1401 }
1402
1403 /* Find shared policy intersecting idx */
1404 struct mempolicy *
1405 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1406 {
1407         struct mempolicy *pol = NULL;
1408         struct sp_node *sn;
1409
1410         if (!sp->root.rb_node)
1411                 return NULL;
1412         spin_lock(&sp->lock);
1413         sn = sp_lookup(sp, idx, idx+1);
1414         if (sn) {
1415                 mpol_get(sn->policy);
1416                 pol = sn->policy;
1417         }
1418         spin_unlock(&sp->lock);
1419         return pol;
1420 }
1421
1422 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1423 {
1424         PDprintk("deleting %lx-l%x\n", n->start, n->end);
1425         rb_erase(&n->nd, &sp->root);
1426         mpol_free(n->policy);
1427         kmem_cache_free(sn_cache, n);
1428 }
1429
1430 struct sp_node *
1431 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1432 {
1433         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1434
1435         if (!n)
1436                 return NULL;
1437         n->start = start;
1438         n->end = end;
1439         mpol_get(pol);
1440         n->policy = pol;
1441         return n;
1442 }
1443
1444 /* Replace a policy range. */
1445 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1446                                  unsigned long end, struct sp_node *new)
1447 {
1448         struct sp_node *n, *new2 = NULL;
1449
1450 restart:
1451         spin_lock(&sp->lock);
1452         n = sp_lookup(sp, start, end);
1453         /* Take care of old policies in the same range. */
1454         while (n && n->start < end) {
1455                 struct rb_node *next = rb_next(&n->nd);
1456                 if (n->start >= start) {
1457                         if (n->end <= end)
1458                                 sp_delete(sp, n);
1459                         else
1460                                 n->start = end;
1461                 } else {
1462                         /* Old policy spanning whole new range. */
1463                         if (n->end > end) {
1464                                 if (!new2) {
1465                                         spin_unlock(&sp->lock);
1466                                         new2 = sp_alloc(end, n->end, n->policy);
1467                                         if (!new2)
1468                                                 return -ENOMEM;
1469                                         goto restart;
1470                                 }
1471                                 n->end = start;
1472                                 sp_insert(sp, new2);
1473                                 new2 = NULL;
1474                                 break;
1475                         } else
1476                                 n->end = start;
1477                 }
1478                 if (!next)
1479                         break;
1480                 n = rb_entry(next, struct sp_node, nd);
1481         }
1482         if (new)
1483                 sp_insert(sp, new);
1484         spin_unlock(&sp->lock);
1485         if (new2) {
1486                 mpol_free(new2->policy);
1487                 kmem_cache_free(sn_cache, new2);
1488         }
1489         return 0;
1490 }
1491
1492 void mpol_shared_policy_init(struct shared_policy *info, int policy,
1493                                 nodemask_t *policy_nodes)
1494 {
1495         info->root = RB_ROOT;
1496         spin_lock_init(&info->lock);
1497
1498         if (policy != MPOL_DEFAULT) {
1499                 struct mempolicy *newpol;
1500
1501                 /* Falls back to MPOL_DEFAULT on any error */
1502                 newpol = mpol_new(policy, policy_nodes);
1503                 if (!IS_ERR(newpol)) {
1504                         /* Create pseudo-vma that contains just the policy */
1505                         struct vm_area_struct pvma;
1506
1507                         memset(&pvma, 0, sizeof(struct vm_area_struct));
1508                         /* Policy covers entire file */
1509                         pvma.vm_end = TASK_SIZE;
1510                         mpol_set_shared_policy(info, &pvma, newpol);
1511                         mpol_free(newpol);
1512                 }
1513         }
1514 }
1515
1516 int mpol_set_shared_policy(struct shared_policy *info,
1517                         struct vm_area_struct *vma, struct mempolicy *npol)
1518 {
1519         int err;
1520         struct sp_node *new = NULL;
1521         unsigned long sz = vma_pages(vma);
1522
1523         PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1524                  vma->vm_pgoff,
1525                  sz, npol? npol->policy : -1,
1526                 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1527
1528         if (npol) {
1529                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1530                 if (!new)
1531                         return -ENOMEM;
1532         }
1533         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1534         if (err && new)
1535                 kmem_cache_free(sn_cache, new);
1536         return err;
1537 }
1538
1539 /* Free a backing policy store on inode delete. */
1540 void mpol_free_shared_policy(struct shared_policy *p)
1541 {
1542         struct sp_node *n;
1543         struct rb_node *next;
1544
1545         if (!p->root.rb_node)
1546                 return;
1547         spin_lock(&p->lock);
1548         next = rb_first(&p->root);
1549         while (next) {
1550                 n = rb_entry(next, struct sp_node, nd);
1551                 next = rb_next(&n->nd);
1552                 rb_erase(&n->nd, &p->root);
1553                 mpol_free(n->policy);
1554                 kmem_cache_free(sn_cache, n);
1555         }
1556         spin_unlock(&p->lock);
1557 }
1558
1559 /* assumes fs == KERNEL_DS */
1560 void __init numa_policy_init(void)
1561 {
1562         policy_cache = kmem_cache_create("numa_policy",
1563                                          sizeof(struct mempolicy),
1564                                          0, SLAB_PANIC, NULL, NULL);
1565
1566         sn_cache = kmem_cache_create("shared_policy_node",
1567                                      sizeof(struct sp_node),
1568                                      0, SLAB_PANIC, NULL, NULL);
1569
1570         /* Set interleaving policy for system init. This way not all
1571            the data structures allocated at system boot end up in node zero. */
1572
1573         if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1574                 printk("numa_policy_init: interleaving failed\n");
1575 }
1576
1577 /* Reset policy of current process to default */
1578 void numa_default_policy(void)
1579 {
1580         do_set_mempolicy(MPOL_DEFAULT, NULL);
1581 }
1582
1583 /* Migrate a policy to a different set of nodes */
1584 void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1585 {
1586         nodemask_t *mpolmask;
1587         nodemask_t tmp;
1588
1589         if (!pol)
1590                 return;
1591         mpolmask = &pol->cpuset_mems_allowed;
1592         if (nodes_equal(*mpolmask, *newmask))
1593                 return;
1594
1595         switch (pol->policy) {
1596         case MPOL_DEFAULT:
1597                 break;
1598         case MPOL_INTERLEAVE:
1599                 nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1600                 pol->v.nodes = tmp;
1601                 *mpolmask = *newmask;
1602                 current->il_next = node_remap(current->il_next,
1603                                                 *mpolmask, *newmask);
1604                 break;
1605         case MPOL_PREFERRED:
1606                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1607                                                 *mpolmask, *newmask);
1608                 *mpolmask = *newmask;
1609                 break;
1610         case MPOL_BIND: {
1611                 nodemask_t nodes;
1612                 struct zone **z;
1613                 struct zonelist *zonelist;
1614
1615                 nodes_clear(nodes);
1616                 for (z = pol->v.zonelist->zones; *z; z++)
1617                         node_set((*z)->zone_pgdat->node_id, nodes);
1618                 nodes_remap(tmp, nodes, *mpolmask, *newmask);
1619                 nodes = tmp;
1620
1621                 zonelist = bind_zonelist(&nodes);
1622
1623                 /* If no mem, then zonelist is NULL and we keep old zonelist.
1624                  * If that old zonelist has no remaining mems_allowed nodes,
1625                  * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1626                  */
1627
1628                 if (zonelist) {
1629                         /* Good - got mem - substitute new zonelist */
1630                         kfree(pol->v.zonelist);
1631                         pol->v.zonelist = zonelist;
1632                 }
1633                 *mpolmask = *newmask;
1634                 break;
1635         }
1636         default:
1637                 BUG();
1638                 break;
1639         }
1640 }
1641
1642 /*
1643  * Wrapper for mpol_rebind_policy() that just requires task
1644  * pointer, and updates task mempolicy.
1645  */
1646
1647 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1648 {
1649         mpol_rebind_policy(tsk->mempolicy, new);
1650 }
1651
1652 /*
1653  * Rebind each vma in mm to new nodemask.
1654  *
1655  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
1656  */
1657
1658 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1659 {
1660         struct vm_area_struct *vma;
1661
1662         down_write(&mm->mmap_sem);
1663         for (vma = mm->mmap; vma; vma = vma->vm_next)
1664                 mpol_rebind_policy(vma->vm_policy, new);
1665         up_write(&mm->mmap_sem);
1666 }
1667
1668 /*
1669  * Display pages allocated per node and memory policy via /proc.
1670  */
1671
1672 static const char *policy_types[] = { "default", "prefer", "bind",
1673                                       "interleave" };
1674
1675 /*
1676  * Convert a mempolicy into a string.
1677  * Returns the number of characters in buffer (if positive)
1678  * or an error (negative)
1679  */
1680 static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1681 {
1682         char *p = buffer;
1683         int l;
1684         nodemask_t nodes;
1685         int mode = pol ? pol->policy : MPOL_DEFAULT;
1686
1687         switch (mode) {
1688         case MPOL_DEFAULT:
1689                 nodes_clear(nodes);
1690                 break;
1691
1692         case MPOL_PREFERRED:
1693                 nodes_clear(nodes);
1694                 node_set(pol->v.preferred_node, nodes);
1695                 break;
1696
1697         case MPOL_BIND:
1698                 get_zonemask(pol, &nodes);
1699                 break;
1700
1701         case MPOL_INTERLEAVE:
1702                 nodes = pol->v.nodes;
1703                 break;
1704
1705         default:
1706                 BUG();
1707                 return -EFAULT;
1708         }
1709
1710         l = strlen(policy_types[mode]);
1711         if (buffer + maxlen < p + l + 1)
1712                 return -ENOSPC;
1713
1714         strcpy(p, policy_types[mode]);
1715         p += l;
1716
1717         if (!nodes_empty(nodes)) {
1718                 if (buffer + maxlen < p + 2)
1719                         return -ENOSPC;
1720                 *p++ = '=';
1721                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1722         }
1723         return p - buffer;
1724 }
1725
1726 struct numa_maps {
1727         unsigned long pages;
1728         unsigned long anon;
1729         unsigned long mapped;
1730         unsigned long mapcount_max;
1731         unsigned long node[MAX_NUMNODES];
1732 };
1733
1734 static void gather_stats(struct page *page, void *private)
1735 {
1736         struct numa_maps *md = private;
1737         int count = page_mapcount(page);
1738
1739         if (count)
1740                 md->mapped++;
1741
1742         if (count > md->mapcount_max)
1743                 md->mapcount_max = count;
1744
1745         md->pages++;
1746
1747         if (PageAnon(page))
1748                 md->anon++;
1749
1750         md->node[page_to_nid(page)]++;
1751         cond_resched();
1752 }
1753
1754 int show_numa_map(struct seq_file *m, void *v)
1755 {
1756         struct task_struct *task = m->private;
1757         struct vm_area_struct *vma = v;
1758         struct numa_maps *md;
1759         int n;
1760         char buffer[50];
1761
1762         if (!vma->vm_mm)
1763                 return 0;
1764
1765         md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1766         if (!md)
1767                 return 0;
1768
1769         check_pgd_range(vma, vma->vm_start, vma->vm_end,
1770                     &node_online_map, MPOL_MF_STATS, md);
1771
1772         if (md->pages) {
1773                 mpol_to_str(buffer, sizeof(buffer),
1774                             get_vma_policy(task, vma, vma->vm_start));
1775
1776                 seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu",
1777                            vma->vm_start, buffer, md->pages,
1778                            md->mapped, md->mapcount_max);
1779
1780                 if (md->anon)
1781                         seq_printf(m," anon=%lu",md->anon);
1782
1783                 for_each_online_node(n)
1784                         if (md->node[n])
1785                                 seq_printf(m, " N%d=%lu", n, md->node[n]);
1786
1787                 seq_putc(m, '\n');
1788         }
1789         kfree(md);
1790
1791         if (m->count < m->size)
1792                 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
1793         return 0;
1794 }
1795