mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66    could replace all the switch()es with a mempolicy_ops structure.
  67 */
  68
  69 #include <linux/mempolicy.h>
  70 #include <linux/mm.h>
  71 #include <linux/highmem.h>
  72 #include <linux/hugetlb.h>
  73 #include <linux/kernel.h>
  74 #include <linux/sched.h>
  75 #include <linux/mm.h>
  76 #include <linux/nodemask.h>
  77 #include <linux/cpuset.h>
  78 #include <linux/gfp.h>
  79 #include <linux/slab.h>
  80 #include <linux/string.h>
  81 #include <linux/module.h>
  82 #include <linux/interrupt.h>
  83 #include <linux/init.h>
  84 #include <linux/compat.h>
  85 #include <linux/mempolicy.h>
  86 #include <linux/swap.h>
  87 #include <linux/seq_file.h>
  88 #include <linux/proc_fs.h>
  89 #include <linux/migrate.h>
  90
  91 #include <asm/tlbflush.h>
  92 #include <asm/uaccess.h>
  93
  94 /* Internal flags */
  95 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
  96 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
  97 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
  98
  99 static struct kmem_cache *policy_cache;
 100 static struct kmem_cache *sn_cache;
 101
 102 #define PDprintk(fmt...)
 103
 104 /* Highest zone. An specific allocation for a zone below that is not
 105    policied. */
 106 int policy_zone = ZONE_DMA;
 107
 108 struct mempolicy default_policy = {
 109         .refcnt = ATOMIC_INIT(1), /* never free it */
 110         .policy = MPOL_DEFAULT,
 111 };
 112
 113 /* Do sanity checking on a policy */
 114 static int mpol_check_policy(int mode, nodemask_t *nodes)
 115 {
 116         int empty = nodes_empty(*nodes);
 117
 118         switch (mode) {
 119         case MPOL_DEFAULT:
 120                 if (!empty)
 121                         return -EINVAL;
 122                 break;
 123         case MPOL_BIND:
 124         case MPOL_INTERLEAVE:
 125                 /* Preferred will only use the first bit, but allow
 126                    more for now. */
 127                 if (empty)
 128                         return -EINVAL;
 129                 break;
 130         }
 131         return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
 132 }
 133
 134 /* Generate a custom zonelist for the BIND policy. */
 135 static struct zonelist *bind_zonelist(nodemask_t *nodes)
 136 {
 137         struct zonelist *zl;
 138         int num, max, nd, k;
 139
 140         max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
 141         zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
 142         if (!zl)
 143                 return NULL;
 144         num = 0;
 145         /* First put in the highest zones from all nodes, then all the next
 146            lower zones etc. Avoid empty zones because the memory allocator
 147            doesn't like them. If you implement node hot removal you
 148            have to fix that. */
 149         for (k = policy_zone; k >= 0; k--) {
 150                 for_each_node_mask(nd, *nodes) {
 151                         struct zone *z = &NODE_DATA(nd)->node_zones[k];
 152                         if (z->present_pages > 0)
 153                                 zl->zones[num++] = z;
 154                 }
 155         }
 156         zl->zones[num] = NULL;
 157         return zl;
 158 }
 159
 160 /* Create a new policy */
 161 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 162 {
 163         struct mempolicy *policy;
 164
 165         PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
 166         if (mode == MPOL_DEFAULT)
 167                 return NULL;
 168         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 169         if (!policy)
 170                 return ERR_PTR(-ENOMEM);
 171         atomic_set(&policy->refcnt, 1);
 172         switch (mode) {
 173         case MPOL_INTERLEAVE:
 174                 policy->v.nodes = *nodes;
 175                 if (nodes_weight(*nodes) == 0) {
 176                         kmem_cache_free(policy_cache, policy);
 177                         return ERR_PTR(-EINVAL);
 178                 }
 179                 break;
 180         case MPOL_PREFERRED:
 181                 policy->v.preferred_node = first_node(*nodes);
 182                 if (policy->v.preferred_node >= MAX_NUMNODES)
 183                         policy->v.preferred_node = -1;
 184                 break;
 185         case MPOL_BIND:
 186                 policy->v.zonelist = bind_zonelist(nodes);
 187                 if (policy->v.zonelist == NULL) {
 188                         kmem_cache_free(policy_cache, policy);
 189                         return ERR_PTR(-ENOMEM);
 190                 }
 191                 break;
 192         }
 193         policy->policy = mode;
 194         policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
 195         return policy;
 196 }
 197
 198 static void gather_stats(struct page *, void *, int pte_dirty);
 199 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 200                                 unsigned long flags);
 201
 202 /* Scan through pages checking if pages follow certain conditions. */
 203 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 204                 unsigned long addr, unsigned long end,
 205                 const nodemask_t *nodes, unsigned long flags,
 206                 void *private)
 207 {
 208         pte_t *orig_pte;
 209         pte_t *pte;
 210         spinlock_t *ptl;
 211
 212         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 213         do {
 214                 struct page *page;
 215                 unsigned int nid;
 216
 217                 if (!pte_present(*pte))
 218                         continue;
 219                 page = vm_normal_page(vma, addr, *pte);
 220                 if (!page)
 221                         continue;
 222                 /*
 223                  * The check for PageReserved here is important to avoid
 224                  * handling zero pages and other pages that may have been
 225                  * marked special by the system.
 226                  *
 227                  * If the PageReserved would not be checked here then f.e.
 228                  * the location of the zero page could have an influence
 229                  * on MPOL_MF_STRICT, zero pages would be counted for
 230                  * the per node stats, and there would be useless attempts
 231                  * to put zero pages on the migration list.
 232                  */
 233                 if (PageReserved(page))
 234                         continue;
 235                 nid = page_to_nid(page);
 236                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 237                         continue;
 238
 239                 if (flags & MPOL_MF_STATS)
 240                         gather_stats(page, private, pte_dirty(*pte));
 241                 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 242                         migrate_page_add(page, private, flags);
 243                 else
 244                         break;
 245         } while (pte++, addr += PAGE_SIZE, addr != end);
 246         pte_unmap_unlock(orig_pte, ptl);
 247         return addr != end;
 248 }
 249
 250 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 251                 unsigned long addr, unsigned long end,
 252                 const nodemask_t *nodes, unsigned long flags,
 253                 void *private)
 254 {
 255         pmd_t *pmd;
 256         unsigned long next;
 257
 258         pmd = pmd_offset(pud, addr);
 259         do {
 260                 next = pmd_addr_end(addr, end);
 261                 if (pmd_none_or_clear_bad(pmd))
 262                         continue;
 263                 if (check_pte_range(vma, pmd, addr, next, nodes,
 264                                     flags, private))
 265                         return -EIO;
 266         } while (pmd++, addr = next, addr != end);
 267         return 0;
 268 }
 269
 270 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 271                 unsigned long addr, unsigned long end,
 272                 const nodemask_t *nodes, unsigned long flags,
 273                 void *private)
 274 {
 275         pud_t *pud;
 276         unsigned long next;
 277
 278         pud = pud_offset(pgd, addr);
 279         do {
 280                 next = pud_addr_end(addr, end);
 281                 if (pud_none_or_clear_bad(pud))
 282                         continue;
 283                 if (check_pmd_range(vma, pud, addr, next, nodes,
 284                                     flags, private))
 285                         return -EIO;
 286         } while (pud++, addr = next, addr != end);
 287         return 0;
 288 }
 289
 290 static inline int check_pgd_range(struct vm_area_struct *vma,
 291                 unsigned long addr, unsigned long end,
 292                 const nodemask_t *nodes, unsigned long flags,
 293                 void *private)
 294 {
 295         pgd_t *pgd;
 296         unsigned long next;
 297
 298         pgd = pgd_offset(vma->vm_mm, addr);
 299         do {
 300                 next = pgd_addr_end(addr, end);
 301                 if (pgd_none_or_clear_bad(pgd))
 302                         continue;
 303                 if (check_pud_range(vma, pgd, addr, next, nodes,
 304                                     flags, private))
 305                         return -EIO;
 306         } while (pgd++, addr = next, addr != end);
 307         return 0;
 308 }
 309
 310 /* Check if a vma is migratable */
 311 static inline int vma_migratable(struct vm_area_struct *vma)
 312 {
 313         if (vma->vm_flags & (
 314                 VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED))
 315                 return 0;
 316         return 1;
 317 }
 318
 319 /*
 320  * Check if all pages in a range are on a set of nodes.
 321  * If pagelist != NULL then isolate pages from the LRU and
 322  * put them on the pagelist.
 323  */
 324 static struct vm_area_struct *
 325 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 326                 const nodemask_t *nodes, unsigned long flags, void *private)
 327 {
 328         int err;
 329         struct vm_area_struct *first, *vma, *prev;
 330
 331         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 332
 333                 err = migrate_prep();
 334                 if (err)
 335                         return ERR_PTR(err);
 336         }
 337
 338         first = find_vma(mm, start);
 339         if (!first)
 340                 return ERR_PTR(-EFAULT);
 341         prev = NULL;
 342         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 343                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 344                         if (!vma->vm_next && vma->vm_end < end)
 345                                 return ERR_PTR(-EFAULT);
 346                         if (prev && prev->vm_end < vma->vm_start)
 347                                 return ERR_PTR(-EFAULT);
 348                 }
 349                 if (!is_vm_hugetlb_page(vma) &&
 350                     ((flags & MPOL_MF_STRICT) ||
 351                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 352                                 vma_migratable(vma)))) {
 353                         unsigned long endvma = vma->vm_end;
 354
 355                         if (endvma > end)
 356                                 endvma = end;
 357                         if (vma->vm_start > start)
 358                                 start = vma->vm_start;
 359                         err = check_pgd_range(vma, start, endvma, nodes,
 360                                                 flags, private);
 361                         if (err) {
 362                                 first = ERR_PTR(err);
 363                                 break;
 364                         }
 365                 }
 366                 prev = vma;
 367         }
 368         return first;
 369 }
 370
 371 /* Apply policy to a single VMA */
 372 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 373 {
 374         int err = 0;
 375         struct mempolicy *old = vma->vm_policy;
 376
 377         PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 378                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 379                  vma->vm_ops, vma->vm_file,
 380                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 381
 382         if (vma->vm_ops && vma->vm_ops->set_policy)
 383                 err = vma->vm_ops->set_policy(vma, new);
 384         if (!err) {
 385                 mpol_get(new);
 386                 vma->vm_policy = new;
 387                 mpol_free(old);
 388         }
 389         return err;
 390 }
 391
 392 /* Step 2: apply policy to a range and do splits. */
 393 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 394                        unsigned long end, struct mempolicy *new)
 395 {
 396         struct vm_area_struct *next;
 397         int err;
 398
 399         err = 0;
 400         for (; vma && vma->vm_start < end; vma = next) {
 401                 next = vma->vm_next;
 402                 if (vma->vm_start < start)
 403                         err = split_vma(vma->vm_mm, vma, start, 1);
 404                 if (!err && vma->vm_end > end)
 405                         err = split_vma(vma->vm_mm, vma, end, 0);
 406                 if (!err)
 407                         err = policy_vma(vma, new);
 408                 if (err)
 409                         break;
 410         }
 411         return err;
 412 }
 413
 414 static int contextualize_policy(int mode, nodemask_t *nodes)
 415 {
 416         if (!nodes)
 417                 return 0;
 418
 419         cpuset_update_task_memory_state();
 420         if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
 421                 return -EINVAL;
 422         return mpol_check_policy(mode, nodes);
 423 }
 424
 425
 426 /*
 427  * Update task->flags PF_MEMPOLICY bit: set iff non-default
 428  * mempolicy.  Allows more rapid checking of this (combined perhaps
 429  * with other PF_* flag bits) on memory allocation hot code paths.
 430  *
 431  * If called from outside this file, the task 'p' should -only- be
 432  * a newly forked child not yet visible on the task list, because
 433  * manipulating the task flags of a visible task is not safe.
 434  *
 435  * The above limitation is why this routine has the funny name
 436  * mpol_fix_fork_child_flag().
 437  *
 438  * It is also safe to call this with a task pointer of current,
 439  * which the static wrapper mpol_set_task_struct_flag() does,
 440  * for use within this file.
 441  */
 442
 443 void mpol_fix_fork_child_flag(struct task_struct *p)
 444 {
 445         if (p->mempolicy)
 446                 p->flags |= PF_MEMPOLICY;
 447         else
 448                 p->flags &= ~PF_MEMPOLICY;
 449 }
 450
 451 static void mpol_set_task_struct_flag(void)
 452 {
 453         mpol_fix_fork_child_flag(current);
 454 }
 455
 456 /* Set the process memory policy */
 457 long do_set_mempolicy(int mode, nodemask_t *nodes)
 458 {
 459         struct mempolicy *new;
 460
 461         if (contextualize_policy(mode, nodes))
 462                 return -EINVAL;
 463         new = mpol_new(mode, nodes);
 464         if (IS_ERR(new))
 465                 return PTR_ERR(new);
 466         mpol_free(current->mempolicy);
 467         current->mempolicy = new;
 468         mpol_set_task_struct_flag();
 469         if (new && new->policy == MPOL_INTERLEAVE)
 470                 current->il_next = first_node(new->v.nodes);
 471         return 0;
 472 }
 473
 474 /* Fill a zone bitmap for a policy */
 475 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 476 {
 477         int i;
 478
 479         nodes_clear(*nodes);
 480         switch (p->policy) {
 481         case MPOL_BIND:
 482                 for (i = 0; p->v.zonelist->zones[i]; i++)
 483                         node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
 484                                 *nodes);
 485                 break;
 486         case MPOL_DEFAULT:
 487                 break;
 488         case MPOL_INTERLEAVE:
 489                 *nodes = p->v.nodes;
 490                 break;
 491         case MPOL_PREFERRED:
 492                 /* or use current node instead of online map? */
 493                 if (p->v.preferred_node < 0)
 494                         *nodes = node_online_map;
 495                 else
 496                         node_set(p->v.preferred_node, *nodes);
 497                 break;
 498         default:
 499                 BUG();
 500         }
 501 }
 502
 503 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 504 {
 505         struct page *p;
 506         int err;
 507
 508         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 509         if (err >= 0) {
 510                 err = page_to_nid(p);
 511                 put_page(p);
 512         }
 513         return err;
 514 }
 515
 516 /* Retrieve NUMA policy */
 517 long do_get_mempolicy(int *policy, nodemask_t *nmask,
 518                         unsigned long addr, unsigned long flags)
 519 {
 520         int err;
 521         struct mm_struct *mm = current->mm;
 522         struct vm_area_struct *vma = NULL;
 523         struct mempolicy *pol = current->mempolicy;
 524
 525         cpuset_update_task_memory_state();
 526         if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
 527                 return -EINVAL;
 528         if (flags & MPOL_F_ADDR) {
 529                 down_read(&mm->mmap_sem);
 530                 vma = find_vma_intersection(mm, addr, addr+1);
 531                 if (!vma) {
 532                         up_read(&mm->mmap_sem);
 533                         return -EFAULT;
 534                 }
 535                 if (vma->vm_ops && vma->vm_ops->get_policy)
 536                         pol = vma->vm_ops->get_policy(vma, addr);
 537                 else
 538                         pol = vma->vm_policy;
 539         } else if (addr)
 540                 return -EINVAL;
 541
 542         if (!pol)
 543                 pol = &default_policy;
 544
 545         if (flags & MPOL_F_NODE) {
 546                 if (flags & MPOL_F_ADDR) {
 547                         err = lookup_node(mm, addr);
 548                         if (err < 0)
 549                                 goto out;
 550                         *policy = err;
 551                 } else if (pol == current->mempolicy &&
 552                                 pol->policy == MPOL_INTERLEAVE) {
 553                         *policy = current->il_next;
 554                 } else {
 555                         err = -EINVAL;
 556                         goto out;
 557                 }
 558         } else
 559                 *policy = pol->policy;
 560
 561         if (vma) {
 562                 up_read(&current->mm->mmap_sem);
 563                 vma = NULL;
 564         }
 565
 566         err = 0;
 567         if (nmask)
 568                 get_zonemask(pol, nmask);
 569
 570  out:
 571         if (vma)
 572                 up_read(&current->mm->mmap_sem);
 573         return err;
 574 }
 575
 576 #ifdef CONFIG_MIGRATION
 577 /*
 578  * page migration
 579  */
 580 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 581                                 unsigned long flags)
 582 {
 583         /*
 584          * Avoid migrating a page that is shared with others.
 585          */
 586         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
 587                 isolate_lru_page(page, pagelist);
 588 }
 589
 590 /*
 591  * Migrate pages from one node to a target node.
 592  * Returns error or the number of pages not migrated.
 593  */
 594 int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
 595 {
 596         nodemask_t nmask;
 597         LIST_HEAD(pagelist);
 598         int err = 0;
 599
 600         nodes_clear(nmask);
 601         node_set(source, nmask);
 602
 603         check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
 604                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 605
 606         if (!list_empty(&pagelist)) {
 607                 err = migrate_pages_to(&pagelist, NULL, dest);
 608                 if (!list_empty(&pagelist))
 609                         putback_lru_pages(&pagelist);
 610         }
 611         return err;
 612 }
 613
 614 /*
 615  * Move pages between the two nodesets so as to preserve the physical
 616  * layout as much as possible.
 617  *
 618  * Returns the number of page that could not be moved.
 619  */
 620 int do_migrate_pages(struct mm_struct *mm,
 621         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 622 {
 623         LIST_HEAD(pagelist);
 624         int busy = 0;
 625         int err = 0;
 626         nodemask_t tmp;
 627
 628         down_read(&mm->mmap_sem);
 629
 630 /*
 631  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
 632  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
 633  * bit in 'tmp', and return that <source, dest> pair for migration.
 634  * The pair of nodemasks 'to' and 'from' define the map.
 635  *
 636  * If no pair of bits is found that way, fallback to picking some
 637  * pair of 'source' and 'dest' bits that are not the same.  If the
 638  * 'source' and 'dest' bits are the same, this represents a node
 639  * that will be migrating to itself, so no pages need move.
 640  *
 641  * If no bits are left in 'tmp', or if all remaining bits left
 642  * in 'tmp' correspond to the same bit in 'to', return false
 643  * (nothing left to migrate).
 644  *
 645  * This lets us pick a pair of nodes to migrate between, such that
 646  * if possible the dest node is not already occupied by some other
 647  * source node, minimizing the risk of overloading the memory on a
 648  * node that would happen if we migrated incoming memory to a node
 649  * before migrating outgoing memory source that same node.
 650  *
 651  * A single scan of tmp is sufficient.  As we go, we remember the
 652  * most recent <s, d> pair that moved (s != d).  If we find a pair
 653  * that not only moved, but what's better, moved to an empty slot
 654  * (d is not set in tmp), then we break out then, with that pair.
 655  * Otherwise when we finish scannng from_tmp, we at least have the
 656  * most recent <s, d> pair that moved.  If we get all the way through
 657  * the scan of tmp without finding any node that moved, much less
 658  * moved to an empty node, then there is nothing left worth migrating.
 659  */
 660
 661         tmp = *from_nodes;
 662         while (!nodes_empty(tmp)) {
 663                 int s,d;
 664                 int source = -1;
 665                 int dest = 0;
 666
 667                 for_each_node_mask(s, tmp) {
 668                         d = node_remap(s, *from_nodes, *to_nodes);
 669                         if (s == d)
 670                                 continue;
 671
 672                         source = s;     /* Node moved. Memorize */
 673                         dest = d;
 674
 675                         /* dest not in remaining from nodes? */
 676                         if (!node_isset(dest, tmp))
 677                                 break;
 678                 }
 679                 if (source == -1)
 680                         break;
 681
 682                 node_clear(source, tmp);
 683                 err = migrate_to_node(mm, source, dest, flags);
 684                 if (err > 0)
 685                         busy += err;
 686                 if (err < 0)
 687                         break;
 688         }
 689
 690         up_read(&mm->mmap_sem);
 691         if (err < 0)
 692                 return err;
 693         return busy;
 694
 695 }
 696
 697 #else
 698
 699 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 700                                 unsigned long flags)
 701 {
 702 }
 703
 704 int do_migrate_pages(struct mm_struct *mm,
 705         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 706 {
 707         return -ENOSYS;
 708 }
 709 #endif
 710
 711 long do_mbind(unsigned long start, unsigned long len,
 712                 unsigned long mode, nodemask_t *nmask, unsigned long flags)
 713 {
 714         struct vm_area_struct *vma;
 715         struct mm_struct *mm = current->mm;
 716         struct mempolicy *new;
 717         unsigned long end;
 718         int err;
 719         LIST_HEAD(pagelist);
 720
 721         if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
 722                                       MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 723             || mode > MPOL_MAX)
 724                 return -EINVAL;
 725         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
 726                 return -EPERM;
 727
 728         if (start & ~PAGE_MASK)
 729                 return -EINVAL;
 730
 731         if (mode == MPOL_DEFAULT)
 732                 flags &= ~MPOL_MF_STRICT;
 733
 734         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 735         end = start + len;
 736
 737         if (end < start)
 738                 return -EINVAL;
 739         if (end == start)
 740                 return 0;
 741
 742         if (mpol_check_policy(mode, nmask))
 743                 return -EINVAL;
 744
 745         new = mpol_new(mode, nmask);
 746         if (IS_ERR(new))
 747                 return PTR_ERR(new);
 748
 749         /*
 750          * If we are using the default policy then operation
 751          * on discontinuous address spaces is okay after all
 752          */
 753         if (!new)
 754                 flags |= MPOL_MF_DISCONTIG_OK;
 755
 756         PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
 757                         mode,nodes_addr(nodes)[0]);
 758
 759         down_write(&mm->mmap_sem);
 760         vma = check_range(mm, start, end, nmask,
 761                           flags | MPOL_MF_INVERT, &pagelist);
 762
 763         err = PTR_ERR(vma);
 764         if (!IS_ERR(vma)) {
 765                 int nr_failed = 0;
 766
 767                 err = mbind_range(vma, start, end, new);
 768
 769                 if (!list_empty(&pagelist))
 770                         nr_failed = migrate_pages_to(&pagelist, vma, -1);
 771
 772                 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 773                         err = -EIO;
 774         }
 775
 776         if (!list_empty(&pagelist))
 777                 putback_lru_pages(&pagelist);
 778
 779         up_write(&mm->mmap_sem);
 780         mpol_free(new);
 781         return err;
 782 }
 783
 784 /*
 785  * User space interface with variable sized bitmaps for nodelists.
 786  */
 787
 788 /* Copy a node mask from user space. */
 789 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 790                      unsigned long maxnode)
 791 {
 792         unsigned long k;
 793         unsigned long nlongs;
 794         unsigned long endmask;
 795
 796         --maxnode;
 797         nodes_clear(*nodes);
 798         if (maxnode == 0 || !nmask)
 799                 return 0;
 800         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
 801                 return -EINVAL;
 802
 803         nlongs = BITS_TO_LONGS(maxnode);
 804         if ((maxnode % BITS_PER_LONG) == 0)
 805                 endmask = ~0UL;
 806         else
 807                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
 808
 809         /* When the user specified more nodes than supported just check
 810            if the non supported part is all zero. */
 811         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
 812                 if (nlongs > PAGE_SIZE/sizeof(long))
 813                         return -EINVAL;
 814                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
 815                         unsigned long t;
 816                         if (get_user(t, nmask + k))
 817                                 return -EFAULT;
 818                         if (k == nlongs - 1) {
 819                                 if (t & endmask)
 820                                         return -EINVAL;
 821                         } else if (t)
 822                                 return -EINVAL;
 823                 }
 824                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
 825                 endmask = ~0UL;
 826         }
 827
 828         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
 829                 return -EFAULT;
 830         nodes_addr(*nodes)[nlongs-1] &= endmask;
 831         return 0;
 832 }
 833
 834 /* Copy a kernel node mask to user space */
 835 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 836                               nodemask_t *nodes)
 837 {
 838         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
 839         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
 840
 841         if (copy > nbytes) {
 842                 if (copy > PAGE_SIZE)
 843                         return -EINVAL;
 844                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
 845                         return -EFAULT;
 846                 copy = nbytes;
 847         }
 848         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
 849 }
 850
 851 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 852                         unsigned long mode,
 853                         unsigned long __user *nmask, unsigned long maxnode,
 854                         unsigned flags)
 855 {
 856         nodemask_t nodes;
 857         int err;
 858
 859         err = get_nodes(&nodes, nmask, maxnode);
 860         if (err)
 861                 return err;
 862         return do_mbind(start, len, mode, &nodes, flags);
 863 }
 864
 865 /* Set the process memory policy */
 866 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 867                 unsigned long maxnode)
 868 {
 869         int err;
 870         nodemask_t nodes;
 871
 872         if (mode < 0 || mode > MPOL_MAX)
 873                 return -EINVAL;
 874         err = get_nodes(&nodes, nmask, maxnode);
 875         if (err)
 876                 return err;
 877         return do_set_mempolicy(mode, &nodes);
 878 }
 879
 880 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 881                 const unsigned long __user *old_nodes,
 882                 const unsigned long __user *new_nodes)
 883 {
 884         struct mm_struct *mm;
 885         struct task_struct *task;
 886         nodemask_t old;
 887         nodemask_t new;
 888         nodemask_t task_nodes;
 889         int err;
 890
 891         err = get_nodes(&old, old_nodes, maxnode);
 892         if (err)
 893                 return err;
 894
 895         err = get_nodes(&new, new_nodes, maxnode);
 896         if (err)
 897                 return err;
 898
 899         /* Find the mm_struct */
 900         read_lock(&tasklist_lock);
 901         task = pid ? find_task_by_pid(pid) : current;
 902         if (!task) {
 903                 read_unlock(&tasklist_lock);
 904                 return -ESRCH;
 905         }
 906         mm = get_task_mm(task);
 907         read_unlock(&tasklist_lock);
 908
 909         if (!mm)
 910                 return -EINVAL;
 911
 912         /*
 913          * Check if this process has the right to modify the specified
 914          * process. The right exists if the process has administrative
 915          * capabilities, superuser priviledges or the same
 916          * userid as the target process.
 917          */
 918         if ((current->euid != task->suid) && (current->euid != task->uid) &&
 919             (current->uid != task->suid) && (current->uid != task->uid) &&
 920             !capable(CAP_SYS_NICE)) {
 921                 err = -EPERM;
 922                 goto out;
 923         }
 924
 925         task_nodes = cpuset_mems_allowed(task);
 926         /* Is the user allowed to access the target nodes? */
 927         if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
 928                 err = -EPERM;
 929                 goto out;
 930         }
 931
 932         err = do_migrate_pages(mm, &old, &new,
 933                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
 934 out:
 935         mmput(mm);
 936         return err;
 937 }
 938
 939
 940 /* Retrieve NUMA policy */
 941 asmlinkage long sys_get_mempolicy(int __user *policy,
 942                                 unsigned long __user *nmask,
 943                                 unsigned long maxnode,
 944                                 unsigned long addr, unsigned long flags)
 945 {
 946         int err, pval;
 947         nodemask_t nodes;
 948
 949         if (nmask != NULL && maxnode < MAX_NUMNODES)
 950                 return -EINVAL;
 951
 952         err = do_get_mempolicy(&pval, &nodes, addr, flags);
 953
 954         if (err)
 955                 return err;
 956
 957         if (policy && put_user(pval, policy))
 958                 return -EFAULT;
 959
 960         if (nmask)
 961                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
 962
 963         return err;
 964 }
 965
 966 #ifdef CONFIG_COMPAT
 967
 968 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
 969                                      compat_ulong_t __user *nmask,
 970                                      compat_ulong_t maxnode,
 971                                      compat_ulong_t addr, compat_ulong_t flags)
 972 {
 973         long err;
 974         unsigned long __user *nm = NULL;
 975         unsigned long nr_bits, alloc_size;
 976         DECLARE_BITMAP(bm, MAX_NUMNODES);
 977
 978         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 979         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 980
 981         if (nmask)
 982                 nm = compat_alloc_user_space(alloc_size);
 983
 984         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
 985
 986         if (!err && nmask) {
 987                 err = copy_from_user(bm, nm, alloc_size);
 988                 /* ensure entire bitmap is zeroed */
 989                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
 990                 err |= compat_put_bitmap(nmask, bm, nr_bits);
 991         }
 992
 993         return err;
 994 }
 995
 996 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
 997                                      compat_ulong_t maxnode)
 998 {
 999         long err = 0;
1000         unsigned long __user *nm = NULL;
1001         unsigned long nr_bits, alloc_size;
1002         DECLARE_BITMAP(bm, MAX_NUMNODES);
1003
1004         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1005         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1006
1007         if (nmask) {
1008                 err = compat_get_bitmap(bm, nmask, nr_bits);
1009                 nm = compat_alloc_user_space(alloc_size);
1010                 err |= copy_to_user(nm, bm, alloc_size);
1011         }
1012
1013         if (err)
1014                 return -EFAULT;
1015
1016         return sys_set_mempolicy(mode, nm, nr_bits+1);
1017 }
1018
1019 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1020                              compat_ulong_t mode, compat_ulong_t __user *nmask,
1021                              compat_ulong_t maxnode, compat_ulong_t flags)
1022 {
1023         long err = 0;
1024         unsigned long __user *nm = NULL;
1025         unsigned long nr_bits, alloc_size;
1026         nodemask_t bm;
1027
1028         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1029         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1030
1031         if (nmask) {
1032                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1033                 nm = compat_alloc_user_space(alloc_size);
1034                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1035         }
1036
1037         if (err)
1038                 return -EFAULT;
1039
1040         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1041 }
1042
1043 #endif
1044
1045 /* Return effective policy for a VMA */
1046 static struct mempolicy * get_vma_policy(struct task_struct *task,
1047                 struct vm_area_struct *vma, unsigned long addr)
1048 {
1049         struct mempolicy *pol = task->mempolicy;
1050
1051         if (vma) {
1052                 if (vma->vm_ops && vma->vm_ops->get_policy)
1053                         pol = vma->vm_ops->get_policy(vma, addr);
1054                 else if (vma->vm_policy &&
1055                                 vma->vm_policy->policy != MPOL_DEFAULT)
1056                         pol = vma->vm_policy;
1057         }
1058         if (!pol)
1059                 pol = &default_policy;
1060         return pol;
1061 }
1062
1063 /* Return a zonelist representing a mempolicy */
1064 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1065 {
1066         int nd;
1067
1068         switch (policy->policy) {
1069         case MPOL_PREFERRED:
1070                 nd = policy->v.preferred_node;
1071                 if (nd < 0)
1072                         nd = numa_node_id();
1073                 break;
1074         case MPOL_BIND:
1075                 /* Lower zones don't get a policy applied */
1076                 /* Careful: current->mems_allowed might have moved */
1077                 if (gfp_zone(gfp) >= policy_zone)
1078                         if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
1079                                 return policy->v.zonelist;
1080                 /*FALL THROUGH*/
1081         case MPOL_INTERLEAVE: /* should not happen */
1082         case MPOL_DEFAULT:
1083                 nd = numa_node_id();
1084                 break;
1085         default:
1086                 nd = 0;
1087                 BUG();
1088         }
1089         return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
1090 }
1091
1092 /* Do dynamic interleaving for a process */
1093 static unsigned interleave_nodes(struct mempolicy *policy)
1094 {
1095         unsigned nid, next;
1096         struct task_struct *me = current;
1097
1098         nid = me->il_next;
1099         next = next_node(nid, policy->v.nodes);
1100         if (next >= MAX_NUMNODES)
1101                 next = first_node(policy->v.nodes);
1102         me->il_next = next;
1103         return nid;
1104 }
1105
1106 /*
1107  * Depending on the memory policy provide a node from which to allocate the
1108  * next slab entry.
1109  */
1110 unsigned slab_node(struct mempolicy *policy)
1111 {
1112         switch (policy->policy) {
1113         case MPOL_INTERLEAVE:
1114                 return interleave_nodes(policy);
1115
1116         case MPOL_BIND:
1117                 /*
1118                  * Follow bind policy behavior and start allocation at the
1119                  * first node.
1120                  */
1121                 return policy->v.zonelist->zones[0]->zone_pgdat->node_id;
1122
1123         case MPOL_PREFERRED:
1124                 if (policy->v.preferred_node >= 0)
1125                         return policy->v.preferred_node;
1126                 /* Fall through */
1127
1128         default:
1129                 return numa_node_id();
1130         }
1131 }
1132
1133 /* Do static interleaving for a VMA with known offset. */
1134 static unsigned offset_il_node(struct mempolicy *pol,
1135                 struct vm_area_struct *vma, unsigned long off)
1136 {
1137         unsigned nnodes = nodes_weight(pol->v.nodes);
1138         unsigned target = (unsigned)off % nnodes;
1139         int c;
1140         int nid = -1;
1141
1142         c = 0;
1143         do {
1144                 nid = next_node(nid, pol->v.nodes);
1145                 c++;
1146         } while (c <= target);
1147         return nid;
1148 }
1149
1150 /* Determine a node number for interleave */
1151 static inline unsigned interleave_nid(struct mempolicy *pol,
1152                  struct vm_area_struct *vma, unsigned long addr, int shift)
1153 {
1154         if (vma) {
1155                 unsigned long off;
1156
1157                 off = vma->vm_pgoff;
1158                 off += (addr - vma->vm_start) >> shift;
1159                 return offset_il_node(pol, vma, off);
1160         } else
1161                 return interleave_nodes(pol);
1162 }
1163
1164 #ifdef CONFIG_HUGETLBFS
1165 /* Return a zonelist suitable for a huge page allocation. */
1166 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1167 {
1168         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1169
1170         if (pol->policy == MPOL_INTERLEAVE) {
1171                 unsigned nid;
1172
1173                 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1174                 return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
1175         }
1176         return zonelist_policy(GFP_HIGHUSER, pol);
1177 }
1178 #endif
1179
1180 /* Allocate a page in interleaved policy.
1181    Own path because it needs to do special accounting. */
1182 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1183                                         unsigned nid)
1184 {
1185         struct zonelist *zl;
1186         struct page *page;
1187
1188         zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1189         page = __alloc_pages(gfp, order, zl);
1190         if (page && page_zone(page) == zl->zones[0]) {
1191                 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
1192                 put_cpu();
1193         }
1194         return page;
1195 }
1196
1197 /**
1198  *      alloc_page_vma  - Allocate a page for a VMA.
1199  *
1200  *      @gfp:
1201  *      %GFP_USER    user allocation.
1202  *      %GFP_KERNEL  kernel allocations,
1203  *      %GFP_HIGHMEM highmem/user allocations,
1204  *      %GFP_FS      allocation should not call back into a file system.
1205  *      %GFP_ATOMIC  don't sleep.
1206  *
1207  *      @vma:  Pointer to VMA or NULL if not available.
1208  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1209  *
1210  *      This function allocates a page from the kernel page pool and applies
1211  *      a NUMA policy associated with the VMA or the current process.
1212  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1213  *      mm_struct of the VMA to prevent it from going away. Should be used for
1214  *      all allocations for pages that will be mapped into
1215  *      user space. Returns NULL when no page can be allocated.
1216  *
1217  *      Should be called with the mm_sem of the vma hold.
1218  */
1219 struct page *
1220 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1221 {
1222         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1223
1224         cpuset_update_task_memory_state();
1225
1226         if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1227                 unsigned nid;
1228
1229                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1230                 return alloc_page_interleave(gfp, 0, nid);
1231         }
1232         return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
1233 }
1234
1235 /**
1236  *      alloc_pages_current - Allocate pages.
1237  *
1238  *      @gfp:
1239  *              %GFP_USER   user allocation,
1240  *              %GFP_KERNEL kernel allocation,
1241  *              %GFP_HIGHMEM highmem allocation,
1242  *              %GFP_FS     don't call back into a file system.
1243  *              %GFP_ATOMIC don't sleep.
1244  *      @order: Power of two of allocation size in pages. 0 is a single page.
1245  *
1246  *      Allocate a page from the kernel page pool.  When not in
1247  *      interrupt context and apply the current process NUMA policy.
1248  *      Returns NULL when no page can be allocated.
1249  *
1250  *      Don't call cpuset_update_task_memory_state() unless
1251  *      1) it's ok to take cpuset_sem (can WAIT), and
1252  *      2) allocating for current task (not interrupt).
1253  */
1254 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1255 {
1256         struct mempolicy *pol = current->mempolicy;
1257
1258         if ((gfp & __GFP_WAIT) && !in_interrupt())
1259                 cpuset_update_task_memory_state();
1260         if (!pol || in_interrupt())
1261                 pol = &default_policy;
1262         if (pol->policy == MPOL_INTERLEAVE)
1263                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1264         return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1265 }
1266 EXPORT_SYMBOL(alloc_pages_current);
1267
1268 /*
1269  * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1270  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1271  * with the mems_allowed returned by cpuset_mems_allowed().  This
1272  * keeps mempolicies cpuset relative after its cpuset moves.  See
1273  * further kernel/cpuset.c update_nodemask().
1274  */
1275 void *cpuset_being_rebound;
1276
1277 /* Slow path of a mempolicy copy */
1278 struct mempolicy *__mpol_copy(struct mempolicy *old)
1279 {
1280         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1281
1282         if (!new)
1283                 return ERR_PTR(-ENOMEM);
1284         if (current_cpuset_is_being_rebound()) {
1285                 nodemask_t mems = cpuset_mems_allowed(current);
1286                 mpol_rebind_policy(old, &mems);
1287         }
1288         *new = *old;
1289         atomic_set(&new->refcnt, 1);
1290         if (new->policy == MPOL_BIND) {
1291                 int sz = ksize(old->v.zonelist);
1292                 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
1293                 if (!new->v.zonelist) {
1294                         kmem_cache_free(policy_cache, new);
1295                         return ERR_PTR(-ENOMEM);
1296                 }
1297                 memcpy(new->v.zonelist, old->v.zonelist, sz);
1298         }
1299         return new;
1300 }
1301
1302 /* Slow path of a mempolicy comparison */
1303 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1304 {
1305         if (!a || !b)
1306                 return 0;
1307         if (a->policy != b->policy)
1308                 return 0;
1309         switch (a->policy) {
1310         case MPOL_DEFAULT:
1311                 return 1;
1312         case MPOL_INTERLEAVE:
1313                 return nodes_equal(a->v.nodes, b->v.nodes);
1314         case MPOL_PREFERRED:
1315                 return a->v.preferred_node == b->v.preferred_node;
1316         case MPOL_BIND: {
1317                 int i;
1318                 for (i = 0; a->v.zonelist->zones[i]; i++)
1319                         if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1320                                 return 0;
1321                 return b->v.zonelist->zones[i] == NULL;
1322         }
1323         default:
1324                 BUG();
1325                 return 0;
1326         }
1327 }
1328
1329 /* Slow path of a mpol destructor. */
1330 void __mpol_free(struct mempolicy *p)
1331 {
1332         if (!atomic_dec_and_test(&p->refcnt))
1333                 return;
1334         if (p->policy == MPOL_BIND)
1335                 kfree(p->v.zonelist);
1336         p->policy = MPOL_DEFAULT;
1337         kmem_cache_free(policy_cache, p);
1338 }
1339
1340 /*
1341  * Shared memory backing store policy support.
1342  *
1343  * Remember policies even when nobody has shared memory mapped.
1344  * The policies are kept in Red-Black tree linked from the inode.
1345  * They are protected by the sp->lock spinlock, which should be held
1346  * for any accesses to the tree.
1347  */
1348
1349 /* lookup first element intersecting start-end */
1350 /* Caller holds sp->lock */
1351 static struct sp_node *
1352 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1353 {
1354         struct rb_node *n = sp->root.rb_node;
1355
1356         while (n) {
1357                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1358
1359                 if (start >= p->end)
1360                         n = n->rb_right;
1361                 else if (end <= p->start)
1362                         n = n->rb_left;
1363                 else
1364                         break;
1365         }
1366         if (!n)
1367                 return NULL;
1368         for (;;) {
1369                 struct sp_node *w = NULL;
1370                 struct rb_node *prev = rb_prev(n);
1371                 if (!prev)
1372                         break;
1373                 w = rb_entry(prev, struct sp_node, nd);
1374                 if (w->end <= start)
1375                         break;
1376                 n = prev;
1377         }
1378         return rb_entry(n, struct sp_node, nd);
1379 }
1380
1381 /* Insert a new shared policy into the list. */
1382 /* Caller holds sp->lock */
1383 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1384 {
1385         struct rb_node **p = &sp->root.rb_node;
1386         struct rb_node *parent = NULL;
1387         struct sp_node *nd;
1388
1389         while (*p) {
1390                 parent = *p;
1391                 nd = rb_entry(parent, struct sp_node, nd);
1392                 if (new->start < nd->start)
1393                         p = &(*p)->rb_left;
1394                 else if (new->end > nd->end)
1395                         p = &(*p)->rb_right;
1396                 else
1397                         BUG();
1398         }
1399         rb_link_node(&new->nd, parent, p);
1400         rb_insert_color(&new->nd, &sp->root);
1401         PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1402                  new->policy ? new->policy->policy : 0);
1403 }
1404
1405 /* Find shared policy intersecting idx */
1406 struct mempolicy *
1407 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1408 {
1409         struct mempolicy *pol = NULL;
1410         struct sp_node *sn;
1411
1412         if (!sp->root.rb_node)
1413                 return NULL;
1414         spin_lock(&sp->lock);
1415         sn = sp_lookup(sp, idx, idx+1);
1416         if (sn) {
1417                 mpol_get(sn->policy);
1418                 pol = sn->policy;
1419         }
1420         spin_unlock(&sp->lock);
1421         return pol;
1422 }
1423
1424 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1425 {
1426         PDprintk("deleting %lx-l%x\n", n->start, n->end);
1427         rb_erase(&n->nd, &sp->root);
1428         mpol_free(n->policy);
1429         kmem_cache_free(sn_cache, n);
1430 }
1431
1432 struct sp_node *
1433 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1434 {
1435         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1436
1437         if (!n)
1438                 return NULL;
1439         n->start = start;
1440         n->end = end;
1441         mpol_get(pol);
1442         n->policy = pol;
1443         return n;
1444 }
1445
1446 /* Replace a policy range. */
1447 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1448                                  unsigned long end, struct sp_node *new)
1449 {
1450         struct sp_node *n, *new2 = NULL;
1451
1452 restart:
1453         spin_lock(&sp->lock);
1454         n = sp_lookup(sp, start, end);
1455         /* Take care of old policies in the same range. */
1456         while (n && n->start < end) {
1457                 struct rb_node *next = rb_next(&n->nd);
1458                 if (n->start >= start) {
1459                         if (n->end <= end)
1460                                 sp_delete(sp, n);
1461                         else
1462                                 n->start = end;
1463                 } else {
1464                         /* Old policy spanning whole new range. */
1465                         if (n->end > end) {
1466                                 if (!new2) {
1467                                         spin_unlock(&sp->lock);
1468                                         new2 = sp_alloc(end, n->end, n->policy);
1469                                         if (!new2)
1470                                                 return -ENOMEM;
1471                                         goto restart;
1472                                 }
1473                                 n->end = start;
1474                                 sp_insert(sp, new2);
1475                                 new2 = NULL;
1476                                 break;
1477                         } else
1478                                 n->end = start;
1479                 }
1480                 if (!next)
1481                         break;
1482                 n = rb_entry(next, struct sp_node, nd);
1483         }
1484         if (new)
1485                 sp_insert(sp, new);
1486         spin_unlock(&sp->lock);
1487         if (new2) {
1488                 mpol_free(new2->policy);
1489                 kmem_cache_free(sn_cache, new2);
1490         }
1491         return 0;
1492 }
1493
1494 void mpol_shared_policy_init(struct shared_policy *info, int policy,
1495                                 nodemask_t *policy_nodes)
1496 {
1497         info->root = RB_ROOT;
1498         spin_lock_init(&info->lock);
1499
1500         if (policy != MPOL_DEFAULT) {
1501                 struct mempolicy *newpol;
1502
1503                 /* Falls back to MPOL_DEFAULT on any error */
1504                 newpol = mpol_new(policy, policy_nodes);
1505                 if (!IS_ERR(newpol)) {
1506                         /* Create pseudo-vma that contains just the policy */
1507                         struct vm_area_struct pvma;
1508
1509                         memset(&pvma, 0, sizeof(struct vm_area_struct));
1510                         /* Policy covers entire file */
1511                         pvma.vm_end = TASK_SIZE;
1512                         mpol_set_shared_policy(info, &pvma, newpol);
1513                         mpol_free(newpol);
1514                 }
1515         }
1516 }
1517
1518 int mpol_set_shared_policy(struct shared_policy *info,
1519                         struct vm_area_struct *vma, struct mempolicy *npol)
1520 {
1521         int err;
1522         struct sp_node *new = NULL;
1523         unsigned long sz = vma_pages(vma);
1524
1525         PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1526                  vma->vm_pgoff,
1527                  sz, npol? npol->policy : -1,
1528                 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1529
1530         if (npol) {
1531                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1532                 if (!new)
1533                         return -ENOMEM;
1534         }
1535         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1536         if (err && new)
1537                 kmem_cache_free(sn_cache, new);
1538         return err;
1539 }
1540
1541 /* Free a backing policy store on inode delete. */
1542 void mpol_free_shared_policy(struct shared_policy *p)
1543 {
1544         struct sp_node *n;
1545         struct rb_node *next;
1546
1547         if (!p->root.rb_node)
1548                 return;
1549         spin_lock(&p->lock);
1550         next = rb_first(&p->root);
1551         while (next) {
1552                 n = rb_entry(next, struct sp_node, nd);
1553                 next = rb_next(&n->nd);
1554                 rb_erase(&n->nd, &p->root);
1555                 mpol_free(n->policy);
1556                 kmem_cache_free(sn_cache, n);
1557         }
1558         spin_unlock(&p->lock);
1559 }
1560
1561 /* assumes fs == KERNEL_DS */
1562 void __init numa_policy_init(void)
1563 {
1564         policy_cache = kmem_cache_create("numa_policy",
1565                                          sizeof(struct mempolicy),
1566                                          0, SLAB_PANIC, NULL, NULL);
1567
1568         sn_cache = kmem_cache_create("shared_policy_node",
1569                                      sizeof(struct sp_node),
1570                                      0, SLAB_PANIC, NULL, NULL);
1571
1572         /* Set interleaving policy for system init. This way not all
1573            the data structures allocated at system boot end up in node zero. */
1574
1575         if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1576                 printk("numa_policy_init: interleaving failed\n");
1577 }
1578
1579 /* Reset policy of current process to default */
1580 void numa_default_policy(void)
1581 {
1582         do_set_mempolicy(MPOL_DEFAULT, NULL);
1583 }
1584
1585 /* Migrate a policy to a different set of nodes */
1586 void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1587 {
1588         nodemask_t *mpolmask;
1589         nodemask_t tmp;
1590
1591         if (!pol)
1592                 return;
1593         mpolmask = &pol->cpuset_mems_allowed;
1594         if (nodes_equal(*mpolmask, *newmask))
1595                 return;
1596
1597         switch (pol->policy) {
1598         case MPOL_DEFAULT:
1599                 break;
1600         case MPOL_INTERLEAVE:
1601                 nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1602                 pol->v.nodes = tmp;
1603                 *mpolmask = *newmask;
1604                 current->il_next = node_remap(current->il_next,
1605                                                 *mpolmask, *newmask);
1606                 break;
1607         case MPOL_PREFERRED:
1608                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1609                                                 *mpolmask, *newmask);
1610                 *mpolmask = *newmask;
1611                 break;
1612         case MPOL_BIND: {
1613                 nodemask_t nodes;
1614                 struct zone **z;
1615                 struct zonelist *zonelist;
1616
1617                 nodes_clear(nodes);
1618                 for (z = pol->v.zonelist->zones; *z; z++)
1619                         node_set((*z)->zone_pgdat->node_id, nodes);
1620                 nodes_remap(tmp, nodes, *mpolmask, *newmask);
1621                 nodes = tmp;
1622
1623                 zonelist = bind_zonelist(&nodes);
1624
1625                 /* If no mem, then zonelist is NULL and we keep old zonelist.
1626                  * If that old zonelist has no remaining mems_allowed nodes,
1627                  * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1628                  */
1629
1630                 if (zonelist) {
1631                         /* Good - got mem - substitute new zonelist */
1632                         kfree(pol->v.zonelist);
1633                         pol->v.zonelist = zonelist;
1634                 }
1635                 *mpolmask = *newmask;
1636                 break;
1637         }
1638         default:
1639                 BUG();
1640                 break;
1641         }
1642 }
1643
1644 /*
1645  * Wrapper for mpol_rebind_policy() that just requires task
1646  * pointer, and updates task mempolicy.
1647  */
1648
1649 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1650 {
1651         mpol_rebind_policy(tsk->mempolicy, new);
1652 }
1653
1654 /*
1655  * Rebind each vma in mm to new nodemask.
1656  *
1657  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
1658  */
1659
1660 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1661 {
1662         struct vm_area_struct *vma;
1663
1664         down_write(&mm->mmap_sem);
1665         for (vma = mm->mmap; vma; vma = vma->vm_next)
1666                 mpol_rebind_policy(vma->vm_policy, new);
1667         up_write(&mm->mmap_sem);
1668 }
1669
1670 /*
1671  * Display pages allocated per node and memory policy via /proc.
1672  */
1673
1674 static const char *policy_types[] = { "default", "prefer", "bind",
1675                                       "interleave" };
1676
1677 /*
1678  * Convert a mempolicy into a string.
1679  * Returns the number of characters in buffer (if positive)
1680  * or an error (negative)
1681  */
1682 static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1683 {
1684         char *p = buffer;
1685         int l;
1686         nodemask_t nodes;
1687         int mode = pol ? pol->policy : MPOL_DEFAULT;
1688
1689         switch (mode) {
1690         case MPOL_DEFAULT:
1691                 nodes_clear(nodes);
1692                 break;
1693
1694         case MPOL_PREFERRED:
1695                 nodes_clear(nodes);
1696                 node_set(pol->v.preferred_node, nodes);
1697                 break;
1698
1699         case MPOL_BIND:
1700                 get_zonemask(pol, &nodes);
1701                 break;
1702
1703         case MPOL_INTERLEAVE:
1704                 nodes = pol->v.nodes;
1705                 break;
1706
1707         default:
1708                 BUG();
1709                 return -EFAULT;
1710         }
1711
1712         l = strlen(policy_types[mode]);
1713         if (buffer + maxlen < p + l + 1)
1714                 return -ENOSPC;
1715
1716         strcpy(p, policy_types[mode]);
1717         p += l;
1718
1719         if (!nodes_empty(nodes)) {
1720                 if (buffer + maxlen < p + 2)
1721                         return -ENOSPC;
1722                 *p++ = '=';
1723                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1724         }
1725         return p - buffer;
1726 }
1727
1728 struct numa_maps {
1729         unsigned long pages;
1730         unsigned long anon;
1731         unsigned long active;
1732         unsigned long writeback;
1733         unsigned long mapcount_max;
1734         unsigned long dirty;
1735         unsigned long swapcache;
1736         unsigned long node[MAX_NUMNODES];
1737 };
1738
1739 static void gather_stats(struct page *page, void *private, int pte_dirty)
1740 {
1741         struct numa_maps *md = private;
1742         int count = page_mapcount(page);
1743
1744         md->pages++;
1745         if (pte_dirty || PageDirty(page))
1746                 md->dirty++;
1747
1748         if (PageSwapCache(page))
1749                 md->swapcache++;
1750
1751         if (PageActive(page))
1752                 md->active++;
1753
1754         if (PageWriteback(page))
1755                 md->writeback++;
1756
1757         if (PageAnon(page))
1758                 md->anon++;
1759
1760         if (count > md->mapcount_max)
1761                 md->mapcount_max = count;
1762
1763         md->node[page_to_nid(page)]++;
1764         cond_resched();
1765 }
1766
1767 #ifdef CONFIG_HUGETLB_PAGE
1768 static void check_huge_range(struct vm_area_struct *vma,
1769                 unsigned long start, unsigned long end,
1770                 struct numa_maps *md)
1771 {
1772         unsigned long addr;
1773         struct page *page;
1774
1775         for (addr = start; addr < end; addr += HPAGE_SIZE) {
1776                 pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
1777                 pte_t pte;
1778
1779                 if (!ptep)
1780                         continue;
1781
1782                 pte = *ptep;
1783                 if (pte_none(pte))
1784                         continue;
1785
1786                 page = pte_page(pte);
1787                 if (!page)
1788                         continue;
1789
1790                 gather_stats(page, md, pte_dirty(*ptep));
1791         }
1792 }
1793 #else
1794 static inline void check_huge_range(struct vm_area_struct *vma,
1795                 unsigned long start, unsigned long end,
1796                 struct numa_maps *md)
1797 {
1798 }
1799 #endif
1800
1801 int show_numa_map(struct seq_file *m, void *v)
1802 {
1803         struct task_struct *task = m->private;
1804         struct vm_area_struct *vma = v;
1805         struct numa_maps *md;
1806         struct file *file = vma->vm_file;
1807         struct mm_struct *mm = vma->vm_mm;
1808         int n;
1809         char buffer[50];
1810
1811         if (!mm)
1812                 return 0;
1813
1814         md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1815         if (!md)
1816                 return 0;
1817
1818         mpol_to_str(buffer, sizeof(buffer),
1819                         get_vma_policy(task, vma, vma->vm_start));
1820
1821         seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1822
1823         if (file) {
1824                 seq_printf(m, " file=");
1825                 seq_path(m, file->f_vfsmnt, file->f_dentry, "\n\t= ");
1826         } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1827                 seq_printf(m, " heap");
1828         } else if (vma->vm_start <= mm->start_stack &&
1829                         vma->vm_end >= mm->start_stack) {
1830                 seq_printf(m, " stack");
1831         }
1832
1833         if (is_vm_hugetlb_page(vma)) {
1834                 check_huge_range(vma, vma->vm_start, vma->vm_end, md);
1835                 seq_printf(m, " huge");
1836         } else {
1837                 check_pgd_range(vma, vma->vm_start, vma->vm_end,
1838                                 &node_online_map, MPOL_MF_STATS, md);
1839         }
1840
1841         if (!md->pages)
1842                 goto out;
1843
1844         if (md->anon)
1845                 seq_printf(m," anon=%lu",md->anon);
1846
1847         if (md->dirty)
1848                 seq_printf(m," dirty=%lu",md->dirty);
1849
1850         if (md->pages != md->anon && md->pages != md->dirty)
1851                 seq_printf(m, " mapped=%lu", md->pages);
1852
1853         if (md->mapcount_max > 1)
1854                 seq_printf(m, " mapmax=%lu", md->mapcount_max);
1855
1856         if (md->swapcache)
1857                 seq_printf(m," swapcache=%lu", md->swapcache);
1858
1859         if (md->active < md->pages && !is_vm_hugetlb_page(vma))
1860                 seq_printf(m," active=%lu", md->active);
1861
1862         if (md->writeback)
1863                 seq_printf(m," writeback=%lu", md->writeback);
1864
1865         for_each_online_node(n)
1866                 if (md->node[n])
1867                         seq_printf(m, " N%d=%lu", n, md->node[n]);
1868 out:
1869         seq_putc(m, '\n');
1870         kfree(md);
1871
1872         if (m->count < m->size)
1873                 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
1874         return 0;
1875 }
1876