mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66    could replace all the switch()es with a mempolicy_ops structure.
  67 */
  68
  69 #include <linux/mempolicy.h>
  70 #include <linux/mm.h>
  71 #include <linux/highmem.h>
  72 #include <linux/hugetlb.h>
  73 #include <linux/kernel.h>
  74 #include <linux/sched.h>
  75 #include <linux/mm.h>
  76 #include <linux/nodemask.h>
  77 #include <linux/cpuset.h>
  78 #include <linux/gfp.h>
  79 #include <linux/slab.h>
  80 #include <linux/string.h>
  81 #include <linux/module.h>
  82 #include <linux/interrupt.h>
  83 #include <linux/init.h>
  84 #include <linux/compat.h>
  85 #include <linux/mempolicy.h>
  86 #include <linux/swap.h>
  87 #include <linux/seq_file.h>
  88 #include <linux/proc_fs.h>
  89
  90 #include <asm/tlbflush.h>
  91 #include <asm/uaccess.h>
  92
  93 /* Internal flags */
  94 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
  95 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
  96 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
  97
  98 static kmem_cache_t *policy_cache;
  99 static kmem_cache_t *sn_cache;
 100
 101 #define PDprintk(fmt...)
 102
 103 /* Highest zone. An specific allocation for a zone below that is not
 104    policied. */
 105 int policy_zone = ZONE_DMA;
 106
 107 struct mempolicy default_policy = {
 108         .refcnt = ATOMIC_INIT(1), /* never free it */
 109         .policy = MPOL_DEFAULT,
 110 };
 111
 112 /* Do sanity checking on a policy */
 113 static int mpol_check_policy(int mode, nodemask_t *nodes)
 114 {
 115         int empty = nodes_empty(*nodes);
 116
 117         switch (mode) {
 118         case MPOL_DEFAULT:
 119                 if (!empty)
 120                         return -EINVAL;
 121                 break;
 122         case MPOL_BIND:
 123         case MPOL_INTERLEAVE:
 124                 /* Preferred will only use the first bit, but allow
 125                    more for now. */
 126                 if (empty)
 127                         return -EINVAL;
 128                 break;
 129         }
 130         return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
 131 }
 132 /* Generate a custom zonelist for the BIND policy. */
 133 static struct zonelist *bind_zonelist(nodemask_t *nodes)
 134 {
 135         struct zonelist *zl;
 136         int num, max, nd;
 137
 138         max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
 139         zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
 140         if (!zl)
 141                 return NULL;
 142         num = 0;
 143         for_each_node_mask(nd, *nodes)
 144                 zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
 145         zl->zones[num] = NULL;
 146         return zl;
 147 }
 148
 149 /* Create a new policy */
 150 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 151 {
 152         struct mempolicy *policy;
 153
 154         PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
 155         if (mode == MPOL_DEFAULT)
 156                 return NULL;
 157         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 158         if (!policy)
 159                 return ERR_PTR(-ENOMEM);
 160         atomic_set(&policy->refcnt, 1);
 161         switch (mode) {
 162         case MPOL_INTERLEAVE:
 163                 policy->v.nodes = *nodes;
 164                 if (nodes_weight(*nodes) == 0) {
 165                         kmem_cache_free(policy_cache, policy);
 166                         return ERR_PTR(-EINVAL);
 167                 }
 168                 break;
 169         case MPOL_PREFERRED:
 170                 policy->v.preferred_node = first_node(*nodes);
 171                 if (policy->v.preferred_node >= MAX_NUMNODES)
 172                         policy->v.preferred_node = -1;
 173                 break;
 174         case MPOL_BIND:
 175                 policy->v.zonelist = bind_zonelist(nodes);
 176                 if (policy->v.zonelist == NULL) {
 177                         kmem_cache_free(policy_cache, policy);
 178                         return ERR_PTR(-ENOMEM);
 179                 }
 180                 break;
 181         }
 182         policy->policy = mode;
 183         policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
 184         return policy;
 185 }
 186
 187 static void gather_stats(struct page *, void *);
 188 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 189                                 unsigned long flags);
 190
 191 /* Scan through pages checking if pages follow certain conditions. */
 192 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 193                 unsigned long addr, unsigned long end,
 194                 const nodemask_t *nodes, unsigned long flags,
 195                 void *private)
 196 {
 197         pte_t *orig_pte;
 198         pte_t *pte;
 199         spinlock_t *ptl;
 200
 201         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 202         do {
 203                 struct page *page;
 204                 unsigned int nid;
 205
 206                 if (!pte_present(*pte))
 207                         continue;
 208                 page = vm_normal_page(vma, addr, *pte);
 209                 if (!page)
 210                         continue;
 211                 /*
 212                  * The check for PageReserved here is important to avoid
 213                  * handling zero pages and other pages that may have been
 214                  * marked special by the system.
 215                  *
 216                  * If the PageReserved would not be checked here then f.e.
 217                  * the location of the zero page could have an influence
 218                  * on MPOL_MF_STRICT, zero pages would be counted for
 219                  * the per node stats, and there would be useless attempts
 220                  * to put zero pages on the migration list.
 221                  */
 222                 if (PageReserved(page))
 223                         continue;
 224                 nid = page_to_nid(page);
 225                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 226                         continue;
 227
 228                 if (flags & MPOL_MF_STATS)
 229                         gather_stats(page, private);
 230                 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 231                         migrate_page_add(page, private, flags);
 232                 else
 233                         break;
 234         } while (pte++, addr += PAGE_SIZE, addr != end);
 235         pte_unmap_unlock(orig_pte, ptl);
 236         return addr != end;
 237 }
 238
 239 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 240                 unsigned long addr, unsigned long end,
 241                 const nodemask_t *nodes, unsigned long flags,
 242                 void *private)
 243 {
 244         pmd_t *pmd;
 245         unsigned long next;
 246
 247         pmd = pmd_offset(pud, addr);
 248         do {
 249                 next = pmd_addr_end(addr, end);
 250                 if (pmd_none_or_clear_bad(pmd))
 251                         continue;
 252                 if (check_pte_range(vma, pmd, addr, next, nodes,
 253                                     flags, private))
 254                         return -EIO;
 255         } while (pmd++, addr = next, addr != end);
 256         return 0;
 257 }
 258
 259 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 260                 unsigned long addr, unsigned long end,
 261                 const nodemask_t *nodes, unsigned long flags,
 262                 void *private)
 263 {
 264         pud_t *pud;
 265         unsigned long next;
 266
 267         pud = pud_offset(pgd, addr);
 268         do {
 269                 next = pud_addr_end(addr, end);
 270                 if (pud_none_or_clear_bad(pud))
 271                         continue;
 272                 if (check_pmd_range(vma, pud, addr, next, nodes,
 273                                     flags, private))
 274                         return -EIO;
 275         } while (pud++, addr = next, addr != end);
 276         return 0;
 277 }
 278
 279 static inline int check_pgd_range(struct vm_area_struct *vma,
 280                 unsigned long addr, unsigned long end,
 281                 const nodemask_t *nodes, unsigned long flags,
 282                 void *private)
 283 {
 284         pgd_t *pgd;
 285         unsigned long next;
 286
 287         pgd = pgd_offset(vma->vm_mm, addr);
 288         do {
 289                 next = pgd_addr_end(addr, end);
 290                 if (pgd_none_or_clear_bad(pgd))
 291                         continue;
 292                 if (check_pud_range(vma, pgd, addr, next, nodes,
 293                                     flags, private))
 294                         return -EIO;
 295         } while (pgd++, addr = next, addr != end);
 296         return 0;
 297 }
 298
 299 /* Check if a vma is migratable */
 300 static inline int vma_migratable(struct vm_area_struct *vma)
 301 {
 302         if (vma->vm_flags & (
 303                 VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED))
 304                 return 0;
 305         return 1;
 306 }
 307
 308 /*
 309  * Check if all pages in a range are on a set of nodes.
 310  * If pagelist != NULL then isolate pages from the LRU and
 311  * put them on the pagelist.
 312  */
 313 static struct vm_area_struct *
 314 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 315                 const nodemask_t *nodes, unsigned long flags, void *private)
 316 {
 317         int err;
 318         struct vm_area_struct *first, *vma, *prev;
 319
 320         /* Clear the LRU lists so pages can be isolated */
 321         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 322                 lru_add_drain_all();
 323
 324         first = find_vma(mm, start);
 325         if (!first)
 326                 return ERR_PTR(-EFAULT);
 327         prev = NULL;
 328         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 329                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 330                         if (!vma->vm_next && vma->vm_end < end)
 331                                 return ERR_PTR(-EFAULT);
 332                         if (prev && prev->vm_end < vma->vm_start)
 333                                 return ERR_PTR(-EFAULT);
 334                 }
 335                 if (!is_vm_hugetlb_page(vma) &&
 336                     ((flags & MPOL_MF_STRICT) ||
 337                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 338                                 vma_migratable(vma)))) {
 339                         unsigned long endvma = vma->vm_end;
 340
 341                         if (endvma > end)
 342                                 endvma = end;
 343                         if (vma->vm_start > start)
 344                                 start = vma->vm_start;
 345                         err = check_pgd_range(vma, start, endvma, nodes,
 346                                                 flags, private);
 347                         if (err) {
 348                                 first = ERR_PTR(err);
 349                                 break;
 350                         }
 351                 }
 352                 prev = vma;
 353         }
 354         return first;
 355 }
 356
 357 /* Apply policy to a single VMA */
 358 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 359 {
 360         int err = 0;
 361         struct mempolicy *old = vma->vm_policy;
 362
 363         PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 364                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 365                  vma->vm_ops, vma->vm_file,
 366                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 367
 368         if (vma->vm_ops && vma->vm_ops->set_policy)
 369                 err = vma->vm_ops->set_policy(vma, new);
 370         if (!err) {
 371                 mpol_get(new);
 372                 vma->vm_policy = new;
 373                 mpol_free(old);
 374         }
 375         return err;
 376 }
 377
 378 /* Step 2: apply policy to a range and do splits. */
 379 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 380                        unsigned long end, struct mempolicy *new)
 381 {
 382         struct vm_area_struct *next;
 383         int err;
 384
 385         err = 0;
 386         for (; vma && vma->vm_start < end; vma = next) {
 387                 next = vma->vm_next;
 388                 if (vma->vm_start < start)
 389                         err = split_vma(vma->vm_mm, vma, start, 1);
 390                 if (!err && vma->vm_end > end)
 391                         err = split_vma(vma->vm_mm, vma, end, 0);
 392                 if (!err)
 393                         err = policy_vma(vma, new);
 394                 if (err)
 395                         break;
 396         }
 397         return err;
 398 }
 399
 400 static int contextualize_policy(int mode, nodemask_t *nodes)
 401 {
 402         if (!nodes)
 403                 return 0;
 404
 405         cpuset_update_task_memory_state();
 406         if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
 407                 return -EINVAL;
 408         return mpol_check_policy(mode, nodes);
 409 }
 410
 411 /* Set the process memory policy */
 412 long do_set_mempolicy(int mode, nodemask_t *nodes)
 413 {
 414         struct mempolicy *new;
 415
 416         if (contextualize_policy(mode, nodes))
 417                 return -EINVAL;
 418         new = mpol_new(mode, nodes);
 419         if (IS_ERR(new))
 420                 return PTR_ERR(new);
 421         mpol_free(current->mempolicy);
 422         current->mempolicy = new;
 423         if (new && new->policy == MPOL_INTERLEAVE)
 424                 current->il_next = first_node(new->v.nodes);
 425         return 0;
 426 }
 427
 428 /* Fill a zone bitmap for a policy */
 429 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 430 {
 431         int i;
 432
 433         nodes_clear(*nodes);
 434         switch (p->policy) {
 435         case MPOL_BIND:
 436                 for (i = 0; p->v.zonelist->zones[i]; i++)
 437                         node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
 438                                 *nodes);
 439                 break;
 440         case MPOL_DEFAULT:
 441                 break;
 442         case MPOL_INTERLEAVE:
 443                 *nodes = p->v.nodes;
 444                 break;
 445         case MPOL_PREFERRED:
 446                 /* or use current node instead of online map? */
 447                 if (p->v.preferred_node < 0)
 448                         *nodes = node_online_map;
 449                 else
 450                         node_set(p->v.preferred_node, *nodes);
 451                 break;
 452         default:
 453                 BUG();
 454         }
 455 }
 456
 457 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 458 {
 459         struct page *p;
 460         int err;
 461
 462         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 463         if (err >= 0) {
 464                 err = page_to_nid(p);
 465                 put_page(p);
 466         }
 467         return err;
 468 }
 469
 470 /* Retrieve NUMA policy */
 471 long do_get_mempolicy(int *policy, nodemask_t *nmask,
 472                         unsigned long addr, unsigned long flags)
 473 {
 474         int err;
 475         struct mm_struct *mm = current->mm;
 476         struct vm_area_struct *vma = NULL;
 477         struct mempolicy *pol = current->mempolicy;
 478
 479         cpuset_update_task_memory_state();
 480         if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
 481                 return -EINVAL;
 482         if (flags & MPOL_F_ADDR) {
 483                 down_read(&mm->mmap_sem);
 484                 vma = find_vma_intersection(mm, addr, addr+1);
 485                 if (!vma) {
 486                         up_read(&mm->mmap_sem);
 487                         return -EFAULT;
 488                 }
 489                 if (vma->vm_ops && vma->vm_ops->get_policy)
 490                         pol = vma->vm_ops->get_policy(vma, addr);
 491                 else
 492                         pol = vma->vm_policy;
 493         } else if (addr)
 494                 return -EINVAL;
 495
 496         if (!pol)
 497                 pol = &default_policy;
 498
 499         if (flags & MPOL_F_NODE) {
 500                 if (flags & MPOL_F_ADDR) {
 501                         err = lookup_node(mm, addr);
 502                         if (err < 0)
 503                                 goto out;
 504                         *policy = err;
 505                 } else if (pol == current->mempolicy &&
 506                                 pol->policy == MPOL_INTERLEAVE) {
 507                         *policy = current->il_next;
 508                 } else {
 509                         err = -EINVAL;
 510                         goto out;
 511                 }
 512         } else
 513                 *policy = pol->policy;
 514
 515         if (vma) {
 516                 up_read(&current->mm->mmap_sem);
 517                 vma = NULL;
 518         }
 519
 520         err = 0;
 521         if (nmask)
 522                 get_zonemask(pol, nmask);
 523
 524  out:
 525         if (vma)
 526                 up_read(&current->mm->mmap_sem);
 527         return err;
 528 }
 529
 530 /*
 531  * page migration
 532  */
 533
 534 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 535                                 unsigned long flags)
 536 {
 537         /*
 538          * Avoid migrating a page that is shared with others.
 539          */
 540         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
 541                 if (isolate_lru_page(page))
 542                         list_add(&page->lru, pagelist);
 543         }
 544 }
 545
 546 static int swap_pages(struct list_head *pagelist)
 547 {
 548         LIST_HEAD(moved);
 549         LIST_HEAD(failed);
 550         int n;
 551
 552         n = migrate_pages(pagelist, NULL, &moved, &failed);
 553         putback_lru_pages(&failed);
 554         putback_lru_pages(&moved);
 555
 556         return n;
 557 }
 558
 559 /*
 560  * For now migrate_pages simply swaps out the pages from nodes that are in
 561  * the source set but not in the target set. In the future, we would
 562  * want a function that moves pages between the two nodesets in such
 563  * a way as to preserve the physical layout as much as possible.
 564  *
 565  * Returns the number of page that could not be moved.
 566  */
 567 int do_migrate_pages(struct mm_struct *mm,
 568         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 569 {
 570         LIST_HEAD(pagelist);
 571         int count = 0;
 572         nodemask_t nodes;
 573
 574         nodes_andnot(nodes, *from_nodes, *to_nodes);
 575
 576         down_read(&mm->mmap_sem);
 577         check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
 578                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 579
 580         if (!list_empty(&pagelist)) {
 581                 count = swap_pages(&pagelist);
 582                 putback_lru_pages(&pagelist);
 583         }
 584
 585         up_read(&mm->mmap_sem);
 586         return count;
 587 }
 588
 589 long do_mbind(unsigned long start, unsigned long len,
 590                 unsigned long mode, nodemask_t *nmask, unsigned long flags)
 591 {
 592         struct vm_area_struct *vma;
 593         struct mm_struct *mm = current->mm;
 594         struct mempolicy *new;
 595         unsigned long end;
 596         int err;
 597         LIST_HEAD(pagelist);
 598
 599         if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
 600                                       MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 601             || mode > MPOL_MAX)
 602                 return -EINVAL;
 603         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
 604                 return -EPERM;
 605
 606         if (start & ~PAGE_MASK)
 607                 return -EINVAL;
 608
 609         if (mode == MPOL_DEFAULT)
 610                 flags &= ~MPOL_MF_STRICT;
 611
 612         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 613         end = start + len;
 614
 615         if (end < start)
 616                 return -EINVAL;
 617         if (end == start)
 618                 return 0;
 619
 620         if (mpol_check_policy(mode, nmask))
 621                 return -EINVAL;
 622
 623         new = mpol_new(mode, nmask);
 624         if (IS_ERR(new))
 625                 return PTR_ERR(new);
 626
 627         /*
 628          * If we are using the default policy then operation
 629          * on discontinuous address spaces is okay after all
 630          */
 631         if (!new)
 632                 flags |= MPOL_MF_DISCONTIG_OK;
 633
 634         PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
 635                         mode,nodes_addr(nodes)[0]);
 636
 637         down_write(&mm->mmap_sem);
 638         vma = check_range(mm, start, end, nmask,
 639                           flags | MPOL_MF_INVERT, &pagelist);
 640
 641         err = PTR_ERR(vma);
 642         if (!IS_ERR(vma)) {
 643                 int nr_failed = 0;
 644
 645                 err = mbind_range(vma, start, end, new);
 646                 if (!list_empty(&pagelist))
 647                         nr_failed = swap_pages(&pagelist);
 648
 649                 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 650                         err = -EIO;
 651         }
 652         if (!list_empty(&pagelist))
 653                 putback_lru_pages(&pagelist);
 654
 655         up_write(&mm->mmap_sem);
 656         mpol_free(new);
 657         return err;
 658 }
 659
 660 /*
 661  * User space interface with variable sized bitmaps for nodelists.
 662  */
 663
 664 /* Copy a node mask from user space. */
 665 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 666                      unsigned long maxnode)
 667 {
 668         unsigned long k;
 669         unsigned long nlongs;
 670         unsigned long endmask;
 671
 672         --maxnode;
 673         nodes_clear(*nodes);
 674         if (maxnode == 0 || !nmask)
 675                 return 0;
 676
 677         nlongs = BITS_TO_LONGS(maxnode);
 678         if ((maxnode % BITS_PER_LONG) == 0)
 679                 endmask = ~0UL;
 680         else
 681                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
 682
 683         /* When the user specified more nodes than supported just check
 684            if the non supported part is all zero. */
 685         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
 686                 if (nlongs > PAGE_SIZE/sizeof(long))
 687                         return -EINVAL;
 688                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
 689                         unsigned long t;
 690                         if (get_user(t, nmask + k))
 691                                 return -EFAULT;
 692                         if (k == nlongs - 1) {
 693                                 if (t & endmask)
 694                                         return -EINVAL;
 695                         } else if (t)
 696                                 return -EINVAL;
 697                 }
 698                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
 699                 endmask = ~0UL;
 700         }
 701
 702         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
 703                 return -EFAULT;
 704         nodes_addr(*nodes)[nlongs-1] &= endmask;
 705         return 0;
 706 }
 707
 708 /* Copy a kernel node mask to user space */
 709 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 710                               nodemask_t *nodes)
 711 {
 712         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
 713         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
 714
 715         if (copy > nbytes) {
 716                 if (copy > PAGE_SIZE)
 717                         return -EINVAL;
 718                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
 719                         return -EFAULT;
 720                 copy = nbytes;
 721         }
 722         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
 723 }
 724
 725 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 726                         unsigned long mode,
 727                         unsigned long __user *nmask, unsigned long maxnode,
 728                         unsigned flags)
 729 {
 730         nodemask_t nodes;
 731         int err;
 732
 733         err = get_nodes(&nodes, nmask, maxnode);
 734         if (err)
 735                 return err;
 736         return do_mbind(start, len, mode, &nodes, flags);
 737 }
 738
 739 /* Set the process memory policy */
 740 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 741                 unsigned long maxnode)
 742 {
 743         int err;
 744         nodemask_t nodes;
 745
 746         if (mode < 0 || mode > MPOL_MAX)
 747                 return -EINVAL;
 748         err = get_nodes(&nodes, nmask, maxnode);
 749         if (err)
 750                 return err;
 751         return do_set_mempolicy(mode, &nodes);
 752 }
 753
 754 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 755                 const unsigned long __user *old_nodes,
 756                 const unsigned long __user *new_nodes)
 757 {
 758         struct mm_struct *mm;
 759         struct task_struct *task;
 760         nodemask_t old;
 761         nodemask_t new;
 762         nodemask_t task_nodes;
 763         int err;
 764
 765         err = get_nodes(&old, old_nodes, maxnode);
 766         if (err)
 767                 return err;
 768
 769         err = get_nodes(&new, new_nodes, maxnode);
 770         if (err)
 771                 return err;
 772
 773         /* Find the mm_struct */
 774         read_lock(&tasklist_lock);
 775         task = pid ? find_task_by_pid(pid) : current;
 776         if (!task) {
 777                 read_unlock(&tasklist_lock);
 778                 return -ESRCH;
 779         }
 780         mm = get_task_mm(task);
 781         read_unlock(&tasklist_lock);
 782
 783         if (!mm)
 784                 return -EINVAL;
 785
 786         /*
 787          * Check if this process has the right to modify the specified
 788          * process. The right exists if the process has administrative
 789          * capabilities, superuser priviledges or the same
 790          * userid as the target process.
 791          */
 792         if ((current->euid != task->suid) && (current->euid != task->uid) &&
 793             (current->uid != task->suid) && (current->uid != task->uid) &&
 794             !capable(CAP_SYS_ADMIN)) {
 795                 err = -EPERM;
 796                 goto out;
 797         }
 798
 799         task_nodes = cpuset_mems_allowed(task);
 800         /* Is the user allowed to access the target nodes? */
 801         if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
 802                 err = -EPERM;
 803                 goto out;
 804         }
 805
 806         err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
 807 out:
 808         mmput(mm);
 809         return err;
 810 }
 811
 812
 813 /* Retrieve NUMA policy */
 814 asmlinkage long sys_get_mempolicy(int __user *policy,
 815                                 unsigned long __user *nmask,
 816                                 unsigned long maxnode,
 817                                 unsigned long addr, unsigned long flags)
 818 {
 819         int err, pval;
 820         nodemask_t nodes;
 821
 822         if (nmask != NULL && maxnode < MAX_NUMNODES)
 823                 return -EINVAL;
 824
 825         err = do_get_mempolicy(&pval, &nodes, addr, flags);
 826
 827         if (err)
 828                 return err;
 829
 830         if (policy && put_user(pval, policy))
 831                 return -EFAULT;
 832
 833         if (nmask)
 834                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
 835
 836         return err;
 837 }
 838
 839 #ifdef CONFIG_COMPAT
 840
 841 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
 842                                      compat_ulong_t __user *nmask,
 843                                      compat_ulong_t maxnode,
 844                                      compat_ulong_t addr, compat_ulong_t flags)
 845 {
 846         long err;
 847         unsigned long __user *nm = NULL;
 848         unsigned long nr_bits, alloc_size;
 849         DECLARE_BITMAP(bm, MAX_NUMNODES);
 850
 851         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 852         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 853
 854         if (nmask)
 855                 nm = compat_alloc_user_space(alloc_size);
 856
 857         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
 858
 859         if (!err && nmask) {
 860                 err = copy_from_user(bm, nm, alloc_size);
 861                 /* ensure entire bitmap is zeroed */
 862                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
 863                 err |= compat_put_bitmap(nmask, bm, nr_bits);
 864         }
 865
 866         return err;
 867 }
 868
 869 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
 870                                      compat_ulong_t maxnode)
 871 {
 872         long err = 0;
 873         unsigned long __user *nm = NULL;
 874         unsigned long nr_bits, alloc_size;
 875         DECLARE_BITMAP(bm, MAX_NUMNODES);
 876
 877         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 878         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 879
 880         if (nmask) {
 881                 err = compat_get_bitmap(bm, nmask, nr_bits);
 882                 nm = compat_alloc_user_space(alloc_size);
 883                 err |= copy_to_user(nm, bm, alloc_size);
 884         }
 885
 886         if (err)
 887                 return -EFAULT;
 888
 889         return sys_set_mempolicy(mode, nm, nr_bits+1);
 890 }
 891
 892 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
 893                              compat_ulong_t mode, compat_ulong_t __user *nmask,
 894                              compat_ulong_t maxnode, compat_ulong_t flags)
 895 {
 896         long err = 0;
 897         unsigned long __user *nm = NULL;
 898         unsigned long nr_bits, alloc_size;
 899         nodemask_t bm;
 900
 901         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 902         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 903
 904         if (nmask) {
 905                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
 906                 nm = compat_alloc_user_space(alloc_size);
 907                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
 908         }
 909
 910         if (err)
 911                 return -EFAULT;
 912
 913         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
 914 }
 915
 916 #endif
 917
 918 /* Return effective policy for a VMA */
 919 static struct mempolicy * get_vma_policy(struct task_struct *task,
 920                 struct vm_area_struct *vma, unsigned long addr)
 921 {
 922         struct mempolicy *pol = task->mempolicy;
 923
 924         if (vma) {
 925                 if (vma->vm_ops && vma->vm_ops->get_policy)
 926                         pol = vma->vm_ops->get_policy(vma, addr);
 927                 else if (vma->vm_policy &&
 928                                 vma->vm_policy->policy != MPOL_DEFAULT)
 929                         pol = vma->vm_policy;
 930         }
 931         if (!pol)
 932                 pol = &default_policy;
 933         return pol;
 934 }
 935
 936 /* Return a zonelist representing a mempolicy */
 937 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
 938 {
 939         int nd;
 940
 941         switch (policy->policy) {
 942         case MPOL_PREFERRED:
 943                 nd = policy->v.preferred_node;
 944                 if (nd < 0)
 945                         nd = numa_node_id();
 946                 break;
 947         case MPOL_BIND:
 948                 /* Lower zones don't get a policy applied */
 949                 /* Careful: current->mems_allowed might have moved */
 950                 if (gfp_zone(gfp) >= policy_zone)
 951                         if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
 952                                 return policy->v.zonelist;
 953                 /*FALL THROUGH*/
 954         case MPOL_INTERLEAVE: /* should not happen */
 955         case MPOL_DEFAULT:
 956                 nd = numa_node_id();
 957                 break;
 958         default:
 959                 nd = 0;
 960                 BUG();
 961         }
 962         return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
 963 }
 964
 965 /* Do dynamic interleaving for a process */
 966 static unsigned interleave_nodes(struct mempolicy *policy)
 967 {
 968         unsigned nid, next;
 969         struct task_struct *me = current;
 970
 971         nid = me->il_next;
 972         next = next_node(nid, policy->v.nodes);
 973         if (next >= MAX_NUMNODES)
 974                 next = first_node(policy->v.nodes);
 975         me->il_next = next;
 976         return nid;
 977 }
 978
 979 /*
 980  * Depending on the memory policy provide a node from which to allocate the
 981  * next slab entry.
 982  */
 983 unsigned slab_node(struct mempolicy *policy)
 984 {
 985         switch (policy->policy) {
 986         case MPOL_INTERLEAVE:
 987                 return interleave_nodes(policy);
 988
 989         case MPOL_BIND:
 990                 /*
 991                  * Follow bind policy behavior and start allocation at the
 992                  * first node.
 993                  */
 994                 return policy->v.zonelist->zones[0]->zone_pgdat->node_id;
 995
 996         case MPOL_PREFERRED:
 997                 if (policy->v.preferred_node >= 0)
 998                         return policy->v.preferred_node;
 999                 /* Fall through */
1000
1001         default:
1002                 return numa_node_id();
1003         }
1004 }
1005
1006 /* Do static interleaving for a VMA with known offset. */
1007 static unsigned offset_il_node(struct mempolicy *pol,
1008                 struct vm_area_struct *vma, unsigned long off)
1009 {
1010         unsigned nnodes = nodes_weight(pol->v.nodes);
1011         unsigned target = (unsigned)off % nnodes;
1012         int c;
1013         int nid = -1;
1014
1015         c = 0;
1016         do {
1017                 nid = next_node(nid, pol->v.nodes);
1018                 c++;
1019         } while (c <= target);
1020         return nid;
1021 }
1022
1023 /* Determine a node number for interleave */
1024 static inline unsigned interleave_nid(struct mempolicy *pol,
1025                  struct vm_area_struct *vma, unsigned long addr, int shift)
1026 {
1027         if (vma) {
1028                 unsigned long off;
1029
1030                 off = vma->vm_pgoff;
1031                 off += (addr - vma->vm_start) >> shift;
1032                 return offset_il_node(pol, vma, off);
1033         } else
1034                 return interleave_nodes(pol);
1035 }
1036
1037 /* Return a zonelist suitable for a huge page allocation. */
1038 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1039 {
1040         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1041
1042         if (pol->policy == MPOL_INTERLEAVE) {
1043                 unsigned nid;
1044
1045                 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1046                 return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
1047         }
1048         return zonelist_policy(GFP_HIGHUSER, pol);
1049 }
1050
1051 /* Allocate a page in interleaved policy.
1052    Own path because it needs to do special accounting. */
1053 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1054                                         unsigned nid)
1055 {
1056         struct zonelist *zl;
1057         struct page *page;
1058
1059         zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1060         page = __alloc_pages(gfp, order, zl);
1061         if (page && page_zone(page) == zl->zones[0]) {
1062                 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
1063                 put_cpu();
1064         }
1065         return page;
1066 }
1067
1068 /**
1069  *      alloc_page_vma  - Allocate a page for a VMA.
1070  *
1071  *      @gfp:
1072  *      %GFP_USER    user allocation.
1073  *      %GFP_KERNEL  kernel allocations,
1074  *      %GFP_HIGHMEM highmem/user allocations,
1075  *      %GFP_FS      allocation should not call back into a file system.
1076  *      %GFP_ATOMIC  don't sleep.
1077  *
1078  *      @vma:  Pointer to VMA or NULL if not available.
1079  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1080  *
1081  *      This function allocates a page from the kernel page pool and applies
1082  *      a NUMA policy associated with the VMA or the current process.
1083  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1084  *      mm_struct of the VMA to prevent it from going away. Should be used for
1085  *      all allocations for pages that will be mapped into
1086  *      user space. Returns NULL when no page can be allocated.
1087  *
1088  *      Should be called with the mm_sem of the vma hold.
1089  */
1090 struct page *
1091 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1092 {
1093         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1094
1095         cpuset_update_task_memory_state();
1096
1097         if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1098                 unsigned nid;
1099
1100                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1101                 return alloc_page_interleave(gfp, 0, nid);
1102         }
1103         return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
1104 }
1105
1106 /**
1107  *      alloc_pages_current - Allocate pages.
1108  *
1109  *      @gfp:
1110  *              %GFP_USER   user allocation,
1111  *              %GFP_KERNEL kernel allocation,
1112  *              %GFP_HIGHMEM highmem allocation,
1113  *              %GFP_FS     don't call back into a file system.
1114  *              %GFP_ATOMIC don't sleep.
1115  *      @order: Power of two of allocation size in pages. 0 is a single page.
1116  *
1117  *      Allocate a page from the kernel page pool.  When not in
1118  *      interrupt context and apply the current process NUMA policy.
1119  *      Returns NULL when no page can be allocated.
1120  *
1121  *      Don't call cpuset_update_task_memory_state() unless
1122  *      1) it's ok to take cpuset_sem (can WAIT), and
1123  *      2) allocating for current task (not interrupt).
1124  */
1125 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1126 {
1127         struct mempolicy *pol = current->mempolicy;
1128
1129         if ((gfp & __GFP_WAIT) && !in_interrupt())
1130                 cpuset_update_task_memory_state();
1131         if (!pol || in_interrupt())
1132                 pol = &default_policy;
1133         if (pol->policy == MPOL_INTERLEAVE)
1134                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1135         return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1136 }
1137 EXPORT_SYMBOL(alloc_pages_current);
1138
1139 /*
1140  * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1141  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1142  * with the mems_allowed returned by cpuset_mems_allowed().  This
1143  * keeps mempolicies cpuset relative after its cpuset moves.  See
1144  * further kernel/cpuset.c update_nodemask().
1145  */
1146 void *cpuset_being_rebound;
1147
1148 /* Slow path of a mempolicy copy */
1149 struct mempolicy *__mpol_copy(struct mempolicy *old)
1150 {
1151         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1152
1153         if (!new)
1154                 return ERR_PTR(-ENOMEM);
1155         if (current_cpuset_is_being_rebound()) {
1156                 nodemask_t mems = cpuset_mems_allowed(current);
1157                 mpol_rebind_policy(old, &mems);
1158         }
1159         *new = *old;
1160         atomic_set(&new->refcnt, 1);
1161         if (new->policy == MPOL_BIND) {
1162                 int sz = ksize(old->v.zonelist);
1163                 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
1164                 if (!new->v.zonelist) {
1165                         kmem_cache_free(policy_cache, new);
1166                         return ERR_PTR(-ENOMEM);
1167                 }
1168                 memcpy(new->v.zonelist, old->v.zonelist, sz);
1169         }
1170         return new;
1171 }
1172
1173 /* Slow path of a mempolicy comparison */
1174 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1175 {
1176         if (!a || !b)
1177                 return 0;
1178         if (a->policy != b->policy)
1179                 return 0;
1180         switch (a->policy) {
1181         case MPOL_DEFAULT:
1182                 return 1;
1183         case MPOL_INTERLEAVE:
1184                 return nodes_equal(a->v.nodes, b->v.nodes);
1185         case MPOL_PREFERRED:
1186                 return a->v.preferred_node == b->v.preferred_node;
1187         case MPOL_BIND: {
1188                 int i;
1189                 for (i = 0; a->v.zonelist->zones[i]; i++)
1190                         if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1191                                 return 0;
1192                 return b->v.zonelist->zones[i] == NULL;
1193         }
1194         default:
1195                 BUG();
1196                 return 0;
1197         }
1198 }
1199
1200 /* Slow path of a mpol destructor. */
1201 void __mpol_free(struct mempolicy *p)
1202 {
1203         if (!atomic_dec_and_test(&p->refcnt))
1204                 return;
1205         if (p->policy == MPOL_BIND)
1206                 kfree(p->v.zonelist);
1207         p->policy = MPOL_DEFAULT;
1208         kmem_cache_free(policy_cache, p);
1209 }
1210
1211 /*
1212  * Shared memory backing store policy support.
1213  *
1214  * Remember policies even when nobody has shared memory mapped.
1215  * The policies are kept in Red-Black tree linked from the inode.
1216  * They are protected by the sp->lock spinlock, which should be held
1217  * for any accesses to the tree.
1218  */
1219
1220 /* lookup first element intersecting start-end */
1221 /* Caller holds sp->lock */
1222 static struct sp_node *
1223 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1224 {
1225         struct rb_node *n = sp->root.rb_node;
1226
1227         while (n) {
1228                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1229
1230                 if (start >= p->end)
1231                         n = n->rb_right;
1232                 else if (end <= p->start)
1233                         n = n->rb_left;
1234                 else
1235                         break;
1236         }
1237         if (!n)
1238                 return NULL;
1239         for (;;) {
1240                 struct sp_node *w = NULL;
1241                 struct rb_node *prev = rb_prev(n);
1242                 if (!prev)
1243                         break;
1244                 w = rb_entry(prev, struct sp_node, nd);
1245                 if (w->end <= start)
1246                         break;
1247                 n = prev;
1248         }
1249         return rb_entry(n, struct sp_node, nd);
1250 }
1251
1252 /* Insert a new shared policy into the list. */
1253 /* Caller holds sp->lock */
1254 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1255 {
1256         struct rb_node **p = &sp->root.rb_node;
1257         struct rb_node *parent = NULL;
1258         struct sp_node *nd;
1259
1260         while (*p) {
1261                 parent = *p;
1262                 nd = rb_entry(parent, struct sp_node, nd);
1263                 if (new->start < nd->start)
1264                         p = &(*p)->rb_left;
1265                 else if (new->end > nd->end)
1266                         p = &(*p)->rb_right;
1267                 else
1268                         BUG();
1269         }
1270         rb_link_node(&new->nd, parent, p);
1271         rb_insert_color(&new->nd, &sp->root);
1272         PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1273                  new->policy ? new->policy->policy : 0);
1274 }
1275
1276 /* Find shared policy intersecting idx */
1277 struct mempolicy *
1278 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1279 {
1280         struct mempolicy *pol = NULL;
1281         struct sp_node *sn;
1282
1283         if (!sp->root.rb_node)
1284                 return NULL;
1285         spin_lock(&sp->lock);
1286         sn = sp_lookup(sp, idx, idx+1);
1287         if (sn) {
1288                 mpol_get(sn->policy);
1289                 pol = sn->policy;
1290         }
1291         spin_unlock(&sp->lock);
1292         return pol;
1293 }
1294
1295 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1296 {
1297         PDprintk("deleting %lx-l%x\n", n->start, n->end);
1298         rb_erase(&n->nd, &sp->root);
1299         mpol_free(n->policy);
1300         kmem_cache_free(sn_cache, n);
1301 }
1302
1303 struct sp_node *
1304 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1305 {
1306         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1307
1308         if (!n)
1309                 return NULL;
1310         n->start = start;
1311         n->end = end;
1312         mpol_get(pol);
1313         n->policy = pol;
1314         return n;
1315 }
1316
1317 /* Replace a policy range. */
1318 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1319                                  unsigned long end, struct sp_node *new)
1320 {
1321         struct sp_node *n, *new2 = NULL;
1322
1323 restart:
1324         spin_lock(&sp->lock);
1325         n = sp_lookup(sp, start, end);
1326         /* Take care of old policies in the same range. */
1327         while (n && n->start < end) {
1328                 struct rb_node *next = rb_next(&n->nd);
1329                 if (n->start >= start) {
1330                         if (n->end <= end)
1331                                 sp_delete(sp, n);
1332                         else
1333                                 n->start = end;
1334                 } else {
1335                         /* Old policy spanning whole new range. */
1336                         if (n->end > end) {
1337                                 if (!new2) {
1338                                         spin_unlock(&sp->lock);
1339                                         new2 = sp_alloc(end, n->end, n->policy);
1340                                         if (!new2)
1341                                                 return -ENOMEM;
1342                                         goto restart;
1343                                 }
1344                                 n->end = start;
1345                                 sp_insert(sp, new2);
1346                                 new2 = NULL;
1347                                 break;
1348                         } else
1349                                 n->end = start;
1350                 }
1351                 if (!next)
1352                         break;
1353                 n = rb_entry(next, struct sp_node, nd);
1354         }
1355         if (new)
1356                 sp_insert(sp, new);
1357         spin_unlock(&sp->lock);
1358         if (new2) {
1359                 mpol_free(new2->policy);
1360                 kmem_cache_free(sn_cache, new2);
1361         }
1362         return 0;
1363 }
1364
1365 void mpol_shared_policy_init(struct shared_policy *info, int policy,
1366                                 nodemask_t *policy_nodes)
1367 {
1368         info->root = RB_ROOT;
1369         spin_lock_init(&info->lock);
1370
1371         if (policy != MPOL_DEFAULT) {
1372                 struct mempolicy *newpol;
1373
1374                 /* Falls back to MPOL_DEFAULT on any error */
1375                 newpol = mpol_new(policy, policy_nodes);
1376                 if (!IS_ERR(newpol)) {
1377                         /* Create pseudo-vma that contains just the policy */
1378                         struct vm_area_struct pvma;
1379
1380                         memset(&pvma, 0, sizeof(struct vm_area_struct));
1381                         /* Policy covers entire file */
1382                         pvma.vm_end = TASK_SIZE;
1383                         mpol_set_shared_policy(info, &pvma, newpol);
1384                         mpol_free(newpol);
1385                 }
1386         }
1387 }
1388
1389 int mpol_set_shared_policy(struct shared_policy *info,
1390                         struct vm_area_struct *vma, struct mempolicy *npol)
1391 {
1392         int err;
1393         struct sp_node *new = NULL;
1394         unsigned long sz = vma_pages(vma);
1395
1396         PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1397                  vma->vm_pgoff,
1398                  sz, npol? npol->policy : -1,
1399                 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1400
1401         if (npol) {
1402                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1403                 if (!new)
1404                         return -ENOMEM;
1405         }
1406         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1407         if (err && new)
1408                 kmem_cache_free(sn_cache, new);
1409         return err;
1410 }
1411
1412 /* Free a backing policy store on inode delete. */
1413 void mpol_free_shared_policy(struct shared_policy *p)
1414 {
1415         struct sp_node *n;
1416         struct rb_node *next;
1417
1418         if (!p->root.rb_node)
1419                 return;
1420         spin_lock(&p->lock);
1421         next = rb_first(&p->root);
1422         while (next) {
1423                 n = rb_entry(next, struct sp_node, nd);
1424                 next = rb_next(&n->nd);
1425                 rb_erase(&n->nd, &p->root);
1426                 mpol_free(n->policy);
1427                 kmem_cache_free(sn_cache, n);
1428         }
1429         spin_unlock(&p->lock);
1430 }
1431
1432 /* assumes fs == KERNEL_DS */
1433 void __init numa_policy_init(void)
1434 {
1435         policy_cache = kmem_cache_create("numa_policy",
1436                                          sizeof(struct mempolicy),
1437                                          0, SLAB_PANIC, NULL, NULL);
1438
1439         sn_cache = kmem_cache_create("shared_policy_node",
1440                                      sizeof(struct sp_node),
1441                                      0, SLAB_PANIC, NULL, NULL);
1442
1443         /* Set interleaving policy for system init. This way not all
1444            the data structures allocated at system boot end up in node zero. */
1445
1446         if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1447                 printk("numa_policy_init: interleaving failed\n");
1448 }
1449
1450 /* Reset policy of current process to default */
1451 void numa_default_policy(void)
1452 {
1453         do_set_mempolicy(MPOL_DEFAULT, NULL);
1454 }
1455
1456 /* Migrate a policy to a different set of nodes */
1457 void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1458 {
1459         nodemask_t *mpolmask;
1460         nodemask_t tmp;
1461
1462         if (!pol)
1463                 return;
1464         mpolmask = &pol->cpuset_mems_allowed;
1465         if (nodes_equal(*mpolmask, *newmask))
1466                 return;
1467
1468         switch (pol->policy) {
1469         case MPOL_DEFAULT:
1470                 break;
1471         case MPOL_INTERLEAVE:
1472                 nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1473                 pol->v.nodes = tmp;
1474                 *mpolmask = *newmask;
1475                 current->il_next = node_remap(current->il_next,
1476                                                 *mpolmask, *newmask);
1477                 break;
1478         case MPOL_PREFERRED:
1479                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1480                                                 *mpolmask, *newmask);
1481                 *mpolmask = *newmask;
1482                 break;
1483         case MPOL_BIND: {
1484                 nodemask_t nodes;
1485                 struct zone **z;
1486                 struct zonelist *zonelist;
1487
1488                 nodes_clear(nodes);
1489                 for (z = pol->v.zonelist->zones; *z; z++)
1490                         node_set((*z)->zone_pgdat->node_id, nodes);
1491                 nodes_remap(tmp, nodes, *mpolmask, *newmask);
1492                 nodes = tmp;
1493
1494                 zonelist = bind_zonelist(&nodes);
1495
1496                 /* If no mem, then zonelist is NULL and we keep old zonelist.
1497                  * If that old zonelist has no remaining mems_allowed nodes,
1498                  * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1499                  */
1500
1501                 if (zonelist) {
1502                         /* Good - got mem - substitute new zonelist */
1503                         kfree(pol->v.zonelist);
1504                         pol->v.zonelist = zonelist;
1505                 }
1506                 *mpolmask = *newmask;
1507                 break;
1508         }
1509         default:
1510                 BUG();
1511                 break;
1512         }
1513 }
1514
1515 /*
1516  * Wrapper for mpol_rebind_policy() that just requires task
1517  * pointer, and updates task mempolicy.
1518  */
1519
1520 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1521 {
1522         mpol_rebind_policy(tsk->mempolicy, new);
1523 }
1524
1525 /*
1526  * Rebind each vma in mm to new nodemask.
1527  *
1528  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
1529  */
1530
1531 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1532 {
1533         struct vm_area_struct *vma;
1534
1535         down_write(&mm->mmap_sem);
1536         for (vma = mm->mmap; vma; vma = vma->vm_next)
1537                 mpol_rebind_policy(vma->vm_policy, new);
1538         up_write(&mm->mmap_sem);
1539 }
1540
1541 /*
1542  * Display pages allocated per node and memory policy via /proc.
1543  */
1544
1545 static const char *policy_types[] = { "default", "prefer", "bind",
1546                                       "interleave" };
1547
1548 /*
1549  * Convert a mempolicy into a string.
1550  * Returns the number of characters in buffer (if positive)
1551  * or an error (negative)
1552  */
1553 static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1554 {
1555         char *p = buffer;
1556         int l;
1557         nodemask_t nodes;
1558         int mode = pol ? pol->policy : MPOL_DEFAULT;
1559
1560         switch (mode) {
1561         case MPOL_DEFAULT:
1562                 nodes_clear(nodes);
1563                 break;
1564
1565         case MPOL_PREFERRED:
1566                 nodes_clear(nodes);
1567                 node_set(pol->v.preferred_node, nodes);
1568                 break;
1569
1570         case MPOL_BIND:
1571                 get_zonemask(pol, &nodes);
1572                 break;
1573
1574         case MPOL_INTERLEAVE:
1575                 nodes = pol->v.nodes;
1576                 break;
1577
1578         default:
1579                 BUG();
1580                 return -EFAULT;
1581         }
1582
1583         l = strlen(policy_types[mode]);
1584         if (buffer + maxlen < p + l + 1)
1585                 return -ENOSPC;
1586
1587         strcpy(p, policy_types[mode]);
1588         p += l;
1589
1590         if (!nodes_empty(nodes)) {
1591                 if (buffer + maxlen < p + 2)
1592                         return -ENOSPC;
1593                 *p++ = '=';
1594                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1595         }
1596         return p - buffer;
1597 }
1598
1599 struct numa_maps {
1600         unsigned long pages;
1601         unsigned long anon;
1602         unsigned long mapped;
1603         unsigned long mapcount_max;
1604         unsigned long node[MAX_NUMNODES];
1605 };
1606
1607 static void gather_stats(struct page *page, void *private)
1608 {
1609         struct numa_maps *md = private;
1610         int count = page_mapcount(page);
1611
1612         if (count)
1613                 md->mapped++;
1614
1615         if (count > md->mapcount_max)
1616                 md->mapcount_max = count;
1617
1618         md->pages++;
1619
1620         if (PageAnon(page))
1621                 md->anon++;
1622
1623         md->node[page_to_nid(page)]++;
1624         cond_resched();
1625 }
1626
1627 int show_numa_map(struct seq_file *m, void *v)
1628 {
1629         struct task_struct *task = m->private;
1630         struct vm_area_struct *vma = v;
1631         struct numa_maps *md;
1632         int n;
1633         char buffer[50];
1634
1635         if (!vma->vm_mm)
1636                 return 0;
1637
1638         md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1639         if (!md)
1640                 return 0;
1641
1642         check_pgd_range(vma, vma->vm_start, vma->vm_end,
1643                     &node_online_map, MPOL_MF_STATS, md);
1644
1645         if (md->pages) {
1646                 mpol_to_str(buffer, sizeof(buffer),
1647                             get_vma_policy(task, vma, vma->vm_start));
1648
1649                 seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu",
1650                            vma->vm_start, buffer, md->pages,
1651                            md->mapped, md->mapcount_max);
1652
1653                 if (md->anon)
1654                         seq_printf(m," anon=%lu",md->anon);
1655
1656                 for_each_online_node(n)
1657                         if (md->node[n])
1658                                 seq_printf(m, " N%d=%lu", n, md->node[n]);
1659
1660                 seq_putc(m, '\n');
1661         }
1662         kfree(md);
1663
1664         if (m->count < m->size)
1665                 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
1666         return 0;
1667 }
1668