mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66    could replace all the switch()es with a mempolicy_ops structure.
  67 */
  68
  69 #include <linux/mempolicy.h>
  70 #include <linux/mm.h>
  71 #include <linux/highmem.h>
  72 #include <linux/hugetlb.h>
  73 #include <linux/kernel.h>
  74 #include <linux/sched.h>
  75 #include <linux/mm.h>
  76 #include <linux/nodemask.h>
  77 #include <linux/cpuset.h>
  78 #include <linux/gfp.h>
  79 #include <linux/slab.h>
  80 #include <linux/string.h>
  81 #include <linux/module.h>
  82 #include <linux/interrupt.h>
  83 #include <linux/init.h>
  84 #include <linux/compat.h>
  85 #include <linux/mempolicy.h>
  86 #include <linux/swap.h>
  87 #include <linux/seq_file.h>
  88 #include <linux/proc_fs.h>
  89
  90 #include <asm/tlbflush.h>
  91 #include <asm/uaccess.h>
  92
  93 /* Internal flags */
  94 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
  95 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
  96 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
  97
  98 static kmem_cache_t *policy_cache;
  99 static kmem_cache_t *sn_cache;
 100
 101 #define PDprintk(fmt...)
 102
 103 /* Highest zone. An specific allocation for a zone below that is not
 104    policied. */
 105 int policy_zone = ZONE_DMA;
 106
 107 struct mempolicy default_policy = {
 108         .refcnt = ATOMIC_INIT(1), /* never free it */
 109         .policy = MPOL_DEFAULT,
 110 };
 111
 112 /* Do sanity checking on a policy */
 113 static int mpol_check_policy(int mode, nodemask_t *nodes)
 114 {
 115         int empty = nodes_empty(*nodes);
 116
 117         switch (mode) {
 118         case MPOL_DEFAULT:
 119                 if (!empty)
 120                         return -EINVAL;
 121                 break;
 122         case MPOL_BIND:
 123         case MPOL_INTERLEAVE:
 124                 /* Preferred will only use the first bit, but allow
 125                    more for now. */
 126                 if (empty)
 127                         return -EINVAL;
 128                 break;
 129         }
 130         return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
 131 }
 132 /* Generate a custom zonelist for the BIND policy. */
 133 static struct zonelist *bind_zonelist(nodemask_t *nodes)
 134 {
 135         struct zonelist *zl;
 136         int num, max, nd;
 137
 138         max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
 139         zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
 140         if (!zl)
 141                 return NULL;
 142         num = 0;
 143         for_each_node_mask(nd, *nodes)
 144                 zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
 145         zl->zones[num] = NULL;
 146         return zl;
 147 }
 148
 149 /* Create a new policy */
 150 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 151 {
 152         struct mempolicy *policy;
 153
 154         PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
 155         if (mode == MPOL_DEFAULT)
 156                 return NULL;
 157         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 158         if (!policy)
 159                 return ERR_PTR(-ENOMEM);
 160         atomic_set(&policy->refcnt, 1);
 161         switch (mode) {
 162         case MPOL_INTERLEAVE:
 163                 policy->v.nodes = *nodes;
 164                 if (nodes_weight(*nodes) == 0) {
 165                         kmem_cache_free(policy_cache, policy);
 166                         return ERR_PTR(-EINVAL);
 167                 }
 168                 break;
 169         case MPOL_PREFERRED:
 170                 policy->v.preferred_node = first_node(*nodes);
 171                 if (policy->v.preferred_node >= MAX_NUMNODES)
 172                         policy->v.preferred_node = -1;
 173                 break;
 174         case MPOL_BIND:
 175                 policy->v.zonelist = bind_zonelist(nodes);
 176                 if (policy->v.zonelist == NULL) {
 177                         kmem_cache_free(policy_cache, policy);
 178                         return ERR_PTR(-ENOMEM);
 179                 }
 180                 break;
 181         }
 182         policy->policy = mode;
 183         return policy;
 184 }
 185
 186 /* Check if we are the only process mapping the page in question */
 187 static inline int single_mm_mapping(struct mm_struct *mm,
 188                         struct address_space *mapping)
 189 {
 190         struct vm_area_struct *vma;
 191         struct prio_tree_iter iter;
 192         int rc = 1;
 193
 194         spin_lock(&mapping->i_mmap_lock);
 195         vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
 196                 if (mm != vma->vm_mm) {
 197                         rc = 0;
 198                         goto out;
 199                 }
 200         list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
 201                 if (mm != vma->vm_mm) {
 202                         rc = 0;
 203                         goto out;
 204                 }
 205 out:
 206         spin_unlock(&mapping->i_mmap_lock);
 207         return rc;
 208 }
 209
 210 /*
 211  * Add a page to be migrated to the pagelist
 212  */
 213 static void migrate_page_add(struct vm_area_struct *vma,
 214         struct page *page, struct list_head *pagelist, unsigned long flags)
 215 {
 216         /*
 217          * Avoid migrating a page that is shared by others and not writable.
 218          */
 219         if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
 220             mapping_writably_mapped(page->mapping) ||
 221             single_mm_mapping(vma->vm_mm, page->mapping)) {
 222                 int rc = isolate_lru_page(page);
 223
 224                 if (rc == 1)
 225                         list_add(&page->lru, pagelist);
 226                 /*
 227                  * If the isolate attempt was not successful then we just
 228                  * encountered an unswappable page. Something must be wrong.
 229                  */
 230                 WARN_ON(rc == 0);
 231         }
 232 }
 233
 234 static void gather_stats(struct page *, void *);
 235
 236 /* Scan through pages checking if pages follow certain conditions. */
 237 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 238                 unsigned long addr, unsigned long end,
 239                 const nodemask_t *nodes, unsigned long flags,
 240                 void *private)
 241 {
 242         pte_t *orig_pte;
 243         pte_t *pte;
 244         spinlock_t *ptl;
 245
 246         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 247         do {
 248                 struct page *page;
 249                 unsigned int nid;
 250
 251                 if (!pte_present(*pte))
 252                         continue;
 253                 page = vm_normal_page(vma, addr, *pte);
 254                 if (!page)
 255                         continue;
 256                 nid = page_to_nid(page);
 257                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 258                         continue;
 259
 260                 if (flags & MPOL_MF_STATS)
 261                         gather_stats(page, private);
 262                 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 263                         spin_unlock(ptl);
 264                         migrate_page_add(vma, page, private, flags);
 265                         spin_lock(ptl);
 266                 }
 267                 else
 268                         break;
 269         } while (pte++, addr += PAGE_SIZE, addr != end);
 270         pte_unmap_unlock(orig_pte, ptl);
 271         return addr != end;
 272 }
 273
 274 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 275                 unsigned long addr, unsigned long end,
 276                 const nodemask_t *nodes, unsigned long flags,
 277                 void *private)
 278 {
 279         pmd_t *pmd;
 280         unsigned long next;
 281
 282         pmd = pmd_offset(pud, addr);
 283         do {
 284                 next = pmd_addr_end(addr, end);
 285                 if (pmd_none_or_clear_bad(pmd))
 286                         continue;
 287                 if (check_pte_range(vma, pmd, addr, next, nodes,
 288                                     flags, private))
 289                         return -EIO;
 290         } while (pmd++, addr = next, addr != end);
 291         return 0;
 292 }
 293
 294 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 295                 unsigned long addr, unsigned long end,
 296                 const nodemask_t *nodes, unsigned long flags,
 297                 void *private)
 298 {
 299         pud_t *pud;
 300         unsigned long next;
 301
 302         pud = pud_offset(pgd, addr);
 303         do {
 304                 next = pud_addr_end(addr, end);
 305                 if (pud_none_or_clear_bad(pud))
 306                         continue;
 307                 if (check_pmd_range(vma, pud, addr, next, nodes,
 308                                     flags, private))
 309                         return -EIO;
 310         } while (pud++, addr = next, addr != end);
 311         return 0;
 312 }
 313
 314 static inline int check_pgd_range(struct vm_area_struct *vma,
 315                 unsigned long addr, unsigned long end,
 316                 const nodemask_t *nodes, unsigned long flags,
 317                 void *private)
 318 {
 319         pgd_t *pgd;
 320         unsigned long next;
 321
 322         pgd = pgd_offset(vma->vm_mm, addr);
 323         do {
 324                 next = pgd_addr_end(addr, end);
 325                 if (pgd_none_or_clear_bad(pgd))
 326                         continue;
 327                 if (check_pud_range(vma, pgd, addr, next, nodes,
 328                                     flags, private))
 329                         return -EIO;
 330         } while (pgd++, addr = next, addr != end);
 331         return 0;
 332 }
 333
 334 /* Check if a vma is migratable */
 335 static inline int vma_migratable(struct vm_area_struct *vma)
 336 {
 337         if (vma->vm_flags & (
 338                 VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP))
 339                 return 0;
 340         return 1;
 341 }
 342
 343 /*
 344  * Check if all pages in a range are on a set of nodes.
 345  * If pagelist != NULL then isolate pages from the LRU and
 346  * put them on the pagelist.
 347  */
 348 static struct vm_area_struct *
 349 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 350                 const nodemask_t *nodes, unsigned long flags, void *private)
 351 {
 352         int err;
 353         struct vm_area_struct *first, *vma, *prev;
 354
 355         first = find_vma(mm, start);
 356         if (!first)
 357                 return ERR_PTR(-EFAULT);
 358         prev = NULL;
 359         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 360                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 361                         if (!vma->vm_next && vma->vm_end < end)
 362                                 return ERR_PTR(-EFAULT);
 363                         if (prev && prev->vm_end < vma->vm_start)
 364                                 return ERR_PTR(-EFAULT);
 365                 }
 366                 if (!is_vm_hugetlb_page(vma) &&
 367                     ((flags & MPOL_MF_STRICT) ||
 368                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 369                                 vma_migratable(vma)))) {
 370                         unsigned long endvma = vma->vm_end;
 371
 372                         if (endvma > end)
 373                                 endvma = end;
 374                         if (vma->vm_start > start)
 375                                 start = vma->vm_start;
 376                         err = check_pgd_range(vma, start, endvma, nodes,
 377                                                 flags, private);
 378                         if (err) {
 379                                 first = ERR_PTR(err);
 380                                 break;
 381                         }
 382                 }
 383                 prev = vma;
 384         }
 385         return first;
 386 }
 387
 388 /* Apply policy to a single VMA */
 389 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 390 {
 391         int err = 0;
 392         struct mempolicy *old = vma->vm_policy;
 393
 394         PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 395                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 396                  vma->vm_ops, vma->vm_file,
 397                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 398
 399         if (vma->vm_ops && vma->vm_ops->set_policy)
 400                 err = vma->vm_ops->set_policy(vma, new);
 401         if (!err) {
 402                 mpol_get(new);
 403                 vma->vm_policy = new;
 404                 mpol_free(old);
 405         }
 406         return err;
 407 }
 408
 409 /* Step 2: apply policy to a range and do splits. */
 410 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 411                        unsigned long end, struct mempolicy *new)
 412 {
 413         struct vm_area_struct *next;
 414         int err;
 415
 416         err = 0;
 417         for (; vma && vma->vm_start < end; vma = next) {
 418                 next = vma->vm_next;
 419                 if (vma->vm_start < start)
 420                         err = split_vma(vma->vm_mm, vma, start, 1);
 421                 if (!err && vma->vm_end > end)
 422                         err = split_vma(vma->vm_mm, vma, end, 0);
 423                 if (!err)
 424                         err = policy_vma(vma, new);
 425                 if (err)
 426                         break;
 427         }
 428         return err;
 429 }
 430
 431 static int contextualize_policy(int mode, nodemask_t *nodes)
 432 {
 433         if (!nodes)
 434                 return 0;
 435
 436         /* Update current mems_allowed */
 437         cpuset_update_current_mems_allowed();
 438         /* Ignore nodes not set in current->mems_allowed */
 439         cpuset_restrict_to_mems_allowed(nodes->bits);
 440         return mpol_check_policy(mode, nodes);
 441 }
 442
 443 static int swap_pages(struct list_head *pagelist)
 444 {
 445         LIST_HEAD(moved);
 446         LIST_HEAD(failed);
 447         int n;
 448
 449         n = migrate_pages(pagelist, NULL, &moved, &failed);
 450         putback_lru_pages(&failed);
 451         putback_lru_pages(&moved);
 452
 453         return n;
 454 }
 455
 456 long do_mbind(unsigned long start, unsigned long len,
 457                 unsigned long mode, nodemask_t *nmask, unsigned long flags)
 458 {
 459         struct vm_area_struct *vma;
 460         struct mm_struct *mm = current->mm;
 461         struct mempolicy *new;
 462         unsigned long end;
 463         int err;
 464         LIST_HEAD(pagelist);
 465
 466         if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
 467                                       MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 468             || mode > MPOL_MAX)
 469                 return -EINVAL;
 470         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
 471                 return -EPERM;
 472
 473         if (start & ~PAGE_MASK)
 474                 return -EINVAL;
 475
 476         if (mode == MPOL_DEFAULT)
 477                 flags &= ~MPOL_MF_STRICT;
 478
 479         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 480         end = start + len;
 481
 482         if (end < start)
 483                 return -EINVAL;
 484         if (end == start)
 485                 return 0;
 486
 487         if (mpol_check_policy(mode, nmask))
 488                 return -EINVAL;
 489
 490         new = mpol_new(mode, nmask);
 491         if (IS_ERR(new))
 492                 return PTR_ERR(new);
 493
 494         /*
 495          * If we are using the default policy then operation
 496          * on discontinuous address spaces is okay after all
 497          */
 498         if (!new)
 499                 flags |= MPOL_MF_DISCONTIG_OK;
 500
 501         PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
 502                         mode,nodes_addr(nodes)[0]);
 503
 504         down_write(&mm->mmap_sem);
 505         vma = check_range(mm, start, end, nmask,
 506                           flags | MPOL_MF_INVERT, &pagelist);
 507
 508         err = PTR_ERR(vma);
 509         if (!IS_ERR(vma)) {
 510                 int nr_failed = 0;
 511
 512                 err = mbind_range(vma, start, end, new);
 513                 if (!list_empty(&pagelist))
 514                         nr_failed = swap_pages(&pagelist);
 515
 516                 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 517                         err = -EIO;
 518         }
 519         if (!list_empty(&pagelist))
 520                 putback_lru_pages(&pagelist);
 521
 522         up_write(&mm->mmap_sem);
 523         mpol_free(new);
 524         return err;
 525 }
 526
 527 /* Set the process memory policy */
 528 long do_set_mempolicy(int mode, nodemask_t *nodes)
 529 {
 530         struct mempolicy *new;
 531
 532         if (contextualize_policy(mode, nodes))
 533                 return -EINVAL;
 534         new = mpol_new(mode, nodes);
 535         if (IS_ERR(new))
 536                 return PTR_ERR(new);
 537         mpol_free(current->mempolicy);
 538         current->mempolicy = new;
 539         if (new && new->policy == MPOL_INTERLEAVE)
 540                 current->il_next = first_node(new->v.nodes);
 541         return 0;
 542 }
 543
 544 /* Fill a zone bitmap for a policy */
 545 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 546 {
 547         int i;
 548
 549         nodes_clear(*nodes);
 550         switch (p->policy) {
 551         case MPOL_BIND:
 552                 for (i = 0; p->v.zonelist->zones[i]; i++)
 553                         node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
 554                                 *nodes);
 555                 break;
 556         case MPOL_DEFAULT:
 557                 break;
 558         case MPOL_INTERLEAVE:
 559                 *nodes = p->v.nodes;
 560                 break;
 561         case MPOL_PREFERRED:
 562                 /* or use current node instead of online map? */
 563                 if (p->v.preferred_node < 0)
 564                         *nodes = node_online_map;
 565                 else
 566                         node_set(p->v.preferred_node, *nodes);
 567                 break;
 568         default:
 569                 BUG();
 570         }
 571 }
 572
 573 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 574 {
 575         struct page *p;
 576         int err;
 577
 578         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 579         if (err >= 0) {
 580                 err = page_to_nid(p);
 581                 put_page(p);
 582         }
 583         return err;
 584 }
 585
 586 /* Retrieve NUMA policy */
 587 long do_get_mempolicy(int *policy, nodemask_t *nmask,
 588                         unsigned long addr, unsigned long flags)
 589 {
 590         int err;
 591         struct mm_struct *mm = current->mm;
 592         struct vm_area_struct *vma = NULL;
 593         struct mempolicy *pol = current->mempolicy;
 594
 595         cpuset_update_current_mems_allowed();
 596         if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
 597                 return -EINVAL;
 598         if (flags & MPOL_F_ADDR) {
 599                 down_read(&mm->mmap_sem);
 600                 vma = find_vma_intersection(mm, addr, addr+1);
 601                 if (!vma) {
 602                         up_read(&mm->mmap_sem);
 603                         return -EFAULT;
 604                 }
 605                 if (vma->vm_ops && vma->vm_ops->get_policy)
 606                         pol = vma->vm_ops->get_policy(vma, addr);
 607                 else
 608                         pol = vma->vm_policy;
 609         } else if (addr)
 610                 return -EINVAL;
 611
 612         if (!pol)
 613                 pol = &default_policy;
 614
 615         if (flags & MPOL_F_NODE) {
 616                 if (flags & MPOL_F_ADDR) {
 617                         err = lookup_node(mm, addr);
 618                         if (err < 0)
 619                                 goto out;
 620                         *policy = err;
 621                 } else if (pol == current->mempolicy &&
 622                                 pol->policy == MPOL_INTERLEAVE) {
 623                         *policy = current->il_next;
 624                 } else {
 625                         err = -EINVAL;
 626                         goto out;
 627                 }
 628         } else
 629                 *policy = pol->policy;
 630
 631         if (vma) {
 632                 up_read(&current->mm->mmap_sem);
 633                 vma = NULL;
 634         }
 635
 636         err = 0;
 637         if (nmask)
 638                 get_zonemask(pol, nmask);
 639
 640  out:
 641         if (vma)
 642                 up_read(&current->mm->mmap_sem);
 643         return err;
 644 }
 645
 646 /*
 647  * For now migrate_pages simply swaps out the pages from nodes that are in
 648  * the source set but not in the target set. In the future, we would
 649  * want a function that moves pages between the two nodesets in such
 650  * a way as to preserve the physical layout as much as possible.
 651  *
 652  * Returns the number of page that could not be moved.
 653  */
 654 int do_migrate_pages(struct mm_struct *mm,
 655         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 656 {
 657         LIST_HEAD(pagelist);
 658         int count = 0;
 659         nodemask_t nodes;
 660
 661         nodes_andnot(nodes, *from_nodes, *to_nodes);
 662
 663         down_read(&mm->mmap_sem);
 664         check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
 665                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 666
 667         if (!list_empty(&pagelist)) {
 668                 count = swap_pages(&pagelist);
 669                 putback_lru_pages(&pagelist);
 670         }
 671
 672         up_read(&mm->mmap_sem);
 673         return count;
 674 }
 675
 676 /*
 677  * User space interface with variable sized bitmaps for nodelists.
 678  */
 679
 680 /* Copy a node mask from user space. */
 681 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 682                      unsigned long maxnode)
 683 {
 684         unsigned long k;
 685         unsigned long nlongs;
 686         unsigned long endmask;
 687
 688         --maxnode;
 689         nodes_clear(*nodes);
 690         if (maxnode == 0 || !nmask)
 691                 return 0;
 692
 693         nlongs = BITS_TO_LONGS(maxnode);
 694         if ((maxnode % BITS_PER_LONG) == 0)
 695                 endmask = ~0UL;
 696         else
 697                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
 698
 699         /* When the user specified more nodes than supported just check
 700            if the non supported part is all zero. */
 701         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
 702                 if (nlongs > PAGE_SIZE/sizeof(long))
 703                         return -EINVAL;
 704                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
 705                         unsigned long t;
 706                         if (get_user(t, nmask + k))
 707                                 return -EFAULT;
 708                         if (k == nlongs - 1) {
 709                                 if (t & endmask)
 710                                         return -EINVAL;
 711                         } else if (t)
 712                                 return -EINVAL;
 713                 }
 714                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
 715                 endmask = ~0UL;
 716         }
 717
 718         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
 719                 return -EFAULT;
 720         nodes_addr(*nodes)[nlongs-1] &= endmask;
 721         return 0;
 722 }
 723
 724 /* Copy a kernel node mask to user space */
 725 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 726                               nodemask_t *nodes)
 727 {
 728         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
 729         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
 730
 731         if (copy > nbytes) {
 732                 if (copy > PAGE_SIZE)
 733                         return -EINVAL;
 734                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
 735                         return -EFAULT;
 736                 copy = nbytes;
 737         }
 738         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
 739 }
 740
 741 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 742                         unsigned long mode,
 743                         unsigned long __user *nmask, unsigned long maxnode,
 744                         unsigned flags)
 745 {
 746         nodemask_t nodes;
 747         int err;
 748
 749         err = get_nodes(&nodes, nmask, maxnode);
 750         if (err)
 751                 return err;
 752         return do_mbind(start, len, mode, &nodes, flags);
 753 }
 754
 755 /* Set the process memory policy */
 756 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 757                 unsigned long maxnode)
 758 {
 759         int err;
 760         nodemask_t nodes;
 761
 762         if (mode < 0 || mode > MPOL_MAX)
 763                 return -EINVAL;
 764         err = get_nodes(&nodes, nmask, maxnode);
 765         if (err)
 766                 return err;
 767         return do_set_mempolicy(mode, &nodes);
 768 }
 769
 770 /* Macro needed until Paul implements this function in kernel/cpusets.c */
 771 #define cpuset_mems_allowed(task) node_online_map
 772
 773 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 774                 const unsigned long __user *old_nodes,
 775                 const unsigned long __user *new_nodes)
 776 {
 777         struct mm_struct *mm;
 778         struct task_struct *task;
 779         nodemask_t old;
 780         nodemask_t new;
 781         nodemask_t task_nodes;
 782         int err;
 783
 784         err = get_nodes(&old, old_nodes, maxnode);
 785         if (err)
 786                 return err;
 787
 788         err = get_nodes(&new, new_nodes, maxnode);
 789         if (err)
 790                 return err;
 791
 792         /* Find the mm_struct */
 793         read_lock(&tasklist_lock);
 794         task = pid ? find_task_by_pid(pid) : current;
 795         if (!task) {
 796                 read_unlock(&tasklist_lock);
 797                 return -ESRCH;
 798         }
 799         mm = get_task_mm(task);
 800         read_unlock(&tasklist_lock);
 801
 802         if (!mm)
 803                 return -EINVAL;
 804
 805         /*
 806          * Check if this process has the right to modify the specified
 807          * process. The right exists if the process has administrative
 808          * capabilities, superuser priviledges or the same
 809          * userid as the target process.
 810          */
 811         if ((current->euid != task->suid) && (current->euid != task->uid) &&
 812             (current->uid != task->suid) && (current->uid != task->uid) &&
 813             !capable(CAP_SYS_ADMIN)) {
 814                 err = -EPERM;
 815                 goto out;
 816         }
 817
 818         task_nodes = cpuset_mems_allowed(task);
 819         /* Is the user allowed to access the target nodes? */
 820         if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
 821                 err = -EPERM;
 822                 goto out;
 823         }
 824
 825         err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
 826 out:
 827         mmput(mm);
 828         return err;
 829 }
 830
 831
 832 /* Retrieve NUMA policy */
 833 asmlinkage long sys_get_mempolicy(int __user *policy,
 834                                 unsigned long __user *nmask,
 835                                 unsigned long maxnode,
 836                                 unsigned long addr, unsigned long flags)
 837 {
 838         int err, pval;
 839         nodemask_t nodes;
 840
 841         if (nmask != NULL && maxnode < MAX_NUMNODES)
 842                 return -EINVAL;
 843
 844         err = do_get_mempolicy(&pval, &nodes, addr, flags);
 845
 846         if (err)
 847                 return err;
 848
 849         if (policy && put_user(pval, policy))
 850                 return -EFAULT;
 851
 852         if (nmask)
 853                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
 854
 855         return err;
 856 }
 857
 858 #ifdef CONFIG_COMPAT
 859
 860 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
 861                                      compat_ulong_t __user *nmask,
 862                                      compat_ulong_t maxnode,
 863                                      compat_ulong_t addr, compat_ulong_t flags)
 864 {
 865         long err;
 866         unsigned long __user *nm = NULL;
 867         unsigned long nr_bits, alloc_size;
 868         DECLARE_BITMAP(bm, MAX_NUMNODES);
 869
 870         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 871         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 872
 873         if (nmask)
 874                 nm = compat_alloc_user_space(alloc_size);
 875
 876         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
 877
 878         if (!err && nmask) {
 879                 err = copy_from_user(bm, nm, alloc_size);
 880                 /* ensure entire bitmap is zeroed */
 881                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
 882                 err |= compat_put_bitmap(nmask, bm, nr_bits);
 883         }
 884
 885         return err;
 886 }
 887
 888 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
 889                                      compat_ulong_t maxnode)
 890 {
 891         long err = 0;
 892         unsigned long __user *nm = NULL;
 893         unsigned long nr_bits, alloc_size;
 894         DECLARE_BITMAP(bm, MAX_NUMNODES);
 895
 896         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 897         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 898
 899         if (nmask) {
 900                 err = compat_get_bitmap(bm, nmask, nr_bits);
 901                 nm = compat_alloc_user_space(alloc_size);
 902                 err |= copy_to_user(nm, bm, alloc_size);
 903         }
 904
 905         if (err)
 906                 return -EFAULT;
 907
 908         return sys_set_mempolicy(mode, nm, nr_bits+1);
 909 }
 910
 911 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
 912                              compat_ulong_t mode, compat_ulong_t __user *nmask,
 913                              compat_ulong_t maxnode, compat_ulong_t flags)
 914 {
 915         long err = 0;
 916         unsigned long __user *nm = NULL;
 917         unsigned long nr_bits, alloc_size;
 918         nodemask_t bm;
 919
 920         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 921         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 922
 923         if (nmask) {
 924                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
 925                 nm = compat_alloc_user_space(alloc_size);
 926                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
 927         }
 928
 929         if (err)
 930                 return -EFAULT;
 931
 932         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
 933 }
 934
 935 #endif
 936
 937 /* Return effective policy for a VMA */
 938 struct mempolicy *
 939 get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
 940 {
 941         struct mempolicy *pol = task->mempolicy;
 942
 943         if (vma) {
 944                 if (vma->vm_ops && vma->vm_ops->get_policy)
 945                         pol = vma->vm_ops->get_policy(vma, addr);
 946                 else if (vma->vm_policy &&
 947                                 vma->vm_policy->policy != MPOL_DEFAULT)
 948                         pol = vma->vm_policy;
 949         }
 950         if (!pol)
 951                 pol = &default_policy;
 952         return pol;
 953 }
 954
 955 /* Return a zonelist representing a mempolicy */
 956 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
 957 {
 958         int nd;
 959
 960         switch (policy->policy) {
 961         case MPOL_PREFERRED:
 962                 nd = policy->v.preferred_node;
 963                 if (nd < 0)
 964                         nd = numa_node_id();
 965                 break;
 966         case MPOL_BIND:
 967                 /* Lower zones don't get a policy applied */
 968                 /* Careful: current->mems_allowed might have moved */
 969                 if (gfp_zone(gfp) >= policy_zone)
 970                         if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
 971                                 return policy->v.zonelist;
 972                 /*FALL THROUGH*/
 973         case MPOL_INTERLEAVE: /* should not happen */
 974         case MPOL_DEFAULT:
 975                 nd = numa_node_id();
 976                 break;
 977         default:
 978                 nd = 0;
 979                 BUG();
 980         }
 981         return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
 982 }
 983
 984 /* Do dynamic interleaving for a process */
 985 static unsigned interleave_nodes(struct mempolicy *policy)
 986 {
 987         unsigned nid, next;
 988         struct task_struct *me = current;
 989
 990         nid = me->il_next;
 991         next = next_node(nid, policy->v.nodes);
 992         if (next >= MAX_NUMNODES)
 993                 next = first_node(policy->v.nodes);
 994         me->il_next = next;
 995         return nid;
 996 }
 997
 998 /* Do static interleaving for a VMA with known offset. */
 999 static unsigned offset_il_node(struct mempolicy *pol,
1000                 struct vm_area_struct *vma, unsigned long off)
1001 {
1002         unsigned nnodes = nodes_weight(pol->v.nodes);
1003         unsigned target = (unsigned)off % nnodes;
1004         int c;
1005         int nid = -1;
1006
1007         c = 0;
1008         do {
1009                 nid = next_node(nid, pol->v.nodes);
1010                 c++;
1011         } while (c <= target);
1012         return nid;
1013 }
1014
1015 /* Determine a node number for interleave */
1016 static inline unsigned interleave_nid(struct mempolicy *pol,
1017                  struct vm_area_struct *vma, unsigned long addr, int shift)
1018 {
1019         if (vma) {
1020                 unsigned long off;
1021
1022                 off = vma->vm_pgoff;
1023                 off += (addr - vma->vm_start) >> shift;
1024                 return offset_il_node(pol, vma, off);
1025         } else
1026                 return interleave_nodes(pol);
1027 }
1028
1029 /* Return a zonelist suitable for a huge page allocation. */
1030 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1031 {
1032         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1033
1034         if (pol->policy == MPOL_INTERLEAVE) {
1035                 unsigned nid;
1036
1037                 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1038                 return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
1039         }
1040         return zonelist_policy(GFP_HIGHUSER, pol);
1041 }
1042
1043 /* Allocate a page in interleaved policy.
1044    Own path because it needs to do special accounting. */
1045 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1046                                         unsigned nid)
1047 {
1048         struct zonelist *zl;
1049         struct page *page;
1050
1051         zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1052         page = __alloc_pages(gfp, order, zl);
1053         if (page && page_zone(page) == zl->zones[0]) {
1054                 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
1055                 put_cpu();
1056         }
1057         return page;
1058 }
1059
1060 /**
1061  *      alloc_page_vma  - Allocate a page for a VMA.
1062  *
1063  *      @gfp:
1064  *      %GFP_USER    user allocation.
1065  *      %GFP_KERNEL  kernel allocations,
1066  *      %GFP_HIGHMEM highmem/user allocations,
1067  *      %GFP_FS      allocation should not call back into a file system.
1068  *      %GFP_ATOMIC  don't sleep.
1069  *
1070  *      @vma:  Pointer to VMA or NULL if not available.
1071  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1072  *
1073  *      This function allocates a page from the kernel page pool and applies
1074  *      a NUMA policy associated with the VMA or the current process.
1075  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1076  *      mm_struct of the VMA to prevent it from going away. Should be used for
1077  *      all allocations for pages that will be mapped into
1078  *      user space. Returns NULL when no page can be allocated.
1079  *
1080  *      Should be called with the mm_sem of the vma hold.
1081  */
1082 struct page *
1083 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1084 {
1085         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1086
1087         cpuset_update_current_mems_allowed();
1088
1089         if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1090                 unsigned nid;
1091
1092                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1093                 return alloc_page_interleave(gfp, 0, nid);
1094         }
1095         return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
1096 }
1097
1098 /**
1099  *      alloc_pages_current - Allocate pages.
1100  *
1101  *      @gfp:
1102  *              %GFP_USER   user allocation,
1103  *              %GFP_KERNEL kernel allocation,
1104  *              %GFP_HIGHMEM highmem allocation,
1105  *              %GFP_FS     don't call back into a file system.
1106  *              %GFP_ATOMIC don't sleep.
1107  *      @order: Power of two of allocation size in pages. 0 is a single page.
1108  *
1109  *      Allocate a page from the kernel page pool.  When not in
1110  *      interrupt context and apply the current process NUMA policy.
1111  *      Returns NULL when no page can be allocated.
1112  *
1113  *      Don't call cpuset_update_current_mems_allowed() unless
1114  *      1) it's ok to take cpuset_sem (can WAIT), and
1115  *      2) allocating for current task (not interrupt).
1116  */
1117 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1118 {
1119         struct mempolicy *pol = current->mempolicy;
1120
1121         if ((gfp & __GFP_WAIT) && !in_interrupt())
1122                 cpuset_update_current_mems_allowed();
1123         if (!pol || in_interrupt())
1124                 pol = &default_policy;
1125         if (pol->policy == MPOL_INTERLEAVE)
1126                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1127         return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1128 }
1129 EXPORT_SYMBOL(alloc_pages_current);
1130
1131 /* Slow path of a mempolicy copy */
1132 struct mempolicy *__mpol_copy(struct mempolicy *old)
1133 {
1134         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1135
1136         if (!new)
1137                 return ERR_PTR(-ENOMEM);
1138         *new = *old;
1139         atomic_set(&new->refcnt, 1);
1140         if (new->policy == MPOL_BIND) {
1141                 int sz = ksize(old->v.zonelist);
1142                 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
1143                 if (!new->v.zonelist) {
1144                         kmem_cache_free(policy_cache, new);
1145                         return ERR_PTR(-ENOMEM);
1146                 }
1147                 memcpy(new->v.zonelist, old->v.zonelist, sz);
1148         }
1149         return new;
1150 }
1151
1152 /* Slow path of a mempolicy comparison */
1153 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1154 {
1155         if (!a || !b)
1156                 return 0;
1157         if (a->policy != b->policy)
1158                 return 0;
1159         switch (a->policy) {
1160         case MPOL_DEFAULT:
1161                 return 1;
1162         case MPOL_INTERLEAVE:
1163                 return nodes_equal(a->v.nodes, b->v.nodes);
1164         case MPOL_PREFERRED:
1165                 return a->v.preferred_node == b->v.preferred_node;
1166         case MPOL_BIND: {
1167                 int i;
1168                 for (i = 0; a->v.zonelist->zones[i]; i++)
1169                         if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1170                                 return 0;
1171                 return b->v.zonelist->zones[i] == NULL;
1172         }
1173         default:
1174                 BUG();
1175                 return 0;
1176         }
1177 }
1178
1179 /* Slow path of a mpol destructor. */
1180 void __mpol_free(struct mempolicy *p)
1181 {
1182         if (!atomic_dec_and_test(&p->refcnt))
1183                 return;
1184         if (p->policy == MPOL_BIND)
1185                 kfree(p->v.zonelist);
1186         p->policy = MPOL_DEFAULT;
1187         kmem_cache_free(policy_cache, p);
1188 }
1189
1190 /*
1191  * Shared memory backing store policy support.
1192  *
1193  * Remember policies even when nobody has shared memory mapped.
1194  * The policies are kept in Red-Black tree linked from the inode.
1195  * They are protected by the sp->lock spinlock, which should be held
1196  * for any accesses to the tree.
1197  */
1198
1199 /* lookup first element intersecting start-end */
1200 /* Caller holds sp->lock */
1201 static struct sp_node *
1202 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1203 {
1204         struct rb_node *n = sp->root.rb_node;
1205
1206         while (n) {
1207                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1208
1209                 if (start >= p->end)
1210                         n = n->rb_right;
1211                 else if (end <= p->start)
1212                         n = n->rb_left;
1213                 else
1214                         break;
1215         }
1216         if (!n)
1217                 return NULL;
1218         for (;;) {
1219                 struct sp_node *w = NULL;
1220                 struct rb_node *prev = rb_prev(n);
1221                 if (!prev)
1222                         break;
1223                 w = rb_entry(prev, struct sp_node, nd);
1224                 if (w->end <= start)
1225                         break;
1226                 n = prev;
1227         }
1228         return rb_entry(n, struct sp_node, nd);
1229 }
1230
1231 /* Insert a new shared policy into the list. */
1232 /* Caller holds sp->lock */
1233 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1234 {
1235         struct rb_node **p = &sp->root.rb_node;
1236         struct rb_node *parent = NULL;
1237         struct sp_node *nd;
1238
1239         while (*p) {
1240                 parent = *p;
1241                 nd = rb_entry(parent, struct sp_node, nd);
1242                 if (new->start < nd->start)
1243                         p = &(*p)->rb_left;
1244                 else if (new->end > nd->end)
1245                         p = &(*p)->rb_right;
1246                 else
1247                         BUG();
1248         }
1249         rb_link_node(&new->nd, parent, p);
1250         rb_insert_color(&new->nd, &sp->root);
1251         PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1252                  new->policy ? new->policy->policy : 0);
1253 }
1254
1255 /* Find shared policy intersecting idx */
1256 struct mempolicy *
1257 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1258 {
1259         struct mempolicy *pol = NULL;
1260         struct sp_node *sn;
1261
1262         if (!sp->root.rb_node)
1263                 return NULL;
1264         spin_lock(&sp->lock);
1265         sn = sp_lookup(sp, idx, idx+1);
1266         if (sn) {
1267                 mpol_get(sn->policy);
1268                 pol = sn->policy;
1269         }
1270         spin_unlock(&sp->lock);
1271         return pol;
1272 }
1273
1274 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1275 {
1276         PDprintk("deleting %lx-l%x\n", n->start, n->end);
1277         rb_erase(&n->nd, &sp->root);
1278         mpol_free(n->policy);
1279         kmem_cache_free(sn_cache, n);
1280 }
1281
1282 struct sp_node *
1283 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1284 {
1285         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1286
1287         if (!n)
1288                 return NULL;
1289         n->start = start;
1290         n->end = end;
1291         mpol_get(pol);
1292         n->policy = pol;
1293         return n;
1294 }
1295
1296 /* Replace a policy range. */
1297 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1298                                  unsigned long end, struct sp_node *new)
1299 {
1300         struct sp_node *n, *new2 = NULL;
1301
1302 restart:
1303         spin_lock(&sp->lock);
1304         n = sp_lookup(sp, start, end);
1305         /* Take care of old policies in the same range. */
1306         while (n && n->start < end) {
1307                 struct rb_node *next = rb_next(&n->nd);
1308                 if (n->start >= start) {
1309                         if (n->end <= end)
1310                                 sp_delete(sp, n);
1311                         else
1312                                 n->start = end;
1313                 } else {
1314                         /* Old policy spanning whole new range. */
1315                         if (n->end > end) {
1316                                 if (!new2) {
1317                                         spin_unlock(&sp->lock);
1318                                         new2 = sp_alloc(end, n->end, n->policy);
1319                                         if (!new2)
1320                                                 return -ENOMEM;
1321                                         goto restart;
1322                                 }
1323                                 n->end = start;
1324                                 sp_insert(sp, new2);
1325                                 new2 = NULL;
1326                                 break;
1327                         } else
1328                                 n->end = start;
1329                 }
1330                 if (!next)
1331                         break;
1332                 n = rb_entry(next, struct sp_node, nd);
1333         }
1334         if (new)
1335                 sp_insert(sp, new);
1336         spin_unlock(&sp->lock);
1337         if (new2) {
1338                 mpol_free(new2->policy);
1339                 kmem_cache_free(sn_cache, new2);
1340         }
1341         return 0;
1342 }
1343
1344 int mpol_set_shared_policy(struct shared_policy *info,
1345                         struct vm_area_struct *vma, struct mempolicy *npol)
1346 {
1347         int err;
1348         struct sp_node *new = NULL;
1349         unsigned long sz = vma_pages(vma);
1350
1351         PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1352                  vma->vm_pgoff,
1353                  sz, npol? npol->policy : -1,
1354                 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1355
1356         if (npol) {
1357                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1358                 if (!new)
1359                         return -ENOMEM;
1360         }
1361         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1362         if (err && new)
1363                 kmem_cache_free(sn_cache, new);
1364         return err;
1365 }
1366
1367 /* Free a backing policy store on inode delete. */
1368 void mpol_free_shared_policy(struct shared_policy *p)
1369 {
1370         struct sp_node *n;
1371         struct rb_node *next;
1372
1373         if (!p->root.rb_node)
1374                 return;
1375         spin_lock(&p->lock);
1376         next = rb_first(&p->root);
1377         while (next) {
1378                 n = rb_entry(next, struct sp_node, nd);
1379                 next = rb_next(&n->nd);
1380                 rb_erase(&n->nd, &p->root);
1381                 mpol_free(n->policy);
1382                 kmem_cache_free(sn_cache, n);
1383         }
1384         spin_unlock(&p->lock);
1385 }
1386
1387 /* assumes fs == KERNEL_DS */
1388 void __init numa_policy_init(void)
1389 {
1390         policy_cache = kmem_cache_create("numa_policy",
1391                                          sizeof(struct mempolicy),
1392                                          0, SLAB_PANIC, NULL, NULL);
1393
1394         sn_cache = kmem_cache_create("shared_policy_node",
1395                                      sizeof(struct sp_node),
1396                                      0, SLAB_PANIC, NULL, NULL);
1397
1398         /* Set interleaving policy for system init. This way not all
1399            the data structures allocated at system boot end up in node zero. */
1400
1401         if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1402                 printk("numa_policy_init: interleaving failed\n");
1403 }
1404
1405 /* Reset policy of current process to default */
1406 void numa_default_policy(void)
1407 {
1408         do_set_mempolicy(MPOL_DEFAULT, NULL);
1409 }
1410
1411 /* Migrate a policy to a different set of nodes */
1412 static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
1413                                                         const nodemask_t *new)
1414 {
1415         nodemask_t tmp;
1416
1417         if (!pol)
1418                 return;
1419
1420         switch (pol->policy) {
1421         case MPOL_DEFAULT:
1422                 break;
1423         case MPOL_INTERLEAVE:
1424                 nodes_remap(tmp, pol->v.nodes, *old, *new);
1425                 pol->v.nodes = tmp;
1426                 current->il_next = node_remap(current->il_next, *old, *new);
1427                 break;
1428         case MPOL_PREFERRED:
1429                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1430                                                                 *old, *new);
1431                 break;
1432         case MPOL_BIND: {
1433                 nodemask_t nodes;
1434                 struct zone **z;
1435                 struct zonelist *zonelist;
1436
1437                 nodes_clear(nodes);
1438                 for (z = pol->v.zonelist->zones; *z; z++)
1439                         node_set((*z)->zone_pgdat->node_id, nodes);
1440                 nodes_remap(tmp, nodes, *old, *new);
1441                 nodes = tmp;
1442
1443                 zonelist = bind_zonelist(&nodes);
1444
1445                 /* If no mem, then zonelist is NULL and we keep old zonelist.
1446                  * If that old zonelist has no remaining mems_allowed nodes,
1447                  * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1448                  */
1449
1450                 if (zonelist) {
1451                         /* Good - got mem - substitute new zonelist */
1452                         kfree(pol->v.zonelist);
1453                         pol->v.zonelist = zonelist;
1454                 }
1455                 break;
1456         }
1457         default:
1458                 BUG();
1459                 break;
1460         }
1461 }
1462
1463 /*
1464  * Someone moved this task to different nodes.  Fixup mempolicies.
1465  *
1466  * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well,
1467  * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
1468  */
1469 void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new)
1470 {
1471         rebind_policy(current->mempolicy, old, new);
1472 }
1473
1474 /*
1475  * Display pages allocated per node and memory policy via /proc.
1476  */
1477
1478 static const char *policy_types[] = { "default", "prefer", "bind",
1479                                       "interleave" };
1480
1481 /*
1482  * Convert a mempolicy into a string.
1483  * Returns the number of characters in buffer (if positive)
1484  * or an error (negative)
1485  */
1486 static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1487 {
1488         char *p = buffer;
1489         int l;
1490         nodemask_t nodes;
1491         int mode = pol ? pol->policy : MPOL_DEFAULT;
1492
1493         switch (mode) {
1494         case MPOL_DEFAULT:
1495                 nodes_clear(nodes);
1496                 break;
1497
1498         case MPOL_PREFERRED:
1499                 nodes_clear(nodes);
1500                 node_set(pol->v.preferred_node, nodes);
1501                 break;
1502
1503         case MPOL_BIND:
1504                 get_zonemask(pol, &nodes);
1505                 break;
1506
1507         case MPOL_INTERLEAVE:
1508                 nodes = pol->v.nodes;
1509                 break;
1510
1511         default:
1512                 BUG();
1513                 return -EFAULT;
1514         }
1515
1516         l = strlen(policy_types[mode]);
1517         if (buffer + maxlen < p + l + 1)
1518                 return -ENOSPC;
1519
1520         strcpy(p, policy_types[mode]);
1521         p += l;
1522
1523         if (!nodes_empty(nodes)) {
1524                 if (buffer + maxlen < p + 2)
1525                         return -ENOSPC;
1526                 *p++ = '=';
1527                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1528         }
1529         return p - buffer;
1530 }
1531
1532 struct numa_maps {
1533         unsigned long pages;
1534         unsigned long anon;
1535         unsigned long mapped;
1536         unsigned long mapcount_max;
1537         unsigned long node[MAX_NUMNODES];
1538 };
1539
1540 static void gather_stats(struct page *page, void *private)
1541 {
1542         struct numa_maps *md = private;
1543         int count = page_mapcount(page);
1544
1545         if (count)
1546                 md->mapped++;
1547
1548         if (count > md->mapcount_max)
1549                 md->mapcount_max = count;
1550
1551         md->pages++;
1552
1553         if (PageAnon(page))
1554                 md->anon++;
1555
1556         md->node[page_to_nid(page)]++;
1557         cond_resched();
1558 }
1559
1560 int show_numa_map(struct seq_file *m, void *v)
1561 {
1562         struct task_struct *task = m->private;
1563         struct vm_area_struct *vma = v;
1564         struct numa_maps *md;
1565         int n;
1566         char buffer[50];
1567
1568         if (!vma->vm_mm)
1569                 return 0;
1570
1571         md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1572         if (!md)
1573                 return 0;
1574
1575         check_pgd_range(vma, vma->vm_start, vma->vm_end,
1576                     &node_online_map, MPOL_MF_STATS, md);
1577
1578         if (md->pages) {
1579                 mpol_to_str(buffer, sizeof(buffer),
1580                             get_vma_policy(task, vma, vma->vm_start));
1581
1582                 seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu",
1583                            vma->vm_start, buffer, md->pages,
1584                            md->mapped, md->mapcount_max);
1585
1586                 if (md->anon)
1587                         seq_printf(m," anon=%lu",md->anon);
1588
1589                 for_each_online_node(n)
1590                         if (md->node[n])
1591                                 seq_printf(m, " N%d=%lu", n, md->node[n]);
1592
1593                 seq_putc(m, '\n');
1594         }
1595         kfree(md);
1596
1597         if (m->count < m->size)
1598                 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
1599         return 0;
1600 }
1601