mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66    could replace all the switch()es with a mempolicy_ops structure.
  67 */
  68
  69 #include <linux/mempolicy.h>
  70 #include <linux/mm.h>
  71 #include <linux/highmem.h>
  72 #include <linux/hugetlb.h>
  73 #include <linux/kernel.h>
  74 #include <linux/sched.h>
  75 #include <linux/mm.h>
  76 #include <linux/nodemask.h>
  77 #include <linux/cpuset.h>
  78 #include <linux/gfp.h>
  79 #include <linux/slab.h>
  80 #include <linux/string.h>
  81 #include <linux/module.h>
  82 #include <linux/interrupt.h>
  83 #include <linux/init.h>
  84 #include <linux/compat.h>
  85 #include <linux/mempolicy.h>
  86 #include <linux/swap.h>
  87 #include <linux/seq_file.h>
  88 #include <linux/proc_fs.h>
  89
  90 #include <asm/tlbflush.h>
  91 #include <asm/uaccess.h>
  92
  93 /* Internal flags */
  94 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
  95 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
  96 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
  97
  98 static kmem_cache_t *policy_cache;
  99 static kmem_cache_t *sn_cache;
 100
 101 #define PDprintk(fmt...)
 102
 103 /* Highest zone. An specific allocation for a zone below that is not
 104    policied. */
 105 int policy_zone = ZONE_DMA;
 106
 107 struct mempolicy default_policy = {
 108         .refcnt = ATOMIC_INIT(1), /* never free it */
 109         .policy = MPOL_DEFAULT,
 110 };
 111
 112 /* Do sanity checking on a policy */
 113 static int mpol_check_policy(int mode, nodemask_t *nodes)
 114 {
 115         int empty = nodes_empty(*nodes);
 116
 117         switch (mode) {
 118         case MPOL_DEFAULT:
 119                 if (!empty)
 120                         return -EINVAL;
 121                 break;
 122         case MPOL_BIND:
 123         case MPOL_INTERLEAVE:
 124                 /* Preferred will only use the first bit, but allow
 125                    more for now. */
 126                 if (empty)
 127                         return -EINVAL;
 128                 break;
 129         }
 130         return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
 131 }
 132 /* Generate a custom zonelist for the BIND policy. */
 133 static struct zonelist *bind_zonelist(nodemask_t *nodes)
 134 {
 135         struct zonelist *zl;
 136         int num, max, nd;
 137
 138         max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
 139         zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
 140         if (!zl)
 141                 return NULL;
 142         num = 0;
 143         for_each_node_mask(nd, *nodes)
 144                 zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
 145         zl->zones[num] = NULL;
 146         return zl;
 147 }
 148
 149 /* Create a new policy */
 150 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 151 {
 152         struct mempolicy *policy;
 153
 154         PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
 155         if (mode == MPOL_DEFAULT)
 156                 return NULL;
 157         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 158         if (!policy)
 159                 return ERR_PTR(-ENOMEM);
 160         atomic_set(&policy->refcnt, 1);
 161         switch (mode) {
 162         case MPOL_INTERLEAVE:
 163                 policy->v.nodes = *nodes;
 164                 if (nodes_weight(*nodes) == 0) {
 165                         kmem_cache_free(policy_cache, policy);
 166                         return ERR_PTR(-EINVAL);
 167                 }
 168                 break;
 169         case MPOL_PREFERRED:
 170                 policy->v.preferred_node = first_node(*nodes);
 171                 if (policy->v.preferred_node >= MAX_NUMNODES)
 172                         policy->v.preferred_node = -1;
 173                 break;
 174         case MPOL_BIND:
 175                 policy->v.zonelist = bind_zonelist(nodes);
 176                 if (policy->v.zonelist == NULL) {
 177                         kmem_cache_free(policy_cache, policy);
 178                         return ERR_PTR(-ENOMEM);
 179                 }
 180                 break;
 181         }
 182         policy->policy = mode;
 183         policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
 184         return policy;
 185 }
 186
 187 static void gather_stats(struct page *, void *);
 188 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 189                                 unsigned long flags);
 190
 191 /* Scan through pages checking if pages follow certain conditions. */
 192 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 193                 unsigned long addr, unsigned long end,
 194                 const nodemask_t *nodes, unsigned long flags,
 195                 void *private)
 196 {
 197         pte_t *orig_pte;
 198         pte_t *pte;
 199         spinlock_t *ptl;
 200
 201         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 202         do {
 203                 struct page *page;
 204                 unsigned int nid;
 205
 206                 if (!pte_present(*pte))
 207                         continue;
 208                 page = vm_normal_page(vma, addr, *pte);
 209                 if (!page)
 210                         continue;
 211                 /*
 212                  * The check for PageReserved here is important to avoid
 213                  * handling zero pages and other pages that may have been
 214                  * marked special by the system.
 215                  *
 216                  * If the PageReserved would not be checked here then f.e.
 217                  * the location of the zero page could have an influence
 218                  * on MPOL_MF_STRICT, zero pages would be counted for
 219                  * the per node stats, and there would be useless attempts
 220                  * to put zero pages on the migration list.
 221                  */
 222                 if (PageReserved(page))
 223                         continue;
 224                 nid = page_to_nid(page);
 225                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 226                         continue;
 227
 228                 if (flags & MPOL_MF_STATS)
 229                         gather_stats(page, private);
 230                 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 231                         migrate_page_add(page, private, flags);
 232                 else
 233                         break;
 234         } while (pte++, addr += PAGE_SIZE, addr != end);
 235         pte_unmap_unlock(orig_pte, ptl);
 236         return addr != end;
 237 }
 238
 239 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 240                 unsigned long addr, unsigned long end,
 241                 const nodemask_t *nodes, unsigned long flags,
 242                 void *private)
 243 {
 244         pmd_t *pmd;
 245         unsigned long next;
 246
 247         pmd = pmd_offset(pud, addr);
 248         do {
 249                 next = pmd_addr_end(addr, end);
 250                 if (pmd_none_or_clear_bad(pmd))
 251                         continue;
 252                 if (check_pte_range(vma, pmd, addr, next, nodes,
 253                                     flags, private))
 254                         return -EIO;
 255         } while (pmd++, addr = next, addr != end);
 256         return 0;
 257 }
 258
 259 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 260                 unsigned long addr, unsigned long end,
 261                 const nodemask_t *nodes, unsigned long flags,
 262                 void *private)
 263 {
 264         pud_t *pud;
 265         unsigned long next;
 266
 267         pud = pud_offset(pgd, addr);
 268         do {
 269                 next = pud_addr_end(addr, end);
 270                 if (pud_none_or_clear_bad(pud))
 271                         continue;
 272                 if (check_pmd_range(vma, pud, addr, next, nodes,
 273                                     flags, private))
 274                         return -EIO;
 275         } while (pud++, addr = next, addr != end);
 276         return 0;
 277 }
 278
 279 static inline int check_pgd_range(struct vm_area_struct *vma,
 280                 unsigned long addr, unsigned long end,
 281                 const nodemask_t *nodes, unsigned long flags,
 282                 void *private)
 283 {
 284         pgd_t *pgd;
 285         unsigned long next;
 286
 287         pgd = pgd_offset(vma->vm_mm, addr);
 288         do {
 289                 next = pgd_addr_end(addr, end);
 290                 if (pgd_none_or_clear_bad(pgd))
 291                         continue;
 292                 if (check_pud_range(vma, pgd, addr, next, nodes,
 293                                     flags, private))
 294                         return -EIO;
 295         } while (pgd++, addr = next, addr != end);
 296         return 0;
 297 }
 298
 299 /* Check if a vma is migratable */
 300 static inline int vma_migratable(struct vm_area_struct *vma)
 301 {
 302         if (vma->vm_flags & (
 303                 VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED))
 304                 return 0;
 305         return 1;
 306 }
 307
 308 /*
 309  * Check if all pages in a range are on a set of nodes.
 310  * If pagelist != NULL then isolate pages from the LRU and
 311  * put them on the pagelist.
 312  */
 313 static struct vm_area_struct *
 314 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 315                 const nodemask_t *nodes, unsigned long flags, void *private)
 316 {
 317         int err;
 318         struct vm_area_struct *first, *vma, *prev;
 319
 320         /* Clear the LRU lists so pages can be isolated */
 321         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 322                 lru_add_drain_all();
 323
 324         first = find_vma(mm, start);
 325         if (!first)
 326                 return ERR_PTR(-EFAULT);
 327         prev = NULL;
 328         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 329                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 330                         if (!vma->vm_next && vma->vm_end < end)
 331                                 return ERR_PTR(-EFAULT);
 332                         if (prev && prev->vm_end < vma->vm_start)
 333                                 return ERR_PTR(-EFAULT);
 334                 }
 335                 if (!is_vm_hugetlb_page(vma) &&
 336                     ((flags & MPOL_MF_STRICT) ||
 337                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 338                                 vma_migratable(vma)))) {
 339                         unsigned long endvma = vma->vm_end;
 340
 341                         if (endvma > end)
 342                                 endvma = end;
 343                         if (vma->vm_start > start)
 344                                 start = vma->vm_start;
 345                         err = check_pgd_range(vma, start, endvma, nodes,
 346                                                 flags, private);
 347                         if (err) {
 348                                 first = ERR_PTR(err);
 349                                 break;
 350                         }
 351                 }
 352                 prev = vma;
 353         }
 354         return first;
 355 }
 356
 357 /* Apply policy to a single VMA */
 358 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 359 {
 360         int err = 0;
 361         struct mempolicy *old = vma->vm_policy;
 362
 363         PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 364                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 365                  vma->vm_ops, vma->vm_file,
 366                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 367
 368         if (vma->vm_ops && vma->vm_ops->set_policy)
 369                 err = vma->vm_ops->set_policy(vma, new);
 370         if (!err) {
 371                 mpol_get(new);
 372                 vma->vm_policy = new;
 373                 mpol_free(old);
 374         }
 375         return err;
 376 }
 377
 378 /* Step 2: apply policy to a range and do splits. */
 379 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 380                        unsigned long end, struct mempolicy *new)
 381 {
 382         struct vm_area_struct *next;
 383         int err;
 384
 385         err = 0;
 386         for (; vma && vma->vm_start < end; vma = next) {
 387                 next = vma->vm_next;
 388                 if (vma->vm_start < start)
 389                         err = split_vma(vma->vm_mm, vma, start, 1);
 390                 if (!err && vma->vm_end > end)
 391                         err = split_vma(vma->vm_mm, vma, end, 0);
 392                 if (!err)
 393                         err = policy_vma(vma, new);
 394                 if (err)
 395                         break;
 396         }
 397         return err;
 398 }
 399
 400 static int contextualize_policy(int mode, nodemask_t *nodes)
 401 {
 402         if (!nodes)
 403                 return 0;
 404
 405         cpuset_update_task_memory_state();
 406         if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
 407                 return -EINVAL;
 408         return mpol_check_policy(mode, nodes);
 409 }
 410
 411 /* Set the process memory policy */
 412 long do_set_mempolicy(int mode, nodemask_t *nodes)
 413 {
 414         struct mempolicy *new;
 415
 416         if (contextualize_policy(mode, nodes))
 417                 return -EINVAL;
 418         new = mpol_new(mode, nodes);
 419         if (IS_ERR(new))
 420                 return PTR_ERR(new);
 421         mpol_free(current->mempolicy);
 422         current->mempolicy = new;
 423         if (new && new->policy == MPOL_INTERLEAVE)
 424                 current->il_next = first_node(new->v.nodes);
 425         return 0;
 426 }
 427
 428 /* Fill a zone bitmap for a policy */
 429 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 430 {
 431         int i;
 432
 433         nodes_clear(*nodes);
 434         switch (p->policy) {
 435         case MPOL_BIND:
 436                 for (i = 0; p->v.zonelist->zones[i]; i++)
 437                         node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
 438                                 *nodes);
 439                 break;
 440         case MPOL_DEFAULT:
 441                 break;
 442         case MPOL_INTERLEAVE:
 443                 *nodes = p->v.nodes;
 444                 break;
 445         case MPOL_PREFERRED:
 446                 /* or use current node instead of online map? */
 447                 if (p->v.preferred_node < 0)
 448                         *nodes = node_online_map;
 449                 else
 450                         node_set(p->v.preferred_node, *nodes);
 451                 break;
 452         default:
 453                 BUG();
 454         }
 455 }
 456
 457 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 458 {
 459         struct page *p;
 460         int err;
 461
 462         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 463         if (err >= 0) {
 464                 err = page_to_nid(p);
 465                 put_page(p);
 466         }
 467         return err;
 468 }
 469
 470 /* Retrieve NUMA policy */
 471 long do_get_mempolicy(int *policy, nodemask_t *nmask,
 472                         unsigned long addr, unsigned long flags)
 473 {
 474         int err;
 475         struct mm_struct *mm = current->mm;
 476         struct vm_area_struct *vma = NULL;
 477         struct mempolicy *pol = current->mempolicy;
 478
 479         cpuset_update_task_memory_state();
 480         if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
 481                 return -EINVAL;
 482         if (flags & MPOL_F_ADDR) {
 483                 down_read(&mm->mmap_sem);
 484                 vma = find_vma_intersection(mm, addr, addr+1);
 485                 if (!vma) {
 486                         up_read(&mm->mmap_sem);
 487                         return -EFAULT;
 488                 }
 489                 if (vma->vm_ops && vma->vm_ops->get_policy)
 490                         pol = vma->vm_ops->get_policy(vma, addr);
 491                 else
 492                         pol = vma->vm_policy;
 493         } else if (addr)
 494                 return -EINVAL;
 495
 496         if (!pol)
 497                 pol = &default_policy;
 498
 499         if (flags & MPOL_F_NODE) {
 500                 if (flags & MPOL_F_ADDR) {
 501                         err = lookup_node(mm, addr);
 502                         if (err < 0)
 503                                 goto out;
 504                         *policy = err;
 505                 } else if (pol == current->mempolicy &&
 506                                 pol->policy == MPOL_INTERLEAVE) {
 507                         *policy = current->il_next;
 508                 } else {
 509                         err = -EINVAL;
 510                         goto out;
 511                 }
 512         } else
 513                 *policy = pol->policy;
 514
 515         if (vma) {
 516                 up_read(&current->mm->mmap_sem);
 517                 vma = NULL;
 518         }
 519
 520         err = 0;
 521         if (nmask)
 522                 get_zonemask(pol, nmask);
 523
 524  out:
 525         if (vma)
 526                 up_read(&current->mm->mmap_sem);
 527         return err;
 528 }
 529
 530 /*
 531  * page migration
 532  */
 533
 534 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 535                                 unsigned long flags)
 536 {
 537         /*
 538          * Avoid migrating a page that is shared with others.
 539          */
 540         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
 541                 if (isolate_lru_page(page))
 542                         list_add(&page->lru, pagelist);
 543         }
 544 }
 545
 546 static int swap_pages(struct list_head *pagelist)
 547 {
 548         LIST_HEAD(moved);
 549         LIST_HEAD(failed);
 550         int n;
 551
 552         n = migrate_pages(pagelist, NULL, &moved, &failed);
 553         putback_lru_pages(&failed);
 554         putback_lru_pages(&moved);
 555
 556         return n;
 557 }
 558
 559 /*
 560  * For now migrate_pages simply swaps out the pages from nodes that are in
 561  * the source set but not in the target set. In the future, we would
 562  * want a function that moves pages between the two nodesets in such
 563  * a way as to preserve the physical layout as much as possible.
 564  *
 565  * Returns the number of page that could not be moved.
 566  */
 567 int do_migrate_pages(struct mm_struct *mm,
 568         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 569 {
 570         LIST_HEAD(pagelist);
 571         int count = 0;
 572         nodemask_t nodes;
 573
 574         nodes_andnot(nodes, *from_nodes, *to_nodes);
 575
 576         down_read(&mm->mmap_sem);
 577         check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
 578                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 579
 580         if (!list_empty(&pagelist)) {
 581                 count = swap_pages(&pagelist);
 582                 putback_lru_pages(&pagelist);
 583         }
 584
 585         up_read(&mm->mmap_sem);
 586         return count;
 587 }
 588
 589 long do_mbind(unsigned long start, unsigned long len,
 590                 unsigned long mode, nodemask_t *nmask, unsigned long flags)
 591 {
 592         struct vm_area_struct *vma;
 593         struct mm_struct *mm = current->mm;
 594         struct mempolicy *new;
 595         unsigned long end;
 596         int err;
 597         LIST_HEAD(pagelist);
 598
 599         if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
 600                                       MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 601             || mode > MPOL_MAX)
 602                 return -EINVAL;
 603         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
 604                 return -EPERM;
 605
 606         if (start & ~PAGE_MASK)
 607                 return -EINVAL;
 608
 609         if (mode == MPOL_DEFAULT)
 610                 flags &= ~MPOL_MF_STRICT;
 611
 612         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 613         end = start + len;
 614
 615         if (end < start)
 616                 return -EINVAL;
 617         if (end == start)
 618                 return 0;
 619
 620         if (mpol_check_policy(mode, nmask))
 621                 return -EINVAL;
 622
 623         new = mpol_new(mode, nmask);
 624         if (IS_ERR(new))
 625                 return PTR_ERR(new);
 626
 627         /*
 628          * If we are using the default policy then operation
 629          * on discontinuous address spaces is okay after all
 630          */
 631         if (!new)
 632                 flags |= MPOL_MF_DISCONTIG_OK;
 633
 634         PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
 635                         mode,nodes_addr(nodes)[0]);
 636
 637         down_write(&mm->mmap_sem);
 638         vma = check_range(mm, start, end, nmask,
 639                           flags | MPOL_MF_INVERT, &pagelist);
 640
 641         err = PTR_ERR(vma);
 642         if (!IS_ERR(vma)) {
 643                 int nr_failed = 0;
 644
 645                 err = mbind_range(vma, start, end, new);
 646                 if (!list_empty(&pagelist))
 647                         nr_failed = swap_pages(&pagelist);
 648
 649                 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 650                         err = -EIO;
 651         }
 652         if (!list_empty(&pagelist))
 653                 putback_lru_pages(&pagelist);
 654
 655         up_write(&mm->mmap_sem);
 656         mpol_free(new);
 657         return err;
 658 }
 659
 660 /*
 661  * User space interface with variable sized bitmaps for nodelists.
 662  */
 663
 664 /* Copy a node mask from user space. */
 665 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 666                      unsigned long maxnode)
 667 {
 668         unsigned long k;
 669         unsigned long nlongs;
 670         unsigned long endmask;
 671
 672         --maxnode;
 673         nodes_clear(*nodes);
 674         if (maxnode == 0 || !nmask)
 675                 return 0;
 676
 677         nlongs = BITS_TO_LONGS(maxnode);
 678         if ((maxnode % BITS_PER_LONG) == 0)
 679                 endmask = ~0UL;
 680         else
 681                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
 682
 683         /* When the user specified more nodes than supported just check
 684            if the non supported part is all zero. */
 685         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
 686                 if (nlongs > PAGE_SIZE/sizeof(long))
 687                         return -EINVAL;
 688                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
 689                         unsigned long t;
 690                         if (get_user(t, nmask + k))
 691                                 return -EFAULT;
 692                         if (k == nlongs - 1) {
 693                                 if (t & endmask)
 694                                         return -EINVAL;
 695                         } else if (t)
 696                                 return -EINVAL;
 697                 }
 698                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
 699                 endmask = ~0UL;
 700         }
 701
 702         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
 703                 return -EFAULT;
 704         nodes_addr(*nodes)[nlongs-1] &= endmask;
 705         return 0;
 706 }
 707
 708 /* Copy a kernel node mask to user space */
 709 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 710                               nodemask_t *nodes)
 711 {
 712         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
 713         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
 714
 715         if (copy > nbytes) {
 716                 if (copy > PAGE_SIZE)
 717                         return -EINVAL;
 718                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
 719                         return -EFAULT;
 720                 copy = nbytes;
 721         }
 722         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
 723 }
 724
 725 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 726                         unsigned long mode,
 727                         unsigned long __user *nmask, unsigned long maxnode,
 728                         unsigned flags)
 729 {
 730         nodemask_t nodes;
 731         int err;
 732
 733         err = get_nodes(&nodes, nmask, maxnode);
 734         if (err)
 735                 return err;
 736         return do_mbind(start, len, mode, &nodes, flags);
 737 }
 738
 739 /* Set the process memory policy */
 740 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 741                 unsigned long maxnode)
 742 {
 743         int err;
 744         nodemask_t nodes;
 745
 746         if (mode < 0 || mode > MPOL_MAX)
 747                 return -EINVAL;
 748         err = get_nodes(&nodes, nmask, maxnode);
 749         if (err)
 750                 return err;
 751         return do_set_mempolicy(mode, &nodes);
 752 }
 753
 754 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 755                 const unsigned long __user *old_nodes,
 756                 const unsigned long __user *new_nodes)
 757 {
 758         struct mm_struct *mm;
 759         struct task_struct *task;
 760         nodemask_t old;
 761         nodemask_t new;
 762         nodemask_t task_nodes;
 763         int err;
 764
 765         err = get_nodes(&old, old_nodes, maxnode);
 766         if (err)
 767                 return err;
 768
 769         err = get_nodes(&new, new_nodes, maxnode);
 770         if (err)
 771                 return err;
 772
 773         /* Find the mm_struct */
 774         read_lock(&tasklist_lock);
 775         task = pid ? find_task_by_pid(pid) : current;
 776         if (!task) {
 777                 read_unlock(&tasklist_lock);
 778                 return -ESRCH;
 779         }
 780         mm = get_task_mm(task);
 781         read_unlock(&tasklist_lock);
 782
 783         if (!mm)
 784                 return -EINVAL;
 785
 786         /*
 787          * Check if this process has the right to modify the specified
 788          * process. The right exists if the process has administrative
 789          * capabilities, superuser priviledges or the same
 790          * userid as the target process.
 791          */
 792         if ((current->euid != task->suid) && (current->euid != task->uid) &&
 793             (current->uid != task->suid) && (current->uid != task->uid) &&
 794             !capable(CAP_SYS_ADMIN)) {
 795                 err = -EPERM;
 796                 goto out;
 797         }
 798
 799         task_nodes = cpuset_mems_allowed(task);
 800         /* Is the user allowed to access the target nodes? */
 801         if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
 802                 err = -EPERM;
 803                 goto out;
 804         }
 805
 806         err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
 807 out:
 808         mmput(mm);
 809         return err;
 810 }
 811
 812
 813 /* Retrieve NUMA policy */
 814 asmlinkage long sys_get_mempolicy(int __user *policy,
 815                                 unsigned long __user *nmask,
 816                                 unsigned long maxnode,
 817                                 unsigned long addr, unsigned long flags)
 818 {
 819         int err, pval;
 820         nodemask_t nodes;
 821
 822         if (nmask != NULL && maxnode < MAX_NUMNODES)
 823                 return -EINVAL;
 824
 825         err = do_get_mempolicy(&pval, &nodes, addr, flags);
 826
 827         if (err)
 828                 return err;
 829
 830         if (policy && put_user(pval, policy))
 831                 return -EFAULT;
 832
 833         if (nmask)
 834                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
 835
 836         return err;
 837 }
 838
 839 #ifdef CONFIG_COMPAT
 840
 841 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
 842                                      compat_ulong_t __user *nmask,
 843                                      compat_ulong_t maxnode,
 844                                      compat_ulong_t addr, compat_ulong_t flags)
 845 {
 846         long err;
 847         unsigned long __user *nm = NULL;
 848         unsigned long nr_bits, alloc_size;
 849         DECLARE_BITMAP(bm, MAX_NUMNODES);
 850
 851         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 852         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 853
 854         if (nmask)
 855                 nm = compat_alloc_user_space(alloc_size);
 856
 857         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
 858
 859         if (!err && nmask) {
 860                 err = copy_from_user(bm, nm, alloc_size);
 861                 /* ensure entire bitmap is zeroed */
 862                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
 863                 err |= compat_put_bitmap(nmask, bm, nr_bits);
 864         }
 865
 866         return err;
 867 }
 868
 869 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
 870                                      compat_ulong_t maxnode)
 871 {
 872         long err = 0;
 873         unsigned long __user *nm = NULL;
 874         unsigned long nr_bits, alloc_size;
 875         DECLARE_BITMAP(bm, MAX_NUMNODES);
 876
 877         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 878         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 879
 880         if (nmask) {
 881                 err = compat_get_bitmap(bm, nmask, nr_bits);
 882                 nm = compat_alloc_user_space(alloc_size);
 883                 err |= copy_to_user(nm, bm, alloc_size);
 884         }
 885
 886         if (err)
 887                 return -EFAULT;
 888
 889         return sys_set_mempolicy(mode, nm, nr_bits+1);
 890 }
 891
 892 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
 893                              compat_ulong_t mode, compat_ulong_t __user *nmask,
 894                              compat_ulong_t maxnode, compat_ulong_t flags)
 895 {
 896         long err = 0;
 897         unsigned long __user *nm = NULL;
 898         unsigned long nr_bits, alloc_size;
 899         nodemask_t bm;
 900
 901         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 902         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 903
 904         if (nmask) {
 905                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
 906                 nm = compat_alloc_user_space(alloc_size);
 907                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
 908         }
 909
 910         if (err)
 911                 return -EFAULT;
 912
 913         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
 914 }
 915
 916 #endif
 917
 918 /* Return effective policy for a VMA */
 919 static struct mempolicy * get_vma_policy(struct task_struct *task,
 920                 struct vm_area_struct *vma, unsigned long addr)
 921 {
 922         struct mempolicy *pol = task->mempolicy;
 923
 924         if (vma) {
 925                 if (vma->vm_ops && vma->vm_ops->get_policy)
 926                         pol = vma->vm_ops->get_policy(vma, addr);
 927                 else if (vma->vm_policy &&
 928                                 vma->vm_policy->policy != MPOL_DEFAULT)
 929                         pol = vma->vm_policy;
 930         }
 931         if (!pol)
 932                 pol = &default_policy;
 933         return pol;
 934 }
 935
 936 /* Return a zonelist representing a mempolicy */
 937 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
 938 {
 939         int nd;
 940
 941         switch (policy->policy) {
 942         case MPOL_PREFERRED:
 943                 nd = policy->v.preferred_node;
 944                 if (nd < 0)
 945                         nd = numa_node_id();
 946                 break;
 947         case MPOL_BIND:
 948                 /* Lower zones don't get a policy applied */
 949                 /* Careful: current->mems_allowed might have moved */
 950                 if (gfp_zone(gfp) >= policy_zone)
 951                         if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
 952                                 return policy->v.zonelist;
 953                 /*FALL THROUGH*/
 954         case MPOL_INTERLEAVE: /* should not happen */
 955         case MPOL_DEFAULT:
 956                 nd = numa_node_id();
 957                 break;
 958         default:
 959                 nd = 0;
 960                 BUG();
 961         }
 962         return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
 963 }
 964
 965 /* Do dynamic interleaving for a process */
 966 static unsigned interleave_nodes(struct mempolicy *policy)
 967 {
 968         unsigned nid, next;
 969         struct task_struct *me = current;
 970
 971         nid = me->il_next;
 972         next = next_node(nid, policy->v.nodes);
 973         if (next >= MAX_NUMNODES)
 974                 next = first_node(policy->v.nodes);
 975         me->il_next = next;
 976         return nid;
 977 }
 978
 979 /* Do static interleaving for a VMA with known offset. */
 980 static unsigned offset_il_node(struct mempolicy *pol,
 981                 struct vm_area_struct *vma, unsigned long off)
 982 {
 983         unsigned nnodes = nodes_weight(pol->v.nodes);
 984         unsigned target = (unsigned)off % nnodes;
 985         int c;
 986         int nid = -1;
 987
 988         c = 0;
 989         do {
 990                 nid = next_node(nid, pol->v.nodes);
 991                 c++;
 992         } while (c <= target);
 993         return nid;
 994 }
 995
 996 /* Determine a node number for interleave */
 997 static inline unsigned interleave_nid(struct mempolicy *pol,
 998                  struct vm_area_struct *vma, unsigned long addr, int shift)
 999 {
1000         if (vma) {
1001                 unsigned long off;
1002
1003                 off = vma->vm_pgoff;
1004                 off += (addr - vma->vm_start) >> shift;
1005                 return offset_il_node(pol, vma, off);
1006         } else
1007                 return interleave_nodes(pol);
1008 }
1009
1010 /* Return a zonelist suitable for a huge page allocation. */
1011 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1012 {
1013         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1014
1015         if (pol->policy == MPOL_INTERLEAVE) {
1016                 unsigned nid;
1017
1018                 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1019                 return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
1020         }
1021         return zonelist_policy(GFP_HIGHUSER, pol);
1022 }
1023
1024 /* Allocate a page in interleaved policy.
1025    Own path because it needs to do special accounting. */
1026 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1027                                         unsigned nid)
1028 {
1029         struct zonelist *zl;
1030         struct page *page;
1031
1032         zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1033         page = __alloc_pages(gfp, order, zl);
1034         if (page && page_zone(page) == zl->zones[0]) {
1035                 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
1036                 put_cpu();
1037         }
1038         return page;
1039 }
1040
1041 /**
1042  *      alloc_page_vma  - Allocate a page for a VMA.
1043  *
1044  *      @gfp:
1045  *      %GFP_USER    user allocation.
1046  *      %GFP_KERNEL  kernel allocations,
1047  *      %GFP_HIGHMEM highmem/user allocations,
1048  *      %GFP_FS      allocation should not call back into a file system.
1049  *      %GFP_ATOMIC  don't sleep.
1050  *
1051  *      @vma:  Pointer to VMA or NULL if not available.
1052  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1053  *
1054  *      This function allocates a page from the kernel page pool and applies
1055  *      a NUMA policy associated with the VMA or the current process.
1056  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1057  *      mm_struct of the VMA to prevent it from going away. Should be used for
1058  *      all allocations for pages that will be mapped into
1059  *      user space. Returns NULL when no page can be allocated.
1060  *
1061  *      Should be called with the mm_sem of the vma hold.
1062  */
1063 struct page *
1064 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1065 {
1066         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1067
1068         cpuset_update_task_memory_state();
1069
1070         if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1071                 unsigned nid;
1072
1073                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1074                 return alloc_page_interleave(gfp, 0, nid);
1075         }
1076         return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
1077 }
1078
1079 /**
1080  *      alloc_pages_current - Allocate pages.
1081  *
1082  *      @gfp:
1083  *              %GFP_USER   user allocation,
1084  *              %GFP_KERNEL kernel allocation,
1085  *              %GFP_HIGHMEM highmem allocation,
1086  *              %GFP_FS     don't call back into a file system.
1087  *              %GFP_ATOMIC don't sleep.
1088  *      @order: Power of two of allocation size in pages. 0 is a single page.
1089  *
1090  *      Allocate a page from the kernel page pool.  When not in
1091  *      interrupt context and apply the current process NUMA policy.
1092  *      Returns NULL when no page can be allocated.
1093  *
1094  *      Don't call cpuset_update_task_memory_state() unless
1095  *      1) it's ok to take cpuset_sem (can WAIT), and
1096  *      2) allocating for current task (not interrupt).
1097  */
1098 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1099 {
1100         struct mempolicy *pol = current->mempolicy;
1101
1102         if ((gfp & __GFP_WAIT) && !in_interrupt())
1103                 cpuset_update_task_memory_state();
1104         if (!pol || in_interrupt())
1105                 pol = &default_policy;
1106         if (pol->policy == MPOL_INTERLEAVE)
1107                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1108         return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1109 }
1110 EXPORT_SYMBOL(alloc_pages_current);
1111
1112 /*
1113  * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1114  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1115  * with the mems_allowed returned by cpuset_mems_allowed().  This
1116  * keeps mempolicies cpuset relative after its cpuset moves.  See
1117  * further kernel/cpuset.c update_nodemask().
1118  */
1119 void *cpuset_being_rebound;
1120
1121 /* Slow path of a mempolicy copy */
1122 struct mempolicy *__mpol_copy(struct mempolicy *old)
1123 {
1124         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1125
1126         if (!new)
1127                 return ERR_PTR(-ENOMEM);
1128         if (current_cpuset_is_being_rebound()) {
1129                 nodemask_t mems = cpuset_mems_allowed(current);
1130                 mpol_rebind_policy(old, &mems);
1131         }
1132         *new = *old;
1133         atomic_set(&new->refcnt, 1);
1134         if (new->policy == MPOL_BIND) {
1135                 int sz = ksize(old->v.zonelist);
1136                 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
1137                 if (!new->v.zonelist) {
1138                         kmem_cache_free(policy_cache, new);
1139                         return ERR_PTR(-ENOMEM);
1140                 }
1141                 memcpy(new->v.zonelist, old->v.zonelist, sz);
1142         }
1143         return new;
1144 }
1145
1146 /* Slow path of a mempolicy comparison */
1147 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1148 {
1149         if (!a || !b)
1150                 return 0;
1151         if (a->policy != b->policy)
1152                 return 0;
1153         switch (a->policy) {
1154         case MPOL_DEFAULT:
1155                 return 1;
1156         case MPOL_INTERLEAVE:
1157                 return nodes_equal(a->v.nodes, b->v.nodes);
1158         case MPOL_PREFERRED:
1159                 return a->v.preferred_node == b->v.preferred_node;
1160         case MPOL_BIND: {
1161                 int i;
1162                 for (i = 0; a->v.zonelist->zones[i]; i++)
1163                         if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1164                                 return 0;
1165                 return b->v.zonelist->zones[i] == NULL;
1166         }
1167         default:
1168                 BUG();
1169                 return 0;
1170         }
1171 }
1172
1173 /* Slow path of a mpol destructor. */
1174 void __mpol_free(struct mempolicy *p)
1175 {
1176         if (!atomic_dec_and_test(&p->refcnt))
1177                 return;
1178         if (p->policy == MPOL_BIND)
1179                 kfree(p->v.zonelist);
1180         p->policy = MPOL_DEFAULT;
1181         kmem_cache_free(policy_cache, p);
1182 }
1183
1184 /*
1185  * Shared memory backing store policy support.
1186  *
1187  * Remember policies even when nobody has shared memory mapped.
1188  * The policies are kept in Red-Black tree linked from the inode.
1189  * They are protected by the sp->lock spinlock, which should be held
1190  * for any accesses to the tree.
1191  */
1192
1193 /* lookup first element intersecting start-end */
1194 /* Caller holds sp->lock */
1195 static struct sp_node *
1196 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1197 {
1198         struct rb_node *n = sp->root.rb_node;
1199
1200         while (n) {
1201                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1202
1203                 if (start >= p->end)
1204                         n = n->rb_right;
1205                 else if (end <= p->start)
1206                         n = n->rb_left;
1207                 else
1208                         break;
1209         }
1210         if (!n)
1211                 return NULL;
1212         for (;;) {
1213                 struct sp_node *w = NULL;
1214                 struct rb_node *prev = rb_prev(n);
1215                 if (!prev)
1216                         break;
1217                 w = rb_entry(prev, struct sp_node, nd);
1218                 if (w->end <= start)
1219                         break;
1220                 n = prev;
1221         }
1222         return rb_entry(n, struct sp_node, nd);
1223 }
1224
1225 /* Insert a new shared policy into the list. */
1226 /* Caller holds sp->lock */
1227 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1228 {
1229         struct rb_node **p = &sp->root.rb_node;
1230         struct rb_node *parent = NULL;
1231         struct sp_node *nd;
1232
1233         while (*p) {
1234                 parent = *p;
1235                 nd = rb_entry(parent, struct sp_node, nd);
1236                 if (new->start < nd->start)
1237                         p = &(*p)->rb_left;
1238                 else if (new->end > nd->end)
1239                         p = &(*p)->rb_right;
1240                 else
1241                         BUG();
1242         }
1243         rb_link_node(&new->nd, parent, p);
1244         rb_insert_color(&new->nd, &sp->root);
1245         PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1246                  new->policy ? new->policy->policy : 0);
1247 }
1248
1249 /* Find shared policy intersecting idx */
1250 struct mempolicy *
1251 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1252 {
1253         struct mempolicy *pol = NULL;
1254         struct sp_node *sn;
1255
1256         if (!sp->root.rb_node)
1257                 return NULL;
1258         spin_lock(&sp->lock);
1259         sn = sp_lookup(sp, idx, idx+1);
1260         if (sn) {
1261                 mpol_get(sn->policy);
1262                 pol = sn->policy;
1263         }
1264         spin_unlock(&sp->lock);
1265         return pol;
1266 }
1267
1268 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1269 {
1270         PDprintk("deleting %lx-l%x\n", n->start, n->end);
1271         rb_erase(&n->nd, &sp->root);
1272         mpol_free(n->policy);
1273         kmem_cache_free(sn_cache, n);
1274 }
1275
1276 struct sp_node *
1277 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1278 {
1279         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1280
1281         if (!n)
1282                 return NULL;
1283         n->start = start;
1284         n->end = end;
1285         mpol_get(pol);
1286         n->policy = pol;
1287         return n;
1288 }
1289
1290 /* Replace a policy range. */
1291 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1292                                  unsigned long end, struct sp_node *new)
1293 {
1294         struct sp_node *n, *new2 = NULL;
1295
1296 restart:
1297         spin_lock(&sp->lock);
1298         n = sp_lookup(sp, start, end);
1299         /* Take care of old policies in the same range. */
1300         while (n && n->start < end) {
1301                 struct rb_node *next = rb_next(&n->nd);
1302                 if (n->start >= start) {
1303                         if (n->end <= end)
1304                                 sp_delete(sp, n);
1305                         else
1306                                 n->start = end;
1307                 } else {
1308                         /* Old policy spanning whole new range. */
1309                         if (n->end > end) {
1310                                 if (!new2) {
1311                                         spin_unlock(&sp->lock);
1312                                         new2 = sp_alloc(end, n->end, n->policy);
1313                                         if (!new2)
1314                                                 return -ENOMEM;
1315                                         goto restart;
1316                                 }
1317                                 n->end = start;
1318                                 sp_insert(sp, new2);
1319                                 new2 = NULL;
1320                                 break;
1321                         } else
1322                                 n->end = start;
1323                 }
1324                 if (!next)
1325                         break;
1326                 n = rb_entry(next, struct sp_node, nd);
1327         }
1328         if (new)
1329                 sp_insert(sp, new);
1330         spin_unlock(&sp->lock);
1331         if (new2) {
1332                 mpol_free(new2->policy);
1333                 kmem_cache_free(sn_cache, new2);
1334         }
1335         return 0;
1336 }
1337
1338 void mpol_shared_policy_init(struct shared_policy *info, int policy,
1339                                 nodemask_t *policy_nodes)
1340 {
1341         info->root = RB_ROOT;
1342         spin_lock_init(&info->lock);
1343
1344         if (policy != MPOL_DEFAULT) {
1345                 struct mempolicy *newpol;
1346
1347                 /* Falls back to MPOL_DEFAULT on any error */
1348                 newpol = mpol_new(policy, policy_nodes);
1349                 if (!IS_ERR(newpol)) {
1350                         /* Create pseudo-vma that contains just the policy */
1351                         struct vm_area_struct pvma;
1352
1353                         memset(&pvma, 0, sizeof(struct vm_area_struct));
1354                         /* Policy covers entire file */
1355                         pvma.vm_end = TASK_SIZE;
1356                         mpol_set_shared_policy(info, &pvma, newpol);
1357                         mpol_free(newpol);
1358                 }
1359         }
1360 }
1361
1362 int mpol_set_shared_policy(struct shared_policy *info,
1363                         struct vm_area_struct *vma, struct mempolicy *npol)
1364 {
1365         int err;
1366         struct sp_node *new = NULL;
1367         unsigned long sz = vma_pages(vma);
1368
1369         PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1370                  vma->vm_pgoff,
1371                  sz, npol? npol->policy : -1,
1372                 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1373
1374         if (npol) {
1375                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1376                 if (!new)
1377                         return -ENOMEM;
1378         }
1379         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1380         if (err && new)
1381                 kmem_cache_free(sn_cache, new);
1382         return err;
1383 }
1384
1385 /* Free a backing policy store on inode delete. */
1386 void mpol_free_shared_policy(struct shared_policy *p)
1387 {
1388         struct sp_node *n;
1389         struct rb_node *next;
1390
1391         if (!p->root.rb_node)
1392                 return;
1393         spin_lock(&p->lock);
1394         next = rb_first(&p->root);
1395         while (next) {
1396                 n = rb_entry(next, struct sp_node, nd);
1397                 next = rb_next(&n->nd);
1398                 rb_erase(&n->nd, &p->root);
1399                 mpol_free(n->policy);
1400                 kmem_cache_free(sn_cache, n);
1401         }
1402         spin_unlock(&p->lock);
1403 }
1404
1405 /* assumes fs == KERNEL_DS */
1406 void __init numa_policy_init(void)
1407 {
1408         policy_cache = kmem_cache_create("numa_policy",
1409                                          sizeof(struct mempolicy),
1410                                          0, SLAB_PANIC, NULL, NULL);
1411
1412         sn_cache = kmem_cache_create("shared_policy_node",
1413                                      sizeof(struct sp_node),
1414                                      0, SLAB_PANIC, NULL, NULL);
1415
1416         /* Set interleaving policy for system init. This way not all
1417            the data structures allocated at system boot end up in node zero. */
1418
1419         if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1420                 printk("numa_policy_init: interleaving failed\n");
1421 }
1422
1423 /* Reset policy of current process to default */
1424 void numa_default_policy(void)
1425 {
1426         do_set_mempolicy(MPOL_DEFAULT, NULL);
1427 }
1428
1429 /* Migrate a policy to a different set of nodes */
1430 void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1431 {
1432         nodemask_t *mpolmask;
1433         nodemask_t tmp;
1434
1435         if (!pol)
1436                 return;
1437         mpolmask = &pol->cpuset_mems_allowed;
1438         if (nodes_equal(*mpolmask, *newmask))
1439                 return;
1440
1441         switch (pol->policy) {
1442         case MPOL_DEFAULT:
1443                 break;
1444         case MPOL_INTERLEAVE:
1445                 nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1446                 pol->v.nodes = tmp;
1447                 *mpolmask = *newmask;
1448                 current->il_next = node_remap(current->il_next,
1449                                                 *mpolmask, *newmask);
1450                 break;
1451         case MPOL_PREFERRED:
1452                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1453                                                 *mpolmask, *newmask);
1454                 *mpolmask = *newmask;
1455                 break;
1456         case MPOL_BIND: {
1457                 nodemask_t nodes;
1458                 struct zone **z;
1459                 struct zonelist *zonelist;
1460
1461                 nodes_clear(nodes);
1462                 for (z = pol->v.zonelist->zones; *z; z++)
1463                         node_set((*z)->zone_pgdat->node_id, nodes);
1464                 nodes_remap(tmp, nodes, *mpolmask, *newmask);
1465                 nodes = tmp;
1466
1467                 zonelist = bind_zonelist(&nodes);
1468
1469                 /* If no mem, then zonelist is NULL and we keep old zonelist.
1470                  * If that old zonelist has no remaining mems_allowed nodes,
1471                  * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1472                  */
1473
1474                 if (zonelist) {
1475                         /* Good - got mem - substitute new zonelist */
1476                         kfree(pol->v.zonelist);
1477                         pol->v.zonelist = zonelist;
1478                 }
1479                 *mpolmask = *newmask;
1480                 break;
1481         }
1482         default:
1483                 BUG();
1484                 break;
1485         }
1486 }
1487
1488 /*
1489  * Wrapper for mpol_rebind_policy() that just requires task
1490  * pointer, and updates task mempolicy.
1491  */
1492
1493 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1494 {
1495         mpol_rebind_policy(tsk->mempolicy, new);
1496 }
1497
1498 /*
1499  * Rebind each vma in mm to new nodemask.
1500  *
1501  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
1502  */
1503
1504 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1505 {
1506         struct vm_area_struct *vma;
1507
1508         down_write(&mm->mmap_sem);
1509         for (vma = mm->mmap; vma; vma = vma->vm_next)
1510                 mpol_rebind_policy(vma->vm_policy, new);
1511         up_write(&mm->mmap_sem);
1512 }
1513
1514 /*
1515  * Display pages allocated per node and memory policy via /proc.
1516  */
1517
1518 static const char *policy_types[] = { "default", "prefer", "bind",
1519                                       "interleave" };
1520
1521 /*
1522  * Convert a mempolicy into a string.
1523  * Returns the number of characters in buffer (if positive)
1524  * or an error (negative)
1525  */
1526 static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1527 {
1528         char *p = buffer;
1529         int l;
1530         nodemask_t nodes;
1531         int mode = pol ? pol->policy : MPOL_DEFAULT;
1532
1533         switch (mode) {
1534         case MPOL_DEFAULT:
1535                 nodes_clear(nodes);
1536                 break;
1537
1538         case MPOL_PREFERRED:
1539                 nodes_clear(nodes);
1540                 node_set(pol->v.preferred_node, nodes);
1541                 break;
1542
1543         case MPOL_BIND:
1544                 get_zonemask(pol, &nodes);
1545                 break;
1546
1547         case MPOL_INTERLEAVE:
1548                 nodes = pol->v.nodes;
1549                 break;
1550
1551         default:
1552                 BUG();
1553                 return -EFAULT;
1554         }
1555
1556         l = strlen(policy_types[mode]);
1557         if (buffer + maxlen < p + l + 1)
1558                 return -ENOSPC;
1559
1560         strcpy(p, policy_types[mode]);
1561         p += l;
1562
1563         if (!nodes_empty(nodes)) {
1564                 if (buffer + maxlen < p + 2)
1565                         return -ENOSPC;
1566                 *p++ = '=';
1567                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1568         }
1569         return p - buffer;
1570 }
1571
1572 struct numa_maps {
1573         unsigned long pages;
1574         unsigned long anon;
1575         unsigned long mapped;
1576         unsigned long mapcount_max;
1577         unsigned long node[MAX_NUMNODES];
1578 };
1579
1580 static void gather_stats(struct page *page, void *private)
1581 {
1582         struct numa_maps *md = private;
1583         int count = page_mapcount(page);
1584
1585         if (count)
1586                 md->mapped++;
1587
1588         if (count > md->mapcount_max)
1589                 md->mapcount_max = count;
1590
1591         md->pages++;
1592
1593         if (PageAnon(page))
1594                 md->anon++;
1595
1596         md->node[page_to_nid(page)]++;
1597         cond_resched();
1598 }
1599
1600 int show_numa_map(struct seq_file *m, void *v)
1601 {
1602         struct task_struct *task = m->private;
1603         struct vm_area_struct *vma = v;
1604         struct numa_maps *md;
1605         int n;
1606         char buffer[50];
1607
1608         if (!vma->vm_mm)
1609                 return 0;
1610
1611         md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1612         if (!md)
1613                 return 0;
1614
1615         check_pgd_range(vma, vma->vm_start, vma->vm_end,
1616                     &node_online_map, MPOL_MF_STATS, md);
1617
1618         if (md->pages) {
1619                 mpol_to_str(buffer, sizeof(buffer),
1620                             get_vma_policy(task, vma, vma->vm_start));
1621
1622                 seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu",
1623                            vma->vm_start, buffer, md->pages,
1624                            md->mapped, md->mapcount_max);
1625
1626                 if (md->anon)
1627                         seq_printf(m," anon=%lu",md->anon);
1628
1629                 for_each_online_node(n)
1630                         if (md->node[n])
1631                                 seq_printf(m, " N%d=%lu", n, md->node[n]);
1632
1633                 seq_putc(m, '\n');
1634         }
1635         kfree(md);
1636
1637         if (m->count < m->size)
1638                 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
1639         return 0;
1640 }
1641