mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * Subject to the GNU Public License, version 2.
   6  *
   7  * NUMA policy allows the user to give hints in which node(s) memory should
   8  * be allocated.
   9  *
  10  * Support four policies per VMA and per process:
  11  *
  12  * The VMA policy has priority over the process policy for a page fault.
  13  *
  14  * interleave     Allocate memory interleaved over a set of nodes,
  15  *                with normal fallback if it fails.
  16  *                For VMA based allocations this interleaves based on the
  17  *                offset into the backing object or offset into the mapping
  18  *                for anonymous memory. For process policy an process counter
  19  *                is used.
  20  * bind           Only allocate memory on a specific set of nodes,
  21  *                no fallback.
  22  * preferred       Try a specific node first before normal fallback.
  23  *                As a special case node -1 here means do the allocation
  24  *                on the local CPU. This is normally identical to default,
  25  *                but useful to set in a VMA when you have a non default
  26  *                process policy.
  27  * default        Allocate on the local node first, or when on a VMA
  28  *                use the process policy. This is what Linux always did
  29  *                in a NUMA aware kernel and still does by, ahem, default.
  30  *
  31  * The process policy is applied for most non interrupt memory allocations
  32  * in that process' context. Interrupts ignore the policies and always
  33  * try to allocate on the local CPU. The VMA policy is only applied for memory
  34  * allocations for a VMA in the VM.
  35  *
  36  * Currently there are a few corner cases in swapping where the policy
  37  * is not applied, but the majority should be handled. When process policy
  38  * is used it is not remembered over swap outs/swap ins.
  39  *
  40  * Only the highest zone in the zone hierarchy gets policied. Allocations
  41  * requesting a lower zone just use default policy. This implies that
  42  * on systems with highmem kernel lowmem allocation don't get policied.
  43  * Same with GFP_DMA allocations.
  44  *
  45  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  46  * all users and remembered even when nobody has memory mapped.
  47  */
  48
  49 /* Notebook:
  50    fix mmap readahead to honour policy and enable policy for any page cache
  51    object
  52    statistics for bigpages
  53    global policy for page cache? currently it uses process policy. Requires
  54    first item above.
  55    handle mremap for shared memory (currently ignored for the policy)
  56    grows down?
  57    make bind policy root only? It can trigger oom much faster and the
  58    kernel is not always grateful with that.
  59    could replace all the switch()es with a mempolicy_ops structure.
  60 */
  61
  62 #include <linux/mempolicy.h>
  63 #include <linux/mm.h>
  64 #include <linux/highmem.h>
  65 #include <linux/hugetlb.h>
  66 #include <linux/kernel.h>
  67 #include <linux/sched.h>
  68 #include <linux/mm.h>
  69 #include <linux/nodemask.h>
  70 #include <linux/cpuset.h>
  71 #include <linux/gfp.h>
  72 #include <linux/slab.h>
  73 #include <linux/string.h>
  74 #include <linux/module.h>
  75 #include <linux/interrupt.h>
  76 #include <linux/init.h>
  77 #include <linux/compat.h>
  78 #include <linux/mempolicy.h>
  79 #include <asm/tlbflush.h>
  80 #include <asm/uaccess.h>
  81
  82 static kmem_cache_t *policy_cache;
  83 static kmem_cache_t *sn_cache;
  84
  85 #define PDprintk(fmt...)
  86
  87 /* Highest zone. An specific allocation for a zone below that is not
  88    policied. */
  89 static int policy_zone;
  90
  91 static struct mempolicy default_policy = {
  92         .refcnt = ATOMIC_INIT(1), /* never free it */
  93         .policy = MPOL_DEFAULT,
  94 };
  95
  96 /* Check if all specified nodes are online */
  97 static int nodes_online(unsigned long *nodes)
  98 {
  99         DECLARE_BITMAP(online2, MAX_NUMNODES);
 100
 101         bitmap_copy(online2, nodes_addr(node_online_map), MAX_NUMNODES);
 102         if (bitmap_empty(online2, MAX_NUMNODES))
 103                 set_bit(0, online2);
 104         if (!bitmap_subset(nodes, online2, MAX_NUMNODES))
 105                 return -EINVAL;
 106         return 0;
 107 }
 108
 109 /* Do sanity checking on a policy */
 110 static int mpol_check_policy(int mode, unsigned long *nodes)
 111 {
 112         int empty = bitmap_empty(nodes, MAX_NUMNODES);
 113
 114         switch (mode) {
 115         case MPOL_DEFAULT:
 116                 if (!empty)
 117                         return -EINVAL;
 118                 break;
 119         case MPOL_BIND:
 120         case MPOL_INTERLEAVE:
 121                 /* Preferred will only use the first bit, but allow
 122                    more for now. */
 123                 if (empty)
 124                         return -EINVAL;
 125                 break;
 126         }
 127         return nodes_online(nodes);
 128 }
 129
 130 /* Copy a node mask from user space. */
 131 static int get_nodes(unsigned long *nodes, unsigned long __user *nmask,
 132                      unsigned long maxnode, int mode)
 133 {
 134         unsigned long k;
 135         unsigned long nlongs;
 136         unsigned long endmask;
 137
 138         --maxnode;
 139         bitmap_zero(nodes, MAX_NUMNODES);
 140         if (maxnode == 0 || !nmask)
 141                 return 0;
 142
 143         nlongs = BITS_TO_LONGS(maxnode);
 144         if ((maxnode % BITS_PER_LONG) == 0)
 145                 endmask = ~0UL;
 146         else
 147                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
 148
 149         /* When the user specified more nodes than supported just check
 150            if the non supported part is all zero. */
 151         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
 152                 if (nlongs > PAGE_SIZE/sizeof(long))
 153                         return -EINVAL;
 154                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
 155                         unsigned long t;
 156                         if (get_user(t,  nmask + k))
 157                                 return -EFAULT;
 158                         if (k == nlongs - 1) {
 159                                 if (t & endmask)
 160                                         return -EINVAL;
 161                         } else if (t)
 162                                 return -EINVAL;
 163                 }
 164                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
 165                 endmask = ~0UL;
 166         }
 167
 168         if (copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long)))
 169                 return -EFAULT;
 170         nodes[nlongs-1] &= endmask;
 171         /* Update current mems_allowed */
 172         cpuset_update_current_mems_allowed();
 173         /* Ignore nodes not set in current->mems_allowed */
 174         cpuset_restrict_to_mems_allowed(nodes);
 175         return mpol_check_policy(mode, nodes);
 176 }
 177
 178 /* Generate a custom zonelist for the BIND policy. */
 179 static struct zonelist *bind_zonelist(unsigned long *nodes)
 180 {
 181         struct zonelist *zl;
 182         int num, max, nd;
 183
 184         max = 1 + MAX_NR_ZONES * bitmap_weight(nodes, MAX_NUMNODES);
 185         zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
 186         if (!zl)
 187                 return NULL;
 188         num = 0;
 189         for (nd = find_first_bit(nodes, MAX_NUMNODES);
 190              nd < MAX_NUMNODES;
 191              nd = find_next_bit(nodes, MAX_NUMNODES, 1+nd)) {
 192                 int k;
 193                 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
 194                         struct zone *z = &NODE_DATA(nd)->node_zones[k];
 195                         if (!z->present_pages)
 196                                 continue;
 197                         zl->zones[num++] = z;
 198                         if (k > policy_zone)
 199                                 policy_zone = k;
 200                 }
 201         }
 202         BUG_ON(num >= max);
 203         zl->zones[num] = NULL;
 204         return zl;
 205 }
 206
 207 /* Create a new policy */
 208 static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
 209 {
 210         struct mempolicy *policy;
 211
 212         PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes[0]);
 213         if (mode == MPOL_DEFAULT)
 214                 return NULL;
 215         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 216         if (!policy)
 217                 return ERR_PTR(-ENOMEM);
 218         atomic_set(&policy->refcnt, 1);
 219         switch (mode) {
 220         case MPOL_INTERLEAVE:
 221                 bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES);
 222                 break;
 223         case MPOL_PREFERRED:
 224                 policy->v.preferred_node = find_first_bit(nodes, MAX_NUMNODES);
 225                 if (policy->v.preferred_node >= MAX_NUMNODES)
 226                         policy->v.preferred_node = -1;
 227                 break;
 228         case MPOL_BIND:
 229                 policy->v.zonelist = bind_zonelist(nodes);
 230                 if (policy->v.zonelist == NULL) {
 231                         kmem_cache_free(policy_cache, policy);
 232                         return ERR_PTR(-ENOMEM);
 233                 }
 234                 break;
 235         }
 236         policy->policy = mode;
 237         return policy;
 238 }
 239
 240 /* Ensure all existing pages follow the policy. */
 241 static int
 242 verify_pages(struct mm_struct *mm,
 243              unsigned long addr, unsigned long end, unsigned long *nodes)
 244 {
 245         while (addr < end) {
 246                 struct page *p;
 247                 pte_t *pte;
 248                 pmd_t *pmd;
 249                 pud_t *pud;
 250                 pgd_t *pgd;
 251                 pgd = pgd_offset(mm, addr);
 252                 if (pgd_none(*pgd)) {
 253                         unsigned long next = (addr + PGDIR_SIZE) & PGDIR_MASK;
 254                         if (next > addr)
 255                                 break;
 256                         addr = next;
 257                         continue;
 258                 }
 259                 pud = pud_offset(pgd, addr);
 260                 if (pud_none(*pud)) {
 261                         addr = (addr + PUD_SIZE) & PUD_MASK;
 262                         continue;
 263                 }
 264                 pmd = pmd_offset(pud, addr);
 265                 if (pmd_none(*pmd)) {
 266                         addr = (addr + PMD_SIZE) & PMD_MASK;
 267                         continue;
 268                 }
 269                 p = NULL;
 270                 pte = pte_offset_map(pmd, addr);
 271                 if (pte_present(*pte))
 272                         p = pte_page(*pte);
 273                 pte_unmap(pte);
 274                 if (p) {
 275                         unsigned nid = page_to_nid(p);
 276                         if (!test_bit(nid, nodes))
 277                                 return -EIO;
 278                 }
 279                 addr += PAGE_SIZE;
 280         }
 281         return 0;
 282 }
 283
 284 /* Step 1: check the range */
 285 static struct vm_area_struct *
 286 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 287             unsigned long *nodes, unsigned long flags)
 288 {
 289         int err;
 290         struct vm_area_struct *first, *vma, *prev;
 291
 292         first = find_vma(mm, start);
 293         if (!first)
 294                 return ERR_PTR(-EFAULT);
 295         prev = NULL;
 296         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 297                 if (!vma->vm_next && vma->vm_end < end)
 298                         return ERR_PTR(-EFAULT);
 299                 if (prev && prev->vm_end < vma->vm_start)
 300                         return ERR_PTR(-EFAULT);
 301                 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
 302                         err = verify_pages(vma->vm_mm,
 303                                            vma->vm_start, vma->vm_end, nodes);
 304                         if (err) {
 305                                 first = ERR_PTR(err);
 306                                 break;
 307                         }
 308                 }
 309                 prev = vma;
 310         }
 311         return first;
 312 }
 313
 314 /* Apply policy to a single VMA */
 315 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 316 {
 317         int err = 0;
 318         struct mempolicy *old = vma->vm_policy;
 319
 320         PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 321                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 322                  vma->vm_ops, vma->vm_file,
 323                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 324
 325         if (vma->vm_ops && vma->vm_ops->set_policy)
 326                 err = vma->vm_ops->set_policy(vma, new);
 327         if (!err) {
 328                 mpol_get(new);
 329                 vma->vm_policy = new;
 330                 mpol_free(old);
 331         }
 332         return err;
 333 }
 334
 335 /* Step 2: apply policy to a range and do splits. */
 336 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 337                        unsigned long end, struct mempolicy *new)
 338 {
 339         struct vm_area_struct *next;
 340         int err;
 341
 342         err = 0;
 343         for (; vma && vma->vm_start < end; vma = next) {
 344                 next = vma->vm_next;
 345                 if (vma->vm_start < start)
 346                         err = split_vma(vma->vm_mm, vma, start, 1);
 347                 if (!err && vma->vm_end > end)
 348                         err = split_vma(vma->vm_mm, vma, end, 0);
 349                 if (!err)
 350                         err = policy_vma(vma, new);
 351                 if (err)
 352                         break;
 353         }
 354         return err;
 355 }
 356
 357 /* Change policy for a memory range */
 358 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 359                           unsigned long mode,
 360                           unsigned long __user *nmask, unsigned long maxnode,
 361                           unsigned flags)
 362 {
 363         struct vm_area_struct *vma;
 364         struct mm_struct *mm = current->mm;
 365         struct mempolicy *new;
 366         unsigned long end;
 367         DECLARE_BITMAP(nodes, MAX_NUMNODES);
 368         int err;
 369
 370         if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
 371                 return -EINVAL;
 372         if (start & ~PAGE_MASK)
 373                 return -EINVAL;
 374         if (mode == MPOL_DEFAULT)
 375                 flags &= ~MPOL_MF_STRICT;
 376         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 377         end = start + len;
 378         if (end < start)
 379                 return -EINVAL;
 380         if (end == start)
 381                 return 0;
 382
 383         err = get_nodes(nodes, nmask, maxnode, mode);
 384         if (err)
 385                 return err;
 386
 387         new = mpol_new(mode, nodes);
 388         if (IS_ERR(new))
 389                 return PTR_ERR(new);
 390
 391         PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
 392                         mode,nodes[0]);
 393
 394         down_write(&mm->mmap_sem);
 395         vma = check_range(mm, start, end, nodes, flags);
 396         err = PTR_ERR(vma);
 397         if (!IS_ERR(vma))
 398                 err = mbind_range(vma, start, end, new);
 399         up_write(&mm->mmap_sem);
 400         mpol_free(new);
 401         return err;
 402 }
 403
 404 /* Set the process memory policy */
 405 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 406                                    unsigned long maxnode)
 407 {
 408         int err;
 409         struct mempolicy *new;
 410         DECLARE_BITMAP(nodes, MAX_NUMNODES);
 411
 412         if (mode > MPOL_MAX)
 413                 return -EINVAL;
 414         err = get_nodes(nodes, nmask, maxnode, mode);
 415         if (err)
 416                 return err;
 417         new = mpol_new(mode, nodes);
 418         if (IS_ERR(new))
 419                 return PTR_ERR(new);
 420         mpol_free(current->mempolicy);
 421         current->mempolicy = new;
 422         if (new && new->policy == MPOL_INTERLEAVE)
 423                 current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES);
 424         return 0;
 425 }
 426
 427 /* Fill a zone bitmap for a policy */
 428 static void get_zonemask(struct mempolicy *p, unsigned long *nodes)
 429 {
 430         int i;
 431
 432         bitmap_zero(nodes, MAX_NUMNODES);
 433         switch (p->policy) {
 434         case MPOL_BIND:
 435                 for (i = 0; p->v.zonelist->zones[i]; i++)
 436                         __set_bit(p->v.zonelist->zones[i]->zone_pgdat->node_id, nodes);
 437                 break;
 438         case MPOL_DEFAULT:
 439                 break;
 440         case MPOL_INTERLEAVE:
 441                 bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES);
 442                 break;
 443         case MPOL_PREFERRED:
 444                 /* or use current node instead of online map? */
 445                 if (p->v.preferred_node < 0)
 446                         bitmap_copy(nodes, nodes_addr(node_online_map), MAX_NUMNODES);
 447                 else
 448                         __set_bit(p->v.preferred_node, nodes);
 449                 break;
 450         default:
 451                 BUG();
 452         }
 453 }
 454
 455 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 456 {
 457         struct page *p;
 458         int err;
 459
 460         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 461         if (err >= 0) {
 462                 err = page_to_nid(p);
 463                 put_page(p);
 464         }
 465         return err;
 466 }
 467
 468 /* Copy a kernel node mask to user space */
 469 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 470                               void *nodes, unsigned nbytes)
 471 {
 472         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
 473
 474         if (copy > nbytes) {
 475                 if (copy > PAGE_SIZE)
 476                         return -EINVAL;
 477                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
 478                         return -EFAULT;
 479                 copy = nbytes;
 480         }
 481         return copy_to_user(mask, nodes, copy) ? -EFAULT : 0;
 482 }
 483
 484 /* Retrieve NUMA policy */
 485 asmlinkage long sys_get_mempolicy(int __user *policy,
 486                                   unsigned long __user *nmask,
 487                                   unsigned long maxnode,
 488                                   unsigned long addr, unsigned long flags)
 489 {
 490         int err, pval;
 491         struct mm_struct *mm = current->mm;
 492         struct vm_area_struct *vma = NULL;
 493         struct mempolicy *pol = current->mempolicy;
 494
 495         if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
 496                 return -EINVAL;
 497         if (nmask != NULL && maxnode < MAX_NUMNODES)
 498                 return -EINVAL;
 499         if (flags & MPOL_F_ADDR) {
 500                 down_read(&mm->mmap_sem);
 501                 vma = find_vma_intersection(mm, addr, addr+1);
 502                 if (!vma) {
 503                         up_read(&mm->mmap_sem);
 504                         return -EFAULT;
 505                 }
 506                 if (vma->vm_ops && vma->vm_ops->get_policy)
 507                         pol = vma->vm_ops->get_policy(vma, addr);
 508                 else
 509                         pol = vma->vm_policy;
 510         } else if (addr)
 511                 return -EINVAL;
 512
 513         if (!pol)
 514                 pol = &default_policy;
 515
 516         if (flags & MPOL_F_NODE) {
 517                 if (flags & MPOL_F_ADDR) {
 518                         err = lookup_node(mm, addr);
 519                         if (err < 0)
 520                                 goto out;
 521                         pval = err;
 522                 } else if (pol == current->mempolicy &&
 523                                 pol->policy == MPOL_INTERLEAVE) {
 524                         pval = current->il_next;
 525                 } else {
 526                         err = -EINVAL;
 527                         goto out;
 528                 }
 529         } else
 530                 pval = pol->policy;
 531
 532         if (vma) {
 533                 up_read(&current->mm->mmap_sem);
 534                 vma = NULL;
 535         }
 536
 537         if (policy && put_user(pval, policy))
 538                 return -EFAULT;
 539
 540         err = 0;
 541         if (nmask) {
 542                 DECLARE_BITMAP(nodes, MAX_NUMNODES);
 543                 get_zonemask(pol, nodes);
 544                 err = copy_nodes_to_user(nmask, maxnode, nodes, sizeof(nodes));
 545         }
 546
 547  out:
 548         if (vma)
 549                 up_read(&current->mm->mmap_sem);
 550         return err;
 551 }
 552
 553 #ifdef CONFIG_COMPAT
 554
 555 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
 556                                      compat_ulong_t __user *nmask,
 557                                      compat_ulong_t maxnode,
 558                                      compat_ulong_t addr, compat_ulong_t flags)
 559 {
 560         long err;
 561         unsigned long __user *nm = NULL;
 562         unsigned long nr_bits, alloc_size;
 563         DECLARE_BITMAP(bm, MAX_NUMNODES);
 564
 565         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 566         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 567
 568         if (nmask)
 569                 nm = compat_alloc_user_space(alloc_size);
 570
 571         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
 572
 573         if (!err && nmask) {
 574                 err = copy_from_user(bm, nm, alloc_size);
 575                 /* ensure entire bitmap is zeroed */
 576                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
 577                 err |= compat_put_bitmap(nmask, bm, nr_bits);
 578         }
 579
 580         return err;
 581 }
 582
 583 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
 584                                      compat_ulong_t maxnode)
 585 {
 586         long err = 0;
 587         unsigned long __user *nm = NULL;
 588         unsigned long nr_bits, alloc_size;
 589         DECLARE_BITMAP(bm, MAX_NUMNODES);
 590
 591         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 592         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 593
 594         if (nmask) {
 595                 err = compat_get_bitmap(bm, nmask, nr_bits);
 596                 nm = compat_alloc_user_space(alloc_size);
 597                 err |= copy_to_user(nm, bm, alloc_size);
 598         }
 599
 600         if (err)
 601                 return -EFAULT;
 602
 603         return sys_set_mempolicy(mode, nm, nr_bits+1);
 604 }
 605
 606 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
 607                              compat_ulong_t mode, compat_ulong_t __user *nmask,
 608                              compat_ulong_t maxnode, compat_ulong_t flags)
 609 {
 610         long err = 0;
 611         unsigned long __user *nm = NULL;
 612         unsigned long nr_bits, alloc_size;
 613         DECLARE_BITMAP(bm, MAX_NUMNODES);
 614
 615         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 616         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 617
 618         if (nmask) {
 619                 err = compat_get_bitmap(bm, nmask, nr_bits);
 620                 nm = compat_alloc_user_space(alloc_size);
 621                 err |= copy_to_user(nm, bm, alloc_size);
 622         }
 623
 624         if (err)
 625                 return -EFAULT;
 626
 627         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
 628 }
 629
 630 #endif
 631
 632 /* Return effective policy for a VMA */
 633 static struct mempolicy *
 634 get_vma_policy(struct vm_area_struct *vma, unsigned long addr)
 635 {
 636         struct mempolicy *pol = current->mempolicy;
 637
 638         if (vma) {
 639                 if (vma->vm_ops && vma->vm_ops->get_policy)
 640                         pol = vma->vm_ops->get_policy(vma, addr);
 641                 else if (vma->vm_policy &&
 642                                 vma->vm_policy->policy != MPOL_DEFAULT)
 643                         pol = vma->vm_policy;
 644         }
 645         if (!pol)
 646                 pol = &default_policy;
 647         return pol;
 648 }
 649
 650 /* Return a zonelist representing a mempolicy */
 651 static struct zonelist *zonelist_policy(unsigned int __nocast gfp, struct mempolicy *policy)
 652 {
 653         int nd;
 654
 655         switch (policy->policy) {
 656         case MPOL_PREFERRED:
 657                 nd = policy->v.preferred_node;
 658                 if (nd < 0)
 659                         nd = numa_node_id();
 660                 break;
 661         case MPOL_BIND:
 662                 /* Lower zones don't get a policy applied */
 663                 /* Careful: current->mems_allowed might have moved */
 664                 if (gfp >= policy_zone)
 665                         if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
 666                                 return policy->v.zonelist;
 667                 /*FALL THROUGH*/
 668         case MPOL_INTERLEAVE: /* should not happen */
 669         case MPOL_DEFAULT:
 670                 nd = numa_node_id();
 671                 break;
 672         default:
 673                 nd = 0;
 674                 BUG();
 675         }
 676         return NODE_DATA(nd)->node_zonelists + (gfp & GFP_ZONEMASK);
 677 }
 678
 679 /* Do dynamic interleaving for a process */
 680 static unsigned interleave_nodes(struct mempolicy *policy)
 681 {
 682         unsigned nid, next;
 683         struct task_struct *me = current;
 684
 685         nid = me->il_next;
 686         BUG_ON(nid >= MAX_NUMNODES);
 687         next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid);
 688         if (next >= MAX_NUMNODES)
 689                 next = find_first_bit(policy->v.nodes, MAX_NUMNODES);
 690         me->il_next = next;
 691         return nid;
 692 }
 693
 694 /* Do static interleaving for a VMA with known offset. */
 695 static unsigned offset_il_node(struct mempolicy *pol,
 696                 struct vm_area_struct *vma, unsigned long off)
 697 {
 698         unsigned nnodes = bitmap_weight(pol->v.nodes, MAX_NUMNODES);
 699         unsigned target = (unsigned)off % nnodes;
 700         int c;
 701         int nid = -1;
 702
 703         c = 0;
 704         do {
 705                 nid = find_next_bit(pol->v.nodes, MAX_NUMNODES, nid+1);
 706                 c++;
 707         } while (c <= target);
 708         BUG_ON(nid >= MAX_NUMNODES);
 709         BUG_ON(!test_bit(nid, pol->v.nodes));
 710         return nid;
 711 }
 712
 713 /* Allocate a page in interleaved policy.
 714    Own path because it needs to do special accounting. */
 715 static struct page *alloc_page_interleave(unsigned int __nocast gfp, unsigned order, unsigned nid)
 716 {
 717         struct zonelist *zl;
 718         struct page *page;
 719
 720         BUG_ON(!node_online(nid));
 721         zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);
 722         page = __alloc_pages(gfp, order, zl);
 723         if (page && page_zone(page) == zl->zones[0]) {
 724                 zl->zones[0]->pageset[get_cpu()].interleave_hit++;
 725                 put_cpu();
 726         }
 727         return page;
 728 }
 729
 730 /**
 731  *      alloc_page_vma  - Allocate a page for a VMA.
 732  *
 733  *      @gfp:
 734  *      %GFP_USER    user allocation.
 735  *      %GFP_KERNEL  kernel allocations,
 736  *      %GFP_HIGHMEM highmem/user allocations,
 737  *      %GFP_FS      allocation should not call back into a file system.
 738  *      %GFP_ATOMIC  don't sleep.
 739  *
 740  *      @vma:  Pointer to VMA or NULL if not available.
 741  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
 742  *
 743  *      This function allocates a page from the kernel page pool and applies
 744  *      a NUMA policy associated with the VMA or the current process.
 745  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
 746  *      mm_struct of the VMA to prevent it from going away. Should be used for
 747  *      all allocations for pages that will be mapped into
 748  *      user space. Returns NULL when no page can be allocated.
 749  *
 750  *      Should be called with the mm_sem of the vma hold.
 751  */
 752 struct page *
 753 alloc_page_vma(unsigned int __nocast gfp, struct vm_area_struct *vma, unsigned long addr)
 754 {
 755         struct mempolicy *pol = get_vma_policy(vma, addr);
 756
 757         cpuset_update_current_mems_allowed();
 758
 759         if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
 760                 unsigned nid;
 761                 if (vma) {
 762                         unsigned long off;
 763                         BUG_ON(addr >= vma->vm_end);
 764                         BUG_ON(addr < vma->vm_start);
 765                         off = vma->vm_pgoff;
 766                         off += (addr - vma->vm_start) >> PAGE_SHIFT;
 767                         nid = offset_il_node(pol, vma, off);
 768                 } else {
 769                         /* fall back to process interleaving */
 770                         nid = interleave_nodes(pol);
 771                 }
 772                 return alloc_page_interleave(gfp, 0, nid);
 773         }
 774         return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
 775 }
 776
 777 /**
 778  *      alloc_pages_current - Allocate pages.
 779  *
 780  *      @gfp:
 781  *              %GFP_USER   user allocation,
 782  *              %GFP_KERNEL kernel allocation,
 783  *              %GFP_HIGHMEM highmem allocation,
 784  *              %GFP_FS     don't call back into a file system.
 785  *              %GFP_ATOMIC don't sleep.
 786  *      @order: Power of two of allocation size in pages. 0 is a single page.
 787  *
 788  *      Allocate a page from the kernel page pool.  When not in
 789  *      interrupt context and apply the current process NUMA policy.
 790  *      Returns NULL when no page can be allocated.
 791  *
 792  *      Don't call cpuset_update_current_mems_allowed() unless
 793  *      1) it's ok to take cpuset_sem (can WAIT), and
 794  *      2) allocating for current task (not interrupt).
 795  */
 796 struct page *alloc_pages_current(unsigned int __nocast gfp, unsigned order)
 797 {
 798         struct mempolicy *pol = current->mempolicy;
 799
 800         if ((gfp & __GFP_WAIT) && !in_interrupt())
 801                 cpuset_update_current_mems_allowed();
 802         if (!pol || in_interrupt())
 803                 pol = &default_policy;
 804         if (pol->policy == MPOL_INTERLEAVE)
 805                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
 806         return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
 807 }
 808 EXPORT_SYMBOL(alloc_pages_current);
 809
 810 /* Slow path of a mempolicy copy */
 811 struct mempolicy *__mpol_copy(struct mempolicy *old)
 812 {
 813         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 814
 815         if (!new)
 816                 return ERR_PTR(-ENOMEM);
 817         *new = *old;
 818         atomic_set(&new->refcnt, 1);
 819         if (new->policy == MPOL_BIND) {
 820                 int sz = ksize(old->v.zonelist);
 821                 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
 822                 if (!new->v.zonelist) {
 823                         kmem_cache_free(policy_cache, new);
 824                         return ERR_PTR(-ENOMEM);
 825                 }
 826                 memcpy(new->v.zonelist, old->v.zonelist, sz);
 827         }
 828         return new;
 829 }
 830
 831 /* Slow path of a mempolicy comparison */
 832 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 833 {
 834         if (!a || !b)
 835                 return 0;
 836         if (a->policy != b->policy)
 837                 return 0;
 838         switch (a->policy) {
 839         case MPOL_DEFAULT:
 840                 return 1;
 841         case MPOL_INTERLEAVE:
 842                 return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES);
 843         case MPOL_PREFERRED:
 844                 return a->v.preferred_node == b->v.preferred_node;
 845         case MPOL_BIND: {
 846                 int i;
 847                 for (i = 0; a->v.zonelist->zones[i]; i++)
 848                         if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
 849                                 return 0;
 850                 return b->v.zonelist->zones[i] == NULL;
 851         }
 852         default:
 853                 BUG();
 854                 return 0;
 855         }
 856 }
 857
 858 /* Slow path of a mpol destructor. */
 859 void __mpol_free(struct mempolicy *p)
 860 {
 861         if (!atomic_dec_and_test(&p->refcnt))
 862                 return;
 863         if (p->policy == MPOL_BIND)
 864                 kfree(p->v.zonelist);
 865         p->policy = MPOL_DEFAULT;
 866         kmem_cache_free(policy_cache, p);
 867 }
 868
 869 /*
 870  * Hugetlb policy. Same as above, just works with node numbers instead of
 871  * zonelists.
 872  */
 873
 874 /* Find first node suitable for an allocation */
 875 int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
 876 {
 877         struct mempolicy *pol = get_vma_policy(vma, addr);
 878
 879         switch (pol->policy) {
 880         case MPOL_DEFAULT:
 881                 return numa_node_id();
 882         case MPOL_BIND:
 883                 return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
 884         case MPOL_INTERLEAVE:
 885                 return interleave_nodes(pol);
 886         case MPOL_PREFERRED:
 887                 return pol->v.preferred_node >= 0 ?
 888                                 pol->v.preferred_node : numa_node_id();
 889         }
 890         BUG();
 891         return 0;
 892 }
 893
 894 /* Find secondary valid nodes for an allocation */
 895 int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
 896 {
 897         struct mempolicy *pol = get_vma_policy(vma, addr);
 898
 899         switch (pol->policy) {
 900         case MPOL_PREFERRED:
 901         case MPOL_DEFAULT:
 902         case MPOL_INTERLEAVE:
 903                 return 1;
 904         case MPOL_BIND: {
 905                 struct zone **z;
 906                 for (z = pol->v.zonelist->zones; *z; z++)
 907                         if ((*z)->zone_pgdat->node_id == nid)
 908                                 return 1;
 909                 return 0;
 910         }
 911         default:
 912                 BUG();
 913                 return 0;
 914         }
 915 }
 916
 917 /*
 918  * Shared memory backing store policy support.
 919  *
 920  * Remember policies even when nobody has shared memory mapped.
 921  * The policies are kept in Red-Black tree linked from the inode.
 922  * They are protected by the sp->lock spinlock, which should be held
 923  * for any accesses to the tree.
 924  */
 925
 926 /* lookup first element intersecting start-end */
 927 /* Caller holds sp->lock */
 928 static struct sp_node *
 929 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
 930 {
 931         struct rb_node *n = sp->root.rb_node;
 932
 933         while (n) {
 934                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
 935
 936                 if (start >= p->end)
 937                         n = n->rb_right;
 938                 else if (end <= p->start)
 939                         n = n->rb_left;
 940                 else
 941                         break;
 942         }
 943         if (!n)
 944                 return NULL;
 945         for (;;) {
 946                 struct sp_node *w = NULL;
 947                 struct rb_node *prev = rb_prev(n);
 948                 if (!prev)
 949                         break;
 950                 w = rb_entry(prev, struct sp_node, nd);
 951                 if (w->end <= start)
 952                         break;
 953                 n = prev;
 954         }
 955         return rb_entry(n, struct sp_node, nd);
 956 }
 957
 958 /* Insert a new shared policy into the list. */
 959 /* Caller holds sp->lock */
 960 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
 961 {
 962         struct rb_node **p = &sp->root.rb_node;
 963         struct rb_node *parent = NULL;
 964         struct sp_node *nd;
 965
 966         while (*p) {
 967                 parent = *p;
 968                 nd = rb_entry(parent, struct sp_node, nd);
 969                 if (new->start < nd->start)
 970                         p = &(*p)->rb_left;
 971                 else if (new->end > nd->end)
 972                         p = &(*p)->rb_right;
 973                 else
 974                         BUG();
 975         }
 976         rb_link_node(&new->nd, parent, p);
 977         rb_insert_color(&new->nd, &sp->root);
 978         PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
 979                  new->policy ? new->policy->policy : 0);
 980 }
 981
 982 /* Find shared policy intersecting idx */
 983 struct mempolicy *
 984 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
 985 {
 986         struct mempolicy *pol = NULL;
 987         struct sp_node *sn;
 988
 989         if (!sp->root.rb_node)
 990                 return NULL;
 991         spin_lock(&sp->lock);
 992         sn = sp_lookup(sp, idx, idx+1);
 993         if (sn) {
 994                 mpol_get(sn->policy);
 995                 pol = sn->policy;
 996         }
 997         spin_unlock(&sp->lock);
 998         return pol;
 999 }
1000
1001 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1002 {
1003         PDprintk("deleting %lx-l%x\n", n->start, n->end);
1004         rb_erase(&n->nd, &sp->root);
1005         mpol_free(n->policy);
1006         kmem_cache_free(sn_cache, n);
1007 }
1008
1009 struct sp_node *
1010 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1011 {
1012         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1013
1014         if (!n)
1015                 return NULL;
1016         n->start = start;
1017         n->end = end;
1018         mpol_get(pol);
1019         n->policy = pol;
1020         return n;
1021 }
1022
1023 /* Replace a policy range. */
1024 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1025                                  unsigned long end, struct sp_node *new)
1026 {
1027         struct sp_node *n, *new2 = NULL;
1028
1029 restart:
1030         spin_lock(&sp->lock);
1031         n = sp_lookup(sp, start, end);
1032         /* Take care of old policies in the same range. */
1033         while (n && n->start < end) {
1034                 struct rb_node *next = rb_next(&n->nd);
1035                 if (n->start >= start) {
1036                         if (n->end <= end)
1037                                 sp_delete(sp, n);
1038                         else
1039                                 n->start = end;
1040                 } else {
1041                         /* Old policy spanning whole new range. */
1042                         if (n->end > end) {
1043                                 if (!new2) {
1044                                         spin_unlock(&sp->lock);
1045                                         new2 = sp_alloc(end, n->end, n->policy);
1046                                         if (!new2)
1047                                                 return -ENOMEM;
1048                                         goto restart;
1049                                 }
1050                                 n->end = start;
1051                                 sp_insert(sp, new2);
1052                                 new2 = NULL;
1053                                 break;
1054                         } else
1055                                 n->end = start;
1056                 }
1057                 if (!next)
1058                         break;
1059                 n = rb_entry(next, struct sp_node, nd);
1060         }
1061         if (new)
1062                 sp_insert(sp, new);
1063         spin_unlock(&sp->lock);
1064         if (new2) {
1065                 mpol_free(new2->policy);
1066                 kmem_cache_free(sn_cache, new2);
1067         }
1068         return 0;
1069 }
1070
1071 int mpol_set_shared_policy(struct shared_policy *info,
1072                         struct vm_area_struct *vma, struct mempolicy *npol)
1073 {
1074         int err;
1075         struct sp_node *new = NULL;
1076         unsigned long sz = vma_pages(vma);
1077
1078         PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1079                  vma->vm_pgoff,
1080                  sz, npol? npol->policy : -1,
1081                 npol ? npol->v.nodes[0] : -1);
1082
1083         if (npol) {
1084                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1085                 if (!new)
1086                         return -ENOMEM;
1087         }
1088         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1089         if (err && new)
1090                 kmem_cache_free(sn_cache, new);
1091         return err;
1092 }
1093
1094 /* Free a backing policy store on inode delete. */
1095 void mpol_free_shared_policy(struct shared_policy *p)
1096 {
1097         struct sp_node *n;
1098         struct rb_node *next;
1099
1100         if (!p->root.rb_node)
1101                 return;
1102         spin_lock(&p->lock);
1103         next = rb_first(&p->root);
1104         while (next) {
1105                 n = rb_entry(next, struct sp_node, nd);
1106                 next = rb_next(&n->nd);
1107                 mpol_free(n->policy);
1108                 kmem_cache_free(sn_cache, n);
1109         }
1110         spin_unlock(&p->lock);
1111         p->root = RB_ROOT;
1112 }
1113
1114 /* assumes fs == KERNEL_DS */
1115 void __init numa_policy_init(void)
1116 {
1117         policy_cache = kmem_cache_create("numa_policy",
1118                                          sizeof(struct mempolicy),
1119                                          0, SLAB_PANIC, NULL, NULL);
1120
1121         sn_cache = kmem_cache_create("shared_policy_node",
1122                                      sizeof(struct sp_node),
1123                                      0, SLAB_PANIC, NULL, NULL);
1124
1125         /* Set interleaving policy for system init. This way not all
1126            the data structures allocated at system boot end up in node zero. */
1127
1128         if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map),
1129                                                         MAX_NUMNODES) < 0)
1130                 printk("numa_policy_init: interleaving failed\n");
1131 }
1132
1133 /* Reset policy of current process to default.
1134  * Assumes fs == KERNEL_DS */
1135 void numa_default_policy(void)
1136 {
1137         sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);
1138 }