mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66    could replace all the switch()es with a mempolicy_ops structure.
  67 */
  68
  69 #include <linux/mempolicy.h>
  70 #include <linux/mm.h>
  71 #include <linux/highmem.h>
  72 #include <linux/hugetlb.h>
  73 #include <linux/kernel.h>
  74 #include <linux/sched.h>
  75 #include <linux/mm.h>
  76 #include <linux/nodemask.h>
  77 #include <linux/cpuset.h>
  78 #include <linux/gfp.h>
  79 #include <linux/slab.h>
  80 #include <linux/string.h>
  81 #include <linux/module.h>
  82 #include <linux/interrupt.h>
  83 #include <linux/init.h>
  84 #include <linux/compat.h>
  85 #include <linux/mempolicy.h>
  86 #include <asm/tlbflush.h>
  87 #include <asm/uaccess.h>
  88
  89 static kmem_cache_t *policy_cache;
  90 static kmem_cache_t *sn_cache;
  91
  92 #define PDprintk(fmt...)
  93
  94 /* Highest zone. An specific allocation for a zone below that is not
  95    policied. */
  96 static int policy_zone;
  97
  98 struct mempolicy default_policy = {
  99         .refcnt = ATOMIC_INIT(1), /* never free it */
 100         .policy = MPOL_DEFAULT,
 101 };
 102
 103 /* Do sanity checking on a policy */
 104 static int mpol_check_policy(int mode, nodemask_t *nodes)
 105 {
 106         int empty = nodes_empty(*nodes);
 107
 108         switch (mode) {
 109         case MPOL_DEFAULT:
 110                 if (!empty)
 111                         return -EINVAL;
 112                 break;
 113         case MPOL_BIND:
 114         case MPOL_INTERLEAVE:
 115                 /* Preferred will only use the first bit, but allow
 116                    more for now. */
 117                 if (empty)
 118                         return -EINVAL;
 119                 break;
 120         }
 121         return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
 122 }
 123 /* Generate a custom zonelist for the BIND policy. */
 124 static struct zonelist *bind_zonelist(nodemask_t *nodes)
 125 {
 126         struct zonelist *zl;
 127         int num, max, nd;
 128
 129         max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
 130         zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
 131         if (!zl)
 132                 return NULL;
 133         num = 0;
 134         for_each_node_mask(nd, *nodes) {
 135                 int k;
 136                 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
 137                         struct zone *z = &NODE_DATA(nd)->node_zones[k];
 138                         if (!z->present_pages)
 139                                 continue;
 140                         zl->zones[num++] = z;
 141                         if (k > policy_zone)
 142                                 policy_zone = k;
 143                 }
 144         }
 145         zl->zones[num] = NULL;
 146         return zl;
 147 }
 148
 149 /* Create a new policy */
 150 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 151 {
 152         struct mempolicy *policy;
 153
 154         PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
 155         if (mode == MPOL_DEFAULT)
 156                 return NULL;
 157         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 158         if (!policy)
 159                 return ERR_PTR(-ENOMEM);
 160         atomic_set(&policy->refcnt, 1);
 161         switch (mode) {
 162         case MPOL_INTERLEAVE:
 163                 policy->v.nodes = *nodes;
 164                 if (nodes_weight(*nodes) == 0) {
 165                         kmem_cache_free(policy_cache, policy);
 166                         return ERR_PTR(-EINVAL);
 167                 }
 168                 break;
 169         case MPOL_PREFERRED:
 170                 policy->v.preferred_node = first_node(*nodes);
 171                 if (policy->v.preferred_node >= MAX_NUMNODES)
 172                         policy->v.preferred_node = -1;
 173                 break;
 174         case MPOL_BIND:
 175                 policy->v.zonelist = bind_zonelist(nodes);
 176                 if (policy->v.zonelist == NULL) {
 177                         kmem_cache_free(policy_cache, policy);
 178                         return ERR_PTR(-ENOMEM);
 179                 }
 180                 break;
 181         }
 182         policy->policy = mode;
 183         return policy;
 184 }
 185
 186 /* Ensure all existing pages follow the policy. */
 187 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 188                 unsigned long addr, unsigned long end, nodemask_t *nodes)
 189 {
 190         pte_t *orig_pte;
 191         pte_t *pte;
 192         spinlock_t *ptl;
 193
 194         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 195         do {
 196                 struct page *page;
 197                 unsigned int nid;
 198
 199                 if (!pte_present(*pte))
 200                         continue;
 201                 page = vm_normal_page(vma, addr, *pte);
 202                 if (!page)
 203                         continue;
 204                 nid = page_to_nid(page);
 205                 if (!node_isset(nid, *nodes))
 206                         break;
 207         } while (pte++, addr += PAGE_SIZE, addr != end);
 208         pte_unmap_unlock(orig_pte, ptl);
 209         return addr != end;
 210 }
 211
 212 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 213                 unsigned long addr, unsigned long end, nodemask_t *nodes)
 214 {
 215         pmd_t *pmd;
 216         unsigned long next;
 217
 218         pmd = pmd_offset(pud, addr);
 219         do {
 220                 next = pmd_addr_end(addr, end);
 221                 if (pmd_none_or_clear_bad(pmd))
 222                         continue;
 223                 if (check_pte_range(vma, pmd, addr, next, nodes))
 224                         return -EIO;
 225         } while (pmd++, addr = next, addr != end);
 226         return 0;
 227 }
 228
 229 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 230                 unsigned long addr, unsigned long end, nodemask_t *nodes)
 231 {
 232         pud_t *pud;
 233         unsigned long next;
 234
 235         pud = pud_offset(pgd, addr);
 236         do {
 237                 next = pud_addr_end(addr, end);
 238                 if (pud_none_or_clear_bad(pud))
 239                         continue;
 240                 if (check_pmd_range(vma, pud, addr, next, nodes))
 241                         return -EIO;
 242         } while (pud++, addr = next, addr != end);
 243         return 0;
 244 }
 245
 246 static inline int check_pgd_range(struct vm_area_struct *vma,
 247                 unsigned long addr, unsigned long end, nodemask_t *nodes)
 248 {
 249         pgd_t *pgd;
 250         unsigned long next;
 251
 252         pgd = pgd_offset(vma->vm_mm, addr);
 253         do {
 254                 next = pgd_addr_end(addr, end);
 255                 if (pgd_none_or_clear_bad(pgd))
 256                         continue;
 257                 if (check_pud_range(vma, pgd, addr, next, nodes))
 258                         return -EIO;
 259         } while (pgd++, addr = next, addr != end);
 260         return 0;
 261 }
 262
 263 /* Step 1: check the range */
 264 static struct vm_area_struct *
 265 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 266             nodemask_t *nodes, unsigned long flags)
 267 {
 268         int err;
 269         struct vm_area_struct *first, *vma, *prev;
 270
 271         first = find_vma(mm, start);
 272         if (!first)
 273                 return ERR_PTR(-EFAULT);
 274         prev = NULL;
 275         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 276                 if (!vma->vm_next && vma->vm_end < end)
 277                         return ERR_PTR(-EFAULT);
 278                 if (prev && prev->vm_end < vma->vm_start)
 279                         return ERR_PTR(-EFAULT);
 280                 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
 281                         unsigned long endvma = vma->vm_end;
 282                         if (endvma > end)
 283                                 endvma = end;
 284                         if (vma->vm_start > start)
 285                                 start = vma->vm_start;
 286                         err = check_pgd_range(vma, start, endvma, nodes);
 287                         if (err) {
 288                                 first = ERR_PTR(err);
 289                                 break;
 290                         }
 291                 }
 292                 prev = vma;
 293         }
 294         return first;
 295 }
 296
 297 /* Apply policy to a single VMA */
 298 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 299 {
 300         int err = 0;
 301         struct mempolicy *old = vma->vm_policy;
 302
 303         PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 304                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 305                  vma->vm_ops, vma->vm_file,
 306                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 307
 308         if (vma->vm_ops && vma->vm_ops->set_policy)
 309                 err = vma->vm_ops->set_policy(vma, new);
 310         if (!err) {
 311                 mpol_get(new);
 312                 vma->vm_policy = new;
 313                 mpol_free(old);
 314         }
 315         return err;
 316 }
 317
 318 /* Step 2: apply policy to a range and do splits. */
 319 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 320                        unsigned long end, struct mempolicy *new)
 321 {
 322         struct vm_area_struct *next;
 323         int err;
 324
 325         err = 0;
 326         for (; vma && vma->vm_start < end; vma = next) {
 327                 next = vma->vm_next;
 328                 if (vma->vm_start < start)
 329                         err = split_vma(vma->vm_mm, vma, start, 1);
 330                 if (!err && vma->vm_end > end)
 331                         err = split_vma(vma->vm_mm, vma, end, 0);
 332                 if (!err)
 333                         err = policy_vma(vma, new);
 334                 if (err)
 335                         break;
 336         }
 337         return err;
 338 }
 339
 340 static int contextualize_policy(int mode, nodemask_t *nodes)
 341 {
 342         if (!nodes)
 343                 return 0;
 344
 345         /* Update current mems_allowed */
 346         cpuset_update_current_mems_allowed();
 347         /* Ignore nodes not set in current->mems_allowed */
 348         cpuset_restrict_to_mems_allowed(nodes->bits);
 349         return mpol_check_policy(mode, nodes);
 350 }
 351
 352 long do_mbind(unsigned long start, unsigned long len,
 353                 unsigned long mode, nodemask_t *nmask, unsigned long flags)
 354 {
 355         struct vm_area_struct *vma;
 356         struct mm_struct *mm = current->mm;
 357         struct mempolicy *new;
 358         unsigned long end;
 359         int err;
 360
 361         if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
 362                 return -EINVAL;
 363         if (start & ~PAGE_MASK)
 364                 return -EINVAL;
 365         if (mode == MPOL_DEFAULT)
 366                 flags &= ~MPOL_MF_STRICT;
 367         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 368         end = start + len;
 369         if (end < start)
 370                 return -EINVAL;
 371         if (end == start)
 372                 return 0;
 373         if (mpol_check_policy(mode, nmask))
 374                 return -EINVAL;
 375         new = mpol_new(mode, nmask);
 376         if (IS_ERR(new))
 377                 return PTR_ERR(new);
 378
 379         PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
 380                         mode,nodes_addr(nodes)[0]);
 381
 382         down_write(&mm->mmap_sem);
 383         vma = check_range(mm, start, end, nmask, flags);
 384         err = PTR_ERR(vma);
 385         if (!IS_ERR(vma))
 386                 err = mbind_range(vma, start, end, new);
 387         up_write(&mm->mmap_sem);
 388         mpol_free(new);
 389         return err;
 390 }
 391
 392 /* Set the process memory policy */
 393 long do_set_mempolicy(int mode, nodemask_t *nodes)
 394 {
 395         struct mempolicy *new;
 396
 397         if (contextualize_policy(mode, nodes))
 398                 return -EINVAL;
 399         new = mpol_new(mode, nodes);
 400         if (IS_ERR(new))
 401                 return PTR_ERR(new);
 402         mpol_free(current->mempolicy);
 403         current->mempolicy = new;
 404         if (new && new->policy == MPOL_INTERLEAVE)
 405                 current->il_next = first_node(new->v.nodes);
 406         return 0;
 407 }
 408
 409 /* Fill a zone bitmap for a policy */
 410 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 411 {
 412         int i;
 413
 414         nodes_clear(*nodes);
 415         switch (p->policy) {
 416         case MPOL_BIND:
 417                 for (i = 0; p->v.zonelist->zones[i]; i++)
 418                         node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
 419                                 *nodes);
 420                 break;
 421         case MPOL_DEFAULT:
 422                 break;
 423         case MPOL_INTERLEAVE:
 424                 *nodes = p->v.nodes;
 425                 break;
 426         case MPOL_PREFERRED:
 427                 /* or use current node instead of online map? */
 428                 if (p->v.preferred_node < 0)
 429                         *nodes = node_online_map;
 430                 else
 431                         node_set(p->v.preferred_node, *nodes);
 432                 break;
 433         default:
 434                 BUG();
 435         }
 436 }
 437
 438 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 439 {
 440         struct page *p;
 441         int err;
 442
 443         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 444         if (err >= 0) {
 445                 err = page_to_nid(p);
 446                 put_page(p);
 447         }
 448         return err;
 449 }
 450
 451 /* Retrieve NUMA policy */
 452 long do_get_mempolicy(int *policy, nodemask_t *nmask,
 453                         unsigned long addr, unsigned long flags)
 454 {
 455         int err;
 456         struct mm_struct *mm = current->mm;
 457         struct vm_area_struct *vma = NULL;
 458         struct mempolicy *pol = current->mempolicy;
 459
 460         cpuset_update_current_mems_allowed();
 461         if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
 462                 return -EINVAL;
 463         if (flags & MPOL_F_ADDR) {
 464                 down_read(&mm->mmap_sem);
 465                 vma = find_vma_intersection(mm, addr, addr+1);
 466                 if (!vma) {
 467                         up_read(&mm->mmap_sem);
 468                         return -EFAULT;
 469                 }
 470                 if (vma->vm_ops && vma->vm_ops->get_policy)
 471                         pol = vma->vm_ops->get_policy(vma, addr);
 472                 else
 473                         pol = vma->vm_policy;
 474         } else if (addr)
 475                 return -EINVAL;
 476
 477         if (!pol)
 478                 pol = &default_policy;
 479
 480         if (flags & MPOL_F_NODE) {
 481                 if (flags & MPOL_F_ADDR) {
 482                         err = lookup_node(mm, addr);
 483                         if (err < 0)
 484                                 goto out;
 485                         *policy = err;
 486                 } else if (pol == current->mempolicy &&
 487                                 pol->policy == MPOL_INTERLEAVE) {
 488                         *policy = current->il_next;
 489                 } else {
 490                         err = -EINVAL;
 491                         goto out;
 492                 }
 493         } else
 494                 *policy = pol->policy;
 495
 496         if (vma) {
 497                 up_read(&current->mm->mmap_sem);
 498                 vma = NULL;
 499         }
 500
 501         err = 0;
 502         if (nmask)
 503                 get_zonemask(pol, nmask);
 504
 505  out:
 506         if (vma)
 507                 up_read(&current->mm->mmap_sem);
 508         return err;
 509 }
 510
 511 /*
 512  * User space interface with variable sized bitmaps for nodelists.
 513  */
 514
 515 /* Copy a node mask from user space. */
 516 static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
 517                      unsigned long maxnode)
 518 {
 519         unsigned long k;
 520         unsigned long nlongs;
 521         unsigned long endmask;
 522
 523         --maxnode;
 524         nodes_clear(*nodes);
 525         if (maxnode == 0 || !nmask)
 526                 return 0;
 527
 528         nlongs = BITS_TO_LONGS(maxnode);
 529         if ((maxnode % BITS_PER_LONG) == 0)
 530                 endmask = ~0UL;
 531         else
 532                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
 533
 534         /* When the user specified more nodes than supported just check
 535            if the non supported part is all zero. */
 536         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
 537                 if (nlongs > PAGE_SIZE/sizeof(long))
 538                         return -EINVAL;
 539                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
 540                         unsigned long t;
 541                         if (get_user(t, nmask + k))
 542                                 return -EFAULT;
 543                         if (k == nlongs - 1) {
 544                                 if (t & endmask)
 545                                         return -EINVAL;
 546                         } else if (t)
 547                                 return -EINVAL;
 548                 }
 549                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
 550                 endmask = ~0UL;
 551         }
 552
 553         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
 554                 return -EFAULT;
 555         nodes_addr(*nodes)[nlongs-1] &= endmask;
 556         return 0;
 557 }
 558
 559 /* Copy a kernel node mask to user space */
 560 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 561                               nodemask_t *nodes)
 562 {
 563         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
 564         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
 565
 566         if (copy > nbytes) {
 567                 if (copy > PAGE_SIZE)
 568                         return -EINVAL;
 569                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
 570                         return -EFAULT;
 571                 copy = nbytes;
 572         }
 573         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
 574 }
 575
 576 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 577                         unsigned long mode,
 578                         unsigned long __user *nmask, unsigned long maxnode,
 579                         unsigned flags)
 580 {
 581         nodemask_t nodes;
 582         int err;
 583
 584         err = get_nodes(&nodes, nmask, maxnode);
 585         if (err)
 586                 return err;
 587         return do_mbind(start, len, mode, &nodes, flags);
 588 }
 589
 590 /* Set the process memory policy */
 591 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 592                 unsigned long maxnode)
 593 {
 594         int err;
 595         nodemask_t nodes;
 596
 597         if (mode < 0 || mode > MPOL_MAX)
 598                 return -EINVAL;
 599         err = get_nodes(&nodes, nmask, maxnode);
 600         if (err)
 601                 return err;
 602         return do_set_mempolicy(mode, &nodes);
 603 }
 604
 605 /* Retrieve NUMA policy */
 606 asmlinkage long sys_get_mempolicy(int __user *policy,
 607                                 unsigned long __user *nmask,
 608                                 unsigned long maxnode,
 609                                 unsigned long addr, unsigned long flags)
 610 {
 611         int err, pval;
 612         nodemask_t nodes;
 613
 614         if (nmask != NULL && maxnode < MAX_NUMNODES)
 615                 return -EINVAL;
 616
 617         err = do_get_mempolicy(&pval, &nodes, addr, flags);
 618
 619         if (err)
 620                 return err;
 621
 622         if (policy && put_user(pval, policy))
 623                 return -EFAULT;
 624
 625         if (nmask)
 626                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
 627
 628         return err;
 629 }
 630
 631 #ifdef CONFIG_COMPAT
 632
 633 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
 634                                      compat_ulong_t __user *nmask,
 635                                      compat_ulong_t maxnode,
 636                                      compat_ulong_t addr, compat_ulong_t flags)
 637 {
 638         long err;
 639         unsigned long __user *nm = NULL;
 640         unsigned long nr_bits, alloc_size;
 641         DECLARE_BITMAP(bm, MAX_NUMNODES);
 642
 643         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 644         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 645
 646         if (nmask)
 647                 nm = compat_alloc_user_space(alloc_size);
 648
 649         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
 650
 651         if (!err && nmask) {
 652                 err = copy_from_user(bm, nm, alloc_size);
 653                 /* ensure entire bitmap is zeroed */
 654                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
 655                 err |= compat_put_bitmap(nmask, bm, nr_bits);
 656         }
 657
 658         return err;
 659 }
 660
 661 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
 662                                      compat_ulong_t maxnode)
 663 {
 664         long err = 0;
 665         unsigned long __user *nm = NULL;
 666         unsigned long nr_bits, alloc_size;
 667         DECLARE_BITMAP(bm, MAX_NUMNODES);
 668
 669         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 670         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 671
 672         if (nmask) {
 673                 err = compat_get_bitmap(bm, nmask, nr_bits);
 674                 nm = compat_alloc_user_space(alloc_size);
 675                 err |= copy_to_user(nm, bm, alloc_size);
 676         }
 677
 678         if (err)
 679                 return -EFAULT;
 680
 681         return sys_set_mempolicy(mode, nm, nr_bits+1);
 682 }
 683
 684 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
 685                              compat_ulong_t mode, compat_ulong_t __user *nmask,
 686                              compat_ulong_t maxnode, compat_ulong_t flags)
 687 {
 688         long err = 0;
 689         unsigned long __user *nm = NULL;
 690         unsigned long nr_bits, alloc_size;
 691         nodemask_t bm;
 692
 693         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 694         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 695
 696         if (nmask) {
 697                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
 698                 nm = compat_alloc_user_space(alloc_size);
 699                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
 700         }
 701
 702         if (err)
 703                 return -EFAULT;
 704
 705         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
 706 }
 707
 708 #endif
 709
 710 /* Return effective policy for a VMA */
 711 struct mempolicy *
 712 get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
 713 {
 714         struct mempolicy *pol = task->mempolicy;
 715
 716         if (vma) {
 717                 if (vma->vm_ops && vma->vm_ops->get_policy)
 718                         pol = vma->vm_ops->get_policy(vma, addr);
 719                 else if (vma->vm_policy &&
 720                                 vma->vm_policy->policy != MPOL_DEFAULT)
 721                         pol = vma->vm_policy;
 722         }
 723         if (!pol)
 724                 pol = &default_policy;
 725         return pol;
 726 }
 727
 728 /* Return a zonelist representing a mempolicy */
 729 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
 730 {
 731         int nd;
 732
 733         switch (policy->policy) {
 734         case MPOL_PREFERRED:
 735                 nd = policy->v.preferred_node;
 736                 if (nd < 0)
 737                         nd = numa_node_id();
 738                 break;
 739         case MPOL_BIND:
 740                 /* Lower zones don't get a policy applied */
 741                 /* Careful: current->mems_allowed might have moved */
 742                 if (gfp_zone(gfp) >= policy_zone)
 743                         if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
 744                                 return policy->v.zonelist;
 745                 /*FALL THROUGH*/
 746         case MPOL_INTERLEAVE: /* should not happen */
 747         case MPOL_DEFAULT:
 748                 nd = numa_node_id();
 749                 break;
 750         default:
 751                 nd = 0;
 752                 BUG();
 753         }
 754         return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
 755 }
 756
 757 /* Do dynamic interleaving for a process */
 758 static unsigned interleave_nodes(struct mempolicy *policy)
 759 {
 760         unsigned nid, next;
 761         struct task_struct *me = current;
 762
 763         nid = me->il_next;
 764         next = next_node(nid, policy->v.nodes);
 765         if (next >= MAX_NUMNODES)
 766                 next = first_node(policy->v.nodes);
 767         me->il_next = next;
 768         return nid;
 769 }
 770
 771 /* Do static interleaving for a VMA with known offset. */
 772 static unsigned offset_il_node(struct mempolicy *pol,
 773                 struct vm_area_struct *vma, unsigned long off)
 774 {
 775         unsigned nnodes = nodes_weight(pol->v.nodes);
 776         unsigned target = (unsigned)off % nnodes;
 777         int c;
 778         int nid = -1;
 779
 780         c = 0;
 781         do {
 782                 nid = next_node(nid, pol->v.nodes);
 783                 c++;
 784         } while (c <= target);
 785         return nid;
 786 }
 787
 788 /* Allocate a page in interleaved policy.
 789    Own path because it needs to do special accounting. */
 790 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
 791                                         unsigned nid)
 792 {
 793         struct zonelist *zl;
 794         struct page *page;
 795
 796         zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
 797         page = __alloc_pages(gfp, order, zl);
 798         if (page && page_zone(page) == zl->zones[0]) {
 799                 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
 800                 put_cpu();
 801         }
 802         return page;
 803 }
 804
 805 /**
 806  *      alloc_page_vma  - Allocate a page for a VMA.
 807  *
 808  *      @gfp:
 809  *      %GFP_USER    user allocation.
 810  *      %GFP_KERNEL  kernel allocations,
 811  *      %GFP_HIGHMEM highmem/user allocations,
 812  *      %GFP_FS      allocation should not call back into a file system.
 813  *      %GFP_ATOMIC  don't sleep.
 814  *
 815  *      @vma:  Pointer to VMA or NULL if not available.
 816  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
 817  *
 818  *      This function allocates a page from the kernel page pool and applies
 819  *      a NUMA policy associated with the VMA or the current process.
 820  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
 821  *      mm_struct of the VMA to prevent it from going away. Should be used for
 822  *      all allocations for pages that will be mapped into
 823  *      user space. Returns NULL when no page can be allocated.
 824  *
 825  *      Should be called with the mm_sem of the vma hold.
 826  */
 827 struct page *
 828 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 829 {
 830         struct mempolicy *pol = get_vma_policy(current, vma, addr);
 831
 832         cpuset_update_current_mems_allowed();
 833
 834         if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
 835                 unsigned nid;
 836                 if (vma) {
 837                         unsigned long off;
 838                         off = vma->vm_pgoff;
 839                         off += (addr - vma->vm_start) >> PAGE_SHIFT;
 840                         nid = offset_il_node(pol, vma, off);
 841                 } else {
 842                         /* fall back to process interleaving */
 843                         nid = interleave_nodes(pol);
 844                 }
 845                 return alloc_page_interleave(gfp, 0, nid);
 846         }
 847         return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
 848 }
 849
 850 /**
 851  *      alloc_pages_current - Allocate pages.
 852  *
 853  *      @gfp:
 854  *              %GFP_USER   user allocation,
 855  *              %GFP_KERNEL kernel allocation,
 856  *              %GFP_HIGHMEM highmem allocation,
 857  *              %GFP_FS     don't call back into a file system.
 858  *              %GFP_ATOMIC don't sleep.
 859  *      @order: Power of two of allocation size in pages. 0 is a single page.
 860  *
 861  *      Allocate a page from the kernel page pool.  When not in
 862  *      interrupt context and apply the current process NUMA policy.
 863  *      Returns NULL when no page can be allocated.
 864  *
 865  *      Don't call cpuset_update_current_mems_allowed() unless
 866  *      1) it's ok to take cpuset_sem (can WAIT), and
 867  *      2) allocating for current task (not interrupt).
 868  */
 869 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 870 {
 871         struct mempolicy *pol = current->mempolicy;
 872
 873         if ((gfp & __GFP_WAIT) && !in_interrupt())
 874                 cpuset_update_current_mems_allowed();
 875         if (!pol || in_interrupt())
 876                 pol = &default_policy;
 877         if (pol->policy == MPOL_INTERLEAVE)
 878                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
 879         return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
 880 }
 881 EXPORT_SYMBOL(alloc_pages_current);
 882
 883 /* Slow path of a mempolicy copy */
 884 struct mempolicy *__mpol_copy(struct mempolicy *old)
 885 {
 886         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 887
 888         if (!new)
 889                 return ERR_PTR(-ENOMEM);
 890         *new = *old;
 891         atomic_set(&new->refcnt, 1);
 892         if (new->policy == MPOL_BIND) {
 893                 int sz = ksize(old->v.zonelist);
 894                 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
 895                 if (!new->v.zonelist) {
 896                         kmem_cache_free(policy_cache, new);
 897                         return ERR_PTR(-ENOMEM);
 898                 }
 899                 memcpy(new->v.zonelist, old->v.zonelist, sz);
 900         }
 901         return new;
 902 }
 903
 904 /* Slow path of a mempolicy comparison */
 905 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 906 {
 907         if (!a || !b)
 908                 return 0;
 909         if (a->policy != b->policy)
 910                 return 0;
 911         switch (a->policy) {
 912         case MPOL_DEFAULT:
 913                 return 1;
 914         case MPOL_INTERLEAVE:
 915                 return nodes_equal(a->v.nodes, b->v.nodes);
 916         case MPOL_PREFERRED:
 917                 return a->v.preferred_node == b->v.preferred_node;
 918         case MPOL_BIND: {
 919                 int i;
 920                 for (i = 0; a->v.zonelist->zones[i]; i++)
 921                         if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
 922                                 return 0;
 923                 return b->v.zonelist->zones[i] == NULL;
 924         }
 925         default:
 926                 BUG();
 927                 return 0;
 928         }
 929 }
 930
 931 /* Slow path of a mpol destructor. */
 932 void __mpol_free(struct mempolicy *p)
 933 {
 934         if (!atomic_dec_and_test(&p->refcnt))
 935                 return;
 936         if (p->policy == MPOL_BIND)
 937                 kfree(p->v.zonelist);
 938         p->policy = MPOL_DEFAULT;
 939         kmem_cache_free(policy_cache, p);
 940 }
 941
 942 /*
 943  * Hugetlb policy. Same as above, just works with node numbers instead of
 944  * zonelists.
 945  */
 946
 947 /* Find first node suitable for an allocation */
 948 int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
 949 {
 950         struct mempolicy *pol = get_vma_policy(current, vma, addr);
 951
 952         switch (pol->policy) {
 953         case MPOL_DEFAULT:
 954                 return numa_node_id();
 955         case MPOL_BIND:
 956                 return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
 957         case MPOL_INTERLEAVE:
 958                 return interleave_nodes(pol);
 959         case MPOL_PREFERRED:
 960                 return pol->v.preferred_node >= 0 ?
 961                                 pol->v.preferred_node : numa_node_id();
 962         }
 963         BUG();
 964         return 0;
 965 }
 966
 967 /* Find secondary valid nodes for an allocation */
 968 int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
 969 {
 970         struct mempolicy *pol = get_vma_policy(current, vma, addr);
 971
 972         switch (pol->policy) {
 973         case MPOL_PREFERRED:
 974         case MPOL_DEFAULT:
 975         case MPOL_INTERLEAVE:
 976                 return 1;
 977         case MPOL_BIND: {
 978                 struct zone **z;
 979                 for (z = pol->v.zonelist->zones; *z; z++)
 980                         if ((*z)->zone_pgdat->node_id == nid)
 981                                 return 1;
 982                 return 0;
 983         }
 984         default:
 985                 BUG();
 986                 return 0;
 987         }
 988 }
 989
 990 /*
 991  * Shared memory backing store policy support.
 992  *
 993  * Remember policies even when nobody has shared memory mapped.
 994  * The policies are kept in Red-Black tree linked from the inode.
 995  * They are protected by the sp->lock spinlock, which should be held
 996  * for any accesses to the tree.
 997  */
 998
 999 /* lookup first element intersecting start-end */
1000 /* Caller holds sp->lock */
1001 static struct sp_node *
1002 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1003 {
1004         struct rb_node *n = sp->root.rb_node;
1005
1006         while (n) {
1007                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1008
1009                 if (start >= p->end)
1010                         n = n->rb_right;
1011                 else if (end <= p->start)
1012                         n = n->rb_left;
1013                 else
1014                         break;
1015         }
1016         if (!n)
1017                 return NULL;
1018         for (;;) {
1019                 struct sp_node *w = NULL;
1020                 struct rb_node *prev = rb_prev(n);
1021                 if (!prev)
1022                         break;
1023                 w = rb_entry(prev, struct sp_node, nd);
1024                 if (w->end <= start)
1025                         break;
1026                 n = prev;
1027         }
1028         return rb_entry(n, struct sp_node, nd);
1029 }
1030
1031 /* Insert a new shared policy into the list. */
1032 /* Caller holds sp->lock */
1033 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1034 {
1035         struct rb_node **p = &sp->root.rb_node;
1036         struct rb_node *parent = NULL;
1037         struct sp_node *nd;
1038
1039         while (*p) {
1040                 parent = *p;
1041                 nd = rb_entry(parent, struct sp_node, nd);
1042                 if (new->start < nd->start)
1043                         p = &(*p)->rb_left;
1044                 else if (new->end > nd->end)
1045                         p = &(*p)->rb_right;
1046                 else
1047                         BUG();
1048         }
1049         rb_link_node(&new->nd, parent, p);
1050         rb_insert_color(&new->nd, &sp->root);
1051         PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1052                  new->policy ? new->policy->policy : 0);
1053 }
1054
1055 /* Find shared policy intersecting idx */
1056 struct mempolicy *
1057 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1058 {
1059         struct mempolicy *pol = NULL;
1060         struct sp_node *sn;
1061
1062         if (!sp->root.rb_node)
1063                 return NULL;
1064         spin_lock(&sp->lock);
1065         sn = sp_lookup(sp, idx, idx+1);
1066         if (sn) {
1067                 mpol_get(sn->policy);
1068                 pol = sn->policy;
1069         }
1070         spin_unlock(&sp->lock);
1071         return pol;
1072 }
1073
1074 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1075 {
1076         PDprintk("deleting %lx-l%x\n", n->start, n->end);
1077         rb_erase(&n->nd, &sp->root);
1078         mpol_free(n->policy);
1079         kmem_cache_free(sn_cache, n);
1080 }
1081
1082 struct sp_node *
1083 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1084 {
1085         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1086
1087         if (!n)
1088                 return NULL;
1089         n->start = start;
1090         n->end = end;
1091         mpol_get(pol);
1092         n->policy = pol;
1093         return n;
1094 }
1095
1096 /* Replace a policy range. */
1097 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1098                                  unsigned long end, struct sp_node *new)
1099 {
1100         struct sp_node *n, *new2 = NULL;
1101
1102 restart:
1103         spin_lock(&sp->lock);
1104         n = sp_lookup(sp, start, end);
1105         /* Take care of old policies in the same range. */
1106         while (n && n->start < end) {
1107                 struct rb_node *next = rb_next(&n->nd);
1108                 if (n->start >= start) {
1109                         if (n->end <= end)
1110                                 sp_delete(sp, n);
1111                         else
1112                                 n->start = end;
1113                 } else {
1114                         /* Old policy spanning whole new range. */
1115                         if (n->end > end) {
1116                                 if (!new2) {
1117                                         spin_unlock(&sp->lock);
1118                                         new2 = sp_alloc(end, n->end, n->policy);
1119                                         if (!new2)
1120                                                 return -ENOMEM;
1121                                         goto restart;
1122                                 }
1123                                 n->end = start;
1124                                 sp_insert(sp, new2);
1125                                 new2 = NULL;
1126                                 break;
1127                         } else
1128                                 n->end = start;
1129                 }
1130                 if (!next)
1131                         break;
1132                 n = rb_entry(next, struct sp_node, nd);
1133         }
1134         if (new)
1135                 sp_insert(sp, new);
1136         spin_unlock(&sp->lock);
1137         if (new2) {
1138                 mpol_free(new2->policy);
1139                 kmem_cache_free(sn_cache, new2);
1140         }
1141         return 0;
1142 }
1143
1144 int mpol_set_shared_policy(struct shared_policy *info,
1145                         struct vm_area_struct *vma, struct mempolicy *npol)
1146 {
1147         int err;
1148         struct sp_node *new = NULL;
1149         unsigned long sz = vma_pages(vma);
1150
1151         PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1152                  vma->vm_pgoff,
1153                  sz, npol? npol->policy : -1,
1154                 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1155
1156         if (npol) {
1157                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1158                 if (!new)
1159                         return -ENOMEM;
1160         }
1161         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1162         if (err && new)
1163                 kmem_cache_free(sn_cache, new);
1164         return err;
1165 }
1166
1167 /* Free a backing policy store on inode delete. */
1168 void mpol_free_shared_policy(struct shared_policy *p)
1169 {
1170         struct sp_node *n;
1171         struct rb_node *next;
1172
1173         if (!p->root.rb_node)
1174                 return;
1175         spin_lock(&p->lock);
1176         next = rb_first(&p->root);
1177         while (next) {
1178                 n = rb_entry(next, struct sp_node, nd);
1179                 next = rb_next(&n->nd);
1180                 rb_erase(&n->nd, &p->root);
1181                 mpol_free(n->policy);
1182                 kmem_cache_free(sn_cache, n);
1183         }
1184         spin_unlock(&p->lock);
1185 }
1186
1187 /* assumes fs == KERNEL_DS */
1188 void __init numa_policy_init(void)
1189 {
1190         policy_cache = kmem_cache_create("numa_policy",
1191                                          sizeof(struct mempolicy),
1192                                          0, SLAB_PANIC, NULL, NULL);
1193
1194         sn_cache = kmem_cache_create("shared_policy_node",
1195                                      sizeof(struct sp_node),
1196                                      0, SLAB_PANIC, NULL, NULL);
1197
1198         /* Set interleaving policy for system init. This way not all
1199            the data structures allocated at system boot end up in node zero. */
1200
1201         if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1202                 printk("numa_policy_init: interleaving failed\n");
1203 }
1204
1205 /* Reset policy of current process to default */
1206 void numa_default_policy(void)
1207 {
1208         do_set_mempolicy(MPOL_DEFAULT, NULL);
1209 }
1210
1211 /* Migrate a policy to a different set of nodes */
1212 static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
1213                                                         const nodemask_t *new)
1214 {
1215         nodemask_t tmp;
1216
1217         if (!pol)
1218                 return;
1219
1220         switch (pol->policy) {
1221         case MPOL_DEFAULT:
1222                 break;
1223         case MPOL_INTERLEAVE:
1224                 nodes_remap(tmp, pol->v.nodes, *old, *new);
1225                 pol->v.nodes = tmp;
1226                 current->il_next = node_remap(current->il_next, *old, *new);
1227                 break;
1228         case MPOL_PREFERRED:
1229                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1230                                                                 *old, *new);
1231                 break;
1232         case MPOL_BIND: {
1233                 nodemask_t nodes;
1234                 struct zone **z;
1235                 struct zonelist *zonelist;
1236
1237                 nodes_clear(nodes);
1238                 for (z = pol->v.zonelist->zones; *z; z++)
1239                         node_set((*z)->zone_pgdat->node_id, nodes);
1240                 nodes_remap(tmp, nodes, *old, *new);
1241                 nodes = tmp;
1242
1243                 zonelist = bind_zonelist(&nodes);
1244
1245                 /* If no mem, then zonelist is NULL and we keep old zonelist.
1246                  * If that old zonelist has no remaining mems_allowed nodes,
1247                  * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1248                  */
1249
1250                 if (zonelist) {
1251                         /* Good - got mem - substitute new zonelist */
1252                         kfree(pol->v.zonelist);
1253                         pol->v.zonelist = zonelist;
1254                 }
1255                 break;
1256         }
1257         default:
1258                 BUG();
1259                 break;
1260         }
1261 }
1262
1263 /*
1264  * Someone moved this task to different nodes.  Fixup mempolicies.
1265  *
1266  * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well,
1267  * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
1268  */
1269 void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new)
1270 {
1271         rebind_policy(current->mempolicy, old, new);
1272 }