mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66    could replace all the switch()es with a mempolicy_ops structure.
  67 */
  68
  69 #include <linux/mempolicy.h>
  70 #include <linux/mm.h>
  71 #include <linux/highmem.h>
  72 #include <linux/hugetlb.h>
  73 #include <linux/kernel.h>
  74 #include <linux/sched.h>
  75 #include <linux/mm.h>
  76 #include <linux/nodemask.h>
  77 #include <linux/cpuset.h>
  78 #include <linux/gfp.h>
  79 #include <linux/slab.h>
  80 #include <linux/string.h>
  81 #include <linux/module.h>
  82 #include <linux/interrupt.h>
  83 #include <linux/init.h>
  84 #include <linux/compat.h>
  85 #include <linux/mempolicy.h>
  86 #include <asm/tlbflush.h>
  87 #include <asm/uaccess.h>
  88
  89 static kmem_cache_t *policy_cache;
  90 static kmem_cache_t *sn_cache;
  91
  92 #define PDprintk(fmt...)
  93
  94 /* Highest zone. An specific allocation for a zone below that is not
  95    policied. */
  96 static int policy_zone;
  97
  98 struct mempolicy default_policy = {
  99         .refcnt = ATOMIC_INIT(1), /* never free it */
 100         .policy = MPOL_DEFAULT,
 101 };
 102
 103 /* Do sanity checking on a policy */
 104 static int mpol_check_policy(int mode, nodemask_t *nodes)
 105 {
 106         int empty = nodes_empty(*nodes);
 107
 108         switch (mode) {
 109         case MPOL_DEFAULT:
 110                 if (!empty)
 111                         return -EINVAL;
 112                 break;
 113         case MPOL_BIND:
 114         case MPOL_INTERLEAVE:
 115                 /* Preferred will only use the first bit, but allow
 116                    more for now. */
 117                 if (empty)
 118                         return -EINVAL;
 119                 break;
 120         }
 121         return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
 122 }
 123 /* Generate a custom zonelist for the BIND policy. */
 124 static struct zonelist *bind_zonelist(nodemask_t *nodes)
 125 {
 126         struct zonelist *zl;
 127         int num, max, nd;
 128
 129         max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
 130         zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
 131         if (!zl)
 132                 return NULL;
 133         num = 0;
 134         for_each_node_mask(nd, *nodes) {
 135                 int k;
 136                 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
 137                         struct zone *z = &NODE_DATA(nd)->node_zones[k];
 138                         if (!z->present_pages)
 139                                 continue;
 140                         zl->zones[num++] = z;
 141                         if (k > policy_zone)
 142                                 policy_zone = k;
 143                 }
 144         }
 145         zl->zones[num] = NULL;
 146         return zl;
 147 }
 148
 149 /* Create a new policy */
 150 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 151 {
 152         struct mempolicy *policy;
 153
 154         PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
 155         if (mode == MPOL_DEFAULT)
 156                 return NULL;
 157         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 158         if (!policy)
 159                 return ERR_PTR(-ENOMEM);
 160         atomic_set(&policy->refcnt, 1);
 161         switch (mode) {
 162         case MPOL_INTERLEAVE:
 163                 policy->v.nodes = *nodes;
 164                 if (nodes_weight(*nodes) == 0) {
 165                         kmem_cache_free(policy_cache, policy);
 166                         return ERR_PTR(-EINVAL);
 167                 }
 168                 break;
 169         case MPOL_PREFERRED:
 170                 policy->v.preferred_node = first_node(*nodes);
 171                 if (policy->v.preferred_node >= MAX_NUMNODES)
 172                         policy->v.preferred_node = -1;
 173                 break;
 174         case MPOL_BIND:
 175                 policy->v.zonelist = bind_zonelist(nodes);
 176                 if (policy->v.zonelist == NULL) {
 177                         kmem_cache_free(policy_cache, policy);
 178                         return ERR_PTR(-ENOMEM);
 179                 }
 180                 break;
 181         }
 182         policy->policy = mode;
 183         return policy;
 184 }
 185
 186 /* Ensure all existing pages follow the policy. */
 187 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 188                 unsigned long addr, unsigned long end, nodemask_t *nodes)
 189 {
 190         pte_t *orig_pte;
 191         pte_t *pte;
 192         spinlock_t *ptl;
 193
 194         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 195         do {
 196                 struct page *page;
 197                 unsigned int nid;
 198
 199                 if (!pte_present(*pte))
 200                         continue;
 201                 page = vm_normal_page(vma, addr, *pte);
 202                 if (!page)
 203                         continue;
 204                 nid = page_to_nid(page);
 205                 if (!node_isset(nid, *nodes))
 206                         break;
 207         } while (pte++, addr += PAGE_SIZE, addr != end);
 208         pte_unmap_unlock(orig_pte, ptl);
 209         return addr != end;
 210 }
 211
 212 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 213                 unsigned long addr, unsigned long end, nodemask_t *nodes)
 214 {
 215         pmd_t *pmd;
 216         unsigned long next;
 217
 218         pmd = pmd_offset(pud, addr);
 219         do {
 220                 next = pmd_addr_end(addr, end);
 221                 if (pmd_none_or_clear_bad(pmd))
 222                         continue;
 223                 if (check_pte_range(vma, pmd, addr, next, nodes))
 224                         return -EIO;
 225         } while (pmd++, addr = next, addr != end);
 226         return 0;
 227 }
 228
 229 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 230                 unsigned long addr, unsigned long end, nodemask_t *nodes)
 231 {
 232         pud_t *pud;
 233         unsigned long next;
 234
 235         pud = pud_offset(pgd, addr);
 236         do {
 237                 next = pud_addr_end(addr, end);
 238                 if (pud_none_or_clear_bad(pud))
 239                         continue;
 240                 if (check_pmd_range(vma, pud, addr, next, nodes))
 241                         return -EIO;
 242         } while (pud++, addr = next, addr != end);
 243         return 0;
 244 }
 245
 246 static inline int check_pgd_range(struct vm_area_struct *vma,
 247                 unsigned long addr, unsigned long end, nodemask_t *nodes)
 248 {
 249         pgd_t *pgd;
 250         unsigned long next;
 251
 252         pgd = pgd_offset(vma->vm_mm, addr);
 253         do {
 254                 next = pgd_addr_end(addr, end);
 255                 if (pgd_none_or_clear_bad(pgd))
 256                         continue;
 257                 if (check_pud_range(vma, pgd, addr, next, nodes))
 258                         return -EIO;
 259         } while (pgd++, addr = next, addr != end);
 260         return 0;
 261 }
 262
 263 /* Step 1: check the range */
 264 static struct vm_area_struct *
 265 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 266             nodemask_t *nodes, unsigned long flags)
 267 {
 268         int err;
 269         struct vm_area_struct *first, *vma, *prev;
 270
 271         first = find_vma(mm, start);
 272         if (!first)
 273                 return ERR_PTR(-EFAULT);
 274         prev = NULL;
 275         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 276                 if (!vma->vm_next && vma->vm_end < end)
 277                         return ERR_PTR(-EFAULT);
 278                 if (prev && prev->vm_end < vma->vm_start)
 279                         return ERR_PTR(-EFAULT);
 280                 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
 281                         unsigned long endvma = vma->vm_end;
 282                         if (endvma > end)
 283                                 endvma = end;
 284                         if (vma->vm_start > start)
 285                                 start = vma->vm_start;
 286                         err = check_pgd_range(vma, start, endvma, nodes);
 287                         if (err) {
 288                                 first = ERR_PTR(err);
 289                                 break;
 290                         }
 291                 }
 292                 prev = vma;
 293         }
 294         return first;
 295 }
 296
 297 /* Apply policy to a single VMA */
 298 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 299 {
 300         int err = 0;
 301         struct mempolicy *old = vma->vm_policy;
 302
 303         PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 304                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 305                  vma->vm_ops, vma->vm_file,
 306                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 307
 308         if (vma->vm_ops && vma->vm_ops->set_policy)
 309                 err = vma->vm_ops->set_policy(vma, new);
 310         if (!err) {
 311                 mpol_get(new);
 312                 vma->vm_policy = new;
 313                 mpol_free(old);
 314         }
 315         return err;
 316 }
 317
 318 /* Step 2: apply policy to a range and do splits. */
 319 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 320                        unsigned long end, struct mempolicy *new)
 321 {
 322         struct vm_area_struct *next;
 323         int err;
 324
 325         err = 0;
 326         for (; vma && vma->vm_start < end; vma = next) {
 327                 next = vma->vm_next;
 328                 if (vma->vm_start < start)
 329                         err = split_vma(vma->vm_mm, vma, start, 1);
 330                 if (!err && vma->vm_end > end)
 331                         err = split_vma(vma->vm_mm, vma, end, 0);
 332                 if (!err)
 333                         err = policy_vma(vma, new);
 334                 if (err)
 335                         break;
 336         }
 337         return err;
 338 }
 339
 340 static int contextualize_policy(int mode, nodemask_t *nodes)
 341 {
 342         if (!nodes)
 343                 return 0;
 344
 345         /* Update current mems_allowed */
 346         cpuset_update_current_mems_allowed();
 347         /* Ignore nodes not set in current->mems_allowed */
 348         cpuset_restrict_to_mems_allowed(nodes->bits);
 349         return mpol_check_policy(mode, nodes);
 350 }
 351
 352 long do_mbind(unsigned long start, unsigned long len,
 353                 unsigned long mode, nodemask_t *nmask, unsigned long flags)
 354 {
 355         struct vm_area_struct *vma;
 356         struct mm_struct *mm = current->mm;
 357         struct mempolicy *new;
 358         unsigned long end;
 359         int err;
 360
 361         if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
 362                 return -EINVAL;
 363         if (start & ~PAGE_MASK)
 364                 return -EINVAL;
 365         if (mode == MPOL_DEFAULT)
 366                 flags &= ~MPOL_MF_STRICT;
 367         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 368         end = start + len;
 369         if (end < start)
 370                 return -EINVAL;
 371         if (end == start)
 372                 return 0;
 373         if (mpol_check_policy(mode, nmask))
 374                 return -EINVAL;
 375         new = mpol_new(mode, nmask);
 376         if (IS_ERR(new))
 377                 return PTR_ERR(new);
 378
 379         PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
 380                         mode,nodes_addr(nodes)[0]);
 381
 382         down_write(&mm->mmap_sem);
 383         vma = check_range(mm, start, end, nmask, flags);
 384         err = PTR_ERR(vma);
 385         if (!IS_ERR(vma))
 386                 err = mbind_range(vma, start, end, new);
 387         up_write(&mm->mmap_sem);
 388         mpol_free(new);
 389         return err;
 390 }
 391
 392 /* Set the process memory policy */
 393 long do_set_mempolicy(int mode, nodemask_t *nodes)
 394 {
 395         struct mempolicy *new;
 396
 397         if (contextualize_policy(mode, nodes))
 398                 return -EINVAL;
 399         new = mpol_new(mode, nodes);
 400         if (IS_ERR(new))
 401                 return PTR_ERR(new);
 402         mpol_free(current->mempolicy);
 403         current->mempolicy = new;
 404         if (new && new->policy == MPOL_INTERLEAVE)
 405                 current->il_next = first_node(new->v.nodes);
 406         return 0;
 407 }
 408
 409 /* Fill a zone bitmap for a policy */
 410 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 411 {
 412         int i;
 413
 414         nodes_clear(*nodes);
 415         switch (p->policy) {
 416         case MPOL_BIND:
 417                 for (i = 0; p->v.zonelist->zones[i]; i++)
 418                         node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
 419                                 *nodes);
 420                 break;
 421         case MPOL_DEFAULT:
 422                 break;
 423         case MPOL_INTERLEAVE:
 424                 *nodes = p->v.nodes;
 425                 break;
 426         case MPOL_PREFERRED:
 427                 /* or use current node instead of online map? */
 428                 if (p->v.preferred_node < 0)
 429                         *nodes = node_online_map;
 430                 else
 431                         node_set(p->v.preferred_node, *nodes);
 432                 break;
 433         default:
 434                 BUG();
 435         }
 436 }
 437
 438 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 439 {
 440         struct page *p;
 441         int err;
 442
 443         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 444         if (err >= 0) {
 445                 err = page_to_nid(p);
 446                 put_page(p);
 447         }
 448         return err;
 449 }
 450
 451 /* Retrieve NUMA policy */
 452 long do_get_mempolicy(int *policy, nodemask_t *nmask,
 453                         unsigned long addr, unsigned long flags)
 454 {
 455         int err;
 456         struct mm_struct *mm = current->mm;
 457         struct vm_area_struct *vma = NULL;
 458         struct mempolicy *pol = current->mempolicy;
 459
 460         cpuset_update_current_mems_allowed();
 461         if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
 462                 return -EINVAL;
 463         if (flags & MPOL_F_ADDR) {
 464                 down_read(&mm->mmap_sem);
 465                 vma = find_vma_intersection(mm, addr, addr+1);
 466                 if (!vma) {
 467                         up_read(&mm->mmap_sem);
 468                         return -EFAULT;
 469                 }
 470                 if (vma->vm_ops && vma->vm_ops->get_policy)
 471                         pol = vma->vm_ops->get_policy(vma, addr);
 472                 else
 473                         pol = vma->vm_policy;
 474         } else if (addr)
 475                 return -EINVAL;
 476
 477         if (!pol)
 478                 pol = &default_policy;
 479
 480         if (flags & MPOL_F_NODE) {
 481                 if (flags & MPOL_F_ADDR) {
 482                         err = lookup_node(mm, addr);
 483                         if (err < 0)
 484                                 goto out;
 485                         *policy = err;
 486                 } else if (pol == current->mempolicy &&
 487                                 pol->policy == MPOL_INTERLEAVE) {
 488                         *policy = current->il_next;
 489                 } else {
 490                         err = -EINVAL;
 491                         goto out;
 492                 }
 493         } else
 494                 *policy = pol->policy;
 495
 496         if (vma) {
 497                 up_read(&current->mm->mmap_sem);
 498                 vma = NULL;
 499         }
 500
 501         err = 0;
 502         if (nmask)
 503                 get_zonemask(pol, nmask);
 504
 505  out:
 506         if (vma)
 507                 up_read(&current->mm->mmap_sem);
 508         return err;
 509 }
 510
 511 /*
 512  * User space interface with variable sized bitmaps for nodelists.
 513  */
 514
 515 /* Copy a node mask from user space. */
 516 static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
 517                      unsigned long maxnode)
 518 {
 519         unsigned long k;
 520         unsigned long nlongs;
 521         unsigned long endmask;
 522
 523         --maxnode;
 524         nodes_clear(*nodes);
 525         if (maxnode == 0 || !nmask)
 526                 return 0;
 527
 528         nlongs = BITS_TO_LONGS(maxnode);
 529         if ((maxnode % BITS_PER_LONG) == 0)
 530                 endmask = ~0UL;
 531         else
 532                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
 533
 534         /* When the user specified more nodes than supported just check
 535            if the non supported part is all zero. */
 536         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
 537                 if (nlongs > PAGE_SIZE/sizeof(long))
 538                         return -EINVAL;
 539                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
 540                         unsigned long t;
 541                         if (get_user(t, nmask + k))
 542                                 return -EFAULT;
 543                         if (k == nlongs - 1) {
 544                                 if (t & endmask)
 545                                         return -EINVAL;
 546                         } else if (t)
 547                                 return -EINVAL;
 548                 }
 549                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
 550                 endmask = ~0UL;
 551         }
 552
 553         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
 554                 return -EFAULT;
 555         nodes_addr(*nodes)[nlongs-1] &= endmask;
 556         return 0;
 557 }
 558
 559 /* Copy a kernel node mask to user space */
 560 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 561                               nodemask_t *nodes)
 562 {
 563         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
 564         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
 565
 566         if (copy > nbytes) {
 567                 if (copy > PAGE_SIZE)
 568                         return -EINVAL;
 569                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
 570                         return -EFAULT;
 571                 copy = nbytes;
 572         }
 573         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
 574 }
 575
 576 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 577                         unsigned long mode,
 578                         unsigned long __user *nmask, unsigned long maxnode,
 579                         unsigned flags)
 580 {
 581         nodemask_t nodes;
 582         int err;
 583
 584         err = get_nodes(&nodes, nmask, maxnode);
 585         if (err)
 586                 return err;
 587         return do_mbind(start, len, mode, &nodes, flags);
 588 }
 589
 590 /* Set the process memory policy */
 591 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 592                 unsigned long maxnode)
 593 {
 594         int err;
 595         nodemask_t nodes;
 596
 597         if (mode < 0 || mode > MPOL_MAX)
 598                 return -EINVAL;
 599         err = get_nodes(&nodes, nmask, maxnode);
 600         if (err)
 601                 return err;
 602         return do_set_mempolicy(mode, &nodes);
 603 }
 604
 605 /* Retrieve NUMA policy */
 606 asmlinkage long sys_get_mempolicy(int __user *policy,
 607                                 unsigned long __user *nmask,
 608                                 unsigned long maxnode,
 609                                 unsigned long addr, unsigned long flags)
 610 {
 611         int err, pval;
 612         nodemask_t nodes;
 613
 614         if (nmask != NULL && maxnode < MAX_NUMNODES)
 615                 return -EINVAL;
 616
 617         err = do_get_mempolicy(&pval, &nodes, addr, flags);
 618
 619         if (err)
 620                 return err;
 621
 622         if (policy && put_user(pval, policy))
 623                 return -EFAULT;
 624
 625         if (nmask)
 626                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
 627
 628         return err;
 629 }
 630
 631 #ifdef CONFIG_COMPAT
 632
 633 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
 634                                      compat_ulong_t __user *nmask,
 635                                      compat_ulong_t maxnode,
 636                                      compat_ulong_t addr, compat_ulong_t flags)
 637 {
 638         long err;
 639         unsigned long __user *nm = NULL;
 640         unsigned long nr_bits, alloc_size;
 641         DECLARE_BITMAP(bm, MAX_NUMNODES);
 642
 643         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 644         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 645
 646         if (nmask)
 647                 nm = compat_alloc_user_space(alloc_size);
 648
 649         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
 650
 651         if (!err && nmask) {
 652                 err = copy_from_user(bm, nm, alloc_size);
 653                 /* ensure entire bitmap is zeroed */
 654                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
 655                 err |= compat_put_bitmap(nmask, bm, nr_bits);
 656         }
 657
 658         return err;
 659 }
 660
 661 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
 662                                      compat_ulong_t maxnode)
 663 {
 664         long err = 0;
 665         unsigned long __user *nm = NULL;
 666         unsigned long nr_bits, alloc_size;
 667         DECLARE_BITMAP(bm, MAX_NUMNODES);
 668
 669         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 670         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 671
 672         if (nmask) {
 673                 err = compat_get_bitmap(bm, nmask, nr_bits);
 674                 nm = compat_alloc_user_space(alloc_size);
 675                 err |= copy_to_user(nm, bm, alloc_size);
 676         }
 677
 678         if (err)
 679                 return -EFAULT;
 680
 681         return sys_set_mempolicy(mode, nm, nr_bits+1);
 682 }
 683
 684 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
 685                              compat_ulong_t mode, compat_ulong_t __user *nmask,
 686                              compat_ulong_t maxnode, compat_ulong_t flags)
 687 {
 688         long err = 0;
 689         unsigned long __user *nm = NULL;
 690         unsigned long nr_bits, alloc_size;
 691         nodemask_t bm;
 692
 693         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 694         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 695
 696         if (nmask) {
 697                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
 698                 nm = compat_alloc_user_space(alloc_size);
 699                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
 700         }
 701
 702         if (err)
 703                 return -EFAULT;
 704
 705         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
 706 }
 707
 708 #endif
 709
 710 /* Return effective policy for a VMA */
 711 struct mempolicy *
 712 get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
 713 {
 714         struct mempolicy *pol = task->mempolicy;
 715
 716         if (vma) {
 717                 if (vma->vm_ops && vma->vm_ops->get_policy)
 718                         pol = vma->vm_ops->get_policy(vma, addr);
 719                 else if (vma->vm_policy &&
 720                                 vma->vm_policy->policy != MPOL_DEFAULT)
 721                         pol = vma->vm_policy;
 722         }
 723         if (!pol)
 724                 pol = &default_policy;
 725         return pol;
 726 }
 727
 728 /* Return a zonelist representing a mempolicy */
 729 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
 730 {
 731         int nd;
 732
 733         switch (policy->policy) {
 734         case MPOL_PREFERRED:
 735                 nd = policy->v.preferred_node;
 736                 if (nd < 0)
 737                         nd = numa_node_id();
 738                 break;
 739         case MPOL_BIND:
 740                 /* Lower zones don't get a policy applied */
 741                 /* Careful: current->mems_allowed might have moved */
 742                 if (gfp_zone(gfp) >= policy_zone)
 743                         if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
 744                                 return policy->v.zonelist;
 745                 /*FALL THROUGH*/
 746         case MPOL_INTERLEAVE: /* should not happen */
 747         case MPOL_DEFAULT:
 748                 nd = numa_node_id();
 749                 break;
 750         default:
 751                 nd = 0;
 752                 BUG();
 753         }
 754         return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
 755 }
 756
 757 /* Do dynamic interleaving for a process */
 758 static unsigned interleave_nodes(struct mempolicy *policy)
 759 {
 760         unsigned nid, next;
 761         struct task_struct *me = current;
 762
 763         nid = me->il_next;
 764         next = next_node(nid, policy->v.nodes);
 765         if (next >= MAX_NUMNODES)
 766                 next = first_node(policy->v.nodes);
 767         me->il_next = next;
 768         return nid;
 769 }
 770
 771 /* Do static interleaving for a VMA with known offset. */
 772 static unsigned offset_il_node(struct mempolicy *pol,
 773                 struct vm_area_struct *vma, unsigned long off)
 774 {
 775         unsigned nnodes = nodes_weight(pol->v.nodes);
 776         unsigned target = (unsigned)off % nnodes;
 777         int c;
 778         int nid = -1;
 779
 780         c = 0;
 781         do {
 782                 nid = next_node(nid, pol->v.nodes);
 783                 c++;
 784         } while (c <= target);
 785         return nid;
 786 }
 787
 788 /* Determine a node number for interleave */
 789 static inline unsigned interleave_nid(struct mempolicy *pol,
 790                  struct vm_area_struct *vma, unsigned long addr, int shift)
 791 {
 792         if (vma) {
 793                 unsigned long off;
 794
 795                 off = vma->vm_pgoff;
 796                 off += (addr - vma->vm_start) >> shift;
 797                 return offset_il_node(pol, vma, off);
 798         } else
 799                 return interleave_nodes(pol);
 800 }
 801
 802 /* Return a zonelist suitable for a huge page allocation. */
 803 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
 804 {
 805         struct mempolicy *pol = get_vma_policy(current, vma, addr);
 806
 807         if (pol->policy == MPOL_INTERLEAVE) {
 808                 unsigned nid;
 809
 810                 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
 811                 return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
 812         }
 813         return zonelist_policy(GFP_HIGHUSER, pol);
 814 }
 815
 816 /* Allocate a page in interleaved policy.
 817    Own path because it needs to do special accounting. */
 818 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
 819                                         unsigned nid)
 820 {
 821         struct zonelist *zl;
 822         struct page *page;
 823
 824         zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
 825         page = __alloc_pages(gfp, order, zl);
 826         if (page && page_zone(page) == zl->zones[0]) {
 827                 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
 828                 put_cpu();
 829         }
 830         return page;
 831 }
 832
 833 /**
 834  *      alloc_page_vma  - Allocate a page for a VMA.
 835  *
 836  *      @gfp:
 837  *      %GFP_USER    user allocation.
 838  *      %GFP_KERNEL  kernel allocations,
 839  *      %GFP_HIGHMEM highmem/user allocations,
 840  *      %GFP_FS      allocation should not call back into a file system.
 841  *      %GFP_ATOMIC  don't sleep.
 842  *
 843  *      @vma:  Pointer to VMA or NULL if not available.
 844  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
 845  *
 846  *      This function allocates a page from the kernel page pool and applies
 847  *      a NUMA policy associated with the VMA or the current process.
 848  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
 849  *      mm_struct of the VMA to prevent it from going away. Should be used for
 850  *      all allocations for pages that will be mapped into
 851  *      user space. Returns NULL when no page can be allocated.
 852  *
 853  *      Should be called with the mm_sem of the vma hold.
 854  */
 855 struct page *
 856 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 857 {
 858         struct mempolicy *pol = get_vma_policy(current, vma, addr);
 859
 860         cpuset_update_current_mems_allowed();
 861
 862         if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
 863                 unsigned nid;
 864
 865                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
 866                 return alloc_page_interleave(gfp, 0, nid);
 867         }
 868         return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
 869 }
 870
 871 /**
 872  *      alloc_pages_current - Allocate pages.
 873  *
 874  *      @gfp:
 875  *              %GFP_USER   user allocation,
 876  *              %GFP_KERNEL kernel allocation,
 877  *              %GFP_HIGHMEM highmem allocation,
 878  *              %GFP_FS     don't call back into a file system.
 879  *              %GFP_ATOMIC don't sleep.
 880  *      @order: Power of two of allocation size in pages. 0 is a single page.
 881  *
 882  *      Allocate a page from the kernel page pool.  When not in
 883  *      interrupt context and apply the current process NUMA policy.
 884  *      Returns NULL when no page can be allocated.
 885  *
 886  *      Don't call cpuset_update_current_mems_allowed() unless
 887  *      1) it's ok to take cpuset_sem (can WAIT), and
 888  *      2) allocating for current task (not interrupt).
 889  */
 890 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 891 {
 892         struct mempolicy *pol = current->mempolicy;
 893
 894         if ((gfp & __GFP_WAIT) && !in_interrupt())
 895                 cpuset_update_current_mems_allowed();
 896         if (!pol || in_interrupt())
 897                 pol = &default_policy;
 898         if (pol->policy == MPOL_INTERLEAVE)
 899                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
 900         return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
 901 }
 902 EXPORT_SYMBOL(alloc_pages_current);
 903
 904 /* Slow path of a mempolicy copy */
 905 struct mempolicy *__mpol_copy(struct mempolicy *old)
 906 {
 907         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 908
 909         if (!new)
 910                 return ERR_PTR(-ENOMEM);
 911         *new = *old;
 912         atomic_set(&new->refcnt, 1);
 913         if (new->policy == MPOL_BIND) {
 914                 int sz = ksize(old->v.zonelist);
 915                 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
 916                 if (!new->v.zonelist) {
 917                         kmem_cache_free(policy_cache, new);
 918                         return ERR_PTR(-ENOMEM);
 919                 }
 920                 memcpy(new->v.zonelist, old->v.zonelist, sz);
 921         }
 922         return new;
 923 }
 924
 925 /* Slow path of a mempolicy comparison */
 926 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 927 {
 928         if (!a || !b)
 929                 return 0;
 930         if (a->policy != b->policy)
 931                 return 0;
 932         switch (a->policy) {
 933         case MPOL_DEFAULT:
 934                 return 1;
 935         case MPOL_INTERLEAVE:
 936                 return nodes_equal(a->v.nodes, b->v.nodes);
 937         case MPOL_PREFERRED:
 938                 return a->v.preferred_node == b->v.preferred_node;
 939         case MPOL_BIND: {
 940                 int i;
 941                 for (i = 0; a->v.zonelist->zones[i]; i++)
 942                         if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
 943                                 return 0;
 944                 return b->v.zonelist->zones[i] == NULL;
 945         }
 946         default:
 947                 BUG();
 948                 return 0;
 949         }
 950 }
 951
 952 /* Slow path of a mpol destructor. */
 953 void __mpol_free(struct mempolicy *p)
 954 {
 955         if (!atomic_dec_and_test(&p->refcnt))
 956                 return;
 957         if (p->policy == MPOL_BIND)
 958                 kfree(p->v.zonelist);
 959         p->policy = MPOL_DEFAULT;
 960         kmem_cache_free(policy_cache, p);
 961 }
 962
 963 /*
 964  * Shared memory backing store policy support.
 965  *
 966  * Remember policies even when nobody has shared memory mapped.
 967  * The policies are kept in Red-Black tree linked from the inode.
 968  * They are protected by the sp->lock spinlock, which should be held
 969  * for any accesses to the tree.
 970  */
 971
 972 /* lookup first element intersecting start-end */
 973 /* Caller holds sp->lock */
 974 static struct sp_node *
 975 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
 976 {
 977         struct rb_node *n = sp->root.rb_node;
 978
 979         while (n) {
 980                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
 981
 982                 if (start >= p->end)
 983                         n = n->rb_right;
 984                 else if (end <= p->start)
 985                         n = n->rb_left;
 986                 else
 987                         break;
 988         }
 989         if (!n)
 990                 return NULL;
 991         for (;;) {
 992                 struct sp_node *w = NULL;
 993                 struct rb_node *prev = rb_prev(n);
 994                 if (!prev)
 995                         break;
 996                 w = rb_entry(prev, struct sp_node, nd);
 997                 if (w->end <= start)
 998                         break;
 999                 n = prev;
1000         }
1001         return rb_entry(n, struct sp_node, nd);
1002 }
1003
1004 /* Insert a new shared policy into the list. */
1005 /* Caller holds sp->lock */
1006 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1007 {
1008         struct rb_node **p = &sp->root.rb_node;
1009         struct rb_node *parent = NULL;
1010         struct sp_node *nd;
1011
1012         while (*p) {
1013                 parent = *p;
1014                 nd = rb_entry(parent, struct sp_node, nd);
1015                 if (new->start < nd->start)
1016                         p = &(*p)->rb_left;
1017                 else if (new->end > nd->end)
1018                         p = &(*p)->rb_right;
1019                 else
1020                         BUG();
1021         }
1022         rb_link_node(&new->nd, parent, p);
1023         rb_insert_color(&new->nd, &sp->root);
1024         PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1025                  new->policy ? new->policy->policy : 0);
1026 }
1027
1028 /* Find shared policy intersecting idx */
1029 struct mempolicy *
1030 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1031 {
1032         struct mempolicy *pol = NULL;
1033         struct sp_node *sn;
1034
1035         if (!sp->root.rb_node)
1036                 return NULL;
1037         spin_lock(&sp->lock);
1038         sn = sp_lookup(sp, idx, idx+1);
1039         if (sn) {
1040                 mpol_get(sn->policy);
1041                 pol = sn->policy;
1042         }
1043         spin_unlock(&sp->lock);
1044         return pol;
1045 }
1046
1047 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1048 {
1049         PDprintk("deleting %lx-l%x\n", n->start, n->end);
1050         rb_erase(&n->nd, &sp->root);
1051         mpol_free(n->policy);
1052         kmem_cache_free(sn_cache, n);
1053 }
1054
1055 struct sp_node *
1056 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1057 {
1058         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1059
1060         if (!n)
1061                 return NULL;
1062         n->start = start;
1063         n->end = end;
1064         mpol_get(pol);
1065         n->policy = pol;
1066         return n;
1067 }
1068
1069 /* Replace a policy range. */
1070 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1071                                  unsigned long end, struct sp_node *new)
1072 {
1073         struct sp_node *n, *new2 = NULL;
1074
1075 restart:
1076         spin_lock(&sp->lock);
1077         n = sp_lookup(sp, start, end);
1078         /* Take care of old policies in the same range. */
1079         while (n && n->start < end) {
1080                 struct rb_node *next = rb_next(&n->nd);
1081                 if (n->start >= start) {
1082                         if (n->end <= end)
1083                                 sp_delete(sp, n);
1084                         else
1085                                 n->start = end;
1086                 } else {
1087                         /* Old policy spanning whole new range. */
1088                         if (n->end > end) {
1089                                 if (!new2) {
1090                                         spin_unlock(&sp->lock);
1091                                         new2 = sp_alloc(end, n->end, n->policy);
1092                                         if (!new2)
1093                                                 return -ENOMEM;
1094                                         goto restart;
1095                                 }
1096                                 n->end = start;
1097                                 sp_insert(sp, new2);
1098                                 new2 = NULL;
1099                                 break;
1100                         } else
1101                                 n->end = start;
1102                 }
1103                 if (!next)
1104                         break;
1105                 n = rb_entry(next, struct sp_node, nd);
1106         }
1107         if (new)
1108                 sp_insert(sp, new);
1109         spin_unlock(&sp->lock);
1110         if (new2) {
1111                 mpol_free(new2->policy);
1112                 kmem_cache_free(sn_cache, new2);
1113         }
1114         return 0;
1115 }
1116
1117 int mpol_set_shared_policy(struct shared_policy *info,
1118                         struct vm_area_struct *vma, struct mempolicy *npol)
1119 {
1120         int err;
1121         struct sp_node *new = NULL;
1122         unsigned long sz = vma_pages(vma);
1123
1124         PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1125                  vma->vm_pgoff,
1126                  sz, npol? npol->policy : -1,
1127                 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1128
1129         if (npol) {
1130                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1131                 if (!new)
1132                         return -ENOMEM;
1133         }
1134         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1135         if (err && new)
1136                 kmem_cache_free(sn_cache, new);
1137         return err;
1138 }
1139
1140 /* Free a backing policy store on inode delete. */
1141 void mpol_free_shared_policy(struct shared_policy *p)
1142 {
1143         struct sp_node *n;
1144         struct rb_node *next;
1145
1146         if (!p->root.rb_node)
1147                 return;
1148         spin_lock(&p->lock);
1149         next = rb_first(&p->root);
1150         while (next) {
1151                 n = rb_entry(next, struct sp_node, nd);
1152                 next = rb_next(&n->nd);
1153                 rb_erase(&n->nd, &p->root);
1154                 mpol_free(n->policy);
1155                 kmem_cache_free(sn_cache, n);
1156         }
1157         spin_unlock(&p->lock);
1158 }
1159
1160 /* assumes fs == KERNEL_DS */
1161 void __init numa_policy_init(void)
1162 {
1163         policy_cache = kmem_cache_create("numa_policy",
1164                                          sizeof(struct mempolicy),
1165                                          0, SLAB_PANIC, NULL, NULL);
1166
1167         sn_cache = kmem_cache_create("shared_policy_node",
1168                                      sizeof(struct sp_node),
1169                                      0, SLAB_PANIC, NULL, NULL);
1170
1171         /* Set interleaving policy for system init. This way not all
1172            the data structures allocated at system boot end up in node zero. */
1173
1174         if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1175                 printk("numa_policy_init: interleaving failed\n");
1176 }
1177
1178 /* Reset policy of current process to default */
1179 void numa_default_policy(void)
1180 {
1181         do_set_mempolicy(MPOL_DEFAULT, NULL);
1182 }
1183
1184 /* Migrate a policy to a different set of nodes */
1185 static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
1186                                                         const nodemask_t *new)
1187 {
1188         nodemask_t tmp;
1189
1190         if (!pol)
1191                 return;
1192
1193         switch (pol->policy) {
1194         case MPOL_DEFAULT:
1195                 break;
1196         case MPOL_INTERLEAVE:
1197                 nodes_remap(tmp, pol->v.nodes, *old, *new);
1198                 pol->v.nodes = tmp;
1199                 current->il_next = node_remap(current->il_next, *old, *new);
1200                 break;
1201         case MPOL_PREFERRED:
1202                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1203                                                                 *old, *new);
1204                 break;
1205         case MPOL_BIND: {
1206                 nodemask_t nodes;
1207                 struct zone **z;
1208                 struct zonelist *zonelist;
1209
1210                 nodes_clear(nodes);
1211                 for (z = pol->v.zonelist->zones; *z; z++)
1212                         node_set((*z)->zone_pgdat->node_id, nodes);
1213                 nodes_remap(tmp, nodes, *old, *new);
1214                 nodes = tmp;
1215
1216                 zonelist = bind_zonelist(&nodes);
1217
1218                 /* If no mem, then zonelist is NULL and we keep old zonelist.
1219                  * If that old zonelist has no remaining mems_allowed nodes,
1220                  * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1221                  */
1222
1223                 if (zonelist) {
1224                         /* Good - got mem - substitute new zonelist */
1225                         kfree(pol->v.zonelist);
1226                         pol->v.zonelist = zonelist;
1227                 }
1228                 break;
1229         }
1230         default:
1231                 BUG();
1232                 break;
1233         }
1234 }
1235
1236 /*
1237  * Someone moved this task to different nodes.  Fixup mempolicies.
1238  *
1239  * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well,
1240  * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
1241  */
1242 void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new)
1243 {
1244         rebind_policy(current->mempolicy, old, new);
1245 }