mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66    could replace all the switch()es with a mempolicy_ops structure.
  67 */
  68
  69 #include <linux/mempolicy.h>
  70 #include <linux/mm.h>
  71 #include <linux/highmem.h>
  72 #include <linux/hugetlb.h>
  73 #include <linux/kernel.h>
  74 #include <linux/sched.h>
  75 #include <linux/mm.h>
  76 #include <linux/nodemask.h>
  77 #include <linux/cpuset.h>
  78 #include <linux/gfp.h>
  79 #include <linux/slab.h>
  80 #include <linux/string.h>
  81 #include <linux/module.h>
  82 #include <linux/interrupt.h>
  83 #include <linux/init.h>
  84 #include <linux/compat.h>
  85 #include <linux/mempolicy.h>
  86 #include <asm/tlbflush.h>
  87 #include <asm/uaccess.h>
  88
  89 static kmem_cache_t *policy_cache;
  90 static kmem_cache_t *sn_cache;
  91
  92 #define PDprintk(fmt...)
  93
  94 /* Highest zone. An specific allocation for a zone below that is not
  95    policied. */
  96 static int policy_zone;
  97
  98 struct mempolicy default_policy = {
  99         .refcnt = ATOMIC_INIT(1), /* never free it */
 100         .policy = MPOL_DEFAULT,
 101 };
 102
 103 /* Do sanity checking on a policy */
 104 static int mpol_check_policy(int mode, nodemask_t *nodes)
 105 {
 106         int empty = nodes_empty(*nodes);
 107
 108         switch (mode) {
 109         case MPOL_DEFAULT:
 110                 if (!empty)
 111                         return -EINVAL;
 112                 break;
 113         case MPOL_BIND:
 114         case MPOL_INTERLEAVE:
 115                 /* Preferred will only use the first bit, but allow
 116                    more for now. */
 117                 if (empty)
 118                         return -EINVAL;
 119                 break;
 120         }
 121         return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
 122 }
 123 /* Generate a custom zonelist for the BIND policy. */
 124 static struct zonelist *bind_zonelist(nodemask_t *nodes)
 125 {
 126         struct zonelist *zl;
 127         int num, max, nd;
 128
 129         max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
 130         zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
 131         if (!zl)
 132                 return NULL;
 133         num = 0;
 134         for_each_node_mask(nd, *nodes) {
 135                 int k;
 136                 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
 137                         struct zone *z = &NODE_DATA(nd)->node_zones[k];
 138                         if (!z->present_pages)
 139                                 continue;
 140                         zl->zones[num++] = z;
 141                         if (k > policy_zone)
 142                                 policy_zone = k;
 143                 }
 144         }
 145         zl->zones[num] = NULL;
 146         return zl;
 147 }
 148
 149 /* Create a new policy */
 150 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 151 {
 152         struct mempolicy *policy;
 153
 154         PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
 155         if (mode == MPOL_DEFAULT)
 156                 return NULL;
 157         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 158         if (!policy)
 159                 return ERR_PTR(-ENOMEM);
 160         atomic_set(&policy->refcnt, 1);
 161         switch (mode) {
 162         case MPOL_INTERLEAVE:
 163                 policy->v.nodes = *nodes;
 164                 break;
 165         case MPOL_PREFERRED:
 166                 policy->v.preferred_node = first_node(*nodes);
 167                 if (policy->v.preferred_node >= MAX_NUMNODES)
 168                         policy->v.preferred_node = -1;
 169                 break;
 170         case MPOL_BIND:
 171                 policy->v.zonelist = bind_zonelist(nodes);
 172                 if (policy->v.zonelist == NULL) {
 173                         kmem_cache_free(policy_cache, policy);
 174                         return ERR_PTR(-ENOMEM);
 175                 }
 176                 break;
 177         }
 178         policy->policy = mode;
 179         return policy;
 180 }
 181
 182 /* Ensure all existing pages follow the policy. */
 183 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 184                 unsigned long addr, unsigned long end, nodemask_t *nodes)
 185 {
 186         pte_t *orig_pte;
 187         pte_t *pte;
 188         spinlock_t *ptl;
 189
 190         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 191         do {
 192                 unsigned long pfn;
 193                 unsigned int nid;
 194
 195                 if (!pte_present(*pte))
 196                         continue;
 197                 pfn = pte_pfn(*pte);
 198                 if (!pfn_valid(pfn)) {
 199                         print_bad_pte(vma, *pte, addr);
 200                         continue;
 201                 }
 202                 nid = pfn_to_nid(pfn);
 203                 if (!node_isset(nid, *nodes))
 204                         break;
 205         } while (pte++, addr += PAGE_SIZE, addr != end);
 206         pte_unmap_unlock(orig_pte, ptl);
 207         return addr != end;
 208 }
 209
 210 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 211                 unsigned long addr, unsigned long end, nodemask_t *nodes)
 212 {
 213         pmd_t *pmd;
 214         unsigned long next;
 215
 216         pmd = pmd_offset(pud, addr);
 217         do {
 218                 next = pmd_addr_end(addr, end);
 219                 if (pmd_none_or_clear_bad(pmd))
 220                         continue;
 221                 if (check_pte_range(vma, pmd, addr, next, nodes))
 222                         return -EIO;
 223         } while (pmd++, addr = next, addr != end);
 224         return 0;
 225 }
 226
 227 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 228                 unsigned long addr, unsigned long end, nodemask_t *nodes)
 229 {
 230         pud_t *pud;
 231         unsigned long next;
 232
 233         pud = pud_offset(pgd, addr);
 234         do {
 235                 next = pud_addr_end(addr, end);
 236                 if (pud_none_or_clear_bad(pud))
 237                         continue;
 238                 if (check_pmd_range(vma, pud, addr, next, nodes))
 239                         return -EIO;
 240         } while (pud++, addr = next, addr != end);
 241         return 0;
 242 }
 243
 244 static inline int check_pgd_range(struct vm_area_struct *vma,
 245                 unsigned long addr, unsigned long end, nodemask_t *nodes)
 246 {
 247         pgd_t *pgd;
 248         unsigned long next;
 249
 250         pgd = pgd_offset(vma->vm_mm, addr);
 251         do {
 252                 next = pgd_addr_end(addr, end);
 253                 if (pgd_none_or_clear_bad(pgd))
 254                         continue;
 255                 if (check_pud_range(vma, pgd, addr, next, nodes))
 256                         return -EIO;
 257         } while (pgd++, addr = next, addr != end);
 258         return 0;
 259 }
 260
 261 /* Step 1: check the range */
 262 static struct vm_area_struct *
 263 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 264             nodemask_t *nodes, unsigned long flags)
 265 {
 266         int err;
 267         struct vm_area_struct *first, *vma, *prev;
 268
 269         first = find_vma(mm, start);
 270         if (!first)
 271                 return ERR_PTR(-EFAULT);
 272         if (first->vm_flags & VM_RESERVED)
 273                 return ERR_PTR(-EACCES);
 274         prev = NULL;
 275         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 276                 if (!vma->vm_next && vma->vm_end < end)
 277                         return ERR_PTR(-EFAULT);
 278                 if (prev && prev->vm_end < vma->vm_start)
 279                         return ERR_PTR(-EFAULT);
 280                 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
 281                         unsigned long endvma = vma->vm_end;
 282                         if (endvma > end)
 283                                 endvma = end;
 284                         if (vma->vm_start > start)
 285                                 start = vma->vm_start;
 286                         err = check_pgd_range(vma, start, endvma, nodes);
 287                         if (err) {
 288                                 first = ERR_PTR(err);
 289                                 break;
 290                         }
 291                 }
 292                 prev = vma;
 293         }
 294         return first;
 295 }
 296
 297 /* Apply policy to a single VMA */
 298 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 299 {
 300         int err = 0;
 301         struct mempolicy *old = vma->vm_policy;
 302
 303         PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 304                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 305                  vma->vm_ops, vma->vm_file,
 306                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 307
 308         if (vma->vm_ops && vma->vm_ops->set_policy)
 309                 err = vma->vm_ops->set_policy(vma, new);
 310         if (!err) {
 311                 mpol_get(new);
 312                 vma->vm_policy = new;
 313                 mpol_free(old);
 314         }
 315         return err;
 316 }
 317
 318 /* Step 2: apply policy to a range and do splits. */
 319 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 320                        unsigned long end, struct mempolicy *new)
 321 {
 322         struct vm_area_struct *next;
 323         int err;
 324
 325         err = 0;
 326         for (; vma && vma->vm_start < end; vma = next) {
 327                 next = vma->vm_next;
 328                 if (vma->vm_start < start)
 329                         err = split_vma(vma->vm_mm, vma, start, 1);
 330                 if (!err && vma->vm_end > end)
 331                         err = split_vma(vma->vm_mm, vma, end, 0);
 332                 if (!err)
 333                         err = policy_vma(vma, new);
 334                 if (err)
 335                         break;
 336         }
 337         return err;
 338 }
 339
 340 static int contextualize_policy(int mode, nodemask_t *nodes)
 341 {
 342         if (!nodes)
 343                 return 0;
 344
 345         /* Update current mems_allowed */
 346         cpuset_update_current_mems_allowed();
 347         /* Ignore nodes not set in current->mems_allowed */
 348         cpuset_restrict_to_mems_allowed(nodes->bits);
 349         return mpol_check_policy(mode, nodes);
 350 }
 351
 352 long do_mbind(unsigned long start, unsigned long len,
 353                 unsigned long mode, nodemask_t *nmask, unsigned long flags)
 354 {
 355         struct vm_area_struct *vma;
 356         struct mm_struct *mm = current->mm;
 357         struct mempolicy *new;
 358         unsigned long end;
 359         int err;
 360
 361         if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
 362                 return -EINVAL;
 363         if (start & ~PAGE_MASK)
 364                 return -EINVAL;
 365         if (mode == MPOL_DEFAULT)
 366                 flags &= ~MPOL_MF_STRICT;
 367         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 368         end = start + len;
 369         if (end < start)
 370                 return -EINVAL;
 371         if (end == start)
 372                 return 0;
 373         if (mpol_check_policy(mode, nmask))
 374                 return -EINVAL;
 375         new = mpol_new(mode, nmask);
 376         if (IS_ERR(new))
 377                 return PTR_ERR(new);
 378
 379         PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
 380                         mode,nodes_addr(nodes)[0]);
 381
 382         down_write(&mm->mmap_sem);
 383         vma = check_range(mm, start, end, nmask, flags);
 384         err = PTR_ERR(vma);
 385         if (!IS_ERR(vma))
 386                 err = mbind_range(vma, start, end, new);
 387         up_write(&mm->mmap_sem);
 388         mpol_free(new);
 389         return err;
 390 }
 391
 392 /* Set the process memory policy */
 393 long do_set_mempolicy(int mode, nodemask_t *nodes)
 394 {
 395         struct mempolicy *new;
 396
 397         if (contextualize_policy(mode, nodes))
 398                 return -EINVAL;
 399         new = mpol_new(mode, nodes);
 400         if (IS_ERR(new))
 401                 return PTR_ERR(new);
 402         mpol_free(current->mempolicy);
 403         current->mempolicy = new;
 404         if (new && new->policy == MPOL_INTERLEAVE)
 405                 current->il_next = first_node(new->v.nodes);
 406         return 0;
 407 }
 408
 409 /* Fill a zone bitmap for a policy */
 410 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 411 {
 412         int i;
 413
 414         nodes_clear(*nodes);
 415         switch (p->policy) {
 416         case MPOL_BIND:
 417                 for (i = 0; p->v.zonelist->zones[i]; i++)
 418                         node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
 419                                 *nodes);
 420                 break;
 421         case MPOL_DEFAULT:
 422                 break;
 423         case MPOL_INTERLEAVE:
 424                 *nodes = p->v.nodes;
 425                 break;
 426         case MPOL_PREFERRED:
 427                 /* or use current node instead of online map? */
 428                 if (p->v.preferred_node < 0)
 429                         *nodes = node_online_map;
 430                 else
 431                         node_set(p->v.preferred_node, *nodes);
 432                 break;
 433         default:
 434                 BUG();
 435         }
 436 }
 437
 438 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 439 {
 440         struct page *p;
 441         int err;
 442
 443         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 444         if (err >= 0) {
 445                 err = page_to_nid(p);
 446                 put_page(p);
 447         }
 448         return err;
 449 }
 450
 451 /* Retrieve NUMA policy */
 452 long do_get_mempolicy(int *policy, nodemask_t *nmask,
 453                         unsigned long addr, unsigned long flags)
 454 {
 455         int err;
 456         struct mm_struct *mm = current->mm;
 457         struct vm_area_struct *vma = NULL;
 458         struct mempolicy *pol = current->mempolicy;
 459
 460         if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
 461                 return -EINVAL;
 462         if (flags & MPOL_F_ADDR) {
 463                 down_read(&mm->mmap_sem);
 464                 vma = find_vma_intersection(mm, addr, addr+1);
 465                 if (!vma) {
 466                         up_read(&mm->mmap_sem);
 467                         return -EFAULT;
 468                 }
 469                 if (vma->vm_ops && vma->vm_ops->get_policy)
 470                         pol = vma->vm_ops->get_policy(vma, addr);
 471                 else
 472                         pol = vma->vm_policy;
 473         } else if (addr)
 474                 return -EINVAL;
 475
 476         if (!pol)
 477                 pol = &default_policy;
 478
 479         if (flags & MPOL_F_NODE) {
 480                 if (flags & MPOL_F_ADDR) {
 481                         err = lookup_node(mm, addr);
 482                         if (err < 0)
 483                                 goto out;
 484                         *policy = err;
 485                 } else if (pol == current->mempolicy &&
 486                                 pol->policy == MPOL_INTERLEAVE) {
 487                         *policy = current->il_next;
 488                 } else {
 489                         err = -EINVAL;
 490                         goto out;
 491                 }
 492         } else
 493                 *policy = pol->policy;
 494
 495         if (vma) {
 496                 up_read(&current->mm->mmap_sem);
 497                 vma = NULL;
 498         }
 499
 500         err = 0;
 501         if (nmask)
 502                 get_zonemask(pol, nmask);
 503
 504  out:
 505         if (vma)
 506                 up_read(&current->mm->mmap_sem);
 507         return err;
 508 }
 509
 510 /*
 511  * User space interface with variable sized bitmaps for nodelists.
 512  */
 513
 514 /* Copy a node mask from user space. */
 515 static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
 516                      unsigned long maxnode)
 517 {
 518         unsigned long k;
 519         unsigned long nlongs;
 520         unsigned long endmask;
 521
 522         --maxnode;
 523         nodes_clear(*nodes);
 524         if (maxnode == 0 || !nmask)
 525                 return 0;
 526
 527         nlongs = BITS_TO_LONGS(maxnode);
 528         if ((maxnode % BITS_PER_LONG) == 0)
 529                 endmask = ~0UL;
 530         else
 531                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
 532
 533         /* When the user specified more nodes than supported just check
 534            if the non supported part is all zero. */
 535         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
 536                 if (nlongs > PAGE_SIZE/sizeof(long))
 537                         return -EINVAL;
 538                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
 539                         unsigned long t;
 540                         if (get_user(t, nmask + k))
 541                                 return -EFAULT;
 542                         if (k == nlongs - 1) {
 543                                 if (t & endmask)
 544                                         return -EINVAL;
 545                         } else if (t)
 546                                 return -EINVAL;
 547                 }
 548                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
 549                 endmask = ~0UL;
 550         }
 551
 552         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
 553                 return -EFAULT;
 554         nodes_addr(*nodes)[nlongs-1] &= endmask;
 555         return 0;
 556 }
 557
 558 /* Copy a kernel node mask to user space */
 559 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 560                               nodemask_t *nodes)
 561 {
 562         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
 563         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
 564
 565         if (copy > nbytes) {
 566                 if (copy > PAGE_SIZE)
 567                         return -EINVAL;
 568                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
 569                         return -EFAULT;
 570                 copy = nbytes;
 571         }
 572         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
 573 }
 574
 575 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 576                         unsigned long mode,
 577                         unsigned long __user *nmask, unsigned long maxnode,
 578                         unsigned flags)
 579 {
 580         nodemask_t nodes;
 581         int err;
 582
 583         err = get_nodes(&nodes, nmask, maxnode);
 584         if (err)
 585                 return err;
 586         return do_mbind(start, len, mode, &nodes, flags);
 587 }
 588
 589 /* Set the process memory policy */
 590 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 591                 unsigned long maxnode)
 592 {
 593         int err;
 594         nodemask_t nodes;
 595
 596         if (mode < 0 || mode > MPOL_MAX)
 597                 return -EINVAL;
 598         err = get_nodes(&nodes, nmask, maxnode);
 599         if (err)
 600                 return err;
 601         return do_set_mempolicy(mode, &nodes);
 602 }
 603
 604 /* Retrieve NUMA policy */
 605 asmlinkage long sys_get_mempolicy(int __user *policy,
 606                                 unsigned long __user *nmask,
 607                                 unsigned long maxnode,
 608                                 unsigned long addr, unsigned long flags)
 609 {
 610         int err, pval;
 611         nodemask_t nodes;
 612
 613         if (nmask != NULL && maxnode < MAX_NUMNODES)
 614                 return -EINVAL;
 615
 616         err = do_get_mempolicy(&pval, &nodes, addr, flags);
 617
 618         if (err)
 619                 return err;
 620
 621         if (policy && put_user(pval, policy))
 622                 return -EFAULT;
 623
 624         if (nmask)
 625                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
 626
 627         return err;
 628 }
 629
 630 #ifdef CONFIG_COMPAT
 631
 632 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
 633                                      compat_ulong_t __user *nmask,
 634                                      compat_ulong_t maxnode,
 635                                      compat_ulong_t addr, compat_ulong_t flags)
 636 {
 637         long err;
 638         unsigned long __user *nm = NULL;
 639         unsigned long nr_bits, alloc_size;
 640         DECLARE_BITMAP(bm, MAX_NUMNODES);
 641
 642         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 643         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 644
 645         if (nmask)
 646                 nm = compat_alloc_user_space(alloc_size);
 647
 648         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
 649
 650         if (!err && nmask) {
 651                 err = copy_from_user(bm, nm, alloc_size);
 652                 /* ensure entire bitmap is zeroed */
 653                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
 654                 err |= compat_put_bitmap(nmask, bm, nr_bits);
 655         }
 656
 657         return err;
 658 }
 659
 660 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
 661                                      compat_ulong_t maxnode)
 662 {
 663         long err = 0;
 664         unsigned long __user *nm = NULL;
 665         unsigned long nr_bits, alloc_size;
 666         DECLARE_BITMAP(bm, MAX_NUMNODES);
 667
 668         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 669         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 670
 671         if (nmask) {
 672                 err = compat_get_bitmap(bm, nmask, nr_bits);
 673                 nm = compat_alloc_user_space(alloc_size);
 674                 err |= copy_to_user(nm, bm, alloc_size);
 675         }
 676
 677         if (err)
 678                 return -EFAULT;
 679
 680         return sys_set_mempolicy(mode, nm, nr_bits+1);
 681 }
 682
 683 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
 684                              compat_ulong_t mode, compat_ulong_t __user *nmask,
 685                              compat_ulong_t maxnode, compat_ulong_t flags)
 686 {
 687         long err = 0;
 688         unsigned long __user *nm = NULL;
 689         unsigned long nr_bits, alloc_size;
 690         nodemask_t bm;
 691
 692         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 693         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 694
 695         if (nmask) {
 696                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
 697                 nm = compat_alloc_user_space(alloc_size);
 698                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
 699         }
 700
 701         if (err)
 702                 return -EFAULT;
 703
 704         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
 705 }
 706
 707 #endif
 708
 709 /* Return effective policy for a VMA */
 710 struct mempolicy *
 711 get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
 712 {
 713         struct mempolicy *pol = task->mempolicy;
 714
 715         if (vma) {
 716                 if (vma->vm_ops && vma->vm_ops->get_policy)
 717                         pol = vma->vm_ops->get_policy(vma, addr);
 718                 else if (vma->vm_policy &&
 719                                 vma->vm_policy->policy != MPOL_DEFAULT)
 720                         pol = vma->vm_policy;
 721         }
 722         if (!pol)
 723                 pol = &default_policy;
 724         return pol;
 725 }
 726
 727 /* Return a zonelist representing a mempolicy */
 728 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
 729 {
 730         int nd;
 731
 732         switch (policy->policy) {
 733         case MPOL_PREFERRED:
 734                 nd = policy->v.preferred_node;
 735                 if (nd < 0)
 736                         nd = numa_node_id();
 737                 break;
 738         case MPOL_BIND:
 739                 /* Lower zones don't get a policy applied */
 740                 /* Careful: current->mems_allowed might have moved */
 741                 if (gfp_zone(gfp) >= policy_zone)
 742                         if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
 743                                 return policy->v.zonelist;
 744                 /*FALL THROUGH*/
 745         case MPOL_INTERLEAVE: /* should not happen */
 746         case MPOL_DEFAULT:
 747                 nd = numa_node_id();
 748                 break;
 749         default:
 750                 nd = 0;
 751                 BUG();
 752         }
 753         return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
 754 }
 755
 756 /* Do dynamic interleaving for a process */
 757 static unsigned interleave_nodes(struct mempolicy *policy)
 758 {
 759         unsigned nid, next;
 760         struct task_struct *me = current;
 761
 762         nid = me->il_next;
 763         next = next_node(nid, policy->v.nodes);
 764         if (next >= MAX_NUMNODES)
 765                 next = first_node(policy->v.nodes);
 766         me->il_next = next;
 767         return nid;
 768 }
 769
 770 /* Do static interleaving for a VMA with known offset. */
 771 static unsigned offset_il_node(struct mempolicy *pol,
 772                 struct vm_area_struct *vma, unsigned long off)
 773 {
 774         unsigned nnodes = nodes_weight(pol->v.nodes);
 775         unsigned target = (unsigned)off % nnodes;
 776         int c;
 777         int nid = -1;
 778
 779         c = 0;
 780         do {
 781                 nid = next_node(nid, pol->v.nodes);
 782                 c++;
 783         } while (c <= target);
 784         return nid;
 785 }
 786
 787 /* Allocate a page in interleaved policy.
 788    Own path because it needs to do special accounting. */
 789 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
 790                                         unsigned nid)
 791 {
 792         struct zonelist *zl;
 793         struct page *page;
 794
 795         zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
 796         page = __alloc_pages(gfp, order, zl);
 797         if (page && page_zone(page) == zl->zones[0]) {
 798                 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
 799                 put_cpu();
 800         }
 801         return page;
 802 }
 803
 804 /**
 805  *      alloc_page_vma  - Allocate a page for a VMA.
 806  *
 807  *      @gfp:
 808  *      %GFP_USER    user allocation.
 809  *      %GFP_KERNEL  kernel allocations,
 810  *      %GFP_HIGHMEM highmem/user allocations,
 811  *      %GFP_FS      allocation should not call back into a file system.
 812  *      %GFP_ATOMIC  don't sleep.
 813  *
 814  *      @vma:  Pointer to VMA or NULL if not available.
 815  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
 816  *
 817  *      This function allocates a page from the kernel page pool and applies
 818  *      a NUMA policy associated with the VMA or the current process.
 819  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
 820  *      mm_struct of the VMA to prevent it from going away. Should be used for
 821  *      all allocations for pages that will be mapped into
 822  *      user space. Returns NULL when no page can be allocated.
 823  *
 824  *      Should be called with the mm_sem of the vma hold.
 825  */
 826 struct page *
 827 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 828 {
 829         struct mempolicy *pol = get_vma_policy(current, vma, addr);
 830
 831         cpuset_update_current_mems_allowed();
 832
 833         if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
 834                 unsigned nid;
 835                 if (vma) {
 836                         unsigned long off;
 837                         off = vma->vm_pgoff;
 838                         off += (addr - vma->vm_start) >> PAGE_SHIFT;
 839                         nid = offset_il_node(pol, vma, off);
 840                 } else {
 841                         /* fall back to process interleaving */
 842                         nid = interleave_nodes(pol);
 843                 }
 844                 return alloc_page_interleave(gfp, 0, nid);
 845         }
 846         return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
 847 }
 848
 849 /**
 850  *      alloc_pages_current - Allocate pages.
 851  *
 852  *      @gfp:
 853  *              %GFP_USER   user allocation,
 854  *              %GFP_KERNEL kernel allocation,
 855  *              %GFP_HIGHMEM highmem allocation,
 856  *              %GFP_FS     don't call back into a file system.
 857  *              %GFP_ATOMIC don't sleep.
 858  *      @order: Power of two of allocation size in pages. 0 is a single page.
 859  *
 860  *      Allocate a page from the kernel page pool.  When not in
 861  *      interrupt context and apply the current process NUMA policy.
 862  *      Returns NULL when no page can be allocated.
 863  *
 864  *      Don't call cpuset_update_current_mems_allowed() unless
 865  *      1) it's ok to take cpuset_sem (can WAIT), and
 866  *      2) allocating for current task (not interrupt).
 867  */
 868 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 869 {
 870         struct mempolicy *pol = current->mempolicy;
 871
 872         if ((gfp & __GFP_WAIT) && !in_interrupt())
 873                 cpuset_update_current_mems_allowed();
 874         if (!pol || in_interrupt())
 875                 pol = &default_policy;
 876         if (pol->policy == MPOL_INTERLEAVE)
 877                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
 878         return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
 879 }
 880 EXPORT_SYMBOL(alloc_pages_current);
 881
 882 /* Slow path of a mempolicy copy */
 883 struct mempolicy *__mpol_copy(struct mempolicy *old)
 884 {
 885         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 886
 887         if (!new)
 888                 return ERR_PTR(-ENOMEM);
 889         *new = *old;
 890         atomic_set(&new->refcnt, 1);
 891         if (new->policy == MPOL_BIND) {
 892                 int sz = ksize(old->v.zonelist);
 893                 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
 894                 if (!new->v.zonelist) {
 895                         kmem_cache_free(policy_cache, new);
 896                         return ERR_PTR(-ENOMEM);
 897                 }
 898                 memcpy(new->v.zonelist, old->v.zonelist, sz);
 899         }
 900         return new;
 901 }
 902
 903 /* Slow path of a mempolicy comparison */
 904 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 905 {
 906         if (!a || !b)
 907                 return 0;
 908         if (a->policy != b->policy)
 909                 return 0;
 910         switch (a->policy) {
 911         case MPOL_DEFAULT:
 912                 return 1;
 913         case MPOL_INTERLEAVE:
 914                 return nodes_equal(a->v.nodes, b->v.nodes);
 915         case MPOL_PREFERRED:
 916                 return a->v.preferred_node == b->v.preferred_node;
 917         case MPOL_BIND: {
 918                 int i;
 919                 for (i = 0; a->v.zonelist->zones[i]; i++)
 920                         if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
 921                                 return 0;
 922                 return b->v.zonelist->zones[i] == NULL;
 923         }
 924         default:
 925                 BUG();
 926                 return 0;
 927         }
 928 }
 929
 930 /* Slow path of a mpol destructor. */
 931 void __mpol_free(struct mempolicy *p)
 932 {
 933         if (!atomic_dec_and_test(&p->refcnt))
 934                 return;
 935         if (p->policy == MPOL_BIND)
 936                 kfree(p->v.zonelist);
 937         p->policy = MPOL_DEFAULT;
 938         kmem_cache_free(policy_cache, p);
 939 }
 940
 941 /*
 942  * Hugetlb policy. Same as above, just works with node numbers instead of
 943  * zonelists.
 944  */
 945
 946 /* Find first node suitable for an allocation */
 947 int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
 948 {
 949         struct mempolicy *pol = get_vma_policy(current, vma, addr);
 950
 951         switch (pol->policy) {
 952         case MPOL_DEFAULT:
 953                 return numa_node_id();
 954         case MPOL_BIND:
 955                 return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
 956         case MPOL_INTERLEAVE:
 957                 return interleave_nodes(pol);
 958         case MPOL_PREFERRED:
 959                 return pol->v.preferred_node >= 0 ?
 960                                 pol->v.preferred_node : numa_node_id();
 961         }
 962         BUG();
 963         return 0;
 964 }
 965
 966 /* Find secondary valid nodes for an allocation */
 967 int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
 968 {
 969         struct mempolicy *pol = get_vma_policy(current, vma, addr);
 970
 971         switch (pol->policy) {
 972         case MPOL_PREFERRED:
 973         case MPOL_DEFAULT:
 974         case MPOL_INTERLEAVE:
 975                 return 1;
 976         case MPOL_BIND: {
 977                 struct zone **z;
 978                 for (z = pol->v.zonelist->zones; *z; z++)
 979                         if ((*z)->zone_pgdat->node_id == nid)
 980                                 return 1;
 981                 return 0;
 982         }
 983         default:
 984                 BUG();
 985                 return 0;
 986         }
 987 }
 988
 989 /*
 990  * Shared memory backing store policy support.
 991  *
 992  * Remember policies even when nobody has shared memory mapped.
 993  * The policies are kept in Red-Black tree linked from the inode.
 994  * They are protected by the sp->lock spinlock, which should be held
 995  * for any accesses to the tree.
 996  */
 997
 998 /* lookup first element intersecting start-end */
 999 /* Caller holds sp->lock */
1000 static struct sp_node *
1001 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1002 {
1003         struct rb_node *n = sp->root.rb_node;
1004
1005         while (n) {
1006                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1007
1008                 if (start >= p->end)
1009                         n = n->rb_right;
1010                 else if (end <= p->start)
1011                         n = n->rb_left;
1012                 else
1013                         break;
1014         }
1015         if (!n)
1016                 return NULL;
1017         for (;;) {
1018                 struct sp_node *w = NULL;
1019                 struct rb_node *prev = rb_prev(n);
1020                 if (!prev)
1021                         break;
1022                 w = rb_entry(prev, struct sp_node, nd);
1023                 if (w->end <= start)
1024                         break;
1025                 n = prev;
1026         }
1027         return rb_entry(n, struct sp_node, nd);
1028 }
1029
1030 /* Insert a new shared policy into the list. */
1031 /* Caller holds sp->lock */
1032 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1033 {
1034         struct rb_node **p = &sp->root.rb_node;
1035         struct rb_node *parent = NULL;
1036         struct sp_node *nd;
1037
1038         while (*p) {
1039                 parent = *p;
1040                 nd = rb_entry(parent, struct sp_node, nd);
1041                 if (new->start < nd->start)
1042                         p = &(*p)->rb_left;
1043                 else if (new->end > nd->end)
1044                         p = &(*p)->rb_right;
1045                 else
1046                         BUG();
1047         }
1048         rb_link_node(&new->nd, parent, p);
1049         rb_insert_color(&new->nd, &sp->root);
1050         PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1051                  new->policy ? new->policy->policy : 0);
1052 }
1053
1054 /* Find shared policy intersecting idx */
1055 struct mempolicy *
1056 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1057 {
1058         struct mempolicy *pol = NULL;
1059         struct sp_node *sn;
1060
1061         if (!sp->root.rb_node)
1062                 return NULL;
1063         spin_lock(&sp->lock);
1064         sn = sp_lookup(sp, idx, idx+1);
1065         if (sn) {
1066                 mpol_get(sn->policy);
1067                 pol = sn->policy;
1068         }
1069         spin_unlock(&sp->lock);
1070         return pol;
1071 }
1072
1073 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1074 {
1075         PDprintk("deleting %lx-l%x\n", n->start, n->end);
1076         rb_erase(&n->nd, &sp->root);
1077         mpol_free(n->policy);
1078         kmem_cache_free(sn_cache, n);
1079 }
1080
1081 struct sp_node *
1082 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1083 {
1084         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1085
1086         if (!n)
1087                 return NULL;
1088         n->start = start;
1089         n->end = end;
1090         mpol_get(pol);
1091         n->policy = pol;
1092         return n;
1093 }
1094
1095 /* Replace a policy range. */
1096 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1097                                  unsigned long end, struct sp_node *new)
1098 {
1099         struct sp_node *n, *new2 = NULL;
1100
1101 restart:
1102         spin_lock(&sp->lock);
1103         n = sp_lookup(sp, start, end);
1104         /* Take care of old policies in the same range. */
1105         while (n && n->start < end) {
1106                 struct rb_node *next = rb_next(&n->nd);
1107                 if (n->start >= start) {
1108                         if (n->end <= end)
1109                                 sp_delete(sp, n);
1110                         else
1111                                 n->start = end;
1112                 } else {
1113                         /* Old policy spanning whole new range. */
1114                         if (n->end > end) {
1115                                 if (!new2) {
1116                                         spin_unlock(&sp->lock);
1117                                         new2 = sp_alloc(end, n->end, n->policy);
1118                                         if (!new2)
1119                                                 return -ENOMEM;
1120                                         goto restart;
1121                                 }
1122                                 n->end = start;
1123                                 sp_insert(sp, new2);
1124                                 new2 = NULL;
1125                                 break;
1126                         } else
1127                                 n->end = start;
1128                 }
1129                 if (!next)
1130                         break;
1131                 n = rb_entry(next, struct sp_node, nd);
1132         }
1133         if (new)
1134                 sp_insert(sp, new);
1135         spin_unlock(&sp->lock);
1136         if (new2) {
1137                 mpol_free(new2->policy);
1138                 kmem_cache_free(sn_cache, new2);
1139         }
1140         return 0;
1141 }
1142
1143 int mpol_set_shared_policy(struct shared_policy *info,
1144                         struct vm_area_struct *vma, struct mempolicy *npol)
1145 {
1146         int err;
1147         struct sp_node *new = NULL;
1148         unsigned long sz = vma_pages(vma);
1149
1150         PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1151                  vma->vm_pgoff,
1152                  sz, npol? npol->policy : -1,
1153                 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1154
1155         if (npol) {
1156                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1157                 if (!new)
1158                         return -ENOMEM;
1159         }
1160         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1161         if (err && new)
1162                 kmem_cache_free(sn_cache, new);
1163         return err;
1164 }
1165
1166 /* Free a backing policy store on inode delete. */
1167 void mpol_free_shared_policy(struct shared_policy *p)
1168 {
1169         struct sp_node *n;
1170         struct rb_node *next;
1171
1172         if (!p->root.rb_node)
1173                 return;
1174         spin_lock(&p->lock);
1175         next = rb_first(&p->root);
1176         while (next) {
1177                 n = rb_entry(next, struct sp_node, nd);
1178                 next = rb_next(&n->nd);
1179                 rb_erase(&n->nd, &p->root);
1180                 mpol_free(n->policy);
1181                 kmem_cache_free(sn_cache, n);
1182         }
1183         spin_unlock(&p->lock);
1184 }
1185
1186 /* assumes fs == KERNEL_DS */
1187 void __init numa_policy_init(void)
1188 {
1189         policy_cache = kmem_cache_create("numa_policy",
1190                                          sizeof(struct mempolicy),
1191                                          0, SLAB_PANIC, NULL, NULL);
1192
1193         sn_cache = kmem_cache_create("shared_policy_node",
1194                                      sizeof(struct sp_node),
1195                                      0, SLAB_PANIC, NULL, NULL);
1196
1197         /* Set interleaving policy for system init. This way not all
1198            the data structures allocated at system boot end up in node zero. */
1199
1200         if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1201                 printk("numa_policy_init: interleaving failed\n");
1202 }
1203
1204 /* Reset policy of current process to default */
1205 void numa_default_policy(void)
1206 {
1207         do_set_mempolicy(MPOL_DEFAULT, NULL);
1208 }