release/src-rt-6.x.4708/linux/linux-2.6.36/mm/mempolicy.c

   1
   2
   3 /* Notebook:
   4    fix mmap readahead to honour policy and enable policy for any page cache
   5    object
   6    statistics for bigpages
   7    global policy for page cache? currently it uses process policy. Requires
   8    first item above.
   9    handle mremap for shared memory (currently ignored for the policy)
  10    grows down?
  11    make bind policy root only? It can trigger oom much faster and the
  12    kernel is not always grateful with that.
  13 */
  14
  15 #include <linux/mempolicy.h>
  16 #include <linux/mm.h>
  17 #include <linux/highmem.h>
  18 #include <linux/hugetlb.h>
  19 #include <linux/kernel.h>
  20 #include <linux/sched.h>
  21 #include <linux/nodemask.h>
  22 #include <linux/cpuset.h>
  23 #include <linux/slab.h>
  24 #include <linux/string.h>
  25 #include <linux/module.h>
  26 #include <linux/nsproxy.h>
  27 #include <linux/interrupt.h>
  28 #include <linux/init.h>
  29 #include <linux/compat.h>
  30 #include <linux/swap.h>
  31 #include <linux/seq_file.h>
  32 #include <linux/proc_fs.h>
  33 #include <linux/migrate.h>
  34 #include <linux/ksm.h>
  35 #include <linux/rmap.h>
  36 #include <linux/security.h>
  37 #include <linux/syscalls.h>
  38 #include <linux/ctype.h>
  39 #include <linux/mm_inline.h>
  40
  41 #include <asm/tlbflush.h>
  42 #include <asm/uaccess.h>
  43
  44 #include "internal.h"
  45
  46 /* Internal flags */
  47 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
  48 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
  49 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
  50
  51 static struct kmem_cache *policy_cache;
  52 static struct kmem_cache *sn_cache;
  53
  54 /* Highest zone. An specific allocation for a zone below that is not
  55    policied. */
  56 enum zone_type policy_zone = 0;
  57
  58 /*
  59  * run-time system-wide default policy => local allocation
  60  */
  61 struct mempolicy default_policy = {
  62         .refcnt = ATOMIC_INIT(1), /* never free it */
  63         .mode = MPOL_PREFERRED,
  64         .flags = MPOL_F_LOCAL,
  65 };
  66
  67 static const struct mempolicy_operations {
  68         int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
  69         /*
  70          * If read-side task has no lock to protect task->mempolicy, write-side
  71          * task will rebind the task->mempolicy by two step. The first step is
  72          * setting all the newly nodes, and the second step is cleaning all the
  73          * disallowed nodes. In this way, we can avoid finding no node to alloc
  74          * page.
  75          * If we have a lock to protect task->mempolicy in read-side, we do
  76          * rebind directly.
  77          *
  78          * step:
  79          *      MPOL_REBIND_ONCE - do rebind work at once
  80          *      MPOL_REBIND_STEP1 - set all the newly nodes
  81          *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
  82          */
  83         void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
  84                         enum mpol_rebind_step step);
  85 } mpol_ops[MPOL_MAX];
  86
  87 /* Check that the nodemask contains at least one populated zone */
  88 static int is_valid_nodemask(const nodemask_t *nodemask)
  89 {
  90         int nd, k;
  91
  92         for_each_node_mask(nd, *nodemask) {
  93                 struct zone *z;
  94
  95                 for (k = 0; k <= policy_zone; k++) {
  96                         z = &NODE_DATA(nd)->node_zones[k];
  97                         if (z->present_pages > 0)
  98                                 return 1;
  99                 }
 100         }
 101
 102         return 0;
 103 }
 104
 105 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 106 {
 107         return pol->flags & MPOL_MODE_FLAGS;
 108 }
 109
 110 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 111                                    const nodemask_t *rel)
 112 {
 113         nodemask_t tmp;
 114         nodes_fold(tmp, *orig, nodes_weight(*rel));
 115         nodes_onto(*ret, tmp, *rel);
 116 }
 117
 118 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 119 {
 120         if (nodes_empty(*nodes))
 121                 return -EINVAL;
 122         pol->v.nodes = *nodes;
 123         return 0;
 124 }
 125
 126 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 127 {
 128         if (!nodes)
 129                 pol->flags |= MPOL_F_LOCAL;     /* local allocation */
 130         else if (nodes_empty(*nodes))
 131                 return -EINVAL;                 /*  no allowed nodes */
 132         else
 133                 pol->v.preferred_node = first_node(*nodes);
 134         return 0;
 135 }
 136
 137 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 138 {
 139         if (!is_valid_nodemask(nodes))
 140                 return -EINVAL;
 141         pol->v.nodes = *nodes;
 142         return 0;
 143 }
 144
 145 /*
 146  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 147  * any, for the new policy.  mpol_new() has already validated the nodes
 148  * parameter with respect to the policy mode and flags.  But, we need to
 149  * handle an empty nodemask with MPOL_PREFERRED here.
 150  *
 151  * Must be called holding task's alloc_lock to protect task's mems_allowed
 152  * and mempolicy.  May also be called holding the mmap_semaphore for write.
 153  */
 154 static int mpol_set_nodemask(struct mempolicy *pol,
 155                      const nodemask_t *nodes, struct nodemask_scratch *nsc)
 156 {
 157         int ret;
 158
 159         /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
 160         if (pol == NULL)
 161                 return 0;
 162         /* Check N_HIGH_MEMORY */
 163         nodes_and(nsc->mask1,
 164                   cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]);
 165
 166         VM_BUG_ON(!nodes);
 167         if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
 168                 nodes = NULL;   /* explicit local allocation */
 169         else {
 170                 if (pol->flags & MPOL_F_RELATIVE_NODES)
 171                         mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
 172                 else
 173                         nodes_and(nsc->mask2, *nodes, nsc->mask1);
 174
 175                 if (mpol_store_user_nodemask(pol))
 176                         pol->w.user_nodemask = *nodes;
 177                 else
 178                         pol->w.cpuset_mems_allowed =
 179                                                 cpuset_current_mems_allowed;
 180         }
 181
 182         if (nodes)
 183                 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
 184         else
 185                 ret = mpol_ops[pol->mode].create(pol, NULL);
 186         return ret;
 187 }
 188
 189 /*
 190  * This function just creates a new policy, does some check and simple
 191  * initialization. You must invoke mpol_set_nodemask() to set nodes.
 192  */
 193 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 194                                   nodemask_t *nodes)
 195 {
 196         struct mempolicy *policy;
 197
 198         pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 199                  mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
 200
 201         if (mode == MPOL_DEFAULT) {
 202                 if (nodes && !nodes_empty(*nodes))
 203                         return ERR_PTR(-EINVAL);
 204                 return NULL;    /* simply delete any existing policy */
 205         }
 206         VM_BUG_ON(!nodes);
 207
 208         /*
 209          * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 210          * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 211          * All other modes require a valid pointer to a non-empty nodemask.
 212          */
 213         if (mode == MPOL_PREFERRED) {
 214                 if (nodes_empty(*nodes)) {
 215                         if (((flags & MPOL_F_STATIC_NODES) ||
 216                              (flags & MPOL_F_RELATIVE_NODES)))
 217                                 return ERR_PTR(-EINVAL);
 218                 }
 219         } else if (nodes_empty(*nodes))
 220                 return ERR_PTR(-EINVAL);
 221         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 222         if (!policy)
 223                 return ERR_PTR(-ENOMEM);
 224         atomic_set(&policy->refcnt, 1);
 225         policy->mode = mode;
 226         policy->flags = flags;
 227
 228         return policy;
 229 }
 230
 231 /* Slow path of a mpol destructor. */
 232 void __mpol_put(struct mempolicy *p)
 233 {
 234         if (!atomic_dec_and_test(&p->refcnt))
 235                 return;
 236         kmem_cache_free(policy_cache, p);
 237 }
 238
 239 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
 240                                 enum mpol_rebind_step step)
 241 {
 242 }
 243
 244 /*
 245  * step:
 246  *      MPOL_REBIND_ONCE  - do rebind work at once
 247  *      MPOL_REBIND_STEP1 - set all the newly nodes
 248  *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 249  */
 250 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
 251                                  enum mpol_rebind_step step)
 252 {
 253         nodemask_t tmp;
 254
 255         if (pol->flags & MPOL_F_STATIC_NODES)
 256                 nodes_and(tmp, pol->w.user_nodemask, *nodes);
 257         else if (pol->flags & MPOL_F_RELATIVE_NODES)
 258                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 259         else {
 260                 /*
 261                  * if step == 1, we use ->w.cpuset_mems_allowed to cache the
 262                  * result
 263                  */
 264                 if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
 265                         nodes_remap(tmp, pol->v.nodes,
 266                                         pol->w.cpuset_mems_allowed, *nodes);
 267                         pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
 268                 } else if (step == MPOL_REBIND_STEP2) {
 269                         tmp = pol->w.cpuset_mems_allowed;
 270                         pol->w.cpuset_mems_allowed = *nodes;
 271                 } else
 272                         BUG();
 273         }
 274
 275         if (nodes_empty(tmp))
 276                 tmp = *nodes;
 277
 278         if (step == MPOL_REBIND_STEP1)
 279                 nodes_or(pol->v.nodes, pol->v.nodes, tmp);
 280         else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
 281                 pol->v.nodes = tmp;
 282         else
 283                 BUG();
 284
 285         if (!node_isset(current->il_next, tmp)) {
 286                 current->il_next = next_node(current->il_next, tmp);
 287                 if (current->il_next >= MAX_NUMNODES)
 288                         current->il_next = first_node(tmp);
 289                 if (current->il_next >= MAX_NUMNODES)
 290                         current->il_next = numa_node_id();
 291         }
 292 }
 293
 294 static void mpol_rebind_preferred(struct mempolicy *pol,
 295                                   const nodemask_t *nodes,
 296                                   enum mpol_rebind_step step)
 297 {
 298         nodemask_t tmp;
 299
 300         if (pol->flags & MPOL_F_STATIC_NODES) {
 301                 int node = first_node(pol->w.user_nodemask);
 302
 303                 if (node_isset(node, *nodes)) {
 304                         pol->v.preferred_node = node;
 305                         pol->flags &= ~MPOL_F_LOCAL;
 306                 } else
 307                         pol->flags |= MPOL_F_LOCAL;
 308         } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 309                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 310                 pol->v.preferred_node = first_node(tmp);
 311         } else if (!(pol->flags & MPOL_F_LOCAL)) {
 312                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
 313                                                    pol->w.cpuset_mems_allowed,
 314                                                    *nodes);
 315                 pol->w.cpuset_mems_allowed = *nodes;
 316         }
 317 }
 318
 319 /*
 320  * mpol_rebind_policy - Migrate a policy to a different set of nodes
 321  *
 322  * If read-side task has no lock to protect task->mempolicy, write-side
 323  * task will rebind the task->mempolicy by two step. The first step is
 324  * setting all the newly nodes, and the second step is cleaning all the
 325  * disallowed nodes. In this way, we can avoid finding no node to alloc
 326  * page.
 327  * If we have a lock to protect task->mempolicy in read-side, we do
 328  * rebind directly.
 329  *
 330  * step:
 331  *      MPOL_REBIND_ONCE  - do rebind work at once
 332  *      MPOL_REBIND_STEP1 - set all the newly nodes
 333  *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 334  */
 335 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
 336                                 enum mpol_rebind_step step)
 337 {
 338         if (!pol)
 339                 return;
 340         if (!mpol_store_user_nodemask(pol) && step == 0 &&
 341             nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 342                 return;
 343
 344         if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
 345                 return;
 346
 347         if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
 348                 BUG();
 349
 350         if (step == MPOL_REBIND_STEP1)
 351                 pol->flags |= MPOL_F_REBINDING;
 352         else if (step == MPOL_REBIND_STEP2)
 353                 pol->flags &= ~MPOL_F_REBINDING;
 354         else if (step >= MPOL_REBIND_NSTEP)
 355                 BUG();
 356
 357         mpol_ops[pol->mode].rebind(pol, newmask, step);
 358 }
 359
 360 /*
 361  * Wrapper for mpol_rebind_policy() that just requires task
 362  * pointer, and updates task mempolicy.
 363  *
 364  * Called with task's alloc_lock held.
 365  */
 366
 367 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
 368                         enum mpol_rebind_step step)
 369 {
 370         mpol_rebind_policy(tsk->mempolicy, new, step);
 371 }
 372
 373 /*
 374  * Rebind each vma in mm to new nodemask.
 375  *
 376  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
 377  */
 378
 379 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 380 {
 381         struct vm_area_struct *vma;
 382
 383         down_write(&mm->mmap_sem);
 384         for (vma = mm->mmap; vma; vma = vma->vm_next)
 385                 mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
 386         up_write(&mm->mmap_sem);
 387 }
 388
 389 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 390         [MPOL_DEFAULT] = {
 391                 .rebind = mpol_rebind_default,
 392         },
 393         [MPOL_INTERLEAVE] = {
 394                 .create = mpol_new_interleave,
 395                 .rebind = mpol_rebind_nodemask,
 396         },
 397         [MPOL_PREFERRED] = {
 398                 .create = mpol_new_preferred,
 399                 .rebind = mpol_rebind_preferred,
 400         },
 401         [MPOL_BIND] = {
 402                 .create = mpol_new_bind,
 403                 .rebind = mpol_rebind_nodemask,
 404         },
 405 };
 406
 407 static void gather_stats(struct page *, void *, int pte_dirty);
 408 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 409                                 unsigned long flags);
 410
 411 /* Scan through pages checking if pages follow certain conditions. */
 412 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 413                 unsigned long addr, unsigned long end,
 414                 const nodemask_t *nodes, unsigned long flags,
 415                 void *private)
 416 {
 417         pte_t *orig_pte;
 418         pte_t *pte;
 419         spinlock_t *ptl;
 420
 421         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 422         do {
 423                 struct page *page;
 424                 int nid;
 425
 426                 if (!pte_present(*pte))
 427                         continue;
 428                 page = vm_normal_page(vma, addr, *pte);
 429                 if (!page)
 430                         continue;
 431                 /*
 432                  * vm_normal_page() filters out zero pages, but there might
 433                  * still be PageReserved pages to skip, perhaps in a VDSO.
 434                  * And we cannot move PageKsm pages sensibly or safely yet.
 435                  */
 436                 if (PageReserved(page) || PageKsm(page))
 437                         continue;
 438                 nid = page_to_nid(page);
 439                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 440                         continue;
 441
 442                 if (flags & MPOL_MF_STATS)
 443                         gather_stats(page, private, pte_dirty(*pte));
 444                 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 445                         migrate_page_add(page, private, flags);
 446                 else
 447                         break;
 448         } while (pte++, addr += PAGE_SIZE, addr != end);
 449         pte_unmap_unlock(orig_pte, ptl);
 450         return addr != end;
 451 }
 452
 453 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 454                 unsigned long addr, unsigned long end,
 455                 const nodemask_t *nodes, unsigned long flags,
 456                 void *private)
 457 {
 458         pmd_t *pmd;
 459         unsigned long next;
 460
 461         pmd = pmd_offset(pud, addr);
 462         do {
 463                 next = pmd_addr_end(addr, end);
 464                 if (pmd_none_or_clear_bad(pmd))
 465                         continue;
 466                 if (check_pte_range(vma, pmd, addr, next, nodes,
 467                                     flags, private))
 468                         return -EIO;
 469         } while (pmd++, addr = next, addr != end);
 470         return 0;
 471 }
 472
 473 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 474                 unsigned long addr, unsigned long end,
 475                 const nodemask_t *nodes, unsigned long flags,
 476                 void *private)
 477 {
 478         pud_t *pud;
 479         unsigned long next;
 480
 481         pud = pud_offset(pgd, addr);
 482         do {
 483                 next = pud_addr_end(addr, end);
 484                 if (pud_none_or_clear_bad(pud))
 485                         continue;
 486                 if (check_pmd_range(vma, pud, addr, next, nodes,
 487                                     flags, private))
 488                         return -EIO;
 489         } while (pud++, addr = next, addr != end);
 490         return 0;
 491 }
 492
 493 static inline int check_pgd_range(struct vm_area_struct *vma,
 494                 unsigned long addr, unsigned long end,
 495                 const nodemask_t *nodes, unsigned long flags,
 496                 void *private)
 497 {
 498         pgd_t *pgd;
 499         unsigned long next;
 500
 501         pgd = pgd_offset(vma->vm_mm, addr);
 502         do {
 503                 next = pgd_addr_end(addr, end);
 504                 if (pgd_none_or_clear_bad(pgd))
 505                         continue;
 506                 if (check_pud_range(vma, pgd, addr, next, nodes,
 507                                     flags, private))
 508                         return -EIO;
 509         } while (pgd++, addr = next, addr != end);
 510         return 0;
 511 }
 512
 513 /*
 514  * Check if all pages in a range are on a set of nodes.
 515  * If pagelist != NULL then isolate pages from the LRU and
 516  * put them on the pagelist.
 517  */
 518 static struct vm_area_struct *
 519 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 520                 const nodemask_t *nodes, unsigned long flags, void *private)
 521 {
 522         int err;
 523         struct vm_area_struct *first, *vma, *prev;
 524
 525
 526         first = find_vma(mm, start);
 527         if (!first)
 528                 return ERR_PTR(-EFAULT);
 529         prev = NULL;
 530         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 531                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 532                         if (!vma->vm_next && vma->vm_end < end)
 533                                 return ERR_PTR(-EFAULT);
 534                         if (prev && prev->vm_end < vma->vm_start)
 535                                 return ERR_PTR(-EFAULT);
 536                 }
 537                 if (!is_vm_hugetlb_page(vma) &&
 538                     ((flags & MPOL_MF_STRICT) ||
 539                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 540                                 vma_migratable(vma)))) {
 541                         unsigned long endvma = vma->vm_end;
 542
 543                         if (endvma > end)
 544                                 endvma = end;
 545                         if (vma->vm_start > start)
 546                                 start = vma->vm_start;
 547                         err = check_pgd_range(vma, start, endvma, nodes,
 548                                                 flags, private);
 549                         if (err) {
 550                                 first = ERR_PTR(err);
 551                                 break;
 552                         }
 553                 }
 554                 prev = vma;
 555         }
 556         return first;
 557 }
 558
 559 /* Apply policy to a single VMA */
 560 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 561 {
 562         int err = 0;
 563         struct mempolicy *old = vma->vm_policy;
 564
 565         pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 566                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 567                  vma->vm_ops, vma->vm_file,
 568                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 569
 570         if (vma->vm_ops && vma->vm_ops->set_policy)
 571                 err = vma->vm_ops->set_policy(vma, new);
 572         if (!err) {
 573                 mpol_get(new);
 574                 vma->vm_policy = new;
 575                 mpol_put(old);
 576         }
 577         return err;
 578 }
 579
 580 /* Step 2: apply policy to a range and do splits. */
 581 static int mbind_range(struct mm_struct *mm, unsigned long start,
 582                        unsigned long end, struct mempolicy *new_pol)
 583 {
 584         struct vm_area_struct *next;
 585         struct vm_area_struct *prev;
 586         struct vm_area_struct *vma;
 587         int err = 0;
 588         pgoff_t pgoff;
 589         unsigned long vmstart;
 590         unsigned long vmend;
 591
 592         vma = find_vma_prev(mm, start, &prev);
 593         if (!vma || vma->vm_start > start)
 594                 return -EFAULT;
 595
 596         for (; vma && vma->vm_start < end; prev = vma, vma = next) {
 597                 next = vma->vm_next;
 598                 vmstart = max(start, vma->vm_start);
 599                 vmend   = min(end, vma->vm_end);
 600
 601                 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
 602                 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
 603                                   vma->anon_vma, vma->vm_file, pgoff, new_pol);
 604                 if (prev) {
 605                         vma = prev;
 606                         next = vma->vm_next;
 607                         continue;
 608                 }
 609                 if (vma->vm_start != vmstart) {
 610                         err = split_vma(vma->vm_mm, vma, vmstart, 1);
 611                         if (err)
 612                                 goto out;
 613                 }
 614                 if (vma->vm_end != vmend) {
 615                         err = split_vma(vma->vm_mm, vma, vmend, 0);
 616                         if (err)
 617                                 goto out;
 618                 }
 619                 err = policy_vma(vma, new_pol);
 620                 if (err)
 621                         goto out;
 622         }
 623
 624  out:
 625         return err;
 626 }
 627
 628 /*
 629  * Update task->flags PF_MEMPOLICY bit: set iff non-default
 630  * mempolicy.  Allows more rapid checking of this (combined perhaps
 631  * with other PF_* flag bits) on memory allocation hot code paths.
 632  *
 633  * If called from outside this file, the task 'p' should -only- be
 634  * a newly forked child not yet visible on the task list, because
 635  * manipulating the task flags of a visible task is not safe.
 636  *
 637  * The above limitation is why this routine has the funny name
 638  * mpol_fix_fork_child_flag().
 639  *
 640  * It is also safe to call this with a task pointer of current,
 641  * which the static wrapper mpol_set_task_struct_flag() does,
 642  * for use within this file.
 643  */
 644
 645 void mpol_fix_fork_child_flag(struct task_struct *p)
 646 {
 647         if (p->mempolicy)
 648                 p->flags |= PF_MEMPOLICY;
 649         else
 650                 p->flags &= ~PF_MEMPOLICY;
 651 }
 652
 653 static void mpol_set_task_struct_flag(void)
 654 {
 655         mpol_fix_fork_child_flag(current);
 656 }
 657
 658 /* Set the process memory policy */
 659 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 660                              nodemask_t *nodes)
 661 {
 662         struct mempolicy *new, *old;
 663         struct mm_struct *mm = current->mm;
 664         NODEMASK_SCRATCH(scratch);
 665         int ret;
 666
 667         if (!scratch)
 668                 return -ENOMEM;
 669
 670         new = mpol_new(mode, flags, nodes);
 671         if (IS_ERR(new)) {
 672                 ret = PTR_ERR(new);
 673                 goto out;
 674         }
 675         /*
 676          * prevent changing our mempolicy while show_numa_maps()
 677          * is using it.
 678          * Note:  do_set_mempolicy() can be called at init time
 679          * with no 'mm'.
 680          */
 681         if (mm)
 682                 down_write(&mm->mmap_sem);
 683         task_lock(current);
 684         ret = mpol_set_nodemask(new, nodes, scratch);
 685         if (ret) {
 686                 task_unlock(current);
 687                 if (mm)
 688                         up_write(&mm->mmap_sem);
 689                 mpol_put(new);
 690                 goto out;
 691         }
 692         old = current->mempolicy;
 693         current->mempolicy = new;
 694         mpol_set_task_struct_flag();
 695         if (new && new->mode == MPOL_INTERLEAVE &&
 696             nodes_weight(new->v.nodes))
 697                 current->il_next = first_node(new->v.nodes);
 698         task_unlock(current);
 699         if (mm)
 700                 up_write(&mm->mmap_sem);
 701
 702         mpol_put(old);
 703         ret = 0;
 704 out:
 705         NODEMASK_SCRATCH_FREE(scratch);
 706         return ret;
 707 }
 708
 709 /*
 710  * Return nodemask for policy for get_mempolicy() query
 711  *
 712  * Called with task's alloc_lock held
 713  */
 714 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 715 {
 716         nodes_clear(*nodes);
 717         if (p == &default_policy)
 718                 return;
 719
 720         switch (p->mode) {
 721         case MPOL_BIND:
 722                 /* Fall through */
 723         case MPOL_INTERLEAVE:
 724                 *nodes = p->v.nodes;
 725                 break;
 726         case MPOL_PREFERRED:
 727                 if (!(p->flags & MPOL_F_LOCAL))
 728                         node_set(p->v.preferred_node, *nodes);
 729                 /* else return empty node mask for local allocation */
 730                 break;
 731         default:
 732                 BUG();
 733         }
 734 }
 735
 736 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 737 {
 738         struct page *p;
 739         int err;
 740
 741         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 742         if (err >= 0) {
 743                 err = page_to_nid(p);
 744                 put_page(p);
 745         }
 746         return err;
 747 }
 748
 749 /* Retrieve NUMA policy */
 750 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 751                              unsigned long addr, unsigned long flags)
 752 {
 753         int err;
 754         struct mm_struct *mm = current->mm;
 755         struct vm_area_struct *vma = NULL;
 756         struct mempolicy *pol = current->mempolicy;
 757
 758         if (flags &
 759                 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 760                 return -EINVAL;
 761
 762         if (flags & MPOL_F_MEMS_ALLOWED) {
 763                 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 764                         return -EINVAL;
 765                 *policy = 0;    /* just so it's initialized */
 766                 task_lock(current);
 767                 *nmask  = cpuset_current_mems_allowed;
 768                 task_unlock(current);
 769                 return 0;
 770         }
 771
 772         if (flags & MPOL_F_ADDR) {
 773                 /*
 774                  * Do NOT fall back to task policy if the
 775                  * vma/shared policy at addr is NULL.  We
 776                  * want to return MPOL_DEFAULT in this case.
 777                  */
 778                 down_read(&mm->mmap_sem);
 779                 vma = find_vma_intersection(mm, addr, addr+1);
 780                 if (!vma) {
 781                         up_read(&mm->mmap_sem);
 782                         return -EFAULT;
 783                 }
 784                 if (vma->vm_ops && vma->vm_ops->get_policy)
 785                         pol = vma->vm_ops->get_policy(vma, addr);
 786                 else
 787                         pol = vma->vm_policy;
 788         } else if (addr)
 789                 return -EINVAL;
 790
 791         if (!pol)
 792                 pol = &default_policy;  /* indicates default behavior */
 793
 794         if (flags & MPOL_F_NODE) {
 795                 if (flags & MPOL_F_ADDR) {
 796                         err = lookup_node(mm, addr);
 797                         if (err < 0)
 798                                 goto out;
 799                         *policy = err;
 800                 } else if (pol == current->mempolicy &&
 801                                 pol->mode == MPOL_INTERLEAVE) {
 802                         *policy = current->il_next;
 803                 } else {
 804                         err = -EINVAL;
 805                         goto out;
 806                 }
 807         } else {
 808                 *policy = pol == &default_policy ? MPOL_DEFAULT :
 809                                                 pol->mode;
 810                 /*
 811                  * Internal mempolicy flags must be masked off before exposing
 812                  * the policy to userspace.
 813                  */
 814                 *policy |= (pol->flags & MPOL_MODE_FLAGS);
 815         }
 816
 817         if (vma) {
 818                 up_read(&current->mm->mmap_sem);
 819                 vma = NULL;
 820         }
 821
 822         err = 0;
 823         if (nmask) {
 824                 if (mpol_store_user_nodemask(pol)) {
 825                         *nmask = pol->w.user_nodemask;
 826                 } else {
 827                         task_lock(current);
 828                         get_policy_nodemask(pol, nmask);
 829                         task_unlock(current);
 830                 }
 831         }
 832
 833  out:
 834         mpol_cond_put(pol);
 835         if (vma)
 836                 up_read(&current->mm->mmap_sem);
 837         return err;
 838 }
 839
 840 #ifdef CONFIG_MIGRATION
 841 /*
 842  * page migration
 843  */
 844 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 845                                 unsigned long flags)
 846 {
 847         /*
 848          * Avoid migrating a page that is shared with others.
 849          */
 850         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
 851                 if (!isolate_lru_page(page)) {
 852                         list_add_tail(&page->lru, pagelist);
 853                         inc_zone_page_state(page, NR_ISOLATED_ANON +
 854                                             page_is_file_cache(page));
 855                 }
 856         }
 857 }
 858
 859 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 860 {
 861         return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
 862 }
 863
 864 /*
 865  * Migrate pages from one node to a target node.
 866  * Returns error or the number of pages not migrated.
 867  */
 868 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 869                            int flags)
 870 {
 871         nodemask_t nmask;
 872         LIST_HEAD(pagelist);
 873         int err = 0;
 874
 875         nodes_clear(nmask);
 876         node_set(source, nmask);
 877
 878         check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
 879                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 880
 881         if (!list_empty(&pagelist))
 882                 err = migrate_pages(&pagelist, new_node_page, dest, 0);
 883
 884         return err;
 885 }
 886
 887 /*
 888  * Move pages between the two nodesets so as to preserve the physical
 889  * layout as much as possible.
 890  *
 891  * Returns the number of page that could not be moved.
 892  */
 893 int do_migrate_pages(struct mm_struct *mm,
 894         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 895 {
 896         int busy = 0;
 897         int err;
 898         nodemask_t tmp;
 899
 900         err = migrate_prep();
 901         if (err)
 902                 return err;
 903
 904         down_read(&mm->mmap_sem);
 905
 906         err = migrate_vmas(mm, from_nodes, to_nodes, flags);
 907         if (err)
 908                 goto out;
 909
 910         /*
 911          * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
 912          * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
 913          * bit in 'tmp', and return that <source, dest> pair for migration.
 914          * The pair of nodemasks 'to' and 'from' define the map.
 915          *
 916          * If no pair of bits is found that way, fallback to picking some
 917          * pair of 'source' and 'dest' bits that are not the same.  If the
 918          * 'source' and 'dest' bits are the same, this represents a node
 919          * that will be migrating to itself, so no pages need move.
 920          *
 921          * If no bits are left in 'tmp', or if all remaining bits left
 922          * in 'tmp' correspond to the same bit in 'to', return false
 923          * (nothing left to migrate).
 924          *
 925          * This lets us pick a pair of nodes to migrate between, such that
 926          * if possible the dest node is not already occupied by some other
 927          * source node, minimizing the risk of overloading the memory on a
 928          * node that would happen if we migrated incoming memory to a node
 929          * before migrating outgoing memory source that same node.
 930          *
 931          * A single scan of tmp is sufficient.  As we go, we remember the
 932          * most recent <s, d> pair that moved (s != d).  If we find a pair
 933          * that not only moved, but what's better, moved to an empty slot
 934          * (d is not set in tmp), then we break out then, with that pair.
 935          * Otherwise when we finish scannng from_tmp, we at least have the
 936          * most recent <s, d> pair that moved.  If we get all the way through
 937          * the scan of tmp without finding any node that moved, much less
 938          * moved to an empty node, then there is nothing left worth migrating.
 939          */
 940
 941         tmp = *from_nodes;
 942         while (!nodes_empty(tmp)) {
 943                 int s,d;
 944                 int source = -1;
 945                 int dest = 0;
 946
 947                 for_each_node_mask(s, tmp) {
 948                         d = node_remap(s, *from_nodes, *to_nodes);
 949                         if (s == d)
 950                                 continue;
 951
 952                         source = s;     /* Node moved. Memorize */
 953                         dest = d;
 954
 955                         /* dest not in remaining from nodes? */
 956                         if (!node_isset(dest, tmp))
 957                                 break;
 958                 }
 959                 if (source == -1)
 960                         break;
 961
 962                 node_clear(source, tmp);
 963                 err = migrate_to_node(mm, source, dest, flags);
 964                 if (err > 0)
 965                         busy += err;
 966                 if (err < 0)
 967                         break;
 968         }
 969 out:
 970         up_read(&mm->mmap_sem);
 971         if (err < 0)
 972                 return err;
 973         return busy;
 974
 975 }
 976
 977 /*
 978  * Allocate a new page for page migration based on vma policy.
 979  * Start assuming that page is mapped by vma pointed to by @private.
 980  * Search forward from there, if not.  N.B., this assumes that the
 981  * list of pages handed to migrate_pages()--which is how we get here--
 982  * is in virtual address order.
 983  */
 984 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 985 {
 986         struct vm_area_struct *vma = (struct vm_area_struct *)private;
 987         unsigned long uninitialized_var(address);
 988
 989         while (vma) {
 990                 address = page_address_in_vma(page, vma);
 991                 if (address != -EFAULT)
 992                         break;
 993                 vma = vma->vm_next;
 994         }
 995
 996         /*
 997          * if !vma, alloc_page_vma() will use task or system default policy
 998          */
 999         return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1000 }
1001 #else
1002
1003 static void migrate_page_add(struct page *page, struct list_head *pagelist,
1004                                 unsigned long flags)
1005 {
1006 }
1007
1008 int do_migrate_pages(struct mm_struct *mm,
1009         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
1010 {
1011         return -ENOSYS;
1012 }
1013
1014 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1015 {
1016         return NULL;
1017 }
1018 #endif
1019
1020 static long do_mbind(unsigned long start, unsigned long len,
1021                      unsigned short mode, unsigned short mode_flags,
1022                      nodemask_t *nmask, unsigned long flags)
1023 {
1024         struct vm_area_struct *vma;
1025         struct mm_struct *mm = current->mm;
1026         struct mempolicy *new;
1027         unsigned long end;
1028         int err;
1029         LIST_HEAD(pagelist);
1030
1031         if (flags & ~(unsigned long)(MPOL_MF_STRICT |
1032                                      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1033                 return -EINVAL;
1034         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1035                 return -EPERM;
1036
1037         if (start & ~PAGE_MASK)
1038                 return -EINVAL;
1039
1040         if (mode == MPOL_DEFAULT)
1041                 flags &= ~MPOL_MF_STRICT;
1042
1043         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1044         end = start + len;
1045
1046         if (end < start)
1047                 return -EINVAL;
1048         if (end == start)
1049                 return 0;
1050
1051         new = mpol_new(mode, mode_flags, nmask);
1052         if (IS_ERR(new))
1053                 return PTR_ERR(new);
1054
1055         /*
1056          * If we are using the default policy then operation
1057          * on discontinuous address spaces is okay after all
1058          */
1059         if (!new)
1060                 flags |= MPOL_MF_DISCONTIG_OK;
1061
1062         pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1063                  start, start + len, mode, mode_flags,
1064                  nmask ? nodes_addr(*nmask)[0] : -1);
1065
1066         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1067
1068                 err = migrate_prep();
1069                 if (err)
1070                         goto mpol_out;
1071         }
1072         {
1073                 NODEMASK_SCRATCH(scratch);
1074                 if (scratch) {
1075                         down_write(&mm->mmap_sem);
1076                         task_lock(current);
1077                         err = mpol_set_nodemask(new, nmask, scratch);
1078                         task_unlock(current);
1079                         if (err)
1080                                 up_write(&mm->mmap_sem);
1081                 } else
1082                         err = -ENOMEM;
1083                 NODEMASK_SCRATCH_FREE(scratch);
1084         }
1085         if (err)
1086                 goto mpol_out;
1087
1088         vma = check_range(mm, start, end, nmask,
1089                           flags | MPOL_MF_INVERT, &pagelist);
1090
1091         err = PTR_ERR(vma);
1092         if (!IS_ERR(vma)) {
1093                 int nr_failed = 0;
1094
1095                 err = mbind_range(mm, start, end, new);
1096
1097                 if (!list_empty(&pagelist))
1098                         nr_failed = migrate_pages(&pagelist, new_vma_page,
1099                                                 (unsigned long)vma, 0);
1100
1101                 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
1102                         err = -EIO;
1103         } else
1104                 putback_lru_pages(&pagelist);
1105
1106         up_write(&mm->mmap_sem);
1107  mpol_out:
1108         mpol_put(new);
1109         return err;
1110 }
1111
1112 /*
1113  * User space interface with variable sized bitmaps for nodelists.
1114  */
1115
1116 /* Copy a node mask from user space. */
1117 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1118                      unsigned long maxnode)
1119 {
1120         unsigned long k;
1121         unsigned long nlongs;
1122         unsigned long endmask;
1123
1124         --maxnode;
1125         nodes_clear(*nodes);
1126         if (maxnode == 0 || !nmask)
1127                 return 0;
1128         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1129                 return -EINVAL;
1130
1131         nlongs = BITS_TO_LONGS(maxnode);
1132         if ((maxnode % BITS_PER_LONG) == 0)
1133                 endmask = ~0UL;
1134         else
1135                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1136
1137         /* When the user specified more nodes than supported just check
1138            if the non supported part is all zero. */
1139         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1140                 if (nlongs > PAGE_SIZE/sizeof(long))
1141                         return -EINVAL;
1142                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1143                         unsigned long t;
1144                         if (get_user(t, nmask + k))
1145                                 return -EFAULT;
1146                         if (k == nlongs - 1) {
1147                                 if (t & endmask)
1148                                         return -EINVAL;
1149                         } else if (t)
1150                                 return -EINVAL;
1151                 }
1152                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1153                 endmask = ~0UL;
1154         }
1155
1156         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1157                 return -EFAULT;
1158         nodes_addr(*nodes)[nlongs-1] &= endmask;
1159         return 0;
1160 }
1161
1162 /* Copy a kernel node mask to user space */
1163 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1164                               nodemask_t *nodes)
1165 {
1166         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1167         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1168
1169         if (copy > nbytes) {
1170                 if (copy > PAGE_SIZE)
1171                         return -EINVAL;
1172                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1173                         return -EFAULT;
1174                 copy = nbytes;
1175         }
1176         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1177 }
1178
1179 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1180                 unsigned long, mode, unsigned long __user *, nmask,
1181                 unsigned long, maxnode, unsigned, flags)
1182 {
1183         nodemask_t nodes;
1184         int err;
1185         unsigned short mode_flags;
1186
1187         mode_flags = mode & MPOL_MODE_FLAGS;
1188         mode &= ~MPOL_MODE_FLAGS;
1189         if (mode >= MPOL_MAX)
1190                 return -EINVAL;
1191         if ((mode_flags & MPOL_F_STATIC_NODES) &&
1192             (mode_flags & MPOL_F_RELATIVE_NODES))
1193                 return -EINVAL;
1194         err = get_nodes(&nodes, nmask, maxnode);
1195         if (err)
1196                 return err;
1197         return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1198 }
1199
1200 /* Set the process memory policy */
1201 SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
1202                 unsigned long, maxnode)
1203 {
1204         int err;
1205         nodemask_t nodes;
1206         unsigned short flags;
1207
1208         flags = mode & MPOL_MODE_FLAGS;
1209         mode &= ~MPOL_MODE_FLAGS;
1210         if ((unsigned int)mode >= MPOL_MAX)
1211                 return -EINVAL;
1212         if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1213                 return -EINVAL;
1214         err = get_nodes(&nodes, nmask, maxnode);
1215         if (err)
1216                 return err;
1217         return do_set_mempolicy(mode, flags, &nodes);
1218 }
1219
1220 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1221                 const unsigned long __user *, old_nodes,
1222                 const unsigned long __user *, new_nodes)
1223 {
1224         const struct cred *cred = current_cred(), *tcred;
1225         struct mm_struct *mm = NULL;
1226         struct task_struct *task;
1227         nodemask_t task_nodes;
1228         int err;
1229         nodemask_t *old;
1230         nodemask_t *new;
1231         NODEMASK_SCRATCH(scratch);
1232
1233         if (!scratch)
1234                 return -ENOMEM;
1235
1236         old = &scratch->mask1;
1237         new = &scratch->mask2;
1238
1239         err = get_nodes(old, old_nodes, maxnode);
1240         if (err)
1241                 goto out;
1242
1243         err = get_nodes(new, new_nodes, maxnode);
1244         if (err)
1245                 goto out;
1246
1247         /* Find the mm_struct */
1248         read_lock(&tasklist_lock);
1249         task = pid ? find_task_by_vpid(pid) : current;
1250         if (!task) {
1251                 read_unlock(&tasklist_lock);
1252                 err = -ESRCH;
1253                 goto out;
1254         }
1255         mm = get_task_mm(task);
1256         read_unlock(&tasklist_lock);
1257
1258         err = -EINVAL;
1259         if (!mm)
1260                 goto out;
1261
1262         /*
1263          * Check if this process has the right to modify the specified
1264          * process. The right exists if the process has administrative
1265          * capabilities, superuser privileges or the same
1266          * userid as the target process.
1267          */
1268         rcu_read_lock();
1269         tcred = __task_cred(task);
1270         if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
1271             cred->uid  != tcred->suid && cred->uid  != tcred->uid &&
1272             !capable(CAP_SYS_NICE)) {
1273                 rcu_read_unlock();
1274                 err = -EPERM;
1275                 goto out;
1276         }
1277         rcu_read_unlock();
1278
1279         task_nodes = cpuset_mems_allowed(task);
1280         /* Is the user allowed to access the target nodes? */
1281         if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1282                 err = -EPERM;
1283                 goto out;
1284         }
1285
1286         if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) {
1287                 err = -EINVAL;
1288                 goto out;
1289         }
1290
1291         err = security_task_movememory(task);
1292         if (err)
1293                 goto out;
1294
1295         err = do_migrate_pages(mm, old, new,
1296                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1297 out:
1298         if (mm)
1299                 mmput(mm);
1300         NODEMASK_SCRATCH_FREE(scratch);
1301
1302         return err;
1303 }
1304
1305
1306 /* Retrieve NUMA policy */
1307 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1308                 unsigned long __user *, nmask, unsigned long, maxnode,
1309                 unsigned long, addr, unsigned long, flags)
1310 {
1311         int err;
1312         int uninitialized_var(pval);
1313         nodemask_t nodes;
1314
1315         if (nmask != NULL && maxnode < MAX_NUMNODES)
1316                 return -EINVAL;
1317
1318         err = do_get_mempolicy(&pval, &nodes, addr, flags);
1319
1320         if (err)
1321                 return err;
1322
1323         if (policy && put_user(pval, policy))
1324                 return -EFAULT;
1325
1326         if (nmask)
1327                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1328
1329         return err;
1330 }
1331
1332 #ifdef CONFIG_COMPAT
1333
1334 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1335                                      compat_ulong_t __user *nmask,
1336                                      compat_ulong_t maxnode,
1337                                      compat_ulong_t addr, compat_ulong_t flags)
1338 {
1339         long err;
1340         unsigned long __user *nm = NULL;
1341         unsigned long nr_bits, alloc_size;
1342         DECLARE_BITMAP(bm, MAX_NUMNODES);
1343
1344         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1345         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1346
1347         if (nmask)
1348                 nm = compat_alloc_user_space(alloc_size);
1349
1350         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1351
1352         if (!err && nmask) {
1353                 err = copy_from_user(bm, nm, alloc_size);
1354                 /* ensure entire bitmap is zeroed */
1355                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1356                 err |= compat_put_bitmap(nmask, bm, nr_bits);
1357         }
1358
1359         return err;
1360 }
1361
1362 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1363                                      compat_ulong_t maxnode)
1364 {
1365         long err = 0;
1366         unsigned long __user *nm = NULL;
1367         unsigned long nr_bits, alloc_size;
1368         DECLARE_BITMAP(bm, MAX_NUMNODES);
1369
1370         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1371         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1372
1373         if (nmask) {
1374                 err = compat_get_bitmap(bm, nmask, nr_bits);
1375                 nm = compat_alloc_user_space(alloc_size);
1376                 err |= copy_to_user(nm, bm, alloc_size);
1377         }
1378
1379         if (err)
1380                 return -EFAULT;
1381
1382         return sys_set_mempolicy(mode, nm, nr_bits+1);
1383 }
1384
1385 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1386                              compat_ulong_t mode, compat_ulong_t __user *nmask,
1387                              compat_ulong_t maxnode, compat_ulong_t flags)
1388 {
1389         long err = 0;
1390         unsigned long __user *nm = NULL;
1391         unsigned long nr_bits, alloc_size;
1392         nodemask_t bm;
1393
1394         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1395         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1396
1397         if (nmask) {
1398                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1399                 nm = compat_alloc_user_space(alloc_size);
1400                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1401         }
1402
1403         if (err)
1404                 return -EFAULT;
1405
1406         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1407 }
1408
1409 #endif
1410
1411 /*
1412  * get_vma_policy(@task, @vma, @addr)
1413  * @task - task for fallback if vma policy == default
1414  * @vma   - virtual memory area whose policy is sought
1415  * @addr  - address in @vma for shared policy lookup
1416  *
1417  * Returns effective policy for a VMA at specified address.
1418  * Falls back to @task or system default policy, as necessary.
1419  * Current or other task's task mempolicy and non-shared vma policies
1420  * are protected by the task's mmap_sem, which must be held for read by
1421  * the caller.
1422  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1423  * count--added by the get_policy() vm_op, as appropriate--to protect against
1424  * freeing by another task.  It is the caller's responsibility to free the
1425  * extra reference for shared policies.
1426  */
1427 static struct mempolicy *get_vma_policy(struct task_struct *task,
1428                 struct vm_area_struct *vma, unsigned long addr)
1429 {
1430         struct mempolicy *pol = task->mempolicy;
1431
1432         if (vma) {
1433                 if (vma->vm_ops && vma->vm_ops->get_policy) {
1434                         struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1435                                                                         addr);
1436                         if (vpol)
1437                                 pol = vpol;
1438                 } else if (vma->vm_policy)
1439                         pol = vma->vm_policy;
1440         }
1441         if (!pol)
1442                 pol = &default_policy;
1443         return pol;
1444 }
1445
1446 /*
1447  * Return a nodemask representing a mempolicy for filtering nodes for
1448  * page allocation
1449  */
1450 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1451 {
1452         /* Lower zones don't get a nodemask applied for MPOL_BIND */
1453         if (unlikely(policy->mode == MPOL_BIND) &&
1454                         gfp_zone(gfp) >= policy_zone &&
1455                         cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1456                 return &policy->v.nodes;
1457
1458         return NULL;
1459 }
1460
1461 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1462 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
1463 {
1464         int nd = numa_node_id();
1465
1466         switch (policy->mode) {
1467         case MPOL_PREFERRED:
1468                 if (!(policy->flags & MPOL_F_LOCAL))
1469                         nd = policy->v.preferred_node;
1470                 break;
1471         case MPOL_BIND:
1472                 /*
1473                  * Normally, MPOL_BIND allocations are node-local within the
1474                  * allowed nodemask.  However, if __GFP_THISNODE is set and the
1475                  * current node isn't part of the mask, we use the zonelist for
1476                  * the first node in the mask instead.
1477                  */
1478                 if (unlikely(gfp & __GFP_THISNODE) &&
1479                                 unlikely(!node_isset(nd, policy->v.nodes)))
1480                         nd = first_node(policy->v.nodes);
1481                 break;
1482         default:
1483                 BUG();
1484         }
1485         return node_zonelist(nd, gfp);
1486 }
1487
1488 /* Do dynamic interleaving for a process */
1489 static unsigned interleave_nodes(struct mempolicy *policy)
1490 {
1491         unsigned nid, next;
1492         struct task_struct *me = current;
1493
1494         nid = me->il_next;
1495         next = next_node(nid, policy->v.nodes);
1496         if (next >= MAX_NUMNODES)
1497                 next = first_node(policy->v.nodes);
1498         if (next < MAX_NUMNODES)
1499                 me->il_next = next;
1500         return nid;
1501 }
1502
1503 /*
1504  * Depending on the memory policy provide a node from which to allocate the
1505  * next slab entry.
1506  * @policy must be protected by freeing by the caller.  If @policy is
1507  * the current task's mempolicy, this protection is implicit, as only the
1508  * task can change it's policy.  The system default policy requires no
1509  * such protection.
1510  */
1511 unsigned slab_node(struct mempolicy *policy)
1512 {
1513         if (!policy || policy->flags & MPOL_F_LOCAL)
1514                 return numa_node_id();
1515
1516         switch (policy->mode) {
1517         case MPOL_PREFERRED:
1518                 /*
1519                  * handled MPOL_F_LOCAL above
1520                  */
1521                 return policy->v.preferred_node;
1522
1523         case MPOL_INTERLEAVE:
1524                 return interleave_nodes(policy);
1525
1526         case MPOL_BIND: {
1527                 /*
1528                  * Follow bind policy behavior and start allocation at the
1529                  * first node.
1530                  */
1531                 struct zonelist *zonelist;
1532                 struct zone *zone;
1533                 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1534                 zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1535                 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1536                                                         &policy->v.nodes,
1537                                                         &zone);
1538                 return zone ? zone->node : numa_node_id();
1539         }
1540
1541         default:
1542                 BUG();
1543         }
1544 }
1545
1546 /* Do static interleaving for a VMA with known offset. */
1547 static unsigned offset_il_node(struct mempolicy *pol,
1548                 struct vm_area_struct *vma, unsigned long off)
1549 {
1550         unsigned nnodes = nodes_weight(pol->v.nodes);
1551         unsigned target;
1552         int c;
1553         int nid = -1;
1554
1555         if (!nnodes)
1556                 return numa_node_id();
1557         target = (unsigned int)off % nnodes;
1558         c = 0;
1559         do {
1560                 nid = next_node(nid, pol->v.nodes);
1561                 c++;
1562         } while (c <= target);
1563         return nid;
1564 }
1565
1566 /* Determine a node number for interleave */
1567 static inline unsigned interleave_nid(struct mempolicy *pol,
1568                  struct vm_area_struct *vma, unsigned long addr, int shift)
1569 {
1570         if (vma) {
1571                 unsigned long off;
1572
1573                 /*
1574                  * for small pages, there is no difference between
1575                  * shift and PAGE_SHIFT, so the bit-shift is safe.
1576                  * for huge pages, since vm_pgoff is in units of small
1577                  * pages, we need to shift off the always 0 bits to get
1578                  * a useful offset.
1579                  */
1580                 BUG_ON(shift < PAGE_SHIFT);
1581                 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1582                 off += (addr - vma->vm_start) >> shift;
1583                 return offset_il_node(pol, vma, off);
1584         } else
1585                 return interleave_nodes(pol);
1586 }
1587
1588 #ifdef CONFIG_HUGETLBFS
1589 /*
1590  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1591  * @vma = virtual memory area whose policy is sought
1592  * @addr = address in @vma for shared policy lookup and interleave policy
1593  * @gfp_flags = for requested zone
1594  * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1595  * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1596  *
1597  * Returns a zonelist suitable for a huge page allocation and a pointer
1598  * to the struct mempolicy for conditional unref after allocation.
1599  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1600  * @nodemask for filtering the zonelist.
1601  *
1602  * Must be protected by get_mems_allowed()
1603  */
1604 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1605                                 gfp_t gfp_flags, struct mempolicy **mpol,
1606                                 nodemask_t **nodemask)
1607 {
1608         struct zonelist *zl;
1609
1610         *mpol = get_vma_policy(current, vma, addr);
1611         *nodemask = NULL;       /* assume !MPOL_BIND */
1612
1613         if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1614                 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1615                                 huge_page_shift(hstate_vma(vma))), gfp_flags);
1616         } else {
1617                 zl = policy_zonelist(gfp_flags, *mpol);
1618                 if ((*mpol)->mode == MPOL_BIND)
1619                         *nodemask = &(*mpol)->v.nodes;
1620         }
1621         return zl;
1622 }
1623
1624 /*
1625  * init_nodemask_of_mempolicy
1626  *
1627  * If the current task's mempolicy is "default" [NULL], return 'false'
1628  * to indicate default policy.  Otherwise, extract the policy nodemask
1629  * for 'bind' or 'interleave' policy into the argument nodemask, or
1630  * initialize the argument nodemask to contain the single node for
1631  * 'preferred' or 'local' policy and return 'true' to indicate presence
1632  * of non-default mempolicy.
1633  *
1634  * We don't bother with reference counting the mempolicy [mpol_get/put]
1635  * because the current task is examining it's own mempolicy and a task's
1636  * mempolicy is only ever changed by the task itself.
1637  *
1638  * N.B., it is the caller's responsibility to free a returned nodemask.
1639  */
1640 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1641 {
1642         struct mempolicy *mempolicy;
1643         int nid;
1644
1645         if (!(mask && current->mempolicy))
1646                 return false;
1647
1648         task_lock(current);
1649         mempolicy = current->mempolicy;
1650         switch (mempolicy->mode) {
1651         case MPOL_PREFERRED:
1652                 if (mempolicy->flags & MPOL_F_LOCAL)
1653                         nid = numa_node_id();
1654                 else
1655                         nid = mempolicy->v.preferred_node;
1656                 init_nodemask_of_node(mask, nid);
1657                 break;
1658
1659         case MPOL_BIND:
1660                 /* Fall through */
1661         case MPOL_INTERLEAVE:
1662                 *mask =  mempolicy->v.nodes;
1663                 break;
1664
1665         default:
1666                 BUG();
1667         }
1668         task_unlock(current);
1669
1670         return true;
1671 }
1672 #endif
1673
1674 /*
1675  * mempolicy_nodemask_intersects
1676  *
1677  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1678  * policy.  Otherwise, check for intersection between mask and the policy
1679  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1680  * policy, always return true since it may allocate elsewhere on fallback.
1681  *
1682  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1683  */
1684 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1685                                         const nodemask_t *mask)
1686 {
1687         struct mempolicy *mempolicy;
1688         bool ret = true;
1689
1690         if (!mask)
1691                 return ret;
1692         task_lock(tsk);
1693         mempolicy = tsk->mempolicy;
1694         if (!mempolicy)
1695                 goto out;
1696
1697         switch (mempolicy->mode) {
1698         case MPOL_PREFERRED:
1699                 /*
1700                  * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1701                  * allocate from, they may fallback to other nodes when oom.
1702                  * Thus, it's possible for tsk to have allocated memory from
1703                  * nodes in mask.
1704                  */
1705                 break;
1706         case MPOL_BIND:
1707         case MPOL_INTERLEAVE:
1708                 ret = nodes_intersects(mempolicy->v.nodes, *mask);
1709                 break;
1710         default:
1711                 BUG();
1712         }
1713 out:
1714         task_unlock(tsk);
1715         return ret;
1716 }
1717
1718 /* Allocate a page in interleaved policy.
1719    Own path because it needs to do special accounting. */
1720 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1721                                         unsigned nid)
1722 {
1723         struct zonelist *zl;
1724         struct page *page;
1725
1726         zl = node_zonelist(nid, gfp);
1727         page = __alloc_pages(gfp, order, zl);
1728         if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1729                 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1730         return page;
1731 }
1732
1733 /**
1734  *      alloc_page_vma  - Allocate a page for a VMA.
1735  *
1736  *      @gfp:
1737  *      %GFP_USER    user allocation.
1738  *      %GFP_KERNEL  kernel allocations,
1739  *      %GFP_HIGHMEM highmem/user allocations,
1740  *      %GFP_FS      allocation should not call back into a file system.
1741  *      %GFP_ATOMIC  don't sleep.
1742  *
1743  *      @vma:  Pointer to VMA or NULL if not available.
1744  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1745  *
1746  *      This function allocates a page from the kernel page pool and applies
1747  *      a NUMA policy associated with the VMA or the current process.
1748  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1749  *      mm_struct of the VMA to prevent it from going away. Should be used for
1750  *      all allocations for pages that will be mapped into
1751  *      user space. Returns NULL when no page can be allocated.
1752  *
1753  *      Should be called with the mm_sem of the vma hold.
1754  */
1755 struct page *
1756 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1757 {
1758         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1759         struct zonelist *zl;
1760         struct page *page;
1761
1762         get_mems_allowed();
1763         if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1764                 unsigned nid;
1765
1766                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1767                 mpol_cond_put(pol);
1768                 page = alloc_page_interleave(gfp, 0, nid);
1769                 put_mems_allowed();
1770                 return page;
1771         }
1772         zl = policy_zonelist(gfp, pol);
1773         if (unlikely(mpol_needs_cond_ref(pol))) {
1774                 /*
1775                  * slow path: ref counted shared policy
1776                  */
1777                 struct page *page =  __alloc_pages_nodemask(gfp, 0,
1778                                                 zl, policy_nodemask(gfp, pol));
1779                 __mpol_put(pol);
1780                 put_mems_allowed();
1781                 return page;
1782         }
1783         /*
1784          * fast path:  default or task policy
1785          */
1786         page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
1787         put_mems_allowed();
1788         return page;
1789 }
1790
1791 /**
1792  *      alloc_pages_current - Allocate pages.
1793  *
1794  *      @gfp:
1795  *              %GFP_USER   user allocation,
1796  *              %GFP_KERNEL kernel allocation,
1797  *              %GFP_HIGHMEM highmem allocation,
1798  *              %GFP_FS     don't call back into a file system.
1799  *              %GFP_ATOMIC don't sleep.
1800  *      @order: Power of two of allocation size in pages. 0 is a single page.
1801  *
1802  *      Allocate a page from the kernel page pool.  When not in
1803  *      interrupt context and apply the current process NUMA policy.
1804  *      Returns NULL when no page can be allocated.
1805  *
1806  *      Don't call cpuset_update_task_memory_state() unless
1807  *      1) it's ok to take cpuset_sem (can WAIT), and
1808  *      2) allocating for current task (not interrupt).
1809  */
1810 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1811 {
1812         struct mempolicy *pol = current->mempolicy;
1813         struct page *page;
1814
1815         if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1816                 pol = &default_policy;
1817
1818         get_mems_allowed();
1819         /*
1820          * No reference counting needed for current->mempolicy
1821          * nor system default_policy
1822          */
1823         if (pol->mode == MPOL_INTERLEAVE)
1824                 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
1825         else
1826                 page = __alloc_pages_nodemask(gfp, order,
1827                         policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
1828         put_mems_allowed();
1829         return page;
1830 }
1831 EXPORT_SYMBOL(alloc_pages_current);
1832
1833 /*
1834  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
1835  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1836  * with the mems_allowed returned by cpuset_mems_allowed().  This
1837  * keeps mempolicies cpuset relative after its cpuset moves.  See
1838  * further kernel/cpuset.c update_nodemask().
1839  *
1840  * current's mempolicy may be rebinded by the other task(the task that changes
1841  * cpuset's mems), so we needn't do rebind work for current task.
1842  */
1843
1844 /* Slow path of a mempolicy duplicate */
1845 struct mempolicy *__mpol_dup(struct mempolicy *old)
1846 {
1847         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1848
1849         if (!new)
1850                 return ERR_PTR(-ENOMEM);
1851
1852         /* task's mempolicy is protected by alloc_lock */
1853         if (old == current->mempolicy) {
1854                 task_lock(current);
1855                 *new = *old;
1856                 task_unlock(current);
1857         } else
1858                 *new = *old;
1859
1860         rcu_read_lock();
1861         if (current_cpuset_is_being_rebound()) {
1862                 nodemask_t mems = cpuset_mems_allowed(current);
1863                 if (new->flags & MPOL_F_REBINDING)
1864                         mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
1865                 else
1866                         mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
1867         }
1868         rcu_read_unlock();
1869         atomic_set(&new->refcnt, 1);
1870         return new;
1871 }
1872
1873 /*
1874  * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
1875  * eliminate the * MPOL_F_* flags that require conditional ref and
1876  * [NOTE!!!] drop the extra ref.  Not safe to reference *frompol directly
1877  * after return.  Use the returned value.
1878  *
1879  * Allows use of a mempolicy for, e.g., multiple allocations with a single
1880  * policy lookup, even if the policy needs/has extra ref on lookup.
1881  * shmem_readahead needs this.
1882  */
1883 struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
1884                                                 struct mempolicy *frompol)
1885 {
1886         if (!mpol_needs_cond_ref(frompol))
1887                 return frompol;
1888
1889         *tompol = *frompol;
1890         tompol->flags &= ~MPOL_F_SHARED;        /* copy doesn't need unref */
1891         __mpol_put(frompol);
1892         return tompol;
1893 }
1894
1895 /* Slow path of a mempolicy comparison */
1896 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1897 {
1898         if (!a || !b)
1899                 return 0;
1900         if (a->mode != b->mode)
1901                 return 0;
1902         if (a->flags != b->flags)
1903                 return 0;
1904         if (mpol_store_user_nodemask(a))
1905                 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
1906                         return 0;
1907
1908         switch (a->mode) {
1909         case MPOL_BIND:
1910                 /* Fall through */
1911         case MPOL_INTERLEAVE:
1912                 return nodes_equal(a->v.nodes, b->v.nodes);
1913         case MPOL_PREFERRED:
1914                 return a->v.preferred_node == b->v.preferred_node &&
1915                         a->flags == b->flags;
1916         default:
1917                 BUG();
1918                 return 0;
1919         }
1920 }
1921
1922 /*
1923  * Shared memory backing store policy support.
1924  *
1925  * Remember policies even when nobody has shared memory mapped.
1926  * The policies are kept in Red-Black tree linked from the inode.
1927  * They are protected by the sp->lock spinlock, which should be held
1928  * for any accesses to the tree.
1929  */
1930
1931 /* lookup first element intersecting start-end */
1932 /* Caller holds sp->lock */
1933 static struct sp_node *
1934 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1935 {
1936         struct rb_node *n = sp->root.rb_node;
1937
1938         while (n) {
1939                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1940
1941                 if (start >= p->end)
1942                         n = n->rb_right;
1943                 else if (end <= p->start)
1944                         n = n->rb_left;
1945                 else
1946                         break;
1947         }
1948         if (!n)
1949                 return NULL;
1950         for (;;) {
1951                 struct sp_node *w = NULL;
1952                 struct rb_node *prev = rb_prev(n);
1953                 if (!prev)
1954                         break;
1955                 w = rb_entry(prev, struct sp_node, nd);
1956                 if (w->end <= start)
1957                         break;
1958                 n = prev;
1959         }
1960         return rb_entry(n, struct sp_node, nd);
1961 }
1962
1963 /* Insert a new shared policy into the list. */
1964 /* Caller holds sp->lock */
1965 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1966 {
1967         struct rb_node **p = &sp->root.rb_node;
1968         struct rb_node *parent = NULL;
1969         struct sp_node *nd;
1970
1971         while (*p) {
1972                 parent = *p;
1973                 nd = rb_entry(parent, struct sp_node, nd);
1974                 if (new->start < nd->start)
1975                         p = &(*p)->rb_left;
1976                 else if (new->end > nd->end)
1977                         p = &(*p)->rb_right;
1978                 else
1979                         BUG();
1980         }
1981         rb_link_node(&new->nd, parent, p);
1982         rb_insert_color(&new->nd, &sp->root);
1983         pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
1984                  new->policy ? new->policy->mode : 0);
1985 }
1986
1987 /* Find shared policy intersecting idx */
1988 struct mempolicy *
1989 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1990 {
1991         struct mempolicy *pol = NULL;
1992         struct sp_node *sn;
1993
1994         if (!sp->root.rb_node)
1995                 return NULL;
1996         spin_lock(&sp->lock);
1997         sn = sp_lookup(sp, idx, idx+1);
1998         if (sn) {
1999                 mpol_get(sn->policy);
2000                 pol = sn->policy;
2001         }
2002         spin_unlock(&sp->lock);
2003         return pol;
2004 }
2005
2006 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2007 {
2008         pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2009         rb_erase(&n->nd, &sp->root);
2010         mpol_put(n->policy);
2011         kmem_cache_free(sn_cache, n);
2012 }
2013
2014 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2015                                 struct mempolicy *pol)
2016 {
2017         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2018
2019         if (!n)
2020                 return NULL;
2021         n->start = start;
2022         n->end = end;
2023         mpol_get(pol);
2024         pol->flags |= MPOL_F_SHARED;    /* for unref */
2025         n->policy = pol;
2026         return n;
2027 }
2028
2029 /* Replace a policy range. */
2030 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2031                                  unsigned long end, struct sp_node *new)
2032 {
2033         struct sp_node *n, *new2 = NULL;
2034
2035 restart:
2036         spin_lock(&sp->lock);
2037         n = sp_lookup(sp, start, end);
2038         /* Take care of old policies in the same range. */
2039         while (n && n->start < end) {
2040                 struct rb_node *next = rb_next(&n->nd);
2041                 if (n->start >= start) {
2042                         if (n->end <= end)
2043                                 sp_delete(sp, n);
2044                         else
2045                                 n->start = end;
2046                 } else {
2047                         /* Old policy spanning whole new range. */
2048                         if (n->end > end) {
2049                                 if (!new2) {
2050                                         spin_unlock(&sp->lock);
2051                                         new2 = sp_alloc(end, n->end, n->policy);
2052                                         if (!new2)
2053                                                 return -ENOMEM;
2054                                         goto restart;
2055                                 }
2056                                 n->end = start;
2057                                 sp_insert(sp, new2);
2058                                 new2 = NULL;
2059                                 break;
2060                         } else
2061                                 n->end = start;
2062                 }
2063                 if (!next)
2064                         break;
2065                 n = rb_entry(next, struct sp_node, nd);
2066         }
2067         if (new)
2068                 sp_insert(sp, new);
2069         spin_unlock(&sp->lock);
2070         if (new2) {
2071                 mpol_put(new2->policy);
2072                 kmem_cache_free(sn_cache, new2);
2073         }
2074         return 0;
2075 }
2076
2077 /**
2078  * mpol_shared_policy_init - initialize shared policy for inode
2079  * @sp: pointer to inode shared policy
2080  * @mpol:  struct mempolicy to install
2081  *
2082  * Install non-NULL @mpol in inode's shared policy rb-tree.
2083  * On entry, the current task has a reference on a non-NULL @mpol.
2084  * This must be released on exit.
2085  * This is called at get_inode() calls and we can use GFP_KERNEL.
2086  */
2087 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2088 {
2089         int ret;
2090
2091         sp->root = RB_ROOT;             /* empty tree == default mempolicy */
2092         spin_lock_init(&sp->lock);
2093
2094         if (mpol) {
2095                 struct vm_area_struct pvma;
2096                 struct mempolicy *new;
2097                 NODEMASK_SCRATCH(scratch);
2098
2099                 if (!scratch)
2100                         goto put_mpol;
2101                 /* contextualize the tmpfs mount point mempolicy */
2102                 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2103                 if (IS_ERR(new))
2104                         goto free_scratch; /* no valid nodemask intersection */
2105
2106                 task_lock(current);
2107                 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2108                 task_unlock(current);
2109                 if (ret)
2110                         goto put_new;
2111
2112                 /* Create pseudo-vma that contains just the policy */
2113                 memset(&pvma, 0, sizeof(struct vm_area_struct));
2114                 pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
2115                 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2116
2117 put_new:
2118                 mpol_put(new);                  /* drop initial ref */
2119 free_scratch:
2120                 NODEMASK_SCRATCH_FREE(scratch);
2121 put_mpol:
2122                 mpol_put(mpol); /* drop our incoming ref on sb mpol */
2123         }
2124 }
2125
2126 int mpol_set_shared_policy(struct shared_policy *info,
2127                         struct vm_area_struct *vma, struct mempolicy *npol)
2128 {
2129         int err;
2130         struct sp_node *new = NULL;
2131         unsigned long sz = vma_pages(vma);
2132
2133         pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2134                  vma->vm_pgoff,
2135                  sz, npol ? npol->mode : -1,
2136                  npol ? npol->flags : -1,
2137                  npol ? nodes_addr(npol->v.nodes)[0] : -1);
2138
2139         if (npol) {
2140                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2141                 if (!new)
2142                         return -ENOMEM;
2143         }
2144         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2145         if (err && new)
2146                 kmem_cache_free(sn_cache, new);
2147         return err;
2148 }
2149
2150 /* Free a backing policy store on inode delete. */
2151 void mpol_free_shared_policy(struct shared_policy *p)
2152 {
2153         struct sp_node *n;
2154         struct rb_node *next;
2155
2156         if (!p->root.rb_node)
2157                 return;
2158         spin_lock(&p->lock);
2159         next = rb_first(&p->root);
2160         while (next) {
2161                 n = rb_entry(next, struct sp_node, nd);
2162                 next = rb_next(&n->nd);
2163                 rb_erase(&n->nd, &p->root);
2164                 mpol_put(n->policy);
2165                 kmem_cache_free(sn_cache, n);
2166         }
2167         spin_unlock(&p->lock);
2168 }
2169
2170 /* assumes fs == KERNEL_DS */
2171 void __init numa_policy_init(void)
2172 {
2173         nodemask_t interleave_nodes;
2174         unsigned long largest = 0;
2175         int nid, prefer = 0;
2176
2177         policy_cache = kmem_cache_create("numa_policy",
2178                                          sizeof(struct mempolicy),
2179                                          0, SLAB_PANIC, NULL);
2180
2181         sn_cache = kmem_cache_create("shared_policy_node",
2182                                      sizeof(struct sp_node),
2183                                      0, SLAB_PANIC, NULL);
2184
2185         /*
2186          * Set interleaving policy for system init. Interleaving is only
2187          * enabled across suitably sized nodes (default is >= 16MB), or
2188          * fall back to the largest node if they're all smaller.
2189          */
2190         nodes_clear(interleave_nodes);
2191         for_each_node_state(nid, N_HIGH_MEMORY) {
2192                 unsigned long total_pages = node_present_pages(nid);
2193
2194                 /* Preserve the largest node */
2195                 if (largest < total_pages) {
2196                         largest = total_pages;
2197                         prefer = nid;
2198                 }
2199
2200                 /* Interleave this node? */
2201                 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2202                         node_set(nid, interleave_nodes);
2203         }
2204
2205         /* All too small, use the largest */
2206         if (unlikely(nodes_empty(interleave_nodes)))
2207                 node_set(prefer, interleave_nodes);
2208
2209         if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2210                 printk("numa_policy_init: interleaving failed\n");
2211 }
2212
2213 /* Reset policy of current process to default */
2214 void numa_default_policy(void)
2215 {
2216         do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2217 }
2218
2219 /*
2220  * Parse and format mempolicy from/to strings
2221  */
2222
2223 /*
2224  * "local" is pseudo-policy:  MPOL_PREFERRED with MPOL_F_LOCAL flag
2225  * Used only for mpol_parse_str() and mpol_to_str()
2226  */
2227 #define MPOL_LOCAL MPOL_MAX
2228 static const char * const policy_modes[] =
2229 {
2230         [MPOL_DEFAULT]    = "default",
2231         [MPOL_PREFERRED]  = "prefer",
2232         [MPOL_BIND]       = "bind",
2233         [MPOL_INTERLEAVE] = "interleave",
2234         [MPOL_LOCAL]      = "local"
2235 };
2236
2237
2238 #ifdef CONFIG_TMPFS
2239 /**
2240  * mpol_parse_str - parse string to mempolicy
2241  * @str:  string containing mempolicy to parse
2242  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2243  * @no_context:  flag whether to "contextualize" the mempolicy
2244  *
2245  * Format of input:
2246  *      <mode>[=<flags>][:<nodelist>]
2247  *
2248  * if @no_context is true, save the input nodemask in w.user_nodemask in
2249  * the returned mempolicy.  This will be used to "clone" the mempolicy in
2250  * a specific context [cpuset] at a later time.  Used to parse tmpfs mpol
2251  * mount option.  Note that if 'static' or 'relative' mode flags were
2252  * specified, the input nodemask will already have been saved.  Saving
2253  * it again is redundant, but safe.
2254  *
2255  * On success, returns 0, else 1
2256  */
2257 int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2258 {
2259         struct mempolicy *new = NULL;
2260         unsigned short mode;
2261         unsigned short uninitialized_var(mode_flags);
2262         nodemask_t nodes;
2263         char *nodelist = strchr(str, ':');
2264         char *flags = strchr(str, '=');
2265         int err = 1;
2266
2267         if (nodelist) {
2268                 /* NUL-terminate mode or flags string */
2269                 *nodelist++ = '\0';
2270                 if (nodelist_parse(nodelist, nodes))
2271                         goto out;
2272                 if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
2273                         goto out;
2274         } else
2275                 nodes_clear(nodes);
2276
2277         if (flags)
2278                 *flags++ = '\0';        /* terminate mode string */
2279
2280         for (mode = 0; mode <= MPOL_LOCAL; mode++) {
2281                 if (!strcmp(str, policy_modes[mode])) {
2282                         break;
2283                 }
2284         }
2285         if (mode > MPOL_LOCAL)
2286                 goto out;
2287
2288         switch (mode) {
2289         case MPOL_PREFERRED:
2290                 /*
2291                  * Insist on a nodelist of one node only
2292                  */
2293                 if (nodelist) {
2294                         char *rest = nodelist;
2295                         while (isdigit(*rest))
2296                                 rest++;
2297                         if (*rest)
2298                                 goto out;
2299                 }
2300                 break;
2301         case MPOL_INTERLEAVE:
2302                 /*
2303                  * Default to online nodes with memory if no nodelist
2304                  */
2305                 if (!nodelist)
2306                         nodes = node_states[N_HIGH_MEMORY];
2307                 break;
2308         case MPOL_LOCAL:
2309                 /*
2310                  * Don't allow a nodelist;  mpol_new() checks flags
2311                  */
2312                 if (nodelist)
2313                         goto out;
2314                 mode = MPOL_PREFERRED;
2315                 break;
2316         case MPOL_DEFAULT:
2317                 /*
2318                  * Insist on a empty nodelist
2319                  */
2320                 if (!nodelist)
2321                         err = 0;
2322                 goto out;
2323         case MPOL_BIND:
2324                 /*
2325                  * Insist on a nodelist
2326                  */
2327                 if (!nodelist)
2328                         goto out;
2329         }
2330
2331         mode_flags = 0;
2332         if (flags) {
2333                 /*
2334                  * Currently, we only support two mutually exclusive
2335                  * mode flags.
2336                  */
2337                 if (!strcmp(flags, "static"))
2338                         mode_flags |= MPOL_F_STATIC_NODES;
2339                 else if (!strcmp(flags, "relative"))
2340                         mode_flags |= MPOL_F_RELATIVE_NODES;
2341                 else
2342                         goto out;
2343         }
2344
2345         new = mpol_new(mode, mode_flags, &nodes);
2346         if (IS_ERR(new))
2347                 goto out;
2348
2349         if (no_context) {
2350                 /* save for contextualization */
2351                 new->w.user_nodemask = nodes;
2352         } else {
2353                 int ret;
2354                 NODEMASK_SCRATCH(scratch);
2355                 if (scratch) {
2356                         task_lock(current);
2357                         ret = mpol_set_nodemask(new, &nodes, scratch);
2358                         task_unlock(current);
2359                 } else
2360                         ret = -ENOMEM;
2361                 NODEMASK_SCRATCH_FREE(scratch);
2362                 if (ret) {
2363                         mpol_put(new);
2364                         goto out;
2365                 }
2366         }
2367         err = 0;
2368
2369 out:
2370         /* Restore string for error message */
2371         if (nodelist)
2372                 *--nodelist = ':';
2373         if (flags)
2374                 *--flags = '=';
2375         if (!err)
2376                 *mpol = new;
2377         return err;
2378 }
2379 #endif /* CONFIG_TMPFS */
2380
2381 /**
2382  * mpol_to_str - format a mempolicy structure for printing
2383  * @buffer:  to contain formatted mempolicy string
2384  * @maxlen:  length of @buffer
2385  * @pol:  pointer to mempolicy to be formatted
2386  * @no_context:  "context free" mempolicy - use nodemask in w.user_nodemask
2387  *
2388  * Convert a mempolicy into a string.
2389  * Returns the number of characters in buffer (if positive)
2390  * or an error (negative)
2391  */
2392 int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2393 {
2394         char *p = buffer;
2395         int l;
2396         nodemask_t nodes;
2397         unsigned short mode;
2398         unsigned short flags = pol ? pol->flags : 0;
2399
2400         /*
2401          * Sanity check:  room for longest mode, flag and some nodes
2402          */
2403         VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2404
2405         if (!pol || pol == &default_policy)
2406                 mode = MPOL_DEFAULT;
2407         else
2408                 mode = pol->mode;
2409
2410         switch (mode) {
2411         case MPOL_DEFAULT:
2412                 nodes_clear(nodes);
2413                 break;
2414
2415         case MPOL_PREFERRED:
2416                 nodes_clear(nodes);
2417                 if (flags & MPOL_F_LOCAL)
2418                         mode = MPOL_LOCAL;      /* pseudo-policy */
2419                 else
2420                         node_set(pol->v.preferred_node, nodes);
2421                 break;
2422
2423         case MPOL_BIND:
2424                 /* Fall through */
2425         case MPOL_INTERLEAVE:
2426                 if (no_context)
2427                         nodes = pol->w.user_nodemask;
2428                 else
2429                         nodes = pol->v.nodes;
2430                 break;
2431
2432         default:
2433                 BUG();
2434         }
2435
2436         l = strlen(policy_modes[mode]);
2437         if (buffer + maxlen < p + l + 1)
2438                 return -ENOSPC;
2439
2440         strcpy(p, policy_modes[mode]);
2441         p += l;
2442
2443         if (flags & MPOL_MODE_FLAGS) {
2444                 if (buffer + maxlen < p + 2)
2445                         return -ENOSPC;
2446                 *p++ = '=';
2447
2448                 /*
2449                  * Currently, the only defined flags are mutually exclusive
2450                  */
2451                 if (flags & MPOL_F_STATIC_NODES)
2452                         p += snprintf(p, buffer + maxlen - p, "static");
2453                 else if (flags & MPOL_F_RELATIVE_NODES)
2454                         p += snprintf(p, buffer + maxlen - p, "relative");
2455         }
2456
2457         if (!nodes_empty(nodes)) {
2458                 if (buffer + maxlen < p + 2)
2459                         return -ENOSPC;
2460                 *p++ = ':';
2461                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2462         }
2463         return p - buffer;
2464 }
2465
2466 struct numa_maps {
2467         unsigned long pages;
2468         unsigned long anon;
2469         unsigned long active;
2470         unsigned long writeback;
2471         unsigned long mapcount_max;
2472         unsigned long dirty;
2473         unsigned long swapcache;
2474         unsigned long node[MAX_NUMNODES];
2475 };
2476
2477 static void gather_stats(struct page *page, void *private, int pte_dirty)
2478 {
2479         struct numa_maps *md = private;
2480         int count = page_mapcount(page);
2481
2482         md->pages++;
2483         if (pte_dirty || PageDirty(page))
2484                 md->dirty++;
2485
2486         if (PageSwapCache(page))
2487                 md->swapcache++;
2488
2489         if (PageActive(page) || PageUnevictable(page))
2490                 md->active++;
2491
2492         if (PageWriteback(page))
2493                 md->writeback++;
2494
2495         if (PageAnon(page))
2496                 md->anon++;
2497
2498         if (count > md->mapcount_max)
2499                 md->mapcount_max = count;
2500
2501         md->node[page_to_nid(page)]++;
2502 }
2503
2504 #ifdef CONFIG_HUGETLB_PAGE
2505 static void check_huge_range(struct vm_area_struct *vma,
2506                 unsigned long start, unsigned long end,
2507                 struct numa_maps *md)
2508 {
2509         unsigned long addr;
2510         struct page *page;
2511         struct hstate *h = hstate_vma(vma);
2512         unsigned long sz = huge_page_size(h);
2513
2514         for (addr = start; addr < end; addr += sz) {
2515                 pte_t *ptep = huge_pte_offset(vma->vm_mm,
2516                                                 addr & huge_page_mask(h));
2517                 pte_t pte;
2518
2519                 if (!ptep)
2520                         continue;
2521
2522                 pte = *ptep;
2523                 if (pte_none(pte))
2524                         continue;
2525
2526                 page = pte_page(pte);
2527                 if (!page)
2528                         continue;
2529
2530                 gather_stats(page, md, pte_dirty(*ptep));
2531         }
2532 }
2533 #else
2534 static inline void check_huge_range(struct vm_area_struct *vma,
2535                 unsigned long start, unsigned long end,
2536                 struct numa_maps *md)
2537 {
2538 }
2539 #endif
2540
2541 /*
2542  * Display pages allocated per node and memory policy via /proc.
2543  */
2544 int show_numa_map(struct seq_file *m, void *v)
2545 {
2546         struct proc_maps_private *priv = m->private;
2547         struct vm_area_struct *vma = v;
2548         struct numa_maps *md;
2549         struct file *file = vma->vm_file;
2550         struct mm_struct *mm = vma->vm_mm;
2551         struct mempolicy *pol;
2552         int n;
2553         char buffer[50];
2554
2555         if (!mm)
2556                 return 0;
2557
2558         md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
2559         if (!md)
2560                 return 0;
2561
2562         pol = get_vma_policy(priv->task, vma, vma->vm_start);
2563         mpol_to_str(buffer, sizeof(buffer), pol, 0);
2564         mpol_cond_put(pol);
2565
2566         seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2567
2568         if (file) {
2569                 seq_printf(m, " file=");
2570                 seq_path(m, &file->f_path, "\n\t= ");
2571         } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2572                 seq_printf(m, " heap");
2573         } else if (vma->vm_start <= mm->start_stack &&
2574                         vma->vm_end >= mm->start_stack) {
2575                 seq_printf(m, " stack");
2576         }
2577
2578         if (is_vm_hugetlb_page(vma)) {
2579                 check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2580                 seq_printf(m, " huge");
2581         } else {
2582                 check_pgd_range(vma, vma->vm_start, vma->vm_end,
2583                         &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
2584         }
2585
2586         if (!md->pages)
2587                 goto out;
2588
2589         if (md->anon)
2590                 seq_printf(m," anon=%lu",md->anon);
2591
2592         if (md->dirty)
2593                 seq_printf(m," dirty=%lu",md->dirty);
2594
2595         if (md->pages != md->anon && md->pages != md->dirty)
2596                 seq_printf(m, " mapped=%lu", md->pages);
2597
2598         if (md->mapcount_max > 1)
2599                 seq_printf(m, " mapmax=%lu", md->mapcount_max);
2600
2601         if (md->swapcache)
2602                 seq_printf(m," swapcache=%lu", md->swapcache);
2603
2604         if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2605                 seq_printf(m," active=%lu", md->active);
2606
2607         if (md->writeback)
2608                 seq_printf(m," writeback=%lu", md->writeback);
2609
2610         for_each_node_state(n, N_HIGH_MEMORY)
2611                 if (md->node[n])
2612                         seq_printf(m, " N%d=%lu", n, md->node[n]);
2613 out:
2614         seq_putc(m, '\n');
2615         kfree(md);
2616
2617         if (m->count < m->size)
2618                 m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
2619         return 0;
2620 }