MOXA linux-2.6.x / linux-2.6.9-uc0 from sdlinux-moxaart.tgz
[linux-2.6.9-moxart.git] / mm / mempolicy.c
blob8fe9c7ee985334b02663ae22626d0db5606c500b
1 /*
2 * Simple NUMA memory policy for the Linux kernel.
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * Subject to the GNU Public License, version 2.
7 * NUMA policy allows the user to give hints in which node(s) memory should
8 * be allocated.
10 * Support four policies per VMA and per process:
12 * The VMA policy has priority over the process policy for a page fault.
14 * interleave Allocate memory interleaved over a set of nodes,
15 * with normal fallback if it fails.
16 * For VMA based allocations this interleaves based on the
17 * offset into the backing object or offset into the mapping
18 * for anonymous memory. For process policy an process counter
19 * is used.
20 * bind Only allocate memory on a specific set of nodes,
21 * no fallback.
22 * preferred Try a specific node first before normal fallback.
23 * As a special case node -1 here means do the allocation
24 * on the local CPU. This is normally identical to default,
25 * but useful to set in a VMA when you have a non default
26 * process policy.
27 * default Allocate on the local node first, or when on a VMA
28 * use the process policy. This is what Linux always did
29 * in a NUMA aware kernel and still does by, ahem, default.
31 * The process policy is applied for most non interrupt memory allocations
32 * in that process' context. Interrupts ignore the policies and always
33 * try to allocate on the local CPU. The VMA policy is only applied for memory
34 * allocations for a VMA in the VM.
36 * Currently there are a few corner cases in swapping where the policy
37 * is not applied, but the majority should be handled. When process policy
38 * is used it is not remembered over swap outs/swap ins.
40 * Only the highest zone in the zone hierarchy gets policied. Allocations
41 * requesting a lower zone just use default policy. This implies that
42 * on systems with highmem kernel lowmem allocation don't get policied.
43 * Same with GFP_DMA allocations.
45 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
46 * all users and remembered even when nobody has memory mapped.
49 /* Notebook:
50 fix mmap readahead to honour policy and enable policy for any page cache
51 object
52 statistics for bigpages
53 global policy for page cache? currently it uses process policy. Requires
54 first item above.
55 handle mremap for shared memory (currently ignored for the policy)
56 grows down?
57 make bind policy root only? It can trigger oom much faster and the
58 kernel is not always grateful with that.
59 could replace all the switch()es with a mempolicy_ops structure.
62 #include <linux/mempolicy.h>
63 #include <linux/mm.h>
64 #include <linux/highmem.h>
65 #include <linux/hugetlb.h>
66 #include <linux/kernel.h>
67 #include <linux/sched.h>
68 #include <linux/mm.h>
69 #include <linux/gfp.h>
70 #include <linux/slab.h>
71 #include <linux/string.h>
72 #include <linux/module.h>
73 #include <linux/interrupt.h>
74 #include <linux/init.h>
75 #include <linux/compat.h>
76 #include <linux/mempolicy.h>
77 #include <asm/uaccess.h>
79 static kmem_cache_t *policy_cache;
80 static kmem_cache_t *sn_cache;
82 #define PDprintk(fmt...)
84 /* Highest zone. An specific allocation for a zone below that is not
85 policied. */
86 static int policy_zone;
88 static struct mempolicy default_policy = {
89 .refcnt = ATOMIC_INIT(1), /* never free it */
90 .policy = MPOL_DEFAULT,
93 /* Check if all specified nodes are online */
94 static int nodes_online(unsigned long *nodes)
96 DECLARE_BITMAP(online2, MAX_NUMNODES);
98 bitmap_copy(online2, node_online_map, MAX_NUMNODES);
99 if (bitmap_empty(online2, MAX_NUMNODES))
100 set_bit(0, online2);
101 if (!bitmap_subset(nodes, online2, MAX_NUMNODES))
102 return -EINVAL;
103 return 0;
106 /* Do sanity checking on a policy */
107 static int mpol_check_policy(int mode, unsigned long *nodes)
109 int empty = bitmap_empty(nodes, MAX_NUMNODES);
111 switch (mode) {
112 case MPOL_DEFAULT:
113 if (!empty)
114 return -EINVAL;
115 break;
116 case MPOL_BIND:
117 case MPOL_INTERLEAVE:
118 /* Preferred will only use the first bit, but allow
119 more for now. */
120 if (empty)
121 return -EINVAL;
122 break;
124 return nodes_online(nodes);
127 /* Copy a node mask from user space. */
128 static int get_nodes(unsigned long *nodes, unsigned long __user *nmask,
129 unsigned long maxnode, int mode)
131 unsigned long k;
132 unsigned long nlongs;
133 unsigned long endmask;
135 --maxnode;
136 bitmap_zero(nodes, MAX_NUMNODES);
137 if (maxnode == 0 || !nmask)
138 return 0;
140 nlongs = BITS_TO_LONGS(maxnode);
141 if ((maxnode % BITS_PER_LONG) == 0)
142 endmask = ~0UL;
143 else
144 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
146 /* When the user specified more nodes than supported just check
147 if the non supported part is all zero. */
148 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
149 if (nlongs > PAGE_SIZE/sizeof(long))
150 return -EINVAL;
151 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
152 unsigned long t;
153 if (get_user(t, nmask + k))
154 return -EFAULT;
155 if (k == nlongs - 1) {
156 if (t & endmask)
157 return -EINVAL;
158 } else if (t)
159 return -EINVAL;
161 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
162 endmask = ~0UL;
165 if (copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long)))
166 return -EFAULT;
167 nodes[nlongs-1] &= endmask;
168 return mpol_check_policy(mode, nodes);
171 /* Generate a custom zonelist for the BIND policy. */
172 static struct zonelist *bind_zonelist(unsigned long *nodes)
174 struct zonelist *zl;
175 int num, max, nd;
177 max = 1 + MAX_NR_ZONES * bitmap_weight(nodes, MAX_NUMNODES);
178 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
179 if (!zl)
180 return NULL;
181 num = 0;
182 for (nd = find_first_bit(nodes, MAX_NUMNODES);
183 nd < MAX_NUMNODES;
184 nd = find_next_bit(nodes, MAX_NUMNODES, 1+nd)) {
185 int k;
186 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
187 struct zone *z = &NODE_DATA(nd)->node_zones[k];
188 if (!z->present_pages)
189 continue;
190 zl->zones[num++] = z;
191 if (k > policy_zone)
192 policy_zone = k;
195 BUG_ON(num >= max);
196 zl->zones[num] = NULL;
197 return zl;
200 /* Create a new policy */
201 static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
203 struct mempolicy *policy;
205 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes[0]);
206 if (mode == MPOL_DEFAULT)
207 return NULL;
208 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
209 if (!policy)
210 return ERR_PTR(-ENOMEM);
211 atomic_set(&policy->refcnt, 1);
212 switch (mode) {
213 case MPOL_INTERLEAVE:
214 bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES);
215 break;
216 case MPOL_PREFERRED:
217 policy->v.preferred_node = find_first_bit(nodes, MAX_NUMNODES);
218 if (policy->v.preferred_node >= MAX_NUMNODES)
219 policy->v.preferred_node = -1;
220 break;
221 case MPOL_BIND:
222 policy->v.zonelist = bind_zonelist(nodes);
223 if (policy->v.zonelist == NULL) {
224 kmem_cache_free(policy_cache, policy);
225 return ERR_PTR(-ENOMEM);
227 break;
229 policy->policy = mode;
230 return policy;
233 /* Ensure all existing pages follow the policy. */
234 static int
235 verify_pages(unsigned long addr, unsigned long end, unsigned long *nodes)
237 while (addr < end) {
238 struct page *p;
239 pte_t *pte;
240 pmd_t *pmd;
241 pgd_t *pgd = pgd_offset_k(addr);
242 if (pgd_none(*pgd)) {
243 addr = (addr + PGDIR_SIZE) & PGDIR_MASK;
244 continue;
246 pmd = pmd_offset(pgd, addr);
247 if (pmd_none(*pmd)) {
248 addr = (addr + PMD_SIZE) & PMD_MASK;
249 continue;
251 p = NULL;
252 pte = pte_offset_map(pmd, addr);
253 if (pte_present(*pte))
254 p = pte_page(*pte);
255 pte_unmap(pte);
256 if (p) {
257 unsigned nid = page_to_nid(p);
258 if (!test_bit(nid, nodes))
259 return -EIO;
261 addr += PAGE_SIZE;
263 return 0;
266 /* Step 1: check the range */
267 static struct vm_area_struct *
268 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
269 unsigned long *nodes, unsigned long flags)
271 int err;
272 struct vm_area_struct *first, *vma, *prev;
274 first = find_vma(mm, start);
275 if (!first)
276 return ERR_PTR(-EFAULT);
277 prev = NULL;
278 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
279 if (!vma->vm_next && vma->vm_end < end)
280 return ERR_PTR(-EFAULT);
281 if (prev && prev->vm_end < vma->vm_start)
282 return ERR_PTR(-EFAULT);
283 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
284 err = verify_pages(vma->vm_start, vma->vm_end, nodes);
285 if (err) {
286 first = ERR_PTR(err);
287 break;
290 prev = vma;
292 return first;
295 /* Apply policy to a single VMA */
296 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
298 int err = 0;
299 struct mempolicy *old = vma->vm_policy;
301 PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
302 vma->vm_start, vma->vm_end, vma->vm_pgoff,
303 vma->vm_ops, vma->vm_file,
304 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
306 if (vma->vm_ops && vma->vm_ops->set_policy)
307 err = vma->vm_ops->set_policy(vma, new);
308 if (!err) {
309 mpol_get(new);
310 vma->vm_policy = new;
311 mpol_free(old);
313 return err;
316 /* Step 2: apply policy to a range and do splits. */
317 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
318 unsigned long end, struct mempolicy *new)
320 struct vm_area_struct *next;
321 int err;
323 err = 0;
324 for (; vma && vma->vm_start < end; vma = next) {
325 next = vma->vm_next;
326 if (vma->vm_start < start)
327 err = split_vma(vma->vm_mm, vma, start, 1);
328 if (!err && vma->vm_end > end)
329 err = split_vma(vma->vm_mm, vma, end, 0);
330 if (!err)
331 err = policy_vma(vma, new);
332 if (err)
333 break;
335 return err;
338 /* Change policy for a memory range */
339 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
340 unsigned long mode,
341 unsigned long __user *nmask, unsigned long maxnode,
342 unsigned flags)
344 struct vm_area_struct *vma;
345 struct mm_struct *mm = current->mm;
346 struct mempolicy *new;
347 unsigned long end;
348 DECLARE_BITMAP(nodes, MAX_NUMNODES);
349 int err;
351 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
352 return -EINVAL;
353 if (start & ~PAGE_MASK)
354 return -EINVAL;
355 if (mode == MPOL_DEFAULT)
356 flags &= ~MPOL_MF_STRICT;
357 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
358 end = start + len;
359 if (end < start)
360 return -EINVAL;
361 if (end == start)
362 return 0;
364 err = get_nodes(nodes, nmask, maxnode, mode);
365 if (err)
366 return err;
368 new = mpol_new(mode, nodes);
369 if (IS_ERR(new))
370 return PTR_ERR(new);
372 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
373 mode,nodes[0]);
375 down_write(&mm->mmap_sem);
376 vma = check_range(mm, start, end, nodes, flags);
377 err = PTR_ERR(vma);
378 if (!IS_ERR(vma))
379 err = mbind_range(vma, start, end, new);
380 up_write(&mm->mmap_sem);
381 mpol_free(new);
382 return err;
385 /* Set the process memory policy */
386 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
387 unsigned long maxnode)
389 int err;
390 struct mempolicy *new;
391 DECLARE_BITMAP(nodes, MAX_NUMNODES);
393 if (mode > MPOL_MAX)
394 return -EINVAL;
395 err = get_nodes(nodes, nmask, maxnode, mode);
396 if (err)
397 return err;
398 new = mpol_new(mode, nodes);
399 if (IS_ERR(new))
400 return PTR_ERR(new);
401 mpol_free(current->mempolicy);
402 current->mempolicy = new;
403 if (new && new->policy == MPOL_INTERLEAVE)
404 current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES);
405 return 0;
408 /* Fill a zone bitmap for a policy */
409 static void get_zonemask(struct mempolicy *p, unsigned long *nodes)
411 int i;
413 bitmap_zero(nodes, MAX_NUMNODES);
414 switch (p->policy) {
415 case MPOL_BIND:
416 for (i = 0; p->v.zonelist->zones[i]; i++)
417 __set_bit(p->v.zonelist->zones[i]->zone_pgdat->node_id, nodes);
418 break;
419 case MPOL_DEFAULT:
420 break;
421 case MPOL_INTERLEAVE:
422 bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES);
423 break;
424 case MPOL_PREFERRED:
425 /* or use current node instead of online map? */
426 if (p->v.preferred_node < 0)
427 bitmap_copy(nodes, node_online_map, MAX_NUMNODES);
428 else
429 __set_bit(p->v.preferred_node, nodes);
430 break;
431 default:
432 BUG();
436 static int lookup_node(struct mm_struct *mm, unsigned long addr)
438 struct page *p;
439 int err;
441 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
442 if (err >= 0) {
443 err = page_to_nid(p);
444 put_page(p);
446 return err;
449 /* Copy a kernel node mask to user space */
450 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
451 void *nodes, unsigned nbytes)
453 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
455 if (copy > nbytes) {
456 if (copy > PAGE_SIZE)
457 return -EINVAL;
458 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
459 return -EFAULT;
460 copy = nbytes;
462 return copy_to_user(mask, nodes, copy) ? -EFAULT : 0;
465 /* Retrieve NUMA policy */
466 asmlinkage long sys_get_mempolicy(int __user *policy,
467 unsigned long __user *nmask,
468 unsigned long maxnode,
469 unsigned long addr, unsigned long flags)
471 int err, pval;
472 struct mm_struct *mm = current->mm;
473 struct vm_area_struct *vma = NULL;
474 struct mempolicy *pol = current->mempolicy;
476 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
477 return -EINVAL;
478 if (nmask != NULL && maxnode < numnodes)
479 return -EINVAL;
480 if (flags & MPOL_F_ADDR) {
481 down_read(&mm->mmap_sem);
482 vma = find_vma_intersection(mm, addr, addr+1);
483 if (!vma) {
484 up_read(&mm->mmap_sem);
485 return -EFAULT;
487 if (vma->vm_ops && vma->vm_ops->get_policy)
488 pol = vma->vm_ops->get_policy(vma, addr);
489 else
490 pol = vma->vm_policy;
491 } else if (addr)
492 return -EINVAL;
494 if (!pol)
495 pol = &default_policy;
497 if (flags & MPOL_F_NODE) {
498 if (flags & MPOL_F_ADDR) {
499 err = lookup_node(mm, addr);
500 if (err < 0)
501 goto out;
502 pval = err;
503 } else if (pol == current->mempolicy &&
504 pol->policy == MPOL_INTERLEAVE) {
505 pval = current->il_next;
506 } else {
507 err = -EINVAL;
508 goto out;
510 } else
511 pval = pol->policy;
513 err = -EFAULT;
514 if (policy && put_user(pval, policy))
515 goto out;
517 err = 0;
518 if (nmask) {
519 DECLARE_BITMAP(nodes, MAX_NUMNODES);
520 get_zonemask(pol, nodes);
521 err = copy_nodes_to_user(nmask, maxnode, nodes, sizeof(nodes));
524 out:
525 if (vma)
526 up_read(&current->mm->mmap_sem);
527 return err;
530 #ifdef CONFIG_COMPAT
532 asmlinkage long compat_get_mempolicy(int __user *policy,
533 compat_ulong_t __user *nmask,
534 compat_ulong_t maxnode,
535 compat_ulong_t addr, compat_ulong_t flags)
537 long err;
538 unsigned long __user *nm = NULL;
539 unsigned long nr_bits, alloc_size;
540 DECLARE_BITMAP(bm, MAX_NUMNODES);
542 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
543 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
545 if (nmask)
546 nm = compat_alloc_user_space(alloc_size);
548 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
550 if (!err && nmask) {
551 err = copy_from_user(bm, nm, alloc_size);
552 /* ensure entire bitmap is zeroed */
553 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
554 err |= compat_put_bitmap(nmask, bm, nr_bits);
557 return err;
560 asmlinkage long compat_set_mempolicy(int mode, compat_ulong_t __user *nmask,
561 compat_ulong_t maxnode)
563 long err = 0;
564 unsigned long __user *nm = NULL;
565 unsigned long nr_bits, alloc_size;
566 DECLARE_BITMAP(bm, MAX_NUMNODES);
568 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
569 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
571 if (nmask) {
572 err = compat_get_bitmap(bm, nmask, nr_bits);
573 nm = compat_alloc_user_space(alloc_size);
574 err |= copy_to_user(nm, bm, alloc_size);
577 if (err)
578 return -EFAULT;
580 return sys_set_mempolicy(mode, nm, nr_bits+1);
583 asmlinkage long compat_mbind(compat_ulong_t start, compat_ulong_t len,
584 compat_ulong_t mode, compat_ulong_t __user *nmask,
585 compat_ulong_t maxnode, compat_ulong_t flags)
587 long err = 0;
588 unsigned long __user *nm = NULL;
589 unsigned long nr_bits, alloc_size;
590 DECLARE_BITMAP(bm, MAX_NUMNODES);
592 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
593 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
595 if (nmask) {
596 err = compat_get_bitmap(bm, nmask, nr_bits);
597 nm = compat_alloc_user_space(alloc_size);
598 err |= copy_to_user(nm, bm, alloc_size);
601 if (err)
602 return -EFAULT;
604 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
607 #endif
609 /* Return effective policy for a VMA */
610 static struct mempolicy *
611 get_vma_policy(struct vm_area_struct *vma, unsigned long addr)
613 struct mempolicy *pol = current->mempolicy;
615 if (vma) {
616 if (vma->vm_ops && vma->vm_ops->get_policy)
617 pol = vma->vm_ops->get_policy(vma, addr);
618 else if (vma->vm_policy &&
619 vma->vm_policy->policy != MPOL_DEFAULT)
620 pol = vma->vm_policy;
622 if (!pol)
623 pol = &default_policy;
624 return pol;
627 /* Return a zonelist representing a mempolicy */
628 static struct zonelist *zonelist_policy(unsigned gfp, struct mempolicy *policy)
630 int nd;
632 switch (policy->policy) {
633 case MPOL_PREFERRED:
634 nd = policy->v.preferred_node;
635 if (nd < 0)
636 nd = numa_node_id();
637 break;
638 case MPOL_BIND:
639 /* Lower zones don't get a policy applied */
640 if (gfp >= policy_zone)
641 return policy->v.zonelist;
642 /*FALL THROUGH*/
643 case MPOL_INTERLEAVE: /* should not happen */
644 case MPOL_DEFAULT:
645 nd = numa_node_id();
646 break;
647 default:
648 nd = 0;
649 BUG();
651 return NODE_DATA(nd)->node_zonelists + (gfp & GFP_ZONEMASK);
654 /* Do dynamic interleaving for a process */
655 static unsigned interleave_nodes(struct mempolicy *policy)
657 unsigned nid, next;
658 struct task_struct *me = current;
660 nid = me->il_next;
661 BUG_ON(nid >= MAX_NUMNODES);
662 next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid);
663 if (next >= MAX_NUMNODES)
664 next = find_first_bit(policy->v.nodes, MAX_NUMNODES);
665 me->il_next = next;
666 return nid;
669 /* Do static interleaving for a VMA with known offset. */
670 static unsigned offset_il_node(struct mempolicy *pol,
671 struct vm_area_struct *vma, unsigned long off)
673 unsigned nnodes = bitmap_weight(pol->v.nodes, MAX_NUMNODES);
674 unsigned target = (unsigned)off % nnodes;
675 int c;
676 int nid = -1;
678 c = 0;
679 do {
680 nid = find_next_bit(pol->v.nodes, MAX_NUMNODES, nid+1);
681 c++;
682 } while (c <= target);
683 BUG_ON(nid >= MAX_NUMNODES);
684 BUG_ON(!test_bit(nid, pol->v.nodes));
685 return nid;
688 /* Allocate a page in interleaved policy.
689 Own path because it needs to do special accounting. */
690 static struct page *alloc_page_interleave(unsigned gfp, unsigned order, unsigned nid)
692 struct zonelist *zl;
693 struct page *page;
695 BUG_ON(!test_bit(nid, node_online_map));
696 zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);
697 page = __alloc_pages(gfp, order, zl);
698 if (page && page_zone(page) == zl->zones[0]) {
699 zl->zones[0]->pageset[get_cpu()].interleave_hit++;
700 put_cpu();
702 return page;
706 * alloc_page_vma - Allocate a page for a VMA.
708 * @gfp:
709 * %GFP_USER user allocation.
710 * %GFP_KERNEL kernel allocations,
711 * %GFP_HIGHMEM highmem/user allocations,
712 * %GFP_FS allocation should not call back into a file system.
713 * %GFP_ATOMIC don't sleep.
715 * @vma: Pointer to VMA or NULL if not available.
716 * @addr: Virtual Address of the allocation. Must be inside the VMA.
718 * This function allocates a page from the kernel page pool and applies
719 * a NUMA policy associated with the VMA or the current process.
720 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
721 * mm_struct of the VMA to prevent it from going away. Should be used for
722 * all allocations for pages that will be mapped into
723 * user space. Returns NULL when no page can be allocated.
725 * Should be called with the mm_sem of the vma hold.
727 struct page *
728 alloc_page_vma(unsigned gfp, struct vm_area_struct *vma, unsigned long addr)
730 struct mempolicy *pol = get_vma_policy(vma, addr);
732 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
733 unsigned nid;
734 if (vma) {
735 unsigned long off;
736 BUG_ON(addr >= vma->vm_end);
737 BUG_ON(addr < vma->vm_start);
738 off = vma->vm_pgoff;
739 off += (addr - vma->vm_start) >> PAGE_SHIFT;
740 nid = offset_il_node(pol, vma, off);
741 } else {
742 /* fall back to process interleaving */
743 nid = interleave_nodes(pol);
745 return alloc_page_interleave(gfp, 0, nid);
747 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
751 * alloc_pages_current - Allocate pages.
753 * @gfp:
754 * %GFP_USER user allocation,
755 * %GFP_KERNEL kernel allocation,
756 * %GFP_HIGHMEM highmem allocation,
757 * %GFP_FS don't call back into a file system.
758 * %GFP_ATOMIC don't sleep.
759 * @order: Power of two of allocation size in pages. 0 is a single page.
761 * Allocate a page from the kernel page pool. When not in
762 * interrupt context and apply the current process NUMA policy.
763 * Returns NULL when no page can be allocated.
765 struct page *alloc_pages_current(unsigned gfp, unsigned order)
767 struct mempolicy *pol = current->mempolicy;
769 if (!pol || in_interrupt())
770 pol = &default_policy;
771 if (pol->policy == MPOL_INTERLEAVE)
772 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
773 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
775 EXPORT_SYMBOL(alloc_pages_current);
777 /* Slow path of a mempolicy copy */
778 struct mempolicy *__mpol_copy(struct mempolicy *old)
780 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
782 if (!new)
783 return ERR_PTR(-ENOMEM);
784 *new = *old;
785 atomic_set(&new->refcnt, 1);
786 if (new->policy == MPOL_BIND) {
787 int sz = ksize(old->v.zonelist);
788 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
789 if (!new->v.zonelist) {
790 kmem_cache_free(policy_cache, new);
791 return ERR_PTR(-ENOMEM);
793 memcpy(new->v.zonelist, old->v.zonelist, sz);
795 return new;
798 /* Slow path of a mempolicy comparison */
799 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
801 if (!a || !b)
802 return 0;
803 if (a->policy != b->policy)
804 return 0;
805 switch (a->policy) {
806 case MPOL_DEFAULT:
807 return 1;
808 case MPOL_INTERLEAVE:
809 return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES);
810 case MPOL_PREFERRED:
811 return a->v.preferred_node == b->v.preferred_node;
812 case MPOL_BIND: {
813 int i;
814 for (i = 0; a->v.zonelist->zones[i]; i++)
815 if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
816 return 0;
817 return b->v.zonelist->zones[i] == NULL;
819 default:
820 BUG();
821 return 0;
825 /* Slow path of a mpol destructor. */
826 void __mpol_free(struct mempolicy *p)
828 if (!atomic_dec_and_test(&p->refcnt))
829 return;
830 if (p->policy == MPOL_BIND)
831 kfree(p->v.zonelist);
832 p->policy = MPOL_DEFAULT;
833 kmem_cache_free(policy_cache, p);
837 * Hugetlb policy. Same as above, just works with node numbers instead of
838 * zonelists.
841 /* Find first node suitable for an allocation */
842 int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
844 struct mempolicy *pol = get_vma_policy(vma, addr);
846 switch (pol->policy) {
847 case MPOL_DEFAULT:
848 return numa_node_id();
849 case MPOL_BIND:
850 return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
851 case MPOL_INTERLEAVE:
852 return interleave_nodes(pol);
853 case MPOL_PREFERRED:
854 return pol->v.preferred_node >= 0 ?
855 pol->v.preferred_node : numa_node_id();
857 BUG();
858 return 0;
861 /* Find secondary valid nodes for an allocation */
862 int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
864 struct mempolicy *pol = get_vma_policy(vma, addr);
866 switch (pol->policy) {
867 case MPOL_PREFERRED:
868 case MPOL_DEFAULT:
869 case MPOL_INTERLEAVE:
870 return 1;
871 case MPOL_BIND: {
872 struct zone **z;
873 for (z = pol->v.zonelist->zones; *z; z++)
874 if ((*z)->zone_pgdat->node_id == nid)
875 return 1;
876 return 0;
878 default:
879 BUG();
880 return 0;
885 * Shared memory backing store policy support.
887 * Remember policies even when nobody has shared memory mapped.
888 * The policies are kept in Red-Black tree linked from the inode.
889 * They are protected by the sp->sem semaphore, which should be held
890 * for any accesses to the tree.
893 /* lookup first element intersecting start-end */
894 /* Caller holds sp->sem */
895 static struct sp_node *
896 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
898 struct rb_node *n = sp->root.rb_node;
900 while (n) {
901 struct sp_node *p = rb_entry(n, struct sp_node, nd);
902 if (start >= p->end) {
903 n = n->rb_right;
904 } else if (end < p->start) {
905 n = n->rb_left;
906 } else {
907 break;
910 if (!n)
911 return NULL;
912 for (;;) {
913 struct sp_node *w = NULL;
914 struct rb_node *prev = rb_prev(n);
915 if (!prev)
916 break;
917 w = rb_entry(prev, struct sp_node, nd);
918 if (w->end <= start)
919 break;
920 n = prev;
922 return rb_entry(n, struct sp_node, nd);
925 /* Insert a new shared policy into the list. */
926 /* Caller holds sp->sem */
927 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
929 struct rb_node **p = &sp->root.rb_node;
930 struct rb_node *parent = NULL;
931 struct sp_node *nd;
933 while (*p) {
934 parent = *p;
935 nd = rb_entry(parent, struct sp_node, nd);
936 if (new->start < nd->start)
937 p = &(*p)->rb_left;
938 else if (new->end > nd->end)
939 p = &(*p)->rb_right;
940 else
941 BUG();
943 rb_link_node(&new->nd, parent, p);
944 rb_insert_color(&new->nd, &sp->root);
945 PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
946 new->policy ? new->policy->policy : 0);
949 /* Find shared policy intersecting idx */
950 struct mempolicy *
951 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
953 struct mempolicy *pol = NULL;
954 struct sp_node *sn;
956 down(&sp->sem);
957 sn = sp_lookup(sp, idx, idx+1);
958 if (sn) {
959 mpol_get(sn->policy);
960 pol = sn->policy;
962 up(&sp->sem);
963 return pol;
966 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
968 PDprintk("deleting %lx-l%x\n", n->start, n->end);
969 rb_erase(&n->nd, &sp->root);
970 mpol_free(n->policy);
971 kmem_cache_free(sn_cache, n);
974 struct sp_node *
975 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
977 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
979 if (!n)
980 return NULL;
981 n->start = start;
982 n->end = end;
983 mpol_get(pol);
984 n->policy = pol;
985 return n;
988 /* Replace a policy range. */
989 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
990 unsigned long end, struct sp_node *new)
992 struct sp_node *n, *new2;
994 down(&sp->sem);
995 n = sp_lookup(sp, start, end);
996 /* Take care of old policies in the same range. */
997 while (n && n->start < end) {
998 struct rb_node *next = rb_next(&n->nd);
999 if (n->start >= start) {
1000 if (n->end <= end)
1001 sp_delete(sp, n);
1002 else
1003 n->start = end;
1004 } else {
1005 /* Old policy spanning whole new range. */
1006 if (n->end > end) {
1007 new2 = sp_alloc(end, n->end, n->policy);
1008 if (!new2) {
1009 up(&sp->sem);
1010 return -ENOMEM;
1012 n->end = end;
1013 sp_insert(sp, new2);
1015 /* Old crossing beginning, but not end (easy) */
1016 if (n->start < start && n->end > start)
1017 n->end = start;
1019 if (!next)
1020 break;
1021 n = rb_entry(next, struct sp_node, nd);
1023 if (new)
1024 sp_insert(sp, new);
1025 up(&sp->sem);
1026 return 0;
1029 int mpol_set_shared_policy(struct shared_policy *info,
1030 struct vm_area_struct *vma, struct mempolicy *npol)
1032 int err;
1033 struct sp_node *new = NULL;
1034 unsigned long sz = vma_pages(vma);
1036 PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1037 vma->vm_pgoff,
1038 sz, npol? npol->policy : -1,
1039 npol ? npol->v.nodes[0] : -1);
1041 if (npol) {
1042 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1043 if (!new)
1044 return -ENOMEM;
1046 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1047 if (err && new)
1048 kmem_cache_free(sn_cache, new);
1049 return err;
1052 /* Free a backing policy store on inode delete. */
1053 void mpol_free_shared_policy(struct shared_policy *p)
1055 struct sp_node *n;
1056 struct rb_node *next;
1058 down(&p->sem);
1059 next = rb_first(&p->root);
1060 while (next) {
1061 n = rb_entry(next, struct sp_node, nd);
1062 next = rb_next(&n->nd);
1063 rb_erase(&n->nd, &p->root);
1064 mpol_free(n->policy);
1065 kmem_cache_free(sn_cache, n);
1067 up(&p->sem);
1070 /* assumes fs == KERNEL_DS */
1071 void __init numa_policy_init(void)
1073 policy_cache = kmem_cache_create("numa_policy",
1074 sizeof(struct mempolicy),
1075 0, SLAB_PANIC, NULL, NULL);
1077 sn_cache = kmem_cache_create("shared_policy_node",
1078 sizeof(struct sp_node),
1079 0, SLAB_PANIC, NULL, NULL);
1081 /* Set interleaving policy for system init. This way not all
1082 the data structures allocated at system boot end up in node zero. */
1084 if (sys_set_mempolicy(MPOL_INTERLEAVE, node_online_map, MAX_NUMNODES) < 0)
1085 printk("numa_policy_init: interleaving failed\n");
1088 /* Reset policy of current process to default.
1089 * Assumes fs == KERNEL_DS */
1090 void numa_default_policy(void)
1092 sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);