2 * Simple NUMA memory policy for the Linux kernel.
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * Subject to the GNU Public License, version 2.
7 * NUMA policy allows the user to give hints in which node(s) memory should
10 * Support four policies per VMA and per process:
12 * The VMA policy has priority over the process policy for a page fault.
14 * interleave Allocate memory interleaved over a set of nodes,
15 * with normal fallback if it fails.
16 * For VMA based allocations this interleaves based on the
17 * offset into the backing object or offset into the mapping
18 * for anonymous memory. For process policy an process counter
20 * bind Only allocate memory on a specific set of nodes,
22 * preferred Try a specific node first before normal fallback.
23 * As a special case node -1 here means do the allocation
24 * on the local CPU. This is normally identical to default,
25 * but useful to set in a VMA when you have a non default
27 * default Allocate on the local node first, or when on a VMA
28 * use the process policy. This is what Linux always did
29 * in a NUMA aware kernel and still does by, ahem, default.
31 * The process policy is applied for most non interrupt memory allocations
32 * in that process' context. Interrupts ignore the policies and always
33 * try to allocate on the local CPU. The VMA policy is only applied for memory
34 * allocations for a VMA in the VM.
36 * Currently there are a few corner cases in swapping where the policy
37 * is not applied, but the majority should be handled. When process policy
38 * is used it is not remembered over swap outs/swap ins.
40 * Only the highest zone in the zone hierarchy gets policied. Allocations
41 * requesting a lower zone just use default policy. This implies that
42 * on systems with highmem kernel lowmem allocation don't get policied.
43 * Same with GFP_DMA allocations.
45 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
46 * all users and remembered even when nobody has memory mapped.
50 fix mmap readahead to honour policy and enable policy for any page cache
52 statistics for bigpages
53 global policy for page cache? currently it uses process policy. Requires
55 handle mremap for shared memory (currently ignored for the policy)
57 make bind policy root only? It can trigger oom much faster and the
58 kernel is not always grateful with that.
59 could replace all the switch()es with a mempolicy_ops structure.
62 #include <linux/mempolicy.h>
64 #include <linux/highmem.h>
65 #include <linux/hugetlb.h>
66 #include <linux/kernel.h>
67 #include <linux/sched.h>
69 #include <linux/gfp.h>
70 #include <linux/slab.h>
71 #include <linux/string.h>
72 #include <linux/module.h>
73 #include <linux/interrupt.h>
74 #include <linux/init.h>
75 #include <linux/compat.h>
76 #include <linux/mempolicy.h>
77 #include <asm/uaccess.h>
79 static kmem_cache_t
*policy_cache
;
80 static kmem_cache_t
*sn_cache
;
82 #define PDprintk(fmt...)
84 /* Highest zone. An specific allocation for a zone below that is not
86 static int policy_zone
;
88 static struct mempolicy default_policy
= {
89 .refcnt
= ATOMIC_INIT(1), /* never free it */
90 .policy
= MPOL_DEFAULT
,
93 /* Check if all specified nodes are online */
94 static int nodes_online(unsigned long *nodes
)
96 DECLARE_BITMAP(online2
, MAX_NUMNODES
);
98 bitmap_copy(online2
, node_online_map
, MAX_NUMNODES
);
99 if (bitmap_empty(online2
, MAX_NUMNODES
))
101 if (!bitmap_subset(nodes
, online2
, MAX_NUMNODES
))
106 /* Do sanity checking on a policy */
107 static int mpol_check_policy(int mode
, unsigned long *nodes
)
109 int empty
= bitmap_empty(nodes
, MAX_NUMNODES
);
117 case MPOL_INTERLEAVE
:
118 /* Preferred will only use the first bit, but allow
124 return nodes_online(nodes
);
127 /* Copy a node mask from user space. */
128 static int get_nodes(unsigned long *nodes
, unsigned long __user
*nmask
,
129 unsigned long maxnode
, int mode
)
132 unsigned long nlongs
;
133 unsigned long endmask
;
136 bitmap_zero(nodes
, MAX_NUMNODES
);
137 if (maxnode
== 0 || !nmask
)
140 nlongs
= BITS_TO_LONGS(maxnode
);
141 if ((maxnode
% BITS_PER_LONG
) == 0)
144 endmask
= (1UL << (maxnode
% BITS_PER_LONG
)) - 1;
146 /* When the user specified more nodes than supported just check
147 if the non supported part is all zero. */
148 if (nlongs
> BITS_TO_LONGS(MAX_NUMNODES
)) {
149 if (nlongs
> PAGE_SIZE
/sizeof(long))
151 for (k
= BITS_TO_LONGS(MAX_NUMNODES
); k
< nlongs
; k
++) {
153 if (get_user(t
, nmask
+ k
))
155 if (k
== nlongs
- 1) {
161 nlongs
= BITS_TO_LONGS(MAX_NUMNODES
);
165 if (copy_from_user(nodes
, nmask
, nlongs
*sizeof(unsigned long)))
167 nodes
[nlongs
-1] &= endmask
;
168 return mpol_check_policy(mode
, nodes
);
171 /* Generate a custom zonelist for the BIND policy. */
172 static struct zonelist
*bind_zonelist(unsigned long *nodes
)
177 max
= 1 + MAX_NR_ZONES
* bitmap_weight(nodes
, MAX_NUMNODES
);
178 zl
= kmalloc(sizeof(void *) * max
, GFP_KERNEL
);
182 for (nd
= find_first_bit(nodes
, MAX_NUMNODES
);
184 nd
= find_next_bit(nodes
, MAX_NUMNODES
, 1+nd
)) {
186 for (k
= MAX_NR_ZONES
-1; k
>= 0; k
--) {
187 struct zone
*z
= &NODE_DATA(nd
)->node_zones
[k
];
188 if (!z
->present_pages
)
190 zl
->zones
[num
++] = z
;
196 zl
->zones
[num
] = NULL
;
200 /* Create a new policy */
201 static struct mempolicy
*mpol_new(int mode
, unsigned long *nodes
)
203 struct mempolicy
*policy
;
205 PDprintk("setting mode %d nodes[0] %lx\n", mode
, nodes
[0]);
206 if (mode
== MPOL_DEFAULT
)
208 policy
= kmem_cache_alloc(policy_cache
, GFP_KERNEL
);
210 return ERR_PTR(-ENOMEM
);
211 atomic_set(&policy
->refcnt
, 1);
213 case MPOL_INTERLEAVE
:
214 bitmap_copy(policy
->v
.nodes
, nodes
, MAX_NUMNODES
);
217 policy
->v
.preferred_node
= find_first_bit(nodes
, MAX_NUMNODES
);
218 if (policy
->v
.preferred_node
>= MAX_NUMNODES
)
219 policy
->v
.preferred_node
= -1;
222 policy
->v
.zonelist
= bind_zonelist(nodes
);
223 if (policy
->v
.zonelist
== NULL
) {
224 kmem_cache_free(policy_cache
, policy
);
225 return ERR_PTR(-ENOMEM
);
229 policy
->policy
= mode
;
233 /* Ensure all existing pages follow the policy. */
235 verify_pages(unsigned long addr
, unsigned long end
, unsigned long *nodes
)
241 pgd_t
*pgd
= pgd_offset_k(addr
);
242 if (pgd_none(*pgd
)) {
243 addr
= (addr
+ PGDIR_SIZE
) & PGDIR_MASK
;
246 pmd
= pmd_offset(pgd
, addr
);
247 if (pmd_none(*pmd
)) {
248 addr
= (addr
+ PMD_SIZE
) & PMD_MASK
;
252 pte
= pte_offset_map(pmd
, addr
);
253 if (pte_present(*pte
))
257 unsigned nid
= page_to_nid(p
);
258 if (!test_bit(nid
, nodes
))
266 /* Step 1: check the range */
267 static struct vm_area_struct
*
268 check_range(struct mm_struct
*mm
, unsigned long start
, unsigned long end
,
269 unsigned long *nodes
, unsigned long flags
)
272 struct vm_area_struct
*first
, *vma
, *prev
;
274 first
= find_vma(mm
, start
);
276 return ERR_PTR(-EFAULT
);
278 for (vma
= first
; vma
&& vma
->vm_start
< end
; vma
= vma
->vm_next
) {
279 if (!vma
->vm_next
&& vma
->vm_end
< end
)
280 return ERR_PTR(-EFAULT
);
281 if (prev
&& prev
->vm_end
< vma
->vm_start
)
282 return ERR_PTR(-EFAULT
);
283 if ((flags
& MPOL_MF_STRICT
) && !is_vm_hugetlb_page(vma
)) {
284 err
= verify_pages(vma
->vm_start
, vma
->vm_end
, nodes
);
286 first
= ERR_PTR(err
);
295 /* Apply policy to a single VMA */
296 static int policy_vma(struct vm_area_struct
*vma
, struct mempolicy
*new)
299 struct mempolicy
*old
= vma
->vm_policy
;
301 PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
302 vma
->vm_start
, vma
->vm_end
, vma
->vm_pgoff
,
303 vma
->vm_ops
, vma
->vm_file
,
304 vma
->vm_ops
? vma
->vm_ops
->set_policy
: NULL
);
306 if (vma
->vm_ops
&& vma
->vm_ops
->set_policy
)
307 err
= vma
->vm_ops
->set_policy(vma
, new);
310 vma
->vm_policy
= new;
316 /* Step 2: apply policy to a range and do splits. */
317 static int mbind_range(struct vm_area_struct
*vma
, unsigned long start
,
318 unsigned long end
, struct mempolicy
*new)
320 struct vm_area_struct
*next
;
324 for (; vma
&& vma
->vm_start
< end
; vma
= next
) {
326 if (vma
->vm_start
< start
)
327 err
= split_vma(vma
->vm_mm
, vma
, start
, 1);
328 if (!err
&& vma
->vm_end
> end
)
329 err
= split_vma(vma
->vm_mm
, vma
, end
, 0);
331 err
= policy_vma(vma
, new);
338 /* Change policy for a memory range */
339 asmlinkage
long sys_mbind(unsigned long start
, unsigned long len
,
341 unsigned long __user
*nmask
, unsigned long maxnode
,
344 struct vm_area_struct
*vma
;
345 struct mm_struct
*mm
= current
->mm
;
346 struct mempolicy
*new;
348 DECLARE_BITMAP(nodes
, MAX_NUMNODES
);
351 if ((flags
& ~(unsigned long)(MPOL_MF_STRICT
)) || mode
> MPOL_MAX
)
353 if (start
& ~PAGE_MASK
)
355 if (mode
== MPOL_DEFAULT
)
356 flags
&= ~MPOL_MF_STRICT
;
357 len
= (len
+ PAGE_SIZE
- 1) & PAGE_MASK
;
364 err
= get_nodes(nodes
, nmask
, maxnode
, mode
);
368 new = mpol_new(mode
, nodes
);
372 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start
,start
+len
,
375 down_write(&mm
->mmap_sem
);
376 vma
= check_range(mm
, start
, end
, nodes
, flags
);
379 err
= mbind_range(vma
, start
, end
, new);
380 up_write(&mm
->mmap_sem
);
385 /* Set the process memory policy */
386 asmlinkage
long sys_set_mempolicy(int mode
, unsigned long __user
*nmask
,
387 unsigned long maxnode
)
390 struct mempolicy
*new;
391 DECLARE_BITMAP(nodes
, MAX_NUMNODES
);
395 err
= get_nodes(nodes
, nmask
, maxnode
, mode
);
398 new = mpol_new(mode
, nodes
);
401 mpol_free(current
->mempolicy
);
402 current
->mempolicy
= new;
403 if (new && new->policy
== MPOL_INTERLEAVE
)
404 current
->il_next
= find_first_bit(new->v
.nodes
, MAX_NUMNODES
);
408 /* Fill a zone bitmap for a policy */
409 static void get_zonemask(struct mempolicy
*p
, unsigned long *nodes
)
413 bitmap_zero(nodes
, MAX_NUMNODES
);
416 for (i
= 0; p
->v
.zonelist
->zones
[i
]; i
++)
417 __set_bit(p
->v
.zonelist
->zones
[i
]->zone_pgdat
->node_id
, nodes
);
421 case MPOL_INTERLEAVE
:
422 bitmap_copy(nodes
, p
->v
.nodes
, MAX_NUMNODES
);
425 /* or use current node instead of online map? */
426 if (p
->v
.preferred_node
< 0)
427 bitmap_copy(nodes
, node_online_map
, MAX_NUMNODES
);
429 __set_bit(p
->v
.preferred_node
, nodes
);
436 static int lookup_node(struct mm_struct
*mm
, unsigned long addr
)
441 err
= get_user_pages(current
, mm
, addr
& PAGE_MASK
, 1, 0, 0, &p
, NULL
);
443 err
= page_to_nid(p
);
449 /* Copy a kernel node mask to user space */
450 static int copy_nodes_to_user(unsigned long __user
*mask
, unsigned long maxnode
,
451 void *nodes
, unsigned nbytes
)
453 unsigned long copy
= ALIGN(maxnode
-1, 64) / 8;
456 if (copy
> PAGE_SIZE
)
458 if (clear_user((char __user
*)mask
+ nbytes
, copy
- nbytes
))
462 return copy_to_user(mask
, nodes
, copy
) ? -EFAULT
: 0;
465 /* Retrieve NUMA policy */
466 asmlinkage
long sys_get_mempolicy(int __user
*policy
,
467 unsigned long __user
*nmask
,
468 unsigned long maxnode
,
469 unsigned long addr
, unsigned long flags
)
472 struct mm_struct
*mm
= current
->mm
;
473 struct vm_area_struct
*vma
= NULL
;
474 struct mempolicy
*pol
= current
->mempolicy
;
476 if (flags
& ~(unsigned long)(MPOL_F_NODE
|MPOL_F_ADDR
))
478 if (nmask
!= NULL
&& maxnode
< numnodes
)
480 if (flags
& MPOL_F_ADDR
) {
481 down_read(&mm
->mmap_sem
);
482 vma
= find_vma_intersection(mm
, addr
, addr
+1);
484 up_read(&mm
->mmap_sem
);
487 if (vma
->vm_ops
&& vma
->vm_ops
->get_policy
)
488 pol
= vma
->vm_ops
->get_policy(vma
, addr
);
490 pol
= vma
->vm_policy
;
495 pol
= &default_policy
;
497 if (flags
& MPOL_F_NODE
) {
498 if (flags
& MPOL_F_ADDR
) {
499 err
= lookup_node(mm
, addr
);
503 } else if (pol
== current
->mempolicy
&&
504 pol
->policy
== MPOL_INTERLEAVE
) {
505 pval
= current
->il_next
;
514 if (policy
&& put_user(pval
, policy
))
519 DECLARE_BITMAP(nodes
, MAX_NUMNODES
);
520 get_zonemask(pol
, nodes
);
521 err
= copy_nodes_to_user(nmask
, maxnode
, nodes
, sizeof(nodes
));
526 up_read(¤t
->mm
->mmap_sem
);
532 asmlinkage
long compat_get_mempolicy(int __user
*policy
,
533 compat_ulong_t __user
*nmask
,
534 compat_ulong_t maxnode
,
535 compat_ulong_t addr
, compat_ulong_t flags
)
538 unsigned long __user
*nm
= NULL
;
539 unsigned long nr_bits
, alloc_size
;
540 DECLARE_BITMAP(bm
, MAX_NUMNODES
);
542 nr_bits
= min_t(unsigned long, maxnode
-1, MAX_NUMNODES
);
543 alloc_size
= ALIGN(nr_bits
, BITS_PER_LONG
) / 8;
546 nm
= compat_alloc_user_space(alloc_size
);
548 err
= sys_get_mempolicy(policy
, nm
, nr_bits
+1, addr
, flags
);
551 err
= copy_from_user(bm
, nm
, alloc_size
);
552 /* ensure entire bitmap is zeroed */
553 err
|= clear_user(nmask
, ALIGN(maxnode
-1, 8) / 8);
554 err
|= compat_put_bitmap(nmask
, bm
, nr_bits
);
560 asmlinkage
long compat_set_mempolicy(int mode
, compat_ulong_t __user
*nmask
,
561 compat_ulong_t maxnode
)
564 unsigned long __user
*nm
= NULL
;
565 unsigned long nr_bits
, alloc_size
;
566 DECLARE_BITMAP(bm
, MAX_NUMNODES
);
568 nr_bits
= min_t(unsigned long, maxnode
-1, MAX_NUMNODES
);
569 alloc_size
= ALIGN(nr_bits
, BITS_PER_LONG
) / 8;
572 err
= compat_get_bitmap(bm
, nmask
, nr_bits
);
573 nm
= compat_alloc_user_space(alloc_size
);
574 err
|= copy_to_user(nm
, bm
, alloc_size
);
580 return sys_set_mempolicy(mode
, nm
, nr_bits
+1);
583 asmlinkage
long compat_mbind(compat_ulong_t start
, compat_ulong_t len
,
584 compat_ulong_t mode
, compat_ulong_t __user
*nmask
,
585 compat_ulong_t maxnode
, compat_ulong_t flags
)
588 unsigned long __user
*nm
= NULL
;
589 unsigned long nr_bits
, alloc_size
;
590 DECLARE_BITMAP(bm
, MAX_NUMNODES
);
592 nr_bits
= min_t(unsigned long, maxnode
-1, MAX_NUMNODES
);
593 alloc_size
= ALIGN(nr_bits
, BITS_PER_LONG
) / 8;
596 err
= compat_get_bitmap(bm
, nmask
, nr_bits
);
597 nm
= compat_alloc_user_space(alloc_size
);
598 err
|= copy_to_user(nm
, bm
, alloc_size
);
604 return sys_mbind(start
, len
, mode
, nm
, nr_bits
+1, flags
);
609 /* Return effective policy for a VMA */
610 static struct mempolicy
*
611 get_vma_policy(struct vm_area_struct
*vma
, unsigned long addr
)
613 struct mempolicy
*pol
= current
->mempolicy
;
616 if (vma
->vm_ops
&& vma
->vm_ops
->get_policy
)
617 pol
= vma
->vm_ops
->get_policy(vma
, addr
);
618 else if (vma
->vm_policy
&&
619 vma
->vm_policy
->policy
!= MPOL_DEFAULT
)
620 pol
= vma
->vm_policy
;
623 pol
= &default_policy
;
627 /* Return a zonelist representing a mempolicy */
628 static struct zonelist
*zonelist_policy(unsigned gfp
, struct mempolicy
*policy
)
632 switch (policy
->policy
) {
634 nd
= policy
->v
.preferred_node
;
639 /* Lower zones don't get a policy applied */
640 if (gfp
>= policy_zone
)
641 return policy
->v
.zonelist
;
643 case MPOL_INTERLEAVE
: /* should not happen */
651 return NODE_DATA(nd
)->node_zonelists
+ (gfp
& GFP_ZONEMASK
);
654 /* Do dynamic interleaving for a process */
655 static unsigned interleave_nodes(struct mempolicy
*policy
)
658 struct task_struct
*me
= current
;
661 BUG_ON(nid
>= MAX_NUMNODES
);
662 next
= find_next_bit(policy
->v
.nodes
, MAX_NUMNODES
, 1+nid
);
663 if (next
>= MAX_NUMNODES
)
664 next
= find_first_bit(policy
->v
.nodes
, MAX_NUMNODES
);
669 /* Do static interleaving for a VMA with known offset. */
670 static unsigned offset_il_node(struct mempolicy
*pol
,
671 struct vm_area_struct
*vma
, unsigned long off
)
673 unsigned nnodes
= bitmap_weight(pol
->v
.nodes
, MAX_NUMNODES
);
674 unsigned target
= (unsigned)off
% nnodes
;
680 nid
= find_next_bit(pol
->v
.nodes
, MAX_NUMNODES
, nid
+1);
682 } while (c
<= target
);
683 BUG_ON(nid
>= MAX_NUMNODES
);
684 BUG_ON(!test_bit(nid
, pol
->v
.nodes
));
688 /* Allocate a page in interleaved policy.
689 Own path because it needs to do special accounting. */
690 static struct page
*alloc_page_interleave(unsigned gfp
, unsigned order
, unsigned nid
)
695 BUG_ON(!test_bit(nid
, node_online_map
));
696 zl
= NODE_DATA(nid
)->node_zonelists
+ (gfp
& GFP_ZONEMASK
);
697 page
= __alloc_pages(gfp
, order
, zl
);
698 if (page
&& page_zone(page
) == zl
->zones
[0]) {
699 zl
->zones
[0]->pageset
[get_cpu()].interleave_hit
++;
706 * alloc_page_vma - Allocate a page for a VMA.
709 * %GFP_USER user allocation.
710 * %GFP_KERNEL kernel allocations,
711 * %GFP_HIGHMEM highmem/user allocations,
712 * %GFP_FS allocation should not call back into a file system.
713 * %GFP_ATOMIC don't sleep.
715 * @vma: Pointer to VMA or NULL if not available.
716 * @addr: Virtual Address of the allocation. Must be inside the VMA.
718 * This function allocates a page from the kernel page pool and applies
719 * a NUMA policy associated with the VMA or the current process.
720 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
721 * mm_struct of the VMA to prevent it from going away. Should be used for
722 * all allocations for pages that will be mapped into
723 * user space. Returns NULL when no page can be allocated.
725 * Should be called with the mm_sem of the vma hold.
728 alloc_page_vma(unsigned gfp
, struct vm_area_struct
*vma
, unsigned long addr
)
730 struct mempolicy
*pol
= get_vma_policy(vma
, addr
);
732 if (unlikely(pol
->policy
== MPOL_INTERLEAVE
)) {
736 BUG_ON(addr
>= vma
->vm_end
);
737 BUG_ON(addr
< vma
->vm_start
);
739 off
+= (addr
- vma
->vm_start
) >> PAGE_SHIFT
;
740 nid
= offset_il_node(pol
, vma
, off
);
742 /* fall back to process interleaving */
743 nid
= interleave_nodes(pol
);
745 return alloc_page_interleave(gfp
, 0, nid
);
747 return __alloc_pages(gfp
, 0, zonelist_policy(gfp
, pol
));
751 * alloc_pages_current - Allocate pages.
754 * %GFP_USER user allocation,
755 * %GFP_KERNEL kernel allocation,
756 * %GFP_HIGHMEM highmem allocation,
757 * %GFP_FS don't call back into a file system.
758 * %GFP_ATOMIC don't sleep.
759 * @order: Power of two of allocation size in pages. 0 is a single page.
761 * Allocate a page from the kernel page pool. When not in
762 * interrupt context and apply the current process NUMA policy.
763 * Returns NULL when no page can be allocated.
765 struct page
*alloc_pages_current(unsigned gfp
, unsigned order
)
767 struct mempolicy
*pol
= current
->mempolicy
;
769 if (!pol
|| in_interrupt())
770 pol
= &default_policy
;
771 if (pol
->policy
== MPOL_INTERLEAVE
)
772 return alloc_page_interleave(gfp
, order
, interleave_nodes(pol
));
773 return __alloc_pages(gfp
, order
, zonelist_policy(gfp
, pol
));
775 EXPORT_SYMBOL(alloc_pages_current
);
777 /* Slow path of a mempolicy copy */
778 struct mempolicy
*__mpol_copy(struct mempolicy
*old
)
780 struct mempolicy
*new = kmem_cache_alloc(policy_cache
, GFP_KERNEL
);
783 return ERR_PTR(-ENOMEM
);
785 atomic_set(&new->refcnt
, 1);
786 if (new->policy
== MPOL_BIND
) {
787 int sz
= ksize(old
->v
.zonelist
);
788 new->v
.zonelist
= kmalloc(sz
, SLAB_KERNEL
);
789 if (!new->v
.zonelist
) {
790 kmem_cache_free(policy_cache
, new);
791 return ERR_PTR(-ENOMEM
);
793 memcpy(new->v
.zonelist
, old
->v
.zonelist
, sz
);
798 /* Slow path of a mempolicy comparison */
799 int __mpol_equal(struct mempolicy
*a
, struct mempolicy
*b
)
803 if (a
->policy
!= b
->policy
)
808 case MPOL_INTERLEAVE
:
809 return bitmap_equal(a
->v
.nodes
, b
->v
.nodes
, MAX_NUMNODES
);
811 return a
->v
.preferred_node
== b
->v
.preferred_node
;
814 for (i
= 0; a
->v
.zonelist
->zones
[i
]; i
++)
815 if (a
->v
.zonelist
->zones
[i
] != b
->v
.zonelist
->zones
[i
])
817 return b
->v
.zonelist
->zones
[i
] == NULL
;
825 /* Slow path of a mpol destructor. */
826 void __mpol_free(struct mempolicy
*p
)
828 if (!atomic_dec_and_test(&p
->refcnt
))
830 if (p
->policy
== MPOL_BIND
)
831 kfree(p
->v
.zonelist
);
832 p
->policy
= MPOL_DEFAULT
;
833 kmem_cache_free(policy_cache
, p
);
837 * Hugetlb policy. Same as above, just works with node numbers instead of
841 /* Find first node suitable for an allocation */
842 int mpol_first_node(struct vm_area_struct
*vma
, unsigned long addr
)
844 struct mempolicy
*pol
= get_vma_policy(vma
, addr
);
846 switch (pol
->policy
) {
848 return numa_node_id();
850 return pol
->v
.zonelist
->zones
[0]->zone_pgdat
->node_id
;
851 case MPOL_INTERLEAVE
:
852 return interleave_nodes(pol
);
854 return pol
->v
.preferred_node
>= 0 ?
855 pol
->v
.preferred_node
: numa_node_id();
861 /* Find secondary valid nodes for an allocation */
862 int mpol_node_valid(int nid
, struct vm_area_struct
*vma
, unsigned long addr
)
864 struct mempolicy
*pol
= get_vma_policy(vma
, addr
);
866 switch (pol
->policy
) {
869 case MPOL_INTERLEAVE
:
873 for (z
= pol
->v
.zonelist
->zones
; *z
; z
++)
874 if ((*z
)->zone_pgdat
->node_id
== nid
)
885 * Shared memory backing store policy support.
887 * Remember policies even when nobody has shared memory mapped.
888 * The policies are kept in Red-Black tree linked from the inode.
889 * They are protected by the sp->sem semaphore, which should be held
890 * for any accesses to the tree.
893 /* lookup first element intersecting start-end */
894 /* Caller holds sp->sem */
895 static struct sp_node
*
896 sp_lookup(struct shared_policy
*sp
, unsigned long start
, unsigned long end
)
898 struct rb_node
*n
= sp
->root
.rb_node
;
901 struct sp_node
*p
= rb_entry(n
, struct sp_node
, nd
);
902 if (start
>= p
->end
) {
904 } else if (end
< p
->start
) {
913 struct sp_node
*w
= NULL
;
914 struct rb_node
*prev
= rb_prev(n
);
917 w
= rb_entry(prev
, struct sp_node
, nd
);
922 return rb_entry(n
, struct sp_node
, nd
);
925 /* Insert a new shared policy into the list. */
926 /* Caller holds sp->sem */
927 static void sp_insert(struct shared_policy
*sp
, struct sp_node
*new)
929 struct rb_node
**p
= &sp
->root
.rb_node
;
930 struct rb_node
*parent
= NULL
;
935 nd
= rb_entry(parent
, struct sp_node
, nd
);
936 if (new->start
< nd
->start
)
938 else if (new->end
> nd
->end
)
943 rb_link_node(&new->nd
, parent
, p
);
944 rb_insert_color(&new->nd
, &sp
->root
);
945 PDprintk("inserting %lx-%lx: %d\n", new->start
, new->end
,
946 new->policy
? new->policy
->policy
: 0);
949 /* Find shared policy intersecting idx */
951 mpol_shared_policy_lookup(struct shared_policy
*sp
, unsigned long idx
)
953 struct mempolicy
*pol
= NULL
;
957 sn
= sp_lookup(sp
, idx
, idx
+1);
959 mpol_get(sn
->policy
);
966 static void sp_delete(struct shared_policy
*sp
, struct sp_node
*n
)
968 PDprintk("deleting %lx-l%x\n", n
->start
, n
->end
);
969 rb_erase(&n
->nd
, &sp
->root
);
970 mpol_free(n
->policy
);
971 kmem_cache_free(sn_cache
, n
);
975 sp_alloc(unsigned long start
, unsigned long end
, struct mempolicy
*pol
)
977 struct sp_node
*n
= kmem_cache_alloc(sn_cache
, GFP_KERNEL
);
988 /* Replace a policy range. */
989 static int shared_policy_replace(struct shared_policy
*sp
, unsigned long start
,
990 unsigned long end
, struct sp_node
*new)
992 struct sp_node
*n
, *new2
;
995 n
= sp_lookup(sp
, start
, end
);
996 /* Take care of old policies in the same range. */
997 while (n
&& n
->start
< end
) {
998 struct rb_node
*next
= rb_next(&n
->nd
);
999 if (n
->start
>= start
) {
1005 /* Old policy spanning whole new range. */
1007 new2
= sp_alloc(end
, n
->end
, n
->policy
);
1013 sp_insert(sp
, new2
);
1015 /* Old crossing beginning, but not end (easy) */
1016 if (n
->start
< start
&& n
->end
> start
)
1021 n
= rb_entry(next
, struct sp_node
, nd
);
1029 int mpol_set_shared_policy(struct shared_policy
*info
,
1030 struct vm_area_struct
*vma
, struct mempolicy
*npol
)
1033 struct sp_node
*new = NULL
;
1034 unsigned long sz
= vma_pages(vma
);
1036 PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1038 sz
, npol
? npol
->policy
: -1,
1039 npol
? npol
->v
.nodes
[0] : -1);
1042 new = sp_alloc(vma
->vm_pgoff
, vma
->vm_pgoff
+ sz
, npol
);
1046 err
= shared_policy_replace(info
, vma
->vm_pgoff
, vma
->vm_pgoff
+sz
, new);
1048 kmem_cache_free(sn_cache
, new);
1052 /* Free a backing policy store on inode delete. */
1053 void mpol_free_shared_policy(struct shared_policy
*p
)
1056 struct rb_node
*next
;
1059 next
= rb_first(&p
->root
);
1061 n
= rb_entry(next
, struct sp_node
, nd
);
1062 next
= rb_next(&n
->nd
);
1063 rb_erase(&n
->nd
, &p
->root
);
1064 mpol_free(n
->policy
);
1065 kmem_cache_free(sn_cache
, n
);
1070 /* assumes fs == KERNEL_DS */
1071 void __init
numa_policy_init(void)
1073 policy_cache
= kmem_cache_create("numa_policy",
1074 sizeof(struct mempolicy
),
1075 0, SLAB_PANIC
, NULL
, NULL
);
1077 sn_cache
= kmem_cache_create("shared_policy_node",
1078 sizeof(struct sp_node
),
1079 0, SLAB_PANIC
, NULL
, NULL
);
1081 /* Set interleaving policy for system init. This way not all
1082 the data structures allocated at system boot end up in node zero. */
1084 if (sys_set_mempolicy(MPOL_INTERLEAVE
, node_online_map
, MAX_NUMNODES
) < 0)
1085 printk("numa_policy_init: interleaving failed\n");
1088 /* Reset policy of current process to default.
1089 * Assumes fs == KERNEL_DS */
1090 void numa_default_policy(void)
1092 sys_set_mempolicy(MPOL_DEFAULT
, NULL
, 0);