2 * Simple NUMA memory policy for the Linux kernel.
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6 * Subject to the GNU Public License, version 2.
8 * NUMA policy allows the user to give hints in which node(s) memory should
11 * Support four policies per VMA and per process:
13 * The VMA policy has priority over the process policy for a page fault.
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
22 * bind Only allocate memory on a specific set of nodes,
24 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead
28 * preferred Try a specific node first before normal fallback.
29 * As a special case node -1 here means do the allocation
30 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default
34 * default Allocate on the local node first, or when on a VMA
35 * use the process policy. This is what Linux always did
36 * in a NUMA aware kernel and still does by, ahem, default.
38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM.
43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins.
47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations.
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped.
57 fix mmap readahead to honour policy and enable policy for any page cache
59 statistics for bigpages
60 global policy for page cache? currently it uses process policy. Requires
62 handle mremap for shared memory (currently ignored for the policy)
64 make bind policy root only? It can trigger oom much faster and the
65 kernel is not always grateful with that.
66 could replace all the switch()es with a mempolicy_ops structure.
69 #include <linux/mempolicy.h>
71 #include <linux/highmem.h>
72 #include <linux/hugetlb.h>
73 #include <linux/kernel.h>
74 #include <linux/sched.h>
76 #include <linux/nodemask.h>
77 #include <linux/cpuset.h>
78 #include <linux/gfp.h>
79 #include <linux/slab.h>
80 #include <linux/string.h>
81 #include <linux/module.h>
82 #include <linux/interrupt.h>
83 #include <linux/init.h>
84 #include <linux/compat.h>
85 #include <linux/mempolicy.h>
86 #include <asm/tlbflush.h>
87 #include <asm/uaccess.h>
89 static kmem_cache_t
*policy_cache
;
90 static kmem_cache_t
*sn_cache
;
92 #define PDprintk(fmt...)
94 /* Highest zone. An specific allocation for a zone below that is not
96 static int policy_zone
;
98 struct mempolicy default_policy
= {
99 .refcnt
= ATOMIC_INIT(1), /* never free it */
100 .policy
= MPOL_DEFAULT
,
103 /* Do sanity checking on a policy */
104 static int mpol_check_policy(int mode
, nodemask_t
*nodes
)
106 int empty
= nodes_empty(*nodes
);
114 case MPOL_INTERLEAVE
:
115 /* Preferred will only use the first bit, but allow
121 return nodes_subset(*nodes
, node_online_map
) ? 0 : -EINVAL
;
123 /* Generate a custom zonelist for the BIND policy. */
124 static struct zonelist
*bind_zonelist(nodemask_t
*nodes
)
129 max
= 1 + MAX_NR_ZONES
* nodes_weight(*nodes
);
130 zl
= kmalloc(sizeof(void *) * max
, GFP_KERNEL
);
134 for_each_node_mask(nd
, *nodes
) {
136 for (k
= MAX_NR_ZONES
-1; k
>= 0; k
--) {
137 struct zone
*z
= &NODE_DATA(nd
)->node_zones
[k
];
138 if (!z
->present_pages
)
140 zl
->zones
[num
++] = z
;
145 zl
->zones
[num
] = NULL
;
149 /* Create a new policy */
150 static struct mempolicy
*mpol_new(int mode
, nodemask_t
*nodes
)
152 struct mempolicy
*policy
;
154 PDprintk("setting mode %d nodes[0] %lx\n", mode
, nodes_addr(*nodes
)[0]);
155 if (mode
== MPOL_DEFAULT
)
157 policy
= kmem_cache_alloc(policy_cache
, GFP_KERNEL
);
159 return ERR_PTR(-ENOMEM
);
160 atomic_set(&policy
->refcnt
, 1);
162 case MPOL_INTERLEAVE
:
163 policy
->v
.nodes
= *nodes
;
166 policy
->v
.preferred_node
= first_node(*nodes
);
167 if (policy
->v
.preferred_node
>= MAX_NUMNODES
)
168 policy
->v
.preferred_node
= -1;
171 policy
->v
.zonelist
= bind_zonelist(nodes
);
172 if (policy
->v
.zonelist
== NULL
) {
173 kmem_cache_free(policy_cache
, policy
);
174 return ERR_PTR(-ENOMEM
);
178 policy
->policy
= mode
;
182 /* Ensure all existing pages follow the policy. */
183 static int check_pte_range(struct vm_area_struct
*vma
, pmd_t
*pmd
,
184 unsigned long addr
, unsigned long end
, nodemask_t
*nodes
)
190 orig_pte
= pte
= pte_offset_map_lock(vma
->vm_mm
, pmd
, addr
, &ptl
);
195 if (!pte_present(*pte
))
198 if (!pfn_valid(pfn
)) {
199 print_bad_pte(vma
, *pte
, addr
);
202 nid
= pfn_to_nid(pfn
);
203 if (!node_isset(nid
, *nodes
))
205 } while (pte
++, addr
+= PAGE_SIZE
, addr
!= end
);
206 pte_unmap_unlock(orig_pte
, ptl
);
210 static inline int check_pmd_range(struct vm_area_struct
*vma
, pud_t
*pud
,
211 unsigned long addr
, unsigned long end
, nodemask_t
*nodes
)
216 pmd
= pmd_offset(pud
, addr
);
218 next
= pmd_addr_end(addr
, end
);
219 if (pmd_none_or_clear_bad(pmd
))
221 if (check_pte_range(vma
, pmd
, addr
, next
, nodes
))
223 } while (pmd
++, addr
= next
, addr
!= end
);
227 static inline int check_pud_range(struct vm_area_struct
*vma
, pgd_t
*pgd
,
228 unsigned long addr
, unsigned long end
, nodemask_t
*nodes
)
233 pud
= pud_offset(pgd
, addr
);
235 next
= pud_addr_end(addr
, end
);
236 if (pud_none_or_clear_bad(pud
))
238 if (check_pmd_range(vma
, pud
, addr
, next
, nodes
))
240 } while (pud
++, addr
= next
, addr
!= end
);
244 static inline int check_pgd_range(struct vm_area_struct
*vma
,
245 unsigned long addr
, unsigned long end
, nodemask_t
*nodes
)
250 pgd
= pgd_offset(vma
->vm_mm
, addr
);
252 next
= pgd_addr_end(addr
, end
);
253 if (pgd_none_or_clear_bad(pgd
))
255 if (check_pud_range(vma
, pgd
, addr
, next
, nodes
))
257 } while (pgd
++, addr
= next
, addr
!= end
);
261 /* Step 1: check the range */
262 static struct vm_area_struct
*
263 check_range(struct mm_struct
*mm
, unsigned long start
, unsigned long end
,
264 nodemask_t
*nodes
, unsigned long flags
)
267 struct vm_area_struct
*first
, *vma
, *prev
;
269 first
= find_vma(mm
, start
);
271 return ERR_PTR(-EFAULT
);
272 if (first
->vm_flags
& VM_RESERVED
)
273 return ERR_PTR(-EACCES
);
275 for (vma
= first
; vma
&& vma
->vm_start
< end
; vma
= vma
->vm_next
) {
276 if (!vma
->vm_next
&& vma
->vm_end
< end
)
277 return ERR_PTR(-EFAULT
);
278 if (prev
&& prev
->vm_end
< vma
->vm_start
)
279 return ERR_PTR(-EFAULT
);
280 if ((flags
& MPOL_MF_STRICT
) && !is_vm_hugetlb_page(vma
)) {
281 unsigned long endvma
= vma
->vm_end
;
284 if (vma
->vm_start
> start
)
285 start
= vma
->vm_start
;
286 err
= check_pgd_range(vma
, start
, endvma
, nodes
);
288 first
= ERR_PTR(err
);
297 /* Apply policy to a single VMA */
298 static int policy_vma(struct vm_area_struct
*vma
, struct mempolicy
*new)
301 struct mempolicy
*old
= vma
->vm_policy
;
303 PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
304 vma
->vm_start
, vma
->vm_end
, vma
->vm_pgoff
,
305 vma
->vm_ops
, vma
->vm_file
,
306 vma
->vm_ops
? vma
->vm_ops
->set_policy
: NULL
);
308 if (vma
->vm_ops
&& vma
->vm_ops
->set_policy
)
309 err
= vma
->vm_ops
->set_policy(vma
, new);
312 vma
->vm_policy
= new;
318 /* Step 2: apply policy to a range and do splits. */
319 static int mbind_range(struct vm_area_struct
*vma
, unsigned long start
,
320 unsigned long end
, struct mempolicy
*new)
322 struct vm_area_struct
*next
;
326 for (; vma
&& vma
->vm_start
< end
; vma
= next
) {
328 if (vma
->vm_start
< start
)
329 err
= split_vma(vma
->vm_mm
, vma
, start
, 1);
330 if (!err
&& vma
->vm_end
> end
)
331 err
= split_vma(vma
->vm_mm
, vma
, end
, 0);
333 err
= policy_vma(vma
, new);
340 static int contextualize_policy(int mode
, nodemask_t
*nodes
)
345 /* Update current mems_allowed */
346 cpuset_update_current_mems_allowed();
347 /* Ignore nodes not set in current->mems_allowed */
348 cpuset_restrict_to_mems_allowed(nodes
->bits
);
349 return mpol_check_policy(mode
, nodes
);
352 long do_mbind(unsigned long start
, unsigned long len
,
353 unsigned long mode
, nodemask_t
*nmask
, unsigned long flags
)
355 struct vm_area_struct
*vma
;
356 struct mm_struct
*mm
= current
->mm
;
357 struct mempolicy
*new;
361 if ((flags
& ~(unsigned long)(MPOL_MF_STRICT
)) || mode
> MPOL_MAX
)
363 if (start
& ~PAGE_MASK
)
365 if (mode
== MPOL_DEFAULT
)
366 flags
&= ~MPOL_MF_STRICT
;
367 len
= (len
+ PAGE_SIZE
- 1) & PAGE_MASK
;
373 if (mpol_check_policy(mode
, nmask
))
375 new = mpol_new(mode
, nmask
);
379 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start
,start
+len
,
380 mode
,nodes_addr(nodes
)[0]);
382 down_write(&mm
->mmap_sem
);
383 vma
= check_range(mm
, start
, end
, nmask
, flags
);
386 err
= mbind_range(vma
, start
, end
, new);
387 up_write(&mm
->mmap_sem
);
392 /* Set the process memory policy */
393 long do_set_mempolicy(int mode
, nodemask_t
*nodes
)
395 struct mempolicy
*new;
397 if (contextualize_policy(mode
, nodes
))
399 new = mpol_new(mode
, nodes
);
402 mpol_free(current
->mempolicy
);
403 current
->mempolicy
= new;
404 if (new && new->policy
== MPOL_INTERLEAVE
)
405 current
->il_next
= first_node(new->v
.nodes
);
409 /* Fill a zone bitmap for a policy */
410 static void get_zonemask(struct mempolicy
*p
, nodemask_t
*nodes
)
417 for (i
= 0; p
->v
.zonelist
->zones
[i
]; i
++)
418 node_set(p
->v
.zonelist
->zones
[i
]->zone_pgdat
->node_id
,
423 case MPOL_INTERLEAVE
:
427 /* or use current node instead of online map? */
428 if (p
->v
.preferred_node
< 0)
429 *nodes
= node_online_map
;
431 node_set(p
->v
.preferred_node
, *nodes
);
438 static int lookup_node(struct mm_struct
*mm
, unsigned long addr
)
443 err
= get_user_pages(current
, mm
, addr
& PAGE_MASK
, 1, 0, 0, &p
, NULL
);
445 err
= page_to_nid(p
);
451 /* Retrieve NUMA policy */
452 long do_get_mempolicy(int *policy
, nodemask_t
*nmask
,
453 unsigned long addr
, unsigned long flags
)
456 struct mm_struct
*mm
= current
->mm
;
457 struct vm_area_struct
*vma
= NULL
;
458 struct mempolicy
*pol
= current
->mempolicy
;
460 if (flags
& ~(unsigned long)(MPOL_F_NODE
|MPOL_F_ADDR
))
462 if (flags
& MPOL_F_ADDR
) {
463 down_read(&mm
->mmap_sem
);
464 vma
= find_vma_intersection(mm
, addr
, addr
+1);
466 up_read(&mm
->mmap_sem
);
469 if (vma
->vm_ops
&& vma
->vm_ops
->get_policy
)
470 pol
= vma
->vm_ops
->get_policy(vma
, addr
);
472 pol
= vma
->vm_policy
;
477 pol
= &default_policy
;
479 if (flags
& MPOL_F_NODE
) {
480 if (flags
& MPOL_F_ADDR
) {
481 err
= lookup_node(mm
, addr
);
485 } else if (pol
== current
->mempolicy
&&
486 pol
->policy
== MPOL_INTERLEAVE
) {
487 *policy
= current
->il_next
;
493 *policy
= pol
->policy
;
496 up_read(¤t
->mm
->mmap_sem
);
502 get_zonemask(pol
, nmask
);
506 up_read(¤t
->mm
->mmap_sem
);
511 * User space interface with variable sized bitmaps for nodelists.
514 /* Copy a node mask from user space. */
515 static int get_nodes(nodemask_t
*nodes
, unsigned long __user
*nmask
,
516 unsigned long maxnode
)
519 unsigned long nlongs
;
520 unsigned long endmask
;
524 if (maxnode
== 0 || !nmask
)
527 nlongs
= BITS_TO_LONGS(maxnode
);
528 if ((maxnode
% BITS_PER_LONG
) == 0)
531 endmask
= (1UL << (maxnode
% BITS_PER_LONG
)) - 1;
533 /* When the user specified more nodes than supported just check
534 if the non supported part is all zero. */
535 if (nlongs
> BITS_TO_LONGS(MAX_NUMNODES
)) {
536 if (nlongs
> PAGE_SIZE
/sizeof(long))
538 for (k
= BITS_TO_LONGS(MAX_NUMNODES
); k
< nlongs
; k
++) {
540 if (get_user(t
, nmask
+ k
))
542 if (k
== nlongs
- 1) {
548 nlongs
= BITS_TO_LONGS(MAX_NUMNODES
);
552 if (copy_from_user(nodes_addr(*nodes
), nmask
, nlongs
*sizeof(unsigned long)))
554 nodes_addr(*nodes
)[nlongs
-1] &= endmask
;
558 /* Copy a kernel node mask to user space */
559 static int copy_nodes_to_user(unsigned long __user
*mask
, unsigned long maxnode
,
562 unsigned long copy
= ALIGN(maxnode
-1, 64) / 8;
563 const int nbytes
= BITS_TO_LONGS(MAX_NUMNODES
) * sizeof(long);
566 if (copy
> PAGE_SIZE
)
568 if (clear_user((char __user
*)mask
+ nbytes
, copy
- nbytes
))
572 return copy_to_user(mask
, nodes_addr(*nodes
), copy
) ? -EFAULT
: 0;
575 asmlinkage
long sys_mbind(unsigned long start
, unsigned long len
,
577 unsigned long __user
*nmask
, unsigned long maxnode
,
583 err
= get_nodes(&nodes
, nmask
, maxnode
);
586 return do_mbind(start
, len
, mode
, &nodes
, flags
);
589 /* Set the process memory policy */
590 asmlinkage
long sys_set_mempolicy(int mode
, unsigned long __user
*nmask
,
591 unsigned long maxnode
)
596 if (mode
< 0 || mode
> MPOL_MAX
)
598 err
= get_nodes(&nodes
, nmask
, maxnode
);
601 return do_set_mempolicy(mode
, &nodes
);
604 /* Retrieve NUMA policy */
605 asmlinkage
long sys_get_mempolicy(int __user
*policy
,
606 unsigned long __user
*nmask
,
607 unsigned long maxnode
,
608 unsigned long addr
, unsigned long flags
)
613 if (nmask
!= NULL
&& maxnode
< MAX_NUMNODES
)
616 err
= do_get_mempolicy(&pval
, &nodes
, addr
, flags
);
621 if (policy
&& put_user(pval
, policy
))
625 err
= copy_nodes_to_user(nmask
, maxnode
, &nodes
);
632 asmlinkage
long compat_sys_get_mempolicy(int __user
*policy
,
633 compat_ulong_t __user
*nmask
,
634 compat_ulong_t maxnode
,
635 compat_ulong_t addr
, compat_ulong_t flags
)
638 unsigned long __user
*nm
= NULL
;
639 unsigned long nr_bits
, alloc_size
;
640 DECLARE_BITMAP(bm
, MAX_NUMNODES
);
642 nr_bits
= min_t(unsigned long, maxnode
-1, MAX_NUMNODES
);
643 alloc_size
= ALIGN(nr_bits
, BITS_PER_LONG
) / 8;
646 nm
= compat_alloc_user_space(alloc_size
);
648 err
= sys_get_mempolicy(policy
, nm
, nr_bits
+1, addr
, flags
);
651 err
= copy_from_user(bm
, nm
, alloc_size
);
652 /* ensure entire bitmap is zeroed */
653 err
|= clear_user(nmask
, ALIGN(maxnode
-1, 8) / 8);
654 err
|= compat_put_bitmap(nmask
, bm
, nr_bits
);
660 asmlinkage
long compat_sys_set_mempolicy(int mode
, compat_ulong_t __user
*nmask
,
661 compat_ulong_t maxnode
)
664 unsigned long __user
*nm
= NULL
;
665 unsigned long nr_bits
, alloc_size
;
666 DECLARE_BITMAP(bm
, MAX_NUMNODES
);
668 nr_bits
= min_t(unsigned long, maxnode
-1, MAX_NUMNODES
);
669 alloc_size
= ALIGN(nr_bits
, BITS_PER_LONG
) / 8;
672 err
= compat_get_bitmap(bm
, nmask
, nr_bits
);
673 nm
= compat_alloc_user_space(alloc_size
);
674 err
|= copy_to_user(nm
, bm
, alloc_size
);
680 return sys_set_mempolicy(mode
, nm
, nr_bits
+1);
683 asmlinkage
long compat_sys_mbind(compat_ulong_t start
, compat_ulong_t len
,
684 compat_ulong_t mode
, compat_ulong_t __user
*nmask
,
685 compat_ulong_t maxnode
, compat_ulong_t flags
)
688 unsigned long __user
*nm
= NULL
;
689 unsigned long nr_bits
, alloc_size
;
692 nr_bits
= min_t(unsigned long, maxnode
-1, MAX_NUMNODES
);
693 alloc_size
= ALIGN(nr_bits
, BITS_PER_LONG
) / 8;
696 err
= compat_get_bitmap(nodes_addr(bm
), nmask
, nr_bits
);
697 nm
= compat_alloc_user_space(alloc_size
);
698 err
|= copy_to_user(nm
, nodes_addr(bm
), alloc_size
);
704 return sys_mbind(start
, len
, mode
, nm
, nr_bits
+1, flags
);
709 /* Return effective policy for a VMA */
711 get_vma_policy(struct task_struct
*task
, struct vm_area_struct
*vma
, unsigned long addr
)
713 struct mempolicy
*pol
= task
->mempolicy
;
716 if (vma
->vm_ops
&& vma
->vm_ops
->get_policy
)
717 pol
= vma
->vm_ops
->get_policy(vma
, addr
);
718 else if (vma
->vm_policy
&&
719 vma
->vm_policy
->policy
!= MPOL_DEFAULT
)
720 pol
= vma
->vm_policy
;
723 pol
= &default_policy
;
727 /* Return a zonelist representing a mempolicy */
728 static struct zonelist
*zonelist_policy(gfp_t gfp
, struct mempolicy
*policy
)
732 switch (policy
->policy
) {
734 nd
= policy
->v
.preferred_node
;
739 /* Lower zones don't get a policy applied */
740 /* Careful: current->mems_allowed might have moved */
741 if (gfp_zone(gfp
) >= policy_zone
)
742 if (cpuset_zonelist_valid_mems_allowed(policy
->v
.zonelist
))
743 return policy
->v
.zonelist
;
745 case MPOL_INTERLEAVE
: /* should not happen */
753 return NODE_DATA(nd
)->node_zonelists
+ gfp_zone(gfp
);
756 /* Do dynamic interleaving for a process */
757 static unsigned interleave_nodes(struct mempolicy
*policy
)
760 struct task_struct
*me
= current
;
763 next
= next_node(nid
, policy
->v
.nodes
);
764 if (next
>= MAX_NUMNODES
)
765 next
= first_node(policy
->v
.nodes
);
770 /* Do static interleaving for a VMA with known offset. */
771 static unsigned offset_il_node(struct mempolicy
*pol
,
772 struct vm_area_struct
*vma
, unsigned long off
)
774 unsigned nnodes
= nodes_weight(pol
->v
.nodes
);
775 unsigned target
= (unsigned)off
% nnodes
;
781 nid
= next_node(nid
, pol
->v
.nodes
);
783 } while (c
<= target
);
787 /* Allocate a page in interleaved policy.
788 Own path because it needs to do special accounting. */
789 static struct page
*alloc_page_interleave(gfp_t gfp
, unsigned order
,
795 zl
= NODE_DATA(nid
)->node_zonelists
+ gfp_zone(gfp
);
796 page
= __alloc_pages(gfp
, order
, zl
);
797 if (page
&& page_zone(page
) == zl
->zones
[0]) {
798 zone_pcp(zl
->zones
[0],get_cpu())->interleave_hit
++;
805 * alloc_page_vma - Allocate a page for a VMA.
808 * %GFP_USER user allocation.
809 * %GFP_KERNEL kernel allocations,
810 * %GFP_HIGHMEM highmem/user allocations,
811 * %GFP_FS allocation should not call back into a file system.
812 * %GFP_ATOMIC don't sleep.
814 * @vma: Pointer to VMA or NULL if not available.
815 * @addr: Virtual Address of the allocation. Must be inside the VMA.
817 * This function allocates a page from the kernel page pool and applies
818 * a NUMA policy associated with the VMA or the current process.
819 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
820 * mm_struct of the VMA to prevent it from going away. Should be used for
821 * all allocations for pages that will be mapped into
822 * user space. Returns NULL when no page can be allocated.
824 * Should be called with the mm_sem of the vma hold.
827 alloc_page_vma(gfp_t gfp
, struct vm_area_struct
*vma
, unsigned long addr
)
829 struct mempolicy
*pol
= get_vma_policy(current
, vma
, addr
);
831 cpuset_update_current_mems_allowed();
833 if (unlikely(pol
->policy
== MPOL_INTERLEAVE
)) {
838 off
+= (addr
- vma
->vm_start
) >> PAGE_SHIFT
;
839 nid
= offset_il_node(pol
, vma
, off
);
841 /* fall back to process interleaving */
842 nid
= interleave_nodes(pol
);
844 return alloc_page_interleave(gfp
, 0, nid
);
846 return __alloc_pages(gfp
, 0, zonelist_policy(gfp
, pol
));
850 * alloc_pages_current - Allocate pages.
853 * %GFP_USER user allocation,
854 * %GFP_KERNEL kernel allocation,
855 * %GFP_HIGHMEM highmem allocation,
856 * %GFP_FS don't call back into a file system.
857 * %GFP_ATOMIC don't sleep.
858 * @order: Power of two of allocation size in pages. 0 is a single page.
860 * Allocate a page from the kernel page pool. When not in
861 * interrupt context and apply the current process NUMA policy.
862 * Returns NULL when no page can be allocated.
864 * Don't call cpuset_update_current_mems_allowed() unless
865 * 1) it's ok to take cpuset_sem (can WAIT), and
866 * 2) allocating for current task (not interrupt).
868 struct page
*alloc_pages_current(gfp_t gfp
, unsigned order
)
870 struct mempolicy
*pol
= current
->mempolicy
;
872 if ((gfp
& __GFP_WAIT
) && !in_interrupt())
873 cpuset_update_current_mems_allowed();
874 if (!pol
|| in_interrupt())
875 pol
= &default_policy
;
876 if (pol
->policy
== MPOL_INTERLEAVE
)
877 return alloc_page_interleave(gfp
, order
, interleave_nodes(pol
));
878 return __alloc_pages(gfp
, order
, zonelist_policy(gfp
, pol
));
880 EXPORT_SYMBOL(alloc_pages_current
);
882 /* Slow path of a mempolicy copy */
883 struct mempolicy
*__mpol_copy(struct mempolicy
*old
)
885 struct mempolicy
*new = kmem_cache_alloc(policy_cache
, GFP_KERNEL
);
888 return ERR_PTR(-ENOMEM
);
890 atomic_set(&new->refcnt
, 1);
891 if (new->policy
== MPOL_BIND
) {
892 int sz
= ksize(old
->v
.zonelist
);
893 new->v
.zonelist
= kmalloc(sz
, SLAB_KERNEL
);
894 if (!new->v
.zonelist
) {
895 kmem_cache_free(policy_cache
, new);
896 return ERR_PTR(-ENOMEM
);
898 memcpy(new->v
.zonelist
, old
->v
.zonelist
, sz
);
903 /* Slow path of a mempolicy comparison */
904 int __mpol_equal(struct mempolicy
*a
, struct mempolicy
*b
)
908 if (a
->policy
!= b
->policy
)
913 case MPOL_INTERLEAVE
:
914 return nodes_equal(a
->v
.nodes
, b
->v
.nodes
);
916 return a
->v
.preferred_node
== b
->v
.preferred_node
;
919 for (i
= 0; a
->v
.zonelist
->zones
[i
]; i
++)
920 if (a
->v
.zonelist
->zones
[i
] != b
->v
.zonelist
->zones
[i
])
922 return b
->v
.zonelist
->zones
[i
] == NULL
;
930 /* Slow path of a mpol destructor. */
931 void __mpol_free(struct mempolicy
*p
)
933 if (!atomic_dec_and_test(&p
->refcnt
))
935 if (p
->policy
== MPOL_BIND
)
936 kfree(p
->v
.zonelist
);
937 p
->policy
= MPOL_DEFAULT
;
938 kmem_cache_free(policy_cache
, p
);
942 * Hugetlb policy. Same as above, just works with node numbers instead of
946 /* Find first node suitable for an allocation */
947 int mpol_first_node(struct vm_area_struct
*vma
, unsigned long addr
)
949 struct mempolicy
*pol
= get_vma_policy(current
, vma
, addr
);
951 switch (pol
->policy
) {
953 return numa_node_id();
955 return pol
->v
.zonelist
->zones
[0]->zone_pgdat
->node_id
;
956 case MPOL_INTERLEAVE
:
957 return interleave_nodes(pol
);
959 return pol
->v
.preferred_node
>= 0 ?
960 pol
->v
.preferred_node
: numa_node_id();
966 /* Find secondary valid nodes for an allocation */
967 int mpol_node_valid(int nid
, struct vm_area_struct
*vma
, unsigned long addr
)
969 struct mempolicy
*pol
= get_vma_policy(current
, vma
, addr
);
971 switch (pol
->policy
) {
974 case MPOL_INTERLEAVE
:
978 for (z
= pol
->v
.zonelist
->zones
; *z
; z
++)
979 if ((*z
)->zone_pgdat
->node_id
== nid
)
990 * Shared memory backing store policy support.
992 * Remember policies even when nobody has shared memory mapped.
993 * The policies are kept in Red-Black tree linked from the inode.
994 * They are protected by the sp->lock spinlock, which should be held
995 * for any accesses to the tree.
998 /* lookup first element intersecting start-end */
999 /* Caller holds sp->lock */
1000 static struct sp_node
*
1001 sp_lookup(struct shared_policy
*sp
, unsigned long start
, unsigned long end
)
1003 struct rb_node
*n
= sp
->root
.rb_node
;
1006 struct sp_node
*p
= rb_entry(n
, struct sp_node
, nd
);
1008 if (start
>= p
->end
)
1010 else if (end
<= p
->start
)
1018 struct sp_node
*w
= NULL
;
1019 struct rb_node
*prev
= rb_prev(n
);
1022 w
= rb_entry(prev
, struct sp_node
, nd
);
1023 if (w
->end
<= start
)
1027 return rb_entry(n
, struct sp_node
, nd
);
1030 /* Insert a new shared policy into the list. */
1031 /* Caller holds sp->lock */
1032 static void sp_insert(struct shared_policy
*sp
, struct sp_node
*new)
1034 struct rb_node
**p
= &sp
->root
.rb_node
;
1035 struct rb_node
*parent
= NULL
;
1040 nd
= rb_entry(parent
, struct sp_node
, nd
);
1041 if (new->start
< nd
->start
)
1043 else if (new->end
> nd
->end
)
1044 p
= &(*p
)->rb_right
;
1048 rb_link_node(&new->nd
, parent
, p
);
1049 rb_insert_color(&new->nd
, &sp
->root
);
1050 PDprintk("inserting %lx-%lx: %d\n", new->start
, new->end
,
1051 new->policy
? new->policy
->policy
: 0);
1054 /* Find shared policy intersecting idx */
1056 mpol_shared_policy_lookup(struct shared_policy
*sp
, unsigned long idx
)
1058 struct mempolicy
*pol
= NULL
;
1061 if (!sp
->root
.rb_node
)
1063 spin_lock(&sp
->lock
);
1064 sn
= sp_lookup(sp
, idx
, idx
+1);
1066 mpol_get(sn
->policy
);
1069 spin_unlock(&sp
->lock
);
1073 static void sp_delete(struct shared_policy
*sp
, struct sp_node
*n
)
1075 PDprintk("deleting %lx-l%x\n", n
->start
, n
->end
);
1076 rb_erase(&n
->nd
, &sp
->root
);
1077 mpol_free(n
->policy
);
1078 kmem_cache_free(sn_cache
, n
);
1082 sp_alloc(unsigned long start
, unsigned long end
, struct mempolicy
*pol
)
1084 struct sp_node
*n
= kmem_cache_alloc(sn_cache
, GFP_KERNEL
);
1095 /* Replace a policy range. */
1096 static int shared_policy_replace(struct shared_policy
*sp
, unsigned long start
,
1097 unsigned long end
, struct sp_node
*new)
1099 struct sp_node
*n
, *new2
= NULL
;
1102 spin_lock(&sp
->lock
);
1103 n
= sp_lookup(sp
, start
, end
);
1104 /* Take care of old policies in the same range. */
1105 while (n
&& n
->start
< end
) {
1106 struct rb_node
*next
= rb_next(&n
->nd
);
1107 if (n
->start
>= start
) {
1113 /* Old policy spanning whole new range. */
1116 spin_unlock(&sp
->lock
);
1117 new2
= sp_alloc(end
, n
->end
, n
->policy
);
1123 sp_insert(sp
, new2
);
1131 n
= rb_entry(next
, struct sp_node
, nd
);
1135 spin_unlock(&sp
->lock
);
1137 mpol_free(new2
->policy
);
1138 kmem_cache_free(sn_cache
, new2
);
1143 int mpol_set_shared_policy(struct shared_policy
*info
,
1144 struct vm_area_struct
*vma
, struct mempolicy
*npol
)
1147 struct sp_node
*new = NULL
;
1148 unsigned long sz
= vma_pages(vma
);
1150 PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1152 sz
, npol
? npol
->policy
: -1,
1153 npol
? nodes_addr(npol
->v
.nodes
)[0] : -1);
1156 new = sp_alloc(vma
->vm_pgoff
, vma
->vm_pgoff
+ sz
, npol
);
1160 err
= shared_policy_replace(info
, vma
->vm_pgoff
, vma
->vm_pgoff
+sz
, new);
1162 kmem_cache_free(sn_cache
, new);
1166 /* Free a backing policy store on inode delete. */
1167 void mpol_free_shared_policy(struct shared_policy
*p
)
1170 struct rb_node
*next
;
1172 if (!p
->root
.rb_node
)
1174 spin_lock(&p
->lock
);
1175 next
= rb_first(&p
->root
);
1177 n
= rb_entry(next
, struct sp_node
, nd
);
1178 next
= rb_next(&n
->nd
);
1179 rb_erase(&n
->nd
, &p
->root
);
1180 mpol_free(n
->policy
);
1181 kmem_cache_free(sn_cache
, n
);
1183 spin_unlock(&p
->lock
);
1186 /* assumes fs == KERNEL_DS */
1187 void __init
numa_policy_init(void)
1189 policy_cache
= kmem_cache_create("numa_policy",
1190 sizeof(struct mempolicy
),
1191 0, SLAB_PANIC
, NULL
, NULL
);
1193 sn_cache
= kmem_cache_create("shared_policy_node",
1194 sizeof(struct sp_node
),
1195 0, SLAB_PANIC
, NULL
, NULL
);
1197 /* Set interleaving policy for system init. This way not all
1198 the data structures allocated at system boot end up in node zero. */
1200 if (do_set_mempolicy(MPOL_INTERLEAVE
, &node_online_map
))
1201 printk("numa_policy_init: interleaving failed\n");
1204 /* Reset policy of current process to default */
1205 void numa_default_policy(void)
1207 do_set_mempolicy(MPOL_DEFAULT
, NULL
);