2 * Copyright (C) 2009 Red Hat, Inc.
4 * This work is licensed under the terms of the GNU GPL, version 2. See
5 * the COPYING file in the top-level directory.
9 #include <linux/sched.h>
10 #include <linux/highmem.h>
11 #include <linux/hugetlb.h>
12 #include <linux/mmu_notifier.h>
13 #include <linux/rmap.h>
14 #include <linux/swap.h>
15 #include <linux/mm_inline.h>
16 #include <linux/kthread.h>
17 #include <linux/khugepaged.h>
19 #include <asm/pgalloc.h>
23 * By default transparent hugepage support is enabled for all mappings
24 * and khugepaged scans all mappings. Defrag is only invoked by
25 * khugepaged hugepage allocations and by page faults inside
26 * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived
29 unsigned long transparent_hugepage_flags __read_mostly
=
30 (1<<TRANSPARENT_HUGEPAGE_FLAG
)|
31 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG
);
33 /* default scan 8*512 pte (or vmas) every 30 second */
34 static unsigned int khugepaged_pages_to_scan __read_mostly
= HPAGE_PMD_NR
*8;
35 static unsigned int khugepaged_pages_collapsed
;
36 static unsigned int khugepaged_full_scans
;
37 static unsigned int khugepaged_scan_sleep_millisecs __read_mostly
= 10000;
38 /* during fragmentation poll the hugepage allocator once every minute */
39 static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly
= 60000;
40 static struct task_struct
*khugepaged_thread __read_mostly
;
41 static DEFINE_MUTEX(khugepaged_mutex
);
42 static DEFINE_SPINLOCK(khugepaged_mm_lock
);
43 static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait
);
45 * default collapse hugepages if there is at least one pte mapped like
46 * it would have happened if the vma was large enough during page
49 static unsigned int khugepaged_max_ptes_none __read_mostly
= HPAGE_PMD_NR
-1;
51 static int khugepaged(void *none
);
52 static int mm_slots_hash_init(void);
53 static int khugepaged_slab_init(void);
54 static void khugepaged_slab_free(void);
56 #define MM_SLOTS_HASH_HEADS 1024
57 static struct hlist_head
*mm_slots_hash __read_mostly
;
58 static struct kmem_cache
*mm_slot_cache __read_mostly
;
61 * struct mm_slot - hash lookup from mm to mm_slot
62 * @hash: hash collision list
63 * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head
64 * @mm: the mm that this information is valid for
67 struct hlist_node hash
;
68 struct list_head mm_node
;
73 * struct khugepaged_scan - cursor for scanning
74 * @mm_head: the head of the mm list to scan
75 * @mm_slot: the current mm_slot we are scanning
76 * @address: the next address inside that to be scanned
78 * There is only the one khugepaged_scan instance of this cursor structure.
80 struct khugepaged_scan
{
81 struct list_head mm_head
;
82 struct mm_slot
*mm_slot
;
83 unsigned long address
;
85 .mm_head
= LIST_HEAD_INIT(khugepaged_scan
.mm_head
),
88 static int start_khugepaged(void)
91 if (khugepaged_enabled()) {
93 if (unlikely(!mm_slot_cache
|| !mm_slots_hash
)) {
97 mutex_lock(&khugepaged_mutex
);
98 if (!khugepaged_thread
)
99 khugepaged_thread
= kthread_run(khugepaged
, NULL
,
101 if (unlikely(IS_ERR(khugepaged_thread
))) {
103 "khugepaged: kthread_run(khugepaged) failed\n");
104 err
= PTR_ERR(khugepaged_thread
);
105 khugepaged_thread
= NULL
;
107 wakeup
= !list_empty(&khugepaged_scan
.mm_head
);
108 mutex_unlock(&khugepaged_mutex
);
110 wake_up_interruptible(&khugepaged_wait
);
113 wake_up_interruptible(&khugepaged_wait
);
120 static ssize_t
double_flag_show(struct kobject
*kobj
,
121 struct kobj_attribute
*attr
, char *buf
,
122 enum transparent_hugepage_flag enabled
,
123 enum transparent_hugepage_flag req_madv
)
125 if (test_bit(enabled
, &transparent_hugepage_flags
)) {
126 VM_BUG_ON(test_bit(req_madv
, &transparent_hugepage_flags
));
127 return sprintf(buf
, "[always] madvise never\n");
128 } else if (test_bit(req_madv
, &transparent_hugepage_flags
))
129 return sprintf(buf
, "always [madvise] never\n");
131 return sprintf(buf
, "always madvise [never]\n");
133 static ssize_t
double_flag_store(struct kobject
*kobj
,
134 struct kobj_attribute
*attr
,
135 const char *buf
, size_t count
,
136 enum transparent_hugepage_flag enabled
,
137 enum transparent_hugepage_flag req_madv
)
139 if (!memcmp("always", buf
,
140 min(sizeof("always")-1, count
))) {
141 set_bit(enabled
, &transparent_hugepage_flags
);
142 clear_bit(req_madv
, &transparent_hugepage_flags
);
143 } else if (!memcmp("madvise", buf
,
144 min(sizeof("madvise")-1, count
))) {
145 clear_bit(enabled
, &transparent_hugepage_flags
);
146 set_bit(req_madv
, &transparent_hugepage_flags
);
147 } else if (!memcmp("never", buf
,
148 min(sizeof("never")-1, count
))) {
149 clear_bit(enabled
, &transparent_hugepage_flags
);
150 clear_bit(req_madv
, &transparent_hugepage_flags
);
157 static ssize_t
enabled_show(struct kobject
*kobj
,
158 struct kobj_attribute
*attr
, char *buf
)
160 return double_flag_show(kobj
, attr
, buf
,
161 TRANSPARENT_HUGEPAGE_FLAG
,
162 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG
);
164 static ssize_t
enabled_store(struct kobject
*kobj
,
165 struct kobj_attribute
*attr
,
166 const char *buf
, size_t count
)
170 ret
= double_flag_store(kobj
, attr
, buf
, count
,
171 TRANSPARENT_HUGEPAGE_FLAG
,
172 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG
);
175 int err
= start_khugepaged();
182 static struct kobj_attribute enabled_attr
=
183 __ATTR(enabled
, 0644, enabled_show
, enabled_store
);
185 static ssize_t
single_flag_show(struct kobject
*kobj
,
186 struct kobj_attribute
*attr
, char *buf
,
187 enum transparent_hugepage_flag flag
)
189 if (test_bit(flag
, &transparent_hugepage_flags
))
190 return sprintf(buf
, "[yes] no\n");
192 return sprintf(buf
, "yes [no]\n");
194 static ssize_t
single_flag_store(struct kobject
*kobj
,
195 struct kobj_attribute
*attr
,
196 const char *buf
, size_t count
,
197 enum transparent_hugepage_flag flag
)
199 if (!memcmp("yes", buf
,
200 min(sizeof("yes")-1, count
))) {
201 set_bit(flag
, &transparent_hugepage_flags
);
202 } else if (!memcmp("no", buf
,
203 min(sizeof("no")-1, count
))) {
204 clear_bit(flag
, &transparent_hugepage_flags
);
212 * Currently defrag only disables __GFP_NOWAIT for allocation. A blind
213 * __GFP_REPEAT is too aggressive, it's never worth swapping tons of
214 * memory just to allocate one more hugepage.
216 static ssize_t
defrag_show(struct kobject
*kobj
,
217 struct kobj_attribute
*attr
, char *buf
)
219 return double_flag_show(kobj
, attr
, buf
,
220 TRANSPARENT_HUGEPAGE_DEFRAG_FLAG
,
221 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG
);
223 static ssize_t
defrag_store(struct kobject
*kobj
,
224 struct kobj_attribute
*attr
,
225 const char *buf
, size_t count
)
227 return double_flag_store(kobj
, attr
, buf
, count
,
228 TRANSPARENT_HUGEPAGE_DEFRAG_FLAG
,
229 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG
);
231 static struct kobj_attribute defrag_attr
=
232 __ATTR(defrag
, 0644, defrag_show
, defrag_store
);
234 #ifdef CONFIG_DEBUG_VM
235 static ssize_t
debug_cow_show(struct kobject
*kobj
,
236 struct kobj_attribute
*attr
, char *buf
)
238 return single_flag_show(kobj
, attr
, buf
,
239 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG
);
241 static ssize_t
debug_cow_store(struct kobject
*kobj
,
242 struct kobj_attribute
*attr
,
243 const char *buf
, size_t count
)
245 return single_flag_store(kobj
, attr
, buf
, count
,
246 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG
);
248 static struct kobj_attribute debug_cow_attr
=
249 __ATTR(debug_cow
, 0644, debug_cow_show
, debug_cow_store
);
250 #endif /* CONFIG_DEBUG_VM */
252 static struct attribute
*hugepage_attr
[] = {
255 #ifdef CONFIG_DEBUG_VM
256 &debug_cow_attr
.attr
,
261 static struct attribute_group hugepage_attr_group
= {
262 .attrs
= hugepage_attr
,
265 static ssize_t
scan_sleep_millisecs_show(struct kobject
*kobj
,
266 struct kobj_attribute
*attr
,
269 return sprintf(buf
, "%u\n", khugepaged_scan_sleep_millisecs
);
272 static ssize_t
scan_sleep_millisecs_store(struct kobject
*kobj
,
273 struct kobj_attribute
*attr
,
274 const char *buf
, size_t count
)
279 err
= strict_strtoul(buf
, 10, &msecs
);
280 if (err
|| msecs
> UINT_MAX
)
283 khugepaged_scan_sleep_millisecs
= msecs
;
284 wake_up_interruptible(&khugepaged_wait
);
288 static struct kobj_attribute scan_sleep_millisecs_attr
=
289 __ATTR(scan_sleep_millisecs
, 0644, scan_sleep_millisecs_show
,
290 scan_sleep_millisecs_store
);
292 static ssize_t
alloc_sleep_millisecs_show(struct kobject
*kobj
,
293 struct kobj_attribute
*attr
,
296 return sprintf(buf
, "%u\n", khugepaged_alloc_sleep_millisecs
);
299 static ssize_t
alloc_sleep_millisecs_store(struct kobject
*kobj
,
300 struct kobj_attribute
*attr
,
301 const char *buf
, size_t count
)
306 err
= strict_strtoul(buf
, 10, &msecs
);
307 if (err
|| msecs
> UINT_MAX
)
310 khugepaged_alloc_sleep_millisecs
= msecs
;
311 wake_up_interruptible(&khugepaged_wait
);
315 static struct kobj_attribute alloc_sleep_millisecs_attr
=
316 __ATTR(alloc_sleep_millisecs
, 0644, alloc_sleep_millisecs_show
,
317 alloc_sleep_millisecs_store
);
319 static ssize_t
pages_to_scan_show(struct kobject
*kobj
,
320 struct kobj_attribute
*attr
,
323 return sprintf(buf
, "%u\n", khugepaged_pages_to_scan
);
325 static ssize_t
pages_to_scan_store(struct kobject
*kobj
,
326 struct kobj_attribute
*attr
,
327 const char *buf
, size_t count
)
332 err
= strict_strtoul(buf
, 10, &pages
);
333 if (err
|| !pages
|| pages
> UINT_MAX
)
336 khugepaged_pages_to_scan
= pages
;
340 static struct kobj_attribute pages_to_scan_attr
=
341 __ATTR(pages_to_scan
, 0644, pages_to_scan_show
,
342 pages_to_scan_store
);
344 static ssize_t
pages_collapsed_show(struct kobject
*kobj
,
345 struct kobj_attribute
*attr
,
348 return sprintf(buf
, "%u\n", khugepaged_pages_collapsed
);
350 static struct kobj_attribute pages_collapsed_attr
=
351 __ATTR_RO(pages_collapsed
);
353 static ssize_t
full_scans_show(struct kobject
*kobj
,
354 struct kobj_attribute
*attr
,
357 return sprintf(buf
, "%u\n", khugepaged_full_scans
);
359 static struct kobj_attribute full_scans_attr
=
360 __ATTR_RO(full_scans
);
362 static ssize_t
khugepaged_defrag_show(struct kobject
*kobj
,
363 struct kobj_attribute
*attr
, char *buf
)
365 return single_flag_show(kobj
, attr
, buf
,
366 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG
);
368 static ssize_t
khugepaged_defrag_store(struct kobject
*kobj
,
369 struct kobj_attribute
*attr
,
370 const char *buf
, size_t count
)
372 return single_flag_store(kobj
, attr
, buf
, count
,
373 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG
);
375 static struct kobj_attribute khugepaged_defrag_attr
=
376 __ATTR(defrag
, 0644, khugepaged_defrag_show
,
377 khugepaged_defrag_store
);
380 * max_ptes_none controls if khugepaged should collapse hugepages over
381 * any unmapped ptes in turn potentially increasing the memory
382 * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
383 * reduce the available free memory in the system as it
384 * runs. Increasing max_ptes_none will instead potentially reduce the
385 * free memory in the system during the khugepaged scan.
387 static ssize_t
khugepaged_max_ptes_none_show(struct kobject
*kobj
,
388 struct kobj_attribute
*attr
,
391 return sprintf(buf
, "%u\n", khugepaged_max_ptes_none
);
393 static ssize_t
khugepaged_max_ptes_none_store(struct kobject
*kobj
,
394 struct kobj_attribute
*attr
,
395 const char *buf
, size_t count
)
398 unsigned long max_ptes_none
;
400 err
= strict_strtoul(buf
, 10, &max_ptes_none
);
401 if (err
|| max_ptes_none
> HPAGE_PMD_NR
-1)
404 khugepaged_max_ptes_none
= max_ptes_none
;
408 static struct kobj_attribute khugepaged_max_ptes_none_attr
=
409 __ATTR(max_ptes_none
, 0644, khugepaged_max_ptes_none_show
,
410 khugepaged_max_ptes_none_store
);
412 static struct attribute
*khugepaged_attr
[] = {
413 &khugepaged_defrag_attr
.attr
,
414 &khugepaged_max_ptes_none_attr
.attr
,
415 &pages_to_scan_attr
.attr
,
416 &pages_collapsed_attr
.attr
,
417 &full_scans_attr
.attr
,
418 &scan_sleep_millisecs_attr
.attr
,
419 &alloc_sleep_millisecs_attr
.attr
,
423 static struct attribute_group khugepaged_attr_group
= {
424 .attrs
= khugepaged_attr
,
425 .name
= "khugepaged",
427 #endif /* CONFIG_SYSFS */
429 static int __init
hugepage_init(void)
433 static struct kobject
*hugepage_kobj
;
436 hugepage_kobj
= kobject_create_and_add("transparent_hugepage", mm_kobj
);
437 if (unlikely(!hugepage_kobj
)) {
438 printk(KERN_ERR
"hugepage: failed kobject create\n");
442 err
= sysfs_create_group(hugepage_kobj
, &hugepage_attr_group
);
444 printk(KERN_ERR
"hugepage: failed register hugeage group\n");
448 err
= sysfs_create_group(hugepage_kobj
, &khugepaged_attr_group
);
450 printk(KERN_ERR
"hugepage: failed register hugeage group\n");
455 err
= khugepaged_slab_init();
459 err
= mm_slots_hash_init();
461 khugepaged_slab_free();
470 module_init(hugepage_init
)
472 static int __init
setup_transparent_hugepage(char *str
)
477 if (!strcmp(str
, "always")) {
478 set_bit(TRANSPARENT_HUGEPAGE_FLAG
,
479 &transparent_hugepage_flags
);
480 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG
,
481 &transparent_hugepage_flags
);
483 } else if (!strcmp(str
, "madvise")) {
484 clear_bit(TRANSPARENT_HUGEPAGE_FLAG
,
485 &transparent_hugepage_flags
);
486 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG
,
487 &transparent_hugepage_flags
);
489 } else if (!strcmp(str
, "never")) {
490 clear_bit(TRANSPARENT_HUGEPAGE_FLAG
,
491 &transparent_hugepage_flags
);
492 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG
,
493 &transparent_hugepage_flags
);
499 "transparent_hugepage= cannot parse, ignored\n");
502 __setup("transparent_hugepage=", setup_transparent_hugepage
);
504 static void prepare_pmd_huge_pte(pgtable_t pgtable
,
505 struct mm_struct
*mm
)
507 assert_spin_locked(&mm
->page_table_lock
);
510 if (!mm
->pmd_huge_pte
)
511 INIT_LIST_HEAD(&pgtable
->lru
);
513 list_add(&pgtable
->lru
, &mm
->pmd_huge_pte
->lru
);
514 mm
->pmd_huge_pte
= pgtable
;
517 static inline pmd_t
maybe_pmd_mkwrite(pmd_t pmd
, struct vm_area_struct
*vma
)
519 if (likely(vma
->vm_flags
& VM_WRITE
))
520 pmd
= pmd_mkwrite(pmd
);
524 static int __do_huge_pmd_anonymous_page(struct mm_struct
*mm
,
525 struct vm_area_struct
*vma
,
526 unsigned long haddr
, pmd_t
*pmd
,
532 VM_BUG_ON(!PageCompound(page
));
533 pgtable
= pte_alloc_one(mm
, haddr
);
534 if (unlikely(!pgtable
)) {
535 mem_cgroup_uncharge_page(page
);
540 clear_huge_page(page
, haddr
, HPAGE_PMD_NR
);
541 __SetPageUptodate(page
);
543 spin_lock(&mm
->page_table_lock
);
544 if (unlikely(!pmd_none(*pmd
))) {
545 spin_unlock(&mm
->page_table_lock
);
546 mem_cgroup_uncharge_page(page
);
548 pte_free(mm
, pgtable
);
551 entry
= mk_pmd(page
, vma
->vm_page_prot
);
552 entry
= maybe_pmd_mkwrite(pmd_mkdirty(entry
), vma
);
553 entry
= pmd_mkhuge(entry
);
555 * The spinlocking to take the lru_lock inside
556 * page_add_new_anon_rmap() acts as a full memory
557 * barrier to be sure clear_huge_page writes become
558 * visible after the set_pmd_at() write.
560 page_add_new_anon_rmap(page
, vma
, haddr
);
561 set_pmd_at(mm
, haddr
, pmd
, entry
);
562 prepare_pmd_huge_pte(pgtable
, mm
);
563 add_mm_counter(mm
, MM_ANONPAGES
, HPAGE_PMD_NR
);
564 spin_unlock(&mm
->page_table_lock
);
570 static inline struct page
*alloc_hugepage(int defrag
)
572 return alloc_pages(GFP_TRANSHUGE
& ~(defrag
? 0 : __GFP_WAIT
),
576 int do_huge_pmd_anonymous_page(struct mm_struct
*mm
, struct vm_area_struct
*vma
,
577 unsigned long address
, pmd_t
*pmd
,
581 unsigned long haddr
= address
& HPAGE_PMD_MASK
;
584 if (haddr
>= vma
->vm_start
&& haddr
+ HPAGE_PMD_SIZE
<= vma
->vm_end
) {
585 if (unlikely(anon_vma_prepare(vma
)))
587 if (unlikely(khugepaged_enter(vma
)))
589 page
= alloc_hugepage(transparent_hugepage_defrag(vma
));
592 if (unlikely(mem_cgroup_newpage_charge(page
, mm
, GFP_KERNEL
))) {
597 return __do_huge_pmd_anonymous_page(mm
, vma
, haddr
, pmd
, page
);
601 * Use __pte_alloc instead of pte_alloc_map, because we can't
602 * run pte_offset_map on the pmd, if an huge pmd could
603 * materialize from under us from a different thread.
605 if (unlikely(__pte_alloc(mm
, vma
, pmd
, address
)))
607 /* if an huge pmd materialized from under us just retry later */
608 if (unlikely(pmd_trans_huge(*pmd
)))
611 * A regular pmd is established and it can't morph into a huge pmd
612 * from under us anymore at this point because we hold the mmap_sem
613 * read mode and khugepaged takes it in write mode. So now it's
614 * safe to run pte_offset_map().
616 pte
= pte_offset_map(pmd
, address
);
617 return handle_pte_fault(mm
, vma
, address
, pte
, pmd
, flags
);
620 int copy_huge_pmd(struct mm_struct
*dst_mm
, struct mm_struct
*src_mm
,
621 pmd_t
*dst_pmd
, pmd_t
*src_pmd
, unsigned long addr
,
622 struct vm_area_struct
*vma
)
624 struct page
*src_page
;
630 pgtable
= pte_alloc_one(dst_mm
, addr
);
631 if (unlikely(!pgtable
))
634 spin_lock(&dst_mm
->page_table_lock
);
635 spin_lock_nested(&src_mm
->page_table_lock
, SINGLE_DEPTH_NESTING
);
639 if (unlikely(!pmd_trans_huge(pmd
))) {
640 pte_free(dst_mm
, pgtable
);
643 if (unlikely(pmd_trans_splitting(pmd
))) {
644 /* split huge page running from under us */
645 spin_unlock(&src_mm
->page_table_lock
);
646 spin_unlock(&dst_mm
->page_table_lock
);
647 pte_free(dst_mm
, pgtable
);
649 wait_split_huge_page(vma
->anon_vma
, src_pmd
); /* src_vma */
652 src_page
= pmd_page(pmd
);
653 VM_BUG_ON(!PageHead(src_page
));
655 page_dup_rmap(src_page
);
656 add_mm_counter(dst_mm
, MM_ANONPAGES
, HPAGE_PMD_NR
);
658 pmdp_set_wrprotect(src_mm
, addr
, src_pmd
);
659 pmd
= pmd_mkold(pmd_wrprotect(pmd
));
660 set_pmd_at(dst_mm
, addr
, dst_pmd
, pmd
);
661 prepare_pmd_huge_pte(pgtable
, dst_mm
);
665 spin_unlock(&src_mm
->page_table_lock
);
666 spin_unlock(&dst_mm
->page_table_lock
);
671 /* no "address" argument so destroys page coloring of some arch */
672 pgtable_t
get_pmd_huge_pte(struct mm_struct
*mm
)
676 assert_spin_locked(&mm
->page_table_lock
);
679 pgtable
= mm
->pmd_huge_pte
;
680 if (list_empty(&pgtable
->lru
))
681 mm
->pmd_huge_pte
= NULL
;
683 mm
->pmd_huge_pte
= list_entry(pgtable
->lru
.next
,
685 list_del(&pgtable
->lru
);
690 static int do_huge_pmd_wp_page_fallback(struct mm_struct
*mm
,
691 struct vm_area_struct
*vma
,
692 unsigned long address
,
693 pmd_t
*pmd
, pmd_t orig_pmd
,
702 pages
= kmalloc(sizeof(struct page
*) * HPAGE_PMD_NR
,
704 if (unlikely(!pages
)) {
709 for (i
= 0; i
< HPAGE_PMD_NR
; i
++) {
710 pages
[i
] = alloc_page_vma(GFP_HIGHUSER_MOVABLE
,
712 if (unlikely(!pages
[i
] ||
713 mem_cgroup_newpage_charge(pages
[i
], mm
,
717 mem_cgroup_uncharge_start();
719 mem_cgroup_uncharge_page(pages
[i
]);
722 mem_cgroup_uncharge_end();
729 for (i
= 0; i
< HPAGE_PMD_NR
; i
++) {
730 copy_user_highpage(pages
[i
], page
+ i
,
731 haddr
+ PAGE_SHIFT
*i
, vma
);
732 __SetPageUptodate(pages
[i
]);
736 spin_lock(&mm
->page_table_lock
);
737 if (unlikely(!pmd_same(*pmd
, orig_pmd
)))
739 VM_BUG_ON(!PageHead(page
));
741 pmdp_clear_flush_notify(vma
, haddr
, pmd
);
742 /* leave pmd empty until pte is filled */
744 pgtable
= get_pmd_huge_pte(mm
);
745 pmd_populate(mm
, &_pmd
, pgtable
);
747 for (i
= 0; i
< HPAGE_PMD_NR
; i
++, haddr
+= PAGE_SIZE
) {
749 entry
= mk_pte(pages
[i
], vma
->vm_page_prot
);
750 entry
= maybe_mkwrite(pte_mkdirty(entry
), vma
);
751 page_add_new_anon_rmap(pages
[i
], vma
, haddr
);
752 pte
= pte_offset_map(&_pmd
, haddr
);
753 VM_BUG_ON(!pte_none(*pte
));
754 set_pte_at(mm
, haddr
, pte
, entry
);
760 smp_wmb(); /* make pte visible before pmd */
761 pmd_populate(mm
, pmd
, pgtable
);
762 page_remove_rmap(page
);
763 spin_unlock(&mm
->page_table_lock
);
765 ret
|= VM_FAULT_WRITE
;
772 spin_unlock(&mm
->page_table_lock
);
773 mem_cgroup_uncharge_start();
774 for (i
= 0; i
< HPAGE_PMD_NR
; i
++) {
775 mem_cgroup_uncharge_page(pages
[i
]);
778 mem_cgroup_uncharge_end();
783 int do_huge_pmd_wp_page(struct mm_struct
*mm
, struct vm_area_struct
*vma
,
784 unsigned long address
, pmd_t
*pmd
, pmd_t orig_pmd
)
787 struct page
*page
, *new_page
;
790 VM_BUG_ON(!vma
->anon_vma
);
791 spin_lock(&mm
->page_table_lock
);
792 if (unlikely(!pmd_same(*pmd
, orig_pmd
)))
795 page
= pmd_page(orig_pmd
);
796 VM_BUG_ON(!PageCompound(page
) || !PageHead(page
));
797 haddr
= address
& HPAGE_PMD_MASK
;
798 if (page_mapcount(page
) == 1) {
800 entry
= pmd_mkyoung(orig_pmd
);
801 entry
= maybe_pmd_mkwrite(pmd_mkdirty(entry
), vma
);
802 if (pmdp_set_access_flags(vma
, haddr
, pmd
, entry
, 1))
803 update_mmu_cache(vma
, address
, entry
);
804 ret
|= VM_FAULT_WRITE
;
808 spin_unlock(&mm
->page_table_lock
);
810 if (transparent_hugepage_enabled(vma
) &&
811 !transparent_hugepage_debug_cow())
812 new_page
= alloc_hugepage(transparent_hugepage_defrag(vma
));
816 if (unlikely(!new_page
)) {
817 ret
= do_huge_pmd_wp_page_fallback(mm
, vma
, address
,
818 pmd
, orig_pmd
, page
, haddr
);
823 if (unlikely(mem_cgroup_newpage_charge(new_page
, mm
, GFP_KERNEL
))) {
830 copy_user_huge_page(new_page
, page
, haddr
, vma
, HPAGE_PMD_NR
);
831 __SetPageUptodate(new_page
);
833 spin_lock(&mm
->page_table_lock
);
835 if (unlikely(!pmd_same(*pmd
, orig_pmd
))) {
836 mem_cgroup_uncharge_page(new_page
);
840 VM_BUG_ON(!PageHead(page
));
841 entry
= mk_pmd(new_page
, vma
->vm_page_prot
);
842 entry
= maybe_pmd_mkwrite(pmd_mkdirty(entry
), vma
);
843 entry
= pmd_mkhuge(entry
);
844 pmdp_clear_flush_notify(vma
, haddr
, pmd
);
845 page_add_new_anon_rmap(new_page
, vma
, haddr
);
846 set_pmd_at(mm
, haddr
, pmd
, entry
);
847 update_mmu_cache(vma
, address
, entry
);
848 page_remove_rmap(page
);
850 ret
|= VM_FAULT_WRITE
;
853 spin_unlock(&mm
->page_table_lock
);
858 struct page
*follow_trans_huge_pmd(struct mm_struct
*mm
,
863 struct page
*page
= NULL
;
865 assert_spin_locked(&mm
->page_table_lock
);
867 if (flags
& FOLL_WRITE
&& !pmd_write(*pmd
))
870 page
= pmd_page(*pmd
);
871 VM_BUG_ON(!PageHead(page
));
872 if (flags
& FOLL_TOUCH
) {
875 * We should set the dirty bit only for FOLL_WRITE but
876 * for now the dirty bit in the pmd is meaningless.
877 * And if the dirty bit will become meaningful and
878 * we'll only set it with FOLL_WRITE, an atomic
879 * set_bit will be required on the pmd to set the
880 * young bit, instead of the current set_pmd_at.
882 _pmd
= pmd_mkyoung(pmd_mkdirty(*pmd
));
883 set_pmd_at(mm
, addr
& HPAGE_PMD_MASK
, pmd
, _pmd
);
885 page
+= (addr
& ~HPAGE_PMD_MASK
) >> PAGE_SHIFT
;
886 VM_BUG_ON(!PageCompound(page
));
887 if (flags
& FOLL_GET
)
894 int zap_huge_pmd(struct mmu_gather
*tlb
, struct vm_area_struct
*vma
,
899 spin_lock(&tlb
->mm
->page_table_lock
);
900 if (likely(pmd_trans_huge(*pmd
))) {
901 if (unlikely(pmd_trans_splitting(*pmd
))) {
902 spin_unlock(&tlb
->mm
->page_table_lock
);
903 wait_split_huge_page(vma
->anon_vma
,
908 pgtable
= get_pmd_huge_pte(tlb
->mm
);
909 page
= pmd_page(*pmd
);
911 page_remove_rmap(page
);
912 VM_BUG_ON(page_mapcount(page
) < 0);
913 add_mm_counter(tlb
->mm
, MM_ANONPAGES
, -HPAGE_PMD_NR
);
914 VM_BUG_ON(!PageHead(page
));
915 spin_unlock(&tlb
->mm
->page_table_lock
);
916 tlb_remove_page(tlb
, page
);
917 pte_free(tlb
->mm
, pgtable
);
921 spin_unlock(&tlb
->mm
->page_table_lock
);
926 pmd_t
*page_check_address_pmd(struct page
*page
,
927 struct mm_struct
*mm
,
928 unsigned long address
,
929 enum page_check_address_pmd_flag flag
)
933 pmd_t
*pmd
, *ret
= NULL
;
935 if (address
& ~HPAGE_PMD_MASK
)
938 pgd
= pgd_offset(mm
, address
);
939 if (!pgd_present(*pgd
))
942 pud
= pud_offset(pgd
, address
);
943 if (!pud_present(*pud
))
946 pmd
= pmd_offset(pud
, address
);
949 if (pmd_page(*pmd
) != page
)
951 VM_BUG_ON(flag
== PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG
&&
952 pmd_trans_splitting(*pmd
));
953 if (pmd_trans_huge(*pmd
)) {
954 VM_BUG_ON(flag
== PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG
&&
955 !pmd_trans_splitting(*pmd
));
962 static int __split_huge_page_splitting(struct page
*page
,
963 struct vm_area_struct
*vma
,
964 unsigned long address
)
966 struct mm_struct
*mm
= vma
->vm_mm
;
970 spin_lock(&mm
->page_table_lock
);
971 pmd
= page_check_address_pmd(page
, mm
, address
,
972 PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG
);
975 * We can't temporarily set the pmd to null in order
976 * to split it, the pmd must remain marked huge at all
977 * times or the VM won't take the pmd_trans_huge paths
978 * and it won't wait on the anon_vma->root->lock to
979 * serialize against split_huge_page*.
981 pmdp_splitting_flush_notify(vma
, address
, pmd
);
984 spin_unlock(&mm
->page_table_lock
);
989 static void __split_huge_page_refcount(struct page
*page
)
992 unsigned long head_index
= page
->index
;
993 struct zone
*zone
= page_zone(page
);
995 /* prevent PageLRU to go away from under us, and freeze lru stats */
996 spin_lock_irq(&zone
->lru_lock
);
999 for (i
= 1; i
< HPAGE_PMD_NR
; i
++) {
1000 struct page
*page_tail
= page
+ i
;
1002 /* tail_page->_count cannot change */
1003 atomic_sub(atomic_read(&page_tail
->_count
), &page
->_count
);
1004 BUG_ON(page_count(page
) <= 0);
1005 atomic_add(page_mapcount(page
) + 1, &page_tail
->_count
);
1006 BUG_ON(atomic_read(&page_tail
->_count
) <= 0);
1008 /* after clearing PageTail the gup refcount can be released */
1011 page_tail
->flags
&= ~PAGE_FLAGS_CHECK_AT_PREP
;
1012 page_tail
->flags
|= (page
->flags
&
1013 ((1L << PG_referenced
) |
1014 (1L << PG_swapbacked
) |
1015 (1L << PG_mlocked
) |
1016 (1L << PG_uptodate
)));
1017 page_tail
->flags
|= (1L << PG_dirty
);
1020 * 1) clear PageTail before overwriting first_page
1021 * 2) clear PageTail before clearing PageHead for VM_BUG_ON
1026 * __split_huge_page_splitting() already set the
1027 * splitting bit in all pmd that could map this
1028 * hugepage, that will ensure no CPU can alter the
1029 * mapcount on the head page. The mapcount is only
1030 * accounted in the head page and it has to be
1031 * transferred to all tail pages in the below code. So
1032 * for this code to be safe, the split the mapcount
1033 * can't change. But that doesn't mean userland can't
1034 * keep changing and reading the page contents while
1035 * we transfer the mapcount, so the pmd splitting
1036 * status is achieved setting a reserved bit in the
1037 * pmd, not by clearing the present bit.
1039 BUG_ON(page_mapcount(page_tail
));
1040 page_tail
->_mapcount
= page
->_mapcount
;
1042 BUG_ON(page_tail
->mapping
);
1043 page_tail
->mapping
= page
->mapping
;
1045 page_tail
->index
= ++head_index
;
1047 BUG_ON(!PageAnon(page_tail
));
1048 BUG_ON(!PageUptodate(page_tail
));
1049 BUG_ON(!PageDirty(page_tail
));
1050 BUG_ON(!PageSwapBacked(page_tail
));
1052 lru_add_page_tail(zone
, page
, page_tail
);
1055 __dec_zone_page_state(page
, NR_ANON_TRANSPARENT_HUGEPAGES
);
1056 __mod_zone_page_state(zone
, NR_ANON_PAGES
, HPAGE_PMD_NR
);
1058 ClearPageCompound(page
);
1059 compound_unlock(page
);
1060 spin_unlock_irq(&zone
->lru_lock
);
1062 for (i
= 1; i
< HPAGE_PMD_NR
; i
++) {
1063 struct page
*page_tail
= page
+ i
;
1064 BUG_ON(page_count(page_tail
) <= 0);
1066 * Tail pages may be freed if there wasn't any mapping
1067 * like if add_to_swap() is running on a lru page that
1068 * had its mapping zapped. And freeing these pages
1069 * requires taking the lru_lock so we do the put_page
1070 * of the tail pages after the split is complete.
1072 put_page(page_tail
);
1076 * Only the head page (now become a regular page) is required
1077 * to be pinned by the caller.
1079 BUG_ON(page_count(page
) <= 0);
1082 static int __split_huge_page_map(struct page
*page
,
1083 struct vm_area_struct
*vma
,
1084 unsigned long address
)
1086 struct mm_struct
*mm
= vma
->vm_mm
;
1090 unsigned long haddr
;
1092 spin_lock(&mm
->page_table_lock
);
1093 pmd
= page_check_address_pmd(page
, mm
, address
,
1094 PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG
);
1096 pgtable
= get_pmd_huge_pte(mm
);
1097 pmd_populate(mm
, &_pmd
, pgtable
);
1099 for (i
= 0, haddr
= address
; i
< HPAGE_PMD_NR
;
1100 i
++, haddr
+= PAGE_SIZE
) {
1102 BUG_ON(PageCompound(page
+i
));
1103 entry
= mk_pte(page
+ i
, vma
->vm_page_prot
);
1104 entry
= maybe_mkwrite(pte_mkdirty(entry
), vma
);
1105 if (!pmd_write(*pmd
))
1106 entry
= pte_wrprotect(entry
);
1108 BUG_ON(page_mapcount(page
) != 1);
1109 if (!pmd_young(*pmd
))
1110 entry
= pte_mkold(entry
);
1111 pte
= pte_offset_map(&_pmd
, haddr
);
1112 BUG_ON(!pte_none(*pte
));
1113 set_pte_at(mm
, haddr
, pte
, entry
);
1118 smp_wmb(); /* make pte visible before pmd */
1120 * Up to this point the pmd is present and huge and
1121 * userland has the whole access to the hugepage
1122 * during the split (which happens in place). If we
1123 * overwrite the pmd with the not-huge version
1124 * pointing to the pte here (which of course we could
1125 * if all CPUs were bug free), userland could trigger
1126 * a small page size TLB miss on the small sized TLB
1127 * while the hugepage TLB entry is still established
1128 * in the huge TLB. Some CPU doesn't like that. See
1129 * http://support.amd.com/us/Processor_TechDocs/41322.pdf,
1130 * Erratum 383 on page 93. Intel should be safe but is
1131 * also warns that it's only safe if the permission
1132 * and cache attributes of the two entries loaded in
1133 * the two TLB is identical (which should be the case
1134 * here). But it is generally safer to never allow
1135 * small and huge TLB entries for the same virtual
1136 * address to be loaded simultaneously. So instead of
1137 * doing "pmd_populate(); flush_tlb_range();" we first
1138 * mark the current pmd notpresent (atomically because
1139 * here the pmd_trans_huge and pmd_trans_splitting
1140 * must remain set at all times on the pmd until the
1141 * split is complete for this pmd), then we flush the
1142 * SMP TLB and finally we write the non-huge version
1143 * of the pmd entry with pmd_populate.
1145 set_pmd_at(mm
, address
, pmd
, pmd_mknotpresent(*pmd
));
1146 flush_tlb_range(vma
, address
, address
+ HPAGE_PMD_SIZE
);
1147 pmd_populate(mm
, pmd
, pgtable
);
1150 spin_unlock(&mm
->page_table_lock
);
1155 /* must be called with anon_vma->root->lock hold */
1156 static void __split_huge_page(struct page
*page
,
1157 struct anon_vma
*anon_vma
)
1159 int mapcount
, mapcount2
;
1160 struct anon_vma_chain
*avc
;
1162 BUG_ON(!PageHead(page
));
1163 BUG_ON(PageTail(page
));
1166 list_for_each_entry(avc
, &anon_vma
->head
, same_anon_vma
) {
1167 struct vm_area_struct
*vma
= avc
->vma
;
1168 unsigned long addr
= vma_address(page
, vma
);
1169 BUG_ON(is_vma_temporary_stack(vma
));
1170 if (addr
== -EFAULT
)
1172 mapcount
+= __split_huge_page_splitting(page
, vma
, addr
);
1175 * It is critical that new vmas are added to the tail of the
1176 * anon_vma list. This guarantes that if copy_huge_pmd() runs
1177 * and establishes a child pmd before
1178 * __split_huge_page_splitting() freezes the parent pmd (so if
1179 * we fail to prevent copy_huge_pmd() from running until the
1180 * whole __split_huge_page() is complete), we will still see
1181 * the newly established pmd of the child later during the
1182 * walk, to be able to set it as pmd_trans_splitting too.
1184 if (mapcount
!= page_mapcount(page
))
1185 printk(KERN_ERR
"mapcount %d page_mapcount %d\n",
1186 mapcount
, page_mapcount(page
));
1187 BUG_ON(mapcount
!= page_mapcount(page
));
1189 __split_huge_page_refcount(page
);
1192 list_for_each_entry(avc
, &anon_vma
->head
, same_anon_vma
) {
1193 struct vm_area_struct
*vma
= avc
->vma
;
1194 unsigned long addr
= vma_address(page
, vma
);
1195 BUG_ON(is_vma_temporary_stack(vma
));
1196 if (addr
== -EFAULT
)
1198 mapcount2
+= __split_huge_page_map(page
, vma
, addr
);
1200 if (mapcount
!= mapcount2
)
1201 printk(KERN_ERR
"mapcount %d mapcount2 %d page_mapcount %d\n",
1202 mapcount
, mapcount2
, page_mapcount(page
));
1203 BUG_ON(mapcount
!= mapcount2
);
1206 int split_huge_page(struct page
*page
)
1208 struct anon_vma
*anon_vma
;
1211 BUG_ON(!PageAnon(page
));
1212 anon_vma
= page_lock_anon_vma(page
);
1216 if (!PageCompound(page
))
1219 BUG_ON(!PageSwapBacked(page
));
1220 __split_huge_page(page
, anon_vma
);
1222 BUG_ON(PageCompound(page
));
1224 page_unlock_anon_vma(anon_vma
);
1229 int hugepage_madvise(unsigned long *vm_flags
)
1232 * Be somewhat over-protective like KSM for now!
1234 if (*vm_flags
& (VM_HUGEPAGE
| VM_SHARED
| VM_MAYSHARE
|
1235 VM_PFNMAP
| VM_IO
| VM_DONTEXPAND
|
1236 VM_RESERVED
| VM_HUGETLB
| VM_INSERTPAGE
|
1237 VM_MIXEDMAP
| VM_SAO
))
1240 *vm_flags
|= VM_HUGEPAGE
;
1245 static int __init
khugepaged_slab_init(void)
1247 mm_slot_cache
= kmem_cache_create("khugepaged_mm_slot",
1248 sizeof(struct mm_slot
),
1249 __alignof__(struct mm_slot
), 0, NULL
);
1256 static void __init
khugepaged_slab_free(void)
1258 kmem_cache_destroy(mm_slot_cache
);
1259 mm_slot_cache
= NULL
;
1262 static inline struct mm_slot
*alloc_mm_slot(void)
1264 if (!mm_slot_cache
) /* initialization failed */
1266 return kmem_cache_zalloc(mm_slot_cache
, GFP_KERNEL
);
1269 static inline void free_mm_slot(struct mm_slot
*mm_slot
)
1271 kmem_cache_free(mm_slot_cache
, mm_slot
);
1274 static int __init
mm_slots_hash_init(void)
1276 mm_slots_hash
= kzalloc(MM_SLOTS_HASH_HEADS
* sizeof(struct hlist_head
),
1284 static void __init
mm_slots_hash_free(void)
1286 kfree(mm_slots_hash
);
1287 mm_slots_hash
= NULL
;
1291 static struct mm_slot
*get_mm_slot(struct mm_struct
*mm
)
1293 struct mm_slot
*mm_slot
;
1294 struct hlist_head
*bucket
;
1295 struct hlist_node
*node
;
1297 bucket
= &mm_slots_hash
[((unsigned long)mm
/ sizeof(struct mm_struct
))
1298 % MM_SLOTS_HASH_HEADS
];
1299 hlist_for_each_entry(mm_slot
, node
, bucket
, hash
) {
1300 if (mm
== mm_slot
->mm
)
1306 static void insert_to_mm_slots_hash(struct mm_struct
*mm
,
1307 struct mm_slot
*mm_slot
)
1309 struct hlist_head
*bucket
;
1311 bucket
= &mm_slots_hash
[((unsigned long)mm
/ sizeof(struct mm_struct
))
1312 % MM_SLOTS_HASH_HEADS
];
1314 hlist_add_head(&mm_slot
->hash
, bucket
);
1317 static inline int khugepaged_test_exit(struct mm_struct
*mm
)
1319 return atomic_read(&mm
->mm_users
) == 0;
1322 int __khugepaged_enter(struct mm_struct
*mm
)
1324 struct mm_slot
*mm_slot
;
1327 mm_slot
= alloc_mm_slot();
1331 /* __khugepaged_exit() must not run from under us */
1332 VM_BUG_ON(khugepaged_test_exit(mm
));
1333 if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE
, &mm
->flags
))) {
1334 free_mm_slot(mm_slot
);
1338 spin_lock(&khugepaged_mm_lock
);
1339 insert_to_mm_slots_hash(mm
, mm_slot
);
1341 * Insert just behind the scanning cursor, to let the area settle
1344 wakeup
= list_empty(&khugepaged_scan
.mm_head
);
1345 list_add_tail(&mm_slot
->mm_node
, &khugepaged_scan
.mm_head
);
1346 spin_unlock(&khugepaged_mm_lock
);
1348 atomic_inc(&mm
->mm_count
);
1350 wake_up_interruptible(&khugepaged_wait
);
1355 int khugepaged_enter_vma_merge(struct vm_area_struct
*vma
)
1357 unsigned long hstart
, hend
;
1360 * Not yet faulted in so we will register later in the
1361 * page fault if needed.
1364 if (vma
->vm_file
|| vma
->vm_ops
)
1365 /* khugepaged not yet working on file or special mappings */
1367 VM_BUG_ON(is_linear_pfn_mapping(vma
) || is_pfn_mapping(vma
));
1368 hstart
= (vma
->vm_start
+ ~HPAGE_PMD_MASK
) & HPAGE_PMD_MASK
;
1369 hend
= vma
->vm_end
& HPAGE_PMD_MASK
;
1371 return khugepaged_enter(vma
);
1375 void __khugepaged_exit(struct mm_struct
*mm
)
1377 struct mm_slot
*mm_slot
;
1380 spin_lock(&khugepaged_mm_lock
);
1381 mm_slot
= get_mm_slot(mm
);
1382 if (mm_slot
&& khugepaged_scan
.mm_slot
!= mm_slot
) {
1383 hlist_del(&mm_slot
->hash
);
1384 list_del(&mm_slot
->mm_node
);
1389 spin_unlock(&khugepaged_mm_lock
);
1390 clear_bit(MMF_VM_HUGEPAGE
, &mm
->flags
);
1391 free_mm_slot(mm_slot
);
1393 } else if (mm_slot
) {
1394 spin_unlock(&khugepaged_mm_lock
);
1396 * This is required to serialize against
1397 * khugepaged_test_exit() (which is guaranteed to run
1398 * under mmap sem read mode). Stop here (after we
1399 * return all pagetables will be destroyed) until
1400 * khugepaged has finished working on the pagetables
1401 * under the mmap_sem.
1403 down_write(&mm
->mmap_sem
);
1404 up_write(&mm
->mmap_sem
);
1406 spin_unlock(&khugepaged_mm_lock
);
1409 static void release_pte_page(struct page
*page
)
1411 /* 0 stands for page_is_file_cache(page) == false */
1412 dec_zone_page_state(page
, NR_ISOLATED_ANON
+ 0);
1414 putback_lru_page(page
);
1417 static void release_pte_pages(pte_t
*pte
, pte_t
*_pte
)
1419 while (--_pte
>= pte
) {
1420 pte_t pteval
= *_pte
;
1421 if (!pte_none(pteval
))
1422 release_pte_page(pte_page(pteval
));
1426 static void release_all_pte_pages(pte_t
*pte
)
1428 release_pte_pages(pte
, pte
+ HPAGE_PMD_NR
);
1431 static int __collapse_huge_page_isolate(struct vm_area_struct
*vma
,
1432 unsigned long address
,
1437 int referenced
= 0, isolated
= 0, none
= 0;
1438 for (_pte
= pte
; _pte
< pte
+HPAGE_PMD_NR
;
1439 _pte
++, address
+= PAGE_SIZE
) {
1440 pte_t pteval
= *_pte
;
1441 if (pte_none(pteval
)) {
1442 if (++none
<= khugepaged_max_ptes_none
)
1445 release_pte_pages(pte
, _pte
);
1449 if (!pte_present(pteval
) || !pte_write(pteval
)) {
1450 release_pte_pages(pte
, _pte
);
1453 page
= vm_normal_page(vma
, address
, pteval
);
1454 if (unlikely(!page
)) {
1455 release_pte_pages(pte
, _pte
);
1458 VM_BUG_ON(PageCompound(page
));
1459 BUG_ON(!PageAnon(page
));
1460 VM_BUG_ON(!PageSwapBacked(page
));
1462 /* cannot use mapcount: can't collapse if there's a gup pin */
1463 if (page_count(page
) != 1) {
1464 release_pte_pages(pte
, _pte
);
1468 * We can do it before isolate_lru_page because the
1469 * page can't be freed from under us. NOTE: PG_lock
1470 * is needed to serialize against split_huge_page
1471 * when invoked from the VM.
1473 if (!trylock_page(page
)) {
1474 release_pte_pages(pte
, _pte
);
1478 * Isolate the page to avoid collapsing an hugepage
1479 * currently in use by the VM.
1481 if (isolate_lru_page(page
)) {
1483 release_pte_pages(pte
, _pte
);
1486 /* 0 stands for page_is_file_cache(page) == false */
1487 inc_zone_page_state(page
, NR_ISOLATED_ANON
+ 0);
1488 VM_BUG_ON(!PageLocked(page
));
1489 VM_BUG_ON(PageLRU(page
));
1491 /* If there is no mapped pte young don't collapse the page */
1492 if (pte_young(pteval
))
1495 if (unlikely(!referenced
))
1496 release_all_pte_pages(pte
);
1503 static void __collapse_huge_page_copy(pte_t
*pte
, struct page
*page
,
1504 struct vm_area_struct
*vma
,
1505 unsigned long address
,
1509 for (_pte
= pte
; _pte
< pte
+HPAGE_PMD_NR
; _pte
++) {
1510 pte_t pteval
= *_pte
;
1511 struct page
*src_page
;
1513 if (pte_none(pteval
)) {
1514 clear_user_highpage(page
, address
);
1515 add_mm_counter(vma
->vm_mm
, MM_ANONPAGES
, 1);
1517 src_page
= pte_page(pteval
);
1518 copy_user_highpage(page
, src_page
, address
, vma
);
1519 VM_BUG_ON(page_mapcount(src_page
) != 1);
1520 VM_BUG_ON(page_count(src_page
) != 2);
1521 release_pte_page(src_page
);
1523 * ptl mostly unnecessary, but preempt has to
1524 * be disabled to update the per-cpu stats
1525 * inside page_remove_rmap().
1529 * paravirt calls inside pte_clear here are
1532 pte_clear(vma
->vm_mm
, address
, _pte
);
1533 page_remove_rmap(src_page
);
1535 free_page_and_swap_cache(src_page
);
1538 address
+= PAGE_SIZE
;
1543 static void collapse_huge_page(struct mm_struct
*mm
,
1544 unsigned long address
,
1545 struct page
**hpage
)
1547 struct vm_area_struct
*vma
;
1553 struct page
*new_page
;
1556 unsigned long hstart
, hend
;
1558 VM_BUG_ON(address
& ~HPAGE_PMD_MASK
);
1562 * Prevent all access to pagetables with the exception of
1563 * gup_fast later hanlded by the ptep_clear_flush and the VM
1564 * handled by the anon_vma lock + PG_lock.
1566 down_write(&mm
->mmap_sem
);
1567 if (unlikely(khugepaged_test_exit(mm
)))
1570 vma
= find_vma(mm
, address
);
1571 hstart
= (vma
->vm_start
+ ~HPAGE_PMD_MASK
) & HPAGE_PMD_MASK
;
1572 hend
= vma
->vm_end
& HPAGE_PMD_MASK
;
1573 if (address
< hstart
|| address
+ HPAGE_PMD_SIZE
> hend
)
1576 if (!(vma
->vm_flags
& VM_HUGEPAGE
) && !khugepaged_always())
1579 /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
1580 if (!vma
->anon_vma
|| vma
->vm_ops
|| vma
->vm_file
)
1582 VM_BUG_ON(is_linear_pfn_mapping(vma
) || is_pfn_mapping(vma
));
1584 pgd
= pgd_offset(mm
, address
);
1585 if (!pgd_present(*pgd
))
1588 pud
= pud_offset(pgd
, address
);
1589 if (!pud_present(*pud
))
1592 pmd
= pmd_offset(pud
, address
);
1593 /* pmd can't go away or become huge under us */
1594 if (!pmd_present(*pmd
) || pmd_trans_huge(*pmd
))
1598 if (unlikely(mem_cgroup_newpage_charge(new_page
, mm
, GFP_KERNEL
)))
1601 anon_vma_lock(vma
->anon_vma
);
1603 pte
= pte_offset_map(pmd
, address
);
1604 ptl
= pte_lockptr(mm
, pmd
);
1606 spin_lock(&mm
->page_table_lock
); /* probably unnecessary */
1608 * After this gup_fast can't run anymore. This also removes
1609 * any huge TLB entry from the CPU so we won't allow
1610 * huge and small TLB entries for the same virtual address
1611 * to avoid the risk of CPU bugs in that area.
1613 _pmd
= pmdp_clear_flush_notify(vma
, address
, pmd
);
1614 spin_unlock(&mm
->page_table_lock
);
1617 isolated
= __collapse_huge_page_isolate(vma
, address
, pte
);
1621 if (unlikely(!isolated
)) {
1622 spin_lock(&mm
->page_table_lock
);
1623 BUG_ON(!pmd_none(*pmd
));
1624 set_pmd_at(mm
, address
, pmd
, _pmd
);
1625 spin_unlock(&mm
->page_table_lock
);
1626 anon_vma_unlock(vma
->anon_vma
);
1627 mem_cgroup_uncharge_page(new_page
);
1632 * All pages are isolated and locked so anon_vma rmap
1633 * can't run anymore.
1635 anon_vma_unlock(vma
->anon_vma
);
1637 __collapse_huge_page_copy(pte
, new_page
, vma
, address
, ptl
);
1638 __SetPageUptodate(new_page
);
1639 pgtable
= pmd_pgtable(_pmd
);
1640 VM_BUG_ON(page_count(pgtable
) != 1);
1641 VM_BUG_ON(page_mapcount(pgtable
) != 0);
1643 _pmd
= mk_pmd(new_page
, vma
->vm_page_prot
);
1644 _pmd
= maybe_pmd_mkwrite(pmd_mkdirty(_pmd
), vma
);
1645 _pmd
= pmd_mkhuge(_pmd
);
1648 * spin_lock() below is not the equivalent of smp_wmb(), so
1649 * this is needed to avoid the copy_huge_page writes to become
1650 * visible after the set_pmd_at() write.
1654 spin_lock(&mm
->page_table_lock
);
1655 BUG_ON(!pmd_none(*pmd
));
1656 page_add_new_anon_rmap(new_page
, vma
, address
);
1657 set_pmd_at(mm
, address
, pmd
, _pmd
);
1658 update_mmu_cache(vma
, address
, entry
);
1659 prepare_pmd_huge_pte(pgtable
, mm
);
1661 spin_unlock(&mm
->page_table_lock
);
1664 khugepaged_pages_collapsed
++;
1666 up_write(&mm
->mmap_sem
);
1669 static int khugepaged_scan_pmd(struct mm_struct
*mm
,
1670 struct vm_area_struct
*vma
,
1671 unsigned long address
,
1672 struct page
**hpage
)
1678 int ret
= 0, referenced
= 0, none
= 0;
1680 unsigned long _address
;
1683 VM_BUG_ON(address
& ~HPAGE_PMD_MASK
);
1685 pgd
= pgd_offset(mm
, address
);
1686 if (!pgd_present(*pgd
))
1689 pud
= pud_offset(pgd
, address
);
1690 if (!pud_present(*pud
))
1693 pmd
= pmd_offset(pud
, address
);
1694 if (!pmd_present(*pmd
) || pmd_trans_huge(*pmd
))
1697 pte
= pte_offset_map_lock(mm
, pmd
, address
, &ptl
);
1698 for (_address
= address
, _pte
= pte
; _pte
< pte
+HPAGE_PMD_NR
;
1699 _pte
++, _address
+= PAGE_SIZE
) {
1700 pte_t pteval
= *_pte
;
1701 if (pte_none(pteval
)) {
1702 if (++none
<= khugepaged_max_ptes_none
)
1707 if (!pte_present(pteval
) || !pte_write(pteval
))
1709 page
= vm_normal_page(vma
, _address
, pteval
);
1710 if (unlikely(!page
))
1712 VM_BUG_ON(PageCompound(page
));
1713 if (!PageLRU(page
) || PageLocked(page
) || !PageAnon(page
))
1715 /* cannot use mapcount: can't collapse if there's a gup pin */
1716 if (page_count(page
) != 1)
1718 if (pte_young(pteval
))
1724 pte_unmap_unlock(pte
, ptl
);
1726 up_read(&mm
->mmap_sem
);
1727 collapse_huge_page(mm
, address
, hpage
);
1733 static void collect_mm_slot(struct mm_slot
*mm_slot
)
1735 struct mm_struct
*mm
= mm_slot
->mm
;
1737 VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock
));
1739 if (khugepaged_test_exit(mm
)) {
1741 hlist_del(&mm_slot
->hash
);
1742 list_del(&mm_slot
->mm_node
);
1745 * Not strictly needed because the mm exited already.
1747 * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
1750 /* khugepaged_mm_lock actually not necessary for the below */
1751 free_mm_slot(mm_slot
);
1756 static unsigned int khugepaged_scan_mm_slot(unsigned int pages
,
1757 struct page
**hpage
)
1759 struct mm_slot
*mm_slot
;
1760 struct mm_struct
*mm
;
1761 struct vm_area_struct
*vma
;
1765 VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock
));
1767 if (khugepaged_scan
.mm_slot
)
1768 mm_slot
= khugepaged_scan
.mm_slot
;
1770 mm_slot
= list_entry(khugepaged_scan
.mm_head
.next
,
1771 struct mm_slot
, mm_node
);
1772 khugepaged_scan
.address
= 0;
1773 khugepaged_scan
.mm_slot
= mm_slot
;
1775 spin_unlock(&khugepaged_mm_lock
);
1778 down_read(&mm
->mmap_sem
);
1779 if (unlikely(khugepaged_test_exit(mm
)))
1782 vma
= find_vma(mm
, khugepaged_scan
.address
);
1785 for (; vma
; vma
= vma
->vm_next
) {
1786 unsigned long hstart
, hend
;
1789 if (unlikely(khugepaged_test_exit(mm
))) {
1794 if (!(vma
->vm_flags
& VM_HUGEPAGE
) &&
1795 !khugepaged_always()) {
1800 /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
1801 if (!vma
->anon_vma
|| vma
->vm_ops
|| vma
->vm_file
) {
1802 khugepaged_scan
.address
= vma
->vm_end
;
1806 VM_BUG_ON(is_linear_pfn_mapping(vma
) || is_pfn_mapping(vma
));
1808 hstart
= (vma
->vm_start
+ ~HPAGE_PMD_MASK
) & HPAGE_PMD_MASK
;
1809 hend
= vma
->vm_end
& HPAGE_PMD_MASK
;
1810 if (hstart
>= hend
) {
1814 if (khugepaged_scan
.address
< hstart
)
1815 khugepaged_scan
.address
= hstart
;
1816 if (khugepaged_scan
.address
> hend
) {
1817 khugepaged_scan
.address
= hend
+ HPAGE_PMD_SIZE
;
1821 BUG_ON(khugepaged_scan
.address
& ~HPAGE_PMD_MASK
);
1823 while (khugepaged_scan
.address
< hend
) {
1826 if (unlikely(khugepaged_test_exit(mm
)))
1827 goto breakouterloop
;
1829 VM_BUG_ON(khugepaged_scan
.address
< hstart
||
1830 khugepaged_scan
.address
+ HPAGE_PMD_SIZE
>
1832 ret
= khugepaged_scan_pmd(mm
, vma
,
1833 khugepaged_scan
.address
,
1835 /* move to next address */
1836 khugepaged_scan
.address
+= HPAGE_PMD_SIZE
;
1837 progress
+= HPAGE_PMD_NR
;
1839 /* we released mmap_sem so break loop */
1840 goto breakouterloop_mmap_sem
;
1841 if (progress
>= pages
)
1842 goto breakouterloop
;
1846 up_read(&mm
->mmap_sem
); /* exit_mmap will destroy ptes after this */
1847 breakouterloop_mmap_sem
:
1849 spin_lock(&khugepaged_mm_lock
);
1850 BUG_ON(khugepaged_scan
.mm_slot
!= mm_slot
);
1852 * Release the current mm_slot if this mm is about to die, or
1853 * if we scanned all vmas of this mm.
1855 if (khugepaged_test_exit(mm
) || !vma
) {
1857 * Make sure that if mm_users is reaching zero while
1858 * khugepaged runs here, khugepaged_exit will find
1859 * mm_slot not pointing to the exiting mm.
1861 if (mm_slot
->mm_node
.next
!= &khugepaged_scan
.mm_head
) {
1862 khugepaged_scan
.mm_slot
= list_entry(
1863 mm_slot
->mm_node
.next
,
1864 struct mm_slot
, mm_node
);
1865 khugepaged_scan
.address
= 0;
1867 khugepaged_scan
.mm_slot
= NULL
;
1868 khugepaged_full_scans
++;
1871 collect_mm_slot(mm_slot
);
1877 static int khugepaged_has_work(void)
1879 return !list_empty(&khugepaged_scan
.mm_head
) &&
1880 khugepaged_enabled();
1883 static int khugepaged_wait_event(void)
1885 return !list_empty(&khugepaged_scan
.mm_head
) ||
1886 !khugepaged_enabled();
1889 static void khugepaged_do_scan(struct page
**hpage
)
1891 unsigned int progress
= 0, pass_through_head
= 0;
1892 unsigned int pages
= khugepaged_pages_to_scan
;
1894 barrier(); /* write khugepaged_pages_to_scan to local stack */
1896 while (progress
< pages
) {
1900 *hpage
= alloc_hugepage(khugepaged_defrag());
1901 if (unlikely(!*hpage
))
1905 spin_lock(&khugepaged_mm_lock
);
1906 if (!khugepaged_scan
.mm_slot
)
1907 pass_through_head
++;
1908 if (khugepaged_has_work() &&
1909 pass_through_head
< 2)
1910 progress
+= khugepaged_scan_mm_slot(pages
- progress
,
1914 spin_unlock(&khugepaged_mm_lock
);
1918 static struct page
*khugepaged_alloc_hugepage(void)
1923 hpage
= alloc_hugepage(khugepaged_defrag());
1926 add_wait_queue(&khugepaged_wait
, &wait
);
1927 schedule_timeout_interruptible(
1929 khugepaged_alloc_sleep_millisecs
));
1930 remove_wait_queue(&khugepaged_wait
, &wait
);
1932 } while (unlikely(!hpage
) &&
1933 likely(khugepaged_enabled()));
1937 static void khugepaged_loop(void)
1941 while (likely(khugepaged_enabled())) {
1942 hpage
= khugepaged_alloc_hugepage();
1943 if (unlikely(!hpage
))
1946 khugepaged_do_scan(&hpage
);
1949 if (khugepaged_has_work()) {
1951 if (!khugepaged_scan_sleep_millisecs
)
1953 add_wait_queue(&khugepaged_wait
, &wait
);
1954 schedule_timeout_interruptible(
1956 khugepaged_scan_sleep_millisecs
));
1957 remove_wait_queue(&khugepaged_wait
, &wait
);
1958 } else if (khugepaged_enabled())
1959 wait_event_interruptible(khugepaged_wait
,
1960 khugepaged_wait_event());
1964 static int khugepaged(void *none
)
1966 struct mm_slot
*mm_slot
;
1968 set_user_nice(current
, 19);
1970 /* serialize with start_khugepaged() */
1971 mutex_lock(&khugepaged_mutex
);
1974 mutex_unlock(&khugepaged_mutex
);
1975 BUG_ON(khugepaged_thread
!= current
);
1977 BUG_ON(khugepaged_thread
!= current
);
1979 mutex_lock(&khugepaged_mutex
);
1980 if (!khugepaged_enabled())
1984 spin_lock(&khugepaged_mm_lock
);
1985 mm_slot
= khugepaged_scan
.mm_slot
;
1986 khugepaged_scan
.mm_slot
= NULL
;
1988 collect_mm_slot(mm_slot
);
1989 spin_unlock(&khugepaged_mm_lock
);
1991 khugepaged_thread
= NULL
;
1992 mutex_unlock(&khugepaged_mutex
);
1997 void __split_huge_page_pmd(struct mm_struct
*mm
, pmd_t
*pmd
)
2001 spin_lock(&mm
->page_table_lock
);
2002 if (unlikely(!pmd_trans_huge(*pmd
))) {
2003 spin_unlock(&mm
->page_table_lock
);
2006 page
= pmd_page(*pmd
);
2007 VM_BUG_ON(!page_count(page
));
2009 spin_unlock(&mm
->page_table_lock
);
2011 split_huge_page(page
);
2014 BUG_ON(pmd_trans_huge(*pmd
));