sys-kernel/xanmod-hybrid: Update To v5.12.9 (#948)
[gentoo-zh.git] / sys-kernel / xanmod-hybrid / files / v1-uksm.patch
blob3321eaa8ee58867857a93d1d12f3b75d0e69194c
1 From 9a42006b641bc8e0c333174a9bf269ac9450d521 Mon Sep 17 00:00:00 2001
2 From: Piotr Gorski <lucjan.lucjanov@gmail.com>
3 Date: Tue, 13 Apr 2021 16:27:12 +0200
4 Subject: [PATCH] UKSM for 5.12
6 Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com>
7 ---
8 Documentation/vm/uksm.txt | 61 +
9 fs/exec.c | 1 +
10 fs/proc/meminfo.c | 4 +
11 include/linux/ksm.h | 43 +-
12 include/linux/mm_types.h | 3 +
13 include/linux/mmzone.h | 3 +
14 include/linux/pgtable.h | 17 +-
15 include/linux/sradix-tree.h | 77 +
16 include/linux/uksm.h | 149 +
17 kernel/fork.c | 2 +-
18 lib/Makefile | 2 +-
19 lib/sradix-tree.c | 476 +++
20 mm/Kconfig | 26 +
21 mm/Makefile | 3 +-
22 mm/ksm.c | 11 -
23 mm/memory.c | 33 +-
24 mm/mmap.c | 37 +
25 mm/uksm.c | 5614 +++++++++++++++++++++++++++++++++++
26 mm/vmstat.c | 3 +
27 19 files changed, 6539 insertions(+), 26 deletions(-)
28 create mode 100644 Documentation/vm/uksm.txt
29 create mode 100644 include/linux/sradix-tree.h
30 create mode 100644 include/linux/uksm.h
31 create mode 100644 lib/sradix-tree.c
32 create mode 100644 mm/uksm.c
34 diff --git a/Documentation/vm/uksm.txt b/Documentation/vm/uksm.txt
35 new file mode 100644
36 index 000000000..be19a3127
37 --- /dev/null
38 +++ b/Documentation/vm/uksm.txt
39 @@ -0,0 +1,61 @@
40 +The Ultra Kernel Samepage Merging feature
41 +----------------------------------------------
42 +/*
43 + * Ultra KSM. Copyright (C) 2011-2012 Nai Xia
44 + *
45 + * This is an improvement upon KSM. Some basic data structures and routines
46 + * are borrowed from ksm.c .
47 + *
48 + * Its new features:
49 + * 1. Full system scan:
50 + * It automatically scans all user processes' anonymous VMAs. Kernel-user
51 + * interaction to submit a memory area to KSM is no longer needed.
52 + *
53 + * 2. Rich area detection:
54 + * It automatically detects rich areas containing abundant duplicated
55 + * pages based. Rich areas are given a full scan speed. Poor areas are
56 + * sampled at a reasonable speed with very low CPU consumption.
57 + *
58 + * 3. Ultra Per-page scan speed improvement:
59 + * A new hash algorithm is proposed. As a result, on a machine with
60 + * Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it
61 + * can scan memory areas that does not contain duplicated pages at speed of
62 + * 627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of
63 + * 477MB/sec ~ 923MB/sec.
64 + *
65 + * 4. Thrashing area avoidance:
66 + * Thrashing area(an VMA that has frequent Ksm page break-out) can be
67 + * filtered out. My benchmark shows it's more efficient than KSM's per-page
68 + * hash value based volatile page detection.
69 + *
70 + *
71 + * 5. Misc changes upon KSM:
72 + * * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page
73 + * comparison. It's much faster than default C version on x86.
74 + * * rmap_item now has an struct *page member to loosely cache a
75 + * address-->page mapping, which reduces too much time-costly
76 + * follow_page().
77 + * * The VMA creation/exit procedures are hooked to let the Ultra KSM know.
78 + * * try_to_merge_two_pages() now can revert a pte if it fails. No break_
79 + * ksm is needed for this case.
80 + *
81 + * 6. Full Zero Page consideration(contributed by Figo Zhang)
82 + * Now uksmd consider full zero pages as special pages and merge them to an
83 + * special unswappable uksm zero page.
84 + */
86 +ChangeLog:
88 +2012-05-05 The creation of this Doc
89 +2012-05-08 UKSM 0.1.1.1 libc crash bug fix, api clean up, doc clean up.
90 +2012-05-28 UKSM 0.1.1.2 bug fix release
91 +2012-06-26 UKSM 0.1.2-beta1 first beta release for 0.1.2
92 +2012-07-2 UKSM 0.1.2-beta2
93 +2012-07-10 UKSM 0.1.2-beta3
94 +2012-07-26 UKSM 0.1.2 Fine grained speed control, more scan optimization.
95 +2012-10-13 UKSM 0.1.2.1 Bug fixes.
96 +2012-12-31 UKSM 0.1.2.2 Minor bug fixes.
97 +2014-07-02 UKSM 0.1.2.3 Fix a " __this_cpu_read() in preemptible bug".
98 +2015-04-22 UKSM 0.1.2.4 Fix a race condition that can sometimes trigger anonying warnings.
99 +2016-09-10 UKSM 0.1.2.5 Fix a bug in dedup ratio calculation.
100 +2017-02-26 UKSM 0.1.2.6 Fix a bug in hugetlbpage handling and a race bug with page migration.
101 diff --git a/fs/exec.c b/fs/exec.c
102 index 18594f11c..aee636fd4 100644
103 --- a/fs/exec.c
104 +++ b/fs/exec.c
105 @@ -65,6 +65,7 @@
106 #include <linux/vmalloc.h>
107 #include <linux/io_uring.h>
108 #include <linux/syscall_user_dispatch.h>
109 +#include <linux/ksm.h>
111 #include <linux/uaccess.h>
112 #include <asm/mmu_context.h>
113 diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
114 index 6fa761c9c..45fd59a0d 100644
115 --- a/fs/proc/meminfo.c
116 +++ b/fs/proc/meminfo.c
117 @@ -108,6 +108,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
118 #endif
119 show_val_kb(m, "PageTables: ",
120 global_node_page_state(NR_PAGETABLE));
121 +#ifdef CONFIG_UKSM
122 + show_val_kb(m, "KsmZeroPages: ",
123 + global_zone_page_state(NR_UKSM_ZERO_PAGES));
124 +#endif
126 show_val_kb(m, "NFS_Unstable: ", 0);
127 show_val_kb(m, "Bounce: ",
128 diff --git a/include/linux/ksm.h b/include/linux/ksm.h
129 index 161e8164a..f0dbdf3c9 100644
130 --- a/include/linux/ksm.h
131 +++ b/include/linux/ksm.h
132 @@ -21,20 +21,16 @@ struct mem_cgroup;
133 #ifdef CONFIG_KSM
134 int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
135 unsigned long end, int advice, unsigned long *vm_flags);
136 -int __ksm_enter(struct mm_struct *mm);
137 -void __ksm_exit(struct mm_struct *mm);
139 -static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
140 +static inline struct stable_node *page_stable_node(struct page *page)
142 - if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
143 - return __ksm_enter(mm);
144 - return 0;
145 + return PageKsm(page) ? page_rmapping(page) : NULL;
148 -static inline void ksm_exit(struct mm_struct *mm)
149 +static inline void set_page_stable_node(struct page *page,
150 + struct stable_node *stable_node)
152 - if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
153 - __ksm_exit(mm);
154 + page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
158 @@ -54,6 +50,33 @@ struct page *ksm_might_need_to_copy(struct page *page,
159 void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc);
160 void ksm_migrate_page(struct page *newpage, struct page *oldpage);
162 +#ifdef CONFIG_KSM_LEGACY
163 +int __ksm_enter(struct mm_struct *mm);
164 +void __ksm_exit(struct mm_struct *mm);
165 +static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
167 + if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
168 + return __ksm_enter(mm);
169 + return 0;
172 +static inline void ksm_exit(struct mm_struct *mm)
174 + if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
175 + __ksm_exit(mm);
178 +#elif defined(CONFIG_UKSM)
179 +static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
181 + return 0;
184 +static inline void ksm_exit(struct mm_struct *mm)
187 +#endif /* !CONFIG_UKSM */
189 #else /* !CONFIG_KSM */
191 static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
192 @@ -89,4 +112,6 @@ static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage)
193 #endif /* CONFIG_MMU */
194 #endif /* !CONFIG_KSM */
196 +#include <linux/uksm.h>
198 #endif /* __LINUX_KSM_H */
199 diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
200 index 6613b26a8..82e18e41b 100644
201 --- a/include/linux/mm_types.h
202 +++ b/include/linux/mm_types.h
203 @@ -370,6 +370,9 @@ struct vm_area_struct {
204 struct mempolicy *vm_policy; /* NUMA policy for the VMA */
205 #endif
206 struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
207 +#ifdef CONFIG_UKSM
208 + struct vma_slot *uksm_vma_slot;
209 +#endif
210 } __randomize_layout;
212 struct core_thread {
213 diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
214 index 47946cec7..a6ce64844 100644
215 --- a/include/linux/mmzone.h
216 +++ b/include/linux/mmzone.h
217 @@ -157,6 +157,9 @@ enum zone_stat_item {
218 NR_ZSPAGES, /* allocated in zsmalloc */
219 #endif
220 NR_FREE_CMA_PAGES,
221 +#ifdef CONFIG_UKSM
222 + NR_UKSM_ZERO_PAGES,
223 +#endif
224 NR_VM_ZONE_STAT_ITEMS };
226 enum node_stat_item {
227 diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
228 index 5e772392a..9d733540d 100644
229 --- a/include/linux/pgtable.h
230 +++ b/include/linux/pgtable.h
231 @@ -1111,12 +1111,25 @@ extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
232 extern void untrack_pfn_moved(struct vm_area_struct *vma);
233 #endif
235 +#ifdef CONFIG_UKSM
236 +static inline int is_uksm_zero_pfn(unsigned long pfn)
238 + extern unsigned long uksm_zero_pfn;
239 + return pfn == uksm_zero_pfn;
241 +#else
242 +static inline int is_uksm_zero_pfn(unsigned long pfn)
244 + return 0;
246 +#endif
248 #ifdef __HAVE_COLOR_ZERO_PAGE
249 static inline int is_zero_pfn(unsigned long pfn)
251 extern unsigned long zero_pfn;
252 unsigned long offset_from_zero_pfn = pfn - zero_pfn;
253 - return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT);
254 + return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT) || is_uksm_zero_pfn(pfn);
257 #define my_zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr))
258 @@ -1125,7 +1138,7 @@ static inline int is_zero_pfn(unsigned long pfn)
259 static inline int is_zero_pfn(unsigned long pfn)
261 extern unsigned long zero_pfn;
262 - return pfn == zero_pfn;
263 + return (pfn == zero_pfn) || (is_uksm_zero_pfn(pfn));
266 static inline unsigned long my_zero_pfn(unsigned long addr)
267 diff --git a/include/linux/sradix-tree.h b/include/linux/sradix-tree.h
268 new file mode 100644
269 index 000000000..d71edba6b
270 --- /dev/null
271 +++ b/include/linux/sradix-tree.h
272 @@ -0,0 +1,77 @@
273 +#ifndef _LINUX_SRADIX_TREE_H
274 +#define _LINUX_SRADIX_TREE_H
277 +#define INIT_SRADIX_TREE(root, mask) \
278 +do { \
279 + (root)->height = 0; \
280 + (root)->gfp_mask = (mask); \
281 + (root)->rnode = NULL; \
282 +} while (0)
284 +#define ULONG_BITS (sizeof(unsigned long) * 8)
285 +#define SRADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long))
286 +//#define SRADIX_TREE_MAP_SHIFT 6
287 +//#define SRADIX_TREE_MAP_SIZE (1UL << SRADIX_TREE_MAP_SHIFT)
288 +//#define SRADIX_TREE_MAP_MASK (SRADIX_TREE_MAP_SIZE-1)
290 +struct sradix_tree_node {
291 + unsigned int height; /* Height from the bottom */
292 + unsigned int count;
293 + unsigned int fulls; /* Number of full sublevel trees */
294 + struct sradix_tree_node *parent;
295 + void *stores[0];
298 +/* A simple radix tree implementation */
299 +struct sradix_tree_root {
300 + unsigned int height;
301 + struct sradix_tree_node *rnode;
303 + /* Where found to have available empty stores in its sublevels */
304 + struct sradix_tree_node *enter_node;
305 + unsigned int shift;
306 + unsigned int stores_size;
307 + unsigned int mask;
308 + unsigned long min; /* The first hole index */
309 + unsigned long num;
310 + //unsigned long *height_to_maxindex;
312 + /* How the node is allocated and freed. */
313 + struct sradix_tree_node *(*alloc)(void);
314 + void (*free)(struct sradix_tree_node *node);
316 + /* When a new node is added and removed */
317 + void (*extend)(struct sradix_tree_node *parent, struct sradix_tree_node *child);
318 + void (*assign)(struct sradix_tree_node *node, unsigned int index, void *item);
319 + void (*rm)(struct sradix_tree_node *node, unsigned int offset);
322 +struct sradix_tree_path {
323 + struct sradix_tree_node *node;
324 + int offset;
327 +static inline
328 +void init_sradix_tree_root(struct sradix_tree_root *root, unsigned long shift)
330 + root->height = 0;
331 + root->rnode = NULL;
332 + root->shift = shift;
333 + root->stores_size = 1UL << shift;
334 + root->mask = root->stores_size - 1;
338 +extern void *sradix_tree_next(struct sradix_tree_root *root,
339 + struct sradix_tree_node *node, unsigned long index,
340 + int (*iter)(void *, unsigned long));
342 +extern int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num);
344 +extern void sradix_tree_delete_from_leaf(struct sradix_tree_root *root,
345 + struct sradix_tree_node *node, unsigned long index);
347 +extern void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index);
349 +#endif /* _LINUX_SRADIX_TREE_H */
350 diff --git a/include/linux/uksm.h b/include/linux/uksm.h
351 new file mode 100644
352 index 000000000..bb8651f53
353 --- /dev/null
354 +++ b/include/linux/uksm.h
355 @@ -0,0 +1,149 @@
356 +#ifndef __LINUX_UKSM_H
357 +#define __LINUX_UKSM_H
359 + * Memory merging support.
361 + * This code enables dynamic sharing of identical pages found in different
362 + * memory areas, even if they are not shared by fork().
363 + */
365 +/* if !CONFIG_UKSM this file should not be compiled at all. */
366 +#ifdef CONFIG_UKSM
368 +#include <linux/bitops.h>
369 +#include <linux/mm.h>
370 +#include <linux/pagemap.h>
371 +#include <linux/rmap.h>
372 +#include <linux/sched.h>
374 +extern unsigned long zero_pfn __read_mostly;
375 +extern unsigned long uksm_zero_pfn __read_mostly;
376 +extern struct page *empty_uksm_zero_page;
378 +/* must be done before linked to mm */
379 +extern void uksm_vma_add_new(struct vm_area_struct *vma);
380 +extern void uksm_remove_vma(struct vm_area_struct *vma);
382 +#define UKSM_SLOT_NEED_SORT (1 << 0)
383 +#define UKSM_SLOT_NEED_RERAND (1 << 1)
384 +#define UKSM_SLOT_SCANNED (1 << 2) /* It's scanned in this round */
385 +#define UKSM_SLOT_FUL_SCANNED (1 << 3)
386 +#define UKSM_SLOT_IN_UKSM (1 << 4)
388 +struct vma_slot {
389 + struct sradix_tree_node *snode;
390 + unsigned long sindex;
392 + struct list_head slot_list;
393 + unsigned long fully_scanned_round;
394 + unsigned long dedup_num;
395 + unsigned long pages_scanned;
396 + unsigned long this_sampled;
397 + unsigned long last_scanned;
398 + unsigned long pages_to_scan;
399 + struct scan_rung *rung;
400 + struct page **rmap_list_pool;
401 + unsigned int *pool_counts;
402 + unsigned long pool_size;
403 + struct vm_area_struct *vma;
404 + struct mm_struct *mm;
405 + unsigned long ctime_j;
406 + unsigned long pages;
407 + unsigned long flags;
408 + unsigned long pages_cowed; /* pages cowed this round */
409 + unsigned long pages_merged; /* pages merged this round */
410 + unsigned long pages_bemerged;
412 + /* when it has page merged in this eval round */
413 + struct list_head dedup_list;
416 +static inline void uksm_unmap_zero_page(pte_t pte)
418 + if (pte_pfn(pte) == uksm_zero_pfn)
419 + __dec_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES);
422 +static inline void uksm_map_zero_page(pte_t pte)
424 + if (pte_pfn(pte) == uksm_zero_pfn)
425 + __inc_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES);
428 +static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page)
430 + if (vma->uksm_vma_slot && PageKsm(page))
431 + vma->uksm_vma_slot->pages_cowed++;
434 +static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte)
436 + if (vma->uksm_vma_slot && pte_pfn(pte) == uksm_zero_pfn)
437 + vma->uksm_vma_slot->pages_cowed++;
440 +static inline int uksm_flags_can_scan(unsigned long vm_flags)
442 +#ifdef VM_SAO
443 + if (vm_flags & VM_SAO)
444 + return 0;
445 +#endif
447 + return !(vm_flags & (VM_PFNMAP | VM_IO | VM_DONTEXPAND |
448 + VM_HUGETLB | VM_MIXEDMAP | VM_SHARED
449 + | VM_MAYSHARE | VM_GROWSUP | VM_GROWSDOWN));
452 +static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p)
454 + if (uksm_flags_can_scan(*vm_flags_p))
455 + *vm_flags_p |= VM_MERGEABLE;
459 + * Just a wrapper for BUG_ON for where ksm_zeropage must not be. TODO: it will
460 + * be removed when uksm zero page patch is stable enough.
461 + */
462 +static inline void uksm_bugon_zeropage(pte_t pte)
464 + BUG_ON(pte_pfn(pte) == uksm_zero_pfn);
466 +#else
467 +static inline void uksm_vma_add_new(struct vm_area_struct *vma)
471 +static inline void uksm_remove_vma(struct vm_area_struct *vma)
475 +static inline void uksm_unmap_zero_page(pte_t pte)
479 +static inline void uksm_map_zero_page(pte_t pte)
483 +static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page)
487 +static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte)
491 +static inline int uksm_flags_can_scan(unsigned long vm_flags)
493 + return 0;
496 +static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p)
500 +static inline void uksm_bugon_zeropage(pte_t pte)
503 +#endif /* !CONFIG_UKSM */
504 +#endif /* __LINUX_UKSM_H */
505 diff --git a/kernel/fork.c b/kernel/fork.c
506 index 426cd0c51..5fd356ca7 100644
507 --- a/kernel/fork.c
508 +++ b/kernel/fork.c
509 @@ -588,7 +588,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
510 __vma_link_rb(mm, tmp, rb_link, rb_parent);
511 rb_link = &tmp->vm_rb.rb_right;
512 rb_parent = &tmp->vm_rb;
514 + uksm_vma_add_new(tmp);
515 mm->map_count++;
516 if (!(tmp->vm_flags & VM_WIPEONFORK))
517 retval = copy_page_range(tmp, mpnt);
518 diff --git a/lib/Makefile b/lib/Makefile
519 index b5307d3ee..480b099e1 100644
520 --- a/lib/Makefile
521 +++ b/lib/Makefile
522 @@ -28,7 +28,7 @@ CFLAGS_string.o += -fno-stack-protector
523 endif
525 lib-y := ctype.o string.o vsprintf.o cmdline.o \
526 - rbtree.o radix-tree.o timerqueue.o xarray.o \
527 + rbtree.o radix-tree.o sradix-tree.o timerqueue.o xarray.o \
528 idr.o extable.o sha1.o irq_regs.o argv_split.o \
529 flex_proportions.o ratelimit.o show_mem.o \
530 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
531 diff --git a/lib/sradix-tree.c b/lib/sradix-tree.c
532 new file mode 100644
533 index 000000000..ab21e6309
534 --- /dev/null
535 +++ b/lib/sradix-tree.c
536 @@ -0,0 +1,476 @@
537 +#include <linux/errno.h>
538 +#include <linux/mm.h>
539 +#include <linux/mman.h>
540 +#include <linux/spinlock.h>
541 +#include <linux/slab.h>
542 +#include <linux/gcd.h>
543 +#include <linux/sradix-tree.h>
545 +static inline int sradix_node_full(struct sradix_tree_root *root, struct sradix_tree_node *node)
547 + return node->fulls == root->stores_size ||
548 + (node->height == 1 && node->count == root->stores_size);
552 + * Extend a sradix tree so it can store key @index.
553 + */
554 +static int sradix_tree_extend(struct sradix_tree_root *root, unsigned long index)
556 + struct sradix_tree_node *node;
557 + unsigned int height;
559 + if (unlikely(root->rnode == NULL)) {
560 + if (!(node = root->alloc()))
561 + return -ENOMEM;
563 + node->height = 1;
564 + root->rnode = node;
565 + root->height = 1;
568 + /* Figure out what the height should be. */
569 + height = root->height;
570 + index >>= root->shift * height;
572 + while (index) {
573 + index >>= root->shift;
574 + height++;
577 + while (height > root->height) {
578 + unsigned int newheight;
580 + if (!(node = root->alloc()))
581 + return -ENOMEM;
583 + /* Increase the height. */
584 + node->stores[0] = root->rnode;
585 + root->rnode->parent = node;
586 + if (root->extend)
587 + root->extend(node, root->rnode);
589 + newheight = root->height + 1;
590 + node->height = newheight;
591 + node->count = 1;
592 + if (sradix_node_full(root, root->rnode))
593 + node->fulls = 1;
595 + root->rnode = node;
596 + root->height = newheight;
599 + return 0;
603 + * Search the next item from the current node, that is not NULL
604 + * and can satify root->iter().
605 + */
606 +void *sradix_tree_next(struct sradix_tree_root *root,
607 + struct sradix_tree_node *node, unsigned long index,
608 + int (*iter)(void *item, unsigned long height))
610 + unsigned long offset;
611 + void *item;
613 + if (unlikely(node == NULL)) {
614 + node = root->rnode;
615 + for (offset = 0; offset < root->stores_size; offset++) {
616 + item = node->stores[offset];
617 + if (item && (!iter || iter(item, node->height)))
618 + break;
621 + if (unlikely(offset >= root->stores_size))
622 + return NULL;
624 + if (node->height == 1)
625 + return item;
626 + else
627 + goto go_down;
630 + while (node) {
631 + offset = (index & root->mask) + 1;
632 + for (; offset < root->stores_size; offset++) {
633 + item = node->stores[offset];
634 + if (item && (!iter || iter(item, node->height)))
635 + break;
638 + if (offset < root->stores_size)
639 + break;
641 + node = node->parent;
642 + index >>= root->shift;
645 + if (!node)
646 + return NULL;
648 + while (node->height > 1) {
649 +go_down:
650 + node = item;
651 + for (offset = 0; offset < root->stores_size; offset++) {
652 + item = node->stores[offset];
653 + if (item && (!iter || iter(item, node->height)))
654 + break;
657 + if (unlikely(offset >= root->stores_size))
658 + return NULL;
661 + BUG_ON(offset > root->stores_size);
663 + return item;
667 + * Blindly insert the item to the tree. Typically, we reuse the
668 + * first empty store item.
669 + */
670 +int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num)
672 + unsigned long index;
673 + unsigned int height;
674 + struct sradix_tree_node *node, *tmp = NULL;
675 + int offset, offset_saved;
676 + void **store = NULL;
677 + int error, i, j, shift;
679 +go_on:
680 + index = root->min;
682 + if (root->enter_node && !sradix_node_full(root, root->enter_node)) {
683 + node = root->enter_node;
684 + BUG_ON((index >> (root->shift * root->height)));
685 + } else {
686 + node = root->rnode;
687 + if (node == NULL || (index >> (root->shift * root->height))
688 + || sradix_node_full(root, node)) {
689 + error = sradix_tree_extend(root, index);
690 + if (error)
691 + return error;
693 + node = root->rnode;
698 + height = node->height;
699 + shift = (height - 1) * root->shift;
700 + offset = (index >> shift) & root->mask;
701 + while (shift > 0) {
702 + offset_saved = offset;
703 + for (; offset < root->stores_size; offset++) {
704 + store = &node->stores[offset];
705 + tmp = *store;
707 + if (!tmp || !sradix_node_full(root, tmp))
708 + break;
710 + BUG_ON(offset >= root->stores_size);
712 + if (offset != offset_saved) {
713 + index += (offset - offset_saved) << shift;
714 + index &= ~((1UL << shift) - 1);
717 + if (!tmp) {
718 + if (!(tmp = root->alloc()))
719 + return -ENOMEM;
721 + tmp->height = shift / root->shift;
722 + *store = tmp;
723 + tmp->parent = node;
724 + node->count++;
725 +// if (root->extend)
726 +// root->extend(node, tmp);
729 + node = tmp;
730 + shift -= root->shift;
731 + offset = (index >> shift) & root->mask;
734 + BUG_ON(node->height != 1);
737 + store = &node->stores[offset];
738 + for (i = 0, j = 0;
739 + j < root->stores_size - node->count &&
740 + i < root->stores_size - offset && j < num; i++) {
741 + if (!store[i]) {
742 + store[i] = item[j];
743 + if (root->assign)
744 + root->assign(node, index + i, item[j]);
745 + j++;
749 + node->count += j;
750 + root->num += j;
751 + num -= j;
753 + while (sradix_node_full(root, node)) {
754 + node = node->parent;
755 + if (!node)
756 + break;
758 + node->fulls++;
761 + if (unlikely(!node)) {
762 + /* All nodes are full */
763 + root->min = 1 << (root->height * root->shift);
764 + root->enter_node = NULL;
765 + } else {
766 + root->min = index + i - 1;
767 + root->min |= (1UL << (node->height - 1)) - 1;
768 + root->min++;
769 + root->enter_node = node;
772 + if (num) {
773 + item += j;
774 + goto go_on;
777 + return 0;
781 +/**
782 + * sradix_tree_shrink - shrink height of a sradix tree to minimal
783 + * @root sradix tree root
785 + */
786 +static inline void sradix_tree_shrink(struct sradix_tree_root *root)
788 + /* try to shrink tree height */
789 + while (root->height > 1) {
790 + struct sradix_tree_node *to_free = root->rnode;
792 + /*
793 + * The candidate node has more than one child, or its child
794 + * is not at the leftmost store, we cannot shrink.
795 + */
796 + if (to_free->count != 1 || !to_free->stores[0])
797 + break;
799 + root->rnode = to_free->stores[0];
800 + root->rnode->parent = NULL;
801 + root->height--;
802 + if (unlikely(root->enter_node == to_free))
803 + root->enter_node = NULL;
804 + root->free(to_free);
809 + * Del the item on the known leaf node and index
810 + */
811 +void sradix_tree_delete_from_leaf(struct sradix_tree_root *root,
812 + struct sradix_tree_node *node, unsigned long index)
814 + unsigned int offset;
815 + struct sradix_tree_node *start, *end;
817 + BUG_ON(node->height != 1);
819 + start = node;
820 + while (node && !(--node->count))
821 + node = node->parent;
823 + end = node;
824 + if (!node) {
825 + root->rnode = NULL;
826 + root->height = 0;
827 + root->min = 0;
828 + root->num = 0;
829 + root->enter_node = NULL;
830 + } else {
831 + offset = (index >> (root->shift * (node->height - 1))) & root->mask;
832 + if (root->rm)
833 + root->rm(node, offset);
834 + node->stores[offset] = NULL;
835 + root->num--;
836 + if (root->min > index) {
837 + root->min = index;
838 + root->enter_node = node;
842 + if (start != end) {
843 + do {
844 + node = start;
845 + start = start->parent;
846 + if (unlikely(root->enter_node == node))
847 + root->enter_node = end;
848 + root->free(node);
849 + } while (start != end);
851 + /*
852 + * Note that shrink may free "end", so enter_node still need to
853 + * be checked inside.
854 + */
855 + sradix_tree_shrink(root);
856 + } else if (node->count == root->stores_size - 1) {
857 + /* It WAS a full leaf node. Update the ancestors */
858 + node = node->parent;
859 + while (node) {
860 + node->fulls--;
861 + if (node->fulls != root->stores_size - 1)
862 + break;
864 + node = node->parent;
869 +void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index)
871 + unsigned int height, offset;
872 + struct sradix_tree_node *node;
873 + int shift;
875 + node = root->rnode;
876 + if (node == NULL || (index >> (root->shift * root->height)))
877 + return NULL;
879 + height = root->height;
880 + shift = (height - 1) * root->shift;
882 + do {
883 + offset = (index >> shift) & root->mask;
884 + node = node->stores[offset];
885 + if (!node)
886 + return NULL;
888 + shift -= root->shift;
889 + } while (shift >= 0);
891 + return node;
895 + * Return the item if it exists, otherwise create it in place
896 + * and return the created item.
897 + */
898 +void *sradix_tree_lookup_create(struct sradix_tree_root *root,
899 + unsigned long index, void *(*item_alloc)(void))
901 + unsigned int height, offset;
902 + struct sradix_tree_node *node, *tmp;
903 + void *item;
904 + int shift, error;
906 + if (root->rnode == NULL || (index >> (root->shift * root->height))) {
907 + if (item_alloc) {
908 + error = sradix_tree_extend(root, index);
909 + if (error)
910 + return NULL;
911 + } else {
912 + return NULL;
916 + node = root->rnode;
917 + height = root->height;
918 + shift = (height - 1) * root->shift;
920 + do {
921 + offset = (index >> shift) & root->mask;
922 + if (!node->stores[offset]) {
923 + if (!(tmp = root->alloc()))
924 + return NULL;
926 + tmp->height = shift / root->shift;
927 + node->stores[offset] = tmp;
928 + tmp->parent = node;
929 + node->count++;
930 + node = tmp;
931 + } else {
932 + node = node->stores[offset];
935 + shift -= root->shift;
936 + } while (shift > 0);
938 + BUG_ON(node->height != 1);
939 + offset = index & root->mask;
940 + if (node->stores[offset]) {
941 + return node->stores[offset];
942 + } else if (item_alloc) {
943 + if (!(item = item_alloc()))
944 + return NULL;
946 + node->stores[offset] = item;
948 + /*
949 + * NOTE: we do NOT call root->assign here, since this item is
950 + * newly created by us having no meaning. Caller can call this
951 + * if it's necessary to do so.
952 + */
954 + node->count++;
955 + root->num++;
957 + while (sradix_node_full(root, node)) {
958 + node = node->parent;
959 + if (!node)
960 + break;
962 + node->fulls++;
965 + if (unlikely(!node)) {
966 + /* All nodes are full */
967 + root->min = 1 << (root->height * root->shift);
968 + } else {
969 + if (root->min == index) {
970 + root->min |= (1UL << (node->height - 1)) - 1;
971 + root->min++;
972 + root->enter_node = node;
976 + return item;
977 + } else {
978 + return NULL;
983 +int sradix_tree_delete(struct sradix_tree_root *root, unsigned long index)
985 + unsigned int height, offset;
986 + struct sradix_tree_node *node;
987 + int shift;
989 + node = root->rnode;
990 + if (node == NULL || (index >> (root->shift * root->height)))
991 + return -ENOENT;
993 + height = root->height;
994 + shift = (height - 1) * root->shift;
996 + do {
997 + offset = (index >> shift) & root->mask;
998 + node = node->stores[offset];
999 + if (!node)
1000 + return -ENOENT;
1002 + shift -= root->shift;
1003 + } while (shift > 0);
1005 + offset = index & root->mask;
1006 + if (!node->stores[offset])
1007 + return -ENOENT;
1009 + sradix_tree_delete_from_leaf(root, node, index);
1011 + return 0;
1013 diff --git a/mm/Kconfig b/mm/Kconfig
1014 index 24c045b24..3ce98ecc2 100644
1015 --- a/mm/Kconfig
1016 +++ b/mm/Kconfig
1017 @@ -317,6 +317,32 @@ config KSM
1018 See Documentation/vm/ksm.rst for more information: KSM is inactive
1019 until a program has madvised that an area is MADV_MERGEABLE, and
1020 root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set).
1021 +choice
1022 + prompt "Choose UKSM/KSM strategy"
1023 + default UKSM
1024 + depends on KSM
1025 + help
1026 + This option allows to select a UKSM/KSM stragety.
1028 +config UKSM
1029 + bool "Ultra-KSM for page merging"
1030 + depends on KSM
1031 + help
1032 + UKSM is inspired by the Linux kernel project \u2014 KSM(Kernel Same
1033 + page Merging), but with a fundamentally rewritten core algorithm. With
1034 + an advanced algorithm, UKSM now can transparently scans all anonymously
1035 + mapped user space applications with an significantly improved scan speed
1036 + and CPU efficiency. Since KVM is friendly to KSM, KVM can also benefit from
1037 + UKSM. Now UKSM has its first stable release and first real world enterprise user.
1038 + For more information, please goto its project page.
1039 + (github.com/dolohow/uksm)
1041 +config KSM_LEGACY
1042 + bool "Legacy KSM implementation"
1043 + depends on KSM
1044 + help
1045 + The legacy KSM implementation from Red Hat.
1046 +endchoice
1048 config DEFAULT_MMAP_MIN_ADDR
1049 int "Low address space to protect from user allocation"
1050 diff --git a/mm/Makefile b/mm/Makefile
1051 index 72227b24a..fd50a3a51 100644
1052 --- a/mm/Makefile
1053 +++ b/mm/Makefile
1054 @@ -76,7 +76,8 @@ obj-$(CONFIG_SPARSEMEM) += sparse.o
1055 obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
1056 obj-$(CONFIG_SLOB) += slob.o
1057 obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
1058 -obj-$(CONFIG_KSM) += ksm.o
1059 +obj-$(CONFIG_KSM_LEGACY) += ksm.o
1060 +obj-$(CONFIG_UKSM) += uksm.o
1061 obj-$(CONFIG_PAGE_POISONING) += page_poison.o
1062 obj-$(CONFIG_SLAB) += slab.o
1063 obj-$(CONFIG_SLUB) += slub.o
1064 diff --git a/mm/ksm.c b/mm/ksm.c
1065 index 9694ee2c7..63af6a528 100644
1066 --- a/mm/ksm.c
1067 +++ b/mm/ksm.c
1068 @@ -858,17 +858,6 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma,
1069 return err;
1072 -static inline struct stable_node *page_stable_node(struct page *page)
1074 - return PageKsm(page) ? page_rmapping(page) : NULL;
1077 -static inline void set_page_stable_node(struct page *page,
1078 - struct stable_node *stable_node)
1080 - page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
1083 #ifdef CONFIG_SYSFS
1085 * Only called through the sysfs control interface:
1086 diff --git a/mm/memory.c b/mm/memory.c
1087 index 550405fc3..b4005b195 100644
1088 --- a/mm/memory.c
1089 +++ b/mm/memory.c
1090 @@ -158,6 +158,25 @@ EXPORT_SYMBOL(zero_pfn);
1092 unsigned long highest_memmap_pfn __read_mostly;
1094 +#ifdef CONFIG_UKSM
1095 +unsigned long uksm_zero_pfn __read_mostly;
1096 +EXPORT_SYMBOL_GPL(uksm_zero_pfn);
1097 +struct page *empty_uksm_zero_page;
1099 +static int __init setup_uksm_zero_page(void)
1101 + empty_uksm_zero_page = alloc_pages(__GFP_ZERO & ~__GFP_MOVABLE, 0);
1102 + if (!empty_uksm_zero_page)
1103 + panic("Oh boy, that early out of memory?");
1105 + SetPageReserved(empty_uksm_zero_page);
1106 + uksm_zero_pfn = page_to_pfn(empty_uksm_zero_page);
1108 + return 0;
1110 +core_initcall(setup_uksm_zero_page);
1111 +#endif
1114 * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
1116 @@ -173,6 +192,7 @@ void mm_trace_rss_stat(struct mm_struct *mm, int member, long count)
1117 trace_rss_stat(mm, member, count);
1121 #if defined(SPLIT_RSS_COUNTING)
1123 void sync_mm_rss(struct mm_struct *mm)
1124 @@ -875,6 +895,11 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
1125 get_page(page);
1126 page_dup_rmap(page, false);
1127 rss[mm_counter(page)]++;
1129 + /* Should return NULL in vm_normal_page() */
1130 + uksm_bugon_zeropage(pte);
1131 + } else {
1132 + uksm_map_zero_page(pte);
1136 @@ -1254,8 +1279,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
1137 ptent = ptep_get_and_clear_full(mm, addr, pte,
1138 tlb->fullmm);
1139 tlb_remove_tlb_entry(tlb, pte, addr);
1140 - if (unlikely(!page))
1141 + if (unlikely(!page)) {
1142 + uksm_unmap_zero_page(ptent);
1143 continue;
1146 if (!PageAnon(page)) {
1147 if (pte_dirty(ptent)) {
1148 @@ -2603,6 +2630,7 @@ static inline bool cow_user_page(struct page *dst, struct page *src,
1150 if (likely(src)) {
1151 copy_user_highpage(dst, src, addr, vma);
1152 + uksm_cow_page(vma, src);
1153 return true;
1156 @@ -2849,6 +2877,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
1157 vmf->address);
1158 if (!new_page)
1159 goto oom;
1160 + uksm_cow_pte(vma, vmf->orig_pte);
1161 } else {
1162 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
1163 vmf->address);
1164 @@ -2891,7 +2920,9 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
1165 mm_counter_file(old_page));
1166 inc_mm_counter_fast(mm, MM_ANONPAGES);
1168 + uksm_bugon_zeropage(vmf->orig_pte);
1169 } else {
1170 + uksm_unmap_zero_page(vmf->orig_pte);
1171 inc_mm_counter_fast(mm, MM_ANONPAGES);
1173 flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
1174 diff --git a/mm/mmap.c b/mm/mmap.c
1175 index 3f287599a..dc719db43 100644
1176 --- a/mm/mmap.c
1177 +++ b/mm/mmap.c
1178 @@ -46,6 +46,7 @@
1179 #include <linux/moduleparam.h>
1180 #include <linux/pkeys.h>
1181 #include <linux/oom.h>
1182 +#include <linux/ksm.h>
1183 #include <linux/sched/mm.h>
1185 #include <linux/uaccess.h>
1186 @@ -181,6 +182,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
1187 if (vma->vm_file)
1188 fput(vma->vm_file);
1189 mpol_put(vma_policy(vma));
1190 + uksm_remove_vma(vma);
1191 vm_area_free(vma);
1192 return next;
1194 @@ -748,9 +750,16 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
1195 long adjust_next = 0;
1196 int remove_next = 0;
1199 + * to avoid deadlock, ksm_remove_vma must be done before any spin_lock is
1200 + * acquired
1201 + */
1202 + uksm_remove_vma(vma);
1204 if (next && !insert) {
1205 struct vm_area_struct *exporter = NULL, *importer = NULL;
1207 + uksm_remove_vma(next);
1208 if (end >= next->vm_end) {
1210 * vma expands, overlapping all the next, and
1211 @@ -881,6 +890,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
1212 end_changed = true;
1214 vma->vm_pgoff = pgoff;
1216 if (adjust_next) {
1217 next->vm_start += adjust_next;
1218 next->vm_pgoff += adjust_next >> PAGE_SHIFT;
1219 @@ -985,6 +995,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
1220 if (remove_next == 2) {
1221 remove_next = 1;
1222 end = next->vm_end;
1223 + uksm_remove_vma(next);
1224 goto again;
1226 else if (next)
1227 @@ -1011,10 +1022,14 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
1229 VM_WARN_ON(mm->highest_vm_end != vm_end_gap(vma));
1231 + } else {
1232 + if (next && !insert)
1233 + uksm_vma_add_new(next);
1235 if (insert && file)
1236 uprobe_mmap(insert);
1238 + uksm_vma_add_new(vma);
1239 validate_mm(mm);
1241 return 0;
1242 @@ -1470,6 +1485,9 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
1243 vm_flags = calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
1244 mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
1246 + /* If uksm is enabled, we add VM_MERGEABLE to new VMAs. */
1247 + uksm_vm_flags_mod(&vm_flags);
1249 if (flags & MAP_LOCKED)
1250 if (!can_do_mlock())
1251 return -EPERM;
1252 @@ -1865,6 +1883,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
1253 allow_write_access(file);
1255 file = vma->vm_file;
1256 + uksm_vma_add_new(vma);
1257 out:
1258 perf_event_mmap(vma);
1260 @@ -1907,6 +1926,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
1261 if (vm_flags & VM_DENYWRITE)
1262 allow_write_access(file);
1263 free_vma:
1264 + uksm_remove_vma(vma);
1265 vm_area_free(vma);
1266 unacct_error:
1267 if (charged)
1268 @@ -2766,6 +2786,8 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
1269 else
1270 err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
1272 + uksm_vma_add_new(new);
1274 /* Success. */
1275 if (!err)
1276 return 0;
1277 @@ -3073,6 +3095,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla
1278 if ((flags & (~VM_EXEC)) != 0)
1279 return -EINVAL;
1280 flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
1281 + uksm_vm_flags_mod(&flags);
1283 mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
1284 if (IS_ERR_VALUE(mapped_addr))
1285 @@ -3118,6 +3141,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla
1286 vma->vm_flags = flags;
1287 vma->vm_page_prot = vm_get_page_prot(flags);
1288 vma_link(mm, vma, prev, rb_link, rb_parent);
1289 + uksm_vma_add_new(vma);
1290 out:
1291 perf_event_mmap(vma);
1292 mm->total_vm += len >> PAGE_SHIFT;
1293 @@ -3195,6 +3219,12 @@ void exit_mmap(struct mm_struct *mm)
1294 mmap_write_unlock(mm);
1297 + /*
1298 + * Taking write lock on mmap does not harm others,
1299 + * but it's crucial for uksm to avoid races.
1300 + */
1301 + mmap_write_lock(mm);
1303 if (mm->locked_vm) {
1304 vma = mm->mmap;
1305 while (vma) {
1306 @@ -3230,6 +3260,11 @@ void exit_mmap(struct mm_struct *mm)
1307 cond_resched();
1309 vm_unacct_memory(nr_accounted);
1311 + mm->mmap = NULL;
1312 + mm->mm_rb = RB_ROOT;
1313 + vmacache_invalidate(mm);
1314 + mmap_write_unlock(mm);
1317 /* Insert vm structure into process list sorted by address
1318 @@ -3337,6 +3372,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
1319 new_vma->vm_ops->open(new_vma);
1320 vma_link(mm, new_vma, prev, rb_link, rb_parent);
1321 *need_rmap_locks = false;
1322 + uksm_vma_add_new(new_vma);
1324 return new_vma;
1326 @@ -3505,6 +3541,7 @@ static struct vm_area_struct *__install_special_mapping(
1327 vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT);
1329 perf_event_mmap(vma);
1330 + uksm_vma_add_new(vma);
1332 return vma;
1334 diff --git a/mm/uksm.c b/mm/uksm.c
1335 new file mode 100644
1336 index 000000000..e4732c00b
1337 --- /dev/null
1338 +++ b/mm/uksm.c
1339 @@ -0,0 +1,5614 @@
1341 + * Ultra KSM. Copyright (C) 2011-2012 Nai Xia
1343 + * This is an improvement upon KSM. Some basic data structures and routines
1344 + * are borrowed from ksm.c .
1346 + * Its new features:
1347 + * 1. Full system scan:
1348 + * It automatically scans all user processes' anonymous VMAs. Kernel-user
1349 + * interaction to submit a memory area to KSM is no longer needed.
1351 + * 2. Rich area detection:
1352 + * It automatically detects rich areas containing abundant duplicated
1353 + * pages based. Rich areas are given a full scan speed. Poor areas are
1354 + * sampled at a reasonable speed with very low CPU consumption.
1356 + * 3. Ultra Per-page scan speed improvement:
1357 + * A new hash algorithm is proposed. As a result, on a machine with
1358 + * Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it
1359 + * can scan memory areas that does not contain duplicated pages at speed of
1360 + * 627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of
1361 + * 477MB/sec ~ 923MB/sec.
1363 + * 4. Thrashing area avoidance:
1364 + * Thrashing area(an VMA that has frequent Ksm page break-out) can be
1365 + * filtered out. My benchmark shows it's more efficient than KSM's per-page
1366 + * hash value based volatile page detection.
1369 + * 5. Misc changes upon KSM:
1370 + * * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page
1371 + * comparison. It's much faster than default C version on x86.
1372 + * * rmap_item now has an struct *page member to loosely cache a
1373 + * address-->page mapping, which reduces too much time-costly
1374 + * follow_page().
1375 + * * The VMA creation/exit procedures are hooked to let the Ultra KSM know.
1376 + * * try_to_merge_two_pages() now can revert a pte if it fails. No break_
1377 + * ksm is needed for this case.
1379 + * 6. Full Zero Page consideration(contributed by Figo Zhang)
1380 + * Now uksmd consider full zero pages as special pages and merge them to an
1381 + * special unswappable uksm zero page.
1382 + */
1384 +#include <linux/errno.h>
1385 +#include <linux/mm.h>
1386 +#include <linux/fs.h>
1387 +#include <linux/mman.h>
1388 +#include <linux/sched.h>
1389 +#include <linux/sched/mm.h>
1390 +#include <linux/sched/coredump.h>
1391 +#include <linux/sched/cputime.h>
1392 +#include <linux/rwsem.h>
1393 +#include <linux/pagemap.h>
1394 +#include <linux/rmap.h>
1395 +#include <linux/spinlock.h>
1396 +#include <linux/jhash.h>
1397 +#include <linux/delay.h>
1398 +#include <linux/kthread.h>
1399 +#include <linux/wait.h>
1400 +#include <linux/slab.h>
1401 +#include <linux/rbtree.h>
1402 +#include <linux/memory.h>
1403 +#include <linux/mmu_notifier.h>
1404 +#include <linux/swap.h>
1405 +#include <linux/ksm.h>
1406 +#include <linux/crypto.h>
1407 +#include <linux/scatterlist.h>
1408 +#include <crypto/hash.h>
1409 +#include <linux/random.h>
1410 +#include <linux/math64.h>
1411 +#include <linux/gcd.h>
1412 +#include <linux/freezer.h>
1413 +#include <linux/oom.h>
1414 +#include <linux/numa.h>
1415 +#include <linux/sradix-tree.h>
1417 +#include <asm/tlbflush.h>
1418 +#include "internal.h"
1420 +#ifdef CONFIG_X86
1421 +#undef memcmp
1423 +#ifdef CONFIG_X86_32
1424 +#define memcmp memcmpx86_32
1426 + * Compare 4-byte-aligned address s1 and s2, with length n
1427 + */
1428 +int memcmpx86_32(void *s1, void *s2, size_t n)
1430 + size_t num = n / 4;
1431 + register int res;
1433 + __asm__ __volatile__
1435 + "testl %3,%3\n\t"
1436 + "repe; cmpsd\n\t"
1437 + "je 1f\n\t"
1438 + "sbbl %0,%0\n\t"
1439 + "orl $1,%0\n"
1440 + "1:"
1441 + : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
1442 + : "0" (0)
1443 + : "cc");
1445 + return res;
1449 + * Check the page is all zero ?
1450 + */
1451 +static int is_full_zero(const void *s1, size_t len)
1453 + unsigned char same;
1455 + len /= 4;
1457 + __asm__ __volatile__
1458 + ("repe; scasl;"
1459 + "sete %0"
1460 + : "=qm" (same), "+D" (s1), "+c" (len)
1461 + : "a" (0)
1462 + : "cc");
1464 + return same;
1468 +#elif defined(CONFIG_X86_64)
1469 +#define memcmp memcmpx86_64
1471 + * Compare 8-byte-aligned address s1 and s2, with length n
1472 + */
1473 +int memcmpx86_64(void *s1, void *s2, size_t n)
1475 + size_t num = n / 8;
1476 + register int res;
1478 + __asm__ __volatile__
1480 + "testq %q3,%q3\n\t"
1481 + "repe; cmpsq\n\t"
1482 + "je 1f\n\t"
1483 + "sbbq %q0,%q0\n\t"
1484 + "orq $1,%q0\n"
1485 + "1:"
1486 + : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
1487 + : "0" (0)
1488 + : "cc");
1490 + return res;
1493 +static int is_full_zero(const void *s1, size_t len)
1495 + unsigned char same;
1497 + len /= 8;
1499 + __asm__ __volatile__
1500 + ("repe; scasq;"
1501 + "sete %0"
1502 + : "=qm" (same), "+D" (s1), "+c" (len)
1503 + : "a" (0)
1504 + : "cc");
1506 + return same;
1509 +#endif
1510 +#else
1511 +static int is_full_zero(const void *s1, size_t len)
1513 + unsigned long *src = s1;
1514 + int i;
1516 + len /= sizeof(*src);
1518 + for (i = 0; i < len; i++) {
1519 + if (src[i])
1520 + return 0;
1523 + return 1;
1525 +#endif
1527 +#define UKSM_RUNG_ROUND_FINISHED (1 << 0)
1528 +#define TIME_RATIO_SCALE 10000
1530 +#define SLOT_TREE_NODE_SHIFT 8
1531 +#define SLOT_TREE_NODE_STORE_SIZE (1UL << SLOT_TREE_NODE_SHIFT)
1532 +struct slot_tree_node {
1533 + unsigned long size;
1534 + struct sradix_tree_node snode;
1535 + void *stores[SLOT_TREE_NODE_STORE_SIZE];
1538 +static struct kmem_cache *slot_tree_node_cachep;
1540 +static struct sradix_tree_node *slot_tree_node_alloc(void)
1542 + struct slot_tree_node *p;
1544 + p = kmem_cache_zalloc(slot_tree_node_cachep, GFP_KERNEL |
1545 + __GFP_NORETRY | __GFP_NOWARN);
1546 + if (!p)
1547 + return NULL;
1549 + return &p->snode;
1552 +static void slot_tree_node_free(struct sradix_tree_node *node)
1554 + struct slot_tree_node *p;
1556 + p = container_of(node, struct slot_tree_node, snode);
1557 + kmem_cache_free(slot_tree_node_cachep, p);
1560 +static void slot_tree_node_extend(struct sradix_tree_node *parent,
1561 + struct sradix_tree_node *child)
1563 + struct slot_tree_node *p, *c;
1565 + p = container_of(parent, struct slot_tree_node, snode);
1566 + c = container_of(child, struct slot_tree_node, snode);
1568 + p->size += c->size;
1571 +void slot_tree_node_assign(struct sradix_tree_node *node,
1572 + unsigned int index, void *item)
1574 + struct vma_slot *slot = item;
1575 + struct slot_tree_node *cur;
1577 + slot->snode = node;
1578 + slot->sindex = index;
1580 + while (node) {
1581 + cur = container_of(node, struct slot_tree_node, snode);
1582 + cur->size += slot->pages;
1583 + node = node->parent;
1587 +void slot_tree_node_rm(struct sradix_tree_node *node, unsigned int offset)
1589 + struct vma_slot *slot;
1590 + struct slot_tree_node *cur;
1591 + unsigned long pages;
1593 + if (node->height == 1) {
1594 + slot = node->stores[offset];
1595 + pages = slot->pages;
1596 + } else {
1597 + cur = container_of(node->stores[offset],
1598 + struct slot_tree_node, snode);
1599 + pages = cur->size;
1602 + while (node) {
1603 + cur = container_of(node, struct slot_tree_node, snode);
1604 + cur->size -= pages;
1605 + node = node->parent;
1609 +unsigned long slot_iter_index;
1610 +int slot_iter(void *item, unsigned long height)
1612 + struct slot_tree_node *node;
1613 + struct vma_slot *slot;
1615 + if (height == 1) {
1616 + slot = item;
1617 + if (slot_iter_index < slot->pages) {
1618 + /*in this one*/
1619 + return 1;
1620 + } else {
1621 + slot_iter_index -= slot->pages;
1622 + return 0;
1625 + } else {
1626 + node = container_of(item, struct slot_tree_node, snode);
1627 + if (slot_iter_index < node->size) {
1628 + /*in this one*/
1629 + return 1;
1630 + } else {
1631 + slot_iter_index -= node->size;
1632 + return 0;
1638 +static inline void slot_tree_init_root(struct sradix_tree_root *root)
1640 + init_sradix_tree_root(root, SLOT_TREE_NODE_SHIFT);
1641 + root->alloc = slot_tree_node_alloc;
1642 + root->free = slot_tree_node_free;
1643 + root->extend = slot_tree_node_extend;
1644 + root->assign = slot_tree_node_assign;
1645 + root->rm = slot_tree_node_rm;
1648 +void slot_tree_init(void)
1650 + slot_tree_node_cachep = kmem_cache_create("slot_tree_node",
1651 + sizeof(struct slot_tree_node), 0,
1652 + SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
1653 + NULL);
1657 +/* Each rung of this ladder is a list of VMAs having a same scan ratio */
1658 +struct scan_rung {
1659 + //struct list_head scanned_list;
1660 + struct sradix_tree_root vma_root;
1661 + struct sradix_tree_root vma_root2;
1663 + struct vma_slot *current_scan;
1664 + unsigned long current_offset;
1666 + /*
1667 + * The initial value for current_offset, it should loop over
1668 + * [0~ step - 1] to let all slot have its chance to be scanned.
1669 + */
1670 + unsigned long offset_init;
1671 + unsigned long step; /* dynamic step for current_offset */
1672 + unsigned int flags;
1673 + unsigned long pages_to_scan;
1674 + //unsigned long fully_scanned_slots;
1675 + /*
1676 + * a little bit tricky - if cpu_time_ratio > 0, then the value is the
1677 + * the cpu time ratio it can spend in rung_i for every scan
1678 + * period. if < 0, then it is the cpu time ratio relative to the
1679 + * max cpu percentage user specified. Both in unit of
1680 + * 1/TIME_RATIO_SCALE
1681 + */
1682 + int cpu_ratio;
1684 + /*
1685 + * How long it will take for all slots in this rung to be fully
1686 + * scanned? If it's zero, we don't care about the cover time:
1687 + * it's fully scanned.
1688 + */
1689 + unsigned int cover_msecs;
1690 + //unsigned long vma_num;
1691 + //unsigned long pages; /* Sum of all slot's pages in rung */
1694 +/**
1695 + * node of either the stable or unstale rbtree
1697 + */
1698 +struct tree_node {
1699 + struct rb_node node; /* link in the main (un)stable rbtree */
1700 + struct rb_root sub_root; /* rb_root for sublevel collision rbtree */
1701 + u32 hash;
1702 + unsigned long count; /* TODO: merged with sub_root */
1703 + struct list_head all_list; /* all tree nodes in stable/unstable tree */
1706 +/**
1707 + * struct stable_node - node of the stable rbtree
1708 + * @node: rb node of this ksm page in the stable tree
1709 + * @hlist: hlist head of rmap_items using this ksm page
1710 + * @kpfn: page frame number of this ksm page
1711 + */
1712 +struct stable_node {
1713 + struct rb_node node; /* link in sub-rbtree */
1714 + struct tree_node *tree_node; /* it's tree node root in stable tree, NULL if it's in hell list */
1715 + struct hlist_head hlist;
1716 + unsigned long kpfn;
1717 + u32 hash_max; /* if ==0 then it's not been calculated yet */
1718 + struct list_head all_list; /* in a list for all stable nodes */
1721 +/**
1722 + * struct node_vma - group rmap_items linked in a same stable
1723 + * node together.
1724 + */
1725 +struct node_vma {
1726 + union {
1727 + struct vma_slot *slot;
1728 + unsigned long key; /* slot is used as key sorted on hlist */
1729 + };
1730 + struct hlist_node hlist;
1731 + struct hlist_head rmap_hlist;
1732 + struct stable_node *head;
1735 +/**
1736 + * struct rmap_item - reverse mapping item for virtual addresses
1737 + * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
1738 + * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
1739 + * @mm: the memory structure this rmap_item is pointing into
1740 + * @address: the virtual address this rmap_item tracks (+ flags in low bits)
1741 + * @node: rb node of this rmap_item in the unstable tree
1742 + * @head: pointer to stable_node heading this list in the stable tree
1743 + * @hlist: link into hlist of rmap_items hanging off that stable_node
1744 + */
1745 +struct rmap_item {
1746 + struct vma_slot *slot;
1747 + struct page *page;
1748 + unsigned long address; /* + low bits used for flags below */
1749 + unsigned long hash_round;
1750 + unsigned long entry_index;
1751 + union {
1752 + struct {/* when in unstable tree */
1753 + struct rb_node node;
1754 + struct tree_node *tree_node;
1755 + u32 hash_max;
1756 + };
1757 + struct { /* when in stable tree */
1758 + struct node_vma *head;
1759 + struct hlist_node hlist;
1760 + struct anon_vma *anon_vma;
1761 + };
1762 + };
1763 +} __aligned(4);
1765 +struct rmap_list_entry {
1766 + union {
1767 + struct rmap_item *item;
1768 + unsigned long addr;
1769 + };
1770 + /* lowest bit is used for is_addr tag */
1771 +} __aligned(4); /* 4 aligned to fit in to pages*/
1774 +/* Basic data structure definition ends */
1778 + * Flags for rmap_item to judge if it's listed in the stable/unstable tree.
1779 + * The flags use the low bits of rmap_item.address
1780 + */
1781 +#define UNSTABLE_FLAG 0x1
1782 +#define STABLE_FLAG 0x2
1783 +#define get_rmap_addr(x) ((x)->address & PAGE_MASK)
1786 + * rmap_list_entry helpers
1787 + */
1788 +#define IS_ADDR_FLAG 1
1789 +#define is_addr(ptr) ((unsigned long)(ptr) & IS_ADDR_FLAG)
1790 +#define set_is_addr(ptr) ((ptr) |= IS_ADDR_FLAG)
1791 +#define get_clean_addr(ptr) (((ptr) & ~(__typeof__(ptr))IS_ADDR_FLAG))
1795 + * High speed caches for frequently allocated and freed structs
1796 + */
1797 +static struct kmem_cache *rmap_item_cache;
1798 +static struct kmem_cache *stable_node_cache;
1799 +static struct kmem_cache *node_vma_cache;
1800 +static struct kmem_cache *vma_slot_cache;
1801 +static struct kmem_cache *tree_node_cache;
1802 +#define UKSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("uksm_"#__struct,\
1803 + sizeof(struct __struct), __alignof__(struct __struct),\
1804 + (__flags), NULL)
1806 +/* Array of all scan_rung, uksm_scan_ladder[0] having the minimum scan ratio */
1807 +#define SCAN_LADDER_SIZE 4
1808 +static struct scan_rung uksm_scan_ladder[SCAN_LADDER_SIZE];
1810 +/* The evaluation rounds uksmd has finished */
1811 +static unsigned long long uksm_eval_round = 1;
1814 + * we add 1 to this var when we consider we should rebuild the whole
1815 + * unstable tree.
1816 + */
1817 +static unsigned long uksm_hash_round = 1;
1820 + * How many times the whole memory is scanned.
1821 + */
1822 +static unsigned long long fully_scanned_round = 1;
1824 +/* The total number of virtual pages of all vma slots */
1825 +static u64 uksm_pages_total;
1827 +/* The number of pages has been scanned since the start up */
1828 +static u64 uksm_pages_scanned;
1830 +static u64 scanned_virtual_pages;
1832 +/* The number of pages has been scanned since last encode_benefit call */
1833 +static u64 uksm_pages_scanned_last;
1835 +/* If the scanned number is tooo large, we encode it here */
1836 +static u64 pages_scanned_stored;
1838 +static unsigned long pages_scanned_base;
1840 +/* The number of nodes in the stable tree */
1841 +static unsigned long uksm_pages_shared;
1843 +/* The number of page slots additionally sharing those nodes */
1844 +static unsigned long uksm_pages_sharing;
1846 +/* The number of nodes in the unstable tree */
1847 +static unsigned long uksm_pages_unshared;
1850 + * Milliseconds ksmd should sleep between scans,
1851 + * >= 100ms to be consistent with
1852 + * scan_time_to_sleep_msec()
1853 + */
1854 +static unsigned int uksm_sleep_jiffies;
1856 +/* The real value for the uksmd next sleep */
1857 +static unsigned int uksm_sleep_real;
1859 +/* Saved value for user input uksm_sleep_jiffies when it's enlarged */
1860 +static unsigned int uksm_sleep_saved;
1862 +/* Max percentage of cpu utilization ksmd can take to scan in one batch */
1863 +static unsigned int uksm_max_cpu_percentage;
1865 +static int uksm_cpu_governor;
1867 +static char *uksm_cpu_governor_str[4] = { "full", "medium", "low", "quiet" };
1869 +struct uksm_cpu_preset_s {
1870 + int cpu_ratio[SCAN_LADDER_SIZE];
1871 + unsigned int cover_msecs[SCAN_LADDER_SIZE];
1872 + unsigned int max_cpu; /* percentage */
1875 +struct uksm_cpu_preset_s uksm_cpu_preset[4] = {
1876 + { {20, 40, -2500, -10000}, {1000, 500, 200, 50}, 95},
1877 + { {20, 30, -2500, -10000}, {1000, 500, 400, 100}, 50},
1878 + { {10, 20, -5000, -10000}, {1500, 1000, 1000, 250}, 20},
1879 + { {10, 20, 40, 75}, {2000, 1000, 1000, 1000}, 1},
1882 +/* The default value for uksm_ema_page_time if it's not initialized */
1883 +#define UKSM_PAGE_TIME_DEFAULT 500
1885 +/*cost to scan one page by expotional moving average in nsecs */
1886 +static unsigned long uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT;
1888 +/* The expotional moving average alpha weight, in percentage. */
1889 +#define EMA_ALPHA 20
1892 + * The threshold used to filter out thrashing areas,
1893 + * If it == 0, filtering is disabled, otherwise it's the percentage up-bound
1894 + * of the thrashing ratio of all areas. Any area with a bigger thrashing ratio
1895 + * will be considered as having a zero duplication ratio.
1896 + */
1897 +static unsigned int uksm_thrash_threshold = 50;
1899 +/* How much dedup ratio is considered to be abundant*/
1900 +static unsigned int uksm_abundant_threshold = 10;
1902 +/* All slots having merged pages in this eval round. */
1903 +struct list_head vma_slot_dedup = LIST_HEAD_INIT(vma_slot_dedup);
1905 +/* How many times the ksmd has slept since startup */
1906 +static unsigned long long uksm_sleep_times;
1908 +#define UKSM_RUN_STOP 0
1909 +#define UKSM_RUN_MERGE 1
1910 +static unsigned int uksm_run = 1;
1912 +static DECLARE_WAIT_QUEUE_HEAD(uksm_thread_wait);
1913 +static DEFINE_MUTEX(uksm_thread_mutex);
1916 + * List vma_slot_new is for newly created vma_slot waiting to be added by
1917 + * ksmd. If one cannot be added(e.g. due to it's too small), it's moved to
1918 + * vma_slot_noadd. vma_slot_del is the list for vma_slot whose corresponding
1919 + * VMA has been removed/freed.
1920 + */
1921 +struct list_head vma_slot_new = LIST_HEAD_INIT(vma_slot_new);
1922 +struct list_head vma_slot_noadd = LIST_HEAD_INIT(vma_slot_noadd);
1923 +struct list_head vma_slot_del = LIST_HEAD_INIT(vma_slot_del);
1924 +static DEFINE_SPINLOCK(vma_slot_list_lock);
1926 +/* The unstable tree heads */
1927 +static struct rb_root root_unstable_tree = RB_ROOT;
1930 + * All tree_nodes are in a list to be freed at once when unstable tree is
1931 + * freed after each scan round.
1932 + */
1933 +static struct list_head unstable_tree_node_list =
1934 + LIST_HEAD_INIT(unstable_tree_node_list);
1936 +/* List contains all stable nodes */
1937 +static struct list_head stable_node_list = LIST_HEAD_INIT(stable_node_list);
1940 + * When the hash strength is changed, the stable tree must be delta_hashed and
1941 + * re-structured. We use two set of below structs to speed up the
1942 + * re-structuring of stable tree.
1943 + */
1944 +static struct list_head
1945 +stable_tree_node_list[2] = {LIST_HEAD_INIT(stable_tree_node_list[0]),
1946 + LIST_HEAD_INIT(stable_tree_node_list[1])};
1948 +static struct list_head *stable_tree_node_listp = &stable_tree_node_list[0];
1949 +static struct rb_root root_stable_tree[2] = {RB_ROOT, RB_ROOT};
1950 +static struct rb_root *root_stable_treep = &root_stable_tree[0];
1951 +static unsigned long stable_tree_index;
1953 +/* The hash strength needed to hash a full page */
1954 +#define HASH_STRENGTH_FULL (PAGE_SIZE / sizeof(u32))
1956 +/* The hash strength needed for loop-back hashing */
1957 +#define HASH_STRENGTH_MAX (HASH_STRENGTH_FULL + 10)
1959 +/* The random offsets in a page */
1960 +static u32 *random_nums;
1962 +/* The hash strength */
1963 +static unsigned long hash_strength = HASH_STRENGTH_FULL >> 4;
1965 +/* The delta value each time the hash strength increases or decreases */
1966 +static unsigned long hash_strength_delta;
1967 +#define HASH_STRENGTH_DELTA_MAX 5
1969 +/* The time we have saved due to random_sample_hash */
1970 +static u64 rshash_pos;
1972 +/* The time we have wasted due to hash collision */
1973 +static u64 rshash_neg;
1975 +struct uksm_benefit {
1976 + u64 pos;
1977 + u64 neg;
1978 + u64 scanned;
1979 + unsigned long base;
1980 +} benefit;
1983 + * The relative cost of memcmp, compared to 1 time unit of random sample
1984 + * hash, this value is tested when ksm module is initialized
1985 + */
1986 +static unsigned long memcmp_cost;
1988 +static unsigned long rshash_neg_cont_zero;
1989 +static unsigned long rshash_cont_obscure;
1991 +/* The possible states of hash strength adjustment heuristic */
1992 +enum rshash_states {
1993 + RSHASH_STILL,
1994 + RSHASH_TRYUP,
1995 + RSHASH_TRYDOWN,
1996 + RSHASH_NEW,
1997 + RSHASH_PRE_STILL,
2000 +/* The possible direction we are about to adjust hash strength */
2001 +enum rshash_direct {
2002 + GO_UP,
2003 + GO_DOWN,
2004 + OBSCURE,
2005 + STILL,
2008 +/* random sampling hash state machine */
2009 +static struct {
2010 + enum rshash_states state;
2011 + enum rshash_direct pre_direct;
2012 + u8 below_count;
2013 + /* Keep a lookup window of size 5, iff above_count/below_count > 3
2014 + * in this window we stop trying.
2015 + */
2016 + u8 lookup_window_index;
2017 + u64 stable_benefit;
2018 + unsigned long turn_point_down;
2019 + unsigned long turn_benefit_down;
2020 + unsigned long turn_point_up;
2021 + unsigned long turn_benefit_up;
2022 + unsigned long stable_point;
2023 +} rshash_state;
2025 +/*zero page hash table, hash_strength [0 ~ HASH_STRENGTH_MAX]*/
2026 +static u32 *zero_hash_table;
2028 +static inline struct node_vma *alloc_node_vma(void)
2030 + struct node_vma *node_vma;
2032 + node_vma = kmem_cache_zalloc(node_vma_cache, GFP_KERNEL |
2033 + __GFP_NORETRY | __GFP_NOWARN);
2034 + if (node_vma) {
2035 + INIT_HLIST_HEAD(&node_vma->rmap_hlist);
2036 + INIT_HLIST_NODE(&node_vma->hlist);
2038 + return node_vma;
2041 +static inline void free_node_vma(struct node_vma *node_vma)
2043 + kmem_cache_free(node_vma_cache, node_vma);
2047 +static inline struct vma_slot *alloc_vma_slot(void)
2049 + struct vma_slot *slot;
2051 + /*
2052 + * In case ksm is not initialized by now.
2053 + * Oops, we need to consider the call site of uksm_init() in the future.
2054 + */
2055 + if (!vma_slot_cache)
2056 + return NULL;
2058 + slot = kmem_cache_zalloc(vma_slot_cache, GFP_KERNEL |
2059 + __GFP_NORETRY | __GFP_NOWARN);
2060 + if (slot) {
2061 + INIT_LIST_HEAD(&slot->slot_list);
2062 + INIT_LIST_HEAD(&slot->dedup_list);
2063 + slot->flags |= UKSM_SLOT_NEED_RERAND;
2065 + return slot;
2068 +static inline void free_vma_slot(struct vma_slot *vma_slot)
2070 + kmem_cache_free(vma_slot_cache, vma_slot);
2075 +static inline struct rmap_item *alloc_rmap_item(void)
2077 + struct rmap_item *rmap_item;
2079 + rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL |
2080 + __GFP_NORETRY | __GFP_NOWARN);
2081 + if (rmap_item) {
2082 + /* bug on lowest bit is not clear for flag use */
2083 + BUG_ON(is_addr(rmap_item));
2085 + return rmap_item;
2088 +static inline void free_rmap_item(struct rmap_item *rmap_item)
2090 + rmap_item->slot = NULL; /* debug safety */
2091 + kmem_cache_free(rmap_item_cache, rmap_item);
2094 +static inline struct stable_node *alloc_stable_node(void)
2096 + struct stable_node *node;
2098 + node = kmem_cache_alloc(stable_node_cache, GFP_KERNEL |
2099 + __GFP_NORETRY | __GFP_NOWARN);
2100 + if (!node)
2101 + return NULL;
2103 + INIT_HLIST_HEAD(&node->hlist);
2104 + list_add(&node->all_list, &stable_node_list);
2105 + return node;
2108 +static inline void free_stable_node(struct stable_node *stable_node)
2110 + list_del(&stable_node->all_list);
2111 + kmem_cache_free(stable_node_cache, stable_node);
2114 +static inline struct tree_node *alloc_tree_node(struct list_head *list)
2116 + struct tree_node *node;
2118 + node = kmem_cache_zalloc(tree_node_cache, GFP_KERNEL |
2119 + __GFP_NORETRY | __GFP_NOWARN);
2120 + if (!node)
2121 + return NULL;
2123 + list_add(&node->all_list, list);
2124 + return node;
2127 +static inline void free_tree_node(struct tree_node *node)
2129 + list_del(&node->all_list);
2130 + kmem_cache_free(tree_node_cache, node);
2133 +static void uksm_drop_anon_vma(struct rmap_item *rmap_item)
2135 + struct anon_vma *anon_vma = rmap_item->anon_vma;
2137 + put_anon_vma(anon_vma);
2141 +/**
2142 + * Remove a stable node from stable_tree, may unlink from its tree_node and
2143 + * may remove its parent tree_node if no other stable node is pending.
2145 + * @stable_node The node need to be removed
2146 + * @unlink_rb Will this node be unlinked from the rbtree?
2147 + * @remove_tree_ node Will its tree_node be removed if empty?
2148 + */
2149 +static void remove_node_from_stable_tree(struct stable_node *stable_node,
2150 + int unlink_rb, int remove_tree_node)
2152 + struct node_vma *node_vma;
2153 + struct rmap_item *rmap_item;
2154 + struct hlist_node *n;
2156 + if (!hlist_empty(&stable_node->hlist)) {
2157 + hlist_for_each_entry_safe(node_vma, n,
2158 + &stable_node->hlist, hlist) {
2159 + hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) {
2160 + uksm_pages_sharing--;
2162 + uksm_drop_anon_vma(rmap_item);
2163 + rmap_item->address &= PAGE_MASK;
2165 + free_node_vma(node_vma);
2166 + cond_resched();
2169 + /* the last one is counted as shared */
2170 + uksm_pages_shared--;
2171 + uksm_pages_sharing++;
2174 + if (stable_node->tree_node && unlink_rb) {
2175 + rb_erase(&stable_node->node,
2176 + &stable_node->tree_node->sub_root);
2178 + if (RB_EMPTY_ROOT(&stable_node->tree_node->sub_root) &&
2179 + remove_tree_node) {
2180 + rb_erase(&stable_node->tree_node->node,
2181 + root_stable_treep);
2182 + free_tree_node(stable_node->tree_node);
2183 + } else {
2184 + stable_node->tree_node->count--;
2188 + free_stable_node(stable_node);
2193 + * get_uksm_page: checks if the page indicated by the stable node
2194 + * is still its ksm page, despite having held no reference to it.
2195 + * In which case we can trust the content of the page, and it
2196 + * returns the gotten page; but if the page has now been zapped,
2197 + * remove the stale node from the stable tree and return NULL.
2199 + * You would expect the stable_node to hold a reference to the ksm page.
2200 + * But if it increments the page's count, swapping out has to wait for
2201 + * ksmd to come around again before it can free the page, which may take
2202 + * seconds or even minutes: much too unresponsive. So instead we use a
2203 + * "keyhole reference": access to the ksm page from the stable node peeps
2204 + * out through its keyhole to see if that page still holds the right key,
2205 + * pointing back to this stable node. This relies on freeing a PageAnon
2206 + * page to reset its page->mapping to NULL, and relies on no other use of
2207 + * a page to put something that might look like our key in page->mapping.
2209 + * include/linux/pagemap.h page_cache_get_speculative() is a good reference,
2210 + * but this is different - made simpler by uksm_thread_mutex being held, but
2211 + * interesting for assuming that no other use of the struct page could ever
2212 + * put our expected_mapping into page->mapping (or a field of the union which
2213 + * coincides with page->mapping). The RCU calls are not for KSM at all, but
2214 + * to keep the page_count protocol described with page_cache_get_speculative.
2216 + * Note: it is possible that get_uksm_page() will return NULL one moment,
2217 + * then page the next, if the page is in between page_freeze_refs() and
2218 + * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page
2219 + * is on its way to being freed; but it is an anomaly to bear in mind.
2221 + * @unlink_rb: if the removal of this node will firstly unlink from
2222 + * its rbtree. stable_node_reinsert will prevent this when restructuring the
2223 + * node from its old tree.
2225 + * @remove_tree_node: if this is the last one of its tree_node, will the
2226 + * tree_node be freed ? If we are inserting stable node, this tree_node may
2227 + * be reused, so don't free it.
2228 + */
2229 +static struct page *get_uksm_page(struct stable_node *stable_node,
2230 + int unlink_rb, int remove_tree_node)
2232 + struct page *page;
2233 + void *expected_mapping;
2234 + unsigned long kpfn;
2236 + expected_mapping = (void *)((unsigned long)stable_node |
2237 + PAGE_MAPPING_KSM);
2238 +again:
2239 + kpfn = READ_ONCE(stable_node->kpfn);
2240 + page = pfn_to_page(kpfn);
2242 + /*
2243 + * page is computed from kpfn, so on most architectures reading
2244 + * page->mapping is naturally ordered after reading node->kpfn,
2245 + * but on Alpha we need to be more careful.
2246 + */
2247 + smp_rmb();
2249 + if (READ_ONCE(page->mapping) != expected_mapping)
2250 + goto stale;
2252 + /*
2253 + * We cannot do anything with the page while its refcount is 0.
2254 + * Usually 0 means free, or tail of a higher-order page: in which
2255 + * case this node is no longer referenced, and should be freed;
2256 + * however, it might mean that the page is under page_freeze_refs().
2257 + * The __remove_mapping() case is easy, again the node is now stale;
2258 + * but if page is swapcache in migrate_page_move_mapping(), it might
2259 + * still be our page, in which case it's essential to keep the node.
2260 + */
2261 + while (!get_page_unless_zero(page)) {
2262 + /*
2263 + * Another check for page->mapping != expected_mapping would
2264 + * work here too. We have chosen the !PageSwapCache test to
2265 + * optimize the common case, when the page is or is about to
2266 + * be freed: PageSwapCache is cleared (under spin_lock_irq)
2267 + * in the freeze_refs section of __remove_mapping(); but Anon
2268 + * page->mapping reset to NULL later, in free_pages_prepare().
2269 + */
2270 + if (!PageSwapCache(page))
2271 + goto stale;
2272 + cpu_relax();
2275 + if (READ_ONCE(page->mapping) != expected_mapping) {
2276 + put_page(page);
2277 + goto stale;
2280 + lock_page(page);
2281 + if (READ_ONCE(page->mapping) != expected_mapping) {
2282 + unlock_page(page);
2283 + put_page(page);
2284 + goto stale;
2286 + unlock_page(page);
2287 + return page;
2288 +stale:
2289 + /*
2290 + * We come here from above when page->mapping or !PageSwapCache
2291 + * suggests that the node is stale; but it might be under migration.
2292 + * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(),
2293 + * before checking whether node->kpfn has been changed.
2294 + */
2295 + smp_rmb();
2296 + if (stable_node->kpfn != kpfn)
2297 + goto again;
2299 + remove_node_from_stable_tree(stable_node, unlink_rb, remove_tree_node);
2301 + return NULL;
2305 + * Removing rmap_item from stable or unstable tree.
2306 + * This function will clean the information from the stable/unstable tree.
2307 + */
2308 +static inline void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
2310 + if (rmap_item->address & STABLE_FLAG) {
2311 + struct stable_node *stable_node;
2312 + struct node_vma *node_vma;
2313 + struct page *page;
2315 + node_vma = rmap_item->head;
2316 + stable_node = node_vma->head;
2317 + page = get_uksm_page(stable_node, 1, 1);
2318 + if (!page)
2319 + goto out;
2321 + /*
2322 + * page lock is needed because it's racing with
2323 + * try_to_unmap_ksm(), etc.
2324 + */
2325 + lock_page(page);
2326 + hlist_del(&rmap_item->hlist);
2328 + if (hlist_empty(&node_vma->rmap_hlist)) {
2329 + hlist_del(&node_vma->hlist);
2330 + free_node_vma(node_vma);
2332 + unlock_page(page);
2334 + put_page(page);
2335 + if (hlist_empty(&stable_node->hlist)) {
2336 + /* do NOT call remove_node_from_stable_tree() here,
2337 + * it's possible for a forked rmap_item not in
2338 + * stable tree while the in-tree rmap_items were
2339 + * deleted.
2340 + */
2341 + uksm_pages_shared--;
2342 + } else
2343 + uksm_pages_sharing--;
2346 + uksm_drop_anon_vma(rmap_item);
2347 + } else if (rmap_item->address & UNSTABLE_FLAG) {
2348 + if (rmap_item->hash_round == uksm_hash_round) {
2350 + rb_erase(&rmap_item->node,
2351 + &rmap_item->tree_node->sub_root);
2352 + if (RB_EMPTY_ROOT(&rmap_item->tree_node->sub_root)) {
2353 + rb_erase(&rmap_item->tree_node->node,
2354 + &root_unstable_tree);
2356 + free_tree_node(rmap_item->tree_node);
2357 + } else
2358 + rmap_item->tree_node->count--;
2360 + uksm_pages_unshared--;
2363 + rmap_item->address &= PAGE_MASK;
2364 + rmap_item->hash_max = 0;
2366 +out:
2367 + cond_resched(); /* we're called from many long loops */
2370 +static inline int slot_in_uksm(struct vma_slot *slot)
2372 + return list_empty(&slot->slot_list);
2376 + * Test if the mm is exiting
2377 + */
2378 +static inline bool uksm_test_exit(struct mm_struct *mm)
2380 + return atomic_read(&mm->mm_users) == 0;
2383 +static inline unsigned long vma_pool_size(struct vma_slot *slot)
2385 + return round_up(sizeof(struct rmap_list_entry) * slot->pages,
2386 + PAGE_SIZE) >> PAGE_SHIFT;
2389 +#define CAN_OVERFLOW_U64(x, delta) (U64_MAX - (x) < (delta))
2391 +/* must be done with sem locked */
2392 +static int slot_pool_alloc(struct vma_slot *slot)
2394 + unsigned long pool_size;
2396 + if (slot->rmap_list_pool)
2397 + return 0;
2399 + pool_size = vma_pool_size(slot);
2400 + slot->rmap_list_pool = kcalloc(pool_size, sizeof(struct page *),
2401 + GFP_KERNEL);
2402 + if (!slot->rmap_list_pool)
2403 + return -ENOMEM;
2405 + slot->pool_counts = kcalloc(pool_size, sizeof(unsigned int),
2406 + GFP_KERNEL);
2407 + if (!slot->pool_counts) {
2408 + kfree(slot->rmap_list_pool);
2409 + return -ENOMEM;
2412 + slot->pool_size = pool_size;
2413 + BUG_ON(CAN_OVERFLOW_U64(uksm_pages_total, slot->pages));
2414 + slot->flags |= UKSM_SLOT_IN_UKSM;
2415 + uksm_pages_total += slot->pages;
2417 + return 0;
2421 + * Called after vma is unlinked from its mm
2422 + */
2423 +void uksm_remove_vma(struct vm_area_struct *vma)
2425 + struct vma_slot *slot;
2427 + if (!vma->uksm_vma_slot)
2428 + return;
2430 + spin_lock(&vma_slot_list_lock);
2431 + slot = vma->uksm_vma_slot;
2432 + if (!slot)
2433 + goto out;
2435 + if (slot_in_uksm(slot)) {
2436 + /**
2437 + * This slot has been added by ksmd, so move to the del list
2438 + * waiting ksmd to free it.
2439 + */
2440 + list_add_tail(&slot->slot_list, &vma_slot_del);
2441 + } else {
2442 + /**
2443 + * It's still on new list. It's ok to free slot directly.
2444 + */
2445 + list_del(&slot->slot_list);
2446 + free_vma_slot(slot);
2448 +out:
2449 + vma->uksm_vma_slot = NULL;
2450 + spin_unlock(&vma_slot_list_lock);
2453 +/**
2454 + * Need to do two things:
2455 + * 1. check if slot was moved to del list
2456 + * 2. make sure the mmap_sem is manipulated under valid vma.
2458 + * My concern here is that in some cases, this may make
2459 + * vma_slot_list_lock() waiters to serialized further by some
2460 + * sem->wait_lock, can this really be expensive?
2463 + * @return
2464 + * 0: if successfully locked mmap_sem
2465 + * -ENOENT: this slot was moved to del list
2466 + * -EBUSY: vma lock failed
2467 + */
2468 +static int try_down_read_slot_mmap_sem(struct vma_slot *slot)
2470 + struct vm_area_struct *vma;
2471 + struct mm_struct *mm;
2472 + struct rw_semaphore *sem;
2474 + spin_lock(&vma_slot_list_lock);
2476 + /* the slot_list was removed and inited from new list, when it enters
2477 + * uksm_list. If now it's not empty, then it must be moved to del list
2478 + */
2479 + if (!slot_in_uksm(slot)) {
2480 + spin_unlock(&vma_slot_list_lock);
2481 + return -ENOENT;
2484 + BUG_ON(slot->pages != vma_pages(slot->vma));
2485 + /* Ok, vma still valid */
2486 + vma = slot->vma;
2487 + mm = vma->vm_mm;
2488 + sem = &mm->mmap_lock;
2490 + if (uksm_test_exit(mm)) {
2491 + spin_unlock(&vma_slot_list_lock);
2492 + return -ENOENT;
2495 + if (down_read_trylock(sem)) {
2496 + spin_unlock(&vma_slot_list_lock);
2497 + if (slot_pool_alloc(slot)) {
2498 + uksm_remove_vma(vma);
2499 + up_read(sem);
2500 + return -ENOENT;
2502 + return 0;
2505 + spin_unlock(&vma_slot_list_lock);
2506 + return -EBUSY;
2509 +static inline unsigned long
2510 +vma_page_address(struct page *page, struct vm_area_struct *vma)
2512 + pgoff_t pgoff = page->index;
2513 + unsigned long address;
2515 + address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
2516 + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
2517 + /* page should be within @vma mapping range */
2518 + return -EFAULT;
2520 + return address;
2524 +/* return 0 on success with the item's mmap_sem locked */
2525 +static inline int get_mergeable_page_lock_mmap(struct rmap_item *item)
2527 + struct mm_struct *mm;
2528 + struct vma_slot *slot = item->slot;
2529 + int err = -EINVAL;
2531 + struct page *page;
2533 + /*
2534 + * try_down_read_slot_mmap_sem() returns non-zero if the slot
2535 + * has been removed by uksm_remove_vma().
2536 + */
2537 + if (try_down_read_slot_mmap_sem(slot))
2538 + return -EBUSY;
2540 + mm = slot->vma->vm_mm;
2542 + if (uksm_test_exit(mm))
2543 + goto failout_up;
2545 + page = item->page;
2546 + rcu_read_lock();
2547 + if (!get_page_unless_zero(page)) {
2548 + rcu_read_unlock();
2549 + goto failout_up;
2552 + /* No need to consider huge page here. */
2553 + if (item->slot->vma->anon_vma != page_anon_vma(page) ||
2554 + vma_page_address(page, item->slot->vma) != get_rmap_addr(item)) {
2555 + /*
2556 + * TODO:
2557 + * should we release this item becase of its stale page
2558 + * mapping?
2559 + */
2560 + put_page(page);
2561 + rcu_read_unlock();
2562 + goto failout_up;
2564 + rcu_read_unlock();
2565 + return 0;
2567 +failout_up:
2568 + mmap_read_unlock(mm);
2569 + return err;
2573 + * What kind of VMA is considered ?
2574 + */
2575 +static inline int vma_can_enter(struct vm_area_struct *vma)
2577 + return uksm_flags_can_scan(vma->vm_flags);
2581 + * Called whenever a fresh new vma is created A new vma_slot.
2582 + * is created and inserted into a global list Must be called.
2583 + * after vma is inserted to its mm.
2584 + */
2585 +void uksm_vma_add_new(struct vm_area_struct *vma)
2587 + struct vma_slot *slot;
2589 + if (!vma_can_enter(vma)) {
2590 + vma->uksm_vma_slot = NULL;
2591 + return;
2594 + slot = alloc_vma_slot();
2595 + if (!slot) {
2596 + vma->uksm_vma_slot = NULL;
2597 + return;
2600 + vma->uksm_vma_slot = slot;
2601 + vma->vm_flags |= VM_MERGEABLE;
2602 + slot->vma = vma;
2603 + slot->mm = vma->vm_mm;
2604 + slot->ctime_j = jiffies;
2605 + slot->pages = vma_pages(vma);
2606 + spin_lock(&vma_slot_list_lock);
2607 + list_add_tail(&slot->slot_list, &vma_slot_new);
2608 + spin_unlock(&vma_slot_list_lock);
2611 +/* 32/3 < they < 32/2 */
2612 +#define shiftl 8
2613 +#define shiftr 12
2615 +#define HASH_FROM_TO(from, to) \
2616 +for (index = from; index < to; index++) { \
2617 + pos = random_nums[index]; \
2618 + hash += key[pos]; \
2619 + hash += (hash << shiftl); \
2620 + hash ^= (hash >> shiftr); \
2624 +#define HASH_FROM_DOWN_TO(from, to) \
2625 +for (index = from - 1; index >= to; index--) { \
2626 + hash ^= (hash >> shiftr); \
2627 + hash ^= (hash >> (shiftr*2)); \
2628 + hash -= (hash << shiftl); \
2629 + hash += (hash << (shiftl*2)); \
2630 + pos = random_nums[index]; \
2631 + hash -= key[pos]; \
2635 + * The main random sample hash function.
2636 + */
2637 +static u32 random_sample_hash(void *addr, u32 hash_strength)
2639 + u32 hash = 0xdeadbeef;
2640 + int index, pos, loop = hash_strength;
2641 + u32 *key = (u32 *)addr;
2643 + if (loop > HASH_STRENGTH_FULL)
2644 + loop = HASH_STRENGTH_FULL;
2646 + HASH_FROM_TO(0, loop);
2648 + if (hash_strength > HASH_STRENGTH_FULL) {
2649 + loop = hash_strength - HASH_STRENGTH_FULL;
2650 + HASH_FROM_TO(0, loop);
2653 + return hash;
2657 +/**
2658 + * It's used when hash strength is adjusted
2660 + * @addr The page's virtual address
2661 + * @from The original hash strength
2662 + * @to The hash strength changed to
2663 + * @hash The hash value generated with "from" hash value
2665 + * return the hash value
2666 + */
2667 +static u32 delta_hash(void *addr, int from, int to, u32 hash)
2669 + u32 *key = (u32 *)addr;
2670 + int index, pos; /* make sure they are int type */
2672 + if (to > from) {
2673 + if (from >= HASH_STRENGTH_FULL) {
2674 + from -= HASH_STRENGTH_FULL;
2675 + to -= HASH_STRENGTH_FULL;
2676 + HASH_FROM_TO(from, to);
2677 + } else if (to <= HASH_STRENGTH_FULL) {
2678 + HASH_FROM_TO(from, to);
2679 + } else {
2680 + HASH_FROM_TO(from, HASH_STRENGTH_FULL);
2681 + HASH_FROM_TO(0, to - HASH_STRENGTH_FULL);
2683 + } else {
2684 + if (from <= HASH_STRENGTH_FULL) {
2685 + HASH_FROM_DOWN_TO(from, to);
2686 + } else if (to >= HASH_STRENGTH_FULL) {
2687 + from -= HASH_STRENGTH_FULL;
2688 + to -= HASH_STRENGTH_FULL;
2689 + HASH_FROM_DOWN_TO(from, to);
2690 + } else {
2691 + HASH_FROM_DOWN_TO(from - HASH_STRENGTH_FULL, 0);
2692 + HASH_FROM_DOWN_TO(HASH_STRENGTH_FULL, to);
2696 + return hash;
2699 +/**
2701 + * Called when: rshash_pos or rshash_neg is about to overflow or a scan round
2702 + * has finished.
2704 + * return 0 if no page has been scanned since last call, 1 otherwise.
2705 + */
2706 +static inline int encode_benefit(void)
2708 + u64 scanned_delta, pos_delta, neg_delta;
2709 + unsigned long base = benefit.base;
2711 + scanned_delta = uksm_pages_scanned - uksm_pages_scanned_last;
2713 + if (!scanned_delta)
2714 + return 0;
2716 + scanned_delta >>= base;
2717 + pos_delta = rshash_pos >> base;
2718 + neg_delta = rshash_neg >> base;
2720 + if (CAN_OVERFLOW_U64(benefit.pos, pos_delta) ||
2721 + CAN_OVERFLOW_U64(benefit.neg, neg_delta) ||
2722 + CAN_OVERFLOW_U64(benefit.scanned, scanned_delta)) {
2723 + benefit.scanned >>= 1;
2724 + benefit.neg >>= 1;
2725 + benefit.pos >>= 1;
2726 + benefit.base++;
2727 + scanned_delta >>= 1;
2728 + pos_delta >>= 1;
2729 + neg_delta >>= 1;
2732 + benefit.pos += pos_delta;
2733 + benefit.neg += neg_delta;
2734 + benefit.scanned += scanned_delta;
2736 + BUG_ON(!benefit.scanned);
2738 + rshash_pos = rshash_neg = 0;
2739 + uksm_pages_scanned_last = uksm_pages_scanned;
2741 + return 1;
2744 +static inline void reset_benefit(void)
2746 + benefit.pos = 0;
2747 + benefit.neg = 0;
2748 + benefit.base = 0;
2749 + benefit.scanned = 0;
2752 +static inline void inc_rshash_pos(unsigned long delta)
2754 + if (CAN_OVERFLOW_U64(rshash_pos, delta))
2755 + encode_benefit();
2757 + rshash_pos += delta;
2760 +static inline void inc_rshash_neg(unsigned long delta)
2762 + if (CAN_OVERFLOW_U64(rshash_neg, delta))
2763 + encode_benefit();
2765 + rshash_neg += delta;
2769 +static inline u32 page_hash(struct page *page, unsigned long hash_strength,
2770 + int cost_accounting)
2772 + u32 val;
2773 + unsigned long delta;
2775 + void *addr = kmap_atomic(page);
2777 + val = random_sample_hash(addr, hash_strength);
2778 + kunmap_atomic(addr);
2780 + if (cost_accounting) {
2781 + if (hash_strength < HASH_STRENGTH_FULL)
2782 + delta = HASH_STRENGTH_FULL - hash_strength;
2783 + else
2784 + delta = 0;
2786 + inc_rshash_pos(delta);
2789 + return val;
2792 +static int memcmp_pages_with_cost(struct page *page1, struct page *page2,
2793 + int cost_accounting)
2795 + char *addr1, *addr2;
2796 + int ret;
2798 + addr1 = kmap_atomic(page1);
2799 + addr2 = kmap_atomic(page2);
2800 + ret = memcmp(addr1, addr2, PAGE_SIZE);
2801 + kunmap_atomic(addr2);
2802 + kunmap_atomic(addr1);
2804 + if (cost_accounting)
2805 + inc_rshash_neg(memcmp_cost);
2807 + return ret;
2810 +static inline int pages_identical_with_cost(struct page *page1, struct page *page2)
2812 + return !memcmp_pages_with_cost(page1, page2, 0);
2815 +static inline int is_page_full_zero(struct page *page)
2817 + char *addr;
2818 + int ret;
2820 + addr = kmap_atomic(page);
2821 + ret = is_full_zero(addr, PAGE_SIZE);
2822 + kunmap_atomic(addr);
2824 + return ret;
2827 +static int write_protect_page(struct vm_area_struct *vma, struct page *page,
2828 + pte_t *orig_pte, pte_t *old_pte)
2830 + struct mm_struct *mm = vma->vm_mm;
2831 + struct page_vma_mapped_walk pvmw = {
2832 + .page = page,
2833 + .vma = vma,
2834 + };
2835 + struct mmu_notifier_range range;
2836 + int swapped;
2837 + int err = -EFAULT;
2839 + pvmw.address = page_address_in_vma(page, vma);
2840 + if (pvmw.address == -EFAULT)
2841 + goto out;
2843 + BUG_ON(PageTransCompound(page));
2845 + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, pvmw.address,
2846 + pvmw.address + PAGE_SIZE);
2847 + mmu_notifier_invalidate_range_start(&range);
2849 + if (!page_vma_mapped_walk(&pvmw))
2850 + goto out_mn;
2851 + if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?"))
2852 + goto out_unlock;
2854 + if (old_pte)
2855 + *old_pte = *pvmw.pte;
2857 + if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) ||
2858 + (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) || mm_tlb_flush_pending(mm)) {
2859 + pte_t entry;
2861 + swapped = PageSwapCache(page);
2862 + flush_cache_page(vma, pvmw.address, page_to_pfn(page));
2863 + /*
2864 + * Ok this is tricky, when get_user_pages_fast() run it doesn't
2865 + * take any lock, therefore the check that we are going to make
2866 + * with the pagecount against the mapcount is racey and
2867 + * O_DIRECT can happen right after the check.
2868 + * So we clear the pte and flush the tlb before the check
2869 + * this assure us that no O_DIRECT can happen after the check
2870 + * or in the middle of the check.
2871 + */
2872 + entry = ptep_clear_flush_notify(vma, pvmw.address, pvmw.pte);
2873 + /*
2874 + * Check that no O_DIRECT or similar I/O is in progress on the
2875 + * page
2876 + */
2877 + if (page_mapcount(page) + 1 + swapped != page_count(page)) {
2878 + set_pte_at(mm, pvmw.address, pvmw.pte, entry);
2879 + goto out_unlock;
2881 + if (pte_dirty(entry))
2882 + set_page_dirty(page);
2884 + if (pte_protnone(entry))
2885 + entry = pte_mkclean(pte_clear_savedwrite(entry));
2886 + else
2887 + entry = pte_mkclean(pte_wrprotect(entry));
2889 + set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry);
2891 + *orig_pte = *pvmw.pte;
2892 + err = 0;
2894 +out_unlock:
2895 + page_vma_mapped_walk_done(&pvmw);
2896 +out_mn:
2897 + mmu_notifier_invalidate_range_end(&range);
2898 +out:
2899 + return err;
2902 +#define MERGE_ERR_PGERR 1 /* the page is invalid cannot continue */
2903 +#define MERGE_ERR_COLLI 2 /* there is a collision */
2904 +#define MERGE_ERR_COLLI_MAX 3 /* collision at the max hash strength */
2905 +#define MERGE_ERR_CHANGED 4 /* the page has changed since last hash */
2908 +/**
2909 + * replace_page - replace page in vma by new ksm page
2910 + * @vma: vma that holds the pte pointing to page
2911 + * @page: the page we are replacing by kpage
2912 + * @kpage: the ksm page we replace page by
2913 + * @orig_pte: the original value of the pte
2915 + * Returns 0 on success, MERGE_ERR_PGERR on failure.
2916 + */
2917 +static int replace_page(struct vm_area_struct *vma, struct page *page,
2918 + struct page *kpage, pte_t orig_pte)
2920 + struct mm_struct *mm = vma->vm_mm;
2921 + struct mmu_notifier_range range;
2922 + pgd_t *pgd;
2923 + p4d_t *p4d;
2924 + pud_t *pud;
2925 + pmd_t *pmd;
2926 + pte_t *ptep;
2927 + spinlock_t *ptl;
2928 + pte_t entry;
2930 + unsigned long addr;
2931 + int err = MERGE_ERR_PGERR;
2933 + addr = page_address_in_vma(page, vma);
2934 + if (addr == -EFAULT)
2935 + goto out;
2937 + pgd = pgd_offset(mm, addr);
2938 + if (!pgd_present(*pgd))
2939 + goto out;
2941 + p4d = p4d_offset(pgd, addr);
2942 + pud = pud_offset(p4d, addr);
2943 + if (!pud_present(*pud))
2944 + goto out;
2946 + pmd = pmd_offset(pud, addr);
2947 + BUG_ON(pmd_trans_huge(*pmd));
2948 + if (!pmd_present(*pmd))
2949 + goto out;
2951 + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
2952 + addr + PAGE_SIZE);
2953 + mmu_notifier_invalidate_range_start(&range);
2955 + ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
2956 + if (!pte_same(*ptep, orig_pte)) {
2957 + pte_unmap_unlock(ptep, ptl);
2958 + goto out_mn;
2961 + flush_cache_page(vma, addr, pte_pfn(*ptep));
2962 + ptep_clear_flush_notify(vma, addr, ptep);
2963 + entry = mk_pte(kpage, vma->vm_page_prot);
2965 + /* special treatment is needed for zero_page */
2966 + if ((page_to_pfn(kpage) == uksm_zero_pfn) ||
2967 + (page_to_pfn(kpage) == zero_pfn)) {
2968 + entry = pte_mkspecial(entry);
2969 + dec_mm_counter(mm, MM_ANONPAGES);
2970 + inc_zone_page_state(page, NR_UKSM_ZERO_PAGES);
2971 + } else {
2972 + get_page(kpage);
2973 + page_add_anon_rmap(kpage, vma, addr, false);
2976 + set_pte_at_notify(mm, addr, ptep, entry);
2978 + page_remove_rmap(page, false);
2979 + if (!page_mapped(page))
2980 + try_to_free_swap(page);
2981 + put_page(page);
2983 + pte_unmap_unlock(ptep, ptl);
2984 + err = 0;
2985 +out_mn:
2986 + mmu_notifier_invalidate_range_end(&range);
2987 +out:
2988 + return err;
2992 +/**
2993 + * Fully hash a page with HASH_STRENGTH_MAX return a non-zero hash value. The
2994 + * zero hash value at HASH_STRENGTH_MAX is used to indicated that its
2995 + * hash_max member has not been calculated.
2997 + * @page The page needs to be hashed
2998 + * @hash_old The hash value calculated with current hash strength
3000 + * return the new hash value calculated at HASH_STRENGTH_MAX
3001 + */
3002 +static inline u32 page_hash_max(struct page *page, u32 hash_old)
3004 + u32 hash_max = 0;
3005 + void *addr;
3007 + addr = kmap_atomic(page);
3008 + hash_max = delta_hash(addr, hash_strength,
3009 + HASH_STRENGTH_MAX, hash_old);
3011 + kunmap_atomic(addr);
3013 + if (!hash_max)
3014 + hash_max = 1;
3016 + inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
3017 + return hash_max;
3021 + * We compare the hash again, to ensure that it is really a hash collision
3022 + * instead of being caused by page write.
3023 + */
3024 +static inline int check_collision(struct rmap_item *rmap_item,
3025 + u32 hash)
3027 + int err;
3028 + struct page *page = rmap_item->page;
3030 + /* if this rmap_item has already been hash_maxed, then the collision
3031 + * must appears in the second-level rbtree search. In this case we check
3032 + * if its hash_max value has been changed. Otherwise, the collision
3033 + * happens in the first-level rbtree search, so we check against it's
3034 + * current hash value.
3035 + */
3036 + if (rmap_item->hash_max) {
3037 + inc_rshash_neg(memcmp_cost);
3038 + inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
3040 + if (rmap_item->hash_max == page_hash_max(page, hash))
3041 + err = MERGE_ERR_COLLI;
3042 + else
3043 + err = MERGE_ERR_CHANGED;
3044 + } else {
3045 + inc_rshash_neg(memcmp_cost + hash_strength);
3047 + if (page_hash(page, hash_strength, 0) == hash)
3048 + err = MERGE_ERR_COLLI;
3049 + else
3050 + err = MERGE_ERR_CHANGED;
3053 + return err;
3056 +/**
3057 + * Try to merge a rmap_item.page with a kpage in stable node. kpage must
3058 + * already be a ksm page.
3060 + * @return 0 if the pages were merged, -EFAULT otherwise.
3061 + */
3062 +static int try_to_merge_with_uksm_page(struct rmap_item *rmap_item,
3063 + struct page *kpage, u32 hash)
3065 + struct vm_area_struct *vma = rmap_item->slot->vma;
3066 + struct mm_struct *mm = vma->vm_mm;
3067 + pte_t orig_pte = __pte(0);
3068 + int err = MERGE_ERR_PGERR;
3069 + struct page *page;
3071 + if (uksm_test_exit(mm))
3072 + goto out;
3074 + page = rmap_item->page;
3076 + if (page == kpage) { /* ksm page forked */
3077 + err = 0;
3078 + goto out;
3081 + /*
3082 + * We need the page lock to read a stable PageSwapCache in
3083 + * write_protect_page(). We use trylock_page() instead of
3084 + * lock_page() because we don't want to wait here - we
3085 + * prefer to continue scanning and merging different pages,
3086 + * then come back to this page when it is unlocked.
3087 + */
3088 + if (!trylock_page(page))
3089 + goto out;
3091 + if (!PageAnon(page) || !PageKsm(kpage))
3092 + goto out_unlock;
3094 + if (PageTransCompound(page)) {
3095 + err = split_huge_page(page);
3096 + if (err)
3097 + goto out_unlock;
3100 + /*
3101 + * If this anonymous page is mapped only here, its pte may need
3102 + * to be write-protected. If it's mapped elsewhere, all of its
3103 + * ptes are necessarily already write-protected. But in either
3104 + * case, we need to lock and check page_count is not raised.
3105 + */
3106 + if (write_protect_page(vma, page, &orig_pte, NULL) == 0) {
3107 + if (pages_identical_with_cost(page, kpage))
3108 + err = replace_page(vma, page, kpage, orig_pte);
3109 + else
3110 + err = check_collision(rmap_item, hash);
3113 + if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
3114 + munlock_vma_page(page);
3115 + if (!PageMlocked(kpage)) {
3116 + unlock_page(page);
3117 + lock_page(kpage);
3118 + mlock_vma_page(kpage);
3119 + page = kpage; /* for final unlock */
3123 +out_unlock:
3124 + unlock_page(page);
3125 +out:
3126 + return err;
3131 +/**
3132 + * If two pages fail to merge in try_to_merge_two_pages, then we have a chance
3133 + * to restore a page mapping that has been changed in try_to_merge_two_pages.
3135 + * @return 0 on success.
3136 + */
3137 +static int restore_uksm_page_pte(struct vm_area_struct *vma, unsigned long addr,
3138 + pte_t orig_pte, pte_t wprt_pte)
3140 + struct mm_struct *mm = vma->vm_mm;
3141 + pgd_t *pgd;
3142 + p4d_t *p4d;
3143 + pud_t *pud;
3144 + pmd_t *pmd;
3145 + pte_t *ptep;
3146 + spinlock_t *ptl;
3148 + int err = -EFAULT;
3150 + pgd = pgd_offset(mm, addr);
3151 + if (!pgd_present(*pgd))
3152 + goto out;
3154 + p4d = p4d_offset(pgd, addr);
3155 + pud = pud_offset(p4d, addr);
3156 + if (!pud_present(*pud))
3157 + goto out;
3159 + pmd = pmd_offset(pud, addr);
3160 + if (!pmd_present(*pmd))
3161 + goto out;
3163 + ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
3164 + if (!pte_same(*ptep, wprt_pte)) {
3165 + /* already copied, let it be */
3166 + pte_unmap_unlock(ptep, ptl);
3167 + goto out;
3170 + /*
3171 + * Good boy, still here. When we still get the ksm page, it does not
3172 + * return to the free page pool, there is no way that a pte was changed
3173 + * to other page and gets back to this page. And remind that ksm page
3174 + * do not reuse in do_wp_page(). So it's safe to restore the original
3175 + * pte.
3176 + */
3177 + flush_cache_page(vma, addr, pte_pfn(*ptep));
3178 + ptep_clear_flush_notify(vma, addr, ptep);
3179 + set_pte_at_notify(mm, addr, ptep, orig_pte);
3181 + pte_unmap_unlock(ptep, ptl);
3182 + err = 0;
3183 +out:
3184 + return err;
3187 +/**
3188 + * try_to_merge_two_pages() - take two identical pages and prepare
3189 + * them to be merged into one page(rmap_item->page)
3191 + * @return 0 if we successfully merged two identical pages into
3192 + * one ksm page. MERGE_ERR_COLLI if it's only a hash collision
3193 + * search in rbtree. MERGE_ERR_CHANGED if rmap_item has been
3194 + * changed since it's hashed. MERGE_ERR_PGERR otherwise.
3196 + */
3197 +static int try_to_merge_two_pages(struct rmap_item *rmap_item,
3198 + struct rmap_item *tree_rmap_item,
3199 + u32 hash)
3201 + pte_t orig_pte1 = __pte(0), orig_pte2 = __pte(0);
3202 + pte_t wprt_pte1 = __pte(0), wprt_pte2 = __pte(0);
3203 + struct vm_area_struct *vma1 = rmap_item->slot->vma;
3204 + struct vm_area_struct *vma2 = tree_rmap_item->slot->vma;
3205 + struct page *page = rmap_item->page;
3206 + struct page *tree_page = tree_rmap_item->page;
3207 + int err = MERGE_ERR_PGERR;
3208 + struct address_space *saved_mapping;
3211 + if (rmap_item->page == tree_rmap_item->page)
3212 + goto out;
3214 + if (!trylock_page(page))
3215 + goto out;
3217 + if (!PageAnon(page))
3218 + goto out_unlock;
3220 + if (PageTransCompound(page)) {
3221 + err = split_huge_page(page);
3222 + if (err)
3223 + goto out_unlock;
3226 + if (write_protect_page(vma1, page, &wprt_pte1, &orig_pte1) != 0) {
3227 + unlock_page(page);
3228 + goto out;
3231 + /*
3232 + * While we hold page lock, upgrade page from
3233 + * PageAnon+anon_vma to PageKsm+NULL stable_node:
3234 + * stable_tree_insert() will update stable_node.
3235 + */
3236 + saved_mapping = page->mapping;
3237 + set_page_stable_node(page, NULL);
3238 + mark_page_accessed(page);
3239 + if (!PageDirty(page))
3240 + SetPageDirty(page);
3242 + unlock_page(page);
3244 + if (!trylock_page(tree_page))
3245 + goto restore_out;
3247 + if (!PageAnon(tree_page)) {
3248 + unlock_page(tree_page);
3249 + goto restore_out;
3252 + if (PageTransCompound(tree_page)) {
3253 + err = split_huge_page(tree_page);
3254 + if (err) {
3255 + unlock_page(tree_page);
3256 + goto restore_out;
3260 + if (write_protect_page(vma2, tree_page, &wprt_pte2, &orig_pte2) != 0) {
3261 + unlock_page(tree_page);
3262 + goto restore_out;
3265 + if (pages_identical_with_cost(page, tree_page)) {
3266 + err = replace_page(vma2, tree_page, page, wprt_pte2);
3267 + if (err) {
3268 + unlock_page(tree_page);
3269 + goto restore_out;
3272 + if ((vma2->vm_flags & VM_LOCKED)) {
3273 + munlock_vma_page(tree_page);
3274 + if (!PageMlocked(page)) {
3275 + unlock_page(tree_page);
3276 + lock_page(page);
3277 + mlock_vma_page(page);
3278 + tree_page = page; /* for final unlock */
3282 + unlock_page(tree_page);
3284 + goto out; /* success */
3286 + } else {
3287 + if (tree_rmap_item->hash_max &&
3288 + tree_rmap_item->hash_max == rmap_item->hash_max) {
3289 + err = MERGE_ERR_COLLI_MAX;
3290 + } else if (page_hash(page, hash_strength, 0) ==
3291 + page_hash(tree_page, hash_strength, 0)) {
3292 + inc_rshash_neg(memcmp_cost + hash_strength * 2);
3293 + err = MERGE_ERR_COLLI;
3294 + } else {
3295 + err = MERGE_ERR_CHANGED;
3298 + unlock_page(tree_page);
3301 +restore_out:
3302 + lock_page(page);
3303 + if (!restore_uksm_page_pte(vma1, get_rmap_addr(rmap_item),
3304 + orig_pte1, wprt_pte1))
3305 + page->mapping = saved_mapping;
3307 +out_unlock:
3308 + unlock_page(page);
3309 +out:
3310 + return err;
3313 +static inline int hash_cmp(u32 new_val, u32 node_val)
3315 + if (new_val > node_val)
3316 + return 1;
3317 + else if (new_val < node_val)
3318 + return -1;
3319 + else
3320 + return 0;
3323 +static inline u32 rmap_item_hash_max(struct rmap_item *item, u32 hash)
3325 + u32 hash_max = item->hash_max;
3327 + if (!hash_max) {
3328 + hash_max = page_hash_max(item->page, hash);
3330 + item->hash_max = hash_max;
3333 + return hash_max;
3338 +/**
3339 + * stable_tree_search() - search the stable tree for a page
3341 + * @item: the rmap_item we are comparing with
3342 + * @hash: the hash value of this item->page already calculated
3344 + * @return the page we have found, NULL otherwise. The page returned has
3345 + * been gotten.
3346 + */
3347 +static struct page *stable_tree_search(struct rmap_item *item, u32 hash)
3349 + struct rb_node *node = root_stable_treep->rb_node;
3350 + struct tree_node *tree_node;
3351 + unsigned long hash_max;
3352 + struct page *page = item->page;
3353 + struct stable_node *stable_node;
3355 + stable_node = page_stable_node(page);
3356 + if (stable_node) {
3357 + /* ksm page forked, that is
3358 + * if (PageKsm(page) && !in_stable_tree(rmap_item))
3359 + * it's actually gotten once outside.
3360 + */
3361 + get_page(page);
3362 + return page;
3365 + while (node) {
3366 + int cmp;
3368 + tree_node = rb_entry(node, struct tree_node, node);
3370 + cmp = hash_cmp(hash, tree_node->hash);
3372 + if (cmp < 0)
3373 + node = node->rb_left;
3374 + else if (cmp > 0)
3375 + node = node->rb_right;
3376 + else
3377 + break;
3380 + if (!node)
3381 + return NULL;
3383 + if (tree_node->count == 1) {
3384 + stable_node = rb_entry(tree_node->sub_root.rb_node,
3385 + struct stable_node, node);
3386 + BUG_ON(!stable_node);
3388 + goto get_page_out;
3391 + /*
3392 + * ok, we have to search the second
3393 + * level subtree, hash the page to a
3394 + * full strength.
3395 + */
3396 + node = tree_node->sub_root.rb_node;
3397 + BUG_ON(!node);
3398 + hash_max = rmap_item_hash_max(item, hash);
3400 + while (node) {
3401 + int cmp;
3403 + stable_node = rb_entry(node, struct stable_node, node);
3405 + cmp = hash_cmp(hash_max, stable_node->hash_max);
3407 + if (cmp < 0)
3408 + node = node->rb_left;
3409 + else if (cmp > 0)
3410 + node = node->rb_right;
3411 + else
3412 + goto get_page_out;
3415 + return NULL;
3417 +get_page_out:
3418 + page = get_uksm_page(stable_node, 1, 1);
3419 + return page;
3422 +static int try_merge_rmap_item(struct rmap_item *item,
3423 + struct page *kpage,
3424 + struct page *tree_page)
3426 + struct vm_area_struct *vma = item->slot->vma;
3427 + struct page_vma_mapped_walk pvmw = {
3428 + .page = kpage,
3429 + .vma = vma,
3430 + };
3432 + pvmw.address = get_rmap_addr(item);
3433 + if (!page_vma_mapped_walk(&pvmw))
3434 + return 0;
3436 + if (pte_write(*pvmw.pte)) {
3437 + /* has changed, abort! */
3438 + page_vma_mapped_walk_done(&pvmw);
3439 + return 0;
3442 + get_page(tree_page);
3443 + page_add_anon_rmap(tree_page, vma, pvmw.address, false);
3445 + flush_cache_page(vma, pvmw.address, page_to_pfn(kpage));
3446 + ptep_clear_flush_notify(vma, pvmw.address, pvmw.pte);
3447 + set_pte_at_notify(vma->vm_mm, pvmw.address, pvmw.pte,
3448 + mk_pte(tree_page, vma->vm_page_prot));
3450 + page_remove_rmap(kpage, false);
3451 + put_page(kpage);
3453 + page_vma_mapped_walk_done(&pvmw);
3455 + return 1;
3458 +/**
3459 + * try_to_merge_with_stable_page() - when two rmap_items need to be inserted
3460 + * into stable tree, the page was found to be identical to a stable ksm page,
3461 + * this is the last chance we can merge them into one.
3463 + * @item1: the rmap_item holding the page which we wanted to insert
3464 + * into stable tree.
3465 + * @item2: the other rmap_item we found when unstable tree search
3466 + * @oldpage: the page currently mapped by the two rmap_items
3467 + * @tree_page: the page we found identical in stable tree node
3468 + * @success1: return if item1 is successfully merged
3469 + * @success2: return if item2 is successfully merged
3470 + */
3471 +static void try_merge_with_stable(struct rmap_item *item1,
3472 + struct rmap_item *item2,
3473 + struct page **kpage,
3474 + struct page *tree_page,
3475 + int *success1, int *success2)
3477 + struct vm_area_struct *vma1 = item1->slot->vma;
3478 + struct vm_area_struct *vma2 = item2->slot->vma;
3479 + *success1 = 0;
3480 + *success2 = 0;
3482 + if (unlikely(*kpage == tree_page)) {
3483 + /* I don't think this can really happen */
3484 + pr_warn("UKSM: unexpected condition detected in "
3485 + "%s -- *kpage == tree_page !\n", __func__);
3486 + *success1 = 1;
3487 + *success2 = 1;
3488 + return;
3491 + if (!PageAnon(*kpage) || !PageKsm(*kpage))
3492 + goto failed;
3494 + if (!trylock_page(tree_page))
3495 + goto failed;
3497 + /* If the oldpage is still ksm and still pointed
3498 + * to in the right place, and still write protected,
3499 + * we are confident it's not changed, no need to
3500 + * memcmp anymore.
3501 + * be ware, we cannot take nested pte locks,
3502 + * deadlock risk.
3503 + */
3504 + if (!try_merge_rmap_item(item1, *kpage, tree_page))
3505 + goto unlock_failed;
3507 + /* ok, then vma2, remind that pte1 already set */
3508 + if (!try_merge_rmap_item(item2, *kpage, tree_page))
3509 + goto success_1;
3511 + *success2 = 1;
3512 +success_1:
3513 + *success1 = 1;
3516 + if ((*success1 && vma1->vm_flags & VM_LOCKED) ||
3517 + (*success2 && vma2->vm_flags & VM_LOCKED)) {
3518 + munlock_vma_page(*kpage);
3519 + if (!PageMlocked(tree_page))
3520 + mlock_vma_page(tree_page);
3523 + /*
3524 + * We do not need oldpage any more in the caller, so can break the lock
3525 + * now.
3526 + */
3527 + unlock_page(*kpage);
3528 + *kpage = tree_page; /* Get unlocked outside. */
3529 + return;
3531 +unlock_failed:
3532 + unlock_page(tree_page);
3533 +failed:
3534 + return;
3537 +static inline void stable_node_hash_max(struct stable_node *node,
3538 + struct page *page, u32 hash)
3540 + u32 hash_max = node->hash_max;
3542 + if (!hash_max) {
3543 + hash_max = page_hash_max(page, hash);
3544 + node->hash_max = hash_max;
3548 +static inline
3549 +struct stable_node *new_stable_node(struct tree_node *tree_node,
3550 + struct page *kpage, u32 hash_max)
3552 + struct stable_node *new_stable_node;
3554 + new_stable_node = alloc_stable_node();
3555 + if (!new_stable_node)
3556 + return NULL;
3558 + new_stable_node->kpfn = page_to_pfn(kpage);
3559 + new_stable_node->hash_max = hash_max;
3560 + new_stable_node->tree_node = tree_node;
3561 + set_page_stable_node(kpage, new_stable_node);
3563 + return new_stable_node;
3566 +static inline
3567 +struct stable_node *first_level_insert(struct tree_node *tree_node,
3568 + struct rmap_item *rmap_item,
3569 + struct rmap_item *tree_rmap_item,
3570 + struct page **kpage, u32 hash,
3571 + int *success1, int *success2)
3573 + int cmp;
3574 + struct page *tree_page;
3575 + u32 hash_max = 0;
3576 + struct stable_node *stable_node, *new_snode;
3577 + struct rb_node *parent = NULL, **new;
3579 + /* this tree node contains no sub-tree yet */
3580 + stable_node = rb_entry(tree_node->sub_root.rb_node,
3581 + struct stable_node, node);
3583 + tree_page = get_uksm_page(stable_node, 1, 0);
3584 + if (tree_page) {
3585 + cmp = memcmp_pages_with_cost(*kpage, tree_page, 1);
3586 + if (!cmp) {
3587 + try_merge_with_stable(rmap_item, tree_rmap_item, kpage,
3588 + tree_page, success1, success2);
3589 + put_page(tree_page);
3590 + if (!*success1 && !*success2)
3591 + goto failed;
3593 + return stable_node;
3595 + } else {
3596 + /*
3597 + * collision in first level try to create a subtree.
3598 + * A new node need to be created.
3599 + */
3600 + put_page(tree_page);
3602 + stable_node_hash_max(stable_node, tree_page,
3603 + tree_node->hash);
3604 + hash_max = rmap_item_hash_max(rmap_item, hash);
3605 + cmp = hash_cmp(hash_max, stable_node->hash_max);
3607 + parent = &stable_node->node;
3608 + if (cmp < 0)
3609 + new = &parent->rb_left;
3610 + else if (cmp > 0)
3611 + new = &parent->rb_right;
3612 + else
3613 + goto failed;
3616 + } else {
3617 + /* the only stable_node deleted, we reuse its tree_node.
3618 + */
3619 + parent = NULL;
3620 + new = &tree_node->sub_root.rb_node;
3623 + new_snode = new_stable_node(tree_node, *kpage, hash_max);
3624 + if (!new_snode)
3625 + goto failed;
3627 + rb_link_node(&new_snode->node, parent, new);
3628 + rb_insert_color(&new_snode->node, &tree_node->sub_root);
3629 + tree_node->count++;
3630 + *success1 = *success2 = 1;
3632 + return new_snode;
3634 +failed:
3635 + return NULL;
3638 +static inline
3639 +struct stable_node *stable_subtree_insert(struct tree_node *tree_node,
3640 + struct rmap_item *rmap_item,
3641 + struct rmap_item *tree_rmap_item,
3642 + struct page **kpage, u32 hash,
3643 + int *success1, int *success2)
3645 + struct page *tree_page;
3646 + u32 hash_max;
3647 + struct stable_node *stable_node, *new_snode;
3648 + struct rb_node *parent, **new;
3650 +research:
3651 + parent = NULL;
3652 + new = &tree_node->sub_root.rb_node;
3653 + BUG_ON(!*new);
3654 + hash_max = rmap_item_hash_max(rmap_item, hash);
3655 + while (*new) {
3656 + int cmp;
3658 + stable_node = rb_entry(*new, struct stable_node, node);
3660 + cmp = hash_cmp(hash_max, stable_node->hash_max);
3662 + if (cmp < 0) {
3663 + parent = *new;
3664 + new = &parent->rb_left;
3665 + } else if (cmp > 0) {
3666 + parent = *new;
3667 + new = &parent->rb_right;
3668 + } else {
3669 + tree_page = get_uksm_page(stable_node, 1, 0);
3670 + if (tree_page) {
3671 + cmp = memcmp_pages_with_cost(*kpage, tree_page, 1);
3672 + if (!cmp) {
3673 + try_merge_with_stable(rmap_item,
3674 + tree_rmap_item, kpage,
3675 + tree_page, success1, success2);
3677 + put_page(tree_page);
3678 + if (!*success1 && !*success2)
3679 + goto failed;
3680 + /*
3681 + * successfully merged with a stable
3682 + * node
3683 + */
3684 + return stable_node;
3685 + } else {
3686 + put_page(tree_page);
3687 + goto failed;
3689 + } else {
3690 + /*
3691 + * stable node may be deleted,
3692 + * and subtree maybe
3693 + * restructed, cannot
3694 + * continue, research it.
3695 + */
3696 + if (tree_node->count) {
3697 + goto research;
3698 + } else {
3699 + /* reuse the tree node*/
3700 + parent = NULL;
3701 + new = &tree_node->sub_root.rb_node;
3707 + new_snode = new_stable_node(tree_node, *kpage, hash_max);
3708 + if (!new_snode)
3709 + goto failed;
3711 + rb_link_node(&new_snode->node, parent, new);
3712 + rb_insert_color(&new_snode->node, &tree_node->sub_root);
3713 + tree_node->count++;
3714 + *success1 = *success2 = 1;
3716 + return new_snode;
3718 +failed:
3719 + return NULL;
3723 +/**
3724 + * stable_tree_insert() - try to insert a merged page in unstable tree to
3725 + * the stable tree
3727 + * @kpage: the page need to be inserted
3728 + * @hash: the current hash of this page
3729 + * @rmap_item: the rmap_item being scanned
3730 + * @tree_rmap_item: the rmap_item found on unstable tree
3731 + * @success1: return if rmap_item is merged
3732 + * @success2: return if tree_rmap_item is merged
3734 + * @return the stable_node on stable tree if at least one
3735 + * rmap_item is inserted into stable tree, NULL
3736 + * otherwise.
3737 + */
3738 +static struct stable_node *
3739 +stable_tree_insert(struct page **kpage, u32 hash,
3740 + struct rmap_item *rmap_item,
3741 + struct rmap_item *tree_rmap_item,
3742 + int *success1, int *success2)
3744 + struct rb_node **new = &root_stable_treep->rb_node;
3745 + struct rb_node *parent = NULL;
3746 + struct stable_node *stable_node;
3747 + struct tree_node *tree_node;
3748 + u32 hash_max = 0;
3750 + *success1 = *success2 = 0;
3752 + while (*new) {
3753 + int cmp;
3755 + tree_node = rb_entry(*new, struct tree_node, node);
3757 + cmp = hash_cmp(hash, tree_node->hash);
3759 + if (cmp < 0) {
3760 + parent = *new;
3761 + new = &parent->rb_left;
3762 + } else if (cmp > 0) {
3763 + parent = *new;
3764 + new = &parent->rb_right;
3765 + } else
3766 + break;
3769 + if (*new) {
3770 + if (tree_node->count == 1) {
3771 + stable_node = first_level_insert(tree_node, rmap_item,
3772 + tree_rmap_item, kpage,
3773 + hash, success1, success2);
3774 + } else {
3775 + stable_node = stable_subtree_insert(tree_node,
3776 + rmap_item, tree_rmap_item, kpage,
3777 + hash, success1, success2);
3779 + } else {
3781 + /* no tree node found */
3782 + tree_node = alloc_tree_node(stable_tree_node_listp);
3783 + if (!tree_node) {
3784 + stable_node = NULL;
3785 + goto out;
3788 + stable_node = new_stable_node(tree_node, *kpage, hash_max);
3789 + if (!stable_node) {
3790 + free_tree_node(tree_node);
3791 + goto out;
3794 + tree_node->hash = hash;
3795 + rb_link_node(&tree_node->node, parent, new);
3796 + rb_insert_color(&tree_node->node, root_stable_treep);
3797 + parent = NULL;
3798 + new = &tree_node->sub_root.rb_node;
3800 + rb_link_node(&stable_node->node, parent, new);
3801 + rb_insert_color(&stable_node->node, &tree_node->sub_root);
3802 + tree_node->count++;
3803 + *success1 = *success2 = 1;
3806 +out:
3807 + return stable_node;
3811 +/**
3812 + * get_tree_rmap_item_page() - try to get the page and lock the mmap_sem
3814 + * @return 0 on success, -EBUSY if unable to lock the mmap_sem,
3815 + * -EINVAL if the page mapping has been changed.
3816 + */
3817 +static inline int get_tree_rmap_item_page(struct rmap_item *tree_rmap_item)
3819 + int err;
3821 + err = get_mergeable_page_lock_mmap(tree_rmap_item);
3823 + if (err == -EINVAL) {
3824 + /* its page map has been changed, remove it */
3825 + remove_rmap_item_from_tree(tree_rmap_item);
3828 + /* The page is gotten and mmap_sem is locked now. */
3829 + return err;
3833 +/**
3834 + * unstable_tree_search_insert() - search an unstable tree rmap_item with the
3835 + * same hash value. Get its page and trylock the mmap_sem
3836 + */
3837 +static inline
3838 +struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
3839 + u32 hash)
3842 + struct rb_node **new = &root_unstable_tree.rb_node;
3843 + struct rb_node *parent = NULL;
3844 + struct tree_node *tree_node;
3845 + u32 hash_max;
3846 + struct rmap_item *tree_rmap_item;
3848 + while (*new) {
3849 + int cmp;
3851 + tree_node = rb_entry(*new, struct tree_node, node);
3853 + cmp = hash_cmp(hash, tree_node->hash);
3855 + if (cmp < 0) {
3856 + parent = *new;
3857 + new = &parent->rb_left;
3858 + } else if (cmp > 0) {
3859 + parent = *new;
3860 + new = &parent->rb_right;
3861 + } else
3862 + break;
3865 + if (*new) {
3866 + /* got the tree_node */
3867 + if (tree_node->count == 1) {
3868 + tree_rmap_item = rb_entry(tree_node->sub_root.rb_node,
3869 + struct rmap_item, node);
3870 + BUG_ON(!tree_rmap_item);
3872 + goto get_page_out;
3875 + /* well, search the collision subtree */
3876 + new = &tree_node->sub_root.rb_node;
3877 + BUG_ON(!*new);
3878 + hash_max = rmap_item_hash_max(rmap_item, hash);
3880 + while (*new) {
3881 + int cmp;
3883 + tree_rmap_item = rb_entry(*new, struct rmap_item,
3884 + node);
3886 + cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
3887 + parent = *new;
3888 + if (cmp < 0)
3889 + new = &parent->rb_left;
3890 + else if (cmp > 0)
3891 + new = &parent->rb_right;
3892 + else
3893 + goto get_page_out;
3895 + } else {
3896 + /* alloc a new tree_node */
3897 + tree_node = alloc_tree_node(&unstable_tree_node_list);
3898 + if (!tree_node)
3899 + return NULL;
3901 + tree_node->hash = hash;
3902 + rb_link_node(&tree_node->node, parent, new);
3903 + rb_insert_color(&tree_node->node, &root_unstable_tree);
3904 + parent = NULL;
3905 + new = &tree_node->sub_root.rb_node;
3908 + /* did not found even in sub-tree */
3909 + rmap_item->tree_node = tree_node;
3910 + rmap_item->address |= UNSTABLE_FLAG;
3911 + rmap_item->hash_round = uksm_hash_round;
3912 + rb_link_node(&rmap_item->node, parent, new);
3913 + rb_insert_color(&rmap_item->node, &tree_node->sub_root);
3915 + uksm_pages_unshared++;
3916 + return NULL;
3918 +get_page_out:
3919 + if (tree_rmap_item->page == rmap_item->page)
3920 + return NULL;
3922 + if (get_tree_rmap_item_page(tree_rmap_item))
3923 + return NULL;
3925 + return tree_rmap_item;
3928 +static void hold_anon_vma(struct rmap_item *rmap_item,
3929 + struct anon_vma *anon_vma)
3931 + rmap_item->anon_vma = anon_vma;
3932 + get_anon_vma(anon_vma);
3936 +/**
3937 + * stable_tree_append() - append a rmap_item to a stable node. Deduplication
3938 + * ratio statistics is done in this function.
3940 + */
3941 +static void stable_tree_append(struct rmap_item *rmap_item,
3942 + struct stable_node *stable_node, int logdedup)
3944 + struct node_vma *node_vma = NULL, *new_node_vma, *node_vma_cont = NULL;
3945 + unsigned long key = (unsigned long)rmap_item->slot;
3946 + unsigned long factor = rmap_item->slot->rung->step;
3948 + BUG_ON(!stable_node);
3949 + rmap_item->address |= STABLE_FLAG;
3951 + if (hlist_empty(&stable_node->hlist)) {
3952 + uksm_pages_shared++;
3953 + goto node_vma_new;
3954 + } else {
3955 + uksm_pages_sharing++;
3958 + hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) {
3959 + if (node_vma->key >= key)
3960 + break;
3962 + if (logdedup) {
3963 + node_vma->slot->pages_bemerged += factor;
3964 + if (list_empty(&node_vma->slot->dedup_list))
3965 + list_add(&node_vma->slot->dedup_list,
3966 + &vma_slot_dedup);
3970 + if (node_vma) {
3971 + if (node_vma->key == key) {
3972 + node_vma_cont = hlist_entry_safe(node_vma->hlist.next, struct node_vma, hlist);
3973 + goto node_vma_ok;
3974 + } else if (node_vma->key > key) {
3975 + node_vma_cont = node_vma;
3979 +node_vma_new:
3980 + /* no same vma already in node, alloc a new node_vma */
3981 + new_node_vma = alloc_node_vma();
3982 + BUG_ON(!new_node_vma);
3983 + new_node_vma->head = stable_node;
3984 + new_node_vma->slot = rmap_item->slot;
3986 + if (!node_vma) {
3987 + hlist_add_head(&new_node_vma->hlist, &stable_node->hlist);
3988 + } else if (node_vma->key != key) {
3989 + if (node_vma->key < key)
3990 + hlist_add_behind(&new_node_vma->hlist, &node_vma->hlist);
3991 + else {
3992 + hlist_add_before(&new_node_vma->hlist,
3993 + &node_vma->hlist);
3997 + node_vma = new_node_vma;
3999 +node_vma_ok: /* ok, ready to add to the list */
4000 + rmap_item->head = node_vma;
4001 + hlist_add_head(&rmap_item->hlist, &node_vma->rmap_hlist);
4002 + hold_anon_vma(rmap_item, rmap_item->slot->vma->anon_vma);
4003 + if (logdedup) {
4004 + rmap_item->slot->pages_merged++;
4005 + if (node_vma_cont) {
4006 + node_vma = node_vma_cont;
4007 + hlist_for_each_entry_continue(node_vma, hlist) {
4008 + node_vma->slot->pages_bemerged += factor;
4009 + if (list_empty(&node_vma->slot->dedup_list))
4010 + list_add(&node_vma->slot->dedup_list,
4011 + &vma_slot_dedup);
4018 + * We use break_ksm to break COW on a ksm page: it's a stripped down
4020 + * if (get_user_pages(addr, 1, 1, 1, &page, NULL) == 1)
4021 + * put_page(page);
4023 + * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
4024 + * in case the application has unmapped and remapped mm,addr meanwhile.
4025 + * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP
4026 + * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
4027 + */
4028 +static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
4030 + struct page *page;
4031 + int ret = 0;
4033 + do {
4034 + cond_resched();
4035 + page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
4036 + if (IS_ERR_OR_NULL(page))
4037 + break;
4038 + if (PageKsm(page)) {
4039 + ret = handle_mm_fault(vma, addr,
4040 + FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE,
4041 + NULL);
4042 + } else
4043 + ret = VM_FAULT_WRITE;
4044 + put_page(page);
4045 + } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
4046 + /*
4047 + * We must loop because handle_mm_fault() may back out if there's
4048 + * any difficulty e.g. if pte accessed bit gets updated concurrently.
4050 + * VM_FAULT_WRITE is what we have been hoping for: it indicates that
4051 + * COW has been broken, even if the vma does not permit VM_WRITE;
4052 + * but note that a concurrent fault might break PageKsm for us.
4054 + * VM_FAULT_SIGBUS could occur if we race with truncation of the
4055 + * backing file, which also invalidates anonymous pages: that's
4056 + * okay, that truncation will have unmapped the PageKsm for us.
4058 + * VM_FAULT_OOM: at the time of writing (late July 2009), setting
4059 + * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
4060 + * current task has TIF_MEMDIE set, and will be OOM killed on return
4061 + * to user; and ksmd, having no mm, would never be chosen for that.
4063 + * But if the mm is in a limited mem_cgroup, then the fault may fail
4064 + * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
4065 + * even ksmd can fail in this way - though it's usually breaking ksm
4066 + * just to undo a merge it made a moment before, so unlikely to oom.
4068 + * That's a pity: we might therefore have more kernel pages allocated
4069 + * than we're counting as nodes in the stable tree; but uksm_do_scan
4070 + * will retry to break_cow on each pass, so should recover the page
4071 + * in due course. The important thing is to not let VM_MERGEABLE
4072 + * be cleared while any such pages might remain in the area.
4073 + */
4074 + return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
4077 +static void break_cow(struct rmap_item *rmap_item)
4079 + struct vm_area_struct *vma = rmap_item->slot->vma;
4080 + struct mm_struct *mm = vma->vm_mm;
4081 + unsigned long addr = get_rmap_addr(rmap_item);
4083 + if (uksm_test_exit(mm))
4084 + goto out;
4086 + break_ksm(vma, addr);
4087 +out:
4088 + return;
4092 + * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather
4093 + * than check every pte of a given vma, the locking doesn't quite work for
4094 + * that - an rmap_item is assigned to the stable tree after inserting ksm
4095 + * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing
4096 + * rmap_items from parent to child at fork time (so as not to waste time
4097 + * if exit comes before the next scan reaches it).
4099 + * Similarly, although we'd like to remove rmap_items (so updating counts
4100 + * and freeing memory) when unmerging an area, it's easier to leave that
4101 + * to the next pass of ksmd - consider, for example, how ksmd might be
4102 + * in cmp_and_merge_page on one of the rmap_items we would be removing.
4103 + */
4104 +inline int unmerge_uksm_pages(struct vm_area_struct *vma,
4105 + unsigned long start, unsigned long end)
4107 + unsigned long addr;
4108 + int err = 0;
4110 + for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
4111 + if (uksm_test_exit(vma->vm_mm))
4112 + break;
4113 + if (signal_pending(current))
4114 + err = -ERESTARTSYS;
4115 + else
4116 + err = break_ksm(vma, addr);
4118 + return err;
4121 +static inline void inc_uksm_pages_scanned(void)
4123 + u64 delta;
4126 + if (uksm_pages_scanned == U64_MAX) {
4127 + encode_benefit();
4129 + delta = uksm_pages_scanned >> pages_scanned_base;
4131 + if (CAN_OVERFLOW_U64(pages_scanned_stored, delta)) {
4132 + pages_scanned_stored >>= 1;
4133 + delta >>= 1;
4134 + pages_scanned_base++;
4137 + pages_scanned_stored += delta;
4139 + uksm_pages_scanned = uksm_pages_scanned_last = 0;
4142 + uksm_pages_scanned++;
4145 +static inline int find_zero_page_hash(int strength, u32 hash)
4147 + return (zero_hash_table[strength] == hash);
4150 +static
4151 +int cmp_and_merge_zero_page(struct vm_area_struct *vma, struct page *page)
4153 + struct page *zero_page = empty_uksm_zero_page;
4154 + struct mm_struct *mm = vma->vm_mm;
4155 + pte_t orig_pte = __pte(0);
4156 + int err = -EFAULT;
4158 + if (uksm_test_exit(mm))
4159 + goto out;
4161 + if (!trylock_page(page))
4162 + goto out;
4164 + if (!PageAnon(page))
4165 + goto out_unlock;
4167 + if (PageTransCompound(page)) {
4168 + err = split_huge_page(page);
4169 + if (err)
4170 + goto out_unlock;
4173 + if (write_protect_page(vma, page, &orig_pte, 0) == 0) {
4174 + if (is_page_full_zero(page))
4175 + err = replace_page(vma, page, zero_page, orig_pte);
4178 +out_unlock:
4179 + unlock_page(page);
4180 +out:
4181 + return err;
4185 + * cmp_and_merge_page() - first see if page can be merged into the stable
4186 + * tree; if not, compare hash to previous and if it's the same, see if page
4187 + * can be inserted into the unstable tree, or merged with a page already there
4188 + * and both transferred to the stable tree.
4190 + * @page: the page that we are searching identical page to.
4191 + * @rmap_item: the reverse mapping into the virtual address of this page
4192 + */
4193 +static void cmp_and_merge_page(struct rmap_item *rmap_item, u32 hash)
4195 + struct rmap_item *tree_rmap_item;
4196 + struct page *page;
4197 + struct page *kpage = NULL;
4198 + u32 hash_max;
4199 + int err;
4200 + unsigned int success1, success2;
4201 + struct stable_node *snode;
4202 + int cmp;
4203 + struct rb_node *parent = NULL, **new;
4205 + remove_rmap_item_from_tree(rmap_item);
4206 + page = rmap_item->page;
4208 + /* We first start with searching the page inside the stable tree */
4209 + kpage = stable_tree_search(rmap_item, hash);
4210 + if (kpage) {
4211 + err = try_to_merge_with_uksm_page(rmap_item, kpage,
4212 + hash);
4213 + if (!err) {
4214 + /*
4215 + * The page was successfully merged, add
4216 + * its rmap_item to the stable tree.
4217 + * page lock is needed because it's
4218 + * racing with try_to_unmap_ksm(), etc.
4219 + */
4220 + lock_page(kpage);
4221 + snode = page_stable_node(kpage);
4222 + stable_tree_append(rmap_item, snode, 1);
4223 + unlock_page(kpage);
4224 + put_page(kpage);
4225 + return; /* success */
4227 + put_page(kpage);
4229 + /*
4230 + * if it's a collision and it has been search in sub-rbtree
4231 + * (hash_max != 0), we want to abort, because if it is
4232 + * successfully merged in unstable tree, the collision trends to
4233 + * happen again.
4234 + */
4235 + if (err == MERGE_ERR_COLLI && rmap_item->hash_max)
4236 + return;
4239 + tree_rmap_item =
4240 + unstable_tree_search_insert(rmap_item, hash);
4241 + if (tree_rmap_item) {
4242 + err = try_to_merge_two_pages(rmap_item, tree_rmap_item, hash);
4243 + /*
4244 + * As soon as we merge this page, we want to remove the
4245 + * rmap_item of the page we have merged with from the unstable
4246 + * tree, and insert it instead as new node in the stable tree.
4247 + */
4248 + if (!err) {
4249 + kpage = page;
4250 + remove_rmap_item_from_tree(tree_rmap_item);
4251 + lock_page(kpage);
4252 + snode = stable_tree_insert(&kpage, hash,
4253 + rmap_item, tree_rmap_item,
4254 + &success1, &success2);
4256 + /*
4257 + * Do not log dedup for tree item, it's not counted as
4258 + * scanned in this round.
4259 + */
4260 + if (success2)
4261 + stable_tree_append(tree_rmap_item, snode, 0);
4263 + /*
4264 + * The order of these two stable append is important:
4265 + * we are scanning rmap_item.
4266 + */
4267 + if (success1)
4268 + stable_tree_append(rmap_item, snode, 1);
4270 + /*
4271 + * The original kpage may be unlocked inside
4272 + * stable_tree_insert() already. This page
4273 + * should be unlocked before doing
4274 + * break_cow().
4275 + */
4276 + unlock_page(kpage);
4278 + if (!success1)
4279 + break_cow(rmap_item);
4281 + if (!success2)
4282 + break_cow(tree_rmap_item);
4284 + } else if (err == MERGE_ERR_COLLI) {
4285 + BUG_ON(tree_rmap_item->tree_node->count > 1);
4287 + rmap_item_hash_max(tree_rmap_item,
4288 + tree_rmap_item->tree_node->hash);
4290 + hash_max = rmap_item_hash_max(rmap_item, hash);
4291 + cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
4292 + parent = &tree_rmap_item->node;
4293 + if (cmp < 0)
4294 + new = &parent->rb_left;
4295 + else if (cmp > 0)
4296 + new = &parent->rb_right;
4297 + else
4298 + goto put_up_out;
4300 + rmap_item->tree_node = tree_rmap_item->tree_node;
4301 + rmap_item->address |= UNSTABLE_FLAG;
4302 + rmap_item->hash_round = uksm_hash_round;
4303 + rb_link_node(&rmap_item->node, parent, new);
4304 + rb_insert_color(&rmap_item->node,
4305 + &tree_rmap_item->tree_node->sub_root);
4306 + rmap_item->tree_node->count++;
4307 + } else {
4308 + /*
4309 + * either one of the page has changed or they collide
4310 + * at the max hash, we consider them as ill items.
4311 + */
4312 + remove_rmap_item_from_tree(tree_rmap_item);
4314 +put_up_out:
4315 + put_page(tree_rmap_item->page);
4316 + mmap_read_unlock(tree_rmap_item->slot->vma->vm_mm);
4323 +static inline unsigned long get_pool_index(struct vma_slot *slot,
4324 + unsigned long index)
4326 + unsigned long pool_index;
4328 + pool_index = (sizeof(struct rmap_list_entry *) * index) >> PAGE_SHIFT;
4329 + if (pool_index >= slot->pool_size)
4330 + BUG();
4331 + return pool_index;
4334 +static inline unsigned long index_page_offset(unsigned long index)
4336 + return offset_in_page(sizeof(struct rmap_list_entry *) * index);
4339 +static inline
4340 +struct rmap_list_entry *get_rmap_list_entry(struct vma_slot *slot,
4341 + unsigned long index, int need_alloc)
4343 + unsigned long pool_index;
4344 + struct page *page;
4345 + void *addr;
4348 + pool_index = get_pool_index(slot, index);
4349 + if (!slot->rmap_list_pool[pool_index]) {
4350 + if (!need_alloc)
4351 + return NULL;
4353 + page = alloc_page(GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN);
4354 + if (!page)
4355 + return NULL;
4357 + slot->rmap_list_pool[pool_index] = page;
4360 + addr = kmap(slot->rmap_list_pool[pool_index]);
4361 + addr += index_page_offset(index);
4363 + return addr;
4366 +static inline void put_rmap_list_entry(struct vma_slot *slot,
4367 + unsigned long index)
4369 + unsigned long pool_index;
4371 + pool_index = get_pool_index(slot, index);
4372 + BUG_ON(!slot->rmap_list_pool[pool_index]);
4373 + kunmap(slot->rmap_list_pool[pool_index]);
4376 +static inline int entry_is_new(struct rmap_list_entry *entry)
4378 + return !entry->item;
4381 +static inline unsigned long get_index_orig_addr(struct vma_slot *slot,
4382 + unsigned long index)
4384 + return slot->vma->vm_start + (index << PAGE_SHIFT);
4387 +static inline unsigned long get_entry_address(struct rmap_list_entry *entry)
4389 + unsigned long addr;
4391 + if (is_addr(entry->addr))
4392 + addr = get_clean_addr(entry->addr);
4393 + else if (entry->item)
4394 + addr = get_rmap_addr(entry->item);
4395 + else
4396 + BUG();
4398 + return addr;
4401 +static inline struct rmap_item *get_entry_item(struct rmap_list_entry *entry)
4403 + if (is_addr(entry->addr))
4404 + return NULL;
4406 + return entry->item;
4409 +static inline void inc_rmap_list_pool_count(struct vma_slot *slot,
4410 + unsigned long index)
4412 + unsigned long pool_index;
4414 + pool_index = get_pool_index(slot, index);
4415 + BUG_ON(!slot->rmap_list_pool[pool_index]);
4416 + slot->pool_counts[pool_index]++;
4419 +static inline void dec_rmap_list_pool_count(struct vma_slot *slot,
4420 + unsigned long index)
4422 + unsigned long pool_index;
4424 + pool_index = get_pool_index(slot, index);
4425 + BUG_ON(!slot->rmap_list_pool[pool_index]);
4426 + BUG_ON(!slot->pool_counts[pool_index]);
4427 + slot->pool_counts[pool_index]--;
4430 +static inline int entry_has_rmap(struct rmap_list_entry *entry)
4432 + return !is_addr(entry->addr) && entry->item;
4435 +static inline void swap_entries(struct rmap_list_entry *entry1,
4436 + unsigned long index1,
4437 + struct rmap_list_entry *entry2,
4438 + unsigned long index2)
4440 + struct rmap_list_entry tmp;
4442 + /* swapping two new entries is meaningless */
4443 + BUG_ON(entry_is_new(entry1) && entry_is_new(entry2));
4445 + tmp = *entry1;
4446 + *entry1 = *entry2;
4447 + *entry2 = tmp;
4449 + if (entry_has_rmap(entry1))
4450 + entry1->item->entry_index = index1;
4452 + if (entry_has_rmap(entry2))
4453 + entry2->item->entry_index = index2;
4455 + if (entry_has_rmap(entry1) && !entry_has_rmap(entry2)) {
4456 + inc_rmap_list_pool_count(entry1->item->slot, index1);
4457 + dec_rmap_list_pool_count(entry1->item->slot, index2);
4458 + } else if (!entry_has_rmap(entry1) && entry_has_rmap(entry2)) {
4459 + inc_rmap_list_pool_count(entry2->item->slot, index2);
4460 + dec_rmap_list_pool_count(entry2->item->slot, index1);
4464 +static inline void free_entry_item(struct rmap_list_entry *entry)
4466 + unsigned long index;
4467 + struct rmap_item *item;
4469 + if (!is_addr(entry->addr)) {
4470 + BUG_ON(!entry->item);
4471 + item = entry->item;
4472 + entry->addr = get_rmap_addr(item);
4473 + set_is_addr(entry->addr);
4474 + index = item->entry_index;
4475 + remove_rmap_item_from_tree(item);
4476 + dec_rmap_list_pool_count(item->slot, index);
4477 + free_rmap_item(item);
4481 +static inline int pool_entry_boundary(unsigned long index)
4483 + unsigned long linear_addr;
4485 + linear_addr = sizeof(struct rmap_list_entry *) * index;
4486 + return index && !offset_in_page(linear_addr);
4489 +static inline void try_free_last_pool(struct vma_slot *slot,
4490 + unsigned long index)
4492 + unsigned long pool_index;
4494 + pool_index = get_pool_index(slot, index);
4495 + if (slot->rmap_list_pool[pool_index] &&
4496 + !slot->pool_counts[pool_index]) {
4497 + __free_page(slot->rmap_list_pool[pool_index]);
4498 + slot->rmap_list_pool[pool_index] = NULL;
4499 + slot->flags |= UKSM_SLOT_NEED_SORT;
4504 +static inline unsigned long vma_item_index(struct vm_area_struct *vma,
4505 + struct rmap_item *item)
4507 + return (get_rmap_addr(item) - vma->vm_start) >> PAGE_SHIFT;
4510 +static int within_same_pool(struct vma_slot *slot,
4511 + unsigned long i, unsigned long j)
4513 + unsigned long pool_i, pool_j;
4515 + pool_i = get_pool_index(slot, i);
4516 + pool_j = get_pool_index(slot, j);
4518 + return (pool_i == pool_j);
4521 +static void sort_rmap_entry_list(struct vma_slot *slot)
4523 + unsigned long i, j;
4524 + struct rmap_list_entry *entry, *swap_entry;
4526 + entry = get_rmap_list_entry(slot, 0, 0);
4527 + for (i = 0; i < slot->pages; ) {
4529 + if (!entry)
4530 + goto skip_whole_pool;
4532 + if (entry_is_new(entry))
4533 + goto next_entry;
4535 + if (is_addr(entry->addr)) {
4536 + entry->addr = 0;
4537 + goto next_entry;
4540 + j = vma_item_index(slot->vma, entry->item);
4541 + if (j == i)
4542 + goto next_entry;
4544 + if (within_same_pool(slot, i, j))
4545 + swap_entry = entry + j - i;
4546 + else
4547 + swap_entry = get_rmap_list_entry(slot, j, 1);
4549 + swap_entries(entry, i, swap_entry, j);
4550 + if (!within_same_pool(slot, i, j))
4551 + put_rmap_list_entry(slot, j);
4552 + continue;
4554 +skip_whole_pool:
4555 + i += PAGE_SIZE / sizeof(*entry);
4556 + if (i < slot->pages)
4557 + entry = get_rmap_list_entry(slot, i, 0);
4558 + continue;
4560 +next_entry:
4561 + if (i >= slot->pages - 1 ||
4562 + !within_same_pool(slot, i, i + 1)) {
4563 + put_rmap_list_entry(slot, i);
4564 + if (i + 1 < slot->pages)
4565 + entry = get_rmap_list_entry(slot, i + 1, 0);
4566 + } else
4567 + entry++;
4568 + i++;
4569 + continue;
4572 + /* free empty pool entries which contain no rmap_item */
4573 + /* CAN be simplied to based on only pool_counts when bug freed !!!!! */
4574 + for (i = 0; i < slot->pool_size; i++) {
4575 + unsigned char has_rmap;
4576 + void *addr;
4578 + if (!slot->rmap_list_pool[i])
4579 + continue;
4581 + has_rmap = 0;
4582 + addr = kmap(slot->rmap_list_pool[i]);
4583 + BUG_ON(!addr);
4584 + for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
4585 + entry = (struct rmap_list_entry *)addr + j;
4586 + if (is_addr(entry->addr))
4587 + continue;
4588 + if (!entry->item)
4589 + continue;
4590 + has_rmap = 1;
4592 + kunmap(slot->rmap_list_pool[i]);
4593 + if (!has_rmap) {
4594 + BUG_ON(slot->pool_counts[i]);
4595 + __free_page(slot->rmap_list_pool[i]);
4596 + slot->rmap_list_pool[i] = NULL;
4600 + slot->flags &= ~UKSM_SLOT_NEED_SORT;
4604 + * vma_fully_scanned() - if all the pages in this slot have been scanned.
4605 + */
4606 +static inline int vma_fully_scanned(struct vma_slot *slot)
4608 + return slot->pages_scanned == slot->pages;
4611 +/**
4612 + * get_next_rmap_item() - Get the next rmap_item in a vma_slot according to
4613 + * its random permutation. This function is embedded with the random
4614 + * permutation index management code.
4615 + */
4616 +static struct rmap_item *get_next_rmap_item(struct vma_slot *slot, u32 *hash)
4618 + unsigned long rand_range, addr, swap_index, scan_index;
4619 + struct rmap_item *item = NULL;
4620 + struct rmap_list_entry *scan_entry, *swap_entry = NULL;
4621 + struct page *page;
4623 + scan_index = swap_index = slot->pages_scanned % slot->pages;
4625 + if (pool_entry_boundary(scan_index))
4626 + try_free_last_pool(slot, scan_index - 1);
4628 + if (vma_fully_scanned(slot)) {
4629 + if (slot->flags & UKSM_SLOT_NEED_SORT)
4630 + slot->flags |= UKSM_SLOT_NEED_RERAND;
4631 + else
4632 + slot->flags &= ~UKSM_SLOT_NEED_RERAND;
4633 + if (slot->flags & UKSM_SLOT_NEED_SORT)
4634 + sort_rmap_entry_list(slot);
4637 + scan_entry = get_rmap_list_entry(slot, scan_index, 1);
4638 + if (!scan_entry)
4639 + return NULL;
4641 + if (entry_is_new(scan_entry)) {
4642 + scan_entry->addr = get_index_orig_addr(slot, scan_index);
4643 + set_is_addr(scan_entry->addr);
4646 + if (slot->flags & UKSM_SLOT_NEED_RERAND) {
4647 + rand_range = slot->pages - scan_index;
4648 + BUG_ON(!rand_range);
4649 + swap_index = scan_index + (prandom_u32() % rand_range);
4652 + if (swap_index != scan_index) {
4653 + swap_entry = get_rmap_list_entry(slot, swap_index, 1);
4655 + if (!swap_entry)
4656 + return NULL;
4658 + if (entry_is_new(swap_entry)) {
4659 + swap_entry->addr = get_index_orig_addr(slot,
4660 + swap_index);
4661 + set_is_addr(swap_entry->addr);
4663 + swap_entries(scan_entry, scan_index, swap_entry, swap_index);
4666 + addr = get_entry_address(scan_entry);
4667 + item = get_entry_item(scan_entry);
4668 + BUG_ON(addr > slot->vma->vm_end || addr < slot->vma->vm_start);
4670 + page = follow_page(slot->vma, addr, FOLL_GET);
4671 + if (IS_ERR_OR_NULL(page))
4672 + goto nopage;
4674 + if (!PageAnon(page))
4675 + goto putpage;
4677 + /*check is zero_page pfn or uksm_zero_page*/
4678 + if ((page_to_pfn(page) == zero_pfn)
4679 + || (page_to_pfn(page) == uksm_zero_pfn))
4680 + goto putpage;
4682 + flush_anon_page(slot->vma, page, addr);
4683 + flush_dcache_page(page);
4686 + *hash = page_hash(page, hash_strength, 1);
4687 + inc_uksm_pages_scanned();
4688 + /*if the page content all zero, re-map to zero-page*/
4689 + if (find_zero_page_hash(hash_strength, *hash)) {
4690 + if (!cmp_and_merge_zero_page(slot->vma, page)) {
4691 + slot->pages_merged++;
4693 + /* For full-zero pages, no need to create rmap item */
4694 + goto putpage;
4695 + } else {
4696 + inc_rshash_neg(memcmp_cost / 2);
4700 + if (!item) {
4701 + item = alloc_rmap_item();
4702 + if (item) {
4703 + /* It has already been zeroed */
4704 + item->slot = slot;
4705 + item->address = addr;
4706 + item->entry_index = scan_index;
4707 + scan_entry->item = item;
4708 + inc_rmap_list_pool_count(slot, scan_index);
4709 + } else
4710 + goto putpage;
4713 + BUG_ON(item->slot != slot);
4714 + /* the page may have changed */
4715 + item->page = page;
4716 + put_rmap_list_entry(slot, scan_index);
4717 + if (swap_entry)
4718 + put_rmap_list_entry(slot, swap_index);
4719 + return item;
4721 +putpage:
4722 + put_page(page);
4723 + page = NULL;
4724 +nopage:
4725 + /* no page, store addr back and free rmap_item if possible */
4726 + free_entry_item(scan_entry);
4727 + put_rmap_list_entry(slot, scan_index);
4728 + if (swap_entry)
4729 + put_rmap_list_entry(slot, swap_index);
4730 + return NULL;
4733 +static inline int in_stable_tree(struct rmap_item *rmap_item)
4735 + return rmap_item->address & STABLE_FLAG;
4738 +/**
4739 + * scan_vma_one_page() - scan the next page in a vma_slot. Called with
4740 + * mmap_sem locked.
4741 + */
4742 +static noinline void scan_vma_one_page(struct vma_slot *slot)
4744 + u32 hash;
4745 + struct mm_struct *mm;
4746 + struct rmap_item *rmap_item = NULL;
4747 + struct vm_area_struct *vma = slot->vma;
4749 + mm = vma->vm_mm;
4750 + BUG_ON(!mm);
4751 + BUG_ON(!slot);
4753 + rmap_item = get_next_rmap_item(slot, &hash);
4754 + if (!rmap_item)
4755 + goto out1;
4757 + if (PageKsm(rmap_item->page) && in_stable_tree(rmap_item))
4758 + goto out2;
4760 + cmp_and_merge_page(rmap_item, hash);
4761 +out2:
4762 + put_page(rmap_item->page);
4763 +out1:
4764 + slot->pages_scanned++;
4765 + slot->this_sampled++;
4766 + if (slot->fully_scanned_round != fully_scanned_round)
4767 + scanned_virtual_pages++;
4769 + if (vma_fully_scanned(slot))
4770 + slot->fully_scanned_round = fully_scanned_round;
4773 +static inline unsigned long rung_get_pages(struct scan_rung *rung)
4775 + struct slot_tree_node *node;
4777 + if (!rung->vma_root.rnode)
4778 + return 0;
4780 + node = container_of(rung->vma_root.rnode, struct slot_tree_node, snode);
4782 + return node->size;
4785 +#define RUNG_SAMPLED_MIN 3
4787 +static inline
4788 +void uksm_calc_rung_step(struct scan_rung *rung,
4789 + unsigned long page_time, unsigned long ratio)
4791 + unsigned long sampled, pages;
4793 + /* will be fully scanned ? */
4794 + if (!rung->cover_msecs) {
4795 + rung->step = 1;
4796 + return;
4799 + sampled = rung->cover_msecs * (NSEC_PER_MSEC / TIME_RATIO_SCALE)
4800 + * ratio / page_time;
4802 + /*
4803 + * Before we finsish a scan round and expensive per-round jobs,
4804 + * we need to have a chance to estimate the per page time. So
4805 + * the sampled number can not be too small.
4806 + */
4807 + if (sampled < RUNG_SAMPLED_MIN)
4808 + sampled = RUNG_SAMPLED_MIN;
4810 + pages = rung_get_pages(rung);
4811 + if (likely(pages > sampled))
4812 + rung->step = pages / sampled;
4813 + else
4814 + rung->step = 1;
4817 +static inline int step_need_recalc(struct scan_rung *rung)
4819 + unsigned long pages, stepmax;
4821 + pages = rung_get_pages(rung);
4822 + stepmax = pages / RUNG_SAMPLED_MIN;
4824 + return pages && (rung->step > pages ||
4825 + (stepmax && rung->step > stepmax));
4828 +static inline
4829 +void reset_current_scan(struct scan_rung *rung, int finished, int step_recalc)
4831 + struct vma_slot *slot;
4833 + if (finished)
4834 + rung->flags |= UKSM_RUNG_ROUND_FINISHED;
4836 + if (step_recalc || step_need_recalc(rung)) {
4837 + uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio);
4838 + BUG_ON(step_need_recalc(rung));
4841 + slot_iter_index = prandom_u32() % rung->step;
4842 + BUG_ON(!rung->vma_root.rnode);
4843 + slot = sradix_tree_next(&rung->vma_root, NULL, 0, slot_iter);
4844 + BUG_ON(!slot);
4846 + rung->current_scan = slot;
4847 + rung->current_offset = slot_iter_index;
4850 +static inline struct sradix_tree_root *slot_get_root(struct vma_slot *slot)
4852 + return &slot->rung->vma_root;
4856 + * return if resetted.
4857 + */
4858 +static int advance_current_scan(struct scan_rung *rung)
4860 + unsigned short n;
4861 + struct vma_slot *slot, *next = NULL;
4863 + BUG_ON(!rung->vma_root.num);
4865 + slot = rung->current_scan;
4866 + n = (slot->pages - rung->current_offset) % rung->step;
4867 + slot_iter_index = rung->step - n;
4868 + next = sradix_tree_next(&rung->vma_root, slot->snode,
4869 + slot->sindex, slot_iter);
4871 + if (next) {
4872 + rung->current_offset = slot_iter_index;
4873 + rung->current_scan = next;
4874 + return 0;
4875 + } else {
4876 + reset_current_scan(rung, 1, 0);
4877 + return 1;
4881 +static inline void rung_rm_slot(struct vma_slot *slot)
4883 + struct scan_rung *rung = slot->rung;
4884 + struct sradix_tree_root *root;
4886 + if (rung->current_scan == slot)
4887 + advance_current_scan(rung);
4889 + root = slot_get_root(slot);
4890 + sradix_tree_delete_from_leaf(root, slot->snode, slot->sindex);
4891 + slot->snode = NULL;
4892 + if (step_need_recalc(rung)) {
4893 + uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio);
4894 + BUG_ON(step_need_recalc(rung));
4897 + /* In case advance_current_scan loop back to this slot again */
4898 + if (rung->vma_root.num && rung->current_scan == slot)
4899 + reset_current_scan(slot->rung, 1, 0);
4902 +static inline void rung_add_new_slots(struct scan_rung *rung,
4903 + struct vma_slot **slots, unsigned long num)
4905 + int err;
4906 + struct vma_slot *slot;
4907 + unsigned long i;
4908 + struct sradix_tree_root *root = &rung->vma_root;
4910 + err = sradix_tree_enter(root, (void **)slots, num);
4911 + BUG_ON(err);
4913 + for (i = 0; i < num; i++) {
4914 + slot = slots[i];
4915 + slot->rung = rung;
4916 + BUG_ON(vma_fully_scanned(slot));
4919 + if (rung->vma_root.num == num)
4920 + reset_current_scan(rung, 0, 1);
4923 +static inline int rung_add_one_slot(struct scan_rung *rung,
4924 + struct vma_slot *slot)
4926 + int err;
4928 + err = sradix_tree_enter(&rung->vma_root, (void **)&slot, 1);
4929 + if (err)
4930 + return err;
4932 + slot->rung = rung;
4933 + if (rung->vma_root.num == 1)
4934 + reset_current_scan(rung, 0, 1);
4936 + return 0;
4940 + * Return true if the slot is deleted from its rung.
4941 + */
4942 +static inline int vma_rung_enter(struct vma_slot *slot, struct scan_rung *rung)
4944 + struct scan_rung *old_rung = slot->rung;
4945 + int err;
4947 + if (old_rung == rung)
4948 + return 0;
4950 + rung_rm_slot(slot);
4951 + err = rung_add_one_slot(rung, slot);
4952 + if (err) {
4953 + err = rung_add_one_slot(old_rung, slot);
4954 + WARN_ON(err); /* OOPS, badly OOM, we lost this slot */
4957 + return 1;
4960 +static inline int vma_rung_up(struct vma_slot *slot)
4962 + struct scan_rung *rung;
4964 + rung = slot->rung;
4965 + if (slot->rung != &uksm_scan_ladder[SCAN_LADDER_SIZE-1])
4966 + rung++;
4968 + return vma_rung_enter(slot, rung);
4971 +static inline int vma_rung_down(struct vma_slot *slot)
4973 + struct scan_rung *rung;
4975 + rung = slot->rung;
4976 + if (slot->rung != &uksm_scan_ladder[0])
4977 + rung--;
4979 + return vma_rung_enter(slot, rung);
4982 +/**
4983 + * cal_dedup_ratio() - Calculate the deduplication ratio for this slot.
4984 + */
4985 +static unsigned long cal_dedup_ratio(struct vma_slot *slot)
4987 + unsigned long ret;
4988 + unsigned long pages;
4990 + pages = slot->this_sampled;
4991 + if (!pages)
4992 + return 0;
4994 + BUG_ON(slot->pages_scanned == slot->last_scanned);
4996 + ret = slot->pages_merged;
4998 + /* Thrashing area filtering */
4999 + if (ret && uksm_thrash_threshold) {
5000 + if (slot->pages_cowed * 100 / slot->pages_merged
5001 + > uksm_thrash_threshold) {
5002 + ret = 0;
5003 + } else {
5004 + ret = slot->pages_merged - slot->pages_cowed;
5008 + return ret * 100 / pages;
5011 +/**
5012 + * cal_dedup_ratio() - Calculate the deduplication ratio for this slot.
5013 + */
5014 +static unsigned long cal_dedup_ratio_old(struct vma_slot *slot)
5016 + unsigned long ret;
5017 + unsigned long pages;
5019 + pages = slot->pages;
5020 + if (!pages)
5021 + return 0;
5023 + ret = slot->pages_bemerged;
5025 + /* Thrashing area filtering */
5026 + if (ret && uksm_thrash_threshold) {
5027 + if (slot->pages_cowed * 100 / slot->pages_bemerged
5028 + > uksm_thrash_threshold) {
5029 + ret = 0;
5030 + } else {
5031 + ret = slot->pages_bemerged - slot->pages_cowed;
5035 + return ret * 100 / pages;
5038 +/**
5039 + * stable_node_reinsert() - When the hash_strength has been adjusted, the
5040 + * stable tree need to be restructured, this is the function re-inserting the
5041 + * stable node.
5042 + */
5043 +static inline void stable_node_reinsert(struct stable_node *new_node,
5044 + struct page *page,
5045 + struct rb_root *root_treep,
5046 + struct list_head *tree_node_listp,
5047 + u32 hash)
5049 + struct rb_node **new = &root_treep->rb_node;
5050 + struct rb_node *parent = NULL;
5051 + struct stable_node *stable_node;
5052 + struct tree_node *tree_node;
5053 + struct page *tree_page;
5054 + int cmp;
5056 + while (*new) {
5057 + int cmp;
5059 + tree_node = rb_entry(*new, struct tree_node, node);
5061 + cmp = hash_cmp(hash, tree_node->hash);
5063 + if (cmp < 0) {
5064 + parent = *new;
5065 + new = &parent->rb_left;
5066 + } else if (cmp > 0) {
5067 + parent = *new;
5068 + new = &parent->rb_right;
5069 + } else
5070 + break;
5073 + if (*new) {
5074 + /* find a stable tree node with same first level hash value */
5075 + stable_node_hash_max(new_node, page, hash);
5076 + if (tree_node->count == 1) {
5077 + stable_node = rb_entry(tree_node->sub_root.rb_node,
5078 + struct stable_node, node);
5079 + tree_page = get_uksm_page(stable_node, 1, 0);
5080 + if (tree_page) {
5081 + stable_node_hash_max(stable_node,
5082 + tree_page, hash);
5083 + put_page(tree_page);
5085 + /* prepare for stable node insertion */
5087 + cmp = hash_cmp(new_node->hash_max,
5088 + stable_node->hash_max);
5089 + parent = &stable_node->node;
5090 + if (cmp < 0)
5091 + new = &parent->rb_left;
5092 + else if (cmp > 0)
5093 + new = &parent->rb_right;
5094 + else
5095 + goto failed;
5097 + goto add_node;
5098 + } else {
5099 + /* the only stable_node deleted, the tree node
5100 + * was not deleted.
5101 + */
5102 + goto tree_node_reuse;
5106 + /* well, search the collision subtree */
5107 + new = &tree_node->sub_root.rb_node;
5108 + parent = NULL;
5109 + BUG_ON(!*new);
5110 + while (*new) {
5111 + int cmp;
5113 + stable_node = rb_entry(*new, struct stable_node, node);
5115 + cmp = hash_cmp(new_node->hash_max,
5116 + stable_node->hash_max);
5118 + if (cmp < 0) {
5119 + parent = *new;
5120 + new = &parent->rb_left;
5121 + } else if (cmp > 0) {
5122 + parent = *new;
5123 + new = &parent->rb_right;
5124 + } else {
5125 + /* oh, no, still a collision */
5126 + goto failed;
5130 + goto add_node;
5133 + /* no tree node found */
5134 + tree_node = alloc_tree_node(tree_node_listp);
5135 + if (!tree_node) {
5136 + pr_err("UKSM: memory allocation error!\n");
5137 + goto failed;
5138 + } else {
5139 + tree_node->hash = hash;
5140 + rb_link_node(&tree_node->node, parent, new);
5141 + rb_insert_color(&tree_node->node, root_treep);
5143 +tree_node_reuse:
5144 + /* prepare for stable node insertion */
5145 + parent = NULL;
5146 + new = &tree_node->sub_root.rb_node;
5149 +add_node:
5150 + rb_link_node(&new_node->node, parent, new);
5151 + rb_insert_color(&new_node->node, &tree_node->sub_root);
5152 + new_node->tree_node = tree_node;
5153 + tree_node->count++;
5154 + return;
5156 +failed:
5157 + /* This can only happen when two nodes have collided
5158 + * in two levels.
5159 + */
5160 + new_node->tree_node = NULL;
5161 + return;
5164 +static inline void free_all_tree_nodes(struct list_head *list)
5166 + struct tree_node *node, *tmp;
5168 + list_for_each_entry_safe(node, tmp, list, all_list) {
5169 + free_tree_node(node);
5173 +/**
5174 + * stable_tree_delta_hash() - Delta hash the stable tree from previous hash
5175 + * strength to the current hash_strength. It re-structures the hole tree.
5176 + */
5177 +static inline void stable_tree_delta_hash(u32 prev_hash_strength)
5179 + struct stable_node *node, *tmp;
5180 + struct rb_root *root_new_treep;
5181 + struct list_head *new_tree_node_listp;
5183 + stable_tree_index = (stable_tree_index + 1) % 2;
5184 + root_new_treep = &root_stable_tree[stable_tree_index];
5185 + new_tree_node_listp = &stable_tree_node_list[stable_tree_index];
5186 + *root_new_treep = RB_ROOT;
5187 + BUG_ON(!list_empty(new_tree_node_listp));
5189 + /*
5190 + * we need to be safe, the node could be removed by get_uksm_page()
5191 + */
5192 + list_for_each_entry_safe(node, tmp, &stable_node_list, all_list) {
5193 + void *addr;
5194 + struct page *node_page;
5195 + u32 hash;
5197 + /*
5198 + * We are completely re-structuring the stable nodes to a new
5199 + * stable tree. We don't want to touch the old tree unlinks and
5200 + * old tree_nodes. The old tree_nodes will be freed at once.
5201 + */
5202 + node_page = get_uksm_page(node, 0, 0);
5203 + if (!node_page)
5204 + continue;
5206 + if (node->tree_node) {
5207 + hash = node->tree_node->hash;
5209 + addr = kmap_atomic(node_page);
5211 + hash = delta_hash(addr, prev_hash_strength,
5212 + hash_strength, hash);
5213 + kunmap_atomic(addr);
5214 + } else {
5215 + /*
5216 + *it was not inserted to rbtree due to collision in last
5217 + *round scan.
5218 + */
5219 + hash = page_hash(node_page, hash_strength, 0);
5222 + stable_node_reinsert(node, node_page, root_new_treep,
5223 + new_tree_node_listp, hash);
5224 + put_page(node_page);
5227 + root_stable_treep = root_new_treep;
5228 + free_all_tree_nodes(stable_tree_node_listp);
5229 + BUG_ON(!list_empty(stable_tree_node_listp));
5230 + stable_tree_node_listp = new_tree_node_listp;
5233 +static inline void inc_hash_strength(unsigned long delta)
5235 + hash_strength += 1 << delta;
5236 + if (hash_strength > HASH_STRENGTH_MAX)
5237 + hash_strength = HASH_STRENGTH_MAX;
5240 +static inline void dec_hash_strength(unsigned long delta)
5242 + unsigned long change = 1 << delta;
5244 + if (hash_strength <= change + 1)
5245 + hash_strength = 1;
5246 + else
5247 + hash_strength -= change;
5250 +static inline void inc_hash_strength_delta(void)
5252 + hash_strength_delta++;
5253 + if (hash_strength_delta > HASH_STRENGTH_DELTA_MAX)
5254 + hash_strength_delta = HASH_STRENGTH_DELTA_MAX;
5257 +static inline unsigned long get_current_neg_ratio(void)
5259 + u64 pos = benefit.pos;
5260 + u64 neg = benefit.neg;
5262 + if (!neg)
5263 + return 0;
5265 + if (!pos || neg > pos)
5266 + return 100;
5268 + if (neg > div64_u64(U64_MAX, 100))
5269 + pos = div64_u64(pos, 100);
5270 + else
5271 + neg *= 100;
5273 + return div64_u64(neg, pos);
5276 +static inline unsigned long get_current_benefit(void)
5278 + u64 pos = benefit.pos;
5279 + u64 neg = benefit.neg;
5280 + u64 scanned = benefit.scanned;
5282 + if (neg > pos)
5283 + return 0;
5285 + return div64_u64((pos - neg), scanned);
5288 +static inline int judge_rshash_direction(void)
5290 + u64 current_neg_ratio, stable_benefit;
5291 + u64 current_benefit, delta = 0;
5292 + int ret = STILL;
5294 + /*
5295 + * Try to probe a value after the boot, and in case the system
5296 + * are still for a long time.
5297 + */
5298 + if ((fully_scanned_round & 0xFFULL) == 10) {
5299 + ret = OBSCURE;
5300 + goto out;
5303 + current_neg_ratio = get_current_neg_ratio();
5305 + if (current_neg_ratio == 0) {
5306 + rshash_neg_cont_zero++;
5307 + if (rshash_neg_cont_zero > 2)
5308 + return GO_DOWN;
5309 + else
5310 + return STILL;
5312 + rshash_neg_cont_zero = 0;
5314 + if (current_neg_ratio > 90) {
5315 + ret = GO_UP;
5316 + goto out;
5319 + current_benefit = get_current_benefit();
5320 + stable_benefit = rshash_state.stable_benefit;
5322 + if (!stable_benefit) {
5323 + ret = OBSCURE;
5324 + goto out;
5327 + if (current_benefit > stable_benefit)
5328 + delta = current_benefit - stable_benefit;
5329 + else if (current_benefit < stable_benefit)
5330 + delta = stable_benefit - current_benefit;
5332 + delta = div64_u64(100 * delta, stable_benefit);
5334 + if (delta > 50) {
5335 + rshash_cont_obscure++;
5336 + if (rshash_cont_obscure > 2)
5337 + return OBSCURE;
5338 + else
5339 + return STILL;
5342 +out:
5343 + rshash_cont_obscure = 0;
5344 + return ret;
5347 +/**
5348 + * rshash_adjust() - The main function to control the random sampling state
5349 + * machine for hash strength adapting.
5351 + * return true if hash_strength has changed.
5352 + */
5353 +static inline int rshash_adjust(void)
5355 + unsigned long prev_hash_strength = hash_strength;
5357 + if (!encode_benefit())
5358 + return 0;
5360 + switch (rshash_state.state) {
5361 + case RSHASH_STILL:
5362 + switch (judge_rshash_direction()) {
5363 + case GO_UP:
5364 + if (rshash_state.pre_direct == GO_DOWN)
5365 + hash_strength_delta = 0;
5367 + inc_hash_strength(hash_strength_delta);
5368 + inc_hash_strength_delta();
5369 + rshash_state.stable_benefit = get_current_benefit();
5370 + rshash_state.pre_direct = GO_UP;
5371 + break;
5373 + case GO_DOWN:
5374 + if (rshash_state.pre_direct == GO_UP)
5375 + hash_strength_delta = 0;
5377 + dec_hash_strength(hash_strength_delta);
5378 + inc_hash_strength_delta();
5379 + rshash_state.stable_benefit = get_current_benefit();
5380 + rshash_state.pre_direct = GO_DOWN;
5381 + break;
5383 + case OBSCURE:
5384 + rshash_state.stable_point = hash_strength;
5385 + rshash_state.turn_point_down = hash_strength;
5386 + rshash_state.turn_point_up = hash_strength;
5387 + rshash_state.turn_benefit_down = get_current_benefit();
5388 + rshash_state.turn_benefit_up = get_current_benefit();
5389 + rshash_state.lookup_window_index = 0;
5390 + rshash_state.state = RSHASH_TRYDOWN;
5391 + dec_hash_strength(hash_strength_delta);
5392 + inc_hash_strength_delta();
5393 + break;
5395 + case STILL:
5396 + break;
5397 + default:
5398 + BUG();
5400 + break;
5402 + case RSHASH_TRYDOWN:
5403 + if (rshash_state.lookup_window_index++ % 5 == 0)
5404 + rshash_state.below_count = 0;
5406 + if (get_current_benefit() < rshash_state.stable_benefit)
5407 + rshash_state.below_count++;
5408 + else if (get_current_benefit() >
5409 + rshash_state.turn_benefit_down) {
5410 + rshash_state.turn_point_down = hash_strength;
5411 + rshash_state.turn_benefit_down = get_current_benefit();
5414 + if (rshash_state.below_count >= 3 ||
5415 + judge_rshash_direction() == GO_UP ||
5416 + hash_strength == 1) {
5417 + hash_strength = rshash_state.stable_point;
5418 + hash_strength_delta = 0;
5419 + inc_hash_strength(hash_strength_delta);
5420 + inc_hash_strength_delta();
5421 + rshash_state.lookup_window_index = 0;
5422 + rshash_state.state = RSHASH_TRYUP;
5423 + hash_strength_delta = 0;
5424 + } else {
5425 + dec_hash_strength(hash_strength_delta);
5426 + inc_hash_strength_delta();
5428 + break;
5430 + case RSHASH_TRYUP:
5431 + if (rshash_state.lookup_window_index++ % 5 == 0)
5432 + rshash_state.below_count = 0;
5434 + if (get_current_benefit() < rshash_state.turn_benefit_down)
5435 + rshash_state.below_count++;
5436 + else if (get_current_benefit() > rshash_state.turn_benefit_up) {
5437 + rshash_state.turn_point_up = hash_strength;
5438 + rshash_state.turn_benefit_up = get_current_benefit();
5441 + if (rshash_state.below_count >= 3 ||
5442 + judge_rshash_direction() == GO_DOWN ||
5443 + hash_strength == HASH_STRENGTH_MAX) {
5444 + hash_strength = rshash_state.turn_benefit_up >
5445 + rshash_state.turn_benefit_down ?
5446 + rshash_state.turn_point_up :
5447 + rshash_state.turn_point_down;
5449 + rshash_state.state = RSHASH_PRE_STILL;
5450 + } else {
5451 + inc_hash_strength(hash_strength_delta);
5452 + inc_hash_strength_delta();
5455 + break;
5457 + case RSHASH_NEW:
5458 + case RSHASH_PRE_STILL:
5459 + rshash_state.stable_benefit = get_current_benefit();
5460 + rshash_state.state = RSHASH_STILL;
5461 + hash_strength_delta = 0;
5462 + break;
5463 + default:
5464 + BUG();
5467 + /* rshash_neg = rshash_pos = 0; */
5468 + reset_benefit();
5470 + if (prev_hash_strength != hash_strength)
5471 + stable_tree_delta_hash(prev_hash_strength);
5473 + return prev_hash_strength != hash_strength;
5476 +/**
5477 + * round_update_ladder() - The main function to do update of all the
5478 + * adjustments whenever a scan round is finished.
5479 + */
5480 +static noinline void round_update_ladder(void)
5482 + int i;
5483 + unsigned long dedup;
5484 + struct vma_slot *slot, *tmp_slot;
5486 + for (i = 0; i < SCAN_LADDER_SIZE; i++)
5487 + uksm_scan_ladder[i].flags &= ~UKSM_RUNG_ROUND_FINISHED;
5489 + list_for_each_entry_safe(slot, tmp_slot, &vma_slot_dedup, dedup_list) {
5491 + /* slot may be rung_rm_slot() when mm exits */
5492 + if (slot->snode) {
5493 + dedup = cal_dedup_ratio_old(slot);
5494 + if (dedup && dedup >= uksm_abundant_threshold)
5495 + vma_rung_up(slot);
5498 + slot->pages_bemerged = 0;
5499 + slot->pages_cowed = 0;
5501 + list_del_init(&slot->dedup_list);
5505 +static void uksm_del_vma_slot(struct vma_slot *slot)
5507 + int i, j;
5508 + struct rmap_list_entry *entry;
5510 + if (slot->snode) {
5511 + /*
5512 + * In case it just failed when entering the rung, it's not
5513 + * necessary.
5514 + */
5515 + rung_rm_slot(slot);
5518 + if (!list_empty(&slot->dedup_list))
5519 + list_del(&slot->dedup_list);
5521 + if (!slot->rmap_list_pool || !slot->pool_counts) {
5522 + /* In case it OOMed in uksm_vma_enter() */
5523 + goto out;
5526 + for (i = 0; i < slot->pool_size; i++) {
5527 + void *addr;
5529 + if (!slot->rmap_list_pool[i])
5530 + continue;
5532 + addr = kmap(slot->rmap_list_pool[i]);
5533 + for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
5534 + entry = (struct rmap_list_entry *)addr + j;
5535 + if (is_addr(entry->addr))
5536 + continue;
5537 + if (!entry->item)
5538 + continue;
5540 + remove_rmap_item_from_tree(entry->item);
5541 + free_rmap_item(entry->item);
5542 + slot->pool_counts[i]--;
5544 + BUG_ON(slot->pool_counts[i]);
5545 + kunmap(slot->rmap_list_pool[i]);
5546 + __free_page(slot->rmap_list_pool[i]);
5548 + kfree(slot->rmap_list_pool);
5549 + kfree(slot->pool_counts);
5551 +out:
5552 + slot->rung = NULL;
5553 + if (slot->flags & UKSM_SLOT_IN_UKSM) {
5554 + BUG_ON(uksm_pages_total < slot->pages);
5555 + uksm_pages_total -= slot->pages;
5558 + if (slot->fully_scanned_round == fully_scanned_round)
5559 + scanned_virtual_pages -= slot->pages;
5560 + else
5561 + scanned_virtual_pages -= slot->pages_scanned;
5562 + free_vma_slot(slot);
5566 +#define SPIN_LOCK_PERIOD 32
5567 +static struct vma_slot *cleanup_slots[SPIN_LOCK_PERIOD];
5568 +static inline void cleanup_vma_slots(void)
5570 + struct vma_slot *slot;
5571 + int i;
5573 + i = 0;
5574 + spin_lock(&vma_slot_list_lock);
5575 + while (!list_empty(&vma_slot_del)) {
5576 + slot = list_entry(vma_slot_del.next,
5577 + struct vma_slot, slot_list);
5578 + list_del(&slot->slot_list);
5579 + cleanup_slots[i++] = slot;
5580 + if (i == SPIN_LOCK_PERIOD) {
5581 + spin_unlock(&vma_slot_list_lock);
5582 + while (--i >= 0)
5583 + uksm_del_vma_slot(cleanup_slots[i]);
5584 + i = 0;
5585 + spin_lock(&vma_slot_list_lock);
5588 + spin_unlock(&vma_slot_list_lock);
5590 + while (--i >= 0)
5591 + uksm_del_vma_slot(cleanup_slots[i]);
5595 + * Expotional moving average formula
5596 + */
5597 +static inline unsigned long ema(unsigned long curr, unsigned long last_ema)
5599 + /*
5600 + * For a very high burst, even the ema cannot work well, a false very
5601 + * high per-page time estimation can result in feedback in very high
5602 + * overhead of context switch and rung update -- this will then lead
5603 + * to higher per-paper time, this may not converge.
5605 + * Instead, we try to approach this value in a binary manner.
5606 + */
5607 + if (curr > last_ema * 10)
5608 + return last_ema * 2;
5610 + return (EMA_ALPHA * curr + (100 - EMA_ALPHA) * last_ema) / 100;
5614 + * convert cpu ratio in 1/TIME_RATIO_SCALE configured by user to
5615 + * nanoseconds based on current uksm_sleep_jiffies.
5616 + */
5617 +static inline unsigned long cpu_ratio_to_nsec(unsigned int ratio)
5619 + return NSEC_PER_USEC * jiffies_to_usecs(uksm_sleep_jiffies) /
5620 + (TIME_RATIO_SCALE - ratio) * ratio;
5624 +static inline unsigned long rung_real_ratio(int cpu_time_ratio)
5626 + unsigned long ret;
5628 + BUG_ON(!cpu_time_ratio);
5630 + if (cpu_time_ratio > 0)
5631 + ret = cpu_time_ratio;
5632 + else
5633 + ret = (unsigned long)(-cpu_time_ratio) *
5634 + uksm_max_cpu_percentage / 100UL;
5636 + return ret ? ret : 1;
5639 +static noinline void uksm_calc_scan_pages(void)
5641 + struct scan_rung *ladder = uksm_scan_ladder;
5642 + unsigned long sleep_usecs, nsecs;
5643 + unsigned long ratio;
5644 + int i;
5645 + unsigned long per_page;
5647 + if (uksm_ema_page_time > 100000 ||
5648 + (((unsigned long) uksm_eval_round & (256UL - 1)) == 0UL))
5649 + uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT;
5651 + per_page = uksm_ema_page_time;
5652 + BUG_ON(!per_page);
5654 + /*
5655 + * For every 8 eval round, we try to probe a uksm_sleep_jiffies value
5656 + * based on saved user input.
5657 + */
5658 + if (((unsigned long) uksm_eval_round & (8UL - 1)) == 0UL)
5659 + uksm_sleep_jiffies = uksm_sleep_saved;
5661 + /* We require a rung scan at least 1 page in a period. */
5662 + nsecs = per_page;
5663 + ratio = rung_real_ratio(ladder[0].cpu_ratio);
5664 + if (cpu_ratio_to_nsec(ratio) < nsecs) {
5665 + sleep_usecs = nsecs * (TIME_RATIO_SCALE - ratio) / ratio
5666 + / NSEC_PER_USEC;
5667 + uksm_sleep_jiffies = usecs_to_jiffies(sleep_usecs) + 1;
5670 + for (i = 0; i < SCAN_LADDER_SIZE; i++) {
5671 + ratio = rung_real_ratio(ladder[i].cpu_ratio);
5672 + ladder[i].pages_to_scan = cpu_ratio_to_nsec(ratio) /
5673 + per_page;
5674 + BUG_ON(!ladder[i].pages_to_scan);
5675 + uksm_calc_rung_step(&ladder[i], per_page, ratio);
5680 + * From the scan time of this round (ns) to next expected min sleep time
5681 + * (ms), be careful of the possible overflows. ratio is taken from
5682 + * rung_real_ratio()
5683 + */
5684 +static inline
5685 +unsigned int scan_time_to_sleep(unsigned long long scan_time, unsigned long ratio)
5687 + scan_time >>= 20; /* to msec level now */
5688 + BUG_ON(scan_time > (ULONG_MAX / TIME_RATIO_SCALE));
5690 + return (unsigned int) ((unsigned long) scan_time *
5691 + (TIME_RATIO_SCALE - ratio) / ratio);
5694 +#define __round_mask(x, y) ((__typeof__(x))((y)-1))
5695 +#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
5697 +static void uksm_vma_enter(struct vma_slot **slots, unsigned long num)
5699 + struct scan_rung *rung;
5701 + rung = &uksm_scan_ladder[0];
5702 + rung_add_new_slots(rung, slots, num);
5705 +static struct vma_slot *batch_slots[SLOT_TREE_NODE_STORE_SIZE];
5707 +static void uksm_enter_all_slots(void)
5709 + struct vma_slot *slot;
5710 + unsigned long index;
5711 + struct list_head empty_vma_list;
5712 + int i;
5714 + i = 0;
5715 + index = 0;
5716 + INIT_LIST_HEAD(&empty_vma_list);
5718 + spin_lock(&vma_slot_list_lock);
5719 + while (!list_empty(&vma_slot_new)) {
5720 + slot = list_entry(vma_slot_new.next,
5721 + struct vma_slot, slot_list);
5723 + if (!slot->vma->anon_vma) {
5724 + list_move(&slot->slot_list, &empty_vma_list);
5725 + } else if (vma_can_enter(slot->vma)) {
5726 + batch_slots[index++] = slot;
5727 + list_del_init(&slot->slot_list);
5728 + } else {
5729 + list_move(&slot->slot_list, &vma_slot_noadd);
5732 + if (++i == SPIN_LOCK_PERIOD ||
5733 + (index && !(index % SLOT_TREE_NODE_STORE_SIZE))) {
5734 + spin_unlock(&vma_slot_list_lock);
5736 + if (index && !(index % SLOT_TREE_NODE_STORE_SIZE)) {
5737 + uksm_vma_enter(batch_slots, index);
5738 + index = 0;
5740 + i = 0;
5741 + cond_resched();
5742 + spin_lock(&vma_slot_list_lock);
5746 + list_splice(&empty_vma_list, &vma_slot_new);
5748 + spin_unlock(&vma_slot_list_lock);
5750 + if (index)
5751 + uksm_vma_enter(batch_slots, index);
5755 +static inline int rung_round_finished(struct scan_rung *rung)
5757 + return rung->flags & UKSM_RUNG_ROUND_FINISHED;
5760 +static inline void judge_slot(struct vma_slot *slot)
5762 + struct scan_rung *rung = slot->rung;
5763 + unsigned long dedup;
5764 + int deleted;
5766 + dedup = cal_dedup_ratio(slot);
5767 + if (vma_fully_scanned(slot) && uksm_thrash_threshold)
5768 + deleted = vma_rung_enter(slot, &uksm_scan_ladder[0]);
5769 + else if (dedup && dedup >= uksm_abundant_threshold)
5770 + deleted = vma_rung_up(slot);
5771 + else
5772 + deleted = vma_rung_down(slot);
5774 + slot->pages_merged = 0;
5775 + slot->pages_cowed = 0;
5776 + slot->this_sampled = 0;
5778 + if (vma_fully_scanned(slot))
5779 + slot->pages_scanned = 0;
5781 + slot->last_scanned = slot->pages_scanned;
5783 + /* If its deleted in above, then rung was already advanced. */
5784 + if (!deleted)
5785 + advance_current_scan(rung);
5789 +static inline int hash_round_finished(void)
5791 + if (scanned_virtual_pages > (uksm_pages_total >> 2)) {
5792 + scanned_virtual_pages = 0;
5793 + if (uksm_pages_scanned)
5794 + fully_scanned_round++;
5796 + return 1;
5797 + } else {
5798 + return 0;
5802 +#define UKSM_MMSEM_BATCH 5
5803 +#define BUSY_RETRY 100
5805 +/**
5806 + * uksm_do_scan() - the main worker function.
5807 + */
5808 +static noinline void uksm_do_scan(void)
5810 + struct vma_slot *slot, *iter;
5811 + struct mm_struct *busy_mm;
5812 + unsigned char round_finished, all_rungs_emtpy;
5813 + int i, err, mmsem_batch;
5814 + unsigned long pcost;
5815 + long long delta_exec;
5816 + unsigned long vpages, max_cpu_ratio;
5817 + unsigned long long start_time, end_time, scan_time;
5818 + unsigned int expected_jiffies;
5820 + might_sleep();
5822 + vpages = 0;
5824 + start_time = task_sched_runtime(current);
5825 + max_cpu_ratio = 0;
5826 + mmsem_batch = 0;
5828 + for (i = 0; i < SCAN_LADDER_SIZE;) {
5829 + struct scan_rung *rung = &uksm_scan_ladder[i];
5830 + unsigned long ratio;
5831 + int busy_retry;
5833 + if (!rung->pages_to_scan) {
5834 + i++;
5835 + continue;
5838 + if (!rung->vma_root.num) {
5839 + rung->pages_to_scan = 0;
5840 + i++;
5841 + continue;
5844 + ratio = rung_real_ratio(rung->cpu_ratio);
5845 + if (ratio > max_cpu_ratio)
5846 + max_cpu_ratio = ratio;
5848 + busy_retry = BUSY_RETRY;
5849 + /*
5850 + * Do not consider rung_round_finished() here, just used up the
5851 + * rung->pages_to_scan quota.
5852 + */
5853 + while (rung->pages_to_scan && rung->vma_root.num &&
5854 + likely(!freezing(current))) {
5855 + int reset = 0;
5857 + slot = rung->current_scan;
5859 + BUG_ON(vma_fully_scanned(slot));
5861 + if (mmsem_batch)
5862 + err = 0;
5863 + else
5864 + err = try_down_read_slot_mmap_sem(slot);
5866 + if (err == -ENOENT) {
5867 +rm_slot:
5868 + rung_rm_slot(slot);
5869 + continue;
5872 + busy_mm = slot->mm;
5874 + if (err == -EBUSY) {
5875 + /* skip other vmas on the same mm */
5876 + do {
5877 + reset = advance_current_scan(rung);
5878 + iter = rung->current_scan;
5879 + busy_retry--;
5880 + if (iter->vma->vm_mm != busy_mm ||
5881 + !busy_retry || reset)
5882 + break;
5883 + } while (1);
5885 + if (iter->vma->vm_mm != busy_mm) {
5886 + continue;
5887 + } else {
5888 + /* scan round finsished */
5889 + break;
5893 + BUG_ON(!vma_can_enter(slot->vma));
5894 + if (uksm_test_exit(slot->vma->vm_mm)) {
5895 + mmsem_batch = 0;
5896 + mmap_read_unlock(slot->vma->vm_mm);
5897 + goto rm_slot;
5900 + if (mmsem_batch)
5901 + mmsem_batch--;
5902 + else
5903 + mmsem_batch = UKSM_MMSEM_BATCH;
5905 + /* Ok, we have take the mmap_sem, ready to scan */
5906 + scan_vma_one_page(slot);
5907 + rung->pages_to_scan--;
5908 + vpages++;
5910 + if (rung->current_offset + rung->step > slot->pages - 1
5911 + || vma_fully_scanned(slot)) {
5912 + mmap_read_unlock(slot->vma->vm_mm);
5913 + judge_slot(slot);
5914 + mmsem_batch = 0;
5915 + } else {
5916 + rung->current_offset += rung->step;
5917 + if (!mmsem_batch)
5918 + mmap_read_unlock(slot->vma->vm_mm);
5921 + busy_retry = BUSY_RETRY;
5922 + cond_resched();
5925 + if (mmsem_batch) {
5926 + mmap_read_unlock(slot->vma->vm_mm);
5927 + mmsem_batch = 0;
5930 + if (freezing(current))
5931 + break;
5933 + cond_resched();
5935 + end_time = task_sched_runtime(current);
5936 + delta_exec = end_time - start_time;
5938 + if (freezing(current))
5939 + return;
5941 + cleanup_vma_slots();
5942 + uksm_enter_all_slots();
5944 + round_finished = 1;
5945 + all_rungs_emtpy = 1;
5946 + for (i = 0; i < SCAN_LADDER_SIZE; i++) {
5947 + struct scan_rung *rung = &uksm_scan_ladder[i];
5949 + if (rung->vma_root.num) {
5950 + all_rungs_emtpy = 0;
5951 + if (!rung_round_finished(rung))
5952 + round_finished = 0;
5956 + if (all_rungs_emtpy)
5957 + round_finished = 0;
5959 + if (round_finished) {
5960 + round_update_ladder();
5961 + uksm_eval_round++;
5963 + if (hash_round_finished() && rshash_adjust()) {
5964 + /* Reset the unstable root iff hash strength changed */
5965 + uksm_hash_round++;
5966 + root_unstable_tree = RB_ROOT;
5967 + free_all_tree_nodes(&unstable_tree_node_list);
5970 + /*
5971 + * A number of pages can hang around indefinitely on per-cpu
5972 + * pagevecs, raised page count preventing write_protect_page
5973 + * from merging them. Though it doesn't really matter much,
5974 + * it is puzzling to see some stuck in pages_volatile until
5975 + * other activity jostles them out, and they also prevented
5976 + * LTP's KSM test from succeeding deterministically; so drain
5977 + * them here (here rather than on entry to uksm_do_scan(),
5978 + * so we don't IPI too often when pages_to_scan is set low).
5979 + */
5980 + lru_add_drain_all();
5984 + if (vpages && delta_exec > 0) {
5985 + pcost = (unsigned long) delta_exec / vpages;
5986 + if (likely(uksm_ema_page_time))
5987 + uksm_ema_page_time = ema(pcost, uksm_ema_page_time);
5988 + else
5989 + uksm_ema_page_time = pcost;
5992 + uksm_calc_scan_pages();
5993 + uksm_sleep_real = uksm_sleep_jiffies;
5994 + /* in case of radical cpu bursts, apply the upper bound */
5995 + end_time = task_sched_runtime(current);
5996 + if (max_cpu_ratio && end_time > start_time) {
5997 + scan_time = end_time - start_time;
5998 + expected_jiffies = msecs_to_jiffies(
5999 + scan_time_to_sleep(scan_time, max_cpu_ratio));
6001 + if (expected_jiffies > uksm_sleep_real)
6002 + uksm_sleep_real = expected_jiffies;
6004 + /* We have a 1 second up bound for responsiveness. */
6005 + if (jiffies_to_msecs(uksm_sleep_real) > MSEC_PER_SEC)
6006 + uksm_sleep_real = msecs_to_jiffies(1000);
6009 + return;
6012 +static int ksmd_should_run(void)
6014 + return uksm_run & UKSM_RUN_MERGE;
6017 +static int uksm_scan_thread(void *nothing)
6019 + set_freezable();
6020 + set_user_nice(current, 5);
6022 + while (!kthread_should_stop()) {
6023 + mutex_lock(&uksm_thread_mutex);
6024 + if (ksmd_should_run())
6025 + uksm_do_scan();
6026 + mutex_unlock(&uksm_thread_mutex);
6028 + try_to_freeze();
6030 + if (ksmd_should_run()) {
6031 + schedule_timeout_interruptible(uksm_sleep_real);
6032 + uksm_sleep_times++;
6033 + } else {
6034 + wait_event_freezable(uksm_thread_wait,
6035 + ksmd_should_run() || kthread_should_stop());
6038 + return 0;
6041 +void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
6043 + struct stable_node *stable_node;
6044 + struct node_vma *node_vma;
6045 + struct rmap_item *rmap_item;
6046 + int search_new_forks = 0;
6047 + unsigned long address;
6049 + VM_BUG_ON_PAGE(!PageKsm(page), page);
6050 + VM_BUG_ON_PAGE(!PageLocked(page), page);
6052 + stable_node = page_stable_node(page);
6053 + if (!stable_node)
6054 + return;
6055 +again:
6056 + hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) {
6057 + hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) {
6058 + struct anon_vma *anon_vma = rmap_item->anon_vma;
6059 + struct anon_vma_chain *vmac;
6060 + struct vm_area_struct *vma;
6062 + cond_resched();
6063 + anon_vma_lock_read(anon_vma);
6064 + anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
6065 + 0, ULONG_MAX) {
6066 + cond_resched();
6067 + vma = vmac->vma;
6068 + address = get_rmap_addr(rmap_item);
6070 + if (address < vma->vm_start ||
6071 + address >= vma->vm_end)
6072 + continue;
6074 + if ((rmap_item->slot->vma == vma) ==
6075 + search_new_forks)
6076 + continue;
6078 + if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
6079 + continue;
6081 + if (!rwc->rmap_one(page, vma, address, rwc->arg)) {
6082 + anon_vma_unlock_read(anon_vma);
6083 + return;
6086 + if (rwc->done && rwc->done(page)) {
6087 + anon_vma_unlock_read(anon_vma);
6088 + return;
6091 + anon_vma_unlock_read(anon_vma);
6094 + if (!search_new_forks++)
6095 + goto again;
6098 +#ifdef CONFIG_MIGRATION
6099 +/* Common ksm interface but may be specific to uksm */
6100 +void ksm_migrate_page(struct page *newpage, struct page *oldpage)
6102 + struct stable_node *stable_node;
6104 + VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
6105 + VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
6106 + VM_BUG_ON(newpage->mapping != oldpage->mapping);
6108 + stable_node = page_stable_node(newpage);
6109 + if (stable_node) {
6110 + VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage));
6111 + stable_node->kpfn = page_to_pfn(newpage);
6112 + /*
6113 + * newpage->mapping was set in advance; now we need smp_wmb()
6114 + * to make sure that the new stable_node->kpfn is visible
6115 + * to get_ksm_page() before it can see that oldpage->mapping
6116 + * has gone stale (or that PageSwapCache has been cleared).
6117 + */
6118 + smp_wmb();
6119 + set_page_stable_node(oldpage, NULL);
6122 +#endif /* CONFIG_MIGRATION */
6124 +#ifdef CONFIG_MEMORY_HOTREMOVE
6125 +static struct stable_node *uksm_check_stable_tree(unsigned long start_pfn,
6126 + unsigned long end_pfn)
6128 + struct rb_node *node;
6130 + for (node = rb_first(root_stable_treep); node; node = rb_next(node)) {
6131 + struct stable_node *stable_node;
6133 + stable_node = rb_entry(node, struct stable_node, node);
6134 + if (stable_node->kpfn >= start_pfn &&
6135 + stable_node->kpfn < end_pfn)
6136 + return stable_node;
6138 + return NULL;
6141 +static int uksm_memory_callback(struct notifier_block *self,
6142 + unsigned long action, void *arg)
6144 + struct memory_notify *mn = arg;
6145 + struct stable_node *stable_node;
6147 + switch (action) {
6148 + case MEM_GOING_OFFLINE:
6149 + /*
6150 + * Keep it very simple for now: just lock out ksmd and
6151 + * MADV_UNMERGEABLE while any memory is going offline.
6152 + * mutex_lock_nested() is necessary because lockdep was alarmed
6153 + * that here we take uksm_thread_mutex inside notifier chain
6154 + * mutex, and later take notifier chain mutex inside
6155 + * uksm_thread_mutex to unlock it. But that's safe because both
6156 + * are inside mem_hotplug_mutex.
6157 + */
6158 + mutex_lock_nested(&uksm_thread_mutex, SINGLE_DEPTH_NESTING);
6159 + break;
6161 + case MEM_OFFLINE:
6162 + /*
6163 + * Most of the work is done by page migration; but there might
6164 + * be a few stable_nodes left over, still pointing to struct
6165 + * pages which have been offlined: prune those from the tree.
6166 + */
6167 + while ((stable_node = uksm_check_stable_tree(mn->start_pfn,
6168 + mn->start_pfn + mn->nr_pages)) != NULL)
6169 + remove_node_from_stable_tree(stable_node, 1, 1);
6170 + /* fallthrough */
6172 + case MEM_CANCEL_OFFLINE:
6173 + mutex_unlock(&uksm_thread_mutex);
6174 + break;
6176 + return NOTIFY_OK;
6178 +#endif /* CONFIG_MEMORY_HOTREMOVE */
6180 +#ifdef CONFIG_SYSFS
6182 + * This all compiles without CONFIG_SYSFS, but is a waste of space.
6183 + */
6185 +#define UKSM_ATTR_RO(_name) \
6186 + static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
6187 +#define UKSM_ATTR(_name) \
6188 + static struct kobj_attribute _name##_attr = \
6189 + __ATTR(_name, 0644, _name##_show, _name##_store)
6191 +static ssize_t max_cpu_percentage_show(struct kobject *kobj,
6192 + struct kobj_attribute *attr, char *buf)
6194 + return sprintf(buf, "%u\n", uksm_max_cpu_percentage);
6197 +static ssize_t max_cpu_percentage_store(struct kobject *kobj,
6198 + struct kobj_attribute *attr,
6199 + const char *buf, size_t count)
6201 + unsigned long max_cpu_percentage;
6202 + int err;
6204 + err = kstrtoul(buf, 10, &max_cpu_percentage);
6205 + if (err || max_cpu_percentage > 100)
6206 + return -EINVAL;
6208 + if (max_cpu_percentage == 100)
6209 + max_cpu_percentage = 99;
6210 + else if (max_cpu_percentage < 10)
6211 + max_cpu_percentage = 10;
6213 + uksm_max_cpu_percentage = max_cpu_percentage;
6215 + return count;
6217 +UKSM_ATTR(max_cpu_percentage);
6219 +static ssize_t sleep_millisecs_show(struct kobject *kobj,
6220 + struct kobj_attribute *attr, char *buf)
6222 + return sprintf(buf, "%u\n", jiffies_to_msecs(uksm_sleep_jiffies));
6225 +static ssize_t sleep_millisecs_store(struct kobject *kobj,
6226 + struct kobj_attribute *attr,
6227 + const char *buf, size_t count)
6229 + unsigned long msecs;
6230 + int err;
6232 + err = kstrtoul(buf, 10, &msecs);
6233 + if (err || msecs > MSEC_PER_SEC)
6234 + return -EINVAL;
6236 + uksm_sleep_jiffies = msecs_to_jiffies(msecs);
6237 + uksm_sleep_saved = uksm_sleep_jiffies;
6239 + return count;
6241 +UKSM_ATTR(sleep_millisecs);
6244 +static ssize_t cpu_governor_show(struct kobject *kobj,
6245 + struct kobj_attribute *attr, char *buf)
6247 + int n = sizeof(uksm_cpu_governor_str) / sizeof(char *);
6248 + int i;
6250 + buf[0] = '\0';
6251 + for (i = 0; i < n ; i++) {
6252 + if (uksm_cpu_governor == i)
6253 + strcat(buf, "[");
6255 + strcat(buf, uksm_cpu_governor_str[i]);
6257 + if (uksm_cpu_governor == i)
6258 + strcat(buf, "]");
6260 + strcat(buf, " ");
6262 + strcat(buf, "\n");
6264 + return strlen(buf);
6267 +static inline void init_performance_values(void)
6269 + int i;
6270 + struct scan_rung *rung;
6271 + struct uksm_cpu_preset_s *preset = uksm_cpu_preset + uksm_cpu_governor;
6274 + for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6275 + rung = uksm_scan_ladder + i;
6276 + rung->cpu_ratio = preset->cpu_ratio[i];
6277 + rung->cover_msecs = preset->cover_msecs[i];
6280 + uksm_max_cpu_percentage = preset->max_cpu;
6283 +static ssize_t cpu_governor_store(struct kobject *kobj,
6284 + struct kobj_attribute *attr,
6285 + const char *buf, size_t count)
6287 + int n = sizeof(uksm_cpu_governor_str) / sizeof(char *);
6289 + for (n--; n >= 0 ; n--) {
6290 + if (!strncmp(buf, uksm_cpu_governor_str[n],
6291 + strlen(uksm_cpu_governor_str[n])))
6292 + break;
6295 + if (n < 0)
6296 + return -EINVAL;
6297 + else
6298 + uksm_cpu_governor = n;
6300 + init_performance_values();
6302 + return count;
6304 +UKSM_ATTR(cpu_governor);
6306 +static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
6307 + char *buf)
6309 + return sprintf(buf, "%u\n", uksm_run);
6312 +static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
6313 + const char *buf, size_t count)
6315 + int err;
6316 + unsigned long flags;
6318 + err = kstrtoul(buf, 10, &flags);
6319 + if (err || flags > UINT_MAX)
6320 + return -EINVAL;
6321 + if (flags > UKSM_RUN_MERGE)
6322 + return -EINVAL;
6324 + mutex_lock(&uksm_thread_mutex);
6325 + if (uksm_run != flags)
6326 + uksm_run = flags;
6327 + mutex_unlock(&uksm_thread_mutex);
6329 + if (flags & UKSM_RUN_MERGE)
6330 + wake_up_interruptible(&uksm_thread_wait);
6332 + return count;
6334 +UKSM_ATTR(run);
6336 +static ssize_t abundant_threshold_show(struct kobject *kobj,
6337 + struct kobj_attribute *attr, char *buf)
6339 + return sprintf(buf, "%u\n", uksm_abundant_threshold);
6342 +static ssize_t abundant_threshold_store(struct kobject *kobj,
6343 + struct kobj_attribute *attr,
6344 + const char *buf, size_t count)
6346 + int err;
6347 + unsigned long flags;
6349 + err = kstrtoul(buf, 10, &flags);
6350 + if (err || flags > 99)
6351 + return -EINVAL;
6353 + uksm_abundant_threshold = flags;
6355 + return count;
6357 +UKSM_ATTR(abundant_threshold);
6359 +static ssize_t thrash_threshold_show(struct kobject *kobj,
6360 + struct kobj_attribute *attr, char *buf)
6362 + return sprintf(buf, "%u\n", uksm_thrash_threshold);
6365 +static ssize_t thrash_threshold_store(struct kobject *kobj,
6366 + struct kobj_attribute *attr,
6367 + const char *buf, size_t count)
6369 + int err;
6370 + unsigned long flags;
6372 + err = kstrtoul(buf, 10, &flags);
6373 + if (err || flags > 99)
6374 + return -EINVAL;
6376 + uksm_thrash_threshold = flags;
6378 + return count;
6380 +UKSM_ATTR(thrash_threshold);
6382 +static ssize_t cpu_ratios_show(struct kobject *kobj,
6383 + struct kobj_attribute *attr, char *buf)
6385 + int i, size;
6386 + struct scan_rung *rung;
6387 + char *p = buf;
6389 + for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6390 + rung = &uksm_scan_ladder[i];
6392 + if (rung->cpu_ratio > 0)
6393 + size = sprintf(p, "%d ", rung->cpu_ratio);
6394 + else
6395 + size = sprintf(p, "MAX/%d ",
6396 + TIME_RATIO_SCALE / -rung->cpu_ratio);
6398 + p += size;
6401 + *p++ = '\n';
6402 + *p = '\0';
6404 + return p - buf;
6407 +static ssize_t cpu_ratios_store(struct kobject *kobj,
6408 + struct kobj_attribute *attr,
6409 + const char *buf, size_t count)
6411 + int i, cpuratios[SCAN_LADDER_SIZE], err;
6412 + unsigned long value;
6413 + struct scan_rung *rung;
6414 + char *p, *end = NULL;
6416 + p = kzalloc(count, GFP_KERNEL);
6417 + if (!p)
6418 + return -ENOMEM;
6420 + memcpy(p, buf, count);
6422 + for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6423 + if (i != SCAN_LADDER_SIZE - 1) {
6424 + end = strchr(p, ' ');
6425 + if (!end)
6426 + return -EINVAL;
6428 + *end = '\0';
6431 + if (strstr(p, "MAX/")) {
6432 + p = strchr(p, '/') + 1;
6433 + err = kstrtoul(p, 10, &value);
6434 + if (err || value > TIME_RATIO_SCALE || !value)
6435 + return -EINVAL;
6437 + cpuratios[i] = -(int) (TIME_RATIO_SCALE / value);
6438 + } else {
6439 + err = kstrtoul(p, 10, &value);
6440 + if (err || value > TIME_RATIO_SCALE || !value)
6441 + return -EINVAL;
6443 + cpuratios[i] = value;
6446 + p = end + 1;
6449 + for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6450 + rung = &uksm_scan_ladder[i];
6452 + rung->cpu_ratio = cpuratios[i];
6455 + return count;
6457 +UKSM_ATTR(cpu_ratios);
6459 +static ssize_t eval_intervals_show(struct kobject *kobj,
6460 + struct kobj_attribute *attr, char *buf)
6462 + int i, size;
6463 + struct scan_rung *rung;
6464 + char *p = buf;
6466 + for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6467 + rung = &uksm_scan_ladder[i];
6468 + size = sprintf(p, "%u ", rung->cover_msecs);
6469 + p += size;
6472 + *p++ = '\n';
6473 + *p = '\0';
6475 + return p - buf;
6478 +static ssize_t eval_intervals_store(struct kobject *kobj,
6479 + struct kobj_attribute *attr,
6480 + const char *buf, size_t count)
6482 + int i, err;
6483 + unsigned long values[SCAN_LADDER_SIZE];
6484 + struct scan_rung *rung;
6485 + char *p, *end = NULL;
6486 + ssize_t ret = count;
6488 + p = kzalloc(count + 2, GFP_KERNEL);
6489 + if (!p)
6490 + return -ENOMEM;
6492 + memcpy(p, buf, count);
6494 + for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6495 + if (i != SCAN_LADDER_SIZE - 1) {
6496 + end = strchr(p, ' ');
6497 + if (!end) {
6498 + ret = -EINVAL;
6499 + goto out;
6502 + *end = '\0';
6505 + err = kstrtoul(p, 10, &values[i]);
6506 + if (err) {
6507 + ret = -EINVAL;
6508 + goto out;
6511 + p = end + 1;
6514 + for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6515 + rung = &uksm_scan_ladder[i];
6517 + rung->cover_msecs = values[i];
6520 +out:
6521 + kfree(p);
6522 + return ret;
6524 +UKSM_ATTR(eval_intervals);
6526 +static ssize_t ema_per_page_time_show(struct kobject *kobj,
6527 + struct kobj_attribute *attr, char *buf)
6529 + return sprintf(buf, "%lu\n", uksm_ema_page_time);
6531 +UKSM_ATTR_RO(ema_per_page_time);
6533 +static ssize_t pages_shared_show(struct kobject *kobj,
6534 + struct kobj_attribute *attr, char *buf)
6536 + return sprintf(buf, "%lu\n", uksm_pages_shared);
6538 +UKSM_ATTR_RO(pages_shared);
6540 +static ssize_t pages_sharing_show(struct kobject *kobj,
6541 + struct kobj_attribute *attr, char *buf)
6543 + return sprintf(buf, "%lu\n", uksm_pages_sharing);
6545 +UKSM_ATTR_RO(pages_sharing);
6547 +static ssize_t pages_unshared_show(struct kobject *kobj,
6548 + struct kobj_attribute *attr, char *buf)
6550 + return sprintf(buf, "%lu\n", uksm_pages_unshared);
6552 +UKSM_ATTR_RO(pages_unshared);
6554 +static ssize_t full_scans_show(struct kobject *kobj,
6555 + struct kobj_attribute *attr, char *buf)
6557 + return sprintf(buf, "%llu\n", fully_scanned_round);
6559 +UKSM_ATTR_RO(full_scans);
6561 +static ssize_t pages_scanned_show(struct kobject *kobj,
6562 + struct kobj_attribute *attr, char *buf)
6564 + unsigned long base = 0;
6565 + u64 delta, ret;
6567 + if (pages_scanned_stored) {
6568 + base = pages_scanned_base;
6569 + ret = pages_scanned_stored;
6570 + delta = uksm_pages_scanned >> base;
6571 + if (CAN_OVERFLOW_U64(ret, delta)) {
6572 + ret >>= 1;
6573 + delta >>= 1;
6574 + base++;
6575 + ret += delta;
6577 + } else {
6578 + ret = uksm_pages_scanned;
6581 + while (ret > ULONG_MAX) {
6582 + ret >>= 1;
6583 + base++;
6586 + if (base)
6587 + return sprintf(buf, "%lu * 2^%lu\n", (unsigned long)ret, base);
6588 + else
6589 + return sprintf(buf, "%lu\n", (unsigned long)ret);
6591 +UKSM_ATTR_RO(pages_scanned);
6593 +static ssize_t hash_strength_show(struct kobject *kobj,
6594 + struct kobj_attribute *attr, char *buf)
6596 + return sprintf(buf, "%lu\n", hash_strength);
6598 +UKSM_ATTR_RO(hash_strength);
6600 +static ssize_t sleep_times_show(struct kobject *kobj,
6601 + struct kobj_attribute *attr, char *buf)
6603 + return sprintf(buf, "%llu\n", uksm_sleep_times);
6605 +UKSM_ATTR_RO(sleep_times);
6608 +static struct attribute *uksm_attrs[] = {
6609 + &max_cpu_percentage_attr.attr,
6610 + &sleep_millisecs_attr.attr,
6611 + &cpu_governor_attr.attr,
6612 + &run_attr.attr,
6613 + &ema_per_page_time_attr.attr,
6614 + &pages_shared_attr.attr,
6615 + &pages_sharing_attr.attr,
6616 + &pages_unshared_attr.attr,
6617 + &full_scans_attr.attr,
6618 + &pages_scanned_attr.attr,
6619 + &hash_strength_attr.attr,
6620 + &sleep_times_attr.attr,
6621 + &thrash_threshold_attr.attr,
6622 + &abundant_threshold_attr.attr,
6623 + &cpu_ratios_attr.attr,
6624 + &eval_intervals_attr.attr,
6625 + NULL,
6628 +static struct attribute_group uksm_attr_group = {
6629 + .attrs = uksm_attrs,
6630 + .name = "uksm",
6632 +#endif /* CONFIG_SYSFS */
6634 +static inline void init_scan_ladder(void)
6636 + int i;
6637 + struct scan_rung *rung;
6639 + for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6640 + rung = uksm_scan_ladder + i;
6641 + slot_tree_init_root(&rung->vma_root);
6644 + init_performance_values();
6645 + uksm_calc_scan_pages();
6648 +static inline int cal_positive_negative_costs(void)
6650 + struct page *p1, *p2;
6651 + unsigned char *addr1, *addr2;
6652 + unsigned long i, time_start, hash_cost;
6653 + unsigned long loopnum = 0;
6655 + /*IMPORTANT: volatile is needed to prevent over-optimization by gcc. */
6656 + volatile u32 hash;
6657 + volatile int ret;
6659 + p1 = alloc_page(GFP_KERNEL);
6660 + if (!p1)
6661 + return -ENOMEM;
6663 + p2 = alloc_page(GFP_KERNEL);
6664 + if (!p2)
6665 + return -ENOMEM;
6667 + addr1 = kmap_atomic(p1);
6668 + addr2 = kmap_atomic(p2);
6669 + memset(addr1, prandom_u32(), PAGE_SIZE);
6670 + memcpy(addr2, addr1, PAGE_SIZE);
6672 + /* make sure that the two pages differ in last byte */
6673 + addr2[PAGE_SIZE-1] = ~addr2[PAGE_SIZE-1];
6674 + kunmap_atomic(addr2);
6675 + kunmap_atomic(addr1);
6677 + time_start = jiffies;
6678 + while (jiffies - time_start < 100) {
6679 + for (i = 0; i < 100; i++)
6680 + hash = page_hash(p1, HASH_STRENGTH_FULL, 0);
6681 + loopnum += 100;
6683 + hash_cost = (jiffies - time_start);
6685 + time_start = jiffies;
6686 + for (i = 0; i < loopnum; i++)
6687 + ret = pages_identical_with_cost(p1, p2);
6688 + memcmp_cost = HASH_STRENGTH_FULL * (jiffies - time_start);
6689 + memcmp_cost /= hash_cost;
6690 + pr_info("UKSM: relative memcmp_cost = %lu "
6691 + "hash=%u cmp_ret=%d.\n",
6692 + memcmp_cost, hash, ret);
6694 + __free_page(p1);
6695 + __free_page(p2);
6696 + return 0;
6699 +static int init_zeropage_hash_table(void)
6701 + struct page *page;
6702 + char *addr;
6703 + int i;
6705 + page = alloc_page(GFP_KERNEL);
6706 + if (!page)
6707 + return -ENOMEM;
6709 + addr = kmap_atomic(page);
6710 + memset(addr, 0, PAGE_SIZE);
6711 + kunmap_atomic(addr);
6713 + zero_hash_table = kmalloc_array(HASH_STRENGTH_MAX, sizeof(u32),
6714 + GFP_KERNEL);
6715 + if (!zero_hash_table)
6716 + return -ENOMEM;
6718 + for (i = 0; i < HASH_STRENGTH_MAX; i++)
6719 + zero_hash_table[i] = page_hash(page, i, 0);
6721 + __free_page(page);
6723 + return 0;
6726 +static inline int init_random_sampling(void)
6728 + unsigned long i;
6730 + random_nums = kmalloc(PAGE_SIZE, GFP_KERNEL);
6731 + if (!random_nums)
6732 + return -ENOMEM;
6734 + for (i = 0; i < HASH_STRENGTH_FULL; i++)
6735 + random_nums[i] = i;
6737 + for (i = 0; i < HASH_STRENGTH_FULL; i++) {
6738 + unsigned long rand_range, swap_index, tmp;
6740 + rand_range = HASH_STRENGTH_FULL - i;
6741 + swap_index = i + prandom_u32() % rand_range;
6742 + tmp = random_nums[i];
6743 + random_nums[i] = random_nums[swap_index];
6744 + random_nums[swap_index] = tmp;
6747 + rshash_state.state = RSHASH_NEW;
6748 + rshash_state.below_count = 0;
6749 + rshash_state.lookup_window_index = 0;
6751 + return cal_positive_negative_costs();
6754 +static int __init uksm_slab_init(void)
6756 + rmap_item_cache = UKSM_KMEM_CACHE(rmap_item, 0);
6757 + if (!rmap_item_cache)
6758 + goto out;
6760 + stable_node_cache = UKSM_KMEM_CACHE(stable_node, 0);
6761 + if (!stable_node_cache)
6762 + goto out_free1;
6764 + node_vma_cache = UKSM_KMEM_CACHE(node_vma, 0);
6765 + if (!node_vma_cache)
6766 + goto out_free2;
6768 + vma_slot_cache = UKSM_KMEM_CACHE(vma_slot, 0);
6769 + if (!vma_slot_cache)
6770 + goto out_free3;
6772 + tree_node_cache = UKSM_KMEM_CACHE(tree_node, 0);
6773 + if (!tree_node_cache)
6774 + goto out_free4;
6776 + return 0;
6778 +out_free4:
6779 + kmem_cache_destroy(vma_slot_cache);
6780 +out_free3:
6781 + kmem_cache_destroy(node_vma_cache);
6782 +out_free2:
6783 + kmem_cache_destroy(stable_node_cache);
6784 +out_free1:
6785 + kmem_cache_destroy(rmap_item_cache);
6786 +out:
6787 + return -ENOMEM;
6790 +static void __init uksm_slab_free(void)
6792 + kmem_cache_destroy(stable_node_cache);
6793 + kmem_cache_destroy(rmap_item_cache);
6794 + kmem_cache_destroy(node_vma_cache);
6795 + kmem_cache_destroy(vma_slot_cache);
6796 + kmem_cache_destroy(tree_node_cache);
6799 +/* Common interface to ksm, different to it. */
6800 +int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
6801 + unsigned long end, int advice, unsigned long *vm_flags)
6803 + int err;
6805 + switch (advice) {
6806 + case MADV_MERGEABLE:
6807 + return 0; /* just ignore the advice */
6809 + case MADV_UNMERGEABLE:
6810 + if (!(*vm_flags & VM_MERGEABLE) || !uksm_flags_can_scan(*vm_flags))
6811 + return 0; /* just ignore the advice */
6813 + if (vma->anon_vma) {
6814 + err = unmerge_uksm_pages(vma, start, end);
6815 + if (err)
6816 + return err;
6819 + uksm_remove_vma(vma);
6820 + *vm_flags &= ~VM_MERGEABLE;
6821 + break;
6824 + return 0;
6827 +/* Common interface to ksm, actually the same. */
6828 +struct page *ksm_might_need_to_copy(struct page *page,
6829 + struct vm_area_struct *vma, unsigned long address)
6831 + struct anon_vma *anon_vma = page_anon_vma(page);
6832 + struct page *new_page;
6834 + if (PageKsm(page)) {
6835 + if (page_stable_node(page))
6836 + return page; /* no need to copy it */
6837 + } else if (!anon_vma) {
6838 + return page; /* no need to copy it */
6839 + } else if (anon_vma->root == vma->anon_vma->root &&
6840 + page->index == linear_page_index(vma, address)) {
6841 + return page; /* still no need to copy it */
6843 + if (!PageUptodate(page))
6844 + return page; /* let do_swap_page report the error */
6846 + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
6847 + if (new_page) {
6848 + copy_user_highpage(new_page, page, address, vma);
6850 + SetPageDirty(new_page);
6851 + __SetPageUptodate(new_page);
6852 + __SetPageLocked(new_page);
6855 + return new_page;
6858 +/* Copied from mm/ksm.c and required from 5.1 */
6859 +bool reuse_ksm_page(struct page *page,
6860 + struct vm_area_struct *vma,
6861 + unsigned long address)
6863 +#ifdef CONFIG_DEBUG_VM
6864 + if (WARN_ON(is_zero_pfn(page_to_pfn(page))) ||
6865 + WARN_ON(!page_mapped(page)) ||
6866 + WARN_ON(!PageLocked(page))) {
6867 + dump_page(page, "reuse_ksm_page");
6868 + return false;
6870 +#endif
6872 + if (PageSwapCache(page) || !page_stable_node(page))
6873 + return false;
6874 + /* Prohibit parallel get_ksm_page() */
6875 + if (!page_ref_freeze(page, 1))
6876 + return false;
6878 + page_move_anon_rmap(page, vma);
6879 + page->index = linear_page_index(vma, address);
6880 + page_ref_unfreeze(page, 1);
6882 + return true;
6885 +static int __init uksm_init(void)
6887 + struct task_struct *uksm_thread;
6888 + int err;
6890 + uksm_sleep_jiffies = msecs_to_jiffies(100);
6891 + uksm_sleep_saved = uksm_sleep_jiffies;
6893 + slot_tree_init();
6894 + init_scan_ladder();
6897 + err = init_random_sampling();
6898 + if (err)
6899 + goto out_free2;
6901 + err = uksm_slab_init();
6902 + if (err)
6903 + goto out_free1;
6905 + err = init_zeropage_hash_table();
6906 + if (err)
6907 + goto out_free0;
6909 + uksm_thread = kthread_run(uksm_scan_thread, NULL, "uksmd");
6910 + if (IS_ERR(uksm_thread)) {
6911 + pr_err("uksm: creating kthread failed\n");
6912 + err = PTR_ERR(uksm_thread);
6913 + goto out_free;
6916 +#ifdef CONFIG_SYSFS
6917 + err = sysfs_create_group(mm_kobj, &uksm_attr_group);
6918 + if (err) {
6919 + pr_err("uksm: register sysfs failed\n");
6920 + kthread_stop(uksm_thread);
6921 + goto out_free;
6923 +#else
6924 + uksm_run = UKSM_RUN_MERGE; /* no way for user to start it */
6926 +#endif /* CONFIG_SYSFS */
6928 +#ifdef CONFIG_MEMORY_HOTREMOVE
6929 + /*
6930 + * Choose a high priority since the callback takes uksm_thread_mutex:
6931 + * later callbacks could only be taking locks which nest within that.
6932 + */
6933 + hotplug_memory_notifier(uksm_memory_callback, 100);
6934 +#endif
6935 + return 0;
6937 +out_free:
6938 + kfree(zero_hash_table);
6939 +out_free0:
6940 + uksm_slab_free();
6941 +out_free1:
6942 + kfree(random_nums);
6943 +out_free2:
6944 + kfree(uksm_scan_ladder);
6945 + return err;
6948 +#ifdef MODULE
6949 +subsys_initcall(ksm_init);
6950 +#else
6951 +late_initcall(uksm_init);
6952 +#endif
6954 diff --git a/mm/vmstat.c b/mm/vmstat.c
6955 index 74b2c374b..ae42103a8 100644
6956 --- a/mm/vmstat.c
6957 +++ b/mm/vmstat.c
6958 @@ -1231,6 +1231,9 @@ const char * const vmstat_text[] = {
6959 "nr_swapcached",
6960 #endif
6962 +#ifdef CONFIG_UKSM
6963 + "nr_uksm_zero_pages",
6964 +#endif
6965 /* enum writeback_stat_item counters */
6966 "nr_dirty_threshold",
6967 "nr_dirty_background_threshold",
6969 2.31.1.305.gd1b10fc6d8