updated on Tue Jan 10 04:01:21 UTC 2012
[aur-mirror.git] / kernel26-uksm / uksm-2.6.38-20111223.patch
blobba8efaf0d076432f30948055b9e37d5e7dc1c8bf
1 diff -Nur linux-2.6.38/arch/x86/kernel/entry_32.S uksm-2.6.38/arch/x86/kernel/entry_32.S
2 --- linux-2.6.38/arch/x86/kernel/entry_32.S 2011-03-15 09:20:32.000000000 +0800
3 +++ uksm-2.6.38/arch/x86/kernel/entry_32.S 2011-12-16 01:09:35.000000000 +0800
4 @@ -1413,7 +1413,7 @@
5 CFI_ADJUST_CFA_OFFSET 4
6 jmp error_code
7 CFI_ENDPROC
8 -END(apf_page_fault)
9 +END(async_page_fault)
10 #endif
13 diff -Nur linux-2.6.38/arch/x86/kernel/entry_64.S uksm-2.6.38/arch/x86/kernel/entry_64.S
14 --- linux-2.6.38/arch/x86/kernel/entry_64.S 2011-03-15 09:20:32.000000000 +0800
15 +++ uksm-2.6.38/arch/x86/kernel/entry_64.S 2011-12-16 01:09:35.000000000 +0800
16 @@ -1248,7 +1248,7 @@
17 decl PER_CPU_VAR(irq_count)
18 jmp error_exit
19 CFI_ENDPROC
20 -END(do_hypervisor_callback)
21 +END(xen_do_hypervisor_callback)
24 * Hypervisor uses this for application faults while it executes.
25 diff -Nur linux-2.6.38/fs/exec.c uksm-2.6.38/fs/exec.c
26 --- linux-2.6.38/fs/exec.c 2011-03-15 09:20:32.000000000 +0800
27 +++ uksm-2.6.38/fs/exec.c 2011-12-16 01:10:09.000000000 +0800
28 @@ -19,7 +19,7 @@
29 * current->executable is only used by the procfs. This allows a dispatch
30 * table to check for several different types of binary formats. We keep
31 * trying until we recognize the file or we run out of supported binary
32 - * formats.
33 + * formats.
36 #include <linux/slab.h>
37 @@ -55,6 +55,7 @@
38 #include <linux/fs_struct.h>
39 #include <linux/pipe_fs_i.h>
40 #include <linux/oom.h>
41 +#include <linux/ksm.h>
43 #include <asm/uaccess.h>
44 #include <asm/mmu_context.h>
45 @@ -85,7 +86,7 @@
46 insert ? list_add(&fmt->lh, &formats) :
47 list_add_tail(&fmt->lh, &formats);
48 write_unlock(&binfmt_lock);
49 - return 0;
50 + return 0;
53 EXPORT_SYMBOL(__register_binfmt);
54 @@ -1106,7 +1107,7 @@
55 group */
57 current->self_exec_id++;
60 flush_signal_handlers(current, 0);
61 flush_old_files(current->files);
63 @@ -1196,8 +1197,8 @@
64 return res;
67 -/*
68 - * Fill the binprm structure from the inode.
69 +/*
70 + * Fill the binprm structure from the inode.
71 * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
73 * This may be called multiple times for binary chains (scripts for example).
74 diff -Nur linux-2.6.38/fs/proc/meminfo.c uksm-2.6.38/fs/proc/meminfo.c
75 --- linux-2.6.38/fs/proc/meminfo.c 2011-03-15 09:20:32.000000000 +0800
76 +++ uksm-2.6.38/fs/proc/meminfo.c 2011-12-16 01:10:10.000000000 +0800
77 @@ -87,6 +87,9 @@
78 "SUnreclaim: %8lu kB\n"
79 "KernelStack: %8lu kB\n"
80 "PageTables: %8lu kB\n"
81 +#ifdef CONFIG_KSM
82 + "KsmSharing: %8lu kB\n"
83 +#endif
84 #ifdef CONFIG_QUICKLIST
85 "Quicklists: %8lu kB\n"
86 #endif
87 @@ -145,6 +148,9 @@
88 K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
89 global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024,
90 K(global_page_state(NR_PAGETABLE)),
91 +#ifdef CONFIG_KSM
92 + K(global_page_state(NR_KSM_PAGES_SHARING)),
93 +#endif
94 #ifdef CONFIG_QUICKLIST
95 K(quicklist_total_size()),
96 #endif
97 diff -Nur linux-2.6.38/include/linux/ksm.h uksm-2.6.38/include/linux/ksm.h
98 --- linux-2.6.38/include/linux/ksm.h 2011-03-15 09:20:32.000000000 +0800
99 +++ uksm-2.6.38/include/linux/ksm.h 2011-12-22 17:46:52.213988023 +0800
100 @@ -20,24 +20,6 @@
101 struct vm_area_struct *vma, unsigned long address);
103 #ifdef CONFIG_KSM
104 -int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
105 - unsigned long end, int advice, unsigned long *vm_flags);
106 -int __ksm_enter(struct mm_struct *mm);
107 -void __ksm_exit(struct mm_struct *mm);
109 -static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
111 - if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
112 - return __ksm_enter(mm);
113 - return 0;
116 -static inline void ksm_exit(struct mm_struct *mm)
118 - if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
119 - __ksm_exit(mm);
123 * A KSM page is one of those write-protected "shared pages" or "merged pages"
124 * which KSM maps into multiple mms, wherever identical anonymous page content
125 @@ -62,6 +44,13 @@
126 (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
129 +/* must be done before linked to mm */
130 +extern void ksm_vma_add_new(struct vm_area_struct *vma);
132 +extern void ksm_remove_vma(struct vm_area_struct *vma);
133 +extern int unmerge_ksm_pages(struct vm_area_struct *vma,
134 + unsigned long start, unsigned long end);
137 * When do_swap_page() first faults in from swap what used to be a KSM page,
138 * no problem, it will be assigned to this vma's anon_vma; but thereafter,
139 @@ -90,16 +79,184 @@
140 struct vm_area_struct *, unsigned long, void *), void *arg);
141 void ksm_migrate_page(struct page *newpage, struct page *oldpage);
143 -#else /* !CONFIG_KSM */
144 +/* Each rung of this ladder is a list of VMAs having a same scan ratio */
145 +struct scan_rung {
146 + struct list_head vma_list;
147 + //spinlock_t vma_list_lock;
148 + //struct semaphore sem;
149 + struct list_head *current_scan;
150 + unsigned int pages_to_scan;
151 + unsigned char round_finished; /* rung is ready for the next round */
152 + unsigned char busy_searched;
153 + unsigned long fully_scanned_slots;
154 + unsigned long scan_ratio;
155 + unsigned long vma_num;
156 + //unsigned long vma_finished;
157 + unsigned long scan_turn;
160 +struct vma_slot {
161 + struct list_head ksm_list;
162 + struct list_head slot_list;
163 + unsigned long dedup_ratio;
164 + unsigned long dedup_num;
165 + int ksm_index; /* -1 if vma is not in inter-table,
166 + positive otherwise */
167 + unsigned long pages_scanned;
168 + unsigned long last_scanned;
169 + unsigned long pages_to_scan;
170 + struct scan_rung *rung;
171 + struct page **rmap_list_pool;
172 + unsigned long *pool_counts;
173 + unsigned long pool_size;
174 + struct vm_area_struct *vma;
175 + struct mm_struct *mm;
176 + unsigned long ctime_j;
177 + unsigned long pages;
178 + unsigned char need_sort;
179 + unsigned char need_rerand;
180 + unsigned long slot_scanned; /* It's scanned in this round */
181 + unsigned long fully_scanned; /* the above four to be merged to status bits */
182 + unsigned long pages_cowed; /* pages cowed this round */
183 + unsigned long pages_merged; /* pages merged this round */
185 + /* used for dup vma pair */
186 + struct radix_tree_root dup_tree;
189 -static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
191 - return 0;
194 + * A few notes about the KSM scanning process,
195 + * to make it easier to understand the data structures below:
197 + * In order to reduce excessive scanning, KSM sorts the memory pages by their
198 + * contents into a data structure that holds pointers to the pages' locations.
200 + * Since the contents of the pages may change at any moment, KSM cannot just
201 + * insert the pages into a normal sorted tree and expect it to find anything.
202 + * Therefore KSM uses two data structures - the stable and the unstable tree.
204 + * The stable tree holds pointers to all the merged pages (ksm pages), sorted
205 + * by their contents. Because each such page is write-protected, searching on
206 + * this tree is fully assured to be working (except when pages are unmapped),
207 + * and therefore this tree is called the stable tree.
209 + * In addition to the stable tree, KSM uses a second data structure called the
210 + * unstable tree: this tree holds pointers to pages which have been found to
211 + * be "unchanged for a period of time". The unstable tree sorts these pages
212 + * by their contents, but since they are not write-protected, KSM cannot rely
213 + * upon the unstable tree to work correctly - the unstable tree is liable to
214 + * be corrupted as its contents are modified, and so it is called unstable.
216 + * KSM solves this problem by several techniques:
218 + * 1) The unstable tree is flushed every time KSM completes scanning all
219 + * memory areas, and then the tree is rebuilt again from the beginning.
220 + * 2) KSM will only insert into the unstable tree, pages whose hash value
221 + * has not changed since the previous scan of all memory areas.
222 + * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the
223 + * colors of the nodes and not on their contents, assuring that even when
224 + * the tree gets "corrupted" it won't get out of balance, so scanning time
225 + * remains the same (also, searching and inserting nodes in an rbtree uses
226 + * the same algorithm, so we have no overhead when we flush and rebuild).
227 + * 4) KSM never flushes the stable tree, which means that even if it were to
228 + * take 10 attempts to find a page in the unstable tree, once it is found,
229 + * it is secured in the stable tree. (When we scan a new page, we first
230 + * compare it against the stable tree, and then against the unstable tree.)
231 + */
233 -static inline void ksm_exit(struct mm_struct *mm)
237 +/**
238 + * node of either the stable or unstale rbtree
240 + */
241 +struct tree_node {
242 + struct rb_node node; /* link in the main (un)stable rbtree */
243 + struct rb_root sub_root; /* rb_root for sublevel collision rbtree */
244 + u32 hash;
245 + unsigned long count; /* how many sublevel tree nodes */
246 + struct list_head all_list; /* all tree nodes in stable/unstable tree */
250 +/**
251 + * struct stable_node - node of the stable rbtree
252 + * @node: rb node of this ksm page in the stable tree
253 + * @hlist: hlist head of rmap_items using this ksm page
254 + * @kpfn: page frame number of this ksm page
255 + */
256 +struct stable_node {
257 + struct rb_node node; /* link in sub-rbtree */
258 + struct tree_node *tree_node; /* it's tree node root in stable tree, NULL if it's in hell list */
259 + struct hlist_head hlist;
260 + unsigned long kpfn;
261 + u32 hash_max; /* if ==0 then it's not been calculated yet */
262 + //struct vm_area_struct *old_vma;
263 + struct list_head all_list; /* in a list for all stable nodes */
269 +/**
270 + * struct node_vma - group rmap_items linked in a same stable
271 + * node together.
272 + */
273 +struct node_vma {
274 + union {
275 + struct vma_slot *slot;
276 + unsigned long key; /* slot is used as key sorted on hlist */
277 + };
278 + struct hlist_node hlist;
279 + struct hlist_head rmap_hlist;
280 + struct stable_node *head;
281 + unsigned long last_update;
284 +/**
285 + * struct rmap_item - reverse mapping item for virtual addresses
286 + * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
287 + * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
288 + * @mm: the memory structure this rmap_item is pointing into
289 + * @address: the virtual address this rmap_item tracks (+ flags in low bits)
290 + * @node: rb node of this rmap_item in the unstable tree
291 + * @head: pointer to stable_node heading this list in the stable tree
292 + * @hlist: link into hlist of rmap_items hanging off that stable_node
293 + */
294 +struct rmap_item {
295 + struct vma_slot *slot;
296 + struct page *page;
297 + unsigned long address; /* + low bits used for flags below */
298 + /* Appendded to (un)stable tree on which scan round */
299 + unsigned long append_round;
301 + /* Which rung scan turn it was last scanned */
302 + //unsigned long last_scan;
303 + unsigned long entry_index;
304 + union {
305 + struct {/* when in unstable tree */
306 + struct rb_node node;
307 + struct tree_node *tree_node;
308 + u32 hash_max;
309 + };
310 + struct { /* when in stable tree */
311 + struct node_vma *head;
312 + struct hlist_node hlist;
313 + struct anon_vma *anon_vma;
314 + };
315 + };
316 +} __attribute__((aligned(4)));
318 +struct rmap_list_entry {
319 + union {
320 + struct rmap_item *item;
321 + unsigned long addr;
322 + };
323 + // lowest bit is used for is_addr tag
324 + //unsigned char is_addr;
325 +} __attribute__((aligned(4))); // 4 aligned to fit in to pages
327 +//extern struct semaphore ksm_scan_sem;
328 +#else /* !CONFIG_KSM */
330 static inline int PageKsm(struct page *page)
332 @@ -107,8 +264,9 @@
335 #ifdef CONFIG_MMU
336 -static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
337 - unsigned long end, int advice, unsigned long *vm_flags)
339 +extern inline int unmerge_ksm_pages(struct vm_area_struct *vma,
340 + unsigned long start, unsigned long end)
342 return 0;
344 diff -Nur linux-2.6.38/include/linux/mm_types.h uksm-2.6.38/include/linux/mm_types.h
345 --- linux-2.6.38/include/linux/mm_types.h 2011-03-15 09:20:32.000000000 +0800
346 +++ uksm-2.6.38/include/linux/mm_types.h 2011-12-16 01:10:13.000000000 +0800
347 @@ -183,6 +183,9 @@
348 #ifdef CONFIG_NUMA
349 struct mempolicy *vm_policy; /* NUMA policy for the VMA */
350 #endif
351 +#ifdef CONFIG_KSM
352 + struct vma_slot *ksm_vma_slot;
353 +#endif
356 struct core_thread {
357 diff -Nur linux-2.6.38/include/linux/mmzone.h uksm-2.6.38/include/linux/mmzone.h
358 --- linux-2.6.38/include/linux/mmzone.h 2011-03-15 09:20:32.000000000 +0800
359 +++ uksm-2.6.38/include/linux/mmzone.h 2011-12-16 01:10:13.000000000 +0800
360 @@ -115,6 +115,9 @@
361 NUMA_OTHER, /* allocation from other node */
362 #endif
363 NR_ANON_TRANSPARENT_HUGEPAGES,
364 +#ifdef CONFIG_KSM
365 + NR_KSM_PAGES_SHARING,
366 +#endif
367 NR_VM_ZONE_STAT_ITEMS };
370 @@ -344,7 +347,7 @@
371 ZONE_PADDING(_pad1_)
373 /* Fields commonly accessed by the page reclaim scanner */
374 - spinlock_t lru_lock;
375 + spinlock_t lru_lock;
376 struct zone_lru {
377 struct list_head list;
378 } lru[NR_LRU_LISTS];
379 @@ -722,7 +725,7 @@
383 - * is_highmem - helper function to quickly check if a struct zone is a
384 + * is_highmem - helper function to quickly check if a struct zone is a
385 * highmem zone or not. This is an attempt to keep references
386 * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum.
387 * @zone - pointer to struct zone variable
388 diff -Nur linux-2.6.38/include/linux/sched.h uksm-2.6.38/include/linux/sched.h
389 --- linux-2.6.38/include/linux/sched.h 2011-03-15 09:20:32.000000000 +0800
390 +++ uksm-2.6.38/include/linux/sched.h 2011-12-16 01:10:13.000000000 +0800
391 @@ -433,7 +433,6 @@
392 # define MMF_DUMP_MASK_DEFAULT_ELF 0
393 #endif
394 /* leave room for more dump flags */
395 -#define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */
396 #define MMF_VM_HUGEPAGE 17 /* set when VM_HUGEPAGE is set on vma */
398 #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
399 @@ -1280,9 +1279,9 @@
400 unsigned long stack_canary;
401 #endif
403 - /*
404 + /*
405 * pointers to (original) parent process, youngest child, younger sibling,
406 - * older sibling, respectively. (p->father can be replaced with
407 + * older sibling, respectively. (p->father can be replaced with
408 * p->real_parent->pid)
410 struct task_struct *real_parent; /* real parent process */
411 @@ -2080,7 +2079,7 @@
412 spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
414 return ret;
418 extern void block_all_signals(int (*notifier)(void *priv), void *priv,
419 sigset_t *mask);
420 diff -Nur linux-2.6.38/kernel/fork.c uksm-2.6.38/kernel/fork.c
421 --- linux-2.6.38/kernel/fork.c 2011-03-15 09:20:32.000000000 +0800
422 +++ uksm-2.6.38/kernel/fork.c 2011-12-16 01:10:14.000000000 +0800
423 @@ -328,9 +328,6 @@
424 rb_link = &mm->mm_rb.rb_node;
425 rb_parent = NULL;
426 pprev = &mm->mmap;
427 - retval = ksm_fork(mm, oldmm);
428 - if (retval)
429 - goto out;
430 retval = khugepaged_fork(mm, oldmm);
431 if (retval)
432 goto out;
433 @@ -353,7 +350,7 @@
434 goto fail_nomem;
435 charge = len;
437 - tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
438 + tmp = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
439 if (!tmp)
440 goto fail_nomem;
441 *tmp = *mpnt;
442 @@ -406,7 +403,9 @@
443 __vma_link_rb(mm, tmp, rb_link, rb_parent);
444 rb_link = &tmp->vm_rb.rb_right;
445 rb_parent = &tmp->vm_rb;
447 +#ifdef CONFIG_KSM
448 + ksm_vma_add_new(tmp);
449 +#endif
450 mm->map_count++;
451 retval = copy_page_range(mm, oldmm, mpnt);
453 @@ -549,7 +548,6 @@
455 if (atomic_dec_and_test(&mm->mm_users)) {
456 exit_aio(mm);
457 - ksm_exit(mm);
458 khugepaged_exit(mm); /* must run before exit_mmap */
459 exit_mmap(mm);
460 set_mm_exe_file(mm, NULL);
461 diff -Nur linux-2.6.38/mm/ksm.c uksm-2.6.38/mm/ksm.c
462 --- linux-2.6.38/mm/ksm.c 2011-03-15 09:20:32.000000000 +0800
463 +++ uksm-2.6.38/mm/ksm.c 2011-12-22 17:46:27.967320547 +0800
464 @@ -12,6 +12,47 @@
465 * Hugh Dickins
467 * This work is licensed under the terms of the GNU GPL, version 2.
471 + * Ultra KSM. Copyright (C) 2011 Nai Xia
473 + * This is an improvement upon KSM. Its features:
474 + * 1. Full system scan:
475 + * It automatically scans all user processes' anonymous VMAs. Kernel-user
476 + * interaction to submit a memory area to KSM is no longer needed.
478 + * 2. Rich area detection based on random sampling:
479 + * It automatically detects rich areas containing abundant duplicated
480 + * pages based on their randomly-sampled history. Rich areas are given
481 + * a full scan speed. Poor areas are sampled at a reasonable speed with
482 + * very low CPU consumption.
484 + * 3. Per-page scan speed improvement:
485 + * A new hash algorithm(random_sample_hash) is proposed. Quite usually,
486 + * it's enough to distinguish pages by hashing their partial content
487 + * instead of full pages. This algorithm can automatically adapt to this
488 + * situation. For the best case, only one 32-bit-word/page is needed to
489 + * get the hash value for distinguishing pages. For the worst case, it's as
490 + * fast as SuperFastHash.
492 + * 4. Thrashing area avoidance:
493 + * Thrashing area(an VMA that has frequent Ksm page break-out) can be
494 + * filtered out. My benchmark shows it's more efficient than KSM's per-page
495 + * hash value based volatile page detection.
497 + * 5. Hash-value-based identical page detection:
498 + * It no longer uses "memcmp" based page detection any more.
500 + * 6. Misc changes upon KSM:
501 + * * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page
502 + * comparison. It's much faster than default C version on x86.
503 + * * rmap_item now has an struct *page member to loosely cache a
504 + * address-->page mapping, which reduces too much time-costly
505 + * follow_page().
506 + * * The VMA creation/exit procedures are hooked to let the Ultra KSM know.
507 + * * try_to_merge_two_pages() now can revert a pte if it fails. No break_
508 + * ksm is needed for this case.
511 #include <linux/errno.h>
512 @@ -33,142 +74,157 @@
513 #include <linux/mmu_notifier.h>
514 #include <linux/swap.h>
515 #include <linux/ksm.h>
516 -#include <linux/hash.h>
517 +#include <linux/crypto.h>
518 +#include <linux/scatterlist.h>
519 +#include <crypto/hash.h>
520 +#include <linux/random.h>
521 +#include <linux/math64.h>
522 +#include <linux/gcd.h>
523 #include <linux/freezer.h>
525 #include <asm/tlbflush.h>
526 #include "internal.h"
528 +#ifdef CONFIG_X86
529 +#undef memcmp
531 +#ifdef CONFIG_X86_32
532 +#define memcmp memcmpx86_32
534 - * A few notes about the KSM scanning process,
535 - * to make it easier to understand the data structures below:
537 - * In order to reduce excessive scanning, KSM sorts the memory pages by their
538 - * contents into a data structure that holds pointers to the pages' locations.
540 - * Since the contents of the pages may change at any moment, KSM cannot just
541 - * insert the pages into a normal sorted tree and expect it to find anything.
542 - * Therefore KSM uses two data structures - the stable and the unstable tree.
544 - * The stable tree holds pointers to all the merged pages (ksm pages), sorted
545 - * by their contents. Because each such page is write-protected, searching on
546 - * this tree is fully assured to be working (except when pages are unmapped),
547 - * and therefore this tree is called the stable tree.
549 - * In addition to the stable tree, KSM uses a second data structure called the
550 - * unstable tree: this tree holds pointers to pages which have been found to
551 - * be "unchanged for a period of time". The unstable tree sorts these pages
552 - * by their contents, but since they are not write-protected, KSM cannot rely
553 - * upon the unstable tree to work correctly - the unstable tree is liable to
554 - * be corrupted as its contents are modified, and so it is called unstable.
556 - * KSM solves this problem by several techniques:
558 - * 1) The unstable tree is flushed every time KSM completes scanning all
559 - * memory areas, and then the tree is rebuilt again from the beginning.
560 - * 2) KSM will only insert into the unstable tree, pages whose hash value
561 - * has not changed since the previous scan of all memory areas.
562 - * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the
563 - * colors of the nodes and not on their contents, assuring that even when
564 - * the tree gets "corrupted" it won't get out of balance, so scanning time
565 - * remains the same (also, searching and inserting nodes in an rbtree uses
566 - * the same algorithm, so we have no overhead when we flush and rebuild).
567 - * 4) KSM never flushes the stable tree, which means that even if it were to
568 - * take 10 attempts to find a page in the unstable tree, once it is found,
569 - * it is secured in the stable tree. (When we scan a new page, we first
570 - * compare it against the stable tree, and then against the unstable tree.)
571 + * Compare 4-byte-aligned address s1 and s2, with length n
573 +int memcmpx86_32(void *s1, void *s2, size_t n)
575 + size_t num = n / 4;
576 + register int res;
577 + __asm__ __volatile__
578 + ("cld\n\t"
579 + "testl %3,%3\n\t"
580 + "repe; cmpsd\n\t"
581 + "je 1f\n\t"
582 + "sbbl %0,%0\n\t"
583 + "orl $1,%0\n"
584 + "1:"
585 + : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
586 + : "0" (0)
587 + : "cc");
589 -/**
590 - * struct mm_slot - ksm information per mm that is being scanned
591 - * @link: link to the mm_slots hash list
592 - * @mm_list: link into the mm_slots list, rooted in ksm_mm_head
593 - * @rmap_list: head for this mm_slot's singly-linked list of rmap_items
594 - * @mm: the mm that this information is valid for
595 - */
596 -struct mm_slot {
597 - struct hlist_node link;
598 - struct list_head mm_list;
599 - struct rmap_item *rmap_list;
600 - struct mm_struct *mm;
602 + return res;
605 -/**
606 - * struct ksm_scan - cursor for scanning
607 - * @mm_slot: the current mm_slot we are scanning
608 - * @address: the next address inside that to be scanned
609 - * @rmap_list: link to the next rmap to be scanned in the rmap_list
610 - * @seqnr: count of completed full scans (needed when removing unstable node)
612 - * There is only the one ksm_scan instance of this cursor structure.
613 +#elif defined(CONFIG_X86_64)
614 +#define memcmp memcmpx86_64
616 + * Compare 8-byte-aligned address s1 and s2, with length n
618 -struct ksm_scan {
619 - struct mm_slot *mm_slot;
620 - unsigned long address;
621 - struct rmap_item **rmap_list;
622 - unsigned long seqnr;
624 +int memcmpx86_64(void *s1, void *s2, size_t n)
626 + size_t num = n / 8;
627 + register int res;
628 + __asm__ __volatile__
629 + ("cld\n\t"
630 + "testq %q3,%q3\n\t"
631 + "repe; cmpsq\n\t"
632 + "je 1f\n\t"
633 + "sbbq %q0,%q0\n\t"
634 + "orq $1,%q0\n"
635 + "1:"
636 + : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
637 + : "0" (0)
638 + : "cc");
640 -/**
641 - * struct stable_node - node of the stable rbtree
642 - * @node: rb node of this ksm page in the stable tree
643 - * @hlist: hlist head of rmap_items using this ksm page
644 - * @kpfn: page frame number of this ksm page
645 - */
646 -struct stable_node {
647 - struct rb_node node;
648 - struct hlist_head hlist;
649 - unsigned long kpfn;
651 + return res;
653 +#endif
654 +#endif
656 -/**
657 - * struct rmap_item - reverse mapping item for virtual addresses
658 - * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
659 - * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
660 - * @mm: the memory structure this rmap_item is pointing into
661 - * @address: the virtual address this rmap_item tracks (+ flags in low bits)
662 - * @oldchecksum: previous checksum of the page at that virtual address
663 - * @node: rb node of this rmap_item in the unstable tree
664 - * @head: pointer to stable_node heading this list in the stable tree
665 - * @hlist: link into hlist of rmap_items hanging off that stable_node
666 - */
667 -struct rmap_item {
668 - struct rmap_item *rmap_list;
669 - struct anon_vma *anon_vma; /* when stable */
670 - struct mm_struct *mm;
671 - unsigned long address; /* + low bits used for flags below */
672 - unsigned int oldchecksum; /* when unstable */
673 - union {
674 - struct rb_node node; /* when node of unstable tree */
675 - struct { /* when listed from stable tree */
676 - struct stable_node *head;
677 - struct hlist_node hlist;
678 - };
679 - };
681 +#ifdef CONFIG_X86
682 +#ifdef CONFIG_X86_32
684 + * Check the page is all zero ?
685 + */
686 +static int check_zero_page(const void *s1, size_t len)
688 + unsigned char diff;
689 + len = len>>2;
690 + asm("repe; scasl; setnz %0"
691 + : "=qm" (diff), "+D" (s1), "+c" (len)
692 + : "a" (0)
693 + :"cc");
694 + return diff;
697 +#elif defined(CONFIG_X86_64)
698 +static int check_zero_page(const void *s1, size_t len)
700 + unsigned char diff;
701 + len = len>>3;
702 + asm("repe; scasq; setnz %0"
703 + : "=qm" (diff), "+D" (s1), "+c" (len)
704 + : "a" (0)
705 + :"cc");
706 + return diff;
708 +#endif
709 +#else
710 +static int check_zero_page(const void *s1, size_t len)
712 + int ret = 0;
713 + u32 *src = (u32 *)s1;
714 + u32 z = 0;
715 + len = len>>2;
716 + while(len--)
717 + if ((ret = *src++ - z) != 0)
718 + break;
720 -#define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */
721 -#define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */
722 -#define STABLE_FLAG 0x200 /* is listed from the stable tree */
723 + return ret;
725 +#endif
727 -/* The stable and unstable tree heads */
728 -static struct rb_root root_stable_tree = RB_ROOT;
729 -static struct rb_root root_unstable_tree = RB_ROOT;
730 +#define U64_MAX (~((u64)0))
732 -#define MM_SLOTS_HASH_SHIFT 10
733 -#define MM_SLOTS_HASH_HEADS (1 << MM_SLOTS_HASH_SHIFT)
734 -static struct hlist_head mm_slots_hash[MM_SLOTS_HASH_HEADS];
736 -static struct mm_slot ksm_mm_head = {
737 - .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
739 -static struct ksm_scan ksm_scan = {
740 - .mm_slot = &ksm_mm_head,
743 + * Flags for rmap_item to judge if it's listed in the stable/unstable tree.
744 + * The flags use the low bits of rmap_item.address
745 + */
746 +#define UNSTABLE_FLAG 0x1
747 +#define STABLE_FLAG 0x2
748 +#define get_rmap_addr(x) ((x)->address & PAGE_MASK)
751 + * rmap_list_entry helpers
752 + */
753 +#define IS_ADDR_FLAG 1
754 +#define is_addr(ptr) ((unsigned long)(ptr) & IS_ADDR_FLAG)
755 +#define set_is_addr(ptr) ((ptr) |= IS_ADDR_FLAG)
756 +#define get_clean_addr(ptr) (((ptr) & ~(__typeof__(ptr))IS_ADDR_FLAG))
760 + * High speed caches for frequently allocated and freed structs
761 + */
762 static struct kmem_cache *rmap_item_cache;
763 static struct kmem_cache *stable_node_cache;
764 -static struct kmem_cache *mm_slot_cache;
765 +static struct kmem_cache *node_vma_cache;
766 +static struct kmem_cache *vma_slot_cache;
767 +static struct kmem_cache *tree_node_cache;
768 +#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
769 + sizeof(struct __struct), __alignof__(struct __struct),\
770 + (__flags), NULL)
772 +/* The scan rounds ksmd is currently in */
773 +static unsigned long long ksm_scan_round = 1;
775 +/* The number of pages has been scanned since the start up */
776 +static u64 ksm_pages_scanned;
778 +/* The number of pages has been scanned when last scan round finished */
779 +static u64 ksm_pages_scanned_last;
781 +/* If the scanned number is tooo large, we encode it here */
782 +static u64 pages_scanned_stored;
783 +static unsigned long pages_scanned_base;
785 /* The number of nodes in the stable tree */
786 static unsigned long ksm_pages_shared;
787 @@ -179,345 +235,408 @@
788 /* The number of nodes in the unstable tree */
789 static unsigned long ksm_pages_unshared;
791 -/* The number of rmap_items in use: to calculate pages_volatile */
792 -static unsigned long ksm_rmap_items;
793 +/*The number pages remap to zero pages */
794 +static unsigned long ksm_remap_zero_pages;
796 -/* Number of pages ksmd should scan in one batch */
797 -static unsigned int ksm_thread_pages_to_scan = 100;
799 + * Number of pages ksmd should scan in one batch. This is the top speed for
800 + * richly duplicated areas.
801 + */
802 +static unsigned long ksm_scan_batch_pages = 60000;
804 /* Milliseconds ksmd should sleep between batches */
805 -static unsigned int ksm_thread_sleep_millisecs = 20;
806 +static unsigned int ksm_sleep_jiffies = 2;
809 + * The threshold used to filter out thrashing areas,
810 + * If it == 0, filtering is disabled, otherwise it's the percentage up-bound
811 + * of the thrashing ratio of all areas. Any area with a bigger thrashing ratio
812 + * will be considered as having a zero duplication ratio.
813 + */
814 +static unsigned int ksm_thrash_threshold = 50;
816 +/* To avoid the float point arithmetic, this is the scale of a
817 + * deduplication ratio number.
818 + */
819 +#define KSM_DEDUP_RATIO_SCALE 100
822 +#define KSM_SCAN_RATIO_MAX 125
824 +/* minimum scan ratio for a vma, in unit of 1/KSM_SCAN_RATIO_MAX */
825 +static unsigned int ksm_min_scan_ratio = 1;
828 + * After each scan round, the scan ratio of an area with a big deduplication
829 + * ratio is upgraded by *=ksm_scan_ratio_delta
830 + */
831 +static unsigned int ksm_scan_ratio_delta = 5;
834 + * Inter-vma duplication number table page pointer array, initialized at
835 + * startup. Whenever ksmd finds that two areas have an identical page,
836 + * their corresponding table entry is increased. After each scan round
837 + * is finished, this table is scanned to calculate the estimated
838 + * duplication ratio for VMAs. Limited number(2048) of VMAs are
839 + * supported by now. We will migrate it to more scalable data structures
840 + * in the future.
841 + */
842 +#define KSM_DUP_VMA_MAX 2048
844 +#define INDIRECT_OFFSET 1
847 + * For mapping of vma_slot and its index in inter-vma duplication number
848 + * table
849 + */
850 +static struct radix_tree_root ksm_vma_tree;
851 +static unsigned long ksm_vma_tree_num;
852 +static unsigned long ksm_vma_tree_index_end;
854 +/* Array of all scan_rung, ksm_scan_ladder[0] having the minimum scan ratio */
855 +static struct scan_rung *ksm_scan_ladder;
856 +static unsigned int ksm_scan_ladder_size;
858 +/* The number of VMAs we are keeping track of */
859 +static unsigned long ksm_vma_slot_num;
861 +/* How many times the ksmd has slept since startup */
862 +static u64 ksm_sleep_times;
864 #define KSM_RUN_STOP 0
865 #define KSM_RUN_MERGE 1
866 -#define KSM_RUN_UNMERGE 2
867 -static unsigned int ksm_run = KSM_RUN_STOP;
868 +static unsigned int ksm_run = KSM_RUN_MERGE;
870 static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
871 static DEFINE_MUTEX(ksm_thread_mutex);
872 -static DEFINE_SPINLOCK(ksm_mmlist_lock);
874 -#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
875 - sizeof(struct __struct), __alignof__(struct __struct),\
876 - (__flags), NULL)
878 + * List vma_slot_new is for newly created vma_slot waiting to be added by
879 + * ksmd. If one cannot be added(e.g. due to it's too small), it's moved to
880 + * vma_slot_noadd. vma_slot_del is the list for vma_slot whose corresponding
881 + * VMA has been removed/freed.
882 + */
883 +struct list_head vma_slot_new = LIST_HEAD_INIT(vma_slot_new);
884 +struct list_head vma_slot_noadd = LIST_HEAD_INIT(vma_slot_noadd);
885 +struct list_head vma_slot_del = LIST_HEAD_INIT(vma_slot_del);
886 +static DEFINE_SPINLOCK(vma_slot_list_lock);
888 -static int __init ksm_slab_init(void)
889 +/* The unstable tree heads */
890 +static struct rb_root root_unstable_tree = RB_ROOT;
893 + * All tree_nodes are in a list to be freed at once when unstable tree is
894 + * freed after each scan round.
895 + */
896 +static struct list_head unstable_tree_node_list =
897 + LIST_HEAD_INIT(unstable_tree_node_list);
899 +/* List contains all stable nodes */
900 +static struct list_head stable_node_list = LIST_HEAD_INIT(stable_node_list);
903 + * When the hash strength is changed, the stable tree must be delta_hashed and
904 + * re-structured. We use two set of below structs to speed up the
905 + * re-structuring of stable tree.
906 + */
907 +static struct list_head
908 +stable_tree_node_list[2] = {LIST_HEAD_INIT(stable_tree_node_list[0]),
909 + LIST_HEAD_INIT(stable_tree_node_list[1])};
911 +static struct list_head *stable_tree_node_listp = &stable_tree_node_list[0];
912 +static struct rb_root root_stable_tree[2] = {RB_ROOT, RB_ROOT};
913 +static struct rb_root *root_stable_treep = &root_stable_tree[0];
914 +static unsigned long stable_tree_index;
916 +/* The hash strength needed to hash a full page */
917 +#define HASH_STRENGTH_FULL (PAGE_SIZE / sizeof(u32))
919 +/* The hash strength needed for loop-back hashing */
920 +#define HASH_STRENGTH_MAX (HASH_STRENGTH_FULL + 10)
922 +/* The random offsets in a page */
923 +static u32 *random_nums;
925 +/* The hash strength */
926 +static unsigned long hash_strength = HASH_STRENGTH_FULL >> 4;
928 +/* The delta value each time the hash strength increases or decreases */
929 +static unsigned long hash_strength_delta;
930 +#define HASH_STRENGTH_DELTA_MAX 5
932 +/* The time we have saved due to random_sample_hash */
933 +static u64 rshash_pos;
935 +/* The time we have wasted due to hash collision */
936 +static u64 rshash_neg;
938 +struct ksm_benefit {
939 + u64 pos;
940 + u64 neg;
941 + u64 scanned;
942 + unsigned long base;
943 +} benefit;
946 + * The relative cost of memcmp, compared to 1 time unit of random sample
947 + * hash, this value is tested when ksm module is initialized
948 + */
949 +static unsigned long memcmp_cost;
951 +static unsigned long rshash_neg_cont_zero;
952 +static unsigned long rshash_cont_obscure;
954 +/* The possible states of hash strength adjustment heuristic */
955 +enum rshash_states {
956 + RSHASH_STILL,
957 + RSHASH_TRYUP,
958 + RSHASH_TRYDOWN,
959 + RSHASH_NEW,
960 + RSHASH_PRE_STILL,
963 +/* The possible direction we are about to adjust hash strength */
964 +enum rshash_direct {
965 + GO_UP,
966 + GO_DOWN,
967 + OBSCURE,
968 + STILL,
971 +/* random sampling hash state machine */
972 +static struct {
973 + enum rshash_states state;
974 + enum rshash_direct pre_direct;
975 + u8 below_count;
976 + /* Keep a lookup window of size 5, iff above_count/below_count > 3
977 + * in this window we stop trying.
978 + */
979 + u8 lookup_window_index;
980 + u64 stable_benefit;
981 + unsigned long turn_point_down;
982 + unsigned long turn_benefit_down;
983 + unsigned long turn_point_up;
984 + unsigned long turn_benefit_up;
985 + unsigned long stable_point;
986 +} rshash_state;
988 +/*zero page hash table, hash_strength [0 ~ HASH_STRENGTH_MAX]*/
989 +static u32 *zero_hash_table;
991 +extern unsigned long zero_pfn __read_mostly;
993 +static inline struct node_vma *alloc_node_vma(void)
995 - rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
996 - if (!rmap_item_cache)
997 - goto out;
998 + struct node_vma *node_vma;
999 + node_vma = kmem_cache_zalloc(node_vma_cache, GFP_KERNEL);
1000 + if (node_vma) {
1001 + INIT_HLIST_HEAD(&node_vma->rmap_hlist);
1002 + INIT_HLIST_NODE(&node_vma->hlist);
1003 + node_vma->last_update = 0;
1005 + return node_vma;
1008 - stable_node_cache = KSM_KMEM_CACHE(stable_node, 0);
1009 - if (!stable_node_cache)
1010 - goto out_free1;
1011 +static inline void free_node_vma(struct node_vma *node_vma)
1013 + kmem_cache_free(node_vma_cache, node_vma);
1016 - mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
1017 - if (!mm_slot_cache)
1018 - goto out_free2;
1020 - return 0;
1021 +static inline struct vma_slot *alloc_vma_slot(void)
1023 + struct vma_slot *slot;
1025 -out_free2:
1026 - kmem_cache_destroy(stable_node_cache);
1027 -out_free1:
1028 - kmem_cache_destroy(rmap_item_cache);
1029 -out:
1030 - return -ENOMEM;
1031 + /*
1032 + * In case ksm is not initialized by now.
1033 + * Oops, we need to consider the call site of ksm_init() in the future.
1034 + */
1035 + if (!vma_slot_cache)
1036 + return NULL;
1038 + slot = kmem_cache_zalloc(vma_slot_cache, GFP_KERNEL);
1039 + if (slot) {
1040 + INIT_LIST_HEAD(&slot->ksm_list);
1041 + INIT_LIST_HEAD(&slot->slot_list);
1042 + INIT_RADIX_TREE(&slot->dup_tree, GFP_KERNEL);
1043 + slot->ksm_index = -1;
1044 + slot->need_rerand = 1;
1046 + return slot;
1049 -static void __init ksm_slab_free(void)
1050 +static inline void free_vma_slot(struct vma_slot *vma_slot)
1052 - kmem_cache_destroy(mm_slot_cache);
1053 - kmem_cache_destroy(stable_node_cache);
1054 - kmem_cache_destroy(rmap_item_cache);
1055 - mm_slot_cache = NULL;
1056 + kmem_cache_free(vma_slot_cache, vma_slot);
1061 static inline struct rmap_item *alloc_rmap_item(void)
1063 struct rmap_item *rmap_item;
1065 rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL);
1066 - if (rmap_item)
1067 - ksm_rmap_items++;
1068 + if (rmap_item) {
1069 + /* bug on lowest bit is not clear for flag use */
1070 + BUG_ON(is_addr(rmap_item));
1072 return rmap_item;
1075 static inline void free_rmap_item(struct rmap_item *rmap_item)
1077 - ksm_rmap_items--;
1078 - rmap_item->mm = NULL; /* debug safety */
1079 + rmap_item->slot = NULL; /* debug safety */
1080 kmem_cache_free(rmap_item_cache, rmap_item);
1083 static inline struct stable_node *alloc_stable_node(void)
1085 - return kmem_cache_alloc(stable_node_cache, GFP_KERNEL);
1086 + struct stable_node *node;
1087 + node = kmem_cache_alloc(stable_node_cache, GFP_KERNEL | GFP_ATOMIC);
1088 + if (!node)
1089 + return NULL;
1091 + INIT_HLIST_HEAD(&node->hlist);
1092 + list_add(&node->all_list, &stable_node_list);
1093 + return node;
1096 static inline void free_stable_node(struct stable_node *stable_node)
1098 + list_del(&stable_node->all_list);
1099 kmem_cache_free(stable_node_cache, stable_node);
1102 -static inline struct mm_slot *alloc_mm_slot(void)
1103 +static inline struct tree_node *alloc_tree_node(struct list_head *list)
1105 - if (!mm_slot_cache) /* initialization failed */
1106 + struct tree_node *node;
1107 + node = kmem_cache_zalloc(tree_node_cache, GFP_KERNEL | GFP_ATOMIC);
1108 + if (!node)
1109 return NULL;
1110 - return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
1112 + list_add(&node->all_list, list);
1113 + return node;
1116 -static inline void free_mm_slot(struct mm_slot *mm_slot)
1117 +static inline void free_tree_node(struct tree_node *node)
1119 - kmem_cache_free(mm_slot_cache, mm_slot);
1120 + list_del(&node->all_list);
1121 + kmem_cache_free(tree_node_cache, node);
1124 -static struct mm_slot *get_mm_slot(struct mm_struct *mm)
1125 +static void ksm_drop_anon_vma(struct rmap_item *rmap_item)
1127 - struct mm_slot *mm_slot;
1128 - struct hlist_head *bucket;
1129 - struct hlist_node *node;
1130 + struct anon_vma *anon_vma = rmap_item->anon_vma;
1132 - bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)];
1133 - hlist_for_each_entry(mm_slot, node, bucket, link) {
1134 - if (mm == mm_slot->mm)
1135 - return mm_slot;
1137 - return NULL;
1138 + drop_anon_vma(anon_vma);
1141 -static void insert_to_mm_slots_hash(struct mm_struct *mm,
1142 - struct mm_slot *mm_slot)
1144 +/**
1145 + * Remove a stable node from stable_tree, may unlink from its tree_node and
1146 + * may remove its parent tree_node if no other stable node is pending.
1148 + * @stable_node The node need to be removed
1149 + * @unlink_rb Will this node be unlinked from the rbtree?
1150 + * @remove_tree_ node Will its tree_node be removed if empty?
1151 + */
1152 +static void remove_node_from_stable_tree(struct stable_node *stable_node,
1153 + int unlink_rb, int remove_tree_node)
1155 - struct hlist_head *bucket;
1156 + struct node_vma *node_vma;
1157 + struct rmap_item *rmap_item;
1158 + struct hlist_node *hlist, *rmap_hlist, *n;
1160 - bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)];
1161 - mm_slot->mm = mm;
1162 - hlist_add_head(&mm_slot->link, bucket);
1164 + if (!hlist_empty(&stable_node->hlist)) {
1165 + hlist_for_each_entry_safe(node_vma, hlist, n,
1166 + &stable_node->hlist, hlist) {
1167 + hlist_for_each_entry(rmap_item, rmap_hlist,
1168 + &node_vma->rmap_hlist, hlist) {
1169 + ksm_pages_sharing--;
1171 -static inline int in_stable_tree(struct rmap_item *rmap_item)
1173 - return rmap_item->address & STABLE_FLAG;
1175 + ksm_drop_anon_vma(rmap_item);
1176 + rmap_item->address &= PAGE_MASK;
1178 + free_node_vma(node_vma);
1179 + cond_resched();
1182 -static void hold_anon_vma(struct rmap_item *rmap_item,
1183 - struct anon_vma *anon_vma)
1185 - rmap_item->anon_vma = anon_vma;
1186 - get_anon_vma(anon_vma);
1188 + /* the last one is counted as shared */
1189 + ksm_pages_shared--;
1190 + ksm_pages_sharing++;
1193 -static void ksm_drop_anon_vma(struct rmap_item *rmap_item)
1195 - struct anon_vma *anon_vma = rmap_item->anon_vma;
1196 + if (stable_node->tree_node && unlink_rb) {
1197 + rb_erase(&stable_node->node,
1198 + &stable_node->tree_node->sub_root);
1200 + if (RB_EMPTY_ROOT(&stable_node->tree_node->sub_root) &&
1201 + remove_tree_node) {
1202 + rb_erase(&stable_node->tree_node->node,
1203 + root_stable_treep);
1204 + free_tree_node(stable_node->tree_node);
1205 + } else {
1206 + stable_node->tree_node->count--;
1210 - drop_anon_vma(anon_vma);
1211 + free_stable_node(stable_node);
1215 - * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
1216 - * page tables after it has passed through ksm_exit() - which, if necessary,
1217 - * takes mmap_sem briefly to serialize against them. ksm_exit() does not set
1218 - * a special flag: they can just back out as soon as mm_users goes to zero.
1219 - * ksm_test_exit() is used throughout to make this test for exit: in some
1220 - * places for correctness, in some places just to avoid unnecessary work.
1221 - */
1222 -static inline bool ksm_test_exit(struct mm_struct *mm)
1224 - return atomic_read(&mm->mm_users) == 0;
1228 - * We use break_ksm to break COW on a ksm page: it's a stripped down
1229 + * get_ksm_page: checks if the page indicated by the stable node
1230 + * is still its ksm page, despite having held no reference to it.
1231 + * In which case we can trust the content of the page, and it
1232 + * returns the gotten page; but if the page has now been zapped,
1233 + * remove the stale node from the stable tree and return NULL.
1235 - * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1)
1236 - * put_page(page);
1237 + * You would expect the stable_node to hold a reference to the ksm page.
1238 + * But if it increments the page's count, swapping out has to wait for
1239 + * ksmd to come around again before it can free the page, which may take
1240 + * seconds or even minutes: much too unresponsive. So instead we use a
1241 + * "keyhole reference": access to the ksm page from the stable node peeps
1242 + * out through its keyhole to see if that page still holds the right key,
1243 + * pointing back to this stable node. This relies on freeing a PageAnon
1244 + * page to reset its page->mapping to NULL, and relies on no other use of
1245 + * a page to put something that might look like our key in page->mapping.
1247 - * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
1248 - * in case the application has unmapped and remapped mm,addr meanwhile.
1249 - * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP
1250 - * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
1251 + * include/linux/pagemap.h page_cache_get_speculative() is a good reference,
1252 + * but this is different - made simpler by ksm_thread_mutex being held, but
1253 + * interesting for assuming that no other use of the struct page could ever
1254 + * put our expected_mapping into page->mapping (or a field of the union which
1255 + * coincides with page->mapping). The RCU calls are not for KSM at all, but
1256 + * to keep the page_count protocol described with page_cache_get_speculative.
1258 + * Note: it is possible that get_ksm_page() will return NULL one moment,
1259 + * then page the next, if the page is in between page_freeze_refs() and
1260 + * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page
1261 + * is on its way to being freed; but it is an anomaly to bear in mind.
1263 + * @unlink_rb: if the removal of this node will firstly unlink from
1264 + * its rbtree. stable_node_reinsert will prevent this when restructuring the
1265 + * node from its old tree.
1267 + * @remove_tree_node: if this is the last one of its tree_node, will the
1268 + * tree_node be freed ? If we are inserting stable node, this tree_node may
1269 + * be reused, so don't free it.
1271 -static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
1272 +static struct page *get_ksm_page(struct stable_node *stable_node,
1273 + int unlink_rb, int remove_tree_node)
1275 struct page *page;
1276 - int ret = 0;
1277 + void *expected_mapping;
1279 - do {
1280 - cond_resched();
1281 - page = follow_page(vma, addr, FOLL_GET);
1282 - if (IS_ERR_OR_NULL(page))
1283 - break;
1284 - if (PageKsm(page))
1285 - ret = handle_mm_fault(vma->vm_mm, vma, addr,
1286 - FAULT_FLAG_WRITE);
1287 - else
1288 - ret = VM_FAULT_WRITE;
1289 - put_page(page);
1290 - } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM)));
1291 - /*
1292 - * We must loop because handle_mm_fault() may back out if there's
1293 - * any difficulty e.g. if pte accessed bit gets updated concurrently.
1295 - * VM_FAULT_WRITE is what we have been hoping for: it indicates that
1296 - * COW has been broken, even if the vma does not permit VM_WRITE;
1297 - * but note that a concurrent fault might break PageKsm for us.
1299 - * VM_FAULT_SIGBUS could occur if we race with truncation of the
1300 - * backing file, which also invalidates anonymous pages: that's
1301 - * okay, that truncation will have unmapped the PageKsm for us.
1303 - * VM_FAULT_OOM: at the time of writing (late July 2009), setting
1304 - * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
1305 - * current task has TIF_MEMDIE set, and will be OOM killed on return
1306 - * to user; and ksmd, having no mm, would never be chosen for that.
1308 - * But if the mm is in a limited mem_cgroup, then the fault may fail
1309 - * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
1310 - * even ksmd can fail in this way - though it's usually breaking ksm
1311 - * just to undo a merge it made a moment before, so unlikely to oom.
1313 - * That's a pity: we might therefore have more kernel pages allocated
1314 - * than we're counting as nodes in the stable tree; but ksm_do_scan
1315 - * will retry to break_cow on each pass, so should recover the page
1316 - * in due course. The important thing is to not let VM_MERGEABLE
1317 - * be cleared while any such pages might remain in the area.
1318 - */
1319 - return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
1322 -static void break_cow(struct rmap_item *rmap_item)
1324 - struct mm_struct *mm = rmap_item->mm;
1325 - unsigned long addr = rmap_item->address;
1326 - struct vm_area_struct *vma;
1328 - /*
1329 - * It is not an accident that whenever we want to break COW
1330 - * to undo, we also need to drop a reference to the anon_vma.
1331 - */
1332 - ksm_drop_anon_vma(rmap_item);
1334 - down_read(&mm->mmap_sem);
1335 - if (ksm_test_exit(mm))
1336 - goto out;
1337 - vma = find_vma(mm, addr);
1338 - if (!vma || vma->vm_start > addr)
1339 - goto out;
1340 - if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
1341 - goto out;
1342 - break_ksm(vma, addr);
1343 -out:
1344 - up_read(&mm->mmap_sem);
1347 -static struct page *page_trans_compound_anon(struct page *page)
1349 - if (PageTransCompound(page)) {
1350 - struct page *head = compound_trans_head(page);
1351 - /*
1352 - * head may actually be splitted and freed from under
1353 - * us but it's ok here.
1354 - */
1355 - if (PageAnon(head))
1356 - return head;
1358 - return NULL;
1361 -static struct page *get_mergeable_page(struct rmap_item *rmap_item)
1363 - struct mm_struct *mm = rmap_item->mm;
1364 - unsigned long addr = rmap_item->address;
1365 - struct vm_area_struct *vma;
1366 - struct page *page;
1368 - down_read(&mm->mmap_sem);
1369 - if (ksm_test_exit(mm))
1370 - goto out;
1371 - vma = find_vma(mm, addr);
1372 - if (!vma || vma->vm_start > addr)
1373 - goto out;
1374 - if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
1375 - goto out;
1377 - page = follow_page(vma, addr, FOLL_GET);
1378 - if (IS_ERR_OR_NULL(page))
1379 - goto out;
1380 - if (PageAnon(page) || page_trans_compound_anon(page)) {
1381 - flush_anon_page(vma, page, addr);
1382 - flush_dcache_page(page);
1383 - } else {
1384 - put_page(page);
1385 -out: page = NULL;
1387 - up_read(&mm->mmap_sem);
1388 - return page;
1391 -static void remove_node_from_stable_tree(struct stable_node *stable_node)
1393 - struct rmap_item *rmap_item;
1394 - struct hlist_node *hlist;
1396 - hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
1397 - if (rmap_item->hlist.next)
1398 - ksm_pages_sharing--;
1399 - else
1400 - ksm_pages_shared--;
1401 - ksm_drop_anon_vma(rmap_item);
1402 - rmap_item->address &= PAGE_MASK;
1403 - cond_resched();
1406 - rb_erase(&stable_node->node, &root_stable_tree);
1407 - free_stable_node(stable_node);
1411 - * get_ksm_page: checks if the page indicated by the stable node
1412 - * is still its ksm page, despite having held no reference to it.
1413 - * In which case we can trust the content of the page, and it
1414 - * returns the gotten page; but if the page has now been zapped,
1415 - * remove the stale node from the stable tree and return NULL.
1417 - * You would expect the stable_node to hold a reference to the ksm page.
1418 - * But if it increments the page's count, swapping out has to wait for
1419 - * ksmd to come around again before it can free the page, which may take
1420 - * seconds or even minutes: much too unresponsive. So instead we use a
1421 - * "keyhole reference": access to the ksm page from the stable node peeps
1422 - * out through its keyhole to see if that page still holds the right key,
1423 - * pointing back to this stable node. This relies on freeing a PageAnon
1424 - * page to reset its page->mapping to NULL, and relies on no other use of
1425 - * a page to put something that might look like our key in page->mapping.
1427 - * include/linux/pagemap.h page_cache_get_speculative() is a good reference,
1428 - * but this is different - made simpler by ksm_thread_mutex being held, but
1429 - * interesting for assuming that no other use of the struct page could ever
1430 - * put our expected_mapping into page->mapping (or a field of the union which
1431 - * coincides with page->mapping). The RCU calls are not for KSM at all, but
1432 - * to keep the page_count protocol described with page_cache_get_speculative.
1434 - * Note: it is possible that get_ksm_page() will return NULL one moment,
1435 - * then page the next, if the page is in between page_freeze_refs() and
1436 - * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page
1437 - * is on its way to being freed; but it is an anomaly to bear in mind.
1438 - */
1439 -static struct page *get_ksm_page(struct stable_node *stable_node)
1441 - struct page *page;
1442 - void *expected_mapping;
1444 - page = pfn_to_page(stable_node->kpfn);
1445 - expected_mapping = (void *)stable_node +
1446 - (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
1447 - rcu_read_lock();
1448 - if (page->mapping != expected_mapping)
1449 - goto stale;
1450 - if (!get_page_unless_zero(page))
1451 - goto stale;
1452 - if (page->mapping != expected_mapping) {
1453 + page = pfn_to_page(stable_node->kpfn);
1454 + expected_mapping = (void *)stable_node +
1455 + (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
1456 + rcu_read_lock();
1457 + if (page->mapping != expected_mapping)
1458 + goto stale;
1459 + if (!get_page_unless_zero(page))
1460 + goto stale;
1461 + if (page->mapping != expected_mapping) {
1462 put_page(page);
1463 goto stale;
1465 @@ -525,7 +644,8 @@
1466 return page;
1467 stale:
1468 rcu_read_unlock();
1469 - remove_node_from_stable_tree(stable_node);
1470 + remove_node_from_stable_tree(stable_node, unlink_rb, remove_tree_node);
1472 return NULL;
1475 @@ -533,32 +653,46 @@
1476 * Removing rmap_item from stable or unstable tree.
1477 * This function will clean the information from the stable/unstable tree.
1479 -static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
1480 +static inline void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
1482 if (rmap_item->address & STABLE_FLAG) {
1483 struct stable_node *stable_node;
1484 + struct node_vma *node_vma;
1485 struct page *page;
1487 - stable_node = rmap_item->head;
1488 - page = get_ksm_page(stable_node);
1489 + node_vma = rmap_item->head;
1490 + stable_node = node_vma->head;
1491 + page = get_ksm_page(stable_node, 1, 1);
1492 if (!page)
1493 goto out;
1495 + /*
1496 + * page lock is needed because it's racing with
1497 + * try_to_unmap_ksm(), etc.
1498 + */
1499 lock_page(page);
1500 hlist_del(&rmap_item->hlist);
1502 + if (hlist_empty(&node_vma->rmap_hlist)) {
1503 + hlist_del(&node_vma->hlist);
1504 + free_node_vma(node_vma);
1506 unlock_page(page);
1507 - put_page(page);
1509 - if (stable_node->hlist.first)
1510 - ksm_pages_sharing--;
1511 - else
1512 + put_page(page);
1513 + if (hlist_empty(&stable_node->hlist)) {
1514 + /* do NOT call remove_node_from_stable_tree() here,
1515 + * it's possible for a forked rmap_item not in
1516 + * stable tree while the in-tree rmap_items were
1517 + * deleted.
1518 + */
1519 ksm_pages_shared--;
1520 + } else
1521 + ksm_pages_sharing--;
1523 - ksm_drop_anon_vma(rmap_item);
1524 - rmap_item->address &= PAGE_MASK;
1526 + ksm_drop_anon_vma(rmap_item);
1527 } else if (rmap_item->address & UNSTABLE_FLAG) {
1528 - unsigned char age;
1530 * Usually ksmd can and must skip the rb_erase, because
1531 * root_unstable_tree was already reset to RB_ROOT.
1532 @@ -566,173 +700,458 @@
1533 * if this rmap_item was inserted by this scan, rather
1534 * than left over from before.
1536 - age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
1537 - BUG_ON(age > 1);
1538 - if (!age)
1539 - rb_erase(&rmap_item->node, &root_unstable_tree);
1541 + if (rmap_item->append_round == ksm_scan_round) {
1542 + rb_erase(&rmap_item->node,
1543 + &rmap_item->tree_node->sub_root);
1544 + if (RB_EMPTY_ROOT(&rmap_item->tree_node->sub_root)) {
1545 + rb_erase(&rmap_item->tree_node->node,
1546 + &root_unstable_tree);
1548 + free_tree_node(rmap_item->tree_node);
1549 + } else
1550 + rmap_item->tree_node->count--;
1552 ksm_pages_unshared--;
1553 - rmap_item->address &= PAGE_MASK;
1556 + rmap_item->address &= PAGE_MASK;
1557 + rmap_item->hash_max = 0;
1559 out:
1560 cond_resched(); /* we're called from many long loops */
1563 -static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
1564 - struct rmap_item **rmap_list)
1565 +/**
1566 + * Need to do two things:
1567 + * 1. check if slot was moved to del list
1568 + * 2. make sure the mmap_sem is manipulated under valid vma.
1570 + * My concern here is that in some cases, this may make
1571 + * vma_slot_list_lock() waiters to serialized further by some
1572 + * sem->wait_lock, can this really be expensive?
1575 + * @return
1576 + * 0: if successfully locked mmap_sem
1577 + * -ENOENT: this slot was moved to del list
1578 + * -EBUSY: vma lock failed
1579 + */
1580 +static int try_down_read_slot_mmap_sem(struct vma_slot *slot)
1582 - while (*rmap_list) {
1583 - struct rmap_item *rmap_item = *rmap_list;
1584 - *rmap_list = rmap_item->rmap_list;
1585 - remove_rmap_item_from_tree(rmap_item);
1586 - free_rmap_item(rmap_item);
1587 + struct vm_area_struct *vma;
1588 + struct mm_struct *mm;
1589 + struct rw_semaphore *sem;
1591 + spin_lock(&vma_slot_list_lock);
1593 + /* the slot_list was removed and inited from new list, when it enters
1594 + * ksm_list. If now it's not empty, then it must be moved to del list
1595 + */
1596 + if (!list_empty(&slot->slot_list)) {
1597 + spin_unlock(&vma_slot_list_lock);
1598 + return -ENOENT;
1601 + BUG_ON(slot->pages != vma_pages(slot->vma));
1602 + /* Ok, vma still valid */
1603 + vma = slot->vma;
1604 + mm = vma->vm_mm;
1605 + sem = &mm->mmap_sem;
1606 + if (down_read_trylock(sem)) {
1607 + spin_unlock(&vma_slot_list_lock);
1608 + return 0;
1611 + spin_unlock(&vma_slot_list_lock);
1612 + return -EBUSY;
1616 - * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather
1617 - * than check every pte of a given vma, the locking doesn't quite work for
1618 - * that - an rmap_item is assigned to the stable tree after inserting ksm
1619 - * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing
1620 - * rmap_items from parent to child at fork time (so as not to waste time
1621 - * if exit comes before the next scan reaches it).
1623 - * Similarly, although we'd like to remove rmap_items (so updating counts
1624 - * and freeing memory) when unmerging an area, it's easier to leave that
1625 - * to the next pass of ksmd - consider, for example, how ksmd might be
1626 - * in cmp_and_merge_page on one of the rmap_items we would be removing.
1627 - */
1628 -static int unmerge_ksm_pages(struct vm_area_struct *vma,
1629 - unsigned long start, unsigned long end)
1630 +static inline unsigned long
1631 +vma_page_address(struct page *page, struct vm_area_struct *vma)
1633 - unsigned long addr;
1634 - int err = 0;
1635 + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
1636 + unsigned long address;
1638 - for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
1639 - if (ksm_test_exit(vma->vm_mm))
1640 - break;
1641 - if (signal_pending(current))
1642 - err = -ERESTARTSYS;
1643 - else
1644 - err = break_ksm(vma, addr);
1645 + address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
1646 + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
1647 + /* page should be within @vma mapping range */
1648 + return -EFAULT;
1650 - return err;
1651 + return address;
1654 -#ifdef CONFIG_SYSFS
1656 - * Only called through the sysfs control interface:
1657 + * Test if the mm is exiting
1659 -static int unmerge_and_remove_all_rmap_items(void)
1660 +static inline bool ksm_test_exit(struct mm_struct *mm)
1662 + return atomic_read(&mm->mm_users) == 0;
1665 +/* return 0 on success with the item's mmap_sem locked */
1666 +static inline int get_mergeable_page_lock_mmap(struct rmap_item *item)
1668 - struct mm_slot *mm_slot;
1669 struct mm_struct *mm;
1670 struct vm_area_struct *vma;
1671 - int err = 0;
1672 + struct vma_slot *slot = item->slot;
1673 + int err = -EINVAL;
1675 - spin_lock(&ksm_mmlist_lock);
1676 - ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
1677 - struct mm_slot, mm_list);
1678 - spin_unlock(&ksm_mmlist_lock);
1680 - for (mm_slot = ksm_scan.mm_slot;
1681 - mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
1682 - mm = mm_slot->mm;
1683 - down_read(&mm->mmap_sem);
1684 - for (vma = mm->mmap; vma; vma = vma->vm_next) {
1685 - if (ksm_test_exit(mm))
1686 - break;
1687 - if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
1688 - continue;
1689 - err = unmerge_ksm_pages(vma,
1690 - vma->vm_start, vma->vm_end);
1691 - if (err)
1692 - goto error;
1695 - remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list);
1697 - spin_lock(&ksm_mmlist_lock);
1698 - ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
1699 - struct mm_slot, mm_list);
1700 - if (ksm_test_exit(mm)) {
1701 - hlist_del(&mm_slot->link);
1702 - list_del(&mm_slot->mm_list);
1703 - spin_unlock(&ksm_mmlist_lock);
1705 - free_mm_slot(mm_slot);
1706 - clear_bit(MMF_VM_MERGEABLE, &mm->flags);
1707 - up_read(&mm->mmap_sem);
1708 - mmdrop(mm);
1709 - } else {
1710 - spin_unlock(&ksm_mmlist_lock);
1711 - up_read(&mm->mmap_sem);
1713 + struct page *page;
1715 + BUG_ON(!item->slot);
1716 + /*
1717 + * try_down_read_slot_mmap_sem() returns non-zero if the slot
1718 + * has been removed by ksm_remove_vma().
1719 + */
1720 + if (try_down_read_slot_mmap_sem(slot))
1721 + return -EBUSY;
1723 + mm = slot->vma->vm_mm;
1724 + vma = slot->vma;
1726 + if (ksm_test_exit(mm))
1727 + goto failout_up;
1729 + page = item->page;
1730 + rcu_read_lock();
1731 + if (!get_page_unless_zero(page)) {
1732 + rcu_read_unlock();
1733 + goto failout_up;
1736 - ksm_scan.seqnr = 0;
1737 + /* No need to consider huge page here. */
1738 + if (item->slot->vma->anon_vma != page_anon_vma(page) ||
1739 + vma_page_address(page, item->slot->vma) != get_rmap_addr(item)) {
1740 + /*
1741 + * TODO:
1742 + * should we release this item becase of its stale page
1743 + * mapping?
1744 + */
1745 + put_page(page);
1746 + rcu_read_unlock();
1747 + goto failout_up;
1749 + rcu_read_unlock();
1750 return 0;
1752 -error:
1753 +failout_up:
1754 up_read(&mm->mmap_sem);
1755 - spin_lock(&ksm_mmlist_lock);
1756 - ksm_scan.mm_slot = &ksm_mm_head;
1757 - spin_unlock(&ksm_mmlist_lock);
1758 return err;
1760 -#endif /* CONFIG_SYSFS */
1762 -static u32 calc_checksum(struct page *page)
1764 + * What kind of VMA is considered ?
1765 + */
1766 +static inline int vma_can_enter(struct vm_area_struct *vma)
1768 - u32 checksum;
1769 - void *addr = kmap_atomic(page, KM_USER0);
1770 - checksum = jhash2(addr, PAGE_SIZE / 4, 17);
1771 - kunmap_atomic(addr, KM_USER0);
1772 - return checksum;
1773 + return !(vma->vm_flags & (VM_PFNMAP | VM_IO | VM_DONTEXPAND |
1774 + VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
1775 + VM_NONLINEAR | VM_MIXEDMAP | VM_SAO |
1776 + VM_SHARED | VM_MAYSHARE | VM_GROWSUP
1777 + | VM_GROWSDOWN));
1780 -static int memcmp_pages(struct page *page1, struct page *page2)
1782 + * Called whenever a fresh new vma is created A new vma_slot.
1783 + * is created and inserted into a global list Must be called.
1784 + * after vma is inserted to its mm .
1785 + */
1786 +inline void ksm_vma_add_new(struct vm_area_struct *vma)
1788 - char *addr1, *addr2;
1789 - int ret;
1790 + struct vma_slot *slot;
1792 - addr1 = kmap_atomic(page1, KM_USER0);
1793 - addr2 = kmap_atomic(page2, KM_USER1);
1794 - ret = memcmp(addr1, addr2, PAGE_SIZE);
1795 - kunmap_atomic(addr2, KM_USER1);
1796 - kunmap_atomic(addr1, KM_USER0);
1797 - return ret;
1798 + if (!vma_can_enter(vma)) {
1799 + vma->ksm_vma_slot = NULL;
1800 + return;
1803 + slot = alloc_vma_slot();
1804 + if (!slot) {
1805 + vma->ksm_vma_slot = NULL;
1806 + return;
1809 + vma->ksm_vma_slot = slot;
1810 + slot->vma = vma;
1811 + slot->mm = vma->vm_mm;
1812 + slot->ctime_j = jiffies;
1813 + slot->pages = vma_pages(vma);
1814 + spin_lock(&vma_slot_list_lock);
1815 + list_add_tail(&slot->slot_list, &vma_slot_new);
1816 + spin_unlock(&vma_slot_list_lock);
1819 -static inline int pages_identical(struct page *page1, struct page *page2)
1821 + * Called after vma is unlinked from its mm
1822 + */
1823 +void ksm_remove_vma(struct vm_area_struct *vma)
1825 - return !memcmp_pages(page1, page2);
1826 + struct vma_slot *slot;
1828 + if (!vma->ksm_vma_slot)
1829 + return;
1831 + slot = vma->ksm_vma_slot;
1832 + spin_lock(&vma_slot_list_lock);
1833 + if (list_empty(&slot->slot_list)) {
1834 + /**
1835 + * This slot has been added by ksmd, so move to the del list
1836 + * waiting ksmd to free it.
1837 + */
1838 + list_add_tail(&slot->slot_list, &vma_slot_del);
1839 + } else {
1840 + /**
1841 + * It's still on new list. It's ok to free slot directly.
1842 + */
1843 + list_del(&slot->slot_list);
1844 + free_vma_slot(slot);
1846 + spin_unlock(&vma_slot_list_lock);
1847 + vma->ksm_vma_slot = NULL;
1850 -static int write_protect_page(struct vm_area_struct *vma, struct page *page,
1851 - pte_t *orig_pte)
1852 +/* 32/3 < they < 32/2 */
1853 +#define shiftl 8
1854 +#define shiftr 12
1856 +#define HASH_FROM_TO(from, to) \
1857 +for (index = from; index < to; index++) { \
1858 + pos = random_nums[index]; \
1859 + hash += key[pos]; \
1860 + hash += (hash << shiftl); \
1861 + hash ^= (hash >> shiftr); \
1865 +#define HASH_FROM_DOWN_TO(from, to) \
1866 +for (index = from - 1; index >= to; index--) { \
1867 + hash ^= (hash >> shiftr); \
1868 + hash ^= (hash >> (shiftr*2)); \
1869 + hash -= (hash << shiftl); \
1870 + hash += (hash << (shiftl*2)); \
1871 + pos = random_nums[index]; \
1872 + hash -= key[pos]; \
1876 + * The main random sample hash function.
1877 + */
1878 +static u32 random_sample_hash(void *addr, u32 hash_strength)
1880 - struct mm_struct *mm = vma->vm_mm;
1881 - unsigned long addr;
1882 - pte_t *ptep;
1883 - spinlock_t *ptl;
1884 - int swapped;
1885 - int err = -EFAULT;
1886 + u32 hash = 0xdeadbeef;
1887 + int index, pos, loop = hash_strength;
1888 + u32 *key = (u32 *)addr;
1890 - addr = page_address_in_vma(page, vma);
1891 - if (addr == -EFAULT)
1892 - goto out;
1893 + if (loop > HASH_STRENGTH_FULL)
1894 + loop = HASH_STRENGTH_FULL;
1896 - BUG_ON(PageTransCompound(page));
1897 - ptep = page_check_address(page, mm, addr, &ptl, 0);
1898 - if (!ptep)
1899 - goto out;
1900 + HASH_FROM_TO(0, loop);
1902 - if (pte_write(*ptep) || pte_dirty(*ptep)) {
1903 - pte_t entry;
1904 + if (hash_strength > HASH_STRENGTH_FULL) {
1905 + loop = hash_strength - HASH_STRENGTH_FULL;
1906 + HASH_FROM_TO(0, loop);
1909 - swapped = PageSwapCache(page);
1910 - flush_cache_page(vma, addr, page_to_pfn(page));
1911 + return hash;
1915 +/**
1916 + * It's used when hash strength is adjusted
1918 + * @addr The page's virtual address
1919 + * @from The original hash strength
1920 + * @to The hash strength changed to
1921 + * @hash The hash value generated with "from" hash value
1923 + * return the hash value
1924 + */
1925 +static u32 delta_hash(void *addr, int from, int to, u32 hash)
1927 + u32 *key = (u32 *)addr;
1928 + int index, pos; /* make sure they are int type */
1930 + if (to > from) {
1931 + if (from >= HASH_STRENGTH_FULL) {
1932 + from -= HASH_STRENGTH_FULL;
1933 + to -= HASH_STRENGTH_FULL;
1934 + HASH_FROM_TO(from, to);
1935 + } else if (to <= HASH_STRENGTH_FULL) {
1936 + HASH_FROM_TO(from, to);
1937 + } else {
1938 + HASH_FROM_TO(from, HASH_STRENGTH_FULL);
1939 + HASH_FROM_TO(0, to - HASH_STRENGTH_FULL);
1941 + } else {
1942 + if (from <= HASH_STRENGTH_FULL) {
1943 + HASH_FROM_DOWN_TO(from, to);
1944 + } else if (to >= HASH_STRENGTH_FULL) {
1945 + from -= HASH_STRENGTH_FULL;
1946 + to -= HASH_STRENGTH_FULL;
1947 + HASH_FROM_DOWN_TO(from, to);
1948 + } else {
1949 + HASH_FROM_DOWN_TO(from - HASH_STRENGTH_FULL, 0);
1950 + HASH_FROM_DOWN_TO(HASH_STRENGTH_FULL, to);
1954 + return hash;
1960 +#define CAN_OVERFLOW_U64(x, delta) (U64_MAX - (x) < (delta))
1962 +/**
1964 + * Called when: rshash_pos or rshash_neg is about to overflow or a scan round
1965 + * has finished.
1967 + */
1968 +static inline void encode_benefit(void)
1970 + u64 scanned_delta, pos_delta, neg_delta;
1971 + unsigned long base = benefit.base;
1973 + scanned_delta = (ksm_pages_scanned - ksm_pages_scanned_last) >> base;
1974 + pos_delta = rshash_pos >> base;
1975 + neg_delta = rshash_neg >> base;
1977 + if (CAN_OVERFLOW_U64(benefit.pos, pos_delta) ||
1978 + CAN_OVERFLOW_U64(benefit.neg, neg_delta) ||
1979 + CAN_OVERFLOW_U64(benefit.scanned, scanned_delta)) {
1980 + benefit.scanned >>= 1;
1981 + benefit.neg >>= 1;
1982 + benefit.pos >>= 1;
1983 + benefit.base++;
1984 + scanned_delta >>= 1;
1985 + pos_delta >>= 1;
1986 + neg_delta >>= 1;
1989 + benefit.pos += pos_delta;
1990 + benefit.neg += neg_delta;
1991 + benefit.scanned += scanned_delta;
1993 + BUG_ON(!benefit.scanned);
1995 + rshash_pos = rshash_neg = 0;
1997 + /* -1 to make rshash_adjust() work */
1998 + ksm_pages_scanned_last = ksm_pages_scanned - 1;
2001 +static inline void reset_benefit(void)
2003 + benefit.pos = 0;
2004 + benefit.neg = 0;
2005 + benefit.base = 0;
2006 + benefit.scanned = 0;
2009 +static inline void inc_rshash_pos(unsigned long delta)
2011 + if (CAN_OVERFLOW_U64(rshash_pos, delta))
2012 + encode_benefit();
2014 + rshash_pos += delta;
2017 +static inline void inc_rshash_neg(unsigned long delta)
2019 + if (CAN_OVERFLOW_U64(rshash_neg, delta))
2020 + encode_benefit();
2022 + rshash_neg += delta;
2026 +static inline u32 page_hash(struct page *page, unsigned long hash_strength,
2027 + int cost_accounting)
2029 + u32 val;
2030 + unsigned long delta;
2032 + void *addr = kmap_atomic(page, KM_USER0);
2034 + val = random_sample_hash(addr, hash_strength);
2035 + kunmap_atomic(addr, KM_USER0);
2037 + if (cost_accounting) {
2038 + if (HASH_STRENGTH_FULL > hash_strength)
2039 + delta = HASH_STRENGTH_FULL - hash_strength;
2040 + else
2041 + delta = 0;
2043 + inc_rshash_pos(delta);
2046 + return val;
2049 +static int memcmp_pages(struct page *page1, struct page *page2,
2050 + int cost_accounting)
2052 + char *addr1, *addr2;
2053 + int ret;
2055 + addr1 = kmap_atomic(page1, KM_USER0);
2056 + addr2 = kmap_atomic(page2, KM_USER1);
2057 + ret = memcmp(addr1, addr2, PAGE_SIZE);
2058 + kunmap_atomic(addr2, KM_USER1);
2059 + kunmap_atomic(addr1, KM_USER0);
2061 + if (cost_accounting)
2062 + inc_rshash_neg(memcmp_cost);
2064 + return ret;
2067 +static inline int pages_identical(struct page *page1, struct page *page2)
2069 + return !memcmp_pages(page1, page2, 0);
2072 +static inline int is_zero_page(struct page *page)
2074 + char *addr;
2075 + int ret;
2077 + addr = kmap_atomic(page, KM_USER0);
2078 + ret = check_zero_page(addr, PAGE_SIZE);
2079 + kunmap_atomic(addr, KM_USER0);
2081 + return ret;
2084 +static int write_protect_page(struct vm_area_struct *vma, struct page *page,
2085 + pte_t *orig_pte, pte_t *old_pte)
2087 + struct mm_struct *mm = vma->vm_mm;
2088 + unsigned long addr;
2089 + pte_t *ptep;
2090 + spinlock_t *ptl;
2091 + int swapped;
2092 + int err = -EFAULT;
2094 + addr = page_address_in_vma(page, vma);
2095 + if (addr == -EFAULT)
2096 + goto out;
2098 + BUG_ON(PageTransCompound(page));
2099 + ptep = page_check_address(page, mm, addr, &ptl, 0);
2100 + if (!ptep)
2101 + goto out;
2103 + if (old_pte)
2104 + *old_pte = *ptep;
2106 + if (pte_write(*ptep) || pte_dirty(*ptep)) {
2107 + pte_t entry;
2109 + swapped = PageSwapCache(page);
2110 + flush_cache_page(vma, addr, page_to_pfn(page));
2112 * Ok this is tricky, when get_user_pages_fast() run it doesnt
2113 * take any lock, therefore the check that we are going to make
2114 @@ -765,6 +1184,11 @@
2115 return err;
2118 +#define MERGE_ERR_PGERR 1 /* the page is invalid cannot continue */
2119 +#define MERGE_ERR_COLLI 2 /* there is a collision */
2120 +#define MERGE_ERR_CHANGED 3 /* the page has changed since last hash */
2124 * replace_page - replace page in vma by new ksm page
2125 * @vma: vma that holds the pte pointing to page
2126 @@ -772,7 +1196,7 @@
2127 * @kpage: the ksm page we replace page by
2128 * @orig_pte: the original value of the pte
2130 - * Returns 0 on success, -EFAULT on failure.
2131 + * Returns 0 on success, MERGE_ERR_PGERR on failure.
2133 static int replace_page(struct vm_area_struct *vma, struct page *page,
2134 struct page *kpage, pte_t orig_pte)
2135 @@ -784,7 +1208,7 @@
2136 pte_t *ptep;
2137 spinlock_t *ptl;
2138 unsigned long addr;
2139 - int err = -EFAULT;
2140 + int err = MERGE_ERR_PGERR;
2142 addr = page_address_in_vma(page, vma);
2143 if (addr == -EFAULT)
2144 @@ -827,6 +1251,85 @@
2145 return err;
2149 +/**
2150 + * Fully hash a page with HASH_STRENGTH_MAX return a non-zero hash value. The
2151 + * zero hash value at HASH_STRENGTH_MAX is used to indicated that its
2152 + * hash_max member has not been calculated.
2154 + * @page The page needs to be hashed
2155 + * @hash_old The hash value calculated with current hash strength
2157 + * return the new hash value calculated at HASH_STRENGTH_MAX
2158 + */
2159 +static inline u32 page_hash_max(struct page *page, u32 hash_old)
2161 + u32 hash_max = 0;
2162 + void *addr;
2164 + addr = kmap_atomic(page, KM_USER0);
2165 + hash_max = delta_hash(addr, hash_strength,
2166 + HASH_STRENGTH_MAX, hash_old);
2168 + kunmap_atomic(addr, KM_USER0);
2170 + if (!hash_max)
2171 + hash_max = 1;
2173 + inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
2174 + return hash_max;
2178 + * We compare the hash again, to ensure that it is really a hash collision
2179 + * instead of being caused by page write.
2180 + */
2181 +static inline int check_collision(struct rmap_item *rmap_item,
2182 + u32 hash)
2184 + int err;
2185 + struct page *page = rmap_item->page;
2187 + /* if this rmap_item has already been hash_maxed, then the collision
2188 + * must appears in the second-level rbtree search. In this case we check
2189 + * if its hash_max value has been changed. Otherwise, the collision
2190 + * happens in the first-level rbtree search, so we check against it's
2191 + * current hash value.
2192 + */
2193 + if (rmap_item->hash_max) {
2194 + inc_rshash_neg(memcmp_cost);
2195 + inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
2197 + if (rmap_item->hash_max == page_hash_max(page, hash))
2198 + err = MERGE_ERR_COLLI;
2199 + else
2200 + err = MERGE_ERR_CHANGED;
2201 + } else {
2202 + inc_rshash_neg(memcmp_cost + hash_strength);
2204 + if (page_hash(page, hash_strength, 0) == hash)
2205 + err = MERGE_ERR_COLLI;
2206 + else
2207 + err = MERGE_ERR_CHANGED;
2210 + return err;
2213 +static struct page *page_trans_compound_anon(struct page *page)
2215 + if (PageTransCompound(page)) {
2216 + struct page *head = compound_trans_head(page);
2217 + /*
2218 + * head may actually be splitted and freed from under
2219 + * us but it's ok here.
2220 + */
2221 + if (PageAnon(head))
2222 + return head;
2224 + return NULL;
2227 static int page_trans_compound_anon_split(struct page *page)
2229 int ret = 0;
2230 @@ -854,30 +1357,36 @@
2231 return ret;
2235 - * try_to_merge_one_page - take two pages and merge them into one
2236 - * @vma: the vma that holds the pte pointing to page
2237 - * @page: the PageAnon page that we want to replace with kpage
2238 - * @kpage: the PageKsm page that we want to map instead of page,
2239 - * or NULL the first time when we want to use page as kpage.
2240 +/**
2241 + * Try to merge a rmap_item.page with a kpage in stable node. kpage must
2242 + * already be a ksm page.
2244 - * This function returns 0 if the pages were merged, -EFAULT otherwise.
2245 + * @return 0 if the pages were merged, -EFAULT otherwise.
2247 -static int try_to_merge_one_page(struct vm_area_struct *vma,
2248 - struct page *page, struct page *kpage)
2249 +static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
2250 + struct page *kpage, u32 hash)
2252 + struct vm_area_struct *vma = rmap_item->slot->vma;
2253 + struct mm_struct *mm = vma->vm_mm;
2254 pte_t orig_pte = __pte(0);
2255 - int err = -EFAULT;
2256 + int err = MERGE_ERR_PGERR;
2257 + struct page *page;
2259 - if (page == kpage) /* ksm page forked */
2260 - return 0;
2261 + if (ksm_test_exit(mm))
2262 + goto out;
2264 + page = rmap_item->page;
2266 - if (!(vma->vm_flags & VM_MERGEABLE))
2267 + if (page == kpage) { /* ksm page forked */
2268 + err = 0;
2269 goto out;
2272 if (PageTransCompound(page) && page_trans_compound_anon_split(page))
2273 goto out;
2274 BUG_ON(PageTransCompound(page));
2275 - if (!PageAnon(page))
2277 + if (!PageAnon(page) || !PageKsm(kpage))
2278 goto out;
2281 @@ -895,18 +1404,27 @@
2282 * ptes are necessarily already write-protected. But in either
2283 * case, we need to lock and check page_count is not raised.
2285 - if (write_protect_page(vma, page, &orig_pte) == 0) {
2286 + if (write_protect_page(vma, page, &orig_pte, NULL) == 0) {
2287 if (!kpage) {
2288 + long map_sharing = atomic_read(&page->_mapcount);
2290 * While we hold page lock, upgrade page from
2291 * PageAnon+anon_vma to PageKsm+NULL stable_node:
2292 * stable_tree_insert() will update stable_node.
2294 set_page_stable_node(page, NULL);
2295 + if (map_sharing)
2296 + add_zone_page_state(page_zone(page),
2297 + NR_KSM_PAGES_SHARING,
2298 + map_sharing);
2299 mark_page_accessed(page);
2300 err = 0;
2301 - } else if (pages_identical(page, kpage))
2302 - err = replace_page(vma, page, kpage, orig_pte);
2303 + } else {
2304 + if (pages_identical(page, kpage))
2305 + err = replace_page(vma, page, kpage, orig_pte);
2306 + else
2307 + err = check_collision(rmap_item, hash);
2311 if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
2312 @@ -924,378 +1442,2683 @@
2313 return err;
2317 - * try_to_merge_with_ksm_page - like try_to_merge_two_pages,
2318 - * but no new kernel page is allocated: kpage must already be a ksm page.
2321 +/**
2322 + * If two pages fail to merge in try_to_merge_two_pages, then we have a chance
2323 + * to restore a page mapping that has been changed in try_to_merge_two_pages.
2325 - * This function returns 0 if the pages were merged, -EFAULT otherwise.
2326 + * @return 0 on success.
2328 -static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
2329 - struct page *page, struct page *kpage)
2330 +static int restore_ksm_page_pte(struct vm_area_struct *vma, unsigned long addr,
2331 + pte_t orig_pte, pte_t wprt_pte)
2333 - struct mm_struct *mm = rmap_item->mm;
2334 - struct vm_area_struct *vma;
2335 + struct mm_struct *mm = vma->vm_mm;
2336 + pgd_t *pgd;
2337 + pud_t *pud;
2338 + pmd_t *pmd;
2339 + pte_t *ptep;
2340 + spinlock_t *ptl;
2342 int err = -EFAULT;
2344 - down_read(&mm->mmap_sem);
2345 - if (ksm_test_exit(mm))
2346 - goto out;
2347 - vma = find_vma(mm, rmap_item->address);
2348 - if (!vma || vma->vm_start > rmap_item->address)
2349 + pgd = pgd_offset(mm, addr);
2350 + if (!pgd_present(*pgd))
2351 + goto out;
2353 + pud = pud_offset(pgd, addr);
2354 + if (!pud_present(*pud))
2355 + goto out;
2357 + pmd = pmd_offset(pud, addr);
2358 + if (!pmd_present(*pmd))
2359 + goto out;
2361 + ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
2362 + if (!pte_same(*ptep, wprt_pte)) {
2363 + /* already copied, let it be */
2364 + pte_unmap_unlock(ptep, ptl);
2365 + goto out;
2368 + /*
2369 + * Good boy, still here. When we still get the ksm page, it does not
2370 + * return to the free page pool, there is no way that a pte was changed
2371 + * to other page and gets back to this page. And remind that ksm page
2372 + * do not reuse in do_wp_page(). So it's safe to restore the original
2373 + * pte.
2374 + */
2375 + flush_cache_page(vma, addr, pte_pfn(*ptep));
2376 + ptep_clear_flush(vma, addr, ptep);
2377 + set_pte_at_notify(mm, addr, ptep, orig_pte);
2379 + pte_unmap_unlock(ptep, ptl);
2380 + err = 0;
2381 +out:
2382 + return err;
2385 +/**
2386 + * try_to_merge_two_pages() - take two identical pages and prepare
2387 + * them to be merged into one page(rmap_item->page)
2389 + * @return 0 if we successfully merged two identical pages into
2390 + * one ksm page. MERGE_ERR_COLLI if it's only a hash collision
2391 + * search in rbtree. MERGE_ERR_CHANGED if rmap_item has been
2392 + * changed since it's hashed. MERGE_ERR_PGERR otherwise.
2394 + */
2395 +static int try_to_merge_two_pages(struct rmap_item *rmap_item,
2396 + struct rmap_item *tree_rmap_item,
2397 + u32 hash)
2399 + pte_t orig_pte1 = __pte(0), orig_pte2 = __pte(0);
2400 + pte_t wprt_pte1 = __pte(0), wprt_pte2 = __pte(0);
2401 + struct vm_area_struct *vma1 = rmap_item->slot->vma;
2402 + struct vm_area_struct *vma2 = tree_rmap_item->slot->vma;
2403 + struct page *page = rmap_item->page;
2404 + struct page *tree_page = tree_rmap_item->page;
2405 + int err = MERGE_ERR_PGERR;
2407 + long map_sharing;
2408 + struct address_space *saved_mapping;
2411 + if (rmap_item->page == tree_rmap_item->page)
2412 + goto out;
2414 + if (PageTransCompound(page) && page_trans_compound_anon_split(page))
2415 + goto out;
2416 + BUG_ON(PageTransCompound(page));
2418 + if (PageTransCompound(tree_page) && page_trans_compound_anon_split(tree_page))
2419 + goto out;
2420 + BUG_ON(PageTransCompound(tree_page));
2422 + if (!PageAnon(page) || !PageAnon(tree_page))
2423 + goto out;
2425 + if (!trylock_page(page))
2426 + goto out;
2429 + if (write_protect_page(vma1, page, &wprt_pte1, &orig_pte1) != 0) {
2430 + unlock_page(page);
2431 + goto out;
2434 + /*
2435 + * While we hold page lock, upgrade page from
2436 + * PageAnon+anon_vma to PageKsm+NULL stable_node:
2437 + * stable_tree_insert() will update stable_node.
2438 + */
2439 + saved_mapping = page->mapping;
2440 + map_sharing = atomic_read(&page->_mapcount);
2441 + set_page_stable_node(page, NULL);
2442 + if (map_sharing)
2443 + add_zone_page_state(page_zone(page),
2444 + NR_KSM_PAGES_SHARING,
2445 + map_sharing);
2446 + mark_page_accessed(page);
2447 + unlock_page(page);
2449 + if (!trylock_page(tree_page))
2450 + goto restore_out;
2452 + if (write_protect_page(vma2, tree_page, &wprt_pte2, &orig_pte2) != 0) {
2453 + unlock_page(tree_page);
2454 + goto restore_out;
2457 + if (pages_identical(page, tree_page)) {
2458 + err = replace_page(vma2, tree_page, page, wprt_pte2);
2459 + if (err)
2460 + goto restore_out;
2462 + if ((vma2->vm_flags & VM_LOCKED)) {
2463 + munlock_vma_page(tree_page);
2464 + if (!PageMlocked(page)) {
2465 + unlock_page(tree_page);
2466 + lock_page(page);
2467 + mlock_vma_page(page);
2468 + tree_page = page; /* for final unlock */
2472 + unlock_page(tree_page);
2474 + goto out; /* success */
2476 + } else {
2477 + if (page_hash(page, hash_strength, 0) ==
2478 + page_hash(tree_page, hash_strength, 0)) {
2479 + inc_rshash_neg(memcmp_cost + hash_strength * 2);
2480 + err = MERGE_ERR_COLLI;
2481 + } else
2482 + err = MERGE_ERR_CHANGED;
2484 + unlock_page(tree_page);
2487 +restore_out:
2488 + lock_page(page);
2489 + if (!restore_ksm_page_pte(vma1, get_rmap_addr(rmap_item),
2490 + orig_pte1, wprt_pte1))
2491 + page->mapping = saved_mapping;
2493 + unlock_page(page);
2494 +out:
2495 + return err;
2498 +static inline int hash_cmp(u32 new_val, u32 node_val)
2500 + if (new_val > node_val)
2501 + return 1;
2502 + else if (new_val < node_val)
2503 + return -1;
2504 + else
2505 + return 0;
2508 +static inline u32 rmap_item_hash_max(struct rmap_item *item, u32 hash)
2510 + u32 hash_max = item->hash_max;
2512 + if (!hash_max) {
2513 + hash_max = page_hash_max(item->page, hash);
2515 + item->hash_max = hash_max;
2518 + return hash_max;
2523 +/**
2524 + * stable_tree_search() - search the stable tree for a page
2526 + * @item: the rmap_item we are comparing with
2527 + * @hash: the hash value of this item->page already calculated
2529 + * @return the page we have found, NULL otherwise. The page returned has
2530 + * been gotten.
2531 + */
2532 +static struct page *stable_tree_search(struct rmap_item *item, u32 hash)
2534 + struct rb_node *node = root_stable_treep->rb_node;
2535 + struct tree_node *tree_node;
2536 + unsigned long hash_max;
2537 + struct page *page = item->page;
2538 + struct stable_node *stable_node;
2540 + stable_node = page_stable_node(page);
2541 + if (stable_node) {
2542 + /* ksm page forked, that is
2543 + * if (PageKsm(page) && !in_stable_tree(rmap_item))
2544 + * it's actually gotten once outside.
2545 + */
2546 + get_page(page);
2547 + return page;
2550 + while (node) {
2551 + int cmp;
2553 + tree_node = rb_entry(node, struct tree_node, node);
2555 + cmp = hash_cmp(hash, tree_node->hash);
2557 + if (cmp < 0)
2558 + node = node->rb_left;
2559 + else if (cmp > 0)
2560 + node = node->rb_right;
2561 + else
2562 + break;
2565 + if (!node)
2566 + return NULL;
2568 + if (tree_node->count == 1) {
2569 + stable_node = rb_entry(tree_node->sub_root.rb_node,
2570 + struct stable_node, node);
2571 + BUG_ON(!stable_node);
2573 + goto get_page_out;
2576 + /*
2577 + * ok, we have to search the second
2578 + * level subtree, hash the page to a
2579 + * full strength.
2580 + */
2581 + node = tree_node->sub_root.rb_node;
2582 + BUG_ON(!node);
2583 + hash_max = rmap_item_hash_max(item, hash);
2585 + while (node) {
2586 + int cmp;
2588 + stable_node = rb_entry(node, struct stable_node, node);
2590 + cmp = hash_cmp(hash_max, stable_node->hash_max);
2592 + if (cmp < 0)
2593 + node = node->rb_left;
2594 + else if (cmp > 0)
2595 + node = node->rb_right;
2596 + else
2597 + goto get_page_out;
2600 + return NULL;
2602 +get_page_out:
2603 + page = get_ksm_page(stable_node, 1, 1);
2604 + return page;
2608 +/**
2609 + * try_to_merge_with_stable_page() - when two rmap_items need to be inserted
2610 + * into stable tree, the page was found to be identical to a stable ksm page,
2611 + * this is the last chance we can merge them into one.
2613 + * @item1: the rmap_item holding the page which we wanted to insert
2614 + * into stable tree.
2615 + * @item2: the other rmap_item we found when unstable tree search
2616 + * @oldpage: the page currently mapped by the two rmap_items
2617 + * @tree_page: the page we found identical in stable tree node
2618 + * @success1: return if item1 is successfully merged
2619 + * @success2: return if item2 is successfully merged
2620 + */
2621 +static void try_merge_with_stable(struct rmap_item *item1,
2622 + struct rmap_item *item2,
2623 + struct page **kpage,
2624 + struct page *tree_page,
2625 + int *success1, int *success2)
2627 + spinlock_t *ptl1, *ptl2;
2628 + pte_t *ptep1, *ptep2;
2629 + unsigned long addr1, addr2;
2630 + struct vm_area_struct *vma1 = item1->slot->vma;
2631 + struct vm_area_struct *vma2 = item2->slot->vma;
2633 + *success1 = 0;
2634 + *success2 = 0;
2636 + if (unlikely(*kpage == tree_page)) {
2637 + /* I don't think this can really happen */
2638 + goto success_both;
2641 + if (!PageAnon(*kpage) || !PageKsm(*kpage))
2642 + goto failed;
2644 + if (!trylock_page(tree_page))
2645 + goto failed;
2647 + /* If the oldpage is still ksm and still pointed
2648 + * to in the right place, and still write protected,
2649 + * we are confident it's not changed, no need to
2650 + * memcmp anymore.
2651 + * be ware, we cannot take nested pte locks,
2652 + * deadlock risk.
2653 + */
2654 + addr1 = get_rmap_addr(item1);
2656 + ptep1 = page_check_address(*kpage, vma1->vm_mm, addr1, &ptl1, 0);
2657 + if (!ptep1)
2658 + goto failed;
2660 + if (pte_write(*ptep1)) {
2661 + /* has changed, abort! */
2662 + pte_unmap_unlock(ptep1, ptl1);
2663 + goto failed;
2666 + get_page(tree_page);
2667 + page_add_anon_rmap(tree_page, vma1, addr1);
2669 + flush_cache_page(vma1, addr1, pte_pfn(*ptep1));
2670 + ptep_clear_flush(vma1, addr1, ptep1);
2671 + set_pte_at_notify(vma1->vm_mm, addr1, ptep1,
2672 + mk_pte(tree_page, vma1->vm_page_prot));
2674 + page_remove_rmap(*kpage);
2675 + put_page(*kpage);
2677 + pte_unmap_unlock(ptep1, ptl1);
2680 + /* ok, then vma2, remind that pte1 already set */
2681 + addr2 = get_rmap_addr(item2);
2683 + ptep2 = page_check_address(*kpage, vma2->vm_mm, addr2, &ptl2, 0);
2684 + if (!ptep2)
2685 + goto success1;
2687 + if (pte_write(*ptep2)) {
2688 + /* has changed, abort! */
2689 + pte_unmap_unlock(ptep2, ptl2);
2690 + goto success1;
2693 + get_page(tree_page);
2694 + page_add_anon_rmap(tree_page, vma2, addr2);
2696 + flush_cache_page(vma2, addr2, pte_pfn(*ptep2));
2697 + ptep_clear_flush(vma2, addr2, ptep2);
2698 + set_pte_at_notify(vma2->vm_mm, addr2, ptep2,
2699 + mk_pte(tree_page, vma2->vm_page_prot));
2701 + page_remove_rmap(*kpage);
2702 + put_page(*kpage);
2704 + pte_unmap_unlock(ptep2, ptl2);
2707 +success_both:
2708 + *success2 = 1;
2709 +success1:
2710 + *success1 = 1;
2713 + if ((*success1 && vma1->vm_flags & VM_LOCKED) ||
2714 + (*success2 && vma2->vm_flags & VM_LOCKED)) {
2715 + munlock_vma_page(*kpage);
2716 + if (!PageMlocked(tree_page))
2717 + mlock_vma_page(tree_page);
2720 + /*
2721 + * We do not need oldpage any more in the caller, so can break the lock
2722 + * now.
2723 + */
2724 + unlock_page(*kpage);
2725 + *kpage = tree_page; /* Get unlocked outside. */
2726 +failed:
2727 + return;
2730 +static inline void stable_node_hash_max(struct stable_node *node,
2731 + struct page *page, u32 hash)
2733 + u32 hash_max = node->hash_max;
2735 + if (!hash_max) {
2736 + hash_max = page_hash_max(page, hash);
2737 + node->hash_max = hash_max;
2741 +static inline
2742 +struct stable_node *new_stable_node(struct tree_node *tree_node,
2743 + struct page *kpage, u32 hash_max)
2745 + struct stable_node *new_stable_node;
2747 + new_stable_node = alloc_stable_node();
2748 + if (!new_stable_node)
2749 + return NULL;
2751 + new_stable_node->kpfn = page_to_pfn(kpage);
2752 + new_stable_node->hash_max = hash_max;
2753 + new_stable_node->tree_node = tree_node;
2754 + set_page_stable_node(kpage, new_stable_node);
2756 + return new_stable_node;
2759 +static inline
2760 +struct stable_node *first_level_insert(struct tree_node *tree_node,
2761 + struct rmap_item *rmap_item,
2762 + struct rmap_item *tree_rmap_item,
2763 + struct page **kpage, u32 hash,
2764 + int *success1, int *success2)
2766 + int cmp;
2767 + struct page *tree_page;
2768 + u32 hash_max = 0;
2769 + struct stable_node *stable_node, *new_snode;
2770 + struct rb_node *parent = NULL, **new;
2772 + /* this tree node contains no sub-tree yet */
2773 + stable_node = rb_entry(tree_node->sub_root.rb_node,
2774 + struct stable_node, node);
2776 + tree_page = get_ksm_page(stable_node, 1, 0);
2777 + if (tree_page) {
2778 + cmp = memcmp_pages(*kpage, tree_page, 1);
2779 + if (!cmp) {
2780 + try_merge_with_stable(rmap_item, tree_rmap_item, kpage,
2781 + tree_page, success1, success2);
2782 + put_page(tree_page);
2783 + if (!*success1 && !*success2)
2784 + goto failed;
2786 + return stable_node;
2788 + } else {
2789 + /*
2790 + * collision in first level try to create a subtree.
2791 + * A new node need to be created.
2792 + */
2793 + put_page(tree_page);
2795 + stable_node_hash_max(stable_node, tree_page,
2796 + tree_node->hash);
2797 + hash_max = rmap_item_hash_max(rmap_item, hash);
2798 + cmp = hash_cmp(hash_max, stable_node->hash_max);
2800 + parent = &stable_node->node;
2801 + if (cmp < 0) {
2802 + new = &parent->rb_left;
2803 + } else if (cmp > 0) {
2804 + new = &parent->rb_right;
2805 + } else {
2806 + goto failed;
2810 + } else {
2811 + /* the only stable_node deleted, we reuse its tree_node.
2812 + */
2813 + parent = NULL;
2814 + new = &tree_node->sub_root.rb_node;
2817 + new_snode = new_stable_node(tree_node, *kpage, hash_max);
2818 + if (!new_snode)
2819 + goto failed;
2821 + rb_link_node(&new_snode->node, parent, new);
2822 + rb_insert_color(&new_snode->node, &tree_node->sub_root);
2823 + tree_node->count++;
2824 + *success1 = *success2 = 1;
2826 + return new_snode;
2828 +failed:
2829 + return NULL;
2832 +static inline
2833 +struct stable_node *stable_subtree_insert(struct tree_node *tree_node,
2834 + struct rmap_item *rmap_item,
2835 + struct rmap_item *tree_rmap_item,
2836 + struct page **kpage, u32 hash,
2837 + int *success1, int *success2)
2839 + struct page *tree_page;
2840 + u32 hash_max;
2841 + struct stable_node *stable_node, *new_snode;
2842 + struct rb_node *parent, **new;
2844 +research:
2845 + parent = NULL;
2846 + new = &tree_node->sub_root.rb_node;
2847 + BUG_ON(!*new);
2848 + hash_max = rmap_item_hash_max(rmap_item, hash);
2849 + while (*new) {
2850 + int cmp;
2852 + stable_node = rb_entry(*new, struct stable_node, node);
2854 + cmp = hash_cmp(hash_max, stable_node->hash_max);
2856 + if (cmp < 0) {
2857 + parent = *new;
2858 + new = &parent->rb_left;
2859 + } else if (cmp > 0) {
2860 + parent = *new;
2861 + new = &parent->rb_right;
2862 + } else {
2863 + tree_page = get_ksm_page(stable_node, 1, 0);
2864 + if (tree_page) {
2865 + cmp = memcmp_pages(*kpage, tree_page, 1);
2866 + if (!cmp) {
2867 + try_merge_with_stable(rmap_item,
2868 + tree_rmap_item, kpage,
2869 + tree_page, success1, success2);
2871 + put_page(tree_page);
2872 + if (!*success1 && !*success2)
2873 + goto failed;
2874 + /*
2875 + * successfully merged with a stable
2876 + * node
2877 + */
2878 + return stable_node;
2879 + } else {
2880 + put_page(tree_page);
2881 + goto failed;
2883 + } else {
2884 + /*
2885 + * stable node may be deleted,
2886 + * and subtree maybe
2887 + * restructed, cannot
2888 + * continue, research it.
2889 + */
2890 + if (tree_node->count) {
2891 + goto research;
2892 + } else {
2893 + /* reuse the tree node*/
2894 + parent = NULL;
2895 + new = &tree_node->sub_root.rb_node;
2901 + new_snode = new_stable_node(tree_node, *kpage, hash_max);
2902 + if (!new_snode)
2903 + goto failed;
2905 + rb_link_node(&new_snode->node, parent, new);
2906 + rb_insert_color(&new_snode->node, &tree_node->sub_root);
2907 + tree_node->count++;
2908 + *success1 = *success2 = 1;
2910 + return new_snode;
2912 +failed:
2913 + return NULL;
2917 +/**
2918 + * stable_tree_insert() - try to insert a merged page in unstable tree to
2919 + * the stable tree
2921 + * @kpage: the page need to be inserted
2922 + * @hash: the current hash of this page
2923 + * @rmap_item: the rmap_item being scanned
2924 + * @tree_rmap_item: the rmap_item found on unstable tree
2925 + * @success1: return if rmap_item is merged
2926 + * @success2: return if tree_rmap_item is merged
2928 + * @return the stable_node on stable tree if at least one
2929 + * rmap_item is inserted into stable tree, NULL
2930 + * otherwise.
2931 + */
2932 +static struct stable_node *
2933 +stable_tree_insert(struct page **kpage, u32 hash,
2934 + struct rmap_item *rmap_item,
2935 + struct rmap_item *tree_rmap_item,
2936 + int *success1, int *success2)
2938 + struct rb_node **new = &root_stable_treep->rb_node;
2939 + struct rb_node *parent = NULL;
2940 + struct stable_node *stable_node;
2941 + struct tree_node *tree_node;
2942 + u32 hash_max = 0;
2944 + *success1 = *success2 = 0;
2946 + while (*new) {
2947 + int cmp;
2949 + tree_node = rb_entry(*new, struct tree_node, node);
2951 + cmp = hash_cmp(hash, tree_node->hash);
2953 + if (cmp < 0) {
2954 + parent = *new;
2955 + new = &parent->rb_left;
2956 + } else if (cmp > 0) {
2957 + parent = *new;
2958 + new = &parent->rb_right;
2959 + } else
2960 + break;
2963 + if (*new) {
2964 + if (tree_node->count == 1) {
2965 + stable_node = first_level_insert(tree_node, rmap_item,
2966 + tree_rmap_item, kpage,
2967 + hash, success1, success2);
2968 + } else {
2969 + stable_node = stable_subtree_insert(tree_node,
2970 + rmap_item, tree_rmap_item, kpage,
2971 + hash, success1, success2);
2973 + } else {
2975 + /* no tree node found */
2976 + tree_node = alloc_tree_node(stable_tree_node_listp);
2977 + if (!tree_node) {
2978 + stable_node = NULL;
2979 + goto out;
2982 + stable_node = new_stable_node(tree_node, *kpage, hash_max);
2983 + if (!stable_node) {
2984 + free_tree_node(tree_node);
2985 + goto out;
2988 + tree_node->hash = hash;
2989 + rb_link_node(&tree_node->node, parent, new);
2990 + rb_insert_color(&tree_node->node, root_stable_treep);
2991 + parent = NULL;
2992 + new = &tree_node->sub_root.rb_node;
2994 + rb_link_node(&stable_node->node, parent, new);
2995 + rb_insert_color(&stable_node->node, &tree_node->sub_root);
2996 + tree_node->count++;
2997 + *success1 = *success2 = 1;
3000 +out:
3001 + return stable_node;
3005 +/**
3006 + * get_tree_rmap_item_page() - try to get the page and lock the mmap_sem
3008 + * @return 0 on success, -EBUSY if unable to lock the mmap_sem,
3009 + * -EINVAL if the page mapping has been changed.
3010 + */
3011 +static inline int get_tree_rmap_item_page(struct rmap_item *tree_rmap_item)
3013 + int err;
3015 + err = get_mergeable_page_lock_mmap(tree_rmap_item);
3017 + if (err == -EINVAL) {
3018 + /* its page map has been changed, remove it */
3019 + remove_rmap_item_from_tree(tree_rmap_item);
3022 + /* The page is gotten and mmap_sem is locked now. */
3023 + return err;
3027 +/**
3028 + * unstable_tree_search_insert() - search an unstable tree rmap_item with the
3029 + * same hash value. Get its page and trylock the mmap_sem
3030 + */
3031 +static inline
3032 +struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
3033 + u32 hash)
3036 + struct rb_node **new = &root_unstable_tree.rb_node;
3037 + struct rb_node *parent = NULL;
3038 + struct tree_node *tree_node;
3039 + u32 hash_max;
3040 + struct rmap_item *tree_rmap_item;
3042 + while (*new) {
3043 + int cmp;
3045 + tree_node = rb_entry(*new, struct tree_node, node);
3047 + cmp = hash_cmp(hash, tree_node->hash);
3049 + if (cmp < 0) {
3050 + parent = *new;
3051 + new = &parent->rb_left;
3052 + } else if (cmp > 0) {
3053 + parent = *new;
3054 + new = &parent->rb_right;
3055 + } else
3056 + break;
3059 + if (*new) {
3060 + /* got the tree_node */
3061 + if (tree_node->count == 1) {
3062 + tree_rmap_item = rb_entry(tree_node->sub_root.rb_node,
3063 + struct rmap_item, node);
3064 + BUG_ON(!tree_rmap_item);
3066 + goto get_page_out;
3069 + /* well, search the collision subtree */
3070 + new = &tree_node->sub_root.rb_node;
3071 + BUG_ON(!*new);
3072 + hash_max = rmap_item_hash_max(rmap_item, hash);
3074 + while (*new) {
3075 + int cmp;
3077 + tree_rmap_item = rb_entry(*new, struct rmap_item,
3078 + node);
3080 + cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
3081 + parent = *new;
3082 + if (cmp < 0)
3083 + new = &parent->rb_left;
3084 + else if (cmp > 0)
3085 + new = &parent->rb_right;
3086 + else
3087 + goto get_page_out;
3089 + } else {
3090 + /* alloc a new tree_node */
3091 + tree_node = alloc_tree_node(&unstable_tree_node_list);
3092 + if (!tree_node)
3093 + return NULL;
3095 + tree_node->hash = hash;
3096 + rb_link_node(&tree_node->node, parent, new);
3097 + rb_insert_color(&tree_node->node, &root_unstable_tree);
3098 + parent = NULL;
3099 + new = &tree_node->sub_root.rb_node;
3102 + /* did not found even in sub-tree */
3103 + rmap_item->tree_node = tree_node;
3104 + rmap_item->address |= UNSTABLE_FLAG;
3105 + rmap_item->append_round = ksm_scan_round;
3106 + rb_link_node(&rmap_item->node, parent, new);
3107 + rb_insert_color(&rmap_item->node, &tree_node->sub_root);
3109 + ksm_pages_unshared++;
3110 + return NULL;
3112 +get_page_out:
3113 + if (tree_rmap_item->page == rmap_item->page)
3114 + return NULL;
3116 + if (get_tree_rmap_item_page(tree_rmap_item))
3117 + return NULL;
3119 + return tree_rmap_item;
3122 +static void enter_vma_tree(struct vma_slot *slot)
3124 + unsigned long i;
3125 + int ret;
3127 + i = ksm_vma_tree_index_end;
3129 + ret = radix_tree_insert(&ksm_vma_tree, i, slot);
3130 + BUG_ON(ret);
3132 + slot->ksm_index = i;
3133 + ksm_vma_tree_num++;
3134 + ksm_vma_tree_index_end++;
3137 +static inline void get_sub_dup_vma(struct vma_slot **slot,
3138 + struct vma_slot **sub_slot)
3140 + struct vma_slot *tmp;
3142 + if ((*slot)->ksm_index > (*sub_slot)->ksm_index) {
3143 + tmp = *slot;
3144 + *slot = *sub_slot;
3145 + *sub_slot = tmp;
3150 + * Inc or dec the dup pages stored in a slot, return the dup page num after
3151 + * the operation.
3152 + */
3153 +static inline unsigned long dup_pages_mod(void **slot, int inc)
3155 + unsigned long item, ret;
3157 + item = (unsigned long)(*slot) >> INDIRECT_OFFSET;
3158 + if (inc) {
3159 + item++;
3160 + BUG_ON(!item);
3161 + } else {
3162 + BUG_ON(!item);
3163 + item--;
3165 + ret = item;
3166 + item <<= INDIRECT_OFFSET;
3167 + *slot = (void *)item;
3169 + return ret;
3172 +static void inc_dup_vma(struct vma_slot *slot, struct vma_slot *sub_slot)
3174 + void **dup_slot;
3175 + unsigned long dup_pages;
3176 + int ret;
3178 + if (slot->ksm_index == -1)
3179 + enter_vma_tree(slot);
3181 + if (sub_slot->ksm_index == -1)
3182 + enter_vma_tree(sub_slot);
3184 + get_sub_dup_vma(&slot, &sub_slot);
3186 + dup_slot = radix_tree_lookup_slot(&slot->dup_tree, sub_slot->ksm_index);
3187 + if (dup_slot)
3188 + goto found;
3190 + /*
3191 + * In order to store dup_pages in radix tree, we must make
3192 + * radix_tree_is_indirect_ptr() happy.
3193 + */
3194 + dup_pages = 1 << INDIRECT_OFFSET;
3196 + /* no such entry yet, insert one */
3197 + ret = radix_tree_insert(&slot->dup_tree, sub_slot->ksm_index,
3198 + (void *)dup_pages);
3199 + BUG_ON(ret);
3201 + return;
3203 +found:
3204 + dup_pages_mod(dup_slot, 1);
3207 +static void dec_dup_vma(struct vma_slot *slot, struct vma_slot *sub_slot)
3209 + void **dup_slot;
3210 + unsigned long dup_pages;
3212 + BUG_ON(slot->ksm_index == -1 || sub_slot->ksm_index == -1);
3214 + get_sub_dup_vma(&slot, &sub_slot);
3216 + dup_slot = radix_tree_lookup_slot(&slot->dup_tree, sub_slot->ksm_index);
3217 + BUG_ON(!dup_slot);
3219 + dup_pages = dup_pages_mod(dup_slot, 0);
3221 + /* dup_pages == 0, we need to kick it out */
3222 + if (!dup_pages)
3223 + radix_tree_delete(&slot->dup_tree, sub_slot->ksm_index);
3226 +static void hold_anon_vma(struct rmap_item *rmap_item,
3227 + struct anon_vma *anon_vma)
3229 + rmap_item->anon_vma = anon_vma;
3230 + get_anon_vma(anon_vma);
3234 +/**
3235 + * stable_tree_append() - append a rmap_item to a stable node. Deduplication
3236 + * ratio statistics is done in this function.
3238 + */
3239 +static void stable_tree_append(struct rmap_item *rmap_item,
3240 + struct stable_node *stable_node)
3242 + struct node_vma *node_vma = NULL, *new_node_vma, *node_vma_iter = NULL;
3243 + struct hlist_node *hlist, *cont_p = NULL;
3244 + unsigned long key = (unsigned long)rmap_item->slot;
3246 + BUG_ON(!stable_node);
3247 + rmap_item->address |= STABLE_FLAG;
3248 + rmap_item->append_round = ksm_scan_round;
3250 + if (hlist_empty(&stable_node->hlist)) {
3251 + ksm_pages_shared++;
3252 + goto node_vma_new;
3253 + } else {
3254 + ksm_pages_sharing++;
3257 + hlist_for_each_entry(node_vma, hlist, &stable_node->hlist, hlist) {
3258 + if (node_vma->last_update == ksm_scan_round)
3259 + inc_dup_vma(rmap_item->slot, node_vma->slot);
3261 + if (node_vma->key >= key)
3262 + break;
3265 + cont_p = hlist;
3267 + if (node_vma && node_vma->key == key) {
3268 + if (node_vma->last_update == ksm_scan_round) {
3269 + /**
3270 + * we consider this page a inner duplicate, cancel
3271 + * other updates
3272 + */
3273 + hlist_for_each_entry(node_vma_iter, hlist,
3274 + &stable_node->hlist, hlist) {
3275 + if (node_vma_iter->key == key)
3276 + break;
3278 + /* only need to increase the same vma */
3279 + if (node_vma_iter->last_update ==
3280 + ksm_scan_round) {
3281 + dec_dup_vma(rmap_item->slot,
3282 + node_vma_iter->slot);
3285 + } else {
3286 + /**
3287 + * Although it's same vma, it contains no duplicate for this
3288 + * round. Continue scan other vma.
3289 + */
3290 + hlist_for_each_entry_continue(node_vma_iter,
3291 + hlist, hlist) {
3292 + if (node_vma_iter->last_update ==
3293 + ksm_scan_round) {
3294 + inc_dup_vma(rmap_item->slot,
3295 + node_vma_iter->slot);
3301 + goto node_vma_ok;
3304 +node_vma_new:
3305 + /* no same vma already in node, alloc a new node_vma */
3306 + new_node_vma = alloc_node_vma();
3307 + BUG_ON(!new_node_vma);
3308 + new_node_vma->head = stable_node;
3309 + new_node_vma->slot = rmap_item->slot;
3311 + if (!node_vma) {
3312 + hlist_add_head(&new_node_vma->hlist, &stable_node->hlist);
3313 + } else if (node_vma->key != key) {
3314 + if (node_vma->key < key)
3315 + hlist_add_after(&node_vma->hlist, &new_node_vma->hlist);
3316 + else {
3317 + hlist_for_each_entry_continue(node_vma_iter, cont_p,
3318 + hlist) {
3319 + if (node_vma_iter->last_update ==
3320 + ksm_scan_round) {
3321 + inc_dup_vma(rmap_item->slot,
3322 + node_vma_iter->slot);
3325 + hlist_add_before(&new_node_vma->hlist,
3326 + &node_vma->hlist);
3330 + node_vma = new_node_vma;
3332 +node_vma_ok: /* ok, ready to add to the list */
3333 + rmap_item->head = node_vma;
3334 + hlist_add_head(&rmap_item->hlist, &node_vma->rmap_hlist);
3335 + node_vma->last_update = ksm_scan_round;
3336 + hold_anon_vma(rmap_item, rmap_item->slot->vma->anon_vma);
3337 + rmap_item->slot->pages_merged++;
3341 + * We use break_ksm to break COW on a ksm page: it's a stripped down
3343 + * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1)
3344 + * put_page(page);
3346 + * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
3347 + * in case the application has unmapped and remapped mm,addr meanwhile.
3348 + * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP
3349 + * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
3350 + */
3351 +static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
3353 + struct page *page;
3354 + int ret = 0;
3356 + do {
3357 + cond_resched();
3358 + page = follow_page(vma, addr, FOLL_GET);
3359 + if (IS_ERR_OR_NULL(page))
3360 + break;
3361 + if (PageKsm(page)) {
3362 + ret = handle_mm_fault(vma->vm_mm, vma, addr,
3363 + FAULT_FLAG_WRITE);
3364 + } else
3365 + ret = VM_FAULT_WRITE;
3366 + put_page(page);
3367 + } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM)));
3368 + /*
3369 + * We must loop because handle_mm_fault() may back out if there's
3370 + * any difficulty e.g. if pte accessed bit gets updated concurrently.
3372 + * VM_FAULT_WRITE is what we have been hoping for: it indicates that
3373 + * COW has been broken, even if the vma does not permit VM_WRITE;
3374 + * but note that a concurrent fault might break PageKsm for us.
3376 + * VM_FAULT_SIGBUS could occur if we race with truncation of the
3377 + * backing file, which also invalidates anonymous pages: that's
3378 + * okay, that truncation will have unmapped the PageKsm for us.
3380 + * VM_FAULT_OOM: at the time of writing (late July 2009), setting
3381 + * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
3382 + * current task has TIF_MEMDIE set, and will be OOM killed on return
3383 + * to user; and ksmd, having no mm, would never be chosen for that.
3385 + * But if the mm is in a limited mem_cgroup, then the fault may fail
3386 + * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
3387 + * even ksmd can fail in this way - though it's usually breaking ksm
3388 + * just to undo a merge it made a moment before, so unlikely to oom.
3390 + * That's a pity: we might therefore have more kernel pages allocated
3391 + * than we're counting as nodes in the stable tree; but ksm_do_scan
3392 + * will retry to break_cow on each pass, so should recover the page
3393 + * in due course. The important thing is to not let VM_MERGEABLE
3394 + * be cleared while any such pages might remain in the area.
3395 + */
3396 + return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
3399 +static void break_cow(struct rmap_item *rmap_item)
3401 + struct vm_area_struct *vma = rmap_item->slot->vma;
3402 + struct mm_struct *mm = vma->vm_mm;
3403 + unsigned long addr = get_rmap_addr(rmap_item);
3405 + if (ksm_test_exit(mm))
3406 + goto out;
3408 + break_ksm(vma, addr);
3409 +out:
3410 + return;
3414 + * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather
3415 + * than check every pte of a given vma, the locking doesn't quite work for
3416 + * that - an rmap_item is assigned to the stable tree after inserting ksm
3417 + * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing
3418 + * rmap_items from parent to child at fork time (so as not to waste time
3419 + * if exit comes before the next scan reaches it).
3421 + * Similarly, although we'd like to remove rmap_items (so updating counts
3422 + * and freeing memory) when unmerging an area, it's easier to leave that
3423 + * to the next pass of ksmd - consider, for example, how ksmd might be
3424 + * in cmp_and_merge_page on one of the rmap_items we would be removing.
3425 + */
3426 +inline int unmerge_ksm_pages(struct vm_area_struct *vma,
3427 + unsigned long start, unsigned long end)
3429 + unsigned long addr;
3430 + int err = 0;
3432 + for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
3433 + if (ksm_test_exit(vma->vm_mm))
3434 + break;
3435 + if (signal_pending(current))
3436 + err = -ERESTARTSYS;
3437 + else
3438 + err = break_ksm(vma, addr);
3440 + return err;
3443 +static inline void inc_ksm_pages_scanned(void)
3445 + u64 delta;
3448 + if (ksm_pages_scanned == U64_MAX) {
3449 + encode_benefit();
3451 + delta = ksm_pages_scanned >> pages_scanned_base;
3453 + if (CAN_OVERFLOW_U64(pages_scanned_stored, delta)) {
3454 + pages_scanned_stored >>= 1;
3455 + delta >>= 1;
3456 + pages_scanned_base++;
3459 + pages_scanned_stored += delta;
3461 + ksm_pages_scanned = ksm_pages_scanned_last = 0;
3464 + ksm_pages_scanned++;
3467 +static int find_zero_page_hash(int strength, u32 hash)
3469 + return (zero_hash_table[strength]== hash);
3472 +static int cmp_and_merge_zero_page(struct vm_area_struct *vma, struct page *page)
3474 + struct page *zero_page = ZERO_PAGE(0);
3475 + pte_t orig_pte = __pte(0);
3477 + int err = -EFAULT;
3479 + if (!trylock_page(page))
3480 + goto out;
3482 + if (write_protect_page(vma, page, &orig_pte, 0) == 0) {
3483 + if (is_zero_page(page) == 0)
3484 + err = replace_page(vma, page, zero_page, orig_pte);
3487 + unlock_page(page);
3488 +out:
3489 + return err;
3493 + * cmp_and_merge_page() - first see if page can be merged into the stable
3494 + * tree; if not, compare hash to previous and if it's the same, see if page
3495 + * can be inserted into the unstable tree, or merged with a page already there
3496 + * and both transferred to the stable tree.
3498 + * @page: the page that we are searching identical page to.
3499 + * @rmap_item: the reverse mapping into the virtual address of this page
3500 + */
3501 +static void cmp_and_merge_page(struct rmap_item *rmap_item)
3503 + struct rmap_item *tree_rmap_item;
3504 + struct page *page;
3505 + struct page *kpage = NULL;
3506 + u32 hash, hash_max;
3507 + int err;
3508 + unsigned int success1, success2;
3509 + struct stable_node *snode;
3510 + int cmp;
3511 + struct rb_node *parent = NULL, **new;
3513 + remove_rmap_item_from_tree(rmap_item);
3515 + page = rmap_item->page;
3517 + hash = page_hash(page, hash_strength, 1);
3519 + /*if the page content all zero, re-map to zero-page*/
3520 + if (find_zero_page_hash(hash_strength, hash)) {
3521 + if (!cmp_and_merge_zero_page(rmap_item->slot->vma, page)) {
3522 + ksm_remap_zero_pages++;
3523 + return ;
3526 + //ksm_pages_scanned++;
3527 + inc_ksm_pages_scanned();
3529 + /* We first start with searching the page inside the stable tree */
3530 + kpage = stable_tree_search(rmap_item, hash);
3531 + if (kpage) {
3532 + err = try_to_merge_with_ksm_page(rmap_item, kpage,
3533 + hash);
3534 + if (!err) {
3535 + /*
3536 + * The page was successfully merged, add
3537 + * its rmap_item to the stable tree.
3538 + * page lock is needed because it's
3539 + * racing with try_to_unmap_ksm(), etc.
3540 + */
3541 + lock_page(kpage);
3542 + stable_tree_append(rmap_item, page_stable_node(kpage));
3543 + unlock_page(kpage);
3544 + put_page(kpage);
3545 + return; /* success */
3547 + put_page(kpage);
3549 + /*
3550 + * if it's a collision and it has been search in sub-rbtree
3551 + * (hash_max != 0), we want to abort, because if it is
3552 + * successfully merged in unstable tree, the collision trends to
3553 + * happen again.
3554 + */
3555 + if (err == MERGE_ERR_COLLI && rmap_item->hash_max)
3556 + return;
3559 + tree_rmap_item =
3560 + unstable_tree_search_insert(rmap_item, hash);
3561 + if (tree_rmap_item) {
3562 + err = try_to_merge_two_pages(rmap_item, tree_rmap_item, hash);
3563 + /*
3564 + * As soon as we merge this page, we want to remove the
3565 + * rmap_item of the page we have merged with from the unstable
3566 + * tree, and insert it instead as new node in the stable tree.
3567 + */
3568 + if (!err) {
3569 + kpage = page;
3570 + remove_rmap_item_from_tree(tree_rmap_item);
3571 + lock_page(kpage);
3572 + snode = stable_tree_insert(&kpage, hash,
3573 + rmap_item, tree_rmap_item,
3574 + &success1, &success2);
3576 + if (success1)
3577 + stable_tree_append(rmap_item, snode);
3578 + else
3579 + break_cow(rmap_item);
3581 + if (success2)
3582 + stable_tree_append(tree_rmap_item, snode);
3583 + else
3584 + break_cow(tree_rmap_item);
3586 + /*
3587 + * The original kpage may be unlocked inside
3588 + * stable_tree_insert() already.
3589 + */
3590 + unlock_page(kpage);
3592 + } else if (err == MERGE_ERR_COLLI) {
3593 + if (tree_rmap_item->tree_node->count == 1) {
3594 + rmap_item_hash_max(tree_rmap_item,
3595 + tree_rmap_item->tree_node->hash);
3596 + } else
3597 + BUG_ON(!(tree_rmap_item->hash_max));
3599 + hash_max = rmap_item_hash_max(rmap_item, hash);
3600 + cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
3601 + parent = &tree_rmap_item->node;
3602 + if (cmp < 0)
3603 + new = &parent->rb_left;
3604 + else if (cmp > 0)
3605 + new = &parent->rb_right;
3606 + else
3607 + goto put_up_out;
3609 + rmap_item->tree_node = tree_rmap_item->tree_node;
3610 + rmap_item->address |= UNSTABLE_FLAG;
3611 + rmap_item->append_round = ksm_scan_round;
3612 + rb_link_node(&rmap_item->node, parent, new);
3613 + rb_insert_color(&rmap_item->node,
3614 + &tree_rmap_item->tree_node->sub_root);
3615 + rmap_item->tree_node->count++;
3617 +put_up_out:
3618 + put_page(tree_rmap_item->page);
3619 + up_read(&tree_rmap_item->slot->vma->vm_mm->mmap_sem);
3626 +static inline unsigned long get_pool_index(struct vma_slot *slot,
3627 + unsigned long index)
3629 + unsigned long pool_index;
3631 + pool_index = (sizeof(struct rmap_list_entry *) * index) >> PAGE_SHIFT;
3632 + if (pool_index >= slot->pool_size)
3633 + BUG();
3634 + return pool_index;
3637 +static inline unsigned long index_page_offset(unsigned long index)
3639 + return offset_in_page(sizeof(struct rmap_list_entry *) * index);
3642 +static inline
3643 +struct rmap_list_entry *get_rmap_list_entry(struct vma_slot *slot,
3644 + unsigned long index, int need_alloc)
3646 + unsigned long pool_index;
3647 + void *addr;
3650 + pool_index = get_pool_index(slot, index);
3651 + if (!slot->rmap_list_pool[pool_index]) {
3652 + if (!need_alloc)
3653 + return NULL;
3655 + slot->rmap_list_pool[pool_index] =
3656 + alloc_page(GFP_KERNEL | __GFP_ZERO);
3657 + BUG_ON(!slot->rmap_list_pool[pool_index]);
3660 + addr = kmap(slot->rmap_list_pool[pool_index]);
3661 + addr += index_page_offset(index);
3663 + return addr;
3666 +static inline void put_rmap_list_entry(struct vma_slot *slot,
3667 + unsigned long index)
3669 + unsigned long pool_index;
3671 + pool_index = get_pool_index(slot, index);
3672 + BUG_ON(!slot->rmap_list_pool[pool_index]);
3673 + kunmap(slot->rmap_list_pool[pool_index]);
3676 +static inline int entry_is_new(struct rmap_list_entry *entry)
3678 + return !entry->item;
3681 +static inline unsigned long get_index_orig_addr(struct vma_slot *slot,
3682 + unsigned long index)
3684 + return slot->vma->vm_start + (index << PAGE_SHIFT);
3687 +static inline unsigned long get_entry_address(struct rmap_list_entry *entry)
3689 + unsigned long addr;
3691 + if (is_addr(entry->addr))
3692 + addr = get_clean_addr(entry->addr);
3693 + else if (entry->item)
3694 + addr = get_rmap_addr(entry->item);
3695 + else
3696 + BUG();
3698 + return addr;
3701 +static inline struct rmap_item *get_entry_item(struct rmap_list_entry *entry)
3703 + if (is_addr(entry->addr))
3704 + return NULL;
3706 + return entry->item;
3709 +static inline void inc_rmap_list_pool_count(struct vma_slot *slot,
3710 + unsigned long index)
3712 + unsigned long pool_index;
3714 + pool_index = get_pool_index(slot, index);
3715 + BUG_ON(!slot->rmap_list_pool[pool_index]);
3716 + slot->pool_counts[pool_index]++;
3719 +static inline void dec_rmap_list_pool_count(struct vma_slot *slot,
3720 + unsigned long index)
3722 + unsigned long pool_index;
3724 + pool_index = get_pool_index(slot, index);
3725 + BUG_ON(!slot->rmap_list_pool[pool_index]);
3726 + BUG_ON(!slot->pool_counts[pool_index]);
3727 + slot->pool_counts[pool_index]--;
3730 +static inline int entry_has_rmap(struct rmap_list_entry *entry)
3732 + return !is_addr(entry->addr) && entry->item;
3735 +static inline void swap_entries(struct rmap_list_entry *entry1,
3736 + unsigned long index1,
3737 + struct rmap_list_entry *entry2,
3738 + unsigned long index2)
3740 + struct rmap_list_entry tmp;
3742 + /* swapping two new entries is meaningless */
3743 + BUG_ON(entry_is_new(entry1) && entry_is_new(entry2));
3745 + tmp = *entry1;
3746 + *entry1 = *entry2;
3747 + *entry2 = tmp;
3749 + if (entry_has_rmap(entry1))
3750 + entry1->item->entry_index = index1;
3752 + if (entry_has_rmap(entry2))
3753 + entry2->item->entry_index = index2;
3755 + if (entry_has_rmap(entry1) && !entry_has_rmap(entry2)) {
3756 + inc_rmap_list_pool_count(entry1->item->slot, index1);
3757 + dec_rmap_list_pool_count(entry1->item->slot, index2);
3758 + } else if (!entry_has_rmap(entry1) && entry_has_rmap(entry2)) {
3759 + inc_rmap_list_pool_count(entry2->item->slot, index2);
3760 + dec_rmap_list_pool_count(entry2->item->slot, index1);
3764 +static inline void free_entry_item(struct rmap_list_entry *entry)
3766 + unsigned long index;
3767 + struct rmap_item *item;
3769 + if (!is_addr(entry->addr)) {
3770 + BUG_ON(!entry->item);
3771 + item = entry->item;
3772 + entry->addr = get_rmap_addr(item);
3773 + set_is_addr(entry->addr);
3774 + index = item->entry_index;
3775 + remove_rmap_item_from_tree(item);
3776 + dec_rmap_list_pool_count(item->slot, index);
3777 + free_rmap_item(item);
3781 +static inline int pool_entry_boundary(unsigned long index)
3783 + unsigned long linear_addr;
3785 + linear_addr = sizeof(struct rmap_list_entry *) * index;
3786 + return index && !offset_in_page(linear_addr);
3789 +static inline void try_free_last_pool(struct vma_slot *slot,
3790 + unsigned long index)
3792 + unsigned long pool_index;
3794 + pool_index = get_pool_index(slot, index);
3795 + if (slot->rmap_list_pool[pool_index] &&
3796 + !slot->pool_counts[pool_index]) {
3797 + __free_page(slot->rmap_list_pool[pool_index]);
3798 + slot->rmap_list_pool[pool_index] = NULL;
3799 + slot->need_sort = 1;
3804 +static inline unsigned long vma_item_index(struct vm_area_struct *vma,
3805 + struct rmap_item *item)
3807 + return (get_rmap_addr(item) - vma->vm_start) >> PAGE_SHIFT;
3810 +static int within_same_pool(struct vma_slot *slot,
3811 + unsigned long i, unsigned long j)
3813 + unsigned long pool_i, pool_j;
3815 + pool_i = get_pool_index(slot, i);
3816 + pool_j = get_pool_index(slot, j);
3818 + return (pool_i == pool_j);
3821 +static void sort_rmap_entry_list(struct vma_slot *slot)
3823 + unsigned long i, j;
3824 + struct rmap_list_entry *entry, *swap_entry;
3826 + entry = get_rmap_list_entry(slot, 0, 0);
3827 + for (i = 0; i < slot->pages; ) {
3829 + if (!entry)
3830 + goto skip_whole_pool;
3832 + if (entry_is_new(entry))
3833 + goto next_entry;
3835 + if (is_addr(entry->addr)) {
3836 + entry->addr = 0;
3837 + goto next_entry;
3840 + j = vma_item_index(slot->vma, entry->item);
3841 + if (j == i)
3842 + goto next_entry;
3844 + if (within_same_pool(slot, i, j))
3845 + swap_entry = entry + j - i;
3846 + else
3847 + swap_entry = get_rmap_list_entry(slot, j, 1);
3849 + swap_entries(entry, i, swap_entry, j);
3850 + if (!within_same_pool(slot, i, j))
3851 + put_rmap_list_entry(slot, j);
3852 + continue;
3854 +skip_whole_pool:
3855 + i += PAGE_SIZE / sizeof(*entry);
3856 + if (i < slot->pages)
3857 + entry = get_rmap_list_entry(slot, i, 0);
3858 + continue;
3860 +next_entry:
3861 + if (i >= slot->pages - 1 ||
3862 + !within_same_pool(slot, i, i + 1)) {
3863 + put_rmap_list_entry(slot, i);
3864 + if (i + 1 < slot->pages)
3865 + entry = get_rmap_list_entry(slot, i + 1, 0);
3866 + } else
3867 + entry++;
3868 + i++;
3869 + continue;
3872 + /* free empty pool entries which contain no rmap_item */
3873 + /* CAN be simplied to based on only pool_counts when bug freed !!!!! */
3874 + for (i = 0; i < slot->pool_size; i++) {
3875 + unsigned char has_rmap;
3876 + void *addr;
3878 + if (!slot->rmap_list_pool[i])
3879 + continue;
3881 + has_rmap = 0;
3882 + addr = kmap(slot->rmap_list_pool[i]);
3883 + BUG_ON(!addr);
3884 + for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
3885 + entry = (struct rmap_list_entry *)addr + j;
3886 + if (is_addr(entry->addr))
3887 + continue;
3888 + if (!entry->item)
3889 + continue;
3890 + has_rmap = 1;
3892 + kunmap(slot->rmap_list_pool[i]);
3893 + if (!has_rmap) {
3894 + BUG_ON(slot->pool_counts[i]);
3895 + __free_page(slot->rmap_list_pool[i]);
3896 + slot->rmap_list_pool[i] = NULL;
3900 + slot->need_sort = 0;
3904 + * vma_fully_scanned() - if all the pages in this slot have been scanned.
3905 + */
3906 +static inline int vma_fully_scanned(struct vma_slot *slot)
3908 + return slot->pages_scanned && !(slot->pages_scanned % slot->pages);
3911 +/**
3912 + * get_next_rmap_item() - Get the next rmap_item in a vma_slot according to
3913 + * its random permutation. This function is embedded with the random
3914 + * permutation index management code.
3915 + */
3916 +static struct rmap_item *get_next_rmap_item(struct vma_slot *slot)
3918 + unsigned long rand_range, addr, swap_index, scan_index;
3919 + struct rmap_item *item = NULL;
3920 + struct rmap_list_entry *scan_entry, *swap_entry = NULL;
3921 + struct page *page;
3923 + scan_index = swap_index = slot->pages_scanned % slot->pages;
3925 + if (pool_entry_boundary(scan_index))
3926 + try_free_last_pool(slot, scan_index - 1);
3928 + if (vma_fully_scanned(slot)) {
3929 + slot->need_rerand = slot->need_sort;
3930 + if (slot->need_sort)
3931 + sort_rmap_entry_list(slot);
3934 + scan_entry = get_rmap_list_entry(slot, scan_index, 1);
3935 + if (entry_is_new(scan_entry)) {
3936 + scan_entry->addr = get_index_orig_addr(slot, scan_index);
3937 + set_is_addr(scan_entry->addr);
3940 + if (slot->need_rerand) {
3941 + rand_range = slot->pages - scan_index;
3942 + BUG_ON(!rand_range);
3943 + swap_index = scan_index + (random32() % rand_range);
3946 + if (swap_index != scan_index) {
3947 + swap_entry = get_rmap_list_entry(slot, swap_index, 1);
3948 + if (entry_is_new(swap_entry)) {
3949 + swap_entry->addr = get_index_orig_addr(slot,
3950 + swap_index);
3951 + set_is_addr(swap_entry->addr);
3953 + swap_entries(scan_entry, scan_index, swap_entry, swap_index);
3956 + addr = get_entry_address(scan_entry);
3957 + item = get_entry_item(scan_entry);
3958 + BUG_ON(addr > slot->vma->vm_end || addr < slot->vma->vm_start);
3960 + page = follow_page(slot->vma, addr, FOLL_GET);
3961 + if (IS_ERR_OR_NULL(page))
3962 + goto nopage;
3964 + if (!PageAnon(page) && !page_trans_compound_anon(page))
3965 + goto putpage;
3967 + /*check is zero_page pfn*/
3968 + if (page_to_pfn(page) == zero_pfn)
3969 + goto putpage;
3971 + flush_anon_page(slot->vma, page, addr);
3972 + flush_dcache_page(page);
3974 + if (!item) {
3975 + item = alloc_rmap_item();
3976 + if (item) {
3977 + /* It has already been zeroed */
3978 + item->slot = slot;
3979 + item->address = addr;
3980 + item->entry_index = scan_index;
3981 + scan_entry->item = item;
3982 + inc_rmap_list_pool_count(slot, scan_index);
3983 + } else
3984 + goto putpage;
3987 + BUG_ON(item->slot != slot);
3988 + /* the page may have changed */
3989 + item->page = page;
3990 + put_rmap_list_entry(slot, scan_index);
3991 + if (swap_entry)
3992 + put_rmap_list_entry(slot, swap_index);
3993 + return item;
3995 +putpage:
3996 + put_page(page);
3997 + page = NULL;
3998 +nopage:
3999 + /* no page, store addr back and free rmap_item if possible */
4000 + free_entry_item(scan_entry);
4001 + put_rmap_list_entry(slot, scan_index);
4002 + if (swap_entry)
4003 + put_rmap_list_entry(slot, swap_index);
4004 + return NULL;
4007 +static inline int in_stable_tree(struct rmap_item *rmap_item)
4009 + return rmap_item->address & STABLE_FLAG;
4012 +/**
4013 + * scan_vma_one_page() - scan the next page in a vma_slot. Called with
4014 + * mmap_sem locked.
4015 + */
4016 +static void scan_vma_one_page(struct vma_slot *slot)
4018 + struct mm_struct *mm;
4019 + struct rmap_item *rmap_item = NULL;
4020 + struct vm_area_struct *vma = slot->vma;
4022 + mm = vma->vm_mm;
4023 + BUG_ON(!mm);
4024 + BUG_ON(!slot);
4026 + rmap_item = get_next_rmap_item(slot);
4027 + if (!rmap_item)
4028 + goto out1;
4030 + if (PageKsm(rmap_item->page) && in_stable_tree(rmap_item))
4031 + goto out2;
4033 + cmp_and_merge_page(rmap_item);
4034 +out2:
4035 + put_page(rmap_item->page);
4036 +out1:
4037 + slot->pages_scanned++;
4038 + slot->slot_scanned = 1;
4039 + if (vma_fully_scanned(slot)) {
4040 + slot->fully_scanned = 1;
4041 + slot->rung->fully_scanned_slots++;
4042 + BUG_ON(!slot->rung->fully_scanned_slots);
4046 +static unsigned long get_vma_random_scan_num(struct vma_slot *slot,
4047 + unsigned long scan_ratio)
4049 + return slot->pages * scan_ratio / KSM_SCAN_RATIO_MAX;
4052 +static inline void vma_rung_enter(struct vma_slot *slot,
4053 + struct scan_rung *rung)
4055 + unsigned long pages_to_scan;
4056 + struct scan_rung *old_rung = slot->rung;
4058 + /* leave the old rung it was in */
4059 + BUG_ON(list_empty(&slot->ksm_list));
4061 + if (old_rung->current_scan == &slot->ksm_list)
4062 + old_rung->current_scan = slot->ksm_list.next;
4063 + list_del_init(&slot->ksm_list);
4064 + old_rung->vma_num--;
4065 + if (slot->fully_scanned)
4066 + old_rung->fully_scanned_slots--;
4068 + if (old_rung->current_scan == &old_rung->vma_list) {
4069 + /* This rung finishes a round */
4070 + old_rung->round_finished = 1;
4071 + old_rung->current_scan = old_rung->vma_list.next;
4072 + BUG_ON(old_rung->current_scan == &old_rung->vma_list &&
4073 + !list_empty(&old_rung->vma_list));
4076 + /* enter the new rung */
4077 + while (!(pages_to_scan =
4078 + get_vma_random_scan_num(slot, rung->scan_ratio))) {
4079 + rung++;
4080 + BUG_ON(rung > &ksm_scan_ladder[ksm_scan_ladder_size - 1]);
4082 + if (list_empty(&rung->vma_list))
4083 + rung->current_scan = &slot->ksm_list;
4084 + list_add(&slot->ksm_list, &rung->vma_list);
4085 + slot->rung = rung;
4086 + slot->pages_to_scan = pages_to_scan;
4087 + slot->rung->vma_num++;
4088 + if (slot->fully_scanned)
4089 + rung->fully_scanned_slots++;
4091 + BUG_ON(rung->current_scan == &rung->vma_list &&
4092 + !list_empty(&rung->vma_list));
4095 +static inline void vma_rung_up(struct vma_slot *slot)
4097 + if (slot->rung == &ksm_scan_ladder[ksm_scan_ladder_size-1])
4098 + return;
4100 + vma_rung_enter(slot, slot->rung + 1);
4103 +static inline void vma_rung_down(struct vma_slot *slot)
4105 + if (slot->rung == &ksm_scan_ladder[0])
4106 + return;
4108 + vma_rung_enter(slot, slot->rung - 1);
4111 +/**
4112 + * cal_dedup_ratio() - Calculate the deduplication ratio for this slot.
4113 + */
4114 +static unsigned long cal_dedup_ratio(struct vma_slot *slot)
4116 + struct vma_slot *slot2;
4117 + void **dup_slot;
4118 + unsigned long dup_pages;
4119 + unsigned long dedup_num, pages1, scanned1;
4120 + unsigned long ret;
4121 + int i;
4123 + if (!slot->pages_scanned)
4124 + return 0;
4126 + pages1 = slot->pages;
4127 + scanned1 = slot->pages_scanned - slot->last_scanned;
4128 + BUG_ON(scanned1 > slot->pages_scanned);
4130 + for (i = slot->ksm_index; i < ksm_vma_tree_index_end; i++) {
4131 + unsigned long pages2, scanned2;
4133 + dup_slot = radix_tree_lookup_slot(&slot->dup_tree, i);
4134 + if (!dup_slot)
4135 + continue;
4137 + dup_pages = (unsigned long)(*dup_slot) >> INDIRECT_OFFSET;
4139 + slot2 = radix_tree_lookup(&ksm_vma_tree, i);
4140 + BUG_ON(!slot2 || !slot2->pages_scanned);
4142 + pages2 = slot2->pages;
4143 + scanned2 = slot2->pages_scanned - slot2->last_scanned;
4144 + BUG_ON(scanned2 > slot2->pages_scanned);
4146 + BUG_ON(!scanned1 || !scanned2);
4148 + dedup_num = dup_pages * pages1 / scanned1 * pages2 / scanned2;
4149 + slot->dedup_num += dedup_num;
4150 + slot2->dedup_num += dedup_num;
4153 + ret = (slot->dedup_num * KSM_DEDUP_RATIO_SCALE / pages1);
4155 + /* Thrashing area filtering */
4156 + if (ksm_thrash_threshold) {
4157 + if (slot->pages_cowed * 100 / slot->pages_merged
4158 + > ksm_thrash_threshold) {
4159 + ret = 0;
4160 + } else {
4161 + ret = ret * (slot->pages_merged - slot->pages_cowed)
4162 + / slot->pages_merged;
4166 + return ret;
4170 +/**
4171 + * stable_node_reinsert() - When the hash_strength has been adjusted, the
4172 + * stable tree need to be restructured, this is the function re-inserting the
4173 + * stable node.
4174 + */
4175 +static inline void stable_node_reinsert(struct stable_node *new_node,
4176 + struct page *page,
4177 + struct rb_root *root_treep,
4178 + struct list_head *tree_node_listp,
4179 + u32 hash)
4181 + struct rb_node **new = &root_treep->rb_node;
4182 + struct rb_node *parent = NULL;
4183 + struct stable_node *stable_node;
4184 + struct tree_node *tree_node;
4185 + struct page *tree_page;
4186 + int cmp;
4188 + while (*new) {
4189 + int cmp;
4191 + tree_node = rb_entry(*new, struct tree_node, node);
4193 + cmp = hash_cmp(hash, tree_node->hash);
4195 + if (cmp < 0) {
4196 + parent = *new;
4197 + new = &parent->rb_left;
4198 + } else if (cmp > 0) {
4199 + parent = *new;
4200 + new = &parent->rb_right;
4201 + } else
4202 + break;
4205 + if (*new) {
4206 + /* find a stable tree node with same first level hash value */
4207 + stable_node_hash_max(new_node, page, hash);
4208 + if (tree_node->count == 1) {
4209 + stable_node = rb_entry(tree_node->sub_root.rb_node,
4210 + struct stable_node, node);
4211 + tree_page = get_ksm_page(stable_node, 1, 0);
4212 + if (tree_page) {
4213 + stable_node_hash_max(stable_node,
4214 + tree_page, hash);
4215 + put_page(tree_page);
4217 + /* prepare for stable node insertion */
4219 + cmp = hash_cmp(new_node->hash_max,
4220 + stable_node->hash_max);
4221 + parent = &stable_node->node;
4222 + if (cmp < 0)
4223 + new = &parent->rb_left;
4224 + else if (cmp > 0)
4225 + new = &parent->rb_right;
4226 + else
4227 + goto failed;
4229 + goto add_node;
4230 + } else {
4231 + /* the only stable_node deleted, the tree node
4232 + * was not deleted.
4233 + */
4234 + goto tree_node_reuse;
4238 + /* well, search the collision subtree */
4239 + new = &tree_node->sub_root.rb_node;
4240 + parent = NULL;
4241 + BUG_ON(!*new);
4242 + while (*new) {
4243 + int cmp;
4245 + stable_node = rb_entry(*new, struct stable_node, node);
4247 + cmp = hash_cmp(new_node->hash_max,
4248 + stable_node->hash_max);
4250 + if (cmp < 0) {
4251 + parent = *new;
4252 + new = &parent->rb_left;
4253 + } else if (cmp > 0) {
4254 + parent = *new;
4255 + new = &parent->rb_right;
4256 + } else {
4257 + /* oh, no, still a collision */
4258 + goto failed;
4262 + goto add_node;
4265 + /* no tree node found */
4266 + tree_node = alloc_tree_node(tree_node_listp);
4267 + if (!tree_node) {
4268 + printk(KERN_ERR "UKSM: memory allocation error!\n");
4269 + goto failed;
4270 + } else {
4271 + tree_node->hash = hash;
4272 + rb_link_node(&tree_node->node, parent, new);
4273 + rb_insert_color(&tree_node->node, root_treep);
4275 +tree_node_reuse:
4276 + /* prepare for stable node insertion */
4277 + parent = NULL;
4278 + new = &tree_node->sub_root.rb_node;
4281 +add_node:
4282 + rb_link_node(&new_node->node, parent, new);
4283 + rb_insert_color(&new_node->node, &tree_node->sub_root);
4284 + new_node->tree_node = tree_node;
4285 + tree_node->count++;
4286 + return;
4288 +failed:
4289 + /* This can only happen when two nodes have collided
4290 + * in two levels.
4291 + */
4292 + new_node->tree_node = NULL;
4293 + return;
4296 +static inline void free_all_tree_nodes(struct list_head *list)
4298 + struct tree_node *node, *tmp;
4300 + list_for_each_entry_safe(node, tmp, list, all_list) {
4301 + free_tree_node(node);
4305 +/**
4306 + * stable_tree_delta_hash() - Delta hash the stable tree from previous hash
4307 + * strength to the current hash_strength. It re-structures the hole tree.
4308 + */
4309 +static inline void stable_tree_delta_hash(u32 prev_hash_strength)
4311 + struct stable_node *node, *tmp;
4312 + struct rb_root *root_new_treep;
4313 + struct list_head *new_tree_node_listp;
4315 + stable_tree_index = (stable_tree_index + 1) % 2;
4316 + root_new_treep = &root_stable_tree[stable_tree_index];
4317 + new_tree_node_listp = &stable_tree_node_list[stable_tree_index];
4318 + *root_new_treep = RB_ROOT;
4319 + BUG_ON(!list_empty(new_tree_node_listp));
4321 + /*
4322 + * we need to be safe, the node could be removed by get_ksm_page()
4323 + */
4324 + list_for_each_entry_safe(node, tmp, &stable_node_list, all_list) {
4325 + void *addr;
4326 + struct page *node_page;
4327 + u32 hash;
4329 + /*
4330 + * We are completely re-structuring the stable nodes to a new
4331 + * stable tree. We don't want to touch the old tree unlinks and
4332 + * old tree_nodes. The old tree_nodes will be freed at once.
4333 + */
4334 + node_page = get_ksm_page(node, 0, 0);
4335 + if (!node_page)
4336 + continue;
4338 + if (node->tree_node) {
4339 + hash = node->tree_node->hash;
4341 + addr = kmap_atomic(node_page, KM_USER0);
4343 + hash = delta_hash(addr, prev_hash_strength,
4344 + hash_strength, hash);
4345 + kunmap_atomic(addr, KM_USER0);
4346 + } else {
4347 + /*
4348 + *it was not inserted to rbtree due to collision in last
4349 + *round scan.
4350 + */
4351 + hash = page_hash(node_page, hash_strength, 0);
4354 + stable_node_reinsert(node, node_page, root_new_treep,
4355 + new_tree_node_listp, hash);
4356 + put_page(node_page);
4359 + root_stable_treep = root_new_treep;
4360 + free_all_tree_nodes(stable_tree_node_listp);
4361 + BUG_ON(!list_empty(stable_tree_node_listp));
4362 + stable_tree_node_listp = new_tree_node_listp;
4365 +static inline void inc_hash_strength(unsigned long delta)
4367 + hash_strength += 1 << delta;
4368 + if (hash_strength > HASH_STRENGTH_MAX)
4369 + hash_strength = HASH_STRENGTH_MAX;
4372 +static inline void dec_hash_strength(unsigned long delta)
4374 + unsigned long change = 1 << delta;
4376 + if (hash_strength <= change + 1)
4377 + hash_strength = 1;
4378 + else
4379 + hash_strength -= change;
4382 +static inline void inc_hash_strength_delta(void)
4384 + hash_strength_delta++;
4385 + if (hash_strength_delta > HASH_STRENGTH_DELTA_MAX)
4386 + hash_strength_delta = HASH_STRENGTH_DELTA_MAX;
4390 +static inline unsigned long get_current_neg_ratio(void)
4392 + if (!rshash_pos || rshash_neg > rshash_pos)
4393 + return 100;
4395 + return div64_u64(100 * rshash_neg , rshash_pos);
4399 +static inline unsigned long get_current_neg_ratio(void)
4401 + u64 pos = benefit.pos;
4402 + u64 neg = benefit.neg;
4404 + if (!neg)
4405 + return 0;
4407 + if (!pos || neg > pos)
4408 + return 100;
4410 + if (neg > div64_u64(U64_MAX, 100))
4411 + pos = div64_u64(pos, 100);
4412 + else
4413 + neg *= 100;
4415 + return div64_u64(neg, pos);
4418 +static inline unsigned long get_current_benefit(void)
4420 + u64 pos = benefit.pos;
4421 + u64 neg = benefit.neg;
4422 + u64 scanned = benefit.scanned;
4424 + if (neg > pos)
4425 + return 0;
4427 + return div64_u64((pos - neg), scanned);
4430 +static inline int judge_rshash_direction(void)
4432 + u64 current_neg_ratio, stable_benefit;
4433 + u64 current_benefit, delta = 0;
4434 + int ret = STILL;
4436 + /* In case the system are still for a long time. */
4437 + if (ksm_scan_round % 1024 == 3) {
4438 + ret = OBSCURE;
4439 + goto out;
4442 + current_neg_ratio = get_current_neg_ratio();
4444 + if (current_neg_ratio == 0) {
4445 + rshash_neg_cont_zero++;
4446 + if (rshash_neg_cont_zero > 2)
4447 + return GO_DOWN;
4448 + else
4449 + return STILL;
4451 + rshash_neg_cont_zero = 0;
4453 + if (current_neg_ratio > 90) {
4454 + ret = GO_UP;
4455 goto out;
4458 - err = try_to_merge_one_page(vma, page, kpage);
4459 - if (err)
4460 + current_benefit = get_current_benefit();
4461 + stable_benefit = rshash_state.stable_benefit;
4463 + if (!stable_benefit) {
4464 + ret = OBSCURE;
4465 goto out;
4468 + if (current_benefit > stable_benefit)
4469 + delta = current_benefit - stable_benefit;
4470 + else if (current_benefit < stable_benefit)
4471 + delta = stable_benefit - current_benefit;
4473 + delta = div64_u64(100 * delta , stable_benefit);
4475 + if (delta > 50) {
4476 + rshash_cont_obscure++;
4477 + if (rshash_cont_obscure > 2)
4478 + return OBSCURE;
4479 + else
4480 + return STILL;
4483 - /* Must get reference to anon_vma while still holding mmap_sem */
4484 - hold_anon_vma(rmap_item, vma->anon_vma);
4485 out:
4486 - up_read(&mm->mmap_sem);
4487 - return err;
4488 + rshash_cont_obscure = 0;
4489 + return ret;
4493 - * try_to_merge_two_pages - take two identical pages and prepare them
4494 - * to be merged into one page.
4496 - * This function returns the kpage if we successfully merged two identical
4497 - * pages into one ksm page, NULL otherwise.
4499 - * Note that this function upgrades page to ksm page: if one of the pages
4500 - * is already a ksm page, try_to_merge_with_ksm_page should be used.
4501 +/**
4502 + * rshash_adjust() - The main function to control the random sampling state
4503 + * machine for hash strength adapting.
4505 -static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
4506 - struct page *page,
4507 - struct rmap_item *tree_rmap_item,
4508 - struct page *tree_page)
4509 +static void rshash_adjust(void)
4511 - int err;
4512 + unsigned long prev_hash_strength = hash_strength;
4514 - err = try_to_merge_with_ksm_page(rmap_item, page, NULL);
4515 - if (!err) {
4516 - err = try_to_merge_with_ksm_page(tree_rmap_item,
4517 - tree_page, page);
4518 - /*
4519 - * If that fails, we have a ksm page with only one pte
4520 - * pointing to it: so break it.
4521 - */
4522 - if (err)
4523 - break_cow(rmap_item);
4525 - return err ? NULL : page;
4527 + if (ksm_pages_scanned == ksm_pages_scanned_last)
4528 + return;
4531 - * stable_tree_search - search for page inside the stable tree
4533 - * This function checks if there is a page inside the stable tree
4534 - * with identical content to the page that we are scanning right now.
4536 - * This function returns the stable tree node of identical content if found,
4537 - * NULL otherwise.
4538 - */
4539 -static struct page *stable_tree_search(struct page *page)
4541 - struct rb_node *node = root_stable_tree.rb_node;
4542 - struct stable_node *stable_node;
4543 + encode_benefit();
4545 - stable_node = page_stable_node(page);
4546 - if (stable_node) { /* ksm page forked */
4547 - get_page(page);
4548 - return page;
4549 + switch (rshash_state.state) {
4550 + case RSHASH_STILL:
4551 + switch (judge_rshash_direction()) {
4552 + case GO_UP:
4553 + if (rshash_state.pre_direct == GO_DOWN)
4554 + hash_strength_delta = 0;
4556 + inc_hash_strength(hash_strength_delta);
4557 + inc_hash_strength_delta();
4558 + rshash_state.stable_benefit = get_current_benefit();
4559 + rshash_state.pre_direct = GO_UP;
4560 + break;
4562 + case GO_DOWN:
4563 + if (rshash_state.pre_direct == GO_UP)
4564 + hash_strength_delta = 0;
4566 + dec_hash_strength(hash_strength_delta);
4567 + inc_hash_strength_delta();
4568 + rshash_state.stable_benefit = get_current_benefit();
4569 + rshash_state.pre_direct = GO_DOWN;
4570 + break;
4572 + case OBSCURE:
4573 + rshash_state.stable_point = hash_strength;
4574 + rshash_state.turn_point_down = hash_strength;
4575 + rshash_state.turn_point_up = hash_strength;
4576 + rshash_state.turn_benefit_down = get_current_benefit();
4577 + rshash_state.turn_benefit_up = get_current_benefit();
4578 + rshash_state.lookup_window_index = 0;
4579 + rshash_state.state = RSHASH_TRYDOWN;
4580 + dec_hash_strength(hash_strength_delta);
4581 + inc_hash_strength_delta();
4582 + break;
4584 + case STILL:
4585 + break;
4586 + default:
4587 + BUG();
4589 + break;
4591 + case RSHASH_TRYDOWN:
4592 + if (rshash_state.lookup_window_index++ % 5 == 0)
4593 + rshash_state.below_count = 0;
4595 + if (get_current_benefit() < rshash_state.stable_benefit)
4596 + rshash_state.below_count++;
4597 + else if (get_current_benefit() >
4598 + rshash_state.turn_benefit_down) {
4599 + rshash_state.turn_point_down = hash_strength;
4600 + rshash_state.turn_benefit_down = get_current_benefit();
4603 + if (rshash_state.below_count >= 3 ||
4604 + judge_rshash_direction() == GO_UP ||
4605 + hash_strength == 1) {
4606 + hash_strength = rshash_state.stable_point;
4607 + hash_strength_delta = 0;
4608 + inc_hash_strength(hash_strength_delta);
4609 + inc_hash_strength_delta();
4610 + rshash_state.lookup_window_index = 0;
4611 + rshash_state.state = RSHASH_TRYUP;
4612 + hash_strength_delta = 0;
4613 + } else {
4614 + dec_hash_strength(hash_strength_delta);
4615 + inc_hash_strength_delta();
4617 + break;
4619 + case RSHASH_TRYUP:
4620 + if (rshash_state.lookup_window_index++ % 5 == 0)
4621 + rshash_state.below_count = 0;
4623 + if (get_current_benefit() < rshash_state.turn_benefit_down)
4624 + rshash_state.below_count++;
4625 + else if (get_current_benefit() > rshash_state.turn_benefit_up) {
4626 + rshash_state.turn_point_up = hash_strength;
4627 + rshash_state.turn_benefit_up = get_current_benefit();
4630 + if (rshash_state.below_count >= 3 ||
4631 + judge_rshash_direction() == GO_DOWN ||
4632 + hash_strength == HASH_STRENGTH_MAX) {
4633 + hash_strength = rshash_state.turn_benefit_up >
4634 + rshash_state.turn_benefit_down ?
4635 + rshash_state.turn_point_up :
4636 + rshash_state.turn_point_down;
4638 + rshash_state.state = RSHASH_PRE_STILL;
4639 + } else {
4640 + inc_hash_strength(hash_strength_delta);
4641 + inc_hash_strength_delta();
4644 + break;
4646 + case RSHASH_NEW:
4647 + case RSHASH_PRE_STILL:
4648 + rshash_state.stable_benefit = get_current_benefit();
4649 + rshash_state.state = RSHASH_STILL;
4650 + hash_strength_delta = 0;
4651 + break;
4652 + default:
4653 + BUG();
4656 - while (node) {
4657 - struct page *tree_page;
4658 - int ret;
4659 + /* rshash_neg = rshash_pos = 0; */
4660 + reset_benefit();
4662 - cond_resched();
4663 - stable_node = rb_entry(node, struct stable_node, node);
4664 - tree_page = get_ksm_page(stable_node);
4665 - if (!tree_page)
4666 - return NULL;
4667 + if (prev_hash_strength != hash_strength)
4668 + stable_tree_delta_hash(prev_hash_strength);
4671 - ret = memcmp_pages(page, tree_page);
4672 +static void free_vma_dup_tree(struct vma_slot *slot)
4674 + struct vma_slot *tmp_slot;
4675 + int i;
4677 - if (ret < 0) {
4678 - put_page(tree_page);
4679 - node = node->rb_left;
4680 - } else if (ret > 0) {
4681 - put_page(tree_page);
4682 - node = node->rb_right;
4683 - } else
4684 - return tree_page;
4685 + /* step 1: free entries in smaller vmas' dup tree */
4686 + for (i = 0; i < slot->ksm_index; i++) {
4687 + tmp_slot = radix_tree_lookup(&ksm_vma_tree, i);
4688 + if (tmp_slot)
4689 + radix_tree_delete(&tmp_slot->dup_tree, slot->ksm_index);
4692 - return NULL;
4693 + /* step 2: free my own dup tree */
4694 + for (i = slot->ksm_index; i < ksm_vma_tree_index_end; i++)
4695 + radix_tree_delete(&slot->dup_tree, i);
4697 + BUG_ON(slot->dup_tree.rnode);
4701 - * stable_tree_insert - insert rmap_item pointing to new ksm page
4702 - * into the stable tree.
4704 - * This function returns the stable tree node just allocated on success,
4705 - * NULL otherwise.
4706 +/**
4707 + * round_update_ladder() - The main function to do update of all the
4708 + * adjustments whenever a scan round is finished.
4710 -static struct stable_node *stable_tree_insert(struct page *kpage)
4711 +static void round_update_ladder(void)
4713 - struct rb_node **new = &root_stable_tree.rb_node;
4714 - struct rb_node *parent = NULL;
4715 - struct stable_node *stable_node;
4716 + int i;
4717 + struct vma_slot *slot, *tmp_slot;
4718 + unsigned long dedup_ratio_max = 0, dedup_ratio_mean = 0;
4719 + unsigned long threshold;
4721 + for (i = 0; i < ksm_vma_tree_index_end; i++) {
4722 + slot = radix_tree_lookup(&ksm_vma_tree, i);
4724 + if (slot) {
4725 + slot->dedup_ratio = cal_dedup_ratio(slot);
4726 + if (dedup_ratio_max < slot->dedup_ratio)
4727 + dedup_ratio_max = slot->dedup_ratio;
4728 + dedup_ratio_mean += slot->dedup_ratio;
4732 - while (*new) {
4733 - struct page *tree_page;
4734 - int ret;
4735 + dedup_ratio_mean /= ksm_vma_slot_num;
4736 + threshold = dedup_ratio_mean;
4738 - cond_resched();
4739 - stable_node = rb_entry(*new, struct stable_node, node);
4740 - tree_page = get_ksm_page(stable_node);
4741 - if (!tree_page)
4742 - return NULL;
4743 + for (i = 0; i < ksm_vma_tree_index_end; i++) {
4744 + slot = radix_tree_lookup(&ksm_vma_tree, i);
4746 - ret = memcmp_pages(kpage, tree_page);
4747 - put_page(tree_page);
4748 + if (slot) {
4749 + if (slot->dedup_ratio &&
4750 + slot->dedup_ratio >= threshold) {
4751 + vma_rung_up(slot);
4752 + } else {
4753 + vma_rung_down(slot);
4756 - parent = *new;
4757 - if (ret < 0)
4758 - new = &parent->rb_left;
4759 - else if (ret > 0)
4760 - new = &parent->rb_right;
4761 - else {
4762 + free_vma_dup_tree(slot);
4763 + radix_tree_delete(&ksm_vma_tree, i);
4764 + ksm_vma_tree_num--;
4765 + slot->ksm_index = -1;
4766 + slot->slot_scanned = 0;
4767 + slot->dedup_ratio = 0;
4768 + slot->dedup_num = 0;
4772 + for (i = 0; i < ksm_scan_ladder_size; i++) {
4773 + list_for_each_entry_safe(slot, tmp_slot,
4774 + &ksm_scan_ladder[i].vma_list,
4775 + ksm_list) {
4777 - * It is not a bug that stable_tree_search() didn't
4778 - * find this node: because at that time our page was
4779 - * not yet write-protected, so may have changed since.
4780 + * The slots were scanned but not in inter_tab, their
4781 + * dedup must be 0.
4783 - return NULL;
4784 + if (slot->slot_scanned) {
4785 + BUG_ON(slot->dedup_ratio != 0);
4786 + vma_rung_down(slot);
4789 + slot->dedup_ratio = 0;
4793 - stable_node = alloc_stable_node();
4794 - if (!stable_node)
4795 - return NULL;
4796 + BUG_ON(ksm_vma_tree_num != 0);
4797 + ksm_vma_tree_index_end = 0;
4799 - rb_link_node(&stable_node->node, parent, new);
4800 - rb_insert_color(&stable_node->node, &root_stable_tree);
4801 + for (i = 0; i < ksm_scan_ladder_size; i++) {
4802 + ksm_scan_ladder[i].round_finished = 0;
4803 + ksm_scan_ladder[i].busy_searched = 0;
4805 + list_for_each_entry(slot, &ksm_scan_ladder[i].vma_list,
4806 + ksm_list) {
4807 + slot->last_scanned = slot->pages_scanned;
4808 + slot->slot_scanned = 0;
4809 + slot->pages_cowed = 0;
4810 + slot->pages_merged = 0;
4811 + if (slot->fully_scanned) {
4812 + slot->fully_scanned = 0;
4813 + ksm_scan_ladder[i].fully_scanned_slots--;
4815 + BUG_ON(slot->ksm_index != -1);
4818 - INIT_HLIST_HEAD(&stable_node->hlist);
4819 + BUG_ON(ksm_scan_ladder[i].fully_scanned_slots);
4822 - stable_node->kpfn = page_to_pfn(kpage);
4823 - set_page_stable_node(kpage, stable_node);
4824 + rshash_adjust();
4826 - return stable_node;
4827 + //ksm_pages_scanned_last = ksm_pages_scanned;
4831 - * unstable_tree_search_insert - search for identical page,
4832 - * else insert rmap_item into the unstable tree.
4834 - * This function searches for a page in the unstable tree identical to the
4835 - * page currently being scanned; and if no identical page is found in the
4836 - * tree, we insert rmap_item as a new object into the unstable tree.
4838 - * This function returns pointer to rmap_item found to be identical
4839 - * to the currently scanned page, NULL otherwise.
4841 - * This function does both searching and inserting, because they share
4842 - * the same walking algorithm in an rbtree.
4843 - */
4844 -static
4845 -struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
4846 - struct page *page,
4847 - struct page **tree_pagep)
4848 +static inline unsigned int ksm_pages_to_scan(unsigned int batch_pages)
4850 + return totalram_pages * batch_pages / 1000000;
4853 +static inline void cal_ladder_pages_to_scan(unsigned int num)
4855 - struct rb_node **new = &root_unstable_tree.rb_node;
4856 - struct rb_node *parent = NULL;
4857 + int i;
4859 - while (*new) {
4860 - struct rmap_item *tree_rmap_item;
4861 - struct page *tree_page;
4862 - int ret;
4863 + for (i = 0; i < ksm_scan_ladder_size; i++) {
4864 + ksm_scan_ladder[i].pages_to_scan = num
4865 + * ksm_scan_ladder[i].scan_ratio / KSM_SCAN_RATIO_MAX;
4867 + ksm_scan_ladder[0].pages_to_scan /= 16;
4868 + ksm_scan_ladder[1].pages_to_scan /= 4;
4871 - cond_resched();
4872 - tree_rmap_item = rb_entry(*new, struct rmap_item, node);
4873 - tree_page = get_mergeable_page(tree_rmap_item);
4874 - if (IS_ERR_OR_NULL(tree_page))
4875 - return NULL;
4876 +static inline void ksm_del_vma_slot(struct vma_slot *slot)
4878 + int i, j;
4879 + struct rmap_list_entry *entry;
4880 + struct vma_slot *tmp;
4882 - /*
4883 - * Don't substitute a ksm page for a forked page.
4884 - */
4885 - if (page == tree_page) {
4886 - put_page(tree_page);
4887 - return NULL;
4889 + /* mutex lock contention maybe intensive, other idea ? */
4890 + BUG_ON(list_empty(&slot->ksm_list) || !slot->rung);
4892 - ret = memcmp_pages(page, tree_page);
4893 + if (slot->rung->current_scan == &slot->ksm_list)
4894 + slot->rung->current_scan = slot->rung->current_scan->next;
4896 - parent = *new;
4897 - if (ret < 0) {
4898 - put_page(tree_page);
4899 - new = &parent->rb_left;
4900 - } else if (ret > 0) {
4901 - put_page(tree_page);
4902 - new = &parent->rb_right;
4903 - } else {
4904 - *tree_pagep = tree_page;
4905 - return tree_rmap_item;
4907 + list_del_init(&slot->ksm_list);
4908 + slot->rung->vma_num--;
4909 + if (slot->fully_scanned)
4910 + slot->rung->fully_scanned_slots--;
4912 + if (slot->rung->current_scan == &slot->rung->vma_list) {
4913 + /* This rung finishes a round */
4914 + slot->rung->round_finished = 1;
4915 + slot->rung->current_scan = slot->rung->vma_list.next;
4916 + BUG_ON(slot->rung->current_scan == &slot->rung->vma_list
4917 + && !list_empty(&slot->rung->vma_list));
4920 - rmap_item->address |= UNSTABLE_FLAG;
4921 - rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
4922 - rb_link_node(&rmap_item->node, parent, new);
4923 - rb_insert_color(&rmap_item->node, &root_unstable_tree);
4924 + if (slot->ksm_index == -1)
4925 + goto skip;
4927 - ksm_pages_unshared++;
4928 - return NULL;
4929 + tmp = radix_tree_delete(&ksm_vma_tree, slot->ksm_index);
4930 + BUG_ON(!tmp || tmp != slot);
4931 + free_vma_dup_tree(slot);
4932 + ksm_vma_tree_num--;
4933 + if (slot->ksm_index == ksm_vma_tree_index_end - 1)
4934 + ksm_vma_tree_index_end--;
4936 +skip:
4937 + if (!slot->rmap_list_pool)
4938 + goto out;
4940 + for (i = 0; i < slot->pool_size; i++) {
4941 + void *addr;
4943 + if (!slot->rmap_list_pool[i])
4944 + continue;
4946 + addr = kmap(slot->rmap_list_pool[i]);
4947 + BUG_ON(!addr);
4948 + for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
4949 + entry = (struct rmap_list_entry *)addr + j;
4950 + if (is_addr(entry->addr))
4951 + continue;
4952 + if (!entry->item)
4953 + continue;
4955 + remove_rmap_item_from_tree(entry->item);
4956 + free_rmap_item(entry->item);
4957 + slot->pool_counts[i]--;
4959 + BUG_ON(slot->pool_counts[i]);
4960 + kunmap(slot->rmap_list_pool[i]);
4961 + __free_page(slot->rmap_list_pool[i]);
4963 + kfree(slot->rmap_list_pool);
4964 + kfree(slot->pool_counts);
4966 +out:
4967 + slot->rung = NULL;
4968 + free_vma_slot(slot);
4969 + BUG_ON(!ksm_vma_slot_num);
4970 + ksm_vma_slot_num--;
4974 - * stable_tree_append - add another rmap_item to the linked list of
4975 - * rmap_items hanging off a given node of the stable tree, all sharing
4976 - * the same ksm page.
4977 - */
4978 -static void stable_tree_append(struct rmap_item *rmap_item,
4979 - struct stable_node *stable_node)
4981 +static inline void cleanup_vma_slots(void)
4983 - rmap_item->head = stable_node;
4984 - rmap_item->address |= STABLE_FLAG;
4985 - hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
4986 + struct vma_slot *slot;
4988 - if (rmap_item->hlist.next)
4989 - ksm_pages_sharing++;
4990 - else
4991 - ksm_pages_shared++;
4992 + spin_lock(&vma_slot_list_lock);
4993 + while (!list_empty(&vma_slot_del)) {
4994 + slot = list_entry(vma_slot_del.next,
4995 + struct vma_slot, slot_list);
4996 + list_del(&slot->slot_list);
4997 + spin_unlock(&vma_slot_list_lock);
4998 + ksm_del_vma_slot(slot);
4999 + spin_lock(&vma_slot_list_lock);
5001 + spin_unlock(&vma_slot_list_lock);
5005 - * cmp_and_merge_page - first see if page can be merged into the stable tree;
5006 - * if not, compare checksum to previous and if it's the same, see if page can
5007 - * be inserted into the unstable tree, or merged with a page already there and
5008 - * both transferred to the stable tree.
5010 - * @page: the page that we are searching identical page to.
5011 - * @rmap_item: the reverse mapping into the virtual address of this page
5012 +static inline int rung_fully_scanned(struct scan_rung *rung)
5014 + return (rung->fully_scanned_slots == rung->vma_num &&
5015 + rung->fully_scanned_slots);
5018 +/**
5019 + * ksm_do_scan() - the main worker function.
5021 -static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
5022 +static void ksm_do_scan(void)
5024 - struct rmap_item *tree_rmap_item;
5025 - struct page *tree_page = NULL;
5026 - struct stable_node *stable_node;
5027 - struct page *kpage;
5028 - unsigned int checksum;
5029 - int err;
5030 + struct vma_slot *slot, *iter;
5031 + struct list_head *next_scan, *iter_head;
5032 + struct mm_struct *busy_mm;
5033 + unsigned char round_finished, all_rungs_emtpy;
5034 + int i, err;
5035 + unsigned long rest_pages;
5037 + might_sleep();
5039 + rest_pages = 0;
5040 +repeat_all:
5041 + for (i = ksm_scan_ladder_size - 1; i >= 0; i--) {
5042 + struct scan_rung *rung = &ksm_scan_ladder[i];
5044 - remove_rmap_item_from_tree(rmap_item);
5045 + if (!rung->pages_to_scan)
5046 + continue;
5048 - /* We first start with searching the page inside the stable tree */
5049 - kpage = stable_tree_search(page);
5050 - if (kpage) {
5051 - err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
5052 - if (!err) {
5053 - /*
5054 - * The page was successfully merged:
5055 - * add its rmap_item to the stable tree.
5056 - */
5057 - lock_page(kpage);
5058 - stable_tree_append(rmap_item, page_stable_node(kpage));
5059 - unlock_page(kpage);
5060 + if (list_empty(&rung->vma_list)) {
5061 + rung->pages_to_scan = 0;
5062 + continue;
5064 - put_page(kpage);
5065 - return;
5068 - /*
5069 - * If the hash value of the page has changed from the last time
5070 - * we calculated it, this page is changing frequently: therefore we
5071 - * don't want to insert it in the unstable tree, and we don't want
5072 - * to waste our time searching for something identical to it there.
5073 - */
5074 - checksum = calc_checksum(page);
5075 - if (rmap_item->oldchecksum != checksum) {
5076 - rmap_item->oldchecksum = checksum;
5077 - return;
5080 - tree_rmap_item =
5081 - unstable_tree_search_insert(rmap_item, page, &tree_page);
5082 - if (tree_rmap_item) {
5083 - kpage = try_to_merge_two_pages(rmap_item, page,
5084 - tree_rmap_item, tree_page);
5085 - put_page(tree_page);
5087 - * As soon as we merge this page, we want to remove the
5088 - * rmap_item of the page we have merged with from the unstable
5089 - * tree, and insert it instead as new node in the stable tree.
5090 + * if a higher rung is fully scanned, its rest pages should be
5091 + * propagated to the lower rungs. This can prevent the higher
5092 + * rung from waiting a long time while it still has its
5093 + * pages_to_scan quota.
5096 - if (kpage) {
5097 - remove_rmap_item_from_tree(tree_rmap_item);
5098 + if (rung_fully_scanned(rung)) {
5099 + rest_pages += rung->pages_to_scan;
5100 + rung->pages_to_scan = 0;
5101 + continue;
5104 - lock_page(kpage);
5105 - stable_node = stable_tree_insert(kpage);
5106 - if (stable_node) {
5107 - stable_tree_append(tree_rmap_item, stable_node);
5108 - stable_tree_append(rmap_item, stable_node);
5109 + rung->pages_to_scan += rest_pages;
5110 + rest_pages = 0;
5111 + while (rung->pages_to_scan && likely(!freezing(current))) {
5112 +cleanup:
5113 + cleanup_vma_slots();
5115 + if (list_empty(&rung->vma_list))
5116 + break;
5118 +rescan:
5119 + BUG_ON(rung->current_scan == &rung->vma_list &&
5120 + !list_empty(&rung->vma_list));
5122 + slot = list_entry(rung->current_scan,
5123 + struct vma_slot, ksm_list);
5126 + if (slot->fully_scanned)
5127 + goto next_scan;
5129 + err = try_down_read_slot_mmap_sem(slot);
5130 + if (err == -ENOENT)
5131 + goto cleanup;
5133 + busy_mm = slot->mm;
5135 +busy:
5136 + if (err == -EBUSY) {
5137 + /* skip other vmas on the same mm */
5138 + rung->busy_searched = 1;
5139 + iter = slot;
5140 + iter_head = slot->ksm_list.next;
5142 + while (iter_head != &rung->vma_list) {
5143 + iter = list_entry(iter_head,
5144 + struct vma_slot,
5145 + ksm_list);
5146 + if (iter->vma->vm_mm != busy_mm)
5147 + break;
5148 + iter_head = iter_head->next;
5151 + if (iter->vma->vm_mm != busy_mm) {
5152 + rung->current_scan = &iter->ksm_list;
5153 + goto rescan;
5154 + } else {
5155 + /* at the end, but still busy */
5156 + rung->current_scan = iter->ksm_list.next;
5157 + goto next_scan;
5158 + break;
5161 - unlock_page(kpage);
5163 - /*
5164 - * If we fail to insert the page into the stable tree,
5165 - * we will have 2 virtual addresses that are pointing
5166 - * to a ksm page left outside the stable tree,
5167 - * in which case we need to break_cow on both.
5168 - */
5169 - if (!stable_node) {
5170 - break_cow(tree_rmap_item);
5171 - break_cow(rmap_item);
5172 + BUG_ON(!vma_can_enter(slot->vma));
5173 + if (ksm_test_exit(slot->vma->vm_mm)) {
5174 + busy_mm = slot->vma->vm_mm;
5175 + up_read(&slot->vma->vm_mm->mmap_sem);
5176 + err = -EBUSY;
5177 + goto busy;
5180 + if (rung->busy_searched)
5181 + rung->busy_searched = 0;
5182 + /* Ok, we have take the mmap_sem, ready to scan */
5183 + scan_vma_one_page(slot);
5184 + up_read(&slot->vma->vm_mm->mmap_sem);
5185 + rung->pages_to_scan--;
5187 + if ((slot->pages_scanned &&
5188 + slot->pages_scanned % slot->pages_to_scan == 0)
5189 + || slot->fully_scanned) {
5190 +next_scan:
5191 + next_scan = rung->current_scan->next;
5192 + if (next_scan == &rung->vma_list) {
5193 + /*
5194 + * All the slots in this rung
5195 + * have been traveled in this
5196 + * round.
5197 + */
5198 + rung->round_finished = 1;
5199 + rung->current_scan =
5200 + rung->vma_list.next;
5201 + if (rung_fully_scanned(rung) ||
5202 + rung->busy_searched) {
5203 + /*
5204 + * All the pages in all slots
5205 + * have been scanned. Or we
5206 + * did not make any progress
5207 + * because of busy mm.
5208 + */
5209 + rest_pages +=
5210 + rung->pages_to_scan;
5211 + rung->pages_to_scan = 0;
5212 + break;
5214 + } else {
5215 + rung->current_scan = next_scan;
5219 + cond_resched();
5222 + if (freezing(current))
5223 + break;
5227 -static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
5228 - struct rmap_item **rmap_list,
5229 - unsigned long addr)
5231 - struct rmap_item *rmap_item;
5232 + if (freezing(current))
5233 + return;
5235 - while (*rmap_list) {
5236 - rmap_item = *rmap_list;
5237 - if ((rmap_item->address & PAGE_MASK) == addr)
5238 - return rmap_item;
5239 - if (rmap_item->address > addr)
5240 + round_finished = 1;
5241 + all_rungs_emtpy = 1;
5242 + for (i = 0; i < ksm_scan_ladder_size; i++) {
5243 + struct scan_rung *rung = &ksm_scan_ladder[i];
5245 + if (!list_empty(&rung->vma_list)) {
5246 + all_rungs_emtpy = 0;
5247 + if (!rung->round_finished)
5248 + round_finished = 0;
5249 break;
5250 - *rmap_list = rmap_item->rmap_list;
5251 - remove_rmap_item_from_tree(rmap_item);
5252 - free_rmap_item(rmap_item);
5256 - rmap_item = alloc_rmap_item();
5257 - if (rmap_item) {
5258 - /* It has already been zeroed */
5259 - rmap_item->mm = mm_slot->mm;
5260 - rmap_item->address = addr;
5261 - rmap_item->rmap_list = *rmap_list;
5262 - *rmap_list = rmap_item;
5264 - return rmap_item;
5266 + if (all_rungs_emtpy)
5267 + round_finished = 0;
5269 -static struct rmap_item *scan_get_next_rmap_item(struct page **page)
5271 - struct mm_struct *mm;
5272 - struct mm_slot *slot;
5273 - struct vm_area_struct *vma;
5274 - struct rmap_item *rmap_item;
5275 + cleanup_vma_slots();
5277 - if (list_empty(&ksm_mm_head.mm_list))
5278 - return NULL;
5279 + if (round_finished) {
5280 + round_update_ladder();
5282 - slot = ksm_scan.mm_slot;
5283 - if (slot == &ksm_mm_head) {
5285 * A number of pages can hang around indefinitely on per-cpu
5286 * pagevecs, raised page count preventing write_protect_page
5287 @@ -1308,266 +4131,160 @@
5289 lru_add_drain_all();
5291 + /* sync with ksm_remove_vma for rb_erase */
5292 + ksm_scan_round++;
5293 root_unstable_tree = RB_ROOT;
5295 - spin_lock(&ksm_mmlist_lock);
5296 - slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
5297 - ksm_scan.mm_slot = slot;
5298 - spin_unlock(&ksm_mmlist_lock);
5299 -next_mm:
5300 - ksm_scan.address = 0;
5301 - ksm_scan.rmap_list = &slot->rmap_list;
5304 - mm = slot->mm;
5305 - down_read(&mm->mmap_sem);
5306 - if (ksm_test_exit(mm))
5307 - vma = NULL;
5308 - else
5309 - vma = find_vma(mm, ksm_scan.address);
5311 - for (; vma; vma = vma->vm_next) {
5312 - if (!(vma->vm_flags & VM_MERGEABLE))
5313 - continue;
5314 - if (ksm_scan.address < vma->vm_start)
5315 - ksm_scan.address = vma->vm_start;
5316 - if (!vma->anon_vma)
5317 - ksm_scan.address = vma->vm_end;
5319 - while (ksm_scan.address < vma->vm_end) {
5320 - if (ksm_test_exit(mm))
5321 - break;
5322 - *page = follow_page(vma, ksm_scan.address, FOLL_GET);
5323 - if (IS_ERR_OR_NULL(*page)) {
5324 - ksm_scan.address += PAGE_SIZE;
5325 - cond_resched();
5326 - continue;
5328 - if (PageAnon(*page) ||
5329 - page_trans_compound_anon(*page)) {
5330 - flush_anon_page(vma, *page, ksm_scan.address);
5331 - flush_dcache_page(*page);
5332 - rmap_item = get_next_rmap_item(slot,
5333 - ksm_scan.rmap_list, ksm_scan.address);
5334 - if (rmap_item) {
5335 - ksm_scan.rmap_list =
5336 - &rmap_item->rmap_list;
5337 - ksm_scan.address += PAGE_SIZE;
5338 - } else
5339 - put_page(*page);
5340 - up_read(&mm->mmap_sem);
5341 - return rmap_item;
5343 - put_page(*page);
5344 - ksm_scan.address += PAGE_SIZE;
5345 - cond_resched();
5349 - if (ksm_test_exit(mm)) {
5350 - ksm_scan.address = 0;
5351 - ksm_scan.rmap_list = &slot->rmap_list;
5353 - /*
5354 - * Nuke all the rmap_items that are above this current rmap:
5355 - * because there were no VM_MERGEABLE vmas with such addresses.
5356 - */
5357 - remove_trailing_rmap_items(slot, ksm_scan.rmap_list);
5359 - spin_lock(&ksm_mmlist_lock);
5360 - ksm_scan.mm_slot = list_entry(slot->mm_list.next,
5361 - struct mm_slot, mm_list);
5362 - if (ksm_scan.address == 0) {
5363 - /*
5364 - * We've completed a full scan of all vmas, holding mmap_sem
5365 - * throughout, and found no VM_MERGEABLE: so do the same as
5366 - * __ksm_exit does to remove this mm from all our lists now.
5367 - * This applies either when cleaning up after __ksm_exit
5368 - * (but beware: we can reach here even before __ksm_exit),
5369 - * or when all VM_MERGEABLE areas have been unmapped (and
5370 - * mmap_sem then protects against race with MADV_MERGEABLE).
5371 - */
5372 - hlist_del(&slot->link);
5373 - list_del(&slot->mm_list);
5374 - spin_unlock(&ksm_mmlist_lock);
5376 - free_mm_slot(slot);
5377 - clear_bit(MMF_VM_MERGEABLE, &mm->flags);
5378 - up_read(&mm->mmap_sem);
5379 - mmdrop(mm);
5380 - } else {
5381 - spin_unlock(&ksm_mmlist_lock);
5382 - up_read(&mm->mmap_sem);
5383 + free_all_tree_nodes(&unstable_tree_node_list);
5386 - /* Repeat until we've completed scanning the whole list */
5387 - slot = ksm_scan.mm_slot;
5388 - if (slot != &ksm_mm_head)
5389 - goto next_mm;
5391 - ksm_scan.seqnr++;
5392 - return NULL;
5395 -/**
5396 - * ksm_do_scan - the ksm scanner main worker function.
5397 - * @scan_npages - number of pages we want to scan before we return.
5398 - */
5399 -static void ksm_do_scan(unsigned int scan_npages)
5401 - struct rmap_item *rmap_item;
5402 - struct page *uninitialized_var(page);
5404 - while (scan_npages-- && likely(!freezing(current))) {
5405 - cond_resched();
5406 - rmap_item = scan_get_next_rmap_item(&page);
5407 - if (!rmap_item)
5408 - return;
5409 - if (!PageKsm(page) || !in_stable_tree(rmap_item))
5410 - cmp_and_merge_page(page, rmap_item);
5411 - put_page(page);
5412 + for (i = 0; i < ksm_scan_ladder_size; i++) {
5413 + struct scan_rung *rung = &ksm_scan_ladder[i];
5415 + /*
5416 + * Before we can go sleep, we should make sure that all the
5417 + * pages_to_scan quota for this scan has been finished
5418 + */
5419 + if (!list_empty(&rung->vma_list) && rung->pages_to_scan)
5420 + goto repeat_all;
5423 + cal_ladder_pages_to_scan(ksm_scan_batch_pages);
5426 static int ksmd_should_run(void)
5428 - return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list);
5429 + return ksm_run & KSM_RUN_MERGE;
5432 -static int ksm_scan_thread(void *nothing)
5434 - set_freezable();
5435 - set_user_nice(current, 5);
5437 - while (!kthread_should_stop()) {
5438 - mutex_lock(&ksm_thread_mutex);
5439 - if (ksmd_should_run())
5440 - ksm_do_scan(ksm_thread_pages_to_scan);
5441 - mutex_unlock(&ksm_thread_mutex);
5443 - try_to_freeze();
5444 +#define __round_mask(x, y) ((__typeof__(x))((y)-1))
5445 +#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
5447 - if (ksmd_should_run()) {
5448 - schedule_timeout_interruptible(
5449 - msecs_to_jiffies(ksm_thread_sleep_millisecs));
5450 - } else {
5451 - wait_event_freezable(ksm_thread_wait,
5452 - ksmd_should_run() || kthread_should_stop());
5455 - return 0;
5456 +static inline unsigned long vma_pool_size(struct vm_area_struct *vma)
5458 + return round_up(sizeof(struct rmap_list_entry) * vma_pages(vma),
5459 + PAGE_SIZE) >> PAGE_SHIFT;
5462 -int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
5463 - unsigned long end, int advice, unsigned long *vm_flags)
5464 +/**
5468 + * @param slot
5470 + * @return int , 1 on success, 0 on failure
5471 + */
5472 +static int ksm_vma_enter(struct vma_slot *slot)
5474 - struct mm_struct *mm = vma->vm_mm;
5475 - int err;
5477 - switch (advice) {
5478 - case MADV_MERGEABLE:
5479 - /*
5480 - * Be somewhat over-protective for now!
5481 - */
5482 - if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
5483 - VM_PFNMAP | VM_IO | VM_DONTEXPAND |
5484 - VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
5485 - VM_NONLINEAR | VM_MIXEDMAP | VM_SAO))
5486 - return 0; /* just ignore the advice */
5488 - if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
5489 - err = __ksm_enter(mm);
5490 - if (err)
5491 - return err;
5494 - *vm_flags |= VM_MERGEABLE;
5495 - break;
5496 + struct scan_rung *rung;
5497 + unsigned long pages_to_scan, pool_size;
5499 - case MADV_UNMERGEABLE:
5500 - if (!(*vm_flags & VM_MERGEABLE))
5501 - return 0; /* just ignore the advice */
5502 + BUG_ON(slot->pages != vma_pages(slot->vma));
5503 + rung = &ksm_scan_ladder[0];
5505 - if (vma->anon_vma) {
5506 - err = unmerge_ksm_pages(vma, start, end);
5507 - if (err)
5508 - return err;
5509 + pages_to_scan = get_vma_random_scan_num(slot, rung->scan_ratio);
5510 + if (pages_to_scan) {
5511 + if (list_empty(&rung->vma_list))
5512 + rung->current_scan = &slot->ksm_list;
5513 + BUG_ON(!list_empty(&slot->ksm_list));
5515 + list_add(&slot->ksm_list, &rung->vma_list);
5516 + slot->rung = rung;
5517 + slot->pages_to_scan = pages_to_scan;
5518 + slot->rung->vma_num++;
5519 + BUG_ON(PAGE_SIZE % sizeof(struct rmap_list_entry) != 0);
5521 + pool_size = vma_pool_size(slot->vma);
5523 + slot->rmap_list_pool = kzalloc(sizeof(struct page *) *
5524 + pool_size, GFP_NOWAIT);
5525 + slot->pool_counts = kzalloc(sizeof(unsigned long) * pool_size,
5526 + GFP_NOWAIT);
5527 + slot->pool_size = pool_size;
5528 + if (!slot->rmap_list_pool)
5529 + goto failed;
5531 + if (!slot->pool_counts) {
5532 + kfree(slot->rmap_list_pool);
5533 + goto failed;
5536 - *vm_flags &= ~VM_MERGEABLE;
5537 - break;
5538 + BUG_ON(rung->current_scan == &rung->vma_list &&
5539 + !list_empty(&rung->vma_list));
5541 + ksm_vma_slot_num++;
5542 + BUG_ON(!ksm_vma_slot_num);
5543 + return 1;
5546 +failed:
5547 return 0;
5550 -int __ksm_enter(struct mm_struct *mm)
5552 - struct mm_slot *mm_slot;
5553 - int needs_wakeup;
5555 - mm_slot = alloc_mm_slot();
5556 - if (!mm_slot)
5557 - return -ENOMEM;
5559 - /* Check ksm_run too? Would need tighter locking */
5560 - needs_wakeup = list_empty(&ksm_mm_head.mm_list);
5562 - spin_lock(&ksm_mmlist_lock);
5563 - insert_to_mm_slots_hash(mm, mm_slot);
5564 - /*
5565 - * Insert just behind the scanning cursor, to let the area settle
5566 - * down a little; when fork is followed by immediate exec, we don't
5567 - * want ksmd to waste time setting up and tearing down an rmap_list.
5568 - */
5569 - list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
5570 - spin_unlock(&ksm_mmlist_lock);
5571 +static void ksm_enter_all_slots(void)
5573 + struct vma_slot *slot;
5574 + int added;
5576 - set_bit(MMF_VM_MERGEABLE, &mm->flags);
5577 - atomic_inc(&mm->mm_count);
5578 + spin_lock(&vma_slot_list_lock);
5579 + while (!list_empty(&vma_slot_new)) {
5580 + slot = list_entry(vma_slot_new.next,
5581 + struct vma_slot, slot_list);
5582 + /**
5583 + * slots are sorted by ctime_j, if one found to be too
5584 + * young, just stop scanning the rest ones.
5585 + */
5586 + /*
5588 - if (needs_wakeup)
5589 - wake_up_interruptible(&ksm_thread_wait);
5590 + if (time_before(jiffies, slot->ctime_j +
5591 + msecs_to_jiffies(1000))) {
5592 + spin_unlock(&vma_slot_list_lock);
5593 + return;
5595 + */
5597 - return 0;
5598 + list_del_init(&slot->slot_list);
5599 + added = 0;
5600 + if (vma_can_enter(slot->vma))
5601 + added = ksm_vma_enter(slot);
5603 + if (!added) {
5604 + /* Put back to new list to be del by its creator */
5605 + slot->ctime_j = jiffies;
5606 + list_del(&slot->slot_list);
5607 + list_add_tail(&slot->slot_list, &vma_slot_noadd);
5609 + spin_unlock(&vma_slot_list_lock);
5610 + cond_resched();
5611 + spin_lock(&vma_slot_list_lock);
5613 + spin_unlock(&vma_slot_list_lock);
5616 -void __ksm_exit(struct mm_struct *mm)
5617 +static int ksm_scan_thread(void *nothing)
5619 - struct mm_slot *mm_slot;
5620 - int easy_to_free = 0;
5621 + set_freezable();
5622 + set_user_nice(current, 5);
5624 - /*
5625 - * This process is exiting: if it's straightforward (as is the
5626 - * case when ksmd was never running), free mm_slot immediately.
5627 - * But if it's at the cursor or has rmap_items linked to it, use
5628 - * mmap_sem to synchronize with any break_cows before pagetables
5629 - * are freed, and leave the mm_slot on the list for ksmd to free.
5630 - * Beware: ksm may already have noticed it exiting and freed the slot.
5631 - */
5632 + while (!kthread_should_stop()) {
5633 + mutex_lock(&ksm_thread_mutex);
5634 + if (ksmd_should_run()) {
5635 + ksm_enter_all_slots();
5636 + ksm_do_scan();
5638 + mutex_unlock(&ksm_thread_mutex);
5640 + try_to_freeze();
5642 - spin_lock(&ksm_mmlist_lock);
5643 - mm_slot = get_mm_slot(mm);
5644 - if (mm_slot && ksm_scan.mm_slot != mm_slot) {
5645 - if (!mm_slot->rmap_list) {
5646 - hlist_del(&mm_slot->link);
5647 - list_del(&mm_slot->mm_list);
5648 - easy_to_free = 1;
5649 + if (ksmd_should_run()) {
5650 + schedule_timeout_interruptible(ksm_sleep_jiffies);
5651 + ksm_sleep_times++;
5652 } else {
5653 - list_move(&mm_slot->mm_list,
5654 - &ksm_scan.mm_slot->mm_list);
5655 + wait_event_freezable(ksm_thread_wait,
5656 + ksmd_should_run() || kthread_should_stop());
5659 - spin_unlock(&ksm_mmlist_lock);
5661 - if (easy_to_free) {
5662 - free_mm_slot(mm_slot);
5663 - clear_bit(MMF_VM_MERGEABLE, &mm->flags);
5664 - mmdrop(mm);
5665 - } else if (mm_slot) {
5666 - down_write(&mm->mmap_sem);
5667 - up_write(&mm->mmap_sem);
5669 + return 0;
5672 struct page *ksm_does_need_to_copy(struct page *page,
5673 @@ -1597,11 +4314,13 @@
5674 unsigned long *vm_flags)
5676 struct stable_node *stable_node;
5677 + struct node_vma *node_vma;
5678 struct rmap_item *rmap_item;
5679 - struct hlist_node *hlist;
5680 + struct hlist_node *hlist, *rmap_hlist;
5681 unsigned int mapcount = page_mapcount(page);
5682 int referenced = 0;
5683 int search_new_forks = 0;
5684 + unsigned long address;
5686 VM_BUG_ON(!PageKsm(page));
5687 VM_BUG_ON(!PageLocked(page));
5688 @@ -1609,38 +4328,51 @@
5689 stable_node = page_stable_node(page);
5690 if (!stable_node)
5691 return 0;
5692 -again:
5693 - hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
5694 - struct anon_vma *anon_vma = rmap_item->anon_vma;
5695 - struct anon_vma_chain *vmac;
5696 - struct vm_area_struct *vma;
5698 - anon_vma_lock(anon_vma);
5699 - list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
5700 - vma = vmac->vma;
5701 - if (rmap_item->address < vma->vm_start ||
5702 - rmap_item->address >= vma->vm_end)
5703 - continue;
5704 - /*
5705 - * Initially we examine only the vma which covers this
5706 - * rmap_item; but later, if there is still work to do,
5707 - * we examine covering vmas in other mms: in case they
5708 - * were forked from the original since ksmd passed.
5709 - */
5710 - if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
5711 - continue;
5713 - if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
5714 - continue;
5716 - referenced += page_referenced_one(page, vma,
5717 - rmap_item->address, &mapcount, vm_flags);
5718 - if (!search_new_forks || !mapcount)
5719 - break;
5720 +again:
5721 + hlist_for_each_entry(node_vma, hlist, &stable_node->hlist, hlist) {
5722 + hlist_for_each_entry(rmap_item, rmap_hlist,
5723 + &node_vma->rmap_hlist, hlist) {
5724 + struct anon_vma *anon_vma = rmap_item->anon_vma;
5725 + struct anon_vma_chain *vmac;
5726 + struct vm_area_struct *vma;
5728 + anon_vma_lock(anon_vma);
5729 + list_for_each_entry(vmac, &anon_vma->head,
5730 + same_anon_vma) {
5731 + vma = vmac->vma;
5732 + address = get_rmap_addr(rmap_item);
5734 + if (address < vma->vm_start ||
5735 + address >= vma->vm_end)
5736 + continue;
5737 + /*
5738 + * Initially we examine only the vma which
5739 + * covers this rmap_item; but later, if there
5740 + * is still work to do, we examine covering
5741 + * vmas in other mms: in case they were forked
5742 + * from the original since ksmd passed.
5743 + */
5744 + if ((rmap_item->slot->vma == vma) ==
5745 + search_new_forks)
5746 + continue;
5748 + if (memcg &&
5749 + !mm_match_cgroup(vma->vm_mm, memcg))
5750 + continue;
5752 + referenced +=
5753 + page_referenced_one(page, vma,
5754 + address, &mapcount, vm_flags);
5755 + if (!search_new_forks || !mapcount)
5756 + break;
5759 + anon_vma_unlock(anon_vma);
5760 + if (!mapcount)
5761 + goto out;
5763 - anon_vma_unlock(anon_vma);
5764 - if (!mapcount)
5765 - goto out;
5767 if (!search_new_forks++)
5768 goto again;
5769 @@ -1651,10 +4383,12 @@
5770 int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
5772 struct stable_node *stable_node;
5773 - struct hlist_node *hlist;
5774 + struct node_vma *node_vma;
5775 + struct hlist_node *hlist, *rmap_hlist;
5776 struct rmap_item *rmap_item;
5777 int ret = SWAP_AGAIN;
5778 int search_new_forks = 0;
5779 + unsigned long address;
5781 VM_BUG_ON(!PageKsm(page));
5782 VM_BUG_ON(!PageLocked(page));
5783 @@ -1663,34 +4397,42 @@
5784 if (!stable_node)
5785 return SWAP_FAIL;
5786 again:
5787 - hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
5788 - struct anon_vma *anon_vma = rmap_item->anon_vma;
5789 - struct anon_vma_chain *vmac;
5790 - struct vm_area_struct *vma;
5792 - anon_vma_lock(anon_vma);
5793 - list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
5794 - vma = vmac->vma;
5795 - if (rmap_item->address < vma->vm_start ||
5796 - rmap_item->address >= vma->vm_end)
5797 - continue;
5798 - /*
5799 - * Initially we examine only the vma which covers this
5800 - * rmap_item; but later, if there is still work to do,
5801 - * we examine covering vmas in other mms: in case they
5802 - * were forked from the original since ksmd passed.
5803 - */
5804 - if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
5805 - continue;
5807 - ret = try_to_unmap_one(page, vma,
5808 - rmap_item->address, flags);
5809 - if (ret != SWAP_AGAIN || !page_mapped(page)) {
5810 - anon_vma_unlock(anon_vma);
5811 - goto out;
5812 + hlist_for_each_entry(node_vma, hlist, &stable_node->hlist, hlist) {
5813 + hlist_for_each_entry(rmap_item, rmap_hlist,
5814 + &node_vma->rmap_hlist, hlist) {
5815 + struct anon_vma *anon_vma = rmap_item->anon_vma;
5816 + struct anon_vma_chain *vmac;
5817 + struct vm_area_struct *vma;
5819 + anon_vma_lock(anon_vma);
5820 + list_for_each_entry(vmac, &anon_vma->head,
5821 + same_anon_vma) {
5822 + vma = vmac->vma;
5823 + address = get_rmap_addr(rmap_item);
5825 + if (address < vma->vm_start ||
5826 + address >= vma->vm_end)
5827 + continue;
5828 + /*
5829 + * Initially we examine only the vma which
5830 + * covers this rmap_item; but later, if there
5831 + * is still work to do, we examine covering
5832 + * vmas in other mms: in case they were forked
5833 + * from the original since ksmd passed.
5834 + */
5835 + if ((rmap_item->slot->vma == vma) ==
5836 + search_new_forks)
5837 + continue;
5839 + ret = try_to_unmap_one(page, vma,
5840 + address, flags);
5841 + if (ret != SWAP_AGAIN || !page_mapped(page)) {
5842 + anon_vma_unlock(anon_vma);
5843 + goto out;
5846 + anon_vma_unlock(anon_vma);
5848 - anon_vma_unlock(anon_vma);
5850 if (!search_new_forks++)
5851 goto again;
5852 @@ -1703,10 +4445,12 @@
5853 struct vm_area_struct *, unsigned long, void *), void *arg)
5855 struct stable_node *stable_node;
5856 - struct hlist_node *hlist;
5857 + struct node_vma *node_vma;
5858 + struct hlist_node *hlist, *rmap_hlist;
5859 struct rmap_item *rmap_item;
5860 int ret = SWAP_AGAIN;
5861 int search_new_forks = 0;
5862 + unsigned long address;
5864 VM_BUG_ON(!PageKsm(page));
5865 VM_BUG_ON(!PageLocked(page));
5866 @@ -1715,33 +4459,35 @@
5867 if (!stable_node)
5868 return ret;
5869 again:
5870 - hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
5871 - struct anon_vma *anon_vma = rmap_item->anon_vma;
5872 - struct anon_vma_chain *vmac;
5873 - struct vm_area_struct *vma;
5875 - anon_vma_lock(anon_vma);
5876 - list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
5877 - vma = vmac->vma;
5878 - if (rmap_item->address < vma->vm_start ||
5879 - rmap_item->address >= vma->vm_end)
5880 - continue;
5881 - /*
5882 - * Initially we examine only the vma which covers this
5883 - * rmap_item; but later, if there is still work to do,
5884 - * we examine covering vmas in other mms: in case they
5885 - * were forked from the original since ksmd passed.
5886 - */
5887 - if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
5888 - continue;
5890 - ret = rmap_one(page, vma, rmap_item->address, arg);
5891 - if (ret != SWAP_AGAIN) {
5892 - anon_vma_unlock(anon_vma);
5893 - goto out;
5894 + hlist_for_each_entry(node_vma, hlist, &stable_node->hlist, hlist) {
5895 + hlist_for_each_entry(rmap_item, rmap_hlist,
5896 + &node_vma->rmap_hlist, hlist) {
5897 + struct anon_vma *anon_vma = rmap_item->anon_vma;
5898 + struct anon_vma_chain *vmac;
5899 + struct vm_area_struct *vma;
5901 + anon_vma_lock(anon_vma);
5902 + list_for_each_entry(vmac, &anon_vma->head,
5903 + same_anon_vma) {
5904 + vma = vmac->vma;
5905 + address = get_rmap_addr(rmap_item);
5907 + if (address < vma->vm_start ||
5908 + address >= vma->vm_end)
5909 + continue;
5911 + if ((rmap_item->slot->vma == vma) ==
5912 + search_new_forks)
5913 + continue;
5915 + ret = rmap_one(page, vma, address, arg);
5916 + if (ret != SWAP_AGAIN) {
5917 + anon_vma_unlock(anon_vma);
5918 + goto out;
5921 + anon_vma_unlock(anon_vma);
5923 - anon_vma_unlock(anon_vma);
5925 if (!search_new_forks++)
5926 goto again;
5927 @@ -1771,7 +4517,7 @@
5929 struct rb_node *node;
5931 - for (node = rb_first(&root_stable_tree); node; node = rb_next(node)) {
5932 + for (node = rb_first(root_stable_treep); node; node = rb_next(node)) {
5933 struct stable_node *stable_node;
5935 stable_node = rb_entry(node, struct stable_node, node);
5936 @@ -1810,7 +4556,7 @@
5938 while ((stable_node = ksm_check_stable_tree(mn->start_pfn,
5939 mn->start_pfn + mn->nr_pages)) != NULL)
5940 - remove_node_from_stable_tree(stable_node);
5941 + remove_node_from_stable_tree(stable_node, 1, 1);
5942 /* fallthrough */
5944 case MEM_CANCEL_OFFLINE:
5945 @@ -1835,7 +4581,7 @@
5946 static ssize_t sleep_millisecs_show(struct kobject *kobj,
5947 struct kobj_attribute *attr, char *buf)
5949 - return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs);
5950 + return sprintf(buf, "%u\n", jiffies_to_msecs(ksm_sleep_jiffies));
5953 static ssize_t sleep_millisecs_store(struct kobject *kobj,
5954 @@ -1849,34 +4595,58 @@
5955 if (err || msecs > UINT_MAX)
5956 return -EINVAL;
5958 - ksm_thread_sleep_millisecs = msecs;
5959 + ksm_sleep_jiffies = msecs_to_jiffies(msecs);
5961 return count;
5963 KSM_ATTR(sleep_millisecs);
5965 -static ssize_t pages_to_scan_show(struct kobject *kobj,
5966 +static ssize_t min_scan_ratio_show(struct kobject *kobj,
5967 + struct kobj_attribute *attr, char *buf)
5969 + return sprintf(buf, "%u\n", ksm_min_scan_ratio);
5972 +static ssize_t min_scan_ratio_store(struct kobject *kobj,
5973 + struct kobj_attribute *attr,
5974 + const char *buf, size_t count)
5976 + unsigned long msr;
5977 + int err;
5979 + err = strict_strtoul(buf, 10, &msr);
5980 + if (err || msr > UINT_MAX)
5981 + return -EINVAL;
5983 + ksm_min_scan_ratio = msr;
5985 + return count;
5987 +KSM_ATTR(min_scan_ratio);
5989 +static ssize_t scan_batch_pages_show(struct kobject *kobj,
5990 struct kobj_attribute *attr, char *buf)
5992 - return sprintf(buf, "%u\n", ksm_thread_pages_to_scan);
5993 + return sprintf(buf, "%lu\n", ksm_scan_batch_pages);
5996 -static ssize_t pages_to_scan_store(struct kobject *kobj,
5997 +static ssize_t scan_batch_pages_store(struct kobject *kobj,
5998 struct kobj_attribute *attr,
5999 const char *buf, size_t count)
6001 int err;
6002 - unsigned long nr_pages;
6003 + unsigned long batch_pages;
6005 - err = strict_strtoul(buf, 10, &nr_pages);
6006 - if (err || nr_pages > UINT_MAX)
6007 + err = strict_strtoul(buf, 10, &batch_pages);
6008 + if (err || batch_pages > UINT_MAX)
6009 return -EINVAL;
6011 - ksm_thread_pages_to_scan = nr_pages;
6012 + ksm_scan_batch_pages = batch_pages;
6013 + cal_ladder_pages_to_scan(ksm_scan_batch_pages);
6015 return count;
6017 -KSM_ATTR(pages_to_scan);
6018 +KSM_ATTR(scan_batch_pages);
6020 static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
6021 char *buf)
6022 @@ -1893,28 +4663,12 @@
6023 err = strict_strtoul(buf, 10, &flags);
6024 if (err || flags > UINT_MAX)
6025 return -EINVAL;
6026 - if (flags > KSM_RUN_UNMERGE)
6027 + if (flags > KSM_RUN_MERGE)
6028 return -EINVAL;
6030 - /*
6031 - * KSM_RUN_MERGE sets ksmd running, and 0 stops it running.
6032 - * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items,
6033 - * breaking COW to free the pages_shared (but leaves mm_slots
6034 - * on the list for when ksmd may be set running again).
6035 - */
6037 mutex_lock(&ksm_thread_mutex);
6038 if (ksm_run != flags) {
6039 ksm_run = flags;
6040 - if (flags & KSM_RUN_UNMERGE) {
6041 - current->flags |= PF_OOM_ORIGIN;
6042 - err = unmerge_and_remove_all_rmap_items();
6043 - current->flags &= ~PF_OOM_ORIGIN;
6044 - if (err) {
6045 - ksm_run = KSM_RUN_STOP;
6046 - count = err;
6050 mutex_unlock(&ksm_thread_mutex);
6052 @@ -1925,6 +4679,30 @@
6054 KSM_ATTR(run);
6057 +static ssize_t thrash_threshold_show(struct kobject *kobj,
6058 + struct kobj_attribute *attr, char *buf)
6060 + return sprintf(buf, "%u\n", ksm_thrash_threshold);
6063 +static ssize_t thrash_threshold_store(struct kobject *kobj,
6064 + struct kobj_attribute *attr,
6065 + const char *buf, size_t count)
6067 + int err;
6068 + unsigned long flags;
6070 + err = strict_strtoul(buf, 10, &flags);
6071 + if (err || flags > 99)
6072 + return -EINVAL;
6074 + ksm_thrash_threshold = flags;
6076 + return count;
6078 +KSM_ATTR(thrash_threshold);
6080 static ssize_t pages_shared_show(struct kobject *kobj,
6081 struct kobj_attribute *attr, char *buf)
6083 @@ -1946,60 +4724,300 @@
6085 KSM_ATTR_RO(pages_unshared);
6087 -static ssize_t pages_volatile_show(struct kobject *kobj,
6088 +static ssize_t pages_remap_zeropage_show(struct kobject *kobj,
6089 struct kobj_attribute *attr, char *buf)
6091 - long ksm_pages_volatile;
6093 - ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared
6094 - - ksm_pages_sharing - ksm_pages_unshared;
6095 - /*
6096 - * It was not worth any locking to calculate that statistic,
6097 - * but it might therefore sometimes be negative: conceal that.
6098 - */
6099 - if (ksm_pages_volatile < 0)
6100 - ksm_pages_volatile = 0;
6101 - return sprintf(buf, "%ld\n", ksm_pages_volatile);
6102 + return sprintf(buf, "%lu\n", ksm_remap_zero_pages);
6104 -KSM_ATTR_RO(pages_volatile);
6105 +KSM_ATTR_RO(pages_remap_zeropage);
6107 static ssize_t full_scans_show(struct kobject *kobj,
6108 struct kobj_attribute *attr, char *buf)
6110 - return sprintf(buf, "%lu\n", ksm_scan.seqnr);
6111 + return sprintf(buf, "%llu\n", ksm_scan_round);
6113 KSM_ATTR_RO(full_scans);
6115 +static ssize_t pages_scanned_show(struct kobject *kobj,
6116 + struct kobj_attribute *attr, char *buf)
6118 + unsigned long base = 0;
6119 + u64 delta, ret;
6121 + if (pages_scanned_stored) {
6122 + base = pages_scanned_base;
6123 + ret = pages_scanned_stored;
6124 + delta = ksm_pages_scanned >> base;
6125 + if (CAN_OVERFLOW_U64(ret, delta)) {
6126 + ret >>= 1;
6127 + delta >>= 1;
6128 + base++;
6129 + ret += delta;
6131 + } else {
6132 + ret = ksm_pages_scanned;
6135 + while (ret > ULONG_MAX) {
6136 + ret >>= 1;
6137 + base++;
6140 + if (base)
6141 + return sprintf(buf, "%lu * 2^%lu\n", (unsigned long)ret, base);
6142 + else
6143 + return sprintf(buf, "%lu\n", (unsigned long)ret);
6145 +KSM_ATTR_RO(pages_scanned);
6147 +static ssize_t hash_strength_show(struct kobject *kobj,
6148 + struct kobj_attribute *attr, char *buf)
6150 + return sprintf(buf, "%lu\n", hash_strength);
6152 +KSM_ATTR_RO(hash_strength);
6154 +static ssize_t sleep_times_show(struct kobject *kobj,
6155 + struct kobj_attribute *attr, char *buf)
6157 + return sprintf(buf, "%llu\n", ksm_sleep_times);
6159 +KSM_ATTR_RO(sleep_times);
6162 static struct attribute *ksm_attrs[] = {
6163 &sleep_millisecs_attr.attr,
6164 - &pages_to_scan_attr.attr,
6165 + &scan_batch_pages_attr.attr,
6166 &run_attr.attr,
6167 &pages_shared_attr.attr,
6168 &pages_sharing_attr.attr,
6169 &pages_unshared_attr.attr,
6170 - &pages_volatile_attr.attr,
6171 + &pages_remap_zeropage_attr.attr,
6172 &full_scans_attr.attr,
6173 + &min_scan_ratio_attr.attr,
6174 + &pages_scanned_attr.attr,
6175 + &hash_strength_attr.attr,
6176 + &sleep_times_attr.attr,
6177 + &thrash_threshold_attr.attr,
6178 NULL,
6181 static struct attribute_group ksm_attr_group = {
6182 .attrs = ksm_attrs,
6183 - .name = "ksm",
6184 + .name = "uksm",
6186 #endif /* CONFIG_SYSFS */
6188 +static inline void init_scan_ladder(void)
6190 + int i;
6191 + unsigned long mul = 1;
6193 + unsigned long pages_to_scan;
6195 + pages_to_scan = ksm_scan_batch_pages;
6197 + for (i = 0; i < ksm_scan_ladder_size; i++,
6198 + mul *= ksm_scan_ratio_delta) {
6200 + ksm_scan_ladder[i].scan_ratio = ksm_min_scan_ratio * mul;
6201 + INIT_LIST_HEAD(&ksm_scan_ladder[i].vma_list);
6202 + ksm_scan_ladder[i].vma_num = 0;
6203 + ksm_scan_ladder[i].round_finished = 0;
6204 + ksm_scan_ladder[i].fully_scanned_slots = 0;
6205 + ksm_scan_ladder[i].busy_searched = 0;
6208 + cal_ladder_pages_to_scan(ksm_scan_batch_pages);
6211 +static inline int cal_positive_negative_costs(void)
6213 + struct page *p1, *p2;
6214 + unsigned char *addr1, *addr2;
6215 + unsigned long i, time_start, hash_cost;
6216 + unsigned long loopnum = 0;
6218 + /*IMPORTANT: volatile is needed to prevent over-optimization by gcc. */
6219 + volatile u32 hash;
6220 + volatile int ret;
6222 + p1 = alloc_page(GFP_KERNEL);
6223 + if (!p1)
6224 + return -ENOMEM;
6226 + p2 = alloc_page(GFP_KERNEL);
6227 + if (!p2)
6228 + return -ENOMEM;
6230 + addr1 = kmap_atomic(p1, KM_USER0);
6231 + addr2 = kmap_atomic(p2, KM_USER1);
6232 + memset(addr1, random32(), PAGE_SIZE);
6233 + memcpy(addr2, addr1, PAGE_SIZE);
6235 + /* make sure that the two pages differ in last byte */
6236 + addr2[PAGE_SIZE-1] = ~addr2[PAGE_SIZE-1];
6237 + kunmap_atomic(addr2, KM_USER1);
6238 + kunmap_atomic(addr1, KM_USER0);
6240 + time_start = jiffies;
6241 + while (jiffies - time_start < 100) {
6242 + for (i = 0; i < 100; i++)
6243 + hash = page_hash(p1, HASH_STRENGTH_FULL, 0);
6244 + loopnum += 100;
6246 + hash_cost = (jiffies - time_start);
6248 + time_start = jiffies;
6249 + for (i = 0; i < loopnum; i++)
6250 + ret = pages_identical(p1, p2);
6251 + memcmp_cost = HASH_STRENGTH_FULL * (jiffies - time_start);
6252 + memcmp_cost /= hash_cost;
6253 + printk(KERN_INFO "UKSM: relative memcmp_cost = %lu.\n", memcmp_cost);
6255 + __free_page(p1);
6256 + __free_page(p2);
6257 + return 0;
6260 +static int init_zeropage_hash_table(void)
6262 + struct page *page;
6263 + char *addr;
6264 + int i;
6266 + page = alloc_page(GFP_KERNEL);
6267 + if (!page)
6268 + return -ENOMEM;
6270 + addr = kmap_atomic(page, KM_USER0);
6271 + memset(addr, 0, PAGE_SIZE);
6272 + kunmap_atomic(addr, KM_USER0);
6274 + zero_hash_table = kmalloc(HASH_STRENGTH_MAX * sizeof(u32),
6275 + GFP_KERNEL);
6276 + if (!zero_hash_table)
6277 + return -ENOMEM;
6279 + for (i = 0; i< HASH_STRENGTH_MAX; i++) {
6280 + zero_hash_table[i] = page_hash(page, i, 0);
6283 + __free_page(page);
6285 + return 0;
6288 +static inline int init_random_sampling(void)
6290 + unsigned long i;
6291 + random_nums = kmalloc(PAGE_SIZE, GFP_KERNEL);
6292 + if (!random_nums)
6293 + return -ENOMEM;
6295 + for (i = 0; i < HASH_STRENGTH_FULL; i++)
6296 + random_nums[i] = i;
6298 + for (i = 0; i < HASH_STRENGTH_FULL; i++) {
6299 + unsigned long rand_range, swap_index, tmp;
6301 + rand_range = HASH_STRENGTH_FULL - i;
6302 + swap_index = i + random32() % rand_range;
6303 + tmp = random_nums[i];
6304 + random_nums[i] = random_nums[swap_index];
6305 + random_nums[swap_index] = tmp;
6308 + rshash_state.state = RSHASH_NEW;
6309 + rshash_state.below_count = 0;
6310 + rshash_state.lookup_window_index = 0;
6312 + return cal_positive_negative_costs();
6315 +static int __init ksm_slab_init(void)
6317 + rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
6318 + if (!rmap_item_cache)
6319 + goto out;
6321 + stable_node_cache = KSM_KMEM_CACHE(stable_node, 0);
6322 + if (!stable_node_cache)
6323 + goto out_free1;
6325 + node_vma_cache = KSM_KMEM_CACHE(node_vma, 0);
6326 + if (!node_vma_cache)
6327 + goto out_free2;
6329 + vma_slot_cache = KSM_KMEM_CACHE(vma_slot, 0);
6330 + if (!vma_slot_cache)
6331 + goto out_free3;
6333 + tree_node_cache = KSM_KMEM_CACHE(tree_node, 0);
6334 + if (!tree_node_cache)
6335 + goto out_free4;
6337 + return 0;
6339 +out_free4:
6340 + kmem_cache_destroy(vma_slot_cache);
6341 +out_free3:
6342 + kmem_cache_destroy(node_vma_cache);
6343 +out_free2:
6344 + kmem_cache_destroy(stable_node_cache);
6345 +out_free1:
6346 + kmem_cache_destroy(rmap_item_cache);
6347 +out:
6348 + return -ENOMEM;
6351 +static void __init ksm_slab_free(void)
6353 + kmem_cache_destroy(stable_node_cache);
6354 + kmem_cache_destroy(rmap_item_cache);
6355 + kmem_cache_destroy(node_vma_cache);
6356 + kmem_cache_destroy(vma_slot_cache);
6357 + kmem_cache_destroy(tree_node_cache);
6360 static int __init ksm_init(void)
6362 struct task_struct *ksm_thread;
6363 int err;
6364 + unsigned int sr = ksm_min_scan_ratio;
6366 + ksm_scan_ladder_size = 1;
6367 + while (sr < KSM_SCAN_RATIO_MAX) {
6368 + sr *= ksm_scan_ratio_delta;
6369 + ksm_scan_ladder_size++;
6371 + ksm_scan_ladder = kzalloc(sizeof(struct scan_rung) *
6372 + ksm_scan_ladder_size, GFP_KERNEL);
6373 + if (!ksm_scan_ladder) {
6374 + printk(KERN_ERR "uksm scan ladder allocation failed, size=%d\n",
6375 + ksm_scan_ladder_size);
6376 + err = ENOMEM;
6377 + goto out;
6379 + init_scan_ladder();
6381 + INIT_RADIX_TREE(&ksm_vma_tree, GFP_KERNEL);
6383 + err = init_random_sampling();
6384 + if (err)
6385 + goto out_free2;
6387 err = ksm_slab_init();
6388 if (err)
6389 - goto out;
6390 + goto out_free1;
6392 - ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
6393 + err = init_zeropage_hash_table();
6394 + if (err)
6395 + goto out_free0;
6397 + ksm_thread = kthread_run(ksm_scan_thread, NULL, "uksmd");
6398 if (IS_ERR(ksm_thread)) {
6399 - printk(KERN_ERR "ksm: creating kthread failed\n");
6400 + printk(KERN_ERR "uksm: creating kthread failed\n");
6401 err = PTR_ERR(ksm_thread);
6402 goto out_free;
6404 @@ -2007,7 +5025,7 @@
6405 #ifdef CONFIG_SYSFS
6406 err = sysfs_create_group(mm_kobj, &ksm_attr_group);
6407 if (err) {
6408 - printk(KERN_ERR "ksm: register sysfs failed\n");
6409 + printk(KERN_ERR "uksm: register sysfs failed\n");
6410 kthread_stop(ksm_thread);
6411 goto out_free;
6413 @@ -2027,7 +5045,19 @@
6415 out_free:
6416 ksm_slab_free();
6417 +out_free0:
6418 + kfree(zero_hash_table);
6419 +out_free1:
6420 + kfree(random_nums);
6421 +out_free2:
6422 + kfree(ksm_scan_ladder);
6423 out:
6424 return err;
6427 +#ifdef MODULE
6428 module_init(ksm_init)
6429 +#else
6430 +late_initcall(ksm_init);
6431 +#endif
6433 Binary files linux-2.6.38/mm/.ksm.c.swp and uksm-2.6.38/mm/.ksm.c.swp differ
6434 diff -Nur linux-2.6.38/mm/madvise.c uksm-2.6.38/mm/madvise.c
6435 --- linux-2.6.38/mm/madvise.c 2011-03-15 09:20:32.000000000 +0800
6436 +++ uksm-2.6.38/mm/madvise.c 2011-12-16 01:10:14.000000000 +0800
6437 @@ -65,12 +65,6 @@
6439 new_flags &= ~VM_DONTCOPY;
6440 break;
6441 - case MADV_MERGEABLE:
6442 - case MADV_UNMERGEABLE:
6443 - error = ksm_madvise(vma, start, end, behavior, &new_flags);
6444 - if (error)
6445 - goto out;
6446 - break;
6447 case MADV_HUGEPAGE:
6448 case MADV_NOHUGEPAGE:
6449 error = hugepage_madvise(vma, &new_flags, behavior);
6450 @@ -285,10 +279,6 @@
6451 case MADV_REMOVE:
6452 case MADV_WILLNEED:
6453 case MADV_DONTNEED:
6454 -#ifdef CONFIG_KSM
6455 - case MADV_MERGEABLE:
6456 - case MADV_UNMERGEABLE:
6457 -#endif
6458 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
6459 case MADV_HUGEPAGE:
6460 case MADV_NOHUGEPAGE:
6461 diff -Nur linux-2.6.38/mm/memory.c uksm-2.6.38/mm/memory.c
6462 --- linux-2.6.38/mm/memory.c 2011-03-15 09:20:32.000000000 +0800
6463 +++ uksm-2.6.38/mm/memory.c 2011-12-16 01:10:14.000000000 +0800
6464 @@ -719,6 +719,10 @@
6465 rss[MM_ANONPAGES]++;
6466 else
6467 rss[MM_FILEPAGES]++;
6468 +#ifdef CONFIG_KSM
6469 + if (PageKsm(page)) /* follows page_dup_rmap() */
6470 + inc_zone_page_state(page, NR_KSM_PAGES_SHARING);
6471 +#endif
6474 out_set_pte:
6475 @@ -1423,7 +1427,7 @@
6477 VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
6479 - /*
6480 + /*
6481 * Require read or write permissions.
6482 * If FOLL_FORCE is set, we only require the "MAY" flags.
6484 @@ -2158,8 +2162,13 @@
6485 clear_page(kaddr);
6486 kunmap_atomic(kaddr, KM_USER0);
6487 flush_dcache_page(dst);
6488 - } else
6489 + } else {
6490 copy_user_highpage(dst, src, va, vma);
6491 +#ifdef CONFIG_KSM
6492 + if (vma->ksm_vma_slot && PageKsm(src))
6493 + vma->ksm_vma_slot->pages_cowed++;
6494 +#endif
6499 diff -Nur linux-2.6.38/mm/mmap.c uksm-2.6.38/mm/mmap.c
6500 --- linux-2.6.38/mm/mmap.c 2011-03-15 09:20:32.000000000 +0800
6501 +++ uksm-2.6.38/mm/mmap.c 2011-12-16 01:10:14.000000000 +0800
6502 @@ -30,6 +30,7 @@
6503 #include <linux/perf_event.h>
6504 #include <linux/audit.h>
6505 #include <linux/khugepaged.h>
6506 +#include <linux/ksm.h>
6508 #include <asm/uaccess.h>
6509 #include <asm/cacheflush.h>
6510 @@ -65,7 +66,7 @@
6511 * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes
6512 * w: (no) no w: (no) no w: (yes) yes w: (no) no
6513 * x: (no) no x: (no) yes x: (no) yes x: (yes) yes
6514 - *
6516 * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes
6517 * w: (no) no w: (no) no w: (copy) copy w: (no) no
6518 * x: (no) no x: (no) yes x: (no) yes x: (yes) yes
6519 @@ -240,6 +241,9 @@
6520 removed_exe_file_vma(vma->vm_mm);
6522 mpol_put(vma_policy(vma));
6523 +#ifdef CONFIG_KSM
6524 + ksm_remove_vma(vma);
6525 +#endif
6526 kmem_cache_free(vm_area_cachep, vma);
6527 return next;
6529 @@ -529,9 +533,20 @@
6530 long adjust_next = 0;
6531 int remove_next = 0;
6534 + * to avoid deadlock, ksm_remove_vma must be done before any spin_lock is
6535 + * acquired
6536 + */
6537 +#ifdef CONFIG_KSM
6538 + ksm_remove_vma(vma);
6539 +#endif
6541 if (next && !insert) {
6542 struct vm_area_struct *exporter = NULL;
6544 +#ifdef CONFIG_KSM
6545 + ksm_remove_vma(next);
6546 +#endif
6547 if (end >= next->vm_end) {
6549 * vma expands, overlapping all the next, and
6550 @@ -616,10 +631,10 @@
6551 if (adjust_next)
6552 vma_prio_tree_remove(next, root);
6555 vma->vm_start = start;
6556 vma->vm_end = end;
6557 vma->vm_pgoff = pgoff;
6559 if (adjust_next) {
6560 next->vm_start += adjust_next << PAGE_SHIFT;
6561 next->vm_pgoff += adjust_next;
6562 @@ -672,10 +687,22 @@
6564 if (remove_next == 2) {
6565 next = vma->vm_next;
6566 +#ifdef CONFIG_KSM
6567 + ksm_remove_vma(next);
6568 +#endif
6569 goto again;
6571 + } else {
6572 +#ifdef CONFIG_KSM
6573 + if (next && !insert)
6574 + ksm_vma_add_new(next);
6575 +#endif
6578 +#ifdef CONFIG_KSM
6579 + ksm_vma_add_new(vma);
6580 +#endif
6582 validate_mm(mm);
6584 return 0;
6585 @@ -1352,6 +1379,9 @@
6587 vma_link(mm, vma, prev, rb_link, rb_parent);
6588 file = vma->vm_file;
6589 +#ifdef CONFIG_KSM
6590 + ksm_vma_add_new(vma);
6591 +#endif
6593 /* Once vma denies write, undo our temporary denial count */
6594 if (correct_wcount)
6595 @@ -1378,6 +1408,9 @@
6596 unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
6597 charged = 0;
6598 free_vma:
6599 +#ifdef CONFIG_KSM
6600 + ksm_remove_vma(vma);
6601 +#endif
6602 kmem_cache_free(vm_area_cachep, vma);
6603 unacct_error:
6604 if (charged)
6605 @@ -1453,7 +1486,7 @@
6606 addr = vma->vm_end;
6609 -#endif
6610 +#endif
6612 void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
6614 @@ -2014,6 +2047,10 @@
6615 else
6616 err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
6618 +#ifdef CONFIG_KSM
6619 + ksm_vma_add_new(new);
6620 +#endif
6622 /* Success. */
6623 if (!err)
6624 return 0;
6625 @@ -2250,6 +2287,9 @@
6626 vma->vm_flags = flags;
6627 vma->vm_page_prot = vm_get_page_prot(flags);
6628 vma_link(mm, vma, prev, rb_link, rb_parent);
6629 +#ifdef CONFIG_KSM
6630 + ksm_vma_add_new(vma);
6631 +#endif
6632 out:
6633 perf_event_mmap(vma);
6634 mm->total_vm += len >> PAGE_SHIFT;
6635 @@ -2273,6 +2313,12 @@
6636 /* mm's last user has gone, and its about to be pulled down */
6637 mmu_notifier_release(mm);
6639 + /*
6640 + * Taking write lock on mmap_sem does not harm others,
6641 + * but it's crucial for uksm to avoid races.
6642 + */
6643 + down_write(&mm->mmap_sem);
6645 if (mm->locked_vm) {
6646 vma = mm->mmap;
6647 while (vma) {
6648 @@ -2306,6 +2352,11 @@
6649 while (vma)
6650 vma = remove_vma(vma);
6652 + mm->mmap = NULL;
6653 + mm->mm_rb = RB_ROOT;
6654 + mm->mmap_cache = NULL;
6655 + up_write(&mm->mmap_sem);
6657 BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
6660 @@ -2397,6 +2448,9 @@
6661 if (new_vma->vm_ops && new_vma->vm_ops->open)
6662 new_vma->vm_ops->open(new_vma);
6663 vma_link(mm, new_vma, prev, rb_link, rb_parent);
6664 +#ifdef CONFIG_KSM
6665 + ksm_vma_add_new(new_vma);
6666 +#endif
6669 return new_vma;
6670 @@ -2502,11 +2556,14 @@
6671 ret = insert_vm_struct(mm, vma);
6672 if (ret)
6673 goto out;
6675 mm->total_vm += len >> PAGE_SHIFT;
6677 perf_event_mmap(vma);
6679 +#ifdef CONFIG_KSM
6680 + ksm_vma_add_new(vma);
6681 +#endif
6683 return 0;
6685 out:
6686 diff -Nur linux-2.6.38/mm/mremap.c uksm-2.6.38/mm/mremap.c
6687 --- linux-2.6.38/mm/mremap.c 2011-03-15 09:20:32.000000000 +0800
6688 +++ uksm-2.6.38/mm/mremap.c 2011-12-16 01:10:14.000000000 +0800
6689 @@ -191,8 +191,7 @@
6690 * pages recently unmapped. But leave vma->vm_flags as it was,
6691 * so KSM can come around to merge on vma and new_vma afterwards.
6693 - err = ksm_madvise(vma, old_addr, old_addr + old_len,
6694 - MADV_UNMERGEABLE, &vm_flags);
6695 + err = unmerge_ksm_pages(vma, old_addr, old_addr + old_len);
6696 if (err)
6697 return err;
6699 diff -Nur linux-2.6.38/mm/rmap.c uksm-2.6.38/mm/rmap.c
6700 --- linux-2.6.38/mm/rmap.c 2011-03-15 09:20:32.000000000 +0800
6701 +++ uksm-2.6.38/mm/rmap.c 2011-12-16 01:10:14.000000000 +0800
6702 @@ -817,9 +817,9 @@
6705 * __page_set_anon_rmap - set up new anonymous rmap
6706 - * @page: Page to add to rmap
6707 + * @page: Page to add to rmap
6708 * @vma: VM area to add page to.
6709 - * @address: User virtual address of the mapping
6710 + * @address: User virtual address of the mapping
6711 * @exclusive: the page is exclusively owned by the current process
6713 static void __page_set_anon_rmap(struct page *page,
6714 @@ -905,9 +905,12 @@
6715 __inc_zone_page_state(page,
6716 NR_ANON_TRANSPARENT_HUGEPAGES);
6718 - if (unlikely(PageKsm(page)))
6719 +#ifdef CONFIG_KSM
6720 + if (unlikely(PageKsm(page))) {
6721 + __inc_zone_page_state(page, NR_KSM_PAGES_SHARING);
6722 return;
6725 +#endif
6726 VM_BUG_ON(!PageLocked(page));
6727 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
6728 if (first)
6729 @@ -965,6 +968,10 @@
6731 void page_remove_rmap(struct page *page)
6733 +#ifdef CONFIG_KSM
6734 + if (PageKsm(page))
6735 + __dec_zone_page_state(page, NR_KSM_PAGES_SHARING);
6736 +#endif
6737 /* page still mapped by someone else? */
6738 if (!atomic_add_negative(-1, &page->_mapcount))
6739 return;