sys-kernel/xanmod-hybrid/files/v1-uksm.patch

   1 From 9a42006b641bc8e0c333174a9bf269ac9450d521 Mon Sep 17 00:00:00 2001
   2 From: Piotr Gorski <lucjan.lucjanov@gmail.com>
   3 Date: Tue, 13 Apr 2021 16:27:12 +0200
   4 Subject: [PATCH] UKSM for 5.12
   5
   6 Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com>
   7 ---
   8  Documentation/vm/uksm.txt   |   61 +
   9  fs/exec.c                   |    1 +
  10  fs/proc/meminfo.c           |    4 +
  11  include/linux/ksm.h         |   43 +-
  12  include/linux/mm_types.h    |    3 +
  13  include/linux/mmzone.h      |    3 +
  14  include/linux/pgtable.h     |   17 +-
  15  include/linux/sradix-tree.h |   77 +
  16  include/linux/uksm.h        |  149 +
  17  kernel/fork.c               |    2 +-
  18  lib/Makefile                |    2 +-
  19  lib/sradix-tree.c           |  476 +++
  20  mm/Kconfig                  |   26 +
  21  mm/Makefile                 |    3 +-
  22  mm/ksm.c                    |   11 -
  23  mm/memory.c                 |   33 +-
  24  mm/mmap.c                   |   37 +
  25  mm/uksm.c                   | 5614 +++++++++++++++++++++++++++++++++++
  26  mm/vmstat.c                 |    3 +
  27  19 files changed, 6539 insertions(+), 26 deletions(-)
  28  create mode 100644 Documentation/vm/uksm.txt
  29  create mode 100644 include/linux/sradix-tree.h
  30  create mode 100644 include/linux/uksm.h
  31  create mode 100644 lib/sradix-tree.c
  32  create mode 100644 mm/uksm.c
  33
  34 diff --git a/Documentation/vm/uksm.txt b/Documentation/vm/uksm.txt
  35 new file mode 100644
  36 index 000000000..be19a3127
  37 --- /dev/null
  38 +++ b/Documentation/vm/uksm.txt
  39 @@ -0,0 +1,61 @@
  40 +The Ultra Kernel Samepage Merging feature
  41 +----------------------------------------------
  42 +/*
  43 + * Ultra KSM. Copyright (C) 2011-2012 Nai Xia
  44 + *
  45 + * This is an improvement upon KSM. Some basic data structures and routines
  46 + * are borrowed from ksm.c .
  47 + *
  48 + * Its new features:
  49 + * 1. Full system scan:
  50 + *      It automatically scans all user processes' anonymous VMAs. Kernel-user
  51 + *      interaction to submit a memory area to KSM is no longer needed.
  52 + *
  53 + * 2. Rich area detection:
  54 + *      It automatically detects rich areas containing abundant duplicated
  55 + *      pages based. Rich areas are given a full scan speed. Poor areas are
  56 + *      sampled at a reasonable speed with very low CPU consumption.
  57 + *
  58 + * 3. Ultra Per-page scan speed improvement:
  59 + *      A new hash algorithm is proposed. As a result, on a machine with
  60 + *      Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it
  61 + *      can scan memory areas that does not contain duplicated pages at speed of
  62 + *      627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of
  63 + *      477MB/sec ~ 923MB/sec.
  64 + *
  65 + * 4. Thrashing area avoidance:
  66 + *      Thrashing area(an VMA that has frequent Ksm page break-out) can be
  67 + *      filtered out. My benchmark shows it's more efficient than KSM's per-page
  68 + *      hash value based volatile page detection.
  69 + *
  70 + *
  71 + * 5. Misc changes upon KSM:
  72 + *      * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page
  73 + *        comparison. It's much faster than default C version on x86.
  74 + *      * rmap_item now has an struct *page member to loosely cache a
  75 + *        address-->page mapping, which reduces too much time-costly
  76 + *        follow_page().
  77 + *      * The VMA creation/exit procedures are hooked to let the Ultra KSM know.
  78 + *      * try_to_merge_two_pages() now can revert a pte if it fails. No break_
  79 + *        ksm is needed for this case.
  80 + *
  81 + * 6. Full Zero Page consideration(contributed by Figo Zhang)
  82 + *    Now uksmd consider full zero pages as special pages and merge them to an
  83 + *    special unswappable uksm zero page.
  84 + */
  85 +
  86 +ChangeLog:
  87 +
  88 +2012-05-05 The creation of this Doc
  89 +2012-05-08 UKSM 0.1.1.1 libc crash bug fix, api clean up, doc clean up.
  90 +2012-05-28 UKSM 0.1.1.2 bug fix release
  91 +2012-06-26 UKSM 0.1.2-beta1 first beta release for 0.1.2
  92 +2012-07-2  UKSM 0.1.2-beta2
  93 +2012-07-10 UKSM 0.1.2-beta3
  94 +2012-07-26 UKSM 0.1.2 Fine grained speed control, more scan optimization.
  95 +2012-10-13 UKSM 0.1.2.1 Bug fixes.
  96 +2012-12-31 UKSM 0.1.2.2 Minor bug fixes.
  97 +2014-07-02 UKSM 0.1.2.3 Fix a " __this_cpu_read() in preemptible bug".
  98 +2015-04-22 UKSM 0.1.2.4 Fix a race condition that can sometimes trigger anonying warnings.
  99 +2016-09-10 UKSM 0.1.2.5 Fix a bug in dedup ratio calculation.
 100 +2017-02-26 UKSM 0.1.2.6 Fix a bug in hugetlbpage handling and a race bug with page migration.
 101 diff --git a/fs/exec.c b/fs/exec.c
 102 index 18594f11c..aee636fd4 100644
 103 --- a/fs/exec.c
 104 +++ b/fs/exec.c
 105 @@ -65,6 +65,7 @@
 106  #include <linux/vmalloc.h>
 107  #include <linux/io_uring.h>
 108  #include <linux/syscall_user_dispatch.h>
 109 +#include <linux/ksm.h>
 110
 111  #include <linux/uaccess.h>
 112  #include <asm/mmu_context.h>
 113 diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
 114 index 6fa761c9c..45fd59a0d 100644
 115 --- a/fs/proc/meminfo.c
 116 +++ b/fs/proc/meminfo.c
 117 @@ -108,6 +108,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 118  #endif
 119         show_val_kb(m, "PageTables:     ",
 120                     global_node_page_state(NR_PAGETABLE));
 121 +#ifdef CONFIG_UKSM
 122 +       show_val_kb(m, "KsmZeroPages:     ",
 123 +                   global_zone_page_state(NR_UKSM_ZERO_PAGES));
 124 +#endif
 125
 126         show_val_kb(m, "NFS_Unstable:   ", 0);
 127         show_val_kb(m, "Bounce:         ",
 128 diff --git a/include/linux/ksm.h b/include/linux/ksm.h
 129 index 161e8164a..f0dbdf3c9 100644
 130 --- a/include/linux/ksm.h
 131 +++ b/include/linux/ksm.h
 132 @@ -21,20 +21,16 @@ struct mem_cgroup;
 133  #ifdef CONFIG_KSM
 134  int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 135                 unsigned long end, int advice, unsigned long *vm_flags);
 136 -int __ksm_enter(struct mm_struct *mm);
 137 -void __ksm_exit(struct mm_struct *mm);
 138
 139 -static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 140 +static inline struct stable_node *page_stable_node(struct page *page)
 141  {
 142 -       if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
 143 -               return __ksm_enter(mm);
 144 -       return 0;
 145 +       return PageKsm(page) ? page_rmapping(page) : NULL;
 146  }
 147
 148 -static inline void ksm_exit(struct mm_struct *mm)
 149 +static inline void set_page_stable_node(struct page *page,
 150 +                                       struct stable_node *stable_node)
 151  {
 152 -       if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
 153 -               __ksm_exit(mm);
 154 +       page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
 155  }
 156
 157  /*
 158 @@ -54,6 +50,33 @@ struct page *ksm_might_need_to_copy(struct page *page,
 159  void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc);
 160  void ksm_migrate_page(struct page *newpage, struct page *oldpage);
 161
 162 +#ifdef CONFIG_KSM_LEGACY
 163 +int __ksm_enter(struct mm_struct *mm);
 164 +void __ksm_exit(struct mm_struct *mm);
 165 +static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 166 +{
 167 +       if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
 168 +               return __ksm_enter(mm);
 169 +       return 0;
 170 +}
 171 +
 172 +static inline void ksm_exit(struct mm_struct *mm)
 173 +{
 174 +       if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
 175 +               __ksm_exit(mm);
 176 +}
 177 +
 178 +#elif defined(CONFIG_UKSM)
 179 +static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 180 +{
 181 +       return 0;
 182 +}
 183 +
 184 +static inline void ksm_exit(struct mm_struct *mm)
 185 +{
 186 +}
 187 +#endif /* !CONFIG_UKSM */
 188 +
 189  #else  /* !CONFIG_KSM */
 190
 191  static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 192 @@ -89,4 +112,6 @@ static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage)
 193  #endif /* CONFIG_MMU */
 194  #endif /* !CONFIG_KSM */
 195
 196 +#include <linux/uksm.h>
 197 +
 198  #endif /* __LINUX_KSM_H */
 199 diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
 200 index 6613b26a8..82e18e41b 100644
 201 --- a/include/linux/mm_types.h
 202 +++ b/include/linux/mm_types.h
 203 @@ -370,6 +370,9 @@ struct vm_area_struct {
 204         struct mempolicy *vm_policy;    /* NUMA policy for the VMA */
 205  #endif
 206         struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
 207 +#ifdef CONFIG_UKSM
 208 +       struct vma_slot *uksm_vma_slot;
 209 +#endif
 210  } __randomize_layout;
 211
 212  struct core_thread {
 213 diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
 214 index 47946cec7..a6ce64844 100644
 215 --- a/include/linux/mmzone.h
 216 +++ b/include/linux/mmzone.h
 217 @@ -157,6 +157,9 @@ enum zone_stat_item {
 218         NR_ZSPAGES,             /* allocated in zsmalloc */
 219  #endif
 220         NR_FREE_CMA_PAGES,
 221 +#ifdef CONFIG_UKSM
 222 +       NR_UKSM_ZERO_PAGES,
 223 +#endif
 224         NR_VM_ZONE_STAT_ITEMS };
 225
 226  enum node_stat_item {
 227 diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
 228 index 5e772392a..9d733540d 100644
 229 --- a/include/linux/pgtable.h
 230 +++ b/include/linux/pgtable.h
 231 @@ -1111,12 +1111,25 @@ extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
 232  extern void untrack_pfn_moved(struct vm_area_struct *vma);
 233  #endif
 234
 235 +#ifdef CONFIG_UKSM
 236 +static inline int is_uksm_zero_pfn(unsigned long pfn)
 237 +{
 238 +       extern unsigned long uksm_zero_pfn;
 239 +       return pfn == uksm_zero_pfn;
 240 +}
 241 +#else
 242 +static inline int is_uksm_zero_pfn(unsigned long pfn)
 243 +{
 244 +       return 0;
 245 +}
 246 +#endif
 247 +
 248  #ifdef __HAVE_COLOR_ZERO_PAGE
 249  static inline int is_zero_pfn(unsigned long pfn)
 250  {
 251         extern unsigned long zero_pfn;
 252         unsigned long offset_from_zero_pfn = pfn - zero_pfn;
 253 -       return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT);
 254 +       return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT) || is_uksm_zero_pfn(pfn);
 255  }
 256
 257  #define my_zero_pfn(addr)      page_to_pfn(ZERO_PAGE(addr))
 258 @@ -1125,7 +1138,7 @@ static inline int is_zero_pfn(unsigned long pfn)
 259  static inline int is_zero_pfn(unsigned long pfn)
 260  {
 261         extern unsigned long zero_pfn;
 262 -       return pfn == zero_pfn;
 263 +       return (pfn == zero_pfn) || (is_uksm_zero_pfn(pfn));
 264  }
 265
 266  static inline unsigned long my_zero_pfn(unsigned long addr)
 267 diff --git a/include/linux/sradix-tree.h b/include/linux/sradix-tree.h
 268 new file mode 100644
 269 index 000000000..d71edba6b
 270 --- /dev/null
 271 +++ b/include/linux/sradix-tree.h
 272 @@ -0,0 +1,77 @@
 273 +#ifndef _LINUX_SRADIX_TREE_H
 274 +#define _LINUX_SRADIX_TREE_H
 275 +
 276 +
 277 +#define INIT_SRADIX_TREE(root, mask)                                   \
 278 +do {                                                                   \
 279 +       (root)->height = 0;                                             \
 280 +       (root)->gfp_mask = (mask);                                      \
 281 +       (root)->rnode = NULL;                                           \
 282 +} while (0)
 283 +
 284 +#define ULONG_BITS     (sizeof(unsigned long) * 8)
 285 +#define SRADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
 286 +//#define SRADIX_TREE_MAP_SHIFT        6
 287 +//#define SRADIX_TREE_MAP_SIZE (1UL << SRADIX_TREE_MAP_SHIFT)
 288 +//#define SRADIX_TREE_MAP_MASK (SRADIX_TREE_MAP_SIZE-1)
 289 +
 290 +struct sradix_tree_node {
 291 +       unsigned int    height;         /* Height from the bottom */
 292 +       unsigned int    count;
 293 +       unsigned int    fulls;          /* Number of full sublevel trees */
 294 +       struct sradix_tree_node *parent;
 295 +       void *stores[0];
 296 +};
 297 +
 298 +/* A simple radix tree implementation */
 299 +struct sradix_tree_root {
 300 +       unsigned int            height;
 301 +       struct sradix_tree_node *rnode;
 302 +
 303 +       /* Where found to have available empty stores in its sublevels */
 304 +       struct sradix_tree_node *enter_node;
 305 +       unsigned int shift;
 306 +       unsigned int stores_size;
 307 +       unsigned int mask;
 308 +       unsigned long min;      /* The first hole index */
 309 +       unsigned long num;
 310 +       //unsigned long *height_to_maxindex;
 311 +
 312 +       /* How the node is allocated and freed. */
 313 +       struct sradix_tree_node *(*alloc)(void);
 314 +       void (*free)(struct sradix_tree_node *node);
 315 +
 316 +       /* When a new node is added and removed */
 317 +       void (*extend)(struct sradix_tree_node *parent, struct sradix_tree_node *child);
 318 +       void (*assign)(struct sradix_tree_node *node, unsigned int index, void *item);
 319 +       void (*rm)(struct sradix_tree_node *node, unsigned int offset);
 320 +};
 321 +
 322 +struct sradix_tree_path {
 323 +       struct sradix_tree_node *node;
 324 +       int offset;
 325 +};
 326 +
 327 +static inline
 328 +void init_sradix_tree_root(struct sradix_tree_root *root, unsigned long shift)
 329 +{
 330 +       root->height = 0;
 331 +       root->rnode = NULL;
 332 +       root->shift = shift;
 333 +       root->stores_size = 1UL << shift;
 334 +       root->mask = root->stores_size - 1;
 335 +}
 336 +
 337 +
 338 +extern void *sradix_tree_next(struct sradix_tree_root *root,
 339 +                      struct sradix_tree_node *node, unsigned long index,
 340 +                      int (*iter)(void *, unsigned long));
 341 +
 342 +extern int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num);
 343 +
 344 +extern void sradix_tree_delete_from_leaf(struct sradix_tree_root *root,
 345 +                       struct sradix_tree_node *node, unsigned long index);
 346 +
 347 +extern void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index);
 348 +
 349 +#endif /* _LINUX_SRADIX_TREE_H */
 350 diff --git a/include/linux/uksm.h b/include/linux/uksm.h
 351 new file mode 100644
 352 index 000000000..bb8651f53
 353 --- /dev/null
 354 +++ b/include/linux/uksm.h
 355 @@ -0,0 +1,149 @@
 356 +#ifndef __LINUX_UKSM_H
 357 +#define __LINUX_UKSM_H
 358 +/*
 359 + * Memory merging support.
 360 + *
 361 + * This code enables dynamic sharing of identical pages found in different
 362 + * memory areas, even if they are not shared by fork().
 363 + */
 364 +
 365 +/* if !CONFIG_UKSM this file should not be compiled at all. */
 366 +#ifdef CONFIG_UKSM
 367 +
 368 +#include <linux/bitops.h>
 369 +#include <linux/mm.h>
 370 +#include <linux/pagemap.h>
 371 +#include <linux/rmap.h>
 372 +#include <linux/sched.h>
 373 +
 374 +extern unsigned long zero_pfn __read_mostly;
 375 +extern unsigned long uksm_zero_pfn __read_mostly;
 376 +extern struct page *empty_uksm_zero_page;
 377 +
 378 +/* must be done before linked to mm */
 379 +extern void uksm_vma_add_new(struct vm_area_struct *vma);
 380 +extern void uksm_remove_vma(struct vm_area_struct *vma);
 381 +
 382 +#define UKSM_SLOT_NEED_SORT    (1 << 0)
 383 +#define UKSM_SLOT_NEED_RERAND  (1 << 1)
 384 +#define UKSM_SLOT_SCANNED      (1 << 2) /* It's scanned in this round */
 385 +#define UKSM_SLOT_FUL_SCANNED  (1 << 3)
 386 +#define UKSM_SLOT_IN_UKSM      (1 << 4)
 387 +
 388 +struct vma_slot {
 389 +       struct sradix_tree_node *snode;
 390 +       unsigned long sindex;
 391 +
 392 +       struct list_head slot_list;
 393 +       unsigned long fully_scanned_round;
 394 +       unsigned long dedup_num;
 395 +       unsigned long pages_scanned;
 396 +       unsigned long this_sampled;
 397 +       unsigned long last_scanned;
 398 +       unsigned long pages_to_scan;
 399 +       struct scan_rung *rung;
 400 +       struct page **rmap_list_pool;
 401 +       unsigned int *pool_counts;
 402 +       unsigned long pool_size;
 403 +       struct vm_area_struct *vma;
 404 +       struct mm_struct *mm;
 405 +       unsigned long ctime_j;
 406 +       unsigned long pages;
 407 +       unsigned long flags;
 408 +       unsigned long pages_cowed; /* pages cowed this round */
 409 +       unsigned long pages_merged; /* pages merged this round */
 410 +       unsigned long pages_bemerged;
 411 +
 412 +       /* when it has page merged in this eval round */
 413 +       struct list_head dedup_list;
 414 +};
 415 +
 416 +static inline void uksm_unmap_zero_page(pte_t pte)
 417 +{
 418 +       if (pte_pfn(pte) == uksm_zero_pfn)
 419 +               __dec_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES);
 420 +}
 421 +
 422 +static inline void uksm_map_zero_page(pte_t pte)
 423 +{
 424 +       if (pte_pfn(pte) == uksm_zero_pfn)
 425 +               __inc_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES);
 426 +}
 427 +
 428 +static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page)
 429 +{
 430 +       if (vma->uksm_vma_slot && PageKsm(page))
 431 +               vma->uksm_vma_slot->pages_cowed++;
 432 +}
 433 +
 434 +static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte)
 435 +{
 436 +       if (vma->uksm_vma_slot && pte_pfn(pte) == uksm_zero_pfn)
 437 +               vma->uksm_vma_slot->pages_cowed++;
 438 +}
 439 +
 440 +static inline int uksm_flags_can_scan(unsigned long vm_flags)
 441 +{
 442 +#ifdef VM_SAO
 443 +               if (vm_flags & VM_SAO)
 444 +                       return 0;
 445 +#endif
 446 +
 447 +       return !(vm_flags & (VM_PFNMAP | VM_IO  | VM_DONTEXPAND |
 448 +                            VM_HUGETLB | VM_MIXEDMAP | VM_SHARED
 449 +                            | VM_MAYSHARE | VM_GROWSUP | VM_GROWSDOWN));
 450 +}
 451 +
 452 +static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p)
 453 +{
 454 +       if (uksm_flags_can_scan(*vm_flags_p))
 455 +               *vm_flags_p |= VM_MERGEABLE;
 456 +}
 457 +
 458 +/*
 459 + * Just a wrapper for BUG_ON for where ksm_zeropage must not be. TODO: it will
 460 + * be removed when uksm zero page patch is stable enough.
 461 + */
 462 +static inline void uksm_bugon_zeropage(pte_t pte)
 463 +{
 464 +       BUG_ON(pte_pfn(pte) == uksm_zero_pfn);
 465 +}
 466 +#else
 467 +static inline void uksm_vma_add_new(struct vm_area_struct *vma)
 468 +{
 469 +}
 470 +
 471 +static inline void uksm_remove_vma(struct vm_area_struct *vma)
 472 +{
 473 +}
 474 +
 475 +static inline void uksm_unmap_zero_page(pte_t pte)
 476 +{
 477 +}
 478 +
 479 +static inline void uksm_map_zero_page(pte_t pte)
 480 +{
 481 +}
 482 +
 483 +static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page)
 484 +{
 485 +}
 486 +
 487 +static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte)
 488 +{
 489 +}
 490 +
 491 +static inline int uksm_flags_can_scan(unsigned long vm_flags)
 492 +{
 493 +       return 0;
 494 +}
 495 +
 496 +static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p)
 497 +{
 498 +}
 499 +
 500 +static inline void uksm_bugon_zeropage(pte_t pte)
 501 +{
 502 +}
 503 +#endif /* !CONFIG_UKSM */
 504 +#endif /* __LINUX_UKSM_H */
 505 diff --git a/kernel/fork.c b/kernel/fork.c
 506 index 426cd0c51..5fd356ca7 100644
 507 --- a/kernel/fork.c
 508 +++ b/kernel/fork.c
 509 @@ -588,7 +588,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 510                 __vma_link_rb(mm, tmp, rb_link, rb_parent);
 511                 rb_link = &tmp->vm_rb.rb_right;
 512                 rb_parent = &tmp->vm_rb;
 513 -
 514 +               uksm_vma_add_new(tmp);
 515                 mm->map_count++;
 516                 if (!(tmp->vm_flags & VM_WIPEONFORK))
 517                         retval = copy_page_range(tmp, mpnt);
 518 diff --git a/lib/Makefile b/lib/Makefile
 519 index b5307d3ee..480b099e1 100644
 520 --- a/lib/Makefile
 521 +++ b/lib/Makefile
 522 @@ -28,7 +28,7 @@ CFLAGS_string.o += -fno-stack-protector
 523  endif
 524
 525  lib-y := ctype.o string.o vsprintf.o cmdline.o \
 526 -        rbtree.o radix-tree.o timerqueue.o xarray.o \
 527 +        rbtree.o radix-tree.o sradix-tree.o timerqueue.o xarray.o \
 528          idr.o extable.o sha1.o irq_regs.o argv_split.o \
 529          flex_proportions.o ratelimit.o show_mem.o \
 530          is_single_threaded.o plist.o decompress.o kobject_uevent.o \
 531 diff --git a/lib/sradix-tree.c b/lib/sradix-tree.c
 532 new file mode 100644
 533 index 000000000..ab21e6309
 534 --- /dev/null
 535 +++ b/lib/sradix-tree.c
 536 @@ -0,0 +1,476 @@
 537 +#include <linux/errno.h>
 538 +#include <linux/mm.h>
 539 +#include <linux/mman.h>
 540 +#include <linux/spinlock.h>
 541 +#include <linux/slab.h>
 542 +#include <linux/gcd.h>
 543 +#include <linux/sradix-tree.h>
 544 +
 545 +static inline int sradix_node_full(struct sradix_tree_root *root, struct sradix_tree_node *node)
 546 +{
 547 +       return node->fulls == root->stores_size ||
 548 +               (node->height == 1 && node->count == root->stores_size);
 549 +}
 550 +
 551 +/*
 552 + *     Extend a sradix tree so it can store key @index.
 553 + */
 554 +static int sradix_tree_extend(struct sradix_tree_root *root, unsigned long index)
 555 +{
 556 +       struct sradix_tree_node *node;
 557 +       unsigned int height;
 558 +
 559 +       if (unlikely(root->rnode == NULL)) {
 560 +               if (!(node = root->alloc()))
 561 +                       return -ENOMEM;
 562 +
 563 +               node->height = 1;
 564 +               root->rnode = node;
 565 +               root->height = 1;
 566 +       }
 567 +
 568 +       /* Figure out what the height should be.  */
 569 +       height = root->height;
 570 +       index >>= root->shift * height;
 571 +
 572 +       while (index) {
 573 +               index >>= root->shift;
 574 +               height++;
 575 +       }
 576 +
 577 +       while (height > root->height) {
 578 +               unsigned int newheight;
 579 +
 580 +               if (!(node = root->alloc()))
 581 +                       return -ENOMEM;
 582 +
 583 +               /* Increase the height.  */
 584 +               node->stores[0] = root->rnode;
 585 +               root->rnode->parent = node;
 586 +               if (root->extend)
 587 +                       root->extend(node, root->rnode);
 588 +
 589 +               newheight = root->height + 1;
 590 +               node->height = newheight;
 591 +               node->count = 1;
 592 +               if (sradix_node_full(root, root->rnode))
 593 +                       node->fulls = 1;
 594 +
 595 +               root->rnode = node;
 596 +               root->height = newheight;
 597 +       }
 598 +
 599 +       return 0;
 600 +}
 601 +
 602 +/*
 603 + * Search the next item from the current node, that is not NULL
 604 + * and can satify root->iter().
 605 + */
 606 +void *sradix_tree_next(struct sradix_tree_root *root,
 607 +                      struct sradix_tree_node *node, unsigned long index,
 608 +                      int (*iter)(void *item, unsigned long height))
 609 +{
 610 +       unsigned long offset;
 611 +       void *item;
 612 +
 613 +       if (unlikely(node == NULL)) {
 614 +               node = root->rnode;
 615 +               for (offset = 0; offset < root->stores_size; offset++) {
 616 +                       item = node->stores[offset];
 617 +                       if (item && (!iter || iter(item, node->height)))
 618 +                               break;
 619 +               }
 620 +
 621 +               if (unlikely(offset >= root->stores_size))
 622 +                       return NULL;
 623 +
 624 +               if (node->height == 1)
 625 +                       return item;
 626 +               else
 627 +                       goto go_down;
 628 +       }
 629 +
 630 +       while (node) {
 631 +               offset = (index & root->mask) + 1;
 632 +               for (; offset < root->stores_size; offset++) {
 633 +                       item = node->stores[offset];
 634 +                       if (item && (!iter || iter(item, node->height)))
 635 +                               break;
 636 +               }
 637 +
 638 +               if (offset < root->stores_size)
 639 +                       break;
 640 +
 641 +               node = node->parent;
 642 +               index >>= root->shift;
 643 +       }
 644 +
 645 +       if (!node)
 646 +               return NULL;
 647 +
 648 +       while (node->height > 1) {
 649 +go_down:
 650 +               node = item;
 651 +               for (offset = 0; offset < root->stores_size; offset++) {
 652 +                       item = node->stores[offset];
 653 +                       if (item && (!iter || iter(item, node->height)))
 654 +                               break;
 655 +               }
 656 +
 657 +               if (unlikely(offset >= root->stores_size))
 658 +                       return NULL;
 659 +       }
 660 +
 661 +       BUG_ON(offset > root->stores_size);
 662 +
 663 +       return item;
 664 +}
 665 +
 666 +/*
 667 + * Blindly insert the item to the tree. Typically, we reuse the
 668 + * first empty store item.
 669 + */
 670 +int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num)
 671 +{
 672 +       unsigned long index;
 673 +       unsigned int height;
 674 +       struct sradix_tree_node *node, *tmp = NULL;
 675 +       int offset, offset_saved;
 676 +       void **store = NULL;
 677 +       int error, i, j, shift;
 678 +
 679 +go_on:
 680 +       index = root->min;
 681 +
 682 +       if (root->enter_node && !sradix_node_full(root, root->enter_node)) {
 683 +               node = root->enter_node;
 684 +               BUG_ON((index >> (root->shift * root->height)));
 685 +       } else {
 686 +               node = root->rnode;
 687 +               if (node == NULL || (index >> (root->shift * root->height))
 688 +                   || sradix_node_full(root, node)) {
 689 +                       error = sradix_tree_extend(root, index);
 690 +                       if (error)
 691 +                               return error;
 692 +
 693 +                       node = root->rnode;
 694 +               }
 695 +       }
 696 +
 697 +
 698 +       height = node->height;
 699 +       shift = (height - 1) * root->shift;
 700 +       offset = (index >> shift) & root->mask;
 701 +       while (shift > 0) {
 702 +               offset_saved = offset;
 703 +               for (; offset < root->stores_size; offset++) {
 704 +                       store = &node->stores[offset];
 705 +                       tmp = *store;
 706 +
 707 +                       if (!tmp || !sradix_node_full(root, tmp))
 708 +                               break;
 709 +               }
 710 +               BUG_ON(offset >= root->stores_size);
 711 +
 712 +               if (offset != offset_saved) {
 713 +                       index += (offset - offset_saved) << shift;
 714 +                       index &= ~((1UL << shift) - 1);
 715 +               }
 716 +
 717 +               if (!tmp) {
 718 +                       if (!(tmp = root->alloc()))
 719 +                               return -ENOMEM;
 720 +
 721 +                       tmp->height = shift / root->shift;
 722 +                       *store = tmp;
 723 +                       tmp->parent = node;
 724 +                       node->count++;
 725 +//                     if (root->extend)
 726 +//                             root->extend(node, tmp);
 727 +               }
 728 +
 729 +               node = tmp;
 730 +               shift -= root->shift;
 731 +               offset = (index >> shift) & root->mask;
 732 +       }
 733 +
 734 +       BUG_ON(node->height != 1);
 735 +
 736 +
 737 +       store = &node->stores[offset];
 738 +       for (i = 0, j = 0;
 739 +             j < root->stores_size - node->count &&
 740 +             i < root->stores_size - offset && j < num; i++) {
 741 +               if (!store[i]) {
 742 +                       store[i] = item[j];
 743 +                       if (root->assign)
 744 +                               root->assign(node, index + i, item[j]);
 745 +                       j++;
 746 +               }
 747 +       }
 748 +
 749 +       node->count += j;
 750 +       root->num += j;
 751 +       num -= j;
 752 +
 753 +       while (sradix_node_full(root, node)) {
 754 +               node = node->parent;
 755 +               if (!node)
 756 +                       break;
 757 +
 758 +               node->fulls++;
 759 +       }
 760 +
 761 +       if (unlikely(!node)) {
 762 +               /* All nodes are full */
 763 +               root->min = 1 << (root->height * root->shift);
 764 +               root->enter_node = NULL;
 765 +       } else {
 766 +               root->min = index + i - 1;
 767 +               root->min |= (1UL << (node->height - 1)) - 1;
 768 +               root->min++;
 769 +               root->enter_node = node;
 770 +       }
 771 +
 772 +       if (num) {
 773 +               item += j;
 774 +               goto go_on;
 775 +       }
 776 +
 777 +       return 0;
 778 +}
 779 +
 780 +
 781 +/**
 782 + *     sradix_tree_shrink    -    shrink height of a sradix tree to minimal
 783 + *      @root          sradix tree root
 784 + *
 785 + */
 786 +static inline void sradix_tree_shrink(struct sradix_tree_root *root)
 787 +{
 788 +       /* try to shrink tree height */
 789 +       while (root->height > 1) {
 790 +               struct sradix_tree_node *to_free = root->rnode;
 791 +
 792 +               /*
 793 +                * The candidate node has more than one child, or its child
 794 +                * is not at the leftmost store, we cannot shrink.
 795 +                */
 796 +               if (to_free->count != 1 || !to_free->stores[0])
 797 +                       break;
 798 +
 799 +               root->rnode = to_free->stores[0];
 800 +               root->rnode->parent = NULL;
 801 +               root->height--;
 802 +               if (unlikely(root->enter_node == to_free))
 803 +                       root->enter_node = NULL;
 804 +               root->free(to_free);
 805 +       }
 806 +}
 807 +
 808 +/*
 809 + * Del the item on the known leaf node and index
 810 + */
 811 +void sradix_tree_delete_from_leaf(struct sradix_tree_root *root,
 812 +                                 struct sradix_tree_node *node, unsigned long index)
 813 +{
 814 +       unsigned int offset;
 815 +       struct sradix_tree_node *start, *end;
 816 +
 817 +       BUG_ON(node->height != 1);
 818 +
 819 +       start = node;
 820 +       while (node && !(--node->count))
 821 +               node = node->parent;
 822 +
 823 +       end = node;
 824 +       if (!node) {
 825 +               root->rnode = NULL;
 826 +               root->height = 0;
 827 +               root->min = 0;
 828 +               root->num = 0;
 829 +               root->enter_node = NULL;
 830 +       } else {
 831 +               offset = (index >> (root->shift * (node->height - 1))) & root->mask;
 832 +               if (root->rm)
 833 +                       root->rm(node, offset);
 834 +               node->stores[offset] = NULL;
 835 +               root->num--;
 836 +               if (root->min > index) {
 837 +                       root->min = index;
 838 +                       root->enter_node = node;
 839 +               }
 840 +       }
 841 +
 842 +       if (start != end) {
 843 +               do {
 844 +                       node = start;
 845 +                       start = start->parent;
 846 +                       if (unlikely(root->enter_node == node))
 847 +                               root->enter_node = end;
 848 +                       root->free(node);
 849 +               } while (start != end);
 850 +
 851 +               /*
 852 +                * Note that shrink may free "end", so enter_node still need to
 853 +                * be checked inside.
 854 +                */
 855 +               sradix_tree_shrink(root);
 856 +       } else if (node->count == root->stores_size - 1) {
 857 +               /* It WAS a full leaf node. Update the ancestors */
 858 +               node = node->parent;
 859 +               while (node) {
 860 +                       node->fulls--;
 861 +                       if (node->fulls != root->stores_size - 1)
 862 +                               break;
 863 +
 864 +                       node = node->parent;
 865 +               }
 866 +       }
 867 +}
 868 +
 869 +void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index)
 870 +{
 871 +       unsigned int height, offset;
 872 +       struct sradix_tree_node *node;
 873 +       int shift;
 874 +
 875 +       node = root->rnode;
 876 +       if (node == NULL || (index >> (root->shift * root->height)))
 877 +               return NULL;
 878 +
 879 +       height = root->height;
 880 +       shift = (height - 1) * root->shift;
 881 +
 882 +       do {
 883 +               offset = (index >> shift) & root->mask;
 884 +               node = node->stores[offset];
 885 +               if (!node)
 886 +                       return NULL;
 887 +
 888 +               shift -= root->shift;
 889 +       } while (shift >= 0);
 890 +
 891 +       return node;
 892 +}
 893 +
 894 +/*
 895 + * Return the item if it exists, otherwise create it in place
 896 + * and return the created item.
 897 + */
 898 +void *sradix_tree_lookup_create(struct sradix_tree_root *root,
 899 +                       unsigned long index, void *(*item_alloc)(void))
 900 +{
 901 +       unsigned int height, offset;
 902 +       struct sradix_tree_node *node, *tmp;
 903 +       void *item;
 904 +       int shift, error;
 905 +
 906 +       if (root->rnode == NULL || (index >> (root->shift * root->height))) {
 907 +               if (item_alloc) {
 908 +                       error = sradix_tree_extend(root, index);
 909 +                       if (error)
 910 +                               return NULL;
 911 +               } else {
 912 +                       return NULL;
 913 +               }
 914 +       }
 915 +
 916 +       node = root->rnode;
 917 +       height = root->height;
 918 +       shift = (height - 1) * root->shift;
 919 +
 920 +       do {
 921 +               offset = (index >> shift) & root->mask;
 922 +               if (!node->stores[offset]) {
 923 +                       if (!(tmp = root->alloc()))
 924 +                               return NULL;
 925 +
 926 +                       tmp->height = shift / root->shift;
 927 +                       node->stores[offset] = tmp;
 928 +                       tmp->parent = node;
 929 +                       node->count++;
 930 +                       node = tmp;
 931 +               } else {
 932 +                       node = node->stores[offset];
 933 +               }
 934 +
 935 +               shift -= root->shift;
 936 +       } while (shift > 0);
 937 +
 938 +       BUG_ON(node->height != 1);
 939 +       offset = index & root->mask;
 940 +       if (node->stores[offset]) {
 941 +               return node->stores[offset];
 942 +       } else if (item_alloc) {
 943 +               if (!(item = item_alloc()))
 944 +                       return NULL;
 945 +
 946 +               node->stores[offset] = item;
 947 +
 948 +               /*
 949 +                * NOTE: we do NOT call root->assign here, since this item is
 950 +                * newly created by us having no meaning. Caller can call this
 951 +                * if it's necessary to do so.
 952 +                */
 953 +
 954 +               node->count++;
 955 +               root->num++;
 956 +
 957 +               while (sradix_node_full(root, node)) {
 958 +                       node = node->parent;
 959 +                       if (!node)
 960 +                               break;
 961 +
 962 +                       node->fulls++;
 963 +               }
 964 +
 965 +               if (unlikely(!node)) {
 966 +                       /* All nodes are full */
 967 +                       root->min = 1 << (root->height * root->shift);
 968 +               } else {
 969 +                       if (root->min == index) {
 970 +                               root->min |= (1UL << (node->height - 1)) - 1;
 971 +                               root->min++;
 972 +                               root->enter_node = node;
 973 +                       }
 974 +               }
 975 +
 976 +               return item;
 977 +       } else {
 978 +               return NULL;
 979 +       }
 980 +
 981 +}
 982 +
 983 +int sradix_tree_delete(struct sradix_tree_root *root, unsigned long index)
 984 +{
 985 +       unsigned int height, offset;
 986 +       struct sradix_tree_node *node;
 987 +       int shift;
 988 +
 989 +       node = root->rnode;
 990 +       if (node == NULL || (index >> (root->shift * root->height)))
 991 +               return -ENOENT;
 992 +
 993 +       height = root->height;
 994 +       shift = (height - 1) * root->shift;
 995 +
 996 +       do {
 997 +               offset = (index >> shift) & root->mask;
 998 +               node = node->stores[offset];
 999 +               if (!node)
1000 +                       return -ENOENT;
1001 +
1002 +               shift -= root->shift;
1003 +       } while (shift > 0);
1004 +
1005 +       offset = index & root->mask;
1006 +       if (!node->stores[offset])
1007 +               return -ENOENT;
1008 +
1009 +       sradix_tree_delete_from_leaf(root, node, index);
1010 +
1011 +       return 0;
1012 +}
1013 diff --git a/mm/Kconfig b/mm/Kconfig
1014 index 24c045b24..3ce98ecc2 100644
1015 --- a/mm/Kconfig
1016 +++ b/mm/Kconfig
1017 @@ -317,6 +317,32 @@ config KSM
1018           See Documentation/vm/ksm.rst for more information: KSM is inactive
1019           until a program has madvised that an area is MADV_MERGEABLE, and
1020           root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set).
1021 +choice
1022 +       prompt "Choose UKSM/KSM strategy"
1023 +       default UKSM
1024 +       depends on KSM
1025 +       help
1026 +         This option allows to select a UKSM/KSM stragety.
1027 +
1028 +config UKSM
1029 +       bool "Ultra-KSM for page merging"
1030 +       depends on KSM
1031 +       help
1032 +       UKSM is inspired by the Linux kernel project \u2014 KSM(Kernel Same
1033 +       page Merging), but with a fundamentally rewritten core algorithm. With
1034 +       an advanced algorithm, UKSM now can transparently scans all anonymously
1035 +       mapped user space applications with an significantly improved scan speed
1036 +       and CPU efficiency. Since KVM is friendly to KSM, KVM can also benefit from
1037 +       UKSM. Now UKSM has its first stable release and first real world enterprise user.
1038 +       For more information, please goto its project page.
1039 +       (github.com/dolohow/uksm)
1040 +
1041 +config KSM_LEGACY
1042 +       bool "Legacy KSM implementation"
1043 +       depends on KSM
1044 +       help
1045 +       The legacy KSM implementation from Red Hat.
1046 +endchoice
1047
1048  config DEFAULT_MMAP_MIN_ADDR
1049         int "Low address space to protect from user allocation"
1050 diff --git a/mm/Makefile b/mm/Makefile
1051 index 72227b24a..fd50a3a51 100644
1052 --- a/mm/Makefile
1053 +++ b/mm/Makefile
1054 @@ -76,7 +76,8 @@ obj-$(CONFIG_SPARSEMEM)       += sparse.o
1055  obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
1056  obj-$(CONFIG_SLOB) += slob.o
1057  obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
1058 -obj-$(CONFIG_KSM) += ksm.o
1059 +obj-$(CONFIG_KSM_LEGACY) += ksm.o
1060 +obj-$(CONFIG_UKSM) += uksm.o
1061  obj-$(CONFIG_PAGE_POISONING) += page_poison.o
1062  obj-$(CONFIG_SLAB) += slab.o
1063  obj-$(CONFIG_SLUB) += slub.o
1064 diff --git a/mm/ksm.c b/mm/ksm.c
1065 index 9694ee2c7..63af6a528 100644
1066 --- a/mm/ksm.c
1067 +++ b/mm/ksm.c
1068 @@ -858,17 +858,6 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma,
1069         return err;
1070  }
1071
1072 -static inline struct stable_node *page_stable_node(struct page *page)
1073 -{
1074 -       return PageKsm(page) ? page_rmapping(page) : NULL;
1075 -}
1076 -
1077 -static inline void set_page_stable_node(struct page *page,
1078 -                                       struct stable_node *stable_node)
1079 -{
1080 -       page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
1081 -}
1082 -
1083  #ifdef CONFIG_SYSFS
1084  /*
1085   * Only called through the sysfs control interface:
1086 diff --git a/mm/memory.c b/mm/memory.c
1087 index 550405fc3..b4005b195 100644
1088 --- a/mm/memory.c
1089 +++ b/mm/memory.c
1090 @@ -158,6 +158,25 @@ EXPORT_SYMBOL(zero_pfn);
1091
1092  unsigned long highest_memmap_pfn __read_mostly;
1093
1094 +#ifdef CONFIG_UKSM
1095 +unsigned long uksm_zero_pfn __read_mostly;
1096 +EXPORT_SYMBOL_GPL(uksm_zero_pfn);
1097 +struct page *empty_uksm_zero_page;
1098 +
1099 +static int __init setup_uksm_zero_page(void)
1100 +{
1101 +       empty_uksm_zero_page = alloc_pages(__GFP_ZERO & ~__GFP_MOVABLE, 0);
1102 +       if (!empty_uksm_zero_page)
1103 +               panic("Oh boy, that early out of memory?");
1104 +
1105 +       SetPageReserved(empty_uksm_zero_page);
1106 +       uksm_zero_pfn = page_to_pfn(empty_uksm_zero_page);
1107 +
1108 +       return 0;
1109 +}
1110 +core_initcall(setup_uksm_zero_page);
1111 +#endif
1112 +
1113  /*
1114   * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
1115   */
1116 @@ -173,6 +192,7 @@ void mm_trace_rss_stat(struct mm_struct *mm, int member, long count)
1117         trace_rss_stat(mm, member, count);
1118  }
1119
1120 +
1121  #if defined(SPLIT_RSS_COUNTING)
1122
1123  void sync_mm_rss(struct mm_struct *mm)
1124 @@ -875,6 +895,11 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
1125                 get_page(page);
1126                 page_dup_rmap(page, false);
1127                 rss[mm_counter(page)]++;
1128 +
1129 +               /* Should return NULL in vm_normal_page() */
1130 +               uksm_bugon_zeropage(pte);
1131 +       } else {
1132 +               uksm_map_zero_page(pte);
1133         }
1134
1135         /*
1136 @@ -1254,8 +1279,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
1137                         ptent = ptep_get_and_clear_full(mm, addr, pte,
1138                                                         tlb->fullmm);
1139                         tlb_remove_tlb_entry(tlb, pte, addr);
1140 -                       if (unlikely(!page))
1141 +                       if (unlikely(!page)) {
1142 +                               uksm_unmap_zero_page(ptent);
1143                                 continue;
1144 +                       }
1145
1146                         if (!PageAnon(page)) {
1147                                 if (pte_dirty(ptent)) {
1148 @@ -2603,6 +2630,7 @@ static inline bool cow_user_page(struct page *dst, struct page *src,
1149
1150         if (likely(src)) {
1151                 copy_user_highpage(dst, src, addr, vma);
1152 +               uksm_cow_page(vma, src);
1153                 return true;
1154         }
1155
1156 @@ -2849,6 +2877,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
1157                                                               vmf->address);
1158                 if (!new_page)
1159                         goto oom;
1160 +               uksm_cow_pte(vma, vmf->orig_pte);
1161         } else {
1162                 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
1163                                 vmf->address);
1164 @@ -2891,7 +2920,9 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
1165                                                 mm_counter_file(old_page));
1166                                 inc_mm_counter_fast(mm, MM_ANONPAGES);
1167                         }
1168 +                       uksm_bugon_zeropage(vmf->orig_pte);
1169                 } else {
1170 +                       uksm_unmap_zero_page(vmf->orig_pte);
1171                         inc_mm_counter_fast(mm, MM_ANONPAGES);
1172                 }
1173                 flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
1174 diff --git a/mm/mmap.c b/mm/mmap.c
1175 index 3f287599a..dc719db43 100644
1176 --- a/mm/mmap.c
1177 +++ b/mm/mmap.c
1178 @@ -46,6 +46,7 @@
1179  #include <linux/moduleparam.h>
1180  #include <linux/pkeys.h>
1181  #include <linux/oom.h>
1182 +#include <linux/ksm.h>
1183  #include <linux/sched/mm.h>
1184
1185  #include <linux/uaccess.h>
1186 @@ -181,6 +182,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
1187         if (vma->vm_file)
1188                 fput(vma->vm_file);
1189         mpol_put(vma_policy(vma));
1190 +       uksm_remove_vma(vma);
1191         vm_area_free(vma);
1192         return next;
1193  }
1194 @@ -748,9 +750,16 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
1195         long adjust_next = 0;
1196         int remove_next = 0;
1197
1198 +/*
1199 + * to avoid deadlock, ksm_remove_vma must be done before any spin_lock is
1200 + * acquired
1201 + */
1202 +       uksm_remove_vma(vma);
1203 +
1204         if (next && !insert) {
1205                 struct vm_area_struct *exporter = NULL, *importer = NULL;
1206
1207 +               uksm_remove_vma(next);
1208                 if (end >= next->vm_end) {
1209                         /*
1210                          * vma expands, overlapping all the next, and
1211 @@ -881,6 +890,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
1212                 end_changed = true;
1213         }
1214         vma->vm_pgoff = pgoff;
1215 +
1216         if (adjust_next) {
1217                 next->vm_start += adjust_next;
1218                 next->vm_pgoff += adjust_next >> PAGE_SHIFT;
1219 @@ -985,6 +995,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
1220                 if (remove_next == 2) {
1221                         remove_next = 1;
1222                         end = next->vm_end;
1223 +                       uksm_remove_vma(next);
1224                         goto again;
1225                 }
1226                 else if (next)
1227 @@ -1011,10 +1022,14 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
1228                          */
1229                         VM_WARN_ON(mm->highest_vm_end != vm_end_gap(vma));
1230                 }
1231 +       } else {
1232 +               if (next && !insert)
1233 +                       uksm_vma_add_new(next);
1234         }
1235         if (insert && file)
1236                 uprobe_mmap(insert);
1237
1238 +       uksm_vma_add_new(vma);
1239         validate_mm(mm);
1240
1241         return 0;
1242 @@ -1470,6 +1485,9 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
1243         vm_flags = calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
1244                         mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
1245
1246 +       /* If uksm is enabled, we add VM_MERGEABLE to new VMAs. */
1247 +       uksm_vm_flags_mod(&vm_flags);
1248 +
1249         if (flags & MAP_LOCKED)
1250                 if (!can_do_mlock())
1251                         return -EPERM;
1252 @@ -1865,6 +1883,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
1253                         allow_write_access(file);
1254         }
1255         file = vma->vm_file;
1256 +       uksm_vma_add_new(vma);
1257  out:
1258         perf_event_mmap(vma);
1259
1260 @@ -1907,6 +1926,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
1261         if (vm_flags & VM_DENYWRITE)
1262                 allow_write_access(file);
1263  free_vma:
1264 +       uksm_remove_vma(vma);
1265         vm_area_free(vma);
1266  unacct_error:
1267         if (charged)
1268 @@ -2766,6 +2786,8 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
1269         else
1270                 err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
1271
1272 +       uksm_vma_add_new(new);
1273 +
1274         /* Success. */
1275         if (!err)
1276                 return 0;
1277 @@ -3073,6 +3095,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla
1278         if ((flags & (~VM_EXEC)) != 0)
1279                 return -EINVAL;
1280         flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
1281 +       uksm_vm_flags_mod(&flags);
1282
1283         mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
1284         if (IS_ERR_VALUE(mapped_addr))
1285 @@ -3118,6 +3141,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla
1286         vma->vm_flags = flags;
1287         vma->vm_page_prot = vm_get_page_prot(flags);
1288         vma_link(mm, vma, prev, rb_link, rb_parent);
1289 +       uksm_vma_add_new(vma);
1290  out:
1291         perf_event_mmap(vma);
1292         mm->total_vm += len >> PAGE_SHIFT;
1293 @@ -3195,6 +3219,12 @@ void exit_mmap(struct mm_struct *mm)
1294                 mmap_write_unlock(mm);
1295         }
1296
1297 +       /*
1298 +        * Taking write lock on mmap does not harm others,
1299 +        * but it's crucial for uksm to avoid races.
1300 +        */
1301 +       mmap_write_lock(mm);
1302 +
1303         if (mm->locked_vm) {
1304                 vma = mm->mmap;
1305                 while (vma) {
1306 @@ -3230,6 +3260,11 @@ void exit_mmap(struct mm_struct *mm)
1307                 cond_resched();
1308         }
1309         vm_unacct_memory(nr_accounted);
1310 +
1311 +       mm->mmap = NULL;
1312 +       mm->mm_rb = RB_ROOT;
1313 +       vmacache_invalidate(mm);
1314 +       mmap_write_unlock(mm);
1315  }
1316
1317  /* Insert vm structure into process list sorted by address
1318 @@ -3337,6 +3372,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
1319                         new_vma->vm_ops->open(new_vma);
1320                 vma_link(mm, new_vma, prev, rb_link, rb_parent);
1321                 *need_rmap_locks = false;
1322 +               uksm_vma_add_new(new_vma);
1323         }
1324         return new_vma;
1325
1326 @@ -3505,6 +3541,7 @@ static struct vm_area_struct *__install_special_mapping(
1327         vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT);
1328
1329         perf_event_mmap(vma);
1330 +       uksm_vma_add_new(vma);
1331
1332         return vma;
1333
1334 diff --git a/mm/uksm.c b/mm/uksm.c
1335 new file mode 100644
1336 index 000000000..e4732c00b
1337 --- /dev/null
1338 +++ b/mm/uksm.c
1339 @@ -0,0 +1,5614 @@
1340 +/*
1341 + * Ultra KSM. Copyright (C) 2011-2012 Nai Xia
1342 + *
1343 + * This is an improvement upon KSM. Some basic data structures and routines
1344 + * are borrowed from ksm.c .
1345 + *
1346 + * Its new features:
1347 + * 1. Full system scan:
1348 + *      It automatically scans all user processes' anonymous VMAs. Kernel-user
1349 + *      interaction to submit a memory area to KSM is no longer needed.
1350 + *
1351 + * 2. Rich area detection:
1352 + *      It automatically detects rich areas containing abundant duplicated
1353 + *      pages based. Rich areas are given a full scan speed. Poor areas are
1354 + *      sampled at a reasonable speed with very low CPU consumption.
1355 + *
1356 + * 3. Ultra Per-page scan speed improvement:
1357 + *      A new hash algorithm is proposed. As a result, on a machine with
1358 + *      Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it
1359 + *      can scan memory areas that does not contain duplicated pages at speed of
1360 + *      627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of
1361 + *      477MB/sec ~ 923MB/sec.
1362 + *
1363 + * 4. Thrashing area avoidance:
1364 + *      Thrashing area(an VMA that has frequent Ksm page break-out) can be
1365 + *      filtered out. My benchmark shows it's more efficient than KSM's per-page
1366 + *      hash value based volatile page detection.
1367 + *
1368 + *
1369 + * 5. Misc changes upon KSM:
1370 + *      * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page
1371 + *        comparison. It's much faster than default C version on x86.
1372 + *      * rmap_item now has an struct *page member to loosely cache a
1373 + *        address-->page mapping, which reduces too much time-costly
1374 + *        follow_page().
1375 + *      * The VMA creation/exit procedures are hooked to let the Ultra KSM know.
1376 + *      * try_to_merge_two_pages() now can revert a pte if it fails. No break_
1377 + *        ksm is needed for this case.
1378 + *
1379 + * 6. Full Zero Page consideration(contributed by Figo Zhang)
1380 + *    Now uksmd consider full zero pages as special pages and merge them to an
1381 + *    special unswappable uksm zero page.
1382 + */
1383 +
1384 +#include <linux/errno.h>
1385 +#include <linux/mm.h>
1386 +#include <linux/fs.h>
1387 +#include <linux/mman.h>
1388 +#include <linux/sched.h>
1389 +#include <linux/sched/mm.h>
1390 +#include <linux/sched/coredump.h>
1391 +#include <linux/sched/cputime.h>
1392 +#include <linux/rwsem.h>
1393 +#include <linux/pagemap.h>
1394 +#include <linux/rmap.h>
1395 +#include <linux/spinlock.h>
1396 +#include <linux/jhash.h>
1397 +#include <linux/delay.h>
1398 +#include <linux/kthread.h>
1399 +#include <linux/wait.h>
1400 +#include <linux/slab.h>
1401 +#include <linux/rbtree.h>
1402 +#include <linux/memory.h>
1403 +#include <linux/mmu_notifier.h>
1404 +#include <linux/swap.h>
1405 +#include <linux/ksm.h>
1406 +#include <linux/crypto.h>
1407 +#include <linux/scatterlist.h>
1408 +#include <crypto/hash.h>
1409 +#include <linux/random.h>
1410 +#include <linux/math64.h>
1411 +#include <linux/gcd.h>
1412 +#include <linux/freezer.h>
1413 +#include <linux/oom.h>
1414 +#include <linux/numa.h>
1415 +#include <linux/sradix-tree.h>
1416 +
1417 +#include <asm/tlbflush.h>
1418 +#include "internal.h"
1419 +
1420 +#ifdef CONFIG_X86
1421 +#undef memcmp
1422 +
1423 +#ifdef CONFIG_X86_32
1424 +#define memcmp memcmpx86_32
1425 +/*
1426 + * Compare 4-byte-aligned address s1 and s2, with length n
1427 + */
1428 +int memcmpx86_32(void *s1, void *s2, size_t n)
1429 +{
1430 +       size_t num = n / 4;
1431 +       register int res;
1432 +
1433 +       __asm__ __volatile__
1434 +       (
1435 +        "testl %3,%3\n\t"
1436 +        "repe; cmpsd\n\t"
1437 +        "je        1f\n\t"
1438 +        "sbbl      %0,%0\n\t"
1439 +        "orl       $1,%0\n"
1440 +        "1:"
1441 +        : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
1442 +        : "0" (0)
1443 +        : "cc");
1444 +
1445 +       return res;
1446 +}
1447 +
1448 +/*
1449 + * Check the page is all zero ?
1450 + */
1451 +static int is_full_zero(const void *s1, size_t len)
1452 +{
1453 +       unsigned char same;
1454 +
1455 +       len /= 4;
1456 +
1457 +       __asm__ __volatile__
1458 +       ("repe; scasl;"
1459 +        "sete %0"
1460 +        : "=qm" (same), "+D" (s1), "+c" (len)
1461 +        : "a" (0)
1462 +        : "cc");
1463 +
1464 +       return same;
1465 +}
1466 +
1467 +
1468 +#elif defined(CONFIG_X86_64)
1469 +#define memcmp memcmpx86_64
1470 +/*
1471 + * Compare 8-byte-aligned address s1 and s2, with length n
1472 + */
1473 +int memcmpx86_64(void *s1, void *s2, size_t n)
1474 +{
1475 +       size_t num = n / 8;
1476 +       register int res;
1477 +
1478 +       __asm__ __volatile__
1479 +       (
1480 +        "testq %q3,%q3\n\t"
1481 +        "repe; cmpsq\n\t"
1482 +        "je        1f\n\t"
1483 +        "sbbq      %q0,%q0\n\t"
1484 +        "orq       $1,%q0\n"
1485 +        "1:"
1486 +        : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
1487 +        : "0" (0)
1488 +        : "cc");
1489 +
1490 +       return res;
1491 +}
1492 +
1493 +static int is_full_zero(const void *s1, size_t len)
1494 +{
1495 +       unsigned char same;
1496 +
1497 +       len /= 8;
1498 +
1499 +       __asm__ __volatile__
1500 +       ("repe; scasq;"
1501 +        "sete %0"
1502 +        : "=qm" (same), "+D" (s1), "+c" (len)
1503 +        : "a" (0)
1504 +        : "cc");
1505 +
1506 +       return same;
1507 +}
1508 +
1509 +#endif
1510 +#else
1511 +static int is_full_zero(const void *s1, size_t len)
1512 +{
1513 +       unsigned long *src = s1;
1514 +       int i;
1515 +
1516 +       len /= sizeof(*src);
1517 +
1518 +       for (i = 0; i < len; i++) {
1519 +               if (src[i])
1520 +                       return 0;
1521 +       }
1522 +
1523 +       return 1;
1524 +}
1525 +#endif
1526 +
1527 +#define UKSM_RUNG_ROUND_FINISHED  (1 << 0)
1528 +#define TIME_RATIO_SCALE       10000
1529 +
1530 +#define SLOT_TREE_NODE_SHIFT   8
1531 +#define SLOT_TREE_NODE_STORE_SIZE      (1UL << SLOT_TREE_NODE_SHIFT)
1532 +struct slot_tree_node {
1533 +       unsigned long size;
1534 +       struct sradix_tree_node snode;
1535 +       void *stores[SLOT_TREE_NODE_STORE_SIZE];
1536 +};
1537 +
1538 +static struct kmem_cache *slot_tree_node_cachep;
1539 +
1540 +static struct sradix_tree_node *slot_tree_node_alloc(void)
1541 +{
1542 +       struct slot_tree_node *p;
1543 +
1544 +       p = kmem_cache_zalloc(slot_tree_node_cachep, GFP_KERNEL |
1545 +                             __GFP_NORETRY | __GFP_NOWARN);
1546 +       if (!p)
1547 +               return NULL;
1548 +
1549 +       return &p->snode;
1550 +}
1551 +
1552 +static void slot_tree_node_free(struct sradix_tree_node *node)
1553 +{
1554 +       struct slot_tree_node *p;
1555 +
1556 +       p = container_of(node, struct slot_tree_node, snode);
1557 +       kmem_cache_free(slot_tree_node_cachep, p);
1558 +}
1559 +
1560 +static void slot_tree_node_extend(struct sradix_tree_node *parent,
1561 +                                 struct sradix_tree_node *child)
1562 +{
1563 +       struct slot_tree_node *p, *c;
1564 +
1565 +       p = container_of(parent, struct slot_tree_node, snode);
1566 +       c = container_of(child, struct slot_tree_node, snode);
1567 +
1568 +       p->size += c->size;
1569 +}
1570 +
1571 +void slot_tree_node_assign(struct sradix_tree_node *node,
1572 +                          unsigned int index, void *item)
1573 +{
1574 +       struct vma_slot *slot = item;
1575 +       struct slot_tree_node *cur;
1576 +
1577 +       slot->snode = node;
1578 +       slot->sindex = index;
1579 +
1580 +       while (node) {
1581 +               cur = container_of(node, struct slot_tree_node, snode);
1582 +               cur->size += slot->pages;
1583 +               node = node->parent;
1584 +       }
1585 +}
1586 +
1587 +void slot_tree_node_rm(struct sradix_tree_node *node, unsigned int offset)
1588 +{
1589 +       struct vma_slot *slot;
1590 +       struct slot_tree_node *cur;
1591 +       unsigned long pages;
1592 +
1593 +       if (node->height == 1) {
1594 +               slot = node->stores[offset];
1595 +               pages = slot->pages;
1596 +       } else {
1597 +               cur = container_of(node->stores[offset],
1598 +                                  struct slot_tree_node, snode);
1599 +               pages = cur->size;
1600 +       }
1601 +
1602 +       while (node) {
1603 +               cur = container_of(node, struct slot_tree_node, snode);
1604 +               cur->size -= pages;
1605 +               node = node->parent;
1606 +       }
1607 +}
1608 +
1609 +unsigned long slot_iter_index;
1610 +int slot_iter(void *item,  unsigned long height)
1611 +{
1612 +       struct slot_tree_node *node;
1613 +       struct vma_slot *slot;
1614 +
1615 +       if (height == 1) {
1616 +               slot = item;
1617 +               if (slot_iter_index < slot->pages) {
1618 +                       /*in this one*/
1619 +                       return 1;
1620 +               } else {
1621 +                       slot_iter_index -= slot->pages;
1622 +                       return 0;
1623 +               }
1624 +
1625 +       } else {
1626 +               node = container_of(item, struct slot_tree_node, snode);
1627 +               if (slot_iter_index < node->size) {
1628 +                       /*in this one*/
1629 +                       return 1;
1630 +               } else {
1631 +                       slot_iter_index -= node->size;
1632 +                       return 0;
1633 +               }
1634 +       }
1635 +}
1636 +
1637 +
1638 +static inline void slot_tree_init_root(struct sradix_tree_root *root)
1639 +{
1640 +       init_sradix_tree_root(root, SLOT_TREE_NODE_SHIFT);
1641 +       root->alloc = slot_tree_node_alloc;
1642 +       root->free = slot_tree_node_free;
1643 +       root->extend = slot_tree_node_extend;
1644 +       root->assign = slot_tree_node_assign;
1645 +       root->rm = slot_tree_node_rm;
1646 +}
1647 +
1648 +void slot_tree_init(void)
1649 +{
1650 +       slot_tree_node_cachep = kmem_cache_create("slot_tree_node",
1651 +                               sizeof(struct slot_tree_node), 0,
1652 +                               SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
1653 +                               NULL);
1654 +}
1655 +
1656 +
1657 +/* Each rung of this ladder is a list of VMAs having a same scan ratio */
1658 +struct scan_rung {
1659 +       //struct list_head scanned_list;
1660 +       struct sradix_tree_root vma_root;
1661 +       struct sradix_tree_root vma_root2;
1662 +
1663 +       struct vma_slot *current_scan;
1664 +       unsigned long current_offset;
1665 +
1666 +       /*
1667 +        * The initial value for current_offset, it should loop over
1668 +        * [0~ step - 1] to let all slot have its chance to be scanned.
1669 +        */
1670 +       unsigned long offset_init;
1671 +       unsigned long step; /* dynamic step for current_offset */
1672 +       unsigned int flags;
1673 +       unsigned long pages_to_scan;
1674 +       //unsigned long fully_scanned_slots;
1675 +       /*
1676 +        * a little bit tricky - if cpu_time_ratio > 0, then the value is the
1677 +        * the cpu time ratio it can spend in rung_i for every scan
1678 +        * period. if < 0, then it is the cpu time ratio relative to the
1679 +        * max cpu percentage user specified. Both in unit of
1680 +        * 1/TIME_RATIO_SCALE
1681 +        */
1682 +       int cpu_ratio;
1683 +
1684 +       /*
1685 +        * How long it will take for all slots in this rung to be fully
1686 +        * scanned? If it's zero, we don't care about the cover time:
1687 +        * it's fully scanned.
1688 +        */
1689 +       unsigned int cover_msecs;
1690 +       //unsigned long vma_num;
1691 +       //unsigned long pages; /* Sum of all slot's pages in rung */
1692 +};
1693 +
1694 +/**
1695 + * node of either the stable or unstale rbtree
1696 + *
1697 + */
1698 +struct tree_node {
1699 +       struct rb_node node; /* link in the main (un)stable rbtree */
1700 +       struct rb_root sub_root; /* rb_root for sublevel collision rbtree */
1701 +       u32 hash;
1702 +       unsigned long count; /* TODO: merged with sub_root */
1703 +       struct list_head all_list; /* all tree nodes in stable/unstable tree */
1704 +};
1705 +
1706 +/**
1707 + * struct stable_node - node of the stable rbtree
1708 + * @node: rb node of this ksm page in the stable tree
1709 + * @hlist: hlist head of rmap_items using this ksm page
1710 + * @kpfn: page frame number of this ksm page
1711 + */
1712 +struct stable_node {
1713 +       struct rb_node node; /* link in sub-rbtree */
1714 +       struct tree_node *tree_node; /* it's tree node root in stable tree, NULL if it's in hell list */
1715 +       struct hlist_head hlist;
1716 +       unsigned long kpfn;
1717 +       u32 hash_max; /* if ==0 then it's not been calculated yet */
1718 +       struct list_head all_list; /* in a list for all stable nodes */
1719 +};
1720 +
1721 +/**
1722 + * struct node_vma - group rmap_items linked in a same stable
1723 + * node together.
1724 + */
1725 +struct node_vma {
1726 +       union {
1727 +               struct vma_slot *slot;
1728 +               unsigned long key;  /* slot is used as key sorted on hlist */
1729 +       };
1730 +       struct hlist_node hlist;
1731 +       struct hlist_head rmap_hlist;
1732 +       struct stable_node *head;
1733 +};
1734 +
1735 +/**
1736 + * struct rmap_item - reverse mapping item for virtual addresses
1737 + * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
1738 + * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
1739 + * @mm: the memory structure this rmap_item is pointing into
1740 + * @address: the virtual address this rmap_item tracks (+ flags in low bits)
1741 + * @node: rb node of this rmap_item in the unstable tree
1742 + * @head: pointer to stable_node heading this list in the stable tree
1743 + * @hlist: link into hlist of rmap_items hanging off that stable_node
1744 + */
1745 +struct rmap_item {
1746 +       struct vma_slot *slot;
1747 +       struct page *page;
1748 +       unsigned long address;  /* + low bits used for flags below */
1749 +       unsigned long hash_round;
1750 +       unsigned long entry_index;
1751 +       union {
1752 +               struct {/* when in unstable tree */
1753 +                       struct rb_node node;
1754 +                       struct tree_node *tree_node;
1755 +                       u32 hash_max;
1756 +               };
1757 +               struct { /* when in stable tree */
1758 +                       struct node_vma *head;
1759 +                       struct hlist_node hlist;
1760 +                       struct anon_vma *anon_vma;
1761 +               };
1762 +       };
1763 +} __aligned(4);
1764 +
1765 +struct rmap_list_entry {
1766 +       union {
1767 +               struct rmap_item *item;
1768 +               unsigned long addr;
1769 +       };
1770 +       /* lowest bit is used for is_addr tag */
1771 +} __aligned(4); /* 4 aligned to fit in to pages*/
1772 +
1773 +
1774 +/* Basic data structure definition ends */
1775 +
1776 +
1777 +/*
1778 + * Flags for rmap_item to judge if it's listed in the stable/unstable tree.
1779 + * The flags use the low bits of rmap_item.address
1780 + */
1781 +#define UNSTABLE_FLAG  0x1
1782 +#define STABLE_FLAG    0x2
1783 +#define get_rmap_addr(x)       ((x)->address & PAGE_MASK)
1784 +
1785 +/*
1786 + * rmap_list_entry helpers
1787 + */
1788 +#define IS_ADDR_FLAG   1
1789 +#define is_addr(ptr)           ((unsigned long)(ptr) & IS_ADDR_FLAG)
1790 +#define set_is_addr(ptr)       ((ptr) |= IS_ADDR_FLAG)
1791 +#define get_clean_addr(ptr)    (((ptr) & ~(__typeof__(ptr))IS_ADDR_FLAG))
1792 +
1793 +
1794 +/*
1795 + * High speed caches for frequently allocated and freed structs
1796 + */
1797 +static struct kmem_cache *rmap_item_cache;
1798 +static struct kmem_cache *stable_node_cache;
1799 +static struct kmem_cache *node_vma_cache;
1800 +static struct kmem_cache *vma_slot_cache;
1801 +static struct kmem_cache *tree_node_cache;
1802 +#define UKSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("uksm_"#__struct,\
1803 +               sizeof(struct __struct), __alignof__(struct __struct),\
1804 +               (__flags), NULL)
1805 +
1806 +/* Array of all scan_rung, uksm_scan_ladder[0] having the minimum scan ratio */
1807 +#define SCAN_LADDER_SIZE 4
1808 +static struct scan_rung uksm_scan_ladder[SCAN_LADDER_SIZE];
1809 +
1810 +/* The evaluation rounds uksmd has finished */
1811 +static unsigned long long uksm_eval_round = 1;
1812 +
1813 +/*
1814 + * we add 1 to this var when we consider we should rebuild the whole
1815 + * unstable tree.
1816 + */
1817 +static unsigned long uksm_hash_round = 1;
1818 +
1819 +/*
1820 + * How many times the whole memory is scanned.
1821 + */
1822 +static unsigned long long fully_scanned_round = 1;
1823 +
1824 +/* The total number of virtual pages of all vma slots */
1825 +static u64 uksm_pages_total;
1826 +
1827 +/* The number of pages has been scanned since the start up */
1828 +static u64 uksm_pages_scanned;
1829 +
1830 +static u64 scanned_virtual_pages;
1831 +
1832 +/* The number of pages has been scanned since last encode_benefit call */
1833 +static u64 uksm_pages_scanned_last;
1834 +
1835 +/* If the scanned number is tooo large, we encode it here */
1836 +static u64 pages_scanned_stored;
1837 +
1838 +static unsigned long pages_scanned_base;
1839 +
1840 +/* The number of nodes in the stable tree */
1841 +static unsigned long uksm_pages_shared;
1842 +
1843 +/* The number of page slots additionally sharing those nodes */
1844 +static unsigned long uksm_pages_sharing;
1845 +
1846 +/* The number of nodes in the unstable tree */
1847 +static unsigned long uksm_pages_unshared;
1848 +
1849 +/*
1850 + * Milliseconds ksmd should sleep between scans,
1851 + * >= 100ms to be consistent with
1852 + * scan_time_to_sleep_msec()
1853 + */
1854 +static unsigned int uksm_sleep_jiffies;
1855 +
1856 +/* The real value for the uksmd next sleep */
1857 +static unsigned int uksm_sleep_real;
1858 +
1859 +/* Saved value for user input uksm_sleep_jiffies when it's enlarged */
1860 +static unsigned int uksm_sleep_saved;
1861 +
1862 +/* Max percentage of cpu utilization ksmd can take to scan in one batch */
1863 +static unsigned int uksm_max_cpu_percentage;
1864 +
1865 +static int uksm_cpu_governor;
1866 +
1867 +static char *uksm_cpu_governor_str[4] = { "full", "medium", "low", "quiet" };
1868 +
1869 +struct uksm_cpu_preset_s {
1870 +       int cpu_ratio[SCAN_LADDER_SIZE];
1871 +       unsigned int cover_msecs[SCAN_LADDER_SIZE];
1872 +       unsigned int max_cpu; /* percentage */
1873 +};
1874 +
1875 +struct uksm_cpu_preset_s uksm_cpu_preset[4] = {
1876 +       { {20, 40, -2500, -10000}, {1000, 500, 200, 50}, 95},
1877 +       { {20, 30, -2500, -10000}, {1000, 500, 400, 100}, 50},
1878 +       { {10, 20, -5000, -10000}, {1500, 1000, 1000, 250}, 20},
1879 +       { {10, 20, 40, 75}, {2000, 1000, 1000, 1000}, 1},
1880 +};
1881 +
1882 +/* The default value for uksm_ema_page_time if it's not initialized */
1883 +#define UKSM_PAGE_TIME_DEFAULT 500
1884 +
1885 +/*cost to scan one page by expotional moving average in nsecs */
1886 +static unsigned long uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT;
1887 +
1888 +/* The expotional moving average alpha weight, in percentage. */
1889 +#define EMA_ALPHA      20
1890 +
1891 +/*
1892 + * The threshold used to filter out thrashing areas,
1893 + * If it == 0, filtering is disabled, otherwise it's the percentage up-bound
1894 + * of the thrashing ratio of all areas. Any area with a bigger thrashing ratio
1895 + * will be considered as having a zero duplication ratio.
1896 + */
1897 +static unsigned int uksm_thrash_threshold = 50;
1898 +
1899 +/* How much dedup ratio is considered to be abundant*/
1900 +static unsigned int uksm_abundant_threshold = 10;
1901 +
1902 +/* All slots having merged pages in this eval round. */
1903 +struct list_head vma_slot_dedup = LIST_HEAD_INIT(vma_slot_dedup);
1904 +
1905 +/* How many times the ksmd has slept since startup */
1906 +static unsigned long long uksm_sleep_times;
1907 +
1908 +#define UKSM_RUN_STOP  0
1909 +#define UKSM_RUN_MERGE 1
1910 +static unsigned int uksm_run = 1;
1911 +
1912 +static DECLARE_WAIT_QUEUE_HEAD(uksm_thread_wait);
1913 +static DEFINE_MUTEX(uksm_thread_mutex);
1914 +
1915 +/*
1916 + * List vma_slot_new is for newly created vma_slot waiting to be added by
1917 + * ksmd. If one cannot be added(e.g. due to it's too small), it's moved to
1918 + * vma_slot_noadd. vma_slot_del is the list for vma_slot whose corresponding
1919 + * VMA has been removed/freed.
1920 + */
1921 +struct list_head vma_slot_new = LIST_HEAD_INIT(vma_slot_new);
1922 +struct list_head vma_slot_noadd = LIST_HEAD_INIT(vma_slot_noadd);
1923 +struct list_head vma_slot_del = LIST_HEAD_INIT(vma_slot_del);
1924 +static DEFINE_SPINLOCK(vma_slot_list_lock);
1925 +
1926 +/* The unstable tree heads */
1927 +static struct rb_root root_unstable_tree = RB_ROOT;
1928 +
1929 +/*
1930 + * All tree_nodes are in a list to be freed at once when unstable tree is
1931 + * freed after each scan round.
1932 + */
1933 +static struct list_head unstable_tree_node_list =
1934 +                               LIST_HEAD_INIT(unstable_tree_node_list);
1935 +
1936 +/* List contains all stable nodes */
1937 +static struct list_head stable_node_list = LIST_HEAD_INIT(stable_node_list);
1938 +
1939 +/*
1940 + * When the hash strength is changed, the stable tree must be delta_hashed and
1941 + * re-structured. We use two set of below structs to speed up the
1942 + * re-structuring of stable tree.
1943 + */
1944 +static struct list_head
1945 +stable_tree_node_list[2] = {LIST_HEAD_INIT(stable_tree_node_list[0]),
1946 +                           LIST_HEAD_INIT(stable_tree_node_list[1])};
1947 +
1948 +static struct list_head *stable_tree_node_listp = &stable_tree_node_list[0];
1949 +static struct rb_root root_stable_tree[2] = {RB_ROOT, RB_ROOT};
1950 +static struct rb_root *root_stable_treep = &root_stable_tree[0];
1951 +static unsigned long stable_tree_index;
1952 +
1953 +/* The hash strength needed to hash a full page */
1954 +#define HASH_STRENGTH_FULL             (PAGE_SIZE / sizeof(u32))
1955 +
1956 +/* The hash strength needed for loop-back hashing */
1957 +#define HASH_STRENGTH_MAX              (HASH_STRENGTH_FULL + 10)
1958 +
1959 +/* The random offsets in a page */
1960 +static u32 *random_nums;
1961 +
1962 +/* The hash strength */
1963 +static unsigned long hash_strength = HASH_STRENGTH_FULL >> 4;
1964 +
1965 +/* The delta value each time the hash strength increases or decreases */
1966 +static unsigned long hash_strength_delta;
1967 +#define HASH_STRENGTH_DELTA_MAX        5
1968 +
1969 +/* The time we have saved due to random_sample_hash */
1970 +static u64 rshash_pos;
1971 +
1972 +/* The time we have wasted due to hash collision */
1973 +static u64 rshash_neg;
1974 +
1975 +struct uksm_benefit {
1976 +       u64 pos;
1977 +       u64 neg;
1978 +       u64 scanned;
1979 +       unsigned long base;
1980 +} benefit;
1981 +
1982 +/*
1983 + * The relative cost of memcmp, compared to 1 time unit of random sample
1984 + * hash, this value is tested when ksm module is initialized
1985 + */
1986 +static unsigned long memcmp_cost;
1987 +
1988 +static unsigned long  rshash_neg_cont_zero;
1989 +static unsigned long  rshash_cont_obscure;
1990 +
1991 +/* The possible states of hash strength adjustment heuristic */
1992 +enum rshash_states {
1993 +               RSHASH_STILL,
1994 +               RSHASH_TRYUP,
1995 +               RSHASH_TRYDOWN,
1996 +               RSHASH_NEW,
1997 +               RSHASH_PRE_STILL,
1998 +};
1999 +
2000 +/* The possible direction we are about to adjust hash strength */
2001 +enum rshash_direct {
2002 +       GO_UP,
2003 +       GO_DOWN,
2004 +       OBSCURE,
2005 +       STILL,
2006 +};
2007 +
2008 +/* random sampling hash state machine */
2009 +static struct {
2010 +       enum rshash_states state;
2011 +       enum rshash_direct pre_direct;
2012 +       u8 below_count;
2013 +       /* Keep a lookup window of size 5, iff above_count/below_count > 3
2014 +        * in this window we stop trying.
2015 +        */
2016 +       u8 lookup_window_index;
2017 +       u64 stable_benefit;
2018 +       unsigned long turn_point_down;
2019 +       unsigned long turn_benefit_down;
2020 +       unsigned long turn_point_up;
2021 +       unsigned long turn_benefit_up;
2022 +       unsigned long stable_point;
2023 +} rshash_state;
2024 +
2025 +/*zero page hash table, hash_strength [0 ~ HASH_STRENGTH_MAX]*/
2026 +static u32 *zero_hash_table;
2027 +
2028 +static inline struct node_vma *alloc_node_vma(void)
2029 +{
2030 +       struct node_vma *node_vma;
2031 +
2032 +       node_vma = kmem_cache_zalloc(node_vma_cache, GFP_KERNEL |
2033 +                                    __GFP_NORETRY | __GFP_NOWARN);
2034 +       if (node_vma) {
2035 +               INIT_HLIST_HEAD(&node_vma->rmap_hlist);
2036 +               INIT_HLIST_NODE(&node_vma->hlist);
2037 +       }
2038 +       return node_vma;
2039 +}
2040 +
2041 +static inline void free_node_vma(struct node_vma *node_vma)
2042 +{
2043 +       kmem_cache_free(node_vma_cache, node_vma);
2044 +}
2045 +
2046 +
2047 +static inline struct vma_slot *alloc_vma_slot(void)
2048 +{
2049 +       struct vma_slot *slot;
2050 +
2051 +       /*
2052 +        * In case ksm is not initialized by now.
2053 +        * Oops, we need to consider the call site of uksm_init() in the future.
2054 +        */
2055 +       if (!vma_slot_cache)
2056 +               return NULL;
2057 +
2058 +       slot = kmem_cache_zalloc(vma_slot_cache, GFP_KERNEL |
2059 +                                __GFP_NORETRY | __GFP_NOWARN);
2060 +       if (slot) {
2061 +               INIT_LIST_HEAD(&slot->slot_list);
2062 +               INIT_LIST_HEAD(&slot->dedup_list);
2063 +               slot->flags |= UKSM_SLOT_NEED_RERAND;
2064 +       }
2065 +       return slot;
2066 +}
2067 +
2068 +static inline void free_vma_slot(struct vma_slot *vma_slot)
2069 +{
2070 +       kmem_cache_free(vma_slot_cache, vma_slot);
2071 +}
2072 +
2073 +
2074 +
2075 +static inline struct rmap_item *alloc_rmap_item(void)
2076 +{
2077 +       struct rmap_item *rmap_item;
2078 +
2079 +       rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL |
2080 +                                     __GFP_NORETRY | __GFP_NOWARN);
2081 +       if (rmap_item) {
2082 +               /* bug on lowest bit is not clear for flag use */
2083 +               BUG_ON(is_addr(rmap_item));
2084 +       }
2085 +       return rmap_item;
2086 +}
2087 +
2088 +static inline void free_rmap_item(struct rmap_item *rmap_item)
2089 +{
2090 +       rmap_item->slot = NULL; /* debug safety */
2091 +       kmem_cache_free(rmap_item_cache, rmap_item);
2092 +}
2093 +
2094 +static inline struct stable_node *alloc_stable_node(void)
2095 +{
2096 +       struct stable_node *node;
2097 +
2098 +       node = kmem_cache_alloc(stable_node_cache, GFP_KERNEL |
2099 +                               __GFP_NORETRY | __GFP_NOWARN);
2100 +       if (!node)
2101 +               return NULL;
2102 +
2103 +       INIT_HLIST_HEAD(&node->hlist);
2104 +       list_add(&node->all_list, &stable_node_list);
2105 +       return node;
2106 +}
2107 +
2108 +static inline void free_stable_node(struct stable_node *stable_node)
2109 +{
2110 +       list_del(&stable_node->all_list);
2111 +       kmem_cache_free(stable_node_cache, stable_node);
2112 +}
2113 +
2114 +static inline struct tree_node *alloc_tree_node(struct list_head *list)
2115 +{
2116 +       struct tree_node *node;
2117 +
2118 +       node = kmem_cache_zalloc(tree_node_cache, GFP_KERNEL |
2119 +                                __GFP_NORETRY | __GFP_NOWARN);
2120 +       if (!node)
2121 +               return NULL;
2122 +
2123 +       list_add(&node->all_list, list);
2124 +       return node;
2125 +}
2126 +
2127 +static inline void free_tree_node(struct tree_node *node)
2128 +{
2129 +       list_del(&node->all_list);
2130 +       kmem_cache_free(tree_node_cache, node);
2131 +}
2132 +
2133 +static void uksm_drop_anon_vma(struct rmap_item *rmap_item)
2134 +{
2135 +       struct anon_vma *anon_vma = rmap_item->anon_vma;
2136 +
2137 +       put_anon_vma(anon_vma);
2138 +}
2139 +
2140 +
2141 +/**
2142 + * Remove a stable node from stable_tree, may unlink from its tree_node and
2143 + * may remove its parent tree_node if no other stable node is pending.
2144 + *
2145 + * @stable_node            The node need to be removed
2146 + * @unlink_rb      Will this node be unlinked from the rbtree?
2147 + * @remove_tree_    node Will its tree_node be removed if empty?
2148 + */
2149 +static void remove_node_from_stable_tree(struct stable_node *stable_node,
2150 +                                        int unlink_rb,  int remove_tree_node)
2151 +{
2152 +       struct node_vma *node_vma;
2153 +       struct rmap_item *rmap_item;
2154 +       struct hlist_node *n;
2155 +
2156 +       if (!hlist_empty(&stable_node->hlist)) {
2157 +               hlist_for_each_entry_safe(node_vma, n,
2158 +                                         &stable_node->hlist, hlist) {
2159 +                       hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) {
2160 +                               uksm_pages_sharing--;
2161 +
2162 +                               uksm_drop_anon_vma(rmap_item);
2163 +                               rmap_item->address &= PAGE_MASK;
2164 +                       }
2165 +                       free_node_vma(node_vma);
2166 +                       cond_resched();
2167 +               }
2168 +
2169 +               /* the last one is counted as shared */
2170 +               uksm_pages_shared--;
2171 +               uksm_pages_sharing++;
2172 +       }
2173 +
2174 +       if (stable_node->tree_node && unlink_rb) {
2175 +               rb_erase(&stable_node->node,
2176 +                        &stable_node->tree_node->sub_root);
2177 +
2178 +               if (RB_EMPTY_ROOT(&stable_node->tree_node->sub_root) &&
2179 +                   remove_tree_node) {
2180 +                       rb_erase(&stable_node->tree_node->node,
2181 +                                root_stable_treep);
2182 +                       free_tree_node(stable_node->tree_node);
2183 +               } else {
2184 +                       stable_node->tree_node->count--;
2185 +               }
2186 +       }
2187 +
2188 +       free_stable_node(stable_node);
2189 +}
2190 +
2191 +
2192 +/*
2193 + * get_uksm_page: checks if the page indicated by the stable node
2194 + * is still its ksm page, despite having held no reference to it.
2195 + * In which case we can trust the content of the page, and it
2196 + * returns the gotten page; but if the page has now been zapped,
2197 + * remove the stale node from the stable tree and return NULL.
2198 + *
2199 + * You would expect the stable_node to hold a reference to the ksm page.
2200 + * But if it increments the page's count, swapping out has to wait for
2201 + * ksmd to come around again before it can free the page, which may take
2202 + * seconds or even minutes: much too unresponsive.  So instead we use a
2203 + * "keyhole reference": access to the ksm page from the stable node peeps
2204 + * out through its keyhole to see if that page still holds the right key,
2205 + * pointing back to this stable node.  This relies on freeing a PageAnon
2206 + * page to reset its page->mapping to NULL, and relies on no other use of
2207 + * a page to put something that might look like our key in page->mapping.
2208 + *
2209 + * include/linux/pagemap.h page_cache_get_speculative() is a good reference,
2210 + * but this is different - made simpler by uksm_thread_mutex being held, but
2211 + * interesting for assuming that no other use of the struct page could ever
2212 + * put our expected_mapping into page->mapping (or a field of the union which
2213 + * coincides with page->mapping).  The RCU calls are not for KSM at all, but
2214 + * to keep the page_count protocol described with page_cache_get_speculative.
2215 + *
2216 + * Note: it is possible that get_uksm_page() will return NULL one moment,
2217 + * then page the next, if the page is in between page_freeze_refs() and
2218 + * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page
2219 + * is on its way to being freed; but it is an anomaly to bear in mind.
2220 + *
2221 + * @unlink_rb:                 if the removal of this node will firstly unlink from
2222 + * its rbtree. stable_node_reinsert will prevent this when restructuring the
2223 + * node from its old tree.
2224 + *
2225 + * @remove_tree_node:  if this is the last one of its tree_node, will the
2226 + * tree_node be freed ? If we are inserting stable node, this tree_node may
2227 + * be reused, so don't free it.
2228 + */
2229 +static struct page *get_uksm_page(struct stable_node *stable_node,
2230 +                                int unlink_rb, int remove_tree_node)
2231 +{
2232 +       struct page *page;
2233 +       void *expected_mapping;
2234 +       unsigned long kpfn;
2235 +
2236 +       expected_mapping = (void *)((unsigned long)stable_node |
2237 +                                   PAGE_MAPPING_KSM);
2238 +again:
2239 +       kpfn = READ_ONCE(stable_node->kpfn);
2240 +       page = pfn_to_page(kpfn);
2241 +
2242 +       /*
2243 +        * page is computed from kpfn, so on most architectures reading
2244 +        * page->mapping is naturally ordered after reading node->kpfn,
2245 +        * but on Alpha we need to be more careful.
2246 +        */
2247 +       smp_rmb();
2248 +
2249 +       if (READ_ONCE(page->mapping) != expected_mapping)
2250 +               goto stale;
2251 +
2252 +       /*
2253 +        * We cannot do anything with the page while its refcount is 0.
2254 +        * Usually 0 means free, or tail of a higher-order page: in which
2255 +        * case this node is no longer referenced, and should be freed;
2256 +        * however, it might mean that the page is under page_freeze_refs().
2257 +        * The __remove_mapping() case is easy, again the node is now stale;
2258 +        * but if page is swapcache in migrate_page_move_mapping(), it might
2259 +        * still be our page, in which case it's essential to keep the node.
2260 +        */
2261 +       while (!get_page_unless_zero(page)) {
2262 +               /*
2263 +                * Another check for page->mapping != expected_mapping would
2264 +                * work here too.  We have chosen the !PageSwapCache test to
2265 +                * optimize the common case, when the page is or is about to
2266 +                * be freed: PageSwapCache is cleared (under spin_lock_irq)
2267 +                * in the freeze_refs section of __remove_mapping(); but Anon
2268 +                * page->mapping reset to NULL later, in free_pages_prepare().
2269 +                */
2270 +               if (!PageSwapCache(page))
2271 +                       goto stale;
2272 +               cpu_relax();
2273 +       }
2274 +
2275 +       if (READ_ONCE(page->mapping) != expected_mapping) {
2276 +               put_page(page);
2277 +               goto stale;
2278 +       }
2279 +
2280 +       lock_page(page);
2281 +       if (READ_ONCE(page->mapping) != expected_mapping) {
2282 +               unlock_page(page);
2283 +               put_page(page);
2284 +               goto stale;
2285 +       }
2286 +       unlock_page(page);
2287 +       return page;
2288 +stale:
2289 +       /*
2290 +        * We come here from above when page->mapping or !PageSwapCache
2291 +        * suggests that the node is stale; but it might be under migration.
2292 +        * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(),
2293 +        * before checking whether node->kpfn has been changed.
2294 +        */
2295 +       smp_rmb();
2296 +       if (stable_node->kpfn != kpfn)
2297 +               goto again;
2298 +
2299 +       remove_node_from_stable_tree(stable_node, unlink_rb, remove_tree_node);
2300 +
2301 +       return NULL;
2302 +}
2303 +
2304 +/*
2305 + * Removing rmap_item from stable or unstable tree.
2306 + * This function will clean the information from the stable/unstable tree.
2307 + */
2308 +static inline void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
2309 +{
2310 +       if (rmap_item->address & STABLE_FLAG) {
2311 +               struct stable_node *stable_node;
2312 +               struct node_vma *node_vma;
2313 +               struct page *page;
2314 +
2315 +               node_vma = rmap_item->head;
2316 +               stable_node = node_vma->head;
2317 +               page = get_uksm_page(stable_node, 1, 1);
2318 +               if (!page)
2319 +                       goto out;
2320 +
2321 +               /*
2322 +                * page lock is needed because it's racing with
2323 +                * try_to_unmap_ksm(), etc.
2324 +                */
2325 +               lock_page(page);
2326 +               hlist_del(&rmap_item->hlist);
2327 +
2328 +               if (hlist_empty(&node_vma->rmap_hlist)) {
2329 +                       hlist_del(&node_vma->hlist);
2330 +                       free_node_vma(node_vma);
2331 +               }
2332 +               unlock_page(page);
2333 +
2334 +               put_page(page);
2335 +               if (hlist_empty(&stable_node->hlist)) {
2336 +                       /* do NOT call remove_node_from_stable_tree() here,
2337 +                        * it's possible for a forked rmap_item not in
2338 +                        * stable tree while the in-tree rmap_items were
2339 +                        * deleted.
2340 +                        */
2341 +                       uksm_pages_shared--;
2342 +               } else
2343 +                       uksm_pages_sharing--;
2344 +
2345 +
2346 +               uksm_drop_anon_vma(rmap_item);
2347 +       } else if (rmap_item->address & UNSTABLE_FLAG) {
2348 +               if (rmap_item->hash_round == uksm_hash_round) {
2349 +
2350 +                       rb_erase(&rmap_item->node,
2351 +                                &rmap_item->tree_node->sub_root);
2352 +                       if (RB_EMPTY_ROOT(&rmap_item->tree_node->sub_root)) {
2353 +                               rb_erase(&rmap_item->tree_node->node,
2354 +                                        &root_unstable_tree);
2355 +
2356 +                               free_tree_node(rmap_item->tree_node);
2357 +                       } else
2358 +                               rmap_item->tree_node->count--;
2359 +               }
2360 +               uksm_pages_unshared--;
2361 +       }
2362 +
2363 +       rmap_item->address &= PAGE_MASK;
2364 +       rmap_item->hash_max = 0;
2365 +
2366 +out:
2367 +       cond_resched();         /* we're called from many long loops */
2368 +}
2369 +
2370 +static inline int slot_in_uksm(struct vma_slot *slot)
2371 +{
2372 +       return list_empty(&slot->slot_list);
2373 +}
2374 +
2375 +/*
2376 + * Test if the mm is exiting
2377 + */
2378 +static inline bool uksm_test_exit(struct mm_struct *mm)
2379 +{
2380 +       return atomic_read(&mm->mm_users) == 0;
2381 +}
2382 +
2383 +static inline unsigned long vma_pool_size(struct vma_slot *slot)
2384 +{
2385 +       return round_up(sizeof(struct rmap_list_entry) * slot->pages,
2386 +                       PAGE_SIZE) >> PAGE_SHIFT;
2387 +}
2388 +
2389 +#define CAN_OVERFLOW_U64(x, delta) (U64_MAX - (x) < (delta))
2390 +
2391 +/* must be done with sem locked */
2392 +static int slot_pool_alloc(struct vma_slot *slot)
2393 +{
2394 +       unsigned long pool_size;
2395 +
2396 +       if (slot->rmap_list_pool)
2397 +               return 0;
2398 +
2399 +       pool_size = vma_pool_size(slot);
2400 +       slot->rmap_list_pool = kcalloc(pool_size, sizeof(struct page *),
2401 +                                      GFP_KERNEL);
2402 +       if (!slot->rmap_list_pool)
2403 +               return -ENOMEM;
2404 +
2405 +       slot->pool_counts = kcalloc(pool_size, sizeof(unsigned int),
2406 +                                   GFP_KERNEL);
2407 +       if (!slot->pool_counts) {
2408 +               kfree(slot->rmap_list_pool);
2409 +               return -ENOMEM;
2410 +       }
2411 +
2412 +       slot->pool_size = pool_size;
2413 +       BUG_ON(CAN_OVERFLOW_U64(uksm_pages_total, slot->pages));
2414 +       slot->flags |= UKSM_SLOT_IN_UKSM;
2415 +       uksm_pages_total += slot->pages;
2416 +
2417 +       return 0;
2418 +}
2419 +
2420 +/*
2421 + * Called after vma is unlinked from its mm
2422 + */
2423 +void uksm_remove_vma(struct vm_area_struct *vma)
2424 +{
2425 +       struct vma_slot *slot;
2426 +
2427 +       if (!vma->uksm_vma_slot)
2428 +               return;
2429 +
2430 +       spin_lock(&vma_slot_list_lock);
2431 +       slot = vma->uksm_vma_slot;
2432 +       if (!slot)
2433 +               goto out;
2434 +
2435 +       if (slot_in_uksm(slot)) {
2436 +               /**
2437 +                * This slot has been added by ksmd, so move to the del list
2438 +                * waiting ksmd to free it.
2439 +                */
2440 +               list_add_tail(&slot->slot_list, &vma_slot_del);
2441 +       } else {
2442 +               /**
2443 +                * It's still on new list. It's ok to free slot directly.
2444 +                */
2445 +               list_del(&slot->slot_list);
2446 +               free_vma_slot(slot);
2447 +       }
2448 +out:
2449 +       vma->uksm_vma_slot = NULL;
2450 +       spin_unlock(&vma_slot_list_lock);
2451 +}
2452 +
2453 +/**
2454 + * Need to do two things:
2455 + * 1. check if slot was moved to del list
2456 + * 2. make sure the mmap_sem is manipulated under valid vma.
2457 + *
2458 + * My concern here is that in some cases, this may make
2459 + * vma_slot_list_lock() waiters to serialized further by some
2460 + * sem->wait_lock, can this really be expensive?
2461 + *
2462 + *
2463 + * @return
2464 + * 0: if successfully locked mmap_sem
2465 + * -ENOENT: this slot was moved to del list
2466 + * -EBUSY: vma lock failed
2467 + */
2468 +static int try_down_read_slot_mmap_sem(struct vma_slot *slot)
2469 +{
2470 +       struct vm_area_struct *vma;
2471 +       struct mm_struct *mm;
2472 +       struct rw_semaphore *sem;
2473 +
2474 +       spin_lock(&vma_slot_list_lock);
2475 +
2476 +       /* the slot_list was removed and inited from new list, when it enters
2477 +        * uksm_list. If now it's not empty, then it must be moved to del list
2478 +        */
2479 +       if (!slot_in_uksm(slot)) {
2480 +               spin_unlock(&vma_slot_list_lock);
2481 +               return -ENOENT;
2482 +       }
2483 +
2484 +       BUG_ON(slot->pages != vma_pages(slot->vma));
2485 +       /* Ok, vma still valid */
2486 +       vma = slot->vma;
2487 +       mm = vma->vm_mm;
2488 +       sem = &mm->mmap_lock;
2489 +
2490 +       if (uksm_test_exit(mm)) {
2491 +               spin_unlock(&vma_slot_list_lock);
2492 +               return -ENOENT;
2493 +       }
2494 +
2495 +       if (down_read_trylock(sem)) {
2496 +               spin_unlock(&vma_slot_list_lock);
2497 +               if (slot_pool_alloc(slot)) {
2498 +                       uksm_remove_vma(vma);
2499 +                       up_read(sem);
2500 +                       return -ENOENT;
2501 +               }
2502 +               return 0;
2503 +       }
2504 +
2505 +       spin_unlock(&vma_slot_list_lock);
2506 +       return -EBUSY;
2507 +}
2508 +
2509 +static inline unsigned long
2510 +vma_page_address(struct page *page, struct vm_area_struct *vma)
2511 +{
2512 +       pgoff_t pgoff = page->index;
2513 +       unsigned long address;
2514 +
2515 +       address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
2516 +       if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
2517 +               /* page should be within @vma mapping range */
2518 +               return -EFAULT;
2519 +       }
2520 +       return address;
2521 +}
2522 +
2523 +
2524 +/* return 0 on success with the item's mmap_sem locked */
2525 +static inline int get_mergeable_page_lock_mmap(struct rmap_item *item)
2526 +{
2527 +       struct mm_struct *mm;
2528 +       struct vma_slot *slot = item->slot;
2529 +       int err = -EINVAL;
2530 +
2531 +       struct page *page;
2532 +
2533 +       /*
2534 +        * try_down_read_slot_mmap_sem() returns non-zero if the slot
2535 +        * has been removed by uksm_remove_vma().
2536 +        */
2537 +       if (try_down_read_slot_mmap_sem(slot))
2538 +               return -EBUSY;
2539 +
2540 +       mm = slot->vma->vm_mm;
2541 +
2542 +       if (uksm_test_exit(mm))
2543 +               goto failout_up;
2544 +
2545 +       page = item->page;
2546 +       rcu_read_lock();
2547 +       if (!get_page_unless_zero(page)) {
2548 +               rcu_read_unlock();
2549 +               goto failout_up;
2550 +       }
2551 +
2552 +       /* No need to consider huge page here. */
2553 +       if (item->slot->vma->anon_vma != page_anon_vma(page) ||
2554 +           vma_page_address(page, item->slot->vma) != get_rmap_addr(item)) {
2555 +               /*
2556 +                * TODO:
2557 +                * should we release this item becase of its stale page
2558 +                * mapping?
2559 +                */
2560 +               put_page(page);
2561 +               rcu_read_unlock();
2562 +               goto failout_up;
2563 +       }
2564 +       rcu_read_unlock();
2565 +       return 0;
2566 +
2567 +failout_up:
2568 +       mmap_read_unlock(mm);
2569 +       return err;
2570 +}
2571 +
2572 +/*
2573 + * What kind of VMA is considered ?
2574 + */
2575 +static inline int vma_can_enter(struct vm_area_struct *vma)
2576 +{
2577 +       return uksm_flags_can_scan(vma->vm_flags);
2578 +}
2579 +
2580 +/*
2581 + * Called whenever a fresh new vma is created A new vma_slot.
2582 + * is created and inserted into a global list Must be called.
2583 + * after vma is inserted to its mm.
2584 + */
2585 +void uksm_vma_add_new(struct vm_area_struct *vma)
2586 +{
2587 +       struct vma_slot *slot;
2588 +
2589 +       if (!vma_can_enter(vma)) {
2590 +               vma->uksm_vma_slot = NULL;
2591 +               return;
2592 +       }
2593 +
2594 +       slot = alloc_vma_slot();
2595 +       if (!slot) {
2596 +               vma->uksm_vma_slot = NULL;
2597 +               return;
2598 +       }
2599 +
2600 +       vma->uksm_vma_slot = slot;
2601 +       vma->vm_flags |= VM_MERGEABLE;
2602 +       slot->vma = vma;
2603 +       slot->mm = vma->vm_mm;
2604 +       slot->ctime_j = jiffies;
2605 +       slot->pages = vma_pages(vma);
2606 +       spin_lock(&vma_slot_list_lock);
2607 +       list_add_tail(&slot->slot_list, &vma_slot_new);
2608 +       spin_unlock(&vma_slot_list_lock);
2609 +}
2610 +
2611 +/*   32/3 < they < 32/2 */
2612 +#define shiftl 8
2613 +#define shiftr 12
2614 +
2615 +#define HASH_FROM_TO(from, to)                 \
2616 +for (index = from; index < to; index++) {      \
2617 +       pos = random_nums[index];               \
2618 +       hash += key[pos];                       \
2619 +       hash += (hash << shiftl);               \
2620 +       hash ^= (hash >> shiftr);               \
2621 +}
2622 +
2623 +
2624 +#define HASH_FROM_DOWN_TO(from, to)            \
2625 +for (index = from - 1; index >= to; index--) { \
2626 +       hash ^= (hash >> shiftr);               \
2627 +       hash ^= (hash >> (shiftr*2));           \
2628 +       hash -= (hash << shiftl);               \
2629 +       hash += (hash << (shiftl*2));           \
2630 +       pos = random_nums[index];               \
2631 +       hash -= key[pos];                       \
2632 +}
2633 +
2634 +/*
2635 + * The main random sample hash function.
2636 + */
2637 +static u32 random_sample_hash(void *addr, u32 hash_strength)
2638 +{
2639 +       u32 hash = 0xdeadbeef;
2640 +       int index, pos, loop = hash_strength;
2641 +       u32 *key = (u32 *)addr;
2642 +
2643 +       if (loop > HASH_STRENGTH_FULL)
2644 +               loop = HASH_STRENGTH_FULL;
2645 +
2646 +       HASH_FROM_TO(0, loop);
2647 +
2648 +       if (hash_strength > HASH_STRENGTH_FULL) {
2649 +               loop = hash_strength - HASH_STRENGTH_FULL;
2650 +               HASH_FROM_TO(0, loop);
2651 +       }
2652 +
2653 +       return hash;
2654 +}
2655 +
2656 +
2657 +/**
2658 + * It's used when hash strength is adjusted
2659 + *
2660 + * @addr The page's virtual address
2661 + * @from The original hash strength
2662 + * @to   The hash strength changed to
2663 + * @hash The hash value generated with "from" hash value
2664 + *
2665 + * return the hash value
2666 + */
2667 +static u32 delta_hash(void *addr, int from, int to, u32 hash)
2668 +{
2669 +       u32 *key = (u32 *)addr;
2670 +       int index, pos; /* make sure they are int type */
2671 +
2672 +       if (to > from) {
2673 +               if (from >= HASH_STRENGTH_FULL) {
2674 +                       from -= HASH_STRENGTH_FULL;
2675 +                       to -= HASH_STRENGTH_FULL;
2676 +                       HASH_FROM_TO(from, to);
2677 +               } else if (to <= HASH_STRENGTH_FULL) {
2678 +                       HASH_FROM_TO(from, to);
2679 +               } else {
2680 +                       HASH_FROM_TO(from, HASH_STRENGTH_FULL);
2681 +                       HASH_FROM_TO(0, to - HASH_STRENGTH_FULL);
2682 +               }
2683 +       } else {
2684 +               if (from <= HASH_STRENGTH_FULL) {
2685 +                       HASH_FROM_DOWN_TO(from, to);
2686 +               } else if (to >= HASH_STRENGTH_FULL) {
2687 +                       from -= HASH_STRENGTH_FULL;
2688 +                       to -= HASH_STRENGTH_FULL;
2689 +                       HASH_FROM_DOWN_TO(from, to);
2690 +               } else {
2691 +                       HASH_FROM_DOWN_TO(from - HASH_STRENGTH_FULL, 0);
2692 +                       HASH_FROM_DOWN_TO(HASH_STRENGTH_FULL, to);
2693 +               }
2694 +       }
2695 +
2696 +       return hash;
2697 +}
2698 +
2699 +/**
2700 + *
2701 + * Called when: rshash_pos or rshash_neg is about to overflow or a scan round
2702 + * has finished.
2703 + *
2704 + * return 0 if no page has been scanned since last call, 1 otherwise.
2705 + */
2706 +static inline int encode_benefit(void)
2707 +{
2708 +       u64 scanned_delta, pos_delta, neg_delta;
2709 +       unsigned long base = benefit.base;
2710 +
2711 +       scanned_delta = uksm_pages_scanned - uksm_pages_scanned_last;
2712 +
2713 +       if (!scanned_delta)
2714 +               return 0;
2715 +
2716 +       scanned_delta >>= base;
2717 +       pos_delta = rshash_pos >> base;
2718 +       neg_delta = rshash_neg >> base;
2719 +
2720 +       if (CAN_OVERFLOW_U64(benefit.pos, pos_delta) ||
2721 +           CAN_OVERFLOW_U64(benefit.neg, neg_delta) ||
2722 +           CAN_OVERFLOW_U64(benefit.scanned, scanned_delta)) {
2723 +               benefit.scanned >>= 1;
2724 +               benefit.neg >>= 1;
2725 +               benefit.pos >>= 1;
2726 +               benefit.base++;
2727 +               scanned_delta >>= 1;
2728 +               pos_delta >>= 1;
2729 +               neg_delta >>= 1;
2730 +       }
2731 +
2732 +       benefit.pos += pos_delta;
2733 +       benefit.neg += neg_delta;
2734 +       benefit.scanned += scanned_delta;
2735 +
2736 +       BUG_ON(!benefit.scanned);
2737 +
2738 +       rshash_pos = rshash_neg = 0;
2739 +       uksm_pages_scanned_last = uksm_pages_scanned;
2740 +
2741 +       return 1;
2742 +}
2743 +
2744 +static inline void reset_benefit(void)
2745 +{
2746 +       benefit.pos = 0;
2747 +       benefit.neg = 0;
2748 +       benefit.base = 0;
2749 +       benefit.scanned = 0;
2750 +}
2751 +
2752 +static inline void inc_rshash_pos(unsigned long delta)
2753 +{
2754 +       if (CAN_OVERFLOW_U64(rshash_pos, delta))
2755 +               encode_benefit();
2756 +
2757 +       rshash_pos += delta;
2758 +}
2759 +
2760 +static inline void inc_rshash_neg(unsigned long delta)
2761 +{
2762 +       if (CAN_OVERFLOW_U64(rshash_neg, delta))
2763 +               encode_benefit();
2764 +
2765 +       rshash_neg += delta;
2766 +}
2767 +
2768 +
2769 +static inline u32 page_hash(struct page *page, unsigned long hash_strength,
2770 +                           int cost_accounting)
2771 +{
2772 +       u32 val;
2773 +       unsigned long delta;
2774 +
2775 +       void *addr = kmap_atomic(page);
2776 +
2777 +       val = random_sample_hash(addr, hash_strength);
2778 +       kunmap_atomic(addr);
2779 +
2780 +       if (cost_accounting) {
2781 +               if (hash_strength < HASH_STRENGTH_FULL)
2782 +                       delta = HASH_STRENGTH_FULL - hash_strength;
2783 +               else
2784 +                       delta = 0;
2785 +
2786 +               inc_rshash_pos(delta);
2787 +       }
2788 +
2789 +       return val;
2790 +}
2791 +
2792 +static int memcmp_pages_with_cost(struct page *page1, struct page *page2,
2793 +                       int cost_accounting)
2794 +{
2795 +       char *addr1, *addr2;
2796 +       int ret;
2797 +
2798 +       addr1 = kmap_atomic(page1);
2799 +       addr2 = kmap_atomic(page2);
2800 +       ret = memcmp(addr1, addr2, PAGE_SIZE);
2801 +       kunmap_atomic(addr2);
2802 +       kunmap_atomic(addr1);
2803 +
2804 +       if (cost_accounting)
2805 +               inc_rshash_neg(memcmp_cost);
2806 +
2807 +       return ret;
2808 +}
2809 +
2810 +static inline int pages_identical_with_cost(struct page *page1, struct page *page2)
2811 +{
2812 +       return !memcmp_pages_with_cost(page1, page2, 0);
2813 +}
2814 +
2815 +static inline int is_page_full_zero(struct page *page)
2816 +{
2817 +       char *addr;
2818 +       int ret;
2819 +
2820 +       addr = kmap_atomic(page);
2821 +       ret = is_full_zero(addr, PAGE_SIZE);
2822 +       kunmap_atomic(addr);
2823 +
2824 +       return ret;
2825 +}
2826 +
2827 +static int write_protect_page(struct vm_area_struct *vma, struct page *page,
2828 +                             pte_t *orig_pte, pte_t *old_pte)
2829 +{
2830 +       struct mm_struct *mm = vma->vm_mm;
2831 +       struct page_vma_mapped_walk pvmw = {
2832 +               .page = page,
2833 +               .vma = vma,
2834 +       };
2835 +       struct mmu_notifier_range range;
2836 +       int swapped;
2837 +       int err = -EFAULT;
2838 +
2839 +       pvmw.address = page_address_in_vma(page, vma);
2840 +       if (pvmw.address == -EFAULT)
2841 +               goto out;
2842 +
2843 +       BUG_ON(PageTransCompound(page));
2844 +
2845 +        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, pvmw.address,
2846 +                                pvmw.address + PAGE_SIZE);
2847 +       mmu_notifier_invalidate_range_start(&range);
2848 +
2849 +       if (!page_vma_mapped_walk(&pvmw))
2850 +               goto out_mn;
2851 +       if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?"))
2852 +               goto out_unlock;
2853 +
2854 +       if (old_pte)
2855 +               *old_pte = *pvmw.pte;
2856 +
2857 +       if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) ||
2858 +           (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) || mm_tlb_flush_pending(mm)) {
2859 +               pte_t entry;
2860 +
2861 +               swapped = PageSwapCache(page);
2862 +               flush_cache_page(vma, pvmw.address, page_to_pfn(page));
2863 +               /*
2864 +                * Ok this is tricky, when get_user_pages_fast() run it doesn't
2865 +                * take any lock, therefore the check that we are going to make
2866 +                * with the pagecount against the mapcount is racey and
2867 +                * O_DIRECT can happen right after the check.
2868 +                * So we clear the pte and flush the tlb before the check
2869 +                * this assure us that no O_DIRECT can happen after the check
2870 +                * or in the middle of the check.
2871 +                */
2872 +               entry = ptep_clear_flush_notify(vma, pvmw.address, pvmw.pte);
2873 +               /*
2874 +                * Check that no O_DIRECT or similar I/O is in progress on the
2875 +                * page
2876 +                */
2877 +               if (page_mapcount(page) + 1 + swapped != page_count(page)) {
2878 +                       set_pte_at(mm, pvmw.address, pvmw.pte, entry);
2879 +                       goto out_unlock;
2880 +               }
2881 +               if (pte_dirty(entry))
2882 +                       set_page_dirty(page);
2883 +
2884 +               if (pte_protnone(entry))
2885 +                       entry = pte_mkclean(pte_clear_savedwrite(entry));
2886 +               else
2887 +                       entry = pte_mkclean(pte_wrprotect(entry));
2888 +
2889 +               set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry);
2890 +       }
2891 +       *orig_pte = *pvmw.pte;
2892 +       err = 0;
2893 +
2894 +out_unlock:
2895 +       page_vma_mapped_walk_done(&pvmw);
2896 +out_mn:
2897 +       mmu_notifier_invalidate_range_end(&range);
2898 +out:
2899 +       return err;
2900 +}
2901 +
2902 +#define MERGE_ERR_PGERR                1 /* the page is invalid cannot continue */
2903 +#define MERGE_ERR_COLLI                2 /* there is a collision */
2904 +#define MERGE_ERR_COLLI_MAX    3 /* collision at the max hash strength */
2905 +#define MERGE_ERR_CHANGED      4 /* the page has changed since last hash */
2906 +
2907 +
2908 +/**
2909 + * replace_page - replace page in vma by new ksm page
2910 + * @vma:      vma that holds the pte pointing to page
2911 + * @page:     the page we are replacing by kpage
2912 + * @kpage:    the ksm page we replace page by
2913 + * @orig_pte: the original value of the pte
2914 + *
2915 + * Returns 0 on success, MERGE_ERR_PGERR on failure.
2916 + */
2917 +static int replace_page(struct vm_area_struct *vma, struct page *page,
2918 +                       struct page *kpage, pte_t orig_pte)
2919 +{
2920 +       struct mm_struct *mm = vma->vm_mm;
2921 +       struct mmu_notifier_range range;
2922 +       pgd_t *pgd;
2923 +       p4d_t *p4d;
2924 +       pud_t *pud;
2925 +       pmd_t *pmd;
2926 +       pte_t *ptep;
2927 +       spinlock_t *ptl;
2928 +       pte_t entry;
2929 +
2930 +       unsigned long addr;
2931 +       int err = MERGE_ERR_PGERR;
2932 +
2933 +       addr = page_address_in_vma(page, vma);
2934 +       if (addr == -EFAULT)
2935 +               goto out;
2936 +
2937 +       pgd = pgd_offset(mm, addr);
2938 +       if (!pgd_present(*pgd))
2939 +               goto out;
2940 +
2941 +       p4d = p4d_offset(pgd, addr);
2942 +       pud = pud_offset(p4d, addr);
2943 +       if (!pud_present(*pud))
2944 +               goto out;
2945 +
2946 +       pmd = pmd_offset(pud, addr);
2947 +       BUG_ON(pmd_trans_huge(*pmd));
2948 +       if (!pmd_present(*pmd))
2949 +               goto out;
2950 +
2951 +        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
2952 +                                addr + PAGE_SIZE);
2953 +       mmu_notifier_invalidate_range_start(&range);
2954 +
2955 +       ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
2956 +       if (!pte_same(*ptep, orig_pte)) {
2957 +               pte_unmap_unlock(ptep, ptl);
2958 +               goto out_mn;
2959 +       }
2960 +
2961 +       flush_cache_page(vma, addr, pte_pfn(*ptep));
2962 +       ptep_clear_flush_notify(vma, addr, ptep);
2963 +       entry = mk_pte(kpage, vma->vm_page_prot);
2964 +
2965 +       /* special treatment is needed for zero_page */
2966 +       if ((page_to_pfn(kpage) == uksm_zero_pfn) ||
2967 +                               (page_to_pfn(kpage) == zero_pfn)) {
2968 +               entry = pte_mkspecial(entry);
2969 +               dec_mm_counter(mm, MM_ANONPAGES);
2970 +               inc_zone_page_state(page, NR_UKSM_ZERO_PAGES);
2971 +       } else {
2972 +               get_page(kpage);
2973 +               page_add_anon_rmap(kpage, vma, addr, false);
2974 +       }
2975 +
2976 +       set_pte_at_notify(mm, addr, ptep, entry);
2977 +
2978 +       page_remove_rmap(page, false);
2979 +       if (!page_mapped(page))
2980 +               try_to_free_swap(page);
2981 +       put_page(page);
2982 +
2983 +       pte_unmap_unlock(ptep, ptl);
2984 +       err = 0;
2985 +out_mn:
2986 +       mmu_notifier_invalidate_range_end(&range);
2987 +out:
2988 +       return err;
2989 +}
2990 +
2991 +
2992 +/**
2993 + *  Fully hash a page with HASH_STRENGTH_MAX return a non-zero hash value. The
2994 + *  zero hash value at HASH_STRENGTH_MAX is used to indicated that its
2995 + *  hash_max member has not been calculated.
2996 + *
2997 + * @page The page needs to be hashed
2998 + * @hash_old The hash value calculated with current hash strength
2999 + *
3000 + * return the new hash value calculated at HASH_STRENGTH_MAX
3001 + */
3002 +static inline u32 page_hash_max(struct page *page, u32 hash_old)
3003 +{
3004 +       u32 hash_max = 0;
3005 +       void *addr;
3006 +
3007 +       addr = kmap_atomic(page);
3008 +       hash_max = delta_hash(addr, hash_strength,
3009 +                             HASH_STRENGTH_MAX, hash_old);
3010 +
3011 +       kunmap_atomic(addr);
3012 +
3013 +       if (!hash_max)
3014 +               hash_max = 1;
3015 +
3016 +       inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
3017 +       return hash_max;
3018 +}
3019 +
3020 +/*
3021 + * We compare the hash again, to ensure that it is really a hash collision
3022 + * instead of being caused by page write.
3023 + */
3024 +static inline int check_collision(struct rmap_item *rmap_item,
3025 +                                 u32 hash)
3026 +{
3027 +       int err;
3028 +       struct page *page = rmap_item->page;
3029 +
3030 +       /* if this rmap_item has already been hash_maxed, then the collision
3031 +        * must appears in the second-level rbtree search. In this case we check
3032 +        * if its hash_max value has been changed. Otherwise, the collision
3033 +        * happens in the first-level rbtree search, so we check against it's
3034 +        * current hash value.
3035 +        */
3036 +       if (rmap_item->hash_max) {
3037 +               inc_rshash_neg(memcmp_cost);
3038 +               inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
3039 +
3040 +               if (rmap_item->hash_max == page_hash_max(page, hash))
3041 +                       err = MERGE_ERR_COLLI;
3042 +               else
3043 +                       err = MERGE_ERR_CHANGED;
3044 +       } else {
3045 +               inc_rshash_neg(memcmp_cost + hash_strength);
3046 +
3047 +               if (page_hash(page, hash_strength, 0) == hash)
3048 +                       err = MERGE_ERR_COLLI;
3049 +               else
3050 +                       err = MERGE_ERR_CHANGED;
3051 +       }
3052 +
3053 +       return err;
3054 +}
3055 +
3056 +/**
3057 + * Try to merge a rmap_item.page with a kpage in stable node. kpage must
3058 + * already be a ksm page.
3059 + *
3060 + * @return 0 if the pages were merged, -EFAULT otherwise.
3061 + */
3062 +static int try_to_merge_with_uksm_page(struct rmap_item *rmap_item,
3063 +                                     struct page *kpage, u32 hash)
3064 +{
3065 +       struct vm_area_struct *vma = rmap_item->slot->vma;
3066 +       struct mm_struct *mm = vma->vm_mm;
3067 +       pte_t orig_pte = __pte(0);
3068 +       int err = MERGE_ERR_PGERR;
3069 +       struct page *page;
3070 +
3071 +       if (uksm_test_exit(mm))
3072 +               goto out;
3073 +
3074 +       page = rmap_item->page;
3075 +
3076 +       if (page == kpage) { /* ksm page forked */
3077 +               err = 0;
3078 +               goto out;
3079 +       }
3080 +
3081 +       /*
3082 +        * We need the page lock to read a stable PageSwapCache in
3083 +        * write_protect_page().  We use trylock_page() instead of
3084 +        * lock_page() because we don't want to wait here - we
3085 +        * prefer to continue scanning and merging different pages,
3086 +        * then come back to this page when it is unlocked.
3087 +        */
3088 +       if (!trylock_page(page))
3089 +               goto out;
3090 +
3091 +       if (!PageAnon(page) || !PageKsm(kpage))
3092 +               goto out_unlock;
3093 +
3094 +       if (PageTransCompound(page)) {
3095 +               err = split_huge_page(page);
3096 +               if (err)
3097 +                       goto out_unlock;
3098 +       }
3099 +
3100 +       /*
3101 +        * If this anonymous page is mapped only here, its pte may need
3102 +        * to be write-protected.  If it's mapped elsewhere, all of its
3103 +        * ptes are necessarily already write-protected.  But in either
3104 +        * case, we need to lock and check page_count is not raised.
3105 +        */
3106 +       if (write_protect_page(vma, page, &orig_pte, NULL) == 0) {
3107 +               if (pages_identical_with_cost(page, kpage))
3108 +                       err = replace_page(vma, page, kpage, orig_pte);
3109 +               else
3110 +                       err = check_collision(rmap_item, hash);
3111 +       }
3112 +
3113 +       if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
3114 +               munlock_vma_page(page);
3115 +               if (!PageMlocked(kpage)) {
3116 +                       unlock_page(page);
3117 +                       lock_page(kpage);
3118 +                       mlock_vma_page(kpage);
3119 +                       page = kpage;           /* for final unlock */
3120 +               }
3121 +       }
3122 +
3123 +out_unlock:
3124 +       unlock_page(page);
3125 +out:
3126 +       return err;
3127 +}
3128 +
3129 +
3130 +
3131 +/**
3132 + * If two pages fail to merge in try_to_merge_two_pages, then we have a chance
3133 + * to restore a page mapping that has been changed in try_to_merge_two_pages.
3134 + *
3135 + * @return 0 on success.
3136 + */
3137 +static int restore_uksm_page_pte(struct vm_area_struct *vma, unsigned long addr,
3138 +                            pte_t orig_pte, pte_t wprt_pte)
3139 +{
3140 +       struct mm_struct *mm = vma->vm_mm;
3141 +       pgd_t *pgd;
3142 +       p4d_t *p4d;
3143 +       pud_t *pud;
3144 +       pmd_t *pmd;
3145 +       pte_t *ptep;
3146 +       spinlock_t *ptl;
3147 +
3148 +       int err = -EFAULT;
3149 +
3150 +       pgd = pgd_offset(mm, addr);
3151 +       if (!pgd_present(*pgd))
3152 +               goto out;
3153 +
3154 +       p4d = p4d_offset(pgd, addr);
3155 +       pud = pud_offset(p4d, addr);
3156 +       if (!pud_present(*pud))
3157 +               goto out;
3158 +
3159 +       pmd = pmd_offset(pud, addr);
3160 +       if (!pmd_present(*pmd))
3161 +               goto out;
3162 +
3163 +       ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
3164 +       if (!pte_same(*ptep, wprt_pte)) {
3165 +               /* already copied, let it be */
3166 +               pte_unmap_unlock(ptep, ptl);
3167 +               goto out;
3168 +       }
3169 +
3170 +       /*
3171 +        * Good boy, still here. When we still get the ksm page, it does not
3172 +        * return to the free page pool, there is no way that a pte was changed
3173 +        * to other page and gets back to this page. And remind that ksm page
3174 +        * do not reuse in do_wp_page(). So it's safe to restore the original
3175 +        * pte.
3176 +        */
3177 +       flush_cache_page(vma, addr, pte_pfn(*ptep));
3178 +       ptep_clear_flush_notify(vma, addr, ptep);
3179 +       set_pte_at_notify(mm, addr, ptep, orig_pte);
3180 +
3181 +       pte_unmap_unlock(ptep, ptl);
3182 +       err = 0;
3183 +out:
3184 +       return err;
3185 +}
3186 +
3187 +/**
3188 + * try_to_merge_two_pages() - take two identical pages and prepare
3189 + * them to be merged into one page(rmap_item->page)
3190 + *
3191 + * @return 0 if we successfully merged two identical pages into
3192 + *         one ksm page. MERGE_ERR_COLLI if it's only a hash collision
3193 + *         search in rbtree. MERGE_ERR_CHANGED if rmap_item has been
3194 + *         changed since it's hashed. MERGE_ERR_PGERR otherwise.
3195 + *
3196 + */
3197 +static int try_to_merge_two_pages(struct rmap_item *rmap_item,
3198 +                                 struct rmap_item *tree_rmap_item,
3199 +                                 u32 hash)
3200 +{
3201 +       pte_t orig_pte1 = __pte(0), orig_pte2 = __pte(0);
3202 +       pte_t wprt_pte1 = __pte(0), wprt_pte2 = __pte(0);
3203 +       struct vm_area_struct *vma1 = rmap_item->slot->vma;
3204 +       struct vm_area_struct *vma2 = tree_rmap_item->slot->vma;
3205 +       struct page *page = rmap_item->page;
3206 +       struct page *tree_page = tree_rmap_item->page;
3207 +       int err = MERGE_ERR_PGERR;
3208 +       struct address_space *saved_mapping;
3209 +
3210 +
3211 +       if (rmap_item->page == tree_rmap_item->page)
3212 +               goto out;
3213 +
3214 +       if (!trylock_page(page))
3215 +               goto out;
3216 +
3217 +       if (!PageAnon(page))
3218 +               goto out_unlock;
3219 +
3220 +       if (PageTransCompound(page)) {
3221 +               err = split_huge_page(page);
3222 +               if (err)
3223 +                       goto out_unlock;
3224 +       }
3225 +
3226 +       if (write_protect_page(vma1, page, &wprt_pte1, &orig_pte1) != 0) {
3227 +               unlock_page(page);
3228 +               goto out;
3229 +       }
3230 +
3231 +       /*
3232 +        * While we hold page lock, upgrade page from
3233 +        * PageAnon+anon_vma to PageKsm+NULL stable_node:
3234 +        * stable_tree_insert() will update stable_node.
3235 +        */
3236 +       saved_mapping = page->mapping;
3237 +       set_page_stable_node(page, NULL);
3238 +       mark_page_accessed(page);
3239 +       if (!PageDirty(page))
3240 +               SetPageDirty(page);
3241 +
3242 +       unlock_page(page);
3243 +
3244 +       if (!trylock_page(tree_page))
3245 +               goto restore_out;
3246 +
3247 +       if (!PageAnon(tree_page)) {
3248 +               unlock_page(tree_page);
3249 +               goto restore_out;
3250 +       }
3251 +
3252 +       if (PageTransCompound(tree_page)) {
3253 +               err = split_huge_page(tree_page);
3254 +               if (err) {
3255 +                       unlock_page(tree_page);
3256 +                       goto restore_out;
3257 +               }
3258 +       }
3259 +
3260 +       if (write_protect_page(vma2, tree_page, &wprt_pte2, &orig_pte2) != 0) {
3261 +               unlock_page(tree_page);
3262 +               goto restore_out;
3263 +       }
3264 +
3265 +       if (pages_identical_with_cost(page, tree_page)) {
3266 +               err = replace_page(vma2, tree_page, page, wprt_pte2);
3267 +               if (err) {
3268 +                       unlock_page(tree_page);
3269 +                       goto restore_out;
3270 +               }
3271 +
3272 +               if ((vma2->vm_flags & VM_LOCKED)) {
3273 +                       munlock_vma_page(tree_page);
3274 +                       if (!PageMlocked(page)) {
3275 +                               unlock_page(tree_page);
3276 +                               lock_page(page);
3277 +                               mlock_vma_page(page);
3278 +                               tree_page = page; /* for final unlock */
3279 +                       }
3280 +               }
3281 +
3282 +               unlock_page(tree_page);
3283 +
3284 +               goto out; /* success */
3285 +
3286 +       } else {
3287 +               if (tree_rmap_item->hash_max &&
3288 +                   tree_rmap_item->hash_max == rmap_item->hash_max) {
3289 +                       err = MERGE_ERR_COLLI_MAX;
3290 +               } else if (page_hash(page, hash_strength, 0) ==
3291 +                   page_hash(tree_page, hash_strength, 0)) {
3292 +                       inc_rshash_neg(memcmp_cost + hash_strength * 2);
3293 +                       err = MERGE_ERR_COLLI;
3294 +               } else {
3295 +                       err = MERGE_ERR_CHANGED;
3296 +               }
3297 +
3298 +               unlock_page(tree_page);
3299 +       }
3300 +
3301 +restore_out:
3302 +       lock_page(page);
3303 +       if (!restore_uksm_page_pte(vma1, get_rmap_addr(rmap_item),
3304 +                                 orig_pte1, wprt_pte1))
3305 +               page->mapping = saved_mapping;
3306 +
3307 +out_unlock:
3308 +       unlock_page(page);
3309 +out:
3310 +       return err;
3311 +}
3312 +
3313 +static inline int hash_cmp(u32 new_val, u32 node_val)
3314 +{
3315 +       if (new_val > node_val)
3316 +               return 1;
3317 +       else if (new_val < node_val)
3318 +               return -1;
3319 +       else
3320 +               return 0;
3321 +}
3322 +
3323 +static inline u32 rmap_item_hash_max(struct rmap_item *item, u32 hash)
3324 +{
3325 +       u32 hash_max = item->hash_max;
3326 +
3327 +       if (!hash_max) {
3328 +               hash_max = page_hash_max(item->page, hash);
3329 +
3330 +               item->hash_max = hash_max;
3331 +       }
3332 +
3333 +       return hash_max;
3334 +}
3335 +
3336 +
3337 +
3338 +/**
3339 + * stable_tree_search() - search the stable tree for a page
3340 + *
3341 + * @item:      the rmap_item we are comparing with
3342 + * @hash:      the hash value of this item->page already calculated
3343 + *
3344 + * @return     the page we have found, NULL otherwise. The page returned has
3345 + *                     been gotten.
3346 + */
3347 +static struct page *stable_tree_search(struct rmap_item *item, u32 hash)
3348 +{
3349 +       struct rb_node *node = root_stable_treep->rb_node;
3350 +       struct tree_node *tree_node;
3351 +       unsigned long hash_max;
3352 +       struct page *page = item->page;
3353 +       struct stable_node *stable_node;
3354 +
3355 +       stable_node = page_stable_node(page);
3356 +       if (stable_node) {
3357 +               /* ksm page forked, that is
3358 +                * if (PageKsm(page) && !in_stable_tree(rmap_item))
3359 +                * it's actually gotten once outside.
3360 +                */
3361 +               get_page(page);
3362 +               return page;
3363 +       }
3364 +
3365 +       while (node) {
3366 +               int cmp;
3367 +
3368 +               tree_node = rb_entry(node, struct tree_node, node);
3369 +
3370 +               cmp = hash_cmp(hash, tree_node->hash);
3371 +
3372 +               if (cmp < 0)
3373 +                       node = node->rb_left;
3374 +               else if (cmp > 0)
3375 +                       node = node->rb_right;
3376 +               else
3377 +                       break;
3378 +       }
3379 +
3380 +       if (!node)
3381 +               return NULL;
3382 +
3383 +       if (tree_node->count == 1) {
3384 +               stable_node = rb_entry(tree_node->sub_root.rb_node,
3385 +                                      struct stable_node, node);
3386 +               BUG_ON(!stable_node);
3387 +
3388 +               goto get_page_out;
3389 +       }
3390 +
3391 +       /*
3392 +        * ok, we have to search the second
3393 +        * level subtree, hash the page to a
3394 +        * full strength.
3395 +        */
3396 +       node = tree_node->sub_root.rb_node;
3397 +       BUG_ON(!node);
3398 +       hash_max = rmap_item_hash_max(item, hash);
3399 +
3400 +       while (node) {
3401 +               int cmp;
3402 +
3403 +               stable_node = rb_entry(node, struct stable_node, node);
3404 +
3405 +               cmp = hash_cmp(hash_max, stable_node->hash_max);
3406 +
3407 +               if (cmp < 0)
3408 +                       node = node->rb_left;
3409 +               else if (cmp > 0)
3410 +                       node = node->rb_right;
3411 +               else
3412 +                       goto get_page_out;
3413 +       }
3414 +
3415 +       return NULL;
3416 +
3417 +get_page_out:
3418 +       page = get_uksm_page(stable_node, 1, 1);
3419 +       return page;
3420 +}
3421 +
3422 +static int try_merge_rmap_item(struct rmap_item *item,
3423 +                              struct page *kpage,
3424 +                              struct page *tree_page)
3425 +{
3426 +       struct vm_area_struct *vma = item->slot->vma;
3427 +       struct page_vma_mapped_walk pvmw = {
3428 +               .page = kpage,
3429 +               .vma = vma,
3430 +       };
3431 +
3432 +       pvmw.address = get_rmap_addr(item);
3433 +       if (!page_vma_mapped_walk(&pvmw))
3434 +               return 0;
3435 +
3436 +       if (pte_write(*pvmw.pte)) {
3437 +               /* has changed, abort! */
3438 +               page_vma_mapped_walk_done(&pvmw);
3439 +               return 0;
3440 +       }
3441 +
3442 +       get_page(tree_page);
3443 +       page_add_anon_rmap(tree_page, vma, pvmw.address, false);
3444 +
3445 +       flush_cache_page(vma, pvmw.address, page_to_pfn(kpage));
3446 +       ptep_clear_flush_notify(vma, pvmw.address, pvmw.pte);
3447 +       set_pte_at_notify(vma->vm_mm, pvmw.address, pvmw.pte,
3448 +                         mk_pte(tree_page, vma->vm_page_prot));
3449 +
3450 +       page_remove_rmap(kpage, false);
3451 +       put_page(kpage);
3452 +
3453 +       page_vma_mapped_walk_done(&pvmw);
3454 +
3455 +       return 1;
3456 +}
3457 +
3458 +/**
3459 + * try_to_merge_with_stable_page() - when two rmap_items need to be inserted
3460 + * into stable tree, the page was found to be identical to a stable ksm page,
3461 + * this is the last chance we can merge them into one.
3462 + *
3463 + * @item1:     the rmap_item holding the page which we wanted to insert
3464 + *             into stable tree.
3465 + * @item2:     the other rmap_item we found when unstable tree search
3466 + * @oldpage:   the page currently mapped by the two rmap_items
3467 + * @tree_page: the page we found identical in stable tree node
3468 + * @success1:  return if item1 is successfully merged
3469 + * @success2:  return if item2 is successfully merged
3470 + */
3471 +static void try_merge_with_stable(struct rmap_item *item1,
3472 +                                 struct rmap_item *item2,
3473 +                                 struct page **kpage,
3474 +                                 struct page *tree_page,
3475 +                                 int *success1, int *success2)
3476 +{
3477 +       struct vm_area_struct *vma1 = item1->slot->vma;
3478 +       struct vm_area_struct *vma2 = item2->slot->vma;
3479 +       *success1 = 0;
3480 +       *success2 = 0;
3481 +
3482 +       if (unlikely(*kpage == tree_page)) {
3483 +               /* I don't think this can really happen */
3484 +               pr_warn("UKSM: unexpected condition detected in "
3485 +                       "%s -- *kpage == tree_page !\n", __func__);
3486 +               *success1 = 1;
3487 +               *success2 = 1;
3488 +               return;
3489 +       }
3490 +
3491 +       if (!PageAnon(*kpage) || !PageKsm(*kpage))
3492 +               goto failed;
3493 +
3494 +       if (!trylock_page(tree_page))
3495 +               goto failed;
3496 +
3497 +       /* If the oldpage is still ksm and still pointed
3498 +        * to in the right place, and still write protected,
3499 +        * we are confident it's not changed, no need to
3500 +        * memcmp anymore.
3501 +        * be ware, we cannot take nested pte locks,
3502 +        * deadlock risk.
3503 +        */
3504 +       if (!try_merge_rmap_item(item1, *kpage, tree_page))
3505 +               goto unlock_failed;
3506 +
3507 +       /* ok, then vma2, remind that pte1 already set */
3508 +       if (!try_merge_rmap_item(item2, *kpage, tree_page))
3509 +               goto success_1;
3510 +
3511 +       *success2 = 1;
3512 +success_1:
3513 +       *success1 = 1;
3514 +
3515 +
3516 +       if ((*success1 && vma1->vm_flags & VM_LOCKED) ||
3517 +           (*success2 && vma2->vm_flags & VM_LOCKED)) {
3518 +               munlock_vma_page(*kpage);
3519 +               if (!PageMlocked(tree_page))
3520 +                       mlock_vma_page(tree_page);
3521 +       }
3522 +
3523 +       /*
3524 +        * We do not need oldpage any more in the caller, so can break the lock
3525 +        * now.
3526 +        */
3527 +       unlock_page(*kpage);
3528 +       *kpage = tree_page; /* Get unlocked outside. */
3529 +       return;
3530 +
3531 +unlock_failed:
3532 +       unlock_page(tree_page);
3533 +failed:
3534 +       return;
3535 +}
3536 +
3537 +static inline void stable_node_hash_max(struct stable_node *node,
3538 +                                        struct page *page, u32 hash)
3539 +{
3540 +       u32 hash_max = node->hash_max;
3541 +
3542 +       if (!hash_max) {
3543 +               hash_max = page_hash_max(page, hash);
3544 +               node->hash_max = hash_max;
3545 +       }
3546 +}
3547 +
3548 +static inline
3549 +struct stable_node *new_stable_node(struct tree_node *tree_node,
3550 +                                   struct page *kpage, u32 hash_max)
3551 +{
3552 +       struct stable_node *new_stable_node;
3553 +
3554 +       new_stable_node = alloc_stable_node();
3555 +       if (!new_stable_node)
3556 +               return NULL;
3557 +
3558 +       new_stable_node->kpfn = page_to_pfn(kpage);
3559 +       new_stable_node->hash_max = hash_max;
3560 +       new_stable_node->tree_node = tree_node;
3561 +       set_page_stable_node(kpage, new_stable_node);
3562 +
3563 +       return new_stable_node;
3564 +}
3565 +
3566 +static inline
3567 +struct stable_node *first_level_insert(struct tree_node *tree_node,
3568 +                                      struct rmap_item *rmap_item,
3569 +                                      struct rmap_item *tree_rmap_item,
3570 +                                      struct page **kpage, u32 hash,
3571 +                                      int *success1, int *success2)
3572 +{
3573 +       int cmp;
3574 +       struct page *tree_page;
3575 +       u32 hash_max = 0;
3576 +       struct stable_node *stable_node, *new_snode;
3577 +       struct rb_node *parent = NULL, **new;
3578 +
3579 +       /* this tree node contains no sub-tree yet */
3580 +       stable_node = rb_entry(tree_node->sub_root.rb_node,
3581 +                              struct stable_node, node);
3582 +
3583 +       tree_page = get_uksm_page(stable_node, 1, 0);
3584 +       if (tree_page) {
3585 +               cmp = memcmp_pages_with_cost(*kpage, tree_page, 1);
3586 +               if (!cmp) {
3587 +                       try_merge_with_stable(rmap_item, tree_rmap_item, kpage,
3588 +                                             tree_page, success1, success2);
3589 +                       put_page(tree_page);
3590 +                       if (!*success1 && !*success2)
3591 +                               goto failed;
3592 +
3593 +                       return stable_node;
3594 +
3595 +               } else {
3596 +                       /*
3597 +                        * collision in first level try to create a subtree.
3598 +                        * A new node need to be created.
3599 +                        */
3600 +                       put_page(tree_page);
3601 +
3602 +                       stable_node_hash_max(stable_node, tree_page,
3603 +                                            tree_node->hash);
3604 +                       hash_max = rmap_item_hash_max(rmap_item, hash);
3605 +                       cmp = hash_cmp(hash_max, stable_node->hash_max);
3606 +
3607 +                       parent = &stable_node->node;
3608 +                       if (cmp < 0)
3609 +                               new = &parent->rb_left;
3610 +                       else if (cmp > 0)
3611 +                               new = &parent->rb_right;
3612 +                       else
3613 +                               goto failed;
3614 +               }
3615 +
3616 +       } else {
3617 +               /* the only stable_node deleted, we reuse its tree_node.
3618 +                */
3619 +               parent = NULL;
3620 +               new = &tree_node->sub_root.rb_node;
3621 +       }
3622 +
3623 +       new_snode = new_stable_node(tree_node, *kpage, hash_max);
3624 +       if (!new_snode)
3625 +               goto failed;
3626 +
3627 +       rb_link_node(&new_snode->node, parent, new);
3628 +       rb_insert_color(&new_snode->node, &tree_node->sub_root);
3629 +       tree_node->count++;
3630 +       *success1 = *success2 = 1;
3631 +
3632 +       return new_snode;
3633 +
3634 +failed:
3635 +       return NULL;
3636 +}
3637 +
3638 +static inline
3639 +struct stable_node *stable_subtree_insert(struct tree_node *tree_node,
3640 +                                         struct rmap_item *rmap_item,
3641 +                                         struct rmap_item *tree_rmap_item,
3642 +                                         struct page **kpage, u32 hash,
3643 +                                         int *success1, int *success2)
3644 +{
3645 +       struct page *tree_page;
3646 +       u32 hash_max;
3647 +       struct stable_node *stable_node, *new_snode;
3648 +       struct rb_node *parent, **new;
3649 +
3650 +research:
3651 +       parent = NULL;
3652 +       new = &tree_node->sub_root.rb_node;
3653 +       BUG_ON(!*new);
3654 +       hash_max = rmap_item_hash_max(rmap_item, hash);
3655 +       while (*new) {
3656 +               int cmp;
3657 +
3658 +               stable_node = rb_entry(*new, struct stable_node, node);
3659 +
3660 +               cmp = hash_cmp(hash_max, stable_node->hash_max);
3661 +
3662 +               if (cmp < 0) {
3663 +                       parent = *new;
3664 +                       new = &parent->rb_left;
3665 +               } else if (cmp > 0) {
3666 +                       parent = *new;
3667 +                       new = &parent->rb_right;
3668 +               } else {
3669 +                       tree_page = get_uksm_page(stable_node, 1, 0);
3670 +                       if (tree_page) {
3671 +                               cmp = memcmp_pages_with_cost(*kpage, tree_page, 1);
3672 +                               if (!cmp) {
3673 +                                       try_merge_with_stable(rmap_item,
3674 +                                               tree_rmap_item, kpage,
3675 +                                               tree_page, success1, success2);
3676 +
3677 +                                       put_page(tree_page);
3678 +                                       if (!*success1 && !*success2)
3679 +                                               goto failed;
3680 +                                       /*
3681 +                                        * successfully merged with a stable
3682 +                                        * node
3683 +                                        */
3684 +                                       return stable_node;
3685 +                               } else {
3686 +                                       put_page(tree_page);
3687 +                                       goto failed;
3688 +                               }
3689 +                       } else {
3690 +                               /*
3691 +                                * stable node may be deleted,
3692 +                                * and subtree maybe
3693 +                                * restructed, cannot
3694 +                                * continue, research it.
3695 +                                */
3696 +                               if (tree_node->count) {
3697 +                                       goto research;
3698 +                               } else {
3699 +                                       /* reuse the tree node*/
3700 +                                       parent = NULL;
3701 +                                       new = &tree_node->sub_root.rb_node;
3702 +                               }
3703 +                       }
3704 +               }
3705 +       }
3706 +
3707 +       new_snode = new_stable_node(tree_node, *kpage, hash_max);
3708 +       if (!new_snode)
3709 +               goto failed;
3710 +
3711 +       rb_link_node(&new_snode->node, parent, new);
3712 +       rb_insert_color(&new_snode->node, &tree_node->sub_root);
3713 +       tree_node->count++;
3714 +       *success1 = *success2 = 1;
3715 +
3716 +       return new_snode;
3717 +
3718 +failed:
3719 +       return NULL;
3720 +}
3721 +
3722 +
3723 +/**
3724 + * stable_tree_insert() - try to insert a merged page in unstable tree to
3725 + * the stable tree
3726 + *
3727 + * @kpage:             the page need to be inserted
3728 + * @hash:              the current hash of this page
3729 + * @rmap_item:         the rmap_item being scanned
3730 + * @tree_rmap_item:    the rmap_item found on unstable tree
3731 + * @success1:          return if rmap_item is merged
3732 + * @success2:          return if tree_rmap_item is merged
3733 + *
3734 + * @return             the stable_node on stable tree if at least one
3735 + *                     rmap_item is inserted into stable tree, NULL
3736 + *                     otherwise.
3737 + */
3738 +static struct stable_node *
3739 +stable_tree_insert(struct page **kpage, u32 hash,
3740 +                  struct rmap_item *rmap_item,
3741 +                  struct rmap_item *tree_rmap_item,
3742 +                  int *success1, int *success2)
3743 +{
3744 +       struct rb_node **new = &root_stable_treep->rb_node;
3745 +       struct rb_node *parent = NULL;
3746 +       struct stable_node *stable_node;
3747 +       struct tree_node *tree_node;
3748 +       u32 hash_max = 0;
3749 +
3750 +       *success1 = *success2 = 0;
3751 +
3752 +       while (*new) {
3753 +               int cmp;
3754 +
3755 +               tree_node = rb_entry(*new, struct tree_node, node);
3756 +
3757 +               cmp = hash_cmp(hash, tree_node->hash);
3758 +
3759 +               if (cmp < 0) {
3760 +                       parent = *new;
3761 +                       new = &parent->rb_left;
3762 +               } else if (cmp > 0) {
3763 +                       parent = *new;
3764 +                       new = &parent->rb_right;
3765 +               } else
3766 +                       break;
3767 +       }
3768 +
3769 +       if (*new) {
3770 +               if (tree_node->count == 1) {
3771 +                       stable_node = first_level_insert(tree_node, rmap_item,
3772 +                                               tree_rmap_item, kpage,
3773 +                                               hash, success1, success2);
3774 +               } else {
3775 +                       stable_node = stable_subtree_insert(tree_node,
3776 +                                       rmap_item, tree_rmap_item, kpage,
3777 +                                       hash, success1, success2);
3778 +               }
3779 +       } else {
3780 +
3781 +               /* no tree node found */
3782 +               tree_node = alloc_tree_node(stable_tree_node_listp);
3783 +               if (!tree_node) {
3784 +                       stable_node = NULL;
3785 +                       goto out;
3786 +               }
3787 +
3788 +               stable_node = new_stable_node(tree_node, *kpage, hash_max);
3789 +               if (!stable_node) {
3790 +                       free_tree_node(tree_node);
3791 +                       goto out;
3792 +               }
3793 +
3794 +               tree_node->hash = hash;
3795 +               rb_link_node(&tree_node->node, parent, new);
3796 +               rb_insert_color(&tree_node->node, root_stable_treep);
3797 +               parent = NULL;
3798 +               new = &tree_node->sub_root.rb_node;
3799 +
3800 +               rb_link_node(&stable_node->node, parent, new);
3801 +               rb_insert_color(&stable_node->node, &tree_node->sub_root);
3802 +               tree_node->count++;
3803 +               *success1 = *success2 = 1;
3804 +       }
3805 +
3806 +out:
3807 +       return stable_node;
3808 +}
3809 +
3810 +
3811 +/**
3812 + * get_tree_rmap_item_page() - try to get the page and lock the mmap_sem
3813 + *
3814 + * @return     0 on success, -EBUSY if unable to lock the mmap_sem,
3815 + *             -EINVAL if the page mapping has been changed.
3816 + */
3817 +static inline int get_tree_rmap_item_page(struct rmap_item *tree_rmap_item)
3818 +{
3819 +       int err;
3820 +
3821 +       err = get_mergeable_page_lock_mmap(tree_rmap_item);
3822 +
3823 +       if (err == -EINVAL) {
3824 +               /* its page map has been changed, remove it */
3825 +               remove_rmap_item_from_tree(tree_rmap_item);
3826 +       }
3827 +
3828 +       /* The page is gotten and mmap_sem is locked now. */
3829 +       return err;
3830 +}
3831 +
3832 +
3833 +/**
3834 + * unstable_tree_search_insert() - search an unstable tree rmap_item with the
3835 + * same hash value. Get its page and trylock the mmap_sem
3836 + */
3837 +static inline
3838 +struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
3839 +                                             u32 hash)
3840 +
3841 +{
3842 +       struct rb_node **new = &root_unstable_tree.rb_node;
3843 +       struct rb_node *parent = NULL;
3844 +       struct tree_node *tree_node;
3845 +       u32 hash_max;
3846 +       struct rmap_item *tree_rmap_item;
3847 +
3848 +       while (*new) {
3849 +               int cmp;
3850 +
3851 +               tree_node = rb_entry(*new, struct tree_node, node);
3852 +
3853 +               cmp = hash_cmp(hash, tree_node->hash);
3854 +
3855 +               if (cmp < 0) {
3856 +                       parent = *new;
3857 +                       new = &parent->rb_left;
3858 +               } else if (cmp > 0) {
3859 +                       parent = *new;
3860 +                       new = &parent->rb_right;
3861 +               } else
3862 +                       break;
3863 +       }
3864 +
3865 +       if (*new) {
3866 +               /* got the tree_node */
3867 +               if (tree_node->count == 1) {
3868 +                       tree_rmap_item = rb_entry(tree_node->sub_root.rb_node,
3869 +                                                 struct rmap_item, node);
3870 +                       BUG_ON(!tree_rmap_item);
3871 +
3872 +                       goto get_page_out;
3873 +               }
3874 +
3875 +               /* well, search the collision subtree */
3876 +               new = &tree_node->sub_root.rb_node;
3877 +               BUG_ON(!*new);
3878 +               hash_max = rmap_item_hash_max(rmap_item, hash);
3879 +
3880 +               while (*new) {
3881 +                       int cmp;
3882 +
3883 +                       tree_rmap_item = rb_entry(*new, struct rmap_item,
3884 +                                                 node);
3885 +
3886 +                       cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
3887 +                       parent = *new;
3888 +                       if (cmp < 0)
3889 +                               new = &parent->rb_left;
3890 +                       else if (cmp > 0)
3891 +                               new = &parent->rb_right;
3892 +                       else
3893 +                               goto get_page_out;
3894 +               }
3895 +       } else {
3896 +               /* alloc a new tree_node */
3897 +               tree_node = alloc_tree_node(&unstable_tree_node_list);
3898 +               if (!tree_node)
3899 +                       return NULL;
3900 +
3901 +               tree_node->hash = hash;
3902 +               rb_link_node(&tree_node->node, parent, new);
3903 +               rb_insert_color(&tree_node->node, &root_unstable_tree);
3904 +               parent = NULL;
3905 +               new = &tree_node->sub_root.rb_node;
3906 +       }
3907 +
3908 +       /* did not found even in sub-tree */
3909 +       rmap_item->tree_node = tree_node;
3910 +       rmap_item->address |= UNSTABLE_FLAG;
3911 +       rmap_item->hash_round = uksm_hash_round;
3912 +       rb_link_node(&rmap_item->node, parent, new);
3913 +       rb_insert_color(&rmap_item->node, &tree_node->sub_root);
3914 +
3915 +       uksm_pages_unshared++;
3916 +       return NULL;
3917 +
3918 +get_page_out:
3919 +       if (tree_rmap_item->page == rmap_item->page)
3920 +               return NULL;
3921 +
3922 +       if (get_tree_rmap_item_page(tree_rmap_item))
3923 +               return NULL;
3924 +
3925 +       return tree_rmap_item;
3926 +}
3927 +
3928 +static void hold_anon_vma(struct rmap_item *rmap_item,
3929 +                         struct anon_vma *anon_vma)
3930 +{
3931 +       rmap_item->anon_vma = anon_vma;
3932 +       get_anon_vma(anon_vma);
3933 +}
3934 +
3935 +
3936 +/**
3937 + * stable_tree_append() - append a rmap_item to a stable node. Deduplication
3938 + * ratio statistics is done in this function.
3939 + *
3940 + */
3941 +static void stable_tree_append(struct rmap_item *rmap_item,
3942 +                              struct stable_node *stable_node, int logdedup)
3943 +{
3944 +       struct node_vma *node_vma = NULL, *new_node_vma, *node_vma_cont = NULL;
3945 +       unsigned long key = (unsigned long)rmap_item->slot;
3946 +       unsigned long factor = rmap_item->slot->rung->step;
3947 +
3948 +       BUG_ON(!stable_node);
3949 +       rmap_item->address |= STABLE_FLAG;
3950 +
3951 +       if (hlist_empty(&stable_node->hlist)) {
3952 +               uksm_pages_shared++;
3953 +               goto node_vma_new;
3954 +       } else {
3955 +               uksm_pages_sharing++;
3956 +       }
3957 +
3958 +       hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) {
3959 +               if (node_vma->key >= key)
3960 +                       break;
3961 +
3962 +               if (logdedup) {
3963 +                       node_vma->slot->pages_bemerged += factor;
3964 +                       if (list_empty(&node_vma->slot->dedup_list))
3965 +                               list_add(&node_vma->slot->dedup_list,
3966 +                                        &vma_slot_dedup);
3967 +               }
3968 +       }
3969 +
3970 +       if (node_vma) {
3971 +               if (node_vma->key == key) {
3972 +                       node_vma_cont = hlist_entry_safe(node_vma->hlist.next, struct node_vma, hlist);
3973 +                       goto node_vma_ok;
3974 +               } else if (node_vma->key > key) {
3975 +                       node_vma_cont = node_vma;
3976 +               }
3977 +       }
3978 +
3979 +node_vma_new:
3980 +       /* no same vma already in node, alloc a new node_vma */
3981 +       new_node_vma = alloc_node_vma();
3982 +       BUG_ON(!new_node_vma);
3983 +       new_node_vma->head = stable_node;
3984 +       new_node_vma->slot = rmap_item->slot;
3985 +
3986 +       if (!node_vma) {
3987 +               hlist_add_head(&new_node_vma->hlist, &stable_node->hlist);
3988 +       } else if (node_vma->key != key) {
3989 +               if (node_vma->key < key)
3990 +                       hlist_add_behind(&new_node_vma->hlist, &node_vma->hlist);
3991 +               else {
3992 +                       hlist_add_before(&new_node_vma->hlist,
3993 +                                        &node_vma->hlist);
3994 +               }
3995 +
3996 +       }
3997 +       node_vma = new_node_vma;
3998 +
3999 +node_vma_ok: /* ok, ready to add to the list */
4000 +       rmap_item->head = node_vma;
4001 +       hlist_add_head(&rmap_item->hlist, &node_vma->rmap_hlist);
4002 +       hold_anon_vma(rmap_item, rmap_item->slot->vma->anon_vma);
4003 +       if (logdedup) {
4004 +               rmap_item->slot->pages_merged++;
4005 +               if (node_vma_cont) {
4006 +                       node_vma = node_vma_cont;
4007 +                       hlist_for_each_entry_continue(node_vma, hlist) {
4008 +                               node_vma->slot->pages_bemerged += factor;
4009 +                               if (list_empty(&node_vma->slot->dedup_list))
4010 +                                       list_add(&node_vma->slot->dedup_list,
4011 +                                                &vma_slot_dedup);
4012 +                       }
4013 +               }
4014 +       }
4015 +}
4016 +
4017 +/*
4018 + * We use break_ksm to break COW on a ksm page: it's a stripped down
4019 + *
4020 + *     if (get_user_pages(addr, 1, 1, 1, &page, NULL) == 1)
4021 + *             put_page(page);
4022 + *
4023 + * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
4024 + * in case the application has unmapped and remapped mm,addr meanwhile.
4025 + * Could a ksm page appear anywhere else?  Actually yes, in a VM_PFNMAP
4026 + * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
4027 + */
4028 +static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
4029 +{
4030 +       struct page *page;
4031 +       int ret = 0;
4032 +
4033 +       do {
4034 +               cond_resched();
4035 +               page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
4036 +               if (IS_ERR_OR_NULL(page))
4037 +                       break;
4038 +               if (PageKsm(page)) {
4039 +                       ret = handle_mm_fault(vma, addr,
4040 +                                             FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE,
4041 +                                             NULL);
4042 +               } else
4043 +                       ret = VM_FAULT_WRITE;
4044 +               put_page(page);
4045 +       } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
4046 +       /*
4047 +        * We must loop because handle_mm_fault() may back out if there's
4048 +        * any difficulty e.g. if pte accessed bit gets updated concurrently.
4049 +        *
4050 +        * VM_FAULT_WRITE is what we have been hoping for: it indicates that
4051 +        * COW has been broken, even if the vma does not permit VM_WRITE;
4052 +        * but note that a concurrent fault might break PageKsm for us.
4053 +        *
4054 +        * VM_FAULT_SIGBUS could occur if we race with truncation of the
4055 +        * backing file, which also invalidates anonymous pages: that's
4056 +        * okay, that truncation will have unmapped the PageKsm for us.
4057 +        *
4058 +        * VM_FAULT_OOM: at the time of writing (late July 2009), setting
4059 +        * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
4060 +        * current task has TIF_MEMDIE set, and will be OOM killed on return
4061 +        * to user; and ksmd, having no mm, would never be chosen for that.
4062 +        *
4063 +        * But if the mm is in a limited mem_cgroup, then the fault may fail
4064 +        * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
4065 +        * even ksmd can fail in this way - though it's usually breaking ksm
4066 +        * just to undo a merge it made a moment before, so unlikely to oom.
4067 +        *
4068 +        * That's a pity: we might therefore have more kernel pages allocated
4069 +        * than we're counting as nodes in the stable tree; but uksm_do_scan
4070 +        * will retry to break_cow on each pass, so should recover the page
4071 +        * in due course.  The important thing is to not let VM_MERGEABLE
4072 +        * be cleared while any such pages might remain in the area.
4073 +        */
4074 +       return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
4075 +}
4076 +
4077 +static void break_cow(struct rmap_item *rmap_item)
4078 +{
4079 +       struct vm_area_struct *vma = rmap_item->slot->vma;
4080 +       struct mm_struct *mm = vma->vm_mm;
4081 +       unsigned long addr = get_rmap_addr(rmap_item);
4082 +
4083 +       if (uksm_test_exit(mm))
4084 +               goto out;
4085 +
4086 +       break_ksm(vma, addr);
4087 +out:
4088 +       return;
4089 +}
4090 +
4091 +/*
4092 + * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather
4093 + * than check every pte of a given vma, the locking doesn't quite work for
4094 + * that - an rmap_item is assigned to the stable tree after inserting ksm
4095 + * page and upping mmap_sem.  Nor does it fit with the way we skip dup'ing
4096 + * rmap_items from parent to child at fork time (so as not to waste time
4097 + * if exit comes before the next scan reaches it).
4098 + *
4099 + * Similarly, although we'd like to remove rmap_items (so updating counts
4100 + * and freeing memory) when unmerging an area, it's easier to leave that
4101 + * to the next pass of ksmd - consider, for example, how ksmd might be
4102 + * in cmp_and_merge_page on one of the rmap_items we would be removing.
4103 + */
4104 +inline int unmerge_uksm_pages(struct vm_area_struct *vma,
4105 +                     unsigned long start, unsigned long end)
4106 +{
4107 +       unsigned long addr;
4108 +       int err = 0;
4109 +
4110 +       for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
4111 +               if (uksm_test_exit(vma->vm_mm))
4112 +                       break;
4113 +               if (signal_pending(current))
4114 +                       err = -ERESTARTSYS;
4115 +               else
4116 +                       err = break_ksm(vma, addr);
4117 +       }
4118 +       return err;
4119 +}
4120 +
4121 +static inline void inc_uksm_pages_scanned(void)
4122 +{
4123 +       u64 delta;
4124 +
4125 +
4126 +       if (uksm_pages_scanned == U64_MAX) {
4127 +               encode_benefit();
4128 +
4129 +               delta = uksm_pages_scanned >> pages_scanned_base;
4130 +
4131 +               if (CAN_OVERFLOW_U64(pages_scanned_stored, delta)) {
4132 +                       pages_scanned_stored >>= 1;
4133 +                       delta >>= 1;
4134 +                       pages_scanned_base++;
4135 +               }
4136 +
4137 +               pages_scanned_stored += delta;
4138 +
4139 +               uksm_pages_scanned = uksm_pages_scanned_last = 0;
4140 +       }
4141 +
4142 +       uksm_pages_scanned++;
4143 +}
4144 +
4145 +static inline int find_zero_page_hash(int strength, u32 hash)
4146 +{
4147 +       return (zero_hash_table[strength] == hash);
4148 +}
4149 +
4150 +static
4151 +int cmp_and_merge_zero_page(struct vm_area_struct *vma, struct page *page)
4152 +{
4153 +       struct page *zero_page = empty_uksm_zero_page;
4154 +       struct mm_struct *mm = vma->vm_mm;
4155 +       pte_t orig_pte = __pte(0);
4156 +       int err = -EFAULT;
4157 +
4158 +       if (uksm_test_exit(mm))
4159 +               goto out;
4160 +
4161 +       if (!trylock_page(page))
4162 +               goto out;
4163 +
4164 +       if (!PageAnon(page))
4165 +               goto out_unlock;
4166 +
4167 +       if (PageTransCompound(page)) {
4168 +               err = split_huge_page(page);
4169 +               if (err)
4170 +                       goto out_unlock;
4171 +       }
4172 +
4173 +       if (write_protect_page(vma, page, &orig_pte, 0) == 0) {
4174 +               if (is_page_full_zero(page))
4175 +                       err = replace_page(vma, page, zero_page, orig_pte);
4176 +       }
4177 +
4178 +out_unlock:
4179 +       unlock_page(page);
4180 +out:
4181 +       return err;
4182 +}
4183 +
4184 +/*
4185 + * cmp_and_merge_page() - first see if page can be merged into the stable
4186 + * tree; if not, compare hash to previous and if it's the same, see if page
4187 + * can be inserted into the unstable tree, or merged with a page already there
4188 + * and both transferred to the stable tree.
4189 + *
4190 + * @page: the page that we are searching identical page to.
4191 + * @rmap_item: the reverse mapping into the virtual address of this page
4192 + */
4193 +static void cmp_and_merge_page(struct rmap_item *rmap_item, u32 hash)
4194 +{
4195 +       struct rmap_item *tree_rmap_item;
4196 +       struct page *page;
4197 +       struct page *kpage = NULL;
4198 +       u32 hash_max;
4199 +       int err;
4200 +       unsigned int success1, success2;
4201 +       struct stable_node *snode;
4202 +       int cmp;
4203 +       struct rb_node *parent = NULL, **new;
4204 +
4205 +       remove_rmap_item_from_tree(rmap_item);
4206 +       page = rmap_item->page;
4207 +
4208 +       /* We first start with searching the page inside the stable tree */
4209 +       kpage = stable_tree_search(rmap_item, hash);
4210 +       if (kpage) {
4211 +               err = try_to_merge_with_uksm_page(rmap_item, kpage,
4212 +                                                hash);
4213 +               if (!err) {
4214 +                       /*
4215 +                        * The page was successfully merged, add
4216 +                        * its rmap_item to the stable tree.
4217 +                        * page lock is needed because it's
4218 +                        * racing with try_to_unmap_ksm(), etc.
4219 +                        */
4220 +                       lock_page(kpage);
4221 +                       snode = page_stable_node(kpage);
4222 +                       stable_tree_append(rmap_item, snode, 1);
4223 +                       unlock_page(kpage);
4224 +                       put_page(kpage);
4225 +                       return; /* success */
4226 +               }
4227 +               put_page(kpage);
4228 +
4229 +               /*
4230 +                * if it's a collision and it has been search in sub-rbtree
4231 +                * (hash_max != 0), we want to abort, because if it is
4232 +                * successfully merged in unstable tree, the collision trends to
4233 +                * happen again.
4234 +                */
4235 +               if (err == MERGE_ERR_COLLI && rmap_item->hash_max)
4236 +                       return;
4237 +       }
4238 +
4239 +       tree_rmap_item =
4240 +               unstable_tree_search_insert(rmap_item, hash);
4241 +       if (tree_rmap_item) {
4242 +               err = try_to_merge_two_pages(rmap_item, tree_rmap_item, hash);
4243 +               /*
4244 +                * As soon as we merge this page, we want to remove the
4245 +                * rmap_item of the page we have merged with from the unstable
4246 +                * tree, and insert it instead as new node in the stable tree.
4247 +                */
4248 +               if (!err) {
4249 +                       kpage = page;
4250 +                       remove_rmap_item_from_tree(tree_rmap_item);
4251 +                       lock_page(kpage);
4252 +                       snode = stable_tree_insert(&kpage, hash,
4253 +                                                  rmap_item, tree_rmap_item,
4254 +                                                  &success1, &success2);
4255 +
4256 +                       /*
4257 +                        * Do not log dedup for tree item, it's not counted as
4258 +                        * scanned in this round.
4259 +                        */
4260 +                       if (success2)
4261 +                               stable_tree_append(tree_rmap_item, snode, 0);
4262 +
4263 +                       /*
4264 +                        * The order of these two stable append is important:
4265 +                        * we are scanning rmap_item.
4266 +                        */
4267 +                       if (success1)
4268 +                               stable_tree_append(rmap_item, snode, 1);
4269 +
4270 +                       /*
4271 +                        * The original kpage may be unlocked inside
4272 +                        * stable_tree_insert() already. This page
4273 +                        * should be unlocked before doing
4274 +                        * break_cow().
4275 +                        */
4276 +                       unlock_page(kpage);
4277 +
4278 +                       if (!success1)
4279 +                               break_cow(rmap_item);
4280 +
4281 +                       if (!success2)
4282 +                               break_cow(tree_rmap_item);
4283 +
4284 +               } else if (err == MERGE_ERR_COLLI) {
4285 +                       BUG_ON(tree_rmap_item->tree_node->count > 1);
4286 +
4287 +                       rmap_item_hash_max(tree_rmap_item,
4288 +                                          tree_rmap_item->tree_node->hash);
4289 +
4290 +                       hash_max = rmap_item_hash_max(rmap_item, hash);
4291 +                       cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
4292 +                       parent = &tree_rmap_item->node;
4293 +                       if (cmp < 0)
4294 +                               new = &parent->rb_left;
4295 +                       else if (cmp > 0)
4296 +                               new = &parent->rb_right;
4297 +                       else
4298 +                               goto put_up_out;
4299 +
4300 +                       rmap_item->tree_node = tree_rmap_item->tree_node;
4301 +                       rmap_item->address |= UNSTABLE_FLAG;
4302 +                       rmap_item->hash_round = uksm_hash_round;
4303 +                       rb_link_node(&rmap_item->node, parent, new);
4304 +                       rb_insert_color(&rmap_item->node,
4305 +                                       &tree_rmap_item->tree_node->sub_root);
4306 +                       rmap_item->tree_node->count++;
4307 +               } else {
4308 +                       /*
4309 +                        * either one of the page has changed or they collide
4310 +                        * at the max hash, we consider them as ill items.
4311 +                        */
4312 +                       remove_rmap_item_from_tree(tree_rmap_item);
4313 +               }
4314 +put_up_out:
4315 +               put_page(tree_rmap_item->page);
4316 +               mmap_read_unlock(tree_rmap_item->slot->vma->vm_mm);
4317 +       }
4318 +}
4319 +
4320 +
4321 +
4322 +
4323 +static inline unsigned long get_pool_index(struct vma_slot *slot,
4324 +                                          unsigned long index)
4325 +{
4326 +       unsigned long pool_index;
4327 +
4328 +       pool_index = (sizeof(struct rmap_list_entry *) * index) >> PAGE_SHIFT;
4329 +       if (pool_index >= slot->pool_size)
4330 +               BUG();
4331 +       return pool_index;
4332 +}
4333 +
4334 +static inline unsigned long index_page_offset(unsigned long index)
4335 +{
4336 +       return offset_in_page(sizeof(struct rmap_list_entry *) * index);
4337 +}
4338 +
4339 +static inline
4340 +struct rmap_list_entry *get_rmap_list_entry(struct vma_slot *slot,
4341 +                                           unsigned long index, int need_alloc)
4342 +{
4343 +       unsigned long pool_index;
4344 +       struct page *page;
4345 +       void *addr;
4346 +
4347 +
4348 +       pool_index = get_pool_index(slot, index);
4349 +       if (!slot->rmap_list_pool[pool_index]) {
4350 +               if (!need_alloc)
4351 +                       return NULL;
4352 +
4353 +               page = alloc_page(GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN);
4354 +               if (!page)
4355 +                       return NULL;
4356 +
4357 +               slot->rmap_list_pool[pool_index] = page;
4358 +       }
4359 +
4360 +       addr = kmap(slot->rmap_list_pool[pool_index]);
4361 +       addr += index_page_offset(index);
4362 +
4363 +       return addr;
4364 +}
4365 +
4366 +static inline void put_rmap_list_entry(struct vma_slot *slot,
4367 +                                      unsigned long index)
4368 +{
4369 +       unsigned long pool_index;
4370 +
4371 +       pool_index = get_pool_index(slot, index);
4372 +       BUG_ON(!slot->rmap_list_pool[pool_index]);
4373 +       kunmap(slot->rmap_list_pool[pool_index]);
4374 +}
4375 +
4376 +static inline int entry_is_new(struct rmap_list_entry *entry)
4377 +{
4378 +       return !entry->item;
4379 +}
4380 +
4381 +static inline unsigned long get_index_orig_addr(struct vma_slot *slot,
4382 +                                               unsigned long index)
4383 +{
4384 +       return slot->vma->vm_start + (index << PAGE_SHIFT);
4385 +}
4386 +
4387 +static inline unsigned long get_entry_address(struct rmap_list_entry *entry)
4388 +{
4389 +       unsigned long addr;
4390 +
4391 +       if (is_addr(entry->addr))
4392 +               addr = get_clean_addr(entry->addr);
4393 +       else if (entry->item)
4394 +               addr = get_rmap_addr(entry->item);
4395 +       else
4396 +               BUG();
4397 +
4398 +       return addr;
4399 +}
4400 +
4401 +static inline struct rmap_item *get_entry_item(struct rmap_list_entry *entry)
4402 +{
4403 +       if (is_addr(entry->addr))
4404 +               return NULL;
4405 +
4406 +       return entry->item;
4407 +}
4408 +
4409 +static inline void inc_rmap_list_pool_count(struct vma_slot *slot,
4410 +                                           unsigned long index)
4411 +{
4412 +       unsigned long pool_index;
4413 +
4414 +       pool_index = get_pool_index(slot, index);
4415 +       BUG_ON(!slot->rmap_list_pool[pool_index]);
4416 +       slot->pool_counts[pool_index]++;
4417 +}
4418 +
4419 +static inline void dec_rmap_list_pool_count(struct vma_slot *slot,
4420 +                                           unsigned long index)
4421 +{
4422 +       unsigned long pool_index;
4423 +
4424 +       pool_index = get_pool_index(slot, index);
4425 +       BUG_ON(!slot->rmap_list_pool[pool_index]);
4426 +       BUG_ON(!slot->pool_counts[pool_index]);
4427 +       slot->pool_counts[pool_index]--;
4428 +}
4429 +
4430 +static inline int entry_has_rmap(struct rmap_list_entry *entry)
4431 +{
4432 +       return !is_addr(entry->addr) && entry->item;
4433 +}
4434 +
4435 +static inline void swap_entries(struct rmap_list_entry *entry1,
4436 +                               unsigned long index1,
4437 +                               struct rmap_list_entry *entry2,
4438 +                               unsigned long index2)
4439 +{
4440 +       struct rmap_list_entry tmp;
4441 +
4442 +       /* swapping two new entries is meaningless */
4443 +       BUG_ON(entry_is_new(entry1) && entry_is_new(entry2));
4444 +
4445 +       tmp = *entry1;
4446 +       *entry1 = *entry2;
4447 +       *entry2 = tmp;
4448 +
4449 +       if (entry_has_rmap(entry1))
4450 +               entry1->item->entry_index = index1;
4451 +
4452 +       if (entry_has_rmap(entry2))
4453 +               entry2->item->entry_index = index2;
4454 +
4455 +       if (entry_has_rmap(entry1) && !entry_has_rmap(entry2)) {
4456 +               inc_rmap_list_pool_count(entry1->item->slot, index1);
4457 +               dec_rmap_list_pool_count(entry1->item->slot, index2);
4458 +       } else if (!entry_has_rmap(entry1) && entry_has_rmap(entry2)) {
4459 +               inc_rmap_list_pool_count(entry2->item->slot, index2);
4460 +               dec_rmap_list_pool_count(entry2->item->slot, index1);
4461 +       }
4462 +}
4463 +
4464 +static inline void free_entry_item(struct rmap_list_entry *entry)
4465 +{
4466 +       unsigned long index;
4467 +       struct rmap_item *item;
4468 +
4469 +       if (!is_addr(entry->addr)) {
4470 +               BUG_ON(!entry->item);
4471 +               item = entry->item;
4472 +               entry->addr = get_rmap_addr(item);
4473 +               set_is_addr(entry->addr);
4474 +               index = item->entry_index;
4475 +               remove_rmap_item_from_tree(item);
4476 +               dec_rmap_list_pool_count(item->slot, index);
4477 +               free_rmap_item(item);
4478 +       }
4479 +}
4480 +
4481 +static inline int pool_entry_boundary(unsigned long index)
4482 +{
4483 +       unsigned long linear_addr;
4484 +
4485 +       linear_addr = sizeof(struct rmap_list_entry *) * index;
4486 +       return index && !offset_in_page(linear_addr);
4487 +}
4488 +
4489 +static inline void try_free_last_pool(struct vma_slot *slot,
4490 +                                     unsigned long index)
4491 +{
4492 +       unsigned long pool_index;
4493 +
4494 +       pool_index = get_pool_index(slot, index);
4495 +       if (slot->rmap_list_pool[pool_index] &&
4496 +           !slot->pool_counts[pool_index]) {
4497 +               __free_page(slot->rmap_list_pool[pool_index]);
4498 +               slot->rmap_list_pool[pool_index] = NULL;
4499 +               slot->flags |= UKSM_SLOT_NEED_SORT;
4500 +       }
4501 +
4502 +}
4503 +
4504 +static inline unsigned long vma_item_index(struct vm_area_struct *vma,
4505 +                                          struct rmap_item *item)
4506 +{
4507 +       return (get_rmap_addr(item) - vma->vm_start) >> PAGE_SHIFT;
4508 +}
4509 +
4510 +static int within_same_pool(struct vma_slot *slot,
4511 +                           unsigned long i, unsigned long j)
4512 +{
4513 +       unsigned long pool_i, pool_j;
4514 +
4515 +       pool_i = get_pool_index(slot, i);
4516 +       pool_j = get_pool_index(slot, j);
4517 +
4518 +       return (pool_i == pool_j);
4519 +}
4520 +
4521 +static void sort_rmap_entry_list(struct vma_slot *slot)
4522 +{
4523 +       unsigned long i, j;
4524 +       struct rmap_list_entry *entry, *swap_entry;
4525 +
4526 +       entry = get_rmap_list_entry(slot, 0, 0);
4527 +       for (i = 0; i < slot->pages; ) {
4528 +
4529 +               if (!entry)
4530 +                       goto skip_whole_pool;
4531 +
4532 +               if (entry_is_new(entry))
4533 +                       goto next_entry;
4534 +
4535 +               if (is_addr(entry->addr)) {
4536 +                       entry->addr = 0;
4537 +                       goto next_entry;
4538 +               }
4539 +
4540 +               j = vma_item_index(slot->vma, entry->item);
4541 +               if (j == i)
4542 +                       goto next_entry;
4543 +
4544 +               if (within_same_pool(slot, i, j))
4545 +                       swap_entry = entry + j - i;
4546 +               else
4547 +                       swap_entry = get_rmap_list_entry(slot, j, 1);
4548 +
4549 +               swap_entries(entry, i, swap_entry, j);
4550 +               if (!within_same_pool(slot, i, j))
4551 +                       put_rmap_list_entry(slot, j);
4552 +               continue;
4553 +
4554 +skip_whole_pool:
4555 +               i += PAGE_SIZE / sizeof(*entry);
4556 +               if (i < slot->pages)
4557 +                       entry = get_rmap_list_entry(slot, i, 0);
4558 +               continue;
4559 +
4560 +next_entry:
4561 +               if (i >= slot->pages - 1 ||
4562 +                   !within_same_pool(slot, i, i + 1)) {
4563 +                       put_rmap_list_entry(slot, i);
4564 +                       if (i + 1 < slot->pages)
4565 +                               entry = get_rmap_list_entry(slot, i + 1, 0);
4566 +               } else
4567 +                       entry++;
4568 +               i++;
4569 +               continue;
4570 +       }
4571 +
4572 +       /* free empty pool entries which contain no rmap_item */
4573 +       /* CAN be simplied to based on only pool_counts when bug freed !!!!! */
4574 +       for (i = 0; i < slot->pool_size; i++) {
4575 +               unsigned char has_rmap;
4576 +               void *addr;
4577 +
4578 +               if (!slot->rmap_list_pool[i])
4579 +                       continue;
4580 +
4581 +               has_rmap = 0;
4582 +               addr = kmap(slot->rmap_list_pool[i]);
4583 +               BUG_ON(!addr);
4584 +               for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
4585 +                       entry = (struct rmap_list_entry *)addr + j;
4586 +                       if (is_addr(entry->addr))
4587 +                               continue;
4588 +                       if (!entry->item)
4589 +                               continue;
4590 +                       has_rmap = 1;
4591 +               }
4592 +               kunmap(slot->rmap_list_pool[i]);
4593 +               if (!has_rmap) {
4594 +                       BUG_ON(slot->pool_counts[i]);
4595 +                       __free_page(slot->rmap_list_pool[i]);
4596 +                       slot->rmap_list_pool[i] = NULL;
4597 +               }
4598 +       }
4599 +
4600 +       slot->flags &= ~UKSM_SLOT_NEED_SORT;
4601 +}
4602 +
4603 +/*
4604 + * vma_fully_scanned() - if all the pages in this slot have been scanned.
4605 + */
4606 +static inline int vma_fully_scanned(struct vma_slot *slot)
4607 +{
4608 +       return slot->pages_scanned == slot->pages;
4609 +}
4610 +
4611 +/**
4612 + * get_next_rmap_item() - Get the next rmap_item in a vma_slot according to
4613 + * its random permutation. This function is embedded with the random
4614 + * permutation index management code.
4615 + */
4616 +static struct rmap_item *get_next_rmap_item(struct vma_slot *slot, u32 *hash)
4617 +{
4618 +       unsigned long rand_range, addr, swap_index, scan_index;
4619 +       struct rmap_item *item = NULL;
4620 +       struct rmap_list_entry *scan_entry, *swap_entry = NULL;
4621 +       struct page *page;
4622 +
4623 +       scan_index = swap_index = slot->pages_scanned % slot->pages;
4624 +
4625 +       if (pool_entry_boundary(scan_index))
4626 +               try_free_last_pool(slot, scan_index - 1);
4627 +
4628 +       if (vma_fully_scanned(slot)) {
4629 +               if (slot->flags & UKSM_SLOT_NEED_SORT)
4630 +                       slot->flags |= UKSM_SLOT_NEED_RERAND;
4631 +               else
4632 +                       slot->flags &= ~UKSM_SLOT_NEED_RERAND;
4633 +               if (slot->flags & UKSM_SLOT_NEED_SORT)
4634 +                       sort_rmap_entry_list(slot);
4635 +       }
4636 +
4637 +       scan_entry = get_rmap_list_entry(slot, scan_index, 1);
4638 +       if (!scan_entry)
4639 +               return NULL;
4640 +
4641 +       if (entry_is_new(scan_entry)) {
4642 +               scan_entry->addr = get_index_orig_addr(slot, scan_index);
4643 +               set_is_addr(scan_entry->addr);
4644 +       }
4645 +
4646 +       if (slot->flags & UKSM_SLOT_NEED_RERAND) {
4647 +               rand_range = slot->pages - scan_index;
4648 +               BUG_ON(!rand_range);
4649 +               swap_index = scan_index + (prandom_u32() % rand_range);
4650 +       }
4651 +
4652 +       if (swap_index != scan_index) {
4653 +               swap_entry = get_rmap_list_entry(slot, swap_index, 1);
4654 +
4655 +               if (!swap_entry)
4656 +                       return NULL;
4657 +
4658 +               if (entry_is_new(swap_entry)) {
4659 +                       swap_entry->addr = get_index_orig_addr(slot,
4660 +                                                              swap_index);
4661 +                       set_is_addr(swap_entry->addr);
4662 +               }
4663 +               swap_entries(scan_entry, scan_index, swap_entry, swap_index);
4664 +       }
4665 +
4666 +       addr = get_entry_address(scan_entry);
4667 +       item = get_entry_item(scan_entry);
4668 +       BUG_ON(addr > slot->vma->vm_end || addr < slot->vma->vm_start);
4669 +
4670 +       page = follow_page(slot->vma, addr, FOLL_GET);
4671 +       if (IS_ERR_OR_NULL(page))
4672 +               goto nopage;
4673 +
4674 +       if (!PageAnon(page))
4675 +               goto putpage;
4676 +
4677 +       /*check is zero_page pfn or uksm_zero_page*/
4678 +       if ((page_to_pfn(page) == zero_pfn)
4679 +                       || (page_to_pfn(page) == uksm_zero_pfn))
4680 +               goto putpage;
4681 +
4682 +       flush_anon_page(slot->vma, page, addr);
4683 +       flush_dcache_page(page);
4684 +
4685 +
4686 +       *hash = page_hash(page, hash_strength, 1);
4687 +       inc_uksm_pages_scanned();
4688 +       /*if the page content all zero, re-map to zero-page*/
4689 +       if (find_zero_page_hash(hash_strength, *hash)) {
4690 +               if (!cmp_and_merge_zero_page(slot->vma, page)) {
4691 +                       slot->pages_merged++;
4692 +
4693 +                       /* For full-zero pages, no need to create rmap item */
4694 +                       goto putpage;
4695 +               } else {
4696 +                       inc_rshash_neg(memcmp_cost / 2);
4697 +               }
4698 +       }
4699 +
4700 +       if (!item) {
4701 +               item = alloc_rmap_item();
4702 +               if (item) {
4703 +                       /* It has already been zeroed */
4704 +                       item->slot = slot;
4705 +                       item->address = addr;
4706 +                       item->entry_index = scan_index;
4707 +                       scan_entry->item = item;
4708 +                       inc_rmap_list_pool_count(slot, scan_index);
4709 +               } else
4710 +                       goto putpage;
4711 +       }
4712 +
4713 +       BUG_ON(item->slot != slot);
4714 +       /* the page may have changed */
4715 +       item->page = page;
4716 +       put_rmap_list_entry(slot, scan_index);
4717 +       if (swap_entry)
4718 +               put_rmap_list_entry(slot, swap_index);
4719 +       return item;
4720 +
4721 +putpage:
4722 +       put_page(page);
4723 +       page = NULL;
4724 +nopage:
4725 +       /* no page, store addr back and free rmap_item if possible */
4726 +       free_entry_item(scan_entry);
4727 +       put_rmap_list_entry(slot, scan_index);
4728 +       if (swap_entry)
4729 +               put_rmap_list_entry(slot, swap_index);
4730 +       return NULL;
4731 +}
4732 +
4733 +static inline int in_stable_tree(struct rmap_item *rmap_item)
4734 +{
4735 +       return rmap_item->address & STABLE_FLAG;
4736 +}
4737 +
4738 +/**
4739 + * scan_vma_one_page() - scan the next page in a vma_slot. Called with
4740 + * mmap_sem locked.
4741 + */
4742 +static noinline void scan_vma_one_page(struct vma_slot *slot)
4743 +{
4744 +       u32 hash;
4745 +       struct mm_struct *mm;
4746 +       struct rmap_item *rmap_item = NULL;
4747 +       struct vm_area_struct *vma = slot->vma;
4748 +
4749 +       mm = vma->vm_mm;
4750 +       BUG_ON(!mm);
4751 +       BUG_ON(!slot);
4752 +
4753 +       rmap_item = get_next_rmap_item(slot, &hash);
4754 +       if (!rmap_item)
4755 +               goto out1;
4756 +
4757 +       if (PageKsm(rmap_item->page) && in_stable_tree(rmap_item))
4758 +               goto out2;
4759 +
4760 +       cmp_and_merge_page(rmap_item, hash);
4761 +out2:
4762 +       put_page(rmap_item->page);
4763 +out1:
4764 +       slot->pages_scanned++;
4765 +       slot->this_sampled++;
4766 +       if (slot->fully_scanned_round != fully_scanned_round)
4767 +               scanned_virtual_pages++;
4768 +
4769 +       if (vma_fully_scanned(slot))
4770 +               slot->fully_scanned_round = fully_scanned_round;
4771 +}
4772 +
4773 +static inline unsigned long rung_get_pages(struct scan_rung *rung)
4774 +{
4775 +       struct slot_tree_node *node;
4776 +
4777 +       if (!rung->vma_root.rnode)
4778 +               return 0;
4779 +
4780 +       node = container_of(rung->vma_root.rnode, struct slot_tree_node, snode);
4781 +
4782 +       return node->size;
4783 +}
4784 +
4785 +#define RUNG_SAMPLED_MIN       3
4786 +
4787 +static inline
4788 +void uksm_calc_rung_step(struct scan_rung *rung,
4789 +                        unsigned long page_time, unsigned long ratio)
4790 +{
4791 +       unsigned long sampled, pages;
4792 +
4793 +       /* will be fully scanned ? */
4794 +       if (!rung->cover_msecs) {
4795 +               rung->step = 1;
4796 +               return;
4797 +       }
4798 +
4799 +       sampled = rung->cover_msecs * (NSEC_PER_MSEC / TIME_RATIO_SCALE)
4800 +                 * ratio / page_time;
4801 +
4802 +       /*
4803 +        *  Before we finsish a scan round and expensive per-round jobs,
4804 +        *  we need to have a chance to estimate the per page time. So
4805 +        *  the sampled number can not be too small.
4806 +        */
4807 +       if (sampled < RUNG_SAMPLED_MIN)
4808 +               sampled = RUNG_SAMPLED_MIN;
4809 +
4810 +       pages = rung_get_pages(rung);
4811 +       if (likely(pages > sampled))
4812 +               rung->step = pages / sampled;
4813 +       else
4814 +               rung->step = 1;
4815 +}
4816 +
4817 +static inline int step_need_recalc(struct scan_rung *rung)
4818 +{
4819 +       unsigned long pages, stepmax;
4820 +
4821 +       pages = rung_get_pages(rung);
4822 +       stepmax = pages / RUNG_SAMPLED_MIN;
4823 +
4824 +       return pages && (rung->step > pages ||
4825 +                        (stepmax && rung->step > stepmax));
4826 +}
4827 +
4828 +static inline
4829 +void reset_current_scan(struct scan_rung *rung, int finished, int step_recalc)
4830 +{
4831 +       struct vma_slot *slot;
4832 +
4833 +       if (finished)
4834 +               rung->flags |= UKSM_RUNG_ROUND_FINISHED;
4835 +
4836 +       if (step_recalc || step_need_recalc(rung)) {
4837 +               uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio);
4838 +               BUG_ON(step_need_recalc(rung));
4839 +       }
4840 +
4841 +       slot_iter_index = prandom_u32() % rung->step;
4842 +       BUG_ON(!rung->vma_root.rnode);
4843 +       slot = sradix_tree_next(&rung->vma_root, NULL, 0, slot_iter);
4844 +       BUG_ON(!slot);
4845 +
4846 +       rung->current_scan = slot;
4847 +       rung->current_offset = slot_iter_index;
4848 +}
4849 +
4850 +static inline struct sradix_tree_root *slot_get_root(struct vma_slot *slot)
4851 +{
4852 +       return &slot->rung->vma_root;
4853 +}
4854 +
4855 +/*
4856 + * return if resetted.
4857 + */
4858 +static int advance_current_scan(struct scan_rung *rung)
4859 +{
4860 +       unsigned short n;
4861 +       struct vma_slot *slot, *next = NULL;
4862 +
4863 +       BUG_ON(!rung->vma_root.num);
4864 +
4865 +       slot = rung->current_scan;
4866 +       n = (slot->pages - rung->current_offset) % rung->step;
4867 +       slot_iter_index = rung->step - n;
4868 +       next = sradix_tree_next(&rung->vma_root, slot->snode,
4869 +                               slot->sindex, slot_iter);
4870 +
4871 +       if (next) {
4872 +               rung->current_offset = slot_iter_index;
4873 +               rung->current_scan = next;
4874 +               return 0;
4875 +       } else {
4876 +               reset_current_scan(rung, 1, 0);
4877 +               return 1;
4878 +       }
4879 +}
4880 +
4881 +static inline void rung_rm_slot(struct vma_slot *slot)
4882 +{
4883 +       struct scan_rung *rung = slot->rung;
4884 +       struct sradix_tree_root *root;
4885 +
4886 +       if (rung->current_scan == slot)
4887 +               advance_current_scan(rung);
4888 +
4889 +       root = slot_get_root(slot);
4890 +       sradix_tree_delete_from_leaf(root, slot->snode, slot->sindex);
4891 +       slot->snode = NULL;
4892 +       if (step_need_recalc(rung)) {
4893 +               uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio);
4894 +               BUG_ON(step_need_recalc(rung));
4895 +       }
4896 +
4897 +       /* In case advance_current_scan loop back to this slot again */
4898 +       if (rung->vma_root.num && rung->current_scan == slot)
4899 +               reset_current_scan(slot->rung, 1, 0);
4900 +}
4901 +
4902 +static inline void rung_add_new_slots(struct scan_rung *rung,
4903 +                       struct vma_slot **slots, unsigned long num)
4904 +{
4905 +       int err;
4906 +       struct vma_slot *slot;
4907 +       unsigned long i;
4908 +       struct sradix_tree_root *root = &rung->vma_root;
4909 +
4910 +       err = sradix_tree_enter(root, (void **)slots, num);
4911 +       BUG_ON(err);
4912 +
4913 +       for (i = 0; i < num; i++) {
4914 +               slot = slots[i];
4915 +               slot->rung = rung;
4916 +               BUG_ON(vma_fully_scanned(slot));
4917 +       }
4918 +
4919 +       if (rung->vma_root.num == num)
4920 +               reset_current_scan(rung, 0, 1);
4921 +}
4922 +
4923 +static inline int rung_add_one_slot(struct scan_rung *rung,
4924 +                                    struct vma_slot *slot)
4925 +{
4926 +       int err;
4927 +
4928 +       err = sradix_tree_enter(&rung->vma_root, (void **)&slot, 1);
4929 +       if (err)
4930 +               return err;
4931 +
4932 +       slot->rung = rung;
4933 +       if (rung->vma_root.num == 1)
4934 +               reset_current_scan(rung, 0, 1);
4935 +
4936 +       return 0;
4937 +}
4938 +
4939 +/*
4940 + * Return true if the slot is deleted from its rung.
4941 + */
4942 +static inline int vma_rung_enter(struct vma_slot *slot, struct scan_rung *rung)
4943 +{
4944 +       struct scan_rung *old_rung = slot->rung;
4945 +       int err;
4946 +
4947 +       if (old_rung == rung)
4948 +               return 0;
4949 +
4950 +       rung_rm_slot(slot);
4951 +       err = rung_add_one_slot(rung, slot);
4952 +       if (err) {
4953 +               err = rung_add_one_slot(old_rung, slot);
4954 +               WARN_ON(err); /* OOPS, badly OOM, we lost this slot */
4955 +       }
4956 +
4957 +       return 1;
4958 +}
4959 +
4960 +static inline int vma_rung_up(struct vma_slot *slot)
4961 +{
4962 +       struct scan_rung *rung;
4963 +
4964 +       rung = slot->rung;
4965 +       if (slot->rung != &uksm_scan_ladder[SCAN_LADDER_SIZE-1])
4966 +               rung++;
4967 +
4968 +       return vma_rung_enter(slot, rung);
4969 +}
4970 +
4971 +static inline int vma_rung_down(struct vma_slot *slot)
4972 +{
4973 +       struct scan_rung *rung;
4974 +
4975 +       rung = slot->rung;
4976 +       if (slot->rung != &uksm_scan_ladder[0])
4977 +               rung--;
4978 +
4979 +       return vma_rung_enter(slot, rung);
4980 +}
4981 +
4982 +/**
4983 + * cal_dedup_ratio() - Calculate the deduplication ratio for this slot.
4984 + */
4985 +static unsigned long cal_dedup_ratio(struct vma_slot *slot)
4986 +{
4987 +       unsigned long ret;
4988 +       unsigned long pages;
4989 +
4990 +       pages = slot->this_sampled;
4991 +       if (!pages)
4992 +               return 0;
4993 +
4994 +       BUG_ON(slot->pages_scanned == slot->last_scanned);
4995 +
4996 +       ret = slot->pages_merged;
4997 +
4998 +       /* Thrashing area filtering */
4999 +       if (ret && uksm_thrash_threshold) {
5000 +               if (slot->pages_cowed * 100 / slot->pages_merged
5001 +                   > uksm_thrash_threshold) {
5002 +                       ret = 0;
5003 +               } else {
5004 +                       ret = slot->pages_merged - slot->pages_cowed;
5005 +               }
5006 +       }
5007 +
5008 +       return ret * 100 / pages;
5009 +}
5010 +
5011 +/**
5012 + * cal_dedup_ratio() - Calculate the deduplication ratio for this slot.
5013 + */
5014 +static unsigned long cal_dedup_ratio_old(struct vma_slot *slot)
5015 +{
5016 +       unsigned long ret;
5017 +       unsigned long pages;
5018 +
5019 +       pages = slot->pages;
5020 +       if (!pages)
5021 +               return 0;
5022 +
5023 +       ret = slot->pages_bemerged;
5024 +
5025 +       /* Thrashing area filtering */
5026 +       if (ret && uksm_thrash_threshold) {
5027 +               if (slot->pages_cowed * 100 / slot->pages_bemerged
5028 +                   > uksm_thrash_threshold) {
5029 +                       ret = 0;
5030 +               } else {
5031 +                       ret = slot->pages_bemerged - slot->pages_cowed;
5032 +               }
5033 +       }
5034 +
5035 +       return ret * 100 / pages;
5036 +}
5037 +
5038 +/**
5039 + * stable_node_reinsert() - When the hash_strength has been adjusted, the
5040 + * stable tree need to be restructured, this is the function re-inserting the
5041 + * stable node.
5042 + */
5043 +static inline void stable_node_reinsert(struct stable_node *new_node,
5044 +                                       struct page *page,
5045 +                                       struct rb_root *root_treep,
5046 +                                       struct list_head *tree_node_listp,
5047 +                                       u32 hash)
5048 +{
5049 +       struct rb_node **new = &root_treep->rb_node;
5050 +       struct rb_node *parent = NULL;
5051 +       struct stable_node *stable_node;
5052 +       struct tree_node *tree_node;
5053 +       struct page *tree_page;
5054 +       int cmp;
5055 +
5056 +       while (*new) {
5057 +               int cmp;
5058 +
5059 +               tree_node = rb_entry(*new, struct tree_node, node);
5060 +
5061 +               cmp = hash_cmp(hash, tree_node->hash);
5062 +
5063 +               if (cmp < 0) {
5064 +                       parent = *new;
5065 +                       new = &parent->rb_left;
5066 +               } else if (cmp > 0) {
5067 +                       parent = *new;
5068 +                       new = &parent->rb_right;
5069 +               } else
5070 +                       break;
5071 +       }
5072 +
5073 +       if (*new) {
5074 +               /* find a stable tree node with same first level hash value */
5075 +               stable_node_hash_max(new_node, page, hash);
5076 +               if (tree_node->count == 1) {
5077 +                       stable_node = rb_entry(tree_node->sub_root.rb_node,
5078 +                                              struct stable_node, node);
5079 +                       tree_page = get_uksm_page(stable_node, 1, 0);
5080 +                       if (tree_page) {
5081 +                               stable_node_hash_max(stable_node,
5082 +                                                     tree_page, hash);
5083 +                               put_page(tree_page);
5084 +
5085 +                               /* prepare for stable node insertion */
5086 +
5087 +                               cmp = hash_cmp(new_node->hash_max,
5088 +                                                  stable_node->hash_max);
5089 +                               parent = &stable_node->node;
5090 +                               if (cmp < 0)
5091 +                                       new = &parent->rb_left;
5092 +                               else if (cmp > 0)
5093 +                                       new = &parent->rb_right;
5094 +                               else
5095 +                                       goto failed;
5096 +
5097 +                               goto add_node;
5098 +                       } else {
5099 +                               /* the only stable_node deleted, the tree node
5100 +                                * was not deleted.
5101 +                                */
5102 +                               goto tree_node_reuse;
5103 +                       }
5104 +               }
5105 +
5106 +               /* well, search the collision subtree */
5107 +               new = &tree_node->sub_root.rb_node;
5108 +               parent = NULL;
5109 +               BUG_ON(!*new);
5110 +               while (*new) {
5111 +                       int cmp;
5112 +
5113 +                       stable_node = rb_entry(*new, struct stable_node, node);
5114 +
5115 +                       cmp = hash_cmp(new_node->hash_max,
5116 +                                          stable_node->hash_max);
5117 +
5118 +                       if (cmp < 0) {
5119 +                               parent = *new;
5120 +                               new = &parent->rb_left;
5121 +                       } else if (cmp > 0) {
5122 +                               parent = *new;
5123 +                               new = &parent->rb_right;
5124 +                       } else {
5125 +                               /* oh, no, still a collision */
5126 +                               goto failed;
5127 +                       }
5128 +               }
5129 +
5130 +               goto add_node;
5131 +       }
5132 +
5133 +       /* no tree node found */
5134 +       tree_node = alloc_tree_node(tree_node_listp);
5135 +       if (!tree_node) {
5136 +               pr_err("UKSM: memory allocation error!\n");
5137 +               goto failed;
5138 +       } else {
5139 +               tree_node->hash = hash;
5140 +               rb_link_node(&tree_node->node, parent, new);
5141 +               rb_insert_color(&tree_node->node, root_treep);
5142 +
5143 +tree_node_reuse:
5144 +               /* prepare for stable node insertion */
5145 +               parent = NULL;
5146 +               new = &tree_node->sub_root.rb_node;
5147 +       }
5148 +
5149 +add_node:
5150 +       rb_link_node(&new_node->node, parent, new);
5151 +       rb_insert_color(&new_node->node, &tree_node->sub_root);
5152 +       new_node->tree_node = tree_node;
5153 +       tree_node->count++;
5154 +       return;
5155 +
5156 +failed:
5157 +       /* This can only happen when two nodes have collided
5158 +        * in two levels.
5159 +        */
5160 +       new_node->tree_node = NULL;
5161 +       return;
5162 +}
5163 +
5164 +static inline void free_all_tree_nodes(struct list_head *list)
5165 +{
5166 +       struct tree_node *node, *tmp;
5167 +
5168 +       list_for_each_entry_safe(node, tmp, list, all_list) {
5169 +               free_tree_node(node);
5170 +       }
5171 +}
5172 +
5173 +/**
5174 + * stable_tree_delta_hash() - Delta hash the stable tree from previous hash
5175 + * strength to the current hash_strength. It re-structures the hole tree.
5176 + */
5177 +static inline void stable_tree_delta_hash(u32 prev_hash_strength)
5178 +{
5179 +       struct stable_node *node, *tmp;
5180 +       struct rb_root *root_new_treep;
5181 +       struct list_head *new_tree_node_listp;
5182 +
5183 +       stable_tree_index = (stable_tree_index + 1) % 2;
5184 +       root_new_treep = &root_stable_tree[stable_tree_index];
5185 +       new_tree_node_listp = &stable_tree_node_list[stable_tree_index];
5186 +       *root_new_treep = RB_ROOT;
5187 +       BUG_ON(!list_empty(new_tree_node_listp));
5188 +
5189 +       /*
5190 +        * we need to be safe, the node could be removed by get_uksm_page()
5191 +        */
5192 +       list_for_each_entry_safe(node, tmp, &stable_node_list, all_list) {
5193 +               void *addr;
5194 +               struct page *node_page;
5195 +               u32 hash;
5196 +
5197 +               /*
5198 +                * We are completely re-structuring the stable nodes to a new
5199 +                * stable tree. We don't want to touch the old tree unlinks and
5200 +                * old tree_nodes. The old tree_nodes will be freed at once.
5201 +                */
5202 +               node_page = get_uksm_page(node, 0, 0);
5203 +               if (!node_page)
5204 +                       continue;
5205 +
5206 +               if (node->tree_node) {
5207 +                       hash = node->tree_node->hash;
5208 +
5209 +                       addr = kmap_atomic(node_page);
5210 +
5211 +                       hash = delta_hash(addr, prev_hash_strength,
5212 +                                         hash_strength, hash);
5213 +                       kunmap_atomic(addr);
5214 +               } else {
5215 +                       /*
5216 +                        *it was not inserted to rbtree due to collision in last
5217 +                        *round scan.
5218 +                        */
5219 +                       hash = page_hash(node_page, hash_strength, 0);
5220 +               }
5221 +
5222 +               stable_node_reinsert(node, node_page, root_new_treep,
5223 +                                    new_tree_node_listp, hash);
5224 +               put_page(node_page);
5225 +       }
5226 +
5227 +       root_stable_treep = root_new_treep;
5228 +       free_all_tree_nodes(stable_tree_node_listp);
5229 +       BUG_ON(!list_empty(stable_tree_node_listp));
5230 +       stable_tree_node_listp = new_tree_node_listp;
5231 +}
5232 +
5233 +static inline void inc_hash_strength(unsigned long delta)
5234 +{
5235 +       hash_strength += 1 << delta;
5236 +       if (hash_strength > HASH_STRENGTH_MAX)
5237 +               hash_strength = HASH_STRENGTH_MAX;
5238 +}
5239 +
5240 +static inline void dec_hash_strength(unsigned long delta)
5241 +{
5242 +       unsigned long change = 1 << delta;
5243 +
5244 +       if (hash_strength <= change + 1)
5245 +               hash_strength = 1;
5246 +       else
5247 +               hash_strength -= change;
5248 +}
5249 +
5250 +static inline void inc_hash_strength_delta(void)
5251 +{
5252 +       hash_strength_delta++;
5253 +       if (hash_strength_delta > HASH_STRENGTH_DELTA_MAX)
5254 +               hash_strength_delta = HASH_STRENGTH_DELTA_MAX;
5255 +}
5256 +
5257 +static inline unsigned long get_current_neg_ratio(void)
5258 +{
5259 +       u64 pos = benefit.pos;
5260 +       u64 neg = benefit.neg;
5261 +
5262 +       if (!neg)
5263 +               return 0;
5264 +
5265 +       if (!pos || neg > pos)
5266 +               return 100;
5267 +
5268 +       if (neg > div64_u64(U64_MAX, 100))
5269 +               pos = div64_u64(pos, 100);
5270 +       else
5271 +               neg *= 100;
5272 +
5273 +       return div64_u64(neg, pos);
5274 +}
5275 +
5276 +static inline unsigned long get_current_benefit(void)
5277 +{
5278 +       u64 pos = benefit.pos;
5279 +       u64 neg = benefit.neg;
5280 +       u64 scanned = benefit.scanned;
5281 +
5282 +       if (neg > pos)
5283 +               return 0;
5284 +
5285 +       return div64_u64((pos - neg), scanned);
5286 +}
5287 +
5288 +static inline int judge_rshash_direction(void)
5289 +{
5290 +       u64 current_neg_ratio, stable_benefit;
5291 +       u64 current_benefit, delta = 0;
5292 +       int ret = STILL;
5293 +
5294 +       /*
5295 +        * Try to probe a value after the boot, and in case the system
5296 +        * are still for a long time.
5297 +        */
5298 +       if ((fully_scanned_round & 0xFFULL) == 10) {
5299 +               ret = OBSCURE;
5300 +               goto out;
5301 +       }
5302 +
5303 +       current_neg_ratio = get_current_neg_ratio();
5304 +
5305 +       if (current_neg_ratio == 0) {
5306 +               rshash_neg_cont_zero++;
5307 +               if (rshash_neg_cont_zero > 2)
5308 +                       return GO_DOWN;
5309 +               else
5310 +                       return STILL;
5311 +       }
5312 +       rshash_neg_cont_zero = 0;
5313 +
5314 +       if (current_neg_ratio > 90) {
5315 +               ret = GO_UP;
5316 +               goto out;
5317 +       }
5318 +
5319 +       current_benefit = get_current_benefit();
5320 +       stable_benefit = rshash_state.stable_benefit;
5321 +
5322 +       if (!stable_benefit) {
5323 +               ret = OBSCURE;
5324 +               goto out;
5325 +       }
5326 +
5327 +       if (current_benefit > stable_benefit)
5328 +               delta = current_benefit - stable_benefit;
5329 +       else if (current_benefit < stable_benefit)
5330 +               delta = stable_benefit - current_benefit;
5331 +
5332 +       delta = div64_u64(100 * delta, stable_benefit);
5333 +
5334 +       if (delta > 50) {
5335 +               rshash_cont_obscure++;
5336 +               if (rshash_cont_obscure > 2)
5337 +                       return OBSCURE;
5338 +               else
5339 +                       return STILL;
5340 +       }
5341 +
5342 +out:
5343 +       rshash_cont_obscure = 0;
5344 +       return ret;
5345 +}
5346 +
5347 +/**
5348 + * rshash_adjust() - The main function to control the random sampling state
5349 + * machine for hash strength adapting.
5350 + *
5351 + * return true if hash_strength has changed.
5352 + */
5353 +static inline int rshash_adjust(void)
5354 +{
5355 +       unsigned long prev_hash_strength = hash_strength;
5356 +
5357 +       if (!encode_benefit())
5358 +               return 0;
5359 +
5360 +       switch (rshash_state.state) {
5361 +       case RSHASH_STILL:
5362 +               switch (judge_rshash_direction()) {
5363 +               case GO_UP:
5364 +                       if (rshash_state.pre_direct == GO_DOWN)
5365 +                               hash_strength_delta = 0;
5366 +
5367 +                       inc_hash_strength(hash_strength_delta);
5368 +                       inc_hash_strength_delta();
5369 +                       rshash_state.stable_benefit = get_current_benefit();
5370 +                       rshash_state.pre_direct = GO_UP;
5371 +                       break;
5372 +
5373 +               case GO_DOWN:
5374 +                       if (rshash_state.pre_direct == GO_UP)
5375 +                               hash_strength_delta = 0;
5376 +
5377 +                       dec_hash_strength(hash_strength_delta);
5378 +                       inc_hash_strength_delta();
5379 +                       rshash_state.stable_benefit = get_current_benefit();
5380 +                       rshash_state.pre_direct = GO_DOWN;
5381 +                       break;
5382 +
5383 +               case OBSCURE:
5384 +                       rshash_state.stable_point = hash_strength;
5385 +                       rshash_state.turn_point_down = hash_strength;
5386 +                       rshash_state.turn_point_up = hash_strength;
5387 +                       rshash_state.turn_benefit_down = get_current_benefit();
5388 +                       rshash_state.turn_benefit_up = get_current_benefit();
5389 +                       rshash_state.lookup_window_index = 0;
5390 +                       rshash_state.state = RSHASH_TRYDOWN;
5391 +                       dec_hash_strength(hash_strength_delta);
5392 +                       inc_hash_strength_delta();
5393 +                       break;
5394 +
5395 +               case STILL:
5396 +                       break;
5397 +               default:
5398 +                       BUG();
5399 +               }
5400 +               break;
5401 +
5402 +       case RSHASH_TRYDOWN:
5403 +               if (rshash_state.lookup_window_index++ % 5 == 0)
5404 +                       rshash_state.below_count = 0;
5405 +
5406 +               if (get_current_benefit() < rshash_state.stable_benefit)
5407 +                       rshash_state.below_count++;
5408 +               else if (get_current_benefit() >
5409 +                        rshash_state.turn_benefit_down) {
5410 +                       rshash_state.turn_point_down = hash_strength;
5411 +                       rshash_state.turn_benefit_down = get_current_benefit();
5412 +               }
5413 +
5414 +               if (rshash_state.below_count >= 3 ||
5415 +                   judge_rshash_direction() == GO_UP ||
5416 +                   hash_strength == 1) {
5417 +                       hash_strength = rshash_state.stable_point;
5418 +                       hash_strength_delta = 0;
5419 +                       inc_hash_strength(hash_strength_delta);
5420 +                       inc_hash_strength_delta();
5421 +                       rshash_state.lookup_window_index = 0;
5422 +                       rshash_state.state = RSHASH_TRYUP;
5423 +                       hash_strength_delta = 0;
5424 +               } else {
5425 +                       dec_hash_strength(hash_strength_delta);
5426 +                       inc_hash_strength_delta();
5427 +               }
5428 +               break;
5429 +
5430 +       case RSHASH_TRYUP:
5431 +               if (rshash_state.lookup_window_index++ % 5 == 0)
5432 +                       rshash_state.below_count = 0;
5433 +
5434 +               if (get_current_benefit() < rshash_state.turn_benefit_down)
5435 +                       rshash_state.below_count++;
5436 +               else if (get_current_benefit() > rshash_state.turn_benefit_up) {
5437 +                       rshash_state.turn_point_up = hash_strength;
5438 +                       rshash_state.turn_benefit_up = get_current_benefit();
5439 +               }
5440 +
5441 +               if (rshash_state.below_count >= 3 ||
5442 +                   judge_rshash_direction() == GO_DOWN ||
5443 +                   hash_strength == HASH_STRENGTH_MAX) {
5444 +                       hash_strength = rshash_state.turn_benefit_up >
5445 +                               rshash_state.turn_benefit_down ?
5446 +                               rshash_state.turn_point_up :
5447 +                               rshash_state.turn_point_down;
5448 +
5449 +                       rshash_state.state = RSHASH_PRE_STILL;
5450 +               } else {
5451 +                       inc_hash_strength(hash_strength_delta);
5452 +                       inc_hash_strength_delta();
5453 +               }
5454 +
5455 +               break;
5456 +
5457 +       case RSHASH_NEW:
5458 +       case RSHASH_PRE_STILL:
5459 +               rshash_state.stable_benefit = get_current_benefit();
5460 +               rshash_state.state = RSHASH_STILL;
5461 +               hash_strength_delta = 0;
5462 +               break;
5463 +       default:
5464 +               BUG();
5465 +       }
5466 +
5467 +       /* rshash_neg = rshash_pos = 0; */
5468 +       reset_benefit();
5469 +
5470 +       if (prev_hash_strength != hash_strength)
5471 +               stable_tree_delta_hash(prev_hash_strength);
5472 +
5473 +       return prev_hash_strength != hash_strength;
5474 +}
5475 +
5476 +/**
5477 + * round_update_ladder() - The main function to do update of all the
5478 + * adjustments whenever a scan round is finished.
5479 + */
5480 +static noinline void round_update_ladder(void)
5481 +{
5482 +       int i;
5483 +       unsigned long dedup;
5484 +       struct vma_slot *slot, *tmp_slot;
5485 +
5486 +       for (i = 0; i < SCAN_LADDER_SIZE; i++)
5487 +               uksm_scan_ladder[i].flags &= ~UKSM_RUNG_ROUND_FINISHED;
5488 +
5489 +       list_for_each_entry_safe(slot, tmp_slot, &vma_slot_dedup, dedup_list) {
5490 +
5491 +               /* slot may be rung_rm_slot() when mm exits */
5492 +               if (slot->snode) {
5493 +                       dedup = cal_dedup_ratio_old(slot);
5494 +                       if (dedup && dedup >= uksm_abundant_threshold)
5495 +                               vma_rung_up(slot);
5496 +               }
5497 +
5498 +               slot->pages_bemerged = 0;
5499 +               slot->pages_cowed = 0;
5500 +
5501 +               list_del_init(&slot->dedup_list);
5502 +       }
5503 +}
5504 +
5505 +static void uksm_del_vma_slot(struct vma_slot *slot)
5506 +{
5507 +       int i, j;
5508 +       struct rmap_list_entry *entry;
5509 +
5510 +       if (slot->snode) {
5511 +               /*
5512 +                * In case it just failed when entering the rung, it's not
5513 +                * necessary.
5514 +                */
5515 +               rung_rm_slot(slot);
5516 +       }
5517 +
5518 +       if (!list_empty(&slot->dedup_list))
5519 +               list_del(&slot->dedup_list);
5520 +
5521 +       if (!slot->rmap_list_pool || !slot->pool_counts) {
5522 +               /* In case it OOMed in uksm_vma_enter() */
5523 +               goto out;
5524 +       }
5525 +
5526 +       for (i = 0; i < slot->pool_size; i++) {
5527 +               void *addr;
5528 +
5529 +               if (!slot->rmap_list_pool[i])
5530 +                       continue;
5531 +
5532 +               addr = kmap(slot->rmap_list_pool[i]);
5533 +               for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
5534 +                       entry = (struct rmap_list_entry *)addr + j;
5535 +                       if (is_addr(entry->addr))
5536 +                               continue;
5537 +                       if (!entry->item)
5538 +                               continue;
5539 +
5540 +                       remove_rmap_item_from_tree(entry->item);
5541 +                       free_rmap_item(entry->item);
5542 +                       slot->pool_counts[i]--;
5543 +               }
5544 +               BUG_ON(slot->pool_counts[i]);
5545 +               kunmap(slot->rmap_list_pool[i]);
5546 +               __free_page(slot->rmap_list_pool[i]);
5547 +       }
5548 +       kfree(slot->rmap_list_pool);
5549 +       kfree(slot->pool_counts);
5550 +
5551 +out:
5552 +       slot->rung = NULL;
5553 +       if (slot->flags & UKSM_SLOT_IN_UKSM) {
5554 +               BUG_ON(uksm_pages_total < slot->pages);
5555 +               uksm_pages_total -= slot->pages;
5556 +       }
5557 +
5558 +       if (slot->fully_scanned_round == fully_scanned_round)
5559 +               scanned_virtual_pages -= slot->pages;
5560 +       else
5561 +               scanned_virtual_pages -= slot->pages_scanned;
5562 +       free_vma_slot(slot);
5563 +}
5564 +
5565 +
5566 +#define SPIN_LOCK_PERIOD       32
5567 +static struct vma_slot *cleanup_slots[SPIN_LOCK_PERIOD];
5568 +static inline void cleanup_vma_slots(void)
5569 +{
5570 +       struct vma_slot *slot;
5571 +       int i;
5572 +
5573 +       i = 0;
5574 +       spin_lock(&vma_slot_list_lock);
5575 +       while (!list_empty(&vma_slot_del)) {
5576 +               slot = list_entry(vma_slot_del.next,
5577 +                                 struct vma_slot, slot_list);
5578 +               list_del(&slot->slot_list);
5579 +               cleanup_slots[i++] = slot;
5580 +               if (i == SPIN_LOCK_PERIOD) {
5581 +                       spin_unlock(&vma_slot_list_lock);
5582 +                       while (--i >= 0)
5583 +                               uksm_del_vma_slot(cleanup_slots[i]);
5584 +                       i = 0;
5585 +                       spin_lock(&vma_slot_list_lock);
5586 +               }
5587 +       }
5588 +       spin_unlock(&vma_slot_list_lock);
5589 +
5590 +       while (--i >= 0)
5591 +               uksm_del_vma_slot(cleanup_slots[i]);
5592 +}
5593 +
5594 +/*
5595 + * Expotional moving average formula
5596 + */
5597 +static inline unsigned long ema(unsigned long curr, unsigned long last_ema)
5598 +{
5599 +       /*
5600 +        * For a very high burst, even the ema cannot work well, a false very
5601 +        * high per-page time estimation can result in feedback in very high
5602 +        * overhead of context switch and rung update -- this will then lead
5603 +        * to higher per-paper time, this may not converge.
5604 +        *
5605 +        * Instead, we try to approach this value in a binary manner.
5606 +        */
5607 +       if (curr > last_ema * 10)
5608 +               return last_ema * 2;
5609 +
5610 +       return (EMA_ALPHA * curr + (100 - EMA_ALPHA) * last_ema) / 100;
5611 +}
5612 +
5613 +/*
5614 + * convert cpu ratio in 1/TIME_RATIO_SCALE configured by user to
5615 + * nanoseconds based on current uksm_sleep_jiffies.
5616 + */
5617 +static inline unsigned long cpu_ratio_to_nsec(unsigned int ratio)
5618 +{
5619 +       return NSEC_PER_USEC * jiffies_to_usecs(uksm_sleep_jiffies) /
5620 +               (TIME_RATIO_SCALE - ratio) * ratio;
5621 +}
5622 +
5623 +
5624 +static inline unsigned long rung_real_ratio(int cpu_time_ratio)
5625 +{
5626 +       unsigned long ret;
5627 +
5628 +       BUG_ON(!cpu_time_ratio);
5629 +
5630 +       if (cpu_time_ratio > 0)
5631 +               ret = cpu_time_ratio;
5632 +       else
5633 +               ret = (unsigned long)(-cpu_time_ratio) *
5634 +                       uksm_max_cpu_percentage / 100UL;
5635 +
5636 +       return ret ? ret : 1;
5637 +}
5638 +
5639 +static noinline void uksm_calc_scan_pages(void)
5640 +{
5641 +       struct scan_rung *ladder = uksm_scan_ladder;
5642 +       unsigned long sleep_usecs, nsecs;
5643 +       unsigned long ratio;
5644 +       int i;
5645 +       unsigned long per_page;
5646 +
5647 +       if (uksm_ema_page_time > 100000 ||
5648 +           (((unsigned long) uksm_eval_round & (256UL - 1)) == 0UL))
5649 +               uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT;
5650 +
5651 +       per_page = uksm_ema_page_time;
5652 +       BUG_ON(!per_page);
5653 +
5654 +       /*
5655 +        * For every 8 eval round, we try to probe a uksm_sleep_jiffies value
5656 +        * based on saved user input.
5657 +        */
5658 +       if (((unsigned long) uksm_eval_round & (8UL - 1)) == 0UL)
5659 +               uksm_sleep_jiffies = uksm_sleep_saved;
5660 +
5661 +       /* We require a rung scan at least 1 page in a period. */
5662 +       nsecs = per_page;
5663 +       ratio = rung_real_ratio(ladder[0].cpu_ratio);
5664 +       if (cpu_ratio_to_nsec(ratio) < nsecs) {
5665 +               sleep_usecs = nsecs * (TIME_RATIO_SCALE - ratio) / ratio
5666 +                               / NSEC_PER_USEC;
5667 +               uksm_sleep_jiffies = usecs_to_jiffies(sleep_usecs) + 1;
5668 +       }
5669 +
5670 +       for (i = 0; i < SCAN_LADDER_SIZE; i++) {
5671 +               ratio = rung_real_ratio(ladder[i].cpu_ratio);
5672 +               ladder[i].pages_to_scan = cpu_ratio_to_nsec(ratio) /
5673 +                                       per_page;
5674 +               BUG_ON(!ladder[i].pages_to_scan);
5675 +               uksm_calc_rung_step(&ladder[i], per_page, ratio);
5676 +       }
5677 +}
5678 +
5679 +/*
5680 + * From the scan time of this round (ns) to next expected min sleep time
5681 + * (ms), be careful of the possible overflows. ratio is taken from
5682 + * rung_real_ratio()
5683 + */
5684 +static inline
5685 +unsigned int scan_time_to_sleep(unsigned long long scan_time, unsigned long ratio)
5686 +{
5687 +       scan_time >>= 20; /* to msec level now */
5688 +       BUG_ON(scan_time > (ULONG_MAX / TIME_RATIO_SCALE));
5689 +
5690 +       return (unsigned int) ((unsigned long) scan_time *
5691 +                              (TIME_RATIO_SCALE - ratio) / ratio);
5692 +}
5693 +
5694 +#define __round_mask(x, y) ((__typeof__(x))((y)-1))
5695 +#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
5696 +
5697 +static void uksm_vma_enter(struct vma_slot **slots, unsigned long num)
5698 +{
5699 +       struct scan_rung *rung;
5700 +
5701 +       rung = &uksm_scan_ladder[0];
5702 +       rung_add_new_slots(rung, slots, num);
5703 +}
5704 +
5705 +static struct vma_slot *batch_slots[SLOT_TREE_NODE_STORE_SIZE];
5706 +
5707 +static void uksm_enter_all_slots(void)
5708 +{
5709 +       struct vma_slot *slot;
5710 +       unsigned long index;
5711 +       struct list_head empty_vma_list;
5712 +       int i;
5713 +
5714 +       i = 0;
5715 +       index = 0;
5716 +       INIT_LIST_HEAD(&empty_vma_list);
5717 +
5718 +       spin_lock(&vma_slot_list_lock);
5719 +       while (!list_empty(&vma_slot_new)) {
5720 +               slot = list_entry(vma_slot_new.next,
5721 +                                 struct vma_slot, slot_list);
5722 +
5723 +               if (!slot->vma->anon_vma) {
5724 +                       list_move(&slot->slot_list, &empty_vma_list);
5725 +               } else if (vma_can_enter(slot->vma)) {
5726 +                       batch_slots[index++] = slot;
5727 +                       list_del_init(&slot->slot_list);
5728 +               } else {
5729 +                       list_move(&slot->slot_list, &vma_slot_noadd);
5730 +               }
5731 +
5732 +               if (++i == SPIN_LOCK_PERIOD ||
5733 +                   (index && !(index % SLOT_TREE_NODE_STORE_SIZE))) {
5734 +                       spin_unlock(&vma_slot_list_lock);
5735 +
5736 +                       if (index && !(index % SLOT_TREE_NODE_STORE_SIZE)) {
5737 +                               uksm_vma_enter(batch_slots, index);
5738 +                               index = 0;
5739 +                       }
5740 +                       i = 0;
5741 +                       cond_resched();
5742 +                       spin_lock(&vma_slot_list_lock);
5743 +               }
5744 +       }
5745 +
5746 +       list_splice(&empty_vma_list, &vma_slot_new);
5747 +
5748 +       spin_unlock(&vma_slot_list_lock);
5749 +
5750 +       if (index)
5751 +               uksm_vma_enter(batch_slots, index);
5752 +
5753 +}
5754 +
5755 +static inline int rung_round_finished(struct scan_rung *rung)
5756 +{
5757 +       return rung->flags & UKSM_RUNG_ROUND_FINISHED;
5758 +}
5759 +
5760 +static inline void judge_slot(struct vma_slot *slot)
5761 +{
5762 +       struct scan_rung *rung = slot->rung;
5763 +       unsigned long dedup;
5764 +       int deleted;
5765 +
5766 +       dedup = cal_dedup_ratio(slot);
5767 +       if (vma_fully_scanned(slot) && uksm_thrash_threshold)
5768 +               deleted = vma_rung_enter(slot, &uksm_scan_ladder[0]);
5769 +       else if (dedup && dedup >= uksm_abundant_threshold)
5770 +               deleted = vma_rung_up(slot);
5771 +       else
5772 +               deleted = vma_rung_down(slot);
5773 +
5774 +       slot->pages_merged = 0;
5775 +       slot->pages_cowed = 0;
5776 +       slot->this_sampled = 0;
5777 +
5778 +       if (vma_fully_scanned(slot))
5779 +               slot->pages_scanned = 0;
5780 +
5781 +       slot->last_scanned = slot->pages_scanned;
5782 +
5783 +       /* If its deleted in above, then rung was already advanced. */
5784 +       if (!deleted)
5785 +               advance_current_scan(rung);
5786 +}
5787 +
5788 +
5789 +static inline int hash_round_finished(void)
5790 +{
5791 +       if (scanned_virtual_pages > (uksm_pages_total >> 2)) {
5792 +               scanned_virtual_pages = 0;
5793 +               if (uksm_pages_scanned)
5794 +                       fully_scanned_round++;
5795 +
5796 +               return 1;
5797 +       } else {
5798 +               return 0;
5799 +       }
5800 +}
5801 +
5802 +#define UKSM_MMSEM_BATCH       5
5803 +#define BUSY_RETRY             100
5804 +
5805 +/**
5806 + * uksm_do_scan()  - the main worker function.
5807 + */
5808 +static noinline void uksm_do_scan(void)
5809 +{
5810 +       struct vma_slot *slot, *iter;
5811 +       struct mm_struct *busy_mm;
5812 +       unsigned char round_finished, all_rungs_emtpy;
5813 +       int i, err, mmsem_batch;
5814 +       unsigned long pcost;
5815 +       long long delta_exec;
5816 +       unsigned long vpages, max_cpu_ratio;
5817 +       unsigned long long start_time, end_time, scan_time;
5818 +       unsigned int expected_jiffies;
5819 +
5820 +       might_sleep();
5821 +
5822 +       vpages = 0;
5823 +
5824 +       start_time = task_sched_runtime(current);
5825 +       max_cpu_ratio = 0;
5826 +       mmsem_batch = 0;
5827 +
5828 +       for (i = 0; i < SCAN_LADDER_SIZE;) {
5829 +               struct scan_rung *rung = &uksm_scan_ladder[i];
5830 +               unsigned long ratio;
5831 +               int busy_retry;
5832 +
5833 +               if (!rung->pages_to_scan) {
5834 +                       i++;
5835 +                       continue;
5836 +               }
5837 +
5838 +               if (!rung->vma_root.num) {
5839 +                       rung->pages_to_scan = 0;
5840 +                       i++;
5841 +                       continue;
5842 +               }
5843 +
5844 +               ratio = rung_real_ratio(rung->cpu_ratio);
5845 +               if (ratio > max_cpu_ratio)
5846 +                       max_cpu_ratio = ratio;
5847 +
5848 +               busy_retry = BUSY_RETRY;
5849 +               /*
5850 +                * Do not consider rung_round_finished() here, just used up the
5851 +                * rung->pages_to_scan quota.
5852 +                */
5853 +               while (rung->pages_to_scan && rung->vma_root.num &&
5854 +                      likely(!freezing(current))) {
5855 +                       int reset = 0;
5856 +
5857 +                       slot = rung->current_scan;
5858 +
5859 +                       BUG_ON(vma_fully_scanned(slot));
5860 +
5861 +                       if (mmsem_batch)
5862 +                               err = 0;
5863 +                       else
5864 +                               err = try_down_read_slot_mmap_sem(slot);
5865 +
5866 +                       if (err == -ENOENT) {
5867 +rm_slot:
5868 +                               rung_rm_slot(slot);
5869 +                               continue;
5870 +                       }
5871 +
5872 +                       busy_mm = slot->mm;
5873 +
5874 +                       if (err == -EBUSY) {
5875 +                               /* skip other vmas on the same mm */
5876 +                               do {
5877 +                                       reset = advance_current_scan(rung);
5878 +                                       iter = rung->current_scan;
5879 +                                       busy_retry--;
5880 +                                       if (iter->vma->vm_mm != busy_mm ||
5881 +                                           !busy_retry || reset)
5882 +                                               break;
5883 +                               } while (1);
5884 +
5885 +                               if (iter->vma->vm_mm != busy_mm) {
5886 +                                       continue;
5887 +                               } else {
5888 +                                       /* scan round finsished */
5889 +                                       break;
5890 +                               }
5891 +                       }
5892 +
5893 +                       BUG_ON(!vma_can_enter(slot->vma));
5894 +                       if (uksm_test_exit(slot->vma->vm_mm)) {
5895 +                               mmsem_batch = 0;
5896 +                               mmap_read_unlock(slot->vma->vm_mm);
5897 +                               goto rm_slot;
5898 +                       }
5899 +
5900 +                       if (mmsem_batch)
5901 +                               mmsem_batch--;
5902 +                       else
5903 +                               mmsem_batch = UKSM_MMSEM_BATCH;
5904 +
5905 +                       /* Ok, we have take the mmap_sem, ready to scan */
5906 +                       scan_vma_one_page(slot);
5907 +                       rung->pages_to_scan--;
5908 +                       vpages++;
5909 +
5910 +                       if (rung->current_offset + rung->step > slot->pages - 1
5911 +                           || vma_fully_scanned(slot)) {
5912 +                               mmap_read_unlock(slot->vma->vm_mm);
5913 +                               judge_slot(slot);
5914 +                               mmsem_batch = 0;
5915 +                       } else {
5916 +                               rung->current_offset += rung->step;
5917 +                               if (!mmsem_batch)
5918 +                                       mmap_read_unlock(slot->vma->vm_mm);
5919 +                       }
5920 +
5921 +                       busy_retry = BUSY_RETRY;
5922 +                       cond_resched();
5923 +               }
5924 +
5925 +               if (mmsem_batch) {
5926 +                       mmap_read_unlock(slot->vma->vm_mm);
5927 +                       mmsem_batch = 0;
5928 +               }
5929 +
5930 +               if (freezing(current))
5931 +                       break;
5932 +
5933 +               cond_resched();
5934 +       }
5935 +       end_time = task_sched_runtime(current);
5936 +       delta_exec = end_time - start_time;
5937 +
5938 +       if (freezing(current))
5939 +               return;
5940 +
5941 +       cleanup_vma_slots();
5942 +       uksm_enter_all_slots();
5943 +
5944 +       round_finished = 1;
5945 +       all_rungs_emtpy = 1;
5946 +       for (i = 0; i < SCAN_LADDER_SIZE; i++) {
5947 +               struct scan_rung *rung = &uksm_scan_ladder[i];
5948 +
5949 +               if (rung->vma_root.num) {
5950 +                       all_rungs_emtpy = 0;
5951 +                       if (!rung_round_finished(rung))
5952 +                               round_finished = 0;
5953 +               }
5954 +       }
5955 +
5956 +       if (all_rungs_emtpy)
5957 +               round_finished = 0;
5958 +
5959 +       if (round_finished) {
5960 +               round_update_ladder();
5961 +               uksm_eval_round++;
5962 +
5963 +               if (hash_round_finished() && rshash_adjust()) {
5964 +                       /* Reset the unstable root iff hash strength changed */
5965 +                       uksm_hash_round++;
5966 +                       root_unstable_tree = RB_ROOT;
5967 +                       free_all_tree_nodes(&unstable_tree_node_list);
5968 +               }
5969 +
5970 +               /*
5971 +                * A number of pages can hang around indefinitely on per-cpu
5972 +                * pagevecs, raised page count preventing write_protect_page
5973 +                * from merging them.  Though it doesn't really matter much,
5974 +                * it is puzzling to see some stuck in pages_volatile until
5975 +                * other activity jostles them out, and they also prevented
5976 +                * LTP's KSM test from succeeding deterministically; so drain
5977 +                * them here (here rather than on entry to uksm_do_scan(),
5978 +                * so we don't IPI too often when pages_to_scan is set low).
5979 +                */
5980 +               lru_add_drain_all();
5981 +       }
5982 +
5983 +
5984 +       if (vpages && delta_exec > 0) {
5985 +               pcost = (unsigned long) delta_exec / vpages;
5986 +               if (likely(uksm_ema_page_time))
5987 +                       uksm_ema_page_time = ema(pcost, uksm_ema_page_time);
5988 +               else
5989 +                       uksm_ema_page_time = pcost;
5990 +       }
5991 +
5992 +       uksm_calc_scan_pages();
5993 +       uksm_sleep_real = uksm_sleep_jiffies;
5994 +       /* in case of radical cpu bursts, apply the upper bound */
5995 +       end_time = task_sched_runtime(current);
5996 +       if (max_cpu_ratio && end_time > start_time) {
5997 +               scan_time = end_time - start_time;
5998 +               expected_jiffies = msecs_to_jiffies(
5999 +                       scan_time_to_sleep(scan_time, max_cpu_ratio));
6000 +
6001 +               if (expected_jiffies > uksm_sleep_real)
6002 +                       uksm_sleep_real = expected_jiffies;
6003 +
6004 +               /* We have a 1 second up bound for responsiveness. */
6005 +               if (jiffies_to_msecs(uksm_sleep_real) > MSEC_PER_SEC)
6006 +                       uksm_sleep_real = msecs_to_jiffies(1000);
6007 +       }
6008 +
6009 +       return;
6010 +}
6011 +
6012 +static int ksmd_should_run(void)
6013 +{
6014 +       return uksm_run & UKSM_RUN_MERGE;
6015 +}
6016 +
6017 +static int uksm_scan_thread(void *nothing)
6018 +{
6019 +       set_freezable();
6020 +       set_user_nice(current, 5);
6021 +
6022 +       while (!kthread_should_stop()) {
6023 +               mutex_lock(&uksm_thread_mutex);
6024 +               if (ksmd_should_run())
6025 +                       uksm_do_scan();
6026 +               mutex_unlock(&uksm_thread_mutex);
6027 +
6028 +               try_to_freeze();
6029 +
6030 +               if (ksmd_should_run()) {
6031 +                       schedule_timeout_interruptible(uksm_sleep_real);
6032 +                       uksm_sleep_times++;
6033 +               } else {
6034 +                       wait_event_freezable(uksm_thread_wait,
6035 +                               ksmd_should_run() || kthread_should_stop());
6036 +               }
6037 +       }
6038 +       return 0;
6039 +}
6040 +
6041 +void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
6042 +{
6043 +       struct stable_node *stable_node;
6044 +       struct node_vma *node_vma;
6045 +       struct rmap_item *rmap_item;
6046 +       int search_new_forks = 0;
6047 +       unsigned long address;
6048 +
6049 +       VM_BUG_ON_PAGE(!PageKsm(page), page);
6050 +       VM_BUG_ON_PAGE(!PageLocked(page), page);
6051 +
6052 +       stable_node = page_stable_node(page);
6053 +       if (!stable_node)
6054 +               return;
6055 +again:
6056 +       hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) {
6057 +               hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) {
6058 +                       struct anon_vma *anon_vma = rmap_item->anon_vma;
6059 +                       struct anon_vma_chain *vmac;
6060 +                       struct vm_area_struct *vma;
6061 +
6062 +                       cond_resched();
6063 +                       anon_vma_lock_read(anon_vma);
6064 +                       anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
6065 +                                                      0, ULONG_MAX) {
6066 +                               cond_resched();
6067 +                               vma = vmac->vma;
6068 +                               address = get_rmap_addr(rmap_item);
6069 +
6070 +                               if (address < vma->vm_start ||
6071 +                                   address >= vma->vm_end)
6072 +                                       continue;
6073 +
6074 +                               if ((rmap_item->slot->vma == vma) ==
6075 +                                   search_new_forks)
6076 +                                       continue;
6077 +
6078 +                               if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
6079 +                                       continue;
6080 +
6081 +                               if (!rwc->rmap_one(page, vma, address, rwc->arg)) {
6082 +                                       anon_vma_unlock_read(anon_vma);
6083 +                                       return;
6084 +                               }
6085 +
6086 +                               if (rwc->done && rwc->done(page)) {
6087 +                                       anon_vma_unlock_read(anon_vma);
6088 +                                       return;
6089 +                               }
6090 +                       }
6091 +                       anon_vma_unlock_read(anon_vma);
6092 +               }
6093 +       }
6094 +       if (!search_new_forks++)
6095 +               goto again;
6096 +}
6097 +
6098 +#ifdef CONFIG_MIGRATION
6099 +/* Common ksm interface but may be specific to uksm */
6100 +void ksm_migrate_page(struct page *newpage, struct page *oldpage)
6101 +{
6102 +       struct stable_node *stable_node;
6103 +
6104 +       VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
6105 +       VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
6106 +       VM_BUG_ON(newpage->mapping != oldpage->mapping);
6107 +
6108 +       stable_node = page_stable_node(newpage);
6109 +       if (stable_node) {
6110 +               VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage));
6111 +               stable_node->kpfn = page_to_pfn(newpage);
6112 +               /*
6113 +                * newpage->mapping was set in advance; now we need smp_wmb()
6114 +                * to make sure that the new stable_node->kpfn is visible
6115 +                * to get_ksm_page() before it can see that oldpage->mapping
6116 +                * has gone stale (or that PageSwapCache has been cleared).
6117 +                */
6118 +               smp_wmb();
6119 +               set_page_stable_node(oldpage, NULL);
6120 +       }
6121 +}
6122 +#endif /* CONFIG_MIGRATION */
6123 +
6124 +#ifdef CONFIG_MEMORY_HOTREMOVE
6125 +static struct stable_node *uksm_check_stable_tree(unsigned long start_pfn,
6126 +                                                unsigned long end_pfn)
6127 +{
6128 +       struct rb_node *node;
6129 +
6130 +       for (node = rb_first(root_stable_treep); node; node = rb_next(node)) {
6131 +               struct stable_node *stable_node;
6132 +
6133 +               stable_node = rb_entry(node, struct stable_node, node);
6134 +               if (stable_node->kpfn >= start_pfn &&
6135 +                   stable_node->kpfn < end_pfn)
6136 +                       return stable_node;
6137 +       }
6138 +       return NULL;
6139 +}
6140 +
6141 +static int uksm_memory_callback(struct notifier_block *self,
6142 +                              unsigned long action, void *arg)
6143 +{
6144 +       struct memory_notify *mn = arg;
6145 +       struct stable_node *stable_node;
6146 +
6147 +       switch (action) {
6148 +       case MEM_GOING_OFFLINE:
6149 +               /*
6150 +                * Keep it very simple for now: just lock out ksmd and
6151 +                * MADV_UNMERGEABLE while any memory is going offline.
6152 +                * mutex_lock_nested() is necessary because lockdep was alarmed
6153 +                * that here we take uksm_thread_mutex inside notifier chain
6154 +                * mutex, and later take notifier chain mutex inside
6155 +                * uksm_thread_mutex to unlock it.   But that's safe because both
6156 +                * are inside mem_hotplug_mutex.
6157 +                */
6158 +               mutex_lock_nested(&uksm_thread_mutex, SINGLE_DEPTH_NESTING);
6159 +               break;
6160 +
6161 +       case MEM_OFFLINE:
6162 +               /*
6163 +                * Most of the work is done by page migration; but there might
6164 +                * be a few stable_nodes left over, still pointing to struct
6165 +                * pages which have been offlined: prune those from the tree.
6166 +                */
6167 +               while ((stable_node = uksm_check_stable_tree(mn->start_pfn,
6168 +                                       mn->start_pfn + mn->nr_pages)) != NULL)
6169 +                       remove_node_from_stable_tree(stable_node, 1, 1);
6170 +               /* fallthrough */
6171 +
6172 +       case MEM_CANCEL_OFFLINE:
6173 +               mutex_unlock(&uksm_thread_mutex);
6174 +               break;
6175 +       }
6176 +       return NOTIFY_OK;
6177 +}
6178 +#endif /* CONFIG_MEMORY_HOTREMOVE */
6179 +
6180 +#ifdef CONFIG_SYSFS
6181 +/*
6182 + * This all compiles without CONFIG_SYSFS, but is a waste of space.
6183 + */
6184 +
6185 +#define UKSM_ATTR_RO(_name) \
6186 +       static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
6187 +#define UKSM_ATTR(_name) \
6188 +       static struct kobj_attribute _name##_attr = \
6189 +               __ATTR(_name, 0644, _name##_show, _name##_store)
6190 +
6191 +static ssize_t max_cpu_percentage_show(struct kobject *kobj,
6192 +                                   struct kobj_attribute *attr, char *buf)
6193 +{
6194 +       return sprintf(buf, "%u\n", uksm_max_cpu_percentage);
6195 +}
6196 +
6197 +static ssize_t max_cpu_percentage_store(struct kobject *kobj,
6198 +                                    struct kobj_attribute *attr,
6199 +                                    const char *buf, size_t count)
6200 +{
6201 +       unsigned long max_cpu_percentage;
6202 +       int err;
6203 +
6204 +       err = kstrtoul(buf, 10, &max_cpu_percentage);
6205 +       if (err || max_cpu_percentage > 100)
6206 +               return -EINVAL;
6207 +
6208 +       if (max_cpu_percentage == 100)
6209 +               max_cpu_percentage = 99;
6210 +       else if (max_cpu_percentage < 10)
6211 +               max_cpu_percentage = 10;
6212 +
6213 +       uksm_max_cpu_percentage = max_cpu_percentage;
6214 +
6215 +       return count;
6216 +}
6217 +UKSM_ATTR(max_cpu_percentage);
6218 +
6219 +static ssize_t sleep_millisecs_show(struct kobject *kobj,
6220 +                                   struct kobj_attribute *attr, char *buf)
6221 +{
6222 +       return sprintf(buf, "%u\n", jiffies_to_msecs(uksm_sleep_jiffies));
6223 +}
6224 +
6225 +static ssize_t sleep_millisecs_store(struct kobject *kobj,
6226 +                                    struct kobj_attribute *attr,
6227 +                                    const char *buf, size_t count)
6228 +{
6229 +       unsigned long msecs;
6230 +       int err;
6231 +
6232 +       err = kstrtoul(buf, 10, &msecs);
6233 +       if (err || msecs > MSEC_PER_SEC)
6234 +               return -EINVAL;
6235 +
6236 +       uksm_sleep_jiffies = msecs_to_jiffies(msecs);
6237 +       uksm_sleep_saved = uksm_sleep_jiffies;
6238 +
6239 +       return count;
6240 +}
6241 +UKSM_ATTR(sleep_millisecs);
6242 +
6243 +
6244 +static ssize_t cpu_governor_show(struct kobject *kobj,
6245 +                                 struct kobj_attribute *attr, char *buf)
6246 +{
6247 +       int n = sizeof(uksm_cpu_governor_str) / sizeof(char *);
6248 +       int i;
6249 +
6250 +       buf[0] = '\0';
6251 +       for (i = 0; i < n ; i++) {
6252 +               if (uksm_cpu_governor == i)
6253 +                       strcat(buf, "[");
6254 +
6255 +               strcat(buf, uksm_cpu_governor_str[i]);
6256 +
6257 +               if (uksm_cpu_governor == i)
6258 +                       strcat(buf, "]");
6259 +
6260 +               strcat(buf, " ");
6261 +       }
6262 +       strcat(buf, "\n");
6263 +
6264 +       return strlen(buf);
6265 +}
6266 +
6267 +static inline void init_performance_values(void)
6268 +{
6269 +       int i;
6270 +       struct scan_rung *rung;
6271 +       struct uksm_cpu_preset_s *preset = uksm_cpu_preset + uksm_cpu_governor;
6272 +
6273 +
6274 +       for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6275 +               rung = uksm_scan_ladder + i;
6276 +               rung->cpu_ratio = preset->cpu_ratio[i];
6277 +               rung->cover_msecs = preset->cover_msecs[i];
6278 +       }
6279 +
6280 +       uksm_max_cpu_percentage = preset->max_cpu;
6281 +}
6282 +
6283 +static ssize_t cpu_governor_store(struct kobject *kobj,
6284 +                                  struct kobj_attribute *attr,
6285 +                                  const char *buf, size_t count)
6286 +{
6287 +       int n = sizeof(uksm_cpu_governor_str) / sizeof(char *);
6288 +
6289 +       for (n--; n >= 0 ; n--) {
6290 +               if (!strncmp(buf, uksm_cpu_governor_str[n],
6291 +                            strlen(uksm_cpu_governor_str[n])))
6292 +                       break;
6293 +       }
6294 +
6295 +       if (n < 0)
6296 +               return -EINVAL;
6297 +       else
6298 +               uksm_cpu_governor = n;
6299 +
6300 +       init_performance_values();
6301 +
6302 +       return count;
6303 +}
6304 +UKSM_ATTR(cpu_governor);
6305 +
6306 +static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
6307 +                       char *buf)
6308 +{
6309 +       return sprintf(buf, "%u\n", uksm_run);
6310 +}
6311 +
6312 +static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
6313 +                        const char *buf, size_t count)
6314 +{
6315 +       int err;
6316 +       unsigned long flags;
6317 +
6318 +       err = kstrtoul(buf, 10, &flags);
6319 +       if (err || flags > UINT_MAX)
6320 +               return -EINVAL;
6321 +       if (flags > UKSM_RUN_MERGE)
6322 +               return -EINVAL;
6323 +
6324 +       mutex_lock(&uksm_thread_mutex);
6325 +       if (uksm_run != flags)
6326 +               uksm_run = flags;
6327 +       mutex_unlock(&uksm_thread_mutex);
6328 +
6329 +       if (flags & UKSM_RUN_MERGE)
6330 +               wake_up_interruptible(&uksm_thread_wait);
6331 +
6332 +       return count;
6333 +}
6334 +UKSM_ATTR(run);
6335 +
6336 +static ssize_t abundant_threshold_show(struct kobject *kobj,
6337 +                                    struct kobj_attribute *attr, char *buf)
6338 +{
6339 +       return sprintf(buf, "%u\n", uksm_abundant_threshold);
6340 +}
6341 +
6342 +static ssize_t abundant_threshold_store(struct kobject *kobj,
6343 +                                     struct kobj_attribute *attr,
6344 +                                     const char *buf, size_t count)
6345 +{
6346 +       int err;
6347 +       unsigned long flags;
6348 +
6349 +       err = kstrtoul(buf, 10, &flags);
6350 +       if (err || flags > 99)
6351 +               return -EINVAL;
6352 +
6353 +       uksm_abundant_threshold = flags;
6354 +
6355 +       return count;
6356 +}
6357 +UKSM_ATTR(abundant_threshold);
6358 +
6359 +static ssize_t thrash_threshold_show(struct kobject *kobj,
6360 +                                    struct kobj_attribute *attr, char *buf)
6361 +{
6362 +       return sprintf(buf, "%u\n", uksm_thrash_threshold);
6363 +}
6364 +
6365 +static ssize_t thrash_threshold_store(struct kobject *kobj,
6366 +                                     struct kobj_attribute *attr,
6367 +                                     const char *buf, size_t count)
6368 +{
6369 +       int err;
6370 +       unsigned long flags;
6371 +
6372 +       err = kstrtoul(buf, 10, &flags);
6373 +       if (err || flags > 99)
6374 +               return -EINVAL;
6375 +
6376 +       uksm_thrash_threshold = flags;
6377 +
6378 +       return count;
6379 +}
6380 +UKSM_ATTR(thrash_threshold);
6381 +
6382 +static ssize_t cpu_ratios_show(struct kobject *kobj,
6383 +                              struct kobj_attribute *attr, char *buf)
6384 +{
6385 +       int i, size;
6386 +       struct scan_rung *rung;
6387 +       char *p = buf;
6388 +
6389 +       for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6390 +               rung = &uksm_scan_ladder[i];
6391 +
6392 +               if (rung->cpu_ratio > 0)
6393 +                       size = sprintf(p, "%d ", rung->cpu_ratio);
6394 +               else
6395 +                       size = sprintf(p, "MAX/%d ",
6396 +                                       TIME_RATIO_SCALE / -rung->cpu_ratio);
6397 +
6398 +               p += size;
6399 +       }
6400 +
6401 +       *p++ = '\n';
6402 +       *p = '\0';
6403 +
6404 +       return p - buf;
6405 +}
6406 +
6407 +static ssize_t cpu_ratios_store(struct kobject *kobj,
6408 +                                     struct kobj_attribute *attr,
6409 +                                     const char *buf, size_t count)
6410 +{
6411 +       int i, cpuratios[SCAN_LADDER_SIZE], err;
6412 +       unsigned long value;
6413 +       struct scan_rung *rung;
6414 +       char *p, *end = NULL;
6415 +
6416 +       p = kzalloc(count, GFP_KERNEL);
6417 +       if (!p)
6418 +               return -ENOMEM;
6419 +
6420 +       memcpy(p, buf, count);
6421 +
6422 +       for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6423 +               if (i != SCAN_LADDER_SIZE - 1) {
6424 +                       end = strchr(p, ' ');
6425 +                       if (!end)
6426 +                               return -EINVAL;
6427 +
6428 +                       *end = '\0';
6429 +               }
6430 +
6431 +               if (strstr(p, "MAX/")) {
6432 +                       p = strchr(p, '/') + 1;
6433 +                       err = kstrtoul(p, 10, &value);
6434 +                       if (err || value > TIME_RATIO_SCALE || !value)
6435 +                               return -EINVAL;
6436 +
6437 +                       cpuratios[i] = -(int) (TIME_RATIO_SCALE / value);
6438 +               } else {
6439 +                       err = kstrtoul(p, 10, &value);
6440 +                       if (err || value > TIME_RATIO_SCALE || !value)
6441 +                               return -EINVAL;
6442 +
6443 +                       cpuratios[i] = value;
6444 +               }
6445 +
6446 +               p = end + 1;
6447 +       }
6448 +
6449 +       for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6450 +               rung = &uksm_scan_ladder[i];
6451 +
6452 +               rung->cpu_ratio = cpuratios[i];
6453 +       }
6454 +
6455 +       return count;
6456 +}
6457 +UKSM_ATTR(cpu_ratios);
6458 +
6459 +static ssize_t eval_intervals_show(struct kobject *kobj,
6460 +                              struct kobj_attribute *attr, char *buf)
6461 +{
6462 +       int i, size;
6463 +       struct scan_rung *rung;
6464 +       char *p = buf;
6465 +
6466 +       for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6467 +               rung = &uksm_scan_ladder[i];
6468 +               size = sprintf(p, "%u ", rung->cover_msecs);
6469 +               p += size;
6470 +       }
6471 +
6472 +       *p++ = '\n';
6473 +       *p = '\0';
6474 +
6475 +       return p - buf;
6476 +}
6477 +
6478 +static ssize_t eval_intervals_store(struct kobject *kobj,
6479 +                                     struct kobj_attribute *attr,
6480 +                                     const char *buf, size_t count)
6481 +{
6482 +       int i, err;
6483 +       unsigned long values[SCAN_LADDER_SIZE];
6484 +       struct scan_rung *rung;
6485 +       char *p, *end = NULL;
6486 +       ssize_t ret = count;
6487 +
6488 +       p = kzalloc(count + 2, GFP_KERNEL);
6489 +       if (!p)
6490 +               return -ENOMEM;
6491 +
6492 +       memcpy(p, buf, count);
6493 +
6494 +       for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6495 +               if (i != SCAN_LADDER_SIZE - 1) {
6496 +                       end = strchr(p, ' ');
6497 +                       if (!end) {
6498 +                               ret = -EINVAL;
6499 +                               goto out;
6500 +                       }
6501 +
6502 +                       *end = '\0';
6503 +               }
6504 +
6505 +               err = kstrtoul(p, 10, &values[i]);
6506 +               if (err) {
6507 +                       ret = -EINVAL;
6508 +                       goto out;
6509 +               }
6510 +
6511 +               p = end + 1;
6512 +       }
6513 +
6514 +       for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6515 +               rung = &uksm_scan_ladder[i];
6516 +
6517 +               rung->cover_msecs = values[i];
6518 +       }
6519 +
6520 +out:
6521 +       kfree(p);
6522 +       return ret;
6523 +}
6524 +UKSM_ATTR(eval_intervals);
6525 +
6526 +static ssize_t ema_per_page_time_show(struct kobject *kobj,
6527 +                                struct kobj_attribute *attr, char *buf)
6528 +{
6529 +       return sprintf(buf, "%lu\n", uksm_ema_page_time);
6530 +}
6531 +UKSM_ATTR_RO(ema_per_page_time);
6532 +
6533 +static ssize_t pages_shared_show(struct kobject *kobj,
6534 +                                struct kobj_attribute *attr, char *buf)
6535 +{
6536 +       return sprintf(buf, "%lu\n", uksm_pages_shared);
6537 +}
6538 +UKSM_ATTR_RO(pages_shared);
6539 +
6540 +static ssize_t pages_sharing_show(struct kobject *kobj,
6541 +                                 struct kobj_attribute *attr, char *buf)
6542 +{
6543 +       return sprintf(buf, "%lu\n", uksm_pages_sharing);
6544 +}
6545 +UKSM_ATTR_RO(pages_sharing);
6546 +
6547 +static ssize_t pages_unshared_show(struct kobject *kobj,
6548 +                                  struct kobj_attribute *attr, char *buf)
6549 +{
6550 +       return sprintf(buf, "%lu\n", uksm_pages_unshared);
6551 +}
6552 +UKSM_ATTR_RO(pages_unshared);
6553 +
6554 +static ssize_t full_scans_show(struct kobject *kobj,
6555 +                              struct kobj_attribute *attr, char *buf)
6556 +{
6557 +       return sprintf(buf, "%llu\n", fully_scanned_round);
6558 +}
6559 +UKSM_ATTR_RO(full_scans);
6560 +
6561 +static ssize_t pages_scanned_show(struct kobject *kobj,
6562 +                                 struct kobj_attribute *attr, char *buf)
6563 +{
6564 +       unsigned long base = 0;
6565 +       u64 delta, ret;
6566 +
6567 +       if (pages_scanned_stored) {
6568 +               base = pages_scanned_base;
6569 +               ret = pages_scanned_stored;
6570 +               delta = uksm_pages_scanned >> base;
6571 +               if (CAN_OVERFLOW_U64(ret, delta)) {
6572 +                       ret >>= 1;
6573 +                       delta >>= 1;
6574 +                       base++;
6575 +                       ret += delta;
6576 +               }
6577 +       } else {
6578 +               ret = uksm_pages_scanned;
6579 +       }
6580 +
6581 +       while (ret > ULONG_MAX) {
6582 +               ret >>= 1;
6583 +               base++;
6584 +       }
6585 +
6586 +       if (base)
6587 +               return sprintf(buf, "%lu * 2^%lu\n", (unsigned long)ret, base);
6588 +       else
6589 +               return sprintf(buf, "%lu\n", (unsigned long)ret);
6590 +}
6591 +UKSM_ATTR_RO(pages_scanned);
6592 +
6593 +static ssize_t hash_strength_show(struct kobject *kobj,
6594 +                                 struct kobj_attribute *attr, char *buf)
6595 +{
6596 +       return sprintf(buf, "%lu\n", hash_strength);
6597 +}
6598 +UKSM_ATTR_RO(hash_strength);
6599 +
6600 +static ssize_t sleep_times_show(struct kobject *kobj,
6601 +                                 struct kobj_attribute *attr, char *buf)
6602 +{
6603 +       return sprintf(buf, "%llu\n", uksm_sleep_times);
6604 +}
6605 +UKSM_ATTR_RO(sleep_times);
6606 +
6607 +
6608 +static struct attribute *uksm_attrs[] = {
6609 +       &max_cpu_percentage_attr.attr,
6610 +       &sleep_millisecs_attr.attr,
6611 +       &cpu_governor_attr.attr,
6612 +       &run_attr.attr,
6613 +       &ema_per_page_time_attr.attr,
6614 +       &pages_shared_attr.attr,
6615 +       &pages_sharing_attr.attr,
6616 +       &pages_unshared_attr.attr,
6617 +       &full_scans_attr.attr,
6618 +       &pages_scanned_attr.attr,
6619 +       &hash_strength_attr.attr,
6620 +       &sleep_times_attr.attr,
6621 +       &thrash_threshold_attr.attr,
6622 +       &abundant_threshold_attr.attr,
6623 +       &cpu_ratios_attr.attr,
6624 +       &eval_intervals_attr.attr,
6625 +       NULL,
6626 +};
6627 +
6628 +static struct attribute_group uksm_attr_group = {
6629 +       .attrs = uksm_attrs,
6630 +       .name = "uksm",
6631 +};
6632 +#endif /* CONFIG_SYSFS */
6633 +
6634 +static inline void init_scan_ladder(void)
6635 +{
6636 +       int i;
6637 +       struct scan_rung *rung;
6638 +
6639 +       for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6640 +               rung = uksm_scan_ladder + i;
6641 +               slot_tree_init_root(&rung->vma_root);
6642 +       }
6643 +
6644 +       init_performance_values();
6645 +       uksm_calc_scan_pages();
6646 +}
6647 +
6648 +static inline int cal_positive_negative_costs(void)
6649 +{
6650 +       struct page *p1, *p2;
6651 +       unsigned char *addr1, *addr2;
6652 +       unsigned long i, time_start, hash_cost;
6653 +       unsigned long loopnum = 0;
6654 +
6655 +       /*IMPORTANT: volatile is needed to prevent over-optimization by gcc. */
6656 +       volatile u32 hash;
6657 +       volatile int ret;
6658 +
6659 +       p1 = alloc_page(GFP_KERNEL);
6660 +       if (!p1)
6661 +               return -ENOMEM;
6662 +
6663 +       p2 = alloc_page(GFP_KERNEL);
6664 +       if (!p2)
6665 +               return -ENOMEM;
6666 +
6667 +       addr1 = kmap_atomic(p1);
6668 +       addr2 = kmap_atomic(p2);
6669 +       memset(addr1, prandom_u32(), PAGE_SIZE);
6670 +       memcpy(addr2, addr1, PAGE_SIZE);
6671 +
6672 +       /* make sure that the two pages differ in last byte */
6673 +       addr2[PAGE_SIZE-1] = ~addr2[PAGE_SIZE-1];
6674 +       kunmap_atomic(addr2);
6675 +       kunmap_atomic(addr1);
6676 +
6677 +       time_start = jiffies;
6678 +       while (jiffies - time_start < 100) {
6679 +               for (i = 0; i < 100; i++)
6680 +                       hash = page_hash(p1, HASH_STRENGTH_FULL, 0);
6681 +               loopnum += 100;
6682 +       }
6683 +       hash_cost = (jiffies - time_start);
6684 +
6685 +       time_start = jiffies;
6686 +       for (i = 0; i < loopnum; i++)
6687 +               ret = pages_identical_with_cost(p1, p2);
6688 +       memcmp_cost = HASH_STRENGTH_FULL * (jiffies - time_start);
6689 +       memcmp_cost /= hash_cost;
6690 +       pr_info("UKSM: relative memcmp_cost = %lu "
6691 +               "hash=%u cmp_ret=%d.\n",
6692 +               memcmp_cost, hash, ret);
6693 +
6694 +       __free_page(p1);
6695 +       __free_page(p2);
6696 +       return 0;
6697 +}
6698 +
6699 +static int init_zeropage_hash_table(void)
6700 +{
6701 +       struct page *page;
6702 +       char *addr;
6703 +       int i;
6704 +
6705 +       page = alloc_page(GFP_KERNEL);
6706 +       if (!page)
6707 +               return -ENOMEM;
6708 +
6709 +       addr = kmap_atomic(page);
6710 +       memset(addr, 0, PAGE_SIZE);
6711 +       kunmap_atomic(addr);
6712 +
6713 +       zero_hash_table = kmalloc_array(HASH_STRENGTH_MAX, sizeof(u32),
6714 +               GFP_KERNEL);
6715 +       if (!zero_hash_table)
6716 +               return -ENOMEM;
6717 +
6718 +       for (i = 0; i < HASH_STRENGTH_MAX; i++)
6719 +               zero_hash_table[i] = page_hash(page, i, 0);
6720 +
6721 +       __free_page(page);
6722 +
6723 +       return 0;
6724 +}
6725 +
6726 +static inline int init_random_sampling(void)
6727 +{
6728 +       unsigned long i;
6729 +
6730 +       random_nums = kmalloc(PAGE_SIZE, GFP_KERNEL);
6731 +       if (!random_nums)
6732 +               return -ENOMEM;
6733 +
6734 +       for (i = 0; i < HASH_STRENGTH_FULL; i++)
6735 +               random_nums[i] = i;
6736 +
6737 +       for (i = 0; i < HASH_STRENGTH_FULL; i++) {
6738 +               unsigned long rand_range, swap_index, tmp;
6739 +
6740 +               rand_range = HASH_STRENGTH_FULL - i;
6741 +               swap_index = i + prandom_u32() % rand_range;
6742 +               tmp = random_nums[i];
6743 +               random_nums[i] =  random_nums[swap_index];
6744 +               random_nums[swap_index] = tmp;
6745 +       }
6746 +
6747 +       rshash_state.state = RSHASH_NEW;
6748 +       rshash_state.below_count = 0;
6749 +       rshash_state.lookup_window_index = 0;
6750 +
6751 +       return cal_positive_negative_costs();
6752 +}
6753 +
6754 +static int __init uksm_slab_init(void)
6755 +{
6756 +       rmap_item_cache = UKSM_KMEM_CACHE(rmap_item, 0);
6757 +       if (!rmap_item_cache)
6758 +               goto out;
6759 +
6760 +       stable_node_cache = UKSM_KMEM_CACHE(stable_node, 0);
6761 +       if (!stable_node_cache)
6762 +               goto out_free1;
6763 +
6764 +       node_vma_cache = UKSM_KMEM_CACHE(node_vma, 0);
6765 +       if (!node_vma_cache)
6766 +               goto out_free2;
6767 +
6768 +       vma_slot_cache = UKSM_KMEM_CACHE(vma_slot, 0);
6769 +       if (!vma_slot_cache)
6770 +               goto out_free3;
6771 +
6772 +       tree_node_cache = UKSM_KMEM_CACHE(tree_node, 0);
6773 +       if (!tree_node_cache)
6774 +               goto out_free4;
6775 +
6776 +       return 0;
6777 +
6778 +out_free4:
6779 +       kmem_cache_destroy(vma_slot_cache);
6780 +out_free3:
6781 +       kmem_cache_destroy(node_vma_cache);
6782 +out_free2:
6783 +       kmem_cache_destroy(stable_node_cache);
6784 +out_free1:
6785 +       kmem_cache_destroy(rmap_item_cache);
6786 +out:
6787 +       return -ENOMEM;
6788 +}
6789 +
6790 +static void __init uksm_slab_free(void)
6791 +{
6792 +       kmem_cache_destroy(stable_node_cache);
6793 +       kmem_cache_destroy(rmap_item_cache);
6794 +       kmem_cache_destroy(node_vma_cache);
6795 +       kmem_cache_destroy(vma_slot_cache);
6796 +       kmem_cache_destroy(tree_node_cache);
6797 +}
6798 +
6799 +/* Common interface to ksm, different to it. */
6800 +int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
6801 +               unsigned long end, int advice, unsigned long *vm_flags)
6802 +{
6803 +       int err;
6804 +
6805 +       switch (advice) {
6806 +       case MADV_MERGEABLE:
6807 +               return 0;               /* just ignore the advice */
6808 +
6809 +       case MADV_UNMERGEABLE:
6810 +               if (!(*vm_flags & VM_MERGEABLE) || !uksm_flags_can_scan(*vm_flags))
6811 +                       return 0;               /* just ignore the advice */
6812 +
6813 +               if (vma->anon_vma) {
6814 +                       err = unmerge_uksm_pages(vma, start, end);
6815 +                       if (err)
6816 +                               return err;
6817 +               }
6818 +
6819 +               uksm_remove_vma(vma);
6820 +               *vm_flags &= ~VM_MERGEABLE;
6821 +               break;
6822 +       }
6823 +
6824 +       return 0;
6825 +}
6826 +
6827 +/* Common interface to ksm, actually the same. */
6828 +struct page *ksm_might_need_to_copy(struct page *page,
6829 +                       struct vm_area_struct *vma, unsigned long address)
6830 +{
6831 +       struct anon_vma *anon_vma = page_anon_vma(page);
6832 +       struct page *new_page;
6833 +
6834 +       if (PageKsm(page)) {
6835 +               if (page_stable_node(page))
6836 +                       return page;    /* no need to copy it */
6837 +       } else if (!anon_vma) {
6838 +               return page;            /* no need to copy it */
6839 +       } else if (anon_vma->root == vma->anon_vma->root &&
6840 +                page->index == linear_page_index(vma, address)) {
6841 +               return page;            /* still no need to copy it */
6842 +       }
6843 +       if (!PageUptodate(page))
6844 +               return page;            /* let do_swap_page report the error */
6845 +
6846 +       new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
6847 +       if (new_page) {
6848 +               copy_user_highpage(new_page, page, address, vma);
6849 +
6850 +               SetPageDirty(new_page);
6851 +               __SetPageUptodate(new_page);
6852 +               __SetPageLocked(new_page);
6853 +       }
6854 +
6855 +       return new_page;
6856 +}
6857 +
6858 +/* Copied from mm/ksm.c and required from 5.1 */
6859 +bool reuse_ksm_page(struct page *page,
6860 +                   struct vm_area_struct *vma,
6861 +                   unsigned long address)
6862 +{
6863 +#ifdef CONFIG_DEBUG_VM
6864 +       if (WARN_ON(is_zero_pfn(page_to_pfn(page))) ||
6865 +                       WARN_ON(!page_mapped(page)) ||
6866 +                       WARN_ON(!PageLocked(page))) {
6867 +               dump_page(page, "reuse_ksm_page");
6868 +               return false;
6869 +       }
6870 +#endif
6871 +
6872 +       if (PageSwapCache(page) || !page_stable_node(page))
6873 +               return false;
6874 +       /* Prohibit parallel get_ksm_page() */
6875 +       if (!page_ref_freeze(page, 1))
6876 +               return false;
6877 +
6878 +       page_move_anon_rmap(page, vma);
6879 +       page->index = linear_page_index(vma, address);
6880 +       page_ref_unfreeze(page, 1);
6881 +
6882 +       return true;
6883 +}
6884 +
6885 +static int __init uksm_init(void)
6886 +{
6887 +       struct task_struct *uksm_thread;
6888 +       int err;
6889 +
6890 +       uksm_sleep_jiffies = msecs_to_jiffies(100);
6891 +       uksm_sleep_saved = uksm_sleep_jiffies;
6892 +
6893 +       slot_tree_init();
6894 +       init_scan_ladder();
6895 +
6896 +
6897 +       err = init_random_sampling();
6898 +       if (err)
6899 +               goto out_free2;
6900 +
6901 +       err = uksm_slab_init();
6902 +       if (err)
6903 +               goto out_free1;
6904 +
6905 +       err = init_zeropage_hash_table();
6906 +       if (err)
6907 +               goto out_free0;
6908 +
6909 +       uksm_thread = kthread_run(uksm_scan_thread, NULL, "uksmd");
6910 +       if (IS_ERR(uksm_thread)) {
6911 +               pr_err("uksm: creating kthread failed\n");
6912 +               err = PTR_ERR(uksm_thread);
6913 +               goto out_free;
6914 +       }
6915 +
6916 +#ifdef CONFIG_SYSFS
6917 +       err = sysfs_create_group(mm_kobj, &uksm_attr_group);
6918 +       if (err) {
6919 +               pr_err("uksm: register sysfs failed\n");
6920 +               kthread_stop(uksm_thread);
6921 +               goto out_free;
6922 +       }
6923 +#else
6924 +       uksm_run = UKSM_RUN_MERGE;      /* no way for user to start it */
6925 +
6926 +#endif /* CONFIG_SYSFS */
6927 +
6928 +#ifdef CONFIG_MEMORY_HOTREMOVE
6929 +       /*
6930 +        * Choose a high priority since the callback takes uksm_thread_mutex:
6931 +        * later callbacks could only be taking locks which nest within that.
6932 +        */
6933 +       hotplug_memory_notifier(uksm_memory_callback, 100);
6934 +#endif
6935 +       return 0;
6936 +
6937 +out_free:
6938 +       kfree(zero_hash_table);
6939 +out_free0:
6940 +       uksm_slab_free();
6941 +out_free1:
6942 +       kfree(random_nums);
6943 +out_free2:
6944 +       kfree(uksm_scan_ladder);
6945 +       return err;
6946 +}
6947 +
6948 +#ifdef MODULE
6949 +subsys_initcall(ksm_init);
6950 +#else
6951 +late_initcall(uksm_init);
6952 +#endif
6953 +
6954 diff --git a/mm/vmstat.c b/mm/vmstat.c
6955 index 74b2c374b..ae42103a8 100644
6956 --- a/mm/vmstat.c
6957 +++ b/mm/vmstat.c
6958 @@ -1231,6 +1231,9 @@ const char * const vmstat_text[] = {
6959         "nr_swapcached",
6960  #endif
6961
6962 +#ifdef CONFIG_UKSM
6963 +       "nr_uksm_zero_pages",
6964 +#endif
6965         /* enum writeback_stat_item counters */
6966         "nr_dirty_threshold",
6967         "nr_dirty_background_threshold",
6968 --
6969 2.31.1.305.gd1b10fc6d8
6970