kernel26-uksm/uksm-2.6.38-20111223.patch

   1 diff -Nur linux-2.6.38/arch/x86/kernel/entry_32.S uksm-2.6.38/arch/x86/kernel/entry_32.S
   2 --- linux-2.6.38/arch/x86/kernel/entry_32.S     2011-03-15 09:20:32.000000000 +0800
   3 +++ uksm-2.6.38/arch/x86/kernel/entry_32.S      2011-12-16 01:09:35.000000000 +0800
   4 @@ -1413,7 +1413,7 @@
   5         CFI_ADJUST_CFA_OFFSET 4
   6         jmp error_code
   7         CFI_ENDPROC
   8 -END(apf_page_fault)
   9 +END(async_page_fault)
  10  #endif
  11
  12  /*
  13 diff -Nur linux-2.6.38/arch/x86/kernel/entry_64.S uksm-2.6.38/arch/x86/kernel/entry_64.S
  14 --- linux-2.6.38/arch/x86/kernel/entry_64.S     2011-03-15 09:20:32.000000000 +0800
  15 +++ uksm-2.6.38/arch/x86/kernel/entry_64.S      2011-12-16 01:09:35.000000000 +0800
  16 @@ -1248,7 +1248,7 @@
  17         decl PER_CPU_VAR(irq_count)
  18         jmp  error_exit
  19         CFI_ENDPROC
  20 -END(do_hypervisor_callback)
  21 +END(xen_do_hypervisor_callback)
  22
  23  /*
  24   * Hypervisor uses this for application faults while it executes.
  25 diff -Nur linux-2.6.38/fs/exec.c uksm-2.6.38/fs/exec.c
  26 --- linux-2.6.38/fs/exec.c      2011-03-15 09:20:32.000000000 +0800
  27 +++ uksm-2.6.38/fs/exec.c       2011-12-16 01:10:09.000000000 +0800
  28 @@ -19,7 +19,7 @@
  29   * current->executable is only used by the procfs.  This allows a dispatch
  30   * table to check for several different types  of binary formats.  We keep
  31   * trying until we recognize the file or we run out of supported binary
  32 - * formats.
  33 + * formats.
  34   */
  35
  36  #include <linux/slab.h>
  37 @@ -55,6 +55,7 @@
  38  #include <linux/fs_struct.h>
  39  #include <linux/pipe_fs_i.h>
  40  #include <linux/oom.h>
  41 +#include <linux/ksm.h>
  42
  43  #include <asm/uaccess.h>
  44  #include <asm/mmu_context.h>
  45 @@ -85,7 +86,7 @@
  46         insert ? list_add(&fmt->lh, &formats) :
  47                  list_add_tail(&fmt->lh, &formats);
  48         write_unlock(&binfmt_lock);
  49 -       return 0;
  50 +       return 0;
  51  }
  52
  53  EXPORT_SYMBOL(__register_binfmt);
  54 @@ -1106,7 +1107,7 @@
  55            group */
  56
  57         current->self_exec_id++;
  58 -
  59 +
  60         flush_signal_handlers(current, 0);
  61         flush_old_files(current->files);
  62  }
  63 @@ -1196,8 +1197,8 @@
  64         return res;
  65  }
  66
  67 -/*
  68 - * Fill the binprm structure from the inode.
  69 +/*
  70 + * Fill the binprm structure from the inode.
  71   * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
  72   *
  73   * This may be called multiple times for binary chains (scripts for example).
  74 diff -Nur linux-2.6.38/fs/proc/meminfo.c uksm-2.6.38/fs/proc/meminfo.c
  75 --- linux-2.6.38/fs/proc/meminfo.c      2011-03-15 09:20:32.000000000 +0800
  76 +++ uksm-2.6.38/fs/proc/meminfo.c       2011-12-16 01:10:10.000000000 +0800
  77 @@ -87,6 +87,9 @@
  78                 "SUnreclaim:     %8lu kB\n"
  79                 "KernelStack:    %8lu kB\n"
  80                 "PageTables:     %8lu kB\n"
  81 +#ifdef CONFIG_KSM
  82 +               "KsmSharing:     %8lu kB\n"
  83 +#endif
  84  #ifdef CONFIG_QUICKLIST
  85                 "Quicklists:     %8lu kB\n"
  86  #endif
  87 @@ -145,6 +148,9 @@
  88                 K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
  89                 global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024,
  90                 K(global_page_state(NR_PAGETABLE)),
  91 +#ifdef CONFIG_KSM
  92 +               K(global_page_state(NR_KSM_PAGES_SHARING)),
  93 +#endif
  94  #ifdef CONFIG_QUICKLIST
  95                 K(quicklist_total_size()),
  96  #endif
  97 diff -Nur linux-2.6.38/include/linux/ksm.h uksm-2.6.38/include/linux/ksm.h
  98 --- linux-2.6.38/include/linux/ksm.h    2011-03-15 09:20:32.000000000 +0800
  99 +++ uksm-2.6.38/include/linux/ksm.h     2011-12-22 17:46:52.213988023 +0800
 100 @@ -20,24 +20,6 @@
 101                         struct vm_area_struct *vma, unsigned long address);
 102
 103  #ifdef CONFIG_KSM
 104 -int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 105 -               unsigned long end, int advice, unsigned long *vm_flags);
 106 -int __ksm_enter(struct mm_struct *mm);
 107 -void __ksm_exit(struct mm_struct *mm);
 108 -
 109 -static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 110 -{
 111 -       if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
 112 -               return __ksm_enter(mm);
 113 -       return 0;
 114 -}
 115 -
 116 -static inline void ksm_exit(struct mm_struct *mm)
 117 -{
 118 -       if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
 119 -               __ksm_exit(mm);
 120 -}
 121 -
 122  /*
 123   * A KSM page is one of those write-protected "shared pages" or "merged pages"
 124   * which KSM maps into multiple mms, wherever identical anonymous page content
 125 @@ -62,6 +44,13 @@
 126                                 (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
 127  }
 128
 129 +/* must be done before linked to mm */
 130 +extern  void ksm_vma_add_new(struct vm_area_struct *vma);
 131 +
 132 +extern void ksm_remove_vma(struct vm_area_struct *vma);
 133 +extern  int unmerge_ksm_pages(struct vm_area_struct *vma,
 134 +                                   unsigned long start, unsigned long end);
 135 +
 136  /*
 137   * When do_swap_page() first faults in from swap what used to be a KSM page,
 138   * no problem, it will be assigned to this vma's anon_vma; but thereafter,
 139 @@ -90,16 +79,184 @@
 140                   struct vm_area_struct *, unsigned long, void *), void *arg);
 141  void ksm_migrate_page(struct page *newpage, struct page *oldpage);
 142
 143 -#else  /* !CONFIG_KSM */
 144 +/* Each rung of this ladder is a list of VMAs having a same scan ratio */
 145 +struct scan_rung {
 146 +       struct list_head vma_list;
 147 +       //spinlock_t vma_list_lock;
 148 +       //struct semaphore sem;
 149 +       struct list_head *current_scan;
 150 +       unsigned int pages_to_scan;
 151 +       unsigned char round_finished; /* rung is ready for the next round */
 152 +       unsigned char busy_searched;
 153 +       unsigned long fully_scanned_slots;
 154 +       unsigned long scan_ratio;
 155 +       unsigned long vma_num;
 156 +       //unsigned long vma_finished;
 157 +       unsigned long scan_turn;
 158 +};
 159 +
 160 +struct vma_slot {
 161 +       struct list_head ksm_list;
 162 +       struct list_head slot_list;
 163 +       unsigned long dedup_ratio;
 164 +       unsigned long dedup_num;
 165 +       int ksm_index; /* -1 if vma is not in inter-table,
 166 +                               positive otherwise */
 167 +       unsigned long pages_scanned;
 168 +       unsigned long last_scanned;
 169 +       unsigned long pages_to_scan;
 170 +       struct scan_rung *rung;
 171 +       struct page **rmap_list_pool;
 172 +       unsigned long *pool_counts;
 173 +       unsigned long pool_size;
 174 +       struct vm_area_struct *vma;
 175 +       struct mm_struct *mm;
 176 +       unsigned long ctime_j;
 177 +       unsigned long pages;
 178 +       unsigned char need_sort;
 179 +       unsigned char need_rerand;
 180 +       unsigned long slot_scanned; /* It's scanned in this round */
 181 +       unsigned long fully_scanned; /* the above four to be merged to status bits */
 182 +       unsigned long pages_cowed; /* pages cowed this round */
 183 +       unsigned long pages_merged; /* pages merged this round */
 184 +
 185 +       /* used for dup vma pair */
 186 +       struct radix_tree_root dup_tree;
 187 +};
 188
 189 -static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 190 -{
 191 -       return 0;
 192 -}
 193 +/*
 194 + * A few notes about the KSM scanning process,
 195 + * to make it easier to understand the data structures below:
 196 + *
 197 + * In order to reduce excessive scanning, KSM sorts the memory pages by their
 198 + * contents into a data structure that holds pointers to the pages' locations.
 199 + *
 200 + * Since the contents of the pages may change at any moment, KSM cannot just
 201 + * insert the pages into a normal sorted tree and expect it to find anything.
 202 + * Therefore KSM uses two data structures - the stable and the unstable tree.
 203 + *
 204 + * The stable tree holds pointers to all the merged pages (ksm pages), sorted
 205 + * by their contents.  Because each such page is write-protected, searching on
 206 + * this tree is fully assured to be working (except when pages are unmapped),
 207 + * and therefore this tree is called the stable tree.
 208 + *
 209 + * In addition to the stable tree, KSM uses a second data structure called the
 210 + * unstable tree: this tree holds pointers to pages which have been found to
 211 + * be "unchanged for a period of time".  The unstable tree sorts these pages
 212 + * by their contents, but since they are not write-protected, KSM cannot rely
 213 + * upon the unstable tree to work correctly - the unstable tree is liable to
 214 + * be corrupted as its contents are modified, and so it is called unstable.
 215 + *
 216 + * KSM solves this problem by several techniques:
 217 + *
 218 + * 1) The unstable tree is flushed every time KSM completes scanning all
 219 + *    memory areas, and then the tree is rebuilt again from the beginning.
 220 + * 2) KSM will only insert into the unstable tree, pages whose hash value
 221 + *    has not changed since the previous scan of all memory areas.
 222 + * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the
 223 + *    colors of the nodes and not on their contents, assuring that even when
 224 + *    the tree gets "corrupted" it won't get out of balance, so scanning time
 225 + *    remains the same (also, searching and inserting nodes in an rbtree uses
 226 + *    the same algorithm, so we have no overhead when we flush and rebuild).
 227 + * 4) KSM never flushes the stable tree, which means that even if it were to
 228 + *    take 10 attempts to find a page in the unstable tree, once it is found,
 229 + *    it is secured in the stable tree.  (When we scan a new page, we first
 230 + *    compare it against the stable tree, and then against the unstable tree.)
 231 + */
 232
 233 -static inline void ksm_exit(struct mm_struct *mm)
 234 -{
 235 -}
 236 +
 237 +/**
 238 + * node of either the stable or unstale rbtree
 239 + *
 240 + */
 241 +struct tree_node {
 242 +       struct rb_node node; /* link in the main (un)stable rbtree */
 243 +       struct rb_root sub_root; /* rb_root for sublevel collision rbtree */
 244 +       u32 hash;
 245 +       unsigned long count; /* how many sublevel tree nodes */
 246 +       struct list_head all_list; /* all tree nodes in stable/unstable tree */
 247 +};
 248 +
 249 +
 250 +/**
 251 + * struct stable_node - node of the stable rbtree
 252 + * @node: rb node of this ksm page in the stable tree
 253 + * @hlist: hlist head of rmap_items using this ksm page
 254 + * @kpfn: page frame number of this ksm page
 255 + */
 256 +struct stable_node {
 257 +       struct rb_node node; /* link in sub-rbtree */
 258 +       struct tree_node *tree_node; /* it's tree node root in stable tree, NULL if it's in hell list */
 259 +       struct hlist_head hlist;
 260 +       unsigned long kpfn;
 261 +       u32 hash_max; /* if ==0 then it's not been calculated yet */
 262 +       //struct vm_area_struct *old_vma;
 263 +       struct list_head all_list; /* in a list for all stable nodes */
 264 +};
 265 +
 266 +
 267 +
 268 +
 269 +/**
 270 + * struct node_vma - group rmap_items linked in a same stable
 271 + * node together.
 272 + */
 273 +struct node_vma {
 274 +       union {
 275 +               struct vma_slot *slot;
 276 +               unsigned long key;  /* slot is used as key sorted on hlist */
 277 +       };
 278 +       struct hlist_node hlist;
 279 +       struct hlist_head rmap_hlist;
 280 +       struct stable_node *head;
 281 +       unsigned long last_update;
 282 +};
 283 +
 284 +/**
 285 + * struct rmap_item - reverse mapping item for virtual addresses
 286 + * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
 287 + * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
 288 + * @mm: the memory structure this rmap_item is pointing into
 289 + * @address: the virtual address this rmap_item tracks (+ flags in low bits)
 290 + * @node: rb node of this rmap_item in the unstable tree
 291 + * @head: pointer to stable_node heading this list in the stable tree
 292 + * @hlist: link into hlist of rmap_items hanging off that stable_node
 293 + */
 294 +struct rmap_item {
 295 +       struct vma_slot *slot;
 296 +       struct page *page;
 297 +       unsigned long address;  /* + low bits used for flags below */
 298 +       /* Appendded to (un)stable tree on which scan round */
 299 +       unsigned long append_round;
 300 +
 301 +       /* Which rung scan turn it was last scanned */
 302 +       //unsigned long last_scan;
 303 +       unsigned long entry_index;
 304 +       union {
 305 +               struct {/* when in unstable tree */
 306 +                       struct rb_node node;
 307 +                       struct tree_node *tree_node;
 308 +                       u32 hash_max;
 309 +               };
 310 +               struct { /* when in stable tree */
 311 +                       struct node_vma *head;
 312 +                       struct hlist_node hlist;
 313 +                       struct anon_vma *anon_vma;
 314 +               };
 315 +       };
 316 +} __attribute__((aligned(4)));
 317 +
 318 +struct rmap_list_entry {
 319 +       union {
 320 +               struct rmap_item *item;
 321 +               unsigned long addr;
 322 +       };
 323 +       // lowest bit is used for is_addr tag
 324 +       //unsigned char is_addr;
 325 +} __attribute__((aligned(4))); // 4 aligned to fit in to pages
 326 +
 327 +//extern struct semaphore ksm_scan_sem;
 328 +#else  /* !CONFIG_KSM */
 329
 330  static inline int PageKsm(struct page *page)
 331  {
 332 @@ -107,8 +264,9 @@
 333  }
 334
 335  #ifdef CONFIG_MMU
 336 -static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 337 -               unsigned long end, int advice, unsigned long *vm_flags)
 338 +
 339 +extern inline int unmerge_ksm_pages(struct vm_area_struct *vma,
 340 +                                   unsigned long start, unsigned long end)
 341  {
 342         return 0;
 343  }
 344 diff -Nur linux-2.6.38/include/linux/mm_types.h uksm-2.6.38/include/linux/mm_types.h
 345 --- linux-2.6.38/include/linux/mm_types.h       2011-03-15 09:20:32.000000000 +0800
 346 +++ uksm-2.6.38/include/linux/mm_types.h        2011-12-16 01:10:13.000000000 +0800
 347 @@ -183,6 +183,9 @@
 348  #ifdef CONFIG_NUMA
 349         struct mempolicy *vm_policy;    /* NUMA policy for the VMA */
 350  #endif
 351 +#ifdef CONFIG_KSM
 352 +       struct vma_slot *ksm_vma_slot;
 353 +#endif
 354  };
 355
 356  struct core_thread {
 357 diff -Nur linux-2.6.38/include/linux/mmzone.h uksm-2.6.38/include/linux/mmzone.h
 358 --- linux-2.6.38/include/linux/mmzone.h 2011-03-15 09:20:32.000000000 +0800
 359 +++ uksm-2.6.38/include/linux/mmzone.h  2011-12-16 01:10:13.000000000 +0800
 360 @@ -115,6 +115,9 @@
 361         NUMA_OTHER,             /* allocation from other node */
 362  #endif
 363         NR_ANON_TRANSPARENT_HUGEPAGES,
 364 +#ifdef CONFIG_KSM
 365 +       NR_KSM_PAGES_SHARING,
 366 +#endif
 367         NR_VM_ZONE_STAT_ITEMS };
 368
 369  /*
 370 @@ -344,7 +347,7 @@
 371         ZONE_PADDING(_pad1_)
 372
 373         /* Fields commonly accessed by the page reclaim scanner */
 374 -       spinlock_t              lru_lock;
 375 +       spinlock_t              lru_lock;
 376         struct zone_lru {
 377                 struct list_head list;
 378         } lru[NR_LRU_LISTS];
 379 @@ -722,7 +725,7 @@
 380  }
 381
 382  /**
 383 - * is_highmem - helper function to quickly check if a struct zone is a
 384 + * is_highmem - helper function to quickly check if a struct zone is a
 385   *              highmem zone or not.  This is an attempt to keep references
 386   *              to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum.
 387   * @zone - pointer to struct zone variable
 388 diff -Nur linux-2.6.38/include/linux/sched.h uksm-2.6.38/include/linux/sched.h
 389 --- linux-2.6.38/include/linux/sched.h  2011-03-15 09:20:32.000000000 +0800
 390 +++ uksm-2.6.38/include/linux/sched.h   2011-12-16 01:10:13.000000000 +0800
 391 @@ -433,7 +433,6 @@
 392  # define MMF_DUMP_MASK_DEFAULT_ELF     0
 393  #endif
 394                                         /* leave room for more dump flags */
 395 -#define MMF_VM_MERGEABLE       16      /* KSM may merge identical pages */
 396  #define MMF_VM_HUGEPAGE                17      /* set when VM_HUGEPAGE is set on vma */
 397
 398  #define MMF_INIT_MASK          (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
 399 @@ -1280,9 +1279,9 @@
 400         unsigned long stack_canary;
 401  #endif
 402
 403 -       /*
 404 +       /*
 405          * pointers to (original) parent process, youngest child, younger sibling,
 406 -        * older sibling, respectively.  (p->father can be replaced with
 407 +        * older sibling, respectively.  (p->father can be replaced with
 408          * p->real_parent->pid)
 409          */
 410         struct task_struct *real_parent; /* real parent process */
 411 @@ -2080,7 +2079,7 @@
 412         spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
 413
 414         return ret;
 415 -}
 416 +}
 417
 418  extern void block_all_signals(int (*notifier)(void *priv), void *priv,
 419                               sigset_t *mask);
 420 diff -Nur linux-2.6.38/kernel/fork.c uksm-2.6.38/kernel/fork.c
 421 --- linux-2.6.38/kernel/fork.c  2011-03-15 09:20:32.000000000 +0800
 422 +++ uksm-2.6.38/kernel/fork.c   2011-12-16 01:10:14.000000000 +0800
 423 @@ -328,9 +328,6 @@
 424         rb_link = &mm->mm_rb.rb_node;
 425         rb_parent = NULL;
 426         pprev = &mm->mmap;
 427 -       retval = ksm_fork(mm, oldmm);
 428 -       if (retval)
 429 -               goto out;
 430         retval = khugepaged_fork(mm, oldmm);
 431         if (retval)
 432                 goto out;
 433 @@ -353,7 +350,7 @@
 434                                 goto fail_nomem;
 435                         charge = len;
 436                 }
 437 -               tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 438 +               tmp = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
 439                 if (!tmp)
 440                         goto fail_nomem;
 441                 *tmp = *mpnt;
 442 @@ -406,7 +403,9 @@
 443                 __vma_link_rb(mm, tmp, rb_link, rb_parent);
 444                 rb_link = &tmp->vm_rb.rb_right;
 445                 rb_parent = &tmp->vm_rb;
 446 -
 447 +#ifdef CONFIG_KSM
 448 +               ksm_vma_add_new(tmp);
 449 +#endif
 450                 mm->map_count++;
 451                 retval = copy_page_range(mm, oldmm, mpnt);
 452
 453 @@ -549,7 +548,6 @@
 454
 455         if (atomic_dec_and_test(&mm->mm_users)) {
 456                 exit_aio(mm);
 457 -               ksm_exit(mm);
 458                 khugepaged_exit(mm); /* must run before exit_mmap */
 459                 exit_mmap(mm);
 460                 set_mm_exe_file(mm, NULL);
 461 diff -Nur linux-2.6.38/mm/ksm.c uksm-2.6.38/mm/ksm.c
 462 --- linux-2.6.38/mm/ksm.c       2011-03-15 09:20:32.000000000 +0800
 463 +++ uksm-2.6.38/mm/ksm.c        2011-12-22 17:46:27.967320547 +0800
 464 @@ -12,6 +12,47 @@
 465   *     Hugh Dickins
 466   *
 467   * This work is licensed under the terms of the GNU GPL, version 2.
 468 + *
 469 + *
 470 + *
 471 + * Ultra KSM. Copyright (C) 2011 Nai Xia
 472 + *
 473 + * This is an improvement upon KSM. Its features:
 474 + * 1. Full system scan:
 475 + *      It automatically scans all user processes' anonymous VMAs. Kernel-user
 476 + *      interaction to submit a memory area to KSM is no longer needed.
 477 + *
 478 + * 2. Rich area detection based on random sampling:
 479 + *      It automatically detects rich areas containing abundant duplicated
 480 + *      pages based on their randomly-sampled history. Rich areas are given
 481 + *      a full scan speed. Poor areas are sampled at a reasonable speed with
 482 + *      very low CPU consumption.
 483 + *
 484 + * 3. Per-page scan speed improvement:
 485 + *      A new hash algorithm(random_sample_hash) is proposed. Quite usually,
 486 + *      it's enough to distinguish pages by hashing their partial content
 487 + *      instead of full pages. This algorithm can automatically adapt to this
 488 + *      situation. For the best case, only one 32-bit-word/page is needed to
 489 + *      get the hash value for distinguishing pages. For the worst case, it's as
 490 + *      fast as SuperFastHash.
 491 + *
 492 + * 4. Thrashing area avoidance:
 493 + *      Thrashing area(an VMA that has frequent Ksm page break-out) can be
 494 + *      filtered out. My benchmark shows it's more efficient than KSM's per-page
 495 + *      hash value based volatile page detection.
 496 + *
 497 + * 5. Hash-value-based identical page detection:
 498 + *      It no longer uses "memcmp" based page detection any more.
 499 + *
 500 + * 6. Misc changes upon KSM:
 501 + *      * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page
 502 + *        comparison. It's much faster than default C version on x86.
 503 + *      * rmap_item now has an struct *page member to loosely cache a
 504 + *        address-->page mapping, which reduces too much time-costly
 505 + *        follow_page().
 506 + *      * The VMA creation/exit procedures are hooked to let the Ultra KSM know.
 507 + *      * try_to_merge_two_pages() now can revert a pte if it fails. No break_
 508 + *        ksm is needed for this case.
 509   */
 510
 511  #include <linux/errno.h>
 512 @@ -33,142 +74,157 @@
 513  #include <linux/mmu_notifier.h>
 514  #include <linux/swap.h>
 515  #include <linux/ksm.h>
 516 -#include <linux/hash.h>
 517 +#include <linux/crypto.h>
 518 +#include <linux/scatterlist.h>
 519 +#include <crypto/hash.h>
 520 +#include <linux/random.h>
 521 +#include <linux/math64.h>
 522 +#include <linux/gcd.h>
 523  #include <linux/freezer.h>
 524
 525  #include <asm/tlbflush.h>
 526  #include "internal.h"
 527
 528 +#ifdef CONFIG_X86
 529 +#undef memcmp
 530 +
 531 +#ifdef CONFIG_X86_32
 532 +#define memcmp memcmpx86_32
 533  /*
 534 - * A few notes about the KSM scanning process,
 535 - * to make it easier to understand the data structures below:
 536 - *
 537 - * In order to reduce excessive scanning, KSM sorts the memory pages by their
 538 - * contents into a data structure that holds pointers to the pages' locations.
 539 - *
 540 - * Since the contents of the pages may change at any moment, KSM cannot just
 541 - * insert the pages into a normal sorted tree and expect it to find anything.
 542 - * Therefore KSM uses two data structures - the stable and the unstable tree.
 543 - *
 544 - * The stable tree holds pointers to all the merged pages (ksm pages), sorted
 545 - * by their contents.  Because each such page is write-protected, searching on
 546 - * this tree is fully assured to be working (except when pages are unmapped),
 547 - * and therefore this tree is called the stable tree.
 548 - *
 549 - * In addition to the stable tree, KSM uses a second data structure called the
 550 - * unstable tree: this tree holds pointers to pages which have been found to
 551 - * be "unchanged for a period of time".  The unstable tree sorts these pages
 552 - * by their contents, but since they are not write-protected, KSM cannot rely
 553 - * upon the unstable tree to work correctly - the unstable tree is liable to
 554 - * be corrupted as its contents are modified, and so it is called unstable.
 555 - *
 556 - * KSM solves this problem by several techniques:
 557 - *
 558 - * 1) The unstable tree is flushed every time KSM completes scanning all
 559 - *    memory areas, and then the tree is rebuilt again from the beginning.
 560 - * 2) KSM will only insert into the unstable tree, pages whose hash value
 561 - *    has not changed since the previous scan of all memory areas.
 562 - * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the
 563 - *    colors of the nodes and not on their contents, assuring that even when
 564 - *    the tree gets "corrupted" it won't get out of balance, so scanning time
 565 - *    remains the same (also, searching and inserting nodes in an rbtree uses
 566 - *    the same algorithm, so we have no overhead when we flush and rebuild).
 567 - * 4) KSM never flushes the stable tree, which means that even if it were to
 568 - *    take 10 attempts to find a page in the unstable tree, once it is found,
 569 - *    it is secured in the stable tree.  (When we scan a new page, we first
 570 - *    compare it against the stable tree, and then against the unstable tree.)
 571 + * Compare 4-byte-aligned address s1 and s2, with length n
 572   */
 573 +int memcmpx86_32(void *s1, void *s2, size_t n)
 574 +{
 575 +       size_t num = n / 4;
 576 +       register int res;
 577 +       __asm__ __volatile__
 578 +       ("cld\n\t"
 579 +        "testl %3,%3\n\t"
 580 +        "repe; cmpsd\n\t"
 581 +        "je        1f\n\t"
 582 +        "sbbl      %0,%0\n\t"
 583 +        "orl       $1,%0\n"
 584 +        "1:"
 585 +        : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
 586 +        : "0" (0)
 587 +        : "cc");
 588
 589 -/**
 590 - * struct mm_slot - ksm information per mm that is being scanned
 591 - * @link: link to the mm_slots hash list
 592 - * @mm_list: link into the mm_slots list, rooted in ksm_mm_head
 593 - * @rmap_list: head for this mm_slot's singly-linked list of rmap_items
 594 - * @mm: the mm that this information is valid for
 595 - */
 596 -struct mm_slot {
 597 -       struct hlist_node link;
 598 -       struct list_head mm_list;
 599 -       struct rmap_item *rmap_list;
 600 -       struct mm_struct *mm;
 601 -};
 602 +       return res;
 603 +}
 604
 605 -/**
 606 - * struct ksm_scan - cursor for scanning
 607 - * @mm_slot: the current mm_slot we are scanning
 608 - * @address: the next address inside that to be scanned
 609 - * @rmap_list: link to the next rmap to be scanned in the rmap_list
 610 - * @seqnr: count of completed full scans (needed when removing unstable node)
 611 - *
 612 - * There is only the one ksm_scan instance of this cursor structure.
 613 +#elif defined(CONFIG_X86_64)
 614 +#define memcmp memcmpx86_64
 615 +/*
 616 + * Compare 8-byte-aligned address s1 and s2, with length n
 617   */
 618 -struct ksm_scan {
 619 -       struct mm_slot *mm_slot;
 620 -       unsigned long address;
 621 -       struct rmap_item **rmap_list;
 622 -       unsigned long seqnr;
 623 -};
 624 +int memcmpx86_64(void *s1, void *s2, size_t n)
 625 +{
 626 +       size_t num = n / 8;
 627 +       register int res;
 628 +       __asm__ __volatile__
 629 +       ("cld\n\t"
 630 +        "testq %q3,%q3\n\t"
 631 +        "repe; cmpsq\n\t"
 632 +        "je        1f\n\t"
 633 +        "sbbq      %q0,%q0\n\t"
 634 +        "orq       $1,%q0\n"
 635 +        "1:"
 636 +        : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
 637 +        : "0" (0)
 638 +        : "cc");
 639
 640 -/**
 641 - * struct stable_node - node of the stable rbtree
 642 - * @node: rb node of this ksm page in the stable tree
 643 - * @hlist: hlist head of rmap_items using this ksm page
 644 - * @kpfn: page frame number of this ksm page
 645 - */
 646 -struct stable_node {
 647 -       struct rb_node node;
 648 -       struct hlist_head hlist;
 649 -       unsigned long kpfn;
 650 -};
 651 +       return res;
 652 +}
 653 +#endif
 654 +#endif
 655
 656 -/**
 657 - * struct rmap_item - reverse mapping item for virtual addresses
 658 - * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
 659 - * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
 660 - * @mm: the memory structure this rmap_item is pointing into
 661 - * @address: the virtual address this rmap_item tracks (+ flags in low bits)
 662 - * @oldchecksum: previous checksum of the page at that virtual address
 663 - * @node: rb node of this rmap_item in the unstable tree
 664 - * @head: pointer to stable_node heading this list in the stable tree
 665 - * @hlist: link into hlist of rmap_items hanging off that stable_node
 666 - */
 667 -struct rmap_item {
 668 -       struct rmap_item *rmap_list;
 669 -       struct anon_vma *anon_vma;      /* when stable */
 670 -       struct mm_struct *mm;
 671 -       unsigned long address;          /* + low bits used for flags below */
 672 -       unsigned int oldchecksum;       /* when unstable */
 673 -       union {
 674 -               struct rb_node node;    /* when node of unstable tree */
 675 -               struct {                /* when listed from stable tree */
 676 -                       struct stable_node *head;
 677 -                       struct hlist_node hlist;
 678 -               };
 679 -       };
 680 -};
 681 +#ifdef CONFIG_X86
 682 +#ifdef CONFIG_X86_32
 683 +/*
 684 + * Check the page is all zero ?
 685 + */
 686 +static int check_zero_page(const void *s1, size_t len)
 687 +{
 688 +       unsigned char diff;
 689 +       len = len>>2;
 690 +       asm("repe; scasl; setnz %0"
 691 +           : "=qm" (diff), "+D" (s1), "+c" (len)
 692 +          : "a" (0)
 693 +          :"cc");
 694 +       return diff;
 695 +}
 696 +
 697 +#elif defined(CONFIG_X86_64)
 698 +static int check_zero_page(const void *s1, size_t len)
 699 +{
 700 +       unsigned char diff;
 701 +       len = len>>3;
 702 +       asm("repe; scasq; setnz %0"
 703 +           : "=qm" (diff), "+D" (s1), "+c" (len)
 704 +          : "a" (0)
 705 +          :"cc");
 706 +       return diff;
 707 +}
 708 +#endif
 709 +#else
 710 +static int check_zero_page(const void *s1, size_t len)
 711 +{
 712 +       int ret = 0;
 713 +       u32 *src = (u32 *)s1;
 714 +       u32 z = 0;
 715 +       len = len>>2;
 716 +       while(len--)
 717 +         if ((ret = *src++ - z) != 0)
 718 +                       break;
 719
 720 -#define SEQNR_MASK     0x0ff   /* low bits of unstable tree seqnr */
 721 -#define UNSTABLE_FLAG  0x100   /* is a node of the unstable tree */
 722 -#define STABLE_FLAG    0x200   /* is listed from the stable tree */
 723 +       return ret;
 724 +}
 725 +#endif
 726
 727 -/* The stable and unstable tree heads */
 728 -static struct rb_root root_stable_tree = RB_ROOT;
 729 -static struct rb_root root_unstable_tree = RB_ROOT;
 730 +#define U64_MAX                (~((u64)0))
 731
 732 -#define MM_SLOTS_HASH_SHIFT 10
 733 -#define MM_SLOTS_HASH_HEADS (1 << MM_SLOTS_HASH_SHIFT)
 734 -static struct hlist_head mm_slots_hash[MM_SLOTS_HASH_HEADS];
 735
 736 -static struct mm_slot ksm_mm_head = {
 737 -       .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
 738 -};
 739 -static struct ksm_scan ksm_scan = {
 740 -       .mm_slot = &ksm_mm_head,
 741 -};
 742 +/*
 743 + * Flags for rmap_item to judge if it's listed in the stable/unstable tree.
 744 + * The flags use the low bits of rmap_item.address
 745 + */
 746 +#define UNSTABLE_FLAG  0x1
 747 +#define STABLE_FLAG    0x2
 748 +#define get_rmap_addr(x)       ((x)->address & PAGE_MASK)
 749
 750 +/*
 751 + * rmap_list_entry helpers
 752 + */
 753 +#define IS_ADDR_FLAG   1
 754 +#define is_addr(ptr)           ((unsigned long)(ptr) & IS_ADDR_FLAG)
 755 +#define set_is_addr(ptr)       ((ptr) |= IS_ADDR_FLAG)
 756 +#define get_clean_addr(ptr)    (((ptr) & ~(__typeof__(ptr))IS_ADDR_FLAG))
 757 +
 758 +
 759 +/*
 760 + * High speed caches for frequently allocated and freed structs
 761 + */
 762  static struct kmem_cache *rmap_item_cache;
 763  static struct kmem_cache *stable_node_cache;
 764 -static struct kmem_cache *mm_slot_cache;
 765 +static struct kmem_cache *node_vma_cache;
 766 +static struct kmem_cache *vma_slot_cache;
 767 +static struct kmem_cache *tree_node_cache;
 768 +#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
 769 +               sizeof(struct __struct), __alignof__(struct __struct),\
 770 +               (__flags), NULL)
 771 +
 772 +/* The scan rounds ksmd is currently in */
 773 +static unsigned long long ksm_scan_round = 1;
 774 +
 775 +/* The number of pages has been scanned since the start up */
 776 +static u64 ksm_pages_scanned;
 777 +
 778 +/* The number of pages has been scanned when last scan round finished */
 779 +static u64 ksm_pages_scanned_last;
 780 +
 781 +/* If the scanned number is tooo large, we encode it here */
 782 +static u64 pages_scanned_stored;
 783 +static unsigned long pages_scanned_base;
 784
 785  /* The number of nodes in the stable tree */
 786  static unsigned long ksm_pages_shared;
 787 @@ -179,345 +235,408 @@
 788  /* The number of nodes in the unstable tree */
 789  static unsigned long ksm_pages_unshared;
 790
 791 -/* The number of rmap_items in use: to calculate pages_volatile */
 792 -static unsigned long ksm_rmap_items;
 793 +/*The number pages remap to zero pages */
 794 +static unsigned long ksm_remap_zero_pages;
 795
 796 -/* Number of pages ksmd should scan in one batch */
 797 -static unsigned int ksm_thread_pages_to_scan = 100;
 798 +/*
 799 + * Number of pages ksmd should scan in one batch. This is the top speed for
 800 + * richly duplicated areas.
 801 + */
 802 +static unsigned long ksm_scan_batch_pages = 60000;
 803
 804  /* Milliseconds ksmd should sleep between batches */
 805 -static unsigned int ksm_thread_sleep_millisecs = 20;
 806 +static unsigned int ksm_sleep_jiffies = 2;
 807 +
 808 +/*
 809 + * The threshold used to filter out thrashing areas,
 810 + * If it == 0, filtering is disabled, otherwise it's the percentage up-bound
 811 + * of the thrashing ratio of all areas. Any area with a bigger thrashing ratio
 812 + * will be considered as having a zero duplication ratio.
 813 + */
 814 +static unsigned int ksm_thrash_threshold = 50;
 815 +
 816 +/* To avoid the float point arithmetic, this is the scale of a
 817 + * deduplication ratio number.
 818 + */
 819 +#define KSM_DEDUP_RATIO_SCALE  100
 820 +
 821 +
 822 +#define KSM_SCAN_RATIO_MAX     125
 823 +
 824 +/* minimum scan ratio for a vma, in unit of 1/KSM_SCAN_RATIO_MAX */
 825 +static unsigned int ksm_min_scan_ratio = 1;
 826 +
 827 +/*
 828 + * After each scan round, the scan ratio of an area with a big deduplication
 829 + * ratio is upgraded by *=ksm_scan_ratio_delta
 830 + */
 831 +static unsigned int ksm_scan_ratio_delta = 5;
 832 +
 833 +/*
 834 + * Inter-vma duplication number table page pointer array, initialized at
 835 + * startup. Whenever ksmd finds that two areas have an identical page,
 836 + * their corresponding table entry is increased. After each scan round
 837 + * is finished, this table is scanned to calculate the estimated
 838 + * duplication ratio for VMAs. Limited number(2048) of VMAs are
 839 + * supported by now. We will migrate it to more scalable data structures
 840 + * in the future.
 841 + */
 842 +#define KSM_DUP_VMA_MAX                2048
 843 +
 844 +#define INDIRECT_OFFSET                1
 845 +
 846 +/*
 847 + * For mapping of vma_slot and its index in inter-vma duplication number
 848 + * table
 849 + */
 850 +static struct radix_tree_root ksm_vma_tree;
 851 +static unsigned long ksm_vma_tree_num;
 852 +static unsigned long ksm_vma_tree_index_end;
 853 +
 854 +/* Array of all scan_rung, ksm_scan_ladder[0] having the minimum scan ratio */
 855 +static struct scan_rung *ksm_scan_ladder;
 856 +static unsigned int ksm_scan_ladder_size;
 857 +
 858 +/* The number of VMAs we are keeping track of */
 859 +static unsigned long ksm_vma_slot_num;
 860 +
 861 +/* How many times the ksmd has slept since startup */
 862 +static u64 ksm_sleep_times;
 863
 864  #define KSM_RUN_STOP   0
 865  #define KSM_RUN_MERGE  1
 866 -#define KSM_RUN_UNMERGE        2
 867 -static unsigned int ksm_run = KSM_RUN_STOP;
 868 +static unsigned int ksm_run = KSM_RUN_MERGE;
 869
 870  static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
 871  static DEFINE_MUTEX(ksm_thread_mutex);
 872 -static DEFINE_SPINLOCK(ksm_mmlist_lock);
 873
 874 -#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
 875 -               sizeof(struct __struct), __alignof__(struct __struct),\
 876 -               (__flags), NULL)
 877 +/*
 878 + * List vma_slot_new is for newly created vma_slot waiting to be added by
 879 + * ksmd. If one cannot be added(e.g. due to it's too small), it's moved to
 880 + * vma_slot_noadd. vma_slot_del is the list for vma_slot whose corresponding
 881 + * VMA has been removed/freed.
 882 + */
 883 +struct list_head vma_slot_new = LIST_HEAD_INIT(vma_slot_new);
 884 +struct list_head vma_slot_noadd = LIST_HEAD_INIT(vma_slot_noadd);
 885 +struct list_head vma_slot_del = LIST_HEAD_INIT(vma_slot_del);
 886 +static DEFINE_SPINLOCK(vma_slot_list_lock);
 887
 888 -static int __init ksm_slab_init(void)
 889 +/* The unstable tree heads */
 890 +static struct rb_root root_unstable_tree = RB_ROOT;
 891 +
 892 +/*
 893 + * All tree_nodes are in a list to be freed at once when unstable tree is
 894 + * freed after each scan round.
 895 + */
 896 +static struct list_head unstable_tree_node_list =
 897 +                               LIST_HEAD_INIT(unstable_tree_node_list);
 898 +
 899 +/* List contains all stable nodes */
 900 +static struct list_head stable_node_list = LIST_HEAD_INIT(stable_node_list);
 901 +
 902 +/*
 903 + * When the hash strength is changed, the stable tree must be delta_hashed and
 904 + * re-structured. We use two set of below structs to speed up the
 905 + * re-structuring of stable tree.
 906 + */
 907 +static struct list_head
 908 +stable_tree_node_list[2] = {LIST_HEAD_INIT(stable_tree_node_list[0]),
 909 +                           LIST_HEAD_INIT(stable_tree_node_list[1])};
 910 +
 911 +static struct list_head *stable_tree_node_listp = &stable_tree_node_list[0];
 912 +static struct rb_root root_stable_tree[2] = {RB_ROOT, RB_ROOT};
 913 +static struct rb_root *root_stable_treep = &root_stable_tree[0];
 914 +static unsigned long stable_tree_index;
 915 +
 916 +/* The hash strength needed to hash a full page */
 917 +#define HASH_STRENGTH_FULL             (PAGE_SIZE / sizeof(u32))
 918 +
 919 +/* The hash strength needed for loop-back hashing */
 920 +#define HASH_STRENGTH_MAX              (HASH_STRENGTH_FULL + 10)
 921 +
 922 +/* The random offsets in a page */
 923 +static u32 *random_nums;
 924 +
 925 +/* The hash strength */
 926 +static unsigned long hash_strength = HASH_STRENGTH_FULL >> 4;
 927 +
 928 +/* The delta value each time the hash strength increases or decreases */
 929 +static unsigned long hash_strength_delta;
 930 +#define HASH_STRENGTH_DELTA_MAX        5
 931 +
 932 +/* The time we have saved due to random_sample_hash */
 933 +static u64 rshash_pos;
 934 +
 935 +/* The time we have wasted due to hash collision */
 936 +static u64 rshash_neg;
 937 +
 938 +struct ksm_benefit {
 939 +       u64 pos;
 940 +       u64 neg;
 941 +       u64 scanned;
 942 +       unsigned long base;
 943 +} benefit;
 944 +
 945 +/*
 946 + * The relative cost of memcmp, compared to 1 time unit of random sample
 947 + * hash, this value is tested when ksm module is initialized
 948 + */
 949 +static unsigned long memcmp_cost;
 950 +
 951 +static unsigned long  rshash_neg_cont_zero;
 952 +static unsigned long  rshash_cont_obscure;
 953 +
 954 +/* The possible states of hash strength adjustment heuristic */
 955 +enum rshash_states {
 956 +               RSHASH_STILL,
 957 +               RSHASH_TRYUP,
 958 +               RSHASH_TRYDOWN,
 959 +               RSHASH_NEW,
 960 +               RSHASH_PRE_STILL,
 961 +};
 962 +
 963 +/* The possible direction we are about to adjust hash strength */
 964 +enum rshash_direct {
 965 +       GO_UP,
 966 +       GO_DOWN,
 967 +       OBSCURE,
 968 +       STILL,
 969 +};
 970 +
 971 +/* random sampling hash state machine */
 972 +static struct {
 973 +       enum rshash_states state;
 974 +       enum rshash_direct pre_direct;
 975 +       u8 below_count;
 976 +       /* Keep a lookup window of size 5, iff above_count/below_count > 3
 977 +        * in this window we stop trying.
 978 +        */
 979 +       u8 lookup_window_index;
 980 +       u64 stable_benefit;
 981 +       unsigned long turn_point_down;
 982 +       unsigned long turn_benefit_down;
 983 +       unsigned long turn_point_up;
 984 +       unsigned long turn_benefit_up;
 985 +       unsigned long stable_point;
 986 +} rshash_state;
 987 +
 988 +/*zero page hash table, hash_strength [0 ~ HASH_STRENGTH_MAX]*/
 989 +static u32 *zero_hash_table;
 990 +
 991 +extern unsigned long zero_pfn __read_mostly;
 992 +
 993 +static inline struct node_vma *alloc_node_vma(void)
 994  {
 995 -       rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
 996 -       if (!rmap_item_cache)
 997 -               goto out;
 998 +       struct node_vma *node_vma;
 999 +       node_vma = kmem_cache_zalloc(node_vma_cache, GFP_KERNEL);
1000 +       if (node_vma) {
1001 +               INIT_HLIST_HEAD(&node_vma->rmap_hlist);
1002 +               INIT_HLIST_NODE(&node_vma->hlist);
1003 +               node_vma->last_update = 0;
1004 +       }
1005 +       return node_vma;
1006 +}
1007
1008 -       stable_node_cache = KSM_KMEM_CACHE(stable_node, 0);
1009 -       if (!stable_node_cache)
1010 -               goto out_free1;
1011 +static inline void free_node_vma(struct node_vma *node_vma)
1012 +{
1013 +       kmem_cache_free(node_vma_cache, node_vma);
1014 +}
1015
1016 -       mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
1017 -       if (!mm_slot_cache)
1018 -               goto out_free2;
1019
1020 -       return 0;
1021 +static inline struct vma_slot *alloc_vma_slot(void)
1022 +{
1023 +       struct vma_slot *slot;
1024
1025 -out_free2:
1026 -       kmem_cache_destroy(stable_node_cache);
1027 -out_free1:
1028 -       kmem_cache_destroy(rmap_item_cache);
1029 -out:
1030 -       return -ENOMEM;
1031 +       /*
1032 +        * In case ksm is not initialized by now.
1033 +        * Oops, we need to consider the call site of ksm_init() in the future.
1034 +        */
1035 +       if (!vma_slot_cache)
1036 +               return NULL;
1037 +
1038 +       slot = kmem_cache_zalloc(vma_slot_cache, GFP_KERNEL);
1039 +       if (slot) {
1040 +               INIT_LIST_HEAD(&slot->ksm_list);
1041 +               INIT_LIST_HEAD(&slot->slot_list);
1042 +               INIT_RADIX_TREE(&slot->dup_tree, GFP_KERNEL);
1043 +               slot->ksm_index = -1;
1044 +               slot->need_rerand = 1;
1045 +       }
1046 +       return slot;
1047  }
1048
1049 -static void __init ksm_slab_free(void)
1050 +static inline void free_vma_slot(struct vma_slot *vma_slot)
1051  {
1052 -       kmem_cache_destroy(mm_slot_cache);
1053 -       kmem_cache_destroy(stable_node_cache);
1054 -       kmem_cache_destroy(rmap_item_cache);
1055 -       mm_slot_cache = NULL;
1056 +       kmem_cache_free(vma_slot_cache, vma_slot);
1057  }
1058
1059 +
1060 +
1061  static inline struct rmap_item *alloc_rmap_item(void)
1062  {
1063         struct rmap_item *rmap_item;
1064
1065         rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL);
1066 -       if (rmap_item)
1067 -               ksm_rmap_items++;
1068 +       if (rmap_item) {
1069 +               /* bug on lowest bit is not clear for flag use */
1070 +               BUG_ON(is_addr(rmap_item));
1071 +       }
1072         return rmap_item;
1073  }
1074
1075  static inline void free_rmap_item(struct rmap_item *rmap_item)
1076  {
1077 -       ksm_rmap_items--;
1078 -       rmap_item->mm = NULL;   /* debug safety */
1079 +       rmap_item->slot = NULL; /* debug safety */
1080         kmem_cache_free(rmap_item_cache, rmap_item);
1081  }
1082
1083  static inline struct stable_node *alloc_stable_node(void)
1084  {
1085 -       return kmem_cache_alloc(stable_node_cache, GFP_KERNEL);
1086 +       struct stable_node *node;
1087 +       node = kmem_cache_alloc(stable_node_cache, GFP_KERNEL | GFP_ATOMIC);
1088 +       if (!node)
1089 +               return NULL;
1090 +
1091 +       INIT_HLIST_HEAD(&node->hlist);
1092 +       list_add(&node->all_list, &stable_node_list);
1093 +       return node;
1094  }
1095
1096  static inline void free_stable_node(struct stable_node *stable_node)
1097  {
1098 +       list_del(&stable_node->all_list);
1099         kmem_cache_free(stable_node_cache, stable_node);
1100  }
1101
1102 -static inline struct mm_slot *alloc_mm_slot(void)
1103 +static inline struct tree_node *alloc_tree_node(struct list_head *list)
1104  {
1105 -       if (!mm_slot_cache)     /* initialization failed */
1106 +       struct tree_node *node;
1107 +       node = kmem_cache_zalloc(tree_node_cache, GFP_KERNEL | GFP_ATOMIC);
1108 +       if (!node)
1109                 return NULL;
1110 -       return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
1111 +
1112 +       list_add(&node->all_list, list);
1113 +       return node;
1114  }
1115
1116 -static inline void free_mm_slot(struct mm_slot *mm_slot)
1117 +static inline void free_tree_node(struct tree_node *node)
1118  {
1119 -       kmem_cache_free(mm_slot_cache, mm_slot);
1120 +       list_del(&node->all_list);
1121 +       kmem_cache_free(tree_node_cache, node);
1122  }
1123
1124 -static struct mm_slot *get_mm_slot(struct mm_struct *mm)
1125 +static void ksm_drop_anon_vma(struct rmap_item *rmap_item)
1126  {
1127 -       struct mm_slot *mm_slot;
1128 -       struct hlist_head *bucket;
1129 -       struct hlist_node *node;
1130 +       struct anon_vma *anon_vma = rmap_item->anon_vma;
1131
1132 -       bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)];
1133 -       hlist_for_each_entry(mm_slot, node, bucket, link) {
1134 -               if (mm == mm_slot->mm)
1135 -                       return mm_slot;
1136 -       }
1137 -       return NULL;
1138 +       drop_anon_vma(anon_vma);
1139  }
1140
1141 -static void insert_to_mm_slots_hash(struct mm_struct *mm,
1142 -                                   struct mm_slot *mm_slot)
1143 +
1144 +/**
1145 + * Remove a stable node from stable_tree, may unlink from its tree_node and
1146 + * may remove its parent tree_node if no other stable node is pending.
1147 + *
1148 + * @stable_node        The node need to be removed
1149 + * @unlink_rb          Will this node be unlinked from the rbtree?
1150 + * @remove_tree_       node Will its tree_node be removed if empty?
1151 + */
1152 +static void remove_node_from_stable_tree(struct stable_node *stable_node,
1153 +                                        int unlink_rb,  int remove_tree_node)
1154  {
1155 -       struct hlist_head *bucket;
1156 +       struct node_vma *node_vma;
1157 +       struct rmap_item *rmap_item;
1158 +       struct hlist_node *hlist, *rmap_hlist, *n;
1159
1160 -       bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)];
1161 -       mm_slot->mm = mm;
1162 -       hlist_add_head(&mm_slot->link, bucket);
1163 -}
1164 +       if (!hlist_empty(&stable_node->hlist)) {
1165 +               hlist_for_each_entry_safe(node_vma, hlist, n,
1166 +                                         &stable_node->hlist, hlist) {
1167 +                       hlist_for_each_entry(rmap_item, rmap_hlist,
1168 +                                            &node_vma->rmap_hlist, hlist) {
1169 +                               ksm_pages_sharing--;
1170
1171 -static inline int in_stable_tree(struct rmap_item *rmap_item)
1172 -{
1173 -       return rmap_item->address & STABLE_FLAG;
1174 -}
1175 +                               ksm_drop_anon_vma(rmap_item);
1176 +                               rmap_item->address &= PAGE_MASK;
1177 +                       }
1178 +                       free_node_vma(node_vma);
1179 +                       cond_resched();
1180 +               }
1181
1182 -static void hold_anon_vma(struct rmap_item *rmap_item,
1183 -                         struct anon_vma *anon_vma)
1184 -{
1185 -       rmap_item->anon_vma = anon_vma;
1186 -       get_anon_vma(anon_vma);
1187 -}
1188 +               /* the last one is counted as shared */
1189 +               ksm_pages_shared--;
1190 +               ksm_pages_sharing++;
1191 +       }
1192
1193 -static void ksm_drop_anon_vma(struct rmap_item *rmap_item)
1194 -{
1195 -       struct anon_vma *anon_vma = rmap_item->anon_vma;
1196 +       if (stable_node->tree_node && unlink_rb) {
1197 +               rb_erase(&stable_node->node,
1198 +                        &stable_node->tree_node->sub_root);
1199 +
1200 +               if (RB_EMPTY_ROOT(&stable_node->tree_node->sub_root) &&
1201 +                   remove_tree_node) {
1202 +                       rb_erase(&stable_node->tree_node->node,
1203 +                                root_stable_treep);
1204 +                       free_tree_node(stable_node->tree_node);
1205 +               } else {
1206 +                       stable_node->tree_node->count--;
1207 +               }
1208 +       }
1209
1210 -       drop_anon_vma(anon_vma);
1211 +       free_stable_node(stable_node);
1212  }
1213
1214 -/*
1215 - * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
1216 - * page tables after it has passed through ksm_exit() - which, if necessary,
1217 - * takes mmap_sem briefly to serialize against them.  ksm_exit() does not set
1218 - * a special flag: they can just back out as soon as mm_users goes to zero.
1219 - * ksm_test_exit() is used throughout to make this test for exit: in some
1220 - * places for correctness, in some places just to avoid unnecessary work.
1221 - */
1222 -static inline bool ksm_test_exit(struct mm_struct *mm)
1223 -{
1224 -       return atomic_read(&mm->mm_users) == 0;
1225 -}
1226
1227  /*
1228 - * We use break_ksm to break COW on a ksm page: it's a stripped down
1229 + * get_ksm_page: checks if the page indicated by the stable node
1230 + * is still its ksm page, despite having held no reference to it.
1231 + * In which case we can trust the content of the page, and it
1232 + * returns the gotten page; but if the page has now been zapped,
1233 + * remove the stale node from the stable tree and return NULL.
1234   *
1235 - *     if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1)
1236 - *             put_page(page);
1237 + * You would expect the stable_node to hold a reference to the ksm page.
1238 + * But if it increments the page's count, swapping out has to wait for
1239 + * ksmd to come around again before it can free the page, which may take
1240 + * seconds or even minutes: much too unresponsive.  So instead we use a
1241 + * "keyhole reference": access to the ksm page from the stable node peeps
1242 + * out through its keyhole to see if that page still holds the right key,
1243 + * pointing back to this stable node.  This relies on freeing a PageAnon
1244 + * page to reset its page->mapping to NULL, and relies on no other use of
1245 + * a page to put something that might look like our key in page->mapping.
1246   *
1247 - * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
1248 - * in case the application has unmapped and remapped mm,addr meanwhile.
1249 - * Could a ksm page appear anywhere else?  Actually yes, in a VM_PFNMAP
1250 - * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
1251 + * include/linux/pagemap.h page_cache_get_speculative() is a good reference,
1252 + * but this is different - made simpler by ksm_thread_mutex being held, but
1253 + * interesting for assuming that no other use of the struct page could ever
1254 + * put our expected_mapping into page->mapping (or a field of the union which
1255 + * coincides with page->mapping).  The RCU calls are not for KSM at all, but
1256 + * to keep the page_count protocol described with page_cache_get_speculative.
1257 + *
1258 + * Note: it is possible that get_ksm_page() will return NULL one moment,
1259 + * then page the next, if the page is in between page_freeze_refs() and
1260 + * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page
1261 + * is on its way to being freed; but it is an anomaly to bear in mind.
1262 + *
1263 + * @unlink_rb:                 if the removal of this node will firstly unlink from
1264 + * its rbtree. stable_node_reinsert will prevent this when restructuring the
1265 + * node from its old tree.
1266 + *
1267 + * @remove_tree_node:  if this is the last one of its tree_node, will the
1268 + * tree_node be freed ? If we are inserting stable node, this tree_node may
1269 + * be reused, so don't free it.
1270   */
1271 -static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
1272 +static struct page *get_ksm_page(struct stable_node *stable_node,
1273 +                                int unlink_rb, int remove_tree_node)
1274  {
1275         struct page *page;
1276 -       int ret = 0;
1277 +       void *expected_mapping;
1278
1279 -       do {
1280 -               cond_resched();
1281 -               page = follow_page(vma, addr, FOLL_GET);
1282 -               if (IS_ERR_OR_NULL(page))
1283 -                       break;
1284 -               if (PageKsm(page))
1285 -                       ret = handle_mm_fault(vma->vm_mm, vma, addr,
1286 -                                                       FAULT_FLAG_WRITE);
1287 -               else
1288 -                       ret = VM_FAULT_WRITE;
1289 -               put_page(page);
1290 -       } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM)));
1291 -       /*
1292 -        * We must loop because handle_mm_fault() may back out if there's
1293 -        * any difficulty e.g. if pte accessed bit gets updated concurrently.
1294 -        *
1295 -        * VM_FAULT_WRITE is what we have been hoping for: it indicates that
1296 -        * COW has been broken, even if the vma does not permit VM_WRITE;
1297 -        * but note that a concurrent fault might break PageKsm for us.
1298 -        *
1299 -        * VM_FAULT_SIGBUS could occur if we race with truncation of the
1300 -        * backing file, which also invalidates anonymous pages: that's
1301 -        * okay, that truncation will have unmapped the PageKsm for us.
1302 -        *
1303 -        * VM_FAULT_OOM: at the time of writing (late July 2009), setting
1304 -        * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
1305 -        * current task has TIF_MEMDIE set, and will be OOM killed on return
1306 -        * to user; and ksmd, having no mm, would never be chosen for that.
1307 -        *
1308 -        * But if the mm is in a limited mem_cgroup, then the fault may fail
1309 -        * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
1310 -        * even ksmd can fail in this way - though it's usually breaking ksm
1311 -        * just to undo a merge it made a moment before, so unlikely to oom.
1312 -        *
1313 -        * That's a pity: we might therefore have more kernel pages allocated
1314 -        * than we're counting as nodes in the stable tree; but ksm_do_scan
1315 -        * will retry to break_cow on each pass, so should recover the page
1316 -        * in due course.  The important thing is to not let VM_MERGEABLE
1317 -        * be cleared while any such pages might remain in the area.
1318 -        */
1319 -       return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
1320 -}
1321 -
1322 -static void break_cow(struct rmap_item *rmap_item)
1323 -{
1324 -       struct mm_struct *mm = rmap_item->mm;
1325 -       unsigned long addr = rmap_item->address;
1326 -       struct vm_area_struct *vma;
1327 -
1328 -       /*
1329 -        * It is not an accident that whenever we want to break COW
1330 -        * to undo, we also need to drop a reference to the anon_vma.
1331 -        */
1332 -       ksm_drop_anon_vma(rmap_item);
1333 -
1334 -       down_read(&mm->mmap_sem);
1335 -       if (ksm_test_exit(mm))
1336 -               goto out;
1337 -       vma = find_vma(mm, addr);
1338 -       if (!vma || vma->vm_start > addr)
1339 -               goto out;
1340 -       if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
1341 -               goto out;
1342 -       break_ksm(vma, addr);
1343 -out:
1344 -       up_read(&mm->mmap_sem);
1345 -}
1346 -
1347 -static struct page *page_trans_compound_anon(struct page *page)
1348 -{
1349 -       if (PageTransCompound(page)) {
1350 -               struct page *head = compound_trans_head(page);
1351 -               /*
1352 -                * head may actually be splitted and freed from under
1353 -                * us but it's ok here.
1354 -                */
1355 -               if (PageAnon(head))
1356 -                       return head;
1357 -       }
1358 -       return NULL;
1359 -}
1360 -
1361 -static struct page *get_mergeable_page(struct rmap_item *rmap_item)
1362 -{
1363 -       struct mm_struct *mm = rmap_item->mm;
1364 -       unsigned long addr = rmap_item->address;
1365 -       struct vm_area_struct *vma;
1366 -       struct page *page;
1367 -
1368 -       down_read(&mm->mmap_sem);
1369 -       if (ksm_test_exit(mm))
1370 -               goto out;
1371 -       vma = find_vma(mm, addr);
1372 -       if (!vma || vma->vm_start > addr)
1373 -               goto out;
1374 -       if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
1375 -               goto out;
1376 -
1377 -       page = follow_page(vma, addr, FOLL_GET);
1378 -       if (IS_ERR_OR_NULL(page))
1379 -               goto out;
1380 -       if (PageAnon(page) || page_trans_compound_anon(page)) {
1381 -               flush_anon_page(vma, page, addr);
1382 -               flush_dcache_page(page);
1383 -       } else {
1384 -               put_page(page);
1385 -out:           page = NULL;
1386 -       }
1387 -       up_read(&mm->mmap_sem);
1388 -       return page;
1389 -}
1390 -
1391 -static void remove_node_from_stable_tree(struct stable_node *stable_node)
1392 -{
1393 -       struct rmap_item *rmap_item;
1394 -       struct hlist_node *hlist;
1395 -
1396 -       hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
1397 -               if (rmap_item->hlist.next)
1398 -                       ksm_pages_sharing--;
1399 -               else
1400 -                       ksm_pages_shared--;
1401 -               ksm_drop_anon_vma(rmap_item);
1402 -               rmap_item->address &= PAGE_MASK;
1403 -               cond_resched();
1404 -       }
1405 -
1406 -       rb_erase(&stable_node->node, &root_stable_tree);
1407 -       free_stable_node(stable_node);
1408 -}
1409 -
1410 -/*
1411 - * get_ksm_page: checks if the page indicated by the stable node
1412 - * is still its ksm page, despite having held no reference to it.
1413 - * In which case we can trust the content of the page, and it
1414 - * returns the gotten page; but if the page has now been zapped,
1415 - * remove the stale node from the stable tree and return NULL.
1416 - *
1417 - * You would expect the stable_node to hold a reference to the ksm page.
1418 - * But if it increments the page's count, swapping out has to wait for
1419 - * ksmd to come around again before it can free the page, which may take
1420 - * seconds or even minutes: much too unresponsive.  So instead we use a
1421 - * "keyhole reference": access to the ksm page from the stable node peeps
1422 - * out through its keyhole to see if that page still holds the right key,
1423 - * pointing back to this stable node.  This relies on freeing a PageAnon
1424 - * page to reset its page->mapping to NULL, and relies on no other use of
1425 - * a page to put something that might look like our key in page->mapping.
1426 - *
1427 - * include/linux/pagemap.h page_cache_get_speculative() is a good reference,
1428 - * but this is different - made simpler by ksm_thread_mutex being held, but
1429 - * interesting for assuming that no other use of the struct page could ever
1430 - * put our expected_mapping into page->mapping (or a field of the union which
1431 - * coincides with page->mapping).  The RCU calls are not for KSM at all, but
1432 - * to keep the page_count protocol described with page_cache_get_speculative.
1433 - *
1434 - * Note: it is possible that get_ksm_page() will return NULL one moment,
1435 - * then page the next, if the page is in between page_freeze_refs() and
1436 - * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page
1437 - * is on its way to being freed; but it is an anomaly to bear in mind.
1438 - */
1439 -static struct page *get_ksm_page(struct stable_node *stable_node)
1440 -{
1441 -       struct page *page;
1442 -       void *expected_mapping;
1443 -
1444 -       page = pfn_to_page(stable_node->kpfn);
1445 -       expected_mapping = (void *)stable_node +
1446 -                               (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
1447 -       rcu_read_lock();
1448 -       if (page->mapping != expected_mapping)
1449 -               goto stale;
1450 -       if (!get_page_unless_zero(page))
1451 -               goto stale;
1452 -       if (page->mapping != expected_mapping) {
1453 +       page = pfn_to_page(stable_node->kpfn);
1454 +       expected_mapping = (void *)stable_node +
1455 +                               (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
1456 +       rcu_read_lock();
1457 +       if (page->mapping != expected_mapping)
1458 +               goto stale;
1459 +       if (!get_page_unless_zero(page))
1460 +               goto stale;
1461 +       if (page->mapping != expected_mapping) {
1462                 put_page(page);
1463                 goto stale;
1464         }
1465 @@ -525,7 +644,8 @@
1466         return page;
1467  stale:
1468         rcu_read_unlock();
1469 -       remove_node_from_stable_tree(stable_node);
1470 +       remove_node_from_stable_tree(stable_node, unlink_rb, remove_tree_node);
1471 +
1472         return NULL;
1473  }
1474
1475 @@ -533,32 +653,46 @@
1476   * Removing rmap_item from stable or unstable tree.
1477   * This function will clean the information from the stable/unstable tree.
1478   */
1479 -static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
1480 +static inline void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
1481  {
1482         if (rmap_item->address & STABLE_FLAG) {
1483                 struct stable_node *stable_node;
1484 +               struct node_vma *node_vma;
1485                 struct page *page;
1486
1487 -               stable_node = rmap_item->head;
1488 -               page = get_ksm_page(stable_node);
1489 +               node_vma = rmap_item->head;
1490 +               stable_node = node_vma->head;
1491 +               page = get_ksm_page(stable_node, 1, 1);
1492                 if (!page)
1493                         goto out;
1494
1495 +               /*
1496 +                * page lock is needed because it's racing with
1497 +                * try_to_unmap_ksm(), etc.
1498 +                */
1499                 lock_page(page);
1500                 hlist_del(&rmap_item->hlist);
1501 +
1502 +               if (hlist_empty(&node_vma->rmap_hlist)) {
1503 +                       hlist_del(&node_vma->hlist);
1504 +                       free_node_vma(node_vma);
1505 +               }
1506                 unlock_page(page);
1507 -               put_page(page);
1508
1509 -               if (stable_node->hlist.first)
1510 -                       ksm_pages_sharing--;
1511 -               else
1512 +               put_page(page);
1513 +               if (hlist_empty(&stable_node->hlist)) {
1514 +                       /* do NOT call remove_node_from_stable_tree() here,
1515 +                        * it's possible for a forked rmap_item not in
1516 +                        * stable tree while the in-tree rmap_items were
1517 +                        * deleted.
1518 +                        */
1519                         ksm_pages_shared--;
1520 +               } else
1521 +                       ksm_pages_sharing--;
1522
1523 -               ksm_drop_anon_vma(rmap_item);
1524 -               rmap_item->address &= PAGE_MASK;
1525
1526 +               ksm_drop_anon_vma(rmap_item);
1527         } else if (rmap_item->address & UNSTABLE_FLAG) {
1528 -               unsigned char age;
1529                 /*
1530                  * Usually ksmd can and must skip the rb_erase, because
1531                  * root_unstable_tree was already reset to RB_ROOT.
1532 @@ -566,173 +700,458 @@
1533                  * if this rmap_item was inserted by this scan, rather
1534                  * than left over from before.
1535                  */
1536 -               age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
1537 -               BUG_ON(age > 1);
1538 -               if (!age)
1539 -                       rb_erase(&rmap_item->node, &root_unstable_tree);
1540 -
1541 +               if (rmap_item->append_round == ksm_scan_round) {
1542 +                       rb_erase(&rmap_item->node,
1543 +                                &rmap_item->tree_node->sub_root);
1544 +                       if (RB_EMPTY_ROOT(&rmap_item->tree_node->sub_root)) {
1545 +                               rb_erase(&rmap_item->tree_node->node,
1546 +                                        &root_unstable_tree);
1547 +
1548 +                               free_tree_node(rmap_item->tree_node);
1549 +                       } else
1550 +                               rmap_item->tree_node->count--;
1551 +               }
1552                 ksm_pages_unshared--;
1553 -               rmap_item->address &= PAGE_MASK;
1554         }
1555 +
1556 +       rmap_item->address &= PAGE_MASK;
1557 +       rmap_item->hash_max = 0;
1558 +
1559  out:
1560         cond_resched();         /* we're called from many long loops */
1561  }
1562
1563 -static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
1564 -                                      struct rmap_item **rmap_list)
1565 +/**
1566 + * Need to do two things:
1567 + * 1. check if slot was moved to del list
1568 + * 2. make sure the mmap_sem is manipulated under valid vma.
1569 + *
1570 + * My concern here is that in some cases, this may make
1571 + * vma_slot_list_lock() waiters to serialized further by some
1572 + * sem->wait_lock, can this really be expensive?
1573 + *
1574 + *
1575 + * @return
1576 + * 0: if successfully locked mmap_sem
1577 + * -ENOENT: this slot was moved to del list
1578 + * -EBUSY: vma lock failed
1579 + */
1580 +static int try_down_read_slot_mmap_sem(struct vma_slot *slot)
1581  {
1582 -       while (*rmap_list) {
1583 -               struct rmap_item *rmap_item = *rmap_list;
1584 -               *rmap_list = rmap_item->rmap_list;
1585 -               remove_rmap_item_from_tree(rmap_item);
1586 -               free_rmap_item(rmap_item);
1587 +       struct vm_area_struct *vma;
1588 +       struct mm_struct *mm;
1589 +       struct rw_semaphore *sem;
1590 +
1591 +       spin_lock(&vma_slot_list_lock);
1592 +
1593 +       /* the slot_list was removed and inited from new list, when it enters
1594 +        * ksm_list. If now it's not empty, then it must be moved to del list
1595 +        */
1596 +       if (!list_empty(&slot->slot_list)) {
1597 +               spin_unlock(&vma_slot_list_lock);
1598 +               return -ENOENT;
1599 +       }
1600 +
1601 +       BUG_ON(slot->pages != vma_pages(slot->vma));
1602 +       /* Ok, vma still valid */
1603 +       vma = slot->vma;
1604 +       mm = vma->vm_mm;
1605 +       sem = &mm->mmap_sem;
1606 +       if (down_read_trylock(sem)) {
1607 +               spin_unlock(&vma_slot_list_lock);
1608 +               return 0;
1609         }
1610 +
1611 +       spin_unlock(&vma_slot_list_lock);
1612 +       return -EBUSY;
1613  }
1614
1615 -/*
1616 - * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather
1617 - * than check every pte of a given vma, the locking doesn't quite work for
1618 - * that - an rmap_item is assigned to the stable tree after inserting ksm
1619 - * page and upping mmap_sem.  Nor does it fit with the way we skip dup'ing
1620 - * rmap_items from parent to child at fork time (so as not to waste time
1621 - * if exit comes before the next scan reaches it).
1622 - *
1623 - * Similarly, although we'd like to remove rmap_items (so updating counts
1624 - * and freeing memory) when unmerging an area, it's easier to leave that
1625 - * to the next pass of ksmd - consider, for example, how ksmd might be
1626 - * in cmp_and_merge_page on one of the rmap_items we would be removing.
1627 - */
1628 -static int unmerge_ksm_pages(struct vm_area_struct *vma,
1629 -                            unsigned long start, unsigned long end)
1630 +static inline unsigned long
1631 +vma_page_address(struct page *page, struct vm_area_struct *vma)
1632  {
1633 -       unsigned long addr;
1634 -       int err = 0;
1635 +       pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
1636 +       unsigned long address;
1637
1638 -       for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
1639 -               if (ksm_test_exit(vma->vm_mm))
1640 -                       break;
1641 -               if (signal_pending(current))
1642 -                       err = -ERESTARTSYS;
1643 -               else
1644 -                       err = break_ksm(vma, addr);
1645 +       address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
1646 +       if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
1647 +               /* page should be within @vma mapping range */
1648 +               return -EFAULT;
1649         }
1650 -       return err;
1651 +       return address;
1652  }
1653
1654 -#ifdef CONFIG_SYSFS
1655  /*
1656 - * Only called through the sysfs control interface:
1657 + * Test if the mm is exiting
1658   */
1659 -static int unmerge_and_remove_all_rmap_items(void)
1660 +static inline bool ksm_test_exit(struct mm_struct *mm)
1661 +{
1662 +       return atomic_read(&mm->mm_users) == 0;
1663 +}
1664 +
1665 +/* return 0 on success with the item's mmap_sem locked */
1666 +static inline int get_mergeable_page_lock_mmap(struct rmap_item *item)
1667  {
1668 -       struct mm_slot *mm_slot;
1669         struct mm_struct *mm;
1670         struct vm_area_struct *vma;
1671 -       int err = 0;
1672 +       struct vma_slot *slot = item->slot;
1673 +       int err = -EINVAL;
1674
1675 -       spin_lock(&ksm_mmlist_lock);
1676 -       ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
1677 -                                               struct mm_slot, mm_list);
1678 -       spin_unlock(&ksm_mmlist_lock);
1679 -
1680 -       for (mm_slot = ksm_scan.mm_slot;
1681 -                       mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
1682 -               mm = mm_slot->mm;
1683 -               down_read(&mm->mmap_sem);
1684 -               for (vma = mm->mmap; vma; vma = vma->vm_next) {
1685 -                       if (ksm_test_exit(mm))
1686 -                               break;
1687 -                       if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
1688 -                               continue;
1689 -                       err = unmerge_ksm_pages(vma,
1690 -                                               vma->vm_start, vma->vm_end);
1691 -                       if (err)
1692 -                               goto error;
1693 -               }
1694 -
1695 -               remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list);
1696 -
1697 -               spin_lock(&ksm_mmlist_lock);
1698 -               ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
1699 -                                               struct mm_slot, mm_list);
1700 -               if (ksm_test_exit(mm)) {
1701 -                       hlist_del(&mm_slot->link);
1702 -                       list_del(&mm_slot->mm_list);
1703 -                       spin_unlock(&ksm_mmlist_lock);
1704 -
1705 -                       free_mm_slot(mm_slot);
1706 -                       clear_bit(MMF_VM_MERGEABLE, &mm->flags);
1707 -                       up_read(&mm->mmap_sem);
1708 -                       mmdrop(mm);
1709 -               } else {
1710 -                       spin_unlock(&ksm_mmlist_lock);
1711 -                       up_read(&mm->mmap_sem);
1712 -               }
1713 +       struct page *page;
1714 +
1715 +       BUG_ON(!item->slot);
1716 +       /*
1717 +        * try_down_read_slot_mmap_sem() returns non-zero if the slot
1718 +        * has been removed by ksm_remove_vma().
1719 +        */
1720 +       if (try_down_read_slot_mmap_sem(slot))
1721 +               return -EBUSY;
1722 +
1723 +       mm = slot->vma->vm_mm;
1724 +       vma = slot->vma;
1725 +
1726 +       if (ksm_test_exit(mm))
1727 +               goto failout_up;
1728 +
1729 +       page = item->page;
1730 +       rcu_read_lock();
1731 +       if (!get_page_unless_zero(page)) {
1732 +               rcu_read_unlock();
1733 +               goto failout_up;
1734         }
1735
1736 -       ksm_scan.seqnr = 0;
1737 +       /* No need to consider huge page here. */
1738 +       if (item->slot->vma->anon_vma != page_anon_vma(page) ||
1739 +           vma_page_address(page, item->slot->vma) != get_rmap_addr(item)) {
1740 +               /*
1741 +                * TODO:
1742 +                * should we release this item becase of its stale page
1743 +                * mapping?
1744 +                */
1745 +               put_page(page);
1746 +               rcu_read_unlock();
1747 +               goto failout_up;
1748 +       }
1749 +       rcu_read_unlock();
1750         return 0;
1751
1752 -error:
1753 +failout_up:
1754         up_read(&mm->mmap_sem);
1755 -       spin_lock(&ksm_mmlist_lock);
1756 -       ksm_scan.mm_slot = &ksm_mm_head;
1757 -       spin_unlock(&ksm_mmlist_lock);
1758         return err;
1759  }
1760 -#endif /* CONFIG_SYSFS */
1761
1762 -static u32 calc_checksum(struct page *page)
1763 +/*
1764 + * What kind of VMA is considered ?
1765 + */
1766 +static inline int vma_can_enter(struct vm_area_struct *vma)
1767  {
1768 -       u32 checksum;
1769 -       void *addr = kmap_atomic(page, KM_USER0);
1770 -       checksum = jhash2(addr, PAGE_SIZE / 4, 17);
1771 -       kunmap_atomic(addr, KM_USER0);
1772 -       return checksum;
1773 +       return !(vma->vm_flags & (VM_PFNMAP | VM_IO  | VM_DONTEXPAND |
1774 +                                 VM_RESERVED  | VM_HUGETLB | VM_INSERTPAGE |
1775 +                                 VM_NONLINEAR | VM_MIXEDMAP | VM_SAO |
1776 +                                 VM_SHARED  | VM_MAYSHARE | VM_GROWSUP
1777 +                                 | VM_GROWSDOWN));
1778  }
1779
1780 -static int memcmp_pages(struct page *page1, struct page *page2)
1781 +/*
1782 + * Called whenever a fresh new vma is created A new vma_slot.
1783 + * is created and inserted into a global list Must be called.
1784 + * after vma is inserted to its mm                         .
1785 + */
1786 +inline void ksm_vma_add_new(struct vm_area_struct *vma)
1787  {
1788 -       char *addr1, *addr2;
1789 -       int ret;
1790 +       struct vma_slot *slot;
1791
1792 -       addr1 = kmap_atomic(page1, KM_USER0);
1793 -       addr2 = kmap_atomic(page2, KM_USER1);
1794 -       ret = memcmp(addr1, addr2, PAGE_SIZE);
1795 -       kunmap_atomic(addr2, KM_USER1);
1796 -       kunmap_atomic(addr1, KM_USER0);
1797 -       return ret;
1798 +       if (!vma_can_enter(vma)) {
1799 +               vma->ksm_vma_slot = NULL;
1800 +               return;
1801 +       }
1802 +
1803 +       slot = alloc_vma_slot();
1804 +       if (!slot) {
1805 +               vma->ksm_vma_slot = NULL;
1806 +               return;
1807 +       }
1808 +
1809 +       vma->ksm_vma_slot = slot;
1810 +       slot->vma = vma;
1811 +       slot->mm = vma->vm_mm;
1812 +       slot->ctime_j = jiffies;
1813 +       slot->pages = vma_pages(vma);
1814 +       spin_lock(&vma_slot_list_lock);
1815 +       list_add_tail(&slot->slot_list, &vma_slot_new);
1816 +       spin_unlock(&vma_slot_list_lock);
1817  }
1818
1819 -static inline int pages_identical(struct page *page1, struct page *page2)
1820 +/*
1821 + * Called after vma is unlinked from its mm
1822 + */
1823 +void ksm_remove_vma(struct vm_area_struct *vma)
1824  {
1825 -       return !memcmp_pages(page1, page2);
1826 +       struct vma_slot *slot;
1827 +
1828 +       if (!vma->ksm_vma_slot)
1829 +               return;
1830 +
1831 +       slot = vma->ksm_vma_slot;
1832 +       spin_lock(&vma_slot_list_lock);
1833 +       if (list_empty(&slot->slot_list)) {
1834 +               /**
1835 +                * This slot has been added by ksmd, so move to the del list
1836 +                * waiting ksmd to free it.
1837 +                */
1838 +               list_add_tail(&slot->slot_list, &vma_slot_del);
1839 +       } else {
1840 +               /**
1841 +                * It's still on new list. It's ok to free slot directly.
1842 +                */
1843 +               list_del(&slot->slot_list);
1844 +               free_vma_slot(slot);
1845 +       }
1846 +       spin_unlock(&vma_slot_list_lock);
1847 +       vma->ksm_vma_slot = NULL;
1848  }
1849
1850 -static int write_protect_page(struct vm_area_struct *vma, struct page *page,
1851 -                             pte_t *orig_pte)
1852 +/*   32/3 < they < 32/2 */
1853 +#define shiftl 8
1854 +#define shiftr 12
1855 +
1856 +#define HASH_FROM_TO(from, to)                                 \
1857 +for (index = from; index < to; index++) {              \
1858 +       pos = random_nums[index];                       \
1859 +       hash += key[pos];                               \
1860 +       hash += (hash << shiftl);                       \
1861 +       hash ^= (hash >> shiftr);                       \
1862 +}
1863 +
1864 +
1865 +#define HASH_FROM_DOWN_TO(from, to)                    \
1866 +for (index = from - 1; index >= to; index--) {         \
1867 +       hash ^= (hash >> shiftr);                       \
1868 +       hash ^= (hash >> (shiftr*2));                   \
1869 +       hash -= (hash << shiftl);                       \
1870 +       hash += (hash << (shiftl*2));                   \
1871 +       pos = random_nums[index];                       \
1872 +       hash -= key[pos];                               \
1873 +}
1874 +
1875 +/*
1876 + * The main random sample hash function.
1877 + */
1878 +static u32 random_sample_hash(void *addr, u32 hash_strength)
1879  {
1880 -       struct mm_struct *mm = vma->vm_mm;
1881 -       unsigned long addr;
1882 -       pte_t *ptep;
1883 -       spinlock_t *ptl;
1884 -       int swapped;
1885 -       int err = -EFAULT;
1886 +       u32 hash = 0xdeadbeef;
1887 +       int index, pos, loop = hash_strength;
1888 +       u32 *key = (u32 *)addr;
1889
1890 -       addr = page_address_in_vma(page, vma);
1891 -       if (addr == -EFAULT)
1892 -               goto out;
1893 +       if (loop > HASH_STRENGTH_FULL)
1894 +               loop = HASH_STRENGTH_FULL;
1895
1896 -       BUG_ON(PageTransCompound(page));
1897 -       ptep = page_check_address(page, mm, addr, &ptl, 0);
1898 -       if (!ptep)
1899 -               goto out;
1900 +       HASH_FROM_TO(0, loop);
1901
1902 -       if (pte_write(*ptep) || pte_dirty(*ptep)) {
1903 -               pte_t entry;
1904 +       if (hash_strength > HASH_STRENGTH_FULL) {
1905 +               loop = hash_strength - HASH_STRENGTH_FULL;
1906 +               HASH_FROM_TO(0, loop);
1907 +       }
1908
1909 -               swapped = PageSwapCache(page);
1910 -               flush_cache_page(vma, addr, page_to_pfn(page));
1911 +       return hash;
1912 +}
1913 +
1914 +
1915 +/**
1916 + * It's used when hash strength is adjusted
1917 + *
1918 + * @addr The page's virtual address
1919 + * @from The original hash strength
1920 + * @to   The hash strength changed to
1921 + * @hash The hash value generated with "from" hash value
1922 + *
1923 + * return the hash value
1924 + */
1925 +static u32 delta_hash(void *addr, int from, int to, u32 hash)
1926 +{
1927 +       u32 *key = (u32 *)addr;
1928 +       int index, pos; /* make sure they are int type */
1929 +
1930 +       if (to > from) {
1931 +               if (from >= HASH_STRENGTH_FULL) {
1932 +                       from -= HASH_STRENGTH_FULL;
1933 +                       to -= HASH_STRENGTH_FULL;
1934 +                       HASH_FROM_TO(from, to);
1935 +               } else if (to <= HASH_STRENGTH_FULL) {
1936 +                       HASH_FROM_TO(from, to);
1937 +               } else {
1938 +                       HASH_FROM_TO(from, HASH_STRENGTH_FULL);
1939 +                       HASH_FROM_TO(0, to - HASH_STRENGTH_FULL);
1940 +               }
1941 +       } else {
1942 +               if (from <= HASH_STRENGTH_FULL) {
1943 +                       HASH_FROM_DOWN_TO(from, to);
1944 +               } else if (to >= HASH_STRENGTH_FULL) {
1945 +                       from -= HASH_STRENGTH_FULL;
1946 +                       to -= HASH_STRENGTH_FULL;
1947 +                       HASH_FROM_DOWN_TO(from, to);
1948 +               } else {
1949 +                       HASH_FROM_DOWN_TO(from - HASH_STRENGTH_FULL, 0);
1950 +                       HASH_FROM_DOWN_TO(HASH_STRENGTH_FULL, to);
1951 +               }
1952 +       }
1953 +
1954 +       return hash;
1955 +}
1956 +
1957 +
1958 +
1959 +
1960 +#define CAN_OVERFLOW_U64(x, delta) (U64_MAX - (x) < (delta))
1961 +
1962 +/**
1963 + *
1964 + * Called when: rshash_pos or rshash_neg is about to overflow or a scan round
1965 + * has finished.
1966 + *
1967 + */
1968 +static inline void encode_benefit(void)
1969 +{
1970 +       u64 scanned_delta, pos_delta, neg_delta;
1971 +       unsigned long base = benefit.base;
1972 +
1973 +       scanned_delta = (ksm_pages_scanned - ksm_pages_scanned_last) >> base;
1974 +       pos_delta = rshash_pos >> base;
1975 +       neg_delta = rshash_neg >> base;
1976 +
1977 +       if (CAN_OVERFLOW_U64(benefit.pos, pos_delta) ||
1978 +           CAN_OVERFLOW_U64(benefit.neg, neg_delta) ||
1979 +           CAN_OVERFLOW_U64(benefit.scanned, scanned_delta)) {
1980 +               benefit.scanned >>= 1;
1981 +               benefit.neg >>= 1;
1982 +               benefit.pos >>= 1;
1983 +               benefit.base++;
1984 +               scanned_delta >>= 1;
1985 +               pos_delta >>= 1;
1986 +               neg_delta >>= 1;
1987 +       }
1988 +
1989 +       benefit.pos += pos_delta;
1990 +       benefit.neg += neg_delta;
1991 +       benefit.scanned += scanned_delta;
1992 +
1993 +       BUG_ON(!benefit.scanned);
1994 +
1995 +       rshash_pos = rshash_neg = 0;
1996 +
1997 +       /* -1 to make rshash_adjust() work */
1998 +       ksm_pages_scanned_last = ksm_pages_scanned - 1;
1999 +}
2000 +
2001 +static inline void reset_benefit(void)
2002 +{
2003 +       benefit.pos = 0;
2004 +       benefit.neg = 0;
2005 +       benefit.base = 0;
2006 +       benefit.scanned = 0;
2007 +}
2008 +
2009 +static inline void inc_rshash_pos(unsigned long delta)
2010 +{
2011 +       if (CAN_OVERFLOW_U64(rshash_pos, delta))
2012 +               encode_benefit();
2013 +
2014 +       rshash_pos += delta;
2015 +}
2016 +
2017 +static inline void inc_rshash_neg(unsigned long delta)
2018 +{
2019 +       if (CAN_OVERFLOW_U64(rshash_neg, delta))
2020 +               encode_benefit();
2021 +
2022 +       rshash_neg += delta;
2023 +}
2024 +
2025 +
2026 +static inline u32 page_hash(struct page *page, unsigned long hash_strength,
2027 +                           int cost_accounting)
2028 +{
2029 +       u32 val;
2030 +       unsigned long delta;
2031 +
2032 +       void *addr = kmap_atomic(page, KM_USER0);
2033 +
2034 +       val = random_sample_hash(addr, hash_strength);
2035 +       kunmap_atomic(addr, KM_USER0);
2036 +
2037 +       if (cost_accounting) {
2038 +               if (HASH_STRENGTH_FULL > hash_strength)
2039 +                       delta = HASH_STRENGTH_FULL - hash_strength;
2040 +               else
2041 +                       delta = 0;
2042 +
2043 +               inc_rshash_pos(delta);
2044 +       }
2045 +
2046 +       return val;
2047 +}
2048 +
2049 +static int memcmp_pages(struct page *page1, struct page *page2,
2050 +                       int cost_accounting)
2051 +{
2052 +       char *addr1, *addr2;
2053 +       int ret;
2054 +
2055 +       addr1 = kmap_atomic(page1, KM_USER0);
2056 +       addr2 = kmap_atomic(page2, KM_USER1);
2057 +       ret = memcmp(addr1, addr2, PAGE_SIZE);
2058 +       kunmap_atomic(addr2, KM_USER1);
2059 +       kunmap_atomic(addr1, KM_USER0);
2060 +
2061 +       if (cost_accounting)
2062 +               inc_rshash_neg(memcmp_cost);
2063 +
2064 +       return ret;
2065 +}
2066 +
2067 +static inline int pages_identical(struct page *page1, struct page *page2)
2068 +{
2069 +       return !memcmp_pages(page1, page2, 0);
2070 +}
2071 +
2072 +static inline int is_zero_page(struct page *page)
2073 +{
2074 +       char *addr;
2075 +       int ret;
2076 +
2077 +       addr = kmap_atomic(page, KM_USER0);
2078 +       ret = check_zero_page(addr, PAGE_SIZE);
2079 +       kunmap_atomic(addr, KM_USER0);
2080 +
2081 +       return ret;
2082 +}
2083 +
2084 +static int write_protect_page(struct vm_area_struct *vma, struct page *page,
2085 +                             pte_t *orig_pte, pte_t *old_pte)
2086 +{
2087 +       struct mm_struct *mm = vma->vm_mm;
2088 +       unsigned long addr;
2089 +       pte_t *ptep;
2090 +       spinlock_t *ptl;
2091 +       int swapped;
2092 +       int err = -EFAULT;
2093 +
2094 +       addr = page_address_in_vma(page, vma);
2095 +       if (addr == -EFAULT)
2096 +               goto out;
2097 +
2098 +       BUG_ON(PageTransCompound(page));
2099 +       ptep = page_check_address(page, mm, addr, &ptl, 0);
2100 +       if (!ptep)
2101 +               goto out;
2102 +
2103 +       if (old_pte)
2104 +               *old_pte = *ptep;
2105 +
2106 +       if (pte_write(*ptep) || pte_dirty(*ptep)) {
2107 +               pte_t entry;
2108 +
2109 +               swapped = PageSwapCache(page);
2110 +               flush_cache_page(vma, addr, page_to_pfn(page));
2111                 /*
2112                  * Ok this is tricky, when get_user_pages_fast() run it doesnt
2113                  * take any lock, therefore the check that we are going to make
2114 @@ -765,6 +1184,11 @@
2115         return err;
2116  }
2117
2118 +#define MERGE_ERR_PGERR                1 /* the page is invalid cannot continue */
2119 +#define MERGE_ERR_COLLI                2 /* there is a collision */
2120 +#define MERGE_ERR_CHANGED      3 /* the page has changed since last hash */
2121 +
2122 +
2123  /**
2124   * replace_page - replace page in vma by new ksm page
2125   * @vma:      vma that holds the pte pointing to page
2126 @@ -772,7 +1196,7 @@
2127   * @kpage:    the ksm page we replace page by
2128   * @orig_pte: the original value of the pte
2129   *
2130 - * Returns 0 on success, -EFAULT on failure.
2131 + * Returns 0 on success, MERGE_ERR_PGERR on failure.
2132   */
2133  static int replace_page(struct vm_area_struct *vma, struct page *page,
2134                         struct page *kpage, pte_t orig_pte)
2135 @@ -784,7 +1208,7 @@
2136         pte_t *ptep;
2137         spinlock_t *ptl;
2138         unsigned long addr;
2139 -       int err = -EFAULT;
2140 +       int err = MERGE_ERR_PGERR;
2141
2142         addr = page_address_in_vma(page, vma);
2143         if (addr == -EFAULT)
2144 @@ -827,6 +1251,85 @@
2145         return err;
2146  }
2147
2148 +
2149 +/**
2150 + *  Fully hash a page with HASH_STRENGTH_MAX return a non-zero hash value. The
2151 + *  zero hash value at HASH_STRENGTH_MAX is used to indicated that its
2152 + *  hash_max member has not been calculated.
2153 + *
2154 + * @page The page needs to be hashed
2155 + * @hash_old The hash value calculated with current hash strength
2156 + *
2157 + * return the new hash value calculated at HASH_STRENGTH_MAX
2158 + */
2159 +static inline u32 page_hash_max(struct page *page, u32 hash_old)
2160 +{
2161 +       u32 hash_max = 0;
2162 +       void *addr;
2163 +
2164 +       addr = kmap_atomic(page, KM_USER0);
2165 +       hash_max = delta_hash(addr, hash_strength,
2166 +                             HASH_STRENGTH_MAX, hash_old);
2167 +
2168 +       kunmap_atomic(addr, KM_USER0);
2169 +
2170 +       if (!hash_max)
2171 +               hash_max = 1;
2172 +
2173 +       inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
2174 +       return hash_max;
2175 +}
2176 +
2177 +/*
2178 + * We compare the hash again, to ensure that it is really a hash collision
2179 + * instead of being caused by page write.
2180 + */
2181 +static inline int check_collision(struct rmap_item *rmap_item,
2182 +                                 u32 hash)
2183 +{
2184 +       int err;
2185 +       struct page *page = rmap_item->page;
2186 +
2187 +       /* if this rmap_item has already been hash_maxed, then the collision
2188 +        * must appears in the second-level rbtree search. In this case we check
2189 +        * if its hash_max value has been changed. Otherwise, the collision
2190 +        * happens in the first-level rbtree search, so we check against it's
2191 +        * current hash value.
2192 +        */
2193 +       if (rmap_item->hash_max) {
2194 +               inc_rshash_neg(memcmp_cost);
2195 +               inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
2196 +
2197 +               if (rmap_item->hash_max == page_hash_max(page, hash))
2198 +                       err = MERGE_ERR_COLLI;
2199 +               else
2200 +                       err = MERGE_ERR_CHANGED;
2201 +       } else {
2202 +               inc_rshash_neg(memcmp_cost + hash_strength);
2203 +
2204 +               if (page_hash(page, hash_strength, 0) == hash)
2205 +                       err = MERGE_ERR_COLLI;
2206 +               else
2207 +                       err = MERGE_ERR_CHANGED;
2208 +       }
2209 +
2210 +       return err;
2211 +}
2212 +
2213 +static struct page *page_trans_compound_anon(struct page *page)
2214 +{
2215 +       if (PageTransCompound(page)) {
2216 +               struct page *head = compound_trans_head(page);
2217 +               /*
2218 +                * head may actually be splitted and freed from under
2219 +                * us but it's ok here.
2220 +                */
2221 +               if (PageAnon(head))
2222 +                       return head;
2223 +       }
2224 +       return NULL;
2225 +}
2226 +
2227  static int page_trans_compound_anon_split(struct page *page)
2228  {
2229         int ret = 0;
2230 @@ -854,30 +1357,36 @@
2231         return ret;
2232  }
2233
2234 -/*
2235 - * try_to_merge_one_page - take two pages and merge them into one
2236 - * @vma: the vma that holds the pte pointing to page
2237 - * @page: the PageAnon page that we want to replace with kpage
2238 - * @kpage: the PageKsm page that we want to map instead of page,
2239 - *         or NULL the first time when we want to use page as kpage.
2240 +/**
2241 + * Try to merge a rmap_item.page with a kpage in stable node. kpage must
2242 + * already be a ksm page.
2243   *
2244 - * This function returns 0 if the pages were merged, -EFAULT otherwise.
2245 + * @return 0 if the pages were merged, -EFAULT otherwise.
2246   */
2247 -static int try_to_merge_one_page(struct vm_area_struct *vma,
2248 -                                struct page *page, struct page *kpage)
2249 +static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
2250 +                                     struct page *kpage, u32 hash)
2251  {
2252 +       struct vm_area_struct *vma = rmap_item->slot->vma;
2253 +       struct mm_struct *mm = vma->vm_mm;
2254         pte_t orig_pte = __pte(0);
2255 -       int err = -EFAULT;
2256 +       int err = MERGE_ERR_PGERR;
2257 +       struct page *page;
2258
2259 -       if (page == kpage)                      /* ksm page forked */
2260 -               return 0;
2261 +       if (ksm_test_exit(mm))
2262 +               goto out;
2263 +
2264 +       page = rmap_item->page;
2265
2266 -       if (!(vma->vm_flags & VM_MERGEABLE))
2267 +       if (page == kpage) { /* ksm page forked */
2268 +               err = 0;
2269                 goto out;
2270 +       }
2271 +
2272         if (PageTransCompound(page) && page_trans_compound_anon_split(page))
2273                 goto out;
2274         BUG_ON(PageTransCompound(page));
2275 -       if (!PageAnon(page))
2276 +
2277 +       if (!PageAnon(page) || !PageKsm(kpage))
2278                 goto out;
2279
2280         /*
2281 @@ -895,18 +1404,27 @@
2282          * ptes are necessarily already write-protected.  But in either
2283          * case, we need to lock and check page_count is not raised.
2284          */
2285 -       if (write_protect_page(vma, page, &orig_pte) == 0) {
2286 +       if (write_protect_page(vma, page, &orig_pte, NULL) == 0) {
2287                 if (!kpage) {
2288 +                       long map_sharing = atomic_read(&page->_mapcount);
2289                         /*
2290                          * While we hold page lock, upgrade page from
2291                          * PageAnon+anon_vma to PageKsm+NULL stable_node:
2292                          * stable_tree_insert() will update stable_node.
2293                          */
2294                         set_page_stable_node(page, NULL);
2295 +                       if (map_sharing)
2296 +                               add_zone_page_state(page_zone(page),
2297 +                                                   NR_KSM_PAGES_SHARING,
2298 +                                                   map_sharing);
2299                         mark_page_accessed(page);
2300                         err = 0;
2301 -               } else if (pages_identical(page, kpage))
2302 -                       err = replace_page(vma, page, kpage, orig_pte);
2303 +               } else {
2304 +                       if (pages_identical(page, kpage))
2305 +                               err = replace_page(vma, page, kpage, orig_pte);
2306 +                       else
2307 +                               err = check_collision(rmap_item, hash);
2308 +               }
2309         }
2310
2311         if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
2312 @@ -924,378 +1442,2683 @@
2313         return err;
2314  }
2315
2316 -/*
2317 - * try_to_merge_with_ksm_page - like try_to_merge_two_pages,
2318 - * but no new kernel page is allocated: kpage must already be a ksm page.
2319 +
2320 +
2321 +/**
2322 + * If two pages fail to merge in try_to_merge_two_pages, then we have a chance
2323 + * to restore a page mapping that has been changed in try_to_merge_two_pages.
2324   *
2325 - * This function returns 0 if the pages were merged, -EFAULT otherwise.
2326 + * @return 0 on success.
2327   */
2328 -static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
2329 -                                     struct page *page, struct page *kpage)
2330 +static int restore_ksm_page_pte(struct vm_area_struct *vma, unsigned long addr,
2331 +                            pte_t orig_pte, pte_t wprt_pte)
2332  {
2333 -       struct mm_struct *mm = rmap_item->mm;
2334 -       struct vm_area_struct *vma;
2335 +       struct mm_struct *mm = vma->vm_mm;
2336 +       pgd_t *pgd;
2337 +       pud_t *pud;
2338 +       pmd_t *pmd;
2339 +       pte_t *ptep;
2340 +       spinlock_t *ptl;
2341 +
2342         int err = -EFAULT;
2343
2344 -       down_read(&mm->mmap_sem);
2345 -       if (ksm_test_exit(mm))
2346 -               goto out;
2347 -       vma = find_vma(mm, rmap_item->address);
2348 -       if (!vma || vma->vm_start > rmap_item->address)
2349 +       pgd = pgd_offset(mm, addr);
2350 +       if (!pgd_present(*pgd))
2351 +               goto out;
2352 +
2353 +       pud = pud_offset(pgd, addr);
2354 +       if (!pud_present(*pud))
2355 +               goto out;
2356 +
2357 +       pmd = pmd_offset(pud, addr);
2358 +       if (!pmd_present(*pmd))
2359 +               goto out;
2360 +
2361 +       ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
2362 +       if (!pte_same(*ptep, wprt_pte)) {
2363 +               /* already copied, let it be */
2364 +               pte_unmap_unlock(ptep, ptl);
2365 +               goto out;
2366 +       }
2367 +
2368 +       /*
2369 +        * Good boy, still here. When we still get the ksm page, it does not
2370 +        * return to the free page pool, there is no way that a pte was changed
2371 +        * to other page and gets back to this page. And remind that ksm page
2372 +        * do not reuse in do_wp_page(). So it's safe to restore the original
2373 +        * pte.
2374 +        */
2375 +       flush_cache_page(vma, addr, pte_pfn(*ptep));
2376 +       ptep_clear_flush(vma, addr, ptep);
2377 +       set_pte_at_notify(mm, addr, ptep, orig_pte);
2378 +
2379 +       pte_unmap_unlock(ptep, ptl);
2380 +       err = 0;
2381 +out:
2382 +       return err;
2383 +}
2384 +
2385 +/**
2386 + * try_to_merge_two_pages() - take two identical pages and prepare
2387 + * them to be merged into one page(rmap_item->page)
2388 + *
2389 + * @return 0 if we successfully merged two identical pages into
2390 + *         one ksm page. MERGE_ERR_COLLI if it's only a hash collision
2391 + *         search in rbtree. MERGE_ERR_CHANGED if rmap_item has been
2392 + *         changed since it's hashed. MERGE_ERR_PGERR otherwise.
2393 + *
2394 + */
2395 +static int try_to_merge_two_pages(struct rmap_item *rmap_item,
2396 +                                 struct rmap_item *tree_rmap_item,
2397 +                                 u32 hash)
2398 +{
2399 +       pte_t orig_pte1 = __pte(0), orig_pte2 = __pte(0);
2400 +       pte_t wprt_pte1 = __pte(0), wprt_pte2 = __pte(0);
2401 +       struct vm_area_struct *vma1 = rmap_item->slot->vma;
2402 +       struct vm_area_struct *vma2 = tree_rmap_item->slot->vma;
2403 +       struct page *page = rmap_item->page;
2404 +       struct page *tree_page = tree_rmap_item->page;
2405 +       int err = MERGE_ERR_PGERR;
2406 +
2407 +       long map_sharing;
2408 +       struct address_space *saved_mapping;
2409 +
2410 +
2411 +       if (rmap_item->page == tree_rmap_item->page)
2412 +               goto out;
2413 +
2414 +       if (PageTransCompound(page) && page_trans_compound_anon_split(page))
2415 +               goto out;
2416 +       BUG_ON(PageTransCompound(page));
2417 +
2418 +       if (PageTransCompound(tree_page) && page_trans_compound_anon_split(tree_page))
2419 +               goto out;
2420 +       BUG_ON(PageTransCompound(tree_page));
2421 +
2422 +       if (!PageAnon(page) || !PageAnon(tree_page))
2423 +               goto out;
2424 +
2425 +       if (!trylock_page(page))
2426 +               goto out;
2427 +
2428 +
2429 +       if (write_protect_page(vma1, page, &wprt_pte1, &orig_pte1) != 0) {
2430 +               unlock_page(page);
2431 +               goto out;
2432 +       }
2433 +
2434 +       /*
2435 +        * While we hold page lock, upgrade page from
2436 +        * PageAnon+anon_vma to PageKsm+NULL stable_node:
2437 +        * stable_tree_insert() will update stable_node.
2438 +        */
2439 +       saved_mapping = page->mapping;
2440 +       map_sharing = atomic_read(&page->_mapcount);
2441 +       set_page_stable_node(page, NULL);
2442 +       if (map_sharing)
2443 +               add_zone_page_state(page_zone(page),
2444 +                                   NR_KSM_PAGES_SHARING,
2445 +                                   map_sharing);
2446 +       mark_page_accessed(page);
2447 +       unlock_page(page);
2448 +
2449 +       if (!trylock_page(tree_page))
2450 +               goto restore_out;
2451 +
2452 +       if (write_protect_page(vma2, tree_page, &wprt_pte2, &orig_pte2) != 0) {
2453 +               unlock_page(tree_page);
2454 +               goto restore_out;
2455 +       }
2456 +
2457 +       if (pages_identical(page, tree_page)) {
2458 +               err = replace_page(vma2, tree_page, page, wprt_pte2);
2459 +               if (err)
2460 +                       goto restore_out;
2461 +
2462 +               if ((vma2->vm_flags & VM_LOCKED)) {
2463 +                       munlock_vma_page(tree_page);
2464 +                       if (!PageMlocked(page)) {
2465 +                               unlock_page(tree_page);
2466 +                               lock_page(page);
2467 +                               mlock_vma_page(page);
2468 +                               tree_page = page; /* for final unlock */
2469 +                       }
2470 +               }
2471 +
2472 +               unlock_page(tree_page);
2473 +
2474 +               goto out; /* success */
2475 +
2476 +       } else {
2477 +               if (page_hash(page, hash_strength, 0) ==
2478 +                   page_hash(tree_page, hash_strength, 0)) {
2479 +                       inc_rshash_neg(memcmp_cost + hash_strength * 2);
2480 +                       err = MERGE_ERR_COLLI;
2481 +               } else
2482 +                       err = MERGE_ERR_CHANGED;
2483 +
2484 +               unlock_page(tree_page);
2485 +       }
2486 +
2487 +restore_out:
2488 +       lock_page(page);
2489 +       if (!restore_ksm_page_pte(vma1, get_rmap_addr(rmap_item),
2490 +                                 orig_pte1, wprt_pte1))
2491 +               page->mapping = saved_mapping;
2492 +
2493 +       unlock_page(page);
2494 +out:
2495 +       return err;
2496 +}
2497 +
2498 +static inline int hash_cmp(u32 new_val, u32 node_val)
2499 +{
2500 +       if (new_val > node_val)
2501 +               return 1;
2502 +       else if (new_val < node_val)
2503 +               return -1;
2504 +       else
2505 +               return 0;
2506 +}
2507 +
2508 +static inline u32 rmap_item_hash_max(struct rmap_item *item, u32 hash)
2509 +{
2510 +       u32 hash_max = item->hash_max;
2511 +
2512 +       if (!hash_max) {
2513 +               hash_max = page_hash_max(item->page, hash);
2514 +
2515 +               item->hash_max = hash_max;
2516 +       }
2517 +
2518 +       return hash_max;
2519 +}
2520 +
2521 +
2522 +
2523 +/**
2524 + * stable_tree_search() - search the stable tree for a page
2525 + *
2526 + * @item:      the rmap_item we are comparing with
2527 + * @hash:      the hash value of this item->page already calculated
2528 + *
2529 + * @return     the page we have found, NULL otherwise. The page returned has
2530 + *             been gotten.
2531 + */
2532 +static struct page *stable_tree_search(struct rmap_item *item, u32 hash)
2533 +{
2534 +       struct rb_node *node = root_stable_treep->rb_node;
2535 +       struct tree_node *tree_node;
2536 +       unsigned long hash_max;
2537 +       struct page *page = item->page;
2538 +       struct stable_node *stable_node;
2539 +
2540 +       stable_node = page_stable_node(page);
2541 +       if (stable_node) {
2542 +               /* ksm page forked, that is
2543 +                * if (PageKsm(page) && !in_stable_tree(rmap_item))
2544 +                * it's actually gotten once outside.
2545 +                */
2546 +               get_page(page);
2547 +               return page;
2548 +       }
2549 +
2550 +       while (node) {
2551 +               int cmp;
2552 +
2553 +               tree_node = rb_entry(node, struct tree_node, node);
2554 +
2555 +               cmp = hash_cmp(hash, tree_node->hash);
2556 +
2557 +               if (cmp < 0)
2558 +                       node = node->rb_left;
2559 +               else if (cmp > 0)
2560 +                       node = node->rb_right;
2561 +               else
2562 +                       break;
2563 +       }
2564 +
2565 +       if (!node)
2566 +               return NULL;
2567 +
2568 +       if (tree_node->count == 1) {
2569 +               stable_node = rb_entry(tree_node->sub_root.rb_node,
2570 +                                      struct stable_node, node);
2571 +               BUG_ON(!stable_node);
2572 +
2573 +               goto get_page_out;
2574 +       }
2575 +
2576 +       /*
2577 +        * ok, we have to search the second
2578 +        * level subtree, hash the page to a
2579 +        * full strength.
2580 +        */
2581 +       node = tree_node->sub_root.rb_node;
2582 +       BUG_ON(!node);
2583 +       hash_max = rmap_item_hash_max(item, hash);
2584 +
2585 +       while (node) {
2586 +               int cmp;
2587 +
2588 +               stable_node = rb_entry(node, struct stable_node, node);
2589 +
2590 +               cmp = hash_cmp(hash_max, stable_node->hash_max);
2591 +
2592 +               if (cmp < 0)
2593 +                       node = node->rb_left;
2594 +               else if (cmp > 0)
2595 +                       node = node->rb_right;
2596 +               else
2597 +                       goto get_page_out;
2598 +       }
2599 +
2600 +       return NULL;
2601 +
2602 +get_page_out:
2603 +       page = get_ksm_page(stable_node, 1, 1);
2604 +       return page;
2605 +}
2606 +
2607 +
2608 +/**
2609 + * try_to_merge_with_stable_page() - when two rmap_items need to be inserted
2610 + * into stable tree, the page was found to be identical to a stable ksm page,
2611 + * this is the last chance we can merge them into one.
2612 + *
2613 + * @item1:     the rmap_item holding the page which we wanted to insert
2614 + *             into stable tree.
2615 + * @item2:     the other rmap_item we found when unstable tree search
2616 + * @oldpage:   the page currently mapped by the two rmap_items
2617 + * @tree_page:         the page we found identical in stable tree node
2618 + * @success1:  return if item1 is successfully merged
2619 + * @success2:  return if item2 is successfully merged
2620 + */
2621 +static void try_merge_with_stable(struct rmap_item *item1,
2622 +                                 struct rmap_item *item2,
2623 +                                 struct page **kpage,
2624 +                                 struct page *tree_page,
2625 +                                 int *success1, int *success2)
2626 +{
2627 +       spinlock_t *ptl1, *ptl2;
2628 +       pte_t *ptep1, *ptep2;
2629 +       unsigned long addr1, addr2;
2630 +       struct vm_area_struct *vma1 = item1->slot->vma;
2631 +       struct vm_area_struct *vma2 = item2->slot->vma;
2632 +
2633 +       *success1 = 0;
2634 +       *success2 = 0;
2635 +
2636 +       if (unlikely(*kpage == tree_page)) {
2637 +               /* I don't think this can really happen */
2638 +               goto success_both;
2639 +       }
2640 +
2641 +       if (!PageAnon(*kpage) || !PageKsm(*kpage))
2642 +               goto failed;
2643 +
2644 +       if (!trylock_page(tree_page))
2645 +               goto failed;
2646 +
2647 +       /* If the oldpage is still ksm and still pointed
2648 +        * to in the right place, and still write protected,
2649 +        * we are confident it's not changed, no need to
2650 +        * memcmp anymore.
2651 +        * be ware, we cannot take nested pte locks,
2652 +        * deadlock risk.
2653 +        */
2654 +       addr1 = get_rmap_addr(item1);
2655 +
2656 +       ptep1 = page_check_address(*kpage, vma1->vm_mm, addr1, &ptl1, 0);
2657 +       if (!ptep1)
2658 +               goto failed;
2659 +
2660 +       if (pte_write(*ptep1)) {
2661 +               /* has changed, abort! */
2662 +               pte_unmap_unlock(ptep1, ptl1);
2663 +               goto failed;
2664 +       }
2665 +
2666 +       get_page(tree_page);
2667 +       page_add_anon_rmap(tree_page, vma1, addr1);
2668 +
2669 +       flush_cache_page(vma1, addr1, pte_pfn(*ptep1));
2670 +       ptep_clear_flush(vma1, addr1, ptep1);
2671 +       set_pte_at_notify(vma1->vm_mm, addr1, ptep1,
2672 +                         mk_pte(tree_page, vma1->vm_page_prot));
2673 +
2674 +       page_remove_rmap(*kpage);
2675 +       put_page(*kpage);
2676 +
2677 +       pte_unmap_unlock(ptep1, ptl1);
2678 +
2679 +
2680 +       /* ok, then vma2, remind that pte1 already set */
2681 +       addr2 = get_rmap_addr(item2);
2682 +
2683 +       ptep2 = page_check_address(*kpage, vma2->vm_mm, addr2, &ptl2, 0);
2684 +       if (!ptep2)
2685 +               goto success1;
2686 +
2687 +       if (pte_write(*ptep2)) {
2688 +               /* has changed, abort! */
2689 +               pte_unmap_unlock(ptep2, ptl2);
2690 +               goto success1;
2691 +       }
2692 +
2693 +       get_page(tree_page);
2694 +       page_add_anon_rmap(tree_page, vma2, addr2);
2695 +
2696 +       flush_cache_page(vma2, addr2, pte_pfn(*ptep2));
2697 +       ptep_clear_flush(vma2, addr2, ptep2);
2698 +       set_pte_at_notify(vma2->vm_mm, addr2, ptep2,
2699 +                         mk_pte(tree_page, vma2->vm_page_prot));
2700 +
2701 +       page_remove_rmap(*kpage);
2702 +       put_page(*kpage);
2703 +
2704 +       pte_unmap_unlock(ptep2, ptl2);
2705 +
2706 +
2707 +success_both:
2708 +       *success2 = 1;
2709 +success1:
2710 +       *success1 = 1;
2711 +
2712 +
2713 +       if ((*success1 && vma1->vm_flags & VM_LOCKED) ||
2714 +           (*success2 && vma2->vm_flags & VM_LOCKED)) {
2715 +               munlock_vma_page(*kpage);
2716 +               if (!PageMlocked(tree_page))
2717 +                       mlock_vma_page(tree_page);
2718 +       }
2719 +
2720 +       /*
2721 +        * We do not need oldpage any more in the caller, so can break the lock
2722 +        * now.
2723 +        */
2724 +       unlock_page(*kpage);
2725 +       *kpage = tree_page; /* Get unlocked outside. */
2726 +failed:
2727 +       return;
2728 +}
2729 +
2730 +static inline void stable_node_hash_max(struct stable_node *node,
2731 +                                        struct page *page, u32 hash)
2732 +{
2733 +       u32 hash_max = node->hash_max;
2734 +
2735 +       if (!hash_max) {
2736 +               hash_max = page_hash_max(page, hash);
2737 +               node->hash_max = hash_max;
2738 +       }
2739 +}
2740 +
2741 +static inline
2742 +struct stable_node *new_stable_node(struct tree_node *tree_node,
2743 +                                   struct page *kpage, u32 hash_max)
2744 +{
2745 +       struct stable_node *new_stable_node;
2746 +
2747 +       new_stable_node = alloc_stable_node();
2748 +       if (!new_stable_node)
2749 +               return NULL;
2750 +
2751 +       new_stable_node->kpfn = page_to_pfn(kpage);
2752 +       new_stable_node->hash_max = hash_max;
2753 +       new_stable_node->tree_node = tree_node;
2754 +       set_page_stable_node(kpage, new_stable_node);
2755 +
2756 +       return new_stable_node;
2757 +}
2758 +
2759 +static inline
2760 +struct stable_node *first_level_insert(struct tree_node *tree_node,
2761 +                                      struct rmap_item *rmap_item,
2762 +                                      struct rmap_item *tree_rmap_item,
2763 +                                      struct page **kpage, u32 hash,
2764 +                                      int *success1, int *success2)
2765 +{
2766 +       int cmp;
2767 +       struct page *tree_page;
2768 +       u32 hash_max = 0;
2769 +       struct stable_node *stable_node, *new_snode;
2770 +       struct rb_node *parent = NULL, **new;
2771 +
2772 +       /* this tree node contains no sub-tree yet */
2773 +       stable_node = rb_entry(tree_node->sub_root.rb_node,
2774 +                              struct stable_node, node);
2775 +
2776 +       tree_page = get_ksm_page(stable_node, 1, 0);
2777 +       if (tree_page) {
2778 +               cmp = memcmp_pages(*kpage, tree_page, 1);
2779 +               if (!cmp) {
2780 +                       try_merge_with_stable(rmap_item, tree_rmap_item, kpage,
2781 +                                             tree_page, success1, success2);
2782 +                       put_page(tree_page);
2783 +                       if (!*success1 && !*success2)
2784 +                               goto failed;
2785 +
2786 +                       return stable_node;
2787 +
2788 +               } else {
2789 +                       /*
2790 +                        * collision in first level try to create a subtree.
2791 +                        * A new node need to be created.
2792 +                        */
2793 +                       put_page(tree_page);
2794 +
2795 +                       stable_node_hash_max(stable_node, tree_page,
2796 +                                            tree_node->hash);
2797 +                       hash_max = rmap_item_hash_max(rmap_item, hash);
2798 +                       cmp = hash_cmp(hash_max, stable_node->hash_max);
2799 +
2800 +                       parent = &stable_node->node;
2801 +                       if (cmp < 0) {
2802 +                               new = &parent->rb_left;
2803 +                       } else if (cmp > 0) {
2804 +                               new = &parent->rb_right;
2805 +                       } else {
2806 +                               goto failed;
2807 +                       }
2808 +               }
2809 +
2810 +       } else {
2811 +               /* the only stable_node deleted, we reuse its tree_node.
2812 +                */
2813 +               parent = NULL;
2814 +               new = &tree_node->sub_root.rb_node;
2815 +       }
2816 +
2817 +       new_snode = new_stable_node(tree_node, *kpage, hash_max);
2818 +       if (!new_snode)
2819 +               goto failed;
2820 +
2821 +       rb_link_node(&new_snode->node, parent, new);
2822 +       rb_insert_color(&new_snode->node, &tree_node->sub_root);
2823 +       tree_node->count++;
2824 +       *success1 = *success2 = 1;
2825 +
2826 +       return new_snode;
2827 +
2828 +failed:
2829 +       return NULL;
2830 +}
2831 +
2832 +static inline
2833 +struct stable_node *stable_subtree_insert(struct tree_node *tree_node,
2834 +                                         struct rmap_item *rmap_item,
2835 +                                         struct rmap_item *tree_rmap_item,
2836 +                                         struct page **kpage, u32 hash,
2837 +                                         int *success1, int *success2)
2838 +{
2839 +       struct page *tree_page;
2840 +       u32 hash_max;
2841 +       struct stable_node *stable_node, *new_snode;
2842 +       struct rb_node *parent, **new;
2843 +
2844 +research:
2845 +       parent = NULL;
2846 +       new = &tree_node->sub_root.rb_node;
2847 +       BUG_ON(!*new);
2848 +       hash_max = rmap_item_hash_max(rmap_item, hash);
2849 +       while (*new) {
2850 +               int cmp;
2851 +
2852 +               stable_node = rb_entry(*new, struct stable_node, node);
2853 +
2854 +               cmp = hash_cmp(hash_max, stable_node->hash_max);
2855 +
2856 +               if (cmp < 0) {
2857 +                       parent = *new;
2858 +                       new = &parent->rb_left;
2859 +               } else if (cmp > 0) {
2860 +                       parent = *new;
2861 +                       new = &parent->rb_right;
2862 +               } else {
2863 +                       tree_page = get_ksm_page(stable_node, 1, 0);
2864 +                       if (tree_page) {
2865 +                               cmp = memcmp_pages(*kpage, tree_page, 1);
2866 +                               if (!cmp) {
2867 +                                       try_merge_with_stable(rmap_item,
2868 +                                               tree_rmap_item, kpage,
2869 +                                               tree_page, success1, success2);
2870 +
2871 +                                       put_page(tree_page);
2872 +                                       if (!*success1 && !*success2)
2873 +                                               goto failed;
2874 +                                       /*
2875 +                                        * successfully merged with a stable
2876 +                                        * node
2877 +                                        */
2878 +                                       return stable_node;
2879 +                               } else {
2880 +                                       put_page(tree_page);
2881 +                                       goto failed;
2882 +                               }
2883 +                       } else {
2884 +                               /*
2885 +                                * stable node may be deleted,
2886 +                                * and subtree maybe
2887 +                                * restructed, cannot
2888 +                                * continue, research it.
2889 +                                */
2890 +                               if (tree_node->count) {
2891 +                                       goto research;
2892 +                               } else {
2893 +                                       /* reuse the tree node*/
2894 +                                       parent = NULL;
2895 +                                       new = &tree_node->sub_root.rb_node;
2896 +                               }
2897 +                       }
2898 +               }
2899 +       }
2900 +
2901 +       new_snode = new_stable_node(tree_node, *kpage, hash_max);
2902 +       if (!new_snode)
2903 +               goto failed;
2904 +
2905 +       rb_link_node(&new_snode->node, parent, new);
2906 +       rb_insert_color(&new_snode->node, &tree_node->sub_root);
2907 +       tree_node->count++;
2908 +       *success1 = *success2 = 1;
2909 +
2910 +       return new_snode;
2911 +
2912 +failed:
2913 +       return NULL;
2914 +}
2915 +
2916 +
2917 +/**
2918 + * stable_tree_insert() - try to insert a merged page in unstable tree to
2919 + * the stable tree
2920 + *
2921 + * @kpage:             the page need to be inserted
2922 + * @hash:              the current hash of this page
2923 + * @rmap_item:         the rmap_item being scanned
2924 + * @tree_rmap_item:    the rmap_item found on unstable tree
2925 + * @success1:          return if rmap_item is merged
2926 + * @success2:          return if tree_rmap_item is merged
2927 + *
2928 + * @return             the stable_node on stable tree if at least one
2929 + *                     rmap_item is inserted into stable tree, NULL
2930 + *                     otherwise.
2931 + */
2932 +static struct stable_node *
2933 +stable_tree_insert(struct page **kpage, u32 hash,
2934 +                  struct rmap_item *rmap_item,
2935 +                  struct rmap_item *tree_rmap_item,
2936 +                  int *success1, int *success2)
2937 +{
2938 +       struct rb_node **new = &root_stable_treep->rb_node;
2939 +       struct rb_node *parent = NULL;
2940 +       struct stable_node *stable_node;
2941 +       struct tree_node *tree_node;
2942 +       u32 hash_max = 0;
2943 +
2944 +       *success1 = *success2 = 0;
2945 +
2946 +       while (*new) {
2947 +               int cmp;
2948 +
2949 +               tree_node = rb_entry(*new, struct tree_node, node);
2950 +
2951 +               cmp = hash_cmp(hash, tree_node->hash);
2952 +
2953 +               if (cmp < 0) {
2954 +                       parent = *new;
2955 +                       new = &parent->rb_left;
2956 +               } else if (cmp > 0) {
2957 +                       parent = *new;
2958 +                       new = &parent->rb_right;
2959 +               } else
2960 +                       break;
2961 +       }
2962 +
2963 +       if (*new) {
2964 +               if (tree_node->count == 1) {
2965 +                       stable_node = first_level_insert(tree_node, rmap_item,
2966 +                                               tree_rmap_item, kpage,
2967 +                                               hash, success1, success2);
2968 +               } else {
2969 +                       stable_node = stable_subtree_insert(tree_node,
2970 +                                       rmap_item, tree_rmap_item, kpage,
2971 +                                       hash, success1, success2);
2972 +               }
2973 +       } else {
2974 +
2975 +               /* no tree node found */
2976 +               tree_node = alloc_tree_node(stable_tree_node_listp);
2977 +               if (!tree_node) {
2978 +                       stable_node = NULL;
2979 +                       goto out;
2980 +               }
2981 +
2982 +               stable_node = new_stable_node(tree_node, *kpage, hash_max);
2983 +               if (!stable_node) {
2984 +                       free_tree_node(tree_node);
2985 +                       goto out;
2986 +               }
2987 +
2988 +               tree_node->hash = hash;
2989 +               rb_link_node(&tree_node->node, parent, new);
2990 +               rb_insert_color(&tree_node->node, root_stable_treep);
2991 +               parent = NULL;
2992 +               new = &tree_node->sub_root.rb_node;
2993 +
2994 +               rb_link_node(&stable_node->node, parent, new);
2995 +               rb_insert_color(&stable_node->node, &tree_node->sub_root);
2996 +               tree_node->count++;
2997 +               *success1 = *success2 = 1;
2998 +       }
2999 +
3000 +out:
3001 +       return stable_node;
3002 +}
3003 +
3004 +
3005 +/**
3006 + * get_tree_rmap_item_page() - try to get the page and lock the mmap_sem
3007 + *
3008 + * @return     0 on success, -EBUSY if unable to lock the mmap_sem,
3009 + *             -EINVAL if the page mapping has been changed.
3010 + */
3011 +static inline int get_tree_rmap_item_page(struct rmap_item *tree_rmap_item)
3012 +{
3013 +       int err;
3014 +
3015 +       err = get_mergeable_page_lock_mmap(tree_rmap_item);
3016 +
3017 +       if (err == -EINVAL) {
3018 +               /* its page map has been changed, remove it */
3019 +               remove_rmap_item_from_tree(tree_rmap_item);
3020 +       }
3021 +
3022 +       /* The page is gotten and mmap_sem is locked now. */
3023 +       return err;
3024 +}
3025 +
3026 +
3027 +/**
3028 + * unstable_tree_search_insert() - search an unstable tree rmap_item with the
3029 + * same hash value. Get its page and trylock the mmap_sem
3030 + */
3031 +static inline
3032 +struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
3033 +                                             u32 hash)
3034 +
3035 +{
3036 +       struct rb_node **new = &root_unstable_tree.rb_node;
3037 +       struct rb_node *parent = NULL;
3038 +       struct tree_node *tree_node;
3039 +       u32 hash_max;
3040 +       struct rmap_item *tree_rmap_item;
3041 +
3042 +       while (*new) {
3043 +               int cmp;
3044 +
3045 +               tree_node = rb_entry(*new, struct tree_node, node);
3046 +
3047 +               cmp = hash_cmp(hash, tree_node->hash);
3048 +
3049 +               if (cmp < 0) {
3050 +                       parent = *new;
3051 +                       new = &parent->rb_left;
3052 +               } else if (cmp > 0) {
3053 +                       parent = *new;
3054 +                       new = &parent->rb_right;
3055 +               } else
3056 +                       break;
3057 +       }
3058 +
3059 +       if (*new) {
3060 +               /* got the tree_node */
3061 +               if (tree_node->count == 1) {
3062 +                       tree_rmap_item = rb_entry(tree_node->sub_root.rb_node,
3063 +                                                 struct rmap_item, node);
3064 +                       BUG_ON(!tree_rmap_item);
3065 +
3066 +                       goto get_page_out;
3067 +               }
3068 +
3069 +               /* well, search the collision subtree */
3070 +               new = &tree_node->sub_root.rb_node;
3071 +               BUG_ON(!*new);
3072 +               hash_max = rmap_item_hash_max(rmap_item, hash);
3073 +
3074 +               while (*new) {
3075 +                       int cmp;
3076 +
3077 +                       tree_rmap_item = rb_entry(*new, struct rmap_item,
3078 +                                                 node);
3079 +
3080 +                       cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
3081 +                       parent = *new;
3082 +                       if (cmp < 0)
3083 +                               new = &parent->rb_left;
3084 +                       else if (cmp > 0)
3085 +                               new = &parent->rb_right;
3086 +                       else
3087 +                               goto get_page_out;
3088 +               }
3089 +       } else {
3090 +               /* alloc a new tree_node */
3091 +               tree_node = alloc_tree_node(&unstable_tree_node_list);
3092 +               if (!tree_node)
3093 +                       return NULL;
3094 +
3095 +               tree_node->hash = hash;
3096 +               rb_link_node(&tree_node->node, parent, new);
3097 +               rb_insert_color(&tree_node->node, &root_unstable_tree);
3098 +               parent = NULL;
3099 +               new = &tree_node->sub_root.rb_node;
3100 +       }
3101 +
3102 +       /* did not found even in sub-tree */
3103 +       rmap_item->tree_node = tree_node;
3104 +       rmap_item->address |= UNSTABLE_FLAG;
3105 +       rmap_item->append_round = ksm_scan_round;
3106 +       rb_link_node(&rmap_item->node, parent, new);
3107 +       rb_insert_color(&rmap_item->node, &tree_node->sub_root);
3108 +
3109 +       ksm_pages_unshared++;
3110 +       return NULL;
3111 +
3112 +get_page_out:
3113 +       if (tree_rmap_item->page == rmap_item->page)
3114 +               return NULL;
3115 +
3116 +       if (get_tree_rmap_item_page(tree_rmap_item))
3117 +               return NULL;
3118 +
3119 +       return tree_rmap_item;
3120 +}
3121 +
3122 +static void enter_vma_tree(struct vma_slot *slot)
3123 +{
3124 +       unsigned long i;
3125 +       int ret;
3126 +
3127 +       i = ksm_vma_tree_index_end;
3128 +
3129 +       ret = radix_tree_insert(&ksm_vma_tree, i, slot);
3130 +       BUG_ON(ret);
3131 +
3132 +       slot->ksm_index = i;
3133 +       ksm_vma_tree_num++;
3134 +       ksm_vma_tree_index_end++;
3135 +}
3136 +
3137 +static inline void get_sub_dup_vma(struct vma_slot **slot,
3138 +                                  struct vma_slot **sub_slot)
3139 +{
3140 +       struct vma_slot *tmp;
3141 +
3142 +       if ((*slot)->ksm_index > (*sub_slot)->ksm_index) {
3143 +               tmp = *slot;
3144 +               *slot = *sub_slot;
3145 +               *sub_slot = tmp;
3146 +       }
3147 +}
3148 +
3149 +/*
3150 + * Inc or dec the dup pages stored in a slot, return the dup page num after
3151 + * the operation.
3152 + */
3153 +static inline unsigned long dup_pages_mod(void **slot, int inc)
3154 +{
3155 +       unsigned long item, ret;
3156 +
3157 +       item = (unsigned long)(*slot) >> INDIRECT_OFFSET;
3158 +       if (inc) {
3159 +               item++;
3160 +               BUG_ON(!item);
3161 +       } else {
3162 +               BUG_ON(!item);
3163 +               item--;
3164 +       }
3165 +       ret = item;
3166 +       item <<= INDIRECT_OFFSET;
3167 +       *slot = (void *)item;
3168 +
3169 +       return ret;
3170 +}
3171 +
3172 +static void inc_dup_vma(struct vma_slot *slot, struct vma_slot *sub_slot)
3173 +{
3174 +       void **dup_slot;
3175 +       unsigned long dup_pages;
3176 +       int ret;
3177 +
3178 +       if (slot->ksm_index == -1)
3179 +               enter_vma_tree(slot);
3180 +
3181 +       if (sub_slot->ksm_index == -1)
3182 +               enter_vma_tree(sub_slot);
3183 +
3184 +       get_sub_dup_vma(&slot, &sub_slot);
3185 +
3186 +       dup_slot = radix_tree_lookup_slot(&slot->dup_tree, sub_slot->ksm_index);
3187 +       if (dup_slot)
3188 +               goto found;
3189 +
3190 +       /*
3191 +        * In order to store dup_pages in radix tree, we must make
3192 +        * radix_tree_is_indirect_ptr() happy.
3193 +        */
3194 +       dup_pages = 1 << INDIRECT_OFFSET;
3195 +
3196 +       /* no such entry yet, insert one */
3197 +       ret = radix_tree_insert(&slot->dup_tree, sub_slot->ksm_index,
3198 +                               (void *)dup_pages);
3199 +       BUG_ON(ret);
3200 +
3201 +       return;
3202 +
3203 +found:
3204 +       dup_pages_mod(dup_slot, 1);
3205 +}
3206 +
3207 +static void dec_dup_vma(struct vma_slot *slot, struct vma_slot *sub_slot)
3208 +{
3209 +       void **dup_slot;
3210 +       unsigned long dup_pages;
3211 +
3212 +       BUG_ON(slot->ksm_index == -1 || sub_slot->ksm_index == -1);
3213 +
3214 +       get_sub_dup_vma(&slot, &sub_slot);
3215 +
3216 +       dup_slot = radix_tree_lookup_slot(&slot->dup_tree, sub_slot->ksm_index);
3217 +       BUG_ON(!dup_slot);
3218 +
3219 +       dup_pages = dup_pages_mod(dup_slot, 0);
3220 +
3221 +       /* dup_pages == 0, we need to kick it out */
3222 +       if (!dup_pages)
3223 +               radix_tree_delete(&slot->dup_tree, sub_slot->ksm_index);
3224 +}
3225 +
3226 +static void hold_anon_vma(struct rmap_item *rmap_item,
3227 +                         struct anon_vma *anon_vma)
3228 +{
3229 +       rmap_item->anon_vma = anon_vma;
3230 +       get_anon_vma(anon_vma);
3231 +}
3232 +
3233 +
3234 +/**
3235 + * stable_tree_append() - append a rmap_item to a stable node. Deduplication
3236 + * ratio statistics is done in this function.
3237 + *
3238 + */
3239 +static void stable_tree_append(struct rmap_item *rmap_item,
3240 +                              struct stable_node *stable_node)
3241 +{
3242 +       struct node_vma *node_vma = NULL, *new_node_vma, *node_vma_iter = NULL;
3243 +       struct hlist_node *hlist, *cont_p = NULL;
3244 +       unsigned long key = (unsigned long)rmap_item->slot;
3245 +
3246 +       BUG_ON(!stable_node);
3247 +       rmap_item->address |= STABLE_FLAG;
3248 +       rmap_item->append_round = ksm_scan_round;
3249 +
3250 +       if (hlist_empty(&stable_node->hlist)) {
3251 +               ksm_pages_shared++;
3252 +               goto node_vma_new;
3253 +       } else {
3254 +               ksm_pages_sharing++;
3255 +       }
3256 +
3257 +       hlist_for_each_entry(node_vma, hlist, &stable_node->hlist, hlist) {
3258 +               if (node_vma->last_update == ksm_scan_round)
3259 +                       inc_dup_vma(rmap_item->slot, node_vma->slot);
3260 +
3261 +               if (node_vma->key >= key)
3262 +                       break;
3263 +       }
3264 +
3265 +       cont_p = hlist;
3266 +
3267 +       if (node_vma && node_vma->key == key) {
3268 +               if (node_vma->last_update == ksm_scan_round) {
3269 +                       /**
3270 +                        * we consider this page a inner duplicate, cancel
3271 +                        * other updates
3272 +                        */
3273 +                       hlist_for_each_entry(node_vma_iter, hlist,
3274 +                                            &stable_node->hlist, hlist) {
3275 +                               if (node_vma_iter->key == key)
3276 +                                       break;
3277 +
3278 +                               /* only need to increase the same vma */
3279 +                               if (node_vma_iter->last_update ==
3280 +                                   ksm_scan_round) {
3281 +                                       dec_dup_vma(rmap_item->slot,
3282 +                                                   node_vma_iter->slot);
3283 +                               }
3284 +                       }
3285 +               } else {
3286 +                       /**
3287 +                        * Although it's same vma, it contains no duplicate for this
3288 +                        * round. Continue scan other vma.
3289 +                        */
3290 +                       hlist_for_each_entry_continue(node_vma_iter,
3291 +                                                     hlist, hlist) {
3292 +                               if (node_vma_iter->last_update ==
3293 +                                   ksm_scan_round) {
3294 +                                       inc_dup_vma(rmap_item->slot,
3295 +                                                   node_vma_iter->slot);
3296 +                               }
3297 +                       }
3298 +
3299 +               }
3300 +
3301 +               goto node_vma_ok;
3302 +       }
3303 +
3304 +node_vma_new:
3305 +       /* no same vma already in node, alloc a new node_vma */
3306 +       new_node_vma = alloc_node_vma();
3307 +       BUG_ON(!new_node_vma);
3308 +       new_node_vma->head = stable_node;
3309 +       new_node_vma->slot = rmap_item->slot;
3310 +
3311 +       if (!node_vma) {
3312 +               hlist_add_head(&new_node_vma->hlist, &stable_node->hlist);
3313 +       } else if (node_vma->key != key) {
3314 +               if (node_vma->key < key)
3315 +                       hlist_add_after(&node_vma->hlist, &new_node_vma->hlist);
3316 +               else {
3317 +                       hlist_for_each_entry_continue(node_vma_iter, cont_p,
3318 +                                                     hlist) {
3319 +                               if (node_vma_iter->last_update ==
3320 +                                   ksm_scan_round) {
3321 +                                       inc_dup_vma(rmap_item->slot,
3322 +                                                   node_vma_iter->slot);
3323 +                               }
3324 +                       }
3325 +                       hlist_add_before(&new_node_vma->hlist,
3326 +                                        &node_vma->hlist);
3327 +               }
3328 +
3329 +       }
3330 +       node_vma = new_node_vma;
3331 +
3332 +node_vma_ok: /* ok, ready to add to the list */
3333 +       rmap_item->head = node_vma;
3334 +       hlist_add_head(&rmap_item->hlist, &node_vma->rmap_hlist);
3335 +       node_vma->last_update = ksm_scan_round;
3336 +       hold_anon_vma(rmap_item, rmap_item->slot->vma->anon_vma);
3337 +       rmap_item->slot->pages_merged++;
3338 +}
3339 +
3340 +/*
3341 + * We use break_ksm to break COW on a ksm page: it's a stripped down
3342 + *
3343 + *     if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1)
3344 + *             put_page(page);
3345 + *
3346 + * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
3347 + * in case the application has unmapped and remapped mm,addr meanwhile.
3348 + * Could a ksm page appear anywhere else?  Actually yes, in a VM_PFNMAP
3349 + * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
3350 + */
3351 +static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
3352 +{
3353 +       struct page *page;
3354 +       int ret = 0;
3355 +
3356 +       do {
3357 +               cond_resched();
3358 +               page = follow_page(vma, addr, FOLL_GET);
3359 +               if (IS_ERR_OR_NULL(page))
3360 +                       break;
3361 +               if (PageKsm(page)) {
3362 +                       ret = handle_mm_fault(vma->vm_mm, vma, addr,
3363 +                                             FAULT_FLAG_WRITE);
3364 +               } else
3365 +                       ret = VM_FAULT_WRITE;
3366 +               put_page(page);
3367 +       } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM)));
3368 +       /*
3369 +        * We must loop because handle_mm_fault() may back out if there's
3370 +        * any difficulty e.g. if pte accessed bit gets updated concurrently.
3371 +        *
3372 +        * VM_FAULT_WRITE is what we have been hoping for: it indicates that
3373 +        * COW has been broken, even if the vma does not permit VM_WRITE;
3374 +        * but note that a concurrent fault might break PageKsm for us.
3375 +        *
3376 +        * VM_FAULT_SIGBUS could occur if we race with truncation of the
3377 +        * backing file, which also invalidates anonymous pages: that's
3378 +        * okay, that truncation will have unmapped the PageKsm for us.
3379 +        *
3380 +        * VM_FAULT_OOM: at the time of writing (late July 2009), setting
3381 +        * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
3382 +        * current task has TIF_MEMDIE set, and will be OOM killed on return
3383 +        * to user; and ksmd, having no mm, would never be chosen for that.
3384 +        *
3385 +        * But if the mm is in a limited mem_cgroup, then the fault may fail
3386 +        * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
3387 +        * even ksmd can fail in this way - though it's usually breaking ksm
3388 +        * just to undo a merge it made a moment before, so unlikely to oom.
3389 +        *
3390 +        * That's a pity: we might therefore have more kernel pages allocated
3391 +        * than we're counting as nodes in the stable tree; but ksm_do_scan
3392 +        * will retry to break_cow on each pass, so should recover the page
3393 +        * in due course.  The important thing is to not let VM_MERGEABLE
3394 +        * be cleared while any such pages might remain in the area.
3395 +        */
3396 +       return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
3397 +}
3398 +
3399 +static void break_cow(struct rmap_item *rmap_item)
3400 +{
3401 +       struct vm_area_struct *vma = rmap_item->slot->vma;
3402 +       struct mm_struct *mm = vma->vm_mm;
3403 +       unsigned long addr = get_rmap_addr(rmap_item);
3404 +
3405 +       if (ksm_test_exit(mm))
3406 +               goto out;
3407 +
3408 +       break_ksm(vma, addr);
3409 +out:
3410 +       return;
3411 +}
3412 +
3413 +/*
3414 + * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather
3415 + * than check every pte of a given vma, the locking doesn't quite work for
3416 + * that - an rmap_item is assigned to the stable tree after inserting ksm
3417 + * page and upping mmap_sem.  Nor does it fit with the way we skip dup'ing
3418 + * rmap_items from parent to child at fork time (so as not to waste time
3419 + * if exit comes before the next scan reaches it).
3420 + *
3421 + * Similarly, although we'd like to remove rmap_items (so updating counts
3422 + * and freeing memory) when unmerging an area, it's easier to leave that
3423 + * to the next pass of ksmd - consider, for example, how ksmd might be
3424 + * in cmp_and_merge_page on one of the rmap_items we would be removing.
3425 + */
3426 +inline int unmerge_ksm_pages(struct vm_area_struct *vma,
3427 +                     unsigned long start, unsigned long end)
3428 +{
3429 +       unsigned long addr;
3430 +       int err = 0;
3431 +
3432 +       for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
3433 +               if (ksm_test_exit(vma->vm_mm))
3434 +                       break;
3435 +               if (signal_pending(current))
3436 +                       err = -ERESTARTSYS;
3437 +               else
3438 +                       err = break_ksm(vma, addr);
3439 +       }
3440 +       return err;
3441 +}
3442 +
3443 +static inline void inc_ksm_pages_scanned(void)
3444 +{
3445 +       u64 delta;
3446 +
3447 +
3448 +       if (ksm_pages_scanned == U64_MAX) {
3449 +               encode_benefit();
3450 +
3451 +               delta = ksm_pages_scanned >> pages_scanned_base;
3452 +
3453 +               if (CAN_OVERFLOW_U64(pages_scanned_stored, delta)) {
3454 +                       pages_scanned_stored >>= 1;
3455 +                       delta >>= 1;
3456 +                       pages_scanned_base++;
3457 +               }
3458 +
3459 +               pages_scanned_stored += delta;
3460 +
3461 +               ksm_pages_scanned = ksm_pages_scanned_last = 0;
3462 +       }
3463 +
3464 +       ksm_pages_scanned++;
3465 +}
3466 +
3467 +static int find_zero_page_hash(int strength, u32 hash)
3468 +{
3469 +       return (zero_hash_table[strength]== hash);
3470 +}
3471 +
3472 +static int cmp_and_merge_zero_page(struct vm_area_struct *vma, struct page *page)
3473 +{
3474 +       struct page *zero_page = ZERO_PAGE(0);
3475 +       pte_t orig_pte = __pte(0);
3476 +
3477 +       int err = -EFAULT;
3478 +
3479 +       if (!trylock_page(page))
3480 +               goto out;
3481 +
3482 +       if (write_protect_page(vma, page, &orig_pte, 0) == 0) {
3483 +               if (is_zero_page(page) == 0)
3484 +                       err = replace_page(vma, page, zero_page, orig_pte);
3485 +       }
3486 +
3487 +       unlock_page(page);
3488 +out:
3489 +       return err;
3490 +}
3491 +
3492 +/*
3493 + * cmp_and_merge_page() - first see if page can be merged into the stable
3494 + * tree; if not, compare hash to previous and if it's the same, see if page
3495 + * can be inserted into the unstable tree, or merged with a page already there
3496 + * and both transferred to the stable tree.
3497 + *
3498 + * @page: the page that we are searching identical page to.
3499 + * @rmap_item: the reverse mapping into the virtual address of this page
3500 + */
3501 +static void cmp_and_merge_page(struct rmap_item *rmap_item)
3502 +{
3503 +       struct rmap_item *tree_rmap_item;
3504 +       struct page *page;
3505 +       struct page *kpage = NULL;
3506 +       u32 hash, hash_max;
3507 +       int err;
3508 +       unsigned int success1, success2;
3509 +       struct stable_node *snode;
3510 +       int cmp;
3511 +       struct rb_node *parent = NULL, **new;
3512 +
3513 +       remove_rmap_item_from_tree(rmap_item);
3514 +
3515 +       page = rmap_item->page;
3516 +
3517 +       hash = page_hash(page, hash_strength, 1);
3518 +
3519 +       /*if the page content all zero, re-map to zero-page*/
3520 +       if (find_zero_page_hash(hash_strength, hash)) {
3521 +               if (!cmp_and_merge_zero_page(rmap_item->slot->vma, page)) {
3522 +                       ksm_remap_zero_pages++;
3523 +                       return ;
3524 +               }
3525 +       }
3526 +       //ksm_pages_scanned++;
3527 +       inc_ksm_pages_scanned();
3528 +
3529 +       /* We first start with searching the page inside the stable tree */
3530 +       kpage = stable_tree_search(rmap_item, hash);
3531 +       if (kpage) {
3532 +               err = try_to_merge_with_ksm_page(rmap_item, kpage,
3533 +                                                hash);
3534 +               if (!err) {
3535 +                       /*
3536 +                        * The page was successfully merged, add
3537 +                        * its rmap_item to the stable tree.
3538 +                        * page lock is needed because it's
3539 +                        * racing with try_to_unmap_ksm(), etc.
3540 +                        */
3541 +                       lock_page(kpage);
3542 +                       stable_tree_append(rmap_item, page_stable_node(kpage));
3543 +                       unlock_page(kpage);
3544 +                       put_page(kpage);
3545 +                       return; /* success */
3546 +               }
3547 +               put_page(kpage);
3548 +
3549 +               /*
3550 +                * if it's a collision and it has been search in sub-rbtree
3551 +                * (hash_max != 0), we want to abort, because if it is
3552 +                * successfully merged in unstable tree, the collision trends to
3553 +                * happen again.
3554 +                */
3555 +               if (err == MERGE_ERR_COLLI && rmap_item->hash_max)
3556 +                       return;
3557 +       }
3558 +
3559 +       tree_rmap_item =
3560 +               unstable_tree_search_insert(rmap_item, hash);
3561 +       if (tree_rmap_item) {
3562 +               err = try_to_merge_two_pages(rmap_item, tree_rmap_item, hash);
3563 +               /*
3564 +                * As soon as we merge this page, we want to remove the
3565 +                * rmap_item of the page we have merged with from the unstable
3566 +                * tree, and insert it instead as new node in the stable tree.
3567 +                */
3568 +               if (!err) {
3569 +                       kpage = page;
3570 +                       remove_rmap_item_from_tree(tree_rmap_item);
3571 +                       lock_page(kpage);
3572 +                       snode = stable_tree_insert(&kpage, hash,
3573 +                                                  rmap_item, tree_rmap_item,
3574 +                                                  &success1, &success2);
3575 +
3576 +                       if (success1)
3577 +                               stable_tree_append(rmap_item, snode);
3578 +                       else
3579 +                               break_cow(rmap_item);
3580 +
3581 +                       if (success2)
3582 +                               stable_tree_append(tree_rmap_item, snode);
3583 +                       else
3584 +                               break_cow(tree_rmap_item);
3585 +
3586 +                       /*
3587 +                        * The original kpage may be unlocked inside
3588 +                        * stable_tree_insert() already.
3589 +                        */
3590 +                       unlock_page(kpage);
3591 +
3592 +               } else if (err == MERGE_ERR_COLLI) {
3593 +                       if (tree_rmap_item->tree_node->count == 1) {
3594 +                               rmap_item_hash_max(tree_rmap_item,
3595 +                               tree_rmap_item->tree_node->hash);
3596 +                       } else
3597 +                               BUG_ON(!(tree_rmap_item->hash_max));
3598 +
3599 +                       hash_max = rmap_item_hash_max(rmap_item, hash);
3600 +                       cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
3601 +                       parent = &tree_rmap_item->node;
3602 +                       if (cmp < 0)
3603 +                               new = &parent->rb_left;
3604 +                       else if (cmp > 0)
3605 +                               new = &parent->rb_right;
3606 +                       else
3607 +                               goto put_up_out;
3608 +
3609 +                       rmap_item->tree_node = tree_rmap_item->tree_node;
3610 +                       rmap_item->address |= UNSTABLE_FLAG;
3611 +                       rmap_item->append_round = ksm_scan_round;
3612 +                       rb_link_node(&rmap_item->node, parent, new);
3613 +                       rb_insert_color(&rmap_item->node,
3614 +                                       &tree_rmap_item->tree_node->sub_root);
3615 +                       rmap_item->tree_node->count++;
3616 +               }
3617 +put_up_out:
3618 +               put_page(tree_rmap_item->page);
3619 +               up_read(&tree_rmap_item->slot->vma->vm_mm->mmap_sem);
3620 +       }
3621 +}
3622 +
3623 +
3624 +
3625 +
3626 +static inline unsigned long get_pool_index(struct vma_slot *slot,
3627 +                                          unsigned long index)
3628 +{
3629 +       unsigned long pool_index;
3630 +
3631 +       pool_index = (sizeof(struct rmap_list_entry *) * index) >> PAGE_SHIFT;
3632 +       if (pool_index >= slot->pool_size)
3633 +               BUG();
3634 +       return pool_index;
3635 +}
3636 +
3637 +static inline unsigned long index_page_offset(unsigned long index)
3638 +{
3639 +       return offset_in_page(sizeof(struct rmap_list_entry *) * index);
3640 +}
3641 +
3642 +static inline
3643 +struct rmap_list_entry *get_rmap_list_entry(struct vma_slot *slot,
3644 +                                           unsigned long index, int need_alloc)
3645 +{
3646 +       unsigned long pool_index;
3647 +       void *addr;
3648 +
3649 +
3650 +       pool_index = get_pool_index(slot, index);
3651 +       if (!slot->rmap_list_pool[pool_index]) {
3652 +               if (!need_alloc)
3653 +                       return NULL;
3654 +
3655 +               slot->rmap_list_pool[pool_index] =
3656 +                       alloc_page(GFP_KERNEL | __GFP_ZERO);
3657 +               BUG_ON(!slot->rmap_list_pool[pool_index]);
3658 +       }
3659 +
3660 +       addr = kmap(slot->rmap_list_pool[pool_index]);
3661 +       addr += index_page_offset(index);
3662 +
3663 +       return addr;
3664 +}
3665 +
3666 +static inline void put_rmap_list_entry(struct vma_slot *slot,
3667 +                                      unsigned long index)
3668 +{
3669 +       unsigned long pool_index;
3670 +
3671 +       pool_index = get_pool_index(slot, index);
3672 +       BUG_ON(!slot->rmap_list_pool[pool_index]);
3673 +       kunmap(slot->rmap_list_pool[pool_index]);
3674 +}
3675 +
3676 +static inline int entry_is_new(struct rmap_list_entry *entry)
3677 +{
3678 +       return !entry->item;
3679 +}
3680 +
3681 +static inline unsigned long get_index_orig_addr(struct vma_slot *slot,
3682 +                                               unsigned long index)
3683 +{
3684 +       return slot->vma->vm_start + (index << PAGE_SHIFT);
3685 +}
3686 +
3687 +static inline unsigned long get_entry_address(struct rmap_list_entry *entry)
3688 +{
3689 +       unsigned long addr;
3690 +
3691 +       if (is_addr(entry->addr))
3692 +               addr = get_clean_addr(entry->addr);
3693 +       else if (entry->item)
3694 +               addr = get_rmap_addr(entry->item);
3695 +       else
3696 +               BUG();
3697 +
3698 +       return addr;
3699 +}
3700 +
3701 +static inline struct rmap_item *get_entry_item(struct rmap_list_entry *entry)
3702 +{
3703 +       if (is_addr(entry->addr))
3704 +               return NULL;
3705 +
3706 +       return entry->item;
3707 +}
3708 +
3709 +static inline void inc_rmap_list_pool_count(struct vma_slot *slot,
3710 +                                           unsigned long index)
3711 +{
3712 +       unsigned long pool_index;
3713 +
3714 +       pool_index = get_pool_index(slot, index);
3715 +       BUG_ON(!slot->rmap_list_pool[pool_index]);
3716 +       slot->pool_counts[pool_index]++;
3717 +}
3718 +
3719 +static inline void dec_rmap_list_pool_count(struct vma_slot *slot,
3720 +                                           unsigned long index)
3721 +{
3722 +       unsigned long pool_index;
3723 +
3724 +       pool_index = get_pool_index(slot, index);
3725 +       BUG_ON(!slot->rmap_list_pool[pool_index]);
3726 +       BUG_ON(!slot->pool_counts[pool_index]);
3727 +       slot->pool_counts[pool_index]--;
3728 +}
3729 +
3730 +static inline int entry_has_rmap(struct rmap_list_entry *entry)
3731 +{
3732 +       return !is_addr(entry->addr) && entry->item;
3733 +}
3734 +
3735 +static inline void swap_entries(struct rmap_list_entry *entry1,
3736 +                               unsigned long index1,
3737 +                               struct rmap_list_entry *entry2,
3738 +                               unsigned long index2)
3739 +{
3740 +       struct rmap_list_entry tmp;
3741 +
3742 +       /* swapping two new entries is meaningless */
3743 +       BUG_ON(entry_is_new(entry1) && entry_is_new(entry2));
3744 +
3745 +       tmp = *entry1;
3746 +       *entry1 = *entry2;
3747 +       *entry2 = tmp;
3748 +
3749 +       if (entry_has_rmap(entry1))
3750 +               entry1->item->entry_index = index1;
3751 +
3752 +       if (entry_has_rmap(entry2))
3753 +               entry2->item->entry_index = index2;
3754 +
3755 +       if (entry_has_rmap(entry1) && !entry_has_rmap(entry2)) {
3756 +               inc_rmap_list_pool_count(entry1->item->slot, index1);
3757 +               dec_rmap_list_pool_count(entry1->item->slot, index2);
3758 +       } else if (!entry_has_rmap(entry1) && entry_has_rmap(entry2)) {
3759 +               inc_rmap_list_pool_count(entry2->item->slot, index2);
3760 +               dec_rmap_list_pool_count(entry2->item->slot, index1);
3761 +       }
3762 +}
3763 +
3764 +static inline void free_entry_item(struct rmap_list_entry *entry)
3765 +{
3766 +       unsigned long index;
3767 +       struct rmap_item *item;
3768 +
3769 +       if (!is_addr(entry->addr)) {
3770 +               BUG_ON(!entry->item);
3771 +               item = entry->item;
3772 +               entry->addr = get_rmap_addr(item);
3773 +               set_is_addr(entry->addr);
3774 +               index = item->entry_index;
3775 +               remove_rmap_item_from_tree(item);
3776 +               dec_rmap_list_pool_count(item->slot, index);
3777 +               free_rmap_item(item);
3778 +       }
3779 +}
3780 +
3781 +static inline int pool_entry_boundary(unsigned long index)
3782 +{
3783 +       unsigned long linear_addr;
3784 +
3785 +       linear_addr = sizeof(struct rmap_list_entry *) * index;
3786 +       return index && !offset_in_page(linear_addr);
3787 +}
3788 +
3789 +static inline void try_free_last_pool(struct vma_slot *slot,
3790 +                                     unsigned long index)
3791 +{
3792 +       unsigned long pool_index;
3793 +
3794 +       pool_index = get_pool_index(slot, index);
3795 +       if (slot->rmap_list_pool[pool_index] &&
3796 +           !slot->pool_counts[pool_index]) {
3797 +               __free_page(slot->rmap_list_pool[pool_index]);
3798 +               slot->rmap_list_pool[pool_index] = NULL;
3799 +               slot->need_sort = 1;
3800 +       }
3801 +
3802 +}
3803 +
3804 +static inline unsigned long vma_item_index(struct vm_area_struct *vma,
3805 +                                          struct rmap_item *item)
3806 +{
3807 +       return (get_rmap_addr(item) - vma->vm_start) >> PAGE_SHIFT;
3808 +}
3809 +
3810 +static int within_same_pool(struct vma_slot *slot,
3811 +                           unsigned long i, unsigned long j)
3812 +{
3813 +       unsigned long pool_i, pool_j;
3814 +
3815 +       pool_i = get_pool_index(slot, i);
3816 +       pool_j = get_pool_index(slot, j);
3817 +
3818 +       return (pool_i == pool_j);
3819 +}
3820 +
3821 +static void sort_rmap_entry_list(struct vma_slot *slot)
3822 +{
3823 +       unsigned long i, j;
3824 +       struct rmap_list_entry *entry, *swap_entry;
3825 +
3826 +       entry = get_rmap_list_entry(slot, 0, 0);
3827 +       for (i = 0; i < slot->pages; ) {
3828 +
3829 +               if (!entry)
3830 +                       goto skip_whole_pool;
3831 +
3832 +               if (entry_is_new(entry))
3833 +                       goto next_entry;
3834 +
3835 +               if (is_addr(entry->addr)) {
3836 +                       entry->addr = 0;
3837 +                       goto next_entry;
3838 +               }
3839 +
3840 +               j = vma_item_index(slot->vma, entry->item);
3841 +               if (j == i)
3842 +                       goto next_entry;
3843 +
3844 +               if (within_same_pool(slot, i, j))
3845 +                       swap_entry = entry + j - i;
3846 +               else
3847 +                       swap_entry = get_rmap_list_entry(slot, j, 1);
3848 +
3849 +               swap_entries(entry, i, swap_entry, j);
3850 +               if (!within_same_pool(slot, i, j))
3851 +                       put_rmap_list_entry(slot, j);
3852 +               continue;
3853 +
3854 +skip_whole_pool:
3855 +               i += PAGE_SIZE / sizeof(*entry);
3856 +               if (i < slot->pages)
3857 +                       entry = get_rmap_list_entry(slot, i, 0);
3858 +               continue;
3859 +
3860 +next_entry:
3861 +               if (i >= slot->pages - 1 ||
3862 +                   !within_same_pool(slot, i, i + 1)) {
3863 +                       put_rmap_list_entry(slot, i);
3864 +                       if (i + 1 < slot->pages)
3865 +                               entry = get_rmap_list_entry(slot, i + 1, 0);
3866 +               } else
3867 +                       entry++;
3868 +               i++;
3869 +               continue;
3870 +       }
3871 +
3872 +       /* free empty pool entries which contain no rmap_item */
3873 +       /* CAN be simplied to based on only pool_counts when bug freed !!!!! */
3874 +       for (i = 0; i < slot->pool_size; i++) {
3875 +               unsigned char has_rmap;
3876 +               void *addr;
3877 +
3878 +               if (!slot->rmap_list_pool[i])
3879 +                       continue;
3880 +
3881 +               has_rmap = 0;
3882 +               addr = kmap(slot->rmap_list_pool[i]);
3883 +               BUG_ON(!addr);
3884 +               for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
3885 +                       entry = (struct rmap_list_entry *)addr + j;
3886 +                       if (is_addr(entry->addr))
3887 +                               continue;
3888 +                       if (!entry->item)
3889 +                               continue;
3890 +                       has_rmap = 1;
3891 +               }
3892 +               kunmap(slot->rmap_list_pool[i]);
3893 +               if (!has_rmap) {
3894 +                       BUG_ON(slot->pool_counts[i]);
3895 +                       __free_page(slot->rmap_list_pool[i]);
3896 +                       slot->rmap_list_pool[i] = NULL;
3897 +               }
3898 +       }
3899 +
3900 +       slot->need_sort = 0;
3901 +}
3902 +
3903 +/*
3904 + * vma_fully_scanned() - if all the pages in this slot have been scanned.
3905 + */
3906 +static inline int vma_fully_scanned(struct vma_slot *slot)
3907 +{
3908 +       return slot->pages_scanned && !(slot->pages_scanned % slot->pages);
3909 +}
3910 +
3911 +/**
3912 + * get_next_rmap_item() - Get the next rmap_item in a vma_slot according to
3913 + * its random permutation. This function is embedded with the random
3914 + * permutation index management code.
3915 + */
3916 +static struct rmap_item *get_next_rmap_item(struct vma_slot *slot)
3917 +{
3918 +       unsigned long rand_range, addr, swap_index, scan_index;
3919 +       struct rmap_item *item = NULL;
3920 +       struct rmap_list_entry *scan_entry, *swap_entry = NULL;
3921 +       struct page *page;
3922 +
3923 +       scan_index = swap_index = slot->pages_scanned % slot->pages;
3924 +
3925 +       if (pool_entry_boundary(scan_index))
3926 +               try_free_last_pool(slot, scan_index - 1);
3927 +
3928 +       if (vma_fully_scanned(slot)) {
3929 +               slot->need_rerand = slot->need_sort;
3930 +               if (slot->need_sort)
3931 +                       sort_rmap_entry_list(slot);
3932 +       }
3933 +
3934 +       scan_entry = get_rmap_list_entry(slot, scan_index, 1);
3935 +       if (entry_is_new(scan_entry)) {
3936 +               scan_entry->addr = get_index_orig_addr(slot, scan_index);
3937 +               set_is_addr(scan_entry->addr);
3938 +       }
3939 +
3940 +       if (slot->need_rerand) {
3941 +               rand_range = slot->pages - scan_index;
3942 +               BUG_ON(!rand_range);
3943 +               swap_index = scan_index + (random32() % rand_range);
3944 +       }
3945 +
3946 +       if (swap_index != scan_index) {
3947 +               swap_entry = get_rmap_list_entry(slot, swap_index, 1);
3948 +               if (entry_is_new(swap_entry)) {
3949 +                       swap_entry->addr = get_index_orig_addr(slot,
3950 +                                                              swap_index);
3951 +                       set_is_addr(swap_entry->addr);
3952 +               }
3953 +               swap_entries(scan_entry, scan_index, swap_entry, swap_index);
3954 +       }
3955 +
3956 +       addr = get_entry_address(scan_entry);
3957 +       item = get_entry_item(scan_entry);
3958 +       BUG_ON(addr > slot->vma->vm_end || addr < slot->vma->vm_start);
3959 +
3960 +       page = follow_page(slot->vma, addr, FOLL_GET);
3961 +       if (IS_ERR_OR_NULL(page))
3962 +               goto nopage;
3963 +
3964 +       if (!PageAnon(page) && !page_trans_compound_anon(page))
3965 +               goto putpage;
3966 +
3967 +       /*check is zero_page pfn*/
3968 +       if (page_to_pfn(page) == zero_pfn)
3969 +               goto putpage;
3970 +
3971 +       flush_anon_page(slot->vma, page, addr);
3972 +       flush_dcache_page(page);
3973 +
3974 +       if (!item) {
3975 +               item = alloc_rmap_item();
3976 +               if (item) {
3977 +                       /* It has already been zeroed */
3978 +                       item->slot = slot;
3979 +                       item->address = addr;
3980 +                       item->entry_index = scan_index;
3981 +                       scan_entry->item = item;
3982 +                       inc_rmap_list_pool_count(slot, scan_index);
3983 +               } else
3984 +                       goto putpage;
3985 +       }
3986 +
3987 +       BUG_ON(item->slot != slot);
3988 +       /* the page may have changed */
3989 +       item->page = page;
3990 +       put_rmap_list_entry(slot, scan_index);
3991 +       if (swap_entry)
3992 +               put_rmap_list_entry(slot, swap_index);
3993 +       return item;
3994 +
3995 +putpage:
3996 +       put_page(page);
3997 +       page = NULL;
3998 +nopage:
3999 +       /* no page, store addr back and free rmap_item if possible */
4000 +       free_entry_item(scan_entry);
4001 +       put_rmap_list_entry(slot, scan_index);
4002 +       if (swap_entry)
4003 +               put_rmap_list_entry(slot, swap_index);
4004 +       return NULL;
4005 +}
4006 +
4007 +static inline int in_stable_tree(struct rmap_item *rmap_item)
4008 +{
4009 +       return rmap_item->address & STABLE_FLAG;
4010 +}
4011 +
4012 +/**
4013 + * scan_vma_one_page() - scan the next page in a vma_slot. Called with
4014 + * mmap_sem locked.
4015 + */
4016 +static void scan_vma_one_page(struct vma_slot *slot)
4017 +{
4018 +       struct mm_struct *mm;
4019 +       struct rmap_item *rmap_item = NULL;
4020 +       struct vm_area_struct *vma = slot->vma;
4021 +
4022 +       mm = vma->vm_mm;
4023 +       BUG_ON(!mm);
4024 +       BUG_ON(!slot);
4025 +
4026 +       rmap_item = get_next_rmap_item(slot);
4027 +       if (!rmap_item)
4028 +               goto out1;
4029 +
4030 +       if (PageKsm(rmap_item->page) && in_stable_tree(rmap_item))
4031 +               goto out2;
4032 +
4033 +       cmp_and_merge_page(rmap_item);
4034 +out2:
4035 +       put_page(rmap_item->page);
4036 +out1:
4037 +       slot->pages_scanned++;
4038 +       slot->slot_scanned = 1;
4039 +       if (vma_fully_scanned(slot)) {
4040 +               slot->fully_scanned = 1;
4041 +               slot->rung->fully_scanned_slots++;
4042 +               BUG_ON(!slot->rung->fully_scanned_slots);
4043 +       }
4044 +}
4045 +
4046 +static unsigned long get_vma_random_scan_num(struct vma_slot *slot,
4047 +                                            unsigned long scan_ratio)
4048 +{
4049 +       return slot->pages * scan_ratio / KSM_SCAN_RATIO_MAX;
4050 +}
4051 +
4052 +static inline void vma_rung_enter(struct vma_slot *slot,
4053 +                                 struct scan_rung *rung)
4054 +{
4055 +       unsigned long pages_to_scan;
4056 +       struct scan_rung *old_rung = slot->rung;
4057 +
4058 +       /* leave the old rung it was in */
4059 +       BUG_ON(list_empty(&slot->ksm_list));
4060 +
4061 +       if (old_rung->current_scan == &slot->ksm_list)
4062 +               old_rung->current_scan = slot->ksm_list.next;
4063 +       list_del_init(&slot->ksm_list);
4064 +       old_rung->vma_num--;
4065 +       if (slot->fully_scanned)
4066 +               old_rung->fully_scanned_slots--;
4067 +
4068 +       if (old_rung->current_scan == &old_rung->vma_list) {
4069 +               /* This rung finishes a round */
4070 +               old_rung->round_finished = 1;
4071 +               old_rung->current_scan = old_rung->vma_list.next;
4072 +               BUG_ON(old_rung->current_scan == &old_rung->vma_list &&
4073 +                      !list_empty(&old_rung->vma_list));
4074 +       }
4075 +
4076 +       /* enter the new rung */
4077 +       while (!(pages_to_scan =
4078 +               get_vma_random_scan_num(slot, rung->scan_ratio))) {
4079 +               rung++;
4080 +               BUG_ON(rung > &ksm_scan_ladder[ksm_scan_ladder_size - 1]);
4081 +       }
4082 +       if (list_empty(&rung->vma_list))
4083 +               rung->current_scan = &slot->ksm_list;
4084 +       list_add(&slot->ksm_list, &rung->vma_list);
4085 +       slot->rung = rung;
4086 +       slot->pages_to_scan = pages_to_scan;
4087 +       slot->rung->vma_num++;
4088 +       if (slot->fully_scanned)
4089 +               rung->fully_scanned_slots++;
4090 +
4091 +       BUG_ON(rung->current_scan == &rung->vma_list &&
4092 +              !list_empty(&rung->vma_list));
4093 +}
4094 +
4095 +static inline void vma_rung_up(struct vma_slot *slot)
4096 +{
4097 +       if (slot->rung == &ksm_scan_ladder[ksm_scan_ladder_size-1])
4098 +               return;
4099 +
4100 +       vma_rung_enter(slot, slot->rung + 1);
4101 +}
4102 +
4103 +static inline void vma_rung_down(struct vma_slot *slot)
4104 +{
4105 +       if (slot->rung == &ksm_scan_ladder[0])
4106 +               return;
4107 +
4108 +       vma_rung_enter(slot, slot->rung - 1);
4109 +}
4110 +
4111 +/**
4112 + * cal_dedup_ratio() - Calculate the deduplication ratio for this slot.
4113 + */
4114 +static unsigned long cal_dedup_ratio(struct vma_slot *slot)
4115 +{
4116 +       struct vma_slot *slot2;
4117 +       void **dup_slot;
4118 +       unsigned long dup_pages;
4119 +       unsigned long dedup_num, pages1, scanned1;
4120 +       unsigned long ret;
4121 +       int i;
4122 +
4123 +       if (!slot->pages_scanned)
4124 +               return 0;
4125 +
4126 +       pages1 = slot->pages;
4127 +       scanned1 = slot->pages_scanned - slot->last_scanned;
4128 +       BUG_ON(scanned1 > slot->pages_scanned);
4129 +
4130 +       for (i = slot->ksm_index; i < ksm_vma_tree_index_end; i++) {
4131 +               unsigned long pages2, scanned2;
4132 +
4133 +               dup_slot = radix_tree_lookup_slot(&slot->dup_tree, i);
4134 +               if (!dup_slot)
4135 +                       continue;
4136 +
4137 +               dup_pages = (unsigned long)(*dup_slot) >> INDIRECT_OFFSET;
4138 +
4139 +               slot2 = radix_tree_lookup(&ksm_vma_tree, i);
4140 +               BUG_ON(!slot2 || !slot2->pages_scanned);
4141 +
4142 +               pages2 = slot2->pages;
4143 +               scanned2 = slot2->pages_scanned - slot2->last_scanned;
4144 +               BUG_ON(scanned2 > slot2->pages_scanned);
4145 +
4146 +               BUG_ON(!scanned1 || !scanned2);
4147 +
4148 +               dedup_num = dup_pages * pages1 / scanned1 * pages2 / scanned2;
4149 +               slot->dedup_num += dedup_num;
4150 +               slot2->dedup_num += dedup_num;
4151 +       }
4152 +
4153 +       ret = (slot->dedup_num * KSM_DEDUP_RATIO_SCALE / pages1);
4154 +
4155 +       /* Thrashing area filtering */
4156 +       if (ksm_thrash_threshold) {
4157 +               if (slot->pages_cowed * 100 / slot->pages_merged
4158 +                   > ksm_thrash_threshold) {
4159 +                       ret = 0;
4160 +               } else {
4161 +                       ret = ret * (slot->pages_merged - slot->pages_cowed)
4162 +                             / slot->pages_merged;
4163 +               }
4164 +       }
4165 +
4166 +       return ret;
4167 +}
4168 +
4169 +
4170 +/**
4171 + * stable_node_reinsert() - When the hash_strength has been adjusted, the
4172 + * stable tree need to be restructured, this is the function re-inserting the
4173 + * stable node.
4174 + */
4175 +static inline void stable_node_reinsert(struct stable_node *new_node,
4176 +                                       struct page *page,
4177 +                                       struct rb_root *root_treep,
4178 +                                       struct list_head *tree_node_listp,
4179 +                                       u32 hash)
4180 +{
4181 +       struct rb_node **new = &root_treep->rb_node;
4182 +       struct rb_node *parent = NULL;
4183 +       struct stable_node *stable_node;
4184 +       struct tree_node *tree_node;
4185 +       struct page *tree_page;
4186 +       int cmp;
4187 +
4188 +       while (*new) {
4189 +               int cmp;
4190 +
4191 +               tree_node = rb_entry(*new, struct tree_node, node);
4192 +
4193 +               cmp = hash_cmp(hash, tree_node->hash);
4194 +
4195 +               if (cmp < 0) {
4196 +                       parent = *new;
4197 +                       new = &parent->rb_left;
4198 +               } else if (cmp > 0) {
4199 +                       parent = *new;
4200 +                       new = &parent->rb_right;
4201 +               } else
4202 +                       break;
4203 +       }
4204 +
4205 +       if (*new) {
4206 +               /* find a stable tree node with same first level hash value */
4207 +               stable_node_hash_max(new_node, page, hash);
4208 +               if (tree_node->count == 1) {
4209 +                       stable_node = rb_entry(tree_node->sub_root.rb_node,
4210 +                                              struct stable_node, node);
4211 +                       tree_page = get_ksm_page(stable_node, 1, 0);
4212 +                       if (tree_page) {
4213 +                               stable_node_hash_max(stable_node,
4214 +                                                     tree_page, hash);
4215 +                               put_page(tree_page);
4216 +
4217 +                               /* prepare for stable node insertion */
4218 +
4219 +                               cmp = hash_cmp(new_node->hash_max,
4220 +                                                  stable_node->hash_max);
4221 +                               parent = &stable_node->node;
4222 +                               if (cmp < 0)
4223 +                                       new = &parent->rb_left;
4224 +                               else if (cmp > 0)
4225 +                                       new = &parent->rb_right;
4226 +                               else
4227 +                                       goto failed;
4228 +
4229 +                               goto add_node;
4230 +                       } else {
4231 +                               /* the only stable_node deleted, the tree node
4232 +                                * was not deleted.
4233 +                                */
4234 +                               goto tree_node_reuse;
4235 +                       }
4236 +               }
4237 +
4238 +               /* well, search the collision subtree */
4239 +               new = &tree_node->sub_root.rb_node;
4240 +               parent = NULL;
4241 +               BUG_ON(!*new);
4242 +               while (*new) {
4243 +                       int cmp;
4244 +
4245 +                       stable_node = rb_entry(*new, struct stable_node, node);
4246 +
4247 +                       cmp = hash_cmp(new_node->hash_max,
4248 +                                          stable_node->hash_max);
4249 +
4250 +                       if (cmp < 0) {
4251 +                               parent = *new;
4252 +                               new = &parent->rb_left;
4253 +                       } else if (cmp > 0) {
4254 +                               parent = *new;
4255 +                               new = &parent->rb_right;
4256 +                       } else {
4257 +                               /* oh, no, still a collision */
4258 +                               goto failed;
4259 +                       }
4260 +               }
4261 +
4262 +               goto add_node;
4263 +       }
4264 +
4265 +       /* no tree node found */
4266 +       tree_node = alloc_tree_node(tree_node_listp);
4267 +       if (!tree_node) {
4268 +               printk(KERN_ERR "UKSM: memory allocation error!\n");
4269 +               goto failed;
4270 +       } else {
4271 +               tree_node->hash = hash;
4272 +               rb_link_node(&tree_node->node, parent, new);
4273 +               rb_insert_color(&tree_node->node, root_treep);
4274 +
4275 +tree_node_reuse:
4276 +               /* prepare for stable node insertion */
4277 +               parent = NULL;
4278 +               new = &tree_node->sub_root.rb_node;
4279 +       }
4280 +
4281 +add_node:
4282 +       rb_link_node(&new_node->node, parent, new);
4283 +       rb_insert_color(&new_node->node, &tree_node->sub_root);
4284 +       new_node->tree_node = tree_node;
4285 +       tree_node->count++;
4286 +       return;
4287 +
4288 +failed:
4289 +       /* This can only happen when two nodes have collided
4290 +        * in two levels.
4291 +        */
4292 +       new_node->tree_node = NULL;
4293 +       return;
4294 +}
4295 +
4296 +static inline void free_all_tree_nodes(struct list_head *list)
4297 +{
4298 +       struct tree_node *node, *tmp;
4299 +
4300 +       list_for_each_entry_safe(node, tmp, list, all_list) {
4301 +               free_tree_node(node);
4302 +       }
4303 +}
4304 +
4305 +/**
4306 + * stable_tree_delta_hash() - Delta hash the stable tree from previous hash
4307 + * strength to the current hash_strength. It re-structures the hole tree.
4308 + */
4309 +static inline void stable_tree_delta_hash(u32 prev_hash_strength)
4310 +{
4311 +       struct stable_node *node, *tmp;
4312 +       struct rb_root *root_new_treep;
4313 +       struct list_head *new_tree_node_listp;
4314 +
4315 +       stable_tree_index = (stable_tree_index + 1) % 2;
4316 +       root_new_treep = &root_stable_tree[stable_tree_index];
4317 +       new_tree_node_listp = &stable_tree_node_list[stable_tree_index];
4318 +       *root_new_treep = RB_ROOT;
4319 +       BUG_ON(!list_empty(new_tree_node_listp));
4320 +
4321 +       /*
4322 +        * we need to be safe, the node could be removed by get_ksm_page()
4323 +        */
4324 +       list_for_each_entry_safe(node, tmp, &stable_node_list, all_list) {
4325 +               void *addr;
4326 +               struct page *node_page;
4327 +               u32 hash;
4328 +
4329 +               /*
4330 +                * We are completely re-structuring the stable nodes to a new
4331 +                * stable tree. We don't want to touch the old tree unlinks and
4332 +                * old tree_nodes. The old tree_nodes will be freed at once.
4333 +                */
4334 +               node_page = get_ksm_page(node, 0, 0);
4335 +               if (!node_page)
4336 +                       continue;
4337 +
4338 +               if (node->tree_node) {
4339 +                       hash = node->tree_node->hash;
4340 +
4341 +                       addr = kmap_atomic(node_page, KM_USER0);
4342 +
4343 +                       hash = delta_hash(addr, prev_hash_strength,
4344 +                                         hash_strength, hash);
4345 +                       kunmap_atomic(addr, KM_USER0);
4346 +               } else {
4347 +                       /*
4348 +                        *it was not inserted to rbtree due to collision in last
4349 +                        *round scan.
4350 +                        */
4351 +                       hash = page_hash(node_page, hash_strength, 0);
4352 +               }
4353 +
4354 +               stable_node_reinsert(node, node_page, root_new_treep,
4355 +                                    new_tree_node_listp, hash);
4356 +               put_page(node_page);
4357 +       }
4358 +
4359 +       root_stable_treep = root_new_treep;
4360 +       free_all_tree_nodes(stable_tree_node_listp);
4361 +       BUG_ON(!list_empty(stable_tree_node_listp));
4362 +       stable_tree_node_listp = new_tree_node_listp;
4363 +}
4364 +
4365 +static inline void inc_hash_strength(unsigned long delta)
4366 +{
4367 +       hash_strength += 1 << delta;
4368 +       if (hash_strength > HASH_STRENGTH_MAX)
4369 +               hash_strength = HASH_STRENGTH_MAX;
4370 +}
4371 +
4372 +static inline void dec_hash_strength(unsigned long delta)
4373 +{
4374 +       unsigned long change = 1 << delta;
4375 +
4376 +       if (hash_strength <= change + 1)
4377 +               hash_strength = 1;
4378 +       else
4379 +               hash_strength -= change;
4380 +}
4381 +
4382 +static inline void inc_hash_strength_delta(void)
4383 +{
4384 +       hash_strength_delta++;
4385 +       if (hash_strength_delta > HASH_STRENGTH_DELTA_MAX)
4386 +               hash_strength_delta = HASH_STRENGTH_DELTA_MAX;
4387 +}
4388 +
4389 +/*
4390 +static inline unsigned long get_current_neg_ratio(void)
4391 +{
4392 +       if (!rshash_pos || rshash_neg > rshash_pos)
4393 +               return 100;
4394 +
4395 +       return div64_u64(100 * rshash_neg , rshash_pos);
4396 +}
4397 +*/
4398 +
4399 +static inline unsigned long get_current_neg_ratio(void)
4400 +{
4401 +       u64 pos = benefit.pos;
4402 +       u64 neg = benefit.neg;
4403 +
4404 +       if (!neg)
4405 +               return 0;
4406 +
4407 +       if (!pos || neg > pos)
4408 +               return 100;
4409 +
4410 +       if (neg > div64_u64(U64_MAX, 100))
4411 +               pos = div64_u64(pos, 100);
4412 +       else
4413 +               neg *= 100;
4414 +
4415 +       return div64_u64(neg, pos);
4416 +}
4417 +
4418 +static inline unsigned long get_current_benefit(void)
4419 +{
4420 +       u64 pos = benefit.pos;
4421 +       u64 neg = benefit.neg;
4422 +       u64 scanned = benefit.scanned;
4423 +
4424 +       if (neg > pos)
4425 +               return 0;
4426 +
4427 +       return div64_u64((pos - neg), scanned);
4428 +}
4429 +
4430 +static inline int judge_rshash_direction(void)
4431 +{
4432 +       u64 current_neg_ratio, stable_benefit;
4433 +       u64 current_benefit, delta = 0;
4434 +       int ret = STILL;
4435 +
4436 +       /* In case the system are still for a long time. */
4437 +       if (ksm_scan_round % 1024 == 3) {
4438 +               ret = OBSCURE;
4439 +               goto out;
4440 +       }
4441 +
4442 +       current_neg_ratio = get_current_neg_ratio();
4443 +
4444 +       if (current_neg_ratio == 0) {
4445 +               rshash_neg_cont_zero++;
4446 +               if (rshash_neg_cont_zero > 2)
4447 +                       return GO_DOWN;
4448 +               else
4449 +                       return STILL;
4450 +       }
4451 +       rshash_neg_cont_zero = 0;
4452 +
4453 +       if (current_neg_ratio > 90) {
4454 +               ret = GO_UP;
4455                 goto out;
4456 +       }
4457
4458 -       err = try_to_merge_one_page(vma, page, kpage);
4459 -       if (err)
4460 +       current_benefit = get_current_benefit();
4461 +       stable_benefit = rshash_state.stable_benefit;
4462 +
4463 +       if (!stable_benefit) {
4464 +               ret = OBSCURE;
4465                 goto out;
4466 +       }
4467 +
4468 +       if (current_benefit > stable_benefit)
4469 +               delta = current_benefit - stable_benefit;
4470 +       else if (current_benefit < stable_benefit)
4471 +               delta = stable_benefit - current_benefit;
4472 +
4473 +       delta = div64_u64(100 * delta , stable_benefit);
4474 +
4475 +       if (delta > 50) {
4476 +               rshash_cont_obscure++;
4477 +               if (rshash_cont_obscure > 2)
4478 +                       return OBSCURE;
4479 +               else
4480 +                       return STILL;
4481 +       }
4482
4483 -       /* Must get reference to anon_vma while still holding mmap_sem */
4484 -       hold_anon_vma(rmap_item, vma->anon_vma);
4485  out:
4486 -       up_read(&mm->mmap_sem);
4487 -       return err;
4488 +       rshash_cont_obscure = 0;
4489 +       return ret;
4490  }
4491
4492 -/*
4493 - * try_to_merge_two_pages - take two identical pages and prepare them
4494 - * to be merged into one page.
4495 - *
4496 - * This function returns the kpage if we successfully merged two identical
4497 - * pages into one ksm page, NULL otherwise.
4498 - *
4499 - * Note that this function upgrades page to ksm page: if one of the pages
4500 - * is already a ksm page, try_to_merge_with_ksm_page should be used.
4501 +/**
4502 + * rshash_adjust() - The main function to control the random sampling state
4503 + * machine for hash strength adapting.
4504   */
4505 -static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
4506 -                                          struct page *page,
4507 -                                          struct rmap_item *tree_rmap_item,
4508 -                                          struct page *tree_page)
4509 +static void rshash_adjust(void)
4510  {
4511 -       int err;
4512 +       unsigned long prev_hash_strength = hash_strength;
4513
4514 -       err = try_to_merge_with_ksm_page(rmap_item, page, NULL);
4515 -       if (!err) {
4516 -               err = try_to_merge_with_ksm_page(tree_rmap_item,
4517 -                                                       tree_page, page);
4518 -               /*
4519 -                * If that fails, we have a ksm page with only one pte
4520 -                * pointing to it: so break it.
4521 -                */
4522 -               if (err)
4523 -                       break_cow(rmap_item);
4524 -       }
4525 -       return err ? NULL : page;
4526 -}
4527 +       if (ksm_pages_scanned == ksm_pages_scanned_last)
4528 +               return;
4529
4530 -/*
4531 - * stable_tree_search - search for page inside the stable tree
4532 - *
4533 - * This function checks if there is a page inside the stable tree
4534 - * with identical content to the page that we are scanning right now.
4535 - *
4536 - * This function returns the stable tree node of identical content if found,
4537 - * NULL otherwise.
4538 - */
4539 -static struct page *stable_tree_search(struct page *page)
4540 -{
4541 -       struct rb_node *node = root_stable_tree.rb_node;
4542 -       struct stable_node *stable_node;
4543 +       encode_benefit();
4544
4545 -       stable_node = page_stable_node(page);
4546 -       if (stable_node) {                      /* ksm page forked */
4547 -               get_page(page);
4548 -               return page;
4549 +       switch (rshash_state.state) {
4550 +       case RSHASH_STILL:
4551 +               switch (judge_rshash_direction()) {
4552 +               case GO_UP:
4553 +                       if (rshash_state.pre_direct == GO_DOWN)
4554 +                               hash_strength_delta = 0;
4555 +
4556 +                       inc_hash_strength(hash_strength_delta);
4557 +                       inc_hash_strength_delta();
4558 +                       rshash_state.stable_benefit = get_current_benefit();
4559 +                       rshash_state.pre_direct = GO_UP;
4560 +                       break;
4561 +
4562 +               case GO_DOWN:
4563 +                       if (rshash_state.pre_direct == GO_UP)
4564 +                               hash_strength_delta = 0;
4565 +
4566 +                       dec_hash_strength(hash_strength_delta);
4567 +                       inc_hash_strength_delta();
4568 +                       rshash_state.stable_benefit = get_current_benefit();
4569 +                       rshash_state.pre_direct = GO_DOWN;
4570 +                       break;
4571 +
4572 +               case OBSCURE:
4573 +                       rshash_state.stable_point = hash_strength;
4574 +                       rshash_state.turn_point_down = hash_strength;
4575 +                       rshash_state.turn_point_up = hash_strength;
4576 +                       rshash_state.turn_benefit_down = get_current_benefit();
4577 +                       rshash_state.turn_benefit_up = get_current_benefit();
4578 +                       rshash_state.lookup_window_index = 0;
4579 +                       rshash_state.state = RSHASH_TRYDOWN;
4580 +                       dec_hash_strength(hash_strength_delta);
4581 +                       inc_hash_strength_delta();
4582 +                       break;
4583 +
4584 +               case STILL:
4585 +                       break;
4586 +               default:
4587 +                       BUG();
4588 +               }
4589 +               break;
4590 +
4591 +       case RSHASH_TRYDOWN:
4592 +               if (rshash_state.lookup_window_index++ % 5 == 0)
4593 +                       rshash_state.below_count = 0;
4594 +
4595 +               if (get_current_benefit() < rshash_state.stable_benefit)
4596 +                       rshash_state.below_count++;
4597 +               else if (get_current_benefit() >
4598 +                        rshash_state.turn_benefit_down) {
4599 +                       rshash_state.turn_point_down = hash_strength;
4600 +                       rshash_state.turn_benefit_down = get_current_benefit();
4601 +               }
4602 +
4603 +               if (rshash_state.below_count >= 3 ||
4604 +                   judge_rshash_direction() == GO_UP ||
4605 +                   hash_strength == 1) {
4606 +                       hash_strength = rshash_state.stable_point;
4607 +                       hash_strength_delta = 0;
4608 +                       inc_hash_strength(hash_strength_delta);
4609 +                       inc_hash_strength_delta();
4610 +                       rshash_state.lookup_window_index = 0;
4611 +                       rshash_state.state = RSHASH_TRYUP;
4612 +                       hash_strength_delta = 0;
4613 +               } else {
4614 +                       dec_hash_strength(hash_strength_delta);
4615 +                       inc_hash_strength_delta();
4616 +               }
4617 +               break;
4618 +
4619 +       case RSHASH_TRYUP:
4620 +               if (rshash_state.lookup_window_index++ % 5 == 0)
4621 +                       rshash_state.below_count = 0;
4622 +
4623 +               if (get_current_benefit() < rshash_state.turn_benefit_down)
4624 +                       rshash_state.below_count++;
4625 +               else if (get_current_benefit() > rshash_state.turn_benefit_up) {
4626 +                       rshash_state.turn_point_up = hash_strength;
4627 +                       rshash_state.turn_benefit_up = get_current_benefit();
4628 +               }
4629 +
4630 +               if (rshash_state.below_count >= 3 ||
4631 +                   judge_rshash_direction() == GO_DOWN ||
4632 +                   hash_strength == HASH_STRENGTH_MAX) {
4633 +                       hash_strength = rshash_state.turn_benefit_up >
4634 +                               rshash_state.turn_benefit_down ?
4635 +                               rshash_state.turn_point_up :
4636 +                               rshash_state.turn_point_down;
4637 +
4638 +                       rshash_state.state = RSHASH_PRE_STILL;
4639 +               } else {
4640 +                       inc_hash_strength(hash_strength_delta);
4641 +                       inc_hash_strength_delta();
4642 +               }
4643 +
4644 +               break;
4645 +
4646 +       case RSHASH_NEW:
4647 +       case RSHASH_PRE_STILL:
4648 +               rshash_state.stable_benefit = get_current_benefit();
4649 +               rshash_state.state = RSHASH_STILL;
4650 +               hash_strength_delta = 0;
4651 +               break;
4652 +       default:
4653 +               BUG();
4654         }
4655
4656 -       while (node) {
4657 -               struct page *tree_page;
4658 -               int ret;
4659 +       /* rshash_neg = rshash_pos = 0; */
4660 +       reset_benefit();
4661
4662 -               cond_resched();
4663 -               stable_node = rb_entry(node, struct stable_node, node);
4664 -               tree_page = get_ksm_page(stable_node);
4665 -               if (!tree_page)
4666 -                       return NULL;
4667 +       if (prev_hash_strength != hash_strength)
4668 +               stable_tree_delta_hash(prev_hash_strength);
4669 +}
4670
4671 -               ret = memcmp_pages(page, tree_page);
4672 +static void free_vma_dup_tree(struct vma_slot *slot)
4673 +{
4674 +       struct vma_slot *tmp_slot;
4675 +       int i;
4676
4677 -               if (ret < 0) {
4678 -                       put_page(tree_page);
4679 -                       node = node->rb_left;
4680 -               } else if (ret > 0) {
4681 -                       put_page(tree_page);
4682 -                       node = node->rb_right;
4683 -               } else
4684 -                       return tree_page;
4685 +       /* step 1: free entries in smaller vmas' dup tree */
4686 +       for (i = 0; i < slot->ksm_index; i++) {
4687 +               tmp_slot = radix_tree_lookup(&ksm_vma_tree, i);
4688 +               if (tmp_slot)
4689 +                       radix_tree_delete(&tmp_slot->dup_tree, slot->ksm_index);
4690         }
4691
4692 -       return NULL;
4693 +       /* step 2: free my own dup tree */
4694 +       for (i = slot->ksm_index; i < ksm_vma_tree_index_end; i++)
4695 +               radix_tree_delete(&slot->dup_tree, i);
4696 +
4697 +       BUG_ON(slot->dup_tree.rnode);
4698  }
4699
4700 -/*
4701 - * stable_tree_insert - insert rmap_item pointing to new ksm page
4702 - * into the stable tree.
4703 - *
4704 - * This function returns the stable tree node just allocated on success,
4705 - * NULL otherwise.
4706 +/**
4707 + * round_update_ladder() - The main function to do update of all the
4708 + * adjustments whenever a scan round is finished.
4709   */
4710 -static struct stable_node *stable_tree_insert(struct page *kpage)
4711 +static void round_update_ladder(void)
4712  {
4713 -       struct rb_node **new = &root_stable_tree.rb_node;
4714 -       struct rb_node *parent = NULL;
4715 -       struct stable_node *stable_node;
4716 +       int i;
4717 +       struct vma_slot *slot, *tmp_slot;
4718 +       unsigned long dedup_ratio_max = 0, dedup_ratio_mean = 0;
4719 +       unsigned long threshold;
4720 +
4721 +       for (i = 0; i < ksm_vma_tree_index_end; i++) {
4722 +               slot = radix_tree_lookup(&ksm_vma_tree, i);
4723 +
4724 +               if (slot) {
4725 +                       slot->dedup_ratio = cal_dedup_ratio(slot);
4726 +                       if (dedup_ratio_max < slot->dedup_ratio)
4727 +                               dedup_ratio_max = slot->dedup_ratio;
4728 +                       dedup_ratio_mean += slot->dedup_ratio;
4729 +               }
4730 +       }
4731
4732 -       while (*new) {
4733 -               struct page *tree_page;
4734 -               int ret;
4735 +       dedup_ratio_mean /= ksm_vma_slot_num;
4736 +       threshold = dedup_ratio_mean;
4737
4738 -               cond_resched();
4739 -               stable_node = rb_entry(*new, struct stable_node, node);
4740 -               tree_page = get_ksm_page(stable_node);
4741 -               if (!tree_page)
4742 -                       return NULL;
4743 +       for (i = 0; i < ksm_vma_tree_index_end; i++) {
4744 +               slot = radix_tree_lookup(&ksm_vma_tree, i);
4745
4746 -               ret = memcmp_pages(kpage, tree_page);
4747 -               put_page(tree_page);
4748 +               if (slot) {
4749 +                       if (slot->dedup_ratio &&
4750 +                           slot->dedup_ratio >= threshold) {
4751 +                               vma_rung_up(slot);
4752 +                       } else {
4753 +                               vma_rung_down(slot);
4754 +                       }
4755
4756 -               parent = *new;
4757 -               if (ret < 0)
4758 -                       new = &parent->rb_left;
4759 -               else if (ret > 0)
4760 -                       new = &parent->rb_right;
4761 -               else {
4762 +                       free_vma_dup_tree(slot);
4763 +                       radix_tree_delete(&ksm_vma_tree, i);
4764 +                       ksm_vma_tree_num--;
4765 +                       slot->ksm_index = -1;
4766 +                       slot->slot_scanned = 0;
4767 +                       slot->dedup_ratio = 0;
4768 +                       slot->dedup_num = 0;
4769 +               }
4770 +       }
4771 +
4772 +       for (i = 0; i < ksm_scan_ladder_size; i++) {
4773 +               list_for_each_entry_safe(slot, tmp_slot,
4774 +                                        &ksm_scan_ladder[i].vma_list,
4775 +                                        ksm_list) {
4776                         /*
4777 -                        * It is not a bug that stable_tree_search() didn't
4778 -                        * find this node: because at that time our page was
4779 -                        * not yet write-protected, so may have changed since.
4780 +                        * The slots were scanned but not in inter_tab, their
4781 +                        * dedup must be 0.
4782                          */
4783 -                       return NULL;
4784 +                       if (slot->slot_scanned) {
4785 +                               BUG_ON(slot->dedup_ratio != 0);
4786 +                               vma_rung_down(slot);
4787 +                       }
4788 +
4789 +                       slot->dedup_ratio = 0;
4790                 }
4791         }
4792
4793 -       stable_node = alloc_stable_node();
4794 -       if (!stable_node)
4795 -               return NULL;
4796 +       BUG_ON(ksm_vma_tree_num != 0);
4797 +       ksm_vma_tree_index_end = 0;
4798
4799 -       rb_link_node(&stable_node->node, parent, new);
4800 -       rb_insert_color(&stable_node->node, &root_stable_tree);
4801 +       for (i = 0; i < ksm_scan_ladder_size; i++) {
4802 +               ksm_scan_ladder[i].round_finished = 0;
4803 +               ksm_scan_ladder[i].busy_searched = 0;
4804 +
4805 +               list_for_each_entry(slot, &ksm_scan_ladder[i].vma_list,
4806 +                                   ksm_list) {
4807 +                       slot->last_scanned = slot->pages_scanned;
4808 +                       slot->slot_scanned = 0;
4809 +                       slot->pages_cowed = 0;
4810 +                       slot->pages_merged = 0;
4811 +                       if (slot->fully_scanned) {
4812 +                               slot->fully_scanned = 0;
4813 +                               ksm_scan_ladder[i].fully_scanned_slots--;
4814 +                       }
4815 +                       BUG_ON(slot->ksm_index != -1);
4816 +               }
4817
4818 -       INIT_HLIST_HEAD(&stable_node->hlist);
4819 +               BUG_ON(ksm_scan_ladder[i].fully_scanned_slots);
4820 +       }
4821
4822 -       stable_node->kpfn = page_to_pfn(kpage);
4823 -       set_page_stable_node(kpage, stable_node);
4824 +       rshash_adjust();
4825
4826 -       return stable_node;
4827 +       //ksm_pages_scanned_last = ksm_pages_scanned;
4828  }
4829
4830 -/*
4831 - * unstable_tree_search_insert - search for identical page,
4832 - * else insert rmap_item into the unstable tree.
4833 - *
4834 - * This function searches for a page in the unstable tree identical to the
4835 - * page currently being scanned; and if no identical page is found in the
4836 - * tree, we insert rmap_item as a new object into the unstable tree.
4837 - *
4838 - * This function returns pointer to rmap_item found to be identical
4839 - * to the currently scanned page, NULL otherwise.
4840 - *
4841 - * This function does both searching and inserting, because they share
4842 - * the same walking algorithm in an rbtree.
4843 - */
4844 -static
4845 -struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
4846 -                                             struct page *page,
4847 -                                             struct page **tree_pagep)
4848 +static inline unsigned int ksm_pages_to_scan(unsigned int batch_pages)
4849 +{
4850 +       return totalram_pages * batch_pages / 1000000;
4851 +}
4852
4853 +static inline void cal_ladder_pages_to_scan(unsigned int num)
4854  {
4855 -       struct rb_node **new = &root_unstable_tree.rb_node;
4856 -       struct rb_node *parent = NULL;
4857 +       int i;
4858
4859 -       while (*new) {
4860 -               struct rmap_item *tree_rmap_item;
4861 -               struct page *tree_page;
4862 -               int ret;
4863 +       for (i = 0; i < ksm_scan_ladder_size; i++) {
4864 +               ksm_scan_ladder[i].pages_to_scan = num
4865 +                       * ksm_scan_ladder[i].scan_ratio / KSM_SCAN_RATIO_MAX;
4866 +       }
4867 +       ksm_scan_ladder[0].pages_to_scan /= 16;
4868 +       ksm_scan_ladder[1].pages_to_scan /= 4;
4869 +}
4870
4871 -               cond_resched();
4872 -               tree_rmap_item = rb_entry(*new, struct rmap_item, node);
4873 -               tree_page = get_mergeable_page(tree_rmap_item);
4874 -               if (IS_ERR_OR_NULL(tree_page))
4875 -                       return NULL;
4876 +static inline void ksm_del_vma_slot(struct vma_slot *slot)
4877 +{
4878 +       int i, j;
4879 +       struct rmap_list_entry *entry;
4880 +       struct vma_slot *tmp;
4881
4882 -               /*
4883 -                * Don't substitute a ksm page for a forked page.
4884 -                */
4885 -               if (page == tree_page) {
4886 -                       put_page(tree_page);
4887 -                       return NULL;
4888 -               }
4889 +       /* mutex lock contention maybe intensive, other idea ? */
4890 +       BUG_ON(list_empty(&slot->ksm_list) || !slot->rung);
4891
4892 -               ret = memcmp_pages(page, tree_page);
4893 +       if (slot->rung->current_scan == &slot->ksm_list)
4894 +               slot->rung->current_scan = slot->rung->current_scan->next;
4895
4896 -               parent = *new;
4897 -               if (ret < 0) {
4898 -                       put_page(tree_page);
4899 -                       new = &parent->rb_left;
4900 -               } else if (ret > 0) {
4901 -                       put_page(tree_page);
4902 -                       new = &parent->rb_right;
4903 -               } else {
4904 -                       *tree_pagep = tree_page;
4905 -                       return tree_rmap_item;
4906 -               }
4907 +       list_del_init(&slot->ksm_list);
4908 +       slot->rung->vma_num--;
4909 +       if (slot->fully_scanned)
4910 +               slot->rung->fully_scanned_slots--;
4911 +
4912 +       if (slot->rung->current_scan == &slot->rung->vma_list) {
4913 +               /* This rung finishes a round */
4914 +               slot->rung->round_finished = 1;
4915 +               slot->rung->current_scan = slot->rung->vma_list.next;
4916 +               BUG_ON(slot->rung->current_scan == &slot->rung->vma_list
4917 +                      && !list_empty(&slot->rung->vma_list));
4918         }
4919
4920 -       rmap_item->address |= UNSTABLE_FLAG;
4921 -       rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
4922 -       rb_link_node(&rmap_item->node, parent, new);
4923 -       rb_insert_color(&rmap_item->node, &root_unstable_tree);
4924 +       if (slot->ksm_index == -1)
4925 +               goto skip;
4926
4927 -       ksm_pages_unshared++;
4928 -       return NULL;
4929 +       tmp = radix_tree_delete(&ksm_vma_tree, slot->ksm_index);
4930 +       BUG_ON(!tmp || tmp != slot);
4931 +       free_vma_dup_tree(slot);
4932 +       ksm_vma_tree_num--;
4933 +       if (slot->ksm_index == ksm_vma_tree_index_end - 1)
4934 +               ksm_vma_tree_index_end--;
4935 +
4936 +skip:
4937 +       if (!slot->rmap_list_pool)
4938 +               goto out;
4939 +
4940 +       for (i = 0; i < slot->pool_size; i++) {
4941 +               void *addr;
4942 +
4943 +               if (!slot->rmap_list_pool[i])
4944 +                       continue;
4945 +
4946 +               addr = kmap(slot->rmap_list_pool[i]);
4947 +               BUG_ON(!addr);
4948 +               for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
4949 +                       entry = (struct rmap_list_entry *)addr + j;
4950 +                       if (is_addr(entry->addr))
4951 +                               continue;
4952 +                       if (!entry->item)
4953 +                               continue;
4954 +
4955 +                       remove_rmap_item_from_tree(entry->item);
4956 +                       free_rmap_item(entry->item);
4957 +                       slot->pool_counts[i]--;
4958 +               }
4959 +               BUG_ON(slot->pool_counts[i]);
4960 +               kunmap(slot->rmap_list_pool[i]);
4961 +               __free_page(slot->rmap_list_pool[i]);
4962 +       }
4963 +       kfree(slot->rmap_list_pool);
4964 +       kfree(slot->pool_counts);
4965 +
4966 +out:
4967 +       slot->rung = NULL;
4968 +       free_vma_slot(slot);
4969 +       BUG_ON(!ksm_vma_slot_num);
4970 +       ksm_vma_slot_num--;
4971  }
4972
4973 -/*
4974 - * stable_tree_append - add another rmap_item to the linked list of
4975 - * rmap_items hanging off a given node of the stable tree, all sharing
4976 - * the same ksm page.
4977 - */
4978 -static void stable_tree_append(struct rmap_item *rmap_item,
4979 -                              struct stable_node *stable_node)
4980 +
4981 +static inline void cleanup_vma_slots(void)
4982  {
4983 -       rmap_item->head = stable_node;
4984 -       rmap_item->address |= STABLE_FLAG;
4985 -       hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
4986 +       struct vma_slot *slot;
4987
4988 -       if (rmap_item->hlist.next)
4989 -               ksm_pages_sharing++;
4990 -       else
4991 -               ksm_pages_shared++;
4992 +       spin_lock(&vma_slot_list_lock);
4993 +       while (!list_empty(&vma_slot_del)) {
4994 +               slot = list_entry(vma_slot_del.next,
4995 +                                 struct vma_slot, slot_list);
4996 +               list_del(&slot->slot_list);
4997 +               spin_unlock(&vma_slot_list_lock);
4998 +               ksm_del_vma_slot(slot);
4999 +               spin_lock(&vma_slot_list_lock);
5000 +       }
5001 +       spin_unlock(&vma_slot_list_lock);
5002  }
5003
5004 -/*
5005 - * cmp_and_merge_page - first see if page can be merged into the stable tree;
5006 - * if not, compare checksum to previous and if it's the same, see if page can
5007 - * be inserted into the unstable tree, or merged with a page already there and
5008 - * both transferred to the stable tree.
5009 - *
5010 - * @page: the page that we are searching identical page to.
5011 - * @rmap_item: the reverse mapping into the virtual address of this page
5012 +static inline int rung_fully_scanned(struct scan_rung *rung)
5013 +{
5014 +       return (rung->fully_scanned_slots == rung->vma_num &&
5015 +               rung->fully_scanned_slots);
5016 +}
5017 +
5018 +/**
5019 + * ksm_do_scan()  - the main worker function.
5020   */
5021 -static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
5022 +static void ksm_do_scan(void)
5023  {
5024 -       struct rmap_item *tree_rmap_item;
5025 -       struct page *tree_page = NULL;
5026 -       struct stable_node *stable_node;
5027 -       struct page *kpage;
5028 -       unsigned int checksum;
5029 -       int err;
5030 +       struct vma_slot *slot, *iter;
5031 +       struct list_head *next_scan, *iter_head;
5032 +       struct mm_struct *busy_mm;
5033 +       unsigned char round_finished, all_rungs_emtpy;
5034 +       int i, err;
5035 +       unsigned long rest_pages;
5036 +
5037 +       might_sleep();
5038 +
5039 +       rest_pages = 0;
5040 +repeat_all:
5041 +       for (i = ksm_scan_ladder_size - 1; i >= 0; i--) {
5042 +               struct scan_rung *rung = &ksm_scan_ladder[i];
5043
5044 -       remove_rmap_item_from_tree(rmap_item);
5045 +               if (!rung->pages_to_scan)
5046 +                       continue;
5047
5048 -       /* We first start with searching the page inside the stable tree */
5049 -       kpage = stable_tree_search(page);
5050 -       if (kpage) {
5051 -               err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
5052 -               if (!err) {
5053 -                       /*
5054 -                        * The page was successfully merged:
5055 -                        * add its rmap_item to the stable tree.
5056 -                        */
5057 -                       lock_page(kpage);
5058 -                       stable_tree_append(rmap_item, page_stable_node(kpage));
5059 -                       unlock_page(kpage);
5060 +               if (list_empty(&rung->vma_list)) {
5061 +                       rung->pages_to_scan = 0;
5062 +                       continue;
5063                 }
5064 -               put_page(kpage);
5065 -               return;
5066 -       }
5067 -
5068 -       /*
5069 -        * If the hash value of the page has changed from the last time
5070 -        * we calculated it, this page is changing frequently: therefore we
5071 -        * don't want to insert it in the unstable tree, and we don't want
5072 -        * to waste our time searching for something identical to it there.
5073 -        */
5074 -       checksum = calc_checksum(page);
5075 -       if (rmap_item->oldchecksum != checksum) {
5076 -               rmap_item->oldchecksum = checksum;
5077 -               return;
5078 -       }
5079
5080 -       tree_rmap_item =
5081 -               unstable_tree_search_insert(rmap_item, page, &tree_page);
5082 -       if (tree_rmap_item) {
5083 -               kpage = try_to_merge_two_pages(rmap_item, page,
5084 -                                               tree_rmap_item, tree_page);
5085 -               put_page(tree_page);
5086                 /*
5087 -                * As soon as we merge this page, we want to remove the
5088 -                * rmap_item of the page we have merged with from the unstable
5089 -                * tree, and insert it instead as new node in the stable tree.
5090 +                * if a higher rung is fully scanned, its rest pages should be
5091 +                * propagated to the lower rungs. This can prevent the higher
5092 +                * rung from waiting a long time while it still has its
5093 +                * pages_to_scan quota.
5094 +                *
5095                  */
5096 -               if (kpage) {
5097 -                       remove_rmap_item_from_tree(tree_rmap_item);
5098 +               if (rung_fully_scanned(rung)) {
5099 +                       rest_pages += rung->pages_to_scan;
5100 +                       rung->pages_to_scan = 0;
5101 +                       continue;
5102 +               }
5103
5104 -                       lock_page(kpage);
5105 -                       stable_node = stable_tree_insert(kpage);
5106 -                       if (stable_node) {
5107 -                               stable_tree_append(tree_rmap_item, stable_node);
5108 -                               stable_tree_append(rmap_item, stable_node);
5109 +               rung->pages_to_scan += rest_pages;
5110 +               rest_pages = 0;
5111 +               while (rung->pages_to_scan && likely(!freezing(current))) {
5112 +cleanup:
5113 +                       cleanup_vma_slots();
5114 +
5115 +                       if (list_empty(&rung->vma_list))
5116 +                               break;
5117 +
5118 +rescan:
5119 +                       BUG_ON(rung->current_scan == &rung->vma_list &&
5120 +                              !list_empty(&rung->vma_list));
5121 +
5122 +                       slot = list_entry(rung->current_scan,
5123 +                                        struct vma_slot, ksm_list);
5124 +
5125 +
5126 +                       if (slot->fully_scanned)
5127 +                               goto next_scan;
5128 +
5129 +                       err = try_down_read_slot_mmap_sem(slot);
5130 +                       if (err == -ENOENT)
5131 +                               goto cleanup;
5132 +
5133 +                       busy_mm = slot->mm;
5134 +
5135 +busy:
5136 +                       if (err == -EBUSY) {
5137 +                               /* skip other vmas on the same mm */
5138 +                               rung->busy_searched = 1;
5139 +                               iter = slot;
5140 +                               iter_head = slot->ksm_list.next;
5141 +
5142 +                               while (iter_head != &rung->vma_list) {
5143 +                                       iter = list_entry(iter_head,
5144 +                                                         struct vma_slot,
5145 +                                                         ksm_list);
5146 +                                       if (iter->vma->vm_mm != busy_mm)
5147 +                                               break;
5148 +                                       iter_head = iter_head->next;
5149 +                               }
5150 +
5151 +                               if (iter->vma->vm_mm != busy_mm) {
5152 +                                       rung->current_scan = &iter->ksm_list;
5153 +                                       goto rescan;
5154 +                               } else {
5155 +                                       /* at the end, but still busy */
5156 +                                       rung->current_scan = iter->ksm_list.next;
5157 +                                       goto next_scan;
5158 +                                       break;
5159 +                               }
5160                         }
5161 -                       unlock_page(kpage);
5162
5163 -                       /*
5164 -                        * If we fail to insert the page into the stable tree,
5165 -                        * we will have 2 virtual addresses that are pointing
5166 -                        * to a ksm page left outside the stable tree,
5167 -                        * in which case we need to break_cow on both.
5168 -                        */
5169 -                       if (!stable_node) {
5170 -                               break_cow(tree_rmap_item);
5171 -                               break_cow(rmap_item);
5172 +                       BUG_ON(!vma_can_enter(slot->vma));
5173 +                       if (ksm_test_exit(slot->vma->vm_mm)) {
5174 +                               busy_mm = slot->vma->vm_mm;
5175 +                               up_read(&slot->vma->vm_mm->mmap_sem);
5176 +                               err = -EBUSY;
5177 +                               goto busy;
5178 +                       }
5179 +
5180 +                       if (rung->busy_searched)
5181 +                               rung->busy_searched = 0;
5182 +                       /* Ok, we have take the mmap_sem, ready to scan */
5183 +                       scan_vma_one_page(slot);
5184 +                       up_read(&slot->vma->vm_mm->mmap_sem);
5185 +                       rung->pages_to_scan--;
5186 +
5187 +                       if ((slot->pages_scanned &&
5188 +                            slot->pages_scanned % slot->pages_to_scan == 0)
5189 +                           || slot->fully_scanned) {
5190 +next_scan:
5191 +                               next_scan = rung->current_scan->next;
5192 +                               if (next_scan == &rung->vma_list) {
5193 +                                       /*
5194 +                                        * All the slots in this rung
5195 +                                        * have been traveled in this
5196 +                                        * round.
5197 +                                        */
5198 +                                       rung->round_finished = 1;
5199 +                                       rung->current_scan =
5200 +                                               rung->vma_list.next;
5201 +                                       if (rung_fully_scanned(rung) ||
5202 +                                           rung->busy_searched) {
5203 +                                               /*
5204 +                                                * All the pages in all slots
5205 +                                                * have been scanned. Or we
5206 +                                                * did not make any progress
5207 +                                                * because of busy mm.
5208 +                                                */
5209 +                                               rest_pages +=
5210 +                                                       rung->pages_to_scan;
5211 +                                               rung->pages_to_scan = 0;
5212 +                                               break;
5213 +                                       }
5214 +                               } else {
5215 +                                       rung->current_scan = next_scan;
5216 +                               }
5217                         }
5218 +
5219 +                       cond_resched();
5220                 }
5221 +
5222 +               if (freezing(current))
5223 +                       break;
5224         }
5225 -}
5226
5227 -static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
5228 -                                           struct rmap_item **rmap_list,
5229 -                                           unsigned long addr)
5230 -{
5231 -       struct rmap_item *rmap_item;
5232 +       if (freezing(current))
5233 +               return;
5234
5235 -       while (*rmap_list) {
5236 -               rmap_item = *rmap_list;
5237 -               if ((rmap_item->address & PAGE_MASK) == addr)
5238 -                       return rmap_item;
5239 -               if (rmap_item->address > addr)
5240 +       round_finished = 1;
5241 +       all_rungs_emtpy = 1;
5242 +       for (i = 0; i < ksm_scan_ladder_size; i++) {
5243 +               struct scan_rung *rung = &ksm_scan_ladder[i];
5244 +
5245 +               if (!list_empty(&rung->vma_list)) {
5246 +                       all_rungs_emtpy = 0;
5247 +                       if (!rung->round_finished)
5248 +                               round_finished = 0;
5249                         break;
5250 -               *rmap_list = rmap_item->rmap_list;
5251 -               remove_rmap_item_from_tree(rmap_item);
5252 -               free_rmap_item(rmap_item);
5253 +               }
5254         }
5255
5256 -       rmap_item = alloc_rmap_item();
5257 -       if (rmap_item) {
5258 -               /* It has already been zeroed */
5259 -               rmap_item->mm = mm_slot->mm;
5260 -               rmap_item->address = addr;
5261 -               rmap_item->rmap_list = *rmap_list;
5262 -               *rmap_list = rmap_item;
5263 -       }
5264 -       return rmap_item;
5265 -}
5266 +       if (all_rungs_emtpy)
5267 +               round_finished = 0;
5268
5269 -static struct rmap_item *scan_get_next_rmap_item(struct page **page)
5270 -{
5271 -       struct mm_struct *mm;
5272 -       struct mm_slot *slot;
5273 -       struct vm_area_struct *vma;
5274 -       struct rmap_item *rmap_item;
5275 +       cleanup_vma_slots();
5276
5277 -       if (list_empty(&ksm_mm_head.mm_list))
5278 -               return NULL;
5279 +       if (round_finished) {
5280 +               round_update_ladder();
5281
5282 -       slot = ksm_scan.mm_slot;
5283 -       if (slot == &ksm_mm_head) {
5284                 /*
5285                  * A number of pages can hang around indefinitely on per-cpu
5286                  * pagevecs, raised page count preventing write_protect_page
5287 @@ -1308,266 +4131,160 @@
5288                  */
5289                 lru_add_drain_all();
5290
5291 +               /* sync with ksm_remove_vma for rb_erase */
5292 +               ksm_scan_round++;
5293                 root_unstable_tree = RB_ROOT;
5294 -
5295 -               spin_lock(&ksm_mmlist_lock);
5296 -               slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
5297 -               ksm_scan.mm_slot = slot;
5298 -               spin_unlock(&ksm_mmlist_lock);
5299 -next_mm:
5300 -               ksm_scan.address = 0;
5301 -               ksm_scan.rmap_list = &slot->rmap_list;
5302 -       }
5303 -
5304 -       mm = slot->mm;
5305 -       down_read(&mm->mmap_sem);
5306 -       if (ksm_test_exit(mm))
5307 -               vma = NULL;
5308 -       else
5309 -               vma = find_vma(mm, ksm_scan.address);
5310 -
5311 -       for (; vma; vma = vma->vm_next) {
5312 -               if (!(vma->vm_flags & VM_MERGEABLE))
5313 -                       continue;
5314 -               if (ksm_scan.address < vma->vm_start)
5315 -                       ksm_scan.address = vma->vm_start;
5316 -               if (!vma->anon_vma)
5317 -                       ksm_scan.address = vma->vm_end;
5318 -
5319 -               while (ksm_scan.address < vma->vm_end) {
5320 -                       if (ksm_test_exit(mm))
5321 -                               break;
5322 -                       *page = follow_page(vma, ksm_scan.address, FOLL_GET);
5323 -                       if (IS_ERR_OR_NULL(*page)) {
5324 -                               ksm_scan.address += PAGE_SIZE;
5325 -                               cond_resched();
5326 -                               continue;
5327 -                       }
5328 -                       if (PageAnon(*page) ||
5329 -                           page_trans_compound_anon(*page)) {
5330 -                               flush_anon_page(vma, *page, ksm_scan.address);
5331 -                               flush_dcache_page(*page);
5332 -                               rmap_item = get_next_rmap_item(slot,
5333 -                                       ksm_scan.rmap_list, ksm_scan.address);
5334 -                               if (rmap_item) {
5335 -                                       ksm_scan.rmap_list =
5336 -                                                       &rmap_item->rmap_list;
5337 -                                       ksm_scan.address += PAGE_SIZE;
5338 -                               } else
5339 -                                       put_page(*page);
5340 -                               up_read(&mm->mmap_sem);
5341 -                               return rmap_item;
5342 -                       }
5343 -                       put_page(*page);
5344 -                       ksm_scan.address += PAGE_SIZE;
5345 -                       cond_resched();
5346 -               }
5347 -       }
5348 -
5349 -       if (ksm_test_exit(mm)) {
5350 -               ksm_scan.address = 0;
5351 -               ksm_scan.rmap_list = &slot->rmap_list;
5352 -       }
5353 -       /*
5354 -        * Nuke all the rmap_items that are above this current rmap:
5355 -        * because there were no VM_MERGEABLE vmas with such addresses.
5356 -        */
5357 -       remove_trailing_rmap_items(slot, ksm_scan.rmap_list);
5358 -
5359 -       spin_lock(&ksm_mmlist_lock);
5360 -       ksm_scan.mm_slot = list_entry(slot->mm_list.next,
5361 -                                               struct mm_slot, mm_list);
5362 -       if (ksm_scan.address == 0) {
5363 -               /*
5364 -                * We've completed a full scan of all vmas, holding mmap_sem
5365 -                * throughout, and found no VM_MERGEABLE: so do the same as
5366 -                * __ksm_exit does to remove this mm from all our lists now.
5367 -                * This applies either when cleaning up after __ksm_exit
5368 -                * (but beware: we can reach here even before __ksm_exit),
5369 -                * or when all VM_MERGEABLE areas have been unmapped (and
5370 -                * mmap_sem then protects against race with MADV_MERGEABLE).
5371 -                */
5372 -               hlist_del(&slot->link);
5373 -               list_del(&slot->mm_list);
5374 -               spin_unlock(&ksm_mmlist_lock);
5375 -
5376 -               free_mm_slot(slot);
5377 -               clear_bit(MMF_VM_MERGEABLE, &mm->flags);
5378 -               up_read(&mm->mmap_sem);
5379 -               mmdrop(mm);
5380 -       } else {
5381 -               spin_unlock(&ksm_mmlist_lock);
5382 -               up_read(&mm->mmap_sem);
5383 +               free_all_tree_nodes(&unstable_tree_node_list);
5384         }
5385
5386 -       /* Repeat until we've completed scanning the whole list */
5387 -       slot = ksm_scan.mm_slot;
5388 -       if (slot != &ksm_mm_head)
5389 -               goto next_mm;
5390 -
5391 -       ksm_scan.seqnr++;
5392 -       return NULL;
5393 -}
5394 -
5395 -/**
5396 - * ksm_do_scan  - the ksm scanner main worker function.
5397 - * @scan_npages - number of pages we want to scan before we return.
5398 - */
5399 -static void ksm_do_scan(unsigned int scan_npages)
5400 -{
5401 -       struct rmap_item *rmap_item;
5402 -       struct page *uninitialized_var(page);
5403 -
5404 -       while (scan_npages-- && likely(!freezing(current))) {
5405 -               cond_resched();
5406 -               rmap_item = scan_get_next_rmap_item(&page);
5407 -               if (!rmap_item)
5408 -                       return;
5409 -               if (!PageKsm(page) || !in_stable_tree(rmap_item))
5410 -                       cmp_and_merge_page(page, rmap_item);
5411 -               put_page(page);
5412 +       for (i = 0; i < ksm_scan_ladder_size; i++) {
5413 +               struct scan_rung *rung = &ksm_scan_ladder[i];
5414 +
5415 +               /*
5416 +                * Before we can go sleep, we should make sure that all the
5417 +                * pages_to_scan quota for this scan has been finished
5418 +                */
5419 +               if (!list_empty(&rung->vma_list) && rung->pages_to_scan)
5420 +                       goto repeat_all;
5421         }
5422 +
5423 +       cal_ladder_pages_to_scan(ksm_scan_batch_pages);
5424  }
5425
5426  static int ksmd_should_run(void)
5427  {
5428 -       return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list);
5429 +       return ksm_run & KSM_RUN_MERGE;
5430  }
5431
5432 -static int ksm_scan_thread(void *nothing)
5433 -{
5434 -       set_freezable();
5435 -       set_user_nice(current, 5);
5436 -
5437 -       while (!kthread_should_stop()) {
5438 -               mutex_lock(&ksm_thread_mutex);
5439 -               if (ksmd_should_run())
5440 -                       ksm_do_scan(ksm_thread_pages_to_scan);
5441 -               mutex_unlock(&ksm_thread_mutex);
5442 -
5443 -               try_to_freeze();
5444 +#define __round_mask(x, y) ((__typeof__(x))((y)-1))
5445 +#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
5446
5447 -               if (ksmd_should_run()) {
5448 -                       schedule_timeout_interruptible(
5449 -                               msecs_to_jiffies(ksm_thread_sleep_millisecs));
5450 -               } else {
5451 -                       wait_event_freezable(ksm_thread_wait,
5452 -                               ksmd_should_run() || kthread_should_stop());
5453 -               }
5454 -       }
5455 -       return 0;
5456 +static inline unsigned long vma_pool_size(struct vm_area_struct *vma)
5457 +{
5458 +       return round_up(sizeof(struct rmap_list_entry) * vma_pages(vma),
5459 +                       PAGE_SIZE) >> PAGE_SHIFT;
5460  }
5461
5462 -int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
5463 -               unsigned long end, int advice, unsigned long *vm_flags)
5464 +/**
5465 + *
5466 + *
5467 + *
5468 + * @param slot
5469 + *
5470 + * @return int , 1 on success, 0 on failure
5471 + */
5472 +static int ksm_vma_enter(struct vma_slot *slot)
5473  {
5474 -       struct mm_struct *mm = vma->vm_mm;
5475 -       int err;
5476 -
5477 -       switch (advice) {
5478 -       case MADV_MERGEABLE:
5479 -               /*
5480 -                * Be somewhat over-protective for now!
5481 -                */
5482 -               if (*vm_flags & (VM_MERGEABLE | VM_SHARED  | VM_MAYSHARE   |
5483 -                                VM_PFNMAP    | VM_IO      | VM_DONTEXPAND |
5484 -                                VM_RESERVED  | VM_HUGETLB | VM_INSERTPAGE |
5485 -                                VM_NONLINEAR | VM_MIXEDMAP | VM_SAO))
5486 -                       return 0;               /* just ignore the advice */
5487 -
5488 -               if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
5489 -                       err = __ksm_enter(mm);
5490 -                       if (err)
5491 -                               return err;
5492 -               }
5493 -
5494 -               *vm_flags |= VM_MERGEABLE;
5495 -               break;
5496 +       struct scan_rung *rung;
5497 +       unsigned long pages_to_scan, pool_size;
5498
5499 -       case MADV_UNMERGEABLE:
5500 -               if (!(*vm_flags & VM_MERGEABLE))
5501 -                       return 0;               /* just ignore the advice */
5502 +       BUG_ON(slot->pages != vma_pages(slot->vma));
5503 +       rung = &ksm_scan_ladder[0];
5504
5505 -               if (vma->anon_vma) {
5506 -                       err = unmerge_ksm_pages(vma, start, end);
5507 -                       if (err)
5508 -                               return err;
5509 +       pages_to_scan = get_vma_random_scan_num(slot, rung->scan_ratio);
5510 +       if (pages_to_scan) {
5511 +               if (list_empty(&rung->vma_list))
5512 +                       rung->current_scan = &slot->ksm_list;
5513 +               BUG_ON(!list_empty(&slot->ksm_list));
5514 +
5515 +               list_add(&slot->ksm_list, &rung->vma_list);
5516 +               slot->rung = rung;
5517 +               slot->pages_to_scan = pages_to_scan;
5518 +               slot->rung->vma_num++;
5519 +               BUG_ON(PAGE_SIZE % sizeof(struct rmap_list_entry) != 0);
5520 +
5521 +               pool_size = vma_pool_size(slot->vma);
5522 +
5523 +               slot->rmap_list_pool = kzalloc(sizeof(struct page *) *
5524 +                                              pool_size, GFP_NOWAIT);
5525 +               slot->pool_counts = kzalloc(sizeof(unsigned long) * pool_size,
5526 +                                           GFP_NOWAIT);
5527 +               slot->pool_size = pool_size;
5528 +               if (!slot->rmap_list_pool)
5529 +                       goto failed;
5530 +
5531 +               if (!slot->pool_counts) {
5532 +                       kfree(slot->rmap_list_pool);
5533 +                       goto failed;
5534                 }
5535
5536 -               *vm_flags &= ~VM_MERGEABLE;
5537 -               break;
5538 +               BUG_ON(rung->current_scan == &rung->vma_list &&
5539 +                      !list_empty(&rung->vma_list));
5540 +
5541 +               ksm_vma_slot_num++;
5542 +               BUG_ON(!ksm_vma_slot_num);
5543 +               return 1;
5544         }
5545
5546 +failed:
5547         return 0;
5548  }
5549
5550 -int __ksm_enter(struct mm_struct *mm)
5551 -{
5552 -       struct mm_slot *mm_slot;
5553 -       int needs_wakeup;
5554 -
5555 -       mm_slot = alloc_mm_slot();
5556 -       if (!mm_slot)
5557 -               return -ENOMEM;
5558 -
5559 -       /* Check ksm_run too?  Would need tighter locking */
5560 -       needs_wakeup = list_empty(&ksm_mm_head.mm_list);
5561
5562 -       spin_lock(&ksm_mmlist_lock);
5563 -       insert_to_mm_slots_hash(mm, mm_slot);
5564 -       /*
5565 -        * Insert just behind the scanning cursor, to let the area settle
5566 -        * down a little; when fork is followed by immediate exec, we don't
5567 -        * want ksmd to waste time setting up and tearing down an rmap_list.
5568 -        */
5569 -       list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
5570 -       spin_unlock(&ksm_mmlist_lock);
5571 +static void ksm_enter_all_slots(void)
5572 +{
5573 +       struct vma_slot *slot;
5574 +       int added;
5575
5576 -       set_bit(MMF_VM_MERGEABLE, &mm->flags);
5577 -       atomic_inc(&mm->mm_count);
5578 +       spin_lock(&vma_slot_list_lock);
5579 +       while (!list_empty(&vma_slot_new)) {
5580 +               slot = list_entry(vma_slot_new.next,
5581 +                                 struct vma_slot, slot_list);
5582 +               /**
5583 +                * slots are sorted by ctime_j, if one found to be too
5584 +                * young, just stop scanning the rest ones.
5585 +                */
5586 +               /*
5587
5588 -       if (needs_wakeup)
5589 -               wake_up_interruptible(&ksm_thread_wait);
5590 +                       if (time_before(jiffies, slot->ctime_j +
5591 +                                       msecs_to_jiffies(1000))) {
5592 +                               spin_unlock(&vma_slot_list_lock);
5593 +                               return;
5594 +                       }
5595 +               */
5596
5597 -       return 0;
5598 +               list_del_init(&slot->slot_list);
5599 +               added = 0;
5600 +               if (vma_can_enter(slot->vma))
5601 +                       added = ksm_vma_enter(slot);
5602 +
5603 +               if (!added) {
5604 +                       /* Put back to new list to be del by its creator */
5605 +                       slot->ctime_j = jiffies;
5606 +                       list_del(&slot->slot_list);
5607 +                       list_add_tail(&slot->slot_list, &vma_slot_noadd);
5608 +               }
5609 +               spin_unlock(&vma_slot_list_lock);
5610 +               cond_resched();
5611 +               spin_lock(&vma_slot_list_lock);
5612 +       }
5613 +       spin_unlock(&vma_slot_list_lock);
5614  }
5615
5616 -void __ksm_exit(struct mm_struct *mm)
5617 +static int ksm_scan_thread(void *nothing)
5618  {
5619 -       struct mm_slot *mm_slot;
5620 -       int easy_to_free = 0;
5621 +       set_freezable();
5622 +       set_user_nice(current, 5);
5623
5624 -       /*
5625 -        * This process is exiting: if it's straightforward (as is the
5626 -        * case when ksmd was never running), free mm_slot immediately.
5627 -        * But if it's at the cursor or has rmap_items linked to it, use
5628 -        * mmap_sem to synchronize with any break_cows before pagetables
5629 -        * are freed, and leave the mm_slot on the list for ksmd to free.
5630 -        * Beware: ksm may already have noticed it exiting and freed the slot.
5631 -        */
5632 +       while (!kthread_should_stop()) {
5633 +               mutex_lock(&ksm_thread_mutex);
5634 +               if (ksmd_should_run()) {
5635 +                       ksm_enter_all_slots();
5636 +                       ksm_do_scan();
5637 +               }
5638 +               mutex_unlock(&ksm_thread_mutex);
5639 +
5640 +               try_to_freeze();
5641
5642 -       spin_lock(&ksm_mmlist_lock);
5643 -       mm_slot = get_mm_slot(mm);
5644 -       if (mm_slot && ksm_scan.mm_slot != mm_slot) {
5645 -               if (!mm_slot->rmap_list) {
5646 -                       hlist_del(&mm_slot->link);
5647 -                       list_del(&mm_slot->mm_list);
5648 -                       easy_to_free = 1;
5649 +               if (ksmd_should_run()) {
5650 +                       schedule_timeout_interruptible(ksm_sleep_jiffies);
5651 +                       ksm_sleep_times++;
5652                 } else {
5653 -                       list_move(&mm_slot->mm_list,
5654 -                                 &ksm_scan.mm_slot->mm_list);
5655 +                       wait_event_freezable(ksm_thread_wait,
5656 +                               ksmd_should_run() || kthread_should_stop());
5657                 }
5658         }
5659 -       spin_unlock(&ksm_mmlist_lock);
5660 -
5661 -       if (easy_to_free) {
5662 -               free_mm_slot(mm_slot);
5663 -               clear_bit(MMF_VM_MERGEABLE, &mm->flags);
5664 -               mmdrop(mm);
5665 -       } else if (mm_slot) {
5666 -               down_write(&mm->mmap_sem);
5667 -               up_write(&mm->mmap_sem);
5668 -       }
5669 +       return 0;
5670  }
5671
5672  struct page *ksm_does_need_to_copy(struct page *page,
5673 @@ -1597,11 +4314,13 @@
5674                         unsigned long *vm_flags)
5675  {
5676         struct stable_node *stable_node;
5677 +       struct node_vma *node_vma;
5678         struct rmap_item *rmap_item;
5679 -       struct hlist_node *hlist;
5680 +       struct hlist_node *hlist, *rmap_hlist;
5681         unsigned int mapcount = page_mapcount(page);
5682         int referenced = 0;
5683         int search_new_forks = 0;
5684 +       unsigned long address;
5685
5686         VM_BUG_ON(!PageKsm(page));
5687         VM_BUG_ON(!PageLocked(page));
5688 @@ -1609,38 +4328,51 @@
5689         stable_node = page_stable_node(page);
5690         if (!stable_node)
5691                 return 0;
5692 -again:
5693 -       hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
5694 -               struct anon_vma *anon_vma = rmap_item->anon_vma;
5695 -               struct anon_vma_chain *vmac;
5696 -               struct vm_area_struct *vma;
5697 -
5698 -               anon_vma_lock(anon_vma);
5699 -               list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
5700 -                       vma = vmac->vma;
5701 -                       if (rmap_item->address < vma->vm_start ||
5702 -                           rmap_item->address >= vma->vm_end)
5703 -                               continue;
5704 -                       /*
5705 -                        * Initially we examine only the vma which covers this
5706 -                        * rmap_item; but later, if there is still work to do,
5707 -                        * we examine covering vmas in other mms: in case they
5708 -                        * were forked from the original since ksmd passed.
5709 -                        */
5710 -                       if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
5711 -                               continue;
5712
5713 -                       if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
5714 -                               continue;
5715
5716 -                       referenced += page_referenced_one(page, vma,
5717 -                               rmap_item->address, &mapcount, vm_flags);
5718 -                       if (!search_new_forks || !mapcount)
5719 -                               break;
5720 +again:
5721 +       hlist_for_each_entry(node_vma, hlist, &stable_node->hlist, hlist) {
5722 +               hlist_for_each_entry(rmap_item, rmap_hlist,
5723 +                                    &node_vma->rmap_hlist, hlist) {
5724 +                       struct anon_vma *anon_vma = rmap_item->anon_vma;
5725 +                       struct anon_vma_chain *vmac;
5726 +                       struct vm_area_struct *vma;
5727 +
5728 +                       anon_vma_lock(anon_vma);
5729 +                       list_for_each_entry(vmac, &anon_vma->head,
5730 +                                           same_anon_vma) {
5731 +                               vma = vmac->vma;
5732 +                               address = get_rmap_addr(rmap_item);
5733 +
5734 +                               if (address < vma->vm_start ||
5735 +                                   address >= vma->vm_end)
5736 +                                       continue;
5737 +                               /*
5738 +                                * Initially we examine only the vma which
5739 +                                * covers this rmap_item; but later, if there
5740 +                                * is still work to do, we examine covering
5741 +                                * vmas in other mms: in case they were forked
5742 +                                * from the original since ksmd passed.
5743 +                                */
5744 +                               if ((rmap_item->slot->vma == vma) ==
5745 +                                   search_new_forks)
5746 +                                       continue;
5747 +
5748 +                               if (memcg &&
5749 +                                   !mm_match_cgroup(vma->vm_mm, memcg))
5750 +                                       continue;
5751 +
5752 +                               referenced +=
5753 +                                       page_referenced_one(page, vma,
5754 +                                               address, &mapcount, vm_flags);
5755 +                               if (!search_new_forks || !mapcount)
5756 +                                       break;
5757 +                       }
5758 +
5759 +                       anon_vma_unlock(anon_vma);
5760 +                       if (!mapcount)
5761 +                               goto out;
5762                 }
5763 -               anon_vma_unlock(anon_vma);
5764 -               if (!mapcount)
5765 -                       goto out;
5766         }
5767         if (!search_new_forks++)
5768                 goto again;
5769 @@ -1651,10 +4383,12 @@
5770  int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
5771  {
5772         struct stable_node *stable_node;
5773 -       struct hlist_node *hlist;
5774 +       struct node_vma *node_vma;
5775 +       struct hlist_node *hlist, *rmap_hlist;
5776         struct rmap_item *rmap_item;
5777         int ret = SWAP_AGAIN;
5778         int search_new_forks = 0;
5779 +       unsigned long address;
5780
5781         VM_BUG_ON(!PageKsm(page));
5782         VM_BUG_ON(!PageLocked(page));
5783 @@ -1663,34 +4397,42 @@
5784         if (!stable_node)
5785                 return SWAP_FAIL;
5786  again:
5787 -       hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
5788 -               struct anon_vma *anon_vma = rmap_item->anon_vma;
5789 -               struct anon_vma_chain *vmac;
5790 -               struct vm_area_struct *vma;
5791 -
5792 -               anon_vma_lock(anon_vma);
5793 -               list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
5794 -                       vma = vmac->vma;
5795 -                       if (rmap_item->address < vma->vm_start ||
5796 -                           rmap_item->address >= vma->vm_end)
5797 -                               continue;
5798 -                       /*
5799 -                        * Initially we examine only the vma which covers this
5800 -                        * rmap_item; but later, if there is still work to do,
5801 -                        * we examine covering vmas in other mms: in case they
5802 -                        * were forked from the original since ksmd passed.
5803 -                        */
5804 -                       if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
5805 -                               continue;
5806 -
5807 -                       ret = try_to_unmap_one(page, vma,
5808 -                                       rmap_item->address, flags);
5809 -                       if (ret != SWAP_AGAIN || !page_mapped(page)) {
5810 -                               anon_vma_unlock(anon_vma);
5811 -                               goto out;
5812 +       hlist_for_each_entry(node_vma, hlist, &stable_node->hlist, hlist) {
5813 +               hlist_for_each_entry(rmap_item, rmap_hlist,
5814 +                                    &node_vma->rmap_hlist, hlist) {
5815 +                       struct anon_vma *anon_vma = rmap_item->anon_vma;
5816 +                       struct anon_vma_chain *vmac;
5817 +                       struct vm_area_struct *vma;
5818 +
5819 +                       anon_vma_lock(anon_vma);
5820 +                       list_for_each_entry(vmac, &anon_vma->head,
5821 +                                           same_anon_vma) {
5822 +                               vma = vmac->vma;
5823 +                               address = get_rmap_addr(rmap_item);
5824 +
5825 +                               if (address < vma->vm_start ||
5826 +                                   address >= vma->vm_end)
5827 +                                       continue;
5828 +                               /*
5829 +                                * Initially we examine only the vma which
5830 +                                * covers this rmap_item; but later, if there
5831 +                                * is still work to do, we examine covering
5832 +                                * vmas in other mms: in case they were forked
5833 +                                * from the original since ksmd passed.
5834 +                                */
5835 +                               if ((rmap_item->slot->vma == vma) ==
5836 +                                   search_new_forks)
5837 +                                       continue;
5838 +
5839 +                               ret = try_to_unmap_one(page, vma,
5840 +                                                      address, flags);
5841 +                               if (ret != SWAP_AGAIN || !page_mapped(page)) {
5842 +                                       anon_vma_unlock(anon_vma);
5843 +                                       goto out;
5844 +                               }
5845                         }
5846 +                       anon_vma_unlock(anon_vma);
5847                 }
5848 -               anon_vma_unlock(anon_vma);
5849         }
5850         if (!search_new_forks++)
5851                 goto again;
5852 @@ -1703,10 +4445,12 @@
5853                   struct vm_area_struct *, unsigned long, void *), void *arg)
5854  {
5855         struct stable_node *stable_node;
5856 -       struct hlist_node *hlist;
5857 +       struct node_vma *node_vma;
5858 +       struct hlist_node *hlist, *rmap_hlist;
5859         struct rmap_item *rmap_item;
5860         int ret = SWAP_AGAIN;
5861         int search_new_forks = 0;
5862 +       unsigned long address;
5863
5864         VM_BUG_ON(!PageKsm(page));
5865         VM_BUG_ON(!PageLocked(page));
5866 @@ -1715,33 +4459,35 @@
5867         if (!stable_node)
5868                 return ret;
5869  again:
5870 -       hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
5871 -               struct anon_vma *anon_vma = rmap_item->anon_vma;
5872 -               struct anon_vma_chain *vmac;
5873 -               struct vm_area_struct *vma;
5874 -
5875 -               anon_vma_lock(anon_vma);
5876 -               list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
5877 -                       vma = vmac->vma;
5878 -                       if (rmap_item->address < vma->vm_start ||
5879 -                           rmap_item->address >= vma->vm_end)
5880 -                               continue;
5881 -                       /*
5882 -                        * Initially we examine only the vma which covers this
5883 -                        * rmap_item; but later, if there is still work to do,
5884 -                        * we examine covering vmas in other mms: in case they
5885 -                        * were forked from the original since ksmd passed.
5886 -                        */
5887 -                       if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
5888 -                               continue;
5889 -
5890 -                       ret = rmap_one(page, vma, rmap_item->address, arg);
5891 -                       if (ret != SWAP_AGAIN) {
5892 -                               anon_vma_unlock(anon_vma);
5893 -                               goto out;
5894 +       hlist_for_each_entry(node_vma, hlist, &stable_node->hlist, hlist) {
5895 +               hlist_for_each_entry(rmap_item, rmap_hlist,
5896 +                                    &node_vma->rmap_hlist, hlist) {
5897 +                       struct anon_vma *anon_vma = rmap_item->anon_vma;
5898 +                       struct anon_vma_chain *vmac;
5899 +                       struct vm_area_struct *vma;
5900 +
5901 +                       anon_vma_lock(anon_vma);
5902 +                       list_for_each_entry(vmac, &anon_vma->head,
5903 +                                           same_anon_vma) {
5904 +                               vma = vmac->vma;
5905 +                               address = get_rmap_addr(rmap_item);
5906 +
5907 +                               if (address < vma->vm_start ||
5908 +                                   address >= vma->vm_end)
5909 +                                       continue;
5910 +
5911 +                               if ((rmap_item->slot->vma == vma) ==
5912 +                                   search_new_forks)
5913 +                                       continue;
5914 +
5915 +                               ret = rmap_one(page, vma, address, arg);
5916 +                               if (ret != SWAP_AGAIN) {
5917 +                                       anon_vma_unlock(anon_vma);
5918 +                                       goto out;
5919 +                               }
5920                         }
5921 +                       anon_vma_unlock(anon_vma);
5922                 }
5923 -               anon_vma_unlock(anon_vma);
5924         }
5925         if (!search_new_forks++)
5926                 goto again;
5927 @@ -1771,7 +4517,7 @@
5928  {
5929         struct rb_node *node;
5930
5931 -       for (node = rb_first(&root_stable_tree); node; node = rb_next(node)) {
5932 +       for (node = rb_first(root_stable_treep); node; node = rb_next(node)) {
5933                 struct stable_node *stable_node;
5934
5935                 stable_node = rb_entry(node, struct stable_node, node);
5936 @@ -1810,7 +4556,7 @@
5937                  */
5938                 while ((stable_node = ksm_check_stable_tree(mn->start_pfn,
5939                                         mn->start_pfn + mn->nr_pages)) != NULL)
5940 -                       remove_node_from_stable_tree(stable_node);
5941 +                       remove_node_from_stable_tree(stable_node, 1, 1);
5942                 /* fallthrough */
5943
5944         case MEM_CANCEL_OFFLINE:
5945 @@ -1835,7 +4581,7 @@
5946  static ssize_t sleep_millisecs_show(struct kobject *kobj,
5947                                     struct kobj_attribute *attr, char *buf)
5948  {
5949 -       return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs);
5950 +       return sprintf(buf, "%u\n", jiffies_to_msecs(ksm_sleep_jiffies));
5951  }
5952
5953  static ssize_t sleep_millisecs_store(struct kobject *kobj,
5954 @@ -1849,34 +4595,58 @@
5955         if (err || msecs > UINT_MAX)
5956                 return -EINVAL;
5957
5958 -       ksm_thread_sleep_millisecs = msecs;
5959 +       ksm_sleep_jiffies = msecs_to_jiffies(msecs);
5960
5961         return count;
5962  }
5963  KSM_ATTR(sleep_millisecs);
5964
5965 -static ssize_t pages_to_scan_show(struct kobject *kobj,
5966 +static ssize_t min_scan_ratio_show(struct kobject *kobj,
5967 +                                   struct kobj_attribute *attr, char *buf)
5968 +{
5969 +       return sprintf(buf, "%u\n", ksm_min_scan_ratio);
5970 +}
5971 +
5972 +static ssize_t min_scan_ratio_store(struct kobject *kobj,
5973 +                                    struct kobj_attribute *attr,
5974 +                                    const char *buf, size_t count)
5975 +{
5976 +       unsigned long msr;
5977 +       int err;
5978 +
5979 +       err = strict_strtoul(buf, 10, &msr);
5980 +       if (err || msr > UINT_MAX)
5981 +               return -EINVAL;
5982 +
5983 +       ksm_min_scan_ratio = msr;
5984 +
5985 +       return count;
5986 +}
5987 +KSM_ATTR(min_scan_ratio);
5988 +
5989 +static ssize_t scan_batch_pages_show(struct kobject *kobj,
5990                                   struct kobj_attribute *attr, char *buf)
5991  {
5992 -       return sprintf(buf, "%u\n", ksm_thread_pages_to_scan);
5993 +       return sprintf(buf, "%lu\n", ksm_scan_batch_pages);
5994  }
5995
5996 -static ssize_t pages_to_scan_store(struct kobject *kobj,
5997 +static ssize_t scan_batch_pages_store(struct kobject *kobj,
5998                                    struct kobj_attribute *attr,
5999                                    const char *buf, size_t count)
6000  {
6001         int err;
6002 -       unsigned long nr_pages;
6003 +       unsigned long batch_pages;
6004
6005 -       err = strict_strtoul(buf, 10, &nr_pages);
6006 -       if (err || nr_pages > UINT_MAX)
6007 +       err = strict_strtoul(buf, 10, &batch_pages);
6008 +       if (err || batch_pages > UINT_MAX)
6009                 return -EINVAL;
6010
6011 -       ksm_thread_pages_to_scan = nr_pages;
6012 +       ksm_scan_batch_pages = batch_pages;
6013 +       cal_ladder_pages_to_scan(ksm_scan_batch_pages);
6014
6015         return count;
6016  }
6017 -KSM_ATTR(pages_to_scan);
6018 +KSM_ATTR(scan_batch_pages);
6019
6020  static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
6021                         char *buf)
6022 @@ -1893,28 +4663,12 @@
6023         err = strict_strtoul(buf, 10, &flags);
6024         if (err || flags > UINT_MAX)
6025                 return -EINVAL;
6026 -       if (flags > KSM_RUN_UNMERGE)
6027 +       if (flags > KSM_RUN_MERGE)
6028                 return -EINVAL;
6029
6030 -       /*
6031 -        * KSM_RUN_MERGE sets ksmd running, and 0 stops it running.
6032 -        * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items,
6033 -        * breaking COW to free the pages_shared (but leaves mm_slots
6034 -        * on the list for when ksmd may be set running again).
6035 -        */
6036 -
6037         mutex_lock(&ksm_thread_mutex);
6038         if (ksm_run != flags) {
6039                 ksm_run = flags;
6040 -               if (flags & KSM_RUN_UNMERGE) {
6041 -                       current->flags |= PF_OOM_ORIGIN;
6042 -                       err = unmerge_and_remove_all_rmap_items();
6043 -                       current->flags &= ~PF_OOM_ORIGIN;
6044 -                       if (err) {
6045 -                               ksm_run = KSM_RUN_STOP;
6046 -                               count = err;
6047 -                       }
6048 -               }
6049         }
6050         mutex_unlock(&ksm_thread_mutex);
6051
6052 @@ -1925,6 +4679,30 @@
6053  }
6054  KSM_ATTR(run);
6055
6056 +
6057 +static ssize_t thrash_threshold_show(struct kobject *kobj,
6058 +                                    struct kobj_attribute *attr, char *buf)
6059 +{
6060 +       return sprintf(buf, "%u\n", ksm_thrash_threshold);
6061 +}
6062 +
6063 +static ssize_t thrash_threshold_store(struct kobject *kobj,
6064 +                                     struct kobj_attribute *attr,
6065 +                                     const char *buf, size_t count)
6066 +{
6067 +       int err;
6068 +       unsigned long flags;
6069 +
6070 +       err = strict_strtoul(buf, 10, &flags);
6071 +       if (err || flags > 99)
6072 +               return -EINVAL;
6073 +
6074 +       ksm_thrash_threshold = flags;
6075 +
6076 +       return count;
6077 +}
6078 +KSM_ATTR(thrash_threshold);
6079 +
6080  static ssize_t pages_shared_show(struct kobject *kobj,
6081                                  struct kobj_attribute *attr, char *buf)
6082  {
6083 @@ -1946,60 +4724,300 @@
6084  }
6085  KSM_ATTR_RO(pages_unshared);
6086
6087 -static ssize_t pages_volatile_show(struct kobject *kobj,
6088 +static ssize_t pages_remap_zeropage_show(struct kobject *kobj,
6089                                    struct kobj_attribute *attr, char *buf)
6090  {
6091 -       long ksm_pages_volatile;
6092 -
6093 -       ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared
6094 -                               - ksm_pages_sharing - ksm_pages_unshared;
6095 -       /*
6096 -        * It was not worth any locking to calculate that statistic,
6097 -        * but it might therefore sometimes be negative: conceal that.
6098 -        */
6099 -       if (ksm_pages_volatile < 0)
6100 -               ksm_pages_volatile = 0;
6101 -       return sprintf(buf, "%ld\n", ksm_pages_volatile);
6102 +       return sprintf(buf, "%lu\n", ksm_remap_zero_pages);
6103  }
6104 -KSM_ATTR_RO(pages_volatile);
6105 +KSM_ATTR_RO(pages_remap_zeropage);
6106
6107  static ssize_t full_scans_show(struct kobject *kobj,
6108                                struct kobj_attribute *attr, char *buf)
6109  {
6110 -       return sprintf(buf, "%lu\n", ksm_scan.seqnr);
6111 +       return sprintf(buf, "%llu\n", ksm_scan_round);
6112  }
6113  KSM_ATTR_RO(full_scans);
6114
6115 +static ssize_t pages_scanned_show(struct kobject *kobj,
6116 +                                 struct kobj_attribute *attr, char *buf)
6117 +{
6118 +       unsigned long base = 0;
6119 +       u64 delta, ret;
6120 +
6121 +       if (pages_scanned_stored) {
6122 +               base = pages_scanned_base;
6123 +               ret = pages_scanned_stored;
6124 +               delta = ksm_pages_scanned >> base;
6125 +               if (CAN_OVERFLOW_U64(ret, delta)) {
6126 +                       ret >>= 1;
6127 +                       delta >>= 1;
6128 +                       base++;
6129 +                       ret += delta;
6130 +               }
6131 +       } else {
6132 +               ret = ksm_pages_scanned;
6133 +       }
6134 +
6135 +       while (ret > ULONG_MAX) {
6136 +               ret >>= 1;
6137 +               base++;
6138 +       }
6139 +
6140 +       if (base)
6141 +               return sprintf(buf, "%lu * 2^%lu\n", (unsigned long)ret, base);
6142 +       else
6143 +               return sprintf(buf, "%lu\n", (unsigned long)ret);
6144 +}
6145 +KSM_ATTR_RO(pages_scanned);
6146 +
6147 +static ssize_t hash_strength_show(struct kobject *kobj,
6148 +                                 struct kobj_attribute *attr, char *buf)
6149 +{
6150 +       return sprintf(buf, "%lu\n", hash_strength);
6151 +}
6152 +KSM_ATTR_RO(hash_strength);
6153 +
6154 +static ssize_t sleep_times_show(struct kobject *kobj,
6155 +                                 struct kobj_attribute *attr, char *buf)
6156 +{
6157 +       return sprintf(buf, "%llu\n", ksm_sleep_times);
6158 +}
6159 +KSM_ATTR_RO(sleep_times);
6160 +
6161 +
6162  static struct attribute *ksm_attrs[] = {
6163         &sleep_millisecs_attr.attr,
6164 -       &pages_to_scan_attr.attr,
6165 +       &scan_batch_pages_attr.attr,
6166         &run_attr.attr,
6167         &pages_shared_attr.attr,
6168         &pages_sharing_attr.attr,
6169         &pages_unshared_attr.attr,
6170 -       &pages_volatile_attr.attr,
6171 +       &pages_remap_zeropage_attr.attr,
6172         &full_scans_attr.attr,
6173 +       &min_scan_ratio_attr.attr,
6174 +       &pages_scanned_attr.attr,
6175 +       &hash_strength_attr.attr,
6176 +       &sleep_times_attr.attr,
6177 +       &thrash_threshold_attr.attr,
6178         NULL,
6179  };
6180
6181  static struct attribute_group ksm_attr_group = {
6182         .attrs = ksm_attrs,
6183 -       .name = "ksm",
6184 +       .name = "uksm",
6185  };
6186  #endif /* CONFIG_SYSFS */
6187
6188 +static inline void init_scan_ladder(void)
6189 +{
6190 +       int i;
6191 +       unsigned long mul = 1;
6192 +
6193 +       unsigned long pages_to_scan;
6194 +
6195 +       pages_to_scan = ksm_scan_batch_pages;
6196 +
6197 +       for (i = 0; i < ksm_scan_ladder_size; i++,
6198 +             mul *= ksm_scan_ratio_delta) {
6199 +
6200 +               ksm_scan_ladder[i].scan_ratio = ksm_min_scan_ratio * mul;
6201 +               INIT_LIST_HEAD(&ksm_scan_ladder[i].vma_list);
6202 +               ksm_scan_ladder[i].vma_num = 0;
6203 +               ksm_scan_ladder[i].round_finished = 0;
6204 +               ksm_scan_ladder[i].fully_scanned_slots = 0;
6205 +               ksm_scan_ladder[i].busy_searched = 0;
6206 +       }
6207 +
6208 +       cal_ladder_pages_to_scan(ksm_scan_batch_pages);
6209 +}
6210 +
6211 +static inline int cal_positive_negative_costs(void)
6212 +{
6213 +       struct page *p1, *p2;
6214 +       unsigned char *addr1, *addr2;
6215 +       unsigned long i, time_start, hash_cost;
6216 +       unsigned long loopnum = 0;
6217 +
6218 +       /*IMPORTANT: volatile is needed to prevent over-optimization by gcc. */
6219 +       volatile u32 hash;
6220 +       volatile int ret;
6221 +
6222 +       p1 = alloc_page(GFP_KERNEL);
6223 +       if (!p1)
6224 +               return -ENOMEM;
6225 +
6226 +       p2 = alloc_page(GFP_KERNEL);
6227 +       if (!p2)
6228 +               return -ENOMEM;
6229 +
6230 +       addr1 = kmap_atomic(p1, KM_USER0);
6231 +       addr2 = kmap_atomic(p2, KM_USER1);
6232 +       memset(addr1, random32(), PAGE_SIZE);
6233 +       memcpy(addr2, addr1, PAGE_SIZE);
6234 +
6235 +       /* make sure that the two pages differ in last byte */
6236 +       addr2[PAGE_SIZE-1] = ~addr2[PAGE_SIZE-1];
6237 +       kunmap_atomic(addr2, KM_USER1);
6238 +       kunmap_atomic(addr1, KM_USER0);
6239 +
6240 +       time_start = jiffies;
6241 +       while (jiffies - time_start < 100) {
6242 +               for (i = 0; i < 100; i++)
6243 +                       hash = page_hash(p1, HASH_STRENGTH_FULL, 0);
6244 +               loopnum += 100;
6245 +       }
6246 +       hash_cost = (jiffies - time_start);
6247 +
6248 +       time_start = jiffies;
6249 +       for (i = 0; i < loopnum; i++)
6250 +               ret = pages_identical(p1, p2);
6251 +       memcmp_cost = HASH_STRENGTH_FULL * (jiffies - time_start);
6252 +       memcmp_cost /= hash_cost;
6253 +       printk(KERN_INFO "UKSM: relative memcmp_cost = %lu.\n", memcmp_cost);
6254 +
6255 +       __free_page(p1);
6256 +       __free_page(p2);
6257 +       return 0;
6258 +}
6259 +
6260 +static int init_zeropage_hash_table(void)
6261 +{
6262 +       struct page *page;
6263 +       char *addr;
6264 +       int i;
6265 +
6266 +       page = alloc_page(GFP_KERNEL);
6267 +       if (!page)
6268 +               return -ENOMEM;
6269 +
6270 +       addr = kmap_atomic(page, KM_USER0);
6271 +       memset(addr, 0, PAGE_SIZE);
6272 +       kunmap_atomic(addr, KM_USER0);
6273 +
6274 +       zero_hash_table = kmalloc(HASH_STRENGTH_MAX * sizeof(u32),
6275 +               GFP_KERNEL);
6276 +       if (!zero_hash_table)
6277 +               return -ENOMEM;
6278 +
6279 +       for (i = 0; i< HASH_STRENGTH_MAX; i++) {
6280 +               zero_hash_table[i] = page_hash(page, i, 0);
6281 +       }
6282 +
6283 +       __free_page(page);
6284 +
6285 +       return 0;
6286 +}
6287 +
6288 +static inline int init_random_sampling(void)
6289 +{
6290 +       unsigned long i;
6291 +       random_nums = kmalloc(PAGE_SIZE, GFP_KERNEL);
6292 +       if (!random_nums)
6293 +               return -ENOMEM;
6294 +
6295 +       for (i = 0; i < HASH_STRENGTH_FULL; i++)
6296 +               random_nums[i] = i;
6297 +
6298 +       for (i = 0; i < HASH_STRENGTH_FULL; i++) {
6299 +               unsigned long rand_range, swap_index, tmp;
6300 +
6301 +               rand_range = HASH_STRENGTH_FULL - i;
6302 +               swap_index = i + random32() % rand_range;
6303 +               tmp = random_nums[i];
6304 +               random_nums[i] =  random_nums[swap_index];
6305 +               random_nums[swap_index] = tmp;
6306 +       }
6307 +
6308 +       rshash_state.state = RSHASH_NEW;
6309 +       rshash_state.below_count = 0;
6310 +       rshash_state.lookup_window_index = 0;
6311 +
6312 +       return cal_positive_negative_costs();
6313 +}
6314 +
6315 +static int __init ksm_slab_init(void)
6316 +{
6317 +       rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
6318 +       if (!rmap_item_cache)
6319 +               goto out;
6320 +
6321 +       stable_node_cache = KSM_KMEM_CACHE(stable_node, 0);
6322 +       if (!stable_node_cache)
6323 +               goto out_free1;
6324 +
6325 +       node_vma_cache = KSM_KMEM_CACHE(node_vma, 0);
6326 +       if (!node_vma_cache)
6327 +               goto out_free2;
6328 +
6329 +       vma_slot_cache = KSM_KMEM_CACHE(vma_slot, 0);
6330 +       if (!vma_slot_cache)
6331 +               goto out_free3;
6332 +
6333 +       tree_node_cache = KSM_KMEM_CACHE(tree_node, 0);
6334 +       if (!tree_node_cache)
6335 +               goto out_free4;
6336 +
6337 +       return 0;
6338 +
6339 +out_free4:
6340 +       kmem_cache_destroy(vma_slot_cache);
6341 +out_free3:
6342 +       kmem_cache_destroy(node_vma_cache);
6343 +out_free2:
6344 +       kmem_cache_destroy(stable_node_cache);
6345 +out_free1:
6346 +       kmem_cache_destroy(rmap_item_cache);
6347 +out:
6348 +       return -ENOMEM;
6349 +}
6350 +
6351 +static void __init ksm_slab_free(void)
6352 +{
6353 +       kmem_cache_destroy(stable_node_cache);
6354 +       kmem_cache_destroy(rmap_item_cache);
6355 +       kmem_cache_destroy(node_vma_cache);
6356 +       kmem_cache_destroy(vma_slot_cache);
6357 +       kmem_cache_destroy(tree_node_cache);
6358 +}
6359 +
6360  static int __init ksm_init(void)
6361  {
6362         struct task_struct *ksm_thread;
6363         int err;
6364 +       unsigned int sr = ksm_min_scan_ratio;
6365 +
6366 +       ksm_scan_ladder_size = 1;
6367 +       while (sr < KSM_SCAN_RATIO_MAX) {
6368 +               sr *= ksm_scan_ratio_delta;
6369 +               ksm_scan_ladder_size++;
6370 +       }
6371 +       ksm_scan_ladder = kzalloc(sizeof(struct scan_rung) *
6372 +                                 ksm_scan_ladder_size, GFP_KERNEL);
6373 +       if (!ksm_scan_ladder) {
6374 +               printk(KERN_ERR "uksm scan ladder allocation failed, size=%d\n",
6375 +                      ksm_scan_ladder_size);
6376 +               err = ENOMEM;
6377 +               goto out;
6378 +       }
6379 +       init_scan_ladder();
6380 +
6381 +       INIT_RADIX_TREE(&ksm_vma_tree, GFP_KERNEL);
6382 +
6383 +       err = init_random_sampling();
6384 +       if (err)
6385 +               goto out_free2;
6386
6387         err = ksm_slab_init();
6388         if (err)
6389 -               goto out;
6390 +               goto out_free1;
6391
6392 -       ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
6393 +       err = init_zeropage_hash_table();
6394 +       if (err)
6395 +               goto out_free0;
6396 +
6397 +       ksm_thread = kthread_run(ksm_scan_thread, NULL, "uksmd");
6398         if (IS_ERR(ksm_thread)) {
6399 -               printk(KERN_ERR "ksm: creating kthread failed\n");
6400 +               printk(KERN_ERR "uksm: creating kthread failed\n");
6401                 err = PTR_ERR(ksm_thread);
6402                 goto out_free;
6403         }
6404 @@ -2007,7 +5025,7 @@
6405  #ifdef CONFIG_SYSFS
6406         err = sysfs_create_group(mm_kobj, &ksm_attr_group);
6407         if (err) {
6408 -               printk(KERN_ERR "ksm: register sysfs failed\n");
6409 +               printk(KERN_ERR "uksm: register sysfs failed\n");
6410                 kthread_stop(ksm_thread);
6411                 goto out_free;
6412         }
6413 @@ -2027,7 +5045,19 @@
6414
6415  out_free:
6416         ksm_slab_free();
6417 +out_free0:
6418 +       kfree(zero_hash_table);
6419 +out_free1:
6420 +       kfree(random_nums);
6421 +out_free2:
6422 +       kfree(ksm_scan_ladder);
6423  out:
6424         return err;
6425  }
6426 +
6427 +#ifdef MODULE
6428  module_init(ksm_init)
6429 +#else
6430 +late_initcall(ksm_init);
6431 +#endif
6432 +
6433 Binary files linux-2.6.38/mm/.ksm.c.swp and uksm-2.6.38/mm/.ksm.c.swp differ
6434 diff -Nur linux-2.6.38/mm/madvise.c uksm-2.6.38/mm/madvise.c
6435 --- linux-2.6.38/mm/madvise.c   2011-03-15 09:20:32.000000000 +0800
6436 +++ uksm-2.6.38/mm/madvise.c    2011-12-16 01:10:14.000000000 +0800
6437 @@ -65,12 +65,6 @@
6438                 }
6439                 new_flags &= ~VM_DONTCOPY;
6440                 break;
6441 -       case MADV_MERGEABLE:
6442 -       case MADV_UNMERGEABLE:
6443 -               error = ksm_madvise(vma, start, end, behavior, &new_flags);
6444 -               if (error)
6445 -                       goto out;
6446 -               break;
6447         case MADV_HUGEPAGE:
6448         case MADV_NOHUGEPAGE:
6449                 error = hugepage_madvise(vma, &new_flags, behavior);
6450 @@ -285,10 +279,6 @@
6451         case MADV_REMOVE:
6452         case MADV_WILLNEED:
6453         case MADV_DONTNEED:
6454 -#ifdef CONFIG_KSM
6455 -       case MADV_MERGEABLE:
6456 -       case MADV_UNMERGEABLE:
6457 -#endif
6458  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
6459         case MADV_HUGEPAGE:
6460         case MADV_NOHUGEPAGE:
6461 diff -Nur linux-2.6.38/mm/memory.c uksm-2.6.38/mm/memory.c
6462 --- linux-2.6.38/mm/memory.c    2011-03-15 09:20:32.000000000 +0800
6463 +++ uksm-2.6.38/mm/memory.c     2011-12-16 01:10:14.000000000 +0800
6464 @@ -719,6 +719,10 @@
6465                         rss[MM_ANONPAGES]++;
6466                 else
6467                         rss[MM_FILEPAGES]++;
6468 +#ifdef CONFIG_KSM
6469 +               if (PageKsm(page)) /* follows page_dup_rmap() */
6470 +                       inc_zone_page_state(page, NR_KSM_PAGES_SHARING);
6471 +#endif
6472         }
6473
6474  out_set_pte:
6475 @@ -1423,7 +1427,7 @@
6476
6477         VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
6478
6479 -       /*
6480 +       /*
6481          * Require read or write permissions.
6482          * If FOLL_FORCE is set, we only require the "MAY" flags.
6483          */
6484 @@ -2158,8 +2162,13 @@
6485                         clear_page(kaddr);
6486                 kunmap_atomic(kaddr, KM_USER0);
6487                 flush_dcache_page(dst);
6488 -       } else
6489 +       } else {
6490                 copy_user_highpage(dst, src, va, vma);
6491 +#ifdef CONFIG_KSM
6492 +               if (vma->ksm_vma_slot && PageKsm(src))
6493 +                       vma->ksm_vma_slot->pages_cowed++;
6494 +#endif
6495 +       }
6496  }
6497
6498  /*
6499 diff -Nur linux-2.6.38/mm/mmap.c uksm-2.6.38/mm/mmap.c
6500 --- linux-2.6.38/mm/mmap.c      2011-03-15 09:20:32.000000000 +0800
6501 +++ uksm-2.6.38/mm/mmap.c       2011-12-16 01:10:14.000000000 +0800
6502 @@ -30,6 +30,7 @@
6503  #include <linux/perf_event.h>
6504  #include <linux/audit.h>
6505  #include <linux/khugepaged.h>
6506 +#include <linux/ksm.h>
6507
6508  #include <asm/uaccess.h>
6509  #include <asm/cacheflush.h>
6510 @@ -65,7 +66,7 @@
6511   * MAP_SHARED  r: (no) no      r: (yes) yes    r: (no) yes     r: (no) yes
6512   *             w: (no) no      w: (no) no      w: (yes) yes    w: (no) no
6513   *             x: (no) no      x: (no) yes     x: (no) yes     x: (yes) yes
6514 - *
6515 + *
6516   * MAP_PRIVATE r: (no) no      r: (yes) yes    r: (no) yes     r: (no) yes
6517   *             w: (no) no      w: (no) no      w: (copy) copy  w: (no) no
6518   *             x: (no) no      x: (no) yes     x: (no) yes     x: (yes) yes
6519 @@ -240,6 +241,9 @@
6520                         removed_exe_file_vma(vma->vm_mm);
6521         }
6522         mpol_put(vma_policy(vma));
6523 +#ifdef CONFIG_KSM
6524 +       ksm_remove_vma(vma);
6525 +#endif
6526         kmem_cache_free(vm_area_cachep, vma);
6527         return next;
6528  }
6529 @@ -529,9 +533,20 @@
6530         long adjust_next = 0;
6531         int remove_next = 0;
6532
6533 +/*
6534 + * to avoid deadlock, ksm_remove_vma must be done before any spin_lock is
6535 + * acquired
6536 + */
6537 +#ifdef CONFIG_KSM
6538 +       ksm_remove_vma(vma);
6539 +#endif
6540 +
6541         if (next && !insert) {
6542                 struct vm_area_struct *exporter = NULL;
6543
6544 +#ifdef CONFIG_KSM
6545 +               ksm_remove_vma(next);
6546 +#endif
6547                 if (end >= next->vm_end) {
6548                         /*
6549                          * vma expands, overlapping all the next, and
6550 @@ -616,10 +631,10 @@
6551                 if (adjust_next)
6552                         vma_prio_tree_remove(next, root);
6553         }
6554 -
6555         vma->vm_start = start;
6556         vma->vm_end = end;
6557         vma->vm_pgoff = pgoff;
6558 +
6559         if (adjust_next) {
6560                 next->vm_start += adjust_next << PAGE_SHIFT;
6561                 next->vm_pgoff += adjust_next;
6562 @@ -672,10 +687,22 @@
6563                  */
6564                 if (remove_next == 2) {
6565                         next = vma->vm_next;
6566 +#ifdef CONFIG_KSM
6567 +                       ksm_remove_vma(next);
6568 +#endif
6569                         goto again;
6570                 }
6571 +       } else {
6572 +#ifdef CONFIG_KSM
6573 +               if (next && !insert)
6574 +                       ksm_vma_add_new(next);
6575 +#endif
6576         }
6577
6578 +#ifdef CONFIG_KSM
6579 +       ksm_vma_add_new(vma);
6580 +#endif
6581 +
6582         validate_mm(mm);
6583
6584         return 0;
6585 @@ -1352,6 +1379,9 @@
6586
6587         vma_link(mm, vma, prev, rb_link, rb_parent);
6588         file = vma->vm_file;
6589 +#ifdef CONFIG_KSM
6590 +       ksm_vma_add_new(vma);
6591 +#endif
6592
6593         /* Once vma denies write, undo our temporary denial count */
6594         if (correct_wcount)
6595 @@ -1378,6 +1408,9 @@
6596         unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
6597         charged = 0;
6598  free_vma:
6599 +#ifdef CONFIG_KSM
6600 +       ksm_remove_vma(vma);
6601 +#endif
6602         kmem_cache_free(vm_area_cachep, vma);
6603  unacct_error:
6604         if (charged)
6605 @@ -1453,7 +1486,7 @@
6606                 addr = vma->vm_end;
6607         }
6608  }
6609 -#endif
6610 +#endif
6611
6612  void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
6613  {
6614 @@ -2014,6 +2047,10 @@
6615         else
6616                 err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
6617
6618 +#ifdef CONFIG_KSM
6619 +       ksm_vma_add_new(new);
6620 +#endif
6621 +
6622         /* Success. */
6623         if (!err)
6624                 return 0;
6625 @@ -2250,6 +2287,9 @@
6626         vma->vm_flags = flags;
6627         vma->vm_page_prot = vm_get_page_prot(flags);
6628         vma_link(mm, vma, prev, rb_link, rb_parent);
6629 +#ifdef CONFIG_KSM
6630 +       ksm_vma_add_new(vma);
6631 +#endif
6632  out:
6633         perf_event_mmap(vma);
6634         mm->total_vm += len >> PAGE_SHIFT;
6635 @@ -2273,6 +2313,12 @@
6636         /* mm's last user has gone, and its about to be pulled down */
6637         mmu_notifier_release(mm);
6638
6639 +       /*
6640 +        * Taking write lock on mmap_sem does not harm others,
6641 +        * but it's crucial for uksm to avoid races.
6642 +        */
6643 +       down_write(&mm->mmap_sem);
6644 +
6645         if (mm->locked_vm) {
6646                 vma = mm->mmap;
6647                 while (vma) {
6648 @@ -2306,6 +2352,11 @@
6649         while (vma)
6650                 vma = remove_vma(vma);
6651
6652 +       mm->mmap = NULL;
6653 +       mm->mm_rb = RB_ROOT;
6654 +       mm->mmap_cache = NULL;
6655 +       up_write(&mm->mmap_sem);
6656 +
6657         BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
6658  }
6659
6660 @@ -2397,6 +2448,9 @@
6661                         if (new_vma->vm_ops && new_vma->vm_ops->open)
6662                                 new_vma->vm_ops->open(new_vma);
6663                         vma_link(mm, new_vma, prev, rb_link, rb_parent);
6664 +#ifdef CONFIG_KSM
6665 +                       ksm_vma_add_new(new_vma);
6666 +#endif
6667                 }
6668         }
6669         return new_vma;
6670 @@ -2502,11 +2556,14 @@
6671         ret = insert_vm_struct(mm, vma);
6672         if (ret)
6673                 goto out;
6674 -
6675         mm->total_vm += len >> PAGE_SHIFT;
6676
6677         perf_event_mmap(vma);
6678
6679 +#ifdef CONFIG_KSM
6680 +       ksm_vma_add_new(vma);
6681 +#endif
6682 +
6683         return 0;
6684
6685  out:
6686 diff -Nur linux-2.6.38/mm/mremap.c uksm-2.6.38/mm/mremap.c
6687 --- linux-2.6.38/mm/mremap.c    2011-03-15 09:20:32.000000000 +0800
6688 +++ uksm-2.6.38/mm/mremap.c     2011-12-16 01:10:14.000000000 +0800
6689 @@ -191,8 +191,7 @@
6690          * pages recently unmapped.  But leave vma->vm_flags as it was,
6691          * so KSM can come around to merge on vma and new_vma afterwards.
6692          */
6693 -       err = ksm_madvise(vma, old_addr, old_addr + old_len,
6694 -                                               MADV_UNMERGEABLE, &vm_flags);
6695 +       err = unmerge_ksm_pages(vma, old_addr, old_addr + old_len);
6696         if (err)
6697                 return err;
6698
6699 diff -Nur linux-2.6.38/mm/rmap.c uksm-2.6.38/mm/rmap.c
6700 --- linux-2.6.38/mm/rmap.c      2011-03-15 09:20:32.000000000 +0800
6701 +++ uksm-2.6.38/mm/rmap.c       2011-12-16 01:10:14.000000000 +0800
6702 @@ -817,9 +817,9 @@
6703
6704  /**
6705   * __page_set_anon_rmap - set up new anonymous rmap
6706 - * @page:      Page to add to rmap
6707 + * @page:      Page to add to rmap
6708   * @vma:       VM area to add page to.
6709 - * @address:   User virtual address of the mapping
6710 + * @address:   User virtual address of the mapping
6711   * @exclusive: the page is exclusively owned by the current process
6712   */
6713  static void __page_set_anon_rmap(struct page *page,
6714 @@ -905,9 +905,12 @@
6715                         __inc_zone_page_state(page,
6716                                               NR_ANON_TRANSPARENT_HUGEPAGES);
6717         }
6718 -       if (unlikely(PageKsm(page)))
6719 +#ifdef CONFIG_KSM
6720 +       if (unlikely(PageKsm(page))) {
6721 +               __inc_zone_page_state(page, NR_KSM_PAGES_SHARING);
6722                 return;
6723 -
6724 +       }
6725 +#endif
6726         VM_BUG_ON(!PageLocked(page));
6727         VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
6728         if (first)
6729 @@ -965,6 +968,10 @@
6730   */
6731  void page_remove_rmap(struct page *page)
6732  {
6733 +#ifdef CONFIG_KSM
6734 +       if (PageKsm(page))
6735 +               __dec_zone_page_state(page, NR_KSM_PAGES_SHARING);
6736 +#endif
6737         /* page still mapped by someone else? */
6738         if (!atomic_add_negative(-1, &page->_mapcount))
6739                 return;