kernel26-uksm/uksm-2.6.38-20120110.patch

   1 diff -urN linux-2.6.38/arch/x86/kernel/entry_32.S uksm-2.6.38-zhang/arch/x86/kernel/entry_32.S
   2 --- linux-2.6.38/arch/x86/kernel/entry_32.S     2011-03-15 09:20:32.000000000 +0800
   3 +++ uksm-2.6.38-zhang/arch/x86/kernel/entry_32.S        2012-01-09 10:05:23.642269166 +0800
   4 @@ -1413,7 +1413,7 @@
   5         CFI_ADJUST_CFA_OFFSET 4
   6         jmp error_code
   7         CFI_ENDPROC
   8 -END(apf_page_fault)
   9 +END(async_page_fault)
  10  #endif
  11
  12  /*
  13 diff -urN linux-2.6.38/arch/x86/kernel/entry_64.S uksm-2.6.38-zhang/arch/x86/kernel/entry_64.S
  14 --- linux-2.6.38/arch/x86/kernel/entry_64.S     2011-03-15 09:20:32.000000000 +0800
  15 +++ uksm-2.6.38-zhang/arch/x86/kernel/entry_64.S        2012-01-09 10:05:23.642269166 +0800
  16 @@ -1248,7 +1248,7 @@
  17         decl PER_CPU_VAR(irq_count)
  18         jmp  error_exit
  19         CFI_ENDPROC
  20 -END(do_hypervisor_callback)
  21 +END(xen_do_hypervisor_callback)
  22
  23  /*
  24   * Hypervisor uses this for application faults while it executes.
  25 diff -urN linux-2.6.38/fs/exec.c uksm-2.6.38-zhang/fs/exec.c
  26 --- linux-2.6.38/fs/exec.c      2011-03-15 09:20:32.000000000 +0800
  27 +++ uksm-2.6.38-zhang/fs/exec.c 2012-01-09 10:05:55.168936883 +0800
  28 @@ -19,7 +19,7 @@
  29   * current->executable is only used by the procfs.  This allows a dispatch
  30   * table to check for several different types  of binary formats.  We keep
  31   * trying until we recognize the file or we run out of supported binary
  32 - * formats.
  33 + * formats.
  34   */
  35
  36  #include <linux/slab.h>
  37 @@ -55,6 +55,7 @@
  38  #include <linux/fs_struct.h>
  39  #include <linux/pipe_fs_i.h>
  40  #include <linux/oom.h>
  41 +#include <linux/ksm.h>
  42
  43  #include <asm/uaccess.h>
  44  #include <asm/mmu_context.h>
  45 @@ -85,7 +86,7 @@
  46         insert ? list_add(&fmt->lh, &formats) :
  47                  list_add_tail(&fmt->lh, &formats);
  48         write_unlock(&binfmt_lock);
  49 -       return 0;
  50 +       return 0;
  51  }
  52
  53  EXPORT_SYMBOL(__register_binfmt);
  54 @@ -1106,7 +1107,7 @@
  55            group */
  56
  57         current->self_exec_id++;
  58 -
  59 +
  60         flush_signal_handlers(current, 0);
  61         flush_old_files(current->files);
  62  }
  63 @@ -1196,8 +1197,8 @@
  64         return res;
  65  }
  66
  67 -/*
  68 - * Fill the binprm structure from the inode.
  69 +/*
  70 + * Fill the binprm structure from the inode.
  71   * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
  72   *
  73   * This may be called multiple times for binary chains (scripts for example).
  74 diff -urN linux-2.6.38/fs/proc/meminfo.c uksm-2.6.38-zhang/fs/proc/meminfo.c
  75 --- linux-2.6.38/fs/proc/meminfo.c      2011-03-15 09:20:32.000000000 +0800
  76 +++ uksm-2.6.38-zhang/fs/proc/meminfo.c 2012-01-09 10:05:56.362270256 +0800
  77 @@ -87,6 +87,10 @@
  78                 "SUnreclaim:     %8lu kB\n"
  79                 "KernelStack:    %8lu kB\n"
  80                 "PageTables:     %8lu kB\n"
  81 +#ifdef CONFIG_KSM
  82 +               "KsmSharing:     %8lu kB\n"
  83 +               "KsmZeroPages:   %8lu kB\n"
  84 +#endif
  85  #ifdef CONFIG_QUICKLIST
  86                 "Quicklists:     %8lu kB\n"
  87  #endif
  88 @@ -145,6 +149,10 @@
  89                 K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
  90                 global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024,
  91                 K(global_page_state(NR_PAGETABLE)),
  92 +#ifdef CONFIG_KSM
  93 +               K(global_page_state(NR_KSM_PAGES_SHARING)),
  94 +               K(global_page_state(NR_KSM_ZERO_PAGES)),
  95 +#endif
  96  #ifdef CONFIG_QUICKLIST
  97                 K(quicklist_total_size()),
  98  #endif
  99 Binary files linux-2.6.38/.gitignore.swp and uksm-2.6.38-zhang/.gitignore.swp differ
 100 diff -urN linux-2.6.38/include/linux/ksm.h uksm-2.6.38-zhang/include/linux/ksm.h
 101 --- linux-2.6.38/include/linux/ksm.h    2011-03-15 09:20:32.000000000 +0800
 102 +++ uksm-2.6.38-zhang/include/linux/ksm.h       2012-01-09 10:11:24.218947858 +0800
 103 @@ -20,23 +20,10 @@
 104                         struct vm_area_struct *vma, unsigned long address);
 105
 106  #ifdef CONFIG_KSM
 107 -int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 108 -               unsigned long end, int advice, unsigned long *vm_flags);
 109 -int __ksm_enter(struct mm_struct *mm);
 110 -void __ksm_exit(struct mm_struct *mm);
 111
 112 -static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 113 -{
 114 -       if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
 115 -               return __ksm_enter(mm);
 116 -       return 0;
 117 -}
 118 -
 119 -static inline void ksm_exit(struct mm_struct *mm)
 120 -{
 121 -       if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
 122 -               __ksm_exit(mm);
 123 -}
 124 +extern unsigned long zero_pfn __read_mostly;
 125 +extern unsigned long ksm_zero_pfn __read_mostly;
 126 +extern struct page *empty_ksm_zero_page;
 127
 128  /*
 129   * A KSM page is one of those write-protected "shared pages" or "merged pages"
 130 @@ -62,6 +49,13 @@
 131                                 (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
 132  }
 133
 134 +/* must be done before linked to mm */
 135 +extern void ksm_vma_add_new(struct vm_area_struct *vma);
 136 +
 137 +extern void ksm_remove_vma(struct vm_area_struct *vma);
 138 +extern int unmerge_ksm_pages(struct vm_area_struct *vma,
 139 +                                   unsigned long start, unsigned long end);
 140 +
 141  /*
 142   * When do_swap_page() first faults in from swap what used to be a KSM page,
 143   * no problem, it will be assigned to this vma's anon_vma; but thereafter,
 144 @@ -90,16 +84,184 @@
 145                   struct vm_area_struct *, unsigned long, void *), void *arg);
 146  void ksm_migrate_page(struct page *newpage, struct page *oldpage);
 147
 148 -#else  /* !CONFIG_KSM */
 149 +/* Each rung of this ladder is a list of VMAs having a same scan ratio */
 150 +struct scan_rung {
 151 +       struct list_head vma_list;
 152 +       //spinlock_t vma_list_lock;
 153 +       //struct semaphore sem;
 154 +       struct list_head *current_scan;
 155 +       unsigned int pages_to_scan;
 156 +       unsigned char round_finished; /* rung is ready for the next round */
 157 +       unsigned char busy_searched;
 158 +       unsigned long fully_scanned_slots;
 159 +       unsigned long scan_ratio;
 160 +       unsigned long vma_num;
 161 +       //unsigned long vma_finished;
 162 +       unsigned long scan_turn;
 163 +};
 164 +
 165 +struct vma_slot {
 166 +       struct list_head ksm_list;
 167 +       struct list_head slot_list;
 168 +       unsigned long dedup_ratio;
 169 +       unsigned long dedup_num;
 170 +       int ksm_index; /* -1 if vma is not in inter-table,
 171 +                               positive otherwise */
 172 +       unsigned long pages_scanned;
 173 +       unsigned long last_scanned;
 174 +       unsigned long pages_to_scan;
 175 +       struct scan_rung *rung;
 176 +       struct page **rmap_list_pool;
 177 +       unsigned long *pool_counts;
 178 +       unsigned long pool_size;
 179 +       struct vm_area_struct *vma;
 180 +       struct mm_struct *mm;
 181 +       unsigned long ctime_j;
 182 +       unsigned long pages;
 183 +       unsigned char need_sort;
 184 +       unsigned char need_rerand;
 185 +       unsigned long slot_scanned; /* It's scanned in this round */
 186 +       unsigned long fully_scanned; /* the above four to be merged to status bits */
 187 +       unsigned long pages_cowed; /* pages cowed this round */
 188 +       unsigned long pages_merged; /* pages merged this round */
 189 +
 190 +       /* used for dup vma pair */
 191 +       struct radix_tree_root dup_tree;
 192 +};
 193
 194 -static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 195 -{
 196 -       return 0;
 197 -}
 198 +/*
 199 + * A few notes about the KSM scanning process,
 200 + * to make it easier to understand the data structures below:
 201 + *
 202 + * In order to reduce excessive scanning, KSM sorts the memory pages by their
 203 + * contents into a data structure that holds pointers to the pages' locations.
 204 + *
 205 + * Since the contents of the pages may change at any moment, KSM cannot just
 206 + * insert the pages into a normal sorted tree and expect it to find anything.
 207 + * Therefore KSM uses two data structures - the stable and the unstable tree.
 208 + *
 209 + * The stable tree holds pointers to all the merged pages (ksm pages), sorted
 210 + * by their contents.  Because each such page is write-protected, searching on
 211 + * this tree is fully assured to be working (except when pages are unmapped),
 212 + * and therefore this tree is called the stable tree.
 213 + *
 214 + * In addition to the stable tree, KSM uses a second data structure called the
 215 + * unstable tree: this tree holds pointers to pages which have been found to
 216 + * be "unchanged for a period of time".  The unstable tree sorts these pages
 217 + * by their contents, but since they are not write-protected, KSM cannot rely
 218 + * upon the unstable tree to work correctly - the unstable tree is liable to
 219 + * be corrupted as its contents are modified, and so it is called unstable.
 220 + *
 221 + * KSM solves this problem by several techniques:
 222 + *
 223 + * 1) The unstable tree is flushed every time KSM completes scanning all
 224 + *    memory areas, and then the tree is rebuilt again from the beginning.
 225 + * 2) KSM will only insert into the unstable tree, pages whose hash value
 226 + *    has not changed since the previous scan of all memory areas.
 227 + * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the
 228 + *    colors of the nodes and not on their contents, assuring that even when
 229 + *    the tree gets "corrupted" it won't get out of balance, so scanning time
 230 + *    remains the same (also, searching and inserting nodes in an rbtree uses
 231 + *    the same algorithm, so we have no overhead when we flush and rebuild).
 232 + * 4) KSM never flushes the stable tree, which means that even if it were to
 233 + *    take 10 attempts to find a page in the unstable tree, once it is found,
 234 + *    it is secured in the stable tree.  (When we scan a new page, we first
 235 + *    compare it against the stable tree, and then against the unstable tree.)
 236 + */
 237
 238 -static inline void ksm_exit(struct mm_struct *mm)
 239 -{
 240 -}
 241 +
 242 +/**
 243 + * node of either the stable or unstale rbtree
 244 + *
 245 + */
 246 +struct tree_node {
 247 +       struct rb_node node; /* link in the main (un)stable rbtree */
 248 +       struct rb_root sub_root; /* rb_root for sublevel collision rbtree */
 249 +       u32 hash;
 250 +       unsigned long count; /* how many sublevel tree nodes */
 251 +       struct list_head all_list; /* all tree nodes in stable/unstable tree */
 252 +};
 253 +
 254 +
 255 +/**
 256 + * struct stable_node - node of the stable rbtree
 257 + * @node: rb node of this ksm page in the stable tree
 258 + * @hlist: hlist head of rmap_items using this ksm page
 259 + * @kpfn: page frame number of this ksm page
 260 + */
 261 +struct stable_node {
 262 +       struct rb_node node; /* link in sub-rbtree */
 263 +       struct tree_node *tree_node; /* it's tree node root in stable tree, NULL if it's in hell list */
 264 +       struct hlist_head hlist;
 265 +       unsigned long kpfn;
 266 +       u32 hash_max; /* if ==0 then it's not been calculated yet */
 267 +       //struct vm_area_struct *old_vma;
 268 +       struct list_head all_list; /* in a list for all stable nodes */
 269 +};
 270 +
 271 +
 272 +
 273 +
 274 +/**
 275 + * struct node_vma - group rmap_items linked in a same stable
 276 + * node together.
 277 + */
 278 +struct node_vma {
 279 +       union {
 280 +               struct vma_slot *slot;
 281 +               unsigned long key;  /* slot is used as key sorted on hlist */
 282 +       };
 283 +       struct hlist_node hlist;
 284 +       struct hlist_head rmap_hlist;
 285 +       struct stable_node *head;
 286 +       unsigned long last_update;
 287 +};
 288 +
 289 +/**
 290 + * struct rmap_item - reverse mapping item for virtual addresses
 291 + * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
 292 + * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
 293 + * @mm: the memory structure this rmap_item is pointing into
 294 + * @address: the virtual address this rmap_item tracks (+ flags in low bits)
 295 + * @node: rb node of this rmap_item in the unstable tree
 296 + * @head: pointer to stable_node heading this list in the stable tree
 297 + * @hlist: link into hlist of rmap_items hanging off that stable_node
 298 + */
 299 +struct rmap_item {
 300 +       struct vma_slot *slot;
 301 +       struct page *page;
 302 +       unsigned long address;  /* + low bits used for flags below */
 303 +       /* Appendded to (un)stable tree on which scan round */
 304 +       unsigned long append_round;
 305 +
 306 +       /* Which rung scan turn it was last scanned */
 307 +       //unsigned long last_scan;
 308 +       unsigned long entry_index;
 309 +       union {
 310 +               struct {/* when in unstable tree */
 311 +                       struct rb_node node;
 312 +                       struct tree_node *tree_node;
 313 +                       u32 hash_max;
 314 +               };
 315 +               struct { /* when in stable tree */
 316 +                       struct node_vma *head;
 317 +                       struct hlist_node hlist;
 318 +                       struct anon_vma *anon_vma;
 319 +               };
 320 +       };
 321 +} __attribute__((aligned(4)));
 322 +
 323 +struct rmap_list_entry {
 324 +       union {
 325 +               struct rmap_item *item;
 326 +               unsigned long addr;
 327 +       };
 328 +       // lowest bit is used for is_addr tag
 329 +       //unsigned char is_addr;
 330 +} __attribute__((aligned(4))); // 4 aligned to fit in to pages
 331 +
 332 +//extern struct semaphore ksm_scan_sem;
 333 +#else  /* !CONFIG_KSM */
 334
 335  static inline int PageKsm(struct page *page)
 336  {
 337 @@ -107,8 +269,9 @@
 338  }
 339
 340  #ifdef CONFIG_MMU
 341 -static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 342 -               unsigned long end, int advice, unsigned long *vm_flags)
 343 +
 344 +extern inline int unmerge_ksm_pages(struct vm_area_struct *vma,
 345 +                                   unsigned long start, unsigned long end)
 346  {
 347         return 0;
 348  }
 349 diff -urN linux-2.6.38/include/linux/mm_types.h uksm-2.6.38-zhang/include/linux/mm_types.h
 350 --- linux-2.6.38/include/linux/mm_types.h       2011-03-15 09:20:32.000000000 +0800
 351 +++ uksm-2.6.38-zhang/include/linux/mm_types.h  2012-01-09 10:05:57.562270296 +0800
 352 @@ -183,6 +183,9 @@
 353  #ifdef CONFIG_NUMA
 354         struct mempolicy *vm_policy;    /* NUMA policy for the VMA */
 355  #endif
 356 +#ifdef CONFIG_KSM
 357 +       struct vma_slot *ksm_vma_slot;
 358 +#endif
 359  };
 360
 361  struct core_thread {
 362 diff -urN linux-2.6.38/include/linux/mmzone.h uksm-2.6.38-zhang/include/linux/mmzone.h
 363 --- linux-2.6.38/include/linux/mmzone.h 2011-03-15 09:20:32.000000000 +0800
 364 +++ uksm-2.6.38-zhang/include/linux/mmzone.h    2012-01-09 10:05:57.562270296 +0800
 365 @@ -115,6 +115,10 @@
 366         NUMA_OTHER,             /* allocation from other node */
 367  #endif
 368         NR_ANON_TRANSPARENT_HUGEPAGES,
 369 +#ifdef CONFIG_KSM
 370 +       NR_KSM_PAGES_SHARING,
 371 +       NR_KSM_ZERO_PAGES,
 372 +#endif
 373         NR_VM_ZONE_STAT_ITEMS };
 374
 375  /*
 376 @@ -344,7 +348,7 @@
 377         ZONE_PADDING(_pad1_)
 378
 379         /* Fields commonly accessed by the page reclaim scanner */
 380 -       spinlock_t              lru_lock;
 381 +       spinlock_t              lru_lock;
 382         struct zone_lru {
 383                 struct list_head list;
 384         } lru[NR_LRU_LISTS];
 385 @@ -722,7 +726,7 @@
 386  }
 387
 388  /**
 389 - * is_highmem - helper function to quickly check if a struct zone is a
 390 + * is_highmem - helper function to quickly check if a struct zone is a
 391   *              highmem zone or not.  This is an attempt to keep references
 392   *              to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum.
 393   * @zone - pointer to struct zone variable
 394 diff -urN linux-2.6.38/include/linux/sched.h uksm-2.6.38-zhang/include/linux/sched.h
 395 --- linux-2.6.38/include/linux/sched.h  2011-03-15 09:20:32.000000000 +0800
 396 +++ uksm-2.6.38-zhang/include/linux/sched.h     2012-01-09 10:05:57.815603639 +0800
 397 @@ -433,7 +433,6 @@
 398  # define MMF_DUMP_MASK_DEFAULT_ELF     0
 399  #endif
 400                                         /* leave room for more dump flags */
 401 -#define MMF_VM_MERGEABLE       16      /* KSM may merge identical pages */
 402  #define MMF_VM_HUGEPAGE                17      /* set when VM_HUGEPAGE is set on vma */
 403
 404  #define MMF_INIT_MASK          (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
 405 @@ -1280,9 +1279,9 @@
 406         unsigned long stack_canary;
 407  #endif
 408
 409 -       /*
 410 +       /*
 411          * pointers to (original) parent process, youngest child, younger sibling,
 412 -        * older sibling, respectively.  (p->father can be replaced with
 413 +        * older sibling, respectively.  (p->father can be replaced with
 414          * p->real_parent->pid)
 415          */
 416         struct task_struct *real_parent; /* real parent process */
 417 @@ -2080,7 +2079,7 @@
 418         spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
 419
 420         return ret;
 421 -}
 422 +}
 423
 424  extern void block_all_signals(int (*notifier)(void *priv), void *priv,
 425                               sigset_t *mask);
 426 diff -urN linux-2.6.38/kernel/fork.c uksm-2.6.38-zhang/kernel/fork.c
 427 --- linux-2.6.38/kernel/fork.c  2011-03-15 09:20:32.000000000 +0800
 428 +++ uksm-2.6.38-zhang/kernel/fork.c     2012-01-09 10:05:59.635603699 +0800
 429 @@ -328,9 +328,6 @@
 430         rb_link = &mm->mm_rb.rb_node;
 431         rb_parent = NULL;
 432         pprev = &mm->mmap;
 433 -       retval = ksm_fork(mm, oldmm);
 434 -       if (retval)
 435 -               goto out;
 436         retval = khugepaged_fork(mm, oldmm);
 437         if (retval)
 438                 goto out;
 439 @@ -353,7 +350,7 @@
 440                                 goto fail_nomem;
 441                         charge = len;
 442                 }
 443 -               tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 444 +               tmp = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
 445                 if (!tmp)
 446                         goto fail_nomem;
 447                 *tmp = *mpnt;
 448 @@ -406,7 +403,9 @@
 449                 __vma_link_rb(mm, tmp, rb_link, rb_parent);
 450                 rb_link = &tmp->vm_rb.rb_right;
 451                 rb_parent = &tmp->vm_rb;
 452 -
 453 +#ifdef CONFIG_KSM
 454 +               ksm_vma_add_new(tmp);
 455 +#endif
 456                 mm->map_count++;
 457                 retval = copy_page_range(mm, oldmm, mpnt);
 458
 459 @@ -549,7 +548,6 @@
 460
 461         if (atomic_dec_and_test(&mm->mm_users)) {
 462                 exit_aio(mm);
 463 -               ksm_exit(mm);
 464                 khugepaged_exit(mm); /* must run before exit_mmap */
 465                 exit_mmap(mm);
 466                 set_mm_exe_file(mm, NULL);
 467 diff -urN linux-2.6.38/mm/ksm.c uksm-2.6.38-zhang/mm/ksm.c
 468 --- linux-2.6.38/mm/ksm.c       2011-03-15 09:20:32.000000000 +0800
 469 +++ uksm-2.6.38-zhang/mm/ksm.c  2012-01-09 10:05:59.862270375 +0800
 470 @@ -12,6 +12,47 @@
 471   *     Hugh Dickins
 472   *
 473   * This work is licensed under the terms of the GNU GPL, version 2.
 474 + *
 475 + *
 476 + *
 477 + * Ultra KSM. Copyright (C) 2011 Nai Xia
 478 + *
 479 + * This is an improvement upon KSM. Its features:
 480 + * 1. Full system scan:
 481 + *      It automatically scans all user processes' anonymous VMAs. Kernel-user
 482 + *      interaction to submit a memory area to KSM is no longer needed.
 483 + *
 484 + * 2. Rich area detection based on random sampling:
 485 + *      It automatically detects rich areas containing abundant duplicated
 486 + *      pages based on their randomly-sampled history. Rich areas are given
 487 + *      a full scan speed. Poor areas are sampled at a reasonable speed with
 488 + *      very low CPU consumption.
 489 + *
 490 + * 3. Per-page scan speed improvement:
 491 + *      A new hash algorithm(random_sample_hash) is proposed. Quite usually,
 492 + *      it's enough to distinguish pages by hashing their partial content
 493 + *      instead of full pages. This algorithm can automatically adapt to this
 494 + *      situation. For the best case, only one 32-bit-word/page is needed to
 495 + *      get the hash value for distinguishing pages. For the worst case, it's as
 496 + *      fast as SuperFastHash.
 497 + *
 498 + * 4. Thrashing area avoidance:
 499 + *      Thrashing area(an VMA that has frequent Ksm page break-out) can be
 500 + *      filtered out. My benchmark shows it's more efficient than KSM's per-page
 501 + *      hash value based volatile page detection.
 502 + *
 503 + * 5. Hash-value-based identical page detection:
 504 + *      It no longer uses "memcmp" based page detection any more.
 505 + *
 506 + * 6. Misc changes upon KSM:
 507 + *      * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page
 508 + *        comparison. It's much faster than default C version on x86.
 509 + *      * rmap_item now has an struct *page member to loosely cache a
 510 + *        address-->page mapping, which reduces too much time-costly
 511 + *        follow_page().
 512 + *      * The VMA creation/exit procedures are hooked to let the Ultra KSM know.
 513 + *      * try_to_merge_two_pages() now can revert a pte if it fails. No break_
 514 + *        ksm is needed for this case.
 515   */
 516
 517  #include <linux/errno.h>
 518 @@ -33,142 +74,168 @@
 519  #include <linux/mmu_notifier.h>
 520  #include <linux/swap.h>
 521  #include <linux/ksm.h>
 522 -#include <linux/hash.h>
 523 +#include <linux/crypto.h>
 524 +#include <linux/scatterlist.h>
 525 +#include <crypto/hash.h>
 526 +#include <linux/random.h>
 527 +#include <linux/math64.h>
 528 +#include <linux/gcd.h>
 529  #include <linux/freezer.h>
 530
 531  #include <asm/tlbflush.h>
 532  #include "internal.h"
 533
 534 +#ifdef CONFIG_X86
 535 +#undef memcmp
 536 +
 537 +#ifdef CONFIG_X86_32
 538 +#define memcmp memcmpx86_32
 539  /*
 540 - * A few notes about the KSM scanning process,
 541 - * to make it easier to understand the data structures below:
 542 - *
 543 - * In order to reduce excessive scanning, KSM sorts the memory pages by their
 544 - * contents into a data structure that holds pointers to the pages' locations.
 545 - *
 546 - * Since the contents of the pages may change at any moment, KSM cannot just
 547 - * insert the pages into a normal sorted tree and expect it to find anything.
 548 - * Therefore KSM uses two data structures - the stable and the unstable tree.
 549 - *
 550 - * The stable tree holds pointers to all the merged pages (ksm pages), sorted
 551 - * by their contents.  Because each such page is write-protected, searching on
 552 - * this tree is fully assured to be working (except when pages are unmapped),
 553 - * and therefore this tree is called the stable tree.
 554 - *
 555 - * In addition to the stable tree, KSM uses a second data structure called the
 556 - * unstable tree: this tree holds pointers to pages which have been found to
 557 - * be "unchanged for a period of time".  The unstable tree sorts these pages
 558 - * by their contents, but since they are not write-protected, KSM cannot rely
 559 - * upon the unstable tree to work correctly - the unstable tree is liable to
 560 - * be corrupted as its contents are modified, and so it is called unstable.
 561 - *
 562 - * KSM solves this problem by several techniques:
 563 - *
 564 - * 1) The unstable tree is flushed every time KSM completes scanning all
 565 - *    memory areas, and then the tree is rebuilt again from the beginning.
 566 - * 2) KSM will only insert into the unstable tree, pages whose hash value
 567 - *    has not changed since the previous scan of all memory areas.
 568 - * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the
 569 - *    colors of the nodes and not on their contents, assuring that even when
 570 - *    the tree gets "corrupted" it won't get out of balance, so scanning time
 571 - *    remains the same (also, searching and inserting nodes in an rbtree uses
 572 - *    the same algorithm, so we have no overhead when we flush and rebuild).
 573 - * 4) KSM never flushes the stable tree, which means that even if it were to
 574 - *    take 10 attempts to find a page in the unstable tree, once it is found,
 575 - *    it is secured in the stable tree.  (When we scan a new page, we first
 576 - *    compare it against the stable tree, and then against the unstable tree.)
 577 + * Compare 4-byte-aligned address s1 and s2, with length n
 578   */
 579 +int memcmpx86_32(void *s1, void *s2, size_t n)
 580 +{
 581 +       size_t num = n / 4;
 582 +       register int res;
 583
 584 -/**
 585 - * struct mm_slot - ksm information per mm that is being scanned
 586 - * @link: link to the mm_slots hash list
 587 - * @mm_list: link into the mm_slots list, rooted in ksm_mm_head
 588 - * @rmap_list: head for this mm_slot's singly-linked list of rmap_items
 589 - * @mm: the mm that this information is valid for
 590 - */
 591 -struct mm_slot {
 592 -       struct hlist_node link;
 593 -       struct list_head mm_list;
 594 -       struct rmap_item *rmap_list;
 595 -       struct mm_struct *mm;
 596 -};
 597 +       __asm__ __volatile__
 598 +       (
 599 +        "testl %3,%3\n\t"
 600 +        "repe; cmpsd\n\t"
 601 +        "je        1f\n\t"
 602 +        "sbbl      %0,%0\n\t"
 603 +        "orl       $1,%0\n"
 604 +        "1:"
 605 +        : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
 606 +        : "0" (0)
 607 +        : "cc");
 608
 609 -/**
 610 - * struct ksm_scan - cursor for scanning
 611 - * @mm_slot: the current mm_slot we are scanning
 612 - * @address: the next address inside that to be scanned
 613 - * @rmap_list: link to the next rmap to be scanned in the rmap_list
 614 - * @seqnr: count of completed full scans (needed when removing unstable node)
 615 - *
 616 - * There is only the one ksm_scan instance of this cursor structure.
 617 +       return res;
 618 +}
 619 +
 620 +/*
 621 + * Check the page is all zero ?
 622   */
 623 -struct ksm_scan {
 624 -       struct mm_slot *mm_slot;
 625 -       unsigned long address;
 626 -       struct rmap_item **rmap_list;
 627 -       unsigned long seqnr;
 628 -};
 629 +static int is_full_zero(const void *s1, size_t len)
 630 +{
 631 +       unsigned char same;
 632
 633 -/**
 634 - * struct stable_node - node of the stable rbtree
 635 - * @node: rb node of this ksm page in the stable tree
 636 - * @hlist: hlist head of rmap_items using this ksm page
 637 - * @kpfn: page frame number of this ksm page
 638 - */
 639 -struct stable_node {
 640 -       struct rb_node node;
 641 -       struct hlist_head hlist;
 642 -       unsigned long kpfn;
 643 -};
 644 +       len /= 4;
 645
 646 -/**
 647 - * struct rmap_item - reverse mapping item for virtual addresses
 648 - * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
 649 - * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
 650 - * @mm: the memory structure this rmap_item is pointing into
 651 - * @address: the virtual address this rmap_item tracks (+ flags in low bits)
 652 - * @oldchecksum: previous checksum of the page at that virtual address
 653 - * @node: rb node of this rmap_item in the unstable tree
 654 - * @head: pointer to stable_node heading this list in the stable tree
 655 - * @hlist: link into hlist of rmap_items hanging off that stable_node
 656 - */
 657 -struct rmap_item {
 658 -       struct rmap_item *rmap_list;
 659 -       struct anon_vma *anon_vma;      /* when stable */
 660 -       struct mm_struct *mm;
 661 -       unsigned long address;          /* + low bits used for flags below */
 662 -       unsigned int oldchecksum;       /* when unstable */
 663 -       union {
 664 -               struct rb_node node;    /* when node of unstable tree */
 665 -               struct {                /* when listed from stable tree */
 666 -                       struct stable_node *head;
 667 -                       struct hlist_node hlist;
 668 -               };
 669 -       };
 670 -};
 671 +       __asm__ __volatile__
 672 +       ("repe; scasl;"
 673 +        "sete %0"
 674 +        : "=qm" (same), "+D" (s1), "+c" (len)
 675 +        : "a" (0)
 676 +        : "cc");
 677 +
 678 +       return same;
 679 +}
 680
 681 -#define SEQNR_MASK     0x0ff   /* low bits of unstable tree seqnr */
 682 -#define UNSTABLE_FLAG  0x100   /* is a node of the unstable tree */
 683 -#define STABLE_FLAG    0x200   /* is listed from the stable tree */
 684
 685 -/* The stable and unstable tree heads */
 686 -static struct rb_root root_stable_tree = RB_ROOT;
 687 -static struct rb_root root_unstable_tree = RB_ROOT;
 688 +#elif defined(CONFIG_X86_64)
 689 +#define memcmp memcmpx86_64
 690 +/*
 691 + * Compare 8-byte-aligned address s1 and s2, with length n
 692 + */
 693 +int memcmpx86_64(void *s1, void *s2, size_t n)
 694 +{
 695 +       size_t num = n / 8;
 696 +       register int res;
 697
 698 -#define MM_SLOTS_HASH_SHIFT 10
 699 -#define MM_SLOTS_HASH_HEADS (1 << MM_SLOTS_HASH_SHIFT)
 700 -static struct hlist_head mm_slots_hash[MM_SLOTS_HASH_HEADS];
 701 +       __asm__ __volatile__
 702 +       (
 703 +        "testq %q3,%q3\n\t"
 704 +        "repe; cmpsq\n\t"
 705 +        "je        1f\n\t"
 706 +        "sbbq      %q0,%q0\n\t"
 707 +        "orq       $1,%q0\n"
 708 +        "1:"
 709 +        : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
 710 +        : "0" (0)
 711 +        : "cc");
 712
 713 -static struct mm_slot ksm_mm_head = {
 714 -       .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
 715 -};
 716 -static struct ksm_scan ksm_scan = {
 717 -       .mm_slot = &ksm_mm_head,
 718 -};
 719 +       return res;
 720 +}
 721 +
 722 +static int is_full_zero(const void *s1, size_t len)
 723 +{
 724 +       unsigned char same;
 725 +
 726 +       len /= 8;
 727 +
 728 +       __asm__ __volatile__
 729 +       ("repe; scasq;"
 730 +        "sete %0"
 731 +        : "=qm" (same), "+D" (s1), "+c" (len)
 732 +        : "a" (0)
 733 +        : "cc");
 734 +
 735 +       return same;
 736 +}
 737 +
 738 +#endif
 739 +#else
 740 +static int is_full_zero(const void *s1, size_t len)
 741 +{
 742 +       unsigned long *src = s1;
 743 +       int i;
 744
 745 +       len /= sizeof(*src);
 746 +
 747 +       for (i = 0; i < len; i++) {
 748 +               if (src[i])
 749 +                       return 0;
 750 +       }
 751 +
 752 +       return 1;
 753 +}
 754 +#endif
 755 +
 756 +#define U64_MAX                (~((u64)0))
 757 +
 758 +
 759 +/*
 760 + * Flags for rmap_item to judge if it's listed in the stable/unstable tree.
 761 + * The flags use the low bits of rmap_item.address
 762 + */
 763 +#define UNSTABLE_FLAG  0x1
 764 +#define STABLE_FLAG    0x2
 765 +#define get_rmap_addr(x)       ((x)->address & PAGE_MASK)
 766 +
 767 +/*
 768 + * rmap_list_entry helpers
 769 + */
 770 +#define IS_ADDR_FLAG   1
 771 +#define is_addr(ptr)           ((unsigned long)(ptr) & IS_ADDR_FLAG)
 772 +#define set_is_addr(ptr)       ((ptr) |= IS_ADDR_FLAG)
 773 +#define get_clean_addr(ptr)    (((ptr) & ~(__typeof__(ptr))IS_ADDR_FLAG))
 774 +
 775 +
 776 +/*
 777 + * High speed caches for frequently allocated and freed structs
 778 + */
 779  static struct kmem_cache *rmap_item_cache;
 780  static struct kmem_cache *stable_node_cache;
 781 -static struct kmem_cache *mm_slot_cache;
 782 +static struct kmem_cache *node_vma_cache;
 783 +static struct kmem_cache *vma_slot_cache;
 784 +static struct kmem_cache *tree_node_cache;
 785 +#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
 786 +               sizeof(struct __struct), __alignof__(struct __struct),\
 787 +               (__flags), NULL)
 788 +
 789 +/* The scan rounds ksmd is currently in */
 790 +static unsigned long long ksm_scan_round = 1;
 791 +
 792 +/* The number of pages has been scanned since the start up */
 793 +static u64 ksm_pages_scanned;
 794 +
 795 +/* The number of pages has been scanned when last scan round finished */
 796 +static u64 ksm_pages_scanned_last;
 797 +
 798 +/* If the scanned number is tooo large, we encode it here */
 799 +static u64 pages_scanned_stored;
 800 +static unsigned long pages_scanned_base;
 801
 802  /* The number of nodes in the stable tree */
 803  static unsigned long ksm_pages_shared;
 804 @@ -179,345 +246,403 @@
 805  /* The number of nodes in the unstable tree */
 806  static unsigned long ksm_pages_unshared;
 807
 808 -/* The number of rmap_items in use: to calculate pages_volatile */
 809 -static unsigned long ksm_rmap_items;
 810 -
 811 -/* Number of pages ksmd should scan in one batch */
 812 -static unsigned int ksm_thread_pages_to_scan = 100;
 813 +/*
 814 + * Number of pages ksmd should scan in one batch. This is the top speed for
 815 + * richly duplicated areas.
 816 + */
 817 +static unsigned long ksm_scan_batch_pages = 60000;
 818
 819  /* Milliseconds ksmd should sleep between batches */
 820 -static unsigned int ksm_thread_sleep_millisecs = 20;
 821 +static unsigned int ksm_sleep_jiffies = 2;
 822 +
 823 +/*
 824 + * The threshold used to filter out thrashing areas,
 825 + * If it == 0, filtering is disabled, otherwise it's the percentage up-bound
 826 + * of the thrashing ratio of all areas. Any area with a bigger thrashing ratio
 827 + * will be considered as having a zero duplication ratio.
 828 + */
 829 +static unsigned int ksm_thrash_threshold = 50;
 830 +
 831 +/* To avoid the float point arithmetic, this is the scale of a
 832 + * deduplication ratio number.
 833 + */
 834 +#define KSM_DEDUP_RATIO_SCALE  100
 835 +
 836 +
 837 +#define KSM_SCAN_RATIO_MAX     125
 838 +
 839 +/* minimum scan ratio for a vma, in unit of 1/KSM_SCAN_RATIO_MAX */
 840 +static unsigned int ksm_min_scan_ratio = 1;
 841 +
 842 +/*
 843 + * After each scan round, the scan ratio of an area with a big deduplication
 844 + * ratio is upgraded by *=ksm_scan_ratio_delta
 845 + */
 846 +static unsigned int ksm_scan_ratio_delta = 5;
 847 +
 848 +/*
 849 + * Inter-vma duplication number table page pointer array, initialized at
 850 + * startup. Whenever ksmd finds that two areas have an identical page,
 851 + * their corresponding table entry is increased. After each scan round
 852 + * is finished, this table is scanned to calculate the estimated
 853 + * duplication ratio for VMAs. Limited number(2048) of VMAs are
 854 + * supported by now. We will migrate it to more scalable data structures
 855 + * in the future.
 856 + */
 857 +#define KSM_DUP_VMA_MAX                2048
 858 +
 859 +#define INDIRECT_OFFSET                1
 860 +
 861 +/*
 862 + * For mapping of vma_slot and its index in inter-vma duplication number
 863 + * table
 864 + */
 865 +static struct radix_tree_root ksm_vma_tree;
 866 +static unsigned long ksm_vma_tree_num;
 867 +static unsigned long ksm_vma_tree_index_end;
 868 +
 869 +/* Array of all scan_rung, ksm_scan_ladder[0] having the minimum scan ratio */
 870 +static struct scan_rung *ksm_scan_ladder;
 871 +static unsigned int ksm_scan_ladder_size;
 872 +
 873 +/* The number of VMAs we are keeping track of */
 874 +static unsigned long ksm_vma_slot_num;
 875 +
 876 +/* How many times the ksmd has slept since startup */
 877 +static u64 ksm_sleep_times;
 878
 879  #define KSM_RUN_STOP   0
 880  #define KSM_RUN_MERGE  1
 881 -#define KSM_RUN_UNMERGE        2
 882 -static unsigned int ksm_run = KSM_RUN_STOP;
 883 +static unsigned int ksm_run = KSM_RUN_MERGE;
 884
 885  static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
 886  static DEFINE_MUTEX(ksm_thread_mutex);
 887 -static DEFINE_SPINLOCK(ksm_mmlist_lock);
 888
 889 -#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
 890 -               sizeof(struct __struct), __alignof__(struct __struct),\
 891 -               (__flags), NULL)
 892 +/*
 893 + * List vma_slot_new is for newly created vma_slot waiting to be added by
 894 + * ksmd. If one cannot be added(e.g. due to it's too small), it's moved to
 895 + * vma_slot_noadd. vma_slot_del is the list for vma_slot whose corresponding
 896 + * VMA has been removed/freed.
 897 + */
 898 +struct list_head vma_slot_new = LIST_HEAD_INIT(vma_slot_new);
 899 +struct list_head vma_slot_noadd = LIST_HEAD_INIT(vma_slot_noadd);
 900 +struct list_head vma_slot_del = LIST_HEAD_INIT(vma_slot_del);
 901 +static DEFINE_SPINLOCK(vma_slot_list_lock);
 902
 903 -static int __init ksm_slab_init(void)
 904 +/* The unstable tree heads */
 905 +static struct rb_root root_unstable_tree = RB_ROOT;
 906 +
 907 +/*
 908 + * All tree_nodes are in a list to be freed at once when unstable tree is
 909 + * freed after each scan round.
 910 + */
 911 +static struct list_head unstable_tree_node_list =
 912 +                               LIST_HEAD_INIT(unstable_tree_node_list);
 913 +
 914 +/* List contains all stable nodes */
 915 +static struct list_head stable_node_list = LIST_HEAD_INIT(stable_node_list);
 916 +
 917 +/*
 918 + * When the hash strength is changed, the stable tree must be delta_hashed and
 919 + * re-structured. We use two set of below structs to speed up the
 920 + * re-structuring of stable tree.
 921 + */
 922 +static struct list_head
 923 +stable_tree_node_list[2] = {LIST_HEAD_INIT(stable_tree_node_list[0]),
 924 +                           LIST_HEAD_INIT(stable_tree_node_list[1])};
 925 +
 926 +static struct list_head *stable_tree_node_listp = &stable_tree_node_list[0];
 927 +static struct rb_root root_stable_tree[2] = {RB_ROOT, RB_ROOT};
 928 +static struct rb_root *root_stable_treep = &root_stable_tree[0];
 929 +static unsigned long stable_tree_index;
 930 +
 931 +/* The hash strength needed to hash a full page */
 932 +#define HASH_STRENGTH_FULL             (PAGE_SIZE / sizeof(u32))
 933 +
 934 +/* The hash strength needed for loop-back hashing */
 935 +#define HASH_STRENGTH_MAX              (HASH_STRENGTH_FULL + 10)
 936 +
 937 +/* The random offsets in a page */
 938 +static u32 *random_nums;
 939 +
 940 +/* The hash strength */
 941 +static unsigned long hash_strength = HASH_STRENGTH_FULL >> 4;
 942 +
 943 +/* The delta value each time the hash strength increases or decreases */
 944 +static unsigned long hash_strength_delta;
 945 +#define HASH_STRENGTH_DELTA_MAX        5
 946 +
 947 +/* The time we have saved due to random_sample_hash */
 948 +static u64 rshash_pos;
 949 +
 950 +/* The time we have wasted due to hash collision */
 951 +static u64 rshash_neg;
 952 +
 953 +struct ksm_benefit {
 954 +       u64 pos;
 955 +       u64 neg;
 956 +       u64 scanned;
 957 +       unsigned long base;
 958 +} benefit;
 959 +
 960 +/*
 961 + * The relative cost of memcmp, compared to 1 time unit of random sample
 962 + * hash, this value is tested when ksm module is initialized
 963 + */
 964 +static unsigned long memcmp_cost;
 965 +
 966 +static unsigned long  rshash_neg_cont_zero;
 967 +static unsigned long  rshash_cont_obscure;
 968 +
 969 +/* The possible states of hash strength adjustment heuristic */
 970 +enum rshash_states {
 971 +               RSHASH_STILL,
 972 +               RSHASH_TRYUP,
 973 +               RSHASH_TRYDOWN,
 974 +               RSHASH_NEW,
 975 +               RSHASH_PRE_STILL,
 976 +};
 977 +
 978 +/* The possible direction we are about to adjust hash strength */
 979 +enum rshash_direct {
 980 +       GO_UP,
 981 +       GO_DOWN,
 982 +       OBSCURE,
 983 +       STILL,
 984 +};
 985 +
 986 +/* random sampling hash state machine */
 987 +static struct {
 988 +       enum rshash_states state;
 989 +       enum rshash_direct pre_direct;
 990 +       u8 below_count;
 991 +       /* Keep a lookup window of size 5, iff above_count/below_count > 3
 992 +        * in this window we stop trying.
 993 +        */
 994 +       u8 lookup_window_index;
 995 +       u64 stable_benefit;
 996 +       unsigned long turn_point_down;
 997 +       unsigned long turn_benefit_down;
 998 +       unsigned long turn_point_up;
 999 +       unsigned long turn_benefit_up;
1000 +       unsigned long stable_point;
1001 +} rshash_state;
1002 +
1003 +/*zero page hash table, hash_strength [0 ~ HASH_STRENGTH_MAX]*/
1004 +static u32 *zero_hash_table;
1005 +
1006 +static inline struct node_vma *alloc_node_vma(void)
1007  {
1008 -       rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
1009 -       if (!rmap_item_cache)
1010 -               goto out;
1011 +       struct node_vma *node_vma;
1012 +       node_vma = kmem_cache_zalloc(node_vma_cache, GFP_KERNEL);
1013 +       if (node_vma) {
1014 +               INIT_HLIST_HEAD(&node_vma->rmap_hlist);
1015 +               INIT_HLIST_NODE(&node_vma->hlist);
1016 +               node_vma->last_update = 0;
1017 +       }
1018 +       return node_vma;
1019 +}
1020
1021 -       stable_node_cache = KSM_KMEM_CACHE(stable_node, 0);
1022 -       if (!stable_node_cache)
1023 -               goto out_free1;
1024 +static inline void free_node_vma(struct node_vma *node_vma)
1025 +{
1026 +       kmem_cache_free(node_vma_cache, node_vma);
1027 +}
1028
1029 -       mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
1030 -       if (!mm_slot_cache)
1031 -               goto out_free2;
1032
1033 -       return 0;
1034 +static inline struct vma_slot *alloc_vma_slot(void)
1035 +{
1036 +       struct vma_slot *slot;
1037
1038 -out_free2:
1039 -       kmem_cache_destroy(stable_node_cache);
1040 -out_free1:
1041 -       kmem_cache_destroy(rmap_item_cache);
1042 -out:
1043 -       return -ENOMEM;
1044 +       /*
1045 +        * In case ksm is not initialized by now.
1046 +        * Oops, we need to consider the call site of ksm_init() in the future.
1047 +        */
1048 +       if (!vma_slot_cache)
1049 +               return NULL;
1050 +
1051 +       slot = kmem_cache_zalloc(vma_slot_cache, GFP_KERNEL);
1052 +       if (slot) {
1053 +               INIT_LIST_HEAD(&slot->ksm_list);
1054 +               INIT_LIST_HEAD(&slot->slot_list);
1055 +               INIT_RADIX_TREE(&slot->dup_tree, GFP_KERNEL);
1056 +               slot->ksm_index = -1;
1057 +               slot->need_rerand = 1;
1058 +       }
1059 +       return slot;
1060  }
1061
1062 -static void __init ksm_slab_free(void)
1063 +static inline void free_vma_slot(struct vma_slot *vma_slot)
1064  {
1065 -       kmem_cache_destroy(mm_slot_cache);
1066 -       kmem_cache_destroy(stable_node_cache);
1067 -       kmem_cache_destroy(rmap_item_cache);
1068 -       mm_slot_cache = NULL;
1069 +       kmem_cache_free(vma_slot_cache, vma_slot);
1070  }
1071
1072 +
1073 +
1074  static inline struct rmap_item *alloc_rmap_item(void)
1075  {
1076         struct rmap_item *rmap_item;
1077
1078         rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL);
1079 -       if (rmap_item)
1080 -               ksm_rmap_items++;
1081 +       if (rmap_item) {
1082 +               /* bug on lowest bit is not clear for flag use */
1083 +               BUG_ON(is_addr(rmap_item));
1084 +       }
1085         return rmap_item;
1086  }
1087
1088  static inline void free_rmap_item(struct rmap_item *rmap_item)
1089  {
1090 -       ksm_rmap_items--;
1091 -       rmap_item->mm = NULL;   /* debug safety */
1092 +       rmap_item->slot = NULL; /* debug safety */
1093         kmem_cache_free(rmap_item_cache, rmap_item);
1094  }
1095
1096  static inline struct stable_node *alloc_stable_node(void)
1097  {
1098 -       return kmem_cache_alloc(stable_node_cache, GFP_KERNEL);
1099 +       struct stable_node *node;
1100 +       node = kmem_cache_alloc(stable_node_cache, GFP_KERNEL | GFP_ATOMIC);
1101 +       if (!node)
1102 +               return NULL;
1103 +
1104 +       INIT_HLIST_HEAD(&node->hlist);
1105 +       list_add(&node->all_list, &stable_node_list);
1106 +       return node;
1107  }
1108
1109  static inline void free_stable_node(struct stable_node *stable_node)
1110  {
1111 +       list_del(&stable_node->all_list);
1112         kmem_cache_free(stable_node_cache, stable_node);
1113  }
1114
1115 -static inline struct mm_slot *alloc_mm_slot(void)
1116 +static inline struct tree_node *alloc_tree_node(struct list_head *list)
1117  {
1118 -       if (!mm_slot_cache)     /* initialization failed */
1119 +       struct tree_node *node;
1120 +       node = kmem_cache_zalloc(tree_node_cache, GFP_KERNEL | GFP_ATOMIC);
1121 +       if (!node)
1122                 return NULL;
1123 -       return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
1124 +
1125 +       list_add(&node->all_list, list);
1126 +       return node;
1127  }
1128
1129 -static inline void free_mm_slot(struct mm_slot *mm_slot)
1130 +static inline void free_tree_node(struct tree_node *node)
1131  {
1132 -       kmem_cache_free(mm_slot_cache, mm_slot);
1133 +       list_del(&node->all_list);
1134 +       kmem_cache_free(tree_node_cache, node);
1135  }
1136
1137 -static struct mm_slot *get_mm_slot(struct mm_struct *mm)
1138 +static void ksm_drop_anon_vma(struct rmap_item *rmap_item)
1139  {
1140 -       struct mm_slot *mm_slot;
1141 -       struct hlist_head *bucket;
1142 -       struct hlist_node *node;
1143 +       struct anon_vma *anon_vma = rmap_item->anon_vma;
1144
1145 -       bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)];
1146 -       hlist_for_each_entry(mm_slot, node, bucket, link) {
1147 -               if (mm == mm_slot->mm)
1148 -                       return mm_slot;
1149 -       }
1150 -       return NULL;
1151 +       drop_anon_vma(anon_vma);
1152  }
1153
1154 -static void insert_to_mm_slots_hash(struct mm_struct *mm,
1155 -                                   struct mm_slot *mm_slot)
1156 +
1157 +/**
1158 + * Remove a stable node from stable_tree, may unlink from its tree_node and
1159 + * may remove its parent tree_node if no other stable node is pending.
1160 + *
1161 + * @stable_node        The node need to be removed
1162 + * @unlink_rb          Will this node be unlinked from the rbtree?
1163 + * @remove_tree_       node Will its tree_node be removed if empty?
1164 + */
1165 +static void remove_node_from_stable_tree(struct stable_node *stable_node,
1166 +                                        int unlink_rb,  int remove_tree_node)
1167  {
1168 -       struct hlist_head *bucket;
1169 +       struct node_vma *node_vma;
1170 +       struct rmap_item *rmap_item;
1171 +       struct hlist_node *hlist, *rmap_hlist, *n;
1172
1173 -       bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)];
1174 -       mm_slot->mm = mm;
1175 -       hlist_add_head(&mm_slot->link, bucket);
1176 -}
1177 +       if (!hlist_empty(&stable_node->hlist)) {
1178 +               hlist_for_each_entry_safe(node_vma, hlist, n,
1179 +                                         &stable_node->hlist, hlist) {
1180 +                       hlist_for_each_entry(rmap_item, rmap_hlist,
1181 +                                            &node_vma->rmap_hlist, hlist) {
1182 +                               ksm_pages_sharing--;
1183
1184 -static inline int in_stable_tree(struct rmap_item *rmap_item)
1185 -{
1186 -       return rmap_item->address & STABLE_FLAG;
1187 -}
1188 +                               ksm_drop_anon_vma(rmap_item);
1189 +                               rmap_item->address &= PAGE_MASK;
1190 +                       }
1191 +                       free_node_vma(node_vma);
1192 +                       cond_resched();
1193 +               }
1194
1195 -static void hold_anon_vma(struct rmap_item *rmap_item,
1196 -                         struct anon_vma *anon_vma)
1197 -{
1198 -       rmap_item->anon_vma = anon_vma;
1199 -       get_anon_vma(anon_vma);
1200 -}
1201 +               /* the last one is counted as shared */
1202 +               ksm_pages_shared--;
1203 +               ksm_pages_sharing++;
1204 +       }
1205
1206 -static void ksm_drop_anon_vma(struct rmap_item *rmap_item)
1207 -{
1208 -       struct anon_vma *anon_vma = rmap_item->anon_vma;
1209 +       if (stable_node->tree_node && unlink_rb) {
1210 +               rb_erase(&stable_node->node,
1211 +                        &stable_node->tree_node->sub_root);
1212 +
1213 +               if (RB_EMPTY_ROOT(&stable_node->tree_node->sub_root) &&
1214 +                   remove_tree_node) {
1215 +                       rb_erase(&stable_node->tree_node->node,
1216 +                                root_stable_treep);
1217 +                       free_tree_node(stable_node->tree_node);
1218 +               } else {
1219 +                       stable_node->tree_node->count--;
1220 +               }
1221 +       }
1222
1223 -       drop_anon_vma(anon_vma);
1224 +       free_stable_node(stable_node);
1225  }
1226
1227 -/*
1228 - * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
1229 - * page tables after it has passed through ksm_exit() - which, if necessary,
1230 - * takes mmap_sem briefly to serialize against them.  ksm_exit() does not set
1231 - * a special flag: they can just back out as soon as mm_users goes to zero.
1232 - * ksm_test_exit() is used throughout to make this test for exit: in some
1233 - * places for correctness, in some places just to avoid unnecessary work.
1234 - */
1235 -static inline bool ksm_test_exit(struct mm_struct *mm)
1236 -{
1237 -       return atomic_read(&mm->mm_users) == 0;
1238 -}
1239
1240  /*
1241 - * We use break_ksm to break COW on a ksm page: it's a stripped down
1242 + * get_ksm_page: checks if the page indicated by the stable node
1243 + * is still its ksm page, despite having held no reference to it.
1244 + * In which case we can trust the content of the page, and it
1245 + * returns the gotten page; but if the page has now been zapped,
1246 + * remove the stale node from the stable tree and return NULL.
1247   *
1248 - *     if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1)
1249 - *             put_page(page);
1250 + * You would expect the stable_node to hold a reference to the ksm page.
1251 + * But if it increments the page's count, swapping out has to wait for
1252 + * ksmd to come around again before it can free the page, which may take
1253 + * seconds or even minutes: much too unresponsive.  So instead we use a
1254 + * "keyhole reference": access to the ksm page from the stable node peeps
1255 + * out through its keyhole to see if that page still holds the right key,
1256 + * pointing back to this stable node.  This relies on freeing a PageAnon
1257 + * page to reset its page->mapping to NULL, and relies on no other use of
1258 + * a page to put something that might look like our key in page->mapping.
1259   *
1260 - * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
1261 - * in case the application has unmapped and remapped mm,addr meanwhile.
1262 - * Could a ksm page appear anywhere else?  Actually yes, in a VM_PFNMAP
1263 - * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
1264 + * include/linux/pagemap.h page_cache_get_speculative() is a good reference,
1265 + * but this is different - made simpler by ksm_thread_mutex being held, but
1266 + * interesting for assuming that no other use of the struct page could ever
1267 + * put our expected_mapping into page->mapping (or a field of the union which
1268 + * coincides with page->mapping).  The RCU calls are not for KSM at all, but
1269 + * to keep the page_count protocol described with page_cache_get_speculative.
1270 + *
1271 + * Note: it is possible that get_ksm_page() will return NULL one moment,
1272 + * then page the next, if the page is in between page_freeze_refs() and
1273 + * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page
1274 + * is on its way to being freed; but it is an anomaly to bear in mind.
1275 + *
1276 + * @unlink_rb:                 if the removal of this node will firstly unlink from
1277 + * its rbtree. stable_node_reinsert will prevent this when restructuring the
1278 + * node from its old tree.
1279 + *
1280 + * @remove_tree_node:  if this is the last one of its tree_node, will the
1281 + * tree_node be freed ? If we are inserting stable node, this tree_node may
1282 + * be reused, so don't free it.
1283   */
1284 -static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
1285 +static struct page *get_ksm_page(struct stable_node *stable_node,
1286 +                                int unlink_rb, int remove_tree_node)
1287  {
1288         struct page *page;
1289 -       int ret = 0;
1290 +       void *expected_mapping;
1291
1292 -       do {
1293 -               cond_resched();
1294 -               page = follow_page(vma, addr, FOLL_GET);
1295 -               if (IS_ERR_OR_NULL(page))
1296 -                       break;
1297 -               if (PageKsm(page))
1298 -                       ret = handle_mm_fault(vma->vm_mm, vma, addr,
1299 -                                                       FAULT_FLAG_WRITE);
1300 -               else
1301 -                       ret = VM_FAULT_WRITE;
1302 -               put_page(page);
1303 -       } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM)));
1304 -       /*
1305 -        * We must loop because handle_mm_fault() may back out if there's
1306 -        * any difficulty e.g. if pte accessed bit gets updated concurrently.
1307 -        *
1308 -        * VM_FAULT_WRITE is what we have been hoping for: it indicates that
1309 -        * COW has been broken, even if the vma does not permit VM_WRITE;
1310 -        * but note that a concurrent fault might break PageKsm for us.
1311 -        *
1312 -        * VM_FAULT_SIGBUS could occur if we race with truncation of the
1313 -        * backing file, which also invalidates anonymous pages: that's
1314 -        * okay, that truncation will have unmapped the PageKsm for us.
1315 -        *
1316 -        * VM_FAULT_OOM: at the time of writing (late July 2009), setting
1317 -        * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
1318 -        * current task has TIF_MEMDIE set, and will be OOM killed on return
1319 -        * to user; and ksmd, having no mm, would never be chosen for that.
1320 -        *
1321 -        * But if the mm is in a limited mem_cgroup, then the fault may fail
1322 -        * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
1323 -        * even ksmd can fail in this way - though it's usually breaking ksm
1324 -        * just to undo a merge it made a moment before, so unlikely to oom.
1325 -        *
1326 -        * That's a pity: we might therefore have more kernel pages allocated
1327 -        * than we're counting as nodes in the stable tree; but ksm_do_scan
1328 -        * will retry to break_cow on each pass, so should recover the page
1329 -        * in due course.  The important thing is to not let VM_MERGEABLE
1330 -        * be cleared while any such pages might remain in the area.
1331 -        */
1332 -       return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
1333 -}
1334 -
1335 -static void break_cow(struct rmap_item *rmap_item)
1336 -{
1337 -       struct mm_struct *mm = rmap_item->mm;
1338 -       unsigned long addr = rmap_item->address;
1339 -       struct vm_area_struct *vma;
1340 -
1341 -       /*
1342 -        * It is not an accident that whenever we want to break COW
1343 -        * to undo, we also need to drop a reference to the anon_vma.
1344 -        */
1345 -       ksm_drop_anon_vma(rmap_item);
1346 -
1347 -       down_read(&mm->mmap_sem);
1348 -       if (ksm_test_exit(mm))
1349 -               goto out;
1350 -       vma = find_vma(mm, addr);
1351 -       if (!vma || vma->vm_start > addr)
1352 -               goto out;
1353 -       if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
1354 -               goto out;
1355 -       break_ksm(vma, addr);
1356 -out:
1357 -       up_read(&mm->mmap_sem);
1358 -}
1359 -
1360 -static struct page *page_trans_compound_anon(struct page *page)
1361 -{
1362 -       if (PageTransCompound(page)) {
1363 -               struct page *head = compound_trans_head(page);
1364 -               /*
1365 -                * head may actually be splitted and freed from under
1366 -                * us but it's ok here.
1367 -                */
1368 -               if (PageAnon(head))
1369 -                       return head;
1370 -       }
1371 -       return NULL;
1372 -}
1373 -
1374 -static struct page *get_mergeable_page(struct rmap_item *rmap_item)
1375 -{
1376 -       struct mm_struct *mm = rmap_item->mm;
1377 -       unsigned long addr = rmap_item->address;
1378 -       struct vm_area_struct *vma;
1379 -       struct page *page;
1380 -
1381 -       down_read(&mm->mmap_sem);
1382 -       if (ksm_test_exit(mm))
1383 -               goto out;
1384 -       vma = find_vma(mm, addr);
1385 -       if (!vma || vma->vm_start > addr)
1386 -               goto out;
1387 -       if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
1388 -               goto out;
1389 -
1390 -       page = follow_page(vma, addr, FOLL_GET);
1391 -       if (IS_ERR_OR_NULL(page))
1392 -               goto out;
1393 -       if (PageAnon(page) || page_trans_compound_anon(page)) {
1394 -               flush_anon_page(vma, page, addr);
1395 -               flush_dcache_page(page);
1396 -       } else {
1397 -               put_page(page);
1398 -out:           page = NULL;
1399 -       }
1400 -       up_read(&mm->mmap_sem);
1401 -       return page;
1402 -}
1403 -
1404 -static void remove_node_from_stable_tree(struct stable_node *stable_node)
1405 -{
1406 -       struct rmap_item *rmap_item;
1407 -       struct hlist_node *hlist;
1408 -
1409 -       hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
1410 -               if (rmap_item->hlist.next)
1411 -                       ksm_pages_sharing--;
1412 -               else
1413 -                       ksm_pages_shared--;
1414 -               ksm_drop_anon_vma(rmap_item);
1415 -               rmap_item->address &= PAGE_MASK;
1416 -               cond_resched();
1417 -       }
1418 -
1419 -       rb_erase(&stable_node->node, &root_stable_tree);
1420 -       free_stable_node(stable_node);
1421 -}
1422 -
1423 -/*
1424 - * get_ksm_page: checks if the page indicated by the stable node
1425 - * is still its ksm page, despite having held no reference to it.
1426 - * In which case we can trust the content of the page, and it
1427 - * returns the gotten page; but if the page has now been zapped,
1428 - * remove the stale node from the stable tree and return NULL.
1429 - *
1430 - * You would expect the stable_node to hold a reference to the ksm page.
1431 - * But if it increments the page's count, swapping out has to wait for
1432 - * ksmd to come around again before it can free the page, which may take
1433 - * seconds or even minutes: much too unresponsive.  So instead we use a
1434 - * "keyhole reference": access to the ksm page from the stable node peeps
1435 - * out through its keyhole to see if that page still holds the right key,
1436 - * pointing back to this stable node.  This relies on freeing a PageAnon
1437 - * page to reset its page->mapping to NULL, and relies on no other use of
1438 - * a page to put something that might look like our key in page->mapping.
1439 - *
1440 - * include/linux/pagemap.h page_cache_get_speculative() is a good reference,
1441 - * but this is different - made simpler by ksm_thread_mutex being held, but
1442 - * interesting for assuming that no other use of the struct page could ever
1443 - * put our expected_mapping into page->mapping (or a field of the union which
1444 - * coincides with page->mapping).  The RCU calls are not for KSM at all, but
1445 - * to keep the page_count protocol described with page_cache_get_speculative.
1446 - *
1447 - * Note: it is possible that get_ksm_page() will return NULL one moment,
1448 - * then page the next, if the page is in between page_freeze_refs() and
1449 - * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page
1450 - * is on its way to being freed; but it is an anomaly to bear in mind.
1451 - */
1452 -static struct page *get_ksm_page(struct stable_node *stable_node)
1453 -{
1454 -       struct page *page;
1455 -       void *expected_mapping;
1456 -
1457 -       page = pfn_to_page(stable_node->kpfn);
1458 -       expected_mapping = (void *)stable_node +
1459 -                               (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
1460 -       rcu_read_lock();
1461 -       if (page->mapping != expected_mapping)
1462 -               goto stale;
1463 -       if (!get_page_unless_zero(page))
1464 -               goto stale;
1465 -       if (page->mapping != expected_mapping) {
1466 +       page = pfn_to_page(stable_node->kpfn);
1467 +       expected_mapping = (void *)stable_node +
1468 +                               (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
1469 +       rcu_read_lock();
1470 +       if (page->mapping != expected_mapping)
1471 +               goto stale;
1472 +       if (!get_page_unless_zero(page))
1473 +               goto stale;
1474 +       if (page->mapping != expected_mapping) {
1475                 put_page(page);
1476                 goto stale;
1477         }
1478 @@ -525,7 +650,8 @@
1479         return page;
1480  stale:
1481         rcu_read_unlock();
1482 -       remove_node_from_stable_tree(stable_node);
1483 +       remove_node_from_stable_tree(stable_node, unlink_rb, remove_tree_node);
1484 +
1485         return NULL;
1486  }
1487
1488 @@ -533,32 +659,46 @@
1489   * Removing rmap_item from stable or unstable tree.
1490   * This function will clean the information from the stable/unstable tree.
1491   */
1492 -static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
1493 +static inline void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
1494  {
1495         if (rmap_item->address & STABLE_FLAG) {
1496                 struct stable_node *stable_node;
1497 +               struct node_vma *node_vma;
1498                 struct page *page;
1499
1500 -               stable_node = rmap_item->head;
1501 -               page = get_ksm_page(stable_node);
1502 +               node_vma = rmap_item->head;
1503 +               stable_node = node_vma->head;
1504 +               page = get_ksm_page(stable_node, 1, 1);
1505                 if (!page)
1506                         goto out;
1507
1508 +               /*
1509 +                * page lock is needed because it's racing with
1510 +                * try_to_unmap_ksm(), etc.
1511 +                */
1512                 lock_page(page);
1513                 hlist_del(&rmap_item->hlist);
1514 +
1515 +               if (hlist_empty(&node_vma->rmap_hlist)) {
1516 +                       hlist_del(&node_vma->hlist);
1517 +                       free_node_vma(node_vma);
1518 +               }
1519                 unlock_page(page);
1520 -               put_page(page);
1521
1522 -               if (stable_node->hlist.first)
1523 -                       ksm_pages_sharing--;
1524 -               else
1525 +               put_page(page);
1526 +               if (hlist_empty(&stable_node->hlist)) {
1527 +                       /* do NOT call remove_node_from_stable_tree() here,
1528 +                        * it's possible for a forked rmap_item not in
1529 +                        * stable tree while the in-tree rmap_items were
1530 +                        * deleted.
1531 +                        */
1532                         ksm_pages_shared--;
1533 +               } else
1534 +                       ksm_pages_sharing--;
1535
1536 -               ksm_drop_anon_vma(rmap_item);
1537 -               rmap_item->address &= PAGE_MASK;
1538
1539 +               ksm_drop_anon_vma(rmap_item);
1540         } else if (rmap_item->address & UNSTABLE_FLAG) {
1541 -               unsigned char age;
1542                 /*
1543                  * Usually ksmd can and must skip the rb_erase, because
1544                  * root_unstable_tree was already reset to RB_ROOT.
1545 @@ -566,169 +706,454 @@
1546                  * if this rmap_item was inserted by this scan, rather
1547                  * than left over from before.
1548                  */
1549 -               age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
1550 -               BUG_ON(age > 1);
1551 -               if (!age)
1552 -                       rb_erase(&rmap_item->node, &root_unstable_tree);
1553 -
1554 +               if (rmap_item->append_round == ksm_scan_round) {
1555 +                       rb_erase(&rmap_item->node,
1556 +                                &rmap_item->tree_node->sub_root);
1557 +                       if (RB_EMPTY_ROOT(&rmap_item->tree_node->sub_root)) {
1558 +                               rb_erase(&rmap_item->tree_node->node,
1559 +                                        &root_unstable_tree);
1560 +
1561 +                               free_tree_node(rmap_item->tree_node);
1562 +                       } else
1563 +                               rmap_item->tree_node->count--;
1564 +               }
1565                 ksm_pages_unshared--;
1566 -               rmap_item->address &= PAGE_MASK;
1567         }
1568 +
1569 +       rmap_item->address &= PAGE_MASK;
1570 +       rmap_item->hash_max = 0;
1571 +
1572  out:
1573         cond_resched();         /* we're called from many long loops */
1574  }
1575
1576 -static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
1577 -                                      struct rmap_item **rmap_list)
1578 +/**
1579 + * Need to do two things:
1580 + * 1. check if slot was moved to del list
1581 + * 2. make sure the mmap_sem is manipulated under valid vma.
1582 + *
1583 + * My concern here is that in some cases, this may make
1584 + * vma_slot_list_lock() waiters to serialized further by some
1585 + * sem->wait_lock, can this really be expensive?
1586 + *
1587 + *
1588 + * @return
1589 + * 0: if successfully locked mmap_sem
1590 + * -ENOENT: this slot was moved to del list
1591 + * -EBUSY: vma lock failed
1592 + */
1593 +static int try_down_read_slot_mmap_sem(struct vma_slot *slot)
1594  {
1595 -       while (*rmap_list) {
1596 -               struct rmap_item *rmap_item = *rmap_list;
1597 -               *rmap_list = rmap_item->rmap_list;
1598 -               remove_rmap_item_from_tree(rmap_item);
1599 -               free_rmap_item(rmap_item);
1600 +       struct vm_area_struct *vma;
1601 +       struct mm_struct *mm;
1602 +       struct rw_semaphore *sem;
1603 +
1604 +       spin_lock(&vma_slot_list_lock);
1605 +
1606 +       /* the slot_list was removed and inited from new list, when it enters
1607 +        * ksm_list. If now it's not empty, then it must be moved to del list
1608 +        */
1609 +       if (!list_empty(&slot->slot_list)) {
1610 +               spin_unlock(&vma_slot_list_lock);
1611 +               return -ENOENT;
1612 +       }
1613 +
1614 +       BUG_ON(slot->pages != vma_pages(slot->vma));
1615 +       /* Ok, vma still valid */
1616 +       vma = slot->vma;
1617 +       mm = vma->vm_mm;
1618 +       sem = &mm->mmap_sem;
1619 +       if (down_read_trylock(sem)) {
1620 +               spin_unlock(&vma_slot_list_lock);
1621 +               return 0;
1622         }
1623 +
1624 +       spin_unlock(&vma_slot_list_lock);
1625 +       return -EBUSY;
1626  }
1627
1628 -/*
1629 - * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather
1630 - * than check every pte of a given vma, the locking doesn't quite work for
1631 - * that - an rmap_item is assigned to the stable tree after inserting ksm
1632 - * page and upping mmap_sem.  Nor does it fit with the way we skip dup'ing
1633 - * rmap_items from parent to child at fork time (so as not to waste time
1634 - * if exit comes before the next scan reaches it).
1635 - *
1636 - * Similarly, although we'd like to remove rmap_items (so updating counts
1637 - * and freeing memory) when unmerging an area, it's easier to leave that
1638 - * to the next pass of ksmd - consider, for example, how ksmd might be
1639 - * in cmp_and_merge_page on one of the rmap_items we would be removing.
1640 - */
1641 -static int unmerge_ksm_pages(struct vm_area_struct *vma,
1642 -                            unsigned long start, unsigned long end)
1643 +static inline unsigned long
1644 +vma_page_address(struct page *page, struct vm_area_struct *vma)
1645  {
1646 -       unsigned long addr;
1647 -       int err = 0;
1648 +       pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
1649 +       unsigned long address;
1650
1651 -       for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
1652 -               if (ksm_test_exit(vma->vm_mm))
1653 -                       break;
1654 -               if (signal_pending(current))
1655 -                       err = -ERESTARTSYS;
1656 -               else
1657 -                       err = break_ksm(vma, addr);
1658 +       address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
1659 +       if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
1660 +               /* page should be within @vma mapping range */
1661 +               return -EFAULT;
1662         }
1663 -       return err;
1664 +       return address;
1665  }
1666
1667 -#ifdef CONFIG_SYSFS
1668  /*
1669 - * Only called through the sysfs control interface:
1670 + * Test if the mm is exiting
1671   */
1672 -static int unmerge_and_remove_all_rmap_items(void)
1673 +static inline bool ksm_test_exit(struct mm_struct *mm)
1674 +{
1675 +       return atomic_read(&mm->mm_users) == 0;
1676 +}
1677 +
1678 +/* return 0 on success with the item's mmap_sem locked */
1679 +static inline int get_mergeable_page_lock_mmap(struct rmap_item *item)
1680  {
1681 -       struct mm_slot *mm_slot;
1682         struct mm_struct *mm;
1683         struct vm_area_struct *vma;
1684 -       int err = 0;
1685 +       struct vma_slot *slot = item->slot;
1686 +       int err = -EINVAL;
1687
1688 -       spin_lock(&ksm_mmlist_lock);
1689 -       ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
1690 -                                               struct mm_slot, mm_list);
1691 -       spin_unlock(&ksm_mmlist_lock);
1692 -
1693 -       for (mm_slot = ksm_scan.mm_slot;
1694 -                       mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
1695 -               mm = mm_slot->mm;
1696 -               down_read(&mm->mmap_sem);
1697 -               for (vma = mm->mmap; vma; vma = vma->vm_next) {
1698 -                       if (ksm_test_exit(mm))
1699 -                               break;
1700 -                       if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
1701 -                               continue;
1702 -                       err = unmerge_ksm_pages(vma,
1703 -                                               vma->vm_start, vma->vm_end);
1704 -                       if (err)
1705 -                               goto error;
1706 -               }
1707 -
1708 -               remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list);
1709 -
1710 -               spin_lock(&ksm_mmlist_lock);
1711 -               ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
1712 -                                               struct mm_slot, mm_list);
1713 -               if (ksm_test_exit(mm)) {
1714 -                       hlist_del(&mm_slot->link);
1715 -                       list_del(&mm_slot->mm_list);
1716 -                       spin_unlock(&ksm_mmlist_lock);
1717 -
1718 -                       free_mm_slot(mm_slot);
1719 -                       clear_bit(MMF_VM_MERGEABLE, &mm->flags);
1720 -                       up_read(&mm->mmap_sem);
1721 -                       mmdrop(mm);
1722 -               } else {
1723 -                       spin_unlock(&ksm_mmlist_lock);
1724 -                       up_read(&mm->mmap_sem);
1725 -               }
1726 +       struct page *page;
1727 +
1728 +       BUG_ON(!item->slot);
1729 +       /*
1730 +        * try_down_read_slot_mmap_sem() returns non-zero if the slot
1731 +        * has been removed by ksm_remove_vma().
1732 +        */
1733 +       if (try_down_read_slot_mmap_sem(slot))
1734 +               return -EBUSY;
1735 +
1736 +       mm = slot->vma->vm_mm;
1737 +       vma = slot->vma;
1738 +
1739 +       if (ksm_test_exit(mm))
1740 +               goto failout_up;
1741 +
1742 +       page = item->page;
1743 +       rcu_read_lock();
1744 +       if (!get_page_unless_zero(page)) {
1745 +               rcu_read_unlock();
1746 +               goto failout_up;
1747         }
1748
1749 -       ksm_scan.seqnr = 0;
1750 +       /* No need to consider huge page here. */
1751 +       if (item->slot->vma->anon_vma != page_anon_vma(page) ||
1752 +           vma_page_address(page, item->slot->vma) != get_rmap_addr(item)) {
1753 +               /*
1754 +                * TODO:
1755 +                * should we release this item becase of its stale page
1756 +                * mapping?
1757 +                */
1758 +               put_page(page);
1759 +               rcu_read_unlock();
1760 +               goto failout_up;
1761 +       }
1762 +       rcu_read_unlock();
1763         return 0;
1764
1765 -error:
1766 +failout_up:
1767         up_read(&mm->mmap_sem);
1768 -       spin_lock(&ksm_mmlist_lock);
1769 -       ksm_scan.mm_slot = &ksm_mm_head;
1770 -       spin_unlock(&ksm_mmlist_lock);
1771         return err;
1772  }
1773 -#endif /* CONFIG_SYSFS */
1774
1775 -static u32 calc_checksum(struct page *page)
1776 +/*
1777 + * What kind of VMA is considered ?
1778 + */
1779 +static inline int vma_can_enter(struct vm_area_struct *vma)
1780  {
1781 -       u32 checksum;
1782 -       void *addr = kmap_atomic(page, KM_USER0);
1783 -       checksum = jhash2(addr, PAGE_SIZE / 4, 17);
1784 -       kunmap_atomic(addr, KM_USER0);
1785 -       return checksum;
1786 +       return !(vma->vm_flags & (VM_PFNMAP | VM_IO  | VM_DONTEXPAND |
1787 +                                 VM_RESERVED  | VM_HUGETLB | VM_INSERTPAGE |
1788 +                                 VM_NONLINEAR | VM_MIXEDMAP | VM_SAO |
1789 +                                 VM_SHARED  | VM_MAYSHARE | VM_GROWSUP
1790 +                                 | VM_GROWSDOWN));
1791  }
1792
1793 -static int memcmp_pages(struct page *page1, struct page *page2)
1794 +/*
1795 + * Called whenever a fresh new vma is created A new vma_slot.
1796 + * is created and inserted into a global list Must be called.
1797 + * after vma is inserted to its mm                         .
1798 + */
1799 +inline void ksm_vma_add_new(struct vm_area_struct *vma)
1800  {
1801 -       char *addr1, *addr2;
1802 -       int ret;
1803 +       struct vma_slot *slot;
1804
1805 -       addr1 = kmap_atomic(page1, KM_USER0);
1806 -       addr2 = kmap_atomic(page2, KM_USER1);
1807 -       ret = memcmp(addr1, addr2, PAGE_SIZE);
1808 -       kunmap_atomic(addr2, KM_USER1);
1809 -       kunmap_atomic(addr1, KM_USER0);
1810 -       return ret;
1811 -}
1812 +       if (!vma_can_enter(vma)) {
1813 +               vma->ksm_vma_slot = NULL;
1814 +               return;
1815 +       }
1816
1817 -static inline int pages_identical(struct page *page1, struct page *page2)
1818 -{
1819 -       return !memcmp_pages(page1, page2);
1820 +       slot = alloc_vma_slot();
1821 +       if (!slot) {
1822 +               vma->ksm_vma_slot = NULL;
1823 +               return;
1824 +       }
1825 +
1826 +       vma->ksm_vma_slot = slot;
1827 +       slot->vma = vma;
1828 +       slot->mm = vma->vm_mm;
1829 +       slot->ctime_j = jiffies;
1830 +       slot->pages = vma_pages(vma);
1831 +       spin_lock(&vma_slot_list_lock);
1832 +       list_add_tail(&slot->slot_list, &vma_slot_new);
1833 +       spin_unlock(&vma_slot_list_lock);
1834  }
1835
1836 -static int write_protect_page(struct vm_area_struct *vma, struct page *page,
1837 -                             pte_t *orig_pte)
1838 +/*
1839 + * Called after vma is unlinked from its mm
1840 + */
1841 +void ksm_remove_vma(struct vm_area_struct *vma)
1842  {
1843 -       struct mm_struct *mm = vma->vm_mm;
1844 -       unsigned long addr;
1845 -       pte_t *ptep;
1846 -       spinlock_t *ptl;
1847 -       int swapped;
1848 -       int err = -EFAULT;
1849 +       struct vma_slot *slot;
1850
1851 -       addr = page_address_in_vma(page, vma);
1852 -       if (addr == -EFAULT)
1853 -               goto out;
1854 +       if (!vma->ksm_vma_slot)
1855 +               return;
1856
1857 -       BUG_ON(PageTransCompound(page));
1858 -       ptep = page_check_address(page, mm, addr, &ptl, 0);
1859 -       if (!ptep)
1860 -               goto out;
1861 +       slot = vma->ksm_vma_slot;
1862 +       spin_lock(&vma_slot_list_lock);
1863 +       if (list_empty(&slot->slot_list)) {
1864 +               /**
1865 +                * This slot has been added by ksmd, so move to the del list
1866 +                * waiting ksmd to free it.
1867 +                */
1868 +               list_add_tail(&slot->slot_list, &vma_slot_del);
1869 +       } else {
1870 +               /**
1871 +                * It's still on new list. It's ok to free slot directly.
1872 +                */
1873 +               list_del(&slot->slot_list);
1874 +               free_vma_slot(slot);
1875 +       }
1876 +       spin_unlock(&vma_slot_list_lock);
1877 +       vma->ksm_vma_slot = NULL;
1878 +}
1879
1880 -       if (pte_write(*ptep) || pte_dirty(*ptep)) {
1881 +/*   32/3 < they < 32/2 */
1882 +#define shiftl 8
1883 +#define shiftr 12
1884 +
1885 +#define HASH_FROM_TO(from, to)                                 \
1886 +for (index = from; index < to; index++) {              \
1887 +       pos = random_nums[index];                       \
1888 +       hash += key[pos];                               \
1889 +       hash += (hash << shiftl);                       \
1890 +       hash ^= (hash >> shiftr);                       \
1891 +}
1892 +
1893 +
1894 +#define HASH_FROM_DOWN_TO(from, to)                    \
1895 +for (index = from - 1; index >= to; index--) {         \
1896 +       hash ^= (hash >> shiftr);                       \
1897 +       hash ^= (hash >> (shiftr*2));                   \
1898 +       hash -= (hash << shiftl);                       \
1899 +       hash += (hash << (shiftl*2));                   \
1900 +       pos = random_nums[index];                       \
1901 +       hash -= key[pos];                               \
1902 +}
1903 +
1904 +/*
1905 + * The main random sample hash function.
1906 + */
1907 +static u32 random_sample_hash(void *addr, u32 hash_strength)
1908 +{
1909 +       u32 hash = 0xdeadbeef;
1910 +       int index, pos, loop = hash_strength;
1911 +       u32 *key = (u32 *)addr;
1912 +
1913 +       if (loop > HASH_STRENGTH_FULL)
1914 +               loop = HASH_STRENGTH_FULL;
1915 +
1916 +       HASH_FROM_TO(0, loop);
1917 +
1918 +       if (hash_strength > HASH_STRENGTH_FULL) {
1919 +               loop = hash_strength - HASH_STRENGTH_FULL;
1920 +               HASH_FROM_TO(0, loop);
1921 +       }
1922 +
1923 +       return hash;
1924 +}
1925 +
1926 +
1927 +/**
1928 + * It's used when hash strength is adjusted
1929 + *
1930 + * @addr The page's virtual address
1931 + * @from The original hash strength
1932 + * @to   The hash strength changed to
1933 + * @hash The hash value generated with "from" hash value
1934 + *
1935 + * return the hash value
1936 + */
1937 +static u32 delta_hash(void *addr, int from, int to, u32 hash)
1938 +{
1939 +       u32 *key = (u32 *)addr;
1940 +       int index, pos; /* make sure they are int type */
1941 +
1942 +       if (to > from) {
1943 +               if (from >= HASH_STRENGTH_FULL) {
1944 +                       from -= HASH_STRENGTH_FULL;
1945 +                       to -= HASH_STRENGTH_FULL;
1946 +                       HASH_FROM_TO(from, to);
1947 +               } else if (to <= HASH_STRENGTH_FULL) {
1948 +                       HASH_FROM_TO(from, to);
1949 +               } else {
1950 +                       HASH_FROM_TO(from, HASH_STRENGTH_FULL);
1951 +                       HASH_FROM_TO(0, to - HASH_STRENGTH_FULL);
1952 +               }
1953 +       } else {
1954 +               if (from <= HASH_STRENGTH_FULL) {
1955 +                       HASH_FROM_DOWN_TO(from, to);
1956 +               } else if (to >= HASH_STRENGTH_FULL) {
1957 +                       from -= HASH_STRENGTH_FULL;
1958 +                       to -= HASH_STRENGTH_FULL;
1959 +                       HASH_FROM_DOWN_TO(from, to);
1960 +               } else {
1961 +                       HASH_FROM_DOWN_TO(from - HASH_STRENGTH_FULL, 0);
1962 +                       HASH_FROM_DOWN_TO(HASH_STRENGTH_FULL, to);
1963 +               }
1964 +       }
1965 +
1966 +       return hash;
1967 +}
1968 +
1969 +
1970 +
1971 +
1972 +#define CAN_OVERFLOW_U64(x, delta) (U64_MAX - (x) < (delta))
1973 +
1974 +/**
1975 + *
1976 + * Called when: rshash_pos or rshash_neg is about to overflow or a scan round
1977 + * has finished.
1978 + *
1979 + */
1980 +static inline void encode_benefit(void)
1981 +{
1982 +       u64 scanned_delta, pos_delta, neg_delta;
1983 +       unsigned long base = benefit.base;
1984 +
1985 +       scanned_delta = (ksm_pages_scanned - ksm_pages_scanned_last) >> base;
1986 +       pos_delta = rshash_pos >> base;
1987 +       neg_delta = rshash_neg >> base;
1988 +
1989 +       if (CAN_OVERFLOW_U64(benefit.pos, pos_delta) ||
1990 +           CAN_OVERFLOW_U64(benefit.neg, neg_delta) ||
1991 +           CAN_OVERFLOW_U64(benefit.scanned, scanned_delta)) {
1992 +               benefit.scanned >>= 1;
1993 +               benefit.neg >>= 1;
1994 +               benefit.pos >>= 1;
1995 +               benefit.base++;
1996 +               scanned_delta >>= 1;
1997 +               pos_delta >>= 1;
1998 +               neg_delta >>= 1;
1999 +       }
2000 +
2001 +       benefit.pos += pos_delta;
2002 +       benefit.neg += neg_delta;
2003 +       benefit.scanned += scanned_delta;
2004 +
2005 +       BUG_ON(!benefit.scanned);
2006 +
2007 +       rshash_pos = rshash_neg = 0;
2008 +
2009 +       /* -1 to make rshash_adjust() work */
2010 +       ksm_pages_scanned_last = ksm_pages_scanned - 1;
2011 +}
2012 +
2013 +static inline void reset_benefit(void)
2014 +{
2015 +       benefit.pos = 0;
2016 +       benefit.neg = 0;
2017 +       benefit.base = 0;
2018 +       benefit.scanned = 0;
2019 +}
2020 +
2021 +static inline void inc_rshash_pos(unsigned long delta)
2022 +{
2023 +       if (CAN_OVERFLOW_U64(rshash_pos, delta))
2024 +               encode_benefit();
2025 +
2026 +       rshash_pos += delta;
2027 +}
2028 +
2029 +static inline void inc_rshash_neg(unsigned long delta)
2030 +{
2031 +       if (CAN_OVERFLOW_U64(rshash_neg, delta))
2032 +               encode_benefit();
2033 +
2034 +       rshash_neg += delta;
2035 +}
2036 +
2037 +
2038 +static inline u32 page_hash(struct page *page, unsigned long hash_strength,
2039 +                           int cost_accounting)
2040 +{
2041 +       u32 val;
2042 +       unsigned long delta;
2043 +
2044 +       void *addr = kmap_atomic(page, KM_USER0);
2045 +
2046 +       val = random_sample_hash(addr, hash_strength);
2047 +       kunmap_atomic(addr, KM_USER0);
2048 +
2049 +       if (cost_accounting) {
2050 +               if (HASH_STRENGTH_FULL > hash_strength)
2051 +                       delta = HASH_STRENGTH_FULL - hash_strength;
2052 +               else
2053 +                       delta = 0;
2054 +
2055 +               inc_rshash_pos(delta);
2056 +       }
2057 +
2058 +       return val;
2059 +}
2060 +
2061 +static int memcmp_pages(struct page *page1, struct page *page2,
2062 +                       int cost_accounting)
2063 +{
2064 +       char *addr1, *addr2;
2065 +       int ret;
2066 +
2067 +       addr1 = kmap_atomic(page1, KM_USER0);
2068 +       addr2 = kmap_atomic(page2, KM_USER1);
2069 +       ret = memcmp(addr1, addr2, PAGE_SIZE);
2070 +       kunmap_atomic(addr2, KM_USER1);
2071 +       kunmap_atomic(addr1, KM_USER0);
2072 +
2073 +       if (cost_accounting)
2074 +               inc_rshash_neg(memcmp_cost);
2075 +
2076 +       return ret;
2077 +}
2078 +
2079 +static inline int pages_identical(struct page *page1, struct page *page2)
2080 +{
2081 +       return !memcmp_pages(page1, page2, 0);
2082 +}
2083 +
2084 +static inline int is_page_full_zero(struct page *page)
2085 +{
2086 +       char *addr;
2087 +       int ret;
2088 +
2089 +       addr = kmap_atomic(page, KM_USER0);
2090 +       ret = is_full_zero(addr, PAGE_SIZE);
2091 +       kunmap_atomic(addr, KM_USER0);
2092 +
2093 +       return ret;
2094 +}
2095 +
2096 +static int write_protect_page(struct vm_area_struct *vma, struct page *page,
2097 +                             pte_t *orig_pte, pte_t *old_pte)
2098 +{
2099 +       struct mm_struct *mm = vma->vm_mm;
2100 +       unsigned long addr;
2101 +       pte_t *ptep;
2102 +       spinlock_t *ptl;
2103 +       int swapped;
2104 +       int err = -EFAULT;
2105 +
2106 +       addr = page_address_in_vma(page, vma);
2107 +       if (addr == -EFAULT)
2108 +               goto out;
2109 +
2110 +       BUG_ON(PageTransCompound(page));
2111 +       ptep = page_check_address(page, mm, addr, &ptl, 0);
2112 +       if (!ptep)
2113 +               goto out;
2114 +
2115 +       if (old_pte)
2116 +               *old_pte = *ptep;
2117 +
2118 +       if (pte_write(*ptep) || pte_dirty(*ptep)) {
2119                 pte_t entry;
2120
2121                 swapped = PageSwapCache(page);
2122 @@ -765,6 +1190,11 @@
2123         return err;
2124  }
2125
2126 +#define MERGE_ERR_PGERR                1 /* the page is invalid cannot continue */
2127 +#define MERGE_ERR_COLLI                2 /* there is a collision */
2128 +#define MERGE_ERR_CHANGED      3 /* the page has changed since last hash */
2129 +
2130 +
2131  /**
2132   * replace_page - replace page in vma by new ksm page
2133   * @vma:      vma that holds the pte pointing to page
2134 @@ -772,7 +1202,7 @@
2135   * @kpage:    the ksm page we replace page by
2136   * @orig_pte: the original value of the pte
2137   *
2138 - * Returns 0 on success, -EFAULT on failure.
2139 + * Returns 0 on success, MERGE_ERR_PGERR on failure.
2140   */
2141  static int replace_page(struct vm_area_struct *vma, struct page *page,
2142                         struct page *kpage, pte_t orig_pte)
2143 @@ -783,8 +1213,10 @@
2144         pmd_t *pmd;
2145         pte_t *ptep;
2146         spinlock_t *ptl;
2147 +       pte_t entry;
2148 +
2149         unsigned long addr;
2150 -       int err = -EFAULT;
2151 +       int err = MERGE_ERR_PGERR;
2152
2153         addr = page_address_in_vma(page, vma);
2154         if (addr == -EFAULT)
2155 @@ -809,12 +1241,20 @@
2156                 goto out;
2157         }
2158
2159 -       get_page(kpage);
2160 -       page_add_anon_rmap(kpage, vma, addr);
2161 -
2162         flush_cache_page(vma, addr, pte_pfn(*ptep));
2163         ptep_clear_flush(vma, addr, ptep);
2164 -       set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
2165 +       entry = mk_pte(kpage, vma->vm_page_prot);
2166 +
2167 +       /* special treatment is needed for zero_page */
2168 +       if ((page_to_pfn(kpage) == ksm_zero_pfn) ||
2169 +                               (page_to_pfn(kpage) == zero_pfn))
2170 +               entry = pte_mkspecial(entry);
2171 +       else {
2172 +               get_page(kpage);
2173 +               page_add_anon_rmap(kpage, vma, addr);
2174 +       }
2175 +
2176 +       set_pte_at_notify(mm, addr, ptep, entry);
2177
2178         page_remove_rmap(page);
2179         if (!page_mapped(page))
2180 @@ -827,6 +1267,85 @@
2181         return err;
2182  }
2183
2184 +
2185 +/**
2186 + *  Fully hash a page with HASH_STRENGTH_MAX return a non-zero hash value. The
2187 + *  zero hash value at HASH_STRENGTH_MAX is used to indicated that its
2188 + *  hash_max member has not been calculated.
2189 + *
2190 + * @page The page needs to be hashed
2191 + * @hash_old The hash value calculated with current hash strength
2192 + *
2193 + * return the new hash value calculated at HASH_STRENGTH_MAX
2194 + */
2195 +static inline u32 page_hash_max(struct page *page, u32 hash_old)
2196 +{
2197 +       u32 hash_max = 0;
2198 +       void *addr;
2199 +
2200 +       addr = kmap_atomic(page, KM_USER0);
2201 +       hash_max = delta_hash(addr, hash_strength,
2202 +                             HASH_STRENGTH_MAX, hash_old);
2203 +
2204 +       kunmap_atomic(addr, KM_USER0);
2205 +
2206 +       if (!hash_max)
2207 +               hash_max = 1;
2208 +
2209 +       inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
2210 +       return hash_max;
2211 +}
2212 +
2213 +/*
2214 + * We compare the hash again, to ensure that it is really a hash collision
2215 + * instead of being caused by page write.
2216 + */
2217 +static inline int check_collision(struct rmap_item *rmap_item,
2218 +                                 u32 hash)
2219 +{
2220 +       int err;
2221 +       struct page *page = rmap_item->page;
2222 +
2223 +       /* if this rmap_item has already been hash_maxed, then the collision
2224 +        * must appears in the second-level rbtree search. In this case we check
2225 +        * if its hash_max value has been changed. Otherwise, the collision
2226 +        * happens in the first-level rbtree search, so we check against it's
2227 +        * current hash value.
2228 +        */
2229 +       if (rmap_item->hash_max) {
2230 +               inc_rshash_neg(memcmp_cost);
2231 +               inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
2232 +
2233 +               if (rmap_item->hash_max == page_hash_max(page, hash))
2234 +                       err = MERGE_ERR_COLLI;
2235 +               else
2236 +                       err = MERGE_ERR_CHANGED;
2237 +       } else {
2238 +               inc_rshash_neg(memcmp_cost + hash_strength);
2239 +
2240 +               if (page_hash(page, hash_strength, 0) == hash)
2241 +                       err = MERGE_ERR_COLLI;
2242 +               else
2243 +                       err = MERGE_ERR_CHANGED;
2244 +       }
2245 +
2246 +       return err;
2247 +}
2248 +
2249 +static struct page *page_trans_compound_anon(struct page *page)
2250 +{
2251 +       if (PageTransCompound(page)) {
2252 +               struct page *head = compound_trans_head(page);
2253 +               /*
2254 +                * head may actually be splitted and freed from under
2255 +                * us but it's ok here.
2256 +                */
2257 +               if (PageAnon(head))
2258 +                       return head;
2259 +       }
2260 +       return NULL;
2261 +}
2262 +
2263  static int page_trans_compound_anon_split(struct page *page)
2264  {
2265         int ret = 0;
2266 @@ -854,30 +1373,36 @@
2267         return ret;
2268  }
2269
2270 -/*
2271 - * try_to_merge_one_page - take two pages and merge them into one
2272 - * @vma: the vma that holds the pte pointing to page
2273 - * @page: the PageAnon page that we want to replace with kpage
2274 - * @kpage: the PageKsm page that we want to map instead of page,
2275 - *         or NULL the first time when we want to use page as kpage.
2276 +/**
2277 + * Try to merge a rmap_item.page with a kpage in stable node. kpage must
2278 + * already be a ksm page.
2279   *
2280 - * This function returns 0 if the pages were merged, -EFAULT otherwise.
2281 + * @return 0 if the pages were merged, -EFAULT otherwise.
2282   */
2283 -static int try_to_merge_one_page(struct vm_area_struct *vma,
2284 -                                struct page *page, struct page *kpage)
2285 +static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
2286 +                                     struct page *kpage, u32 hash)
2287  {
2288 +       struct vm_area_struct *vma = rmap_item->slot->vma;
2289 +       struct mm_struct *mm = vma->vm_mm;
2290         pte_t orig_pte = __pte(0);
2291 -       int err = -EFAULT;
2292 +       int err = MERGE_ERR_PGERR;
2293 +       struct page *page;
2294
2295 -       if (page == kpage)                      /* ksm page forked */
2296 -               return 0;
2297 +       if (ksm_test_exit(mm))
2298 +               goto out;
2299 +
2300 +       page = rmap_item->page;
2301
2302 -       if (!(vma->vm_flags & VM_MERGEABLE))
2303 +       if (page == kpage) { /* ksm page forked */
2304 +               err = 0;
2305                 goto out;
2306 +       }
2307 +
2308         if (PageTransCompound(page) && page_trans_compound_anon_split(page))
2309                 goto out;
2310         BUG_ON(PageTransCompound(page));
2311 -       if (!PageAnon(page))
2312 +
2313 +       if (!PageAnon(page) || !PageKsm(kpage))
2314                 goto out;
2315
2316         /*
2317 @@ -895,18 +1420,27 @@
2318          * ptes are necessarily already write-protected.  But in either
2319          * case, we need to lock and check page_count is not raised.
2320          */
2321 -       if (write_protect_page(vma, page, &orig_pte) == 0) {
2322 +       if (write_protect_page(vma, page, &orig_pte, NULL) == 0) {
2323                 if (!kpage) {
2324 +                       long map_sharing = atomic_read(&page->_mapcount);
2325                         /*
2326                          * While we hold page lock, upgrade page from
2327                          * PageAnon+anon_vma to PageKsm+NULL stable_node:
2328                          * stable_tree_insert() will update stable_node.
2329                          */
2330                         set_page_stable_node(page, NULL);
2331 +                       if (map_sharing)
2332 +                               add_zone_page_state(page_zone(page),
2333 +                                                   NR_KSM_PAGES_SHARING,
2334 +                                                   map_sharing);
2335                         mark_page_accessed(page);
2336                         err = 0;
2337 -               } else if (pages_identical(page, kpage))
2338 -                       err = replace_page(vma, page, kpage, orig_pte);
2339 +               } else {
2340 +                       if (pages_identical(page, kpage))
2341 +                               err = replace_page(vma, page, kpage, orig_pte);
2342 +                       else
2343 +                               err = check_collision(rmap_item, hash);
2344 +               }
2345         }
2346
2347         if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
2348 @@ -924,378 +1458,2697 @@
2349         return err;
2350  }
2351
2352 -/*
2353 - * try_to_merge_with_ksm_page - like try_to_merge_two_pages,
2354 - * but no new kernel page is allocated: kpage must already be a ksm page.
2355 +
2356 +
2357 +/**
2358 + * If two pages fail to merge in try_to_merge_two_pages, then we have a chance
2359 + * to restore a page mapping that has been changed in try_to_merge_two_pages.
2360   *
2361 - * This function returns 0 if the pages were merged, -EFAULT otherwise.
2362 + * @return 0 on success.
2363   */
2364 -static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
2365 -                                     struct page *page, struct page *kpage)
2366 +static int restore_ksm_page_pte(struct vm_area_struct *vma, unsigned long addr,
2367 +                            pte_t orig_pte, pte_t wprt_pte)
2368  {
2369 -       struct mm_struct *mm = rmap_item->mm;
2370 -       struct vm_area_struct *vma;
2371 +       struct mm_struct *mm = vma->vm_mm;
2372 +       pgd_t *pgd;
2373 +       pud_t *pud;
2374 +       pmd_t *pmd;
2375 +       pte_t *ptep;
2376 +       spinlock_t *ptl;
2377 +
2378         int err = -EFAULT;
2379
2380 -       down_read(&mm->mmap_sem);
2381 -       if (ksm_test_exit(mm))
2382 -               goto out;
2383 -       vma = find_vma(mm, rmap_item->address);
2384 -       if (!vma || vma->vm_start > rmap_item->address)
2385 +       pgd = pgd_offset(mm, addr);
2386 +       if (!pgd_present(*pgd))
2387 +               goto out;
2388 +
2389 +       pud = pud_offset(pgd, addr);
2390 +       if (!pud_present(*pud))
2391 +               goto out;
2392 +
2393 +       pmd = pmd_offset(pud, addr);
2394 +       if (!pmd_present(*pmd))
2395 +               goto out;
2396 +
2397 +       ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
2398 +       if (!pte_same(*ptep, wprt_pte)) {
2399 +               /* already copied, let it be */
2400 +               pte_unmap_unlock(ptep, ptl);
2401 +               goto out;
2402 +       }
2403 +
2404 +       /*
2405 +        * Good boy, still here. When we still get the ksm page, it does not
2406 +        * return to the free page pool, there is no way that a pte was changed
2407 +        * to other page and gets back to this page. And remind that ksm page
2408 +        * do not reuse in do_wp_page(). So it's safe to restore the original
2409 +        * pte.
2410 +        */
2411 +       flush_cache_page(vma, addr, pte_pfn(*ptep));
2412 +       ptep_clear_flush(vma, addr, ptep);
2413 +       set_pte_at_notify(mm, addr, ptep, orig_pte);
2414 +
2415 +       pte_unmap_unlock(ptep, ptl);
2416 +       err = 0;
2417 +out:
2418 +       return err;
2419 +}
2420 +
2421 +/**
2422 + * try_to_merge_two_pages() - take two identical pages and prepare
2423 + * them to be merged into one page(rmap_item->page)
2424 + *
2425 + * @return 0 if we successfully merged two identical pages into
2426 + *         one ksm page. MERGE_ERR_COLLI if it's only a hash collision
2427 + *         search in rbtree. MERGE_ERR_CHANGED if rmap_item has been
2428 + *         changed since it's hashed. MERGE_ERR_PGERR otherwise.
2429 + *
2430 + */
2431 +static int try_to_merge_two_pages(struct rmap_item *rmap_item,
2432 +                                 struct rmap_item *tree_rmap_item,
2433 +                                 u32 hash)
2434 +{
2435 +       pte_t orig_pte1 = __pte(0), orig_pte2 = __pte(0);
2436 +       pte_t wprt_pte1 = __pte(0), wprt_pte2 = __pte(0);
2437 +       struct vm_area_struct *vma1 = rmap_item->slot->vma;
2438 +       struct vm_area_struct *vma2 = tree_rmap_item->slot->vma;
2439 +       struct page *page = rmap_item->page;
2440 +       struct page *tree_page = tree_rmap_item->page;
2441 +       int err = MERGE_ERR_PGERR;
2442 +
2443 +       long map_sharing;
2444 +       struct address_space *saved_mapping;
2445 +
2446 +
2447 +       if (rmap_item->page == tree_rmap_item->page)
2448 +               goto out;
2449 +
2450 +       if (PageTransCompound(page) && page_trans_compound_anon_split(page))
2451 +               goto out;
2452 +       BUG_ON(PageTransCompound(page));
2453 +
2454 +       if (PageTransCompound(tree_page) && page_trans_compound_anon_split(tree_page))
2455 +               goto out;
2456 +       BUG_ON(PageTransCompound(tree_page));
2457 +
2458 +       if (!PageAnon(page) || !PageAnon(tree_page))
2459 +               goto out;
2460 +
2461 +       if (!trylock_page(page))
2462 +               goto out;
2463 +
2464 +
2465 +       if (write_protect_page(vma1, page, &wprt_pte1, &orig_pte1) != 0) {
2466 +               unlock_page(page);
2467 +               goto out;
2468 +       }
2469 +
2470 +       /*
2471 +        * While we hold page lock, upgrade page from
2472 +        * PageAnon+anon_vma to PageKsm+NULL stable_node:
2473 +        * stable_tree_insert() will update stable_node.
2474 +        */
2475 +       saved_mapping = page->mapping;
2476 +       map_sharing = atomic_read(&page->_mapcount);
2477 +       set_page_stable_node(page, NULL);
2478 +       if (map_sharing)
2479 +               add_zone_page_state(page_zone(page),
2480 +                                   NR_KSM_PAGES_SHARING,
2481 +                                   map_sharing);
2482 +       mark_page_accessed(page);
2483 +       unlock_page(page);
2484 +
2485 +       if (!trylock_page(tree_page))
2486 +               goto restore_out;
2487 +
2488 +       if (write_protect_page(vma2, tree_page, &wprt_pte2, &orig_pte2) != 0) {
2489 +               unlock_page(tree_page);
2490 +               goto restore_out;
2491 +       }
2492 +
2493 +       if (pages_identical(page, tree_page)) {
2494 +               err = replace_page(vma2, tree_page, page, wprt_pte2);
2495 +               if (err)
2496 +                       goto restore_out;
2497 +
2498 +               if ((vma2->vm_flags & VM_LOCKED)) {
2499 +                       munlock_vma_page(tree_page);
2500 +                       if (!PageMlocked(page)) {
2501 +                               unlock_page(tree_page);
2502 +                               lock_page(page);
2503 +                               mlock_vma_page(page);
2504 +                               tree_page = page; /* for final unlock */
2505 +                       }
2506 +               }
2507 +
2508 +               unlock_page(tree_page);
2509 +
2510 +               goto out; /* success */
2511 +
2512 +       } else {
2513 +               if (page_hash(page, hash_strength, 0) ==
2514 +                   page_hash(tree_page, hash_strength, 0)) {
2515 +                       inc_rshash_neg(memcmp_cost + hash_strength * 2);
2516 +                       err = MERGE_ERR_COLLI;
2517 +               } else
2518 +                       err = MERGE_ERR_CHANGED;
2519 +
2520 +               unlock_page(tree_page);
2521 +       }
2522 +
2523 +restore_out:
2524 +       lock_page(page);
2525 +       if (!restore_ksm_page_pte(vma1, get_rmap_addr(rmap_item),
2526 +                                 orig_pte1, wprt_pte1))
2527 +               page->mapping = saved_mapping;
2528 +
2529 +       unlock_page(page);
2530 +out:
2531 +       return err;
2532 +}
2533 +
2534 +static inline int hash_cmp(u32 new_val, u32 node_val)
2535 +{
2536 +       if (new_val > node_val)
2537 +               return 1;
2538 +       else if (new_val < node_val)
2539 +               return -1;
2540 +       else
2541 +               return 0;
2542 +}
2543 +
2544 +static inline u32 rmap_item_hash_max(struct rmap_item *item, u32 hash)
2545 +{
2546 +       u32 hash_max = item->hash_max;
2547 +
2548 +       if (!hash_max) {
2549 +               hash_max = page_hash_max(item->page, hash);
2550 +
2551 +               item->hash_max = hash_max;
2552 +       }
2553 +
2554 +       return hash_max;
2555 +}
2556 +
2557 +
2558 +
2559 +/**
2560 + * stable_tree_search() - search the stable tree for a page
2561 + *
2562 + * @item:      the rmap_item we are comparing with
2563 + * @hash:      the hash value of this item->page already calculated
2564 + *
2565 + * @return     the page we have found, NULL otherwise. The page returned has
2566 + *             been gotten.
2567 + */
2568 +static struct page *stable_tree_search(struct rmap_item *item, u32 hash)
2569 +{
2570 +       struct rb_node *node = root_stable_treep->rb_node;
2571 +       struct tree_node *tree_node;
2572 +       unsigned long hash_max;
2573 +       struct page *page = item->page;
2574 +       struct stable_node *stable_node;
2575 +
2576 +       stable_node = page_stable_node(page);
2577 +       if (stable_node) {
2578 +               /* ksm page forked, that is
2579 +                * if (PageKsm(page) && !in_stable_tree(rmap_item))
2580 +                * it's actually gotten once outside.
2581 +                */
2582 +               get_page(page);
2583 +               return page;
2584 +       }
2585 +
2586 +       while (node) {
2587 +               int cmp;
2588 +
2589 +               tree_node = rb_entry(node, struct tree_node, node);
2590 +
2591 +               cmp = hash_cmp(hash, tree_node->hash);
2592 +
2593 +               if (cmp < 0)
2594 +                       node = node->rb_left;
2595 +               else if (cmp > 0)
2596 +                       node = node->rb_right;
2597 +               else
2598 +                       break;
2599 +       }
2600 +
2601 +       if (!node)
2602 +               return NULL;
2603 +
2604 +       if (tree_node->count == 1) {
2605 +               stable_node = rb_entry(tree_node->sub_root.rb_node,
2606 +                                      struct stable_node, node);
2607 +               BUG_ON(!stable_node);
2608 +
2609 +               goto get_page_out;
2610 +       }
2611 +
2612 +       /*
2613 +        * ok, we have to search the second
2614 +        * level subtree, hash the page to a
2615 +        * full strength.
2616 +        */
2617 +       node = tree_node->sub_root.rb_node;
2618 +       BUG_ON(!node);
2619 +       hash_max = rmap_item_hash_max(item, hash);
2620 +
2621 +       while (node) {
2622 +               int cmp;
2623 +
2624 +               stable_node = rb_entry(node, struct stable_node, node);
2625 +
2626 +               cmp = hash_cmp(hash_max, stable_node->hash_max);
2627 +
2628 +               if (cmp < 0)
2629 +                       node = node->rb_left;
2630 +               else if (cmp > 0)
2631 +                       node = node->rb_right;
2632 +               else
2633 +                       goto get_page_out;
2634 +       }
2635 +
2636 +       return NULL;
2637 +
2638 +get_page_out:
2639 +       page = get_ksm_page(stable_node, 1, 1);
2640 +       return page;
2641 +}
2642 +
2643 +
2644 +/**
2645 + * try_to_merge_with_stable_page() - when two rmap_items need to be inserted
2646 + * into stable tree, the page was found to be identical to a stable ksm page,
2647 + * this is the last chance we can merge them into one.
2648 + *
2649 + * @item1:     the rmap_item holding the page which we wanted to insert
2650 + *             into stable tree.
2651 + * @item2:     the other rmap_item we found when unstable tree search
2652 + * @oldpage:   the page currently mapped by the two rmap_items
2653 + * @tree_page:         the page we found identical in stable tree node
2654 + * @success1:  return if item1 is successfully merged
2655 + * @success2:  return if item2 is successfully merged
2656 + */
2657 +static void try_merge_with_stable(struct rmap_item *item1,
2658 +                                 struct rmap_item *item2,
2659 +                                 struct page **kpage,
2660 +                                 struct page *tree_page,
2661 +                                 int *success1, int *success2)
2662 +{
2663 +       spinlock_t *ptl1, *ptl2;
2664 +       pte_t *ptep1, *ptep2;
2665 +       unsigned long addr1, addr2;
2666 +       struct vm_area_struct *vma1 = item1->slot->vma;
2667 +       struct vm_area_struct *vma2 = item2->slot->vma;
2668 +
2669 +       *success1 = 0;
2670 +       *success2 = 0;
2671 +
2672 +       if (unlikely(*kpage == tree_page)) {
2673 +               /* I don't think this can really happen */
2674 +               goto success_both;
2675 +       }
2676 +
2677 +       if (!PageAnon(*kpage) || !PageKsm(*kpage))
2678 +               goto failed;
2679 +
2680 +       if (!trylock_page(tree_page))
2681 +               goto failed;
2682 +
2683 +       /* If the oldpage is still ksm and still pointed
2684 +        * to in the right place, and still write protected,
2685 +        * we are confident it's not changed, no need to
2686 +        * memcmp anymore.
2687 +        * be ware, we cannot take nested pte locks,
2688 +        * deadlock risk.
2689 +        */
2690 +       addr1 = get_rmap_addr(item1);
2691 +
2692 +       ptep1 = page_check_address(*kpage, vma1->vm_mm, addr1, &ptl1, 0);
2693 +       if (!ptep1)
2694 +               goto failed;
2695 +
2696 +       if (pte_write(*ptep1)) {
2697 +               /* has changed, abort! */
2698 +               pte_unmap_unlock(ptep1, ptl1);
2699 +               goto failed;
2700 +       }
2701 +
2702 +       get_page(tree_page);
2703 +       page_add_anon_rmap(tree_page, vma1, addr1);
2704 +
2705 +       flush_cache_page(vma1, addr1, pte_pfn(*ptep1));
2706 +       ptep_clear_flush(vma1, addr1, ptep1);
2707 +       set_pte_at_notify(vma1->vm_mm, addr1, ptep1,
2708 +                         mk_pte(tree_page, vma1->vm_page_prot));
2709 +
2710 +       page_remove_rmap(*kpage);
2711 +       put_page(*kpage);
2712 +
2713 +       pte_unmap_unlock(ptep1, ptl1);
2714 +
2715 +
2716 +       /* ok, then vma2, remind that pte1 already set */
2717 +       addr2 = get_rmap_addr(item2);
2718 +
2719 +       ptep2 = page_check_address(*kpage, vma2->vm_mm, addr2, &ptl2, 0);
2720 +       if (!ptep2)
2721 +               goto success1;
2722 +
2723 +       if (pte_write(*ptep2)) {
2724 +               /* has changed, abort! */
2725 +               pte_unmap_unlock(ptep2, ptl2);
2726 +               goto success1;
2727 +       }
2728 +
2729 +       get_page(tree_page);
2730 +       page_add_anon_rmap(tree_page, vma2, addr2);
2731 +
2732 +       flush_cache_page(vma2, addr2, pte_pfn(*ptep2));
2733 +       ptep_clear_flush(vma2, addr2, ptep2);
2734 +       set_pte_at_notify(vma2->vm_mm, addr2, ptep2,
2735 +                         mk_pte(tree_page, vma2->vm_page_prot));
2736 +
2737 +       page_remove_rmap(*kpage);
2738 +       put_page(*kpage);
2739 +
2740 +       pte_unmap_unlock(ptep2, ptl2);
2741 +
2742 +
2743 +success_both:
2744 +       *success2 = 1;
2745 +success1:
2746 +       *success1 = 1;
2747 +
2748 +
2749 +       if ((*success1 && vma1->vm_flags & VM_LOCKED) ||
2750 +           (*success2 && vma2->vm_flags & VM_LOCKED)) {
2751 +               munlock_vma_page(*kpage);
2752 +               if (!PageMlocked(tree_page))
2753 +                       mlock_vma_page(tree_page);
2754 +       }
2755 +
2756 +       /*
2757 +        * We do not need oldpage any more in the caller, so can break the lock
2758 +        * now.
2759 +        */
2760 +       unlock_page(*kpage);
2761 +       *kpage = tree_page; /* Get unlocked outside. */
2762 +failed:
2763 +       return;
2764 +}
2765 +
2766 +static inline void stable_node_hash_max(struct stable_node *node,
2767 +                                        struct page *page, u32 hash)
2768 +{
2769 +       u32 hash_max = node->hash_max;
2770 +
2771 +       if (!hash_max) {
2772 +               hash_max = page_hash_max(page, hash);
2773 +               node->hash_max = hash_max;
2774 +       }
2775 +}
2776 +
2777 +static inline
2778 +struct stable_node *new_stable_node(struct tree_node *tree_node,
2779 +                                   struct page *kpage, u32 hash_max)
2780 +{
2781 +       struct stable_node *new_stable_node;
2782 +
2783 +       new_stable_node = alloc_stable_node();
2784 +       if (!new_stable_node)
2785 +               return NULL;
2786 +
2787 +       new_stable_node->kpfn = page_to_pfn(kpage);
2788 +       new_stable_node->hash_max = hash_max;
2789 +       new_stable_node->tree_node = tree_node;
2790 +       set_page_stable_node(kpage, new_stable_node);
2791 +
2792 +       return new_stable_node;
2793 +}
2794 +
2795 +static inline
2796 +struct stable_node *first_level_insert(struct tree_node *tree_node,
2797 +                                      struct rmap_item *rmap_item,
2798 +                                      struct rmap_item *tree_rmap_item,
2799 +                                      struct page **kpage, u32 hash,
2800 +                                      int *success1, int *success2)
2801 +{
2802 +       int cmp;
2803 +       struct page *tree_page;
2804 +       u32 hash_max = 0;
2805 +       struct stable_node *stable_node, *new_snode;
2806 +       struct rb_node *parent = NULL, **new;
2807 +
2808 +       /* this tree node contains no sub-tree yet */
2809 +       stable_node = rb_entry(tree_node->sub_root.rb_node,
2810 +                              struct stable_node, node);
2811 +
2812 +       tree_page = get_ksm_page(stable_node, 1, 0);
2813 +       if (tree_page) {
2814 +               cmp = memcmp_pages(*kpage, tree_page, 1);
2815 +               if (!cmp) {
2816 +                       try_merge_with_stable(rmap_item, tree_rmap_item, kpage,
2817 +                                             tree_page, success1, success2);
2818 +                       put_page(tree_page);
2819 +                       if (!*success1 && !*success2)
2820 +                               goto failed;
2821 +
2822 +                       return stable_node;
2823 +
2824 +               } else {
2825 +                       /*
2826 +                        * collision in first level try to create a subtree.
2827 +                        * A new node need to be created.
2828 +                        */
2829 +                       put_page(tree_page);
2830 +
2831 +                       stable_node_hash_max(stable_node, tree_page,
2832 +                                            tree_node->hash);
2833 +                       hash_max = rmap_item_hash_max(rmap_item, hash);
2834 +                       cmp = hash_cmp(hash_max, stable_node->hash_max);
2835 +
2836 +                       parent = &stable_node->node;
2837 +                       if (cmp < 0) {
2838 +                               new = &parent->rb_left;
2839 +                       } else if (cmp > 0) {
2840 +                               new = &parent->rb_right;
2841 +                       } else {
2842 +                               goto failed;
2843 +                       }
2844 +               }
2845 +
2846 +       } else {
2847 +               /* the only stable_node deleted, we reuse its tree_node.
2848 +                */
2849 +               parent = NULL;
2850 +               new = &tree_node->sub_root.rb_node;
2851 +       }
2852 +
2853 +       new_snode = new_stable_node(tree_node, *kpage, hash_max);
2854 +       if (!new_snode)
2855 +               goto failed;
2856 +
2857 +       rb_link_node(&new_snode->node, parent, new);
2858 +       rb_insert_color(&new_snode->node, &tree_node->sub_root);
2859 +       tree_node->count++;
2860 +       *success1 = *success2 = 1;
2861 +
2862 +       return new_snode;
2863 +
2864 +failed:
2865 +       return NULL;
2866 +}
2867 +
2868 +static inline
2869 +struct stable_node *stable_subtree_insert(struct tree_node *tree_node,
2870 +                                         struct rmap_item *rmap_item,
2871 +                                         struct rmap_item *tree_rmap_item,
2872 +                                         struct page **kpage, u32 hash,
2873 +                                         int *success1, int *success2)
2874 +{
2875 +       struct page *tree_page;
2876 +       u32 hash_max;
2877 +       struct stable_node *stable_node, *new_snode;
2878 +       struct rb_node *parent, **new;
2879 +
2880 +research:
2881 +       parent = NULL;
2882 +       new = &tree_node->sub_root.rb_node;
2883 +       BUG_ON(!*new);
2884 +       hash_max = rmap_item_hash_max(rmap_item, hash);
2885 +       while (*new) {
2886 +               int cmp;
2887 +
2888 +               stable_node = rb_entry(*new, struct stable_node, node);
2889 +
2890 +               cmp = hash_cmp(hash_max, stable_node->hash_max);
2891 +
2892 +               if (cmp < 0) {
2893 +                       parent = *new;
2894 +                       new = &parent->rb_left;
2895 +               } else if (cmp > 0) {
2896 +                       parent = *new;
2897 +                       new = &parent->rb_right;
2898 +               } else {
2899 +                       tree_page = get_ksm_page(stable_node, 1, 0);
2900 +                       if (tree_page) {
2901 +                               cmp = memcmp_pages(*kpage, tree_page, 1);
2902 +                               if (!cmp) {
2903 +                                       try_merge_with_stable(rmap_item,
2904 +                                               tree_rmap_item, kpage,
2905 +                                               tree_page, success1, success2);
2906 +
2907 +                                       put_page(tree_page);
2908 +                                       if (!*success1 && !*success2)
2909 +                                               goto failed;
2910 +                                       /*
2911 +                                        * successfully merged with a stable
2912 +                                        * node
2913 +                                        */
2914 +                                       return stable_node;
2915 +                               } else {
2916 +                                       put_page(tree_page);
2917 +                                       goto failed;
2918 +                               }
2919 +                       } else {
2920 +                               /*
2921 +                                * stable node may be deleted,
2922 +                                * and subtree maybe
2923 +                                * restructed, cannot
2924 +                                * continue, research it.
2925 +                                */
2926 +                               if (tree_node->count) {
2927 +                                       goto research;
2928 +                               } else {
2929 +                                       /* reuse the tree node*/
2930 +                                       parent = NULL;
2931 +                                       new = &tree_node->sub_root.rb_node;
2932 +                               }
2933 +                       }
2934 +               }
2935 +       }
2936 +
2937 +       new_snode = new_stable_node(tree_node, *kpage, hash_max);
2938 +       if (!new_snode)
2939 +               goto failed;
2940 +
2941 +       rb_link_node(&new_snode->node, parent, new);
2942 +       rb_insert_color(&new_snode->node, &tree_node->sub_root);
2943 +       tree_node->count++;
2944 +       *success1 = *success2 = 1;
2945 +
2946 +       return new_snode;
2947 +
2948 +failed:
2949 +       return NULL;
2950 +}
2951 +
2952 +
2953 +/**
2954 + * stable_tree_insert() - try to insert a merged page in unstable tree to
2955 + * the stable tree
2956 + *
2957 + * @kpage:             the page need to be inserted
2958 + * @hash:              the current hash of this page
2959 + * @rmap_item:         the rmap_item being scanned
2960 + * @tree_rmap_item:    the rmap_item found on unstable tree
2961 + * @success1:          return if rmap_item is merged
2962 + * @success2:          return if tree_rmap_item is merged
2963 + *
2964 + * @return             the stable_node on stable tree if at least one
2965 + *                     rmap_item is inserted into stable tree, NULL
2966 + *                     otherwise.
2967 + */
2968 +static struct stable_node *
2969 +stable_tree_insert(struct page **kpage, u32 hash,
2970 +                  struct rmap_item *rmap_item,
2971 +                  struct rmap_item *tree_rmap_item,
2972 +                  int *success1, int *success2)
2973 +{
2974 +       struct rb_node **new = &root_stable_treep->rb_node;
2975 +       struct rb_node *parent = NULL;
2976 +       struct stable_node *stable_node;
2977 +       struct tree_node *tree_node;
2978 +       u32 hash_max = 0;
2979 +
2980 +       *success1 = *success2 = 0;
2981 +
2982 +       while (*new) {
2983 +               int cmp;
2984 +
2985 +               tree_node = rb_entry(*new, struct tree_node, node);
2986 +
2987 +               cmp = hash_cmp(hash, tree_node->hash);
2988 +
2989 +               if (cmp < 0) {
2990 +                       parent = *new;
2991 +                       new = &parent->rb_left;
2992 +               } else if (cmp > 0) {
2993 +                       parent = *new;
2994 +                       new = &parent->rb_right;
2995 +               } else
2996 +                       break;
2997 +       }
2998 +
2999 +       if (*new) {
3000 +               if (tree_node->count == 1) {
3001 +                       stable_node = first_level_insert(tree_node, rmap_item,
3002 +                                               tree_rmap_item, kpage,
3003 +                                               hash, success1, success2);
3004 +               } else {
3005 +                       stable_node = stable_subtree_insert(tree_node,
3006 +                                       rmap_item, tree_rmap_item, kpage,
3007 +                                       hash, success1, success2);
3008 +               }
3009 +       } else {
3010 +
3011 +               /* no tree node found */
3012 +               tree_node = alloc_tree_node(stable_tree_node_listp);
3013 +               if (!tree_node) {
3014 +                       stable_node = NULL;
3015 +                       goto out;
3016 +               }
3017 +
3018 +               stable_node = new_stable_node(tree_node, *kpage, hash_max);
3019 +               if (!stable_node) {
3020 +                       free_tree_node(tree_node);
3021 +                       goto out;
3022 +               }
3023 +
3024 +               tree_node->hash = hash;
3025 +               rb_link_node(&tree_node->node, parent, new);
3026 +               rb_insert_color(&tree_node->node, root_stable_treep);
3027 +               parent = NULL;
3028 +               new = &tree_node->sub_root.rb_node;
3029 +
3030 +               rb_link_node(&stable_node->node, parent, new);
3031 +               rb_insert_color(&stable_node->node, &tree_node->sub_root);
3032 +               tree_node->count++;
3033 +               *success1 = *success2 = 1;
3034 +       }
3035 +
3036 +out:
3037 +       return stable_node;
3038 +}
3039 +
3040 +
3041 +/**
3042 + * get_tree_rmap_item_page() - try to get the page and lock the mmap_sem
3043 + *
3044 + * @return     0 on success, -EBUSY if unable to lock the mmap_sem,
3045 + *             -EINVAL if the page mapping has been changed.
3046 + */
3047 +static inline int get_tree_rmap_item_page(struct rmap_item *tree_rmap_item)
3048 +{
3049 +       int err;
3050 +
3051 +       err = get_mergeable_page_lock_mmap(tree_rmap_item);
3052 +
3053 +       if (err == -EINVAL) {
3054 +               /* its page map has been changed, remove it */
3055 +               remove_rmap_item_from_tree(tree_rmap_item);
3056 +       }
3057 +
3058 +       /* The page is gotten and mmap_sem is locked now. */
3059 +       return err;
3060 +}
3061 +
3062 +
3063 +/**
3064 + * unstable_tree_search_insert() - search an unstable tree rmap_item with the
3065 + * same hash value. Get its page and trylock the mmap_sem
3066 + */
3067 +static inline
3068 +struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
3069 +                                             u32 hash)
3070 +
3071 +{
3072 +       struct rb_node **new = &root_unstable_tree.rb_node;
3073 +       struct rb_node *parent = NULL;
3074 +       struct tree_node *tree_node;
3075 +       u32 hash_max;
3076 +       struct rmap_item *tree_rmap_item;
3077 +
3078 +       while (*new) {
3079 +               int cmp;
3080 +
3081 +               tree_node = rb_entry(*new, struct tree_node, node);
3082 +
3083 +               cmp = hash_cmp(hash, tree_node->hash);
3084 +
3085 +               if (cmp < 0) {
3086 +                       parent = *new;
3087 +                       new = &parent->rb_left;
3088 +               } else if (cmp > 0) {
3089 +                       parent = *new;
3090 +                       new = &parent->rb_right;
3091 +               } else
3092 +                       break;
3093 +       }
3094 +
3095 +       if (*new) {
3096 +               /* got the tree_node */
3097 +               if (tree_node->count == 1) {
3098 +                       tree_rmap_item = rb_entry(tree_node->sub_root.rb_node,
3099 +                                                 struct rmap_item, node);
3100 +                       BUG_ON(!tree_rmap_item);
3101 +
3102 +                       goto get_page_out;
3103 +               }
3104 +
3105 +               /* well, search the collision subtree */
3106 +               new = &tree_node->sub_root.rb_node;
3107 +               BUG_ON(!*new);
3108 +               hash_max = rmap_item_hash_max(rmap_item, hash);
3109 +
3110 +               while (*new) {
3111 +                       int cmp;
3112 +
3113 +                       tree_rmap_item = rb_entry(*new, struct rmap_item,
3114 +                                                 node);
3115 +
3116 +                       cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
3117 +                       parent = *new;
3118 +                       if (cmp < 0)
3119 +                               new = &parent->rb_left;
3120 +                       else if (cmp > 0)
3121 +                               new = &parent->rb_right;
3122 +                       else
3123 +                               goto get_page_out;
3124 +               }
3125 +       } else {
3126 +               /* alloc a new tree_node */
3127 +               tree_node = alloc_tree_node(&unstable_tree_node_list);
3128 +               if (!tree_node)
3129 +                       return NULL;
3130 +
3131 +               tree_node->hash = hash;
3132 +               rb_link_node(&tree_node->node, parent, new);
3133 +               rb_insert_color(&tree_node->node, &root_unstable_tree);
3134 +               parent = NULL;
3135 +               new = &tree_node->sub_root.rb_node;
3136 +       }
3137 +
3138 +       /* did not found even in sub-tree */
3139 +       rmap_item->tree_node = tree_node;
3140 +       rmap_item->address |= UNSTABLE_FLAG;
3141 +       rmap_item->append_round = ksm_scan_round;
3142 +       rb_link_node(&rmap_item->node, parent, new);
3143 +       rb_insert_color(&rmap_item->node, &tree_node->sub_root);
3144 +
3145 +       ksm_pages_unshared++;
3146 +       return NULL;
3147 +
3148 +get_page_out:
3149 +       if (tree_rmap_item->page == rmap_item->page)
3150 +               return NULL;
3151 +
3152 +       if (get_tree_rmap_item_page(tree_rmap_item))
3153 +               return NULL;
3154 +
3155 +       return tree_rmap_item;
3156 +}
3157 +
3158 +static void enter_vma_tree(struct vma_slot *slot)
3159 +{
3160 +       unsigned long i;
3161 +       int ret;
3162 +
3163 +       i = ksm_vma_tree_index_end;
3164 +
3165 +       ret = radix_tree_insert(&ksm_vma_tree, i, slot);
3166 +       BUG_ON(ret);
3167 +
3168 +       slot->ksm_index = i;
3169 +       ksm_vma_tree_num++;
3170 +       ksm_vma_tree_index_end++;
3171 +}
3172 +
3173 +static inline void get_sub_dup_vma(struct vma_slot **slot,
3174 +                                  struct vma_slot **sub_slot)
3175 +{
3176 +       struct vma_slot *tmp;
3177 +
3178 +       if ((*slot)->ksm_index > (*sub_slot)->ksm_index) {
3179 +               tmp = *slot;
3180 +               *slot = *sub_slot;
3181 +               *sub_slot = tmp;
3182 +       }
3183 +}
3184 +
3185 +/*
3186 + * Inc or dec the dup pages stored in a slot, return the dup page num after
3187 + * the operation.
3188 + */
3189 +static inline unsigned long dup_pages_mod(void **slot, int inc)
3190 +{
3191 +       unsigned long item, ret;
3192 +
3193 +       item = (unsigned long)(*slot) >> INDIRECT_OFFSET;
3194 +       if (inc) {
3195 +               item++;
3196 +               BUG_ON(!item);
3197 +       } else {
3198 +               BUG_ON(!item);
3199 +               item--;
3200 +       }
3201 +       ret = item;
3202 +       item <<= INDIRECT_OFFSET;
3203 +       *slot = (void *)item;
3204 +
3205 +       return ret;
3206 +}
3207 +
3208 +static void inc_dup_vma(struct vma_slot *slot, struct vma_slot *sub_slot)
3209 +{
3210 +       void **dup_slot;
3211 +       unsigned long dup_pages;
3212 +       int ret;
3213 +
3214 +       if (slot->ksm_index == -1)
3215 +               enter_vma_tree(slot);
3216 +
3217 +       if (sub_slot->ksm_index == -1)
3218 +               enter_vma_tree(sub_slot);
3219 +
3220 +       get_sub_dup_vma(&slot, &sub_slot);
3221 +
3222 +       dup_slot = radix_tree_lookup_slot(&slot->dup_tree, sub_slot->ksm_index);
3223 +       if (dup_slot)
3224 +               goto found;
3225 +
3226 +       /*
3227 +        * In order to store dup_pages in radix tree, we must make
3228 +        * radix_tree_is_indirect_ptr() happy.
3229 +        */
3230 +       dup_pages = 1 << INDIRECT_OFFSET;
3231 +
3232 +       /* no such entry yet, insert one */
3233 +       ret = radix_tree_insert(&slot->dup_tree, sub_slot->ksm_index,
3234 +                               (void *)dup_pages);
3235 +       BUG_ON(ret);
3236 +
3237 +       return;
3238 +
3239 +found:
3240 +       dup_pages_mod(dup_slot, 1);
3241 +}
3242 +
3243 +static void dec_dup_vma(struct vma_slot *slot, struct vma_slot *sub_slot)
3244 +{
3245 +       void **dup_slot;
3246 +       unsigned long dup_pages;
3247 +
3248 +       BUG_ON(slot->ksm_index == -1 || sub_slot->ksm_index == -1);
3249 +
3250 +       get_sub_dup_vma(&slot, &sub_slot);
3251 +
3252 +       dup_slot = radix_tree_lookup_slot(&slot->dup_tree, sub_slot->ksm_index);
3253 +       BUG_ON(!dup_slot);
3254 +
3255 +       dup_pages = dup_pages_mod(dup_slot, 0);
3256 +
3257 +       /* dup_pages == 0, we need to kick it out */
3258 +       if (!dup_pages)
3259 +               radix_tree_delete(&slot->dup_tree, sub_slot->ksm_index);
3260 +}
3261 +
3262 +static void hold_anon_vma(struct rmap_item *rmap_item,
3263 +                         struct anon_vma *anon_vma)
3264 +{
3265 +       rmap_item->anon_vma = anon_vma;
3266 +       get_anon_vma(anon_vma);
3267 +}
3268 +
3269 +
3270 +/**
3271 + * stable_tree_append() - append a rmap_item to a stable node. Deduplication
3272 + * ratio statistics is done in this function.
3273 + *
3274 + */
3275 +static void stable_tree_append(struct rmap_item *rmap_item,
3276 +                              struct stable_node *stable_node)
3277 +{
3278 +       struct node_vma *node_vma = NULL, *new_node_vma, *node_vma_iter = NULL;
3279 +       struct hlist_node *hlist, *cont_p = NULL;
3280 +       unsigned long key = (unsigned long)rmap_item->slot;
3281 +
3282 +       BUG_ON(!stable_node);
3283 +       rmap_item->address |= STABLE_FLAG;
3284 +       rmap_item->append_round = ksm_scan_round;
3285 +
3286 +       if (hlist_empty(&stable_node->hlist)) {
3287 +               ksm_pages_shared++;
3288 +               goto node_vma_new;
3289 +       } else {
3290 +               ksm_pages_sharing++;
3291 +       }
3292 +
3293 +       hlist_for_each_entry(node_vma, hlist, &stable_node->hlist, hlist) {
3294 +               if (node_vma->last_update == ksm_scan_round)
3295 +                       inc_dup_vma(rmap_item->slot, node_vma->slot);
3296 +
3297 +               if (node_vma->key >= key)
3298 +                       break;
3299 +       }
3300 +
3301 +       cont_p = hlist;
3302 +
3303 +       if (node_vma && node_vma->key == key) {
3304 +               if (node_vma->last_update == ksm_scan_round) {
3305 +                       /**
3306 +                        * we consider this page a inner duplicate, cancel
3307 +                        * other updates
3308 +                        */
3309 +                       hlist_for_each_entry(node_vma_iter, hlist,
3310 +                                            &stable_node->hlist, hlist) {
3311 +                               if (node_vma_iter->key == key)
3312 +                                       break;
3313 +
3314 +                               /* only need to increase the same vma */
3315 +                               if (node_vma_iter->last_update ==
3316 +                                   ksm_scan_round) {
3317 +                                       dec_dup_vma(rmap_item->slot,
3318 +                                                   node_vma_iter->slot);
3319 +                               }
3320 +                       }
3321 +               } else {
3322 +                       /**
3323 +                        * Although it's same vma, it contains no duplicate for this
3324 +                        * round. Continue scan other vma.
3325 +                        */
3326 +                       hlist_for_each_entry_continue(node_vma_iter,
3327 +                                                     hlist, hlist) {
3328 +                               if (node_vma_iter->last_update ==
3329 +                                   ksm_scan_round) {
3330 +                                       inc_dup_vma(rmap_item->slot,
3331 +                                                   node_vma_iter->slot);
3332 +                               }
3333 +                       }
3334 +
3335 +               }
3336 +
3337 +               goto node_vma_ok;
3338 +       }
3339 +
3340 +node_vma_new:
3341 +       /* no same vma already in node, alloc a new node_vma */
3342 +       new_node_vma = alloc_node_vma();
3343 +       BUG_ON(!new_node_vma);
3344 +       new_node_vma->head = stable_node;
3345 +       new_node_vma->slot = rmap_item->slot;
3346 +
3347 +       if (!node_vma) {
3348 +               hlist_add_head(&new_node_vma->hlist, &stable_node->hlist);
3349 +       } else if (node_vma->key != key) {
3350 +               if (node_vma->key < key)
3351 +                       hlist_add_after(&node_vma->hlist, &new_node_vma->hlist);
3352 +               else {
3353 +                       hlist_for_each_entry_continue(node_vma_iter, cont_p,
3354 +                                                     hlist) {
3355 +                               if (node_vma_iter->last_update ==
3356 +                                   ksm_scan_round) {
3357 +                                       inc_dup_vma(rmap_item->slot,
3358 +                                                   node_vma_iter->slot);
3359 +                               }
3360 +                       }
3361 +                       hlist_add_before(&new_node_vma->hlist,
3362 +                                        &node_vma->hlist);
3363 +               }
3364 +
3365 +       }
3366 +       node_vma = new_node_vma;
3367 +
3368 +node_vma_ok: /* ok, ready to add to the list */
3369 +       rmap_item->head = node_vma;
3370 +       hlist_add_head(&rmap_item->hlist, &node_vma->rmap_hlist);
3371 +       node_vma->last_update = ksm_scan_round;
3372 +       hold_anon_vma(rmap_item, rmap_item->slot->vma->anon_vma);
3373 +       rmap_item->slot->pages_merged++;
3374 +}
3375 +
3376 +/*
3377 + * We use break_ksm to break COW on a ksm page: it's a stripped down
3378 + *
3379 + *     if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1)
3380 + *             put_page(page);
3381 + *
3382 + * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
3383 + * in case the application has unmapped and remapped mm,addr meanwhile.
3384 + * Could a ksm page appear anywhere else?  Actually yes, in a VM_PFNMAP
3385 + * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
3386 + */
3387 +static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
3388 +{
3389 +       struct page *page;
3390 +       int ret = 0;
3391 +
3392 +       do {
3393 +               cond_resched();
3394 +               page = follow_page(vma, addr, FOLL_GET);
3395 +               if (IS_ERR_OR_NULL(page))
3396 +                       break;
3397 +               if (PageKsm(page)) {
3398 +                       ret = handle_mm_fault(vma->vm_mm, vma, addr,
3399 +                                             FAULT_FLAG_WRITE);
3400 +               } else
3401 +                       ret = VM_FAULT_WRITE;
3402 +               put_page(page);
3403 +       } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM)));
3404 +       /*
3405 +        * We must loop because handle_mm_fault() may back out if there's
3406 +        * any difficulty e.g. if pte accessed bit gets updated concurrently.
3407 +        *
3408 +        * VM_FAULT_WRITE is what we have been hoping for: it indicates that
3409 +        * COW has been broken, even if the vma does not permit VM_WRITE;
3410 +        * but note that a concurrent fault might break PageKsm for us.
3411 +        *
3412 +        * VM_FAULT_SIGBUS could occur if we race with truncation of the
3413 +        * backing file, which also invalidates anonymous pages: that's
3414 +        * okay, that truncation will have unmapped the PageKsm for us.
3415 +        *
3416 +        * VM_FAULT_OOM: at the time of writing (late July 2009), setting
3417 +        * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
3418 +        * current task has TIF_MEMDIE set, and will be OOM killed on return
3419 +        * to user; and ksmd, having no mm, would never be chosen for that.
3420 +        *
3421 +        * But if the mm is in a limited mem_cgroup, then the fault may fail
3422 +        * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
3423 +        * even ksmd can fail in this way - though it's usually breaking ksm
3424 +        * just to undo a merge it made a moment before, so unlikely to oom.
3425 +        *
3426 +        * That's a pity: we might therefore have more kernel pages allocated
3427 +        * than we're counting as nodes in the stable tree; but ksm_do_scan
3428 +        * will retry to break_cow on each pass, so should recover the page
3429 +        * in due course.  The important thing is to not let VM_MERGEABLE
3430 +        * be cleared while any such pages might remain in the area.
3431 +        */
3432 +       return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
3433 +}
3434 +
3435 +static void break_cow(struct rmap_item *rmap_item)
3436 +{
3437 +       struct vm_area_struct *vma = rmap_item->slot->vma;
3438 +       struct mm_struct *mm = vma->vm_mm;
3439 +       unsigned long addr = get_rmap_addr(rmap_item);
3440 +
3441 +       if (ksm_test_exit(mm))
3442 +               goto out;
3443 +
3444 +       break_ksm(vma, addr);
3445 +out:
3446 +       return;
3447 +}
3448 +
3449 +/*
3450 + * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather
3451 + * than check every pte of a given vma, the locking doesn't quite work for
3452 + * that - an rmap_item is assigned to the stable tree after inserting ksm
3453 + * page and upping mmap_sem.  Nor does it fit with the way we skip dup'ing
3454 + * rmap_items from parent to child at fork time (so as not to waste time
3455 + * if exit comes before the next scan reaches it).
3456 + *
3457 + * Similarly, although we'd like to remove rmap_items (so updating counts
3458 + * and freeing memory) when unmerging an area, it's easier to leave that
3459 + * to the next pass of ksmd - consider, for example, how ksmd might be
3460 + * in cmp_and_merge_page on one of the rmap_items we would be removing.
3461 + */
3462 +inline int unmerge_ksm_pages(struct vm_area_struct *vma,
3463 +                     unsigned long start, unsigned long end)
3464 +{
3465 +       unsigned long addr;
3466 +       int err = 0;
3467 +
3468 +       for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
3469 +               if (ksm_test_exit(vma->vm_mm))
3470 +                       break;
3471 +               if (signal_pending(current))
3472 +                       err = -ERESTARTSYS;
3473 +               else
3474 +                       err = break_ksm(vma, addr);
3475 +       }
3476 +       return err;
3477 +}
3478 +
3479 +static inline void inc_ksm_pages_scanned(void)
3480 +{
3481 +       u64 delta;
3482 +
3483 +
3484 +       if (ksm_pages_scanned == U64_MAX) {
3485 +               encode_benefit();
3486 +
3487 +               delta = ksm_pages_scanned >> pages_scanned_base;
3488 +
3489 +               if (CAN_OVERFLOW_U64(pages_scanned_stored, delta)) {
3490 +                       pages_scanned_stored >>= 1;
3491 +                       delta >>= 1;
3492 +                       pages_scanned_base++;
3493 +               }
3494 +
3495 +               pages_scanned_stored += delta;
3496 +
3497 +               ksm_pages_scanned = ksm_pages_scanned_last = 0;
3498 +       }
3499 +
3500 +       ksm_pages_scanned++;
3501 +}
3502 +
3503 +static inline int find_zero_page_hash(int strength, u32 hash)
3504 +{
3505 +       return (zero_hash_table[strength] == hash);
3506 +}
3507 +
3508 +static
3509 +int cmp_and_merge_zero_page(struct vm_area_struct *vma, struct page *page)
3510 +{
3511 +       struct page *zero_page = empty_ksm_zero_page;
3512 +       struct mm_struct *mm = vma->vm_mm;
3513 +       pte_t orig_pte = __pte(0);
3514 +       int err = -EFAULT;
3515 +
3516 +       if (ksm_test_exit(mm))
3517 +               goto out;
3518 +
3519 +       if (PageTransCompound(page) && page_trans_compound_anon_split(page))
3520 +               goto out;
3521 +       BUG_ON(PageTransCompound(page));
3522 +
3523 +       if (!PageAnon(page))
3524 +               goto out;
3525 +
3526 +       if (!trylock_page(page))
3527 +               goto out;
3528 +
3529 +       if (write_protect_page(vma, page, &orig_pte, 0) == 0) {
3530 +               if (is_page_full_zero(page))
3531 +                       err = replace_page(vma, page, zero_page, orig_pte);
3532 +       }
3533 +
3534 +       unlock_page(page);
3535 +out:
3536 +       return err;
3537 +}
3538 +
3539 +/*
3540 + * cmp_and_merge_page() - first see if page can be merged into the stable
3541 + * tree; if not, compare hash to previous and if it's the same, see if page
3542 + * can be inserted into the unstable tree, or merged with a page already there
3543 + * and both transferred to the stable tree.
3544 + *
3545 + * @page: the page that we are searching identical page to.
3546 + * @rmap_item: the reverse mapping into the virtual address of this page
3547 + */
3548 +static void cmp_and_merge_page(struct rmap_item *rmap_item)
3549 +{
3550 +       struct rmap_item *tree_rmap_item;
3551 +       struct page *page;
3552 +       struct page *kpage = NULL;
3553 +       u32 hash, hash_max;
3554 +       int err;
3555 +       unsigned int success1, success2;
3556 +       struct stable_node *snode;
3557 +       int cmp;
3558 +       struct rb_node *parent = NULL, **new;
3559 +
3560 +       remove_rmap_item_from_tree(rmap_item);
3561 +       inc_ksm_pages_scanned();
3562 +
3563 +       page = rmap_item->page;
3564 +
3565 +       hash = page_hash(page, hash_strength, 1);
3566 +
3567 +       /*if the page content all zero, re-map to zero-page*/
3568 +       if (find_zero_page_hash(hash_strength, hash)) {
3569 +               if (!cmp_and_merge_zero_page(rmap_item->slot->vma, page)) {
3570 +                       __inc_zone_page_state(page, NR_KSM_ZERO_PAGES);
3571 +                       return ;
3572 +               } else {
3573 +                       inc_rshash_neg(memcmp_cost / 2);
3574 +               }
3575 +       }
3576 +       //ksm_pages_scanned++;
3577 +
3578 +       /* We first start with searching the page inside the stable tree */
3579 +       kpage = stable_tree_search(rmap_item, hash);
3580 +       if (kpage) {
3581 +               err = try_to_merge_with_ksm_page(rmap_item, kpage,
3582 +                                                hash);
3583 +               if (!err) {
3584 +                       /*
3585 +                        * The page was successfully merged, add
3586 +                        * its rmap_item to the stable tree.
3587 +                        * page lock is needed because it's
3588 +                        * racing with try_to_unmap_ksm(), etc.
3589 +                        */
3590 +                       lock_page(kpage);
3591 +                       stable_tree_append(rmap_item, page_stable_node(kpage));
3592 +                       unlock_page(kpage);
3593 +                       put_page(kpage);
3594 +                       return; /* success */
3595 +               }
3596 +               put_page(kpage);
3597 +
3598 +               /*
3599 +                * if it's a collision and it has been search in sub-rbtree
3600 +                * (hash_max != 0), we want to abort, because if it is
3601 +                * successfully merged in unstable tree, the collision trends to
3602 +                * happen again.
3603 +                */
3604 +               if (err == MERGE_ERR_COLLI && rmap_item->hash_max)
3605 +                       return;
3606 +       }
3607 +
3608 +       tree_rmap_item =
3609 +               unstable_tree_search_insert(rmap_item, hash);
3610 +       if (tree_rmap_item) {
3611 +               err = try_to_merge_two_pages(rmap_item, tree_rmap_item, hash);
3612 +               /*
3613 +                * As soon as we merge this page, we want to remove the
3614 +                * rmap_item of the page we have merged with from the unstable
3615 +                * tree, and insert it instead as new node in the stable tree.
3616 +                */
3617 +               if (!err) {
3618 +                       kpage = page;
3619 +                       remove_rmap_item_from_tree(tree_rmap_item);
3620 +                       lock_page(kpage);
3621 +                       snode = stable_tree_insert(&kpage, hash,
3622 +                                                  rmap_item, tree_rmap_item,
3623 +                                                  &success1, &success2);
3624 +
3625 +                       if (success1)
3626 +                               stable_tree_append(rmap_item, snode);
3627 +                       else
3628 +                               break_cow(rmap_item);
3629 +
3630 +                       if (success2)
3631 +                               stable_tree_append(tree_rmap_item, snode);
3632 +                       else
3633 +                               break_cow(tree_rmap_item);
3634 +
3635 +                       /*
3636 +                        * The original kpage may be unlocked inside
3637 +                        * stable_tree_insert() already.
3638 +                        */
3639 +                       unlock_page(kpage);
3640 +
3641 +               } else if (err == MERGE_ERR_COLLI) {
3642 +                       if (tree_rmap_item->tree_node->count == 1) {
3643 +                               rmap_item_hash_max(tree_rmap_item,
3644 +                               tree_rmap_item->tree_node->hash);
3645 +                       } else
3646 +                               BUG_ON(!(tree_rmap_item->hash_max));
3647 +
3648 +                       hash_max = rmap_item_hash_max(rmap_item, hash);
3649 +                       cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
3650 +                       parent = &tree_rmap_item->node;
3651 +                       if (cmp < 0)
3652 +                               new = &parent->rb_left;
3653 +                       else if (cmp > 0)
3654 +                               new = &parent->rb_right;
3655 +                       else
3656 +                               goto put_up_out;
3657 +
3658 +                       rmap_item->tree_node = tree_rmap_item->tree_node;
3659 +                       rmap_item->address |= UNSTABLE_FLAG;
3660 +                       rmap_item->append_round = ksm_scan_round;
3661 +                       rb_link_node(&rmap_item->node, parent, new);
3662 +                       rb_insert_color(&rmap_item->node,
3663 +                                       &tree_rmap_item->tree_node->sub_root);
3664 +                       rmap_item->tree_node->count++;
3665 +               }
3666 +put_up_out:
3667 +               put_page(tree_rmap_item->page);
3668 +               up_read(&tree_rmap_item->slot->vma->vm_mm->mmap_sem);
3669 +       }
3670 +}
3671 +
3672 +
3673 +
3674 +
3675 +static inline unsigned long get_pool_index(struct vma_slot *slot,
3676 +                                          unsigned long index)
3677 +{
3678 +       unsigned long pool_index;
3679 +
3680 +       pool_index = (sizeof(struct rmap_list_entry *) * index) >> PAGE_SHIFT;
3681 +       if (pool_index >= slot->pool_size)
3682 +               BUG();
3683 +       return pool_index;
3684 +}
3685 +
3686 +static inline unsigned long index_page_offset(unsigned long index)
3687 +{
3688 +       return offset_in_page(sizeof(struct rmap_list_entry *) * index);
3689 +}
3690 +
3691 +static inline
3692 +struct rmap_list_entry *get_rmap_list_entry(struct vma_slot *slot,
3693 +                                           unsigned long index, int need_alloc)
3694 +{
3695 +       unsigned long pool_index;
3696 +       void *addr;
3697 +
3698 +
3699 +       pool_index = get_pool_index(slot, index);
3700 +       if (!slot->rmap_list_pool[pool_index]) {
3701 +               if (!need_alloc)
3702 +                       return NULL;
3703 +
3704 +               slot->rmap_list_pool[pool_index] =
3705 +                       alloc_page(GFP_KERNEL | __GFP_ZERO);
3706 +               BUG_ON(!slot->rmap_list_pool[pool_index]);
3707 +       }
3708 +
3709 +       addr = kmap(slot->rmap_list_pool[pool_index]);
3710 +       addr += index_page_offset(index);
3711 +
3712 +       return addr;
3713 +}
3714 +
3715 +static inline void put_rmap_list_entry(struct vma_slot *slot,
3716 +                                      unsigned long index)
3717 +{
3718 +       unsigned long pool_index;
3719 +
3720 +       pool_index = get_pool_index(slot, index);
3721 +       BUG_ON(!slot->rmap_list_pool[pool_index]);
3722 +       kunmap(slot->rmap_list_pool[pool_index]);
3723 +}
3724 +
3725 +static inline int entry_is_new(struct rmap_list_entry *entry)
3726 +{
3727 +       return !entry->item;
3728 +}
3729 +
3730 +static inline unsigned long get_index_orig_addr(struct vma_slot *slot,
3731 +                                               unsigned long index)
3732 +{
3733 +       return slot->vma->vm_start + (index << PAGE_SHIFT);
3734 +}
3735 +
3736 +static inline unsigned long get_entry_address(struct rmap_list_entry *entry)
3737 +{
3738 +       unsigned long addr;
3739 +
3740 +       if (is_addr(entry->addr))
3741 +               addr = get_clean_addr(entry->addr);
3742 +       else if (entry->item)
3743 +               addr = get_rmap_addr(entry->item);
3744 +       else
3745 +               BUG();
3746 +
3747 +       return addr;
3748 +}
3749 +
3750 +static inline struct rmap_item *get_entry_item(struct rmap_list_entry *entry)
3751 +{
3752 +       if (is_addr(entry->addr))
3753 +               return NULL;
3754 +
3755 +       return entry->item;
3756 +}
3757 +
3758 +static inline void inc_rmap_list_pool_count(struct vma_slot *slot,
3759 +                                           unsigned long index)
3760 +{
3761 +       unsigned long pool_index;
3762 +
3763 +       pool_index = get_pool_index(slot, index);
3764 +       BUG_ON(!slot->rmap_list_pool[pool_index]);
3765 +       slot->pool_counts[pool_index]++;
3766 +}
3767 +
3768 +static inline void dec_rmap_list_pool_count(struct vma_slot *slot,
3769 +                                           unsigned long index)
3770 +{
3771 +       unsigned long pool_index;
3772 +
3773 +       pool_index = get_pool_index(slot, index);
3774 +       BUG_ON(!slot->rmap_list_pool[pool_index]);
3775 +       BUG_ON(!slot->pool_counts[pool_index]);
3776 +       slot->pool_counts[pool_index]--;
3777 +}
3778 +
3779 +static inline int entry_has_rmap(struct rmap_list_entry *entry)
3780 +{
3781 +       return !is_addr(entry->addr) && entry->item;
3782 +}
3783 +
3784 +static inline void swap_entries(struct rmap_list_entry *entry1,
3785 +                               unsigned long index1,
3786 +                               struct rmap_list_entry *entry2,
3787 +                               unsigned long index2)
3788 +{
3789 +       struct rmap_list_entry tmp;
3790 +
3791 +       /* swapping two new entries is meaningless */
3792 +       BUG_ON(entry_is_new(entry1) && entry_is_new(entry2));
3793 +
3794 +       tmp = *entry1;
3795 +       *entry1 = *entry2;
3796 +       *entry2 = tmp;
3797 +
3798 +       if (entry_has_rmap(entry1))
3799 +               entry1->item->entry_index = index1;
3800 +
3801 +       if (entry_has_rmap(entry2))
3802 +               entry2->item->entry_index = index2;
3803 +
3804 +       if (entry_has_rmap(entry1) && !entry_has_rmap(entry2)) {
3805 +               inc_rmap_list_pool_count(entry1->item->slot, index1);
3806 +               dec_rmap_list_pool_count(entry1->item->slot, index2);
3807 +       } else if (!entry_has_rmap(entry1) && entry_has_rmap(entry2)) {
3808 +               inc_rmap_list_pool_count(entry2->item->slot, index2);
3809 +               dec_rmap_list_pool_count(entry2->item->slot, index1);
3810 +       }
3811 +}
3812 +
3813 +static inline void free_entry_item(struct rmap_list_entry *entry)
3814 +{
3815 +       unsigned long index;
3816 +       struct rmap_item *item;
3817 +
3818 +       if (!is_addr(entry->addr)) {
3819 +               BUG_ON(!entry->item);
3820 +               item = entry->item;
3821 +               entry->addr = get_rmap_addr(item);
3822 +               set_is_addr(entry->addr);
3823 +               index = item->entry_index;
3824 +               remove_rmap_item_from_tree(item);
3825 +               dec_rmap_list_pool_count(item->slot, index);
3826 +               free_rmap_item(item);
3827 +       }
3828 +}
3829 +
3830 +static inline int pool_entry_boundary(unsigned long index)
3831 +{
3832 +       unsigned long linear_addr;
3833 +
3834 +       linear_addr = sizeof(struct rmap_list_entry *) * index;
3835 +       return index && !offset_in_page(linear_addr);
3836 +}
3837 +
3838 +static inline void try_free_last_pool(struct vma_slot *slot,
3839 +                                     unsigned long index)
3840 +{
3841 +       unsigned long pool_index;
3842 +
3843 +       pool_index = get_pool_index(slot, index);
3844 +       if (slot->rmap_list_pool[pool_index] &&
3845 +           !slot->pool_counts[pool_index]) {
3846 +               __free_page(slot->rmap_list_pool[pool_index]);
3847 +               slot->rmap_list_pool[pool_index] = NULL;
3848 +               slot->need_sort = 1;
3849 +       }
3850 +
3851 +}
3852 +
3853 +static inline unsigned long vma_item_index(struct vm_area_struct *vma,
3854 +                                          struct rmap_item *item)
3855 +{
3856 +       return (get_rmap_addr(item) - vma->vm_start) >> PAGE_SHIFT;
3857 +}
3858 +
3859 +static int within_same_pool(struct vma_slot *slot,
3860 +                           unsigned long i, unsigned long j)
3861 +{
3862 +       unsigned long pool_i, pool_j;
3863 +
3864 +       pool_i = get_pool_index(slot, i);
3865 +       pool_j = get_pool_index(slot, j);
3866 +
3867 +       return (pool_i == pool_j);
3868 +}
3869 +
3870 +static void sort_rmap_entry_list(struct vma_slot *slot)
3871 +{
3872 +       unsigned long i, j;
3873 +       struct rmap_list_entry *entry, *swap_entry;
3874 +
3875 +       entry = get_rmap_list_entry(slot, 0, 0);
3876 +       for (i = 0; i < slot->pages; ) {
3877 +
3878 +               if (!entry)
3879 +                       goto skip_whole_pool;
3880 +
3881 +               if (entry_is_new(entry))
3882 +                       goto next_entry;
3883 +
3884 +               if (is_addr(entry->addr)) {
3885 +                       entry->addr = 0;
3886 +                       goto next_entry;
3887 +               }
3888 +
3889 +               j = vma_item_index(slot->vma, entry->item);
3890 +               if (j == i)
3891 +                       goto next_entry;
3892 +
3893 +               if (within_same_pool(slot, i, j))
3894 +                       swap_entry = entry + j - i;
3895 +               else
3896 +                       swap_entry = get_rmap_list_entry(slot, j, 1);
3897 +
3898 +               swap_entries(entry, i, swap_entry, j);
3899 +               if (!within_same_pool(slot, i, j))
3900 +                       put_rmap_list_entry(slot, j);
3901 +               continue;
3902 +
3903 +skip_whole_pool:
3904 +               i += PAGE_SIZE / sizeof(*entry);
3905 +               if (i < slot->pages)
3906 +                       entry = get_rmap_list_entry(slot, i, 0);
3907 +               continue;
3908 +
3909 +next_entry:
3910 +               if (i >= slot->pages - 1 ||
3911 +                   !within_same_pool(slot, i, i + 1)) {
3912 +                       put_rmap_list_entry(slot, i);
3913 +                       if (i + 1 < slot->pages)
3914 +                               entry = get_rmap_list_entry(slot, i + 1, 0);
3915 +               } else
3916 +                       entry++;
3917 +               i++;
3918 +               continue;
3919 +       }
3920 +
3921 +       /* free empty pool entries which contain no rmap_item */
3922 +       /* CAN be simplied to based on only pool_counts when bug freed !!!!! */
3923 +       for (i = 0; i < slot->pool_size; i++) {
3924 +               unsigned char has_rmap;
3925 +               void *addr;
3926 +
3927 +               if (!slot->rmap_list_pool[i])
3928 +                       continue;
3929 +
3930 +               has_rmap = 0;
3931 +               addr = kmap(slot->rmap_list_pool[i]);
3932 +               BUG_ON(!addr);
3933 +               for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
3934 +                       entry = (struct rmap_list_entry *)addr + j;
3935 +                       if (is_addr(entry->addr))
3936 +                               continue;
3937 +                       if (!entry->item)
3938 +                               continue;
3939 +                       has_rmap = 1;
3940 +               }
3941 +               kunmap(slot->rmap_list_pool[i]);
3942 +               if (!has_rmap) {
3943 +                       BUG_ON(slot->pool_counts[i]);
3944 +                       __free_page(slot->rmap_list_pool[i]);
3945 +                       slot->rmap_list_pool[i] = NULL;
3946 +               }
3947 +       }
3948 +
3949 +       slot->need_sort = 0;
3950 +}
3951 +
3952 +/*
3953 + * vma_fully_scanned() - if all the pages in this slot have been scanned.
3954 + */
3955 +static inline int vma_fully_scanned(struct vma_slot *slot)
3956 +{
3957 +       return slot->pages_scanned && !(slot->pages_scanned % slot->pages);
3958 +}
3959 +
3960 +/**
3961 + * get_next_rmap_item() - Get the next rmap_item in a vma_slot according to
3962 + * its random permutation. This function is embedded with the random
3963 + * permutation index management code.
3964 + */
3965 +static struct rmap_item *get_next_rmap_item(struct vma_slot *slot)
3966 +{
3967 +       unsigned long rand_range, addr, swap_index, scan_index;
3968 +       struct rmap_item *item = NULL;
3969 +       struct rmap_list_entry *scan_entry, *swap_entry = NULL;
3970 +       struct page *page;
3971 +
3972 +       scan_index = swap_index = slot->pages_scanned % slot->pages;
3973 +
3974 +       if (pool_entry_boundary(scan_index))
3975 +               try_free_last_pool(slot, scan_index - 1);
3976 +
3977 +       if (vma_fully_scanned(slot)) {
3978 +               slot->need_rerand = slot->need_sort;
3979 +               if (slot->need_sort)
3980 +                       sort_rmap_entry_list(slot);
3981 +       }
3982 +
3983 +       scan_entry = get_rmap_list_entry(slot, scan_index, 1);
3984 +       if (entry_is_new(scan_entry)) {
3985 +               scan_entry->addr = get_index_orig_addr(slot, scan_index);
3986 +               set_is_addr(scan_entry->addr);
3987 +       }
3988 +
3989 +       if (slot->need_rerand) {
3990 +               rand_range = slot->pages - scan_index;
3991 +               BUG_ON(!rand_range);
3992 +               swap_index = scan_index + (random32() % rand_range);
3993 +       }
3994 +
3995 +       if (swap_index != scan_index) {
3996 +               swap_entry = get_rmap_list_entry(slot, swap_index, 1);
3997 +               if (entry_is_new(swap_entry)) {
3998 +                       swap_entry->addr = get_index_orig_addr(slot,
3999 +                                                              swap_index);
4000 +                       set_is_addr(swap_entry->addr);
4001 +               }
4002 +               swap_entries(scan_entry, scan_index, swap_entry, swap_index);
4003 +       }
4004 +
4005 +       addr = get_entry_address(scan_entry);
4006 +       item = get_entry_item(scan_entry);
4007 +       BUG_ON(addr > slot->vma->vm_end || addr < slot->vma->vm_start);
4008 +
4009 +       page = follow_page(slot->vma, addr, FOLL_GET);
4010 +       if (IS_ERR_OR_NULL(page))
4011 +               goto nopage;
4012 +
4013 +       if (!PageAnon(page) && !page_trans_compound_anon(page))
4014 +               goto putpage;
4015 +
4016 +       /*check is zero_page pfn or ksm_zero_page*/
4017 +       if ((page_to_pfn(page) == zero_pfn)
4018 +                       || (page_to_pfn(page) == ksm_zero_pfn))
4019 +               goto putpage;
4020 +
4021 +       flush_anon_page(slot->vma, page, addr);
4022 +       flush_dcache_page(page);
4023 +
4024 +       if (!item) {
4025 +               item = alloc_rmap_item();
4026 +               if (item) {
4027 +                       /* It has already been zeroed */
4028 +                       item->slot = slot;
4029 +                       item->address = addr;
4030 +                       item->entry_index = scan_index;
4031 +                       scan_entry->item = item;
4032 +                       inc_rmap_list_pool_count(slot, scan_index);
4033 +               } else
4034 +                       goto putpage;
4035 +       }
4036 +
4037 +       BUG_ON(item->slot != slot);
4038 +       /* the page may have changed */
4039 +       item->page = page;
4040 +       put_rmap_list_entry(slot, scan_index);
4041 +       if (swap_entry)
4042 +               put_rmap_list_entry(slot, swap_index);
4043 +       return item;
4044 +
4045 +putpage:
4046 +       put_page(page);
4047 +       page = NULL;
4048 +nopage:
4049 +       /* no page, store addr back and free rmap_item if possible */
4050 +       free_entry_item(scan_entry);
4051 +       put_rmap_list_entry(slot, scan_index);
4052 +       if (swap_entry)
4053 +               put_rmap_list_entry(slot, swap_index);
4054 +       return NULL;
4055 +}
4056 +
4057 +static inline int in_stable_tree(struct rmap_item *rmap_item)
4058 +{
4059 +       return rmap_item->address & STABLE_FLAG;
4060 +}
4061 +
4062 +/**
4063 + * scan_vma_one_page() - scan the next page in a vma_slot. Called with
4064 + * mmap_sem locked.
4065 + */
4066 +static void scan_vma_one_page(struct vma_slot *slot)
4067 +{
4068 +       struct mm_struct *mm;
4069 +       struct rmap_item *rmap_item = NULL;
4070 +       struct vm_area_struct *vma = slot->vma;
4071 +
4072 +       mm = vma->vm_mm;
4073 +       BUG_ON(!mm);
4074 +       BUG_ON(!slot);
4075 +
4076 +       rmap_item = get_next_rmap_item(slot);
4077 +       if (!rmap_item)
4078 +               goto out1;
4079 +
4080 +       if (PageKsm(rmap_item->page) && in_stable_tree(rmap_item))
4081 +               goto out2;
4082 +
4083 +       cmp_and_merge_page(rmap_item);
4084 +out2:
4085 +       put_page(rmap_item->page);
4086 +out1:
4087 +       slot->pages_scanned++;
4088 +       slot->slot_scanned = 1;
4089 +       if (vma_fully_scanned(slot)) {
4090 +               slot->fully_scanned = 1;
4091 +               slot->rung->fully_scanned_slots++;
4092 +               BUG_ON(!slot->rung->fully_scanned_slots);
4093 +       }
4094 +}
4095 +
4096 +static unsigned long get_vma_random_scan_num(struct vma_slot *slot,
4097 +                                            unsigned long scan_ratio)
4098 +{
4099 +       return slot->pages * scan_ratio / KSM_SCAN_RATIO_MAX;
4100 +}
4101 +
4102 +static inline void vma_rung_enter(struct vma_slot *slot,
4103 +                                 struct scan_rung *rung)
4104 +{
4105 +       unsigned long pages_to_scan;
4106 +       struct scan_rung *old_rung = slot->rung;
4107 +
4108 +       /* leave the old rung it was in */
4109 +       BUG_ON(list_empty(&slot->ksm_list));
4110 +
4111 +       if (old_rung->current_scan == &slot->ksm_list)
4112 +               old_rung->current_scan = slot->ksm_list.next;
4113 +       list_del_init(&slot->ksm_list);
4114 +       old_rung->vma_num--;
4115 +       if (slot->fully_scanned)
4116 +               old_rung->fully_scanned_slots--;
4117 +
4118 +       if (old_rung->current_scan == &old_rung->vma_list) {
4119 +               /* This rung finishes a round */
4120 +               old_rung->round_finished = 1;
4121 +               old_rung->current_scan = old_rung->vma_list.next;
4122 +               BUG_ON(old_rung->current_scan == &old_rung->vma_list &&
4123 +                      !list_empty(&old_rung->vma_list));
4124 +       }
4125 +
4126 +       /* enter the new rung */
4127 +       while (!(pages_to_scan =
4128 +               get_vma_random_scan_num(slot, rung->scan_ratio))) {
4129 +               rung++;
4130 +               BUG_ON(rung > &ksm_scan_ladder[ksm_scan_ladder_size - 1]);
4131 +       }
4132 +       if (list_empty(&rung->vma_list))
4133 +               rung->current_scan = &slot->ksm_list;
4134 +       list_add(&slot->ksm_list, &rung->vma_list);
4135 +       slot->rung = rung;
4136 +       slot->pages_to_scan = pages_to_scan;
4137 +       slot->rung->vma_num++;
4138 +       if (slot->fully_scanned)
4139 +               rung->fully_scanned_slots++;
4140 +
4141 +       BUG_ON(rung->current_scan == &rung->vma_list &&
4142 +              !list_empty(&rung->vma_list));
4143 +}
4144 +
4145 +static inline void vma_rung_up(struct vma_slot *slot)
4146 +{
4147 +       if (slot->rung == &ksm_scan_ladder[ksm_scan_ladder_size-1])
4148 +               return;
4149 +
4150 +       vma_rung_enter(slot, slot->rung + 1);
4151 +}
4152 +
4153 +static inline void vma_rung_down(struct vma_slot *slot)
4154 +{
4155 +       if (slot->rung == &ksm_scan_ladder[0])
4156 +               return;
4157 +
4158 +       vma_rung_enter(slot, slot->rung - 1);
4159 +}
4160 +
4161 +/**
4162 + * cal_dedup_ratio() - Calculate the deduplication ratio for this slot.
4163 + */
4164 +static unsigned long cal_dedup_ratio(struct vma_slot *slot)
4165 +{
4166 +       struct vma_slot *slot2;
4167 +       void **dup_slot;
4168 +       unsigned long dup_pages;
4169 +       unsigned long dedup_num, pages1, scanned1;
4170 +       unsigned long ret;
4171 +       int i;
4172 +
4173 +       if (!slot->pages_scanned)
4174 +               return 0;
4175 +
4176 +       pages1 = slot->pages;
4177 +       scanned1 = slot->pages_scanned - slot->last_scanned;
4178 +       BUG_ON(scanned1 > slot->pages_scanned);
4179 +
4180 +       for (i = slot->ksm_index; i < ksm_vma_tree_index_end; i++) {
4181 +               unsigned long pages2, scanned2;
4182 +
4183 +               dup_slot = radix_tree_lookup_slot(&slot->dup_tree, i);
4184 +               if (!dup_slot)
4185 +                       continue;
4186 +
4187 +               dup_pages = (unsigned long)(*dup_slot) >> INDIRECT_OFFSET;
4188 +
4189 +               slot2 = radix_tree_lookup(&ksm_vma_tree, i);
4190 +               BUG_ON(!slot2 || !slot2->pages_scanned);
4191 +
4192 +               pages2 = slot2->pages;
4193 +               scanned2 = slot2->pages_scanned - slot2->last_scanned;
4194 +               BUG_ON(scanned2 > slot2->pages_scanned);
4195 +
4196 +               BUG_ON(!scanned1 || !scanned2);
4197 +
4198 +               dedup_num = dup_pages * pages1 / scanned1 * pages2 / scanned2;
4199 +               slot->dedup_num += dedup_num;
4200 +               slot2->dedup_num += dedup_num;
4201 +       }
4202 +
4203 +       ret = (slot->dedup_num * KSM_DEDUP_RATIO_SCALE / pages1);
4204 +
4205 +       /* Thrashing area filtering */
4206 +       if (ksm_thrash_threshold) {
4207 +               if (slot->pages_cowed * 100 / slot->pages_merged
4208 +                   > ksm_thrash_threshold) {
4209 +                       ret = 0;
4210 +               } else {
4211 +                       ret = ret * (slot->pages_merged - slot->pages_cowed)
4212 +                             / slot->pages_merged;
4213 +               }
4214 +       }
4215 +
4216 +       return ret;
4217 +}
4218 +
4219 +
4220 +/**
4221 + * stable_node_reinsert() - When the hash_strength has been adjusted, the
4222 + * stable tree need to be restructured, this is the function re-inserting the
4223 + * stable node.
4224 + */
4225 +static inline void stable_node_reinsert(struct stable_node *new_node,
4226 +                                       struct page *page,
4227 +                                       struct rb_root *root_treep,
4228 +                                       struct list_head *tree_node_listp,
4229 +                                       u32 hash)
4230 +{
4231 +       struct rb_node **new = &root_treep->rb_node;
4232 +       struct rb_node *parent = NULL;
4233 +       struct stable_node *stable_node;
4234 +       struct tree_node *tree_node;
4235 +       struct page *tree_page;
4236 +       int cmp;
4237 +
4238 +       while (*new) {
4239 +               int cmp;
4240 +
4241 +               tree_node = rb_entry(*new, struct tree_node, node);
4242 +
4243 +               cmp = hash_cmp(hash, tree_node->hash);
4244 +
4245 +               if (cmp < 0) {
4246 +                       parent = *new;
4247 +                       new = &parent->rb_left;
4248 +               } else if (cmp > 0) {
4249 +                       parent = *new;
4250 +                       new = &parent->rb_right;
4251 +               } else
4252 +                       break;
4253 +       }
4254 +
4255 +       if (*new) {
4256 +               /* find a stable tree node with same first level hash value */
4257 +               stable_node_hash_max(new_node, page, hash);
4258 +               if (tree_node->count == 1) {
4259 +                       stable_node = rb_entry(tree_node->sub_root.rb_node,
4260 +                                              struct stable_node, node);
4261 +                       tree_page = get_ksm_page(stable_node, 1, 0);
4262 +                       if (tree_page) {
4263 +                               stable_node_hash_max(stable_node,
4264 +                                                     tree_page, hash);
4265 +                               put_page(tree_page);
4266 +
4267 +                               /* prepare for stable node insertion */
4268 +
4269 +                               cmp = hash_cmp(new_node->hash_max,
4270 +                                                  stable_node->hash_max);
4271 +                               parent = &stable_node->node;
4272 +                               if (cmp < 0)
4273 +                                       new = &parent->rb_left;
4274 +                               else if (cmp > 0)
4275 +                                       new = &parent->rb_right;
4276 +                               else
4277 +                                       goto failed;
4278 +
4279 +                               goto add_node;
4280 +                       } else {
4281 +                               /* the only stable_node deleted, the tree node
4282 +                                * was not deleted.
4283 +                                */
4284 +                               goto tree_node_reuse;
4285 +                       }
4286 +               }
4287 +
4288 +               /* well, search the collision subtree */
4289 +               new = &tree_node->sub_root.rb_node;
4290 +               parent = NULL;
4291 +               BUG_ON(!*new);
4292 +               while (*new) {
4293 +                       int cmp;
4294 +
4295 +                       stable_node = rb_entry(*new, struct stable_node, node);
4296 +
4297 +                       cmp = hash_cmp(new_node->hash_max,
4298 +                                          stable_node->hash_max);
4299 +
4300 +                       if (cmp < 0) {
4301 +                               parent = *new;
4302 +                               new = &parent->rb_left;
4303 +                       } else if (cmp > 0) {
4304 +                               parent = *new;
4305 +                               new = &parent->rb_right;
4306 +                       } else {
4307 +                               /* oh, no, still a collision */
4308 +                               goto failed;
4309 +                       }
4310 +               }
4311 +
4312 +               goto add_node;
4313 +       }
4314 +
4315 +       /* no tree node found */
4316 +       tree_node = alloc_tree_node(tree_node_listp);
4317 +       if (!tree_node) {
4318 +               printk(KERN_ERR "UKSM: memory allocation error!\n");
4319 +               goto failed;
4320 +       } else {
4321 +               tree_node->hash = hash;
4322 +               rb_link_node(&tree_node->node, parent, new);
4323 +               rb_insert_color(&tree_node->node, root_treep);
4324 +
4325 +tree_node_reuse:
4326 +               /* prepare for stable node insertion */
4327 +               parent = NULL;
4328 +               new = &tree_node->sub_root.rb_node;
4329 +       }
4330 +
4331 +add_node:
4332 +       rb_link_node(&new_node->node, parent, new);
4333 +       rb_insert_color(&new_node->node, &tree_node->sub_root);
4334 +       new_node->tree_node = tree_node;
4335 +       tree_node->count++;
4336 +       return;
4337 +
4338 +failed:
4339 +       /* This can only happen when two nodes have collided
4340 +        * in two levels.
4341 +        */
4342 +       new_node->tree_node = NULL;
4343 +       return;
4344 +}
4345 +
4346 +static inline void free_all_tree_nodes(struct list_head *list)
4347 +{
4348 +       struct tree_node *node, *tmp;
4349 +
4350 +       list_for_each_entry_safe(node, tmp, list, all_list) {
4351 +               free_tree_node(node);
4352 +       }
4353 +}
4354 +
4355 +/**
4356 + * stable_tree_delta_hash() - Delta hash the stable tree from previous hash
4357 + * strength to the current hash_strength. It re-structures the hole tree.
4358 + */
4359 +static inline void stable_tree_delta_hash(u32 prev_hash_strength)
4360 +{
4361 +       struct stable_node *node, *tmp;
4362 +       struct rb_root *root_new_treep;
4363 +       struct list_head *new_tree_node_listp;
4364 +
4365 +       stable_tree_index = (stable_tree_index + 1) % 2;
4366 +       root_new_treep = &root_stable_tree[stable_tree_index];
4367 +       new_tree_node_listp = &stable_tree_node_list[stable_tree_index];
4368 +       *root_new_treep = RB_ROOT;
4369 +       BUG_ON(!list_empty(new_tree_node_listp));
4370 +
4371 +       /*
4372 +        * we need to be safe, the node could be removed by get_ksm_page()
4373 +        */
4374 +       list_for_each_entry_safe(node, tmp, &stable_node_list, all_list) {
4375 +               void *addr;
4376 +               struct page *node_page;
4377 +               u32 hash;
4378 +
4379 +               /*
4380 +                * We are completely re-structuring the stable nodes to a new
4381 +                * stable tree. We don't want to touch the old tree unlinks and
4382 +                * old tree_nodes. The old tree_nodes will be freed at once.
4383 +                */
4384 +               node_page = get_ksm_page(node, 0, 0);
4385 +               if (!node_page)
4386 +                       continue;
4387 +
4388 +               if (node->tree_node) {
4389 +                       hash = node->tree_node->hash;
4390 +
4391 +                       addr = kmap_atomic(node_page, KM_USER0);
4392 +
4393 +                       hash = delta_hash(addr, prev_hash_strength,
4394 +                                         hash_strength, hash);
4395 +                       kunmap_atomic(addr, KM_USER0);
4396 +               } else {
4397 +                       /*
4398 +                        *it was not inserted to rbtree due to collision in last
4399 +                        *round scan.
4400 +                        */
4401 +                       hash = page_hash(node_page, hash_strength, 0);
4402 +               }
4403 +
4404 +               stable_node_reinsert(node, node_page, root_new_treep,
4405 +                                    new_tree_node_listp, hash);
4406 +               put_page(node_page);
4407 +       }
4408 +
4409 +       root_stable_treep = root_new_treep;
4410 +       free_all_tree_nodes(stable_tree_node_listp);
4411 +       BUG_ON(!list_empty(stable_tree_node_listp));
4412 +       stable_tree_node_listp = new_tree_node_listp;
4413 +}
4414 +
4415 +static inline void inc_hash_strength(unsigned long delta)
4416 +{
4417 +       hash_strength += 1 << delta;
4418 +       if (hash_strength > HASH_STRENGTH_MAX)
4419 +               hash_strength = HASH_STRENGTH_MAX;
4420 +}
4421 +
4422 +static inline void dec_hash_strength(unsigned long delta)
4423 +{
4424 +       unsigned long change = 1 << delta;
4425 +
4426 +       if (hash_strength <= change + 1)
4427 +               hash_strength = 1;
4428 +       else
4429 +               hash_strength -= change;
4430 +}
4431 +
4432 +static inline void inc_hash_strength_delta(void)
4433 +{
4434 +       hash_strength_delta++;
4435 +       if (hash_strength_delta > HASH_STRENGTH_DELTA_MAX)
4436 +               hash_strength_delta = HASH_STRENGTH_DELTA_MAX;
4437 +}
4438 +
4439 +/*
4440 +static inline unsigned long get_current_neg_ratio(void)
4441 +{
4442 +       if (!rshash_pos || rshash_neg > rshash_pos)
4443 +               return 100;
4444 +
4445 +       return div64_u64(100 * rshash_neg , rshash_pos);
4446 +}
4447 +*/
4448 +
4449 +static inline unsigned long get_current_neg_ratio(void)
4450 +{
4451 +       u64 pos = benefit.pos;
4452 +       u64 neg = benefit.neg;
4453 +
4454 +       if (!neg)
4455 +               return 0;
4456 +
4457 +       if (!pos || neg > pos)
4458 +               return 100;
4459 +
4460 +       if (neg > div64_u64(U64_MAX, 100))
4461 +               pos = div64_u64(pos, 100);
4462 +       else
4463 +               neg *= 100;
4464 +
4465 +       return div64_u64(neg, pos);
4466 +}
4467 +
4468 +static inline unsigned long get_current_benefit(void)
4469 +{
4470 +       u64 pos = benefit.pos;
4471 +       u64 neg = benefit.neg;
4472 +       u64 scanned = benefit.scanned;
4473 +
4474 +       if (neg > pos)
4475 +               return 0;
4476 +
4477 +       return div64_u64((pos - neg), scanned);
4478 +}
4479 +
4480 +static inline int judge_rshash_direction(void)
4481 +{
4482 +       u64 current_neg_ratio, stable_benefit;
4483 +       u64 current_benefit, delta = 0;
4484 +       int ret = STILL;
4485 +
4486 +       /* In case the system are still for a long time. */
4487 +       if (ksm_scan_round % 1024 == 3) {
4488 +               ret = OBSCURE;
4489 +               goto out;
4490 +       }
4491 +
4492 +       current_neg_ratio = get_current_neg_ratio();
4493 +
4494 +       if (current_neg_ratio == 0) {
4495 +               rshash_neg_cont_zero++;
4496 +               if (rshash_neg_cont_zero > 2)
4497 +                       return GO_DOWN;
4498 +               else
4499 +                       return STILL;
4500 +       }
4501 +       rshash_neg_cont_zero = 0;
4502 +
4503 +       if (current_neg_ratio > 90) {
4504 +               ret = GO_UP;
4505                 goto out;
4506 +       }
4507
4508 -       err = try_to_merge_one_page(vma, page, kpage);
4509 -       if (err)
4510 +       current_benefit = get_current_benefit();
4511 +       stable_benefit = rshash_state.stable_benefit;
4512 +
4513 +       if (!stable_benefit) {
4514 +               ret = OBSCURE;
4515                 goto out;
4516 +       }
4517 +
4518 +       if (current_benefit > stable_benefit)
4519 +               delta = current_benefit - stable_benefit;
4520 +       else if (current_benefit < stable_benefit)
4521 +               delta = stable_benefit - current_benefit;
4522 +
4523 +       delta = div64_u64(100 * delta , stable_benefit);
4524 +
4525 +       if (delta > 50) {
4526 +               rshash_cont_obscure++;
4527 +               if (rshash_cont_obscure > 2)
4528 +                       return OBSCURE;
4529 +               else
4530 +                       return STILL;
4531 +       }
4532
4533 -       /* Must get reference to anon_vma while still holding mmap_sem */
4534 -       hold_anon_vma(rmap_item, vma->anon_vma);
4535  out:
4536 -       up_read(&mm->mmap_sem);
4537 -       return err;
4538 +       rshash_cont_obscure = 0;
4539 +       return ret;
4540  }
4541
4542 -/*
4543 - * try_to_merge_two_pages - take two identical pages and prepare them
4544 - * to be merged into one page.
4545 - *
4546 - * This function returns the kpage if we successfully merged two identical
4547 - * pages into one ksm page, NULL otherwise.
4548 - *
4549 - * Note that this function upgrades page to ksm page: if one of the pages
4550 - * is already a ksm page, try_to_merge_with_ksm_page should be used.
4551 +/**
4552 + * rshash_adjust() - The main function to control the random sampling state
4553 + * machine for hash strength adapting.
4554   */
4555 -static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
4556 -                                          struct page *page,
4557 -                                          struct rmap_item *tree_rmap_item,
4558 -                                          struct page *tree_page)
4559 +static void rshash_adjust(void)
4560  {
4561 -       int err;
4562 +       unsigned long prev_hash_strength = hash_strength;
4563
4564 -       err = try_to_merge_with_ksm_page(rmap_item, page, NULL);
4565 -       if (!err) {
4566 -               err = try_to_merge_with_ksm_page(tree_rmap_item,
4567 -                                                       tree_page, page);
4568 -               /*
4569 -                * If that fails, we have a ksm page with only one pte
4570 -                * pointing to it: so break it.
4571 -                */
4572 -               if (err)
4573 -                       break_cow(rmap_item);
4574 -       }
4575 -       return err ? NULL : page;
4576 -}
4577 +       if (ksm_pages_scanned == ksm_pages_scanned_last)
4578 +               return;
4579
4580 -/*
4581 - * stable_tree_search - search for page inside the stable tree
4582 - *
4583 - * This function checks if there is a page inside the stable tree
4584 - * with identical content to the page that we are scanning right now.
4585 - *
4586 - * This function returns the stable tree node of identical content if found,
4587 - * NULL otherwise.
4588 - */
4589 -static struct page *stable_tree_search(struct page *page)
4590 -{
4591 -       struct rb_node *node = root_stable_tree.rb_node;
4592 -       struct stable_node *stable_node;
4593 +       encode_benefit();
4594
4595 -       stable_node = page_stable_node(page);
4596 -       if (stable_node) {                      /* ksm page forked */
4597 -               get_page(page);
4598 -               return page;
4599 +       switch (rshash_state.state) {
4600 +       case RSHASH_STILL:
4601 +               switch (judge_rshash_direction()) {
4602 +               case GO_UP:
4603 +                       if (rshash_state.pre_direct == GO_DOWN)
4604 +                               hash_strength_delta = 0;
4605 +
4606 +                       inc_hash_strength(hash_strength_delta);
4607 +                       inc_hash_strength_delta();
4608 +                       rshash_state.stable_benefit = get_current_benefit();
4609 +                       rshash_state.pre_direct = GO_UP;
4610 +                       break;
4611 +
4612 +               case GO_DOWN:
4613 +                       if (rshash_state.pre_direct == GO_UP)
4614 +                               hash_strength_delta = 0;
4615 +
4616 +                       dec_hash_strength(hash_strength_delta);
4617 +                       inc_hash_strength_delta();
4618 +                       rshash_state.stable_benefit = get_current_benefit();
4619 +                       rshash_state.pre_direct = GO_DOWN;
4620 +                       break;
4621 +
4622 +               case OBSCURE:
4623 +                       rshash_state.stable_point = hash_strength;
4624 +                       rshash_state.turn_point_down = hash_strength;
4625 +                       rshash_state.turn_point_up = hash_strength;
4626 +                       rshash_state.turn_benefit_down = get_current_benefit();
4627 +                       rshash_state.turn_benefit_up = get_current_benefit();
4628 +                       rshash_state.lookup_window_index = 0;
4629 +                       rshash_state.state = RSHASH_TRYDOWN;
4630 +                       dec_hash_strength(hash_strength_delta);
4631 +                       inc_hash_strength_delta();
4632 +                       break;
4633 +
4634 +               case STILL:
4635 +                       break;
4636 +               default:
4637 +                       BUG();
4638 +               }
4639 +               break;
4640 +
4641 +       case RSHASH_TRYDOWN:
4642 +               if (rshash_state.lookup_window_index++ % 5 == 0)
4643 +                       rshash_state.below_count = 0;
4644 +
4645 +               if (get_current_benefit() < rshash_state.stable_benefit)
4646 +                       rshash_state.below_count++;
4647 +               else if (get_current_benefit() >
4648 +                        rshash_state.turn_benefit_down) {
4649 +                       rshash_state.turn_point_down = hash_strength;
4650 +                       rshash_state.turn_benefit_down = get_current_benefit();
4651 +               }
4652 +
4653 +               if (rshash_state.below_count >= 3 ||
4654 +                   judge_rshash_direction() == GO_UP ||
4655 +                   hash_strength == 1) {
4656 +                       hash_strength = rshash_state.stable_point;
4657 +                       hash_strength_delta = 0;
4658 +                       inc_hash_strength(hash_strength_delta);
4659 +                       inc_hash_strength_delta();
4660 +                       rshash_state.lookup_window_index = 0;
4661 +                       rshash_state.state = RSHASH_TRYUP;
4662 +                       hash_strength_delta = 0;
4663 +               } else {
4664 +                       dec_hash_strength(hash_strength_delta);
4665 +                       inc_hash_strength_delta();
4666 +               }
4667 +               break;
4668 +
4669 +       case RSHASH_TRYUP:
4670 +               if (rshash_state.lookup_window_index++ % 5 == 0)
4671 +                       rshash_state.below_count = 0;
4672 +
4673 +               if (get_current_benefit() < rshash_state.turn_benefit_down)
4674 +                       rshash_state.below_count++;
4675 +               else if (get_current_benefit() > rshash_state.turn_benefit_up) {
4676 +                       rshash_state.turn_point_up = hash_strength;
4677 +                       rshash_state.turn_benefit_up = get_current_benefit();
4678 +               }
4679 +
4680 +               if (rshash_state.below_count >= 3 ||
4681 +                   judge_rshash_direction() == GO_DOWN ||
4682 +                   hash_strength == HASH_STRENGTH_MAX) {
4683 +                       hash_strength = rshash_state.turn_benefit_up >
4684 +                               rshash_state.turn_benefit_down ?
4685 +                               rshash_state.turn_point_up :
4686 +                               rshash_state.turn_point_down;
4687 +
4688 +                       rshash_state.state = RSHASH_PRE_STILL;
4689 +               } else {
4690 +                       inc_hash_strength(hash_strength_delta);
4691 +                       inc_hash_strength_delta();
4692 +               }
4693 +
4694 +               break;
4695 +
4696 +       case RSHASH_NEW:
4697 +       case RSHASH_PRE_STILL:
4698 +               rshash_state.stable_benefit = get_current_benefit();
4699 +               rshash_state.state = RSHASH_STILL;
4700 +               hash_strength_delta = 0;
4701 +               break;
4702 +       default:
4703 +               BUG();
4704         }
4705
4706 -       while (node) {
4707 -               struct page *tree_page;
4708 -               int ret;
4709 +       /* rshash_neg = rshash_pos = 0; */
4710 +       reset_benefit();
4711
4712 -               cond_resched();
4713 -               stable_node = rb_entry(node, struct stable_node, node);
4714 -               tree_page = get_ksm_page(stable_node);
4715 -               if (!tree_page)
4716 -                       return NULL;
4717 +       if (prev_hash_strength != hash_strength)
4718 +               stable_tree_delta_hash(prev_hash_strength);
4719 +}
4720
4721 -               ret = memcmp_pages(page, tree_page);
4722 +static void free_vma_dup_tree(struct vma_slot *slot)
4723 +{
4724 +       struct vma_slot *tmp_slot;
4725 +       int i;
4726
4727 -               if (ret < 0) {
4728 -                       put_page(tree_page);
4729 -                       node = node->rb_left;
4730 -               } else if (ret > 0) {
4731 -                       put_page(tree_page);
4732 -                       node = node->rb_right;
4733 -               } else
4734 -                       return tree_page;
4735 +       /* step 1: free entries in smaller vmas' dup tree */
4736 +       for (i = 0; i < slot->ksm_index; i++) {
4737 +               tmp_slot = radix_tree_lookup(&ksm_vma_tree, i);
4738 +               if (tmp_slot)
4739 +                       radix_tree_delete(&tmp_slot->dup_tree, slot->ksm_index);
4740         }
4741
4742 -       return NULL;
4743 +       /* step 2: free my own dup tree */
4744 +       for (i = slot->ksm_index; i < ksm_vma_tree_index_end; i++)
4745 +               radix_tree_delete(&slot->dup_tree, i);
4746 +
4747 +       BUG_ON(slot->dup_tree.rnode);
4748  }
4749
4750 -/*
4751 - * stable_tree_insert - insert rmap_item pointing to new ksm page
4752 - * into the stable tree.
4753 - *
4754 - * This function returns the stable tree node just allocated on success,
4755 - * NULL otherwise.
4756 +/**
4757 + * round_update_ladder() - The main function to do update of all the
4758 + * adjustments whenever a scan round is finished.
4759   */
4760 -static struct stable_node *stable_tree_insert(struct page *kpage)
4761 +static void round_update_ladder(void)
4762  {
4763 -       struct rb_node **new = &root_stable_tree.rb_node;
4764 -       struct rb_node *parent = NULL;
4765 -       struct stable_node *stable_node;
4766 +       int i;
4767 +       struct vma_slot *slot, *tmp_slot;
4768 +       unsigned long dedup_ratio_max = 0, dedup_ratio_mean = 0;
4769 +       unsigned long threshold;
4770 +
4771 +       for (i = 0; i < ksm_vma_tree_index_end; i++) {
4772 +               slot = radix_tree_lookup(&ksm_vma_tree, i);
4773 +
4774 +               if (slot) {
4775 +                       slot->dedup_ratio = cal_dedup_ratio(slot);
4776 +                       if (dedup_ratio_max < slot->dedup_ratio)
4777 +                               dedup_ratio_max = slot->dedup_ratio;
4778 +                       dedup_ratio_mean += slot->dedup_ratio;
4779 +               }
4780 +       }
4781
4782 -       while (*new) {
4783 -               struct page *tree_page;
4784 -               int ret;
4785 +       dedup_ratio_mean /= ksm_vma_slot_num;
4786 +       threshold = dedup_ratio_mean;
4787
4788 -               cond_resched();
4789 -               stable_node = rb_entry(*new, struct stable_node, node);
4790 -               tree_page = get_ksm_page(stable_node);
4791 -               if (!tree_page)
4792 -                       return NULL;
4793 +       for (i = 0; i < ksm_vma_tree_index_end; i++) {
4794 +               slot = radix_tree_lookup(&ksm_vma_tree, i);
4795
4796 -               ret = memcmp_pages(kpage, tree_page);
4797 -               put_page(tree_page);
4798 +               if (slot) {
4799 +                       if (slot->dedup_ratio &&
4800 +                           slot->dedup_ratio >= threshold) {
4801 +                               vma_rung_up(slot);
4802 +                       } else {
4803 +                               vma_rung_down(slot);
4804 +                       }
4805
4806 -               parent = *new;
4807 -               if (ret < 0)
4808 -                       new = &parent->rb_left;
4809 -               else if (ret > 0)
4810 -                       new = &parent->rb_right;
4811 -               else {
4812 +                       free_vma_dup_tree(slot);
4813 +                       radix_tree_delete(&ksm_vma_tree, i);
4814 +                       ksm_vma_tree_num--;
4815 +                       slot->ksm_index = -1;
4816 +                       slot->slot_scanned = 0;
4817 +                       slot->dedup_ratio = 0;
4818 +                       slot->dedup_num = 0;
4819 +               }
4820 +       }
4821 +
4822 +       for (i = 0; i < ksm_scan_ladder_size; i++) {
4823 +               list_for_each_entry_safe(slot, tmp_slot,
4824 +                                        &ksm_scan_ladder[i].vma_list,
4825 +                                        ksm_list) {
4826                         /*
4827 -                        * It is not a bug that stable_tree_search() didn't
4828 -                        * find this node: because at that time our page was
4829 -                        * not yet write-protected, so may have changed since.
4830 +                        * The slots were scanned but not in inter_tab, their
4831 +                        * dedup must be 0.
4832                          */
4833 -                       return NULL;
4834 +                       if (slot->slot_scanned) {
4835 +                               BUG_ON(slot->dedup_ratio != 0);
4836 +                               vma_rung_down(slot);
4837 +                       }
4838 +
4839 +                       slot->dedup_ratio = 0;
4840                 }
4841         }
4842
4843 -       stable_node = alloc_stable_node();
4844 -       if (!stable_node)
4845 -               return NULL;
4846 +       BUG_ON(ksm_vma_tree_num != 0);
4847 +       ksm_vma_tree_index_end = 0;
4848
4849 -       rb_link_node(&stable_node->node, parent, new);
4850 -       rb_insert_color(&stable_node->node, &root_stable_tree);
4851 +       for (i = 0; i < ksm_scan_ladder_size; i++) {
4852 +               ksm_scan_ladder[i].round_finished = 0;
4853 +               ksm_scan_ladder[i].busy_searched = 0;
4854 +
4855 +               list_for_each_entry(slot, &ksm_scan_ladder[i].vma_list,
4856 +                                   ksm_list) {
4857 +                       slot->last_scanned = slot->pages_scanned;
4858 +                       slot->slot_scanned = 0;
4859 +                       slot->pages_cowed = 0;
4860 +                       slot->pages_merged = 0;
4861 +                       if (slot->fully_scanned) {
4862 +                               slot->fully_scanned = 0;
4863 +                               ksm_scan_ladder[i].fully_scanned_slots--;
4864 +                       }
4865 +                       BUG_ON(slot->ksm_index != -1);
4866 +               }
4867
4868 -       INIT_HLIST_HEAD(&stable_node->hlist);
4869 +               BUG_ON(ksm_scan_ladder[i].fully_scanned_slots);
4870 +       }
4871
4872 -       stable_node->kpfn = page_to_pfn(kpage);
4873 -       set_page_stable_node(kpage, stable_node);
4874 +       rshash_adjust();
4875
4876 -       return stable_node;
4877 +       //ksm_pages_scanned_last = ksm_pages_scanned;
4878  }
4879
4880 -/*
4881 - * unstable_tree_search_insert - search for identical page,
4882 - * else insert rmap_item into the unstable tree.
4883 - *
4884 - * This function searches for a page in the unstable tree identical to the
4885 - * page currently being scanned; and if no identical page is found in the
4886 - * tree, we insert rmap_item as a new object into the unstable tree.
4887 - *
4888 - * This function returns pointer to rmap_item found to be identical
4889 - * to the currently scanned page, NULL otherwise.
4890 - *
4891 - * This function does both searching and inserting, because they share
4892 - * the same walking algorithm in an rbtree.
4893 - */
4894 -static
4895 -struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
4896 -                                             struct page *page,
4897 -                                             struct page **tree_pagep)
4898 +static inline unsigned int ksm_pages_to_scan(unsigned int batch_pages)
4899 +{
4900 +       return totalram_pages * batch_pages / 1000000;
4901 +}
4902
4903 +static inline void cal_ladder_pages_to_scan(unsigned int num)
4904  {
4905 -       struct rb_node **new = &root_unstable_tree.rb_node;
4906 -       struct rb_node *parent = NULL;
4907 +       int i;
4908
4909 -       while (*new) {
4910 -               struct rmap_item *tree_rmap_item;
4911 -               struct page *tree_page;
4912 -               int ret;
4913 +       for (i = 0; i < ksm_scan_ladder_size; i++) {
4914 +               ksm_scan_ladder[i].pages_to_scan = num
4915 +                       * ksm_scan_ladder[i].scan_ratio / KSM_SCAN_RATIO_MAX;
4916 +       }
4917 +       ksm_scan_ladder[0].pages_to_scan /= 16;
4918 +       ksm_scan_ladder[1].pages_to_scan /= 4;
4919 +}
4920
4921 -               cond_resched();
4922 -               tree_rmap_item = rb_entry(*new, struct rmap_item, node);
4923 -               tree_page = get_mergeable_page(tree_rmap_item);
4924 -               if (IS_ERR_OR_NULL(tree_page))
4925 -                       return NULL;
4926 +static inline void ksm_del_vma_slot(struct vma_slot *slot)
4927 +{
4928 +       int i, j;
4929 +       struct rmap_list_entry *entry;
4930 +       struct vma_slot *tmp;
4931
4932 -               /*
4933 -                * Don't substitute a ksm page for a forked page.
4934 -                */
4935 -               if (page == tree_page) {
4936 -                       put_page(tree_page);
4937 -                       return NULL;
4938 -               }
4939 +       /* mutex lock contention maybe intensive, other idea ? */
4940 +       BUG_ON(list_empty(&slot->ksm_list) || !slot->rung);
4941
4942 -               ret = memcmp_pages(page, tree_page);
4943 +       if (slot->rung->current_scan == &slot->ksm_list)
4944 +               slot->rung->current_scan = slot->rung->current_scan->next;
4945
4946 -               parent = *new;
4947 -               if (ret < 0) {
4948 -                       put_page(tree_page);
4949 -                       new = &parent->rb_left;
4950 -               } else if (ret > 0) {
4951 -                       put_page(tree_page);
4952 -                       new = &parent->rb_right;
4953 -               } else {
4954 -                       *tree_pagep = tree_page;
4955 -                       return tree_rmap_item;
4956 -               }
4957 +       list_del_init(&slot->ksm_list);
4958 +       slot->rung->vma_num--;
4959 +       if (slot->fully_scanned)
4960 +               slot->rung->fully_scanned_slots--;
4961 +
4962 +       if (slot->rung->current_scan == &slot->rung->vma_list) {
4963 +               /* This rung finishes a round */
4964 +               slot->rung->round_finished = 1;
4965 +               slot->rung->current_scan = slot->rung->vma_list.next;
4966 +               BUG_ON(slot->rung->current_scan == &slot->rung->vma_list
4967 +                      && !list_empty(&slot->rung->vma_list));
4968         }
4969
4970 -       rmap_item->address |= UNSTABLE_FLAG;
4971 -       rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
4972 -       rb_link_node(&rmap_item->node, parent, new);
4973 -       rb_insert_color(&rmap_item->node, &root_unstable_tree);
4974 +       if (slot->ksm_index == -1)
4975 +               goto skip;
4976
4977 -       ksm_pages_unshared++;
4978 -       return NULL;
4979 +       tmp = radix_tree_delete(&ksm_vma_tree, slot->ksm_index);
4980 +       BUG_ON(!tmp || tmp != slot);
4981 +       free_vma_dup_tree(slot);
4982 +       ksm_vma_tree_num--;
4983 +       if (slot->ksm_index == ksm_vma_tree_index_end - 1)
4984 +               ksm_vma_tree_index_end--;
4985 +
4986 +skip:
4987 +       if (!slot->rmap_list_pool)
4988 +               goto out;
4989 +
4990 +       for (i = 0; i < slot->pool_size; i++) {
4991 +               void *addr;
4992 +
4993 +               if (!slot->rmap_list_pool[i])
4994 +                       continue;
4995 +
4996 +               addr = kmap(slot->rmap_list_pool[i]);
4997 +               BUG_ON(!addr);
4998 +               for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
4999 +                       entry = (struct rmap_list_entry *)addr + j;
5000 +                       if (is_addr(entry->addr))
5001 +                               continue;
5002 +                       if (!entry->item)
5003 +                               continue;
5004 +
5005 +                       remove_rmap_item_from_tree(entry->item);
5006 +                       free_rmap_item(entry->item);
5007 +                       slot->pool_counts[i]--;
5008 +               }
5009 +               BUG_ON(slot->pool_counts[i]);
5010 +               kunmap(slot->rmap_list_pool[i]);
5011 +               __free_page(slot->rmap_list_pool[i]);
5012 +       }
5013 +       kfree(slot->rmap_list_pool);
5014 +       kfree(slot->pool_counts);
5015 +
5016 +out:
5017 +       slot->rung = NULL;
5018 +       free_vma_slot(slot);
5019 +       BUG_ON(!ksm_vma_slot_num);
5020 +       ksm_vma_slot_num--;
5021  }
5022
5023 -/*
5024 - * stable_tree_append - add another rmap_item to the linked list of
5025 - * rmap_items hanging off a given node of the stable tree, all sharing
5026 - * the same ksm page.
5027 - */
5028 -static void stable_tree_append(struct rmap_item *rmap_item,
5029 -                              struct stable_node *stable_node)
5030 +
5031 +static inline void cleanup_vma_slots(void)
5032  {
5033 -       rmap_item->head = stable_node;
5034 -       rmap_item->address |= STABLE_FLAG;
5035 -       hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
5036 +       struct vma_slot *slot;
5037
5038 -       if (rmap_item->hlist.next)
5039 -               ksm_pages_sharing++;
5040 -       else
5041 -               ksm_pages_shared++;
5042 +       spin_lock(&vma_slot_list_lock);
5043 +       while (!list_empty(&vma_slot_del)) {
5044 +               slot = list_entry(vma_slot_del.next,
5045 +                                 struct vma_slot, slot_list);
5046 +               list_del(&slot->slot_list);
5047 +               spin_unlock(&vma_slot_list_lock);
5048 +               ksm_del_vma_slot(slot);
5049 +               spin_lock(&vma_slot_list_lock);
5050 +       }
5051 +       spin_unlock(&vma_slot_list_lock);
5052  }
5053
5054 -/*
5055 - * cmp_and_merge_page - first see if page can be merged into the stable tree;
5056 - * if not, compare checksum to previous and if it's the same, see if page can
5057 - * be inserted into the unstable tree, or merged with a page already there and
5058 - * both transferred to the stable tree.
5059 - *
5060 - * @page: the page that we are searching identical page to.
5061 - * @rmap_item: the reverse mapping into the virtual address of this page
5062 +static inline int rung_fully_scanned(struct scan_rung *rung)
5063 +{
5064 +       return (rung->fully_scanned_slots == rung->vma_num &&
5065 +               rung->fully_scanned_slots);
5066 +}
5067 +
5068 +/**
5069 + * ksm_do_scan()  - the main worker function.
5070   */
5071 -static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
5072 +static void ksm_do_scan(void)
5073  {
5074 -       struct rmap_item *tree_rmap_item;
5075 -       struct page *tree_page = NULL;
5076 -       struct stable_node *stable_node;
5077 -       struct page *kpage;
5078 -       unsigned int checksum;
5079 -       int err;
5080 +       struct vma_slot *slot, *iter;
5081 +       struct list_head *next_scan, *iter_head;
5082 +       struct mm_struct *busy_mm;
5083 +       unsigned char round_finished, all_rungs_emtpy;
5084 +       int i, err;
5085 +       unsigned long rest_pages;
5086 +
5087 +       might_sleep();
5088 +
5089 +       rest_pages = 0;
5090 +repeat_all:
5091 +       for (i = ksm_scan_ladder_size - 1; i >= 0; i--) {
5092 +               struct scan_rung *rung = &ksm_scan_ladder[i];
5093
5094 -       remove_rmap_item_from_tree(rmap_item);
5095 +               if (!rung->pages_to_scan)
5096 +                       continue;
5097
5098 -       /* We first start with searching the page inside the stable tree */
5099 -       kpage = stable_tree_search(page);
5100 -       if (kpage) {
5101 -               err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
5102 -               if (!err) {
5103 -                       /*
5104 -                        * The page was successfully merged:
5105 -                        * add its rmap_item to the stable tree.
5106 -                        */
5107 -                       lock_page(kpage);
5108 -                       stable_tree_append(rmap_item, page_stable_node(kpage));
5109 -                       unlock_page(kpage);
5110 +               if (list_empty(&rung->vma_list)) {
5111 +                       rung->pages_to_scan = 0;
5112 +                       continue;
5113                 }
5114 -               put_page(kpage);
5115 -               return;
5116 -       }
5117 -
5118 -       /*
5119 -        * If the hash value of the page has changed from the last time
5120 -        * we calculated it, this page is changing frequently: therefore we
5121 -        * don't want to insert it in the unstable tree, and we don't want
5122 -        * to waste our time searching for something identical to it there.
5123 -        */
5124 -       checksum = calc_checksum(page);
5125 -       if (rmap_item->oldchecksum != checksum) {
5126 -               rmap_item->oldchecksum = checksum;
5127 -               return;
5128 -       }
5129
5130 -       tree_rmap_item =
5131 -               unstable_tree_search_insert(rmap_item, page, &tree_page);
5132 -       if (tree_rmap_item) {
5133 -               kpage = try_to_merge_two_pages(rmap_item, page,
5134 -                                               tree_rmap_item, tree_page);
5135 -               put_page(tree_page);
5136                 /*
5137 -                * As soon as we merge this page, we want to remove the
5138 -                * rmap_item of the page we have merged with from the unstable
5139 -                * tree, and insert it instead as new node in the stable tree.
5140 +                * if a higher rung is fully scanned, its rest pages should be
5141 +                * propagated to the lower rungs. This can prevent the higher
5142 +                * rung from waiting a long time while it still has its
5143 +                * pages_to_scan quota.
5144 +                *
5145                  */
5146 -               if (kpage) {
5147 -                       remove_rmap_item_from_tree(tree_rmap_item);
5148 +               if (rung_fully_scanned(rung)) {
5149 +                       rest_pages += rung->pages_to_scan;
5150 +                       rung->pages_to_scan = 0;
5151 +                       continue;
5152 +               }
5153
5154 -                       lock_page(kpage);
5155 -                       stable_node = stable_tree_insert(kpage);
5156 -                       if (stable_node) {
5157 -                               stable_tree_append(tree_rmap_item, stable_node);
5158 -                               stable_tree_append(rmap_item, stable_node);
5159 +               rung->pages_to_scan += rest_pages;
5160 +               rest_pages = 0;
5161 +               while (rung->pages_to_scan && likely(!freezing(current))) {
5162 +cleanup:
5163 +                       cleanup_vma_slots();
5164 +
5165 +                       if (list_empty(&rung->vma_list))
5166 +                               break;
5167 +
5168 +rescan:
5169 +                       BUG_ON(rung->current_scan == &rung->vma_list &&
5170 +                              !list_empty(&rung->vma_list));
5171 +
5172 +                       slot = list_entry(rung->current_scan,
5173 +                                        struct vma_slot, ksm_list);
5174 +
5175 +
5176 +                       if (slot->fully_scanned)
5177 +                               goto next_scan;
5178 +
5179 +                       err = try_down_read_slot_mmap_sem(slot);
5180 +                       if (err == -ENOENT)
5181 +                               goto cleanup;
5182 +
5183 +                       busy_mm = slot->mm;
5184 +
5185 +busy:
5186 +                       if (err == -EBUSY) {
5187 +                               /* skip other vmas on the same mm */
5188 +                               rung->busy_searched = 1;
5189 +                               iter = slot;
5190 +                               iter_head = slot->ksm_list.next;
5191 +
5192 +                               while (iter_head != &rung->vma_list) {
5193 +                                       iter = list_entry(iter_head,
5194 +                                                         struct vma_slot,
5195 +                                                         ksm_list);
5196 +                                       if (iter->vma->vm_mm != busy_mm)
5197 +                                               break;
5198 +                                       iter_head = iter_head->next;
5199 +                               }
5200 +
5201 +                               if (iter->vma->vm_mm != busy_mm) {
5202 +                                       rung->current_scan = &iter->ksm_list;
5203 +                                       goto rescan;
5204 +                               } else {
5205 +                                       /* at the end, but still busy */
5206 +                                       rung->current_scan = iter->ksm_list.next;
5207 +                                       goto next_scan;
5208 +                                       break;
5209 +                               }
5210                         }
5211 -                       unlock_page(kpage);
5212
5213 -                       /*
5214 -                        * If we fail to insert the page into the stable tree,
5215 -                        * we will have 2 virtual addresses that are pointing
5216 -                        * to a ksm page left outside the stable tree,
5217 -                        * in which case we need to break_cow on both.
5218 -                        */
5219 -                       if (!stable_node) {
5220 -                               break_cow(tree_rmap_item);
5221 -                               break_cow(rmap_item);
5222 +                       BUG_ON(!vma_can_enter(slot->vma));
5223 +                       if (ksm_test_exit(slot->vma->vm_mm)) {
5224 +                               busy_mm = slot->vma->vm_mm;
5225 +                               up_read(&slot->vma->vm_mm->mmap_sem);
5226 +                               err = -EBUSY;
5227 +                               goto busy;
5228 +                       }
5229 +
5230 +                       if (rung->busy_searched)
5231 +                               rung->busy_searched = 0;
5232 +                       /* Ok, we have take the mmap_sem, ready to scan */
5233 +                       scan_vma_one_page(slot);
5234 +                       up_read(&slot->vma->vm_mm->mmap_sem);
5235 +                       rung->pages_to_scan--;
5236 +
5237 +                       if ((slot->pages_scanned &&
5238 +                            slot->pages_scanned % slot->pages_to_scan == 0)
5239 +                           || slot->fully_scanned) {
5240 +next_scan:
5241 +                               next_scan = rung->current_scan->next;
5242 +                               if (next_scan == &rung->vma_list) {
5243 +                                       /*
5244 +                                        * All the slots in this rung
5245 +                                        * have been traveled in this
5246 +                                        * round.
5247 +                                        */
5248 +                                       rung->round_finished = 1;
5249 +                                       rung->current_scan =
5250 +                                               rung->vma_list.next;
5251 +                                       if (rung_fully_scanned(rung) ||
5252 +                                           rung->busy_searched) {
5253 +                                               /*
5254 +                                                * All the pages in all slots
5255 +                                                * have been scanned. Or we
5256 +                                                * did not make any progress
5257 +                                                * because of busy mm.
5258 +                                                */
5259 +                                               rest_pages +=
5260 +                                                       rung->pages_to_scan;
5261 +                                               rung->pages_to_scan = 0;
5262 +                                               break;
5263 +                                       }
5264 +                               } else {
5265 +                                       rung->current_scan = next_scan;
5266 +                               }
5267                         }
5268 +
5269 +                       cond_resched();
5270                 }
5271 +
5272 +               if (freezing(current))
5273 +                       break;
5274         }
5275 -}
5276
5277 -static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
5278 -                                           struct rmap_item **rmap_list,
5279 -                                           unsigned long addr)
5280 -{
5281 -       struct rmap_item *rmap_item;
5282 +       if (freezing(current))
5283 +               return;
5284
5285 -       while (*rmap_list) {
5286 -               rmap_item = *rmap_list;
5287 -               if ((rmap_item->address & PAGE_MASK) == addr)
5288 -                       return rmap_item;
5289 -               if (rmap_item->address > addr)
5290 +       round_finished = 1;
5291 +       all_rungs_emtpy = 1;
5292 +       for (i = 0; i < ksm_scan_ladder_size; i++) {
5293 +               struct scan_rung *rung = &ksm_scan_ladder[i];
5294 +
5295 +               if (!list_empty(&rung->vma_list)) {
5296 +                       all_rungs_emtpy = 0;
5297 +                       if (!rung->round_finished)
5298 +                               round_finished = 0;
5299                         break;
5300 -               *rmap_list = rmap_item->rmap_list;
5301 -               remove_rmap_item_from_tree(rmap_item);
5302 -               free_rmap_item(rmap_item);
5303 +               }
5304         }
5305
5306 -       rmap_item = alloc_rmap_item();
5307 -       if (rmap_item) {
5308 -               /* It has already been zeroed */
5309 -               rmap_item->mm = mm_slot->mm;
5310 -               rmap_item->address = addr;
5311 -               rmap_item->rmap_list = *rmap_list;
5312 -               *rmap_list = rmap_item;
5313 -       }
5314 -       return rmap_item;
5315 -}
5316 +       if (all_rungs_emtpy)
5317 +               round_finished = 0;
5318
5319 -static struct rmap_item *scan_get_next_rmap_item(struct page **page)
5320 -{
5321 -       struct mm_struct *mm;
5322 -       struct mm_slot *slot;
5323 -       struct vm_area_struct *vma;
5324 -       struct rmap_item *rmap_item;
5325 +       cleanup_vma_slots();
5326
5327 -       if (list_empty(&ksm_mm_head.mm_list))
5328 -               return NULL;
5329 +       if (round_finished) {
5330 +               round_update_ladder();
5331
5332 -       slot = ksm_scan.mm_slot;
5333 -       if (slot == &ksm_mm_head) {
5334                 /*
5335                  * A number of pages can hang around indefinitely on per-cpu
5336                  * pagevecs, raised page count preventing write_protect_page
5337 @@ -1308,266 +4161,160 @@
5338                  */
5339                 lru_add_drain_all();
5340
5341 +               /* sync with ksm_remove_vma for rb_erase */
5342 +               ksm_scan_round++;
5343                 root_unstable_tree = RB_ROOT;
5344 -
5345 -               spin_lock(&ksm_mmlist_lock);
5346 -               slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
5347 -               ksm_scan.mm_slot = slot;
5348 -               spin_unlock(&ksm_mmlist_lock);
5349 -next_mm:
5350 -               ksm_scan.address = 0;
5351 -               ksm_scan.rmap_list = &slot->rmap_list;
5352 -       }
5353 -
5354 -       mm = slot->mm;
5355 -       down_read(&mm->mmap_sem);
5356 -       if (ksm_test_exit(mm))
5357 -               vma = NULL;
5358 -       else
5359 -               vma = find_vma(mm, ksm_scan.address);
5360 -
5361 -       for (; vma; vma = vma->vm_next) {
5362 -               if (!(vma->vm_flags & VM_MERGEABLE))
5363 -                       continue;
5364 -               if (ksm_scan.address < vma->vm_start)
5365 -                       ksm_scan.address = vma->vm_start;
5366 -               if (!vma->anon_vma)
5367 -                       ksm_scan.address = vma->vm_end;
5368 -
5369 -               while (ksm_scan.address < vma->vm_end) {
5370 -                       if (ksm_test_exit(mm))
5371 -                               break;
5372 -                       *page = follow_page(vma, ksm_scan.address, FOLL_GET);
5373 -                       if (IS_ERR_OR_NULL(*page)) {
5374 -                               ksm_scan.address += PAGE_SIZE;
5375 -                               cond_resched();
5376 -                               continue;
5377 -                       }
5378 -                       if (PageAnon(*page) ||
5379 -                           page_trans_compound_anon(*page)) {
5380 -                               flush_anon_page(vma, *page, ksm_scan.address);
5381 -                               flush_dcache_page(*page);
5382 -                               rmap_item = get_next_rmap_item(slot,
5383 -                                       ksm_scan.rmap_list, ksm_scan.address);
5384 -                               if (rmap_item) {
5385 -                                       ksm_scan.rmap_list =
5386 -                                                       &rmap_item->rmap_list;
5387 -                                       ksm_scan.address += PAGE_SIZE;
5388 -                               } else
5389 -                                       put_page(*page);
5390 -                               up_read(&mm->mmap_sem);
5391 -                               return rmap_item;
5392 -                       }
5393 -                       put_page(*page);
5394 -                       ksm_scan.address += PAGE_SIZE;
5395 -                       cond_resched();
5396 -               }
5397 -       }
5398 -
5399 -       if (ksm_test_exit(mm)) {
5400 -               ksm_scan.address = 0;
5401 -               ksm_scan.rmap_list = &slot->rmap_list;
5402 -       }
5403 -       /*
5404 -        * Nuke all the rmap_items that are above this current rmap:
5405 -        * because there were no VM_MERGEABLE vmas with such addresses.
5406 -        */
5407 -       remove_trailing_rmap_items(slot, ksm_scan.rmap_list);
5408 -
5409 -       spin_lock(&ksm_mmlist_lock);
5410 -       ksm_scan.mm_slot = list_entry(slot->mm_list.next,
5411 -                                               struct mm_slot, mm_list);
5412 -       if (ksm_scan.address == 0) {
5413 -               /*
5414 -                * We've completed a full scan of all vmas, holding mmap_sem
5415 -                * throughout, and found no VM_MERGEABLE: so do the same as
5416 -                * __ksm_exit does to remove this mm from all our lists now.
5417 -                * This applies either when cleaning up after __ksm_exit
5418 -                * (but beware: we can reach here even before __ksm_exit),
5419 -                * or when all VM_MERGEABLE areas have been unmapped (and
5420 -                * mmap_sem then protects against race with MADV_MERGEABLE).
5421 -                */
5422 -               hlist_del(&slot->link);
5423 -               list_del(&slot->mm_list);
5424 -               spin_unlock(&ksm_mmlist_lock);
5425 -
5426 -               free_mm_slot(slot);
5427 -               clear_bit(MMF_VM_MERGEABLE, &mm->flags);
5428 -               up_read(&mm->mmap_sem);
5429 -               mmdrop(mm);
5430 -       } else {
5431 -               spin_unlock(&ksm_mmlist_lock);
5432 -               up_read(&mm->mmap_sem);
5433 +               free_all_tree_nodes(&unstable_tree_node_list);
5434         }
5435
5436 -       /* Repeat until we've completed scanning the whole list */
5437 -       slot = ksm_scan.mm_slot;
5438 -       if (slot != &ksm_mm_head)
5439 -               goto next_mm;
5440 -
5441 -       ksm_scan.seqnr++;
5442 -       return NULL;
5443 -}
5444 -
5445 -/**
5446 - * ksm_do_scan  - the ksm scanner main worker function.
5447 - * @scan_npages - number of pages we want to scan before we return.
5448 - */
5449 -static void ksm_do_scan(unsigned int scan_npages)
5450 -{
5451 -       struct rmap_item *rmap_item;
5452 -       struct page *uninitialized_var(page);
5453 +       for (i = 0; i < ksm_scan_ladder_size; i++) {
5454 +               struct scan_rung *rung = &ksm_scan_ladder[i];
5455
5456 -       while (scan_npages-- && likely(!freezing(current))) {
5457 -               cond_resched();
5458 -               rmap_item = scan_get_next_rmap_item(&page);
5459 -               if (!rmap_item)
5460 -                       return;
5461 -               if (!PageKsm(page) || !in_stable_tree(rmap_item))
5462 -                       cmp_and_merge_page(page, rmap_item);
5463 -               put_page(page);
5464 +               /*
5465 +                * Before we can go sleep, we should make sure that all the
5466 +                * pages_to_scan quota for this scan has been finished
5467 +                */
5468 +               if (!list_empty(&rung->vma_list) && rung->pages_to_scan)
5469 +                       goto repeat_all;
5470         }
5471 +
5472 +       cal_ladder_pages_to_scan(ksm_scan_batch_pages);
5473  }
5474
5475  static int ksmd_should_run(void)
5476  {
5477 -       return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list);
5478 +       return ksm_run & KSM_RUN_MERGE;
5479  }
5480
5481 -static int ksm_scan_thread(void *nothing)
5482 -{
5483 -       set_freezable();
5484 -       set_user_nice(current, 5);
5485 -
5486 -       while (!kthread_should_stop()) {
5487 -               mutex_lock(&ksm_thread_mutex);
5488 -               if (ksmd_should_run())
5489 -                       ksm_do_scan(ksm_thread_pages_to_scan);
5490 -               mutex_unlock(&ksm_thread_mutex);
5491 -
5492 -               try_to_freeze();
5493 +#define __round_mask(x, y) ((__typeof__(x))((y)-1))
5494 +#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
5495
5496 -               if (ksmd_should_run()) {
5497 -                       schedule_timeout_interruptible(
5498 -                               msecs_to_jiffies(ksm_thread_sleep_millisecs));
5499 -               } else {
5500 -                       wait_event_freezable(ksm_thread_wait,
5501 -                               ksmd_should_run() || kthread_should_stop());
5502 -               }
5503 -       }
5504 -       return 0;
5505 +static inline unsigned long vma_pool_size(struct vm_area_struct *vma)
5506 +{
5507 +       return round_up(sizeof(struct rmap_list_entry) * vma_pages(vma),
5508 +                       PAGE_SIZE) >> PAGE_SHIFT;
5509  }
5510
5511 -int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
5512 -               unsigned long end, int advice, unsigned long *vm_flags)
5513 +/**
5514 + *
5515 + *
5516 + *
5517 + * @param slot
5518 + *
5519 + * @return int , 1 on success, 0 on failure
5520 + */
5521 +static int ksm_vma_enter(struct vma_slot *slot)
5522  {
5523 -       struct mm_struct *mm = vma->vm_mm;
5524 -       int err;
5525 -
5526 -       switch (advice) {
5527 -       case MADV_MERGEABLE:
5528 -               /*
5529 -                * Be somewhat over-protective for now!
5530 -                */
5531 -               if (*vm_flags & (VM_MERGEABLE | VM_SHARED  | VM_MAYSHARE   |
5532 -                                VM_PFNMAP    | VM_IO      | VM_DONTEXPAND |
5533 -                                VM_RESERVED  | VM_HUGETLB | VM_INSERTPAGE |
5534 -                                VM_NONLINEAR | VM_MIXEDMAP | VM_SAO))
5535 -                       return 0;               /* just ignore the advice */
5536 -
5537 -               if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
5538 -                       err = __ksm_enter(mm);
5539 -                       if (err)
5540 -                               return err;
5541 -               }
5542 -
5543 -               *vm_flags |= VM_MERGEABLE;
5544 -               break;
5545 +       struct scan_rung *rung;
5546 +       unsigned long pages_to_scan, pool_size;
5547
5548 -       case MADV_UNMERGEABLE:
5549 -               if (!(*vm_flags & VM_MERGEABLE))
5550 -                       return 0;               /* just ignore the advice */
5551 +       BUG_ON(slot->pages != vma_pages(slot->vma));
5552 +       rung = &ksm_scan_ladder[0];
5553
5554 -               if (vma->anon_vma) {
5555 -                       err = unmerge_ksm_pages(vma, start, end);
5556 -                       if (err)
5557 -                               return err;
5558 +       pages_to_scan = get_vma_random_scan_num(slot, rung->scan_ratio);
5559 +       if (pages_to_scan) {
5560 +               if (list_empty(&rung->vma_list))
5561 +                       rung->current_scan = &slot->ksm_list;
5562 +               BUG_ON(!list_empty(&slot->ksm_list));
5563 +
5564 +               list_add(&slot->ksm_list, &rung->vma_list);
5565 +               slot->rung = rung;
5566 +               slot->pages_to_scan = pages_to_scan;
5567 +               slot->rung->vma_num++;
5568 +               BUG_ON(PAGE_SIZE % sizeof(struct rmap_list_entry) != 0);
5569 +
5570 +               pool_size = vma_pool_size(slot->vma);
5571 +
5572 +               slot->rmap_list_pool = kzalloc(sizeof(struct page *) *
5573 +                                              pool_size, GFP_NOWAIT);
5574 +               slot->pool_counts = kzalloc(sizeof(unsigned long) * pool_size,
5575 +                                           GFP_NOWAIT);
5576 +               slot->pool_size = pool_size;
5577 +               if (!slot->rmap_list_pool)
5578 +                       goto failed;
5579 +
5580 +               if (!slot->pool_counts) {
5581 +                       kfree(slot->rmap_list_pool);
5582 +                       goto failed;
5583                 }
5584
5585 -               *vm_flags &= ~VM_MERGEABLE;
5586 -               break;
5587 +               BUG_ON(rung->current_scan == &rung->vma_list &&
5588 +                      !list_empty(&rung->vma_list));
5589 +
5590 +               ksm_vma_slot_num++;
5591 +               BUG_ON(!ksm_vma_slot_num);
5592 +               return 1;
5593         }
5594
5595 +failed:
5596         return 0;
5597  }
5598
5599 -int __ksm_enter(struct mm_struct *mm)
5600 -{
5601 -       struct mm_slot *mm_slot;
5602 -       int needs_wakeup;
5603 -
5604 -       mm_slot = alloc_mm_slot();
5605 -       if (!mm_slot)
5606 -               return -ENOMEM;
5607 -
5608 -       /* Check ksm_run too?  Would need tighter locking */
5609 -       needs_wakeup = list_empty(&ksm_mm_head.mm_list);
5610
5611 -       spin_lock(&ksm_mmlist_lock);
5612 -       insert_to_mm_slots_hash(mm, mm_slot);
5613 -       /*
5614 -        * Insert just behind the scanning cursor, to let the area settle
5615 -        * down a little; when fork is followed by immediate exec, we don't
5616 -        * want ksmd to waste time setting up and tearing down an rmap_list.
5617 -        */
5618 -       list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
5619 -       spin_unlock(&ksm_mmlist_lock);
5620 +static void ksm_enter_all_slots(void)
5621 +{
5622 +       struct vma_slot *slot;
5623 +       int added;
5624
5625 -       set_bit(MMF_VM_MERGEABLE, &mm->flags);
5626 -       atomic_inc(&mm->mm_count);
5627 +       spin_lock(&vma_slot_list_lock);
5628 +       while (!list_empty(&vma_slot_new)) {
5629 +               slot = list_entry(vma_slot_new.next,
5630 +                                 struct vma_slot, slot_list);
5631 +               /**
5632 +                * slots are sorted by ctime_j, if one found to be too
5633 +                * young, just stop scanning the rest ones.
5634 +                */
5635 +               /*
5636
5637 -       if (needs_wakeup)
5638 -               wake_up_interruptible(&ksm_thread_wait);
5639 +                       if (time_before(jiffies, slot->ctime_j +
5640 +                                       msecs_to_jiffies(1000))) {
5641 +                               spin_unlock(&vma_slot_list_lock);
5642 +                               return;
5643 +                       }
5644 +               */
5645
5646 -       return 0;
5647 +               list_del_init(&slot->slot_list);
5648 +               added = 0;
5649 +               if (vma_can_enter(slot->vma))
5650 +                       added = ksm_vma_enter(slot);
5651 +
5652 +               if (!added) {
5653 +                       /* Put back to new list to be del by its creator */
5654 +                       slot->ctime_j = jiffies;
5655 +                       list_del(&slot->slot_list);
5656 +                       list_add_tail(&slot->slot_list, &vma_slot_noadd);
5657 +               }
5658 +               spin_unlock(&vma_slot_list_lock);
5659 +               cond_resched();
5660 +               spin_lock(&vma_slot_list_lock);
5661 +       }
5662 +       spin_unlock(&vma_slot_list_lock);
5663  }
5664
5665 -void __ksm_exit(struct mm_struct *mm)
5666 +static int ksm_scan_thread(void *nothing)
5667  {
5668 -       struct mm_slot *mm_slot;
5669 -       int easy_to_free = 0;
5670 +       set_freezable();
5671 +       set_user_nice(current, 5);
5672
5673 -       /*
5674 -        * This process is exiting: if it's straightforward (as is the
5675 -        * case when ksmd was never running), free mm_slot immediately.
5676 -        * But if it's at the cursor or has rmap_items linked to it, use
5677 -        * mmap_sem to synchronize with any break_cows before pagetables
5678 -        * are freed, and leave the mm_slot on the list for ksmd to free.
5679 -        * Beware: ksm may already have noticed it exiting and freed the slot.
5680 -        */
5681 +       while (!kthread_should_stop()) {
5682 +               mutex_lock(&ksm_thread_mutex);
5683 +               if (ksmd_should_run()) {
5684 +                       ksm_enter_all_slots();
5685 +                       ksm_do_scan();
5686 +               }
5687 +               mutex_unlock(&ksm_thread_mutex);
5688 +
5689 +               try_to_freeze();
5690
5691 -       spin_lock(&ksm_mmlist_lock);
5692 -       mm_slot = get_mm_slot(mm);
5693 -       if (mm_slot && ksm_scan.mm_slot != mm_slot) {
5694 -               if (!mm_slot->rmap_list) {
5695 -                       hlist_del(&mm_slot->link);
5696 -                       list_del(&mm_slot->mm_list);
5697 -                       easy_to_free = 1;
5698 +               if (ksmd_should_run()) {
5699 +                       schedule_timeout_interruptible(ksm_sleep_jiffies);
5700 +                       ksm_sleep_times++;
5701                 } else {
5702 -                       list_move(&mm_slot->mm_list,
5703 -                                 &ksm_scan.mm_slot->mm_list);
5704 +                       wait_event_freezable(ksm_thread_wait,
5705 +                               ksmd_should_run() || kthread_should_stop());
5706                 }
5707         }
5708 -       spin_unlock(&ksm_mmlist_lock);
5709 -
5710 -       if (easy_to_free) {
5711 -               free_mm_slot(mm_slot);
5712 -               clear_bit(MMF_VM_MERGEABLE, &mm->flags);
5713 -               mmdrop(mm);
5714 -       } else if (mm_slot) {
5715 -               down_write(&mm->mmap_sem);
5716 -               up_write(&mm->mmap_sem);
5717 -       }
5718 +       return 0;
5719  }
5720
5721  struct page *ksm_does_need_to_copy(struct page *page,
5722 @@ -1597,11 +4344,13 @@
5723                         unsigned long *vm_flags)
5724  {
5725         struct stable_node *stable_node;
5726 +       struct node_vma *node_vma;
5727         struct rmap_item *rmap_item;
5728 -       struct hlist_node *hlist;
5729 +       struct hlist_node *hlist, *rmap_hlist;
5730         unsigned int mapcount = page_mapcount(page);
5731         int referenced = 0;
5732         int search_new_forks = 0;
5733 +       unsigned long address;
5734
5735         VM_BUG_ON(!PageKsm(page));
5736         VM_BUG_ON(!PageLocked(page));
5737 @@ -1609,38 +4358,51 @@
5738         stable_node = page_stable_node(page);
5739         if (!stable_node)
5740                 return 0;
5741 -again:
5742 -       hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
5743 -               struct anon_vma *anon_vma = rmap_item->anon_vma;
5744 -               struct anon_vma_chain *vmac;
5745 -               struct vm_area_struct *vma;
5746 -
5747 -               anon_vma_lock(anon_vma);
5748 -               list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
5749 -                       vma = vmac->vma;
5750 -                       if (rmap_item->address < vma->vm_start ||
5751 -                           rmap_item->address >= vma->vm_end)
5752 -                               continue;
5753 -                       /*
5754 -                        * Initially we examine only the vma which covers this
5755 -                        * rmap_item; but later, if there is still work to do,
5756 -                        * we examine covering vmas in other mms: in case they
5757 -                        * were forked from the original since ksmd passed.
5758 -                        */
5759 -                       if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
5760 -                               continue;
5761
5762 -                       if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
5763 -                               continue;
5764
5765 -                       referenced += page_referenced_one(page, vma,
5766 -                               rmap_item->address, &mapcount, vm_flags);
5767 -                       if (!search_new_forks || !mapcount)
5768 -                               break;
5769 +again:
5770 +       hlist_for_each_entry(node_vma, hlist, &stable_node->hlist, hlist) {
5771 +               hlist_for_each_entry(rmap_item, rmap_hlist,
5772 +                                    &node_vma->rmap_hlist, hlist) {
5773 +                       struct anon_vma *anon_vma = rmap_item->anon_vma;
5774 +                       struct anon_vma_chain *vmac;
5775 +                       struct vm_area_struct *vma;
5776 +
5777 +                       anon_vma_lock(anon_vma);
5778 +                       list_for_each_entry(vmac, &anon_vma->head,
5779 +                                           same_anon_vma) {
5780 +                               vma = vmac->vma;
5781 +                               address = get_rmap_addr(rmap_item);
5782 +
5783 +                               if (address < vma->vm_start ||
5784 +                                   address >= vma->vm_end)
5785 +                                       continue;
5786 +                               /*
5787 +                                * Initially we examine only the vma which
5788 +                                * covers this rmap_item; but later, if there
5789 +                                * is still work to do, we examine covering
5790 +                                * vmas in other mms: in case they were forked
5791 +                                * from the original since ksmd passed.
5792 +                                */
5793 +                               if ((rmap_item->slot->vma == vma) ==
5794 +                                   search_new_forks)
5795 +                                       continue;
5796 +
5797 +                               if (memcg &&
5798 +                                   !mm_match_cgroup(vma->vm_mm, memcg))
5799 +                                       continue;
5800 +
5801 +                               referenced +=
5802 +                                       page_referenced_one(page, vma,
5803 +                                               address, &mapcount, vm_flags);
5804 +                               if (!search_new_forks || !mapcount)
5805 +                                       break;
5806 +                       }
5807 +
5808 +                       anon_vma_unlock(anon_vma);
5809 +                       if (!mapcount)
5810 +                               goto out;
5811                 }
5812 -               anon_vma_unlock(anon_vma);
5813 -               if (!mapcount)
5814 -                       goto out;
5815         }
5816         if (!search_new_forks++)
5817                 goto again;
5818 @@ -1651,10 +4413,12 @@
5819  int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
5820  {
5821         struct stable_node *stable_node;
5822 -       struct hlist_node *hlist;
5823 +       struct node_vma *node_vma;
5824 +       struct hlist_node *hlist, *rmap_hlist;
5825         struct rmap_item *rmap_item;
5826         int ret = SWAP_AGAIN;
5827         int search_new_forks = 0;
5828 +       unsigned long address;
5829
5830         VM_BUG_ON(!PageKsm(page));
5831         VM_BUG_ON(!PageLocked(page));
5832 @@ -1663,34 +4427,42 @@
5833         if (!stable_node)
5834                 return SWAP_FAIL;
5835  again:
5836 -       hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
5837 -               struct anon_vma *anon_vma = rmap_item->anon_vma;
5838 -               struct anon_vma_chain *vmac;
5839 -               struct vm_area_struct *vma;
5840 -
5841 -               anon_vma_lock(anon_vma);
5842 -               list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
5843 -                       vma = vmac->vma;
5844 -                       if (rmap_item->address < vma->vm_start ||
5845 -                           rmap_item->address >= vma->vm_end)
5846 -                               continue;
5847 -                       /*
5848 -                        * Initially we examine only the vma which covers this
5849 -                        * rmap_item; but later, if there is still work to do,
5850 -                        * we examine covering vmas in other mms: in case they
5851 -                        * were forked from the original since ksmd passed.
5852 -                        */
5853 -                       if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
5854 -                               continue;
5855 -
5856 -                       ret = try_to_unmap_one(page, vma,
5857 -                                       rmap_item->address, flags);
5858 -                       if (ret != SWAP_AGAIN || !page_mapped(page)) {
5859 -                               anon_vma_unlock(anon_vma);
5860 -                               goto out;
5861 +       hlist_for_each_entry(node_vma, hlist, &stable_node->hlist, hlist) {
5862 +               hlist_for_each_entry(rmap_item, rmap_hlist,
5863 +                                    &node_vma->rmap_hlist, hlist) {
5864 +                       struct anon_vma *anon_vma = rmap_item->anon_vma;
5865 +                       struct anon_vma_chain *vmac;
5866 +                       struct vm_area_struct *vma;
5867 +
5868 +                       anon_vma_lock(anon_vma);
5869 +                       list_for_each_entry(vmac, &anon_vma->head,
5870 +                                           same_anon_vma) {
5871 +                               vma = vmac->vma;
5872 +                               address = get_rmap_addr(rmap_item);
5873 +
5874 +                               if (address < vma->vm_start ||
5875 +                                   address >= vma->vm_end)
5876 +                                       continue;
5877 +                               /*
5878 +                                * Initially we examine only the vma which
5879 +                                * covers this rmap_item; but later, if there
5880 +                                * is still work to do, we examine covering
5881 +                                * vmas in other mms: in case they were forked
5882 +                                * from the original since ksmd passed.
5883 +                                */
5884 +                               if ((rmap_item->slot->vma == vma) ==
5885 +                                   search_new_forks)
5886 +                                       continue;
5887 +
5888 +                               ret = try_to_unmap_one(page, vma,
5889 +                                                      address, flags);
5890 +                               if (ret != SWAP_AGAIN || !page_mapped(page)) {
5891 +                                       anon_vma_unlock(anon_vma);
5892 +                                       goto out;
5893 +                               }
5894                         }
5895 +                       anon_vma_unlock(anon_vma);
5896                 }
5897 -               anon_vma_unlock(anon_vma);
5898         }
5899         if (!search_new_forks++)
5900                 goto again;
5901 @@ -1703,10 +4475,12 @@
5902                   struct vm_area_struct *, unsigned long, void *), void *arg)
5903  {
5904         struct stable_node *stable_node;
5905 -       struct hlist_node *hlist;
5906 +       struct node_vma *node_vma;
5907 +       struct hlist_node *hlist, *rmap_hlist;
5908         struct rmap_item *rmap_item;
5909         int ret = SWAP_AGAIN;
5910         int search_new_forks = 0;
5911 +       unsigned long address;
5912
5913         VM_BUG_ON(!PageKsm(page));
5914         VM_BUG_ON(!PageLocked(page));
5915 @@ -1715,33 +4489,35 @@
5916         if (!stable_node)
5917                 return ret;
5918  again:
5919 -       hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
5920 -               struct anon_vma *anon_vma = rmap_item->anon_vma;
5921 -               struct anon_vma_chain *vmac;
5922 -               struct vm_area_struct *vma;
5923 -
5924 -               anon_vma_lock(anon_vma);
5925 -               list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
5926 -                       vma = vmac->vma;
5927 -                       if (rmap_item->address < vma->vm_start ||
5928 -                           rmap_item->address >= vma->vm_end)
5929 -                               continue;
5930 -                       /*
5931 -                        * Initially we examine only the vma which covers this
5932 -                        * rmap_item; but later, if there is still work to do,
5933 -                        * we examine covering vmas in other mms: in case they
5934 -                        * were forked from the original since ksmd passed.
5935 -                        */
5936 -                       if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
5937 -                               continue;
5938 -
5939 -                       ret = rmap_one(page, vma, rmap_item->address, arg);
5940 -                       if (ret != SWAP_AGAIN) {
5941 -                               anon_vma_unlock(anon_vma);
5942 -                               goto out;
5943 +       hlist_for_each_entry(node_vma, hlist, &stable_node->hlist, hlist) {
5944 +               hlist_for_each_entry(rmap_item, rmap_hlist,
5945 +                                    &node_vma->rmap_hlist, hlist) {
5946 +                       struct anon_vma *anon_vma = rmap_item->anon_vma;
5947 +                       struct anon_vma_chain *vmac;
5948 +                       struct vm_area_struct *vma;
5949 +
5950 +                       anon_vma_lock(anon_vma);
5951 +                       list_for_each_entry(vmac, &anon_vma->head,
5952 +                                           same_anon_vma) {
5953 +                               vma = vmac->vma;
5954 +                               address = get_rmap_addr(rmap_item);
5955 +
5956 +                               if (address < vma->vm_start ||
5957 +                                   address >= vma->vm_end)
5958 +                                       continue;
5959 +
5960 +                               if ((rmap_item->slot->vma == vma) ==
5961 +                                   search_new_forks)
5962 +                                       continue;
5963 +
5964 +                               ret = rmap_one(page, vma, address, arg);
5965 +                               if (ret != SWAP_AGAIN) {
5966 +                                       anon_vma_unlock(anon_vma);
5967 +                                       goto out;
5968 +                               }
5969                         }
5970 +                       anon_vma_unlock(anon_vma);
5971                 }
5972 -               anon_vma_unlock(anon_vma);
5973         }
5974         if (!search_new_forks++)
5975                 goto again;
5976 @@ -1771,7 +4547,7 @@
5977  {
5978         struct rb_node *node;
5979
5980 -       for (node = rb_first(&root_stable_tree); node; node = rb_next(node)) {
5981 +       for (node = rb_first(root_stable_treep); node; node = rb_next(node)) {
5982                 struct stable_node *stable_node;
5983
5984                 stable_node = rb_entry(node, struct stable_node, node);
5985 @@ -1810,7 +4586,7 @@
5986                  */
5987                 while ((stable_node = ksm_check_stable_tree(mn->start_pfn,
5988                                         mn->start_pfn + mn->nr_pages)) != NULL)
5989 -                       remove_node_from_stable_tree(stable_node);
5990 +                       remove_node_from_stable_tree(stable_node, 1, 1);
5991                 /* fallthrough */
5992
5993         case MEM_CANCEL_OFFLINE:
5994 @@ -1835,7 +4611,7 @@
5995  static ssize_t sleep_millisecs_show(struct kobject *kobj,
5996                                     struct kobj_attribute *attr, char *buf)
5997  {
5998 -       return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs);
5999 +       return sprintf(buf, "%u\n", jiffies_to_msecs(ksm_sleep_jiffies));
6000  }
6001
6002  static ssize_t sleep_millisecs_store(struct kobject *kobj,
6003 @@ -1849,34 +4625,58 @@
6004         if (err || msecs > UINT_MAX)
6005                 return -EINVAL;
6006
6007 -       ksm_thread_sleep_millisecs = msecs;
6008 +       ksm_sleep_jiffies = msecs_to_jiffies(msecs);
6009
6010         return count;
6011  }
6012  KSM_ATTR(sleep_millisecs);
6013
6014 -static ssize_t pages_to_scan_show(struct kobject *kobj,
6015 +static ssize_t min_scan_ratio_show(struct kobject *kobj,
6016 +                                   struct kobj_attribute *attr, char *buf)
6017 +{
6018 +       return sprintf(buf, "%u\n", ksm_min_scan_ratio);
6019 +}
6020 +
6021 +static ssize_t min_scan_ratio_store(struct kobject *kobj,
6022 +                                    struct kobj_attribute *attr,
6023 +                                    const char *buf, size_t count)
6024 +{
6025 +       unsigned long msr;
6026 +       int err;
6027 +
6028 +       err = strict_strtoul(buf, 10, &msr);
6029 +       if (err || msr > UINT_MAX)
6030 +               return -EINVAL;
6031 +
6032 +       ksm_min_scan_ratio = msr;
6033 +
6034 +       return count;
6035 +}
6036 +KSM_ATTR(min_scan_ratio);
6037 +
6038 +static ssize_t scan_batch_pages_show(struct kobject *kobj,
6039                                   struct kobj_attribute *attr, char *buf)
6040  {
6041 -       return sprintf(buf, "%u\n", ksm_thread_pages_to_scan);
6042 +       return sprintf(buf, "%lu\n", ksm_scan_batch_pages);
6043  }
6044
6045 -static ssize_t pages_to_scan_store(struct kobject *kobj,
6046 +static ssize_t scan_batch_pages_store(struct kobject *kobj,
6047                                    struct kobj_attribute *attr,
6048                                    const char *buf, size_t count)
6049  {
6050         int err;
6051 -       unsigned long nr_pages;
6052 +       unsigned long batch_pages;
6053
6054 -       err = strict_strtoul(buf, 10, &nr_pages);
6055 -       if (err || nr_pages > UINT_MAX)
6056 +       err = strict_strtoul(buf, 10, &batch_pages);
6057 +       if (err || batch_pages > UINT_MAX)
6058                 return -EINVAL;
6059
6060 -       ksm_thread_pages_to_scan = nr_pages;
6061 +       ksm_scan_batch_pages = batch_pages;
6062 +       cal_ladder_pages_to_scan(ksm_scan_batch_pages);
6063
6064         return count;
6065  }
6066 -KSM_ATTR(pages_to_scan);
6067 +KSM_ATTR(scan_batch_pages);
6068
6069  static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
6070                         char *buf)
6071 @@ -1893,28 +4693,12 @@
6072         err = strict_strtoul(buf, 10, &flags);
6073         if (err || flags > UINT_MAX)
6074                 return -EINVAL;
6075 -       if (flags > KSM_RUN_UNMERGE)
6076 +       if (flags > KSM_RUN_MERGE)
6077                 return -EINVAL;
6078
6079 -       /*
6080 -        * KSM_RUN_MERGE sets ksmd running, and 0 stops it running.
6081 -        * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items,
6082 -        * breaking COW to free the pages_shared (but leaves mm_slots
6083 -        * on the list for when ksmd may be set running again).
6084 -        */
6085 -
6086         mutex_lock(&ksm_thread_mutex);
6087         if (ksm_run != flags) {
6088                 ksm_run = flags;
6089 -               if (flags & KSM_RUN_UNMERGE) {
6090 -                       current->flags |= PF_OOM_ORIGIN;
6091 -                       err = unmerge_and_remove_all_rmap_items();
6092 -                       current->flags &= ~PF_OOM_ORIGIN;
6093 -                       if (err) {
6094 -                               ksm_run = KSM_RUN_STOP;
6095 -                               count = err;
6096 -                       }
6097 -               }
6098         }
6099         mutex_unlock(&ksm_thread_mutex);
6100
6101 @@ -1925,6 +4709,30 @@
6102  }
6103  KSM_ATTR(run);
6104
6105 +
6106 +static ssize_t thrash_threshold_show(struct kobject *kobj,
6107 +                                    struct kobj_attribute *attr, char *buf)
6108 +{
6109 +       return sprintf(buf, "%u\n", ksm_thrash_threshold);
6110 +}
6111 +
6112 +static ssize_t thrash_threshold_store(struct kobject *kobj,
6113 +                                     struct kobj_attribute *attr,
6114 +                                     const char *buf, size_t count)
6115 +{
6116 +       int err;
6117 +       unsigned long flags;
6118 +
6119 +       err = strict_strtoul(buf, 10, &flags);
6120 +       if (err || flags > 99)
6121 +               return -EINVAL;
6122 +
6123 +       ksm_thrash_threshold = flags;
6124 +
6125 +       return count;
6126 +}
6127 +KSM_ATTR(thrash_threshold);
6128 +
6129  static ssize_t pages_shared_show(struct kobject *kobj,
6130                                  struct kobj_attribute *attr, char *buf)
6131  {
6132 @@ -1946,60 +4754,291 @@
6133  }
6134  KSM_ATTR_RO(pages_unshared);
6135
6136 -static ssize_t pages_volatile_show(struct kobject *kobj,
6137 -                                  struct kobj_attribute *attr, char *buf)
6138 +static ssize_t full_scans_show(struct kobject *kobj,
6139 +                              struct kobj_attribute *attr, char *buf)
6140  {
6141 -       long ksm_pages_volatile;
6142 +       return sprintf(buf, "%llu\n", ksm_scan_round);
6143 +}
6144 +KSM_ATTR_RO(full_scans);
6145
6146 -       ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared
6147 -                               - ksm_pages_sharing - ksm_pages_unshared;
6148 -       /*
6149 -        * It was not worth any locking to calculate that statistic,
6150 -        * but it might therefore sometimes be negative: conceal that.
6151 -        */
6152 -       if (ksm_pages_volatile < 0)
6153 -               ksm_pages_volatile = 0;
6154 -       return sprintf(buf, "%ld\n", ksm_pages_volatile);
6155 +static ssize_t pages_scanned_show(struct kobject *kobj,
6156 +                                 struct kobj_attribute *attr, char *buf)
6157 +{
6158 +       unsigned long base = 0;
6159 +       u64 delta, ret;
6160 +
6161 +       if (pages_scanned_stored) {
6162 +               base = pages_scanned_base;
6163 +               ret = pages_scanned_stored;
6164 +               delta = ksm_pages_scanned >> base;
6165 +               if (CAN_OVERFLOW_U64(ret, delta)) {
6166 +                       ret >>= 1;
6167 +                       delta >>= 1;
6168 +                       base++;
6169 +                       ret += delta;
6170 +               }
6171 +       } else {
6172 +               ret = ksm_pages_scanned;
6173 +       }
6174 +
6175 +       while (ret > ULONG_MAX) {
6176 +               ret >>= 1;
6177 +               base++;
6178 +       }
6179 +
6180 +       if (base)
6181 +               return sprintf(buf, "%lu * 2^%lu\n", (unsigned long)ret, base);
6182 +       else
6183 +               return sprintf(buf, "%lu\n", (unsigned long)ret);
6184  }
6185 -KSM_ATTR_RO(pages_volatile);
6186 +KSM_ATTR_RO(pages_scanned);
6187
6188 -static ssize_t full_scans_show(struct kobject *kobj,
6189 -                              struct kobj_attribute *attr, char *buf)
6190 +static ssize_t hash_strength_show(struct kobject *kobj,
6191 +                                 struct kobj_attribute *attr, char *buf)
6192  {
6193 -       return sprintf(buf, "%lu\n", ksm_scan.seqnr);
6194 +       return sprintf(buf, "%lu\n", hash_strength);
6195  }
6196 -KSM_ATTR_RO(full_scans);
6197 +KSM_ATTR_RO(hash_strength);
6198 +
6199 +static ssize_t sleep_times_show(struct kobject *kobj,
6200 +                                 struct kobj_attribute *attr, char *buf)
6201 +{
6202 +       return sprintf(buf, "%llu\n", ksm_sleep_times);
6203 +}
6204 +KSM_ATTR_RO(sleep_times);
6205 +
6206
6207  static struct attribute *ksm_attrs[] = {
6208         &sleep_millisecs_attr.attr,
6209 -       &pages_to_scan_attr.attr,
6210 +       &scan_batch_pages_attr.attr,
6211         &run_attr.attr,
6212         &pages_shared_attr.attr,
6213         &pages_sharing_attr.attr,
6214         &pages_unshared_attr.attr,
6215 -       &pages_volatile_attr.attr,
6216         &full_scans_attr.attr,
6217 +       &min_scan_ratio_attr.attr,
6218 +       &pages_scanned_attr.attr,
6219 +       &hash_strength_attr.attr,
6220 +       &sleep_times_attr.attr,
6221 +       &thrash_threshold_attr.attr,
6222         NULL,
6223  };
6224
6225  static struct attribute_group ksm_attr_group = {
6226         .attrs = ksm_attrs,
6227 -       .name = "ksm",
6228 +       .name = "uksm",
6229  };
6230  #endif /* CONFIG_SYSFS */
6231
6232 +static inline void init_scan_ladder(void)
6233 +{
6234 +       int i;
6235 +       unsigned long mul = 1;
6236 +
6237 +       unsigned long pages_to_scan;
6238 +
6239 +       pages_to_scan = ksm_scan_batch_pages;
6240 +
6241 +       for (i = 0; i < ksm_scan_ladder_size; i++,
6242 +             mul *= ksm_scan_ratio_delta) {
6243 +
6244 +               ksm_scan_ladder[i].scan_ratio = ksm_min_scan_ratio * mul;
6245 +               INIT_LIST_HEAD(&ksm_scan_ladder[i].vma_list);
6246 +               ksm_scan_ladder[i].vma_num = 0;
6247 +               ksm_scan_ladder[i].round_finished = 0;
6248 +               ksm_scan_ladder[i].fully_scanned_slots = 0;
6249 +               ksm_scan_ladder[i].busy_searched = 0;
6250 +       }
6251 +
6252 +       cal_ladder_pages_to_scan(ksm_scan_batch_pages);
6253 +}
6254 +
6255 +static inline int cal_positive_negative_costs(void)
6256 +{
6257 +       struct page *p1, *p2;
6258 +       unsigned char *addr1, *addr2;
6259 +       unsigned long i, time_start, hash_cost;
6260 +       unsigned long loopnum = 0;
6261 +
6262 +       /*IMPORTANT: volatile is needed to prevent over-optimization by gcc. */
6263 +       volatile u32 hash;
6264 +       volatile int ret;
6265 +
6266 +       p1 = alloc_page(GFP_KERNEL);
6267 +       if (!p1)
6268 +               return -ENOMEM;
6269 +
6270 +       p2 = alloc_page(GFP_KERNEL);
6271 +       if (!p2)
6272 +               return -ENOMEM;
6273 +
6274 +       addr1 = kmap_atomic(p1, KM_USER0);
6275 +       addr2 = kmap_atomic(p2, KM_USER1);
6276 +       memset(addr1, random32(), PAGE_SIZE);
6277 +       memcpy(addr2, addr1, PAGE_SIZE);
6278 +
6279 +       /* make sure that the two pages differ in last byte */
6280 +       addr2[PAGE_SIZE-1] = ~addr2[PAGE_SIZE-1];
6281 +       kunmap_atomic(addr2, KM_USER1);
6282 +       kunmap_atomic(addr1, KM_USER0);
6283 +
6284 +       time_start = jiffies;
6285 +       while (jiffies - time_start < 100) {
6286 +               for (i = 0; i < 100; i++)
6287 +                       hash = page_hash(p1, HASH_STRENGTH_FULL, 0);
6288 +               loopnum += 100;
6289 +       }
6290 +       hash_cost = (jiffies - time_start);
6291 +
6292 +       time_start = jiffies;
6293 +       for (i = 0; i < loopnum; i++)
6294 +               ret = pages_identical(p1, p2);
6295 +       memcmp_cost = HASH_STRENGTH_FULL * (jiffies - time_start);
6296 +       memcmp_cost /= hash_cost;
6297 +       printk(KERN_INFO "UKSM: relative memcmp_cost = %lu.\n", memcmp_cost);
6298 +
6299 +       __free_page(p1);
6300 +       __free_page(p2);
6301 +       return 0;
6302 +}
6303 +
6304 +static int init_zeropage_hash_table(void)
6305 +{
6306 +       struct page *page;
6307 +       char *addr;
6308 +       int i;
6309 +
6310 +       page = alloc_page(GFP_KERNEL);
6311 +       if (!page)
6312 +               return -ENOMEM;
6313 +
6314 +       addr = kmap_atomic(page, KM_USER0);
6315 +       memset(addr, 0, PAGE_SIZE);
6316 +       kunmap_atomic(addr, KM_USER0);
6317 +
6318 +       zero_hash_table = kmalloc(HASH_STRENGTH_MAX * sizeof(u32),
6319 +               GFP_KERNEL);
6320 +       if (!zero_hash_table)
6321 +               return -ENOMEM;
6322 +
6323 +       for (i = 0; i < HASH_STRENGTH_MAX; i++)
6324 +               zero_hash_table[i] = page_hash(page, i, 0);
6325 +
6326 +       __free_page(page);
6327 +
6328 +       return 0;
6329 +}
6330 +
6331 +static inline int init_random_sampling(void)
6332 +{
6333 +       unsigned long i;
6334 +       random_nums = kmalloc(PAGE_SIZE, GFP_KERNEL);
6335 +       if (!random_nums)
6336 +               return -ENOMEM;
6337 +
6338 +       for (i = 0; i < HASH_STRENGTH_FULL; i++)
6339 +               random_nums[i] = i;
6340 +
6341 +       for (i = 0; i < HASH_STRENGTH_FULL; i++) {
6342 +               unsigned long rand_range, swap_index, tmp;
6343 +
6344 +               rand_range = HASH_STRENGTH_FULL - i;
6345 +               swap_index = i + random32() % rand_range;
6346 +               tmp = random_nums[i];
6347 +               random_nums[i] =  random_nums[swap_index];
6348 +               random_nums[swap_index] = tmp;
6349 +       }
6350 +
6351 +       rshash_state.state = RSHASH_NEW;
6352 +       rshash_state.below_count = 0;
6353 +       rshash_state.lookup_window_index = 0;
6354 +
6355 +       return cal_positive_negative_costs();
6356 +}
6357 +
6358 +static int __init ksm_slab_init(void)
6359 +{
6360 +       rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
6361 +       if (!rmap_item_cache)
6362 +               goto out;
6363 +
6364 +       stable_node_cache = KSM_KMEM_CACHE(stable_node, 0);
6365 +       if (!stable_node_cache)
6366 +               goto out_free1;
6367 +
6368 +       node_vma_cache = KSM_KMEM_CACHE(node_vma, 0);
6369 +       if (!node_vma_cache)
6370 +               goto out_free2;
6371 +
6372 +       vma_slot_cache = KSM_KMEM_CACHE(vma_slot, 0);
6373 +       if (!vma_slot_cache)
6374 +               goto out_free3;
6375 +
6376 +       tree_node_cache = KSM_KMEM_CACHE(tree_node, 0);
6377 +       if (!tree_node_cache)
6378 +               goto out_free4;
6379 +
6380 +       return 0;
6381 +
6382 +out_free4:
6383 +       kmem_cache_destroy(vma_slot_cache);
6384 +out_free3:
6385 +       kmem_cache_destroy(node_vma_cache);
6386 +out_free2:
6387 +       kmem_cache_destroy(stable_node_cache);
6388 +out_free1:
6389 +       kmem_cache_destroy(rmap_item_cache);
6390 +out:
6391 +       return -ENOMEM;
6392 +}
6393 +
6394 +static void __init ksm_slab_free(void)
6395 +{
6396 +       kmem_cache_destroy(stable_node_cache);
6397 +       kmem_cache_destroy(rmap_item_cache);
6398 +       kmem_cache_destroy(node_vma_cache);
6399 +       kmem_cache_destroy(vma_slot_cache);
6400 +       kmem_cache_destroy(tree_node_cache);
6401 +}
6402 +
6403  static int __init ksm_init(void)
6404  {
6405         struct task_struct *ksm_thread;
6406         int err;
6407 +       unsigned int sr = ksm_min_scan_ratio;
6408 +
6409 +       ksm_scan_ladder_size = 1;
6410 +       while (sr < KSM_SCAN_RATIO_MAX) {
6411 +               sr *= ksm_scan_ratio_delta;
6412 +               ksm_scan_ladder_size++;
6413 +       }
6414 +       ksm_scan_ladder = kzalloc(sizeof(struct scan_rung) *
6415 +                                 ksm_scan_ladder_size, GFP_KERNEL);
6416 +       if (!ksm_scan_ladder) {
6417 +               printk(KERN_ERR "uksm scan ladder allocation failed, size=%d\n",
6418 +                      ksm_scan_ladder_size);
6419 +               err = ENOMEM;
6420 +               goto out;
6421 +       }
6422 +       init_scan_ladder();
6423 +
6424 +       INIT_RADIX_TREE(&ksm_vma_tree, GFP_KERNEL);
6425 +
6426 +       err = init_random_sampling();
6427 +       if (err)
6428 +               goto out_free2;
6429
6430         err = ksm_slab_init();
6431         if (err)
6432 -               goto out;
6433 +               goto out_free1;
6434
6435 -       ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
6436 +       err = init_zeropage_hash_table();
6437 +       if (err)
6438 +               goto out_free0;
6439 +
6440 +       ksm_thread = kthread_run(ksm_scan_thread, NULL, "uksmd");
6441         if (IS_ERR(ksm_thread)) {
6442 -               printk(KERN_ERR "ksm: creating kthread failed\n");
6443 +               printk(KERN_ERR "uksm: creating kthread failed\n");
6444                 err = PTR_ERR(ksm_thread);
6445                 goto out_free;
6446         }
6447 @@ -2007,7 +5046,7 @@
6448  #ifdef CONFIG_SYSFS
6449         err = sysfs_create_group(mm_kobj, &ksm_attr_group);
6450         if (err) {
6451 -               printk(KERN_ERR "ksm: register sysfs failed\n");
6452 +               printk(KERN_ERR "uksm: register sysfs failed\n");
6453                 kthread_stop(ksm_thread);
6454                 goto out_free;
6455         }
6456 @@ -2026,8 +5065,20 @@
6457         return 0;
6458
6459  out_free:
6460 +       kfree(zero_hash_table);
6461 +out_free0:
6462         ksm_slab_free();
6463 +out_free1:
6464 +       kfree(random_nums);
6465 +out_free2:
6466 +       kfree(ksm_scan_ladder);
6467  out:
6468         return err;
6469  }
6470 +
6471 +#ifdef MODULE
6472  module_init(ksm_init)
6473 +#else
6474 +late_initcall(ksm_init);
6475 +#endif
6476 +
6477 diff -urN linux-2.6.38/mm/madvise.c uksm-2.6.38-zhang/mm/madvise.c
6478 --- linux-2.6.38/mm/madvise.c   2011-03-15 09:20:32.000000000 +0800
6479 +++ uksm-2.6.38-zhang/mm/madvise.c      2012-01-09 10:05:59.862270375 +0800
6480 @@ -65,12 +65,6 @@
6481                 }
6482                 new_flags &= ~VM_DONTCOPY;
6483                 break;
6484 -       case MADV_MERGEABLE:
6485 -       case MADV_UNMERGEABLE:
6486 -               error = ksm_madvise(vma, start, end, behavior, &new_flags);
6487 -               if (error)
6488 -                       goto out;
6489 -               break;
6490         case MADV_HUGEPAGE:
6491         case MADV_NOHUGEPAGE:
6492                 error = hugepage_madvise(vma, &new_flags, behavior);
6493 @@ -285,10 +279,6 @@
6494         case MADV_REMOVE:
6495         case MADV_WILLNEED:
6496         case MADV_DONTNEED:
6497 -#ifdef CONFIG_KSM
6498 -       case MADV_MERGEABLE:
6499 -       case MADV_UNMERGEABLE:
6500 -#endif
6501  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
6502         case MADV_HUGEPAGE:
6503         case MADV_NOHUGEPAGE:
6504 diff -urN linux-2.6.38/mm/memory.c uksm-2.6.38-zhang/mm/memory.c
6505 --- linux-2.6.38/mm/memory.c    2011-03-15 09:20:32.000000000 +0800
6506 +++ uksm-2.6.38-zhang/mm/memory.c       2012-01-09 10:09:28.235610655 +0800
6507 @@ -112,6 +112,37 @@
6508  unsigned long zero_pfn __read_mostly;
6509  unsigned long highest_memmap_pfn __read_mostly;
6510
6511 +#ifdef CONFIG_KSM
6512 +unsigned long ksm_zero_pfn __read_mostly;
6513 +struct page *empty_ksm_zero_page;
6514 +
6515 +static int __init setup_ksm_zero_page(void)
6516 +{
6517 +       unsigned long addr;
6518 +       addr = __get_free_pages(GFP_KERNEL | __GFP_ZERO, 0);
6519 +       if (!addr)
6520 +               panic("Oh boy, that early out of memory?");
6521 +
6522 +       empty_ksm_zero_page = virt_to_page((void *) addr);
6523 +       SetPageReserved(empty_ksm_zero_page);
6524 +
6525 +       ksm_zero_pfn = page_to_pfn(empty_ksm_zero_page);
6526 +
6527 +       return 0;
6528 +}
6529 +core_initcall(setup_ksm_zero_page);
6530 +
6531 +static inline int is_ksm_zero_pfn(unsigned long pfn)
6532 +{
6533 +       return pfn == ksm_zero_pfn;
6534 +}
6535 +#else
6536 +static inline int is_ksm_zero_pfn(unsigned long pfn)
6537 +{
6538 +       return 0;
6539 +}
6540 +#endif
6541 +
6542  /*
6543   * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
6544   */
6545 @@ -123,6 +154,7 @@
6546  core_initcall(init_zero_pfn);
6547
6548
6549 +
6550  #if defined(SPLIT_RSS_COUNTING)
6551
6552  static void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm)
6553 @@ -609,7 +641,7 @@
6554                         goto check_pfn;
6555                 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
6556                         return NULL;
6557 -               if (!is_zero_pfn(pfn))
6558 +               if (!is_zero_pfn(pfn) && !is_ksm_zero_pfn(pfn))
6559                         print_bad_pte(vma, addr, pte, NULL);
6560                 return NULL;
6561         }
6562 @@ -631,7 +663,7 @@
6563                 }
6564         }
6565
6566 -       if (is_zero_pfn(pfn))
6567 +       if (is_zero_pfn(pfn) || is_ksm_zero_pfn(pfn))
6568                 return NULL;
6569  check_pfn:
6570         if (unlikely(pfn > highest_memmap_pfn)) {
6571 @@ -719,6 +751,10 @@
6572                         rss[MM_ANONPAGES]++;
6573                 else
6574                         rss[MM_FILEPAGES]++;
6575 +#ifdef CONFIG_KSM
6576 +               if (PageKsm(page)) /* follows page_dup_rmap() */
6577 +                       inc_zone_page_state(page, NR_KSM_PAGES_SHARING);
6578 +#endif
6579         }
6580
6581  out_set_pte:
6582 @@ -1341,7 +1377,8 @@
6583         page = vm_normal_page(vma, address, pte);
6584         if (unlikely(!page)) {
6585                 if ((flags & FOLL_DUMP) ||
6586 -                   !is_zero_pfn(pte_pfn(pte)))
6587 +                   !is_zero_pfn(pte_pfn(pte)) ||
6588 +                   !is_ksm_zero_pfn(pte_pfn(pte)))
6589                         goto bad_page;
6590                 page = pte_page(pte);
6591         }
6592 @@ -1423,7 +1460,7 @@
6593
6594         VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
6595
6596 -       /*
6597 +       /*
6598          * Require read or write permissions.
6599          * If FOLL_FORCE is set, we only require the "MAY" flags.
6600          */
6601 @@ -1470,7 +1507,8 @@
6602                                 page = vm_normal_page(gate_vma, start, *pte);
6603                                 if (!page) {
6604                                         if (!(gup_flags & FOLL_DUMP) &&
6605 -                                            is_zero_pfn(pte_pfn(*pte)))
6606 +                                           (is_zero_pfn(pte_pfn(*pte)) ||
6607 +                                            is_ksm_zero_pfn(pte_pfn(*pte))))
6608                                                 page = pte_page(*pte);
6609                                         else {
6610                                                 pte_unmap(pte);
6611 @@ -2158,8 +2196,13 @@
6612                         clear_page(kaddr);
6613                 kunmap_atomic(kaddr, KM_USER0);
6614                 flush_dcache_page(dst);
6615 -       } else
6616 +       } else {
6617                 copy_user_highpage(dst, src, va, vma);
6618 +#ifdef CONFIG_KSM
6619 +               if (vma->ksm_vma_slot && PageKsm(src))
6620 +                       vma->ksm_vma_slot->pages_cowed++;
6621 +#endif
6622 +       }
6623  }
6624
6625  /*
6626 @@ -2353,10 +2396,15 @@
6627         if (unlikely(anon_vma_prepare(vma)))
6628                 goto oom;
6629
6630 -       if (is_zero_pfn(pte_pfn(orig_pte))) {
6631 +       if (is_zero_pfn(pte_pfn(orig_pte))
6632 +                       || is_ksm_zero_pfn(pte_pfn(orig_pte))) {
6633                 new_page = alloc_zeroed_user_highpage_movable(vma, address);
6634                 if (!new_page)
6635                         goto oom;
6636 +#ifdef CONFIG_KSM
6637 +               if (vma->ksm_vma_slot && is_ksm_zero_pfn(pte_pfn(orig_pte)))
6638 +                       vma->ksm_vma_slot->pages_cowed++;
6639 +#endif
6640         } else {
6641                 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
6642                 if (!new_page)
6643 @@ -2378,6 +2426,11 @@
6644                                 dec_mm_counter_fast(mm, MM_FILEPAGES);
6645                                 inc_mm_counter_fast(mm, MM_ANONPAGES);
6646                         }
6647 +#ifdef CONFIG_KSM
6648 +                       if (is_ksm_zero_pfn(pte_pfn(orig_pte)))
6649 +                               __dec_zone_page_state(old_page,
6650 +                                               NR_KSM_ZERO_PAGES);
6651 +#endif
6652                 } else
6653                         inc_mm_counter_fast(mm, MM_ANONPAGES);
6654                 flush_cache_page(vma, address, pte_pfn(orig_pte));
6655 Binary files linux-2.6.38/mm/.memory.c.swp and uksm-2.6.38-zhang/mm/.memory.c.swp differ
6656 diff -urN linux-2.6.38/mm/mmap.c uksm-2.6.38-zhang/mm/mmap.c
6657 --- linux-2.6.38/mm/mmap.c      2011-03-15 09:20:32.000000000 +0800
6658 +++ uksm-2.6.38-zhang/mm/mmap.c 2012-01-09 10:05:59.872270374 +0800
6659 @@ -30,6 +30,7 @@
6660  #include <linux/perf_event.h>
6661  #include <linux/audit.h>
6662  #include <linux/khugepaged.h>
6663 +#include <linux/ksm.h>
6664
6665  #include <asm/uaccess.h>
6666  #include <asm/cacheflush.h>
6667 @@ -65,7 +66,7 @@
6668   * MAP_SHARED  r: (no) no      r: (yes) yes    r: (no) yes     r: (no) yes
6669   *             w: (no) no      w: (no) no      w: (yes) yes    w: (no) no
6670   *             x: (no) no      x: (no) yes     x: (no) yes     x: (yes) yes
6671 - *
6672 + *
6673   * MAP_PRIVATE r: (no) no      r: (yes) yes    r: (no) yes     r: (no) yes
6674   *             w: (no) no      w: (no) no      w: (copy) copy  w: (no) no
6675   *             x: (no) no      x: (no) yes     x: (no) yes     x: (yes) yes
6676 @@ -240,6 +241,9 @@
6677                         removed_exe_file_vma(vma->vm_mm);
6678         }
6679         mpol_put(vma_policy(vma));
6680 +#ifdef CONFIG_KSM
6681 +       ksm_remove_vma(vma);
6682 +#endif
6683         kmem_cache_free(vm_area_cachep, vma);
6684         return next;
6685  }
6686 @@ -529,9 +533,20 @@
6687         long adjust_next = 0;
6688         int remove_next = 0;
6689
6690 +/*
6691 + * to avoid deadlock, ksm_remove_vma must be done before any spin_lock is
6692 + * acquired
6693 + */
6694 +#ifdef CONFIG_KSM
6695 +       ksm_remove_vma(vma);
6696 +#endif
6697 +
6698         if (next && !insert) {
6699                 struct vm_area_struct *exporter = NULL;
6700
6701 +#ifdef CONFIG_KSM
6702 +               ksm_remove_vma(next);
6703 +#endif
6704                 if (end >= next->vm_end) {
6705                         /*
6706                          * vma expands, overlapping all the next, and
6707 @@ -616,10 +631,10 @@
6708                 if (adjust_next)
6709                         vma_prio_tree_remove(next, root);
6710         }
6711 -
6712         vma->vm_start = start;
6713         vma->vm_end = end;
6714         vma->vm_pgoff = pgoff;
6715 +
6716         if (adjust_next) {
6717                 next->vm_start += adjust_next << PAGE_SHIFT;
6718                 next->vm_pgoff += adjust_next;
6719 @@ -672,10 +687,22 @@
6720                  */
6721                 if (remove_next == 2) {
6722                         next = vma->vm_next;
6723 +#ifdef CONFIG_KSM
6724 +                       ksm_remove_vma(next);
6725 +#endif
6726                         goto again;
6727                 }
6728 +       } else {
6729 +#ifdef CONFIG_KSM
6730 +               if (next && !insert)
6731 +                       ksm_vma_add_new(next);
6732 +#endif
6733         }
6734
6735 +#ifdef CONFIG_KSM
6736 +       ksm_vma_add_new(vma);
6737 +#endif
6738 +
6739         validate_mm(mm);
6740
6741         return 0;
6742 @@ -1352,6 +1379,9 @@
6743
6744         vma_link(mm, vma, prev, rb_link, rb_parent);
6745         file = vma->vm_file;
6746 +#ifdef CONFIG_KSM
6747 +       ksm_vma_add_new(vma);
6748 +#endif
6749
6750         /* Once vma denies write, undo our temporary denial count */
6751         if (correct_wcount)
6752 @@ -1378,6 +1408,9 @@
6753         unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
6754         charged = 0;
6755  free_vma:
6756 +#ifdef CONFIG_KSM
6757 +       ksm_remove_vma(vma);
6758 +#endif
6759         kmem_cache_free(vm_area_cachep, vma);
6760  unacct_error:
6761         if (charged)
6762 @@ -1453,7 +1486,7 @@
6763                 addr = vma->vm_end;
6764         }
6765  }
6766 -#endif
6767 +#endif
6768
6769  void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
6770  {
6771 @@ -2014,6 +2047,10 @@
6772         else
6773                 err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
6774
6775 +#ifdef CONFIG_KSM
6776 +       ksm_vma_add_new(new);
6777 +#endif
6778 +
6779         /* Success. */
6780         if (!err)
6781                 return 0;
6782 @@ -2250,6 +2287,9 @@
6783         vma->vm_flags = flags;
6784         vma->vm_page_prot = vm_get_page_prot(flags);
6785         vma_link(mm, vma, prev, rb_link, rb_parent);
6786 +#ifdef CONFIG_KSM
6787 +       ksm_vma_add_new(vma);
6788 +#endif
6789  out:
6790         perf_event_mmap(vma);
6791         mm->total_vm += len >> PAGE_SHIFT;
6792 @@ -2273,6 +2313,12 @@
6793         /* mm's last user has gone, and its about to be pulled down */
6794         mmu_notifier_release(mm);
6795
6796 +       /*
6797 +        * Taking write lock on mmap_sem does not harm others,
6798 +        * but it's crucial for uksm to avoid races.
6799 +        */
6800 +       down_write(&mm->mmap_sem);
6801 +
6802         if (mm->locked_vm) {
6803                 vma = mm->mmap;
6804                 while (vma) {
6805 @@ -2306,6 +2352,11 @@
6806         while (vma)
6807                 vma = remove_vma(vma);
6808
6809 +       mm->mmap = NULL;
6810 +       mm->mm_rb = RB_ROOT;
6811 +       mm->mmap_cache = NULL;
6812 +       up_write(&mm->mmap_sem);
6813 +
6814         BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
6815  }
6816
6817 @@ -2397,6 +2448,9 @@
6818                         if (new_vma->vm_ops && new_vma->vm_ops->open)
6819                                 new_vma->vm_ops->open(new_vma);
6820                         vma_link(mm, new_vma, prev, rb_link, rb_parent);
6821 +#ifdef CONFIG_KSM
6822 +                       ksm_vma_add_new(new_vma);
6823 +#endif
6824                 }
6825         }
6826         return new_vma;
6827 @@ -2502,11 +2556,14 @@
6828         ret = insert_vm_struct(mm, vma);
6829         if (ret)
6830                 goto out;
6831 -
6832         mm->total_vm += len >> PAGE_SHIFT;
6833
6834         perf_event_mmap(vma);
6835
6836 +#ifdef CONFIG_KSM
6837 +       ksm_vma_add_new(vma);
6838 +#endif
6839 +
6840         return 0;
6841
6842  out:
6843 diff -urN linux-2.6.38/mm/mremap.c uksm-2.6.38-zhang/mm/mremap.c
6844 --- linux-2.6.38/mm/mremap.c    2011-03-15 09:20:32.000000000 +0800
6845 +++ uksm-2.6.38-zhang/mm/mremap.c       2012-01-09 10:05:59.872270374 +0800
6846 @@ -191,8 +191,7 @@
6847          * pages recently unmapped.  But leave vma->vm_flags as it was,
6848          * so KSM can come around to merge on vma and new_vma afterwards.
6849          */
6850 -       err = ksm_madvise(vma, old_addr, old_addr + old_len,
6851 -                                               MADV_UNMERGEABLE, &vm_flags);
6852 +       err = unmerge_ksm_pages(vma, old_addr, old_addr + old_len);
6853         if (err)
6854                 return err;
6855
6856 diff -urN linux-2.6.38/mm/rmap.c uksm-2.6.38-zhang/mm/rmap.c
6857 --- linux-2.6.38/mm/rmap.c      2011-03-15 09:20:32.000000000 +0800
6858 +++ uksm-2.6.38-zhang/mm/rmap.c 2012-01-09 10:05:59.875603707 +0800
6859 @@ -817,9 +817,9 @@
6860
6861  /**
6862   * __page_set_anon_rmap - set up new anonymous rmap
6863 - * @page:      Page to add to rmap
6864 + * @page:      Page to add to rmap
6865   * @vma:       VM area to add page to.
6866 - * @address:   User virtual address of the mapping
6867 + * @address:   User virtual address of the mapping
6868   * @exclusive: the page is exclusively owned by the current process
6869   */
6870  static void __page_set_anon_rmap(struct page *page,
6871 @@ -905,9 +905,12 @@
6872                         __inc_zone_page_state(page,
6873                                               NR_ANON_TRANSPARENT_HUGEPAGES);
6874         }
6875 -       if (unlikely(PageKsm(page)))
6876 +#ifdef CONFIG_KSM
6877 +       if (unlikely(PageKsm(page))) {
6878 +               __inc_zone_page_state(page, NR_KSM_PAGES_SHARING);
6879                 return;
6880 -
6881 +       }
6882 +#endif
6883         VM_BUG_ON(!PageLocked(page));
6884         VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
6885         if (first)
6886 @@ -965,6 +968,10 @@
6887   */
6888  void page_remove_rmap(struct page *page)
6889  {
6890 +#ifdef CONFIG_KSM
6891 +       if (PageKsm(page))
6892 +               __dec_zone_page_state(page, NR_KSM_PAGES_SHARING);
6893 +#endif
6894         /* page still mapped by someone else? */
6895         if (!atomic_add_negative(-1, &page->_mapcount))
6896                 return;
6897 diff -urN linux-2.6.38/security/apparmor/capability_names.h uksm-2.6.38-zhang/security/apparmor/capability_names.h
6898 --- linux-2.6.38/security/apparmor/capability_names.h   1970-01-01 08:00:00.000000000 +0800
6899 +++ uksm-2.6.38-zhang/security/apparmor/capability_names.h      2012-01-10 09:30:37.569678996 +0800
6900 @@ -0,0 +1,37 @@
6901 +static const char *capability_names[] = {
6902 +[0]  = "chown",
6903 +[1]  = "dac_override",
6904 +[2]  = "dac_read_search",
6905 +[3]  = "fowner",
6906 +[4]  = "fsetid",
6907 +[5]  = "kill",
6908 +[6]  = "setgid",
6909 +[7]  = "setuid",
6910 +[8]  = "setpcap",
6911 +[9]  = "linux_immutable",
6912 +[10]  = "net_bind_service",
6913 +[11]  = "net_broadcast",
6914 +[12]  = "net_admin",
6915 +[13]  = "net_raw",
6916 +[14]  = "ipc_lock",
6917 +[15]  = "ipc_owner",
6918 +[16]  = "sys_module",
6919 +[17]  = "sys_rawio",
6920 +[18]  = "sys_chroot",
6921 +[19]  = "sys_ptrace",
6922 +[20]  = "sys_pacct",
6923 +[21]  = "sys_admin",
6924 +[22]  = "sys_boot",
6925 +[23]  = "sys_nice",
6926 +[24]  = "sys_resource",
6927 +[25]  = "sys_time",
6928 +[26]  = "sys_tty_config",
6929 +[27]  = "mknod",
6930 +[28]  = "lease",
6931 +[29]  = "audit_write",
6932 +[30]  = "audit_control",
6933 +[31]  = "setfcap",
6934 +[32]  = "mac_override",
6935 +[33]  = "mac_admin",
6936 +[34]  = "syslog",
6937 +};
6938 diff -urN linux-2.6.38/security/apparmor/rlim_names.h uksm-2.6.38-zhang/security/apparmor/rlim_names.h
6939 --- linux-2.6.38/security/apparmor/rlim_names.h 1970-01-01 08:00:00.000000000 +0800
6940 +++ uksm-2.6.38-zhang/security/apparmor/rlim_names.h    2012-01-10 09:30:41.073012457 +0800
6941 @@ -0,0 +1,36 @@
6942 +static const char *rlim_names[] = {
6943 +[0]  = "cpu",
6944 +[1]  = "fsize",
6945 +[2]  = "data",
6946 +[3]  = "stack",
6947 +[4]  = "core",
6948 +[5]  = "rss",
6949 +[6]  = "nproc",
6950 +[7]  = "nofile",
6951 +[8]  = "memlock",
6952 +[9]  = "as",
6953 +[10]  = "locks",
6954 +[11]  = "sigpending",
6955 +[12]  = "msgqueue",
6956 +[13]  = "nice",
6957 +[14]  = "rtprio",
6958 +[15]  = "rttime",
6959 +};
6960 +static const int rlim_map[] = {
6961 +RLIMIT_CPU,
6962 +RLIMIT_FSIZE,
6963 +RLIMIT_DATA,
6964 +RLIMIT_STACK,
6965 +RLIMIT_CORE,
6966 +RLIMIT_RSS,
6967 +RLIMIT_NPROC,
6968 +RLIMIT_NOFILE,
6969 +RLIMIT_MEMLOCK,
6970 +RLIMIT_AS,
6971 +RLIMIT_LOCKS,
6972 +RLIMIT_SIGPENDING,
6973 +RLIMIT_MSGQUEUE,
6974 +RLIMIT_NICE,
6975 +RLIMIT_RTPRIO,
6976 +RLIMIT_RTTIME,
6977 +};