mm/swap.c

   1 /*
   2  *  linux/mm/swap.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  */
   6
   7 /*
   8  * This file contains the default values for the operation of the
   9  * Linux VM subsystem. Fine-tuning documentation can be found in
  10  * Documentation/sysctl/vm.txt.
  11  * Started 18.12.91
  12  * Swap aging added 23.2.95, Stephen Tweedie.
  13  * Buffermem limits added 12.3.98, Rik van Riel.
  14  */
  15
  16 #include <linux/mm.h>
  17 #include <linux/sched.h>
  18 #include <linux/kernel_stat.h>
  19 #include <linux/swap.h>
  20 #include <linux/mman.h>
  21 #include <linux/pagemap.h>
  22 #include <linux/pagevec.h>
  23 #include <linux/init.h>
  24 #include <linux/export.h>
  25 #include <linux/mm_inline.h>
  26 #include <linux/percpu_counter.h>
  27 #include <linux/percpu.h>
  28 #include <linux/cpu.h>
  29 #include <linux/notifier.h>
  30 #include <linux/backing-dev.h>
  31 #include <linux/memcontrol.h>
  32 #include <linux/gfp.h>
  33 #include <linux/uio.h>
  34 #include <linux/hugetlb.h>
  35
  36 #include "internal.h"
  37
  38 #define CREATE_TRACE_POINTS
  39 #include <trace/events/pagemap.h>
  40
  41 /* How many pages do we try to swap or page in/out together? */
  42 int page_cluster;
  43
  44 static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
  45 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
  46 static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
  47
  48 /*
  49  * This path almost never happens for VM activity - pages are normally
  50  * freed via pagevecs.  But it gets used by networking.
  51  */
  52 static void __page_cache_release(struct page *page)
  53 {
  54         if (PageLRU(page)) {
  55                 struct zone *zone = page_zone(page);
  56                 struct lruvec *lruvec;
  57                 unsigned long flags;
  58
  59                 spin_lock_irqsave(&zone->lru_lock, flags);
  60                 lruvec = mem_cgroup_page_lruvec(page, zone);
  61                 VM_BUG_ON(!PageLRU(page));
  62                 __ClearPageLRU(page);
  63                 del_page_from_lru_list(page, lruvec, page_off_lru(page));
  64                 spin_unlock_irqrestore(&zone->lru_lock, flags);
  65         }
  66 }
  67
  68 static void __put_single_page(struct page *page)
  69 {
  70         __page_cache_release(page);
  71         free_hot_cold_page(page, 0);
  72 }
  73
  74 static void __put_compound_page(struct page *page)
  75 {
  76         compound_page_dtor *dtor;
  77
  78         __page_cache_release(page);
  79         dtor = get_compound_page_dtor(page);
  80         (*dtor)(page);
  81 }
  82
  83 static void put_compound_page(struct page *page)
  84 {
  85         /*
  86          * hugetlbfs pages cannot be split from under us.  If this is a
  87          * hugetlbfs page, check refcount on head page and release the page if
  88          * the refcount becomes zero.
  89          */
  90         if (PageHuge(page)) {
  91                 page = compound_head(page);
  92                 if (put_page_testzero(page))
  93                         __put_compound_page(page);
  94
  95                 return;
  96         }
  97
  98         if (unlikely(PageTail(page))) {
  99                 /* __split_huge_page_refcount can run under us */
 100                 struct page *page_head = compound_trans_head(page);
 101
 102                 if (likely(page != page_head &&
 103                            get_page_unless_zero(page_head))) {
 104                         unsigned long flags;
 105
 106                         /*
 107                          * THP can not break up slab pages so avoid taking
 108                          * compound_lock().  Slab performs non-atomic bit ops
 109                          * on page->flags for better performance.  In particular
 110                          * slab_unlock() in slub used to be a hot path.  It is
 111                          * still hot on arches that do not support
 112                          * this_cpu_cmpxchg_double().
 113                          */
 114                         if (PageSlab(page_head)) {
 115                                 if (PageTail(page)) {
 116                                         if (put_page_testzero(page_head))
 117                                                 VM_BUG_ON(1);
 118
 119                                         atomic_dec(&page->_mapcount);
 120                                         goto skip_lock_tail;
 121                                 } else
 122                                         goto skip_lock;
 123                         }
 124                         /*
 125                          * page_head wasn't a dangling pointer but it
 126                          * may not be a head page anymore by the time
 127                          * we obtain the lock. That is ok as long as it
 128                          * can't be freed from under us.
 129                          */
 130                         flags = compound_lock_irqsave(page_head);
 131                         if (unlikely(!PageTail(page))) {
 132                                 /* __split_huge_page_refcount run before us */
 133                                 compound_unlock_irqrestore(page_head, flags);
 134 skip_lock:
 135                                 if (put_page_testzero(page_head))
 136                                         __put_single_page(page_head);
 137 out_put_single:
 138                                 if (put_page_testzero(page))
 139                                         __put_single_page(page);
 140                                 return;
 141                         }
 142                         VM_BUG_ON(page_head != page->first_page);
 143                         /*
 144                          * We can release the refcount taken by
 145                          * get_page_unless_zero() now that
 146                          * __split_huge_page_refcount() is blocked on
 147                          * the compound_lock.
 148                          */
 149                         if (put_page_testzero(page_head))
 150                                 VM_BUG_ON(1);
 151                         /* __split_huge_page_refcount will wait now */
 152                         VM_BUG_ON(page_mapcount(page) <= 0);
 153                         atomic_dec(&page->_mapcount);
 154                         VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
 155                         VM_BUG_ON(atomic_read(&page->_count) != 0);
 156                         compound_unlock_irqrestore(page_head, flags);
 157
 158 skip_lock_tail:
 159                         if (put_page_testzero(page_head)) {
 160                                 if (PageHead(page_head))
 161                                         __put_compound_page(page_head);
 162                                 else
 163                                         __put_single_page(page_head);
 164                         }
 165                 } else {
 166                         /* page_head is a dangling pointer */
 167                         VM_BUG_ON(PageTail(page));
 168                         goto out_put_single;
 169                 }
 170         } else if (put_page_testzero(page)) {
 171                 if (PageHead(page))
 172                         __put_compound_page(page);
 173                 else
 174                         __put_single_page(page);
 175         }
 176 }
 177
 178 void put_page(struct page *page)
 179 {
 180         if (unlikely(PageCompound(page)))
 181                 put_compound_page(page);
 182         else if (put_page_testzero(page))
 183                 __put_single_page(page);
 184 }
 185 EXPORT_SYMBOL(put_page);
 186
 187 /*
 188  * This function is exported but must not be called by anything other
 189  * than get_page(). It implements the slow path of get_page().
 190  */
 191 bool __get_page_tail(struct page *page)
 192 {
 193         /*
 194          * This takes care of get_page() if run on a tail page
 195          * returned by one of the get_user_pages/follow_page variants.
 196          * get_user_pages/follow_page itself doesn't need the compound
 197          * lock because it runs __get_page_tail_foll() under the
 198          * proper PT lock that already serializes against
 199          * split_huge_page().
 200          */
 201         bool got = false;
 202         struct page *page_head;
 203
 204         /*
 205          * If this is a hugetlbfs page it cannot be split under us.  Simply
 206          * increment refcount for the head page.
 207          */
 208         if (PageHuge(page)) {
 209                 page_head = compound_head(page);
 210                 atomic_inc(&page_head->_count);
 211                 got = true;
 212         } else {
 213                 unsigned long flags;
 214
 215                 page_head = compound_trans_head(page);
 216                 if (likely(page != page_head &&
 217                                         get_page_unless_zero(page_head))) {
 218
 219                         /* Ref to put_compound_page() comment. */
 220                         if (PageSlab(page_head)) {
 221                                 if (likely(PageTail(page))) {
 222                                         __get_page_tail_foll(page, false);
 223                                         return true;
 224                                 } else {
 225                                         put_page(page_head);
 226                                         return false;
 227                                 }
 228                         }
 229
 230                         /*
 231                          * page_head wasn't a dangling pointer but it
 232                          * may not be a head page anymore by the time
 233                          * we obtain the lock. That is ok as long as it
 234                          * can't be freed from under us.
 235                          */
 236                         flags = compound_lock_irqsave(page_head);
 237                         /* here __split_huge_page_refcount won't run anymore */
 238                         if (likely(PageTail(page))) {
 239                                 __get_page_tail_foll(page, false);
 240                                 got = true;
 241                         }
 242                         compound_unlock_irqrestore(page_head, flags);
 243                         if (unlikely(!got))
 244                                 put_page(page_head);
 245                 }
 246         }
 247         return got;
 248 }
 249 EXPORT_SYMBOL(__get_page_tail);
 250
 251 /**
 252  * put_pages_list() - release a list of pages
 253  * @pages: list of pages threaded on page->lru
 254  *
 255  * Release a list of pages which are strung together on page.lru.  Currently
 256  * used by read_cache_pages() and related error recovery code.
 257  */
 258 void put_pages_list(struct list_head *pages)
 259 {
 260         while (!list_empty(pages)) {
 261                 struct page *victim;
 262
 263                 victim = list_entry(pages->prev, struct page, lru);
 264                 list_del(&victim->lru);
 265                 page_cache_release(victim);
 266         }
 267 }
 268 EXPORT_SYMBOL(put_pages_list);
 269
 270 /*
 271  * get_kernel_pages() - pin kernel pages in memory
 272  * @kiov:       An array of struct kvec structures
 273  * @nr_segs:    number of segments to pin
 274  * @write:      pinning for read/write, currently ignored
 275  * @pages:      array that receives pointers to the pages pinned.
 276  *              Should be at least nr_segs long.
 277  *
 278  * Returns number of pages pinned. This may be fewer than the number
 279  * requested. If nr_pages is 0 or negative, returns 0. If no pages
 280  * were pinned, returns -errno. Each page returned must be released
 281  * with a put_page() call when it is finished with.
 282  */
 283 int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write,
 284                 struct page **pages)
 285 {
 286         int seg;
 287
 288         for (seg = 0; seg < nr_segs; seg++) {
 289                 if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE))
 290                         return seg;
 291
 292                 pages[seg] = kmap_to_page(kiov[seg].iov_base);
 293                 page_cache_get(pages[seg]);
 294         }
 295
 296         return seg;
 297 }
 298 EXPORT_SYMBOL_GPL(get_kernel_pages);
 299
 300 /*
 301  * get_kernel_page() - pin a kernel page in memory
 302  * @start:      starting kernel address
 303  * @write:      pinning for read/write, currently ignored
 304  * @pages:      array that receives pointer to the page pinned.
 305  *              Must be at least nr_segs long.
 306  *
 307  * Returns 1 if page is pinned. If the page was not pinned, returns
 308  * -errno. The page returned must be released with a put_page() call
 309  * when it is finished with.
 310  */
 311 int get_kernel_page(unsigned long start, int write, struct page **pages)
 312 {
 313         const struct kvec kiov = {
 314                 .iov_base = (void *)start,
 315                 .iov_len = PAGE_SIZE
 316         };
 317
 318         return get_kernel_pages(&kiov, 1, write, pages);
 319 }
 320 EXPORT_SYMBOL_GPL(get_kernel_page);
 321
 322 static void pagevec_lru_move_fn(struct pagevec *pvec,
 323         void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg),
 324         void *arg)
 325 {
 326         int i;
 327         struct zone *zone = NULL;
 328         struct lruvec *lruvec;
 329         unsigned long flags = 0;
 330
 331         for (i = 0; i < pagevec_count(pvec); i++) {
 332                 struct page *page = pvec->pages[i];
 333                 struct zone *pagezone = page_zone(page);
 334
 335                 if (pagezone != zone) {
 336                         if (zone)
 337                                 spin_unlock_irqrestore(&zone->lru_lock, flags);
 338                         zone = pagezone;
 339                         spin_lock_irqsave(&zone->lru_lock, flags);
 340                 }
 341
 342                 lruvec = mem_cgroup_page_lruvec(page, zone);
 343                 (*move_fn)(page, lruvec, arg);
 344         }
 345         if (zone)
 346                 spin_unlock_irqrestore(&zone->lru_lock, flags);
 347         release_pages(pvec->pages, pvec->nr, pvec->cold);
 348         pagevec_reinit(pvec);
 349 }
 350
 351 static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec,
 352                                  void *arg)
 353 {
 354         int *pgmoved = arg;
 355
 356         if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
 357                 enum lru_list lru = page_lru_base_type(page);
 358                 list_move_tail(&page->lru, &lruvec->lists[lru]);
 359                 (*pgmoved)++;
 360         }
 361 }
 362
 363 /*
 364  * pagevec_move_tail() must be called with IRQ disabled.
 365  * Otherwise this may cause nasty races.
 366  */
 367 static void pagevec_move_tail(struct pagevec *pvec)
 368 {
 369         int pgmoved = 0;
 370
 371         pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved);
 372         __count_vm_events(PGROTATED, pgmoved);
 373 }
 374
 375 /*
 376  * Writeback is about to end against a page which has been marked for immediate
 377  * reclaim.  If it still appears to be reclaimable, move it to the tail of the
 378  * inactive list.
 379  */
 380 void rotate_reclaimable_page(struct page *page)
 381 {
 382         if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) &&
 383             !PageUnevictable(page) && PageLRU(page)) {
 384                 struct pagevec *pvec;
 385                 unsigned long flags;
 386
 387                 page_cache_get(page);
 388                 local_irq_save(flags);
 389                 pvec = &__get_cpu_var(lru_rotate_pvecs);
 390                 if (!pagevec_add(pvec, page))
 391                         pagevec_move_tail(pvec);
 392                 local_irq_restore(flags);
 393         }
 394 }
 395
 396 static void update_page_reclaim_stat(struct lruvec *lruvec,
 397                                      int file, int rotated)
 398 {
 399         struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
 400
 401         reclaim_stat->recent_scanned[file]++;
 402         if (rotated)
 403                 reclaim_stat->recent_rotated[file]++;
 404 }
 405
 406 static void __activate_page(struct page *page, struct lruvec *lruvec,
 407                             void *arg)
 408 {
 409         if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
 410                 int file = page_is_file_cache(page);
 411                 int lru = page_lru_base_type(page);
 412
 413                 del_page_from_lru_list(page, lruvec, lru);
 414                 SetPageActive(page);
 415                 lru += LRU_ACTIVE;
 416                 add_page_to_lru_list(page, lruvec, lru);
 417                 trace_mm_lru_activate(page, page_to_pfn(page));
 418
 419                 __count_vm_event(PGACTIVATE);
 420                 update_page_reclaim_stat(lruvec, file, 1);
 421         }
 422 }
 423
 424 #ifdef CONFIG_SMP
 425 static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
 426
 427 static void activate_page_drain(int cpu)
 428 {
 429         struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu);
 430
 431         if (pagevec_count(pvec))
 432                 pagevec_lru_move_fn(pvec, __activate_page, NULL);
 433 }
 434
 435 static bool need_activate_page_drain(int cpu)
 436 {
 437         return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0;
 438 }
 439
 440 void activate_page(struct page *page)
 441 {
 442         if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
 443                 struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
 444
 445                 page_cache_get(page);
 446                 if (!pagevec_add(pvec, page))
 447                         pagevec_lru_move_fn(pvec, __activate_page, NULL);
 448                 put_cpu_var(activate_page_pvecs);
 449         }
 450 }
 451
 452 #else
 453 static inline void activate_page_drain(int cpu)
 454 {
 455 }
 456
 457 static bool need_activate_page_drain(int cpu)
 458 {
 459         return false;
 460 }
 461
 462 void activate_page(struct page *page)
 463 {
 464         struct zone *zone = page_zone(page);
 465
 466         spin_lock_irq(&zone->lru_lock);
 467         __activate_page(page, mem_cgroup_page_lruvec(page, zone), NULL);
 468         spin_unlock_irq(&zone->lru_lock);
 469 }
 470 #endif
 471
 472 static void __lru_cache_activate_page(struct page *page)
 473 {
 474         struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
 475         int i;
 476
 477         /*
 478          * Search backwards on the optimistic assumption that the page being
 479          * activated has just been added to this pagevec. Note that only
 480          * the local pagevec is examined as a !PageLRU page could be in the
 481          * process of being released, reclaimed, migrated or on a remote
 482          * pagevec that is currently being drained. Furthermore, marking
 483          * a remote pagevec's page PageActive potentially hits a race where
 484          * a page is marked PageActive just after it is added to the inactive
 485          * list causing accounting errors and BUG_ON checks to trigger.
 486          */
 487         for (i = pagevec_count(pvec) - 1; i >= 0; i--) {
 488                 struct page *pagevec_page = pvec->pages[i];
 489
 490                 if (pagevec_page == page) {
 491                         SetPageActive(page);
 492                         break;
 493                 }
 494         }
 495
 496         put_cpu_var(lru_add_pvec);
 497 }
 498
 499 /*
 500  * Mark a page as having seen activity.
 501  *
 502  * inactive,unreferenced        ->      inactive,referenced
 503  * inactive,referenced          ->      active,unreferenced
 504  * active,unreferenced          ->      active,referenced
 505  */
 506 void mark_page_accessed(struct page *page)
 507 {
 508         if (!PageActive(page) && !PageUnevictable(page) &&
 509                         PageReferenced(page)) {
 510
 511                 /*
 512                  * If the page is on the LRU, queue it for activation via
 513                  * activate_page_pvecs. Otherwise, assume the page is on a
 514                  * pagevec, mark it active and it'll be moved to the active
 515                  * LRU on the next drain.
 516                  */
 517                 if (PageLRU(page))
 518                         activate_page(page);
 519                 else
 520                         __lru_cache_activate_page(page);
 521                 ClearPageReferenced(page);
 522         } else if (!PageReferenced(page)) {
 523                 SetPageReferenced(page);
 524         }
 525 }
 526 EXPORT_SYMBOL(mark_page_accessed);
 527
 528 /*
 529  * Queue the page for addition to the LRU via pagevec. The decision on whether
 530  * to add the page to the [in]active [file|anon] list is deferred until the
 531  * pagevec is drained. This gives a chance for the caller of __lru_cache_add()
 532  * have the page added to the active list using mark_page_accessed().
 533  */
 534 void __lru_cache_add(struct page *page)
 535 {
 536         struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
 537
 538         page_cache_get(page);
 539         if (!pagevec_space(pvec))
 540                 __pagevec_lru_add(pvec);
 541         pagevec_add(pvec, page);
 542         put_cpu_var(lru_add_pvec);
 543 }
 544 EXPORT_SYMBOL(__lru_cache_add);
 545
 546 /**
 547  * lru_cache_add - add a page to a page list
 548  * @page: the page to be added to the LRU.
 549  */
 550 void lru_cache_add(struct page *page)
 551 {
 552         VM_BUG_ON(PageActive(page) && PageUnevictable(page));
 553         VM_BUG_ON(PageLRU(page));
 554         __lru_cache_add(page);
 555 }
 556
 557 /**
 558  * add_page_to_unevictable_list - add a page to the unevictable list
 559  * @page:  the page to be added to the unevictable list
 560  *
 561  * Add page directly to its zone's unevictable list.  To avoid races with
 562  * tasks that might be making the page evictable, through eg. munlock,
 563  * munmap or exit, while it's not on the lru, we want to add the page
 564  * while it's locked or otherwise "invisible" to other tasks.  This is
 565  * difficult to do when using the pagevec cache, so bypass that.
 566  */
 567 void add_page_to_unevictable_list(struct page *page)
 568 {
 569         struct zone *zone = page_zone(page);
 570         struct lruvec *lruvec;
 571
 572         spin_lock_irq(&zone->lru_lock);
 573         lruvec = mem_cgroup_page_lruvec(page, zone);
 574         ClearPageActive(page);
 575         SetPageUnevictable(page);
 576         SetPageLRU(page);
 577         add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE);
 578         spin_unlock_irq(&zone->lru_lock);
 579 }
 580
 581 /*
 582  * If the page can not be invalidated, it is moved to the
 583  * inactive list to speed up its reclaim.  It is moved to the
 584  * head of the list, rather than the tail, to give the flusher
 585  * threads some time to write it out, as this is much more
 586  * effective than the single-page writeout from reclaim.
 587  *
 588  * If the page isn't page_mapped and dirty/writeback, the page
 589  * could reclaim asap using PG_reclaim.
 590  *
 591  * 1. active, mapped page -> none
 592  * 2. active, dirty/writeback page -> inactive, head, PG_reclaim
 593  * 3. inactive, mapped page -> none
 594  * 4. inactive, dirty/writeback page -> inactive, head, PG_reclaim
 595  * 5. inactive, clean -> inactive, tail
 596  * 6. Others -> none
 597  *
 598  * In 4, why it moves inactive's head, the VM expects the page would
 599  * be write it out by flusher threads as this is much more effective
 600  * than the single-page writeout from reclaim.
 601  */
 602 static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
 603                               void *arg)
 604 {
 605         int lru, file;
 606         bool active;
 607
 608         if (!PageLRU(page))
 609                 return;
 610
 611         if (PageUnevictable(page))
 612                 return;
 613
 614         /* Some processes are using the page */
 615         if (page_mapped(page))
 616                 return;
 617
 618         active = PageActive(page);
 619         file = page_is_file_cache(page);
 620         lru = page_lru_base_type(page);
 621
 622         del_page_from_lru_list(page, lruvec, lru + active);
 623         ClearPageActive(page);
 624         ClearPageReferenced(page);
 625         add_page_to_lru_list(page, lruvec, lru);
 626
 627         if (PageWriteback(page) || PageDirty(page)) {
 628                 /*
 629                  * PG_reclaim could be raced with end_page_writeback
 630                  * It can make readahead confusing.  But race window
 631                  * is _really_ small and  it's non-critical problem.
 632                  */
 633                 SetPageReclaim(page);
 634         } else {
 635                 /*
 636                  * The page's writeback ends up during pagevec
 637                  * We moves tha page into tail of inactive.
 638                  */
 639                 list_move_tail(&page->lru, &lruvec->lists[lru]);
 640                 __count_vm_event(PGROTATED);
 641         }
 642
 643         if (active)
 644                 __count_vm_event(PGDEACTIVATE);
 645         update_page_reclaim_stat(lruvec, file, 0);
 646 }
 647
 648 /*
 649  * Drain pages out of the cpu's pagevecs.
 650  * Either "cpu" is the current CPU, and preemption has already been
 651  * disabled; or "cpu" is being hot-unplugged, and is already dead.
 652  */
 653 void lru_add_drain_cpu(int cpu)
 654 {
 655         struct pagevec *pvec = &per_cpu(lru_add_pvec, cpu);
 656
 657         if (pagevec_count(pvec))
 658                 __pagevec_lru_add(pvec);
 659
 660         pvec = &per_cpu(lru_rotate_pvecs, cpu);
 661         if (pagevec_count(pvec)) {
 662                 unsigned long flags;
 663
 664                 /* No harm done if a racing interrupt already did this */
 665                 local_irq_save(flags);
 666                 pagevec_move_tail(pvec);
 667                 local_irq_restore(flags);
 668         }
 669
 670         pvec = &per_cpu(lru_deactivate_pvecs, cpu);
 671         if (pagevec_count(pvec))
 672                 pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
 673
 674         activate_page_drain(cpu);
 675 }
 676
 677 /**
 678  * deactivate_page - forcefully deactivate a page
 679  * @page: page to deactivate
 680  *
 681  * This function hints the VM that @page is a good reclaim candidate,
 682  * for example if its invalidation fails due to the page being dirty
 683  * or under writeback.
 684  */
 685 void deactivate_page(struct page *page)
 686 {
 687         /*
 688          * In a workload with many unevictable page such as mprotect, unevictable
 689          * page deactivation for accelerating reclaim is pointless.
 690          */
 691         if (PageUnevictable(page))
 692                 return;
 693
 694         if (likely(get_page_unless_zero(page))) {
 695                 struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
 696
 697                 if (!pagevec_add(pvec, page))
 698                         pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
 699                 put_cpu_var(lru_deactivate_pvecs);
 700         }
 701 }
 702
 703 void lru_add_drain(void)
 704 {
 705         lru_add_drain_cpu(get_cpu());
 706         put_cpu();
 707 }
 708
 709 static void lru_add_drain_per_cpu(struct work_struct *dummy)
 710 {
 711         lru_add_drain();
 712 }
 713
 714 static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
 715
 716 void lru_add_drain_all(void)
 717 {
 718         static DEFINE_MUTEX(lock);
 719         static struct cpumask has_work;
 720         int cpu;
 721
 722         mutex_lock(&lock);
 723         get_online_cpus();
 724         cpumask_clear(&has_work);
 725
 726         for_each_online_cpu(cpu) {
 727                 struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
 728
 729                 if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
 730                     pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
 731                     pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
 732                     need_activate_page_drain(cpu)) {
 733                         INIT_WORK(work, lru_add_drain_per_cpu);
 734                         schedule_work_on(cpu, work);
 735                         cpumask_set_cpu(cpu, &has_work);
 736                 }
 737         }
 738
 739         for_each_cpu(cpu, &has_work)
 740                 flush_work(&per_cpu(lru_add_drain_work, cpu));
 741
 742         put_online_cpus();
 743         mutex_unlock(&lock);
 744 }
 745
 746 /*
 747  * Batched page_cache_release().  Decrement the reference count on all the
 748  * passed pages.  If it fell to zero then remove the page from the LRU and
 749  * free it.
 750  *
 751  * Avoid taking zone->lru_lock if possible, but if it is taken, retain it
 752  * for the remainder of the operation.
 753  *
 754  * The locking in this function is against shrink_inactive_list(): we recheck
 755  * the page count inside the lock to see whether shrink_inactive_list()
 756  * grabbed the page via the LRU.  If it did, give up: shrink_inactive_list()
 757  * will free it.
 758  */
 759 void release_pages(struct page **pages, int nr, int cold)
 760 {
 761         int i;
 762         LIST_HEAD(pages_to_free);
 763         struct zone *zone = NULL;
 764         struct lruvec *lruvec;
 765         unsigned long uninitialized_var(flags);
 766
 767         for (i = 0; i < nr; i++) {
 768                 struct page *page = pages[i];
 769
 770                 if (unlikely(PageCompound(page))) {
 771                         if (zone) {
 772                                 spin_unlock_irqrestore(&zone->lru_lock, flags);
 773                                 zone = NULL;
 774                         }
 775                         put_compound_page(page);
 776                         continue;
 777                 }
 778
 779                 if (!put_page_testzero(page))
 780                         continue;
 781
 782                 if (PageLRU(page)) {
 783                         struct zone *pagezone = page_zone(page);
 784
 785                         if (pagezone != zone) {
 786                                 if (zone)
 787                                         spin_unlock_irqrestore(&zone->lru_lock,
 788                                                                         flags);
 789                                 zone = pagezone;
 790                                 spin_lock_irqsave(&zone->lru_lock, flags);
 791                         }
 792
 793                         lruvec = mem_cgroup_page_lruvec(page, zone);
 794                         VM_BUG_ON(!PageLRU(page));
 795                         __ClearPageLRU(page);
 796                         del_page_from_lru_list(page, lruvec, page_off_lru(page));
 797                 }
 798
 799                 /* Clear Active bit in case of parallel mark_page_accessed */
 800                 ClearPageActive(page);
 801
 802                 list_add(&page->lru, &pages_to_free);
 803         }
 804         if (zone)
 805                 spin_unlock_irqrestore(&zone->lru_lock, flags);
 806
 807         free_hot_cold_page_list(&pages_to_free, cold);
 808 }
 809 EXPORT_SYMBOL(release_pages);
 810
 811 /*
 812  * The pages which we're about to release may be in the deferred lru-addition
 813  * queues.  That would prevent them from really being freed right now.  That's
 814  * OK from a correctness point of view but is inefficient - those pages may be
 815  * cache-warm and we want to give them back to the page allocator ASAP.
 816  *
 817  * So __pagevec_release() will drain those queues here.  __pagevec_lru_add()
 818  * and __pagevec_lru_add_active() call release_pages() directly to avoid
 819  * mutual recursion.
 820  */
 821 void __pagevec_release(struct pagevec *pvec)
 822 {
 823         lru_add_drain();
 824         release_pages(pvec->pages, pagevec_count(pvec), pvec->cold);
 825         pagevec_reinit(pvec);
 826 }
 827 EXPORT_SYMBOL(__pagevec_release);
 828
 829 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 830 /* used by __split_huge_page_refcount() */
 831 void lru_add_page_tail(struct page *page, struct page *page_tail,
 832                        struct lruvec *lruvec, struct list_head *list)
 833 {
 834         const int file = 0;
 835
 836         VM_BUG_ON(!PageHead(page));
 837         VM_BUG_ON(PageCompound(page_tail));
 838         VM_BUG_ON(PageLRU(page_tail));
 839         VM_BUG_ON(NR_CPUS != 1 &&
 840                   !spin_is_locked(&lruvec_zone(lruvec)->lru_lock));
 841
 842         if (!list)
 843                 SetPageLRU(page_tail);
 844
 845         if (likely(PageLRU(page)))
 846                 list_add_tail(&page_tail->lru, &page->lru);
 847         else if (list) {
 848                 /* page reclaim is reclaiming a huge page */
 849                 get_page(page_tail);
 850                 list_add_tail(&page_tail->lru, list);
 851         } else {
 852                 struct list_head *list_head;
 853                 /*
 854                  * Head page has not yet been counted, as an hpage,
 855                  * so we must account for each subpage individually.
 856                  *
 857                  * Use the standard add function to put page_tail on the list,
 858                  * but then correct its position so they all end up in order.
 859                  */
 860                 add_page_to_lru_list(page_tail, lruvec, page_lru(page_tail));
 861                 list_head = page_tail->lru.prev;
 862                 list_move_tail(&page_tail->lru, list_head);
 863         }
 864
 865         if (!PageUnevictable(page))
 866                 update_page_reclaim_stat(lruvec, file, PageActive(page_tail));
 867 }
 868 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 869
 870 static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
 871                                  void *arg)
 872 {
 873         int file = page_is_file_cache(page);
 874         int active = PageActive(page);
 875         enum lru_list lru = page_lru(page);
 876
 877         VM_BUG_ON(PageLRU(page));
 878
 879         SetPageLRU(page);
 880         add_page_to_lru_list(page, lruvec, lru);
 881         update_page_reclaim_stat(lruvec, file, active);
 882         trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page));
 883 }
 884
 885 /*
 886  * Add the passed pages to the LRU, then drop the caller's refcount
 887  * on them.  Reinitialises the caller's pagevec.
 888  */
 889 void __pagevec_lru_add(struct pagevec *pvec)
 890 {
 891         pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL);
 892 }
 893 EXPORT_SYMBOL(__pagevec_lru_add);
 894
 895 /**
 896  * pagevec_lookup - gang pagecache lookup
 897  * @pvec:       Where the resulting pages are placed
 898  * @mapping:    The address_space to search
 899  * @start:      The starting page index
 900  * @nr_pages:   The maximum number of pages
 901  *
 902  * pagevec_lookup() will search for and return a group of up to @nr_pages pages
 903  * in the mapping.  The pages are placed in @pvec.  pagevec_lookup() takes a
 904  * reference against the pages in @pvec.
 905  *
 906  * The search returns a group of mapping-contiguous pages with ascending
 907  * indexes.  There may be holes in the indices due to not-present pages.
 908  *
 909  * pagevec_lookup() returns the number of pages which were found.
 910  */
 911 unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
 912                 pgoff_t start, unsigned nr_pages)
 913 {
 914         pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages);
 915         return pagevec_count(pvec);
 916 }
 917 EXPORT_SYMBOL(pagevec_lookup);
 918
 919 unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
 920                 pgoff_t *index, int tag, unsigned nr_pages)
 921 {
 922         pvec->nr = find_get_pages_tag(mapping, index, tag,
 923                                         nr_pages, pvec->pages);
 924         return pagevec_count(pvec);
 925 }
 926 EXPORT_SYMBOL(pagevec_lookup_tag);
 927
 928 /*
 929  * Perform any setup for the swap system
 930  */
 931 void __init swap_setup(void)
 932 {
 933         unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT);
 934 #ifdef CONFIG_SWAP
 935         int i;
 936
 937         bdi_init(swapper_spaces[0].backing_dev_info);
 938         for (i = 0; i < MAX_SWAPFILES; i++) {
 939                 spin_lock_init(&swapper_spaces[i].tree_lock);
 940                 INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear);
 941         }
 942 #endif
 943
 944         /* Use a smaller cluster for small-memory machines */
 945         if (megs < 16)
 946                 page_cluster = 2;
 947         else
 948                 page_cluster = 3;
 949         /*
 950          * Right now other parts of the system means that we
 951          * _really_ don't want to cluster much more
 952          */
 953 }