mm/swap.c

   1 /*
   2  *  linux/mm/swap.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  */
   6
   7 /*
   8  * This file contains the default values for the operation of the
   9  * Linux VM subsystem. Fine-tuning documentation can be found in
  10  * Documentation/sysctl/vm.txt.
  11  * Started 18.12.91
  12  * Swap aging added 23.2.95, Stephen Tweedie.
  13  * Buffermem limits added 12.3.98, Rik van Riel.
  14  */
  15
  16 #include <linux/mm.h>
  17 #include <linux/sched.h>
  18 #include <linux/kernel_stat.h>
  19 #include <linux/swap.h>
  20 #include <linux/mman.h>
  21 #include <linux/pagemap.h>
  22 #include <linux/pagevec.h>
  23 #include <linux/init.h>
  24 #include <linux/module.h>
  25 #include <linux/mm_inline.h>
  26 #include <linux/buffer_head.h>  /* for try_to_release_page() */
  27 #include <linux/percpu_counter.h>
  28 #include <linux/percpu.h>
  29 #include <linux/cpu.h>
  30 #include <linux/notifier.h>
  31 #include <linux/backing-dev.h>
  32 #include <linux/memcontrol.h>
  33 #include <linux/interrupt.h>
  34
  35 #include "internal.h"
  36
  37 /* How many pages do we try to swap or page in/out together? */
  38 int page_cluster;
  39
  40 /*
  41  * On PREEMPT_RT we don't want to disable preemption for cpu variables.
  42  * We grab a cpu and then use that cpu to lock the variables accordingly.
  43  *
  44  * (On !PREEMPT_RT this turns into normal preempt-off sections, as before.)
  45  */
  46 static DEFINE_PER_CPU_LOCKED(struct pagevec[NR_LRU_LISTS], lru_add_pvecs);
  47 static DEFINE_PER_CPU_LOCKED(struct pagevec, lru_rotate_pvecs);
  48
  49 #define swap_get_cpu_var_irq_save(var, flags, cpu)      \
  50         ({                                              \
  51                 (void)flags;                            \
  52                 &get_cpu_var_locked(var, &cpu);         \
  53         })
  54
  55 #define swap_put_cpu_var_irq_restore(var, flags, cpu)   \
  56         put_cpu_var_locked(var, cpu)
  57
  58 #define swap_get_cpu_var(var, cpu) \
  59         &get_cpu_var_locked(var, &cpu)
  60
  61 #define swap_put_cpu_var(var, cpu)              \
  62         put_cpu_var_locked(var, cpu)
  63
  64 #define swap_per_cpu_lock(var, cpu)                             \
  65         ({                                                      \
  66                 spin_lock(&__get_cpu_lock(var, cpu));           \
  67                 &__get_cpu_var_locked(var, cpu);        \
  68         })
  69
  70 #define swap_per_cpu_unlock(var, cpu)                   \
  71                 spin_unlock(&__get_cpu_lock(var, cpu));
  72
  73 #define swap_get_cpu() raw_smp_processor_id()
  74
  75 #define swap_put_cpu()
  76
  77 /*
  78  * This path almost never happens for VM activity - pages are normally
  79  * freed via pagevecs.  But it gets used by networking.
  80  */
  81 static void __page_cache_release(struct page *page)
  82 {
  83         if (PageLRU(page)) {
  84                 unsigned long flags;
  85                 struct zone *zone = page_zone(page);
  86
  87                 spin_lock_irqsave(&zone->lru_lock, flags);
  88                 VM_BUG_ON(!PageLRU(page));
  89                 __ClearPageLRU(page);
  90                 del_page_from_lru(zone, page);
  91                 spin_unlock_irqrestore(&zone->lru_lock, flags);
  92         }
  93         free_hot_page(page);
  94 }
  95
  96 static void put_compound_page(struct page *page)
  97 {
  98         page = compound_head(page);
  99         if (put_page_testzero(page)) {
 100                 compound_page_dtor *dtor;
 101
 102                 dtor = get_compound_page_dtor(page);
 103                 (*dtor)(page);
 104         }
 105 }
 106
 107 void put_page(struct page *page)
 108 {
 109         if (unlikely(PageCompound(page)))
 110                 put_compound_page(page);
 111         else if (put_page_testzero(page))
 112                 __page_cache_release(page);
 113 }
 114 EXPORT_SYMBOL(put_page);
 115
 116 /**
 117  * put_pages_list() - release a list of pages
 118  * @pages: list of pages threaded on page->lru
 119  *
 120  * Release a list of pages which are strung together on page.lru.  Currently
 121  * used by read_cache_pages() and related error recovery code.
 122  */
 123 void put_pages_list(struct list_head *pages)
 124 {
 125         while (!list_empty(pages)) {
 126                 struct page *victim;
 127
 128                 victim = list_entry(pages->prev, struct page, lru);
 129                 list_del(&victim->lru);
 130                 page_cache_release(victim);
 131         }
 132 }
 133 EXPORT_SYMBOL(put_pages_list);
 134
 135 /*
 136  * pagevec_move_tail() must be called with IRQ disabled.
 137  * Otherwise this may cause nasty races.
 138  */
 139 static void pagevec_move_tail(struct pagevec *pvec)
 140 {
 141         int i;
 142         int pgmoved = 0;
 143         struct zone *zone = NULL;
 144
 145         for (i = 0; i < pagevec_count(pvec); i++) {
 146                 struct page *page = pvec->pages[i];
 147                 struct zone *pagezone = page_zone(page);
 148
 149                 if (pagezone != zone) {
 150                         if (zone)
 151                                 spin_unlock(&zone->lru_lock);
 152                         zone = pagezone;
 153                         spin_lock(&zone->lru_lock);
 154                 }
 155                 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
 156                         int lru = page_is_file_cache(page);
 157                         list_move_tail(&page->lru, &zone->lru[lru].list);
 158                         pgmoved++;
 159                 }
 160         }
 161         if (zone)
 162                 spin_unlock(&zone->lru_lock);
 163         __count_vm_events(PGROTATED, pgmoved);
 164         release_pages(pvec->pages, pvec->nr, pvec->cold);
 165         pagevec_reinit(pvec);
 166 }
 167
 168 /*
 169  * Writeback is about to end against a page which has been marked for immediate
 170  * reclaim.  If it still appears to be reclaimable, move it to the tail of the
 171  * inactive list.
 172  */
 173 void  rotate_reclaimable_page(struct page *page)
 174 {
 175         if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) &&
 176             !PageUnevictable(page) && PageLRU(page)) {
 177                 struct pagevec *pvec;
 178                 unsigned long flags;
 179                 int cpu;
 180
 181                 page_cache_get(page);
 182                 pvec = swap_get_cpu_var_irq_save(lru_rotate_pvecs, flags, cpu);
 183                 if (!pagevec_add(pvec, page))
 184                         pagevec_move_tail(pvec);
 185                 swap_put_cpu_var_irq_restore(lru_rotate_pvecs, flags, cpu);
 186         }
 187 }
 188
 189 static void update_page_reclaim_stat(struct zone *zone, struct page *page,
 190                                      int file, int rotated)
 191 {
 192         struct zone_reclaim_stat *reclaim_stat = &zone->reclaim_stat;
 193         struct zone_reclaim_stat *memcg_reclaim_stat;
 194
 195         memcg_reclaim_stat = mem_cgroup_get_reclaim_stat_from_page(page);
 196
 197         reclaim_stat->recent_scanned[file]++;
 198         if (rotated)
 199                 reclaim_stat->recent_rotated[file]++;
 200
 201         if (!memcg_reclaim_stat)
 202                 return;
 203
 204         memcg_reclaim_stat->recent_scanned[file]++;
 205         if (rotated)
 206                 memcg_reclaim_stat->recent_rotated[file]++;
 207 }
 208
 209 /*
 210  * FIXME: speed this up?
 211  */
 212 void activate_page(struct page *page)
 213 {
 214         struct zone *zone = page_zone(page);
 215
 216         spin_lock_irq(&zone->lru_lock);
 217         if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
 218                 int file = page_is_file_cache(page);
 219                 int lru = LRU_BASE + file;
 220                 del_page_from_lru_list(zone, page, lru);
 221
 222                 SetPageActive(page);
 223                 lru += LRU_ACTIVE;
 224                 add_page_to_lru_list(zone, page, lru);
 225                 __count_vm_event(PGACTIVATE);
 226
 227                 update_page_reclaim_stat(zone, page, !!file, 1);
 228         }
 229         spin_unlock_irq(&zone->lru_lock);
 230 }
 231
 232 /*
 233  * Mark a page as having seen activity.
 234  *
 235  * inactive,unreferenced        ->      inactive,referenced
 236  * inactive,referenced          ->      active,unreferenced
 237  * active,unreferenced          ->      active,referenced
 238  */
 239 void mark_page_accessed(struct page *page)
 240 {
 241         if (!PageActive(page) && !PageUnevictable(page) &&
 242                         PageReferenced(page) && PageLRU(page)) {
 243                 activate_page(page);
 244                 ClearPageReferenced(page);
 245         } else if (!PageReferenced(page)) {
 246                 SetPageReferenced(page);
 247         }
 248 }
 249
 250 EXPORT_SYMBOL(mark_page_accessed);
 251
 252 void __lru_cache_add(struct page *page, enum lru_list lru)
 253 {
 254         struct pagevec *pvec;
 255         int cpu;
 256
 257         pvec = swap_get_cpu_var(lru_add_pvecs, cpu)[lru];
 258         page_cache_get(page);
 259         if (!pagevec_add(pvec, page))
 260                 ____pagevec_lru_add(pvec, lru);
 261         swap_put_cpu_var(lru_add_pvecs, cpu);
 262 }
 263
 264 /**
 265  * lru_cache_add_lru - add a page to a page list
 266  * @page: the page to be added to the LRU.
 267  * @lru: the LRU list to which the page is added.
 268  */
 269 void lru_cache_add_lru(struct page *page, enum lru_list lru)
 270 {
 271         if (PageActive(page)) {
 272                 VM_BUG_ON(PageUnevictable(page));
 273                 ClearPageActive(page);
 274         } else if (PageUnevictable(page)) {
 275                 VM_BUG_ON(PageActive(page));
 276                 ClearPageUnevictable(page);
 277         }
 278
 279         VM_BUG_ON(PageLRU(page) || PageActive(page) || PageUnevictable(page));
 280         __lru_cache_add(page, lru);
 281 }
 282
 283 /**
 284  * add_page_to_unevictable_list - add a page to the unevictable list
 285  * @page:  the page to be added to the unevictable list
 286  *
 287  * Add page directly to its zone's unevictable list.  To avoid races with
 288  * tasks that might be making the page evictable, through eg. munlock,
 289  * munmap or exit, while it's not on the lru, we want to add the page
 290  * while it's locked or otherwise "invisible" to other tasks.  This is
 291  * difficult to do when using the pagevec cache, so bypass that.
 292  */
 293 void add_page_to_unevictable_list(struct page *page)
 294 {
 295         struct zone *zone = page_zone(page);
 296
 297         spin_lock_irq(&zone->lru_lock);
 298         SetPageUnevictable(page);
 299         SetPageLRU(page);
 300         add_page_to_lru_list(zone, page, LRU_UNEVICTABLE);
 301         spin_unlock_irq(&zone->lru_lock);
 302 }
 303
 304 /*
 305  * Drain pages out of the cpu's pagevecs.
 306  * Either "cpu" is the current CPU, and preemption has already been
 307  * disabled; or "cpu" is being hot-unplugged, and is already dead.
 308  */
 309 static void drain_cpu_pagevecs(int cpu)
 310 {
 311         struct pagevec *pvecs, *pvec;
 312         int lru;
 313
 314         pvecs = swap_per_cpu_lock(lru_add_pvecs, cpu)[0];
 315         for_each_lru(lru) {
 316                 pvec = &pvecs[lru - LRU_BASE];
 317                 if (pagevec_count(pvec))
 318                         ____pagevec_lru_add(pvec, lru);
 319         }
 320         swap_per_cpu_unlock(lru_add_pvecs, cpu);
 321
 322         pvec = swap_per_cpu_lock(lru_rotate_pvecs, cpu);
 323         if (pagevec_count(pvec)) {
 324                 unsigned long flags;
 325
 326                 /* No harm done if a racing interrupt already did this */
 327                 local_irq_save_nort(flags);
 328                 pagevec_move_tail(pvec);
 329                 local_irq_restore_nort(flags);
 330         }
 331         swap_per_cpu_unlock(lru_rotate_pvecs, cpu);
 332 }
 333
 334 void lru_add_drain(void)
 335 {
 336         int cpu;
 337
 338         cpu = swap_get_cpu();
 339         drain_cpu_pagevecs(cpu);
 340         swap_put_cpu();
 341 }
 342
 343 static void lru_add_drain_per_cpu(struct work_struct *dummy)
 344 {
 345         lru_add_drain();
 346 }
 347
 348 /*
 349  * Returns 0 for success
 350  */
 351 int lru_add_drain_all(void)
 352 {
 353         return schedule_on_each_cpu(lru_add_drain_per_cpu);
 354 }
 355
 356 /*
 357  * Batched page_cache_release().  Decrement the reference count on all the
 358  * passed pages.  If it fell to zero then remove the page from the LRU and
 359  * free it.
 360  *
 361  * Avoid taking zone->lru_lock if possible, but if it is taken, retain it
 362  * for the remainder of the operation.
 363  *
 364  * The locking in this function is against shrink_inactive_list(): we recheck
 365  * the page count inside the lock to see whether shrink_inactive_list()
 366  * grabbed the page via the LRU.  If it did, give up: shrink_inactive_list()
 367  * will free it.
 368  */
 369 void release_pages(struct page **pages, int nr, int cold)
 370 {
 371         int i;
 372         struct pagevec pages_to_free;
 373         struct zone *zone = NULL;
 374         unsigned long uninitialized_var(flags);
 375
 376         pagevec_init(&pages_to_free, cold);
 377         for (i = 0; i < nr; i++) {
 378                 struct page *page = pages[i];
 379
 380                 if (unlikely(PageCompound(page))) {
 381                         if (zone) {
 382                                 spin_unlock_irqrestore(&zone->lru_lock, flags);
 383                                 zone = NULL;
 384                         }
 385                         put_compound_page(page);
 386                         continue;
 387                 }
 388
 389                 if (!put_page_testzero(page))
 390                         continue;
 391
 392                 if (PageLRU(page)) {
 393                         struct zone *pagezone = page_zone(page);
 394
 395                         if (pagezone != zone) {
 396                                 if (zone)
 397                                         spin_unlock_irqrestore(&zone->lru_lock,
 398                                                                         flags);
 399                                 zone = pagezone;
 400                                 spin_lock_irqsave(&zone->lru_lock, flags);
 401                         }
 402                         VM_BUG_ON(!PageLRU(page));
 403                         __ClearPageLRU(page);
 404                         del_page_from_lru(zone, page);
 405                 }
 406
 407                 if (!pagevec_add(&pages_to_free, page)) {
 408                         if (zone) {
 409                                 spin_unlock_irqrestore(&zone->lru_lock, flags);
 410                                 zone = NULL;
 411                         }
 412                         __pagevec_free(&pages_to_free);
 413                         pagevec_reinit(&pages_to_free);
 414                 }
 415         }
 416         if (zone)
 417                 spin_unlock_irqrestore(&zone->lru_lock, flags);
 418
 419         pagevec_free(&pages_to_free);
 420 }
 421
 422 /*
 423  * The pages which we're about to release may be in the deferred lru-addition
 424  * queues.  That would prevent them from really being freed right now.  That's
 425  * OK from a correctness point of view but is inefficient - those pages may be
 426  * cache-warm and we want to give them back to the page allocator ASAP.
 427  *
 428  * So __pagevec_release() will drain those queues here.  __pagevec_lru_add()
 429  * and __pagevec_lru_add_active() call release_pages() directly to avoid
 430  * mutual recursion.
 431  */
 432 void __pagevec_release(struct pagevec *pvec)
 433 {
 434         lru_add_drain();
 435         release_pages(pvec->pages, pagevec_count(pvec), pvec->cold);
 436         pagevec_reinit(pvec);
 437 }
 438
 439 EXPORT_SYMBOL(__pagevec_release);
 440
 441 /*
 442  * Add the passed pages to the LRU, then drop the caller's refcount
 443  * on them.  Reinitialises the caller's pagevec.
 444  */
 445 void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
 446 {
 447         int i;
 448         struct zone *zone = NULL;
 449
 450         VM_BUG_ON(is_unevictable_lru(lru));
 451
 452         for (i = 0; i < pagevec_count(pvec); i++) {
 453                 struct page *page = pvec->pages[i];
 454                 struct zone *pagezone = page_zone(page);
 455                 int file;
 456                 int active;
 457
 458                 if (pagezone != zone) {
 459                         if (zone)
 460                                 spin_unlock_irq(&zone->lru_lock);
 461                         zone = pagezone;
 462                         spin_lock_irq(&zone->lru_lock);
 463                 }
 464                 VM_BUG_ON(PageActive(page));
 465                 VM_BUG_ON(PageUnevictable(page));
 466                 VM_BUG_ON(PageLRU(page));
 467                 SetPageLRU(page);
 468                 active = is_active_lru(lru);
 469                 file = is_file_lru(lru);
 470                 if (active)
 471                         SetPageActive(page);
 472                 update_page_reclaim_stat(zone, page, file, active);
 473                 add_page_to_lru_list(zone, page, lru);
 474         }
 475         if (zone)
 476                 spin_unlock_irq(&zone->lru_lock);
 477         release_pages(pvec->pages, pvec->nr, pvec->cold);
 478         pagevec_reinit(pvec);
 479 }
 480
 481 EXPORT_SYMBOL(____pagevec_lru_add);
 482
 483 /*
 484  * Try to drop buffers from the pages in a pagevec
 485  */
 486 void pagevec_strip(struct pagevec *pvec)
 487 {
 488         int i;
 489
 490         for (i = 0; i < pagevec_count(pvec); i++) {
 491                 struct page *page = pvec->pages[i];
 492
 493                 if (PagePrivate(page) && trylock_page(page)) {
 494                         if (PagePrivate(page))
 495                                 try_to_release_page(page, 0);
 496                         unlock_page(page);
 497                 }
 498         }
 499 }
 500
 501 /**
 502  * pagevec_swap_free - try to free swap space from the pages in a pagevec
 503  * @pvec: pagevec with swapcache pages to free the swap space of
 504  *
 505  * The caller needs to hold an extra reference to each page and
 506  * not hold the page lock on the pages.  This function uses a
 507  * trylock on the page lock so it may not always free the swap
 508  * space associated with a page.
 509  */
 510 void pagevec_swap_free(struct pagevec *pvec)
 511 {
 512         int i;
 513
 514         for (i = 0; i < pagevec_count(pvec); i++) {
 515                 struct page *page = pvec->pages[i];
 516
 517                 if (PageSwapCache(page) && trylock_page(page)) {
 518                         try_to_free_swap(page);
 519                         unlock_page(page);
 520                 }
 521         }
 522 }
 523
 524 /**
 525  * pagevec_lookup - gang pagecache lookup
 526  * @pvec:       Where the resulting pages are placed
 527  * @mapping:    The address_space to search
 528  * @start:      The starting page index
 529  * @nr_pages:   The maximum number of pages
 530  *
 531  * pagevec_lookup() will search for and return a group of up to @nr_pages pages
 532  * in the mapping.  The pages are placed in @pvec.  pagevec_lookup() takes a
 533  * reference against the pages in @pvec.
 534  *
 535  * The search returns a group of mapping-contiguous pages with ascending
 536  * indexes.  There may be holes in the indices due to not-present pages.
 537  *
 538  * pagevec_lookup() returns the number of pages which were found.
 539  */
 540 unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
 541                 pgoff_t start, unsigned nr_pages)
 542 {
 543         pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages);
 544         return pagevec_count(pvec);
 545 }
 546
 547 EXPORT_SYMBOL(pagevec_lookup);
 548
 549 unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
 550                 pgoff_t *index, int tag, unsigned nr_pages)
 551 {
 552         pvec->nr = find_get_pages_tag(mapping, index, tag,
 553                                         nr_pages, pvec->pages);
 554         return pagevec_count(pvec);
 555 }
 556
 557 EXPORT_SYMBOL(pagevec_lookup_tag);
 558
 559 /*
 560  * Perform any setup for the swap system
 561  */
 562 void __init swap_setup(void)
 563 {
 564         unsigned long megs = num_physpages >> (20 - PAGE_SHIFT);
 565
 566 #ifdef CONFIG_SWAP
 567         bdi_init(swapper_space.backing_dev_info);
 568 #endif
 569
 570         /* Use a smaller cluster for small-memory machines */
 571         if (megs < 16)
 572                 page_cluster = 2;
 573         else
 574                 page_cluster = 3;
 575         /*
 576          * Right now other parts of the system means that we
 577          * _really_ don't want to cluster much more
 578          */
 579 }