mm/swap.c

   1 /*
   2  *  linux/mm/swap.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  */
   6
   7 /*
   8  * This file contains the default values for the opereation of the
   9  * Linux VM subsystem. Fine-tuning documentation can be found in
  10  * linux/Documentation/sysctl/vm.txt.
  11  * Started 18.12.91
  12  * Swap aging added 23.2.95, Stephen Tweedie.
  13  * Buffermem limits added 12.3.98, Rik van Riel.
  14  */
  15
  16 #include <linux/mm.h>
  17 #include <linux/kernel_stat.h>
  18 #include <linux/swap.h>
  19 #include <linux/mman.h>
  20 #include <linux/pagemap.h>
  21 #include <linux/pagevec.h>
  22 #include <linux/init.h>
  23 #include <linux/module.h>
  24 #include <linux/mm_inline.h>
  25 #include <linux/buffer_head.h>  /* for try_to_release_page() */
  26 #include <linux/percpu.h>
  27
  28 /* How many pages do we try to swap or page in/out together? */
  29 int page_cluster;
  30
  31 /*
  32  * Writeback is about to end against a page which has been marked for immediate
  33  * reclaim.  If it still appears to be reclaimable, move it to the tail of the
  34  * inactive list.  The page still has PageWriteback set, which will pin it.
  35  *
  36  * We don't expect many pages to come through here, so don't bother batching
  37  * things up.
  38  *
  39  * To avoid placing the page at the tail of the LRU while PG_writeback is still
  40  * set, this function will clear PG_writeback before performing the page
  41  * motion.  Do that inside the lru lock because once PG_writeback is cleared
  42  * we may not touch the page.
  43  *
  44  * Returns zero if it cleared PG_writeback.
  45  */
  46 int rotate_reclaimable_page(struct page *page)
  47 {
  48         struct zone *zone;
  49         unsigned long flags;
  50
  51         if (PageLocked(page))
  52                 return 1;
  53         if (PageDirty(page))
  54                 return 1;
  55         if (PageActive(page))
  56                 return 1;
  57         if (!PageLRU(page))
  58                 return 1;
  59
  60         zone = page_zone(page);
  61         spin_lock_irqsave(&zone->lru_lock, flags);
  62         if (PageLRU(page) && !PageActive(page)) {
  63                 list_del(&page->lru);
  64                 list_add_tail(&page->lru, &zone->inactive_list);
  65                 inc_page_state(pgrotated);
  66         }
  67         if (!TestClearPageWriteback(page))
  68                 BUG();
  69         spin_unlock_irqrestore(&zone->lru_lock, flags);
  70         return 0;
  71 }
  72
  73 /*
  74  * FIXME: speed this up?
  75  */
  76 void activate_page(struct page *page)
  77 {
  78         struct zone *zone = page_zone(page);
  79
  80         spin_lock_irq(&zone->lru_lock);
  81         if (PageLRU(page) && !PageActive(page)) {
  82                 del_page_from_inactive_list(zone, page);
  83                 SetPageActive(page);
  84                 add_page_to_active_list(zone, page);
  85                 inc_page_state(pgactivate);
  86         }
  87         spin_unlock_irq(&zone->lru_lock);
  88 }
  89
  90 /*
  91  * Mark a page as having seen activity.
  92  *
  93  * inactive,unreferenced        ->      inactive,referenced
  94  * inactive,referenced          ->      active,unreferenced
  95  * active,unreferenced          ->      active,referenced
  96  */
  97 void mark_page_accessed(struct page *page)
  98 {
  99         if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) {
 100                 activate_page(page);
 101                 ClearPageReferenced(page);
 102         } else if (!PageReferenced(page)) {
 103                 SetPageReferenced(page);
 104         }
 105 }
 106
 107 /**
 108  * lru_cache_add: add a page to the page lists
 109  * @page: the page to add
 110  */
 111 static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, };
 112 static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, };
 113
 114 void lru_cache_add(struct page *page)
 115 {
 116         struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
 117
 118         page_cache_get(page);
 119         if (!pagevec_add(pvec, page))
 120                 __pagevec_lru_add(pvec);
 121         put_cpu_var(lru_add_pvecs);
 122 }
 123
 124 void lru_cache_add_active(struct page *page)
 125 {
 126         struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs);
 127
 128         page_cache_get(page);
 129         if (!pagevec_add(pvec, page))
 130                 __pagevec_lru_add_active(pvec);
 131         put_cpu_var(lru_add_active_pvecs);
 132 }
 133
 134 void lru_add_drain(void)
 135 {
 136         struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
 137
 138         if (pagevec_count(pvec))
 139                 __pagevec_lru_add(pvec);
 140         pvec = &__get_cpu_var(lru_add_active_pvecs);
 141         if (pagevec_count(pvec))
 142                 __pagevec_lru_add_active(pvec);
 143         put_cpu_var(lru_add_pvecs);
 144 }
 145
 146 /*
 147  * This path almost never happens for VM activity - pages are normally
 148  * freed via pagevecs.  But it gets used by networking.
 149  */
 150 void __page_cache_release(struct page *page)
 151 {
 152         unsigned long flags;
 153         struct zone *zone = page_zone(page);
 154
 155         spin_lock_irqsave(&zone->lru_lock, flags);
 156         if (TestClearPageLRU(page))
 157                 del_page_from_lru(zone, page);
 158         if (page_count(page) != 0)
 159                 page = NULL;
 160         spin_unlock_irqrestore(&zone->lru_lock, flags);
 161         if (page)
 162                 free_hot_page(page);
 163 }
 164
 165 /*
 166  * Batched page_cache_release().  Decrement the reference count on all the
 167  * passed pages.  If it fell to zero then remove the page from the LRU and
 168  * free it.
 169  *
 170  * Avoid taking zone->lru_lock if possible, but if it is taken, retain it
 171  * for the remainder of the operation.
 172  *
 173  * The locking in this function is against shrink_cache(): we recheck the
 174  * page count inside the lock to see whether shrink_cache grabbed the page
 175  * via the LRU.  If it did, give up: shrink_cache will free it.
 176  */
 177 void release_pages(struct page **pages, int nr, int cold)
 178 {
 179         int i;
 180         struct pagevec pages_to_free;
 181         struct zone *zone = NULL;
 182
 183         pagevec_init(&pages_to_free, cold);
 184         for (i = 0; i < nr; i++) {
 185                 struct page *page = pages[i];
 186                 struct zone *pagezone;
 187
 188                 if (PageReserved(page) || !put_page_testzero(page))
 189                         continue;
 190
 191                 pagezone = page_zone(page);
 192                 if (pagezone != zone) {
 193                         if (zone)
 194                                 spin_unlock_irq(&zone->lru_lock);
 195                         zone = pagezone;
 196                         spin_lock_irq(&zone->lru_lock);
 197                 }
 198                 if (TestClearPageLRU(page))
 199                         del_page_from_lru(zone, page);
 200                 if (page_count(page) == 0) {
 201                         if (!pagevec_add(&pages_to_free, page)) {
 202                                 spin_unlock_irq(&zone->lru_lock);
 203                                 __pagevec_free(&pages_to_free);
 204                                 pagevec_reinit(&pages_to_free);
 205                                 zone = NULL;    /* No lock is held */
 206                         }
 207                 }
 208         }
 209         if (zone)
 210                 spin_unlock_irq(&zone->lru_lock);
 211
 212         pagevec_free(&pages_to_free);
 213 }
 214
 215 /*
 216  * The pages which we're about to release may be in the deferred lru-addition
 217  * queues.  That would prevent them from really being freed right now.  That's
 218  * OK from a correctness point of view but is inefficient - those pages may be
 219  * cache-warm and we want to give them back to the page allocator ASAP.
 220  *
 221  * So __pagevec_release() will drain those queues here.  __pagevec_lru_add()
 222  * and __pagevec_lru_add_active() call release_pages() directly to avoid
 223  * mutual recursion.
 224  */
 225 void __pagevec_release(struct pagevec *pvec)
 226 {
 227         lru_add_drain();
 228         release_pages(pvec->pages, pagevec_count(pvec), pvec->cold);
 229         pagevec_reinit(pvec);
 230 }
 231
 232 /*
 233  * pagevec_release() for pages which are known to not be on the LRU
 234  *
 235  * This function reinitialises the caller's pagevec.
 236  */
 237 void __pagevec_release_nonlru(struct pagevec *pvec)
 238 {
 239         int i;
 240         struct pagevec pages_to_free;
 241
 242         pagevec_init(&pages_to_free, pvec->cold);
 243         pages_to_free.cold = pvec->cold;
 244         for (i = 0; i < pagevec_count(pvec); i++) {
 245                 struct page *page = pvec->pages[i];
 246
 247                 BUG_ON(PageLRU(page));
 248                 if (put_page_testzero(page))
 249                         pagevec_add(&pages_to_free, page);
 250         }
 251         pagevec_free(&pages_to_free);
 252         pagevec_reinit(pvec);
 253 }
 254
 255 /*
 256  * Add the passed pages to the LRU, then drop the caller's refcount
 257  * on them.  Reinitialises the caller's pagevec.
 258  */
 259 void __pagevec_lru_add(struct pagevec *pvec)
 260 {
 261         int i;
 262         struct zone *zone = NULL;
 263
 264         for (i = 0; i < pagevec_count(pvec); i++) {
 265                 struct page *page = pvec->pages[i];
 266                 struct zone *pagezone = page_zone(page);
 267
 268                 if (pagezone != zone) {
 269                         if (zone)
 270                                 spin_unlock_irq(&zone->lru_lock);
 271                         zone = pagezone;
 272                         spin_lock_irq(&zone->lru_lock);
 273                 }
 274                 if (TestSetPageLRU(page))
 275                         BUG();
 276                 add_page_to_inactive_list(zone, page);
 277         }
 278         if (zone)
 279                 spin_unlock_irq(&zone->lru_lock);
 280         release_pages(pvec->pages, pvec->nr, pvec->cold);
 281         pagevec_reinit(pvec);
 282 }
 283
 284 void __pagevec_lru_add_active(struct pagevec *pvec)
 285 {
 286         int i;
 287         struct zone *zone = NULL;
 288
 289         for (i = 0; i < pagevec_count(pvec); i++) {
 290                 struct page *page = pvec->pages[i];
 291                 struct zone *pagezone = page_zone(page);
 292
 293                 if (pagezone != zone) {
 294                         if (zone)
 295                                 spin_unlock_irq(&zone->lru_lock);
 296                         zone = pagezone;
 297                         spin_lock_irq(&zone->lru_lock);
 298                 }
 299                 if (TestSetPageLRU(page))
 300                         BUG();
 301                 if (TestSetPageActive(page))
 302                         BUG();
 303                 add_page_to_active_list(zone, page);
 304         }
 305         if (zone)
 306                 spin_unlock_irq(&zone->lru_lock);
 307         release_pages(pvec->pages, pvec->nr, pvec->cold);
 308         pagevec_reinit(pvec);
 309 }
 310
 311 /*
 312  * Try to drop buffers from the pages in a pagevec
 313  */
 314 void pagevec_strip(struct pagevec *pvec)
 315 {
 316         int i;
 317
 318         for (i = 0; i < pagevec_count(pvec); i++) {
 319                 struct page *page = pvec->pages[i];
 320
 321                 if (PagePrivate(page) && !TestSetPageLocked(page)) {
 322                         try_to_release_page(page, 0);
 323                         unlock_page(page);
 324                 }
 325         }
 326 }
 327
 328 /**
 329  * pagevec_lookup - gang pagecache lookup
 330  * @pvec:       Where the resulting pages are placed
 331  * @mapping:    The address_space to search
 332  * @start:      The starting page index
 333  * @nr_pages:   The maximum number of pages
 334  *
 335  * pagevec_lookup() will search for and return a group of up to @nr_pages pages
 336  * in the mapping.  The pages are placed in @pvec.  pagevec_lookup() takes a
 337  * reference against the pages in @pvec.
 338  *
 339  * The search returns a group of mapping-contiguous pages with ascending
 340  * indexes.  There may be holes in the indices due to not-present pages.
 341  *
 342  * pagevec_lookup() returns the number of pages which were found.
 343  */
 344 unsigned int pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
 345                 pgoff_t start, unsigned int nr_pages)
 346 {
 347         pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages);
 348         return pagevec_count(pvec);
 349 }
 350
 351
 352 #ifdef CONFIG_SMP
 353 /*
 354  * We tolerate a little inaccuracy to avoid ping-ponging the counter between
 355  * CPUs
 356  */
 357 #define ACCT_THRESHOLD  max(16, NR_CPUS * 2)
 358
 359 static DEFINE_PER_CPU(long, committed_space) = 0;
 360
 361 void vm_acct_memory(long pages)
 362 {
 363         long *local;
 364
 365         preempt_disable();
 366         local = &__get_cpu_var(committed_space);
 367         *local += pages;
 368         if (*local > ACCT_THRESHOLD || *local < -ACCT_THRESHOLD) {
 369                 atomic_add(*local, &vm_committed_space);
 370                 *local = 0;
 371         }
 372         preempt_enable();
 373 }
 374 EXPORT_SYMBOL(vm_acct_memory);
 375 #endif
 376
 377
 378 /*
 379  * Perform any setup for the swap system
 380  */
 381 void __init swap_setup(void)
 382 {
 383         unsigned long megs = num_physpages >> (20 - PAGE_SHIFT);
 384
 385         /* Use a smaller cluster for small-memory machines */
 386         if (megs < 16)
 387                 page_cluster = 2;
 388         else
 389                 page_cluster = 3;
 390         /*
 391          * Right now other parts of the system means that we
 392          * _really_ don't want to cluster much more
 393          */
 394 }