mm/page-writeback.c

   1 /*
   2  * mm/page-writeback.c.
   3  *
   4  * Copyright (C) 2002, Linus Torvalds.
   5  *
   6  * Contains functions related to writing back dirty pages at the
   7  * address_space level.
   8  *
   9  * 10Apr2002    akpm@zip.com.au
  10  *              Initial version
  11  */
  12
  13 #include <linux/kernel.h>
  14 #include <linux/module.h>
  15 #include <linux/spinlock.h>
  16 #include <linux/fs.h>
  17 #include <linux/mm.h>
  18 #include <linux/swap.h>
  19 #include <linux/slab.h>
  20 #include <linux/pagemap.h>
  21 #include <linux/writeback.h>
  22 #include <linux/init.h>
  23 #include <linux/sysrq.h>
  24 #include <linux/backing-dev.h>
  25 #include <linux/blkdev.h>
  26 #include <linux/mpage.h>
  27 #include <linux/percpu.h>
  28 #include <linux/notifier.h>
  29 #include <linux/smp.h>
  30
  31 /*
  32  * The maximum number of pages to writeout in a single bdflush/kupdate
  33  * operation.  We do this so we don't hold I_LOCK against an inode for
  34  * enormous amounts of time, which would block a userspace task which has
  35  * been forced to throttle against that inode.  Also, the code reevaluates
  36  * the dirty each time it has written this many pages.
  37  */
  38 #define MAX_WRITEBACK_PAGES     1024
  39
  40 /*
  41  * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
  42  * will look to see if it needs to force writeback or throttling.
  43  */
  44 static long ratelimit_pages = 32;
  45
  46 static long total_pages;        /* The total number of pages in the machine. */
  47 static int dirty_exceeded;      /* Dirty mem may be over limit */
  48
  49 /*
  50  * When balance_dirty_pages decides that the caller needs to perform some
  51  * non-background writeback, this is how many pages it will attempt to write.
  52  * It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably
  53  * large amounts of I/O are submitted.
  54  */
  55 static inline long sync_writeback_pages(void)
  56 {
  57         return ratelimit_pages + ratelimit_pages / 2;
  58 }
  59
  60 /* The following parameters are exported via /proc/sys/vm */
  61
  62 /*
  63  * Start background writeback (via pdflush) at this percentage
  64  */
  65 int dirty_background_ratio = 10;
  66
  67 /*
  68  * The generator of dirty data starts writeback at this percentage
  69  */
  70 int vm_dirty_ratio = 40;
  71
  72 /*
  73  * The interval between `kupdate'-style writebacks, in centiseconds
  74  * (hundredths of a second)
  75  */
  76 int dirty_writeback_centisecs = 5 * 100;
  77
  78 /*
  79  * The longest number of centiseconds for which data is allowed to remain dirty
  80  */
  81 int dirty_expire_centisecs = 30 * 100;
  82
  83 /* End of sysctl-exported parameters */
  84
  85
  86 static void background_writeout(unsigned long _min_pages);
  87
  88 /*
  89  * Work out the current dirty-memory clamping and background writeout
  90  * thresholds.
  91  *
  92  * The main aim here is to lower them aggressively if there is a lot of mapped
  93  * memory around.  To avoid stressing page reclaim with lots of unreclaimable
  94  * pages.  It is better to clamp down on writers than to start swapping, and
  95  * performing lots of scanning.
  96  *
  97  * We only allow 1/2 of the currently-unmapped memory to be dirtied.
  98  *
  99  * We don't permit the clamping level to fall below 5% - that is getting rather
 100  * excessive.
 101  *
 102  * We make sure that the background writeout level is below the adjusted
 103  * clamping level.
 104  */
 105 static void
 106 get_dirty_limits(struct page_state *ps, long *background, long *dirty)
 107 {
 108         int background_ratio;           /* Percentages */
 109         int dirty_ratio;
 110         int unmapped_ratio;
 111
 112         get_page_state(ps);
 113
 114         unmapped_ratio = 100 - (ps->nr_mapped * 100) / total_pages;
 115
 116         dirty_ratio = vm_dirty_ratio;
 117         if (dirty_ratio > unmapped_ratio / 2)
 118                 dirty_ratio = unmapped_ratio / 2;
 119
 120         if (dirty_ratio < 5)
 121                 dirty_ratio = 5;
 122
 123         background_ratio = dirty_background_ratio;
 124         if (background_ratio >= dirty_ratio)
 125                 background_ratio = dirty_ratio / 2;
 126
 127         *background = (background_ratio * total_pages) / 100;
 128         *dirty = (dirty_ratio * total_pages) / 100;
 129 }
 130
 131 /*
 132  * balance_dirty_pages() must be called by processes which are generating dirty
 133  * data.  It looks at the number of dirty pages in the machine and will force
 134  * the caller to perform writeback if the system is over `vm_dirty_ratio'.
 135  * If we're over `background_thresh' then pdflush is woken to perform some
 136  * writeout.
 137  */
 138 void balance_dirty_pages(struct address_space *mapping)
 139 {
 140         struct page_state ps;
 141         long nr_reclaimable;
 142         long background_thresh;
 143         long dirty_thresh;
 144         unsigned long pages_written = 0;
 145         unsigned long write_chunk = sync_writeback_pages();
 146
 147         struct backing_dev_info *bdi = mapping->backing_dev_info;
 148
 149         for (;;) {
 150                 struct writeback_control wbc = {
 151                         .bdi            = bdi,
 152                         .sync_mode      = WB_SYNC_NONE,
 153                         .older_than_this = NULL,
 154                         .nr_to_write    = write_chunk,
 155                 };
 156
 157                 get_dirty_limits(&ps, &background_thresh, &dirty_thresh);
 158                 nr_reclaimable = ps.nr_dirty + ps.nr_unstable;
 159                 if (nr_reclaimable + ps.nr_writeback <= dirty_thresh)
 160                         break;
 161
 162                 dirty_exceeded = 1;
 163
 164                 /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
 165                  * Unstable writes are a feature of certain networked
 166                  * filesystems (i.e. NFS) in which data may have been
 167                  * written to the server's write cache, but has not yet
 168                  * been flushed to permanent storage.
 169                  */
 170                 if (nr_reclaimable) {
 171                         writeback_inodes(&wbc);
 172                         get_dirty_limits(&ps, &background_thresh,
 173                                         &dirty_thresh);
 174                         nr_reclaimable = ps.nr_dirty + ps.nr_unstable;
 175                         if (nr_reclaimable + ps.nr_writeback <= dirty_thresh)
 176                                 break;
 177                         pages_written += write_chunk - wbc.nr_to_write;
 178                         if (pages_written >= write_chunk)
 179                                 break;          /* We've done our duty */
 180                 }
 181                 blk_congestion_wait(WRITE, HZ/10);
 182         }
 183
 184         if (nr_reclaimable + ps.nr_writeback <= dirty_thresh)
 185                 dirty_exceeded = 0;
 186
 187         if (!writeback_in_progress(bdi) && nr_reclaimable > background_thresh)
 188                 pdflush_operation(background_writeout, 0);
 189 }
 190
 191 /**
 192  * balance_dirty_pages_ratelimited - balance dirty memory state
 193  * @mapping - address_space which was dirtied
 194  *
 195  * Processes which are dirtying memory should call in here once for each page
 196  * which was newly dirtied.  The function will periodically check the system's
 197  * dirty state and will initiate writeback if needed.
 198  *
 199  * On really big machines, get_page_state is expensive, so try to avoid calling
 200  * it too often (ratelimiting).  But once we're over the dirty memory limit we
 201  * decrease the ratelimiting by a lot, to prevent individual processes from
 202  * overshooting the limit by (ratelimit_pages) each.
 203  */
 204 void balance_dirty_pages_ratelimited(struct address_space *mapping)
 205 {
 206         static DEFINE_PER_CPU(int, ratelimits) = 0;
 207         int cpu;
 208         long ratelimit;
 209
 210         ratelimit = ratelimit_pages;
 211         if (dirty_exceeded)
 212                 ratelimit = 8;
 213
 214         cpu = get_cpu();
 215         if (per_cpu(ratelimits, cpu)++ >= ratelimit) {
 216                 per_cpu(ratelimits, cpu) = 0;
 217                 put_cpu();
 218                 balance_dirty_pages(mapping);
 219                 return;
 220         }
 221         put_cpu();
 222 }
 223 EXPORT_SYMBOL_GPL(balance_dirty_pages_ratelimited);
 224
 225 /*
 226  * writeback at least _min_pages, and keep writing until the amount of dirty
 227  * memory is less than the background threshold, or until we're all clean.
 228  */
 229 static void background_writeout(unsigned long _min_pages)
 230 {
 231         long min_pages = _min_pages;
 232         struct writeback_control wbc = {
 233                 .bdi            = NULL,
 234                 .sync_mode      = WB_SYNC_NONE,
 235                 .older_than_this = NULL,
 236                 .nr_to_write    = 0,
 237                 .nonblocking    = 1,
 238         };
 239
 240         CHECK_EMERGENCY_SYNC
 241         for ( ; ; ) {
 242                 struct page_state ps;
 243                 long background_thresh;
 244                 long dirty_thresh;
 245
 246                 get_dirty_limits(&ps, &background_thresh, &dirty_thresh);
 247                 if (ps.nr_dirty + ps.nr_unstable < background_thresh
 248                                 && min_pages <= 0)
 249                         break;
 250                 wbc.encountered_congestion = 0;
 251                 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
 252                 writeback_inodes(&wbc);
 253                 min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
 254                 if (wbc.nr_to_write > 0) {
 255                         /* Wrote less than expected */
 256                         if (wbc.encountered_congestion)
 257                                 blk_congestion_wait(WRITE, HZ/10);
 258                         else
 259                                 break;
 260                 }
 261         }
 262 }
 263
 264 /*
 265  * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
 266  * the whole world.  Returns 0 if a pdflush thread was dispatched.  Returns
 267  * -1 if all pdflush threads were busy.
 268  */
 269 int wakeup_bdflush(long nr_pages)
 270 {
 271         if (nr_pages == 0) {
 272                 struct page_state ps;
 273
 274                 get_page_state(&ps);
 275                 nr_pages = ps.nr_dirty;
 276         }
 277         return pdflush_operation(background_writeout, nr_pages);
 278 }
 279
 280 static struct timer_list wb_timer;
 281
 282 /*
 283  * Periodic writeback of "old" data.
 284  *
 285  * Define "old": the first time one of an inode's pages is dirtied, we mark the
 286  * dirtying-time in the inode's address_space.  So this periodic writeback code
 287  * just walks the superblock inode list, writing back any inodes which are
 288  * older than a specific point in time.
 289  *
 290  * Try to run once per dirty_writeback_centisecs.  But if a writeback event
 291  * takes longer than a dirty_writeback_centisecs interval, then leave a
 292  * one-second gap.
 293  *
 294  * older_than_this takes precedence over nr_to_write.  So we'll only write back
 295  * all dirty pages if they are all attached to "old" mappings.
 296  */
 297 static void wb_kupdate(unsigned long arg)
 298 {
 299         unsigned long oldest_jif;
 300         unsigned long start_jif;
 301         unsigned long next_jif;
 302         long nr_to_write;
 303         struct page_state ps;
 304         struct writeback_control wbc = {
 305                 .bdi            = NULL,
 306                 .sync_mode      = WB_SYNC_NONE,
 307                 .older_than_this = &oldest_jif,
 308                 .nr_to_write    = 0,
 309                 .nonblocking    = 1,
 310                 .for_kupdate    = 1,
 311         };
 312
 313         sync_supers();
 314
 315         get_page_state(&ps);
 316         oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100;
 317         start_jif = jiffies;
 318         next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100;
 319         nr_to_write = ps.nr_dirty + ps.nr_unstable;
 320         while (nr_to_write > 0) {
 321                 wbc.encountered_congestion = 0;
 322                 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
 323                 writeback_inodes(&wbc);
 324                 if (wbc.nr_to_write > 0) {
 325                         if (wbc.encountered_congestion)
 326                                 blk_congestion_wait(WRITE, HZ/10);
 327                         else
 328                                 break;  /* All the old data is written */
 329                 }
 330                 nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
 331         }
 332         if (time_before(next_jif, jiffies + HZ))
 333                 next_jif = jiffies + HZ;
 334         mod_timer(&wb_timer, next_jif);
 335 }
 336
 337 static void wb_timer_fn(unsigned long unused)
 338 {
 339         if (pdflush_operation(wb_kupdate, 0) < 0)
 340                 mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */
 341
 342 }
 343
 344 /*
 345  * If ratelimit_pages is too high then we can get into dirty-data overload
 346  * if a large number of processes all perform writes at the same time.
 347  * If it is too low then SMP machines will call the (expensive) get_page_state
 348  * too often.
 349  *
 350  * Here we set ratelimit_pages to a level which ensures that when all CPUs are
 351  * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
 352  * thresholds before writeback cuts in.
 353  *
 354  * But the limit should not be set too high.  Because it also controls the
 355  * amount of memory which the balance_dirty_pages() caller has to write back.
 356  * If this is too large then the caller will block on the IO queue all the
 357  * time.  So limit it to four megabytes - the balance_dirty_pages() caller
 358  * will write six megabyte chunks, max.
 359  */
 360
 361 static void set_ratelimit(void)
 362 {
 363         ratelimit_pages = total_pages / (num_online_cpus() * 32);
 364         if (ratelimit_pages < 16)
 365                 ratelimit_pages = 16;
 366         if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024)
 367                 ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE;
 368 }
 369
 370 static int
 371 ratelimit_handler(struct notifier_block *self, unsigned long u, void *v)
 372 {
 373         set_ratelimit();
 374         return 0;
 375 }
 376
 377 static struct notifier_block ratelimit_nb = {
 378         .notifier_call  = ratelimit_handler,
 379         .next           = NULL,
 380 };
 381
 382 /*
 383  * If the machine has a large highmem:lowmem ratio then scale back the default
 384  * dirty memory thresholds: allowing too much dirty highmem pins an excessive
 385  * number of buffer_heads.
 386  */
 387 void __init page_writeback_init(void)
 388 {
 389         long buffer_pages = nr_free_buffer_pages();
 390         long correction;
 391
 392         total_pages = nr_free_pagecache_pages();
 393
 394         correction = (100 * 4 * buffer_pages) / total_pages;
 395
 396         if (correction < 100) {
 397                 dirty_background_ratio *= correction;
 398                 dirty_background_ratio /= 100;
 399                 vm_dirty_ratio *= correction;
 400                 vm_dirty_ratio /= 100;
 401         }
 402
 403         init_timer(&wb_timer);
 404         wb_timer.expires = jiffies + (dirty_writeback_centisecs * HZ) / 100;
 405         wb_timer.data = 0;
 406         wb_timer.function = wb_timer_fn;
 407         add_timer(&wb_timer);
 408         set_ratelimit();
 409         register_cpu_notifier(&ratelimit_nb);
 410 }
 411
 412 int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
 413 {
 414         if (mapping->a_ops->writepages)
 415                 return mapping->a_ops->writepages(mapping, wbc);
 416         return generic_writepages(mapping, wbc);
 417 }
 418
 419 /**
 420  * write_one_page - write out a single page and optionally wait on I/O
 421  *
 422  * @page - the page to write
 423  * @wait - if true, wait on writeout
 424  *
 425  * The page must be locked by the caller and will be unlocked upon return.
 426  *
 427  * write_one_page() returns a negative error code if I/O failed.
 428  */
 429 int write_one_page(struct page *page, int wait)
 430 {
 431         struct address_space *mapping = page->mapping;
 432         int ret = 0;
 433         struct writeback_control wbc = {
 434                 .sync_mode = WB_SYNC_ALL,
 435         };
 436
 437         BUG_ON(!PageLocked(page));
 438
 439         if (wait && PageWriteback(page))
 440                 wait_on_page_writeback(page);
 441
 442         spin_lock(&mapping->page_lock);
 443         list_del(&page->list);
 444         if (test_clear_page_dirty(page)) {
 445                 list_add(&page->list, &mapping->locked_pages);
 446                 page_cache_get(page);
 447                 spin_unlock(&mapping->page_lock);
 448                 ret = mapping->a_ops->writepage(page, &wbc);
 449                 if (ret == 0 && wait) {
 450                         wait_on_page_writeback(page);
 451                         if (PageError(page))
 452                                 ret = -EIO;
 453                 }
 454                 page_cache_release(page);
 455         } else {
 456                 list_add(&page->list, &mapping->clean_pages);
 457                 spin_unlock(&mapping->page_lock);
 458                 unlock_page(page);
 459         }
 460         return ret;
 461 }
 462 EXPORT_SYMBOL(write_one_page);
 463
 464 /*
 465  * For address_spaces which do not use buffers.  Just set the page's dirty bit
 466  * and move it to the dirty_pages list.  Also perform space reservation if
 467  * required.
 468  *
 469  * __set_page_dirty_nobuffers() may return -ENOSPC.  But if it does, the page
 470  * is still safe, as long as it actually manages to find some blocks at
 471  * writeback time.
 472  *
 473  * This is also used when a single buffer is being dirtied: we want to set the
 474  * page dirty in that case, but not all the buffers.  This is a "bottom-up"
 475  * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying.
 476  */
 477 int __set_page_dirty_nobuffers(struct page *page)
 478 {
 479         int ret = 0;
 480
 481         if (!TestSetPageDirty(page)) {
 482                 struct address_space *mapping = page->mapping;
 483
 484                 if (mapping) {
 485                         spin_lock(&mapping->page_lock);
 486                         if (page->mapping) {    /* Race with truncate? */
 487                                 BUG_ON(page->mapping != mapping);
 488                                 if (!mapping->backing_dev_info->memory_backed)
 489                                         inc_page_state(nr_dirty);
 490                                 list_del(&page->list);
 491                                 list_add(&page->list, &mapping->dirty_pages);
 492                         }
 493                         spin_unlock(&mapping->page_lock);
 494                         __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 495                 }
 496         }
 497         return ret;
 498 }
 499 EXPORT_SYMBOL(__set_page_dirty_nobuffers);
 500
 501 /*
 502  * set_page_dirty() is racy if the caller has no reference against
 503  * page->mapping->host, and if the page is unlocked.  This is because another
 504  * CPU could truncate the page off the mapping and then free the mapping.
 505  *
 506  * Usually, the page _is_ locked, or the caller is a user-space process which
 507  * holds a reference on the inode by having an open file.
 508  *
 509  * In other cases, the page should be locked before running set_page_dirty().
 510  */
 511 int set_page_dirty_lock(struct page *page)
 512 {
 513         int ret;
 514
 515         lock_page(page);
 516         ret = set_page_dirty(page);
 517         unlock_page(page);
 518         return ret;
 519 }
 520
 521 /*
 522  * Clear a page's dirty flag, while caring for dirty memory accounting.
 523  * Returns true if the page was previously dirty.
 524  */
 525 int test_clear_page_dirty(struct page *page)
 526 {
 527         if (TestClearPageDirty(page)) {
 528                 struct address_space *mapping = page->mapping;
 529
 530                 if (mapping && !mapping->backing_dev_info->memory_backed)
 531                         dec_page_state(nr_dirty);
 532                 return 1;
 533         }
 534         return 0;
 535 }
 536 EXPORT_SYMBOL(test_clear_page_dirty);