sys/vm/vm_swapcache.c

   1 /*
   2  * (MPSAFE)
   3  *
   4  * Copyright (c) 2010,2019 The DragonFly Project.  All rights reserved.
   5  *
   6  * This code is derived from software contributed to The DragonFly Project
   7  * by Matthew Dillon <dillon@backplane.com>
   8  *
   9  * Redistribution and use in source and binary forms, with or without
  10  * modification, are permitted provided that the following conditions
  11  * are met:
  12  *
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in
  17  *    the documentation and/or other materials provided with the
  18  *    distribution.
  19  * 3. Neither the name of The DragonFly Project nor the names of its
  20  *    contributors may be used to endorse or promote products derived
  21  *    from this software without specific, prior written permission.
  22  *
  23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  27  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  28  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  33  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  34  * SUCH DAMAGE.
  35  */
  36
  37 /*
  38  * Implement the swapcache daemon.  When enabled swap is assumed to be
  39  * configured on a fast storage device such as a SSD.  Swap is assigned
  40  * to clean vnode-backed pages in the inactive queue, clustered by object
  41  * if possible, and written out.  The swap assignment sticks around even
  42  * after the underlying pages have been recycled.
  43  *
  44  * The daemon manages write bandwidth based on sysctl settings to control
  45  * wear on the SSD.
  46  *
  47  * The vnode strategy code will check for the swap assignments and divert
  48  * reads to the swap device when the data is present in the swapcache.
  49  *
  50  * This operates on both regular files and the block device vnodes used by
  51  * filesystems to manage meta-data.
  52  */
  53
  54 #include <sys/param.h>
  55 #include <sys/systm.h>
  56 #include <sys/kernel.h>
  57 #include <sys/proc.h>
  58 #include <sys/kthread.h>
  59 #include <sys/resourcevar.h>
  60 #include <sys/signalvar.h>
  61 #include <sys/vnode.h>
  62 #include <sys/vmmeter.h>
  63 #include <sys/sysctl.h>
  64 #include <sys/eventhandler.h>
  65
  66 #include <vm/vm.h>
  67 #include <vm/vm_param.h>
  68 #include <sys/lock.h>
  69 #include <vm/vm_object.h>
  70 #include <vm/vm_page.h>
  71 #include <vm/vm_map.h>
  72 #include <vm/vm_pageout.h>
  73 #include <vm/vm_pager.h>
  74 #include <vm/swap_pager.h>
  75 #include <vm/vm_extern.h>
  76
  77 #include <sys/spinlock2.h>
  78 #include <vm/vm_page2.h>
  79
  80 struct swmarker {
  81         struct vm_object dummy_obj;
  82         struct vm_object *save_obj;
  83         vm_ooffset_t save_off;
  84 };
  85
  86 typedef struct swmarker swmarker_t;
  87
  88 /* the kernel process "vm_pageout"*/
  89 static int vm_swapcached_flush (vm_page_t m, int isblkdev);
  90 static int vm_swapcache_test(vm_page_t m);
  91 static int vm_swapcache_writing_heuristic(void);
  92 static int vm_swapcache_writing(vm_page_t marker, int count, int scount);
  93 static void vm_swapcache_cleaning(swmarker_t *marker,
  94                         struct vm_object_hash **swindexp);
  95 static void vm_swapcache_movemarker(swmarker_t *marker,
  96                         struct vm_object_hash *swindex, vm_object_t object);
  97 struct thread *swapcached_thread;
  98
  99 SYSCTL_NODE(_vm, OID_AUTO, swapcache, CTLFLAG_RW, NULL, NULL);
 100
 101 int vm_swapcache_read_enable;
 102 static long vm_swapcache_wtrigger;
 103 static int vm_swapcache_sleep;
 104 static int vm_swapcache_maxscan = PQ_L2_SIZE * 8;
 105 static int vm_swapcache_maxlaunder = PQ_L2_SIZE * 4;
 106 static int vm_swapcache_data_enable = 0;
 107 static int vm_swapcache_meta_enable = 0;
 108 static int vm_swapcache_maxswappct = 75;
 109 static int vm_swapcache_hysteresis;
 110 static int vm_swapcache_min_hysteresis;
 111 int vm_swapcache_use_chflags = 0;       /* require chflags cache */
 112 static int64_t vm_swapcache_minburst = 10000000LL;      /* 10MB */
 113 static int64_t vm_swapcache_curburst = 4000000000LL;    /* 4G after boot */
 114 static int64_t vm_swapcache_maxburst = 2000000000LL;    /* 2G nominal max */
 115 static int64_t vm_swapcache_accrate = 100000LL;         /* 100K/s */
 116 static int64_t vm_swapcache_write_count;
 117 static int64_t vm_swapcache_maxfilesize;
 118 static int64_t vm_swapcache_cleanperobj = 16*1024*1024;
 119
 120 SYSCTL_INT(_vm_swapcache, OID_AUTO, maxlaunder,
 121         CTLFLAG_RW, &vm_swapcache_maxlaunder, 0, "");
 122 SYSCTL_INT(_vm_swapcache, OID_AUTO, maxscan,
 123         CTLFLAG_RW, &vm_swapcache_maxscan, 0, "");
 124
 125 SYSCTL_INT(_vm_swapcache, OID_AUTO, data_enable,
 126         CTLFLAG_RW, &vm_swapcache_data_enable, 0, "");
 127 SYSCTL_INT(_vm_swapcache, OID_AUTO, meta_enable,
 128         CTLFLAG_RW, &vm_swapcache_meta_enable, 0, "");
 129 SYSCTL_INT(_vm_swapcache, OID_AUTO, read_enable,
 130         CTLFLAG_RW, &vm_swapcache_read_enable, 0, "");
 131 SYSCTL_INT(_vm_swapcache, OID_AUTO, maxswappct,
 132         CTLFLAG_RW, &vm_swapcache_maxswappct, 0, "");
 133 SYSCTL_INT(_vm_swapcache, OID_AUTO, hysteresis,
 134         CTLFLAG_RD, &vm_swapcache_hysteresis, 0, "");
 135 SYSCTL_INT(_vm_swapcache, OID_AUTO, min_hysteresis,
 136         CTLFLAG_RW, &vm_swapcache_min_hysteresis, 0, "");
 137 SYSCTL_INT(_vm_swapcache, OID_AUTO, use_chflags,
 138         CTLFLAG_RW, &vm_swapcache_use_chflags, 0, "");
 139
 140 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, minburst,
 141         CTLFLAG_RW, &vm_swapcache_minburst, 0, "");
 142 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, curburst,
 143         CTLFLAG_RW, &vm_swapcache_curburst, 0, "");
 144 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, maxburst,
 145         CTLFLAG_RW, &vm_swapcache_maxburst, 0, "");
 146 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, maxfilesize,
 147         CTLFLAG_RW, &vm_swapcache_maxfilesize, 0, "");
 148 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, accrate,
 149         CTLFLAG_RW, &vm_swapcache_accrate, 0, "");
 150 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, write_count,
 151         CTLFLAG_RW, &vm_swapcache_write_count, 0, "");
 152 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, cleanperobj,
 153         CTLFLAG_RW, &vm_swapcache_cleanperobj, 0, "");
 154
 155 #define SWAPMAX(adj)    \
 156         ((int64_t)vm_swap_max * (vm_swapcache_maxswappct + (adj)) / 100)
 157
 158 /*
 159  * When shutting down the machine we want to stop swapcache operation
 160  * immediately so swap is not accessed after devices have been shuttered.
 161  */
 162 static void
 163 shutdown_swapcache(void *arg __unused)
 164 {
 165         vm_swapcache_read_enable = 0;
 166         vm_swapcache_data_enable = 0;
 167         vm_swapcache_meta_enable = 0;
 168         wakeup(&vm_swapcache_sleep);    /* shortcut 5-second wait */
 169 }
 170
 171 /*
 172  * vm_swapcached is the high level pageout daemon.
 173  *
 174  * No requirements.
 175  */
 176 static void
 177 vm_swapcached_thread(void)
 178 {
 179         enum { SWAPC_WRITING, SWAPC_CLEANING } state = SWAPC_WRITING;
 180         enum { SWAPB_BURSTING, SWAPB_RECOVERING } burst = SWAPB_BURSTING;
 181         static struct vm_page page_marker[PQ_L2_SIZE];
 182         static swmarker_t swmarker;
 183         static struct vm_object_hash *swindex;
 184         int q;
 185
 186         /*
 187          * Thread setup
 188          */
 189         curthread->td_flags |= TDF_SYSTHREAD;
 190         EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc,
 191                               swapcached_thread, SHUTDOWN_PRI_FIRST);
 192         EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_swapcache,
 193                               NULL, SHUTDOWN_PRI_SECOND);
 194
 195         /*
 196          * Initialize our marker for the inactive scan (SWAPC_WRITING)
 197          */
 198         bzero(&page_marker, sizeof(page_marker));
 199         for (q = 0; q < PQ_L2_SIZE; ++q) {
 200                 page_marker[q].flags = PG_FICTITIOUS | PG_MARKER;
 201                 page_marker[q].busy_count = PBUSY_LOCKED;
 202                 page_marker[q].queue = PQ_INACTIVE + q;
 203                 page_marker[q].pc = q;
 204                 page_marker[q].wire_count = 1;
 205                 vm_page_queues_spin_lock(PQ_INACTIVE + q);
 206                 TAILQ_INSERT_HEAD(
 207                         &vm_page_queues[PQ_INACTIVE + q].pl,
 208                         &page_marker[q], pageq);
 209                 vm_page_queues_spin_unlock(PQ_INACTIVE + q);
 210         }
 211
 212         vm_swapcache_min_hysteresis = 1024;
 213         vm_swapcache_hysteresis = vm_swapcache_min_hysteresis;
 214         vm_swapcache_wtrigger = -vm_swapcache_hysteresis;
 215
 216         /*
 217          * Initialize our marker for the vm_object scan (SWAPC_CLEANING)
 218          */
 219         bzero(&swmarker, sizeof(swmarker));
 220         swmarker.dummy_obj.type = OBJT_MARKER;
 221         swindex = &vm_object_hash[0];
 222         lwkt_gettoken(&swindex->token);
 223         TAILQ_INSERT_HEAD(&swindex->list, &swmarker.dummy_obj, object_entry);
 224         lwkt_reltoken(&swindex->token);
 225
 226         for (;;) {
 227                 int reached_end;
 228                 int scount;
 229                 int count;
 230
 231                 /*
 232                  * Handle shutdown
 233                  */
 234                 kproc_suspend_loop();
 235
 236                 /*
 237                  * Check every 5 seconds when not enabled or if no swap
 238                  * is present.
 239                  */
 240                 if ((vm_swapcache_data_enable == 0 &&
 241                      vm_swapcache_meta_enable == 0 &&
 242                      vm_swap_cache_use <= SWAPMAX(0)) ||
 243                     vm_swap_max == 0) {
 244                         tsleep(&vm_swapcache_sleep, 0, "csleep", hz * 5);
 245                         continue;
 246                 }
 247
 248                 /*
 249                  * Polling rate when enabled is approximately 10 hz.
 250                  */
 251                 tsleep(&vm_swapcache_sleep, 0, "csleep", hz / 10);
 252
 253                 /*
 254                  * State hysteresis.  Generate write activity up to 75% of
 255                  * swap, then clean out swap assignments down to 70%, then
 256                  * repeat.
 257                  */
 258                 if (state == SWAPC_WRITING) {
 259                         if (vm_swap_cache_use > SWAPMAX(0))
 260                                 state = SWAPC_CLEANING;
 261                 } else {
 262                         if (vm_swap_cache_use < SWAPMAX(-10))
 263                                 state = SWAPC_WRITING;
 264                 }
 265
 266                 /*
 267                  * We are allowed to continue accumulating burst value
 268                  * in either state.  Allow the user to set curburst > maxburst
 269                  * for the initial load-in.
 270                  */
 271                 if (vm_swapcache_curburst < vm_swapcache_maxburst) {
 272                         vm_swapcache_curburst += vm_swapcache_accrate / 10;
 273                         if (vm_swapcache_curburst > vm_swapcache_maxburst)
 274                                 vm_swapcache_curburst = vm_swapcache_maxburst;
 275                 }
 276
 277                 /*
 278                  * We don't want to nickle-and-dime the scan as that will
 279                  * create unnecessary fragmentation.  The minimum burst
 280                  * is one-seconds worth of accumulation.
 281                  */
 282                 if (state != SWAPC_WRITING) {
 283                         vm_swapcache_cleaning(&swmarker, &swindex);
 284                         continue;
 285                 }
 286                 if (vm_swapcache_curburst < vm_swapcache_accrate)
 287                         continue;
 288
 289                 reached_end = 0;
 290                 count = vm_swapcache_maxlaunder / PQ_L2_SIZE + 2;
 291                 scount = vm_swapcache_maxscan / PQ_L2_SIZE + 2;
 292
 293                 if (burst == SWAPB_BURSTING) {
 294                         if (vm_swapcache_writing_heuristic()) {
 295                                 for (q = 0; q < PQ_L2_SIZE; ++q) {
 296                                         reached_end +=
 297                                                 vm_swapcache_writing(
 298                                                         &page_marker[q],
 299                                                         count,
 300                                                         scount);
 301                                 }
 302                         }
 303                         if (vm_swapcache_curburst <= 0)
 304                                 burst = SWAPB_RECOVERING;
 305                 } else if (vm_swapcache_curburst > vm_swapcache_minburst) {
 306                         if (vm_swapcache_writing_heuristic()) {
 307                                 for (q = 0; q < PQ_L2_SIZE; ++q) {
 308                                         reached_end +=
 309                                                 vm_swapcache_writing(
 310                                                         &page_marker[q],
 311                                                         count,
 312                                                         scount);
 313                                 }
 314                         }
 315                         burst = SWAPB_BURSTING;
 316                 }
 317                 if (reached_end == PQ_L2_SIZE) {
 318                         vm_swapcache_wtrigger = -vm_swapcache_hysteresis;
 319                 }
 320         }
 321
 322         /*
 323          * Cleanup (NOT REACHED)
 324          */
 325         for (q = 0; q < PQ_L2_SIZE; ++q) {
 326                 vm_page_queues_spin_lock(PQ_INACTIVE + q);
 327                 TAILQ_REMOVE(
 328                         &vm_page_queues[PQ_INACTIVE + q].pl,
 329                         &page_marker[q], pageq);
 330                 vm_page_queues_spin_unlock(PQ_INACTIVE + q);
 331         }
 332
 333         lwkt_gettoken(&swindex->token);
 334         TAILQ_REMOVE(&swindex->list, &swmarker.dummy_obj, object_entry);
 335         lwkt_reltoken(&swindex->token);
 336 }
 337
 338 static struct kproc_desc swpc_kp = {
 339         "swapcached",
 340         vm_swapcached_thread,
 341         &swapcached_thread
 342 };
 343 SYSINIT(swapcached, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start, &swpc_kp);
 344
 345 /*
 346  * Deal with an overflow of the heuristic counter or if the user
 347  * manually changes the hysteresis.
 348  *
 349  * Try to avoid small incremental pageouts by waiting for enough
 350  * pages to buildup in the inactive queue to hopefully get a good
 351  * burst in.  This heuristic is bumped by the VM system and reset
 352  * when our scan hits the end of the queue.
 353  *
 354  * Return TRUE if we need to take a writing pass.
 355  */
 356 static int
 357 vm_swapcache_writing_heuristic(void)
 358 {
 359         int hyst;
 360         int q;
 361         long adds;
 362
 363         hyst = vmstats.v_inactive_count / 4;
 364         if (hyst < vm_swapcache_min_hysteresis)
 365                 hyst = vm_swapcache_min_hysteresis;
 366         cpu_ccfence();
 367         vm_swapcache_hysteresis = hyst;
 368
 369         adds = 0;
 370         for (q = PQ_INACTIVE; q < PQ_INACTIVE + PQ_L2_SIZE; ++q) {
 371                 adds += atomic_swap_long(&vm_page_queues[q].adds, 0);
 372         }
 373         vm_swapcache_wtrigger += adds;
 374         if (vm_swapcache_wtrigger < -hyst)
 375                 vm_swapcache_wtrigger = -hyst;
 376         return (vm_swapcache_wtrigger >= 0);
 377 }
 378
 379 /*
 380  * Take a writing pass on one of the inactive queues, return non-zero if
 381  * we hit the end of the queue.
 382  */
 383 static int
 384 vm_swapcache_writing(vm_page_t marker, int count, int scount)
 385 {
 386         vm_object_t object;
 387         struct vnode *vp;
 388         vm_page_t m;
 389         int isblkdev;
 390
 391         /*
 392          * Scan the inactive queue from our marker to locate
 393          * suitable pages to push to the swap cache.
 394          *
 395          * We are looking for clean vnode-backed pages.
 396          */
 397         vm_page_queues_spin_lock(marker->queue);
 398         while ((m = TAILQ_NEXT(marker, pageq)) != NULL &&
 399                count > 0 && scount-- > 0) {
 400                 KKASSERT(m->queue == marker->queue);
 401
 402                 /*
 403                  * Stop using swap if paniced, dumping, or dumped.
 404                  * Don't try to write if our curburst has been exhausted.
 405                  */
 406                 if (panicstr || dumping)
 407                         break;
 408                 if (vm_swapcache_curburst < 0)
 409                         break;
 410
 411                 /*
 412                  * Move marker
 413                  */
 414                 TAILQ_REMOVE(
 415                         &vm_page_queues[marker->queue].pl, marker, pageq);
 416                 TAILQ_INSERT_AFTER(
 417                         &vm_page_queues[marker->queue].pl, m, marker, pageq);
 418
 419                 /*
 420                  * Ignore markers and ignore pages that already have a swap
 421                  * assignment.
 422                  */
 423                 if (m->flags & (PG_MARKER | PG_SWAPPED))
 424                         continue;
 425                 if (vm_page_busy_try(m, TRUE))
 426                         continue;
 427                 vm_page_queues_spin_unlock(marker->queue);
 428
 429                 if ((object = m->object) == NULL) {
 430                         vm_page_wakeup(m);
 431                         vm_page_queues_spin_lock(marker->queue);
 432                         continue;
 433                 }
 434                 vm_object_hold(object);
 435                 if (m->object != object) {
 436                         vm_object_drop(object);
 437                         vm_page_wakeup(m);
 438                         vm_page_queues_spin_lock(marker->queue);
 439                         continue;
 440                 }
 441                 if (vm_swapcache_test(m)) {
 442                         vm_object_drop(object);
 443                         vm_page_wakeup(m);
 444                         vm_page_queues_spin_lock(marker->queue);
 445                         continue;
 446                 }
 447
 448                 vp = object->handle;
 449                 if (vp == NULL) {
 450                         vm_object_drop(object);
 451                         vm_page_wakeup(m);
 452                         vm_page_queues_spin_lock(marker->queue);
 453                         continue;
 454                 }
 455
 456                 switch(vp->v_type) {
 457                 case VREG:
 458                         /*
 459                          * PG_NOTMETA generically means 'don't swapcache this',
 460                          * and HAMMER will set this for regular data buffers
 461                          * (and leave it unset for meta-data buffers) as
 462                          * appropriate when double buffering is enabled.
 463                          */
 464                         if (m->flags & PG_NOTMETA) {
 465                                 vm_object_drop(object);
 466                                 vm_page_wakeup(m);
 467                                 vm_page_queues_spin_lock(marker->queue);
 468                                 continue;
 469                         }
 470
 471                         /*
 472                          * If data_enable is 0 do not try to swapcache data.
 473                          * If use_chflags is set then only swapcache data for
 474                          * VSWAPCACHE marked vnodes, otherwise any vnode.
 475                          */
 476                         if (vm_swapcache_data_enable == 0 ||
 477                             ((vp->v_flag & VSWAPCACHE) == 0 &&
 478                              vm_swapcache_use_chflags)) {
 479                                 vm_object_drop(object);
 480                                 vm_page_wakeup(m);
 481                                 vm_page_queues_spin_lock(marker->queue);
 482                                 continue;
 483                         }
 484                         if (vm_swapcache_maxfilesize &&
 485                             object->size >
 486                             (vm_swapcache_maxfilesize >> PAGE_SHIFT)) {
 487                                 vm_object_drop(object);
 488                                 vm_page_wakeup(m);
 489                                 vm_page_queues_spin_lock(marker->queue);
 490                                 continue;
 491                         }
 492                         isblkdev = 0;
 493                         break;
 494                 case VCHR:
 495                         /*
 496                          * PG_NOTMETA generically means 'don't swapcache this',
 497                          * and HAMMER will set this for regular data buffers
 498                          * (and leave it unset for meta-data buffers) as
 499                          * appropriate when double buffering is enabled.
 500                          */
 501                         if (m->flags & PG_NOTMETA) {
 502                                 vm_object_drop(object);
 503                                 vm_page_wakeup(m);
 504                                 vm_page_queues_spin_lock(marker->queue);
 505                                 continue;
 506                         }
 507                         if (vm_swapcache_meta_enable == 0) {
 508                                 vm_object_drop(object);
 509                                 vm_page_wakeup(m);
 510                                 vm_page_queues_spin_lock(marker->queue);
 511                                 continue;
 512                         }
 513                         isblkdev = 1;
 514                         break;
 515                 default:
 516                         vm_object_drop(object);
 517                         vm_page_wakeup(m);
 518                         vm_page_queues_spin_lock(marker->queue);
 519                         continue;
 520                 }
 521
 522
 523                 /*
 524                  * Assign swap and initiate I/O.
 525                  *
 526                  * (adjust for the --count which also occurs in the loop)
 527                  */
 528                 count -= vm_swapcached_flush(m, isblkdev);
 529
 530                 /*
 531                  * Setup for next loop using marker.
 532                  */
 533                 vm_object_drop(object);
 534                 vm_page_queues_spin_lock(marker->queue);
 535         }
 536
 537         /*
 538          * The marker could wind up at the end, which is ok.  If we hit the
 539          * end of the list adjust the heuristic.
 540          *
 541          * Earlier inactive pages that were dirty and become clean
 542          * are typically moved to the end of PQ_INACTIVE by virtue
 543          * of vfs_vmio_release() when they become unwired from the
 544          * buffer cache.
 545          */
 546         vm_page_queues_spin_unlock(marker->queue);
 547
 548         /*
 549          * m invalid but can be used to test for NULL
 550          */
 551         return (m == NULL);
 552 }
 553
 554 /*
 555  * Flush the specified page using the swap_pager.  The page
 556  * must be busied by the caller and its disposition will become
 557  * the responsibility of this function.
 558  *
 559  * Try to collect surrounding pages, including pages which may
 560  * have already been assigned swap.  Try to cluster within a
 561  * contiguous aligned SMAP_META_PAGES (typ 16 x PAGE_SIZE) block
 562  * to match what swap_pager_putpages() can do.
 563  *
 564  * We also want to try to match against the buffer cache blocksize
 565  * but we don't really know what it is here.  Since the buffer cache
 566  * wires and unwires pages in groups the fact that we skip wired pages
 567  * should be sufficient.
 568  *
 569  * Returns a count of pages we might have flushed (minimum 1)
 570  */
 571 static
 572 int
 573 vm_swapcached_flush(vm_page_t m, int isblkdev)
 574 {
 575         vm_object_t object;
 576         vm_page_t marray[SWAP_META_PAGES];
 577         vm_pindex_t basei;
 578         int rtvals[SWAP_META_PAGES];
 579         int x;
 580         int i;
 581         int j;
 582         int count;
 583         int error;
 584
 585         vm_page_io_start(m);
 586         vm_page_protect(m, VM_PROT_READ);
 587         object = m->object;
 588         vm_object_hold(object);
 589
 590         /*
 591          * Try to cluster around (m), keeping in mind that the swap pager
 592          * can only do SMAP_META_PAGES worth of continguous write.
 593          */
 594         x = (int)m->pindex & SWAP_META_MASK;
 595         marray[x] = m;
 596         basei = m->pindex;
 597         vm_page_wakeup(m);
 598
 599         for (i = x - 1; i >= 0; --i) {
 600                 m = vm_page_lookup_busy_try(object, basei - x + i,
 601                                             TRUE, &error);
 602                 if (error || m == NULL)
 603                         break;
 604                 if (vm_swapcache_test(m)) {
 605                         vm_page_wakeup(m);
 606                         break;
 607                 }
 608                 if (isblkdev && (m->flags & PG_NOTMETA)) {
 609                         vm_page_wakeup(m);
 610                         break;
 611                 }
 612                 vm_page_io_start(m);
 613                 vm_page_protect(m, VM_PROT_READ);
 614                 if (m->queue - m->pc == PQ_CACHE) {
 615                         vm_page_unqueue_nowakeup(m);
 616                         vm_page_deactivate(m);
 617                 }
 618                 marray[i] = m;
 619                 vm_page_wakeup(m);
 620         }
 621         ++i;
 622
 623         for (j = x + 1; j < SWAP_META_PAGES; ++j) {
 624                 m = vm_page_lookup_busy_try(object, basei - x + j,
 625                                             TRUE, &error);
 626                 if (error || m == NULL)
 627                         break;
 628                 if (vm_swapcache_test(m)) {
 629                         vm_page_wakeup(m);
 630                         break;
 631                 }
 632                 if (isblkdev && (m->flags & PG_NOTMETA)) {
 633                         vm_page_wakeup(m);
 634                         break;
 635                 }
 636                 vm_page_io_start(m);
 637                 vm_page_protect(m, VM_PROT_READ);
 638                 if (m->queue - m->pc == PQ_CACHE) {
 639                         vm_page_unqueue_nowakeup(m);
 640                         vm_page_deactivate(m);
 641                 }
 642                 marray[j] = m;
 643                 vm_page_wakeup(m);
 644         }
 645
 646         count = j - i;
 647         vm_object_pip_add(object, count);
 648         swap_pager_putpages(object, marray + i, count, FALSE, rtvals + i);
 649         vm_swapcache_write_count += count * PAGE_SIZE;
 650         vm_swapcache_curburst -= count * PAGE_SIZE;
 651
 652         while (i < j) {
 653                 if (rtvals[i] != VM_PAGER_PEND) {
 654                         vm_page_busy_wait(marray[i], FALSE, "swppgfd");
 655                         vm_page_io_finish(marray[i]);
 656                         vm_page_wakeup(marray[i]);
 657                         vm_object_pip_wakeup(object);
 658                 }
 659                 ++i;
 660         }
 661         vm_object_drop(object);
 662         return(count);
 663 }
 664
 665 /*
 666  * Test whether a VM page is suitable for writing to the swapcache.
 667  * Does not test m->queue, PG_MARKER, or PG_SWAPPED.
 668  *
 669  * Returns 0 on success, 1 on failure
 670  */
 671 static int
 672 vm_swapcache_test(vm_page_t m)
 673 {
 674         vm_object_t object;
 675
 676         if (m->flags & (PG_UNQUEUED | PG_FICTITIOUS))
 677                 return(1);
 678         if (m->hold_count || m->wire_count)
 679                 return(1);
 680         if (m->valid != VM_PAGE_BITS_ALL)
 681                 return(1);
 682         if (m->dirty & m->valid)
 683                 return(1);
 684         if ((object = m->object) == NULL)
 685                 return(1);
 686         if (object->type != OBJT_VNODE ||
 687             (object->flags & OBJ_DEAD)) {
 688                 return(1);
 689         }
 690         vm_page_test_dirty(m);
 691         if (m->dirty & m->valid)
 692                 return(1);
 693         return(0);
 694 }
 695
 696 /*
 697  * Cleaning pass.
 698  *
 699  * We clean whole objects up to 16MB
 700  */
 701 static
 702 void
 703 vm_swapcache_cleaning(swmarker_t *marker, struct vm_object_hash **swindexp)
 704 {
 705         vm_object_t object;
 706         struct vnode *vp;
 707         int count;
 708         int scount;
 709         int n;
 710         int didmove;
 711
 712         count = vm_swapcache_maxlaunder;
 713         scount = vm_swapcache_maxscan;
 714
 715         /*
 716          * Look for vnode objects
 717          */
 718         lwkt_gettoken(&(*swindexp)->token);
 719
 720         didmove = 0;
 721 outerloop:
 722         while ((object = TAILQ_NEXT(&marker->dummy_obj,
 723                                     object_entry)) != NULL) {
 724                 /*
 725                  * We have to skip markers.  We cannot hold/drop marker
 726                  * objects!
 727                  */
 728                 if (object->type == OBJT_MARKER) {
 729                         vm_swapcache_movemarker(marker, *swindexp, object);
 730                         didmove = 1;
 731                         continue;
 732                 }
 733
 734                 /*
 735                  * Safety, or in case there are millions of VM objects
 736                  * without swapcache backing.
 737                  */
 738                 if (--scount <= 0)
 739                         goto breakout;
 740
 741                 /*
 742                  * We must hold the object before potentially yielding.
 743                  */
 744                 vm_object_hold(object);
 745                 lwkt_yield();
 746
 747                 /*
 748                  * Only operate on live VNODE objects that are either
 749                  * VREG or VCHR (VCHR for meta-data).
 750                  */
 751                 if ((object->type != OBJT_VNODE) ||
 752                     ((object->flags & OBJ_DEAD) ||
 753                      object->swblock_count == 0) ||
 754                     ((vp = object->handle) == NULL) ||
 755                     (vp->v_type != VREG && vp->v_type != VCHR)) {
 756                         vm_object_drop(object);
 757                         /* object may be invalid now */
 758                         vm_swapcache_movemarker(marker, *swindexp, object);
 759                         didmove = 1;
 760                         continue;
 761                 }
 762
 763                 /*
 764                  * Reset the object pindex stored in the marker if the
 765                  * working object has changed.
 766                  */
 767                 if (marker->save_obj != object || didmove) {
 768                         marker->dummy_obj.size = 0;
 769                         marker->save_off = 0;
 770                         marker->save_obj = object;
 771                         didmove = 0;
 772                 }
 773
 774                 /*
 775                  * Look for swblocks starting at our iterator.
 776                  *
 777                  * The swap_pager_condfree() function attempts to free
 778                  * swap space starting at the specified index.  The index
 779                  * will be updated on return.  The function will return
 780                  * a scan factor (NOT the number of blocks freed).
 781                  *
 782                  * If it must cut its scan of the object short due to an
 783                  * excessive number of swblocks, or is able to free the
 784                  * requested number of blocks, it will return n >= count
 785                  * and we break and pick it back up on a future attempt.
 786                  *
 787                  * Scan the object linearly and try to batch large sets of
 788                  * blocks that are likely to clean out entire swap radix
 789                  * tree leafs.
 790                  */
 791                 lwkt_token_swap();
 792                 lwkt_reltoken(&(*swindexp)->token);
 793
 794                 n = swap_pager_condfree(object, &marker->dummy_obj.size,
 795                                     (count + SWAP_META_MASK) & ~SWAP_META_MASK);
 796
 797                 vm_object_drop(object);         /* object may be invalid now */
 798                 lwkt_gettoken(&(*swindexp)->token);
 799
 800                 /*
 801                  * If we have exhausted the object or deleted our per-pass
 802                  * page limit then move us to the next object.  Note that
 803                  * the current object may no longer be on the vm_object_entry.
 804                  */
 805                 if (n <= 0 ||
 806                     marker->save_off > vm_swapcache_cleanperobj) {
 807                         vm_swapcache_movemarker(marker, *swindexp, object);
 808                         didmove = 1;
 809                 }
 810
 811                 /*
 812                  * If we have exhausted our max-launder stop for now.
 813                  */
 814                 count -= n;
 815                 marker->save_off += n * PAGE_SIZE;
 816                 if (count < 0)
 817                         goto breakout;
 818         }
 819
 820         /*
 821          * Iterate vm_object_hash[] hash table
 822          */
 823         TAILQ_REMOVE(&(*swindexp)->list, &marker->dummy_obj, object_entry);
 824         lwkt_reltoken(&(*swindexp)->token);
 825         if (++*swindexp >= &vm_object_hash[VMOBJ_HSIZE])
 826                 *swindexp = &vm_object_hash[0];
 827         lwkt_gettoken(&(*swindexp)->token);
 828         TAILQ_INSERT_HEAD(&(*swindexp)->list, &marker->dummy_obj, object_entry);
 829
 830         if (*swindexp != &vm_object_hash[0])
 831                 goto outerloop;
 832
 833 breakout:
 834         lwkt_reltoken(&(*swindexp)->token);
 835 }
 836
 837 /*
 838  * Move the marker past the current object.  Object can be stale, but we
 839  * still need it to determine if the marker has to be moved.  If the object
 840  * is still the 'current object' (object after the marker), we hop-scotch
 841  * the marker past it.
 842  */
 843 static void
 844 vm_swapcache_movemarker(swmarker_t *marker, struct vm_object_hash *swindex,
 845                         vm_object_t object)
 846 {
 847         if (TAILQ_NEXT(&marker->dummy_obj, object_entry) == object) {
 848                 TAILQ_REMOVE(&swindex->list, &marker->dummy_obj, object_entry);
 849                 TAILQ_INSERT_AFTER(&swindex->list, object,
 850                                    &marker->dummy_obj, object_entry);
 851         }
 852 }