sys/vm/swap_pager.c

   1 /*
   2  * (MPSAFE)
   3  *
   4  * Copyright (c) 1998-2010 The DragonFly Project.  All rights reserved.
   5  *
   6  * This code is derived from software contributed to The DragonFly Project
   7  * by Matthew Dillon <dillon@backplane.com>
   8  *
   9  * Redistribution and use in source and binary forms, with or without
  10  * modification, are permitted provided that the following conditions
  11  * are met:
  12  *
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in
  17  *    the documentation and/or other materials provided with the
  18  *    distribution.
  19  * 3. Neither the name of The DragonFly Project nor the names of its
  20  *    contributors may be used to endorse or promote products derived
  21  *    from this software without specific, prior written permission.
  22  *
  23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  27  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  28  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  33  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  34  * SUCH DAMAGE.
  35  *
  36  * Copyright (c) 1994 John S. Dyson
  37  * Copyright (c) 1990 University of Utah.
  38  * Copyright (c) 1991, 1993
  39  *      The Regents of the University of California.  All rights reserved.
  40  *
  41  * This code is derived from software contributed to Berkeley by
  42  * the Systems Programming Group of the University of Utah Computer
  43  * Science Department.
  44  *
  45  * Redistribution and use in source and binary forms, with or without
  46  * modification, are permitted provided that the following conditions
  47  * are met:
  48  * 1. Redistributions of source code must retain the above copyright
  49  *    notice, this list of conditions and the following disclaimer.
  50  * 2. Redistributions in binary form must reproduce the above copyright
  51  *    notice, this list of conditions and the following disclaimer in the
  52  *    documentation and/or other materials provided with the distribution.
  53  * 3. Neither the name of the University nor the names of its contributors
  54  *    may be used to endorse or promote products derived from this software
  55  *    without specific prior written permission.
  56  *
  57  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  58  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  59  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  60  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  61  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  62  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  63  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  64  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  65  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  66  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  67  * SUCH DAMAGE.
  68  *
  69  *                              New Swap System
  70  *                              Matthew Dillon
  71  *
  72  * Radix Bitmap 'blists'.
  73  *
  74  *      - The new swapper uses the new radix bitmap code.  This should scale
  75  *        to arbitrarily small or arbitrarily large swap spaces and an almost
  76  *        arbitrary degree of fragmentation.
  77  *
  78  * Features:
  79  *
  80  *      - on the fly reallocation of swap during putpages.  The new system
  81  *        does not try to keep previously allocated swap blocks for dirty
  82  *        pages.
  83  *
  84  *      - on the fly deallocation of swap
  85  *
  86  *      - No more garbage collection required.  Unnecessarily allocated swap
  87  *        blocks only exist for dirty vm_page_t's now and these are already
  88  *        cycled (in a high-load system) by the pager.  We also do on-the-fly
  89  *        removal of invalidated swap blocks when a page is destroyed
  90  *        or renamed.
  91  *
  92  * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$
  93  * @(#)swap_pager.c     8.9 (Berkeley) 3/21/94
  94  * $FreeBSD: src/sys/vm/swap_pager.c,v 1.130.2.12 2002/08/31 21:15:55 dillon Exp $
  95  */
  96
  97 #include "opt_swap.h"
  98 #include <sys/param.h>
  99 #include <sys/systm.h>
 100 #include <sys/conf.h>
 101 #include <sys/kernel.h>
 102 #include <sys/proc.h>
 103 #include <sys/buf.h>
 104 #include <sys/vnode.h>
 105 #include <sys/malloc.h>
 106 #include <sys/vmmeter.h>
 107 #include <sys/sysctl.h>
 108 #include <sys/blist.h>
 109 #include <sys/lock.h>
 110 #include <sys/kcollect.h>
 111
 112 #include <vm/vm.h>
 113 #include <vm/vm_object.h>
 114 #include <vm/vm_page.h>
 115 #include <vm/vm_pager.h>
 116 #include <vm/vm_pageout.h>
 117 #include <vm/swap_pager.h>
 118 #include <vm/vm_extern.h>
 119 #include <vm/vm_zone.h>
 120 #include <vm/vnode_pager.h>
 121
 122 #include <sys/buf2.h>
 123 #include <vm/vm_page2.h>
 124
 125 #ifndef MAX_PAGEOUT_CLUSTER
 126 #define MAX_PAGEOUT_CLUSTER     SWB_NPAGES
 127 #endif
 128
 129 #define SWM_FREE        0x02    /* free, period                 */
 130 #define SWM_POP         0x04    /* pop out                      */
 131
 132 #define SWBIO_READ      0x01
 133 #define SWBIO_WRITE     0x02
 134 #define SWBIO_SYNC      0x04
 135 #define SWBIO_TTC       0x08    /* for VM_PAGER_TRY_TO_CACHE */
 136
 137 struct swfreeinfo {
 138         vm_object_t     object;
 139         vm_pindex_t     basei;
 140         vm_pindex_t     begi;
 141         vm_pindex_t     endi;   /* inclusive */
 142 };
 143
 144 struct swswapoffinfo {
 145         vm_object_t     object;
 146         int             devidx;
 147         int             shared;
 148 };
 149
 150 /*
 151  * vm_swap_size is in page-sized chunks now.  It was DEV_BSIZE'd chunks
 152  * in the old system.
 153  */
 154
 155 int swap_pager_full;            /* swap space exhaustion (task killing) */
 156 int swap_fail_ticks;            /* when we became exhausted */
 157 int swap_pager_almost_full;     /* swap space exhaustion (w/ hysteresis)*/
 158 swblk_t vm_swap_cache_use;
 159 swblk_t vm_swap_anon_use;
 160 static int vm_report_swap_allocs;
 161
 162 static struct krate kswaprate = { 1 };
 163 static int nsw_rcount;          /* free read buffers                    */
 164 static int nsw_wcount_sync;     /* limit write buffers / synchronous    */
 165 static int nsw_wcount_async;    /* limit write buffers / asynchronous   */
 166 static int nsw_wcount_async_max;/* assigned maximum                     */
 167 static int nsw_cluster_max;     /* maximum VOP I/O allowed              */
 168
 169 struct blist *swapblist;
 170 static int swap_async_max = 4;  /* maximum in-progress async I/O's      */
 171 static int swap_burst_read = 0; /* allow burst reading */
 172 static swblk_t swapiterator;    /* linearize allocations */
 173 int swap_user_async = 0;        /* user swap pager operation can be async */
 174
 175 static struct spinlock swapbp_spin = SPINLOCK_INITIALIZER(&swapbp_spin, "swapbp_spin");
 176
 177 /* from vm_swap.c */
 178 extern struct vnode *swapdev_vp;
 179 extern struct swdevt *swdevt;
 180 extern int nswdev;
 181
 182 #define BLK2DEVIDX(blk) (nswdev > 1 ? blk / SWB_DMMAX % nswdev : 0)
 183
 184 SYSCTL_INT(_vm, OID_AUTO, swap_async_max,
 185         CTLFLAG_RW, &swap_async_max, 0, "Maximum running async swap ops");
 186 SYSCTL_INT(_vm, OID_AUTO, swap_burst_read,
 187         CTLFLAG_RW, &swap_burst_read, 0, "Allow burst reads for pageins");
 188 SYSCTL_INT(_vm, OID_AUTO, swap_user_async,
 189         CTLFLAG_RW, &swap_user_async, 0, "Allow async uuser swap write I/O");
 190
 191 #if SWBLK_BITS == 64
 192 SYSCTL_LONG(_vm, OID_AUTO, swap_cache_use,
 193         CTLFLAG_RD, &vm_swap_cache_use, 0, "");
 194 SYSCTL_LONG(_vm, OID_AUTO, swap_anon_use,
 195         CTLFLAG_RD, &vm_swap_anon_use, 0, "");
 196 SYSCTL_LONG(_vm, OID_AUTO, swap_size,
 197         CTLFLAG_RD, &vm_swap_size, 0, "");
 198 #else
 199 SYSCTL_INT(_vm, OID_AUTO, swap_cache_use,
 200         CTLFLAG_RD, &vm_swap_cache_use, 0, "");
 201 SYSCTL_INT(_vm, OID_AUTO, swap_anon_use,
 202         CTLFLAG_RD, &vm_swap_anon_use, 0, "");
 203 SYSCTL_INT(_vm, OID_AUTO, swap_size,
 204         CTLFLAG_RD, &vm_swap_size, 0, "");
 205 #endif
 206 SYSCTL_INT(_vm, OID_AUTO, report_swap_allocs,
 207         CTLFLAG_RW, &vm_report_swap_allocs, 0, "");
 208
 209 __read_mostly vm_zone_t swap_zone;
 210
 211 /*
 212  * Red-Black tree for swblock entries
 213  *
 214  * The caller must hold vm_token
 215  */
 216 RB_GENERATE2(swblock_rb_tree, swblock, swb_entry, rb_swblock_compare,
 217              vm_pindex_t, swb_index);
 218
 219 int
 220 rb_swblock_compare(struct swblock *swb1, struct swblock *swb2)
 221 {
 222         if (swb1->swb_index < swb2->swb_index)
 223                 return(-1);
 224         if (swb1->swb_index > swb2->swb_index)
 225                 return(1);
 226         return(0);
 227 }
 228
 229 static
 230 int
 231 rb_swblock_scancmp(struct swblock *swb, void *data)
 232 {
 233         struct swfreeinfo *info = data;
 234
 235         if (swb->swb_index < info->basei)
 236                 return(-1);
 237         if (swb->swb_index > info->endi)
 238                 return(1);
 239         return(0);
 240 }
 241
 242 static
 243 int
 244 rb_swblock_condcmp(struct swblock *swb, void *data)
 245 {
 246         struct swfreeinfo *info = data;
 247
 248         if (swb->swb_index < info->basei)
 249                 return(-1);
 250         return(0);
 251 }
 252
 253 /*
 254  * pagerops for OBJT_SWAP - "swap pager".  Some ops are also global procedure
 255  * calls hooked from other parts of the VM system and do not appear here.
 256  * (see vm/swap_pager.h).
 257  */
 258
 259 static void     swap_pager_dealloc (vm_object_t object);
 260 static int      swap_pager_getpage (vm_object_t, vm_page_t *, int);
 261 static void     swap_chain_iodone(struct bio *biox);
 262
 263 struct pagerops swappagerops = {
 264         swap_pager_dealloc,     /* deallocate an OBJT_SWAP object       */
 265         swap_pager_getpage,     /* pagein                               */
 266         swap_pager_putpages,    /* pageout                              */
 267         swap_pager_haspage      /* get backing store status for page    */
 268 };
 269
 270 /*
 271  * SWB_DMMAX is in page-sized chunks with the new swap system.  It was
 272  * dev-bsized chunks in the old.  SWB_DMMAX is always a power of 2.
 273  *
 274  * swap_*() routines are externally accessible.  swp_*() routines are
 275  * internal.
 276  */
 277
 278 int nswap_lowat = 128;          /* in pages, swap_pager_almost_full warn */
 279 int nswap_hiwat = 512;          /* in pages, swap_pager_almost_full warn */
 280
 281 static __inline void    swp_sizecheck (void);
 282 static void     swp_pager_async_iodone (struct bio *bio);
 283
 284 /*
 285  * Swap bitmap functions
 286  */
 287
 288 static __inline void    swp_pager_freeswapspace(vm_object_t object,
 289                                                 swblk_t blk, int npages);
 290 static __inline swblk_t swp_pager_getswapspace(vm_object_t object, int npages);
 291
 292 /*
 293  * Metadata functions
 294  */
 295
 296 static void swp_pager_meta_convert(vm_object_t);
 297 static void swp_pager_meta_build(vm_object_t, vm_pindex_t, swblk_t);
 298 static void swp_pager_meta_free(vm_object_t, vm_pindex_t, vm_pindex_t);
 299 static void swp_pager_meta_free_all(vm_object_t);
 300 static swblk_t swp_pager_meta_ctl(vm_object_t, vm_pindex_t, int);
 301
 302 /*
 303  * SWP_SIZECHECK() -    update swap_pager_full indication
 304  *
 305  *      update the swap_pager_almost_full indication and warn when we are
 306  *      about to run out of swap space, using lowat/hiwat hysteresis.
 307  *
 308  *      Clear swap_pager_full ( task killing ) indication when lowat is met.
 309  *
 310  * No restrictions on call
 311  * This routine may not block.
 312  * SMP races are ok.
 313  */
 314 static __inline void
 315 swp_sizecheck(void)
 316 {
 317         if (vm_swap_size < nswap_lowat) {
 318                 if (swap_pager_almost_full == 0) {
 319                         kprintf("swap_pager: out of swap space\n");
 320                         swap_pager_almost_full = 1;
 321                         swap_fail_ticks = ticks;
 322                 }
 323         } else {
 324                 swap_pager_full = 0;
 325                 if (vm_swap_size > nswap_hiwat)
 326                         swap_pager_almost_full = 0;
 327         }
 328 }
 329
 330 /*
 331  * Long-term data collection on 10-second interval.  Return the value
 332  * for KCOLLECT_SWAPPCT and set the values for SWAPANO and SWAPCCAC.
 333  *
 334  * Return total swap in the scale field.  This can change if swap is
 335  * regularly added or removed and may cause some historical confusion
 336  * in that case, but SWAPPCT will always be historically accurate.
 337  */
 338
 339 #define PTOB(value)     ((uint64_t)(value) << PAGE_SHIFT)
 340
 341 static uint64_t
 342 collect_swap_callback(int n)
 343 {
 344         uint64_t total = vm_swap_max;
 345         uint64_t anon = vm_swap_anon_use;
 346         uint64_t cache = vm_swap_cache_use;
 347
 348         if (total == 0)         /* avoid divide by zero */
 349                 total = 1;
 350         kcollect_setvalue(KCOLLECT_SWAPANO, PTOB(anon));
 351         kcollect_setvalue(KCOLLECT_SWAPCAC, PTOB(cache));
 352         kcollect_setscale(KCOLLECT_SWAPANO,
 353                           KCOLLECT_SCALE(KCOLLECT_SWAPANO_FORMAT, PTOB(total)));
 354         kcollect_setscale(KCOLLECT_SWAPCAC,
 355                           KCOLLECT_SCALE(KCOLLECT_SWAPCAC_FORMAT, PTOB(total)));
 356         return (((anon + cache) * 10000 + (total >> 1)) / total);
 357 }
 358
 359 /*
 360  * SWAP_PAGER_INIT() -  initialize the swap pager!
 361  *
 362  *      Expected to be started from system init.  NOTE:  This code is run
 363  *      before much else so be careful what you depend on.  Most of the VM
 364  *      system has yet to be initialized at this point.
 365  *
 366  * Called from the low level boot code only.
 367  */
 368 static void
 369 swap_pager_init(void *arg __unused)
 370 {
 371         kcollect_register(KCOLLECT_SWAPPCT, "swapuse", collect_swap_callback,
 372                           KCOLLECT_SCALE(KCOLLECT_SWAPPCT_FORMAT, 0));
 373         kcollect_register(KCOLLECT_SWAPANO, "swapano", NULL,
 374                           KCOLLECT_SCALE(KCOLLECT_SWAPANO_FORMAT, 0));
 375         kcollect_register(KCOLLECT_SWAPCAC, "swapcac", NULL,
 376                           KCOLLECT_SCALE(KCOLLECT_SWAPCAC_FORMAT, 0));
 377 }
 378 SYSINIT(vm_mem, SI_BOOT1_VM, SI_ORDER_THIRD, swap_pager_init, NULL);
 379
 380 /*
 381  * SWAP_PAGER_SWAP_INIT() - swap pager initialization from pageout process
 382  *
 383  *      Expected to be started from pageout process once, prior to entering
 384  *      its main loop.
 385  *
 386  * Called from the low level boot code only.
 387  */
 388 void
 389 swap_pager_swap_init(void)
 390 {
 391         int n, n2;
 392
 393         /*
 394          * Number of in-transit swap bp operations.  Don't
 395          * exhaust the pbufs completely.  Make sure we
 396          * initialize workable values (0 will work for hysteresis
 397          * but it isn't very efficient).
 398          *
 399          * The nsw_cluster_max is constrained by the number of pages an XIO
 400          * holds, i.e., (MAXPHYS/PAGE_SIZE) and our locally defined
 401          * MAX_PAGEOUT_CLUSTER.   Also be aware that swap ops are
 402          * constrained by the swap device interleave stripe size.
 403          *
 404          * Currently we hardwire nsw_wcount_async to 4.  This limit is
 405          * designed to prevent other I/O from having high latencies due to
 406          * our pageout I/O.  The value 4 works well for one or two active swap
 407          * devices but is probably a little low if you have more.  Even so,
 408          * a higher value would probably generate only a limited improvement
 409          * with three or four active swap devices since the system does not
 410          * typically have to pageout at extreme bandwidths.   We will want
 411          * at least 2 per swap devices, and 4 is a pretty good value if you
 412          * have one NFS swap device due to the command/ack latency over NFS.
 413          * So it all works out pretty well.
 414          */
 415
 416         nsw_cluster_max = min((MAXPHYS/PAGE_SIZE), MAX_PAGEOUT_CLUSTER);
 417
 418         nsw_rcount = (nswbuf_kva + 1) / 2;
 419         nsw_wcount_sync = (nswbuf_kva + 3) / 4;
 420         nsw_wcount_async = 4;
 421         nsw_wcount_async_max = nsw_wcount_async;
 422
 423         /*
 424          * The zone is dynamically allocated so generally size it to
 425          * maxswzone (32MB to 256GB of KVM).  Set a minimum size based
 426          * on physical memory of around 8x (each swblock can hold 16 pages).
 427          *
 428          * With the advent of SSDs (vs HDs) the practical (swap:memory) ratio
 429          * has increased dramatically.
 430          */
 431         n = vmstats.v_page_count / 2;
 432         if (maxswzone && n < maxswzone / sizeof(struct swblock))
 433                 n = maxswzone / sizeof(struct swblock);
 434         n2 = n;
 435
 436         do {
 437                 swap_zone = zinit(
 438                         "SWAPMETA",
 439                         sizeof(struct swblock),
 440                         n,
 441                         ZONE_INTERRUPT);
 442                 if (swap_zone != NULL)
 443                         break;
 444                 /*
 445                  * if the allocation failed, try a zone two thirds the
 446                  * size of the previous attempt.
 447                  */
 448                 n -= ((n + 2) / 3);
 449         } while (n > 0);
 450
 451         if (swap_zone == NULL)
 452                 panic("swap_pager_swap_init: swap_zone == NULL");
 453         if (n2 != n)
 454                 kprintf("Swap zone entries reduced from %d to %d.\n", n2, n);
 455 }
 456
 457 /*
 458  * SWAP_PAGER_ALLOC() - allocate a new OBJT_SWAP VM object and instantiate
 459  *                      its metadata structures.
 460  *
 461  *      This routine is called from the mmap and fork code to create a new
 462  *      OBJT_SWAP object.  We do this by creating an OBJT_DEFAULT object
 463  *      and then converting it with swp_pager_meta_convert().
 464  *
 465  *      We only support unnamed objects.
 466  *
 467  * No restrictions.
 468  */
 469 vm_object_t
 470 swap_pager_alloc(void *handle, off_t size, vm_prot_t prot, off_t offset)
 471 {
 472         vm_object_t object;
 473
 474         KKASSERT(handle == NULL);
 475         object = vm_object_allocate_hold(OBJT_DEFAULT,
 476                                          OFF_TO_IDX(offset + PAGE_MASK + size));
 477         swp_pager_meta_convert(object);
 478         vm_object_drop(object);
 479
 480         return (object);
 481 }
 482
 483 /*
 484  * SWAP_PAGER_DEALLOC() -       remove swap metadata from object
 485  *
 486  *      The swap backing for the object is destroyed.  The code is
 487  *      designed such that we can reinstantiate it later, but this
 488  *      routine is typically called only when the entire object is
 489  *      about to be destroyed.
 490  *
 491  * The object must be locked or unreferenceable.
 492  * No other requirements.
 493  */
 494 static void
 495 swap_pager_dealloc(vm_object_t object)
 496 {
 497         vm_object_hold(object);
 498         vm_object_pip_wait(object, "swpdea");
 499
 500         /*
 501          * Free all remaining metadata.  We only bother to free it from
 502          * the swap meta data.  We do not attempt to free swapblk's still
 503          * associated with vm_page_t's for this object.  We do not care
 504          * if paging is still in progress on some objects.
 505          */
 506         swp_pager_meta_free_all(object);
 507         vm_object_drop(object);
 508 }
 509
 510 /************************************************************************
 511  *                      SWAP PAGER BITMAP ROUTINES                      *
 512  ************************************************************************/
 513
 514 /*
 515  * SWP_PAGER_GETSWAPSPACE() -   allocate raw swap space
 516  *
 517  *      Allocate swap for the requested number of pages.  The starting
 518  *      swap block number (a page index) is returned or SWAPBLK_NONE
 519  *      if the allocation failed.
 520  *
 521  *      Also has the side effect of advising that somebody made a mistake
 522  *      when they configured swap and didn't configure enough.
 523  *
 524  * The caller must hold the object.
 525  * This routine may not block.
 526  */
 527 static __inline swblk_t
 528 swp_pager_getswapspace(vm_object_t object, int npages)
 529 {
 530         swblk_t blk;
 531
 532         lwkt_gettoken(&vm_token);
 533         blk = blist_allocat(swapblist, npages, swapiterator);
 534         if (blk == SWAPBLK_NONE)
 535                 blk = blist_allocat(swapblist, npages, 0);
 536         if (blk == SWAPBLK_NONE) {
 537                 if (swap_pager_full != 2) {
 538                         if (vm_swap_max == 0) {
 539                                 krateprintf(&kswaprate,
 540                                         "Warning: The system would like to "
 541                                         "page to swap but no swap space "
 542                                         "is configured!\n");
 543                         } else {
 544                                 krateprintf(&kswaprate,
 545                                         "swap_pager_getswapspace: "
 546                                         "swap full allocating %d pages\n",
 547                                         npages);
 548                         }
 549                         swap_pager_full = 2;
 550                         if (swap_pager_almost_full == 0)
 551                                 swap_fail_ticks = ticks;
 552                         swap_pager_almost_full = 1;
 553                 }
 554         } else {
 555                 /* swapiterator = blk; disable for now, doesn't work well */
 556                 swapacctspace(blk, -npages);
 557                 if (object->type == OBJT_SWAP)
 558                         vm_swap_anon_use += npages;
 559                 else
 560                         vm_swap_cache_use += npages;
 561                 swp_sizecheck();
 562         }
 563         lwkt_reltoken(&vm_token);
 564         return(blk);
 565 }
 566
 567 /*
 568  * SWP_PAGER_FREESWAPSPACE() -  free raw swap space
 569  *
 570  *      This routine returns the specified swap blocks back to the bitmap.
 571  *
 572  *      Note:  This routine may not block (it could in the old swap code),
 573  *      and through the use of the new blist routines it does not block.
 574  *
 575  * This routine may not block.
 576  */
 577
 578 static __inline void
 579 swp_pager_freeswapspace(vm_object_t object, swblk_t blk, int npages)
 580 {
 581         struct swdevt *sp = &swdevt[BLK2DEVIDX(blk)];
 582
 583         lwkt_gettoken(&vm_token);
 584         sp->sw_nused -= npages;
 585         if (object->type == OBJT_SWAP)
 586                 vm_swap_anon_use -= npages;
 587         else
 588                 vm_swap_cache_use -= npages;
 589
 590         if (sp->sw_flags & SW_CLOSING) {
 591                 lwkt_reltoken(&vm_token);
 592                 return;
 593         }
 594
 595         blist_free(swapblist, blk, npages);
 596         vm_swap_size += npages;
 597         swp_sizecheck();
 598         lwkt_reltoken(&vm_token);
 599 }
 600
 601 /*
 602  * SWAP_PAGER_FREESPACE() -     frees swap blocks associated with a page
 603  *                              range within an object.
 604  *
 605  *      This is a globally accessible routine.
 606  *
 607  *      This routine removes swapblk assignments from swap metadata.
 608  *
 609  *      The external callers of this routine typically have already destroyed
 610  *      or renamed vm_page_t's associated with this range in the object so
 611  *      we should be ok.
 612  *
 613  * No requirements.
 614  */
 615 void
 616 swap_pager_freespace(vm_object_t object, vm_pindex_t start, vm_pindex_t size)
 617 {
 618         vm_object_hold(object);
 619         swp_pager_meta_free(object, start, size);
 620         vm_object_drop(object);
 621 }
 622
 623 /*
 624  * No requirements.
 625  */
 626 void
 627 swap_pager_freespace_all(vm_object_t object)
 628 {
 629         vm_object_hold(object);
 630         swp_pager_meta_free_all(object);
 631         vm_object_drop(object);
 632 }
 633
 634 /*
 635  * This function conditionally frees swap cache swap starting at
 636  * (*basei) in the object.  (count) swap blocks will be nominally freed.
 637  * The actual number of blocks freed can be more or less than the
 638  * requested number.
 639  *
 640  * This function nominally returns the number of blocks freed.  However,
 641  * the actual number of blocks freed may be less then the returned value.
 642  * If the function is unable to exhaust the object or if it is able to
 643  * free (approximately) the requested number of blocks it returns
 644  * a value n > count.
 645  *
 646  * If we exhaust the object we will return a value n <= count.
 647  *
 648  * The caller must hold the object.
 649  *
 650  * WARNING!  If count == 0 then -1 can be returned as a degenerate case,
 651  *           callers should always pass a count value > 0.
 652  */
 653 static int swap_pager_condfree_callback(struct swblock *swap, void *data);
 654
 655 int
 656 swap_pager_condfree(vm_object_t object, vm_pindex_t *basei, int count)
 657 {
 658         struct swfreeinfo info;
 659         int n;
 660         int t;
 661
 662         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
 663
 664         info.object = object;
 665         info.basei = *basei;    /* skip up to this page index */
 666         info.begi = count;      /* max swap pages to destroy */
 667         info.endi = count * 8;  /* max swblocks to scan */
 668
 669         swblock_rb_tree_RB_SCAN(&object->swblock_root, rb_swblock_condcmp,
 670                                 swap_pager_condfree_callback, &info);
 671         *basei = info.basei;
 672
 673         /*
 674          * Take the higher difference swblocks vs pages
 675          */
 676         n = count - (int)info.begi;
 677         t = count * 8 - (int)info.endi;
 678         if (n < t)
 679                 n = t;
 680         if (n < 1)
 681                 n = 1;
 682         return(n);
 683 }
 684
 685 /*
 686  * The idea is to free whole meta-block to avoid fragmenting
 687  * the swap space or disk I/O.  We only do this if NO VM pages
 688  * are present.
 689  *
 690  * We do not have to deal with clearing PG_SWAPPED in related VM
 691  * pages because there are no related VM pages.
 692  *
 693  * The caller must hold the object.
 694  */
 695 static int
 696 swap_pager_condfree_callback(struct swblock *swap, void *data)
 697 {
 698         struct swfreeinfo *info = data;
 699         vm_object_t object = info->object;
 700         int i;
 701
 702         for (i = 0; i < SWAP_META_PAGES; ++i) {
 703                 if (vm_page_lookup(object, swap->swb_index + i))
 704                         break;
 705         }
 706         info->basei = swap->swb_index + SWAP_META_PAGES;
 707         if (i == SWAP_META_PAGES) {
 708                 info->begi -= swap->swb_count;
 709                 swap_pager_freespace(object, swap->swb_index, SWAP_META_PAGES);
 710         }
 711         --info->endi;
 712         if ((int)info->begi < 0 || (int)info->endi < 0)
 713                 return(-1);
 714         lwkt_yield();
 715         return(0);
 716 }
 717
 718 /*
 719  * Called by vm_page_alloc() when a new VM page is inserted
 720  * into a VM object.  Checks whether swap has been assigned to
 721  * the page and sets PG_SWAPPED as necessary.
 722  *
 723  * (m) must be busied by caller and remains busied on return.
 724  */
 725 void
 726 swap_pager_page_inserted(vm_page_t m)
 727 {
 728         if (m->object->swblock_count) {
 729                 vm_object_hold(m->object);
 730                 if (swp_pager_meta_ctl(m->object, m->pindex, 0) != SWAPBLK_NONE)
 731                         vm_page_flag_set(m, PG_SWAPPED);
 732                 vm_object_drop(m->object);
 733         }
 734 }
 735
 736 /*
 737  * SWAP_PAGER_RESERVE() - reserve swap blocks in object
 738  *
 739  *      Assigns swap blocks to the specified range within the object.  The
 740  *      swap blocks are not zerod.  Any previous swap assignment is destroyed.
 741  *
 742  *      Returns 0 on success, -1 on failure.
 743  *
 744  * The caller is responsible for avoiding races in the specified range.
 745  * No other requirements.
 746  */
 747 int
 748 swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_size_t size)
 749 {
 750         int n = 0;
 751         swblk_t blk = SWAPBLK_NONE;
 752         vm_pindex_t beg = start;        /* save start index */
 753
 754         vm_object_hold(object);
 755
 756         while (size) {
 757                 if (n == 0) {
 758                         n = BLIST_MAX_ALLOC;
 759                         while ((blk = swp_pager_getswapspace(object, n)) ==
 760                                SWAPBLK_NONE)
 761                         {
 762                                 n >>= 1;
 763                                 if (n == 0) {
 764                                         swp_pager_meta_free(object, beg,
 765                                                             start - beg);
 766                                         vm_object_drop(object);
 767                                         return(-1);
 768                                 }
 769                         }
 770                 }
 771                 swp_pager_meta_build(object, start, blk);
 772                 --size;
 773                 ++start;
 774                 ++blk;
 775                 --n;
 776         }
 777         swp_pager_meta_free(object, start, n);
 778         vm_object_drop(object);
 779         return(0);
 780 }
 781
 782 /*
 783  * SWAP_PAGER_COPY() -  copy blocks from source pager to destination pager
 784  *                      and destroy the source.
 785  *
 786  *      Copy any valid swapblks from the source to the destination.  In
 787  *      cases where both the source and destination have a valid swapblk,
 788  *      we keep the destination's.
 789  *
 790  *      This routine is allowed to block.  It may block allocating metadata
 791  *      indirectly through swp_pager_meta_build() or if paging is still in
 792  *      progress on the source.
 793  *
 794  *      XXX vm_page_collapse() kinda expects us not to block because we
 795  *      supposedly do not need to allocate memory, but for the moment we
 796  *      *may* have to get a little memory from the zone allocator, but
 797  *      it is taken from the interrupt memory.  We should be ok.
 798  *
 799  *      The source object contains no vm_page_t's (which is just as well)
 800  *      The source object is of type OBJT_SWAP.
 801  *
 802  *      The source and destination objects must be held by the caller.
 803  */
 804 void
 805 swap_pager_copy(vm_object_t srcobject, vm_object_t dstobject,
 806                 vm_pindex_t base_index, int destroysource)
 807 {
 808         vm_pindex_t i;
 809
 810         ASSERT_LWKT_TOKEN_HELD(vm_object_token(srcobject));
 811         ASSERT_LWKT_TOKEN_HELD(vm_object_token(dstobject));
 812
 813         /*
 814          * transfer source to destination.
 815          */
 816         for (i = 0; i < dstobject->size; ++i) {
 817                 swblk_t dstaddr;
 818
 819                 /*
 820                  * Locate (without changing) the swapblk on the destination,
 821                  * unless it is invalid in which case free it silently, or
 822                  * if the destination is a resident page, in which case the
 823                  * source is thrown away.
 824                  */
 825                 dstaddr = swp_pager_meta_ctl(dstobject, i, 0);
 826
 827                 if (dstaddr == SWAPBLK_NONE) {
 828                         /*
 829                          * Destination has no swapblk and is not resident,
 830                          * copy source.
 831                          */
 832                         swblk_t srcaddr;
 833
 834                         srcaddr = swp_pager_meta_ctl(srcobject,
 835                                                      base_index + i, SWM_POP);
 836
 837                         if (srcaddr != SWAPBLK_NONE)
 838                                 swp_pager_meta_build(dstobject, i, srcaddr);
 839                 } else {
 840                         /*
 841                          * Destination has valid swapblk or it is represented
 842                          * by a resident page.  We destroy the sourceblock.
 843                          */
 844                         swp_pager_meta_ctl(srcobject, base_index + i, SWM_FREE);
 845                 }
 846         }
 847
 848         /*
 849          * Free left over swap blocks in source.
 850          *
 851          * We have to revert the type to OBJT_DEFAULT so we do not accidently
 852          * double-remove the object from the swap queues.
 853          */
 854         if (destroysource) {
 855                 /*
 856                  * Reverting the type is not necessary, the caller is going
 857                  * to destroy srcobject directly, but I'm doing it here
 858                  * for consistency since we've removed the object from its
 859                  * queues.
 860                  */
 861                 swp_pager_meta_free_all(srcobject);
 862                 if (srcobject->type == OBJT_SWAP)
 863                         srcobject->type = OBJT_DEFAULT;
 864         }
 865 }
 866
 867 /*
 868  * SWAP_PAGER_HASPAGE() -       determine if we have good backing store for
 869  *                              the requested page.
 870  *
 871  *      We determine whether good backing store exists for the requested
 872  *      page and return TRUE if it does, FALSE if it doesn't.
 873  *
 874  *      If TRUE, we also try to determine how much valid, contiguous backing
 875  *      store exists before and after the requested page within a reasonable
 876  *      distance.  We do not try to restrict it to the swap device stripe
 877  *      (that is handled in getpages/putpages).  It probably isn't worth
 878  *      doing here.
 879  *
 880  * No requirements.
 881  */
 882 boolean_t
 883 swap_pager_haspage(vm_object_t object, vm_pindex_t pindex)
 884 {
 885         swblk_t blk0;
 886
 887         /*
 888          * do we have good backing store at the requested index ?
 889          */
 890         vm_object_hold(object);
 891         blk0 = swp_pager_meta_ctl(object, pindex, 0);
 892
 893         if (blk0 == SWAPBLK_NONE) {
 894                 vm_object_drop(object);
 895                 return (FALSE);
 896         }
 897         vm_object_drop(object);
 898         return (TRUE);
 899 }
 900
 901 /*
 902  * Object must be held exclusive or shared by the caller.
 903  */
 904 boolean_t
 905 swap_pager_haspage_locked(vm_object_t object, vm_pindex_t pindex)
 906 {
 907         if (swp_pager_meta_ctl(object, pindex, 0) == SWAPBLK_NONE)
 908                 return FALSE;
 909         return TRUE;
 910 }
 911
 912 /*
 913  * SWAP_PAGER_PAGE_UNSWAPPED() - remove swap backing store related to page
 914  *
 915  * This removes any associated swap backing store, whether valid or
 916  * not, from the page.  This operates on any VM object, not just OBJT_SWAP
 917  * objects.
 918  *
 919  * This routine is typically called when a page is made dirty, at
 920  * which point any associated swap can be freed.  MADV_FREE also
 921  * calls us in a special-case situation
 922  *
 923  * NOTE!!!  If the page is clean and the swap was valid, the caller
 924  *          should make the page dirty before calling this routine.
 925  *          This routine does NOT change the m->dirty status of the page.
 926  *          Also: MADV_FREE depends on it.
 927  *
 928  * The page must be busied.
 929  * The caller can hold the object to avoid blocking, else we might block.
 930  * No other requirements.
 931  */
 932 void
 933 swap_pager_unswapped(vm_page_t m)
 934 {
 935         if (m->flags & PG_SWAPPED) {
 936                 vm_object_hold(m->object);
 937                 KKASSERT(m->flags & PG_SWAPPED);
 938                 swp_pager_meta_ctl(m->object, m->pindex, SWM_FREE);
 939                 vm_page_flag_clear(m, PG_SWAPPED);
 940                 vm_object_drop(m->object);
 941         }
 942 }
 943
 944 /*
 945  * SWAP_PAGER_STRATEGY() - read, write, free blocks
 946  *
 947  * This implements a VM OBJECT strategy function using swap backing store.
 948  * This can operate on any VM OBJECT type, not necessarily just OBJT_SWAP
 949  * types.  Only BUF_CMD_{READ,WRITE,FREEBLKS} is supported, any other
 950  * requests will return EINVAL.
 951  *
 952  * This is intended to be a cacheless interface (i.e. caching occurs at
 953  * higher levels), and is also used as a swap-based SSD cache for vnode
 954  * and device objects.
 955  *
 956  * All I/O goes directly to and from the swap device.
 957  *
 958  * We currently attempt to run I/O synchronously or asynchronously as
 959  * the caller requests.  This isn't perfect because we loose error
 960  * sequencing when we run multiple ops in parallel to satisfy a request.
 961  * But this is swap, so we let it all hang out.
 962  *
 963  * NOTE: This function supports the KVABIO API wherein bp->b_data might
 964  *       not be synchronized to the current cpu.
 965  *
 966  * No requirements.
 967  */
 968 void
 969 swap_pager_strategy(vm_object_t object, struct bio *bio)
 970 {
 971         struct buf *bp = bio->bio_buf;
 972         struct bio *nbio;
 973         vm_pindex_t start;
 974         vm_pindex_t biox_blkno = 0;
 975         int count;
 976         char *data;
 977         struct bio *biox;
 978         struct buf *bufx;
 979 #if 0
 980         struct bio_track *track;
 981 #endif
 982
 983 #if 0
 984         /*
 985          * tracking for swapdev vnode I/Os
 986          */
 987         if (bp->b_cmd == BUF_CMD_READ)
 988                 track = &swapdev_vp->v_track_read;
 989         else
 990                 track = &swapdev_vp->v_track_write;
 991 #endif
 992
 993         /*
 994          * Only supported commands
 995          */
 996         if (bp->b_cmd != BUF_CMD_FREEBLKS &&
 997             bp->b_cmd != BUF_CMD_READ &&
 998             bp->b_cmd != BUF_CMD_WRITE) {
 999                 bp->b_error = EINVAL;
1000                 bp->b_flags |= B_ERROR | B_INVAL;
1001                 biodone(bio);
1002                 return;
1003         }
1004
1005         /*
1006          * bcount must be an integral number of pages.
1007          */
1008         if (bp->b_bcount & PAGE_MASK) {
1009                 bp->b_error = EINVAL;
1010                 bp->b_flags |= B_ERROR | B_INVAL;
1011                 biodone(bio);
1012                 kprintf("swap_pager_strategy: bp %p offset %lld size %d, "
1013                         "not page bounded\n",
1014                         bp, (long long)bio->bio_offset, (int)bp->b_bcount);
1015                 return;
1016         }
1017
1018         /*
1019          * Clear error indication, initialize page index, count, data pointer.
1020          */
1021         bp->b_error = 0;
1022         bp->b_flags &= ~B_ERROR;
1023         bp->b_resid = bp->b_bcount;
1024
1025         start = (vm_pindex_t)(bio->bio_offset >> PAGE_SHIFT);
1026         count = howmany(bp->b_bcount, PAGE_SIZE);
1027
1028         /*
1029          * WARNING!  Do not dereference *data without issuing a bkvasync()
1030          */
1031         data = bp->b_data;
1032
1033         /*
1034          * Deal with BUF_CMD_FREEBLKS
1035          */
1036         if (bp->b_cmd == BUF_CMD_FREEBLKS) {
1037                 /*
1038                  * FREE PAGE(s) - destroy underlying swap that is no longer
1039                  *                needed.
1040                  */
1041                 vm_object_hold(object);
1042                 swp_pager_meta_free(object, start, count);
1043                 vm_object_drop(object);
1044                 bp->b_resid = 0;
1045                 biodone(bio);
1046                 return;
1047         }
1048
1049         /*
1050          * We need to be able to create a new cluster of I/O's.  We cannot
1051          * use the caller fields of the passed bio so push a new one.
1052          *
1053          * Because nbio is just a placeholder for the cluster links,
1054          * we can biodone() the original bio instead of nbio to make
1055          * things a bit more efficient.
1056          */
1057         nbio = push_bio(bio);
1058         nbio->bio_offset = bio->bio_offset;
1059         nbio->bio_caller_info1.cluster_head = NULL;
1060         nbio->bio_caller_info2.cluster_tail = NULL;
1061
1062         biox = NULL;
1063         bufx = NULL;
1064
1065         /*
1066          * Execute read or write
1067          */
1068         vm_object_hold(object);
1069
1070         while (count > 0) {
1071                 swblk_t blk;
1072
1073                 /*
1074                  * Obtain block.  If block not found and writing, allocate a
1075                  * new block and build it into the object.
1076                  */
1077                 blk = swp_pager_meta_ctl(object, start, 0);
1078                 if ((blk == SWAPBLK_NONE) && bp->b_cmd == BUF_CMD_WRITE) {
1079                         blk = swp_pager_getswapspace(object, 1);
1080                         if (blk == SWAPBLK_NONE) {
1081                                 bp->b_error = ENOMEM;
1082                                 bp->b_flags |= B_ERROR;
1083                                 break;
1084                         }
1085                         swp_pager_meta_build(object, start, blk);
1086                 }
1087
1088                 /*
1089                  * Do we have to flush our current collection?  Yes if:
1090                  *
1091                  *      - no swap block at this index
1092                  *      - swap block is not contiguous
1093                  *      - we cross a physical disk boundry in the
1094                  *        stripe.
1095                  */
1096                 if (biox &&
1097                     (biox_blkno + btoc(bufx->b_bcount) != blk ||
1098                      ((biox_blkno ^ blk) & ~SWB_DMMASK))) {
1099                         switch(bp->b_cmd) {
1100                         case BUF_CMD_READ:
1101                                 ++mycpu->gd_cnt.v_swapin;
1102                                 mycpu->gd_cnt.v_swappgsin +=
1103                                         btoc(bufx->b_bcount);
1104                                 break;
1105                         case BUF_CMD_WRITE:
1106                                 ++mycpu->gd_cnt.v_swapout;
1107                                 mycpu->gd_cnt.v_swappgsout +=
1108                                         btoc(bufx->b_bcount);
1109                                 bufx->b_dirtyend = bufx->b_bcount;
1110                                 break;
1111                         default:
1112                                 /* NOT REACHED */
1113                                 break;
1114                         }
1115
1116                         /*
1117                          * Finished with this buf.
1118                          */
1119                         KKASSERT(bufx->b_bcount != 0);
1120                         if (bufx->b_cmd != BUF_CMD_READ)
1121                                 bufx->b_dirtyend = bufx->b_bcount;
1122                         biox = NULL;
1123                         bufx = NULL;
1124                 }
1125
1126                 /*
1127                  * Add new swapblk to biox, instantiating biox if necessary.
1128                  * Zero-fill reads are able to take a shortcut.
1129                  */
1130                 if (blk == SWAPBLK_NONE) {
1131                         /*
1132                          * We can only get here if we are reading.
1133                          */
1134                         bkvasync(bp);
1135                         bzero(data, PAGE_SIZE);
1136                         bp->b_resid -= PAGE_SIZE;
1137                 } else {
1138                         if (biox == NULL) {
1139                                 /* XXX chain count > 4, wait to <= 4 */
1140
1141                                 bufx = getpbuf(NULL);
1142                                 bufx->b_flags |= B_KVABIO;
1143                                 biox = &bufx->b_bio1;
1144                                 cluster_append(nbio, bufx);
1145                                 bufx->b_cmd = bp->b_cmd;
1146                                 biox->bio_done = swap_chain_iodone;
1147                                 biox->bio_offset = (off_t)blk << PAGE_SHIFT;
1148                                 biox->bio_caller_info1.cluster_parent = nbio;
1149                                 biox_blkno = blk;
1150                                 bufx->b_bcount = 0;
1151                                 bufx->b_data = data;
1152                         }
1153                         bufx->b_bcount += PAGE_SIZE;
1154                 }
1155                 --count;
1156                 ++start;
1157                 data += PAGE_SIZE;
1158         }
1159
1160         vm_object_drop(object);
1161
1162         /*
1163          *  Flush out last buffer
1164          */
1165         if (biox) {
1166                 if (bufx->b_cmd == BUF_CMD_READ) {
1167                         ++mycpu->gd_cnt.v_swapin;
1168                         mycpu->gd_cnt.v_swappgsin += btoc(bufx->b_bcount);
1169                 } else {
1170                         ++mycpu->gd_cnt.v_swapout;
1171                         mycpu->gd_cnt.v_swappgsout += btoc(bufx->b_bcount);
1172                         bufx->b_dirtyend = bufx->b_bcount;
1173                 }
1174                 KKASSERT(bufx->b_bcount);
1175                 if (bufx->b_cmd != BUF_CMD_READ)
1176                         bufx->b_dirtyend = bufx->b_bcount;
1177                 /* biox, bufx = NULL */
1178         }
1179
1180         /*
1181          * Now initiate all the I/O.  Be careful looping on our chain as
1182          * I/O's may complete while we are still initiating them.
1183          *
1184          * If the request is a 100% sparse read no bios will be present
1185          * and we just biodone() the buffer.
1186          */
1187         nbio->bio_caller_info2.cluster_tail = NULL;
1188         bufx = nbio->bio_caller_info1.cluster_head;
1189
1190         if (bufx) {
1191                 while (bufx) {
1192                         biox = &bufx->b_bio1;
1193                         BUF_KERNPROC(bufx);
1194                         bufx = bufx->b_cluster_next;
1195                         vn_strategy(swapdev_vp, biox);
1196                 }
1197         } else {
1198                 biodone(bio);
1199         }
1200
1201         /*
1202          * Completion of the cluster will also call biodone_chain(nbio).
1203          * We never call biodone(nbio) so we don't have to worry about
1204          * setting up a bio_done callback.  It's handled in the sub-IO.
1205          */
1206         /**/
1207 }
1208
1209 /*
1210  * biodone callback
1211  *
1212  * No requirements.
1213  */
1214 static void
1215 swap_chain_iodone(struct bio *biox)
1216 {
1217         struct buf **nextp;
1218         struct buf *bufx;       /* chained sub-buffer */
1219         struct bio *nbio;       /* parent nbio with chain glue */
1220         struct buf *bp;         /* original bp associated with nbio */
1221         int chain_empty;
1222
1223         bufx = biox->bio_buf;
1224         nbio = biox->bio_caller_info1.cluster_parent;
1225         bp = nbio->bio_buf;
1226
1227         /*
1228          * Update the original buffer
1229          */
1230         KKASSERT(bp != NULL);
1231         if (bufx->b_flags & B_ERROR) {
1232                 atomic_set_int(&bufx->b_flags, B_ERROR);
1233                 bp->b_error = bufx->b_error;    /* race ok */
1234         } else if (bufx->b_resid != 0) {
1235                 atomic_set_int(&bufx->b_flags, B_ERROR);
1236                 bp->b_error = EINVAL;           /* race ok */
1237         } else {
1238                 atomic_subtract_int(&bp->b_resid, bufx->b_bcount);
1239         }
1240
1241         /*
1242          * Remove us from the chain.
1243          */
1244         spin_lock(&swapbp_spin);
1245         nextp = &nbio->bio_caller_info1.cluster_head;
1246         while (*nextp != bufx) {
1247                 KKASSERT(*nextp != NULL);
1248                 nextp = &(*nextp)->b_cluster_next;
1249         }
1250         *nextp = bufx->b_cluster_next;
1251         chain_empty = (nbio->bio_caller_info1.cluster_head == NULL);
1252         spin_unlock(&swapbp_spin);
1253
1254         /*
1255          * Clean up bufx.  If the chain is now empty we finish out
1256          * the parent.  Note that we may be racing other completions
1257          * so we must use the chain_empty status from above.
1258          */
1259         if (chain_empty) {
1260                 if (bp->b_resid != 0 && !(bp->b_flags & B_ERROR)) {
1261                         atomic_set_int(&bp->b_flags, B_ERROR);
1262                         bp->b_error = EINVAL;
1263                 }
1264                 biodone_chain(nbio);
1265         }
1266         relpbuf(bufx, NULL);
1267 }
1268
1269 /*
1270  * SWAP_PAGER_GETPAGES() - bring page in from swap
1271  *
1272  * The requested page may have to be brought in from swap.  Calculate the
1273  * swap block and bring in additional pages if possible.  All pages must
1274  * have contiguous swap block assignments and reside in the same object.
1275  *
1276  * The caller has a single vm_object_pip_add() reference prior to
1277  * calling us and we should return with the same.
1278  *
1279  * The caller has BUSY'd the page.  We should return with (*mpp) left busy,
1280  * and any additinal pages unbusied.
1281  *
1282  * If the caller encounters a PG_RAM page it will pass it to us even though
1283  * it may be valid and dirty.  We cannot overwrite the page in this case!
1284  * The case is used to allow us to issue pure read-aheads.
1285  *
1286  * NOTE! XXX This code does not entirely pipeline yet due to the fact that
1287  *       the PG_RAM page is validated at the same time as mreq.  What we
1288  *       really need to do is issue a separate read-ahead pbuf.
1289  *
1290  * No requirements.
1291  */
1292 static int
1293 swap_pager_getpage(vm_object_t object, vm_page_t *mpp, int seqaccess)
1294 {
1295         struct buf *bp;
1296         struct bio *bio;
1297         vm_page_t mreq;
1298         vm_page_t m;
1299         vm_offset_t kva;
1300         swblk_t blk;
1301         int i;
1302         int j;
1303         int raonly;
1304         int error;
1305         u_int32_t busy_count;
1306         vm_page_t marray[XIO_INTERNAL_PAGES];
1307
1308         mreq = *mpp;
1309
1310         vm_object_hold(object);
1311         if (mreq->object != object) {
1312                 panic("swap_pager_getpages: object mismatch %p/%p",
1313                     object,
1314                     mreq->object
1315                 );
1316         }
1317
1318         /*
1319          * We don't want to overwrite a fully valid page as it might be
1320          * dirty.  This case can occur when e.g. vm_fault hits a perfectly
1321          * valid page with PG_RAM set.
1322          *
1323          * In this case we see if the next page is a suitable page-in
1324          * candidate and if it is we issue read-ahead.  PG_RAM will be
1325          * set on the last page of the read-ahead to continue the pipeline.
1326          */
1327         if (mreq->valid == VM_PAGE_BITS_ALL) {
1328                 if (swap_burst_read == 0 || mreq->pindex + 1 >= object->size) {
1329                         vm_object_drop(object);
1330                         return(VM_PAGER_OK);
1331                 }
1332                 blk = swp_pager_meta_ctl(object, mreq->pindex + 1, 0);
1333                 if (blk == SWAPBLK_NONE) {
1334                         vm_object_drop(object);
1335                         return(VM_PAGER_OK);
1336                 }
1337                 m = vm_page_lookup_busy_try(object, mreq->pindex + 1,
1338                                             TRUE, &error);
1339                 if (error) {
1340                         vm_object_drop(object);
1341                         return(VM_PAGER_OK);
1342                 } else if (m == NULL) {
1343                         /*
1344                          * Use VM_ALLOC_QUICK to avoid blocking on cache
1345                          * page reuse.
1346                          */
1347                         m = vm_page_alloc(object, mreq->pindex + 1,
1348                                           VM_ALLOC_QUICK);
1349                         if (m == NULL) {
1350                                 vm_object_drop(object);
1351                                 return(VM_PAGER_OK);
1352                         }
1353                 } else {
1354                         if (m->valid) {
1355                                 vm_page_wakeup(m);
1356                                 vm_object_drop(object);
1357                                 return(VM_PAGER_OK);
1358                         }
1359                         vm_page_unqueue_nowakeup(m);
1360                 }
1361                 /* page is busy */
1362                 mreq = m;
1363                 raonly = 1;
1364         } else {
1365                 raonly = 0;
1366         }
1367
1368         /*
1369          * Try to block-read contiguous pages from swap if sequential,
1370          * otherwise just read one page.  Contiguous pages from swap must
1371          * reside within a single device stripe because the I/O cannot be
1372          * broken up across multiple stripes.
1373          *
1374          * Note that blk and iblk can be SWAPBLK_NONE but the loop is
1375          * set up such that the case(s) are handled implicitly.
1376          */
1377         blk = swp_pager_meta_ctl(mreq->object, mreq->pindex, 0);
1378         marray[0] = mreq;
1379
1380         for (i = 1; i <= swap_burst_read &&
1381                     i < XIO_INTERNAL_PAGES &&
1382                     mreq->pindex + i < object->size; ++i) {
1383                 swblk_t iblk;
1384
1385                 iblk = swp_pager_meta_ctl(object, mreq->pindex + i, 0);
1386                 if (iblk != blk + i)
1387                         break;
1388                 if ((blk ^ iblk) & ~SWB_DMMASK)
1389                         break;
1390                 m = vm_page_lookup_busy_try(object, mreq->pindex + i,
1391                                             TRUE, &error);
1392                 if (error) {
1393                         break;
1394                 } else if (m == NULL) {
1395                         /*
1396                          * Use VM_ALLOC_QUICK to avoid blocking on cache
1397                          * page reuse.
1398                          */
1399                         m = vm_page_alloc(object, mreq->pindex + i,
1400                                           VM_ALLOC_QUICK);
1401                         if (m == NULL)
1402                                 break;
1403                 } else {
1404                         if (m->valid) {
1405                                 vm_page_wakeup(m);
1406                                 break;
1407                         }
1408                         vm_page_unqueue_nowakeup(m);
1409                 }
1410                 /* page is busy */
1411                 marray[i] = m;
1412         }
1413         if (i > 1)
1414                 vm_page_flag_set(marray[i - 1], PG_RAM);
1415
1416         /*
1417          * If mreq is the requested page and we have nothing to do return
1418          * VM_PAGER_FAIL.  If raonly is set mreq is just another read-ahead
1419          * page and must be cleaned up.
1420          */
1421         if (blk == SWAPBLK_NONE) {
1422                 KKASSERT(i == 1);
1423                 if (raonly) {
1424                         vnode_pager_freepage(mreq);
1425                         vm_object_drop(object);
1426                         return(VM_PAGER_OK);
1427                 } else {
1428                         vm_object_drop(object);
1429                         return(VM_PAGER_FAIL);
1430                 }
1431         }
1432
1433         /*
1434          * Map our page(s) into kva for input
1435          *
1436          * Use the KVABIO API to avoid synchronizing the pmap.
1437          */
1438         bp = getpbuf_kva(&nsw_rcount);
1439         bio = &bp->b_bio1;
1440         kva = (vm_offset_t) bp->b_kvabase;
1441         bcopy(marray, bp->b_xio.xio_pages, i * sizeof(vm_page_t));
1442         pmap_qenter_noinval(kva, bp->b_xio.xio_pages, i);
1443
1444         bp->b_data = (caddr_t)kva;
1445         bp->b_bcount = PAGE_SIZE * i;
1446         bp->b_xio.xio_npages = i;
1447         bp->b_flags |= B_KVABIO;
1448         bio->bio_done = swp_pager_async_iodone;
1449         bio->bio_offset = (off_t)blk << PAGE_SHIFT;
1450         bio->bio_caller_info1.index = SWBIO_READ;
1451
1452         /*
1453          * Set index.  If raonly set the index beyond the array so all
1454          * the pages are treated the same, otherwise the original mreq is
1455          * at index 0.
1456          */
1457         if (raonly)
1458                 bio->bio_driver_info = (void *)(intptr_t)i;
1459         else
1460                 bio->bio_driver_info = (void *)(intptr_t)0;
1461
1462         for (j = 0; j < i; ++j) {
1463                 atomic_set_int(&bp->b_xio.xio_pages[j]->busy_count,
1464                                PBUSY_SWAPINPROG);
1465         }
1466
1467         mycpu->gd_cnt.v_swapin++;
1468         mycpu->gd_cnt.v_swappgsin += bp->b_xio.xio_npages;
1469
1470         /*
1471          * We still hold the lock on mreq, and our automatic completion routine
1472          * does not remove it.
1473          */
1474         vm_object_pip_add(object, bp->b_xio.xio_npages);
1475
1476         /*
1477          * perform the I/O.  NOTE!!!  bp cannot be considered valid after
1478          * this point because we automatically release it on completion.
1479          * Instead, we look at the one page we are interested in which we
1480          * still hold a lock on even through the I/O completion.
1481          *
1482          * The other pages in our m[] array are also released on completion,
1483          * so we cannot assume they are valid anymore either.
1484          */
1485         bp->b_cmd = BUF_CMD_READ;
1486         BUF_KERNPROC(bp);
1487         vn_strategy(swapdev_vp, bio);
1488
1489         /*
1490          * Wait for the page we want to complete.  PBUSY_SWAPINPROG is always
1491          * cleared on completion.  If an I/O error occurs, SWAPBLK_NONE
1492          * is set in the meta-data.
1493          *
1494          * If this is a read-ahead only we return immediately without
1495          * waiting for I/O.
1496          */
1497         if (raonly) {
1498                 vm_object_drop(object);
1499                 return(VM_PAGER_OK);
1500         }
1501
1502         /*
1503          * Read-ahead includes originally requested page case.
1504          */
1505         for (;;) {
1506                 busy_count = mreq->busy_count;
1507                 cpu_ccfence();
1508                 if ((busy_count & PBUSY_SWAPINPROG) == 0)
1509                         break;
1510                 tsleep_interlock(mreq, 0);
1511                 if (!atomic_cmpset_int(&mreq->busy_count, busy_count,
1512                                        busy_count |
1513                                         PBUSY_SWAPINPROG | PBUSY_WANTED)) {
1514                         continue;
1515                 }
1516                 atomic_set_int(&mreq->flags, PG_REFERENCED);
1517                 mycpu->gd_cnt.v_intrans++;
1518                 if (tsleep(mreq, PINTERLOCKED, "swread", hz*20)) {
1519                         kprintf(
1520                             "swap_pager: indefinite wait buffer: "
1521                                 " bp %p offset: %lld, size: %ld\n",
1522                             bp,
1523                             (long long)bio->bio_offset,
1524                             (long)bp->b_bcount
1525                         );
1526                 }
1527         }
1528
1529         /*
1530          * Disallow speculative reads prior to the SWAPINPROG test.
1531          */
1532         cpu_lfence();
1533
1534         /*
1535          * mreq is left busied after completion, but all the other pages
1536          * are freed.  If we had an unrecoverable read error the page will
1537          * not be valid.
1538          */
1539         vm_object_drop(object);
1540         if (mreq->valid != VM_PAGE_BITS_ALL)
1541                 return(VM_PAGER_ERROR);
1542         else
1543                 return(VM_PAGER_OK);
1544
1545         /*
1546          * A final note: in a low swap situation, we cannot deallocate swap
1547          * and mark a page dirty here because the caller is likely to mark
1548          * the page clean when we return, causing the page to possibly revert
1549          * to all-zero's later.
1550          */
1551 }
1552
1553 /*
1554  *      swap_pager_putpages:
1555  *
1556  *      Assign swap (if necessary) and initiate I/O on the specified pages.
1557  *
1558  *      We support both OBJT_DEFAULT and OBJT_SWAP objects.  DEFAULT objects
1559  *      are automatically converted to SWAP objects.
1560  *
1561  *      In a low memory situation we may block in vn_strategy(), but the new
1562  *      vm_page reservation system coupled with properly written VFS devices
1563  *      should ensure that no low-memory deadlock occurs.  This is an area
1564  *      which needs work.
1565  *
1566  *      The parent has N vm_object_pip_add() references prior to
1567  *      calling us and will remove references for rtvals[] that are
1568  *      not set to VM_PAGER_PEND.  We need to remove the rest on I/O
1569  *      completion.
1570  *
1571  *      The parent has soft-busy'd the pages it passes us and will unbusy
1572  *      those whos rtvals[] entry is not set to VM_PAGER_PEND on return.
1573  *      We need to unbusy the rest on I/O completion.
1574  *
1575  * No requirements.
1576  */
1577 void
1578 swap_pager_putpages(vm_object_t object, vm_page_t *m, int count,
1579                     int flags, int *rtvals)
1580 {
1581         int i;
1582         int n = 0;
1583
1584         vm_object_hold(object);
1585
1586         if (count && m[0]->object != object) {
1587                 panic("swap_pager_getpages: object mismatch %p/%p",
1588                     object,
1589                     m[0]->object
1590                 );
1591         }
1592
1593         /*
1594          * Step 1
1595          *
1596          * Turn object into OBJT_SWAP
1597          * Check for bogus sysops
1598          *
1599          * Force sync if not pageout process, we don't want any single
1600          * non-pageout process to be able to hog the I/O subsystem!  This
1601          * can be overridden by setting.
1602          */
1603         if (object->type == OBJT_DEFAULT) {
1604                 if (object->type == OBJT_DEFAULT)
1605                         swp_pager_meta_convert(object);
1606         }
1607
1608         /*
1609          * Normally we force synchronous swap I/O if this is not the
1610          * pageout daemon to prevent any single user process limited
1611          * via RLIMIT_RSS from hogging swap write bandwidth.
1612          */
1613         if (curthread != pagethread &&
1614             curthread != emergpager &&
1615             swap_user_async == 0) {
1616                 flags |= VM_PAGER_PUT_SYNC;
1617         }
1618
1619         /*
1620          * Step 2
1621          *
1622          * Update nsw parameters from swap_async_max sysctl values.
1623          * Do not let the sysop crash the machine with bogus numbers.
1624          */
1625         if (swap_async_max != nsw_wcount_async_max) {
1626                 int n;
1627
1628                 /*
1629                  * limit range
1630                  */
1631                 if ((n = swap_async_max) > nswbuf_kva / 2)
1632                         n = nswbuf_kva / 2;
1633                 if (n < 1)
1634                         n = 1;
1635                 swap_async_max = n;
1636
1637                 /*
1638                  * Adjust difference ( if possible ).  If the current async
1639                  * count is too low, we may not be able to make the adjustment
1640                  * at this time.
1641                  *
1642                  * vm_token needed for nsw_wcount sleep interlock
1643                  */
1644                 lwkt_gettoken(&vm_token);
1645                 n -= nsw_wcount_async_max;
1646                 if (nsw_wcount_async + n >= 0) {
1647                         nsw_wcount_async_max += n;
1648                         pbuf_adjcount(&nsw_wcount_async, n);
1649                 }
1650                 lwkt_reltoken(&vm_token);
1651         }
1652
1653         /*
1654          * Step 3
1655          *
1656          * Assign swap blocks and issue I/O.  We reallocate swap on the fly.
1657          * The page is left dirty until the pageout operation completes
1658          * successfully.
1659          */
1660
1661         for (i = 0; i < count; i += n) {
1662                 struct buf *bp;
1663                 struct bio *bio;
1664                 swblk_t blk;
1665                 int j;
1666
1667                 /*
1668                  * Maximum I/O size is limited by a number of factors.
1669                  */
1670
1671                 n = min(BLIST_MAX_ALLOC, count - i);
1672                 n = min(n, nsw_cluster_max);
1673
1674                 lwkt_gettoken(&vm_token);
1675
1676                 /*
1677                  * Get biggest block of swap we can.  If we fail, fall
1678                  * back and try to allocate a smaller block.  Don't go
1679                  * overboard trying to allocate space if it would overly
1680                  * fragment swap.
1681                  */
1682                 while (
1683                     (blk = swp_pager_getswapspace(object, n)) == SWAPBLK_NONE &&
1684                     n > 4
1685                 ) {
1686                         n >>= 1;
1687                 }
1688                 if (blk == SWAPBLK_NONE) {
1689                         for (j = 0; j < n; ++j)
1690                                 rtvals[i+j] = VM_PAGER_FAIL;
1691                         lwkt_reltoken(&vm_token);
1692                         continue;
1693                 }
1694                 if (vm_report_swap_allocs > 0) {
1695                         kprintf("swap_alloc %08jx,%d\n", (intmax_t)blk, n);
1696                         --vm_report_swap_allocs;
1697                 }
1698
1699                 /*
1700                  * The I/O we are constructing cannot cross a physical
1701                  * disk boundry in the swap stripe.
1702                  */
1703                 if ((blk ^ (blk + n)) & ~SWB_DMMASK) {
1704                         j = ((blk + SWB_DMMAX) & ~SWB_DMMASK) - blk;
1705                         swp_pager_freeswapspace(object, blk + j, n - j);
1706                         n = j;
1707                 }
1708
1709                 /*
1710                  * All I/O parameters have been satisfied, build the I/O
1711                  * request and assign the swap space.
1712                  *
1713                  * Use the KVABIO API to avoid synchronizing the pmap.
1714                  */
1715                 if ((flags & VM_PAGER_PUT_SYNC))
1716                         bp = getpbuf_kva(&nsw_wcount_sync);
1717                 else
1718                         bp = getpbuf_kva(&nsw_wcount_async);
1719                 bio = &bp->b_bio1;
1720
1721                 lwkt_reltoken(&vm_token);
1722
1723                 pmap_qenter_noinval((vm_offset_t)bp->b_data, &m[i], n);
1724
1725                 bp->b_flags |= B_KVABIO;
1726                 bp->b_bcount = PAGE_SIZE * n;
1727                 bio->bio_offset = (off_t)blk << PAGE_SHIFT;
1728
1729                 for (j = 0; j < n; ++j) {
1730                         vm_page_t mreq = m[i+j];
1731
1732                         swp_pager_meta_build(mreq->object, mreq->pindex,
1733                                              blk + j);
1734                         if (object->type == OBJT_SWAP)
1735                                 vm_page_dirty(mreq);
1736                         rtvals[i+j] = VM_PAGER_OK;
1737
1738                         atomic_set_int(&mreq->busy_count, PBUSY_SWAPINPROG);
1739                         bp->b_xio.xio_pages[j] = mreq;
1740                 }
1741                 bp->b_xio.xio_npages = n;
1742
1743                 mycpu->gd_cnt.v_swapout++;
1744                 mycpu->gd_cnt.v_swappgsout += bp->b_xio.xio_npages;
1745
1746                 bp->b_dirtyoff = 0;             /* req'd for NFS */
1747                 bp->b_dirtyend = bp->b_bcount;  /* req'd for NFS */
1748                 bp->b_cmd = BUF_CMD_WRITE;
1749                 bio->bio_caller_info1.index = SWBIO_WRITE;
1750
1751                 /*
1752                  * asynchronous
1753                  */
1754                 if ((flags & VM_PAGER_PUT_SYNC) == 0) {
1755                         bio->bio_done = swp_pager_async_iodone;
1756                         BUF_KERNPROC(bp);
1757                         vn_strategy(swapdev_vp, bio);
1758
1759                         for (j = 0; j < n; ++j)
1760                                 rtvals[i+j] = VM_PAGER_PEND;
1761                         continue;
1762                 }
1763
1764                 /*
1765                  * Issue synchrnously.
1766                  *
1767                  * Wait for the sync I/O to complete, then update rtvals.
1768                  * We just set the rtvals[] to VM_PAGER_PEND so we can call
1769                  * our async completion routine at the end, thus avoiding a
1770                  * double-free.
1771                  */
1772                 bio->bio_caller_info1.index |= SWBIO_SYNC;
1773                 if (flags & VM_PAGER_TRY_TO_CACHE)
1774                         bio->bio_caller_info1.index |= SWBIO_TTC;
1775                 bio->bio_done = biodone_sync;
1776                 bio->bio_flags |= BIO_SYNC;
1777                 vn_strategy(swapdev_vp, bio);
1778                 biowait(bio, "swwrt");
1779
1780                 for (j = 0; j < n; ++j)
1781                         rtvals[i+j] = VM_PAGER_PEND;
1782
1783                 /*
1784                  * Now that we are through with the bp, we can call the
1785                  * normal async completion, which frees everything up.
1786                  */
1787                 swp_pager_async_iodone(bio);
1788         }
1789         vm_object_drop(object);
1790 }
1791
1792 /*
1793  * No requirements.
1794  *
1795  * Recalculate the low and high-water marks.
1796  */
1797 void
1798 swap_pager_newswap(void)
1799 {
1800         /*
1801          * NOTE: vm_swap_max cannot exceed 1 billion blocks, which is the
1802          *       limitation imposed by the blist code.  Remember that this
1803          *       will be divided by NSWAP_MAX (4), so each swap device is
1804          *       limited to around a terrabyte.
1805          */
1806         if (vm_swap_max) {
1807                 nswap_lowat = (int64_t)vm_swap_max * 4 / 100;   /* 4% left */
1808                 nswap_hiwat = (int64_t)vm_swap_max * 6 / 100;   /* 6% left */
1809                 kprintf("swap low/high-water marks set to %d/%d\n",
1810                         nswap_lowat, nswap_hiwat);
1811         } else {
1812                 nswap_lowat = 128;
1813                 nswap_hiwat = 512;
1814         }
1815         swp_sizecheck();
1816 }
1817
1818 /*
1819  *      swp_pager_async_iodone:
1820  *
1821  *      Completion routine for asynchronous reads and writes from/to swap.
1822  *      Also called manually by synchronous code to finish up a bp.
1823  *
1824  *      For READ operations, the pages are BUSY'd.  For WRITE operations,
1825  *      the pages are vm_page_t->busy'd.  For READ operations, we BUSY
1826  *      unbusy all pages except the 'main' request page.  For WRITE
1827  *      operations, we vm_page_t->busy'd unbusy all pages ( we can do this
1828  *      because we marked them all VM_PAGER_PEND on return from putpages ).
1829  *
1830  *      This routine may not block.
1831  *
1832  * No requirements.
1833  */
1834 static void
1835 swp_pager_async_iodone(struct bio *bio)
1836 {
1837         struct buf *bp = bio->bio_buf;
1838         vm_object_t object = NULL;
1839         int i;
1840         int *nswptr;
1841
1842         /*
1843          * report error
1844          */
1845         if (bp->b_flags & B_ERROR) {
1846                 kprintf(
1847                     "swap_pager: I/O error - %s failed; offset %lld,"
1848                         "size %ld, error %d\n",
1849                     ((bio->bio_caller_info1.index & SWBIO_READ) ?
1850                         "pagein" : "pageout"),
1851                     (long long)bio->bio_offset,
1852                     (long)bp->b_bcount,
1853                     bp->b_error
1854                 );
1855         }
1856
1857         /*
1858          * set object.
1859          */
1860         if (bp->b_xio.xio_npages)
1861                 object = bp->b_xio.xio_pages[0]->object;
1862
1863 #if 0
1864         /* PMAP TESTING CODE (useful, keep it in but #if 0'd) */
1865         if (bio->bio_caller_info1.index & SWBIO_WRITE) {
1866                 if (bio->bio_crc != iscsi_crc32(bp->b_data, bp->b_bcount)) {
1867                         kprintf("SWAPOUT: BADCRC %08x %08x\n",
1868                                 bio->bio_crc,
1869                                 iscsi_crc32(bp->b_data, bp->b_bcount));
1870                         for (i = 0; i < bp->b_xio.xio_npages; ++i) {
1871                                 vm_page_t m = bp->b_xio.xio_pages[i];
1872                                 if ((m->flags & PG_WRITEABLE) &&
1873                                     (pmap_mapped_sync(m) & PG_WRITEABLE)) {
1874                                         kprintf("SWAPOUT: "
1875                                                 "%d/%d %p writable\n",
1876                                                 i, bp->b_xio.xio_npages, m);
1877                                 }
1878                         }
1879                 }
1880         }
1881 #endif
1882
1883         /*
1884          * remove the mapping for kernel virtual
1885          */
1886         pmap_qremove((vm_offset_t)bp->b_data, bp->b_xio.xio_npages);
1887
1888         /*
1889          * cleanup pages.  If an error occurs writing to swap, we are in
1890          * very serious trouble.  If it happens to be a disk error, though,
1891          * we may be able to recover by reassigning the swap later on.  So
1892          * in this case we remove the m->swapblk assignment for the page
1893          * but do not free it in the rlist.  The errornous block(s) are thus
1894          * never reallocated as swap.  Redirty the page and continue.
1895          */
1896         for (i = 0; i < bp->b_xio.xio_npages; ++i) {
1897                 vm_page_t m = bp->b_xio.xio_pages[i];
1898
1899                 if (bp->b_flags & B_ERROR) {
1900                         /*
1901                          * If an error occurs I'd love to throw the swapblk
1902                          * away without freeing it back to swapspace, so it
1903                          * can never be used again.  But I can't from an
1904                          * interrupt.
1905                          */
1906
1907                         if (bio->bio_caller_info1.index & SWBIO_READ) {
1908                                 /*
1909                                  * When reading, reqpage needs to stay
1910                                  * locked for the parent, but all other
1911                                  * pages can be freed.  We still want to
1912                                  * wakeup the parent waiting on the page,
1913                                  * though.  ( also: pg_reqpage can be -1 and
1914                                  * not match anything ).
1915                                  *
1916                                  * We have to wake specifically requested pages
1917                                  * up too because we cleared SWAPINPROG and
1918                                  * someone may be waiting for that.
1919                                  *
1920                                  * NOTE: For reads, m->dirty will probably
1921                                  *       be overridden by the original caller
1922                                  *       of getpages so don't play cute tricks
1923                                  *       here.
1924                                  *
1925                                  * NOTE: We can't actually free the page from
1926                                  *       here, because this is an interrupt.
1927                                  *       It is not legal to mess with
1928                                  *       object->memq from an interrupt.
1929                                  *       Deactivate the page instead.
1930                                  *
1931                                  * WARNING! The instant SWAPINPROG is
1932                                  *          cleared another cpu may start
1933                                  *          using the mreq page (it will
1934                                  *          check m->valid immediately).
1935                                  */
1936
1937                                 m->valid = 0;
1938                                 atomic_clear_int(&m->busy_count,
1939                                                  PBUSY_SWAPINPROG);
1940
1941                                 /*
1942                                  * bio_driver_info holds the requested page
1943                                  * index.
1944                                  */
1945                                 if (i != (int)(intptr_t)bio->bio_driver_info) {
1946                                         vm_page_deactivate(m);
1947                                         vm_page_wakeup(m);
1948                                 } else {
1949                                         vm_page_flash(m);
1950                                 }
1951                                 /*
1952                                  * If i == bp->b_pager.pg_reqpage, do not wake
1953                                  * the page up.  The caller needs to.
1954                                  */
1955                         } else {
1956                                 /*
1957                                  * If a write error occurs remove the swap
1958                                  * assignment (note that PG_SWAPPED may or
1959                                  * may not be set depending on prior activity).
1960                                  *
1961                                  * Re-dirty OBJT_SWAP pages as there is no
1962                                  * other backing store, we can't throw the
1963                                  * page away.
1964                                  *
1965                                  * Non-OBJT_SWAP pages (aka swapcache) must
1966                                  * not be dirtied since they may not have
1967                                  * been dirty in the first place, and they
1968                                  * do have backing store (the vnode).
1969                                  */
1970                                 vm_page_busy_wait(m, FALSE, "swadpg");
1971                                 vm_object_hold(m->object);
1972                                 swp_pager_meta_ctl(m->object, m->pindex,
1973                                                    SWM_FREE);
1974                                 vm_page_flag_clear(m, PG_SWAPPED);
1975                                 vm_object_drop(m->object);
1976                                 if (m->object->type == OBJT_SWAP) {
1977                                         vm_page_dirty(m);
1978                                         vm_page_activate(m);
1979                                 }
1980                                 vm_page_io_finish(m);
1981                                 atomic_clear_int(&m->busy_count,
1982                                                  PBUSY_SWAPINPROG);
1983                                 vm_page_wakeup(m);
1984                         }
1985                 } else if (bio->bio_caller_info1.index & SWBIO_READ) {
1986                         /*
1987                          * NOTE: for reads, m->dirty will probably be
1988                          * overridden by the original caller of getpages so
1989                          * we cannot set them in order to free the underlying
1990                          * swap in a low-swap situation.  I don't think we'd
1991                          * want to do that anyway, but it was an optimization
1992                          * that existed in the old swapper for a time before
1993                          * it got ripped out due to precisely this problem.
1994                          *
1995                          * If not the requested page then deactivate it.
1996                          *
1997                          * Note that the requested page, reqpage, is left
1998                          * busied, but we still have to wake it up.  The
1999                          * other pages are released (unbusied) by
2000                          * vm_page_wakeup().  We do not set reqpage's
2001                          * valid bits here, it is up to the caller.
2002                          */
2003
2004                         /*
2005                          * NOTE: Can't call pmap_clear_modify(m) from an
2006                          *       interrupt thread, the pmap code may have to
2007                          *       map non-kernel pmaps and currently asserts
2008                          *       the case.
2009                          *
2010                          * WARNING! The instant SWAPINPROG is
2011                          *          cleared another cpu may start
2012                          *          using the mreq page (it will
2013                          *          check m->valid immediately).
2014                          */
2015                         /*pmap_clear_modify(m);*/
2016                         m->valid = VM_PAGE_BITS_ALL;
2017                         vm_page_undirty(m);
2018                         vm_page_flag_set(m, PG_SWAPPED);
2019                         atomic_clear_int(&m->busy_count, PBUSY_SWAPINPROG);
2020
2021                         /*
2022                          * We have to wake specifically requested pages
2023                          * up too because we cleared SWAPINPROG and
2024                          * could be waiting for it in getpages.  However,
2025                          * be sure to not unbusy getpages specifically
2026                          * requested page - getpages expects it to be
2027                          * left busy.
2028                          *
2029                          * bio_driver_info holds the requested page
2030                          */
2031                         if (i != (int)(intptr_t)bio->bio_driver_info) {
2032                                 vm_page_deactivate(m);
2033                                 vm_page_wakeup(m);
2034                         } else {
2035                                 vm_page_flash(m);
2036                         }
2037                 } else {
2038                         /*
2039                          * Mark the page clean but do not mess with the
2040                          * pmap-layer's modified state.  That state should
2041                          * also be clear since the caller protected the
2042                          * page VM_PROT_READ, but allow the case.
2043                          *
2044                          * We are in an interrupt, avoid pmap operations.
2045                          *
2046                          * If we have a severe page deficit, deactivate the
2047                          * page.  Do not try to cache it (which would also
2048                          * involve a pmap op), because the page might still
2049                          * be read-heavy.
2050                          *
2051                          * When using the swap to cache clean vnode pages
2052                          * we do not mess with the page dirty bits.
2053                          *
2054                          * NOTE! Nobody is waiting for the key mreq page
2055                          *       on write completion.
2056                          */
2057                         vm_page_busy_wait(m, FALSE, "swadpg");
2058                         if (m->object->type == OBJT_SWAP)
2059                                 vm_page_undirty(m);
2060                         vm_page_flag_set(m, PG_SWAPPED);
2061                         atomic_clear_int(&m->busy_count, PBUSY_SWAPINPROG);
2062                         if (vm_page_count_severe())
2063                                 vm_page_deactivate(m);
2064                         vm_page_io_finish(m);
2065                         if (bio->bio_caller_info1.index & SWBIO_TTC)
2066                                 vm_page_try_to_cache(m);
2067                         else
2068                                 vm_page_wakeup(m);
2069                 }
2070         }
2071
2072         /*
2073          * adjust pip.  NOTE: the original parent may still have its own
2074          * pip refs on the object.
2075          */
2076
2077         if (object)
2078                 vm_object_pip_wakeup_n(object, bp->b_xio.xio_npages);
2079
2080         /*
2081          * Release the physical I/O buffer.
2082          *
2083          * NOTE: Due to synchronous operations in the write case b_cmd may
2084          *       already be set to BUF_CMD_DONE and BIO_SYNC may have already
2085          *       been cleared.
2086          *
2087          * Use vm_token to interlock nsw_rcount/wcount wakeup?
2088          */
2089         lwkt_gettoken(&vm_token);
2090         if (bio->bio_caller_info1.index & SWBIO_READ)
2091                 nswptr = &nsw_rcount;
2092         else if (bio->bio_caller_info1.index & SWBIO_SYNC)
2093                 nswptr = &nsw_wcount_sync;
2094         else
2095                 nswptr = &nsw_wcount_async;
2096         bp->b_cmd = BUF_CMD_DONE;
2097         relpbuf(bp, nswptr);
2098         lwkt_reltoken(&vm_token);
2099 }
2100
2101 /*
2102  * Fault-in a potentially swapped page and remove the swap reference.
2103  * (used by swapoff code)
2104  *
2105  * object must be held.
2106  */
2107 static __inline void
2108 swp_pager_fault_page(vm_object_t object, int *sharedp, vm_pindex_t pindex)
2109 {
2110         struct vnode *vp;
2111         vm_page_t m;
2112         int error;
2113
2114         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
2115
2116         if (object->type == OBJT_VNODE) {
2117                 /*
2118                  * Any swap related to a vnode is due to swapcache.  We must
2119                  * vget() the vnode in case it is not active (otherwise
2120                  * vref() will panic).  Calling vm_object_page_remove() will
2121                  * ensure that any swap ref is removed interlocked with the
2122                  * page.  clean_only is set to TRUE so we don't throw away
2123                  * dirty pages.
2124                  */
2125                 vp = object->handle;
2126                 error = vget(vp, LK_SHARED | LK_RETRY | LK_CANRECURSE);
2127                 if (error == 0) {
2128                         vm_object_page_remove(object, pindex, pindex + 1, TRUE);
2129                         vput(vp);
2130                 }
2131         } else {
2132                 /*
2133                  * Otherwise it is a normal OBJT_SWAP object and we can
2134                  * fault the page in and remove the swap.
2135                  */
2136                 m = vm_fault_object_page(object, IDX_TO_OFF(pindex),
2137                                          VM_PROT_NONE,
2138                                          VM_FAULT_DIRTY | VM_FAULT_UNSWAP,
2139                                          sharedp, &error);
2140                 if (m)
2141                         vm_page_unhold(m);
2142         }
2143 }
2144
2145 /*
2146  * This removes all swap blocks related to a particular device.  We have
2147  * to be careful of ripups during the scan.
2148  */
2149 static int swp_pager_swapoff_callback(struct swblock *swap, void *data);
2150
2151 int
2152 swap_pager_swapoff(int devidx)
2153 {
2154         struct vm_object_hash *hash;
2155         struct swswapoffinfo info;
2156         struct vm_object marker;
2157         vm_object_t object;
2158         int n;
2159
2160         bzero(&marker, sizeof(marker));
2161         marker.type = OBJT_MARKER;
2162
2163         for (n = 0; n < VMOBJ_HSIZE; ++n) {
2164                 hash = &vm_object_hash[n];
2165
2166                 lwkt_gettoken(&hash->token);
2167                 TAILQ_INSERT_HEAD(&hash->list, &marker, object_entry);
2168
2169                 while ((object = TAILQ_NEXT(&marker, object_entry)) != NULL) {
2170                         if (object->type == OBJT_MARKER)
2171                                 goto skip;
2172                         if (object->type != OBJT_SWAP &&
2173                             object->type != OBJT_VNODE)
2174                                 goto skip;
2175                         vm_object_hold(object);
2176                         if (object->type != OBJT_SWAP &&
2177                             object->type != OBJT_VNODE) {
2178                                 vm_object_drop(object);
2179                                 goto skip;
2180                         }
2181
2182                         /*
2183                          * Object is special in that we can't just pagein
2184                          * into vm_page's in it (tmpfs, vn).
2185                          */
2186                         if ((object->flags & OBJ_NOPAGEIN) &&
2187                             RB_ROOT(&object->swblock_root)) {
2188                                 vm_object_drop(object);
2189                                 goto skip;
2190                         }
2191
2192                         info.object = object;
2193                         info.shared = 0;
2194                         info.devidx = devidx;
2195                         swblock_rb_tree_RB_SCAN(&object->swblock_root,
2196                                             NULL, swp_pager_swapoff_callback,
2197                                             &info);
2198                         vm_object_drop(object);
2199 skip:
2200                         if (object == TAILQ_NEXT(&marker, object_entry)) {
2201                                 TAILQ_REMOVE(&hash->list, &marker,
2202                                              object_entry);
2203                                 TAILQ_INSERT_AFTER(&hash->list, object,
2204                                                    &marker, object_entry);
2205                         }
2206                 }
2207                 TAILQ_REMOVE(&hash->list, &marker, object_entry);
2208                 lwkt_reltoken(&hash->token);
2209         }
2210
2211         /*
2212          * If we fail to locate all swblocks we just fail gracefully and
2213          * do not bother to restore paging on the swap device.  If the
2214          * user wants to retry the user can retry.
2215          */
2216         if (swdevt[devidx].sw_nused)
2217                 return (1);
2218         else
2219                 return (0);
2220 }
2221
2222 static
2223 int
2224 swp_pager_swapoff_callback(struct swblock *swap, void *data)
2225 {
2226         struct swswapoffinfo *info = data;
2227         vm_object_t object = info->object;
2228         vm_pindex_t index;
2229         swblk_t v;
2230         int i;
2231
2232         index = swap->swb_index;
2233         for (i = 0; i < SWAP_META_PAGES; ++i) {
2234                 /*
2235                  * Make sure we don't race a dying object.  This will
2236                  * kill the scan of the object's swap blocks entirely.
2237                  */
2238                 if (object->flags & OBJ_DEAD)
2239                         return(-1);
2240
2241                 /*
2242                  * Fault the page, which can obviously block.  If the swap
2243                  * structure disappears break out.
2244                  */
2245                 v = swap->swb_pages[i];
2246                 if (v != SWAPBLK_NONE && BLK2DEVIDX(v) == info->devidx) {
2247                         swp_pager_fault_page(object, &info->shared,
2248                                              swap->swb_index + i);
2249                         /* swap ptr might go away */
2250                         if (RB_LOOKUP(swblock_rb_tree,
2251                                       &object->swblock_root, index) != swap) {
2252                                 break;
2253                         }
2254                 }
2255         }
2256         return(0);
2257 }
2258
2259 /************************************************************************
2260  *                              SWAP META DATA                          *
2261  ************************************************************************
2262  *
2263  *      These routines manipulate the swap metadata stored in the
2264  *      OBJT_SWAP object.
2265  *
2266  *      Swap metadata is implemented with a global hash and not directly
2267  *      linked into the object.  Instead the object simply contains
2268  *      appropriate tracking counters.
2269  */
2270
2271 /*
2272  * Lookup the swblock containing the specified swap block index.
2273  *
2274  * The caller must hold the object.
2275  */
2276 static __inline
2277 struct swblock *
2278 swp_pager_lookup(vm_object_t object, vm_pindex_t index)
2279 {
2280         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
2281         index &= ~(vm_pindex_t)SWAP_META_MASK;
2282         return (RB_LOOKUP(swblock_rb_tree, &object->swblock_root, index));
2283 }
2284
2285 /*
2286  * Remove a swblock from the RB tree.
2287  *
2288  * The caller must hold the object.
2289  */
2290 static __inline
2291 void
2292 swp_pager_remove(vm_object_t object, struct swblock *swap)
2293 {
2294         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
2295         RB_REMOVE(swblock_rb_tree, &object->swblock_root, swap);
2296 }
2297
2298 /*
2299  * Convert default object to swap object if necessary
2300  *
2301  * The caller must hold the object.
2302  */
2303 static void
2304 swp_pager_meta_convert(vm_object_t object)
2305 {
2306         if (object->type == OBJT_DEFAULT) {
2307                 object->type = OBJT_SWAP;
2308                 KKASSERT(object->swblock_count == 0);
2309         }
2310 }
2311
2312 /*
2313  * SWP_PAGER_META_BUILD() -     add swap block to swap meta data for object
2314  *
2315  *      We first convert the object to a swap object if it is a default
2316  *      object.  Vnode objects do not need to be converted.
2317  *
2318  *      The specified swapblk is added to the object's swap metadata.  If
2319  *      the swapblk is not valid, it is freed instead.  Any previously
2320  *      assigned swapblk is freed.
2321  *
2322  * The caller must hold the object.
2323  */
2324 static void
2325 swp_pager_meta_build(vm_object_t object, vm_pindex_t index, swblk_t swapblk)
2326 {
2327         struct swblock *swap;
2328         struct swblock *oswap;
2329         vm_pindex_t v;
2330
2331         KKASSERT(swapblk != SWAPBLK_NONE);
2332         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
2333
2334         /*
2335          * Convert object if necessary
2336          */
2337         if (object->type == OBJT_DEFAULT)
2338                 swp_pager_meta_convert(object);
2339
2340         /*
2341          * Locate swblock.  If not found create, but if we aren't adding
2342          * anything just return.  If we run out of space in the map we wait
2343          * and, since the hash table may have changed, retry.
2344          */
2345 retry:
2346         swap = swp_pager_lookup(object, index);
2347
2348         if (swap == NULL) {
2349                 int i;
2350
2351                 swap = zalloc(swap_zone);
2352                 if (swap == NULL) {
2353                         vm_wait(0);
2354                         goto retry;
2355                 }
2356                 swap->swb_index = index & ~(vm_pindex_t)SWAP_META_MASK;
2357                 swap->swb_count = 0;
2358
2359                 ++object->swblock_count;
2360
2361                 for (i = 0; i < SWAP_META_PAGES; ++i)
2362                         swap->swb_pages[i] = SWAPBLK_NONE;
2363                 oswap = RB_INSERT(swblock_rb_tree, &object->swblock_root, swap);
2364                 KKASSERT(oswap == NULL);
2365         }
2366
2367         /*
2368          * Delete prior contents of metadata.
2369          *
2370          * NOTE: Decrement swb_count after the freeing operation (which
2371          *       might block) to prevent racing destruction of the swblock.
2372          */
2373         index &= SWAP_META_MASK;
2374
2375         while ((v = swap->swb_pages[index]) != SWAPBLK_NONE) {
2376                 swap->swb_pages[index] = SWAPBLK_NONE;
2377                 /* can block */
2378                 swp_pager_freeswapspace(object, v, 1);
2379                 --swap->swb_count;
2380                 --mycpu->gd_vmtotal.t_vm;
2381         }
2382
2383         /*
2384          * Enter block into metadata
2385          */
2386         swap->swb_pages[index] = swapblk;
2387         if (swapblk != SWAPBLK_NONE) {
2388                 ++swap->swb_count;
2389                 ++mycpu->gd_vmtotal.t_vm;
2390         }
2391 }
2392
2393 /*
2394  * SWP_PAGER_META_FREE() - free a range of blocks in the object's swap metadata
2395  *
2396  *      The requested range of blocks is freed, with any associated swap
2397  *      returned to the swap bitmap.
2398  *
2399  *      This routine will free swap metadata structures as they are cleaned
2400  *      out.  This routine does *NOT* operate on swap metadata associated
2401  *      with resident pages.
2402  *
2403  * The caller must hold the object.
2404  */
2405 static int swp_pager_meta_free_callback(struct swblock *swb, void *data);
2406
2407 static void
2408 swp_pager_meta_free(vm_object_t object, vm_pindex_t index, vm_pindex_t count)
2409 {
2410         struct swfreeinfo info;
2411
2412         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
2413
2414         /*
2415          * Nothing to do
2416          */
2417         if (object->swblock_count == 0) {
2418                 KKASSERT(RB_EMPTY(&object->swblock_root));
2419                 return;
2420         }
2421         if (count == 0)
2422                 return;
2423
2424         /*
2425          * Setup for RB tree scan.  Note that the pindex range can be huge
2426          * due to the 64 bit page index space so we cannot safely iterate.
2427          */
2428         info.object = object;
2429         info.basei = index & ~(vm_pindex_t)SWAP_META_MASK;
2430         info.begi = index;
2431         info.endi = index + count - 1;
2432         swblock_rb_tree_RB_SCAN(&object->swblock_root, rb_swblock_scancmp,
2433                                 swp_pager_meta_free_callback, &info);
2434 }
2435
2436 /*
2437  * The caller must hold the object.
2438  */
2439 static
2440 int
2441 swp_pager_meta_free_callback(struct swblock *swap, void *data)
2442 {
2443         struct swfreeinfo *info = data;
2444         vm_object_t object = info->object;
2445         int index;
2446         int eindex;
2447
2448         /*
2449          * Figure out the range within the swblock.  The wider scan may
2450          * return edge-case swap blocks when the start and/or end points
2451          * are in the middle of a block.
2452          */
2453         if (swap->swb_index < info->begi)
2454                 index = (int)info->begi & SWAP_META_MASK;
2455         else
2456                 index = 0;
2457
2458         if (swap->swb_index + SWAP_META_PAGES > info->endi)
2459                 eindex = (int)info->endi & SWAP_META_MASK;
2460         else
2461                 eindex = SWAP_META_MASK;
2462
2463         /*
2464          * Scan and free the blocks.  The loop terminates early
2465          * if (swap) runs out of blocks and could be freed.
2466          *
2467          * NOTE: Decrement swb_count after swp_pager_freeswapspace()
2468          *       to deal with a zfree race.
2469          */
2470         while (index <= eindex) {
2471                 swblk_t v = swap->swb_pages[index];
2472
2473                 if (v != SWAPBLK_NONE) {
2474                         swap->swb_pages[index] = SWAPBLK_NONE;
2475                         /* can block */
2476                         swp_pager_freeswapspace(object, v, 1);
2477                         --mycpu->gd_vmtotal.t_vm;
2478                         if (--swap->swb_count == 0) {
2479                                 swp_pager_remove(object, swap);
2480                                 zfree(swap_zone, swap);
2481                                 --object->swblock_count;
2482                                 break;
2483                         }
2484                 }
2485                 ++index;
2486         }
2487
2488         /* swap may be invalid here due to zfree above */
2489         lwkt_yield();
2490
2491         return(0);
2492 }
2493
2494 /*
2495  * SWP_PAGER_META_FREE_ALL() - destroy all swap metadata associated with object
2496  *
2497  *      This routine locates and destroys all swap metadata associated with
2498  *      an object.
2499  *
2500  * NOTE: Decrement swb_count after the freeing operation (which
2501  *       might block) to prevent racing destruction of the swblock.
2502  *
2503  * The caller must hold the object.
2504  */
2505 static void
2506 swp_pager_meta_free_all(vm_object_t object)
2507 {
2508         struct swblock *swap;
2509         int i;
2510
2511         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
2512
2513         while ((swap = RB_ROOT(&object->swblock_root)) != NULL) {
2514                 swp_pager_remove(object, swap);
2515                 for (i = 0; i < SWAP_META_PAGES; ++i) {
2516                         swblk_t v = swap->swb_pages[i];
2517                         if (v != SWAPBLK_NONE) {
2518                                 /* can block */
2519                                 swp_pager_freeswapspace(object, v, 1);
2520                                 --swap->swb_count;
2521                                 --mycpu->gd_vmtotal.t_vm;
2522                         }
2523                 }
2524                 if (swap->swb_count != 0)
2525                         panic("swap_pager_meta_free_all: swb_count != 0");
2526                 zfree(swap_zone, swap);
2527                 --object->swblock_count;
2528                 lwkt_yield();
2529         }
2530         KKASSERT(object->swblock_count == 0);
2531 }
2532
2533 /*
2534  * SWP_PAGER_METACTL() -  misc control of swap and vm_page_t meta data.
2535  *
2536  *      This routine is capable of looking up, popping, or freeing
2537  *      swapblk assignments in the swap meta data or in the vm_page_t.
2538  *      The routine typically returns the swapblk being looked-up, or popped,
2539  *      or SWAPBLK_NONE if the block was freed, or SWAPBLK_NONE if the block
2540  *      was invalid.  This routine will automatically free any invalid
2541  *      meta-data swapblks.
2542  *
2543  *      It is not possible to store invalid swapblks in the swap meta data
2544  *      (other then a literal 'SWAPBLK_NONE'), so we don't bother checking.
2545  *
2546  *      When acting on a busy resident page and paging is in progress, we
2547  *      have to wait until paging is complete but otherwise can act on the
2548  *      busy page.
2549  *
2550  *      SWM_FREE        remove and free swap block from metadata
2551  *      SWM_POP         remove from meta data but do not free.. pop it out
2552  *
2553  * The caller must hold the object.
2554  */
2555 static swblk_t
2556 swp_pager_meta_ctl(vm_object_t object, vm_pindex_t index, int flags)
2557 {
2558         struct swblock *swap;
2559         swblk_t r1;
2560
2561         if (object->swblock_count == 0)
2562                 return(SWAPBLK_NONE);
2563
2564         r1 = SWAPBLK_NONE;
2565         swap = swp_pager_lookup(object, index);
2566
2567         if (swap != NULL) {
2568                 index &= SWAP_META_MASK;
2569                 r1 = swap->swb_pages[index];
2570
2571                 if (r1 != SWAPBLK_NONE) {
2572                         if (flags & (SWM_FREE|SWM_POP)) {
2573                                 swap->swb_pages[index] = SWAPBLK_NONE;
2574                                 --mycpu->gd_vmtotal.t_vm;
2575                                 if (--swap->swb_count == 0) {
2576                                         swp_pager_remove(object, swap);
2577                                         zfree(swap_zone, swap);
2578                                         --object->swblock_count;
2579                                 }
2580                         }
2581                         /* swap ptr may be invalid */
2582                         if (flags & SWM_FREE) {
2583                                 swp_pager_freeswapspace(object, r1, 1);
2584                                 r1 = SWAPBLK_NONE;
2585                         }
2586                 }
2587                 /* swap ptr may be invalid */
2588         }
2589         return(r1);
2590 }