sys/vm/vm_object.c

   1 /*
   2  * Copyright (c) 1991, 1993, 2013
   3  *      The Regents of the University of California.  All rights reserved.
   4  *
   5  * This code is derived from software contributed to Berkeley by
   6  * The Mach Operating System project at Carnegie-Mellon University.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in the
  15  *    documentation and/or other materials provided with the distribution.
  16  * 3. Neither the name of the University nor the names of its contributors
  17  *    may be used to endorse or promote products derived from this software
  18  *    without specific prior written permission.
  19  *
  20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  30  * SUCH DAMAGE.
  31  *
  32  *      from: @(#)vm_object.c   8.5 (Berkeley) 3/22/94
  33  *
  34  *
  35  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  36  * All rights reserved.
  37  *
  38  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  39  *
  40  * Permission to use, copy, modify and distribute this software and
  41  * its documentation is hereby granted, provided that both the copyright
  42  * notice and this permission notice appear in all copies of the
  43  * software, derivative works or modified versions, and any portions
  44  * thereof, and that both notices appear in supporting documentation.
  45  *
  46  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  47  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  48  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  49  *
  50  * Carnegie Mellon requests users of this software to return to
  51  *
  52  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  53  *  School of Computer Science
  54  *  Carnegie Mellon University
  55  *  Pittsburgh PA 15213-3890
  56  *
  57  * any improvements or extensions that they make and grant Carnegie the
  58  * rights to redistribute these changes.
  59  *
  60  * $FreeBSD: src/sys/vm/vm_object.c,v 1.171.2.8 2003/05/26 19:17:56 alc Exp $
  61  */
  62
  63 /*
  64  *      Virtual memory object module.
  65  */
  66
  67 #include <sys/param.h>
  68 #include <sys/systm.h>
  69 #include <sys/proc.h>           /* for curproc, pageproc */
  70 #include <sys/thread.h>
  71 #include <sys/vnode.h>
  72 #include <sys/vmmeter.h>
  73 #include <sys/mman.h>
  74 #include <sys/mount.h>
  75 #include <sys/kernel.h>
  76 #include <sys/sysctl.h>
  77 #include <sys/refcount.h>
  78
  79 #include <vm/vm.h>
  80 #include <vm/vm_param.h>
  81 #include <vm/pmap.h>
  82 #include <vm/vm_map.h>
  83 #include <vm/vm_object.h>
  84 #include <vm/vm_page.h>
  85 #include <vm/vm_pageout.h>
  86 #include <vm/vm_pager.h>
  87 #include <vm/swap_pager.h>
  88 #include <vm/vm_kern.h>
  89 #include <vm/vm_extern.h>
  90 #include <vm/vm_zone.h>
  91
  92 #include <vm/vm_page2.h>
  93
  94 #include <machine/specialreg.h>
  95
  96 #define EASY_SCAN_FACTOR        8
  97
  98 static void     vm_object_qcollapse(vm_object_t object,
  99                                     vm_object_t backing_object);
 100 static void     vm_object_page_collect_flush(vm_object_t object, vm_page_t p,
 101                                              int pagerflags);
 102 static void     vm_object_lock_init(vm_object_t);
 103
 104
 105 /*
 106  *      Virtual memory objects maintain the actual data
 107  *      associated with allocated virtual memory.  A given
 108  *      page of memory exists within exactly one object.
 109  *
 110  *      An object is only deallocated when all "references"
 111  *      are given up.  Only one "reference" to a given
 112  *      region of an object should be writeable.
 113  *
 114  *      Associated with each object is a list of all resident
 115  *      memory pages belonging to that object; this list is
 116  *      maintained by the "vm_page" module, and locked by the object's
 117  *      lock.
 118  *
 119  *      Each object also records a "pager" routine which is
 120  *      used to retrieve (and store) pages to the proper backing
 121  *      storage.  In addition, objects may be backed by other
 122  *      objects from which they were virtual-copied.
 123  *
 124  *      The only items within the object structure which are
 125  *      modified after time of creation are:
 126  *              reference count         locked by object's lock
 127  *              pager routine           locked by object's lock
 128  *
 129  */
 130
 131 struct vm_object kernel_object;
 132
 133 static long vm_object_count;
 134
 135 static long object_collapses;
 136 static long object_bypasses;
 137 static int next_index;
 138 static vm_zone_t obj_zone;
 139 static struct vm_zone obj_zone_store;
 140 #define VM_OBJECTS_INIT 256
 141 static struct vm_object vm_objects_init[VM_OBJECTS_INIT];
 142
 143 struct object_q vm_object_lists[VMOBJ_HSIZE];
 144 struct lwkt_token vmobj_tokens[VMOBJ_HSIZE];
 145
 146 #if defined(DEBUG_LOCKS)
 147
 148 #define vm_object_vndeallocate(obj, vpp)        \
 149                 debugvm_object_vndeallocate(obj, vpp, __FILE__, __LINE__)
 150
 151 /*
 152  * Debug helper to track hold/drop/ref/deallocate calls.
 153  */
 154 static void
 155 debugvm_object_add(vm_object_t obj, char *file, int line, int addrem)
 156 {
 157         int i;
 158
 159         i = atomic_fetchadd_int(&obj->debug_index, 1);
 160         i = i & (VMOBJ_DEBUG_ARRAY_SIZE - 1);
 161         ksnprintf(obj->debug_hold_thrs[i],
 162                   sizeof(obj->debug_hold_thrs[i]),
 163                   "%c%d:(%d):%s",
 164                   (addrem == -1 ? '-' : (addrem == 1 ? '+' : '=')),
 165                   (curthread->td_proc ? curthread->td_proc->p_pid : -1),
 166                   obj->ref_count,
 167                   curthread->td_comm);
 168         obj->debug_hold_file[i] = file;
 169         obj->debug_hold_line[i] = line;
 170 #if 0
 171         /* Uncomment for debugging obj refs/derefs in reproducable cases */
 172         if (strcmp(curthread->td_comm, "sshd") == 0) {
 173                 kprintf("%d %p refs=%d ar=%d file: %s/%d\n",
 174                         (curthread->td_proc ? curthread->td_proc->p_pid : -1),
 175                         obj, obj->ref_count, addrem, file, line);
 176         }
 177 #endif
 178 }
 179
 180 #endif
 181
 182 /*
 183  * Misc low level routines
 184  */
 185 static void
 186 vm_object_lock_init(vm_object_t obj)
 187 {
 188 #if defined(DEBUG_LOCKS)
 189         int i;
 190
 191         obj->debug_index = 0;
 192         for (i = 0; i < VMOBJ_DEBUG_ARRAY_SIZE; i++) {
 193                 obj->debug_hold_thrs[i][0] = 0;
 194                 obj->debug_hold_file[i] = NULL;
 195                 obj->debug_hold_line[i] = 0;
 196         }
 197 #endif
 198 }
 199
 200 void
 201 vm_object_lock_swap(void)
 202 {
 203         lwkt_token_swap();
 204 }
 205
 206 void
 207 vm_object_lock(vm_object_t obj)
 208 {
 209         lwkt_gettoken(&obj->token);
 210 }
 211
 212 /*
 213  * Returns TRUE on sucesss
 214  */
 215 static int
 216 vm_object_lock_try(vm_object_t obj)
 217 {
 218         return(lwkt_trytoken(&obj->token));
 219 }
 220
 221 void
 222 vm_object_lock_shared(vm_object_t obj)
 223 {
 224         lwkt_gettoken_shared(&obj->token);
 225 }
 226
 227 void
 228 vm_object_unlock(vm_object_t obj)
 229 {
 230         lwkt_reltoken(&obj->token);
 231 }
 232
 233 void
 234 vm_object_upgrade(vm_object_t obj)
 235 {
 236         lwkt_reltoken(&obj->token);
 237         lwkt_gettoken(&obj->token);
 238 }
 239
 240 void
 241 vm_object_downgrade(vm_object_t obj)
 242 {
 243         lwkt_reltoken(&obj->token);
 244         lwkt_gettoken_shared(&obj->token);
 245 }
 246
 247 static __inline void
 248 vm_object_assert_held(vm_object_t obj)
 249 {
 250         ASSERT_LWKT_TOKEN_HELD(&obj->token);
 251 }
 252
 253 static __inline int
 254 vm_quickcolor(void)
 255 {
 256         globaldata_t gd = mycpu;
 257         int pg_color;
 258
 259         pg_color = (int)(intptr_t)gd->gd_curthread >> 10;
 260         pg_color += ++gd->gd_quick_color;
 261
 262         return pg_color;
 263 }
 264
 265 void
 266 VMOBJDEBUG(vm_object_hold)(vm_object_t obj VMOBJDBARGS)
 267 {
 268         KKASSERT(obj != NULL);
 269
 270         /*
 271          * Object must be held (object allocation is stable due to callers
 272          * context, typically already holding the token on a parent object)
 273          * prior to potentially blocking on the lock, otherwise the object
 274          * can get ripped away from us.
 275          */
 276         refcount_acquire(&obj->hold_count);
 277         vm_object_lock(obj);
 278
 279 #if defined(DEBUG_LOCKS)
 280         debugvm_object_add(obj, file, line, 1);
 281 #endif
 282 }
 283
 284 int
 285 VMOBJDEBUG(vm_object_hold_try)(vm_object_t obj VMOBJDBARGS)
 286 {
 287         KKASSERT(obj != NULL);
 288
 289         /*
 290          * Object must be held (object allocation is stable due to callers
 291          * context, typically already holding the token on a parent object)
 292          * prior to potentially blocking on the lock, otherwise the object
 293          * can get ripped away from us.
 294          */
 295         refcount_acquire(&obj->hold_count);
 296         if (vm_object_lock_try(obj) == 0) {
 297                 if (refcount_release(&obj->hold_count)) {
 298                         if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD))
 299                                 zfree(obj_zone, obj);
 300                 }
 301                 return(0);
 302         }
 303
 304 #if defined(DEBUG_LOCKS)
 305         debugvm_object_add(obj, file, line, 1);
 306 #endif
 307         return(1);
 308 }
 309
 310 void
 311 VMOBJDEBUG(vm_object_hold_shared)(vm_object_t obj VMOBJDBARGS)
 312 {
 313         KKASSERT(obj != NULL);
 314
 315         /*
 316          * Object must be held (object allocation is stable due to callers
 317          * context, typically already holding the token on a parent object)
 318          * prior to potentially blocking on the lock, otherwise the object
 319          * can get ripped away from us.
 320          */
 321         refcount_acquire(&obj->hold_count);
 322         vm_object_lock_shared(obj);
 323
 324 #if defined(DEBUG_LOCKS)
 325         debugvm_object_add(obj, file, line, 1);
 326 #endif
 327 }
 328
 329 /*
 330  * Drop the token and hold_count on the object.
 331  *
 332  * WARNING! Token might be shared.
 333  */
 334 void
 335 VMOBJDEBUG(vm_object_drop)(vm_object_t obj VMOBJDBARGS)
 336 {
 337         if (obj == NULL)
 338                 return;
 339
 340         /*
 341          * No new holders should be possible once we drop hold_count 1->0 as
 342          * there is no longer any way to reference the object.
 343          */
 344         KKASSERT(obj->hold_count > 0);
 345         if (refcount_release(&obj->hold_count)) {
 346 #if defined(DEBUG_LOCKS)
 347                 debugvm_object_add(obj, file, line, -1);
 348 #endif
 349
 350                 if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD)) {
 351                         vm_object_unlock(obj);
 352                         zfree(obj_zone, obj);
 353                 } else {
 354                         vm_object_unlock(obj);
 355                 }
 356         } else {
 357 #if defined(DEBUG_LOCKS)
 358                 debugvm_object_add(obj, file, line, -1);
 359 #endif
 360                 vm_object_unlock(obj);
 361         }
 362 }
 363
 364 /*
 365  * Initialize a freshly allocated object, returning a held object.
 366  *
 367  * Used only by vm_object_allocate() and zinitna().
 368  *
 369  * No requirements.
 370  */
 371 void
 372 _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object)
 373 {
 374         int incr;
 375         int n;
 376
 377         RB_INIT(&object->rb_memq);
 378         LIST_INIT(&object->shadow_head);
 379         lwkt_token_init(&object->token, "vmobj");
 380
 381         object->type = type;
 382         object->size = size;
 383         object->ref_count = 1;
 384         object->memattr = VM_MEMATTR_DEFAULT;
 385         object->hold_count = 0;
 386         object->flags = 0;
 387         if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP))
 388                 vm_object_set_flag(object, OBJ_ONEMAPPING);
 389         object->paging_in_progress = 0;
 390         object->resident_page_count = 0;
 391         object->agg_pv_list_count = 0;
 392         object->shadow_count = 0;
 393         /* cpu localization twist */
 394         object->pg_color = vm_quickcolor();
 395         if ( size > (PQ_L2_SIZE / 3 + PQ_PRIME1))
 396                 incr = PQ_L2_SIZE / 3 + PQ_PRIME1;
 397         else
 398                 incr = size;
 399         next_index = (next_index + incr) & PQ_L2_MASK;
 400         object->handle = NULL;
 401         object->backing_object = NULL;
 402         object->backing_object_offset = (vm_ooffset_t)0;
 403
 404         object->generation++;
 405         object->swblock_count = 0;
 406         RB_INIT(&object->swblock_root);
 407         vm_object_lock_init(object);
 408         pmap_object_init(object);
 409
 410         vm_object_hold(object);
 411
 412         n = VMOBJ_HASH(object);
 413         atomic_add_long(&vm_object_count, 1);
 414         lwkt_gettoken(&vmobj_tokens[n]);
 415         TAILQ_INSERT_TAIL(&vm_object_lists[n], object, object_list);
 416         lwkt_reltoken(&vmobj_tokens[n]);
 417 }
 418
 419 /*
 420  * Initialize the VM objects module.
 421  *
 422  * Called from the low level boot code only.
 423  */
 424 void
 425 vm_object_init(void)
 426 {
 427         int i;
 428
 429         for (i = 0; i < VMOBJ_HSIZE; ++i) {
 430                 TAILQ_INIT(&vm_object_lists[i]);
 431                 lwkt_token_init(&vmobj_tokens[i], "vmobjlst");
 432         }
 433
 434         _vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(KvaEnd),
 435                             &kernel_object);
 436         vm_object_drop(&kernel_object);
 437
 438         obj_zone = &obj_zone_store;
 439         zbootinit(obj_zone, "VM OBJECT", sizeof (struct vm_object),
 440                 vm_objects_init, VM_OBJECTS_INIT);
 441 }
 442
 443 void
 444 vm_object_init2(void)
 445 {
 446         zinitna(obj_zone, NULL, NULL, 0, 0, ZONE_PANICFAIL, 1);
 447 }
 448
 449 /*
 450  * Allocate and return a new object of the specified type and size.
 451  *
 452  * No requirements.
 453  */
 454 vm_object_t
 455 vm_object_allocate(objtype_t type, vm_pindex_t size)
 456 {
 457         vm_object_t result;
 458
 459         result = (vm_object_t) zalloc(obj_zone);
 460
 461         _vm_object_allocate(type, size, result);
 462         vm_object_drop(result);
 463
 464         return (result);
 465 }
 466
 467 /*
 468  * This version returns a held object, allowing further atomic initialization
 469  * of the object.
 470  */
 471 vm_object_t
 472 vm_object_allocate_hold(objtype_t type, vm_pindex_t size)
 473 {
 474         vm_object_t result;
 475
 476         result = (vm_object_t) zalloc(obj_zone);
 477
 478         _vm_object_allocate(type, size, result);
 479
 480         return (result);
 481 }
 482
 483 /*
 484  * Add an additional reference to a vm_object.  The object must already be
 485  * held.  The original non-lock version is no longer supported.  The object
 486  * must NOT be chain locked by anyone at the time the reference is added.
 487  *
 488  * Referencing a chain-locked object can blow up the fairly sensitive
 489  * ref_count and shadow_count tests in the deallocator.  Most callers
 490  * will call vm_object_chain_wait() prior to calling
 491  * vm_object_reference_locked() to avoid the case.
 492  *
 493  * The object must be held, but may be held shared if desired (hence why
 494  * we use an atomic op).
 495  */
 496 void
 497 VMOBJDEBUG(vm_object_reference_locked)(vm_object_t object VMOBJDBARGS)
 498 {
 499         KKASSERT(object != NULL);
 500         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
 501         KKASSERT((object->chainlk & (CHAINLK_EXCL | CHAINLK_MASK)) == 0);
 502         atomic_add_int(&object->ref_count, 1);
 503         if (object->type == OBJT_VNODE) {
 504                 vref(object->handle);
 505                 /* XXX what if the vnode is being destroyed? */
 506         }
 507 #if defined(DEBUG_LOCKS)
 508         debugvm_object_add(object, file, line, 1);
 509 #endif
 510 }
 511
 512 /*
 513  * This version is only allowed for vnode objects.
 514  */
 515 void
 516 VMOBJDEBUG(vm_object_reference_quick)(vm_object_t object VMOBJDBARGS)
 517 {
 518         KKASSERT(object->type == OBJT_VNODE);
 519         atomic_add_int(&object->ref_count, 1);
 520         vref(object->handle);
 521 #if defined(DEBUG_LOCKS)
 522         debugvm_object_add(object, file, line, 1);
 523 #endif
 524 }
 525
 526 /*
 527  * Object OBJ_CHAINLOCK lock handling.
 528  *
 529  * The caller can chain-lock backing objects recursively and then
 530  * use vm_object_chain_release_all() to undo the whole chain.
 531  *
 532  * Chain locks are used to prevent collapses and are only applicable
 533  * to OBJT_DEFAULT and OBJT_SWAP objects.  Chain locking operations
 534  * on other object types are ignored.  This is also important because
 535  * it allows e.g. the vnode underlying a memory mapping to take concurrent
 536  * faults.
 537  *
 538  * The object must usually be held on entry, though intermediate
 539  * objects need not be held on release.  The object must be held exclusively,
 540  * NOT shared.  Note that the prefault path checks the shared state and
 541  * avoids using the chain functions.
 542  */
 543 void
 544 vm_object_chain_wait(vm_object_t object, int shared)
 545 {
 546         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
 547         for (;;) {
 548                 uint32_t chainlk = object->chainlk;
 549
 550                 cpu_ccfence();
 551                 if (shared) {
 552                         if (chainlk & (CHAINLK_EXCL | CHAINLK_EXCLREQ)) {
 553                                 tsleep_interlock(object, 0);
 554                                 if (atomic_cmpset_int(&object->chainlk,
 555                                                       chainlk,
 556                                                       chainlk | CHAINLK_WAIT)) {
 557                                         tsleep(object, PINTERLOCKED,
 558                                                "objchns", 0);
 559                                 }
 560                                 /* retry */
 561                         } else {
 562                                 break;
 563                         }
 564                         /* retry */
 565                 } else {
 566                         if (chainlk & (CHAINLK_MASK | CHAINLK_EXCL)) {
 567                                 tsleep_interlock(object, 0);
 568                                 if (atomic_cmpset_int(&object->chainlk,
 569                                                       chainlk,
 570                                                       chainlk | CHAINLK_WAIT))
 571                                 {
 572                                         tsleep(object, PINTERLOCKED,
 573                                                "objchnx", 0);
 574                                 }
 575                                 /* retry */
 576                         } else {
 577                                 if (atomic_cmpset_int(&object->chainlk,
 578                                                       chainlk,
 579                                                       chainlk & ~CHAINLK_WAIT))
 580                                 {
 581                                         if (chainlk & CHAINLK_WAIT)
 582                                                 wakeup(object);
 583                                         break;
 584                                 }
 585                                 /* retry */
 586                         }
 587                 }
 588                 /* retry */
 589         }
 590 }
 591
 592 void
 593 vm_object_chain_acquire(vm_object_t object, int shared)
 594 {
 595         if (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP)
 596                 return;
 597         if (vm_shared_fault == 0)
 598                 shared = 0;
 599
 600         for (;;) {
 601                 uint32_t chainlk = object->chainlk;
 602
 603                 cpu_ccfence();
 604                 if (shared) {
 605                         if (chainlk & (CHAINLK_EXCL | CHAINLK_EXCLREQ)) {
 606                                 tsleep_interlock(object, 0);
 607                                 if (atomic_cmpset_int(&object->chainlk,
 608                                                       chainlk,
 609                                                       chainlk | CHAINLK_WAIT)) {
 610                                         tsleep(object, PINTERLOCKED,
 611                                                "objchns", 0);
 612                                 }
 613                                 /* retry */
 614                         } else if (atomic_cmpset_int(&object->chainlk,
 615                                               chainlk, chainlk + 1)) {
 616                                 break;
 617                         }
 618                         /* retry */
 619                 } else {
 620                         if (chainlk & (CHAINLK_MASK | CHAINLK_EXCL)) {
 621                                 tsleep_interlock(object, 0);
 622                                 if (atomic_cmpset_int(&object->chainlk,
 623                                                       chainlk,
 624                                                       chainlk |
 625                                                        CHAINLK_WAIT |
 626                                                        CHAINLK_EXCLREQ)) {
 627                                         tsleep(object, PINTERLOCKED,
 628                                                "objchnx", 0);
 629                                 }
 630                                 /* retry */
 631                         } else {
 632                                 if (atomic_cmpset_int(&object->chainlk,
 633                                                       chainlk,
 634                                                       (chainlk | CHAINLK_EXCL) &
 635                                                       ~(CHAINLK_EXCLREQ |
 636                                                         CHAINLK_WAIT))) {
 637                                         if (chainlk & CHAINLK_WAIT)
 638                                                 wakeup(object);
 639                                         break;
 640                                 }
 641                                 /* retry */
 642                         }
 643                 }
 644                 /* retry */
 645         }
 646 }
 647
 648 void
 649 vm_object_chain_release(vm_object_t object)
 650 {
 651         /*ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));*/
 652         if (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP)
 653                 return;
 654         KKASSERT(object->chainlk & (CHAINLK_MASK | CHAINLK_EXCL));
 655         for (;;) {
 656                 uint32_t chainlk = object->chainlk;
 657
 658                 cpu_ccfence();
 659                 if (chainlk & CHAINLK_MASK) {
 660                         if ((chainlk & CHAINLK_MASK) == 1 &&
 661                             atomic_cmpset_int(&object->chainlk,
 662                                               chainlk,
 663                                               (chainlk - 1) & ~CHAINLK_WAIT)) {
 664                                 if (chainlk & CHAINLK_WAIT)
 665                                         wakeup(object);
 666                                 break;
 667                         }
 668                         if ((chainlk & CHAINLK_MASK) > 1 &&
 669                             atomic_cmpset_int(&object->chainlk,
 670                                               chainlk, chainlk - 1)) {
 671                                 break;
 672                         }
 673                         /* retry */
 674                 } else {
 675                         KKASSERT(chainlk & CHAINLK_EXCL);
 676                         if (atomic_cmpset_int(&object->chainlk,
 677                                               chainlk,
 678                                               chainlk & ~(CHAINLK_EXCL |
 679                                                           CHAINLK_WAIT))) {
 680                                 if (chainlk & CHAINLK_WAIT)
 681                                         wakeup(object);
 682                                 break;
 683                         }
 684                 }
 685         }
 686 }
 687
 688 /*
 689  * Release the chain from first_object through and including stopobj.
 690  * The caller is typically holding the first and last object locked
 691  * (shared or exclusive) to prevent destruction races.
 692  *
 693  * We release stopobj first as an optimization as this object is most
 694  * likely to be shared across multiple processes.
 695  */
 696 void
 697 vm_object_chain_release_all(vm_object_t first_object, vm_object_t stopobj)
 698 {
 699         vm_object_t backing_object;
 700         vm_object_t object;
 701
 702         vm_object_chain_release(stopobj);
 703         object = first_object;
 704
 705         while (object != stopobj) {
 706                 KKASSERT(object);
 707                 backing_object = object->backing_object;
 708                 vm_object_chain_release(object);
 709                 object = backing_object;
 710         }
 711 }
 712
 713 /*
 714  * Dereference an object and its underlying vnode.  The object may be
 715  * held shared.  On return the object will remain held.
 716  *
 717  * This function may return a vnode in *vpp which the caller must release
 718  * after the caller drops its own lock.  If vpp is NULL, we assume that
 719  * the caller was holding an exclusive lock on the object and we vrele()
 720  * the vp ourselves.
 721  */
 722 static void
 723 VMOBJDEBUG(vm_object_vndeallocate)(vm_object_t object, struct vnode **vpp
 724                                    VMOBJDBARGS)
 725 {
 726         struct vnode *vp = (struct vnode *) object->handle;
 727
 728         KASSERT(object->type == OBJT_VNODE,
 729             ("vm_object_vndeallocate: not a vnode object"));
 730         KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp"));
 731         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
 732 #ifdef INVARIANTS
 733         if (object->ref_count == 0) {
 734                 vprint("vm_object_vndeallocate", vp);
 735                 panic("vm_object_vndeallocate: bad object reference count");
 736         }
 737 #endif
 738         for (;;) {
 739                 int count = object->ref_count;
 740                 cpu_ccfence();
 741                 if (count == 1) {
 742                         vm_object_upgrade(object);
 743                         if (atomic_cmpset_int(&object->ref_count, count, 0)) {
 744                                 vclrflags(vp, VTEXT);
 745                                 break;
 746                         }
 747                 } else {
 748                         if (atomic_cmpset_int(&object->ref_count,
 749                                               count, count - 1)) {
 750                                 break;
 751                         }
 752                 }
 753                 /* retry */
 754         }
 755 #if defined(DEBUG_LOCKS)
 756         debugvm_object_add(object, file, line, -1);
 757 #endif
 758
 759         /*
 760          * vrele or return the vp to vrele.  We can only safely vrele(vp)
 761          * if the object was locked exclusively.  But there are two races
 762          * here.
 763          *
 764          * We had to upgrade the object above to safely clear VTEXT
 765          * but the alternative path where the shared lock is retained
 766          * can STILL race to 0 in other paths and cause our own vrele()
 767          * to terminate the vnode.  We can't allow that if the VM object
 768          * is still locked shared.
 769          */
 770         if (vpp)
 771                 *vpp = vp;
 772         else
 773                 vrele(vp);
 774 }
 775
 776 /*
 777  * Release a reference to the specified object, gained either through a
 778  * vm_object_allocate or a vm_object_reference call.  When all references
 779  * are gone, storage associated with this object may be relinquished.
 780  *
 781  * The caller does not have to hold the object locked but must have control
 782  * over the reference in question in order to guarantee that the object
 783  * does not get ripped out from under us.
 784  *
 785  * XXX Currently all deallocations require an exclusive lock.
 786  */
 787 void
 788 VMOBJDEBUG(vm_object_deallocate)(vm_object_t object VMOBJDBARGS)
 789 {
 790         struct vnode *vp;
 791         int count;
 792
 793         if (object == NULL)
 794                 return;
 795
 796         for (;;) {
 797                 count = object->ref_count;
 798                 cpu_ccfence();
 799
 800                 /*
 801                  * If decrementing the count enters into special handling
 802                  * territory (0, 1, or 2) we have to do it the hard way.
 803                  * Fortunate though, objects with only a few refs like this
 804                  * are not likely to be heavily contended anyway.
 805                  *
 806                  * For vnode objects we only care about 1->0 transitions.
 807                  */
 808                 if (count <= 3 || (object->type == OBJT_VNODE && count <= 1)) {
 809 #if defined(DEBUG_LOCKS)
 810                         debugvm_object_add(object, file, line, 0);
 811 #endif
 812                         vm_object_hold(object);
 813                         vm_object_deallocate_locked(object);
 814                         vm_object_drop(object);
 815                         break;
 816                 }
 817
 818                 /*
 819                  * Try to decrement ref_count without acquiring a hold on
 820                  * the object.  This is particularly important for the exec*()
 821                  * and exit*() code paths because the program binary may
 822                  * have a great deal of sharing and an exclusive lock will
 823                  * crowbar performance in those circumstances.
 824                  */
 825                 if (object->type == OBJT_VNODE) {
 826                         vp = (struct vnode *)object->handle;
 827                         if (atomic_cmpset_int(&object->ref_count,
 828                                               count, count - 1)) {
 829 #if defined(DEBUG_LOCKS)
 830                                 debugvm_object_add(object, file, line, -1);
 831 #endif
 832
 833                                 vrele(vp);
 834                                 break;
 835                         }
 836                         /* retry */
 837                 } else {
 838                         if (atomic_cmpset_int(&object->ref_count,
 839                                               count, count - 1)) {
 840 #if defined(DEBUG_LOCKS)
 841                                 debugvm_object_add(object, file, line, -1);
 842 #endif
 843                                 break;
 844                         }
 845                         /* retry */
 846                 }
 847                 /* retry */
 848         }
 849 }
 850
 851 void
 852 VMOBJDEBUG(vm_object_deallocate_locked)(vm_object_t object VMOBJDBARGS)
 853 {
 854         struct vm_object_dealloc_list *dlist = NULL;
 855         struct vm_object_dealloc_list *dtmp;
 856         vm_object_t temp;
 857         int must_drop = 0;
 858
 859         /*
 860          * We may chain deallocate object, but additional objects may
 861          * collect on the dlist which also have to be deallocated.  We
 862          * must avoid a recursion, vm_object chains can get deep.
 863          */
 864
 865 again:
 866         while (object != NULL) {
 867                 /*
 868                  * vnode case, caller either locked the object exclusively
 869                  * or this is a recursion with must_drop != 0 and the vnode
 870                  * object will be locked shared.
 871                  *
 872                  * If locked shared we have to drop the object before we can
 873                  * call vrele() or risk a shared/exclusive livelock.
 874                  */
 875                 if (object->type == OBJT_VNODE) {
 876                         ASSERT_LWKT_TOKEN_HELD(&object->token);
 877                         if (must_drop) {
 878                                 struct vnode *tmp_vp;
 879
 880                                 vm_object_vndeallocate(object, &tmp_vp);
 881                                 vm_object_drop(object);
 882                                 must_drop = 0;
 883                                 object = NULL;
 884                                 vrele(tmp_vp);
 885                         } else {
 886                                 vm_object_vndeallocate(object, NULL);
 887                         }
 888                         break;
 889                 }
 890                 ASSERT_LWKT_TOKEN_HELD_EXCL(&object->token);
 891
 892                 /*
 893                  * Normal case (object is locked exclusively)
 894                  */
 895                 if (object->ref_count == 0) {
 896                         panic("vm_object_deallocate: object deallocated "
 897                               "too many times: %d", object->type);
 898                 }
 899                 if (object->ref_count > 2) {
 900                         atomic_add_int(&object->ref_count, -1);
 901 #if defined(DEBUG_LOCKS)
 902                         debugvm_object_add(object, file, line, -1);
 903 #endif
 904                         break;
 905                 }
 906
 907                 /*
 908                  * Here on ref_count of one or two, which are special cases for
 909                  * objects.
 910                  *
 911                  * Nominal ref_count > 1 case if the second ref is not from
 912                  * a shadow.
 913                  *
 914                  * (ONEMAPPING only applies to DEFAULT AND SWAP objects)
 915                  */
 916                 if (object->ref_count == 2 && object->shadow_count == 0) {
 917                         if (object->type == OBJT_DEFAULT ||
 918                             object->type == OBJT_SWAP) {
 919                                 vm_object_set_flag(object, OBJ_ONEMAPPING);
 920                         }
 921                         atomic_add_int(&object->ref_count, -1);
 922 #if defined(DEBUG_LOCKS)
 923                         debugvm_object_add(object, file, line, -1);
 924 #endif
 925                         break;
 926                 }
 927
 928                 /*
 929                  * If the second ref is from a shadow we chain along it
 930                  * upwards if object's handle is exhausted.
 931                  *
 932                  * We have to decrement object->ref_count before potentially
 933                  * collapsing the first shadow object or the collapse code
 934                  * will not be able to handle the degenerate case to remove
 935                  * object.  However, if we do it too early the object can
 936                  * get ripped out from under us.
 937                  */
 938                 if (object->ref_count == 2 && object->shadow_count == 1 &&
 939                     object->handle == NULL && (object->type == OBJT_DEFAULT ||
 940                                                object->type == OBJT_SWAP)) {
 941                         temp = LIST_FIRST(&object->shadow_head);
 942                         KKASSERT(temp != NULL);
 943                         vm_object_hold(temp);
 944
 945                         /*
 946                          * Wait for any paging to complete so the collapse
 947                          * doesn't (or isn't likely to) qcollapse.  pip
 948                          * waiting must occur before we acquire the
 949                          * chainlock.
 950                          */
 951                         while (
 952                                 temp->paging_in_progress ||
 953                                 object->paging_in_progress
 954                         ) {
 955                                 vm_object_pip_wait(temp, "objde1");
 956                                 vm_object_pip_wait(object, "objde2");
 957                         }
 958
 959                         /*
 960                          * If the parent is locked we have to give up, as
 961                          * otherwise we would be acquiring locks in the
 962                          * wrong order and potentially deadlock.
 963                          */
 964                         if (temp->chainlk & (CHAINLK_EXCL | CHAINLK_MASK)) {
 965                                 vm_object_drop(temp);
 966                                 goto skip;
 967                         }
 968                         vm_object_chain_acquire(temp, 0);
 969
 970                         /*
 971                          * Recheck/retry after the hold and the paging
 972                          * wait, both of which can block us.
 973                          */
 974                         if (object->ref_count != 2 ||
 975                             object->shadow_count != 1 ||
 976                             object->handle ||
 977                             LIST_FIRST(&object->shadow_head) != temp ||
 978                             (object->type != OBJT_DEFAULT &&
 979                              object->type != OBJT_SWAP)) {
 980                                 vm_object_chain_release(temp);
 981                                 vm_object_drop(temp);
 982                                 continue;
 983                         }
 984
 985                         /*
 986                          * We can safely drop object's ref_count now.
 987                          */
 988                         KKASSERT(object->ref_count == 2);
 989                         atomic_add_int(&object->ref_count, -1);
 990 #if defined(DEBUG_LOCKS)
 991                         debugvm_object_add(object, file, line, -1);
 992 #endif
 993
 994                         /*
 995                          * If our single parent is not collapseable just
 996                          * decrement ref_count (2->1) and stop.
 997                          */
 998                         if (temp->handle || (temp->type != OBJT_DEFAULT &&
 999                                              temp->type != OBJT_SWAP)) {
1000                                 vm_object_chain_release(temp);
1001                                 vm_object_drop(temp);
1002                                 break;
1003                         }
1004
1005                         /*
1006                          * At this point we have already dropped object's
1007                          * ref_count so it is possible for a race to
1008                          * deallocate obj out from under us.  Any collapse
1009                          * will re-check the situation.  We must not block
1010                          * until we are able to collapse.
1011                          *
1012                          * Bump temp's ref_count to avoid an unwanted
1013                          * degenerate recursion (can't call
1014                          * vm_object_reference_locked() because it asserts
1015                          * that CHAINLOCK is not set).
1016                          */
1017                         atomic_add_int(&temp->ref_count, 1);
1018                         KKASSERT(temp->ref_count > 1);
1019
1020                         /*
1021                          * Collapse temp, then deallocate the extra ref
1022                          * formally.
1023                          */
1024                         vm_object_collapse(temp, &dlist);
1025                         vm_object_chain_release(temp);
1026                         if (must_drop) {
1027                                 vm_object_lock_swap();
1028                                 vm_object_drop(object);
1029                         }
1030                         object = temp;
1031                         must_drop = 1;
1032                         continue;
1033                 }
1034
1035                 /*
1036                  * Drop the ref and handle termination on the 1->0 transition.
1037                  * We may have blocked above so we have to recheck.
1038                  */
1039 skip:
1040                 KKASSERT(object->ref_count != 0);
1041                 if (object->ref_count >= 2) {
1042                         atomic_add_int(&object->ref_count, -1);
1043 #if defined(DEBUG_LOCKS)
1044                         debugvm_object_add(object, file, line, -1);
1045 #endif
1046                         break;
1047                 }
1048                 KKASSERT(object->ref_count == 1);
1049
1050                 /*
1051                  * 1->0 transition.  Chain through the backing_object.
1052                  * Maintain the ref until we've located the backing object,
1053                  * then re-check.
1054                  */
1055                 while ((temp = object->backing_object) != NULL) {
1056                         if (temp->type == OBJT_VNODE)
1057                                 vm_object_hold_shared(temp);
1058                         else
1059                                 vm_object_hold(temp);
1060                         if (temp == object->backing_object)
1061                                 break;
1062                         vm_object_drop(temp);
1063                 }
1064
1065                 /*
1066                  * 1->0 transition verified, retry if ref_count is no longer
1067                  * 1.  Otherwise disconnect the backing_object (temp) and
1068                  * clean up.
1069                  */
1070                 if (object->ref_count != 1) {
1071                         vm_object_drop(temp);
1072                         continue;
1073                 }
1074
1075                 /*
1076                  * It shouldn't be possible for the object to be chain locked
1077                  * if we're removing the last ref on it.
1078                  *
1079                  * Removing object from temp's shadow list requires dropping
1080                  * temp, which we will do on loop.
1081                  *
1082                  * NOTE! vnodes do not use the shadow list, but still have
1083                  *       the backing_object reference.
1084                  */
1085                 KKASSERT((object->chainlk & (CHAINLK_EXCL|CHAINLK_MASK)) == 0);
1086
1087                 if (temp) {
1088                         if (object->flags & OBJ_ONSHADOW) {
1089                                 LIST_REMOVE(object, shadow_list);
1090                                 temp->shadow_count--;
1091                                 temp->generation++;
1092                                 vm_object_clear_flag(object, OBJ_ONSHADOW);
1093                         }
1094                         object->backing_object = NULL;
1095                 }
1096
1097                 atomic_add_int(&object->ref_count, -1);
1098                 if ((object->flags & OBJ_DEAD) == 0)
1099                         vm_object_terminate(object);
1100                 if (must_drop && temp)
1101                         vm_object_lock_swap();
1102                 if (must_drop)
1103                         vm_object_drop(object);
1104                 object = temp;
1105                 must_drop = 1;
1106         }
1107
1108         if (must_drop && object)
1109                 vm_object_drop(object);
1110
1111         /*
1112          * Additional tail recursion on dlist.  Avoid a recursion.  Objects
1113          * on the dlist have a hold count but are not locked.
1114          */
1115         if ((dtmp = dlist) != NULL) {
1116                 dlist = dtmp->next;
1117                 object = dtmp->object;
1118                 kfree(dtmp, M_TEMP);
1119
1120                 vm_object_lock(object); /* already held, add lock */
1121                 must_drop = 1;          /* and we're responsible for it */
1122                 goto again;
1123         }
1124 }
1125
1126 /*
1127  * Destroy the specified object, freeing up related resources.
1128  *
1129  * The object must have zero references.
1130  *
1131  * The object must held.  The caller is responsible for dropping the object
1132  * after terminate returns.  Terminate does NOT drop the object.
1133  */
1134 static int vm_object_terminate_callback(vm_page_t p, void *data);
1135
1136 void
1137 vm_object_terminate(vm_object_t object)
1138 {
1139         struct rb_vm_page_scan_info info;
1140         int n;
1141
1142         /*
1143          * Make sure no one uses us.  Once we set OBJ_DEAD we should be
1144          * able to safely block.
1145          */
1146         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1147         KKASSERT((object->flags & OBJ_DEAD) == 0);
1148         vm_object_set_flag(object, OBJ_DEAD);
1149
1150         /*
1151          * Wait for the pageout daemon to be done with the object
1152          */
1153         vm_object_pip_wait(object, "objtrm1");
1154
1155         KASSERT(!object->paging_in_progress,
1156                 ("vm_object_terminate: pageout in progress"));
1157
1158         /*
1159          * Clean and free the pages, as appropriate. All references to the
1160          * object are gone, so we don't need to lock it.
1161          */
1162         if (object->type == OBJT_VNODE) {
1163                 struct vnode *vp;
1164
1165                 /*
1166                  * Clean pages and flush buffers.
1167                  *
1168                  * NOTE!  TMPFS buffer flushes do not typically flush the
1169                  *        actual page to swap as this would be highly
1170                  *        inefficient, and normal filesystems usually wrap
1171                  *        page flushes with buffer cache buffers.
1172                  *
1173                  *        To deal with this we have to call vinvalbuf() both
1174                  *        before and after the vm_object_page_clean().
1175                  */
1176                 vp = (struct vnode *) object->handle;
1177                 vinvalbuf(vp, V_SAVE, 0, 0);
1178                 vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
1179                 vinvalbuf(vp, V_SAVE, 0, 0);
1180         }
1181
1182         /*
1183          * Wait for any I/O to complete, after which there had better not
1184          * be any references left on the object.
1185          */
1186         vm_object_pip_wait(object, "objtrm2");
1187
1188         if (object->ref_count != 0) {
1189                 panic("vm_object_terminate: object with references, "
1190                       "ref_count=%d", object->ref_count);
1191         }
1192
1193         /*
1194          * Cleanup any shared pmaps associated with this object.
1195          */
1196         pmap_object_free(object);
1197
1198         /*
1199          * Now free any remaining pages. For internal objects, this also
1200          * removes them from paging queues. Don't free wired pages, just
1201          * remove them from the object.
1202          */
1203         info.count = 0;
1204         info.object = object;
1205         vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
1206                                 vm_object_terminate_callback, &info);
1207
1208         /*
1209          * Let the pager know object is dead.
1210          */
1211         vm_pager_deallocate(object);
1212
1213         /*
1214          * Wait for the object hold count to hit 1, clean out pages as
1215          * we go.  vmobj_token interlocks any race conditions that might
1216          * pick the object up from the vm_object_list after we have cleared
1217          * rb_memq.
1218          */
1219         for (;;) {
1220                 if (RB_ROOT(&object->rb_memq) == NULL)
1221                         break;
1222                 kprintf("vm_object_terminate: Warning, object %p "
1223                         "still has %d pages\n",
1224                         object, object->resident_page_count);
1225                 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
1226                                         vm_object_terminate_callback, &info);
1227         }
1228
1229         /*
1230          * There had better not be any pages left
1231          */
1232         KKASSERT(object->resident_page_count == 0);
1233
1234         /*
1235          * Remove the object from the global object list.
1236          */
1237         n = VMOBJ_HASH(object);
1238         lwkt_gettoken(&vmobj_tokens[n]);
1239         TAILQ_REMOVE(&vm_object_lists[n], object, object_list);
1240         lwkt_reltoken(&vmobj_tokens[n]);
1241         atomic_add_long(&vm_object_count, -1);
1242
1243         if (object->ref_count != 0) {
1244                 panic("vm_object_terminate2: object with references, "
1245                       "ref_count=%d", object->ref_count);
1246         }
1247
1248         /*
1249          * NOTE: The object hold_count is at least 1, so we cannot zfree()
1250          *       the object here.  See vm_object_drop().
1251          */
1252 }
1253
1254 /*
1255  * The caller must hold the object.
1256  */
1257 static int
1258 vm_object_terminate_callback(vm_page_t p, void *data)
1259 {
1260         struct rb_vm_page_scan_info *info = data;
1261         vm_object_t object;
1262
1263         if ((++info->count & 63) == 0)
1264                 lwkt_user_yield();
1265         object = p->object;
1266         if (object != info->object) {
1267                 kprintf("vm_object_terminate_callback: obj/pg race %p/%p\n",
1268                         info->object, p);
1269                 return(0);
1270         }
1271         vm_page_busy_wait(p, TRUE, "vmpgtrm");
1272         if (object != p->object) {
1273                 kprintf("vm_object_terminate: Warning: Encountered "
1274                         "busied page %p on queue %d\n", p, p->queue);
1275                 vm_page_wakeup(p);
1276         } else if (p->wire_count == 0) {
1277                 /*
1278                  * NOTE: p->dirty and PG_NEED_COMMIT are ignored.
1279                  */
1280                 vm_page_free(p);
1281                 mycpu->gd_cnt.v_pfree++;
1282         } else {
1283                 if (p->queue != PQ_NONE)
1284                         kprintf("vm_object_terminate: Warning: Encountered "
1285                                 "wired page %p on queue %d\n", p, p->queue);
1286                 vm_page_remove(p);
1287                 vm_page_wakeup(p);
1288         }
1289         return(0);
1290 }
1291
1292 /*
1293  * Clean all dirty pages in the specified range of object.  Leaves page
1294  * on whatever queue it is currently on.   If NOSYNC is set then do not
1295  * write out pages with PG_NOSYNC set (originally comes from MAP_NOSYNC),
1296  * leaving the object dirty.
1297  *
1298  * When stuffing pages asynchronously, allow clustering.  XXX we need a
1299  * synchronous clustering mode implementation.
1300  *
1301  * Odd semantics: if start == end, we clean everything.
1302  *
1303  * The object must be locked? XXX
1304  */
1305 static int vm_object_page_clean_pass1(struct vm_page *p, void *data);
1306 static int vm_object_page_clean_pass2(struct vm_page *p, void *data);
1307
1308 void
1309 vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
1310                      int flags)
1311 {
1312         struct rb_vm_page_scan_info info;
1313         struct vnode *vp;
1314         int wholescan;
1315         int pagerflags;
1316         int generation;
1317
1318         vm_object_hold(object);
1319         if (object->type != OBJT_VNODE ||
1320             (object->flags & OBJ_MIGHTBEDIRTY) == 0) {
1321                 vm_object_drop(object);
1322                 return;
1323         }
1324
1325         pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) ?
1326                         VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK;
1327         pagerflags |= (flags & OBJPC_INVAL) ? VM_PAGER_PUT_INVAL : 0;
1328
1329         vp = object->handle;
1330
1331         /*
1332          * Interlock other major object operations.  This allows us to
1333          * temporarily clear OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY.
1334          */
1335         vm_object_set_flag(object, OBJ_CLEANING);
1336
1337         /*
1338          * Handle 'entire object' case
1339          */
1340         info.start_pindex = start;
1341         if (end == 0) {
1342                 info.end_pindex = object->size - 1;
1343         } else {
1344                 info.end_pindex = end - 1;
1345         }
1346         wholescan = (start == 0 && info.end_pindex == object->size - 1);
1347         info.limit = flags;
1348         info.pagerflags = pagerflags;
1349         info.object = object;
1350         info.count = 0;
1351
1352         /*
1353          * If cleaning the entire object do a pass to mark the pages read-only.
1354          * If everything worked out ok, clear OBJ_WRITEABLE and
1355          * OBJ_MIGHTBEDIRTY.
1356          */
1357         if (wholescan) {
1358                 info.error = 0;
1359                 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
1360                                         vm_object_page_clean_pass1, &info);
1361                 if (info.error == 0) {
1362                         vm_object_clear_flag(object,
1363                                              OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
1364                         if (object->type == OBJT_VNODE &&
1365                             (vp = (struct vnode *)object->handle) != NULL) {
1366                                 /*
1367                                  * Use new-style interface to clear VISDIRTY
1368                                  * because the vnode is not necessarily removed
1369                                  * from the syncer list(s) as often as it was
1370                                  * under the old interface, which can leave
1371                                  * the vnode on the syncer list after reclaim.
1372                                  */
1373                                 vclrobjdirty(vp);
1374                         }
1375                 }
1376         }
1377
1378         /*
1379          * Do a pass to clean all the dirty pages we find.
1380          */
1381         do {
1382                 info.error = 0;
1383                 generation = object->generation;
1384                 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
1385                                         vm_object_page_clean_pass2, &info);
1386         } while (info.error || generation != object->generation);
1387
1388         vm_object_clear_flag(object, OBJ_CLEANING);
1389         vm_object_drop(object);
1390 }
1391
1392 /*
1393  * The caller must hold the object.
1394  */
1395 static
1396 int
1397 vm_object_page_clean_pass1(struct vm_page *p, void *data)
1398 {
1399         struct rb_vm_page_scan_info *info = data;
1400
1401         if ((++info->count & 63) == 0)
1402                 lwkt_user_yield();
1403         if (p->object != info->object ||
1404             p->pindex < info->start_pindex ||
1405             p->pindex > info->end_pindex) {
1406                 kprintf("vm_object_page_clean_pass1: obj/pg race %p/%p\n",
1407                         info->object, p);
1408                 return(0);
1409         }
1410         vm_page_flag_set(p, PG_CLEANCHK);
1411         if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
1412                 info->error = 1;
1413         } else if (vm_page_busy_try(p, FALSE) == 0) {
1414                 if (p->object == info->object)
1415                         vm_page_protect(p, VM_PROT_READ);
1416                 vm_page_wakeup(p);
1417         } else {
1418                 info->error = 1;
1419         }
1420         return(0);
1421 }
1422
1423 /*
1424  * The caller must hold the object
1425  */
1426 static
1427 int
1428 vm_object_page_clean_pass2(struct vm_page *p, void *data)
1429 {
1430         struct rb_vm_page_scan_info *info = data;
1431         int generation;
1432
1433         if (p->object != info->object ||
1434             p->pindex < info->start_pindex ||
1435             p->pindex > info->end_pindex) {
1436                 kprintf("vm_object_page_clean_pass2: obj/pg race %p/%p\n",
1437                         info->object, p);
1438                 return(0);
1439         }
1440
1441         /*
1442          * Do not mess with pages that were inserted after we started
1443          * the cleaning pass.
1444          */
1445         if ((p->flags & PG_CLEANCHK) == 0)
1446                 goto done;
1447
1448         generation = info->object->generation;
1449         vm_page_busy_wait(p, TRUE, "vpcwai");
1450
1451         if (p->object != info->object ||
1452             p->pindex < info->start_pindex ||
1453             p->pindex > info->end_pindex ||
1454             info->object->generation != generation) {
1455                 info->error = 1;
1456                 vm_page_wakeup(p);
1457                 goto done;
1458         }
1459
1460         /*
1461          * Before wasting time traversing the pmaps, check for trivial
1462          * cases where the page cannot be dirty.
1463          */
1464         if (p->valid == 0 || (p->queue - p->pc) == PQ_CACHE) {
1465                 KKASSERT((p->dirty & p->valid) == 0 &&
1466                          (p->flags & PG_NEED_COMMIT) == 0);
1467                 vm_page_wakeup(p);
1468                 goto done;
1469         }
1470
1471         /*
1472          * Check whether the page is dirty or not.  The page has been set
1473          * to be read-only so the check will not race a user dirtying the
1474          * page.
1475          */
1476         vm_page_test_dirty(p);
1477         if ((p->dirty & p->valid) == 0 && (p->flags & PG_NEED_COMMIT) == 0) {
1478                 vm_page_flag_clear(p, PG_CLEANCHK);
1479                 vm_page_wakeup(p);
1480                 goto done;
1481         }
1482
1483         /*
1484          * If we have been asked to skip nosync pages and this is a
1485          * nosync page, skip it.  Note that the object flags were
1486          * not cleared in this case (because pass1 will have returned an
1487          * error), so we do not have to set them.
1488          */
1489         if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
1490                 vm_page_flag_clear(p, PG_CLEANCHK);
1491                 vm_page_wakeup(p);
1492                 goto done;
1493         }
1494
1495         /*
1496          * Flush as many pages as we can.  PG_CLEANCHK will be cleared on
1497          * the pages that get successfully flushed.  Set info->error if
1498          * we raced an object modification.
1499          */
1500         vm_object_page_collect_flush(info->object, p, info->pagerflags);
1501         /* vm_wait_nominal(); this can deadlock the system in syncer/pageout */
1502 done:
1503         if ((++info->count & 63) == 0)
1504                 lwkt_user_yield();
1505
1506         return(0);
1507 }
1508
1509 /*
1510  * Collect the specified page and nearby pages and flush them out.
1511  * The number of pages flushed is returned.  The passed page is busied
1512  * by the caller and we are responsible for its disposition.
1513  *
1514  * The caller must hold the object.
1515  */
1516 static void
1517 vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags)
1518 {
1519         int error;
1520         int is;
1521         int ib;
1522         int i;
1523         int page_base;
1524         vm_pindex_t pi;
1525         vm_page_t ma[BLIST_MAX_ALLOC];
1526
1527         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1528
1529         pi = p->pindex;
1530         page_base = pi % BLIST_MAX_ALLOC;
1531         ma[page_base] = p;
1532         ib = page_base - 1;
1533         is = page_base + 1;
1534
1535         while (ib >= 0) {
1536                 vm_page_t tp;
1537
1538                 tp = vm_page_lookup_busy_try(object, pi - page_base + ib,
1539                                              TRUE, &error);
1540                 if (error)
1541                         break;
1542                 if (tp == NULL)
1543                         break;
1544                 if ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 &&
1545                     (tp->flags & PG_CLEANCHK) == 0) {
1546                         vm_page_wakeup(tp);
1547                         break;
1548                 }
1549                 if ((tp->queue - tp->pc) == PQ_CACHE) {
1550                         vm_page_flag_clear(tp, PG_CLEANCHK);
1551                         vm_page_wakeup(tp);
1552                         break;
1553                 }
1554                 vm_page_test_dirty(tp);
1555                 if ((tp->dirty & tp->valid) == 0 &&
1556                     (tp->flags & PG_NEED_COMMIT) == 0) {
1557                         vm_page_flag_clear(tp, PG_CLEANCHK);
1558                         vm_page_wakeup(tp);
1559                         break;
1560                 }
1561                 ma[ib] = tp;
1562                 --ib;
1563         }
1564         ++ib;   /* fixup */
1565
1566         while (is < BLIST_MAX_ALLOC &&
1567                pi - page_base + is < object->size) {
1568                 vm_page_t tp;
1569
1570                 tp = vm_page_lookup_busy_try(object, pi - page_base + is,
1571                                              TRUE, &error);
1572                 if (error)
1573                         break;
1574                 if (tp == NULL)
1575                         break;
1576                 if ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 &&
1577                     (tp->flags & PG_CLEANCHK) == 0) {
1578                         vm_page_wakeup(tp);
1579                         break;
1580                 }
1581                 if ((tp->queue - tp->pc) == PQ_CACHE) {
1582                         vm_page_flag_clear(tp, PG_CLEANCHK);
1583                         vm_page_wakeup(tp);
1584                         break;
1585                 }
1586                 vm_page_test_dirty(tp);
1587                 if ((tp->dirty & tp->valid) == 0 &&
1588                     (tp->flags & PG_NEED_COMMIT) == 0) {
1589                         vm_page_flag_clear(tp, PG_CLEANCHK);
1590                         vm_page_wakeup(tp);
1591                         break;
1592                 }
1593                 ma[is] = tp;
1594                 ++is;
1595         }
1596
1597         /*
1598          * All pages in the ma[] array are busied now
1599          */
1600         for (i = ib; i < is; ++i) {
1601                 vm_page_flag_clear(ma[i], PG_CLEANCHK);
1602                 vm_page_hold(ma[i]);    /* XXX need this any more? */
1603         }
1604         vm_pageout_flush(&ma[ib], is - ib, pagerflags);
1605         for (i = ib; i < is; ++i)       /* XXX need this any more? */
1606                 vm_page_unhold(ma[i]);
1607 }
1608
1609 /*
1610  * Same as vm_object_pmap_copy, except range checking really
1611  * works, and is meant for small sections of an object.
1612  *
1613  * This code protects resident pages by making them read-only
1614  * and is typically called on a fork or split when a page
1615  * is converted to copy-on-write.
1616  *
1617  * NOTE: If the page is already at VM_PROT_NONE, calling
1618  * vm_page_protect will have no effect.
1619  */
1620 void
1621 vm_object_pmap_copy_1(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
1622 {
1623         vm_pindex_t idx;
1624         vm_page_t p;
1625
1626         if (object == NULL || (object->flags & OBJ_WRITEABLE) == 0)
1627                 return;
1628
1629         vm_object_hold(object);
1630         for (idx = start; idx < end; idx++) {
1631                 p = vm_page_lookup(object, idx);
1632                 if (p == NULL)
1633                         continue;
1634                 vm_page_protect(p, VM_PROT_READ);
1635         }
1636         vm_object_drop(object);
1637 }
1638
1639 /*
1640  * Removes all physical pages in the specified object range from all
1641  * physical maps.
1642  *
1643  * The object must *not* be locked.
1644  */
1645
1646 static int vm_object_pmap_remove_callback(vm_page_t p, void *data);
1647
1648 void
1649 vm_object_pmap_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
1650 {
1651         struct rb_vm_page_scan_info info;
1652
1653         if (object == NULL)
1654                 return;
1655         info.start_pindex = start;
1656         info.end_pindex = end - 1;
1657         info.count = 0;
1658         info.object = object;
1659
1660         vm_object_hold(object);
1661         vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
1662                                 vm_object_pmap_remove_callback, &info);
1663         if (start == 0 && end == object->size)
1664                 vm_object_clear_flag(object, OBJ_WRITEABLE);
1665         vm_object_drop(object);
1666 }
1667
1668 /*
1669  * The caller must hold the object
1670  */
1671 static int
1672 vm_object_pmap_remove_callback(vm_page_t p, void *data)
1673 {
1674         struct rb_vm_page_scan_info *info = data;
1675
1676         if ((++info->count & 63) == 0)
1677                 lwkt_user_yield();
1678
1679         if (info->object != p->object ||
1680             p->pindex < info->start_pindex ||
1681             p->pindex > info->end_pindex) {
1682                 kprintf("vm_object_pmap_remove_callback: obj/pg race %p/%p\n",
1683                         info->object, p);
1684                 return(0);
1685         }
1686
1687         vm_page_protect(p, VM_PROT_NONE);
1688
1689         return(0);
1690 }
1691
1692 /*
1693  * Implements the madvise function at the object/page level.
1694  *
1695  * MADV_WILLNEED        (any object)
1696  *
1697  *      Activate the specified pages if they are resident.
1698  *
1699  * MADV_DONTNEED        (any object)
1700  *
1701  *      Deactivate the specified pages if they are resident.
1702  *
1703  * MADV_FREE    (OBJT_DEFAULT/OBJT_SWAP objects, OBJ_ONEMAPPING only)
1704  *
1705  *      Deactivate and clean the specified pages if they are
1706  *      resident.  This permits the process to reuse the pages
1707  *      without faulting or the kernel to reclaim the pages
1708  *      without I/O.
1709  *
1710  * No requirements.
1711  */
1712 void
1713 vm_object_madvise(vm_object_t object, vm_pindex_t pindex, int count, int advise)
1714 {
1715         vm_pindex_t end, tpindex;
1716         vm_object_t tobject;
1717         vm_object_t xobj;
1718         vm_page_t m;
1719         int error;
1720
1721         if (object == NULL)
1722                 return;
1723
1724         end = pindex + count;
1725
1726         vm_object_hold(object);
1727         tobject = object;
1728
1729         /*
1730          * Locate and adjust resident pages
1731          */
1732         for (; pindex < end; pindex += 1) {
1733 relookup:
1734                 if (tobject != object)
1735                         vm_object_drop(tobject);
1736                 tobject = object;
1737                 tpindex = pindex;
1738 shadowlookup:
1739                 /*
1740                  * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages
1741                  * and those pages must be OBJ_ONEMAPPING.
1742                  */
1743                 if (advise == MADV_FREE) {
1744                         if ((tobject->type != OBJT_DEFAULT &&
1745                              tobject->type != OBJT_SWAP) ||
1746                             (tobject->flags & OBJ_ONEMAPPING) == 0) {
1747                                 continue;
1748                         }
1749                 }
1750
1751                 m = vm_page_lookup_busy_try(tobject, tpindex, TRUE, &error);
1752
1753                 if (error) {
1754                         vm_page_sleep_busy(m, TRUE, "madvpo");
1755                         goto relookup;
1756                 }
1757                 if (m == NULL) {
1758                         /*
1759                          * There may be swap even if there is no backing page
1760                          */
1761                         if (advise == MADV_FREE && tobject->type == OBJT_SWAP)
1762                                 swap_pager_freespace(tobject, tpindex, 1);
1763
1764                         /*
1765                          * next object
1766                          */
1767                         while ((xobj = tobject->backing_object) != NULL) {
1768                                 KKASSERT(xobj != object);
1769                                 vm_object_hold(xobj);
1770                                 if (xobj == tobject->backing_object)
1771                                         break;
1772                                 vm_object_drop(xobj);
1773                         }
1774                         if (xobj == NULL)
1775                                 continue;
1776                         tpindex += OFF_TO_IDX(tobject->backing_object_offset);
1777                         if (tobject != object) {
1778                                 vm_object_lock_swap();
1779                                 vm_object_drop(tobject);
1780                         }
1781                         tobject = xobj;
1782                         goto shadowlookup;
1783                 }
1784
1785                 /*
1786                  * If the page is not in a normal active state, we skip it.
1787                  * If the page is not managed there are no page queues to
1788                  * mess with.  Things can break if we mess with pages in
1789                  * any of the below states.
1790                  */
1791                 if (m->wire_count ||
1792                     (m->flags & (PG_UNMANAGED | PG_NEED_COMMIT)) ||
1793                     m->valid != VM_PAGE_BITS_ALL
1794                 ) {
1795                         vm_page_wakeup(m);
1796                         continue;
1797                 }
1798
1799                 /*
1800                  * Theoretically once a page is known not to be busy, an
1801                  * interrupt cannot come along and rip it out from under us.
1802                  */
1803
1804                 if (advise == MADV_WILLNEED) {
1805                         vm_page_activate(m);
1806                 } else if (advise == MADV_DONTNEED) {
1807                         vm_page_dontneed(m);
1808                 } else if (advise == MADV_FREE) {
1809                         /*
1810                          * Mark the page clean.  This will allow the page
1811                          * to be freed up by the system.  However, such pages
1812                          * are often reused quickly by malloc()/free()
1813                          * so we do not do anything that would cause
1814                          * a page fault if we can help it.
1815                          *
1816                          * Specifically, we do not try to actually free
1817                          * the page now nor do we try to put it in the
1818                          * cache (which would cause a page fault on reuse).
1819                          *
1820                          * But we do make the page is freeable as we
1821                          * can without actually taking the step of unmapping
1822                          * it.
1823                          */
1824                         pmap_clear_modify(m);
1825                         m->dirty = 0;
1826                         m->act_count = 0;
1827                         vm_page_dontneed(m);
1828                         if (tobject->type == OBJT_SWAP)
1829                                 swap_pager_freespace(tobject, tpindex, 1);
1830                 }
1831                 vm_page_wakeup(m);
1832         }
1833         if (tobject != object)
1834                 vm_object_drop(tobject);
1835         vm_object_drop(object);
1836 }
1837
1838 /*
1839  * Create a new object which is backed by the specified existing object
1840  * range.  Replace the pointer and offset that was pointing at the existing
1841  * object with the pointer/offset for the new object.
1842  *
1843  * If addref is non-zero the returned object is given an additional reference.
1844  * This mechanic exists to avoid the situation where refs might be 1 and
1845  * race against a collapse when the caller intends to bump it.  So the
1846  * caller cannot add the ref after the fact.  Used when the caller is
1847  * duplicating a vm_map_entry.
1848  *
1849  * No other requirements.
1850  */
1851 void
1852 vm_object_shadow(vm_object_t *objectp, vm_ooffset_t *offset, vm_size_t length,
1853                  int addref)
1854 {
1855         vm_object_t source;
1856         vm_object_t result;
1857         int useshadowlist;
1858
1859         source = *objectp;
1860
1861         /*
1862          * Don't create the new object if the old object isn't shared.
1863          * We have to chain wait before adding the reference to avoid
1864          * racing a collapse or deallocation.
1865          *
1866          * Clear OBJ_ONEMAPPING flag when shadowing.
1867          *
1868          * The caller owns a ref on source via *objectp which we are going
1869          * to replace.  This ref is inherited by the backing_object assignment.
1870          * from nobject and does not need to be incremented here.
1871          *
1872          * However, we add a temporary extra reference to the original source
1873          * prior to holding nobject in case we block, to avoid races where
1874          * someone else might believe that the source can be collapsed.
1875          */
1876         useshadowlist = 0;
1877         if (source) {
1878                 if (source->type != OBJT_VNODE) {
1879                         useshadowlist = 1;
1880                         vm_object_hold(source);
1881                         vm_object_chain_wait(source, 0);
1882                         if (source->ref_count == 1 &&
1883                             source->handle == NULL &&
1884                             (source->type == OBJT_DEFAULT ||
1885                              source->type == OBJT_SWAP)) {
1886                                 if (addref) {
1887                                         vm_object_reference_locked(source);
1888                                         vm_object_clear_flag(source,
1889                                                              OBJ_ONEMAPPING);
1890                                 }
1891                                 vm_object_drop(source);
1892                                 return;
1893                         }
1894                         vm_object_reference_locked(source);
1895                         vm_object_clear_flag(source, OBJ_ONEMAPPING);
1896                 } else {
1897                         vm_object_reference_quick(source);
1898                         vm_object_clear_flag(source, OBJ_ONEMAPPING);
1899                 }
1900         }
1901
1902         /*
1903          * Allocate a new object with the given length.  The new object
1904          * is returned referenced but we may have to add another one.
1905          * If we are adding a second reference we must clear OBJ_ONEMAPPING.
1906          * (typically because the caller is about to clone a vm_map_entry).
1907          *
1908          * The source object currently has an extra reference to prevent
1909          * collapses into it while we mess with its shadow list, which
1910          * we will remove later in this routine.
1911          *
1912          * The target object may require a second reference if asked for one
1913          * by the caller.
1914          */
1915         result = vm_object_allocate(OBJT_DEFAULT, length);
1916         if (result == NULL)
1917                 panic("vm_object_shadow: no object for shadowing");
1918         vm_object_hold(result);
1919         if (addref) {
1920                 vm_object_reference_locked(result);
1921                 vm_object_clear_flag(result, OBJ_ONEMAPPING);
1922         }
1923
1924         /*
1925          * The new object shadows the source object.  Chain wait before
1926          * adjusting shadow_count or the shadow list to avoid races.
1927          *
1928          * Try to optimize the result object's page color when shadowing
1929          * in order to maintain page coloring consistency in the combined
1930          * shadowed object.
1931          *
1932          * The backing_object reference to source requires adding a ref to
1933          * source.  We simply inherit the ref from the original *objectp
1934          * (which we are replacing) so no additional refs need to be added.
1935          * (we must still clean up the extra ref we had to prevent collapse
1936          * races).
1937          *
1938          * SHADOWING IS NOT APPLICABLE TO OBJT_VNODE OBJECTS
1939          */
1940         KKASSERT(result->backing_object == NULL);
1941         result->backing_object = source;
1942         if (source) {
1943                 if (useshadowlist) {
1944                         vm_object_chain_wait(source, 0);
1945                         LIST_INSERT_HEAD(&source->shadow_head,
1946                                          result, shadow_list);
1947                         source->shadow_count++;
1948                         source->generation++;
1949                         vm_object_set_flag(result, OBJ_ONSHADOW);
1950                 }
1951                 /* cpu localization twist */
1952                 result->pg_color = vm_quickcolor();
1953         }
1954
1955         /*
1956          * Adjust the return storage.  Drop the ref on source before
1957          * returning.
1958          */
1959         result->backing_object_offset = *offset;
1960         vm_object_drop(result);
1961         *offset = 0;
1962         if (source) {
1963                 if (useshadowlist) {
1964                         vm_object_deallocate_locked(source);
1965                         vm_object_drop(source);
1966                 } else {
1967                         vm_object_deallocate(source);
1968                 }
1969         }
1970
1971         /*
1972          * Return the new things
1973          */
1974         *objectp = result;
1975 }
1976
1977 #define OBSC_TEST_ALL_SHADOWED  0x0001
1978 #define OBSC_COLLAPSE_NOWAIT    0x0002
1979 #define OBSC_COLLAPSE_WAIT      0x0004
1980
1981 static int vm_object_backing_scan_callback(vm_page_t p, void *data);
1982
1983 /*
1984  * The caller must hold the object.
1985  */
1986 static __inline int
1987 vm_object_backing_scan(vm_object_t object, vm_object_t backing_object, int op)
1988 {
1989         struct rb_vm_page_scan_info info;
1990         int n;
1991
1992         vm_object_assert_held(object);
1993         vm_object_assert_held(backing_object);
1994
1995         KKASSERT(backing_object == object->backing_object);
1996         info.backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
1997
1998         /*
1999          * Initial conditions
2000          */
2001         if (op & OBSC_TEST_ALL_SHADOWED) {
2002                 /*
2003                  * We do not want to have to test for the existence of
2004                  * swap pages in the backing object.  XXX but with the
2005                  * new swapper this would be pretty easy to do.
2006                  *
2007                  * XXX what about anonymous MAP_SHARED memory that hasn't
2008                  * been ZFOD faulted yet?  If we do not test for this, the
2009                  * shadow test may succeed! XXX
2010                  */
2011                 if (backing_object->type != OBJT_DEFAULT)
2012                         return(0);
2013         }
2014         if (op & OBSC_COLLAPSE_WAIT) {
2015                 KKASSERT((backing_object->flags & OBJ_DEAD) == 0);
2016                 vm_object_set_flag(backing_object, OBJ_DEAD);
2017
2018                 n = VMOBJ_HASH(backing_object);
2019                 lwkt_gettoken(&vmobj_tokens[n]);
2020                 TAILQ_REMOVE(&vm_object_lists[n], backing_object, object_list);
2021                 lwkt_reltoken(&vmobj_tokens[n]);
2022                 atomic_add_long(&vm_object_count, -1);
2023         }
2024
2025         /*
2026          * Our scan.   We have to retry if a negative error code is returned,
2027          * otherwise 0 or 1 will be returned in info.error.  0 Indicates that
2028          * the scan had to be stopped because the parent does not completely
2029          * shadow the child.
2030          */
2031         info.object = object;
2032         info.backing_object = backing_object;
2033         info.limit = op;
2034         do {
2035                 info.error = 1;
2036                 vm_page_rb_tree_RB_SCAN(&backing_object->rb_memq, NULL,
2037                                         vm_object_backing_scan_callback,
2038                                         &info);
2039         } while (info.error < 0);
2040
2041         return(info.error);
2042 }
2043
2044 /*
2045  * The caller must hold the object.
2046  */
2047 static int
2048 vm_object_backing_scan_callback(vm_page_t p, void *data)
2049 {
2050         struct rb_vm_page_scan_info *info = data;
2051         vm_object_t backing_object;
2052         vm_object_t object;
2053         vm_pindex_t pindex;
2054         vm_pindex_t new_pindex;
2055         vm_pindex_t backing_offset_index;
2056         int op;
2057
2058         pindex = p->pindex;
2059         new_pindex = pindex - info->backing_offset_index;
2060         op = info->limit;
2061         object = info->object;
2062         backing_object = info->backing_object;
2063         backing_offset_index = info->backing_offset_index;
2064
2065         if (op & OBSC_TEST_ALL_SHADOWED) {
2066                 vm_page_t pp;
2067
2068                 /*
2069                  * Ignore pages outside the parent object's range
2070                  * and outside the parent object's mapping of the
2071                  * backing object.
2072                  *
2073                  * note that we do not busy the backing object's
2074                  * page.
2075                  */
2076                 if (pindex < backing_offset_index ||
2077                     new_pindex >= object->size
2078                 ) {
2079                         return(0);
2080                 }
2081
2082                 /*
2083                  * See if the parent has the page or if the parent's
2084                  * object pager has the page.  If the parent has the
2085                  * page but the page is not valid, the parent's
2086                  * object pager must have the page.
2087                  *
2088                  * If this fails, the parent does not completely shadow
2089                  * the object and we might as well give up now.
2090                  */
2091                 pp = vm_page_lookup(object, new_pindex);
2092                 if ((pp == NULL || pp->valid == 0) &&
2093                     !vm_pager_has_page(object, new_pindex)
2094                 ) {
2095                         info->error = 0;        /* problemo */
2096                         return(-1);             /* stop the scan */
2097                 }
2098         }
2099
2100         /*
2101          * Check for busy page.  Note that we may have lost (p) when we
2102          * possibly blocked above.
2103          */
2104         if (op & (OBSC_COLLAPSE_WAIT | OBSC_COLLAPSE_NOWAIT)) {
2105                 vm_page_t pp;
2106
2107                 if (vm_page_busy_try(p, TRUE)) {
2108                         if (op & OBSC_COLLAPSE_NOWAIT) {
2109                                 return(0);
2110                         } else {
2111                                 /*
2112                                  * If we slept, anything could have
2113                                  * happened.   Ask that the scan be restarted.
2114                                  *
2115                                  * Since the object is marked dead, the
2116                                  * backing offset should not have changed.
2117                                  */
2118                                 vm_page_sleep_busy(p, TRUE, "vmocol");
2119                                 info->error = -1;
2120                                 return(-1);
2121                         }
2122                 }
2123
2124                 /*
2125                  * If (p) is no longer valid restart the scan.
2126                  */
2127                 if (p->object != backing_object || p->pindex != pindex) {
2128                         kprintf("vm_object_backing_scan: Warning: page "
2129                                 "%p ripped out from under us\n", p);
2130                         vm_page_wakeup(p);
2131                         info->error = -1;
2132                         return(-1);
2133                 }
2134
2135                 if (op & OBSC_COLLAPSE_NOWAIT) {
2136                         if (p->valid == 0 ||
2137                             p->wire_count ||
2138                             (p->flags & PG_NEED_COMMIT)) {
2139                                 vm_page_wakeup(p);
2140                                 return(0);
2141                         }
2142                 } else {
2143                         /* XXX what if p->valid == 0 , hold_count, etc? */
2144                 }
2145
2146                 KASSERT(
2147                     p->object == backing_object,
2148                     ("vm_object_qcollapse(): object mismatch")
2149                 );
2150
2151                 /*
2152                  * Destroy any associated swap
2153                  */
2154                 if (backing_object->type == OBJT_SWAP)
2155                         swap_pager_freespace(backing_object, p->pindex, 1);
2156
2157                 if (
2158                     p->pindex < backing_offset_index ||
2159                     new_pindex >= object->size
2160                 ) {
2161                         /*
2162                          * Page is out of the parent object's range, we
2163                          * can simply destroy it.
2164                          */
2165                         vm_page_protect(p, VM_PROT_NONE);
2166                         vm_page_free(p);
2167                         return(0);
2168                 }
2169
2170                 pp = vm_page_lookup(object, new_pindex);
2171                 if (pp != NULL || vm_pager_has_page(object, new_pindex)) {
2172                         /*
2173                          * page already exists in parent OR swap exists
2174                          * for this location in the parent.  Destroy
2175                          * the original page from the backing object.
2176                          *
2177                          * Leave the parent's page alone
2178                          */
2179                         vm_page_protect(p, VM_PROT_NONE);
2180                         vm_page_free(p);
2181                         return(0);
2182                 }
2183
2184                 /*
2185                  * Page does not exist in parent, rename the
2186                  * page from the backing object to the main object.
2187                  *
2188                  * If the page was mapped to a process, it can remain
2189                  * mapped through the rename.
2190                  */
2191                 if ((p->queue - p->pc) == PQ_CACHE)
2192                         vm_page_deactivate(p);
2193
2194                 vm_page_rename(p, object, new_pindex);
2195                 vm_page_wakeup(p);
2196                 /* page automatically made dirty by rename */
2197         }
2198         return(0);
2199 }
2200
2201 /*
2202  * This version of collapse allows the operation to occur earlier and
2203  * when paging_in_progress is true for an object...  This is not a complete
2204  * operation, but should plug 99.9% of the rest of the leaks.
2205  *
2206  * The caller must hold the object and backing_object and both must be
2207  * chainlocked.
2208  *
2209  * (only called from vm_object_collapse)
2210  */
2211 static void
2212 vm_object_qcollapse(vm_object_t object, vm_object_t backing_object)
2213 {
2214         if (backing_object->ref_count == 1) {
2215                 atomic_add_int(&backing_object->ref_count, 2);
2216 #if defined(DEBUG_LOCKS)
2217                 debugvm_object_add(backing_object, "qcollapse", 1, 2);
2218 #endif
2219                 vm_object_backing_scan(object, backing_object,
2220                                        OBSC_COLLAPSE_NOWAIT);
2221                 atomic_add_int(&backing_object->ref_count, -2);
2222 #if defined(DEBUG_LOCKS)
2223                 debugvm_object_add(backing_object, "qcollapse", 2, -2);
2224 #endif
2225         }
2226 }
2227
2228 /*
2229  * Collapse an object with the object backing it.  Pages in the backing
2230  * object are moved into the parent, and the backing object is deallocated.
2231  * Any conflict is resolved in favor of the parent's existing pages.
2232  *
2233  * object must be held and chain-locked on call.
2234  *
2235  * The caller must have an extra ref on object to prevent a race from
2236  * destroying it during the collapse.
2237  */
2238 void
2239 vm_object_collapse(vm_object_t object, struct vm_object_dealloc_list **dlistp)
2240 {
2241         struct vm_object_dealloc_list *dlist = NULL;
2242         vm_object_t backing_object;
2243
2244         /*
2245          * Only one thread is attempting a collapse at any given moment.
2246          * There are few restrictions for (object) that callers of this
2247          * function check so reentrancy is likely.
2248          */
2249         KKASSERT(object != NULL);
2250         vm_object_assert_held(object);
2251         KKASSERT(object->chainlk & (CHAINLK_MASK | CHAINLK_EXCL));
2252
2253         for (;;) {
2254                 vm_object_t bbobj;
2255                 int dodealloc;
2256
2257                 /*
2258                  * We can only collapse a DEFAULT/SWAP object with a
2259                  * DEFAULT/SWAP object.
2260                  */
2261                 if (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP) {
2262                         backing_object = NULL;
2263                         break;
2264                 }
2265
2266                 backing_object = object->backing_object;
2267                 if (backing_object == NULL)
2268                         break;
2269                 if (backing_object->type != OBJT_DEFAULT &&
2270                     backing_object->type != OBJT_SWAP) {
2271                         backing_object = NULL;
2272                         break;
2273                 }
2274
2275                 /*
2276                  * Hold the backing_object and check for races
2277                  */
2278                 vm_object_hold(backing_object);
2279                 if (backing_object != object->backing_object ||
2280                     (backing_object->type != OBJT_DEFAULT &&
2281                      backing_object->type != OBJT_SWAP)) {
2282                         vm_object_drop(backing_object);
2283                         continue;
2284                 }
2285
2286                 /*
2287                  * Chain-lock the backing object too because if we
2288                  * successfully merge its pages into the top object we
2289                  * will collapse backing_object->backing_object as the
2290                  * new backing_object.  Re-check that it is still our
2291                  * backing object.
2292                  */
2293                 vm_object_chain_acquire(backing_object, 0);
2294                 if (backing_object != object->backing_object) {
2295                         vm_object_chain_release(backing_object);
2296                         vm_object_drop(backing_object);
2297                         continue;
2298                 }
2299
2300                 /*
2301                  * we check the backing object first, because it is most likely
2302                  * not collapsable.
2303                  */
2304                 if (backing_object->handle != NULL ||
2305                     (backing_object->type != OBJT_DEFAULT &&
2306                      backing_object->type != OBJT_SWAP) ||
2307                     (backing_object->flags & OBJ_DEAD) ||
2308                     object->handle != NULL ||
2309                     (object->type != OBJT_DEFAULT &&
2310                      object->type != OBJT_SWAP) ||
2311                     (object->flags & OBJ_DEAD)) {
2312                         break;
2313                 }
2314
2315                 /*
2316                  * If paging is in progress we can't do a normal collapse.
2317                  */
2318                 if (
2319                     object->paging_in_progress != 0 ||
2320                     backing_object->paging_in_progress != 0
2321                 ) {
2322                         vm_object_qcollapse(object, backing_object);
2323                         break;
2324                 }
2325
2326                 /*
2327                  * We know that we can either collapse the backing object (if
2328                  * the parent is the only reference to it) or (perhaps) have
2329                  * the parent bypass the object if the parent happens to shadow
2330                  * all the resident pages in the entire backing object.
2331                  *
2332                  * This is ignoring pager-backed pages such as swap pages.
2333                  * vm_object_backing_scan fails the shadowing test in this
2334                  * case.
2335                  */
2336                 if (backing_object->ref_count == 1) {
2337                         /*
2338                          * If there is exactly one reference to the backing
2339                          * object, we can collapse it into the parent.
2340                          */
2341                         KKASSERT(object->backing_object == backing_object);
2342                         vm_object_backing_scan(object, backing_object,
2343                                                OBSC_COLLAPSE_WAIT);
2344
2345                         /*
2346                          * Move the pager from backing_object to object.
2347                          */
2348                         if (backing_object->type == OBJT_SWAP) {
2349                                 vm_object_pip_add(backing_object, 1);
2350
2351                                 /*
2352                                  * scrap the paging_offset junk and do a
2353                                  * discrete copy.  This also removes major
2354                                  * assumptions about how the swap-pager
2355                                  * works from where it doesn't belong.  The
2356                                  * new swapper is able to optimize the
2357                                  * destroy-source case.
2358                                  */
2359                                 vm_object_pip_add(object, 1);
2360                                 swap_pager_copy(backing_object, object,
2361                                     OFF_TO_IDX(object->backing_object_offset),
2362                                     TRUE);
2363                                 vm_object_pip_wakeup(object);
2364                                 vm_object_pip_wakeup(backing_object);
2365                         }
2366
2367                         /*
2368                          * Object now shadows whatever backing_object did.
2369                          * Remove object from backing_object's shadow_list.
2370                          *
2371                          * Removing object from backing_objects shadow list
2372                          * requires releasing object, which we will do below.
2373                          */
2374                         KKASSERT(object->backing_object == backing_object);
2375                         if (object->flags & OBJ_ONSHADOW) {
2376                                 LIST_REMOVE(object, shadow_list);
2377                                 backing_object->shadow_count--;
2378                                 backing_object->generation++;
2379                                 vm_object_clear_flag(object, OBJ_ONSHADOW);
2380                         }
2381
2382                         /*
2383                          * backing_object->backing_object moves from within
2384                          * backing_object to within object.
2385                          *
2386                          * OBJT_VNODE bbobj's should have empty shadow lists.
2387                          */
2388                         while ((bbobj = backing_object->backing_object) != NULL) {
2389                                 if (bbobj->type == OBJT_VNODE)
2390                                         vm_object_hold_shared(bbobj);
2391                                 else
2392                                         vm_object_hold(bbobj);
2393                                 if (bbobj == backing_object->backing_object)
2394                                         break;
2395                                 vm_object_drop(bbobj);
2396                         }
2397
2398                         /*
2399                          * We are removing backing_object from bbobj's
2400                          * shadow list and adding object to bbobj's shadow
2401                          * list, so the ref_count on bbobj is unchanged.
2402                          */
2403                         if (bbobj) {
2404                                 if (backing_object->flags & OBJ_ONSHADOW) {
2405                                         /* not locked exclusively if vnode */
2406                                         KKASSERT(bbobj->type != OBJT_VNODE);
2407                                         LIST_REMOVE(backing_object,
2408                                                     shadow_list);
2409                                         bbobj->shadow_count--;
2410                                         bbobj->generation++;
2411                                         vm_object_clear_flag(backing_object,
2412                                                              OBJ_ONSHADOW);
2413                                 }
2414                                 backing_object->backing_object = NULL;
2415                         }
2416                         object->backing_object = bbobj;
2417                         if (bbobj) {
2418                                 if (bbobj->type != OBJT_VNODE) {
2419                                         LIST_INSERT_HEAD(&bbobj->shadow_head,
2420                                                          object, shadow_list);
2421                                         bbobj->shadow_count++;
2422                                         bbobj->generation++;
2423                                         vm_object_set_flag(object,
2424                                                            OBJ_ONSHADOW);
2425                                 }
2426                         }
2427
2428                         object->backing_object_offset +=
2429                                 backing_object->backing_object_offset;
2430
2431                         vm_object_drop(bbobj);
2432
2433                         /*
2434                          * Discard the old backing_object.  Nothing should be
2435                          * able to ref it, other than a vm_map_split(),
2436                          * and vm_map_split() will stall on our chain lock.
2437                          * And we control the parent so it shouldn't be
2438                          * possible for it to go away either.
2439                          *
2440                          * Since the backing object has no pages, no pager
2441                          * left, and no object references within it, all
2442                          * that is necessary is to dispose of it.
2443                          */
2444                         KASSERT(backing_object->ref_count == 1,
2445                                 ("backing_object %p was somehow "
2446                                  "re-referenced during collapse!",
2447                                  backing_object));
2448                         KASSERT(RB_EMPTY(&backing_object->rb_memq),
2449                                 ("backing_object %p somehow has left "
2450                                  "over pages during collapse!",
2451                                  backing_object));
2452
2453                         /*
2454                          * The object can be destroyed.
2455                          *
2456                          * XXX just fall through and dodealloc instead
2457                          *     of forcing destruction?
2458                          */
2459                         atomic_add_int(&backing_object->ref_count, -1);
2460 #if defined(DEBUG_LOCKS)
2461                         debugvm_object_add(backing_object, "collapse", 1, -1);
2462 #endif
2463                         if ((backing_object->flags & OBJ_DEAD) == 0)
2464                                 vm_object_terminate(backing_object);
2465                         object_collapses++;
2466                         dodealloc = 0;
2467                 } else {
2468                         /*
2469                          * If we do not entirely shadow the backing object,
2470                          * there is nothing we can do so we give up.
2471                          */
2472                         if (vm_object_backing_scan(object, backing_object,
2473                                                 OBSC_TEST_ALL_SHADOWED) == 0) {
2474                                 break;
2475                         }
2476
2477                         /*
2478                          * bbobj is backing_object->backing_object.  Since
2479                          * object completely shadows backing_object we can
2480                          * bypass it and become backed by bbobj instead.
2481                          *
2482                          * The shadow list for vnode backing objects is not
2483                          * used and a shared hold is allowed.
2484                          */
2485                         while ((bbobj = backing_object->backing_object) != NULL) {
2486                                 if (bbobj->type == OBJT_VNODE)
2487                                         vm_object_hold_shared(bbobj);
2488                                 else
2489                                         vm_object_hold(bbobj);
2490                                 if (bbobj == backing_object->backing_object)
2491                                         break;
2492                                 vm_object_drop(bbobj);
2493                         }
2494
2495                         /*
2496                          * Make object shadow bbobj instead of backing_object.
2497                          * Remove object from backing_object's shadow list.
2498                          *
2499                          * Deallocating backing_object will not remove
2500                          * it, since its reference count is at least 2.
2501                          *
2502                          * Removing object from backing_object's shadow
2503                          * list requires releasing a ref, which we do
2504                          * below by setting dodealloc to 1.
2505                          */
2506                         KKASSERT(object->backing_object == backing_object);
2507                         if (object->flags & OBJ_ONSHADOW) {
2508                                 LIST_REMOVE(object, shadow_list);
2509                                 backing_object->shadow_count--;
2510                                 backing_object->generation++;
2511                                 vm_object_clear_flag(object, OBJ_ONSHADOW);
2512                         }
2513
2514                         /*
2515                          * Add a ref to bbobj, bbobj now shadows object.
2516                          *
2517                          * NOTE: backing_object->backing_object still points
2518                          *       to bbobj.  That relationship remains intact
2519                          *       because backing_object has > 1 ref, so
2520                          *       someone else is pointing to it (hence why
2521                          *       we can't collapse it into object and can
2522                          *       only handle the all-shadowed bypass case).
2523                          */
2524                         if (bbobj) {
2525                                 if (bbobj->type != OBJT_VNODE) {
2526                                         vm_object_chain_wait(bbobj, 0);
2527                                         vm_object_reference_locked(bbobj);
2528                                         LIST_INSERT_HEAD(&bbobj->shadow_head,
2529                                                          object, shadow_list);
2530                                         bbobj->shadow_count++;
2531                                         bbobj->generation++;
2532                                         vm_object_set_flag(object,
2533                                                            OBJ_ONSHADOW);
2534                                 } else {
2535                                         vm_object_reference_quick(bbobj);
2536                                 }
2537                                 object->backing_object_offset +=
2538                                         backing_object->backing_object_offset;
2539                                 object->backing_object = bbobj;
2540                                 vm_object_drop(bbobj);
2541                         } else {
2542                                 object->backing_object = NULL;
2543                         }
2544
2545                         /*
2546                          * Drop the reference count on backing_object.  To
2547                          * handle ref_count races properly we can't assume
2548                          * that the ref_count is still at least 2 so we
2549                          * have to actually call vm_object_deallocate()
2550                          * (after clearing the chainlock).
2551                          */
2552                         object_bypasses++;
2553                         dodealloc = 1;
2554                 }
2555
2556                 /*
2557                  * Ok, we want to loop on the new object->bbobj association,
2558                  * possibly collapsing it further.  However if dodealloc is
2559                  * non-zero we have to deallocate the backing_object which
2560                  * itself can potentially undergo a collapse, creating a
2561                  * recursion depth issue with the LWKT token subsystem.
2562                  *
2563                  * In the case where we must deallocate the backing_object
2564                  * it is possible now that the backing_object has a single
2565                  * shadow count on some other object (not represented here
2566                  * as yet), since it no longer shadows us.  Thus when we
2567                  * call vm_object_deallocate() it may attempt to collapse
2568                  * itself into its remaining parent.
2569                  */
2570                 if (dodealloc) {
2571                         struct vm_object_dealloc_list *dtmp;
2572
2573                         vm_object_chain_release(backing_object);
2574                         vm_object_unlock(backing_object);
2575                         /* backing_object remains held */
2576
2577                         /*
2578                          * Auto-deallocation list for caller convenience.
2579                          */
2580                         if (dlistp == NULL)
2581                                 dlistp = &dlist;
2582
2583                         dtmp = kmalloc(sizeof(*dtmp), M_TEMP, M_WAITOK);
2584                         dtmp->object = backing_object;
2585                         dtmp->next = *dlistp;
2586                         *dlistp = dtmp;
2587                 } else {
2588                         vm_object_chain_release(backing_object);
2589                         vm_object_drop(backing_object);
2590                 }
2591                 /* backing_object = NULL; not needed */
2592                 /* loop */
2593         }
2594
2595         /*
2596          * Clean up any left over backing_object
2597          */
2598         if (backing_object) {
2599                 vm_object_chain_release(backing_object);
2600                 vm_object_drop(backing_object);
2601         }
2602
2603         /*
2604          * Clean up any auto-deallocation list.  This is a convenience
2605          * for top-level callers so they don't have to pass &dlist.
2606          * Do not clean up any caller-passed dlistp, the caller will
2607          * do that.
2608          */
2609         if (dlist)
2610                 vm_object_deallocate_list(&dlist);
2611
2612 }
2613
2614 /*
2615  * vm_object_collapse() may collect additional objects in need of
2616  * deallocation.  This routine deallocates these objects.  The
2617  * deallocation itself can trigger additional collapses (which the
2618  * deallocate function takes care of).  This procedure is used to
2619  * reduce procedural recursion since these vm_object shadow chains
2620  * can become quite long.
2621  */
2622 void
2623 vm_object_deallocate_list(struct vm_object_dealloc_list **dlistp)
2624 {
2625         struct vm_object_dealloc_list *dlist;
2626
2627         while ((dlist = *dlistp) != NULL) {
2628                 *dlistp = dlist->next;
2629                 vm_object_lock(dlist->object);
2630                 vm_object_deallocate_locked(dlist->object);
2631                 vm_object_drop(dlist->object);
2632                 kfree(dlist, M_TEMP);
2633         }
2634 }
2635
2636 /*
2637  * Removes all physical pages in the specified object range from the
2638  * object's list of pages.
2639  *
2640  * No requirements.
2641  */
2642 static int vm_object_page_remove_callback(vm_page_t p, void *data);
2643
2644 void
2645 vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
2646                       boolean_t clean_only)
2647 {
2648         struct rb_vm_page_scan_info info;
2649         int all;
2650
2651         /*
2652          * Degenerate cases and assertions
2653          */
2654         vm_object_hold(object);
2655         if (object == NULL ||
2656             (object->resident_page_count == 0 && object->swblock_count == 0)) {
2657                 vm_object_drop(object);
2658                 return;
2659         }
2660         KASSERT(object->type != OBJT_PHYS,
2661                 ("attempt to remove pages from a physical object"));
2662
2663         /*
2664          * Indicate that paging is occuring on the object
2665          */
2666         vm_object_pip_add(object, 1);
2667
2668         /*
2669          * Figure out the actual removal range and whether we are removing
2670          * the entire contents of the object or not.  If removing the entire
2671          * contents, be sure to get all pages, even those that might be
2672          * beyond the end of the object.
2673          */
2674         info.object = object;
2675         info.start_pindex = start;
2676         if (end == 0)
2677                 info.end_pindex = (vm_pindex_t)-1;
2678         else
2679                 info.end_pindex = end - 1;
2680         info.limit = clean_only;
2681         all = (start == 0 && info.end_pindex >= object->size - 1);
2682
2683         /*
2684          * Loop until we are sure we have gotten them all.
2685          */
2686         do {
2687                 info.error = 0;
2688                 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
2689                                         vm_object_page_remove_callback, &info);
2690         } while (info.error);
2691
2692         /*
2693          * Remove any related swap if throwing away pages, or for
2694          * non-swap objects (the swap is a clean copy in that case).
2695          */
2696         if (object->type != OBJT_SWAP || clean_only == FALSE) {
2697                 if (all)
2698                         swap_pager_freespace_all(object);
2699                 else
2700                         swap_pager_freespace(object, info.start_pindex,
2701                              info.end_pindex - info.start_pindex + 1);
2702         }
2703
2704         /*
2705          * Cleanup
2706          */
2707         vm_object_pip_wakeup(object);
2708         vm_object_drop(object);
2709 }
2710
2711 /*
2712  * The caller must hold the object
2713  */
2714 static int
2715 vm_object_page_remove_callback(vm_page_t p, void *data)
2716 {
2717         struct rb_vm_page_scan_info *info = data;
2718
2719         if ((++info->count & 63) == 0)
2720                 lwkt_user_yield();
2721
2722         if (info->object != p->object ||
2723             p->pindex < info->start_pindex ||
2724             p->pindex > info->end_pindex) {
2725                 kprintf("vm_object_page_remove_callbackA: obj/pg race %p/%p\n",
2726                         info->object, p);
2727                 return(0);
2728         }
2729         if (vm_page_busy_try(p, TRUE)) {
2730                 vm_page_sleep_busy(p, TRUE, "vmopar");
2731                 info->error = 1;
2732                 return(0);
2733         }
2734         if (info->object != p->object) {
2735                 /* this should never happen */
2736                 kprintf("vm_object_page_remove_callbackB: obj/pg race %p/%p\n",
2737                         info->object, p);
2738                 vm_page_wakeup(p);
2739                 return(0);
2740         }
2741
2742         /*
2743          * Wired pages cannot be destroyed, but they can be invalidated
2744          * and we do so if clean_only (limit) is not set.
2745          *
2746          * WARNING!  The page may be wired due to being part of a buffer
2747          *           cache buffer, and the buffer might be marked B_CACHE.
2748          *           This is fine as part of a truncation but VFSs must be
2749          *           sure to fix the buffer up when re-extending the file.
2750          *
2751          * NOTE!     PG_NEED_COMMIT is ignored.
2752          */
2753         if (p->wire_count != 0) {
2754                 vm_page_protect(p, VM_PROT_NONE);
2755                 if (info->limit == 0)
2756                         p->valid = 0;
2757                 vm_page_wakeup(p);
2758                 return(0);
2759         }
2760
2761         /*
2762          * limit is our clean_only flag.  If set and the page is dirty or
2763          * requires a commit, do not free it.  If set and the page is being
2764          * held by someone, do not free it.
2765          */
2766         if (info->limit && p->valid) {
2767                 vm_page_test_dirty(p);
2768                 if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) {
2769                         vm_page_wakeup(p);
2770                         return(0);
2771                 }
2772         }
2773
2774         /*
2775          * Destroy the page
2776          */
2777         vm_page_protect(p, VM_PROT_NONE);
2778         vm_page_free(p);
2779
2780         return(0);
2781 }
2782
2783 /*
2784  * Coalesces two objects backing up adjoining regions of memory into a
2785  * single object.
2786  *
2787  * returns TRUE if objects were combined.
2788  *
2789  * NOTE: Only works at the moment if the second object is NULL -
2790  *       if it's not, which object do we lock first?
2791  *
2792  * Parameters:
2793  *      prev_object     First object to coalesce
2794  *      prev_offset     Offset into prev_object
2795  *      next_object     Second object into coalesce
2796  *      next_offset     Offset into next_object
2797  *
2798  *      prev_size       Size of reference to prev_object
2799  *      next_size       Size of reference to next_object
2800  *
2801  * The caller does not need to hold (prev_object) but must have a stable
2802  * pointer to it (typically by holding the vm_map locked).
2803  */
2804 boolean_t
2805 vm_object_coalesce(vm_object_t prev_object, vm_pindex_t prev_pindex,
2806                    vm_size_t prev_size, vm_size_t next_size)
2807 {
2808         vm_pindex_t next_pindex;
2809
2810         if (prev_object == NULL)
2811                 return (TRUE);
2812
2813         vm_object_hold(prev_object);
2814
2815         if (prev_object->type != OBJT_DEFAULT &&
2816             prev_object->type != OBJT_SWAP) {
2817                 vm_object_drop(prev_object);
2818                 return (FALSE);
2819         }
2820
2821         /*
2822          * Try to collapse the object first
2823          */
2824         vm_object_chain_acquire(prev_object, 0);
2825         vm_object_collapse(prev_object, NULL);
2826
2827         /*
2828          * Can't coalesce if: . more than one reference . paged out . shadows
2829          * another object . has a copy elsewhere (any of which mean that the
2830          * pages not mapped to prev_entry may be in use anyway)
2831          */
2832
2833         if (prev_object->backing_object != NULL) {
2834                 vm_object_chain_release(prev_object);
2835                 vm_object_drop(prev_object);
2836                 return (FALSE);
2837         }
2838
2839         prev_size >>= PAGE_SHIFT;
2840         next_size >>= PAGE_SHIFT;
2841         next_pindex = prev_pindex + prev_size;
2842
2843         if ((prev_object->ref_count > 1) &&
2844             (prev_object->size != next_pindex)) {
2845                 vm_object_chain_release(prev_object);
2846                 vm_object_drop(prev_object);
2847                 return (FALSE);
2848         }
2849
2850         /*
2851          * Remove any pages that may still be in the object from a previous
2852          * deallocation.
2853          */
2854         if (next_pindex < prev_object->size) {
2855                 vm_object_page_remove(prev_object,
2856                                       next_pindex,
2857                                       next_pindex + next_size, FALSE);
2858                 if (prev_object->type == OBJT_SWAP)
2859                         swap_pager_freespace(prev_object,
2860                                              next_pindex, next_size);
2861         }
2862
2863         /*
2864          * Extend the object if necessary.
2865          */
2866         if (next_pindex + next_size > prev_object->size)
2867                 prev_object->size = next_pindex + next_size;
2868
2869         vm_object_chain_release(prev_object);
2870         vm_object_drop(prev_object);
2871         return (TRUE);
2872 }
2873
2874 /*
2875  * Make the object writable and flag is being possibly dirty.
2876  *
2877  * The object might not be held (or might be held but held shared),
2878  * the related vnode is probably not held either.  Object and vnode are
2879  * stable by virtue of the vm_page busied by the caller preventing
2880  * destruction.
2881  *
2882  * If the related mount is flagged MNTK_THR_SYNC we need to call
2883  * vsetobjdirty().  Filesystems using this option usually shortcut
2884  * synchronization by only scanning the syncer list.
2885  */
2886 void
2887 vm_object_set_writeable_dirty(vm_object_t object)
2888 {
2889         struct vnode *vp;
2890
2891         /*vm_object_assert_held(object);*/
2892         /*
2893          * Avoid contention in vm fault path by checking the state before
2894          * issuing an atomic op on it.
2895          */
2896         if ((object->flags & (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) !=
2897             (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) {
2898                 vm_object_set_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
2899         }
2900         if (object->type == OBJT_VNODE &&
2901             (vp = (struct vnode *)object->handle) != NULL) {
2902                 if ((vp->v_flag & VOBJDIRTY) == 0) {
2903                         if (vp->v_mount &&
2904                             (vp->v_mount->mnt_kern_flag & MNTK_THR_SYNC)) {
2905                                 /*
2906                                  * New style THR_SYNC places vnodes on the
2907                                  * syncer list more deterministically.
2908                                  */
2909                                 vsetobjdirty(vp);
2910                         } else {
2911                                 /*
2912                                  * Old style scan would not necessarily place
2913                                  * a vnode on the syncer list when possibly
2914                                  * modified via mmap.
2915                                  */
2916                                 vsetflags(vp, VOBJDIRTY);
2917                         }
2918                 }
2919         }
2920 }
2921
2922 #include "opt_ddb.h"
2923 #ifdef DDB
2924 #include <sys/kernel.h>
2925
2926 #include <sys/cons.h>
2927
2928 #include <ddb/ddb.h>
2929
2930 static int      _vm_object_in_map (vm_map_t map, vm_object_t object,
2931                                        vm_map_entry_t entry);
2932 static int      vm_object_in_map (vm_object_t object);
2933
2934 /*
2935  * The caller must hold the object.
2936  */
2937 static int
2938 _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry)
2939 {
2940         vm_map_t tmpm;
2941         vm_map_entry_t tmpe;
2942         vm_object_t obj, nobj;
2943         int entcount;
2944
2945         if (map == 0)
2946                 return 0;
2947         if (entry == 0) {
2948                 tmpe = map->header.next;
2949                 entcount = map->nentries;
2950                 while (entcount-- && (tmpe != &map->header)) {
2951                         if( _vm_object_in_map(map, object, tmpe)) {
2952                                 return 1;
2953                         }
2954                         tmpe = tmpe->next;
2955                 }
2956                 return (0);
2957         }
2958         switch(entry->maptype) {
2959         case VM_MAPTYPE_SUBMAP:
2960                 tmpm = entry->object.sub_map;
2961                 tmpe = tmpm->header.next;
2962                 entcount = tmpm->nentries;
2963                 while (entcount-- && tmpe != &tmpm->header) {
2964                         if( _vm_object_in_map(tmpm, object, tmpe)) {
2965                                 return 1;
2966                         }
2967                         tmpe = tmpe->next;
2968                 }
2969                 break;
2970         case VM_MAPTYPE_NORMAL:
2971         case VM_MAPTYPE_VPAGETABLE:
2972                 obj = entry->object.vm_object;
2973                 while (obj) {
2974                         if (obj == object) {
2975                                 if (obj != entry->object.vm_object)
2976                                         vm_object_drop(obj);
2977                                 return 1;
2978                         }
2979                         while ((nobj = obj->backing_object) != NULL) {
2980                                 vm_object_hold(nobj);
2981                                 if (nobj == obj->backing_object)
2982                                         break;
2983                                 vm_object_drop(nobj);
2984                         }
2985                         if (obj != entry->object.vm_object) {
2986                                 if (nobj)
2987                                         vm_object_lock_swap();
2988                                 vm_object_drop(obj);
2989                         }
2990                         obj = nobj;
2991                 }
2992                 break;
2993         default:
2994                 break;
2995         }
2996         return 0;
2997 }
2998
2999 static int vm_object_in_map_callback(struct proc *p, void *data);
3000
3001 struct vm_object_in_map_info {
3002         vm_object_t object;
3003         int rv;
3004 };
3005
3006 /*
3007  * Debugging only
3008  */
3009 static int
3010 vm_object_in_map(vm_object_t object)
3011 {
3012         struct vm_object_in_map_info info;
3013
3014         info.rv = 0;
3015         info.object = object;
3016
3017         allproc_scan(vm_object_in_map_callback, &info);
3018         if (info.rv)
3019                 return 1;
3020         if( _vm_object_in_map(&kernel_map, object, 0))
3021                 return 1;
3022         if( _vm_object_in_map(&pager_map, object, 0))
3023                 return 1;
3024         if( _vm_object_in_map(&buffer_map, object, 0))
3025                 return 1;
3026         return 0;
3027 }
3028
3029 /*
3030  * Debugging only
3031  */
3032 static int
3033 vm_object_in_map_callback(struct proc *p, void *data)
3034 {
3035         struct vm_object_in_map_info *info = data;
3036
3037         if (p->p_vmspace) {
3038                 if (_vm_object_in_map(&p->p_vmspace->vm_map, info->object, 0)) {
3039                         info->rv = 1;
3040                         return -1;
3041                 }
3042         }
3043         return (0);
3044 }
3045
3046 DB_SHOW_COMMAND(vmochk, vm_object_check)
3047 {
3048         vm_object_t object;
3049         int n;
3050
3051         /*
3052          * make sure that internal objs are in a map somewhere
3053          * and none have zero ref counts.
3054          */
3055         for (n = 0; n < VMOBJ_HSIZE; ++n) {
3056                 for (object = TAILQ_FIRST(&vm_object_lists[n]);
3057                                 object != NULL;
3058                                 object = TAILQ_NEXT(object, object_list)) {
3059                         if (object->type == OBJT_MARKER)
3060                                 continue;
3061                         if (object->handle != NULL ||
3062                             (object->type != OBJT_DEFAULT &&
3063                              object->type != OBJT_SWAP)) {
3064                                 continue;
3065                         }
3066                         if (object->ref_count == 0) {
3067                                 db_printf("vmochk: internal obj has "
3068                                           "zero ref count: %ld\n",
3069                                           (long)object->size);
3070                         }
3071                         if (vm_object_in_map(object))
3072                                 continue;
3073                         db_printf("vmochk: internal obj is not in a map: "
3074                                   "ref: %d, size: %lu: 0x%lx, "
3075                                   "backing_object: %p\n",
3076                                   object->ref_count, (u_long)object->size,
3077                                   (u_long)object->size,
3078                                   (void *)object->backing_object);
3079                 }
3080         }
3081 }
3082
3083 /*
3084  * Debugging only
3085  */
3086 DB_SHOW_COMMAND(object, vm_object_print_static)
3087 {
3088         /* XXX convert args. */
3089         vm_object_t object = (vm_object_t)addr;
3090         boolean_t full = have_addr;
3091
3092         vm_page_t p;
3093
3094         /* XXX count is an (unused) arg.  Avoid shadowing it. */
3095 #define count   was_count
3096
3097         int count;
3098
3099         if (object == NULL)
3100                 return;
3101
3102         db_iprintf(
3103             "Object %p: type=%d, size=0x%lx, res=%d, ref=%d, flags=0x%x\n",
3104             object, (int)object->type, (u_long)object->size,
3105             object->resident_page_count, object->ref_count, object->flags);
3106         /*
3107          * XXX no %qd in kernel.  Truncate object->backing_object_offset.
3108          */
3109         db_iprintf(" sref=%d, backing_object(%d)=(%p)+0x%lx\n",
3110             object->shadow_count,
3111             object->backing_object ? object->backing_object->ref_count : 0,
3112             object->backing_object, (long)object->backing_object_offset);
3113
3114         if (!full)
3115                 return;
3116
3117         db_indent += 2;
3118         count = 0;
3119         RB_FOREACH(p, vm_page_rb_tree, &object->rb_memq) {
3120                 if (count == 0)
3121                         db_iprintf("memory:=");
3122                 else if (count == 6) {
3123                         db_printf("\n");
3124                         db_iprintf(" ...");
3125                         count = 0;
3126                 } else
3127                         db_printf(",");
3128                 count++;
3129
3130                 db_printf("(off=0x%lx,page=0x%lx)",
3131                     (u_long) p->pindex, (u_long) VM_PAGE_TO_PHYS(p));
3132         }
3133         if (count != 0)
3134                 db_printf("\n");
3135         db_indent -= 2;
3136 }
3137
3138 /* XXX. */
3139 #undef count
3140
3141 /*
3142  * XXX need this non-static entry for calling from vm_map_print.
3143  *
3144  * Debugging only
3145  */
3146 void
3147 vm_object_print(/* db_expr_t */ long addr,
3148                 boolean_t have_addr,
3149                 /* db_expr_t */ long count,
3150                 char *modif)
3151 {
3152         vm_object_print_static(addr, have_addr, count, modif);
3153 }
3154
3155 /*
3156  * Debugging only
3157  */
3158 DB_SHOW_COMMAND(vmopag, vm_object_print_pages)
3159 {
3160         vm_object_t object;
3161         int nl = 0;
3162         int c;
3163         int n;
3164
3165         for (n = 0; n < VMOBJ_HSIZE; ++n) {
3166                 for (object = TAILQ_FIRST(&vm_object_lists[n]);
3167                                 object != NULL;
3168                                 object = TAILQ_NEXT(object, object_list)) {
3169                         vm_pindex_t idx, fidx;
3170                         vm_pindex_t osize;
3171                         vm_paddr_t pa = -1, padiff;
3172                         int rcount;
3173                         vm_page_t m;
3174
3175                         if (object->type == OBJT_MARKER)
3176                                 continue;
3177                         db_printf("new object: %p\n", (void *)object);
3178                         if ( nl > 18) {
3179                                 c = cngetc();
3180                                 if (c != ' ')
3181                                         return;
3182                                 nl = 0;
3183                         }
3184                         nl++;
3185                         rcount = 0;
3186                         fidx = 0;
3187                         osize = object->size;
3188                         if (osize > 128)
3189                                 osize = 128;
3190                         for (idx = 0; idx < osize; idx++) {
3191                                 m = vm_page_lookup(object, idx);
3192                                 if (m == NULL) {
3193                                         if (rcount) {
3194                                                 db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
3195                                                         (long)fidx, rcount, (long)pa);
3196                                                 if ( nl > 18) {
3197                                                         c = cngetc();
3198                                                         if (c != ' ')
3199                                                                 return;
3200                                                         nl = 0;
3201                                                 }
3202                                                 nl++;
3203                                                 rcount = 0;
3204                                         }
3205                                         continue;
3206                                 }
3207
3208                                 if (rcount &&
3209                                         (VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) {
3210                                         ++rcount;
3211                                         continue;
3212                                 }
3213                                 if (rcount) {
3214                                         padiff = pa + rcount * PAGE_SIZE - VM_PAGE_TO_PHYS(m);
3215                                         padiff >>= PAGE_SHIFT;
3216                                         padiff &= PQ_L2_MASK;
3217                                         if (padiff == 0) {
3218                                                 pa = VM_PAGE_TO_PHYS(m) - rcount * PAGE_SIZE;
3219                                                 ++rcount;
3220                                                 continue;
3221                                         }
3222                                         db_printf(" index(%ld)run(%d)pa(0x%lx)",
3223                                                 (long)fidx, rcount, (long)pa);
3224                                         db_printf("pd(%ld)\n", (long)padiff);
3225                                         if ( nl > 18) {
3226                                                 c = cngetc();
3227                                                 if (c != ' ')
3228                                                         return;
3229                                                 nl = 0;
3230                                         }
3231                                         nl++;
3232                                 }
3233                                 fidx = idx;
3234                                 pa = VM_PAGE_TO_PHYS(m);
3235                                 rcount = 1;
3236                         }
3237                         if (rcount) {
3238                                 db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
3239                                         (long)fidx, rcount, (long)pa);
3240                                 if ( nl > 18) {
3241                                         c = cngetc();
3242                                         if (c != ' ')
3243                                                 return;
3244                                         nl = 0;
3245                                 }
3246                                 nl++;
3247                         }
3248                 }
3249         }
3250 }
3251 #endif /* DDB */