sys/vm/vm_object.c

   1 /*
   2  * Copyright (c) 1991, 1993, 2013
   3  *      The Regents of the University of California.  All rights reserved.
   4  *
   5  * This code is derived from software contributed to Berkeley by
   6  * The Mach Operating System project at Carnegie-Mellon University.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in the
  15  *    documentation and/or other materials provided with the distribution.
  16  * 3. Neither the name of the University nor the names of its contributors
  17  *    may be used to endorse or promote products derived from this software
  18  *    without specific prior written permission.
  19  *
  20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  30  * SUCH DAMAGE.
  31  *
  32  *      from: @(#)vm_object.c   8.5 (Berkeley) 3/22/94
  33  *
  34  *
  35  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  36  * All rights reserved.
  37  *
  38  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  39  *
  40  * Permission to use, copy, modify and distribute this software and
  41  * its documentation is hereby granted, provided that both the copyright
  42  * notice and this permission notice appear in all copies of the
  43  * software, derivative works or modified versions, and any portions
  44  * thereof, and that both notices appear in supporting documentation.
  45  *
  46  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  47  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  48  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  49  *
  50  * Carnegie Mellon requests users of this software to return to
  51  *
  52  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  53  *  School of Computer Science
  54  *  Carnegie Mellon University
  55  *  Pittsburgh PA 15213-3890
  56  *
  57  * any improvements or extensions that they make and grant Carnegie the
  58  * rights to redistribute these changes.
  59  *
  60  * $FreeBSD: src/sys/vm/vm_object.c,v 1.171.2.8 2003/05/26 19:17:56 alc Exp $
  61  */
  62
  63 /*
  64  *      Virtual memory object module.
  65  */
  66
  67 #include <sys/param.h>
  68 #include <sys/systm.h>
  69 #include <sys/proc.h>           /* for curproc, pageproc */
  70 #include <sys/thread.h>
  71 #include <sys/vnode.h>
  72 #include <sys/vmmeter.h>
  73 #include <sys/mman.h>
  74 #include <sys/mount.h>
  75 #include <sys/kernel.h>
  76 #include <sys/malloc.h>
  77 #include <sys/sysctl.h>
  78 #include <sys/refcount.h>
  79
  80 #include <vm/vm.h>
  81 #include <vm/vm_param.h>
  82 #include <vm/pmap.h>
  83 #include <vm/vm_map.h>
  84 #include <vm/vm_object.h>
  85 #include <vm/vm_page.h>
  86 #include <vm/vm_pageout.h>
  87 #include <vm/vm_pager.h>
  88 #include <vm/swap_pager.h>
  89 #include <vm/vm_kern.h>
  90 #include <vm/vm_extern.h>
  91 #include <vm/vm_zone.h>
  92
  93 #include <vm/vm_page2.h>
  94
  95 #include <machine/specialreg.h>
  96
  97 #define EASY_SCAN_FACTOR        8
  98
  99 static void     vm_object_page_collect_flush(vm_object_t object, vm_page_t p,
 100                                              int pagerflags);
 101 static void     vm_object_lock_init(vm_object_t);
 102
 103 /*
 104  *      Virtual memory objects maintain the actual data
 105  *      associated with allocated virtual memory.  A given
 106  *      page of memory exists within exactly one object.
 107  *
 108  *      An object is only deallocated when all "references"
 109  *      are given up.  Only one "reference" to a given
 110  *      region of an object should be writeable.
 111  *
 112  *      Associated with each object is a list of all resident
 113  *      memory pages belonging to that object; this list is
 114  *      maintained by the "vm_page" module, and locked by the object's
 115  *      lock.
 116  *
 117  *      Each object also records a "pager" routine which is
 118  *      used to retrieve (and store) pages to the proper backing
 119  *      storage.  In addition, objects may be backed by other
 120  *      objects from which they were virtual-copied.
 121  *
 122  *      The only items within the object structure which are
 123  *      modified after time of creation are:
 124  *              reference count         locked by object's lock
 125  *              pager routine           locked by object's lock
 126  *
 127  */
 128
 129 static struct vm_object kernel_object_store;
 130 struct vm_object *kernel_object = &kernel_object_store;
 131
 132 struct vm_object_hash vm_object_hash[VMOBJ_HSIZE];
 133
 134 static MALLOC_DEFINE_OBJ(M_VM_OBJECT, sizeof(struct vm_object),
 135                 "vm_object", "vm_object structures");
 136
 137 #define VMOBJ_HASH_PRIME1       66555444443333333ULL
 138 #define VMOBJ_HASH_PRIME2       989042931893ULL
 139
 140 int vm_object_debug;
 141 SYSCTL_INT(_vm, OID_AUTO, object_debug, CTLFLAG_RW, &vm_object_debug, 0, "");
 142
 143 static __inline
 144 struct vm_object_hash *
 145 vmobj_hash(vm_object_t obj)
 146 {
 147         uintptr_t hash1;
 148         uintptr_t hash2;
 149
 150         hash1 = (uintptr_t)obj + ((uintptr_t)obj >> 18);
 151         hash1 %= VMOBJ_HASH_PRIME1;
 152         hash2 = ((uintptr_t)obj >> 8) + ((uintptr_t)obj >> 24);
 153         hash2 %= VMOBJ_HASH_PRIME2;
 154         return (&vm_object_hash[(hash1 ^ hash2) & VMOBJ_HMASK]);
 155 }
 156
 157 #if defined(DEBUG_LOCKS)
 158
 159 #define vm_object_vndeallocate(obj, vpp)        \
 160                 debugvm_object_vndeallocate(obj, vpp, __FILE__, __LINE__)
 161
 162 /*
 163  * Debug helper to track hold/drop/ref/deallocate calls.
 164  */
 165 static void
 166 debugvm_object_add(vm_object_t obj, char *file, int line, int addrem)
 167 {
 168         int i;
 169
 170         i = atomic_fetchadd_int(&obj->debug_index, 1);
 171         i = i & (VMOBJ_DEBUG_ARRAY_SIZE - 1);
 172         ksnprintf(obj->debug_hold_thrs[i],
 173                   sizeof(obj->debug_hold_thrs[i]),
 174                   "%c%d:(%d):%s",
 175                   (addrem == -1 ? '-' : (addrem == 1 ? '+' : '=')),
 176                   (curthread->td_proc ? curthread->td_proc->p_pid : -1),
 177                   obj->ref_count,
 178                   curthread->td_comm);
 179         obj->debug_hold_file[i] = file;
 180         obj->debug_hold_line[i] = line;
 181 #if 0
 182         /* Uncomment for debugging obj refs/derefs in reproducable cases */
 183         if (strcmp(curthread->td_comm, "sshd") == 0) {
 184                 kprintf("%d %p refs=%d ar=%d file: %s/%d\n",
 185                         (curthread->td_proc ? curthread->td_proc->p_pid : -1),
 186                         obj, obj->ref_count, addrem, file, line);
 187         }
 188 #endif
 189 }
 190
 191 #endif
 192
 193 /*
 194  * Misc low level routines
 195  */
 196 static void
 197 vm_object_lock_init(vm_object_t obj)
 198 {
 199 #if defined(DEBUG_LOCKS)
 200         int i;
 201
 202         obj->debug_index = 0;
 203         for (i = 0; i < VMOBJ_DEBUG_ARRAY_SIZE; i++) {
 204                 obj->debug_hold_thrs[i][0] = 0;
 205                 obj->debug_hold_file[i] = NULL;
 206                 obj->debug_hold_line[i] = 0;
 207         }
 208 #endif
 209 }
 210
 211 void
 212 vm_object_lock_swap(void)
 213 {
 214         lwkt_token_swap();
 215 }
 216
 217 void
 218 vm_object_lock(vm_object_t obj)
 219 {
 220         lwkt_gettoken(&obj->token);
 221 }
 222
 223 /*
 224  * Returns TRUE on sucesss
 225  */
 226 static int
 227 vm_object_lock_try(vm_object_t obj)
 228 {
 229         return(lwkt_trytoken(&obj->token));
 230 }
 231
 232 void
 233 vm_object_lock_shared(vm_object_t obj)
 234 {
 235         lwkt_gettoken_shared(&obj->token);
 236 }
 237
 238 void
 239 vm_object_unlock(vm_object_t obj)
 240 {
 241         lwkt_reltoken(&obj->token);
 242 }
 243
 244 void
 245 vm_object_upgrade(vm_object_t obj)
 246 {
 247         lwkt_reltoken(&obj->token);
 248         lwkt_gettoken(&obj->token);
 249 }
 250
 251 void
 252 vm_object_downgrade(vm_object_t obj)
 253 {
 254         lwkt_reltoken(&obj->token);
 255         lwkt_gettoken_shared(&obj->token);
 256 }
 257
 258 static __inline void
 259 vm_object_assert_held(vm_object_t obj)
 260 {
 261         ASSERT_LWKT_TOKEN_HELD(&obj->token);
 262 }
 263
 264 /*
 265  * Aquire a semi-random base page color for a new object.  Our main concern
 266  * is that the color be spread out a bit.  Further spreading out occurs in
 267  * bio_page_alloc().
 268  */
 269 int
 270 vm_quickcolor(void)
 271 {
 272         globaldata_t gd = mycpu;
 273         int pg_color;
 274
 275         pg_color = (int)(intptr_t)gd->gd_curthread >> 10;
 276         pg_color += gd->gd_quick_color;
 277         gd->gd_quick_color += PQ_PRIME2;
 278
 279         return pg_color;
 280 }
 281
 282 void
 283 VMOBJDEBUG(vm_object_hold)(vm_object_t obj VMOBJDBARGS)
 284 {
 285         KKASSERT(obj != NULL);
 286
 287         /*
 288          * Object must be held (object allocation is stable due to callers
 289          * context, typically already holding the token on a parent object)
 290          * prior to potentially blocking on the lock, otherwise the object
 291          * can get ripped away from us.
 292          */
 293         refcount_acquire(&obj->hold_count);
 294         vm_object_lock(obj);
 295
 296 #if defined(DEBUG_LOCKS)
 297         debugvm_object_add(obj, file, line, 1);
 298 #endif
 299 }
 300
 301 int
 302 VMOBJDEBUG(vm_object_hold_try)(vm_object_t obj VMOBJDBARGS)
 303 {
 304         KKASSERT(obj != NULL);
 305
 306         /*
 307          * Object must be held (object allocation is stable due to callers
 308          * context, typically already holding the token on a parent object)
 309          * prior to potentially blocking on the lock, otherwise the object
 310          * can get ripped away from us.
 311          */
 312         refcount_acquire(&obj->hold_count);
 313         if (vm_object_lock_try(obj) == 0) {
 314                 if (refcount_release(&obj->hold_count)) {
 315                         if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD))
 316                                 kfree_obj(obj, M_VM_OBJECT);
 317                 }
 318                 return(0);
 319         }
 320
 321 #if defined(DEBUG_LOCKS)
 322         debugvm_object_add(obj, file, line, 1);
 323 #endif
 324         return(1);
 325 }
 326
 327 void
 328 VMOBJDEBUG(vm_object_hold_shared)(vm_object_t obj VMOBJDBARGS)
 329 {
 330         KKASSERT(obj != NULL);
 331
 332         /*
 333          * Object must be held (object allocation is stable due to callers
 334          * context, typically already holding the token on a parent object)
 335          * prior to potentially blocking on the lock, otherwise the object
 336          * can get ripped away from us.
 337          */
 338         refcount_acquire(&obj->hold_count);
 339         vm_object_lock_shared(obj);
 340
 341 #if defined(DEBUG_LOCKS)
 342         debugvm_object_add(obj, file, line, 1);
 343 #endif
 344 }
 345
 346 /*
 347  * Drop the token and hold_count on the object.
 348  *
 349  * WARNING! Token might be shared.
 350  */
 351 void
 352 VMOBJDEBUG(vm_object_drop)(vm_object_t obj VMOBJDBARGS)
 353 {
 354         if (obj == NULL)
 355                 return;
 356
 357         /*
 358          * No new holders should be possible once we drop hold_count 1->0 as
 359          * there is no longer any way to reference the object.
 360          */
 361         KKASSERT(obj->hold_count > 0);
 362         if (refcount_release(&obj->hold_count)) {
 363 #if defined(DEBUG_LOCKS)
 364                 debugvm_object_add(obj, file, line, -1);
 365 #endif
 366
 367                 if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD)) {
 368                         vm_object_unlock(obj);
 369                         kfree_obj(obj, M_VM_OBJECT);
 370                 } else {
 371                         vm_object_unlock(obj);
 372                 }
 373         } else {
 374 #if defined(DEBUG_LOCKS)
 375                 debugvm_object_add(obj, file, line, -1);
 376 #endif
 377                 vm_object_unlock(obj);
 378         }
 379 }
 380
 381 /*
 382  * Initialize a freshly allocated object, returning a held object.
 383  *
 384  * Used only by vm_object_allocate(), zinitna() and vm_object_init().
 385  *
 386  * No requirements.
 387  */
 388 void
 389 _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object,
 390                     const char *ident)
 391 {
 392         struct vm_object_hash *hash;
 393
 394         RB_INIT(&object->rb_memq);
 395         lwkt_token_init(&object->token, ident);
 396
 397         TAILQ_INIT(&object->backing_list);
 398         lockinit(&object->backing_lk, "baclk", 0, 0);
 399
 400         object->type = type;
 401         object->size = size;
 402         object->ref_count = 1;
 403         object->memattr = VM_MEMATTR_DEFAULT;
 404         object->hold_count = 0;
 405         object->flags = 0;
 406         if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP))
 407                 vm_object_set_flag(object, OBJ_ONEMAPPING);
 408         object->paging_in_progress = 0;
 409         object->resident_page_count = 0;
 410         /* cpu localization twist */
 411         object->pg_color = vm_quickcolor();
 412         object->handle = NULL;
 413
 414         atomic_add_int(&object->generation, 1);
 415         object->swblock_count = 0;
 416         RB_INIT(&object->swblock_root);
 417         vm_object_lock_init(object);
 418         pmap_object_init(object);
 419
 420         vm_object_hold(object);
 421
 422         hash = vmobj_hash(object);
 423         lwkt_gettoken(&hash->token);
 424         TAILQ_INSERT_TAIL(&hash->list, object, object_entry);
 425         lwkt_reltoken(&hash->token);
 426 }
 427
 428 /*
 429  * Initialize a VM object.
 430  */
 431 void
 432 vm_object_init(vm_object_t object, vm_pindex_t size)
 433 {
 434         _vm_object_allocate(OBJT_DEFAULT, size, object, "vmobj");
 435         vm_object_drop(object);
 436 }
 437
 438 /*
 439  * Initialize the VM objects module.
 440  *
 441  * Called from the low level boot code only.  Note that this occurs before
 442  * kmalloc is initialized so we cannot allocate any VM objects.
 443  */
 444 void
 445 vm_object_init1(void)
 446 {
 447         int i;
 448
 449         for (i = 0; i < VMOBJ_HSIZE; ++i) {
 450                 TAILQ_INIT(&vm_object_hash[i].list);
 451                 lwkt_token_init(&vm_object_hash[i].token, "vmobjlst");
 452         }
 453
 454         _vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(KvaEnd),
 455                             kernel_object, "kobj");
 456         vm_object_drop(kernel_object);
 457 }
 458
 459 void
 460 vm_object_init2(void)
 461 {
 462         kmalloc_obj_set_unlimited(M_VM_OBJECT);
 463 }
 464
 465 /*
 466  * Allocate and return a new object of the specified type and size.
 467  *
 468  * No requirements.
 469  */
 470 vm_object_t
 471 vm_object_allocate(objtype_t type, vm_pindex_t size)
 472 {
 473         vm_object_t obj;
 474
 475         obj = kmalloc_obj(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO);
 476         _vm_object_allocate(type, size, obj, "vmobj");
 477         vm_object_drop(obj);
 478
 479         return (obj);
 480 }
 481
 482 /*
 483  * This version returns a held object, allowing further atomic initialization
 484  * of the object.
 485  */
 486 vm_object_t
 487 vm_object_allocate_hold(objtype_t type, vm_pindex_t size)
 488 {
 489         vm_object_t obj;
 490
 491         obj = kmalloc_obj(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO);
 492         _vm_object_allocate(type, size, obj, "vmobj");
 493
 494         return (obj);
 495 }
 496
 497 /*
 498  * Add an additional reference to a vm_object.  The object must already be
 499  * held.  The original non-lock version is no longer supported.  The object
 500  * must NOT be chain locked by anyone at the time the reference is added.
 501  *
 502  * The object must be held, but may be held shared if desired (hence why
 503  * we use an atomic op).
 504  */
 505 void
 506 VMOBJDEBUG(vm_object_reference_locked)(vm_object_t object VMOBJDBARGS)
 507 {
 508         KKASSERT(object != NULL);
 509         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
 510         atomic_add_int(&object->ref_count, 1);
 511         if (object->type == OBJT_VNODE) {
 512                 vref(object->handle);
 513                 /* XXX what if the vnode is being destroyed? */
 514         }
 515 #if defined(DEBUG_LOCKS)
 516         debugvm_object_add(object, file, line, 1);
 517 #endif
 518 }
 519
 520 /*
 521  * This version is only allowed in situations where the caller
 522  * already knows that the object is deterministically referenced
 523  * (usually because its taken from a ref'd vnode, or during a map_entry
 524  * replication).
 525  */
 526 void
 527 VMOBJDEBUG(vm_object_reference_quick)(vm_object_t object VMOBJDBARGS)
 528 {
 529         KKASSERT(object->type == OBJT_VNODE || object->ref_count > 0);
 530         atomic_add_int(&object->ref_count, 1);
 531         if (object->type == OBJT_VNODE)
 532                 vref(object->handle);
 533 #if defined(DEBUG_LOCKS)
 534         debugvm_object_add(object, file, line, 1);
 535 #endif
 536 }
 537
 538 /*
 539  * Dereference an object and its underlying vnode.  The object may be
 540  * held shared.  On return the object will remain held.
 541  *
 542  * This function may return a vnode in *vpp which the caller must release
 543  * after the caller drops its own lock.  If vpp is NULL, we assume that
 544  * the caller was holding an exclusive lock on the object and we vrele()
 545  * the vp ourselves.
 546  */
 547 static void
 548 VMOBJDEBUG(vm_object_vndeallocate)(vm_object_t object, struct vnode **vpp
 549                                    VMOBJDBARGS)
 550 {
 551         struct vnode *vp = (struct vnode *) object->handle;
 552         int count;
 553
 554         KASSERT(object->type == OBJT_VNODE,
 555             ("vm_object_vndeallocate: not a vnode object"));
 556         KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp"));
 557         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
 558 #ifdef INVARIANTS
 559         if (object->ref_count == 0) {
 560                 vprint("vm_object_vndeallocate", vp);
 561                 panic("vm_object_vndeallocate: bad object reference count");
 562         }
 563 #endif
 564         count = object->ref_count;
 565         cpu_ccfence();
 566         for (;;) {
 567                 if (count == 1) {
 568                         vm_object_upgrade(object);
 569                         if (atomic_fcmpset_int(&object->ref_count, &count, 0)) {
 570                                 vclrflags(vp, VTEXT);
 571                                 break;
 572                         }
 573                 } else {
 574                         if (atomic_fcmpset_int(&object->ref_count,
 575                                                &count, count - 1)) {
 576                                 break;
 577                         }
 578                 }
 579                 cpu_pause();
 580                 /* retry */
 581         }
 582 #if defined(DEBUG_LOCKS)
 583         debugvm_object_add(object, file, line, -1);
 584 #endif
 585
 586         /*
 587          * vrele or return the vp to vrele.  We can only safely vrele(vp)
 588          * if the object was locked exclusively.  But there are two races
 589          * here.
 590          *
 591          * We had to upgrade the object above to safely clear VTEXT
 592          * but the alternative path where the shared lock is retained
 593          * can STILL race to 0 in other paths and cause our own vrele()
 594          * to terminate the vnode.  We can't allow that if the VM object
 595          * is still locked shared.
 596          */
 597         if (vpp)
 598                 *vpp = vp;
 599         else
 600                 vrele(vp);
 601 }
 602
 603 /*
 604  * Release a reference to the specified object, gained either through a
 605  * vm_object_allocate or a vm_object_reference call.  When all references
 606  * are gone, storage associated with this object may be relinquished.
 607  *
 608  * The caller does not have to hold the object locked but must have control
 609  * over the reference in question in order to guarantee that the object
 610  * does not get ripped out from under us.
 611  *
 612  * XXX Currently all deallocations require an exclusive lock.
 613  */
 614 void
 615 VMOBJDEBUG(vm_object_deallocate)(vm_object_t object VMOBJDBARGS)
 616 {
 617         struct vnode *vp;
 618         int count;
 619
 620         if (object == NULL)
 621                 return;
 622
 623         count = object->ref_count;
 624         cpu_ccfence();
 625         for (;;) {
 626                 /*
 627                  * If decrementing the count enters into special handling
 628                  * territory (0, 1, or 2) we have to do it the hard way.
 629                  * Fortunate though, objects with only a few refs like this
 630                  * are not likely to be heavily contended anyway.
 631                  *
 632                  * For vnode objects we only care about 1->0 transitions.
 633                  */
 634                 if (count <= 3 || (object->type == OBJT_VNODE && count <= 1)) {
 635 #if defined(DEBUG_LOCKS)
 636                         debugvm_object_add(object, file, line, 0);
 637 #endif
 638                         vm_object_hold(object);
 639                         vm_object_deallocate_locked(object);
 640                         vm_object_drop(object);
 641                         break;
 642                 }
 643
 644                 /*
 645                  * Try to decrement ref_count without acquiring a hold on
 646                  * the object.  This is particularly important for the exec*()
 647                  * and exit*() code paths because the program binary may
 648                  * have a great deal of sharing and an exclusive lock will
 649                  * crowbar performance in those circumstances.
 650                  */
 651                 if (object->type == OBJT_VNODE) {
 652                         vp = (struct vnode *)object->handle;
 653                         if (atomic_fcmpset_int(&object->ref_count,
 654                                                &count, count - 1)) {
 655 #if defined(DEBUG_LOCKS)
 656                                 debugvm_object_add(object, file, line, -1);
 657 #endif
 658
 659                                 vrele(vp);
 660                                 break;
 661                         }
 662                         /* retry */
 663                 } else {
 664                         if (atomic_fcmpset_int(&object->ref_count,
 665                                                &count, count - 1)) {
 666 #if defined(DEBUG_LOCKS)
 667                                 debugvm_object_add(object, file, line, -1);
 668 #endif
 669                                 break;
 670                         }
 671                         /* retry */
 672                 }
 673                 cpu_pause();
 674                 /* retry */
 675         }
 676 }
 677
 678 void
 679 VMOBJDEBUG(vm_object_deallocate_locked)(vm_object_t object VMOBJDBARGS)
 680 {
 681         /*
 682          * Degenerate case
 683          */
 684         if (object == NULL)
 685                 return;
 686
 687         /*
 688          * vnode case, caller either locked the object exclusively
 689          * or this is a recursion with must_drop != 0 and the vnode
 690          * object will be locked shared.
 691          *
 692          * If locked shared we have to drop the object before we can
 693          * call vrele() or risk a shared/exclusive livelock.
 694          */
 695         if (object->type == OBJT_VNODE) {
 696                 ASSERT_LWKT_TOKEN_HELD(&object->token);
 697                 vm_object_vndeallocate(object, NULL);
 698                 return;
 699         }
 700         ASSERT_LWKT_TOKEN_HELD_EXCL(&object->token);
 701
 702         /*
 703          * Normal case (object is locked exclusively)
 704          */
 705         if (object->ref_count == 0) {
 706                 panic("vm_object_deallocate: object deallocated "
 707                       "too many times: %d", object->type);
 708         }
 709         if (object->ref_count > 2) {
 710                 atomic_add_int(&object->ref_count, -1);
 711 #if defined(DEBUG_LOCKS)
 712                 debugvm_object_add(object, file, line, -1);
 713 #endif
 714                 return;
 715         }
 716
 717         /*
 718          * Drop the ref and handle termination on the 1->0 transition.
 719          * We may have blocked above so we have to recheck.
 720          */
 721         KKASSERT(object->ref_count != 0);
 722         if (object->ref_count >= 2) {
 723                 atomic_add_int(&object->ref_count, -1);
 724 #if defined(DEBUG_LOCKS)
 725                 debugvm_object_add(object, file, line, -1);
 726 #endif
 727                 return;
 728         }
 729
 730         atomic_add_int(&object->ref_count, -1);
 731         if ((object->flags & OBJ_DEAD) == 0)
 732                 vm_object_terminate(object);
 733 }
 734
 735 /*
 736  * Destroy the specified object, freeing up related resources.
 737  *
 738  * The object must have zero references.
 739  *
 740  * The object must held.  The caller is responsible for dropping the object
 741  * after terminate returns.  Terminate does NOT drop the object.
 742  */
 743 static int vm_object_terminate_callback(vm_page_t p, void *data);
 744
 745 void
 746 vm_object_terminate(vm_object_t object)
 747 {
 748         struct rb_vm_page_scan_info info;
 749         struct vm_object_hash *hash;
 750
 751         /*
 752          * Make sure no one uses us.  Once we set OBJ_DEAD we should be
 753          * able to safely block.
 754          */
 755         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
 756         KKASSERT((object->flags & OBJ_DEAD) == 0);
 757         vm_object_set_flag(object, OBJ_DEAD);
 758
 759         /*
 760          * Wait for the pageout daemon to be done with the object
 761          */
 762         vm_object_pip_wait(object, "objtrm1");
 763
 764         KASSERT(!object->paging_in_progress,
 765                 ("vm_object_terminate: pageout in progress"));
 766
 767         /*
 768          * Clean and free the pages, as appropriate. All references to the
 769          * object are gone, so we don't need to lock it.
 770          */
 771         if (object->type == OBJT_VNODE) {
 772                 struct vnode *vp;
 773
 774                 /*
 775                  * Clean pages and flush buffers.
 776                  *
 777                  * NOTE!  TMPFS buffer flushes do not typically flush the
 778                  *        actual page to swap as this would be highly
 779                  *        inefficient, and normal filesystems usually wrap
 780                  *        page flushes with buffer cache buffers.
 781                  *
 782                  *        To deal with this we have to call vinvalbuf() both
 783                  *        before and after the vm_object_page_clean().
 784                  */
 785                 vp = (struct vnode *) object->handle;
 786                 vinvalbuf(vp, V_SAVE, 0, 0);
 787                 vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
 788                 vinvalbuf(vp, V_SAVE, 0, 0);
 789         }
 790
 791         /*
 792          * Wait for any I/O to complete, after which there had better not
 793          * be any references left on the object.
 794          */
 795         vm_object_pip_wait(object, "objtrm2");
 796
 797         if (object->ref_count != 0) {
 798                 panic("vm_object_terminate: object with references, "
 799                       "ref_count=%d", object->ref_count);
 800         }
 801
 802         /*
 803          * Cleanup any shared pmaps associated with this object.
 804          */
 805         pmap_object_free(object);
 806
 807         /*
 808          * Now free any remaining pages. For internal objects, this also
 809          * removes them from paging queues. Don't free wired pages, just
 810          * remove them from the object.
 811          */
 812         info.count = 0;
 813         info.object = object;
 814         do {
 815                 info.error = 0;
 816                 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
 817                                         vm_object_terminate_callback, &info);
 818         } while (info.error);
 819
 820         /*
 821          * Let the pager know object is dead.
 822          */
 823         vm_pager_deallocate(object);
 824
 825         /*
 826          * Wait for the object hold count to hit 1, clean out pages as
 827          * we go.  vmobj_token interlocks any race conditions that might
 828          * pick the object up from the vm_object_list after we have cleared
 829          * rb_memq.
 830          */
 831         for (;;) {
 832                 if (RB_ROOT(&object->rb_memq) == NULL)
 833                         break;
 834                 kprintf("vm_object_terminate: Warning, object %p "
 835                         "still has %ld pages\n",
 836                         object, object->resident_page_count);
 837                 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
 838                                         vm_object_terminate_callback, &info);
 839         }
 840
 841         /*
 842          * There had better not be any pages left
 843          */
 844         KKASSERT(object->resident_page_count == 0);
 845
 846         /*
 847          * Remove the object from the global object list.
 848          */
 849         hash = vmobj_hash(object);
 850         lwkt_gettoken(&hash->token);
 851         TAILQ_REMOVE(&hash->list, object, object_entry);
 852         lwkt_reltoken(&hash->token);
 853
 854         if (object->ref_count != 0) {
 855                 panic("vm_object_terminate2: object with references, "
 856                       "ref_count=%d", object->ref_count);
 857         }
 858
 859         /*
 860          * NOTE: The object hold_count is at least 1, so we cannot kfree()
 861          *       the object here.  See vm_object_drop().
 862          */
 863 }
 864
 865 /*
 866  * The caller must hold the object.
 867  *
 868  * NOTE: It is possible for vm_page's to remain flagged PG_MAPPED
 869  *       or PG_MAPPED|PG_WRITEABLE, even after pmap_mapped_sync()
 870  *       is called, due to normal pmap operations.  This is because only
 871  *       global pmap operations on the vm_page can clear the bits and not
 872  *       just local operations on individual pmaps.
 873  *
 874  *       Most interactions that necessitate the clearing of these bits
 875  *       proactively call vm_page_protect(), and we must do so here as well.
 876  */
 877 static int
 878 vm_object_terminate_callback(vm_page_t p, void *data)
 879 {
 880         struct rb_vm_page_scan_info *info = data;
 881         vm_object_t object;
 882
 883         object = p->object;
 884         KKASSERT(object == info->object);
 885         if (vm_page_busy_try(p, TRUE)) {
 886                 vm_page_sleep_busy(p, TRUE, "vmotrm");
 887                 info->error = 1;
 888                 return 0;
 889         }
 890         if (object != p->object) {
 891                 /* XXX remove once we determine it can't happen */
 892                 kprintf("vm_object_terminate: Warning: Encountered "
 893                         "busied page %p on queue %d\n", p, p->queue);
 894                 vm_page_wakeup(p);
 895                 info->error = 1;
 896         } else if (p->wire_count == 0) {
 897                 /*
 898                  * NOTE: p->dirty and PG_NEED_COMMIT are ignored.
 899                  */
 900                 if (pmap_mapped_sync(p) & (PG_MAPPED | PG_WRITEABLE))
 901                         vm_page_protect(p, VM_PROT_NONE);
 902                 vm_page_free(p);
 903                 mycpu->gd_cnt.v_pfree++;
 904         } else {
 905                 if (p->queue != PQ_NONE) {
 906                         kprintf("vm_object_terminate: Warning: Encountered "
 907                                 "wired page %p on queue %d\n", p, p->queue);
 908                         if (vm_object_debug > 0) {
 909                                 --vm_object_debug;
 910                                 print_backtrace(10);
 911                         }
 912                 }
 913                 if (pmap_mapped_sync(p) & (PG_MAPPED | PG_WRITEABLE))
 914                         vm_page_protect(p, VM_PROT_NONE);
 915                 vm_page_remove(p);
 916                 vm_page_wakeup(p);
 917         }
 918
 919         /*
 920          * Must be at end to avoid SMP races, caller holds object token
 921          */
 922         if ((++info->count & 63) == 0)
 923                 lwkt_user_yield();
 924         return(0);
 925 }
 926
 927 /*
 928  * Clean all dirty pages in the specified range of object.  Leaves page
 929  * on whatever queue it is currently on.   If NOSYNC is set then do not
 930  * write out pages with PG_NOSYNC set (originally comes from MAP_NOSYNC),
 931  * leaving the object dirty.
 932  *
 933  * When stuffing pages asynchronously, allow clustering.  XXX we need a
 934  * synchronous clustering mode implementation.
 935  *
 936  * Odd semantics: if start == end, we clean everything.
 937  *
 938  * The object must be locked? XXX
 939  */
 940 static int vm_object_page_clean_pass1(struct vm_page *p, void *data);
 941 static int vm_object_page_clean_pass2(struct vm_page *p, void *data);
 942
 943 void
 944 vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
 945                      int flags)
 946 {
 947         struct rb_vm_page_scan_info info;
 948         struct vnode *vp;
 949         int wholescan;
 950         int pagerflags;
 951         int generation;
 952
 953         vm_object_hold(object);
 954         if (object->type != OBJT_VNODE ||
 955             (object->flags & OBJ_MIGHTBEDIRTY) == 0) {
 956                 vm_object_drop(object);
 957                 return;
 958         }
 959
 960         pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) ?
 961                         OBJPC_SYNC : OBJPC_CLUSTER_OK;
 962         pagerflags |= (flags & OBJPC_INVAL) ? OBJPC_INVAL : 0;
 963
 964         vp = object->handle;
 965
 966         /*
 967          * Interlock other major object operations.  This allows us to
 968          * temporarily clear OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY.
 969          */
 970         vm_object_set_flag(object, OBJ_CLEANING);
 971
 972         /*
 973          * Handle 'entire object' case
 974          */
 975         info.start_pindex = start;
 976         if (end == 0) {
 977                 info.end_pindex = object->size - 1;
 978         } else {
 979                 info.end_pindex = end - 1;
 980         }
 981         wholescan = (start == 0 && info.end_pindex == object->size - 1);
 982         info.limit = flags;
 983         info.pagerflags = pagerflags;
 984         info.object = object;
 985
 986         /*
 987          * If cleaning the entire object do a pass to mark the pages read-only.
 988          * If everything worked out ok, clear OBJ_WRITEABLE and
 989          * OBJ_MIGHTBEDIRTY.
 990          */
 991         if (wholescan) {
 992                 info.error = 0;
 993                 info.count = 0;
 994                 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
 995                                         vm_object_page_clean_pass1, &info);
 996                 if (info.error == 0) {
 997                         vm_object_clear_flag(object,
 998                                              OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
 999                         if (object->type == OBJT_VNODE &&
1000                             (vp = (struct vnode *)object->handle) != NULL) {
1001                                 /*
1002                                  * Use new-style interface to clear VISDIRTY
1003                                  * because the vnode is not necessarily removed
1004                                  * from the syncer list(s) as often as it was
1005                                  * under the old interface, which can leave
1006                                  * the vnode on the syncer list after reclaim.
1007                                  */
1008                                 vclrobjdirty(vp);
1009                         }
1010                 }
1011         }
1012
1013         /*
1014          * Do a pass to clean all the dirty pages we find.
1015          */
1016         do {
1017                 info.error = 0;
1018                 info.count = 0;
1019                 generation = object->generation;
1020                 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
1021                                         vm_object_page_clean_pass2, &info);
1022         } while (info.error || generation != object->generation);
1023
1024         vm_object_clear_flag(object, OBJ_CLEANING);
1025         vm_object_drop(object);
1026 }
1027
1028 /*
1029  * The caller must hold the object.
1030  */
1031 static
1032 int
1033 vm_object_page_clean_pass1(struct vm_page *p, void *data)
1034 {
1035         struct rb_vm_page_scan_info *info = data;
1036
1037         KKASSERT(p->object == info->object);
1038
1039         vm_page_flag_set(p, PG_CLEANCHK);
1040         if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
1041                 info->error = 1;
1042         } else if (vm_page_busy_try(p, FALSE)) {
1043                 info->error = 1;
1044         } else {
1045                 KKASSERT(p->object == info->object);
1046                 vm_page_protect(p, VM_PROT_READ);
1047                 vm_page_wakeup(p);
1048         }
1049
1050         /*
1051          * Must be at end to avoid SMP races, caller holds object token
1052          */
1053         if ((++info->count & 63) == 0)
1054                 lwkt_user_yield();
1055         return(0);
1056 }
1057
1058 /*
1059  * The caller must hold the object
1060  */
1061 static
1062 int
1063 vm_object_page_clean_pass2(struct vm_page *p, void *data)
1064 {
1065         struct rb_vm_page_scan_info *info = data;
1066         int generation;
1067
1068         KKASSERT(p->object == info->object);
1069
1070         /*
1071          * Do not mess with pages that were inserted after we started
1072          * the cleaning pass.
1073          */
1074         if ((p->flags & PG_CLEANCHK) == 0)
1075                 goto done;
1076
1077         generation = info->object->generation;
1078
1079         if (vm_page_busy_try(p, TRUE)) {
1080                 vm_page_sleep_busy(p, TRUE, "vpcwai");
1081                 info->error = 1;
1082                 goto done;
1083         }
1084
1085         KKASSERT(p->object == info->object &&
1086                  info->object->generation == generation);
1087
1088         /*
1089          * Before wasting time traversing the pmaps, check for trivial
1090          * cases where the page cannot be dirty.
1091          */
1092         if (p->valid == 0 || (p->queue - p->pc) == PQ_CACHE) {
1093                 KKASSERT((p->dirty & p->valid) == 0 &&
1094                          (p->flags & PG_NEED_COMMIT) == 0);
1095                 vm_page_wakeup(p);
1096                 goto done;
1097         }
1098
1099         /*
1100          * Check whether the page is dirty or not.  The page has been set
1101          * to be read-only so the check will not race a user dirtying the
1102          * page.
1103          */
1104         vm_page_test_dirty(p);
1105         if ((p->dirty & p->valid) == 0 && (p->flags & PG_NEED_COMMIT) == 0) {
1106                 vm_page_flag_clear(p, PG_CLEANCHK);
1107                 vm_page_wakeup(p);
1108                 goto done;
1109         }
1110
1111         /*
1112          * If we have been asked to skip nosync pages and this is a
1113          * nosync page, skip it.  Note that the object flags were
1114          * not cleared in this case (because pass1 will have returned an
1115          * error), so we do not have to set them.
1116          */
1117         if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
1118                 vm_page_flag_clear(p, PG_CLEANCHK);
1119                 vm_page_wakeup(p);
1120                 goto done;
1121         }
1122
1123         /*
1124          * Flush as many pages as we can.  PG_CLEANCHK will be cleared on
1125          * the pages that get successfully flushed.  Set info->error if
1126          * we raced an object modification.
1127          */
1128         vm_object_page_collect_flush(info->object, p, info->pagerflags);
1129         /* vm_wait_nominal(); this can deadlock the system in syncer/pageout */
1130
1131         /*
1132          * Must be at end to avoid SMP races, caller holds object token
1133          */
1134 done:
1135         if ((++info->count & 63) == 0)
1136                 lwkt_user_yield();
1137         return(0);
1138 }
1139
1140 /*
1141  * Collect the specified page and nearby pages and flush them out.
1142  * The number of pages flushed is returned.  The passed page is busied
1143  * by the caller and we are responsible for its disposition.
1144  *
1145  * The caller must hold the object.
1146  */
1147 static void
1148 vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags)
1149 {
1150         int error;
1151         int is;
1152         int ib;
1153         int i;
1154         int page_base;
1155         vm_pindex_t pi;
1156         vm_page_t ma[BLIST_MAX_ALLOC];
1157
1158         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1159
1160         pi = p->pindex;
1161         page_base = pi % BLIST_MAX_ALLOC;
1162         ma[page_base] = p;
1163         ib = page_base - 1;
1164         is = page_base + 1;
1165
1166         while (ib >= 0) {
1167                 vm_page_t tp;
1168
1169                 tp = vm_page_lookup_busy_try(object, pi - page_base + ib,
1170                                              TRUE, &error);
1171                 if (error)
1172                         break;
1173                 if (tp == NULL)
1174                         break;
1175                 if ((pagerflags & OBJPC_IGNORE_CLEANCHK) == 0 &&
1176                     (tp->flags & PG_CLEANCHK) == 0) {
1177                         vm_page_wakeup(tp);
1178                         break;
1179                 }
1180                 if ((tp->queue - tp->pc) == PQ_CACHE) {
1181                         vm_page_flag_clear(tp, PG_CLEANCHK);
1182                         vm_page_wakeup(tp);
1183                         break;
1184                 }
1185                 vm_page_test_dirty(tp);
1186                 if ((tp->dirty & tp->valid) == 0 &&
1187                     (tp->flags & PG_NEED_COMMIT) == 0) {
1188                         vm_page_flag_clear(tp, PG_CLEANCHK);
1189                         vm_page_wakeup(tp);
1190                         break;
1191                 }
1192                 ma[ib] = tp;
1193                 --ib;
1194         }
1195         ++ib;   /* fixup */
1196
1197         while (is < BLIST_MAX_ALLOC &&
1198                pi - page_base + is < object->size) {
1199                 vm_page_t tp;
1200
1201                 tp = vm_page_lookup_busy_try(object, pi - page_base + is,
1202                                              TRUE, &error);
1203                 if (error)
1204                         break;
1205                 if (tp == NULL)
1206                         break;
1207                 if ((pagerflags & OBJPC_IGNORE_CLEANCHK) == 0 &&
1208                     (tp->flags & PG_CLEANCHK) == 0) {
1209                         vm_page_wakeup(tp);
1210                         break;
1211                 }
1212                 if ((tp->queue - tp->pc) == PQ_CACHE) {
1213                         vm_page_flag_clear(tp, PG_CLEANCHK);
1214                         vm_page_wakeup(tp);
1215                         break;
1216                 }
1217                 vm_page_test_dirty(tp);
1218                 if ((tp->dirty & tp->valid) == 0 &&
1219                     (tp->flags & PG_NEED_COMMIT) == 0) {
1220                         vm_page_flag_clear(tp, PG_CLEANCHK);
1221                         vm_page_wakeup(tp);
1222                         break;
1223                 }
1224                 ma[is] = tp;
1225                 ++is;
1226         }
1227
1228         /*
1229          * All pages in the ma[] array are busied now
1230          */
1231         for (i = ib; i < is; ++i) {
1232                 vm_page_flag_clear(ma[i], PG_CLEANCHK);
1233                 vm_page_hold(ma[i]);    /* XXX need this any more? */
1234         }
1235         vm_pageout_flush(&ma[ib], is - ib, pagerflags);
1236         for (i = ib; i < is; ++i)       /* XXX need this any more? */
1237                 vm_page_unhold(ma[i]);
1238 }
1239
1240 /*
1241  * Implements the madvise function at the object/page level.
1242  *
1243  * MADV_WILLNEED        (any object)
1244  *
1245  *      Activate the specified pages if they are resident.
1246  *
1247  * MADV_DONTNEED        (any object)
1248  *
1249  *      Deactivate the specified pages if they are resident.
1250  *
1251  * MADV_FREE    (OBJT_DEFAULT/OBJT_SWAP objects, OBJ_ONEMAPPING only)
1252  *
1253  *      Deactivate and clean the specified pages if they are
1254  *      resident.  This permits the process to reuse the pages
1255  *      without faulting or the kernel to reclaim the pages
1256  *      without I/O.
1257  *
1258  * No requirements.
1259  */
1260 void
1261 vm_object_madvise(vm_object_t object, vm_pindex_t pindex,
1262                   vm_pindex_t count, int advise)
1263 {
1264         vm_pindex_t end;
1265         vm_page_t m;
1266         int error;
1267
1268         if (object == NULL)
1269                 return;
1270
1271         end = pindex + count;
1272
1273         vm_object_hold(object);
1274
1275         /*
1276          * Locate and adjust resident pages.  This only applies to the
1277          * primary object in the mapping.
1278          */
1279         for (; pindex < end; pindex += 1) {
1280 relookup:
1281                 /*
1282                  * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages
1283                  * and those pages must be OBJ_ONEMAPPING.
1284                  */
1285                 if (advise == MADV_FREE) {
1286                         if ((object->type != OBJT_DEFAULT &&
1287                              object->type != OBJT_SWAP) ||
1288                             (object->flags & OBJ_ONEMAPPING) == 0) {
1289                                 continue;
1290                         }
1291                 }
1292
1293                 m = vm_page_lookup_busy_try(object, pindex, TRUE, &error);
1294
1295                 if (error) {
1296                         vm_page_sleep_busy(m, TRUE, "madvpo");
1297                         goto relookup;
1298                 }
1299                 if (m == NULL) {
1300                         /*
1301                          * There may be swap even if there is no backing page
1302                          */
1303                         if (advise == MADV_FREE && object->type == OBJT_SWAP)
1304                                 swap_pager_freespace(object, pindex, 1);
1305                         continue;
1306                 }
1307
1308                 /*
1309                  * If the page is not in a normal active state, we skip it.
1310                  * If the page is not managed there are no page queues to
1311                  * mess with.  Things can break if we mess with pages in
1312                  * any of the below states.
1313                  */
1314                 if (m->wire_count ||
1315                     (m->flags & (PG_FICTITIOUS | PG_UNQUEUED |
1316                                  PG_NEED_COMMIT)) ||
1317                     m->valid != VM_PAGE_BITS_ALL
1318                 ) {
1319                         vm_page_wakeup(m);
1320                         continue;
1321                 }
1322
1323                 /*
1324                  * Theoretically once a page is known not to be busy, an
1325                  * interrupt cannot come along and rip it out from under us.
1326                  */
1327                 if (advise == MADV_WILLNEED) {
1328                         vm_page_activate(m);
1329                 } else if (advise == MADV_DONTNEED) {
1330                         vm_page_dontneed(m);
1331                 } else if (advise == MADV_FREE) {
1332                         /*
1333                          * Mark the page clean.  This will allow the page
1334                          * to be freed up by the system.  However, such pages
1335                          * are often reused quickly by malloc()/free()
1336                          * so we do not do anything that would cause
1337                          * a page fault if we can help it.
1338                          *
1339                          * Specifically, we do not try to actually free
1340                          * the page now nor do we try to put it in the
1341                          * cache (which would cause a page fault on reuse).
1342                          *
1343                          * But we do make the page is freeable as we
1344                          * can without actually taking the step of unmapping
1345                          * it.
1346                          */
1347                         pmap_clear_modify(m);
1348                         m->dirty = 0;
1349                         m->act_count = 0;
1350                         vm_page_dontneed(m);
1351                         if (object->type == OBJT_SWAP)
1352                                 swap_pager_freespace(object, pindex, 1);
1353                 }
1354                 vm_page_wakeup(m);
1355         }
1356         vm_object_drop(object);
1357 }
1358
1359 /*
1360  * Removes all physical pages in the specified object range from the
1361  * object's list of pages.
1362  *
1363  * No requirements.
1364  */
1365 static int vm_object_page_remove_callback(vm_page_t p, void *data);
1366
1367 void
1368 vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
1369                       boolean_t clean_only)
1370 {
1371         struct rb_vm_page_scan_info info;
1372         int all;
1373
1374         /*
1375          * Degenerate cases and assertions.
1376          *
1377          * NOTE: Don't shortcut on resident_page_count for MGTDEVICE objects.
1378          *       These objects do not have to have their pages entered into
1379          *       them and are handled via their vm_map_backing lists.
1380          */
1381         vm_object_hold(object);
1382         if (object == NULL ||
1383             (object->type != OBJT_MGTDEVICE &&
1384              object->resident_page_count == 0 && object->swblock_count == 0)) {
1385                 vm_object_drop(object);
1386                 return;
1387         }
1388         KASSERT(object->type != OBJT_PHYS,
1389                 ("attempt to remove pages from a physical object"));
1390
1391         /*
1392          * Indicate that paging is occuring on the object
1393          */
1394         vm_object_pip_add(object, 1);
1395
1396         /*
1397          * Figure out the actual removal range and whether we are removing
1398          * the entire contents of the object or not.  If removing the entire
1399          * contents, be sure to get all pages, even those that might be
1400          * beyond the end of the object.
1401          *
1402          * NOTE: end is non-inclusive, but info.end_pindex is inclusive.
1403          */
1404         info.object = object;
1405         info.start_pindex = start;
1406         if (end == 0 || end == (vm_pindex_t)-1) {
1407                 info.end_pindex = (vm_pindex_t)-1;
1408                 end = object->size;
1409         } else {
1410                 info.end_pindex = end - 1;
1411         }
1412         info.limit = clean_only;
1413         info.count = 0;
1414         all = (start == 0 && info.end_pindex >= object->size - 1);
1415
1416         /*
1417          * Efficiently remove pages from the pmap via a backing scan.
1418          *
1419          * NOTE: This is the only way pages can be removed and unwired
1420          *       from OBJT_MGTDEVICE devices which typically do not enter
1421          *       their pages into the vm_object's RB tree.  And possibly
1422          *       other OBJT_* types in the future.
1423          */
1424         {
1425                 vm_map_backing_t ba;
1426                 vm_pindex_t sba, eba;
1427                 vm_offset_t sva, eva;
1428
1429                 lockmgr(&object->backing_lk, LK_EXCLUSIVE);
1430                 TAILQ_FOREACH(ba, &object->backing_list, entry) {
1431                         /*
1432                          * object offset range within the ba, intersectioned
1433                          * with the page range specified for the object
1434                          */
1435                         sba = OFF_TO_IDX(ba->offset);
1436                         eba = sba + OFF_TO_IDX(ba->end - ba->start);
1437                         if (sba < start)
1438                                 sba = start;
1439                         if (eba > end)
1440                                 eba = end;
1441
1442                         /*
1443                          * If the intersection is valid, remove the related
1444                          * pages.
1445                          *
1446                          * NOTE! This may also remove other incidental pages
1447                          *       in the pmap, as the backing area may be
1448                          *       overloaded.
1449                          *
1450                          * NOTE! pages for MGTDEVICE objects are only removed
1451                          *       here, they aren't entered into rb_memq, so
1452                          *       we must use pmap_remove() instead of
1453                          *       the non-TLB-invalidating pmap_remove_pages().
1454                          */
1455                         if (sba < eba) {
1456                                 sva = ba->start + IDX_TO_OFF(sba) - ba->offset;
1457                                 eva = sva + IDX_TO_OFF(eba - sba);
1458 #if 0
1459                                 kprintf("VM_OBJECT_PAGE_REMOVE "
1460                                         "%p[%016jx] %016jx-%016jx\n",
1461                                         ba->pmap, ba->start, sva, eva);
1462 #endif
1463                                 pmap_remove(ba->pmap, sva, eva);
1464                         }
1465                 }
1466                 lockmgr(&object->backing_lk, LK_RELEASE);
1467         }
1468
1469         /*
1470          * Remove and free pages entered onto the object list.  Note that
1471          * for OBJT_MGTDEVICE objects, there are typically no pages entered.
1472          *
1473          * Loop until we are sure we have gotten them all.
1474          */
1475         do {
1476                 info.error = 0;
1477                 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
1478                                         vm_object_page_remove_callback, &info);
1479         } while (info.error);
1480
1481         /*
1482          * Remove any related swap if throwing away pages, or for
1483          * non-swap objects (the swap is a clean copy in that case).
1484          */
1485         if (object->type != OBJT_SWAP || clean_only == FALSE) {
1486                 if (all)
1487                         swap_pager_freespace_all(object);
1488                 else
1489                         swap_pager_freespace(object, info.start_pindex,
1490                              info.end_pindex - info.start_pindex + 1);
1491         }
1492
1493         /*
1494          * Cleanup
1495          */
1496         vm_object_pip_wakeup(object);
1497         vm_object_drop(object);
1498 }
1499
1500 /*
1501  * The caller must hold the object.
1502  *
1503  * NOTE: User yields are allowed when removing more than one page, but not
1504  *       allowed if only removing one page (the path for single page removals
1505  *       might hold a spinlock).
1506  */
1507 static int
1508 vm_object_page_remove_callback(vm_page_t p, void *data)
1509 {
1510         struct rb_vm_page_scan_info *info = data;
1511
1512         if (info->object != p->object ||
1513             p->pindex < info->start_pindex ||
1514             p->pindex > info->end_pindex) {
1515                 kprintf("vm_object_page_remove_callbackA: obj/pg race %p/%p\n",
1516                         info->object, p);
1517                 return(0);
1518         }
1519         if (vm_page_busy_try(p, TRUE)) {
1520                 vm_page_sleep_busy(p, TRUE, "vmopar");
1521                 info->error = 1;
1522                 return(0);
1523         }
1524         if (info->object != p->object) {
1525                 /* this should never happen */
1526                 kprintf("vm_object_page_remove_callbackB: obj/pg race %p/%p\n",
1527                         info->object, p);
1528                 vm_page_wakeup(p);
1529                 return(0);
1530         }
1531
1532         /*
1533          * Wired pages cannot be destroyed, but they can be invalidated
1534          * and we do so if clean_only (limit) is not set.
1535          *
1536          * WARNING!  The page may be wired due to being part of a buffer
1537          *           cache buffer, and the buffer might be marked B_CACHE.
1538          *           This is fine as part of a truncation but VFSs must be
1539          *           sure to fix the buffer up when re-extending the file.
1540          *
1541          * NOTE!     PG_NEED_COMMIT is ignored.
1542          */
1543         if (p->wire_count != 0) {
1544                 vm_page_protect(p, VM_PROT_NONE);
1545                 if (info->limit == 0)
1546                         p->valid = 0;
1547                 vm_page_wakeup(p);
1548                 goto done;
1549         }
1550
1551         /*
1552          * limit is our clean_only flag.  If set and the page is dirty or
1553          * requires a commit, do not free it.  If set and the page is being
1554          * held by someone, do not free it.
1555          */
1556         if (info->limit && p->valid) {
1557                 vm_page_test_dirty(p);
1558                 if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) {
1559                         vm_page_wakeup(p);
1560                         goto done;
1561                 }
1562         }
1563
1564         /*
1565          * Destroy the page.  But we have to re-test whether its dirty after
1566          * removing it from its pmaps.
1567          */
1568         vm_page_protect(p, VM_PROT_NONE);
1569         if (info->limit && p->valid) {
1570                 vm_page_test_dirty(p);
1571                 if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) {
1572                         vm_page_wakeup(p);
1573                         goto done;
1574                 }
1575         }
1576         vm_page_free(p);
1577
1578         /*
1579          * Must be at end to avoid SMP races, caller holds object token
1580          */
1581 done:
1582         if ((++info->count & 63) == 0)
1583                 lwkt_user_yield();
1584
1585         return(0);
1586 }
1587
1588 /*
1589  * Try to extend prev_object into an adjoining region of virtual
1590  * memory, return TRUE on success.
1591  *
1592  * The caller does not need to hold (prev_object) but must have a stable
1593  * pointer to it (typically by holding the vm_map locked).
1594  *
1595  * This function only works for anonymous memory objects which either
1596  * have (a) one reference or (b) we are extending the object's size.
1597  * Otherwise the related VM pages we want to use for the object might
1598  * be in use by another mapping.
1599  */
1600 boolean_t
1601 vm_object_coalesce(vm_object_t prev_object, vm_pindex_t prev_pindex,
1602                    vm_size_t prev_size, vm_size_t next_size)
1603 {
1604         vm_pindex_t next_pindex;
1605
1606         if (prev_object == NULL)
1607                 return (TRUE);
1608
1609         vm_object_hold(prev_object);
1610
1611         if (prev_object->type != OBJT_DEFAULT &&
1612             prev_object->type != OBJT_SWAP) {
1613                 vm_object_drop(prev_object);
1614                 return (FALSE);
1615         }
1616
1617 #if 0
1618         /* caller now checks this */
1619         /*
1620          * Try to collapse the object first
1621          */
1622         vm_object_collapse(prev_object, NULL);
1623 #endif
1624
1625 #if 0
1626         /* caller now checks this */
1627         /*
1628          * We can't coalesce if we shadow another object (figuring out the
1629          * relationships become too complex).
1630          */
1631         if (prev_object->backing_object != NULL) {
1632                 vm_object_chain_release(prev_object);
1633                 vm_object_drop(prev_object);
1634                 return (FALSE);
1635         }
1636 #endif
1637
1638         prev_size >>= PAGE_SHIFT;
1639         next_size >>= PAGE_SHIFT;
1640         next_pindex = prev_pindex + prev_size;
1641
1642         /*
1643          * We can't if the object has more than one ref count unless we
1644          * are extending it into newly minted space.
1645          */
1646         if (prev_object->ref_count > 1 &&
1647             prev_object->size != next_pindex) {
1648                 vm_object_drop(prev_object);
1649                 return (FALSE);
1650         }
1651
1652         /*
1653          * Remove any pages that may still be in the object from a previous
1654          * deallocation.
1655          */
1656         if (next_pindex < prev_object->size) {
1657                 vm_object_page_remove(prev_object,
1658                                       next_pindex,
1659                                       next_pindex + next_size, FALSE);
1660                 if (prev_object->type == OBJT_SWAP)
1661                         swap_pager_freespace(prev_object,
1662                                              next_pindex, next_size);
1663         }
1664
1665         /*
1666          * Extend the object if necessary.
1667          */
1668         if (next_pindex + next_size > prev_object->size)
1669                 prev_object->size = next_pindex + next_size;
1670         vm_object_drop(prev_object);
1671
1672         return (TRUE);
1673 }
1674
1675 /*
1676  * Make the object writable and flag is being possibly dirty.
1677  *
1678  * The object might not be held (or might be held but held shared),
1679  * the related vnode is probably not held either.  Object and vnode are
1680  * stable by virtue of the vm_page busied by the caller preventing
1681  * destruction.
1682  *
1683  * If the related mount is flagged MNTK_THR_SYNC we need to call
1684  * vsetobjdirty().  Filesystems using this option usually shortcut
1685  * synchronization by only scanning the syncer list.
1686  */
1687 void
1688 vm_object_set_writeable_dirty(vm_object_t object)
1689 {
1690         struct vnode *vp;
1691
1692         /*vm_object_assert_held(object);*/
1693         /*
1694          * Avoid contention in vm fault path by checking the state before
1695          * issuing an atomic op on it.
1696          */
1697         if ((object->flags & (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) !=
1698             (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) {
1699                 vm_object_set_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
1700         }
1701         if (object->type == OBJT_VNODE &&
1702             (vp = (struct vnode *)object->handle) != NULL) {
1703                 if ((vp->v_flag & VOBJDIRTY) == 0) {
1704                         if (vp->v_mount &&
1705                             (vp->v_mount->mnt_kern_flag & MNTK_THR_SYNC)) {
1706                                 /*
1707                                  * New style THR_SYNC places vnodes on the
1708                                  * syncer list more deterministically.
1709                                  */
1710                                 vsetobjdirty(vp);
1711                         } else {
1712                                 /*
1713                                  * Old style scan would not necessarily place
1714                                  * a vnode on the syncer list when possibly
1715                                  * modified via mmap.
1716                                  */
1717                                 vsetflags(vp, VOBJDIRTY);
1718                         }
1719                 }
1720         }
1721 }
1722
1723 #include "opt_ddb.h"
1724 #ifdef DDB
1725 #include <sys/cons.h>
1726
1727 #include <ddb/ddb.h>
1728
1729 static int      _vm_object_in_map (vm_map_t map, vm_object_t object,
1730                                        vm_map_entry_t entry);
1731 static int      vm_object_in_map (vm_object_t object);
1732
1733 /*
1734  * The caller must hold the object.
1735  */
1736 static int
1737 _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry)
1738 {
1739         vm_map_backing_t ba;
1740         vm_map_t tmpm;
1741         vm_map_entry_t tmpe;
1742         int entcount;
1743
1744         if (map == NULL)
1745                 return 0;
1746         if (entry == NULL) {
1747                 tmpe = RB_MIN(vm_map_rb_tree, &map->rb_root);
1748                 entcount = map->nentries;
1749                 while (entcount-- && tmpe) {
1750                         if( _vm_object_in_map(map, object, tmpe)) {
1751                                 return 1;
1752                         }
1753                         tmpe = vm_map_rb_tree_RB_NEXT(tmpe);
1754                 }
1755                 return (0);
1756         }
1757         switch(entry->maptype) {
1758         case VM_MAPTYPE_SUBMAP:
1759                 tmpm = entry->ba.sub_map;
1760                 tmpe = RB_MIN(vm_map_rb_tree, &tmpm->rb_root);
1761                 entcount = tmpm->nentries;
1762                 while (entcount-- && tmpe) {
1763                         if( _vm_object_in_map(tmpm, object, tmpe)) {
1764                                 return 1;
1765                         }
1766                         tmpe = vm_map_rb_tree_RB_NEXT(tmpe);
1767                 }
1768                 break;
1769         case VM_MAPTYPE_NORMAL:
1770                 ba = &entry->ba;
1771                 while (ba) {
1772                         if (ba->object == object)
1773                                 return TRUE;
1774                         ba = ba->backing_ba;
1775                 }
1776                 break;
1777         default:
1778                 break;
1779         }
1780         return 0;
1781 }
1782
1783 static int vm_object_in_map_callback(struct proc *p, void *data);
1784
1785 struct vm_object_in_map_info {
1786         vm_object_t object;
1787         int rv;
1788 };
1789
1790 /*
1791  * Debugging only
1792  */
1793 static int
1794 vm_object_in_map(vm_object_t object)
1795 {
1796         struct vm_object_in_map_info info;
1797
1798         info.rv = 0;
1799         info.object = object;
1800
1801         allproc_scan(vm_object_in_map_callback, &info, 0);
1802         if (info.rv)
1803                 return 1;
1804         if( _vm_object_in_map(kernel_map, object, 0))
1805                 return 1;
1806         if( _vm_object_in_map(pager_map, object, 0))
1807                 return 1;
1808         if( _vm_object_in_map(buffer_map, object, 0))
1809                 return 1;
1810         return 0;
1811 }
1812
1813 /*
1814  * Debugging only
1815  */
1816 static int
1817 vm_object_in_map_callback(struct proc *p, void *data)
1818 {
1819         struct vm_object_in_map_info *info = data;
1820
1821         if (p->p_vmspace) {
1822                 if (_vm_object_in_map(&p->p_vmspace->vm_map, info->object, 0)) {
1823                         info->rv = 1;
1824                         return -1;
1825                 }
1826         }
1827         return (0);
1828 }
1829
1830 DB_SHOW_COMMAND(vmochk, vm_object_check)
1831 {
1832         struct vm_object_hash *hash;
1833         vm_object_t object;
1834         int n;
1835
1836         /*
1837          * make sure that internal objs are in a map somewhere
1838          * and none have zero ref counts.
1839          */
1840         for (n = 0; n < VMOBJ_HSIZE; ++n) {
1841                 hash = &vm_object_hash[n];
1842                 for (object = TAILQ_FIRST(&hash->list);
1843                                 object != NULL;
1844                                 object = TAILQ_NEXT(object, object_entry)) {
1845                         if (object->type == OBJT_MARKER)
1846                                 continue;
1847                         if (object->handle != NULL ||
1848                             (object->type != OBJT_DEFAULT &&
1849                              object->type != OBJT_SWAP)) {
1850                                 continue;
1851                         }
1852                         if (object->ref_count == 0) {
1853                                 db_printf("vmochk: internal obj has "
1854                                           "zero ref count: %ld\n",
1855                                           (long)object->size);
1856                         }
1857                         if (vm_object_in_map(object))
1858                                 continue;
1859                         db_printf("vmochk: internal obj is not in a map: "
1860                                   "ref: %d, size: %lu: 0x%lx\n",
1861                                   object->ref_count, (u_long)object->size,
1862                                   (u_long)object->size);
1863                 }
1864         }
1865 }
1866
1867 /*
1868  * Debugging only
1869  */
1870 DB_SHOW_COMMAND(object, vm_object_print_static)
1871 {
1872         /* XXX convert args. */
1873         vm_object_t object = (vm_object_t)addr;
1874         boolean_t full = have_addr;
1875
1876         vm_page_t p;
1877
1878         /* XXX count is an (unused) arg.  Avoid shadowing it. */
1879 #define count   was_count
1880
1881         int count;
1882
1883         if (object == NULL)
1884                 return;
1885
1886         db_iprintf(
1887             "Object %p: type=%d, size=0x%lx, res=%ld, ref=%d, flags=0x%x\n",
1888             object, (int)object->type, (u_long)object->size,
1889             object->resident_page_count, object->ref_count, object->flags);
1890         /*
1891          * XXX no %qd in kernel.  Truncate object->backing_object_offset.
1892          */
1893         db_iprintf("\n");
1894
1895         if (!full)
1896                 return;
1897
1898         db_indent += 2;
1899         count = 0;
1900         RB_FOREACH(p, vm_page_rb_tree, &object->rb_memq) {
1901                 if (count == 0)
1902                         db_iprintf("memory:=");
1903                 else if (count == 6) {
1904                         db_printf("\n");
1905                         db_iprintf(" ...");
1906                         count = 0;
1907                 } else
1908                         db_printf(",");
1909                 count++;
1910
1911                 db_printf("(off=0x%lx,page=0x%lx)",
1912                     (u_long) p->pindex, (u_long) VM_PAGE_TO_PHYS(p));
1913         }
1914         if (count != 0)
1915                 db_printf("\n");
1916         db_indent -= 2;
1917 }
1918
1919 /* XXX. */
1920 #undef count
1921
1922 /*
1923  * XXX need this non-static entry for calling from vm_map_print.
1924  *
1925  * Debugging only
1926  */
1927 void
1928 vm_object_print(/* db_expr_t */ long addr,
1929                 boolean_t have_addr,
1930                 /* db_expr_t */ long count,
1931                 char *modif)
1932 {
1933         vm_object_print_static(addr, have_addr, count, modif);
1934 }
1935
1936 /*
1937  * Debugging only
1938  */
1939 DB_SHOW_COMMAND(vmopag, vm_object_print_pages)
1940 {
1941         struct vm_object_hash *hash;
1942         vm_object_t object;
1943         int nl = 0;
1944         int c;
1945         int n;
1946
1947         for (n = 0; n < VMOBJ_HSIZE; ++n) {
1948                 hash = &vm_object_hash[n];
1949                 for (object = TAILQ_FIRST(&hash->list);
1950                                 object != NULL;
1951                                 object = TAILQ_NEXT(object, object_entry)) {
1952                         vm_pindex_t idx, fidx;
1953                         vm_pindex_t osize;
1954                         vm_paddr_t pa = -1, padiff;
1955                         int rcount;
1956                         vm_page_t m;
1957
1958                         if (object->type == OBJT_MARKER)
1959                                 continue;
1960                         db_printf("new object: %p\n", (void *)object);
1961                         if ( nl > 18) {
1962                                 c = cngetc();
1963                                 if (c != ' ')
1964                                         return;
1965                                 nl = 0;
1966                         }
1967                         nl++;
1968                         rcount = 0;
1969                         fidx = 0;
1970                         osize = object->size;
1971                         if (osize > 128)
1972                                 osize = 128;
1973                         for (idx = 0; idx < osize; idx++) {
1974                                 m = vm_page_lookup(object, idx);
1975                                 if (m == NULL) {
1976                                         if (rcount) {
1977                                                 db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
1978                                                         (long)fidx, rcount, (long)pa);
1979                                                 if ( nl > 18) {
1980                                                         c = cngetc();
1981                                                         if (c != ' ')
1982                                                                 return;
1983                                                         nl = 0;
1984                                                 }
1985                                                 nl++;
1986                                                 rcount = 0;
1987                                         }
1988                                         continue;
1989                                 }
1990
1991                                 if (rcount &&
1992                                         (VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) {
1993                                         ++rcount;
1994                                         continue;
1995                                 }
1996                                 if (rcount) {
1997                                         padiff = pa + rcount * PAGE_SIZE - VM_PAGE_TO_PHYS(m);
1998                                         padiff >>= PAGE_SHIFT;
1999                                         padiff &= PQ_L2_MASK;
2000                                         if (padiff == 0) {
2001                                                 pa = VM_PAGE_TO_PHYS(m) - rcount * PAGE_SIZE;
2002                                                 ++rcount;
2003                                                 continue;
2004                                         }
2005                                         db_printf(" index(%ld)run(%d)pa(0x%lx)",
2006                                                 (long)fidx, rcount, (long)pa);
2007                                         db_printf("pd(%ld)\n", (long)padiff);
2008                                         if ( nl > 18) {
2009                                                 c = cngetc();
2010                                                 if (c != ' ')
2011                                                         return;
2012                                                 nl = 0;
2013                                         }
2014                                         nl++;
2015                                 }
2016                                 fidx = idx;
2017                                 pa = VM_PAGE_TO_PHYS(m);
2018                                 rcount = 1;
2019                         }
2020                         if (rcount) {
2021                                 db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
2022                                         (long)fidx, rcount, (long)pa);
2023                                 if ( nl > 18) {
2024                                         c = cngetc();
2025                                         if (c != ' ')
2026                                                 return;
2027                                         nl = 0;
2028                                 }
2029                                 nl++;
2030                         }
2031                 }
2032         }
2033 }
2034 #endif /* DDB */