sys/vm/vm_object.c

   1 /*
   2  * Copyright (c) 1991, 1993, 2013
   3  *      The Regents of the University of California.  All rights reserved.
   4  *
   5  * This code is derived from software contributed to Berkeley by
   6  * The Mach Operating System project at Carnegie-Mellon University.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in the
  15  *    documentation and/or other materials provided with the distribution.
  16  * 3. Neither the name of the University nor the names of its contributors
  17  *    may be used to endorse or promote products derived from this software
  18  *    without specific prior written permission.
  19  *
  20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  30  * SUCH DAMAGE.
  31  *
  32  *      from: @(#)vm_object.c   8.5 (Berkeley) 3/22/94
  33  *
  34  *
  35  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  36  * All rights reserved.
  37  *
  38  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  39  *
  40  * Permission to use, copy, modify and distribute this software and
  41  * its documentation is hereby granted, provided that both the copyright
  42  * notice and this permission notice appear in all copies of the
  43  * software, derivative works or modified versions, and any portions
  44  * thereof, and that both notices appear in supporting documentation.
  45  *
  46  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  47  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  48  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  49  *
  50  * Carnegie Mellon requests users of this software to return to
  51  *
  52  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  53  *  School of Computer Science
  54  *  Carnegie Mellon University
  55  *  Pittsburgh PA 15213-3890
  56  *
  57  * any improvements or extensions that they make and grant Carnegie the
  58  * rights to redistribute these changes.
  59  *
  60  * $FreeBSD: src/sys/vm/vm_object.c,v 1.171.2.8 2003/05/26 19:17:56 alc Exp $
  61  */
  62
  63 /*
  64  *      Virtual memory object module.
  65  */
  66
  67 #include <sys/param.h>
  68 #include <sys/systm.h>
  69 #include <sys/proc.h>           /* for curproc, pageproc */
  70 #include <sys/thread.h>
  71 #include <sys/vnode.h>
  72 #include <sys/vmmeter.h>
  73 #include <sys/mman.h>
  74 #include <sys/mount.h>
  75 #include <sys/kernel.h>
  76 #include <sys/sysctl.h>
  77 #include <sys/refcount.h>
  78
  79 #include <vm/vm.h>
  80 #include <vm/vm_param.h>
  81 #include <vm/pmap.h>
  82 #include <vm/vm_map.h>
  83 #include <vm/vm_object.h>
  84 #include <vm/vm_page.h>
  85 #include <vm/vm_pageout.h>
  86 #include <vm/vm_pager.h>
  87 #include <vm/swap_pager.h>
  88 #include <vm/vm_kern.h>
  89 #include <vm/vm_extern.h>
  90 #include <vm/vm_zone.h>
  91
  92 #include <vm/vm_page2.h>
  93
  94 #include <machine/specialreg.h>
  95
  96 #define EASY_SCAN_FACTOR        8
  97
  98 static void     vm_object_page_collect_flush(vm_object_t object, vm_page_t p,
  99                                              int pagerflags);
 100 static void     vm_object_lock_init(vm_object_t);
 101
 102 /*
 103  *      Virtual memory objects maintain the actual data
 104  *      associated with allocated virtual memory.  A given
 105  *      page of memory exists within exactly one object.
 106  *
 107  *      An object is only deallocated when all "references"
 108  *      are given up.  Only one "reference" to a given
 109  *      region of an object should be writeable.
 110  *
 111  *      Associated with each object is a list of all resident
 112  *      memory pages belonging to that object; this list is
 113  *      maintained by the "vm_page" module, and locked by the object's
 114  *      lock.
 115  *
 116  *      Each object also records a "pager" routine which is
 117  *      used to retrieve (and store) pages to the proper backing
 118  *      storage.  In addition, objects may be backed by other
 119  *      objects from which they were virtual-copied.
 120  *
 121  *      The only items within the object structure which are
 122  *      modified after time of creation are:
 123  *              reference count         locked by object's lock
 124  *              pager routine           locked by object's lock
 125  *
 126  */
 127
 128 struct vm_object kernel_object;
 129
 130 struct vm_object_hash vm_object_hash[VMOBJ_HSIZE];
 131
 132 MALLOC_DEFINE(M_VM_OBJECT, "vm_object", "vm_object structures");
 133
 134 #define VMOBJ_HASH_PRIME1       66555444443333333ULL
 135 #define VMOBJ_HASH_PRIME2       989042931893ULL
 136
 137 int vm_object_debug;
 138 SYSCTL_INT(_vm, OID_AUTO, object_debug, CTLFLAG_RW, &vm_object_debug, 0, "");
 139
 140 static __inline
 141 struct vm_object_hash *
 142 vmobj_hash(vm_object_t obj)
 143 {
 144         uintptr_t hash1;
 145         uintptr_t hash2;
 146
 147         hash1 = (uintptr_t)obj + ((uintptr_t)obj >> 18);
 148         hash1 %= VMOBJ_HASH_PRIME1;
 149         hash2 = ((uintptr_t)obj >> 8) + ((uintptr_t)obj >> 24);
 150         hash2 %= VMOBJ_HASH_PRIME2;
 151         return (&vm_object_hash[(hash1 ^ hash2) & VMOBJ_HMASK]);
 152 }
 153
 154 #if defined(DEBUG_LOCKS)
 155
 156 #define vm_object_vndeallocate(obj, vpp)        \
 157                 debugvm_object_vndeallocate(obj, vpp, __FILE__, __LINE__)
 158
 159 /*
 160  * Debug helper to track hold/drop/ref/deallocate calls.
 161  */
 162 static void
 163 debugvm_object_add(vm_object_t obj, char *file, int line, int addrem)
 164 {
 165         int i;
 166
 167         i = atomic_fetchadd_int(&obj->debug_index, 1);
 168         i = i & (VMOBJ_DEBUG_ARRAY_SIZE - 1);
 169         ksnprintf(obj->debug_hold_thrs[i],
 170                   sizeof(obj->debug_hold_thrs[i]),
 171                   "%c%d:(%d):%s",
 172                   (addrem == -1 ? '-' : (addrem == 1 ? '+' : '=')),
 173                   (curthread->td_proc ? curthread->td_proc->p_pid : -1),
 174                   obj->ref_count,
 175                   curthread->td_comm);
 176         obj->debug_hold_file[i] = file;
 177         obj->debug_hold_line[i] = line;
 178 #if 0
 179         /* Uncomment for debugging obj refs/derefs in reproducable cases */
 180         if (strcmp(curthread->td_comm, "sshd") == 0) {
 181                 kprintf("%d %p refs=%d ar=%d file: %s/%d\n",
 182                         (curthread->td_proc ? curthread->td_proc->p_pid : -1),
 183                         obj, obj->ref_count, addrem, file, line);
 184         }
 185 #endif
 186 }
 187
 188 #endif
 189
 190 /*
 191  * Misc low level routines
 192  */
 193 static void
 194 vm_object_lock_init(vm_object_t obj)
 195 {
 196 #if defined(DEBUG_LOCKS)
 197         int i;
 198
 199         obj->debug_index = 0;
 200         for (i = 0; i < VMOBJ_DEBUG_ARRAY_SIZE; i++) {
 201                 obj->debug_hold_thrs[i][0] = 0;
 202                 obj->debug_hold_file[i] = NULL;
 203                 obj->debug_hold_line[i] = 0;
 204         }
 205 #endif
 206 }
 207
 208 void
 209 vm_object_lock_swap(void)
 210 {
 211         lwkt_token_swap();
 212 }
 213
 214 void
 215 vm_object_lock(vm_object_t obj)
 216 {
 217         lwkt_gettoken(&obj->token);
 218 }
 219
 220 /*
 221  * Returns TRUE on sucesss
 222  */
 223 static int
 224 vm_object_lock_try(vm_object_t obj)
 225 {
 226         return(lwkt_trytoken(&obj->token));
 227 }
 228
 229 void
 230 vm_object_lock_shared(vm_object_t obj)
 231 {
 232         lwkt_gettoken_shared(&obj->token);
 233 }
 234
 235 void
 236 vm_object_unlock(vm_object_t obj)
 237 {
 238         lwkt_reltoken(&obj->token);
 239 }
 240
 241 void
 242 vm_object_upgrade(vm_object_t obj)
 243 {
 244         lwkt_reltoken(&obj->token);
 245         lwkt_gettoken(&obj->token);
 246 }
 247
 248 void
 249 vm_object_downgrade(vm_object_t obj)
 250 {
 251         lwkt_reltoken(&obj->token);
 252         lwkt_gettoken_shared(&obj->token);
 253 }
 254
 255 static __inline void
 256 vm_object_assert_held(vm_object_t obj)
 257 {
 258         ASSERT_LWKT_TOKEN_HELD(&obj->token);
 259 }
 260
 261 int
 262 vm_quickcolor(void)
 263 {
 264         globaldata_t gd = mycpu;
 265         int pg_color;
 266
 267         pg_color = (int)(intptr_t)gd->gd_curthread >> 10;
 268         pg_color += gd->gd_quick_color;
 269         gd->gd_quick_color += PQ_PRIME2;
 270
 271         return pg_color;
 272 }
 273
 274 void
 275 VMOBJDEBUG(vm_object_hold)(vm_object_t obj VMOBJDBARGS)
 276 {
 277         KKASSERT(obj != NULL);
 278
 279         /*
 280          * Object must be held (object allocation is stable due to callers
 281          * context, typically already holding the token on a parent object)
 282          * prior to potentially blocking on the lock, otherwise the object
 283          * can get ripped away from us.
 284          */
 285         refcount_acquire(&obj->hold_count);
 286         vm_object_lock(obj);
 287
 288 #if defined(DEBUG_LOCKS)
 289         debugvm_object_add(obj, file, line, 1);
 290 #endif
 291 }
 292
 293 int
 294 VMOBJDEBUG(vm_object_hold_try)(vm_object_t obj VMOBJDBARGS)
 295 {
 296         KKASSERT(obj != NULL);
 297
 298         /*
 299          * Object must be held (object allocation is stable due to callers
 300          * context, typically already holding the token on a parent object)
 301          * prior to potentially blocking on the lock, otherwise the object
 302          * can get ripped away from us.
 303          */
 304         refcount_acquire(&obj->hold_count);
 305         if (vm_object_lock_try(obj) == 0) {
 306                 if (refcount_release(&obj->hold_count)) {
 307                         if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD))
 308                                 kfree(obj, M_VM_OBJECT);
 309                 }
 310                 return(0);
 311         }
 312
 313 #if defined(DEBUG_LOCKS)
 314         debugvm_object_add(obj, file, line, 1);
 315 #endif
 316         return(1);
 317 }
 318
 319 void
 320 VMOBJDEBUG(vm_object_hold_shared)(vm_object_t obj VMOBJDBARGS)
 321 {
 322         KKASSERT(obj != NULL);
 323
 324         /*
 325          * Object must be held (object allocation is stable due to callers
 326          * context, typically already holding the token on a parent object)
 327          * prior to potentially blocking on the lock, otherwise the object
 328          * can get ripped away from us.
 329          */
 330         refcount_acquire(&obj->hold_count);
 331         vm_object_lock_shared(obj);
 332
 333 #if defined(DEBUG_LOCKS)
 334         debugvm_object_add(obj, file, line, 1);
 335 #endif
 336 }
 337
 338 /*
 339  * Drop the token and hold_count on the object.
 340  *
 341  * WARNING! Token might be shared.
 342  */
 343 void
 344 VMOBJDEBUG(vm_object_drop)(vm_object_t obj VMOBJDBARGS)
 345 {
 346         if (obj == NULL)
 347                 return;
 348
 349         /*
 350          * No new holders should be possible once we drop hold_count 1->0 as
 351          * there is no longer any way to reference the object.
 352          */
 353         KKASSERT(obj->hold_count > 0);
 354         if (refcount_release(&obj->hold_count)) {
 355 #if defined(DEBUG_LOCKS)
 356                 debugvm_object_add(obj, file, line, -1);
 357 #endif
 358
 359                 if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD)) {
 360                         vm_object_unlock(obj);
 361                         kfree(obj, M_VM_OBJECT);
 362                 } else {
 363                         vm_object_unlock(obj);
 364                 }
 365         } else {
 366 #if defined(DEBUG_LOCKS)
 367                 debugvm_object_add(obj, file, line, -1);
 368 #endif
 369                 vm_object_unlock(obj);
 370         }
 371 }
 372
 373 /*
 374  * Initialize a freshly allocated object, returning a held object.
 375  *
 376  * Used only by vm_object_allocate(), zinitna() and vm_object_init().
 377  *
 378  * No requirements.
 379  */
 380 void
 381 _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object)
 382 {
 383         struct vm_object_hash *hash;
 384
 385         RB_INIT(&object->rb_memq);
 386         lwkt_token_init(&object->token, "vmobj");
 387
 388         TAILQ_INIT(&object->backing_list);
 389         object->type = type;
 390         object->size = size;
 391         object->ref_count = 1;
 392         object->memattr = VM_MEMATTR_DEFAULT;
 393         object->hold_count = 0;
 394         object->flags = 0;
 395         if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP))
 396                 vm_object_set_flag(object, OBJ_ONEMAPPING);
 397         object->paging_in_progress = 0;
 398         object->resident_page_count = 0;
 399         /* cpu localization twist */
 400         object->pg_color = vm_quickcolor();
 401         object->handle = NULL;
 402
 403         atomic_add_int(&object->generation, 1);
 404         object->swblock_count = 0;
 405         RB_INIT(&object->swblock_root);
 406         vm_object_lock_init(object);
 407         pmap_object_init(object);
 408
 409         vm_object_hold(object);
 410
 411         hash = vmobj_hash(object);
 412         lwkt_gettoken(&hash->token);
 413         TAILQ_INSERT_TAIL(&hash->list, object, object_entry);
 414         lwkt_reltoken(&hash->token);
 415 }
 416
 417 /*
 418  * Initialize a VM object.
 419  */
 420 void
 421 vm_object_init(vm_object_t object, vm_pindex_t size)
 422 {
 423         _vm_object_allocate(OBJT_DEFAULT, size, object);
 424         vm_object_drop(object);
 425 }
 426
 427 /*
 428  * Initialize the VM objects module.
 429  *
 430  * Called from the low level boot code only.  Note that this occurs before
 431  * kmalloc is initialized so we cannot allocate any VM objects.
 432  */
 433 void
 434 vm_object_init1(void)
 435 {
 436         int i;
 437
 438         for (i = 0; i < VMOBJ_HSIZE; ++i) {
 439                 TAILQ_INIT(&vm_object_hash[i].list);
 440                 lwkt_token_init(&vm_object_hash[i].token, "vmobjlst");
 441         }
 442
 443         _vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(KvaEnd),
 444                             &kernel_object);
 445         vm_object_drop(&kernel_object);
 446 }
 447
 448 void
 449 vm_object_init2(void)
 450 {
 451         kmalloc_set_unlimited(M_VM_OBJECT);
 452 }
 453
 454 /*
 455  * Allocate and return a new object of the specified type and size.
 456  *
 457  * No requirements.
 458  */
 459 vm_object_t
 460 vm_object_allocate(objtype_t type, vm_pindex_t size)
 461 {
 462         vm_object_t obj;
 463
 464         obj = kmalloc(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO);
 465         _vm_object_allocate(type, size, obj);
 466         vm_object_drop(obj);
 467
 468         return (obj);
 469 }
 470
 471 /*
 472  * This version returns a held object, allowing further atomic initialization
 473  * of the object.
 474  */
 475 vm_object_t
 476 vm_object_allocate_hold(objtype_t type, vm_pindex_t size)
 477 {
 478         vm_object_t obj;
 479
 480         obj = kmalloc(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO);
 481         _vm_object_allocate(type, size, obj);
 482
 483         return (obj);
 484 }
 485
 486 /*
 487  * Add an additional reference to a vm_object.  The object must already be
 488  * held.  The original non-lock version is no longer supported.  The object
 489  * must NOT be chain locked by anyone at the time the reference is added.
 490  *
 491  * The object must be held, but may be held shared if desired (hence why
 492  * we use an atomic op).
 493  */
 494 void
 495 VMOBJDEBUG(vm_object_reference_locked)(vm_object_t object VMOBJDBARGS)
 496 {
 497         KKASSERT(object != NULL);
 498         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
 499         atomic_add_int(&object->ref_count, 1);
 500         if (object->type == OBJT_VNODE) {
 501                 vref(object->handle);
 502                 /* XXX what if the vnode is being destroyed? */
 503         }
 504 #if defined(DEBUG_LOCKS)
 505         debugvm_object_add(object, file, line, 1);
 506 #endif
 507 }
 508
 509 /*
 510  * This version is only allowed in situations where the caller
 511  * already knows that the object is deterministically referenced
 512  * (usually because its taken from a ref'd vnode, or during a map_entry
 513  * replication).
 514  */
 515 void
 516 VMOBJDEBUG(vm_object_reference_quick)(vm_object_t object VMOBJDBARGS)
 517 {
 518         KKASSERT(object->type == OBJT_VNODE || object->ref_count > 0);
 519         atomic_add_int(&object->ref_count, 1);
 520         if (object->type == OBJT_VNODE)
 521                 vref(object->handle);
 522 #if defined(DEBUG_LOCKS)
 523         debugvm_object_add(object, file, line, 1);
 524 #endif
 525 }
 526
 527 /*
 528  * Dereference an object and its underlying vnode.  The object may be
 529  * held shared.  On return the object will remain held.
 530  *
 531  * This function may return a vnode in *vpp which the caller must release
 532  * after the caller drops its own lock.  If vpp is NULL, we assume that
 533  * the caller was holding an exclusive lock on the object and we vrele()
 534  * the vp ourselves.
 535  */
 536 static void
 537 VMOBJDEBUG(vm_object_vndeallocate)(vm_object_t object, struct vnode **vpp
 538                                    VMOBJDBARGS)
 539 {
 540         struct vnode *vp = (struct vnode *) object->handle;
 541
 542         KASSERT(object->type == OBJT_VNODE,
 543             ("vm_object_vndeallocate: not a vnode object"));
 544         KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp"));
 545         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
 546 #ifdef INVARIANTS
 547         if (object->ref_count == 0) {
 548                 vprint("vm_object_vndeallocate", vp);
 549                 panic("vm_object_vndeallocate: bad object reference count");
 550         }
 551 #endif
 552         for (;;) {
 553                 int count = object->ref_count;
 554                 cpu_ccfence();
 555                 if (count == 1) {
 556                         vm_object_upgrade(object);
 557                         if (atomic_cmpset_int(&object->ref_count, count, 0)) {
 558                                 vclrflags(vp, VTEXT);
 559                                 break;
 560                         }
 561                 } else {
 562                         if (atomic_cmpset_int(&object->ref_count,
 563                                               count, count - 1)) {
 564                                 break;
 565                         }
 566                 }
 567                 /* retry */
 568         }
 569 #if defined(DEBUG_LOCKS)
 570         debugvm_object_add(object, file, line, -1);
 571 #endif
 572
 573         /*
 574          * vrele or return the vp to vrele.  We can only safely vrele(vp)
 575          * if the object was locked exclusively.  But there are two races
 576          * here.
 577          *
 578          * We had to upgrade the object above to safely clear VTEXT
 579          * but the alternative path where the shared lock is retained
 580          * can STILL race to 0 in other paths and cause our own vrele()
 581          * to terminate the vnode.  We can't allow that if the VM object
 582          * is still locked shared.
 583          */
 584         if (vpp)
 585                 *vpp = vp;
 586         else
 587                 vrele(vp);
 588 }
 589
 590 /*
 591  * Release a reference to the specified object, gained either through a
 592  * vm_object_allocate or a vm_object_reference call.  When all references
 593  * are gone, storage associated with this object may be relinquished.
 594  *
 595  * The caller does not have to hold the object locked but must have control
 596  * over the reference in question in order to guarantee that the object
 597  * does not get ripped out from under us.
 598  *
 599  * XXX Currently all deallocations require an exclusive lock.
 600  */
 601 void
 602 VMOBJDEBUG(vm_object_deallocate)(vm_object_t object VMOBJDBARGS)
 603 {
 604         struct vnode *vp;
 605         int count;
 606
 607         if (object == NULL)
 608                 return;
 609
 610         for (;;) {
 611                 count = object->ref_count;
 612                 cpu_ccfence();
 613
 614                 /*
 615                  * If decrementing the count enters into special handling
 616                  * territory (0, 1, or 2) we have to do it the hard way.
 617                  * Fortunate though, objects with only a few refs like this
 618                  * are not likely to be heavily contended anyway.
 619                  *
 620                  * For vnode objects we only care about 1->0 transitions.
 621                  */
 622                 if (count <= 3 || (object->type == OBJT_VNODE && count <= 1)) {
 623 #if defined(DEBUG_LOCKS)
 624                         debugvm_object_add(object, file, line, 0);
 625 #endif
 626                         vm_object_hold(object);
 627                         vm_object_deallocate_locked(object);
 628                         vm_object_drop(object);
 629                         break;
 630                 }
 631
 632                 /*
 633                  * Try to decrement ref_count without acquiring a hold on
 634                  * the object.  This is particularly important for the exec*()
 635                  * and exit*() code paths because the program binary may
 636                  * have a great deal of sharing and an exclusive lock will
 637                  * crowbar performance in those circumstances.
 638                  */
 639                 if (object->type == OBJT_VNODE) {
 640                         vp = (struct vnode *)object->handle;
 641                         if (atomic_cmpset_int(&object->ref_count,
 642                                               count, count - 1)) {
 643 #if defined(DEBUG_LOCKS)
 644                                 debugvm_object_add(object, file, line, -1);
 645 #endif
 646
 647                                 vrele(vp);
 648                                 break;
 649                         }
 650                         /* retry */
 651                 } else {
 652                         if (atomic_cmpset_int(&object->ref_count,
 653                                               count, count - 1)) {
 654 #if defined(DEBUG_LOCKS)
 655                                 debugvm_object_add(object, file, line, -1);
 656 #endif
 657                                 break;
 658                         }
 659                         /* retry */
 660                 }
 661                 /* retry */
 662         }
 663 }
 664
 665 void
 666 VMOBJDEBUG(vm_object_deallocate_locked)(vm_object_t object VMOBJDBARGS)
 667 {
 668         /*
 669          * Degenerate case
 670          */
 671         if (object == NULL)
 672                 return;
 673
 674         /*
 675          * vnode case, caller either locked the object exclusively
 676          * or this is a recursion with must_drop != 0 and the vnode
 677          * object will be locked shared.
 678          *
 679          * If locked shared we have to drop the object before we can
 680          * call vrele() or risk a shared/exclusive livelock.
 681          */
 682         if (object->type == OBJT_VNODE) {
 683                 ASSERT_LWKT_TOKEN_HELD(&object->token);
 684                 vm_object_vndeallocate(object, NULL);
 685                 return;
 686         }
 687         ASSERT_LWKT_TOKEN_HELD_EXCL(&object->token);
 688
 689         /*
 690          * Normal case (object is locked exclusively)
 691          */
 692         if (object->ref_count == 0) {
 693                 panic("vm_object_deallocate: object deallocated "
 694                       "too many times: %d", object->type);
 695         }
 696         if (object->ref_count > 2) {
 697                 atomic_add_int(&object->ref_count, -1);
 698 #if defined(DEBUG_LOCKS)
 699                 debugvm_object_add(object, file, line, -1);
 700 #endif
 701                 return;
 702         }
 703
 704         /*
 705          * Drop the ref and handle termination on the 1->0 transition.
 706          * We may have blocked above so we have to recheck.
 707          */
 708         KKASSERT(object->ref_count != 0);
 709         if (object->ref_count >= 2) {
 710                 atomic_add_int(&object->ref_count, -1);
 711 #if defined(DEBUG_LOCKS)
 712                 debugvm_object_add(object, file, line, -1);
 713 #endif
 714                 return;
 715         }
 716
 717         atomic_add_int(&object->ref_count, -1);
 718         if ((object->flags & OBJ_DEAD) == 0)
 719                 vm_object_terminate(object);
 720 }
 721
 722 /*
 723  * Destroy the specified object, freeing up related resources.
 724  *
 725  * The object must have zero references.
 726  *
 727  * The object must held.  The caller is responsible for dropping the object
 728  * after terminate returns.  Terminate does NOT drop the object.
 729  */
 730 static int vm_object_terminate_callback(vm_page_t p, void *data);
 731
 732 void
 733 vm_object_terminate(vm_object_t object)
 734 {
 735         struct rb_vm_page_scan_info info;
 736         struct vm_object_hash *hash;
 737
 738         /*
 739          * Make sure no one uses us.  Once we set OBJ_DEAD we should be
 740          * able to safely block.
 741          */
 742         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
 743         KKASSERT((object->flags & OBJ_DEAD) == 0);
 744         vm_object_set_flag(object, OBJ_DEAD);
 745
 746         /*
 747          * Wait for the pageout daemon to be done with the object
 748          */
 749         vm_object_pip_wait(object, "objtrm1");
 750
 751         KASSERT(!object->paging_in_progress,
 752                 ("vm_object_terminate: pageout in progress"));
 753
 754         /*
 755          * Clean and free the pages, as appropriate. All references to the
 756          * object are gone, so we don't need to lock it.
 757          */
 758         if (object->type == OBJT_VNODE) {
 759                 struct vnode *vp;
 760
 761                 /*
 762                  * Clean pages and flush buffers.
 763                  *
 764                  * NOTE!  TMPFS buffer flushes do not typically flush the
 765                  *        actual page to swap as this would be highly
 766                  *        inefficient, and normal filesystems usually wrap
 767                  *        page flushes with buffer cache buffers.
 768                  *
 769                  *        To deal with this we have to call vinvalbuf() both
 770                  *        before and after the vm_object_page_clean().
 771                  */
 772                 vp = (struct vnode *) object->handle;
 773                 vinvalbuf(vp, V_SAVE, 0, 0);
 774                 vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
 775                 vinvalbuf(vp, V_SAVE, 0, 0);
 776         }
 777
 778         /*
 779          * Wait for any I/O to complete, after which there had better not
 780          * be any references left on the object.
 781          */
 782         vm_object_pip_wait(object, "objtrm2");
 783
 784         if (object->ref_count != 0) {
 785                 panic("vm_object_terminate: object with references, "
 786                       "ref_count=%d", object->ref_count);
 787         }
 788
 789         /*
 790          * Cleanup any shared pmaps associated with this object.
 791          */
 792         pmap_object_free(object);
 793
 794         /*
 795          * Now free any remaining pages. For internal objects, this also
 796          * removes them from paging queues. Don't free wired pages, just
 797          * remove them from the object.
 798          */
 799         info.count = 0;
 800         info.object = object;
 801         do {
 802                 info.error = 0;
 803                 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
 804                                         vm_object_terminate_callback, &info);
 805         } while (info.error);
 806
 807         /*
 808          * Let the pager know object is dead.
 809          */
 810         vm_pager_deallocate(object);
 811
 812         /*
 813          * Wait for the object hold count to hit 1, clean out pages as
 814          * we go.  vmobj_token interlocks any race conditions that might
 815          * pick the object up from the vm_object_list after we have cleared
 816          * rb_memq.
 817          */
 818         for (;;) {
 819                 if (RB_ROOT(&object->rb_memq) == NULL)
 820                         break;
 821                 kprintf("vm_object_terminate: Warning, object %p "
 822                         "still has %ld pages\n",
 823                         object, object->resident_page_count);
 824                 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
 825                                         vm_object_terminate_callback, &info);
 826         }
 827
 828         /*
 829          * There had better not be any pages left
 830          */
 831         KKASSERT(object->resident_page_count == 0);
 832
 833         /*
 834          * Remove the object from the global object list.
 835          */
 836         hash = vmobj_hash(object);
 837         lwkt_gettoken(&hash->token);
 838         TAILQ_REMOVE(&hash->list, object, object_entry);
 839         lwkt_reltoken(&hash->token);
 840
 841         if (object->ref_count != 0) {
 842                 panic("vm_object_terminate2: object with references, "
 843                       "ref_count=%d", object->ref_count);
 844         }
 845
 846         /*
 847          * NOTE: The object hold_count is at least 1, so we cannot kfree()
 848          *       the object here.  See vm_object_drop().
 849          */
 850 }
 851
 852 /*
 853  * The caller must hold the object.
 854  */
 855 static int
 856 vm_object_terminate_callback(vm_page_t p, void *data)
 857 {
 858         struct rb_vm_page_scan_info *info = data;
 859         vm_object_t object;
 860
 861         object = p->object;
 862         KKASSERT(object == info->object);
 863         if (vm_page_busy_try(p, TRUE)) {
 864                 vm_page_sleep_busy(p, TRUE, "vmotrm");
 865                 info->error = 1;
 866                 return 0;
 867         }
 868         if (object != p->object) {
 869                 /* XXX remove once we determine it can't happen */
 870                 kprintf("vm_object_terminate: Warning: Encountered "
 871                         "busied page %p on queue %d\n", p, p->queue);
 872                 vm_page_wakeup(p);
 873                 info->error = 1;
 874         } else if (p->wire_count == 0) {
 875                 /*
 876                  * NOTE: p->dirty and PG_NEED_COMMIT are ignored.
 877                  */
 878                 vm_page_free(p);
 879                 mycpu->gd_cnt.v_pfree++;
 880         } else {
 881                 if (p->queue != PQ_NONE) {
 882                         kprintf("vm_object_terminate: Warning: Encountered "
 883                                 "wired page %p on queue %d\n", p, p->queue);
 884                         if (vm_object_debug > 0) {
 885                                 --vm_object_debug;
 886                                 print_backtrace(10);
 887                         }
 888                 }
 889                 vm_page_remove(p);
 890                 vm_page_wakeup(p);
 891         }
 892
 893         /*
 894          * Must be at end to avoid SMP races, caller holds object token
 895          */
 896         if ((++info->count & 63) == 0)
 897                 lwkt_user_yield();
 898         return(0);
 899 }
 900
 901 /*
 902  * Clean all dirty pages in the specified range of object.  Leaves page
 903  * on whatever queue it is currently on.   If NOSYNC is set then do not
 904  * write out pages with PG_NOSYNC set (originally comes from MAP_NOSYNC),
 905  * leaving the object dirty.
 906  *
 907  * When stuffing pages asynchronously, allow clustering.  XXX we need a
 908  * synchronous clustering mode implementation.
 909  *
 910  * Odd semantics: if start == end, we clean everything.
 911  *
 912  * The object must be locked? XXX
 913  */
 914 static int vm_object_page_clean_pass1(struct vm_page *p, void *data);
 915 static int vm_object_page_clean_pass2(struct vm_page *p, void *data);
 916
 917 void
 918 vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
 919                      int flags)
 920 {
 921         struct rb_vm_page_scan_info info;
 922         struct vnode *vp;
 923         int wholescan;
 924         int pagerflags;
 925         int generation;
 926
 927         vm_object_hold(object);
 928         if (object->type != OBJT_VNODE ||
 929             (object->flags & OBJ_MIGHTBEDIRTY) == 0) {
 930                 vm_object_drop(object);
 931                 return;
 932         }
 933
 934         pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) ?
 935                         VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK;
 936         pagerflags |= (flags & OBJPC_INVAL) ? VM_PAGER_PUT_INVAL : 0;
 937
 938         vp = object->handle;
 939
 940         /*
 941          * Interlock other major object operations.  This allows us to
 942          * temporarily clear OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY.
 943          */
 944         vm_object_set_flag(object, OBJ_CLEANING);
 945
 946         /*
 947          * Handle 'entire object' case
 948          */
 949         info.start_pindex = start;
 950         if (end == 0) {
 951                 info.end_pindex = object->size - 1;
 952         } else {
 953                 info.end_pindex = end - 1;
 954         }
 955         wholescan = (start == 0 && info.end_pindex == object->size - 1);
 956         info.limit = flags;
 957         info.pagerflags = pagerflags;
 958         info.object = object;
 959
 960         /*
 961          * If cleaning the entire object do a pass to mark the pages read-only.
 962          * If everything worked out ok, clear OBJ_WRITEABLE and
 963          * OBJ_MIGHTBEDIRTY.
 964          */
 965         if (wholescan) {
 966                 info.error = 0;
 967                 info.count = 0;
 968                 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
 969                                         vm_object_page_clean_pass1, &info);
 970                 if (info.error == 0) {
 971                         vm_object_clear_flag(object,
 972                                              OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
 973                         if (object->type == OBJT_VNODE &&
 974                             (vp = (struct vnode *)object->handle) != NULL) {
 975                                 /*
 976                                  * Use new-style interface to clear VISDIRTY
 977                                  * because the vnode is not necessarily removed
 978                                  * from the syncer list(s) as often as it was
 979                                  * under the old interface, which can leave
 980                                  * the vnode on the syncer list after reclaim.
 981                                  */
 982                                 vclrobjdirty(vp);
 983                         }
 984                 }
 985         }
 986
 987         /*
 988          * Do a pass to clean all the dirty pages we find.
 989          */
 990         do {
 991                 info.error = 0;
 992                 info.count = 0;
 993                 generation = object->generation;
 994                 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
 995                                         vm_object_page_clean_pass2, &info);
 996         } while (info.error || generation != object->generation);
 997
 998         vm_object_clear_flag(object, OBJ_CLEANING);
 999         vm_object_drop(object);
1000 }
1001
1002 /*
1003  * The caller must hold the object.
1004  */
1005 static
1006 int
1007 vm_object_page_clean_pass1(struct vm_page *p, void *data)
1008 {
1009         struct rb_vm_page_scan_info *info = data;
1010
1011         KKASSERT(p->object == info->object);
1012
1013         vm_page_flag_set(p, PG_CLEANCHK);
1014         if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
1015                 info->error = 1;
1016         } else if (vm_page_busy_try(p, FALSE)) {
1017                 info->error = 1;
1018         } else {
1019                 KKASSERT(p->object == info->object);
1020                 vm_page_protect(p, VM_PROT_READ);
1021                 vm_page_wakeup(p);
1022         }
1023
1024         /*
1025          * Must be at end to avoid SMP races, caller holds object token
1026          */
1027         if ((++info->count & 63) == 0)
1028                 lwkt_user_yield();
1029         return(0);
1030 }
1031
1032 /*
1033  * The caller must hold the object
1034  */
1035 static
1036 int
1037 vm_object_page_clean_pass2(struct vm_page *p, void *data)
1038 {
1039         struct rb_vm_page_scan_info *info = data;
1040         int generation;
1041
1042         KKASSERT(p->object == info->object);
1043
1044         /*
1045          * Do not mess with pages that were inserted after we started
1046          * the cleaning pass.
1047          */
1048         if ((p->flags & PG_CLEANCHK) == 0)
1049                 goto done;
1050
1051         generation = info->object->generation;
1052
1053         if (vm_page_busy_try(p, TRUE)) {
1054                 vm_page_sleep_busy(p, TRUE, "vpcwai");
1055                 info->error = 1;
1056                 goto done;
1057         }
1058
1059         KKASSERT(p->object == info->object &&
1060                  info->object->generation == generation);
1061
1062         /*
1063          * Before wasting time traversing the pmaps, check for trivial
1064          * cases where the page cannot be dirty.
1065          */
1066         if (p->valid == 0 || (p->queue - p->pc) == PQ_CACHE) {
1067                 KKASSERT((p->dirty & p->valid) == 0 &&
1068                          (p->flags & PG_NEED_COMMIT) == 0);
1069                 vm_page_wakeup(p);
1070                 goto done;
1071         }
1072
1073         /*
1074          * Check whether the page is dirty or not.  The page has been set
1075          * to be read-only so the check will not race a user dirtying the
1076          * page.
1077          */
1078         vm_page_test_dirty(p);
1079         if ((p->dirty & p->valid) == 0 && (p->flags & PG_NEED_COMMIT) == 0) {
1080                 vm_page_flag_clear(p, PG_CLEANCHK);
1081                 vm_page_wakeup(p);
1082                 goto done;
1083         }
1084
1085         /*
1086          * If we have been asked to skip nosync pages and this is a
1087          * nosync page, skip it.  Note that the object flags were
1088          * not cleared in this case (because pass1 will have returned an
1089          * error), so we do not have to set them.
1090          */
1091         if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
1092                 vm_page_flag_clear(p, PG_CLEANCHK);
1093                 vm_page_wakeup(p);
1094                 goto done;
1095         }
1096
1097         /*
1098          * Flush as many pages as we can.  PG_CLEANCHK will be cleared on
1099          * the pages that get successfully flushed.  Set info->error if
1100          * we raced an object modification.
1101          */
1102         vm_object_page_collect_flush(info->object, p, info->pagerflags);
1103         /* vm_wait_nominal(); this can deadlock the system in syncer/pageout */
1104
1105         /*
1106          * Must be at end to avoid SMP races, caller holds object token
1107          */
1108 done:
1109         if ((++info->count & 63) == 0)
1110                 lwkt_user_yield();
1111         return(0);
1112 }
1113
1114 /*
1115  * Collect the specified page and nearby pages and flush them out.
1116  * The number of pages flushed is returned.  The passed page is busied
1117  * by the caller and we are responsible for its disposition.
1118  *
1119  * The caller must hold the object.
1120  */
1121 static void
1122 vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags)
1123 {
1124         int error;
1125         int is;
1126         int ib;
1127         int i;
1128         int page_base;
1129         vm_pindex_t pi;
1130         vm_page_t ma[BLIST_MAX_ALLOC];
1131
1132         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1133
1134         pi = p->pindex;
1135         page_base = pi % BLIST_MAX_ALLOC;
1136         ma[page_base] = p;
1137         ib = page_base - 1;
1138         is = page_base + 1;
1139
1140         while (ib >= 0) {
1141                 vm_page_t tp;
1142
1143                 tp = vm_page_lookup_busy_try(object, pi - page_base + ib,
1144                                              TRUE, &error);
1145                 if (error)
1146                         break;
1147                 if (tp == NULL)
1148                         break;
1149                 if ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 &&
1150                     (tp->flags & PG_CLEANCHK) == 0) {
1151                         vm_page_wakeup(tp);
1152                         break;
1153                 }
1154                 if ((tp->queue - tp->pc) == PQ_CACHE) {
1155                         vm_page_flag_clear(tp, PG_CLEANCHK);
1156                         vm_page_wakeup(tp);
1157                         break;
1158                 }
1159                 vm_page_test_dirty(tp);
1160                 if ((tp->dirty & tp->valid) == 0 &&
1161                     (tp->flags & PG_NEED_COMMIT) == 0) {
1162                         vm_page_flag_clear(tp, PG_CLEANCHK);
1163                         vm_page_wakeup(tp);
1164                         break;
1165                 }
1166                 ma[ib] = tp;
1167                 --ib;
1168         }
1169         ++ib;   /* fixup */
1170
1171         while (is < BLIST_MAX_ALLOC &&
1172                pi - page_base + is < object->size) {
1173                 vm_page_t tp;
1174
1175                 tp = vm_page_lookup_busy_try(object, pi - page_base + is,
1176                                              TRUE, &error);
1177                 if (error)
1178                         break;
1179                 if (tp == NULL)
1180                         break;
1181                 if ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 &&
1182                     (tp->flags & PG_CLEANCHK) == 0) {
1183                         vm_page_wakeup(tp);
1184                         break;
1185                 }
1186                 if ((tp->queue - tp->pc) == PQ_CACHE) {
1187                         vm_page_flag_clear(tp, PG_CLEANCHK);
1188                         vm_page_wakeup(tp);
1189                         break;
1190                 }
1191                 vm_page_test_dirty(tp);
1192                 if ((tp->dirty & tp->valid) == 0 &&
1193                     (tp->flags & PG_NEED_COMMIT) == 0) {
1194                         vm_page_flag_clear(tp, PG_CLEANCHK);
1195                         vm_page_wakeup(tp);
1196                         break;
1197                 }
1198                 ma[is] = tp;
1199                 ++is;
1200         }
1201
1202         /*
1203          * All pages in the ma[] array are busied now
1204          */
1205         for (i = ib; i < is; ++i) {
1206                 vm_page_flag_clear(ma[i], PG_CLEANCHK);
1207                 vm_page_hold(ma[i]);    /* XXX need this any more? */
1208         }
1209         vm_pageout_flush(&ma[ib], is - ib, pagerflags);
1210         for (i = ib; i < is; ++i)       /* XXX need this any more? */
1211                 vm_page_unhold(ma[i]);
1212 }
1213
1214 /*
1215  * Implements the madvise function at the object/page level.
1216  *
1217  * MADV_WILLNEED        (any object)
1218  *
1219  *      Activate the specified pages if they are resident.
1220  *
1221  * MADV_DONTNEED        (any object)
1222  *
1223  *      Deactivate the specified pages if they are resident.
1224  *
1225  * MADV_FREE    (OBJT_DEFAULT/OBJT_SWAP objects, OBJ_ONEMAPPING only)
1226  *
1227  *      Deactivate and clean the specified pages if they are
1228  *      resident.  This permits the process to reuse the pages
1229  *      without faulting or the kernel to reclaim the pages
1230  *      without I/O.
1231  *
1232  * No requirements.
1233  */
1234 void
1235 vm_object_madvise(vm_object_t object, vm_pindex_t pindex,
1236                   vm_pindex_t count, int advise)
1237 {
1238         vm_pindex_t end;
1239         vm_page_t m;
1240         int error;
1241
1242         if (object == NULL)
1243                 return;
1244
1245         end = pindex + count;
1246
1247         vm_object_hold(object);
1248
1249         /*
1250          * Locate and adjust resident pages.  This only applies to the
1251          * primary object in the mapping.
1252          */
1253         for (; pindex < end; pindex += 1) {
1254 relookup:
1255                 /*
1256                  * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages
1257                  * and those pages must be OBJ_ONEMAPPING.
1258                  */
1259                 if (advise == MADV_FREE) {
1260                         if ((object->type != OBJT_DEFAULT &&
1261                              object->type != OBJT_SWAP) ||
1262                             (object->flags & OBJ_ONEMAPPING) == 0) {
1263                                 continue;
1264                         }
1265                 }
1266
1267                 m = vm_page_lookup_busy_try(object, pindex, TRUE, &error);
1268
1269                 if (error) {
1270                         vm_page_sleep_busy(m, TRUE, "madvpo");
1271                         goto relookup;
1272                 }
1273                 if (m == NULL) {
1274                         /*
1275                          * There may be swap even if there is no backing page
1276                          */
1277                         if (advise == MADV_FREE && object->type == OBJT_SWAP)
1278                                 swap_pager_freespace(object, pindex, 1);
1279                         continue;
1280                 }
1281
1282                 /*
1283                  * If the page is not in a normal active state, we skip it.
1284                  * If the page is not managed there are no page queues to
1285                  * mess with.  Things can break if we mess with pages in
1286                  * any of the below states.
1287                  */
1288                 if (m->wire_count ||
1289                     (m->flags & (PG_UNMANAGED | PG_NEED_COMMIT)) ||
1290                     m->valid != VM_PAGE_BITS_ALL
1291                 ) {
1292                         vm_page_wakeup(m);
1293                         continue;
1294                 }
1295
1296                 /*
1297                  * Theoretically once a page is known not to be busy, an
1298                  * interrupt cannot come along and rip it out from under us.
1299                  */
1300                 if (advise == MADV_WILLNEED) {
1301                         vm_page_activate(m);
1302                 } else if (advise == MADV_DONTNEED) {
1303                         vm_page_dontneed(m);
1304                 } else if (advise == MADV_FREE) {
1305                         /*
1306                          * Mark the page clean.  This will allow the page
1307                          * to be freed up by the system.  However, such pages
1308                          * are often reused quickly by malloc()/free()
1309                          * so we do not do anything that would cause
1310                          * a page fault if we can help it.
1311                          *
1312                          * Specifically, we do not try to actually free
1313                          * the page now nor do we try to put it in the
1314                          * cache (which would cause a page fault on reuse).
1315                          *
1316                          * But we do make the page is freeable as we
1317                          * can without actually taking the step of unmapping
1318                          * it.
1319                          */
1320                         pmap_clear_modify(m);
1321                         m->dirty = 0;
1322                         m->act_count = 0;
1323                         vm_page_dontneed(m);
1324                         if (object->type == OBJT_SWAP)
1325                                 swap_pager_freespace(object, pindex, 1);
1326                 }
1327                 vm_page_wakeup(m);
1328         }
1329         vm_object_drop(object);
1330 }
1331
1332 /*
1333  * Removes all physical pages in the specified object range from the
1334  * object's list of pages.
1335  *
1336  * No requirements.
1337  */
1338 static int vm_object_page_remove_callback(vm_page_t p, void *data);
1339
1340 void
1341 vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
1342                       boolean_t clean_only)
1343 {
1344         struct rb_vm_page_scan_info info;
1345         int all;
1346
1347         /*
1348          * Degenerate cases and assertions
1349          */
1350         vm_object_hold(object);
1351         if (object == NULL ||
1352             (object->resident_page_count == 0 && object->swblock_count == 0)) {
1353                 vm_object_drop(object);
1354                 return;
1355         }
1356         KASSERT(object->type != OBJT_PHYS,
1357                 ("attempt to remove pages from a physical object"));
1358
1359         /*
1360          * Indicate that paging is occuring on the object
1361          */
1362         vm_object_pip_add(object, 1);
1363
1364         /*
1365          * Figure out the actual removal range and whether we are removing
1366          * the entire contents of the object or not.  If removing the entire
1367          * contents, be sure to get all pages, even those that might be
1368          * beyond the end of the object.
1369          */
1370         info.object = object;
1371         info.start_pindex = start;
1372         if (end == 0)
1373                 info.end_pindex = (vm_pindex_t)-1;
1374         else
1375                 info.end_pindex = end - 1;
1376         info.limit = clean_only;
1377         info.count = 0;
1378         all = (start == 0 && info.end_pindex >= object->size - 1);
1379
1380         /*
1381          * Loop until we are sure we have gotten them all.
1382          */
1383         do {
1384                 info.error = 0;
1385                 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
1386                                         vm_object_page_remove_callback, &info);
1387         } while (info.error);
1388
1389         /*
1390          * Remove any related swap if throwing away pages, or for
1391          * non-swap objects (the swap is a clean copy in that case).
1392          */
1393         if (object->type != OBJT_SWAP || clean_only == FALSE) {
1394                 if (all)
1395                         swap_pager_freespace_all(object);
1396                 else
1397                         swap_pager_freespace(object, info.start_pindex,
1398                              info.end_pindex - info.start_pindex + 1);
1399         }
1400
1401         /*
1402          * Cleanup
1403          */
1404         vm_object_pip_wakeup(object);
1405         vm_object_drop(object);
1406 }
1407
1408 /*
1409  * The caller must hold the object.
1410  *
1411  * NOTE: User yields are allowed when removing more than one page, but not
1412  *       allowed if only removing one page (the path for single page removals
1413  *       might hold a spinlock).
1414  */
1415 static int
1416 vm_object_page_remove_callback(vm_page_t p, void *data)
1417 {
1418         struct rb_vm_page_scan_info *info = data;
1419
1420         if (info->object != p->object ||
1421             p->pindex < info->start_pindex ||
1422             p->pindex > info->end_pindex) {
1423                 kprintf("vm_object_page_remove_callbackA: obj/pg race %p/%p\n",
1424                         info->object, p);
1425                 return(0);
1426         }
1427         if (vm_page_busy_try(p, TRUE)) {
1428                 vm_page_sleep_busy(p, TRUE, "vmopar");
1429                 info->error = 1;
1430                 return(0);
1431         }
1432         if (info->object != p->object) {
1433                 /* this should never happen */
1434                 kprintf("vm_object_page_remove_callbackB: obj/pg race %p/%p\n",
1435                         info->object, p);
1436                 vm_page_wakeup(p);
1437                 return(0);
1438         }
1439
1440         /*
1441          * Wired pages cannot be destroyed, but they can be invalidated
1442          * and we do so if clean_only (limit) is not set.
1443          *
1444          * WARNING!  The page may be wired due to being part of a buffer
1445          *           cache buffer, and the buffer might be marked B_CACHE.
1446          *           This is fine as part of a truncation but VFSs must be
1447          *           sure to fix the buffer up when re-extending the file.
1448          *
1449          * NOTE!     PG_NEED_COMMIT is ignored.
1450          */
1451         if (p->wire_count != 0) {
1452                 vm_page_protect(p, VM_PROT_NONE);
1453                 if (info->limit == 0)
1454                         p->valid = 0;
1455                 vm_page_wakeup(p);
1456                 goto done;
1457         }
1458
1459         /*
1460          * limit is our clean_only flag.  If set and the page is dirty or
1461          * requires a commit, do not free it.  If set and the page is being
1462          * held by someone, do not free it.
1463          */
1464         if (info->limit && p->valid) {
1465                 vm_page_test_dirty(p);
1466                 if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) {
1467                         vm_page_wakeup(p);
1468                         goto done;
1469                 }
1470         }
1471
1472         /*
1473          * Destroy the page
1474          */
1475         vm_page_protect(p, VM_PROT_NONE);
1476         vm_page_free(p);
1477
1478         /*
1479          * Must be at end to avoid SMP races, caller holds object token
1480          */
1481 done:
1482         if ((++info->count & 63) == 0)
1483                 lwkt_user_yield();
1484
1485         return(0);
1486 }
1487
1488 /*
1489  * Try to extend prev_object into an adjoining region of virtual
1490  * memory, return TRUE on success.
1491  *
1492  * The caller does not need to hold (prev_object) but must have a stable
1493  * pointer to it (typically by holding the vm_map locked).
1494  *
1495  * This function only works for anonymous memory objects which either
1496  * have (a) one reference or (b) we are extending the object's size.
1497  * Otherwise the related VM pages we want to use for the object might
1498  * be in use by another mapping.
1499  */
1500 boolean_t
1501 vm_object_coalesce(vm_object_t prev_object, vm_pindex_t prev_pindex,
1502                    vm_size_t prev_size, vm_size_t next_size)
1503 {
1504         vm_pindex_t next_pindex;
1505
1506         if (prev_object == NULL)
1507                 return (TRUE);
1508
1509         vm_object_hold(prev_object);
1510
1511         if (prev_object->type != OBJT_DEFAULT &&
1512             prev_object->type != OBJT_SWAP) {
1513                 vm_object_drop(prev_object);
1514                 return (FALSE);
1515         }
1516
1517 #if 0
1518         /* caller now checks this */
1519         /*
1520          * Try to collapse the object first
1521          */
1522         vm_object_collapse(prev_object, NULL);
1523 #endif
1524
1525 #if 0
1526         /* caller now checks this */
1527         /*
1528          * We can't coalesce if we shadow another object (figuring out the
1529          * relationships become too complex).
1530          */
1531         if (prev_object->backing_object != NULL) {
1532                 vm_object_chain_release(prev_object);
1533                 vm_object_drop(prev_object);
1534                 return (FALSE);
1535         }
1536 #endif
1537
1538         prev_size >>= PAGE_SHIFT;
1539         next_size >>= PAGE_SHIFT;
1540         next_pindex = prev_pindex + prev_size;
1541
1542         /*
1543          * We can't if the object has more than one ref count unless we
1544          * are extending it into newly minted space.
1545          */
1546         if (prev_object->ref_count > 1 &&
1547             prev_object->size != next_pindex) {
1548                 vm_object_drop(prev_object);
1549                 return (FALSE);
1550         }
1551
1552         /*
1553          * Remove any pages that may still be in the object from a previous
1554          * deallocation.
1555          */
1556         if (next_pindex < prev_object->size) {
1557                 vm_object_page_remove(prev_object,
1558                                       next_pindex,
1559                                       next_pindex + next_size, FALSE);
1560                 if (prev_object->type == OBJT_SWAP)
1561                         swap_pager_freespace(prev_object,
1562                                              next_pindex, next_size);
1563         }
1564
1565         /*
1566          * Extend the object if necessary.
1567          */
1568         if (next_pindex + next_size > prev_object->size)
1569                 prev_object->size = next_pindex + next_size;
1570         vm_object_drop(prev_object);
1571
1572         return (TRUE);
1573 }
1574
1575 /*
1576  * Make the object writable and flag is being possibly dirty.
1577  *
1578  * The object might not be held (or might be held but held shared),
1579  * the related vnode is probably not held either.  Object and vnode are
1580  * stable by virtue of the vm_page busied by the caller preventing
1581  * destruction.
1582  *
1583  * If the related mount is flagged MNTK_THR_SYNC we need to call
1584  * vsetobjdirty().  Filesystems using this option usually shortcut
1585  * synchronization by only scanning the syncer list.
1586  */
1587 void
1588 vm_object_set_writeable_dirty(vm_object_t object)
1589 {
1590         struct vnode *vp;
1591
1592         /*vm_object_assert_held(object);*/
1593         /*
1594          * Avoid contention in vm fault path by checking the state before
1595          * issuing an atomic op on it.
1596          */
1597         if ((object->flags & (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) !=
1598             (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) {
1599                 vm_object_set_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
1600         }
1601         if (object->type == OBJT_VNODE &&
1602             (vp = (struct vnode *)object->handle) != NULL) {
1603                 if ((vp->v_flag & VOBJDIRTY) == 0) {
1604                         if (vp->v_mount &&
1605                             (vp->v_mount->mnt_kern_flag & MNTK_THR_SYNC)) {
1606                                 /*
1607                                  * New style THR_SYNC places vnodes on the
1608                                  * syncer list more deterministically.
1609                                  */
1610                                 vsetobjdirty(vp);
1611                         } else {
1612                                 /*
1613                                  * Old style scan would not necessarily place
1614                                  * a vnode on the syncer list when possibly
1615                                  * modified via mmap.
1616                                  */
1617                                 vsetflags(vp, VOBJDIRTY);
1618                         }
1619                 }
1620         }
1621 }
1622
1623 #include "opt_ddb.h"
1624 #ifdef DDB
1625 #include <sys/cons.h>
1626
1627 #include <ddb/ddb.h>
1628
1629 static int      _vm_object_in_map (vm_map_t map, vm_object_t object,
1630                                        vm_map_entry_t entry);
1631 static int      vm_object_in_map (vm_object_t object);
1632
1633 /*
1634  * The caller must hold the object.
1635  */
1636 static int
1637 _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry)
1638 {
1639         vm_map_backing_t ba;
1640         vm_map_t tmpm;
1641         vm_map_entry_t tmpe;
1642         int entcount;
1643
1644         if (map == NULL)
1645                 return 0;
1646         if (entry == NULL) {
1647                 tmpe = RB_MIN(vm_map_rb_tree, &map->rb_root);
1648                 entcount = map->nentries;
1649                 while (entcount-- && tmpe) {
1650                         if( _vm_object_in_map(map, object, tmpe)) {
1651                                 return 1;
1652                         }
1653                         tmpe = vm_map_rb_tree_RB_NEXT(tmpe);
1654                 }
1655                 return (0);
1656         }
1657         switch(entry->maptype) {
1658         case VM_MAPTYPE_SUBMAP:
1659                 tmpm = entry->ba.sub_map;
1660                 tmpe = RB_MIN(vm_map_rb_tree, &tmpm->rb_root);
1661                 entcount = tmpm->nentries;
1662                 while (entcount-- && tmpe) {
1663                         if( _vm_object_in_map(tmpm, object, tmpe)) {
1664                                 return 1;
1665                         }
1666                         tmpe = vm_map_rb_tree_RB_NEXT(tmpe);
1667                 }
1668                 break;
1669         case VM_MAPTYPE_NORMAL:
1670         case VM_MAPTYPE_VPAGETABLE:
1671                 ba = &entry->ba;
1672                 while (ba) {
1673                         if (ba->object == object)
1674                                 return TRUE;
1675                         ba = ba->backing_ba;
1676                 }
1677                 break;
1678         default:
1679                 break;
1680         }
1681         return 0;
1682 }
1683
1684 static int vm_object_in_map_callback(struct proc *p, void *data);
1685
1686 struct vm_object_in_map_info {
1687         vm_object_t object;
1688         int rv;
1689 };
1690
1691 /*
1692  * Debugging only
1693  */
1694 static int
1695 vm_object_in_map(vm_object_t object)
1696 {
1697         struct vm_object_in_map_info info;
1698
1699         info.rv = 0;
1700         info.object = object;
1701
1702         allproc_scan(vm_object_in_map_callback, &info, 0);
1703         if (info.rv)
1704                 return 1;
1705         if( _vm_object_in_map(&kernel_map, object, 0))
1706                 return 1;
1707         if( _vm_object_in_map(&pager_map, object, 0))
1708                 return 1;
1709         if( _vm_object_in_map(&buffer_map, object, 0))
1710                 return 1;
1711         return 0;
1712 }
1713
1714 /*
1715  * Debugging only
1716  */
1717 static int
1718 vm_object_in_map_callback(struct proc *p, void *data)
1719 {
1720         struct vm_object_in_map_info *info = data;
1721
1722         if (p->p_vmspace) {
1723                 if (_vm_object_in_map(&p->p_vmspace->vm_map, info->object, 0)) {
1724                         info->rv = 1;
1725                         return -1;
1726                 }
1727         }
1728         return (0);
1729 }
1730
1731 DB_SHOW_COMMAND(vmochk, vm_object_check)
1732 {
1733         struct vm_object_hash *hash;
1734         vm_object_t object;
1735         int n;
1736
1737         /*
1738          * make sure that internal objs are in a map somewhere
1739          * and none have zero ref counts.
1740          */
1741         for (n = 0; n < VMOBJ_HSIZE; ++n) {
1742                 hash = &vm_object_hash[n];
1743                 for (object = TAILQ_FIRST(&hash->list);
1744                                 object != NULL;
1745                                 object = TAILQ_NEXT(object, object_entry)) {
1746                         if (object->type == OBJT_MARKER)
1747                                 continue;
1748                         if (object->handle != NULL ||
1749                             (object->type != OBJT_DEFAULT &&
1750                              object->type != OBJT_SWAP)) {
1751                                 continue;
1752                         }
1753                         if (object->ref_count == 0) {
1754                                 db_printf("vmochk: internal obj has "
1755                                           "zero ref count: %ld\n",
1756                                           (long)object->size);
1757                         }
1758                         if (vm_object_in_map(object))
1759                                 continue;
1760                         db_printf("vmochk: internal obj is not in a map: "
1761                                   "ref: %d, size: %lu: 0x%lx\n",
1762                                   object->ref_count, (u_long)object->size,
1763                                   (u_long)object->size);
1764                 }
1765         }
1766 }
1767
1768 /*
1769  * Debugging only
1770  */
1771 DB_SHOW_COMMAND(object, vm_object_print_static)
1772 {
1773         /* XXX convert args. */
1774         vm_object_t object = (vm_object_t)addr;
1775         boolean_t full = have_addr;
1776
1777         vm_page_t p;
1778
1779         /* XXX count is an (unused) arg.  Avoid shadowing it. */
1780 #define count   was_count
1781
1782         int count;
1783
1784         if (object == NULL)
1785                 return;
1786
1787         db_iprintf(
1788             "Object %p: type=%d, size=0x%lx, res=%ld, ref=%d, flags=0x%x\n",
1789             object, (int)object->type, (u_long)object->size,
1790             object->resident_page_count, object->ref_count, object->flags);
1791         /*
1792          * XXX no %qd in kernel.  Truncate object->backing_object_offset.
1793          */
1794         db_iprintf("\n");
1795
1796         if (!full)
1797                 return;
1798
1799         db_indent += 2;
1800         count = 0;
1801         RB_FOREACH(p, vm_page_rb_tree, &object->rb_memq) {
1802                 if (count == 0)
1803                         db_iprintf("memory:=");
1804                 else if (count == 6) {
1805                         db_printf("\n");
1806                         db_iprintf(" ...");
1807                         count = 0;
1808                 } else
1809                         db_printf(",");
1810                 count++;
1811
1812                 db_printf("(off=0x%lx,page=0x%lx)",
1813                     (u_long) p->pindex, (u_long) VM_PAGE_TO_PHYS(p));
1814         }
1815         if (count != 0)
1816                 db_printf("\n");
1817         db_indent -= 2;
1818 }
1819
1820 /* XXX. */
1821 #undef count
1822
1823 /*
1824  * XXX need this non-static entry for calling from vm_map_print.
1825  *
1826  * Debugging only
1827  */
1828 void
1829 vm_object_print(/* db_expr_t */ long addr,
1830                 boolean_t have_addr,
1831                 /* db_expr_t */ long count,
1832                 char *modif)
1833 {
1834         vm_object_print_static(addr, have_addr, count, modif);
1835 }
1836
1837 /*
1838  * Debugging only
1839  */
1840 DB_SHOW_COMMAND(vmopag, vm_object_print_pages)
1841 {
1842         struct vm_object_hash *hash;
1843         vm_object_t object;
1844         int nl = 0;
1845         int c;
1846         int n;
1847
1848         for (n = 0; n < VMOBJ_HSIZE; ++n) {
1849                 hash = &vm_object_hash[n];
1850                 for (object = TAILQ_FIRST(&hash->list);
1851                                 object != NULL;
1852                                 object = TAILQ_NEXT(object, object_entry)) {
1853                         vm_pindex_t idx, fidx;
1854                         vm_pindex_t osize;
1855                         vm_paddr_t pa = -1, padiff;
1856                         int rcount;
1857                         vm_page_t m;
1858
1859                         if (object->type == OBJT_MARKER)
1860                                 continue;
1861                         db_printf("new object: %p\n", (void *)object);
1862                         if ( nl > 18) {
1863                                 c = cngetc();
1864                                 if (c != ' ')
1865                                         return;
1866                                 nl = 0;
1867                         }
1868                         nl++;
1869                         rcount = 0;
1870                         fidx = 0;
1871                         osize = object->size;
1872                         if (osize > 128)
1873                                 osize = 128;
1874                         for (idx = 0; idx < osize; idx++) {
1875                                 m = vm_page_lookup(object, idx);
1876                                 if (m == NULL) {
1877                                         if (rcount) {
1878                                                 db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
1879                                                         (long)fidx, rcount, (long)pa);
1880                                                 if ( nl > 18) {
1881                                                         c = cngetc();
1882                                                         if (c != ' ')
1883                                                                 return;
1884                                                         nl = 0;
1885                                                 }
1886                                                 nl++;
1887                                                 rcount = 0;
1888                                         }
1889                                         continue;
1890                                 }
1891
1892                                 if (rcount &&
1893                                         (VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) {
1894                                         ++rcount;
1895                                         continue;
1896                                 }
1897                                 if (rcount) {
1898                                         padiff = pa + rcount * PAGE_SIZE - VM_PAGE_TO_PHYS(m);
1899                                         padiff >>= PAGE_SHIFT;
1900                                         padiff &= PQ_L2_MASK;
1901                                         if (padiff == 0) {
1902                                                 pa = VM_PAGE_TO_PHYS(m) - rcount * PAGE_SIZE;
1903                                                 ++rcount;
1904                                                 continue;
1905                                         }
1906                                         db_printf(" index(%ld)run(%d)pa(0x%lx)",
1907                                                 (long)fidx, rcount, (long)pa);
1908                                         db_printf("pd(%ld)\n", (long)padiff);
1909                                         if ( nl > 18) {
1910                                                 c = cngetc();
1911                                                 if (c != ' ')
1912                                                         return;
1913                                                 nl = 0;
1914                                         }
1915                                         nl++;
1916                                 }
1917                                 fidx = idx;
1918                                 pa = VM_PAGE_TO_PHYS(m);
1919                                 rcount = 1;
1920                         }
1921                         if (rcount) {
1922                                 db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
1923                                         (long)fidx, rcount, (long)pa);
1924                                 if ( nl > 18) {
1925                                         c = cngetc();
1926                                         if (c != ' ')
1927                                                 return;
1928                                         nl = 0;
1929                                 }
1930                                 nl++;
1931                         }
1932                 }
1933         }
1934 }
1935 #endif /* DDB */