sys/vm/vm_map.c

   1 /*
   2  * Copyright (c) 1991, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  * Copyright (c) 2003-2022 The DragonFly Project.  All rights reserved.
   5  *
   6  * This code is derived from software contributed to Berkeley by
   7  * The Mach Operating System project at Carnegie-Mellon University.
   8  *
   9  * This code is derived from software contributed to The DragonFly Project
  10  * by Matthew Dillon <dillon@backplane.com>
  11  *
  12  * Redistribution and use in source and binary forms, with or without
  13  * modification, are permitted provided that the following conditions
  14  * are met:
  15  * 1. Redistributions of source code must retain the above copyright
  16  *    notice, this list of conditions and the following disclaimer.
  17  * 2. Redistributions in binary form must reproduce the above copyright
  18  *    notice, this list of conditions and the following disclaimer in the
  19  *    documentation and/or other materials provided with the distribution.
  20  * 3. Neither the name of the University nor the names of its contributors
  21  *    may be used to endorse or promote products derived from this software
  22  *    without specific prior written permission.
  23  *
  24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  34  * SUCH DAMAGE.
  35  *
  36  *      from: @(#)vm_map.c      8.3 (Berkeley) 1/12/94
  37  *
  38  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  39  * All rights reserved.
  40  *
  41  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  42  *
  43  * Permission to use, copy, modify and distribute this software and
  44  * its documentation is hereby granted, provided that both the copyright
  45  * notice and this permission notice appear in all copies of the
  46  * software, derivative works or modified versions, and any portions
  47  * thereof, and that both notices appear in supporting documentation.
  48  *
  49  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  50  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  51  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  52  *
  53  * Carnegie Mellon requests users of this software to return to
  54  *
  55  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  56  *  School of Computer Science
  57  *  Carnegie Mellon University
  58  *  Pittsburgh PA 15213-3890
  59  *
  60  * any improvements or extensions that they make and grant Carnegie the
  61  * rights to redistribute these changes.
  62  */
  63 #include <sys/param.h>
  64 #include <sys/systm.h>
  65 #include <sys/kernel.h>
  66 #include <sys/proc.h>
  67 #include <sys/serialize.h>
  68 #include <sys/lock.h>
  69 #include <sys/vmmeter.h>
  70 #include <sys/mman.h>
  71 #include <sys/vnode.h>
  72 #include <sys/resourcevar.h>
  73 #include <sys/shm.h>
  74 #include <sys/tree.h>
  75 #include <sys/malloc.h>
  76 #include <sys/objcache.h>
  77 #include <sys/kern_syscall.h>
  78
  79 #include <vm/vm.h>
  80 #include <vm/vm_param.h>
  81 #include <vm/pmap.h>
  82 #include <vm/vm_map.h>
  83 #include <vm/vm_page.h>
  84 #include <vm/vm_object.h>
  85 #include <vm/vm_pager.h>
  86 #include <vm/vm_kern.h>
  87 #include <vm/vm_extern.h>
  88 #include <vm/swap_pager.h>
  89 #include <vm/vm_zone.h>
  90
  91 #include <sys/random.h>
  92 #include <sys/sysctl.h>
  93 #include <sys/spinlock.h>
  94
  95 #include <sys/thread2.h>
  96 #include <sys/spinlock2.h>
  97
  98 /*
  99  * Virtual memory maps provide for the mapping, protection, and sharing
 100  * of virtual memory objects.  In addition, this module provides for an
 101  * efficient virtual copy of memory from one map to another.
 102  *
 103  * Synchronization is required prior to most operations.
 104  *
 105  * Maps consist of an ordered doubly-linked list of simple entries.
 106  * A hint and a RB tree is used to speed-up lookups.
 107  *
 108  * Callers looking to modify maps specify start/end addresses which cause
 109  * the related map entry to be clipped if necessary, and then later
 110  * recombined if the pieces remained compatible.
 111  *
 112  * Virtual copy operations are performed by copying VM object references
 113  * from one map to another, and then marking both regions as copy-on-write.
 114  */
 115 static boolean_t vmspace_ctor(void *obj, void *privdata, int ocflags);
 116 static void vmspace_dtor(void *obj, void *privdata);
 117 static void vmspace_terminate(struct vmspace *vm, int final);
 118
 119 MALLOC_DEFINE(M_VMSPACE, "vmspace", "vmspace objcache backingstore");
 120 MALLOC_DEFINE(M_MAP_BACKING, "map_backing", "vm_map_backing to entry");
 121 static struct objcache *vmspace_cache;
 122
 123 /*
 124  * per-cpu page table cross mappings are initialized in early boot
 125  * and might require a considerable number of vm_map_entry structures.
 126  */
 127 #define MAPENTRYBSP_CACHE       (MAXCPU+1)
 128 #define MAPENTRYAP_CACHE        8
 129
 130 /*
 131  * Partioning threaded programs with large anonymous memory areas can
 132  * improve concurrent fault performance.
 133  */
 134 #define MAP_ENTRY_PARTITION_SIZE        ((vm_offset_t)(32 * 1024 * 1024))
 135 #define MAP_ENTRY_PARTITION_MASK        (MAP_ENTRY_PARTITION_SIZE - 1)
 136
 137 #define VM_MAP_ENTRY_WITHIN_PARTITION(entry)    \
 138         ((((entry)->ba.start ^ (entry)->ba.end) & ~MAP_ENTRY_PARTITION_MASK) == 0)
 139
 140 static struct vm_zone mapentzone_store;
 141 __read_mostly static vm_zone_t mapentzone;
 142
 143 static struct vm_map_entry map_entry_init[MAX_MAPENT];
 144 static struct vm_map_entry cpu_map_entry_init_bsp[MAPENTRYBSP_CACHE];
 145 static struct vm_map_entry cpu_map_entry_init_ap[MAXCPU][MAPENTRYAP_CACHE];
 146
 147 __read_mostly static int randomize_mmap;
 148 SYSCTL_INT(_vm, OID_AUTO, randomize_mmap, CTLFLAG_RW, &randomize_mmap, 0,
 149     "Randomize mmap offsets");
 150 __read_mostly static int vm_map_relock_enable = 1;
 151 SYSCTL_INT(_vm, OID_AUTO, map_relock_enable, CTLFLAG_RW,
 152            &vm_map_relock_enable, 0, "insert pop pgtable optimization");
 153 __read_mostly static int vm_map_partition_enable = 1;
 154 SYSCTL_INT(_vm, OID_AUTO, map_partition_enable, CTLFLAG_RW,
 155            &vm_map_partition_enable, 0, "Break up larger vm_map_entry's");
 156 __read_mostly static int vm_map_backing_limit = 5;
 157 SYSCTL_INT(_vm, OID_AUTO, map_backing_limit, CTLFLAG_RW,
 158            &vm_map_backing_limit, 0, "ba.backing_ba link depth");
 159 __read_mostly static int vm_map_backing_shadow_test = 1;
 160 SYSCTL_INT(_vm, OID_AUTO, map_backing_shadow_test, CTLFLAG_RW,
 161            &vm_map_backing_shadow_test, 0, "ba.object shadow test");
 162
 163 static void vmspace_drop_notoken(struct vmspace *vm);
 164 static void vm_map_entry_shadow(vm_map_entry_t entry);
 165 static vm_map_entry_t vm_map_entry_create(int *);
 166 static void vm_map_entry_dispose (vm_map_t map, vm_map_entry_t entry, int *);
 167 static void vm_map_entry_dispose_ba (vm_map_entry_t entry, vm_map_backing_t ba);
 168 static void vm_map_backing_replicated(vm_map_t map,
 169                 vm_map_entry_t entry, int flags);
 170 static void vm_map_backing_adjust_start(vm_map_entry_t entry,
 171                 vm_ooffset_t start);
 172 static void vm_map_backing_adjust_end(vm_map_entry_t entry,
 173                 vm_ooffset_t end);
 174 static void vm_map_backing_attach (vm_map_entry_t entry, vm_map_backing_t ba);
 175 static void vm_map_backing_detach (vm_map_entry_t entry, vm_map_backing_t ba);
 176 static void _vm_map_clip_end (vm_map_t, vm_map_entry_t, vm_offset_t, int *);
 177 static void _vm_map_clip_start (vm_map_t, vm_map_entry_t, vm_offset_t, int *);
 178 static void vm_map_entry_delete (vm_map_t, vm_map_entry_t, int *);
 179 static void vm_map_entry_unwire (vm_map_t, vm_map_entry_t);
 180 static void vm_map_copy_entry (vm_map_t, vm_map_t, vm_map_entry_t,
 181                 vm_map_entry_t);
 182 static void vm_map_unclip_range (vm_map_t map, vm_map_entry_t start_entry,
 183                 vm_offset_t start, vm_offset_t end, int *countp, int flags);
 184 static void vm_map_entry_partition(vm_map_t map, vm_map_entry_t entry,
 185                 vm_offset_t vaddr, int *countp);
 186
 187 #define MAP_BACK_CLIPPED        0x0001
 188 #define MAP_BACK_BASEOBJREFD    0x0002
 189
 190 /*
 191  * Initialize the vm_map module.  Must be called before any other vm_map
 192  * routines.
 193  *
 194  * Map and entry structures are allocated from the general purpose
 195  * memory pool with some exceptions:
 196  *
 197  *      - The kernel map is allocated statically.
 198  *      - Initial kernel map entries are allocated out of a static pool.
 199  *      - We must set ZONE_SPECIAL here or the early boot code can get
 200  *        stuck if there are >63 cores.
 201  *
 202  *      These restrictions are necessary since malloc() uses the
 203  *      maps and requires map entries.
 204  *
 205  * Called from the low level boot code only.
 206  */
 207 void
 208 vm_map_startup(void)
 209 {
 210         mapentzone = &mapentzone_store;
 211         zbootinit(mapentzone, "MAP ENTRY", sizeof (struct vm_map_entry),
 212                   map_entry_init, MAX_MAPENT);
 213         mapentzone_store.zflags |= ZONE_SPECIAL;
 214 }
 215
 216 /*
 217  * Called prior to any vmspace allocations.
 218  *
 219  * Called from the low level boot code only.
 220  */
 221 void
 222 vm_init2(void)
 223 {
 224         vmspace_cache = objcache_create_mbacked(M_VMSPACE,
 225                                                 sizeof(struct vmspace),
 226                                                 0, ncpus * 4,
 227                                                 vmspace_ctor, vmspace_dtor,
 228                                                 NULL);
 229         zinitna(mapentzone, NULL, 0, 0, ZONE_USE_RESERVE | ZONE_SPECIAL);
 230         pmap_init2();
 231         vm_object_init2();
 232 }
 233
 234 /*
 235  * objcache support.  We leave the pmap root cached as long as possible
 236  * for performance reasons.
 237  */
 238 static
 239 boolean_t
 240 vmspace_ctor(void *obj, void *privdata, int ocflags)
 241 {
 242         struct vmspace *vm = obj;
 243
 244         bzero(vm, sizeof(*vm));
 245         vm->vm_refcnt = VM_REF_DELETED;
 246
 247         return 1;
 248 }
 249
 250 static
 251 void
 252 vmspace_dtor(void *obj, void *privdata)
 253 {
 254         struct vmspace *vm = obj;
 255
 256         KKASSERT(vm->vm_refcnt == VM_REF_DELETED);
 257         pmap_puninit(vmspace_pmap(vm));
 258 }
 259
 260 /*
 261  * Red black tree functions
 262  *
 263  * The caller must hold the related map lock.
 264  */
 265 static int rb_vm_map_compare(vm_map_entry_t a, vm_map_entry_t b);
 266 RB_GENERATE(vm_map_rb_tree, vm_map_entry, rb_entry, rb_vm_map_compare);
 267
 268 /* a->ba.start is address, and the only field which must be initialized */
 269 static int
 270 rb_vm_map_compare(vm_map_entry_t a, vm_map_entry_t b)
 271 {
 272         if (a->ba.start < b->ba.start)
 273                 return(-1);
 274         else if (a->ba.start > b->ba.start)
 275                 return(1);
 276         return(0);
 277 }
 278
 279 /*
 280  * Initialize vmspace ref/hold counts vmspace0.  There is a holdcnt for
 281  * every refcnt.
 282  */
 283 void
 284 vmspace_initrefs(struct vmspace *vm)
 285 {
 286         vm->vm_refcnt = 1;
 287         vm->vm_holdcnt = 1;
 288 }
 289
 290 /*
 291  * Allocate a vmspace structure, including a vm_map and pmap.
 292  * Initialize numerous fields.  While the initial allocation is zerod,
 293  * subsequence reuse from the objcache leaves elements of the structure
 294  * intact (particularly the pmap), so portions must be zerod.
 295  *
 296  * Returns a referenced vmspace.
 297  *
 298  * No requirements.
 299  */
 300 struct vmspace *
 301 vmspace_alloc(vm_offset_t min, vm_offset_t max)
 302 {
 303         struct vmspace *vm;
 304
 305         vm = objcache_get(vmspace_cache, M_WAITOK);
 306
 307         bzero(&vm->vm_startcopy,
 308               (char *)&vm->vm_endcopy - (char *)&vm->vm_startcopy);
 309         vm_map_init(&vm->vm_map, min, max, NULL);       /* initializes token */
 310
 311         /*
 312          * NOTE: hold to acquires token for safety.
 313          *
 314          * On return vmspace is referenced (refs=1, hold=1).  That is,
 315          * each refcnt also has a holdcnt.  There can be additional holds
 316          * (holdcnt) above and beyond the refcnt.  Finalization is handled in
 317          * two stages, one on refs 1->0, and the the second on hold 1->0.
 318          */
 319         KKASSERT(vm->vm_holdcnt == 0);
 320         KKASSERT(vm->vm_refcnt == VM_REF_DELETED);
 321         vmspace_initrefs(vm);
 322         vmspace_hold(vm);
 323         pmap_pinit(vmspace_pmap(vm));           /* (some fields reused) */
 324         vm->vm_map.pmap = vmspace_pmap(vm);     /* XXX */
 325         vm->vm_shm = NULL;
 326         vm->vm_flags = 0;
 327         cpu_vmspace_alloc(vm);
 328         vmspace_drop(vm);
 329
 330         return (vm);
 331 }
 332
 333 /*
 334  * NOTE: Can return 0 if the vmspace is exiting.
 335  */
 336 int
 337 vmspace_getrefs(struct vmspace *vm)
 338 {
 339         int32_t n;
 340
 341         n = vm->vm_refcnt;
 342         cpu_ccfence();
 343         if (n & VM_REF_DELETED)
 344                 n = -1;
 345         return n;
 346 }
 347
 348 void
 349 vmspace_hold(struct vmspace *vm)
 350 {
 351         atomic_add_int(&vm->vm_holdcnt, 1);
 352         lwkt_gettoken(&vm->vm_map.token);
 353 }
 354
 355 /*
 356  * Drop with final termination interlock.
 357  */
 358 void
 359 vmspace_drop(struct vmspace *vm)
 360 {
 361         lwkt_reltoken(&vm->vm_map.token);
 362         vmspace_drop_notoken(vm);
 363 }
 364
 365 static void
 366 vmspace_drop_notoken(struct vmspace *vm)
 367 {
 368         if (atomic_fetchadd_int(&vm->vm_holdcnt, -1) == 1) {
 369                 if (vm->vm_refcnt & VM_REF_DELETED)
 370                         vmspace_terminate(vm, 1);
 371         }
 372 }
 373
 374 /*
 375  * A vmspace object must not be in a terminated state to be able to obtain
 376  * additional refs on it.
 377  *
 378  * These are official references to the vmspace, the count is used to check
 379  * for vmspace sharing.  Foreign accessors should use 'hold' and not 'ref'.
 380  *
 381  * XXX we need to combine hold & ref together into one 64-bit field to allow
 382  * holds to prevent stage-1 termination.
 383  */
 384 void
 385 vmspace_ref(struct vmspace *vm)
 386 {
 387         uint32_t n;
 388
 389         atomic_add_int(&vm->vm_holdcnt, 1);
 390         n = atomic_fetchadd_int(&vm->vm_refcnt, 1);
 391         KKASSERT((n & VM_REF_DELETED) == 0);
 392 }
 393
 394 /*
 395  * Release a ref on the vmspace.  On the 1->0 transition we do stage-1
 396  * termination of the vmspace.  Then, on the final drop of the hold we
 397  * will do stage-2 final termination.
 398  */
 399 void
 400 vmspace_rel(struct vmspace *vm)
 401 {
 402         uint32_t n;
 403
 404         /*
 405          * Drop refs.  Each ref also has a hold which is also dropped.
 406          *
 407          * When refs hits 0 compete to get the VM_REF_DELETED flag (hold
 408          * prevent finalization) to start termination processing.
 409          * Finalization occurs when the last hold count drops to 0.
 410          */
 411         n = atomic_fetchadd_int(&vm->vm_refcnt, -1) - 1;
 412         while (n == 0) {
 413                 if (atomic_cmpset_int(&vm->vm_refcnt, 0, VM_REF_DELETED)) {
 414                         vmspace_terminate(vm, 0);
 415                         break;
 416                 }
 417                 n = vm->vm_refcnt;
 418                 cpu_ccfence();
 419         }
 420         vmspace_drop_notoken(vm);
 421 }
 422
 423 /*
 424  * This is called during exit indicating that the vmspace is no
 425  * longer in used by an exiting process, but the process has not yet
 426  * been reaped.
 427  *
 428  * We drop refs, allowing for stage-1 termination, but maintain a holdcnt
 429  * to prevent stage-2 until the process is reaped.  Note hte order of
 430  * operation, we must hold first.
 431  *
 432  * No requirements.
 433  */
 434 void
 435 vmspace_relexit(struct vmspace *vm)
 436 {
 437         atomic_add_int(&vm->vm_holdcnt, 1);
 438         vmspace_rel(vm);
 439 }
 440
 441 /*
 442  * Called during reap to disconnect the remainder of the vmspace from
 443  * the process.  On the hold drop the vmspace termination is finalized.
 444  *
 445  * No requirements.
 446  */
 447 void
 448 vmspace_exitfree(struct proc *p)
 449 {
 450         struct vmspace *vm;
 451
 452         vm = p->p_vmspace;
 453         p->p_vmspace = NULL;
 454         vmspace_drop_notoken(vm);
 455 }
 456
 457 /*
 458  * Called in two cases:
 459  *
 460  * (1) When the last refcnt is dropped and the vmspace becomes inactive,
 461  *     called with final == 0.  refcnt will be (u_int)-1 at this point,
 462  *     and holdcnt will still be non-zero.
 463  *
 464  * (2) When holdcnt becomes 0, called with final == 1.  There should no
 465  *     longer be anyone with access to the vmspace.
 466  *
 467  * VMSPACE_EXIT1 flags the primary deactivation
 468  * VMSPACE_EXIT2 flags the last reap
 469  */
 470 static void
 471 vmspace_terminate(struct vmspace *vm, int final)
 472 {
 473         int count;
 474
 475         lwkt_gettoken(&vm->vm_map.token);
 476         if (final == 0) {
 477                 KKASSERT((vm->vm_flags & VMSPACE_EXIT1) == 0);
 478                 vm->vm_flags |= VMSPACE_EXIT1;
 479
 480                 /*
 481                  * Get rid of most of the resources.  Leave the kernel pmap
 482                  * intact.
 483                  *
 484                  * If the pmap does not contain wired pages we can bulk-delete
 485                  * the pmap as a performance optimization before removing the
 486                  * related mappings.
 487                  *
 488                  * If the pmap contains wired pages we cannot do this
 489                  * pre-optimization because currently vm_fault_unwire()
 490                  * expects the pmap pages to exist and will not decrement
 491                  * p->wire_count if they do not.
 492                  */
 493                 shmexit(vm);
 494                 if (vmspace_pmap(vm)->pm_stats.wired_count) {
 495                         vm_map_remove(&vm->vm_map, VM_MIN_USER_ADDRESS,
 496                                       VM_MAX_USER_ADDRESS);
 497                         pmap_remove_pages(vmspace_pmap(vm), VM_MIN_USER_ADDRESS,
 498                                           VM_MAX_USER_ADDRESS);
 499                 } else {
 500                         pmap_remove_pages(vmspace_pmap(vm), VM_MIN_USER_ADDRESS,
 501                                           VM_MAX_USER_ADDRESS);
 502                         vm_map_remove(&vm->vm_map, VM_MIN_USER_ADDRESS,
 503                                       VM_MAX_USER_ADDRESS);
 504                 }
 505                 lwkt_reltoken(&vm->vm_map.token);
 506         } else {
 507                 KKASSERT((vm->vm_flags & VMSPACE_EXIT1) != 0);
 508                 KKASSERT((vm->vm_flags & VMSPACE_EXIT2) == 0);
 509
 510                 /*
 511                  * Get rid of remaining basic resources.
 512                  */
 513                 vm->vm_flags |= VMSPACE_EXIT2;
 514                 shmexit(vm);
 515
 516                 count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
 517                 vm_map_lock(&vm->vm_map);
 518                 cpu_vmspace_free(vm);
 519
 520                 /*
 521                  * Lock the map, to wait out all other references to it.
 522                  * Delete all of the mappings and pages they hold, then call
 523                  * the pmap module to reclaim anything left.
 524                  */
 525                 vm_map_delete(&vm->vm_map,
 526                               vm_map_min(&vm->vm_map),
 527                               vm_map_max(&vm->vm_map),
 528                               &count);
 529                 vm_map_unlock(&vm->vm_map);
 530                 vm_map_entry_release(count);
 531
 532                 pmap_release(vmspace_pmap(vm));
 533                 lwkt_reltoken(&vm->vm_map.token);
 534                 objcache_put(vmspace_cache, vm);
 535         }
 536 }
 537
 538 /*
 539  * Swap useage is determined by taking the proportional swap used by
 540  * VM objects backing the VM map.  To make up for fractional losses,
 541  * if the VM object has any swap use at all the associated map entries
 542  * count for at least 1 swap page.
 543  *
 544  * No requirements.
 545  */
 546 vm_offset_t
 547 vmspace_swap_count(struct vmspace *vm)
 548 {
 549         vm_map_t map = &vm->vm_map;
 550         vm_map_entry_t cur;
 551         vm_object_t object;
 552         vm_offset_t count = 0;
 553         vm_offset_t n;
 554
 555         vmspace_hold(vm);
 556
 557         RB_FOREACH(cur, vm_map_rb_tree, &map->rb_root) {
 558                 switch(cur->maptype) {
 559                 case VM_MAPTYPE_NORMAL:
 560                         if ((object = cur->ba.object) == NULL)
 561                                 break;
 562                         if (object->swblock_count) {
 563                                 n = (cur->ba.end - cur->ba.start) / PAGE_SIZE;
 564                                 count += object->swblock_count *
 565                                     SWAP_META_PAGES * n / object->size + 1;
 566                         }
 567                         break;
 568                 default:
 569                         break;
 570                 }
 571         }
 572         vmspace_drop(vm);
 573
 574         return(count);
 575 }
 576
 577 /*
 578  * Calculate the approximate number of anonymous pages in use by
 579  * this vmspace.  To make up for fractional losses, we count each
 580  * VM object as having at least 1 anonymous page.
 581  *
 582  * No requirements.
 583  */
 584 vm_offset_t
 585 vmspace_anonymous_count(struct vmspace *vm)
 586 {
 587         vm_map_t map = &vm->vm_map;
 588         vm_map_entry_t cur;
 589         vm_object_t object;
 590         vm_offset_t count = 0;
 591
 592         vmspace_hold(vm);
 593         RB_FOREACH(cur, vm_map_rb_tree, &map->rb_root) {
 594                 switch(cur->maptype) {
 595                 case VM_MAPTYPE_NORMAL:
 596                         if ((object = cur->ba.object) == NULL)
 597                                 break;
 598                         if (object->type != OBJT_DEFAULT &&
 599                             object->type != OBJT_SWAP) {
 600                                 break;
 601                         }
 602                         count += object->resident_page_count;
 603                         break;
 604                 default:
 605                         break;
 606                 }
 607         }
 608         vmspace_drop(vm);
 609
 610         return(count);
 611 }
 612
 613 /*
 614  * Initialize an existing vm_map structure such as that in the vmspace
 615  * structure.  The pmap is initialized elsewhere.
 616  *
 617  * No requirements.
 618  */
 619 void
 620 vm_map_init(struct vm_map *map, vm_offset_t min_addr, vm_offset_t max_addr,
 621             pmap_t pmap)
 622 {
 623         RB_INIT(&map->rb_root);
 624         spin_init(&map->ilock_spin, "ilock");
 625         map->ilock_base = NULL;
 626         map->nentries = 0;
 627         map->size = 0;
 628         map->system_map = 0;
 629         vm_map_min(map) = min_addr;
 630         vm_map_max(map) = max_addr;
 631         map->pmap = pmap;
 632         map->timestamp = 0;
 633         map->flags = 0;
 634         bzero(&map->freehint, sizeof(map->freehint));
 635         lwkt_token_init(&map->token, "vm_map");
 636         lockinit(&map->lock, "vm_maplk", (hz + 9) / 10, 0);
 637 }
 638
 639 /*
 640  * Find the first possible free address for the specified request length.
 641  * Returns 0 if we don't have one cached.
 642  */
 643 static
 644 vm_offset_t
 645 vm_map_freehint_find(vm_map_t map, vm_size_t length, vm_size_t align)
 646 {
 647         vm_map_freehint_t *scan;
 648
 649         scan = &map->freehint[0];
 650         while (scan < &map->freehint[VM_MAP_FFCOUNT]) {
 651                 if (scan->length == length && scan->align == align)
 652                         return(scan->start);
 653                 ++scan;
 654         }
 655         return 0;
 656 }
 657
 658 /*
 659  * Unconditionally set the freehint.  Called by vm_map_findspace() after
 660  * it finds an address.  This will help us iterate optimally on the next
 661  * similar findspace.
 662  */
 663 static
 664 void
 665 vm_map_freehint_update(vm_map_t map, vm_offset_t start,
 666                        vm_size_t length, vm_size_t align)
 667 {
 668         vm_map_freehint_t *scan;
 669
 670         scan = &map->freehint[0];
 671         while (scan < &map->freehint[VM_MAP_FFCOUNT]) {
 672                 if (scan->length == length && scan->align == align) {
 673                         scan->start = start;
 674                         return;
 675                 }
 676                 ++scan;
 677         }
 678         scan = &map->freehint[map->freehint_newindex & VM_MAP_FFMASK];
 679         scan->start = start;
 680         scan->align = align;
 681         scan->length = length;
 682         ++map->freehint_newindex;
 683 }
 684
 685 /*
 686  * Update any existing freehints (for any alignment), for the hole we just
 687  * added.
 688  */
 689 static
 690 void
 691 vm_map_freehint_hole(vm_map_t map, vm_offset_t start, vm_size_t length)
 692 {
 693         vm_map_freehint_t *scan;
 694
 695         scan = &map->freehint[0];
 696         while (scan < &map->freehint[VM_MAP_FFCOUNT]) {
 697                 if (scan->length <= length && scan->start > start)
 698                         scan->start = start;
 699                 ++scan;
 700         }
 701 }
 702
 703 /*
 704  * This function handles MAP_ENTRY_NEEDS_COPY by inserting a fronting
 705  * object in the entry for COW faults.
 706  *
 707  * The entire chain including entry->ba (prior to inserting the fronting
 708  * object) essentially becomes set in stone... elements of it can be paged
 709  * in or out, but cannot be further modified.
 710  *
 711  * NOTE: If we do not optimize the backing chain then a unique copy is not
 712  *       needed.  Note, however, that because portions of the chain are
 713  *       shared across pmaps we cannot make any changes to the vm_map_backing
 714  *       elements themselves.
 715  *
 716  * If the map segment is governed by a virtual page table then it is
 717  * possible to address offsets beyond the mapped area.  Just allocate
 718  * a maximally sized object for this case.
 719  *
 720  * If addref is non-zero an additional reference is added to the returned
 721  * entry.  This mechanic exists because the additional reference might have
 722  * to be added atomically and not after return to prevent a premature
 723  * collapse.  XXX currently there is no collapse code.
 724  *
 725  * The vm_map must be exclusively locked.
 726  * No other requirements.
 727  */
 728 static
 729 void
 730 vm_map_entry_shadow(vm_map_entry_t entry)
 731 {
 732         vm_map_backing_t ba;
 733         vm_size_t length;
 734         vm_object_t source;
 735         vm_object_t result;
 736
 737         /*
 738          * Number of bytes we have to shadow
 739          */
 740         length = atop(entry->ba.end - entry->ba.start);
 741
 742         /*
 743          * Don't create the new object if the old object isn't shared.
 744          * This case occurs quite often when programs fork/exec/wait.
 745          *
 746          * Caller ensures source exists (all backing_ba's must have objects),
 747          * typically indirectly by virtue of the NEEDS_COPY flag being set.
 748          * We have a ref on source by virtue of the entry and do not need
 749          * to lock it to do this test.
 750          */
 751         source = entry->ba.object;
 752         KKASSERT(source);
 753
 754         if (source->type != OBJT_VNODE) {
 755                 if (source->ref_count == 1 &&
 756                     source->handle == NULL &&
 757                     (source->type == OBJT_DEFAULT ||
 758                      source->type == OBJT_SWAP)) {
 759                         goto done;
 760                 }
 761         }
 762         ba = kmalloc(sizeof(*ba), M_MAP_BACKING, M_INTWAIT); /* copied later */
 763         vm_object_hold_shared(source);
 764
 765         /*
 766          * Once it becomes part of a backing_ba chain it can wind up anywhere,
 767          * drop the ONEMAPPING flag now.
 768          */
 769         vm_object_clear_flag(source, OBJ_ONEMAPPING);
 770
 771         /*
 772          * Allocate a new object with the given length.  The new object
 773          * is returned referenced but we may have to add another one.
 774          * If we are adding a second reference we must clear OBJ_ONEMAPPING.
 775          * (typically because the caller is about to clone a vm_map_entry).
 776          *
 777          * The source object currently has an extra reference to prevent
 778          * collapses into it while we mess with its shadow list, which
 779          * we will remove later in this routine.
 780          *
 781          * The target object may require a second reference if asked for one
 782          * by the caller.
 783          */
 784         result = vm_object_allocate_hold(OBJT_DEFAULT, length);
 785         if (result == NULL)
 786                 panic("vm_object_shadow: no object for shadowing");
 787
 788         /*
 789          * The new object shadows the source object.
 790          *
 791          * Try to optimize the result object's page color when shadowing
 792          * in order to maintain page coloring consistency in the combined
 793          * shadowed object.
 794          *
 795          * The source object is moved to ba, retaining its existing ref-count.
 796          * No additional ref is needed.
 797          *
 798          * SHADOWING IS NOT APPLICABLE TO OBJT_VNODE OBJECTS
 799          */
 800         vm_map_backing_detach(entry, &entry->ba);
 801         *ba = entry->ba;                /* previous ba */
 802         entry->ba.object = result;      /* new ba (at head of entry) */
 803         entry->ba.backing_ba = ba;
 804         entry->ba.backing_count = ba->backing_count + 1;
 805         entry->ba.offset = 0;
 806
 807         /* cpu localization twist */
 808         result->pg_color = vm_quickcolor();
 809
 810         vm_map_backing_attach(entry, &entry->ba);
 811         vm_map_backing_attach(entry, ba);
 812
 813         /*
 814          * Adjust the return storage.  Drop the ref on source before
 815          * returning.
 816          */
 817         vm_object_drop(result);
 818         vm_object_drop(source);
 819 done:
 820         entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
 821 }
 822
 823 /*
 824  * Allocate an object for a vm_map_entry.
 825  *
 826  * Object allocation for anonymous mappings is defered as long as possible.
 827  * This function is called when we can defer no longer, generally when a map
 828  * entry might be split or forked or takes a page fault.
 829  *
 830  * If the map segment is governed by a virtual page table then it is
 831  * possible to address offsets beyond the mapped area.  Just allocate
 832  * a maximally sized object for this case.
 833  *
 834  * The vm_map must be exclusively locked.
 835  * No other requirements.
 836  */
 837 void
 838 vm_map_entry_allocate_object(vm_map_entry_t entry)
 839 {
 840         vm_object_t obj;
 841
 842         /*
 843          * ba.offset is NOT cumulatively added in the backing_ba scan like
 844          * it was in the old object chain, so we can assign whatever offset
 845          * we like to the new object.
 846          *
 847          * For now assign a value of 0 to make debugging object sizes
 848          * easier.
 849          */
 850         entry->ba.offset = 0;
 851
 852         obj = vm_object_allocate(OBJT_DEFAULT,
 853                                  atop(entry->ba.end - entry->ba.start) +
 854                                  entry->ba.offset);
 855         entry->ba.object = obj;
 856         vm_map_backing_attach(entry, &entry->ba);
 857 }
 858
 859 /*
 860  * Set an initial negative count so the first attempt to reserve
 861  * space preloads a bunch of vm_map_entry's for this cpu.  Also
 862  * pre-allocate 2 vm_map_entries which will be needed by zalloc() to
 863  * map a new page for vm_map_entry structures.  SMP systems are
 864  * particularly sensitive.
 865  *
 866  * This routine is called in early boot so we cannot just call
 867  * vm_map_entry_reserve().
 868  *
 869  * Called from the low level boot code only (for each cpu)
 870  *
 871  * WARNING! Take care not to have too-big a static/BSS structure here
 872  *          as MAXCPU can be 256+, otherwise the loader's 64MB heap
 873  *          can get blown out by the kernel plus the initrd image.
 874  */
 875 void
 876 vm_map_entry_reserve_cpu_init(globaldata_t gd)
 877 {
 878         vm_map_entry_t entry;
 879         int count;
 880         int i;
 881
 882         atomic_add_int(&gd->gd_vme_avail, -MAP_RESERVE_COUNT * 2);
 883         if (gd->gd_cpuid == 0) {
 884                 entry = &cpu_map_entry_init_bsp[0];
 885                 count = MAPENTRYBSP_CACHE;
 886         } else {
 887                 entry = &cpu_map_entry_init_ap[gd->gd_cpuid][0];
 888                 count = MAPENTRYAP_CACHE;
 889         }
 890         for (i = 0; i < count; ++i, ++entry) {
 891                 MAPENT_FREELIST(entry) = gd->gd_vme_base;
 892                 gd->gd_vme_base = entry;
 893         }
 894 }
 895
 896 /*
 897  * Reserves vm_map_entry structures so code later-on can manipulate
 898  * map_entry structures within a locked map without blocking trying
 899  * to allocate a new vm_map_entry.
 900  *
 901  * No requirements.
 902  *
 903  * WARNING!  We must not decrement gd_vme_avail until after we have
 904  *           ensured that sufficient entries exist, otherwise we can
 905  *           get into an endless call recursion in the zalloc code
 906  *           itself.
 907  */
 908 int
 909 vm_map_entry_reserve(int count)
 910 {
 911         struct globaldata *gd = mycpu;
 912         vm_map_entry_t entry;
 913
 914         /*
 915          * Make sure we have enough structures in gd_vme_base to handle
 916          * the reservation request.
 917          *
 918          * Use a critical section to protect against VM faults.  It might
 919          * not be needed, but we have to be careful here.
 920          */
 921         if (gd->gd_vme_avail < count) {
 922                 crit_enter();
 923                 while (gd->gd_vme_avail < count) {
 924                         entry = zalloc(mapentzone);
 925                         MAPENT_FREELIST(entry) = gd->gd_vme_base;
 926                         gd->gd_vme_base = entry;
 927                         atomic_add_int(&gd->gd_vme_avail, 1);
 928                 }
 929                 crit_exit();
 930         }
 931         atomic_add_int(&gd->gd_vme_avail, -count);
 932
 933         return(count);
 934 }
 935
 936 /*
 937  * Releases previously reserved vm_map_entry structures that were not
 938  * used.  If we have too much junk in our per-cpu cache clean some of
 939  * it out.
 940  *
 941  * No requirements.
 942  */
 943 void
 944 vm_map_entry_release(int count)
 945 {
 946         struct globaldata *gd = mycpu;
 947         vm_map_entry_t entry;
 948         vm_map_entry_t efree;
 949
 950         count = atomic_fetchadd_int(&gd->gd_vme_avail, count) + count;
 951         if (gd->gd_vme_avail > MAP_RESERVE_SLOP) {
 952                 efree = NULL;
 953                 crit_enter();
 954                 while (gd->gd_vme_avail > MAP_RESERVE_HYST) {
 955                         entry = gd->gd_vme_base;
 956                         KKASSERT(entry != NULL);
 957                         gd->gd_vme_base = MAPENT_FREELIST(entry);
 958                         atomic_add_int(&gd->gd_vme_avail, -1);
 959                         MAPENT_FREELIST(entry) = efree;
 960                         efree = entry;
 961                 }
 962                 crit_exit();
 963                 while ((entry = efree) != NULL) {
 964                         efree = MAPENT_FREELIST(efree);
 965                         zfree(mapentzone, entry);
 966                 }
 967         }
 968 }
 969
 970 /*
 971  * Reserve map entry structures for use in kernel_map itself.  These
 972  * entries have *ALREADY* been reserved on a per-cpu basis when the map
 973  * was inited.  This function is used by zalloc() to avoid a recursion
 974  * when zalloc() itself needs to allocate additional kernel memory.
 975  *
 976  * This function works like the normal reserve but does not load the
 977  * vm_map_entry cache (because that would result in an infinite
 978  * recursion).  Note that gd_vme_avail may go negative.  This is expected.
 979  *
 980  * Any caller of this function must be sure to renormalize after
 981  * potentially eating entries to ensure that the reserve supply
 982  * remains intact.
 983  *
 984  * No requirements.
 985  */
 986 int
 987 vm_map_entry_kreserve(int count)
 988 {
 989         struct globaldata *gd = mycpu;
 990
 991         atomic_add_int(&gd->gd_vme_avail, -count);
 992         KASSERT(gd->gd_vme_base != NULL,
 993                 ("no reserved entries left, gd_vme_avail = %d",
 994                 gd->gd_vme_avail));
 995         return(count);
 996 }
 997
 998 /*
 999  * Release previously reserved map entries for kernel_map.  We do not
1000  * attempt to clean up like the normal release function as this would
1001  * cause an unnecessary (but probably not fatal) deep procedure call.
1002  *
1003  * No requirements.
1004  */
1005 void
1006 vm_map_entry_krelease(int count)
1007 {
1008         struct globaldata *gd = mycpu;
1009
1010         atomic_add_int(&gd->gd_vme_avail, count);
1011 }
1012
1013 /*
1014  * Allocates a VM map entry for insertion.  No entry fields are filled in.
1015  *
1016  * The entries should have previously been reserved.  The reservation count
1017  * is tracked in (*countp).
1018  *
1019  * No requirements.
1020  */
1021 static vm_map_entry_t
1022 vm_map_entry_create(int *countp)
1023 {
1024         struct globaldata *gd = mycpu;
1025         vm_map_entry_t entry;
1026
1027         KKASSERT(*countp > 0);
1028         --*countp;
1029         crit_enter();
1030         entry = gd->gd_vme_base;
1031         KASSERT(entry != NULL, ("gd_vme_base NULL! count %d", *countp));
1032         gd->gd_vme_base = MAPENT_FREELIST(entry);
1033         crit_exit();
1034
1035         return(entry);
1036 }
1037
1038 /*
1039  * Attach and detach backing store elements
1040  */
1041 static void
1042 vm_map_backing_attach(vm_map_entry_t entry, vm_map_backing_t ba)
1043 {
1044         vm_object_t obj;
1045
1046         switch(entry->maptype) {
1047         case VM_MAPTYPE_NORMAL:
1048                 obj = ba->object;
1049                 lockmgr(&obj->backing_lk, LK_EXCLUSIVE);
1050                 TAILQ_INSERT_TAIL(&obj->backing_list, ba, entry);
1051                 lockmgr(&obj->backing_lk, LK_RELEASE);
1052                 break;
1053         case VM_MAPTYPE_UKSMAP:
1054                 ba->uksmap(ba, UKSMAPOP_ADD, entry->aux.dev, NULL);
1055                 break;
1056         }
1057 }
1058
1059 static void
1060 vm_map_backing_detach(vm_map_entry_t entry, vm_map_backing_t ba)
1061 {
1062         vm_object_t obj;
1063
1064         switch(entry->maptype) {
1065         case VM_MAPTYPE_NORMAL:
1066                 obj = ba->object;
1067                 lockmgr(&obj->backing_lk, LK_EXCLUSIVE);
1068                 TAILQ_REMOVE(&obj->backing_list, ba, entry);
1069                 lockmgr(&obj->backing_lk, LK_RELEASE);
1070                 break;
1071         case VM_MAPTYPE_UKSMAP:
1072                 ba->uksmap(ba, UKSMAPOP_REM, entry->aux.dev, NULL);
1073                 break;
1074         }
1075 }
1076
1077 /*
1078  * Dispose of the dynamically allocated backing_ba chain associated
1079  * with a vm_map_entry.
1080  *
1081  * We decrement the (possibly shared) element and kfree() on the
1082  * 1->0 transition.  We only iterate to the next backing_ba when
1083  * the previous one went through a 1->0 transition.
1084  *
1085  * These can only be normal vm_object based backings.
1086  */
1087 static void
1088 vm_map_entry_dispose_ba(vm_map_entry_t entry, vm_map_backing_t ba)
1089 {
1090         vm_map_backing_t next;
1091
1092         while (ba) {
1093                 if (ba->map_object) {
1094                         vm_map_backing_detach(entry, ba);
1095                         vm_object_deallocate(ba->object);
1096                 }
1097                 next = ba->backing_ba;
1098                 kfree(ba, M_MAP_BACKING);
1099                 ba = next;
1100         }
1101 }
1102
1103 /*
1104  * Dispose of a vm_map_entry that is no longer being referenced.
1105  *
1106  * No requirements.
1107  */
1108 static void
1109 vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry, int *countp)
1110 {
1111         struct globaldata *gd = mycpu;
1112
1113         /*
1114          * Dispose of the base object and the backing link.
1115          */
1116         switch(entry->maptype) {
1117         case VM_MAPTYPE_NORMAL:
1118                 if (entry->ba.map_object) {
1119                         vm_map_backing_detach(entry, &entry->ba);
1120                         vm_object_deallocate(entry->ba.object);
1121                 }
1122                 break;
1123         case VM_MAPTYPE_SUBMAP:
1124                 break;
1125         case VM_MAPTYPE_UKSMAP:
1126                 vm_map_backing_detach(entry, &entry->ba);
1127                 break;
1128         default:
1129                 break;
1130         }
1131         vm_map_entry_dispose_ba(entry, entry->ba.backing_ba);
1132
1133         /*
1134          * Cleanup for safety.
1135          */
1136         entry->ba.backing_ba = NULL;
1137         entry->ba.object = NULL;
1138         entry->ba.offset = 0;
1139
1140         ++*countp;
1141         crit_enter();
1142         MAPENT_FREELIST(entry) = gd->gd_vme_base;
1143         gd->gd_vme_base = entry;
1144         crit_exit();
1145 }
1146
1147
1148 /*
1149  * Insert/remove entries from maps.
1150  *
1151  * The related map must be exclusively locked.
1152  * The caller must hold map->token
1153  * No other requirements.
1154  */
1155 static __inline void
1156 vm_map_entry_link(vm_map_t map, vm_map_entry_t entry)
1157 {
1158         ASSERT_VM_MAP_LOCKED(map);
1159
1160         map->nentries++;
1161         if (vm_map_rb_tree_RB_INSERT(&map->rb_root, entry))
1162                 panic("vm_map_entry_link: dup addr map %p ent %p", map, entry);
1163 }
1164
1165 static __inline void
1166 vm_map_entry_unlink(vm_map_t map,
1167                     vm_map_entry_t entry)
1168 {
1169         ASSERT_VM_MAP_LOCKED(map);
1170
1171         if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
1172                 panic("vm_map_entry_unlink: attempt to mess with "
1173                       "locked entry! %p", entry);
1174         }
1175         vm_map_rb_tree_RB_REMOVE(&map->rb_root, entry);
1176         map->nentries--;
1177 }
1178
1179 /*
1180  * Finds the map entry containing (or immediately preceding) the specified
1181  * address in the given map.  The entry is returned in (*entry).
1182  *
1183  * The boolean result indicates whether the address is actually contained
1184  * in the map.
1185  *
1186  * The related map must be locked.
1187  * No other requirements.
1188  */
1189 boolean_t
1190 vm_map_lookup_entry(vm_map_t map, vm_offset_t address, vm_map_entry_t *entry)
1191 {
1192         vm_map_entry_t tmp;
1193         vm_map_entry_t last;
1194
1195         ASSERT_VM_MAP_LOCKED(map);
1196
1197         /*
1198          * Locate the record from the top of the tree.  'last' tracks the
1199          * closest prior record and is returned if no match is found, which
1200          * in binary tree terms means tracking the most recent right-branch
1201          * taken.  If there is no prior record, *entry is set to NULL.
1202          */
1203         last = NULL;
1204         tmp = RB_ROOT(&map->rb_root);
1205
1206         while (tmp) {
1207                 if (address >= tmp->ba.start) {
1208                         if (address < tmp->ba.end) {
1209                                 *entry = tmp;
1210                                 return(TRUE);
1211                         }
1212                         last = tmp;
1213                         tmp = RB_RIGHT(tmp, rb_entry);
1214                 } else {
1215                         tmp = RB_LEFT(tmp, rb_entry);
1216                 }
1217         }
1218         *entry = last;
1219         return (FALSE);
1220 }
1221
1222 /*
1223  * Inserts the given whole VM object into the target map at the specified
1224  * address range.  The object's size should match that of the address range.
1225  *
1226  * The map must be exclusively locked.
1227  * The object must be held.
1228  * The caller must have reserved sufficient vm_map_entry structures.
1229  *
1230  * If object is non-NULL, ref count must be bumped by caller prior to
1231  * making call to account for the new entry.  XXX API is a bit messy.
1232  */
1233 int
1234 vm_map_insert(vm_map_t map, int *countp,
1235               void *map_object, void *map_aux,
1236               vm_ooffset_t offset, void *aux_info,
1237               vm_offset_t start, vm_offset_t end,
1238               vm_maptype_t maptype, vm_subsys_t id,
1239               vm_prot_t prot, vm_prot_t max, int cow)
1240 {
1241         vm_map_entry_t new_entry;
1242         vm_map_entry_t prev_entry;
1243         vm_map_entry_t next;
1244         vm_map_entry_t temp_entry;
1245         vm_eflags_t protoeflags;
1246         vm_object_t object;
1247         int must_drop = 0;
1248
1249         if (maptype == VM_MAPTYPE_UKSMAP)
1250                 object = NULL;
1251         else
1252                 object = map_object;
1253
1254         ASSERT_VM_MAP_LOCKED(map);
1255         if (object)
1256                 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1257
1258         /*
1259          * Check that the start and end points are not bogus.
1260          */
1261         if ((start < vm_map_min(map)) || (end > vm_map_max(map)) ||
1262             (start >= end)) {
1263                 return (KERN_INVALID_ADDRESS);
1264         }
1265
1266         /*
1267          * Find the entry prior to the proposed starting address; if it's part
1268          * of an existing entry, this range is bogus.
1269          */
1270         if (vm_map_lookup_entry(map, start, &temp_entry))
1271                 return (KERN_NO_SPACE);
1272         prev_entry = temp_entry;
1273
1274         /*
1275          * Assert that the next entry doesn't overlap the end point.
1276          */
1277         if (prev_entry)
1278                 next = vm_map_rb_tree_RB_NEXT(prev_entry);
1279         else
1280                 next = RB_MIN(vm_map_rb_tree, &map->rb_root);
1281         if (next && next->ba.start < end)
1282                 return (KERN_NO_SPACE);
1283
1284         protoeflags = 0;
1285
1286         if (cow & MAP_COPY_ON_WRITE)
1287                 protoeflags |= MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY;
1288
1289         if (cow & MAP_NOFAULT) {
1290                 protoeflags |= MAP_ENTRY_NOFAULT;
1291
1292                 KASSERT(object == NULL,
1293                         ("vm_map_insert: paradoxical MAP_NOFAULT request"));
1294         }
1295         if (cow & MAP_DISABLE_SYNCER)
1296                 protoeflags |= MAP_ENTRY_NOSYNC;
1297         if (cow & MAP_DISABLE_COREDUMP)
1298                 protoeflags |= MAP_ENTRY_NOCOREDUMP;
1299         if (cow & MAP_IS_STACK)
1300                 protoeflags |= MAP_ENTRY_STACK;
1301         if (cow & MAP_IS_KSTACK)
1302                 protoeflags |= MAP_ENTRY_KSTACK;
1303
1304         lwkt_gettoken(&map->token);
1305
1306         if (object) {
1307                 ;
1308         } else if (prev_entry &&
1309                  (prev_entry->eflags == protoeflags) &&
1310                  (prev_entry->ba.end == start) &&
1311                  (prev_entry->wired_count == 0) &&
1312                  (prev_entry->id == id) &&
1313                  prev_entry->maptype == maptype &&
1314                  maptype == VM_MAPTYPE_NORMAL &&
1315                  prev_entry->ba.backing_ba == NULL &&   /* not backed */
1316                  ((prev_entry->ba.object == NULL) ||
1317                   vm_object_coalesce(prev_entry->ba.object,
1318                                      OFF_TO_IDX(prev_entry->ba.offset),
1319                                      (vm_size_t)(prev_entry->ba.end - prev_entry->ba.start),
1320                                      (vm_size_t)(end - prev_entry->ba.end)))) {
1321                 /*
1322                  * We were able to extend the object.  Determine if we
1323                  * can extend the previous map entry to include the
1324                  * new range as well.
1325                  */
1326                 if ((prev_entry->inheritance == VM_INHERIT_DEFAULT) &&
1327                     (prev_entry->protection == prot) &&
1328                     (prev_entry->max_protection == max)) {
1329                         map->size += (end - prev_entry->ba.end);
1330                         vm_map_backing_adjust_end(prev_entry, end);
1331                         vm_map_simplify_entry(map, prev_entry, countp);
1332                         lwkt_reltoken(&map->token);
1333                         return (KERN_SUCCESS);
1334                 }
1335
1336                 /*
1337                  * If we can extend the object but cannot extend the
1338                  * map entry, we have to create a new map entry.  We
1339                  * must bump the ref count on the extended object to
1340                  * account for it.  object may be NULL.
1341                  */
1342                 object = prev_entry->ba.object;
1343                 offset = prev_entry->ba.offset +
1344                         (prev_entry->ba.end - prev_entry->ba.start);
1345                 if (object) {
1346                         vm_object_hold(object);
1347                         vm_object_lock_swap(); /* map->token order */
1348                         vm_object_reference_locked(object);
1349                         map_object = object;
1350                         must_drop = 1;
1351                 }
1352         }
1353
1354         /*
1355          * NOTE: if conditionals fail, object can be NULL here.  This occurs
1356          * in things like the buffer map where we manage kva but do not manage
1357          * backing objects.
1358          */
1359
1360         /*
1361          * Create a new entry
1362          */
1363         new_entry = vm_map_entry_create(countp);
1364         new_entry->ba.pmap = map->pmap;
1365         new_entry->ba.start = start;
1366         new_entry->ba.end = end;
1367         new_entry->id = id;
1368
1369         new_entry->maptype = maptype;
1370         new_entry->eflags = protoeflags;
1371         new_entry->aux.master_pde = 0;          /* in case size is different */
1372         new_entry->aux.map_aux = map_aux;
1373         new_entry->ba.map_object = map_object;
1374         new_entry->ba.backing_ba = NULL;
1375         new_entry->ba.backing_count = 0;
1376         new_entry->ba.offset = offset;
1377         new_entry->ba.aux_info = aux_info;
1378         new_entry->ba.flags = 0;
1379         new_entry->ba.pmap = map->pmap;
1380
1381         new_entry->inheritance = VM_INHERIT_DEFAULT;
1382         new_entry->protection = prot;
1383         new_entry->max_protection = max;
1384         new_entry->wired_count = 0;
1385
1386         /*
1387          * Insert the new entry into the list
1388          */
1389         vm_map_backing_replicated(map, new_entry, MAP_BACK_BASEOBJREFD);
1390         vm_map_entry_link(map, new_entry);
1391         map->size += new_entry->ba.end - new_entry->ba.start;
1392
1393         /*
1394          * Don't worry about updating freehint[] when inserting, allow
1395          * addresses to be lower than the actual first free spot.
1396          */
1397 #if 0
1398         /*
1399          * Temporarily removed to avoid MAP_STACK panic, due to
1400          * MAP_STACK being a huge hack.  Will be added back in
1401          * when MAP_STACK (and the user stack mapping) is fixed.
1402          */
1403         /*
1404          * It may be possible to simplify the entry
1405          */
1406         vm_map_simplify_entry(map, new_entry, countp);
1407 #endif
1408
1409         /*
1410          * Try to pre-populate the page table.  Mappings governed by virtual
1411          * page tables cannot be prepopulated without a lot of work, so
1412          * don't try.
1413          */
1414         if ((cow & (MAP_PREFAULT|MAP_PREFAULT_PARTIAL)) &&
1415             maptype != VM_MAPTYPE_UKSMAP) {
1416                 int dorelock = 0;
1417                 if (vm_map_relock_enable && (cow & MAP_PREFAULT_RELOCK)) {
1418                         dorelock = 1;
1419                         vm_object_lock_swap();
1420                         vm_object_drop(object);
1421                 }
1422                 pmap_object_init_pt(map->pmap, new_entry,
1423                                     new_entry->ba.start,
1424                                     new_entry->ba.end - new_entry->ba.start,
1425                                     cow & MAP_PREFAULT_PARTIAL);
1426                 if (dorelock) {
1427                         vm_object_hold(object);
1428                         vm_object_lock_swap();
1429                 }
1430         }
1431         lwkt_reltoken(&map->token);
1432         if (must_drop)
1433                 vm_object_drop(object);
1434
1435         return (KERN_SUCCESS);
1436 }
1437
1438 /*
1439  * Find sufficient space for `length' bytes in the given map, starting at
1440  * `start'.  Returns 0 on success, 1 on no space.
1441  *
1442  * This function will returned an arbitrarily aligned pointer.  If no
1443  * particular alignment is required you should pass align as 1.  Note that
1444  * the map may return PAGE_SIZE aligned pointers if all the lengths used in
1445  * the map are a multiple of PAGE_SIZE, even if you pass a smaller align
1446  * argument.
1447  *
1448  * 'align' should be a power of 2 but is not required to be.
1449  *
1450  * The map must be exclusively locked.
1451  * No other requirements.
1452  */
1453 int
1454 vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length,
1455                  vm_size_t align, int flags, vm_offset_t *addr)
1456 {
1457         vm_map_entry_t entry;
1458         vm_map_entry_t tmp;
1459         vm_offset_t hole_start;
1460         vm_offset_t end;
1461         vm_offset_t align_mask;
1462
1463         if (start < vm_map_min(map))
1464                 start = vm_map_min(map);
1465         if (start > vm_map_max(map))
1466                 return (1);
1467
1468         /*
1469          * If the alignment is not a power of 2 we will have to use
1470          * a mod/division, set align_mask to a special value.
1471          */
1472         if ((align | (align - 1)) + 1 != (align << 1))
1473                 align_mask = (vm_offset_t)-1;
1474         else
1475                 align_mask = align - 1;
1476
1477         /*
1478          * Use freehint to adjust the start point, hopefully reducing
1479          * the iteration to O(1).
1480          */
1481         hole_start = vm_map_freehint_find(map, length, align);
1482         if (start < hole_start)
1483                 start = hole_start;
1484         if (vm_map_lookup_entry(map, start, &tmp))
1485                 start = tmp->ba.end;
1486         entry = tmp;    /* may be NULL */
1487
1488         /*
1489          * Look through the rest of the map, trying to fit a new region in the
1490          * gap between existing regions, or after the very last region.
1491          */
1492         for (;;) {
1493                 /*
1494                  * Adjust the proposed start by the requested alignment,
1495                  * be sure that we didn't wrap the address.
1496                  */
1497                 if (align_mask == (vm_offset_t)-1)
1498                         end = roundup(start, align);
1499                 else
1500                         end = (start + align_mask) & ~align_mask;
1501                 if (end < start)
1502                         return (1);
1503                 start = end;
1504
1505                 /*
1506                  * Find the end of the proposed new region.  Be sure we didn't
1507                  * go beyond the end of the map, or wrap around the address.
1508                  * Then check to see if this is the last entry or if the
1509                  * proposed end fits in the gap between this and the next
1510                  * entry.
1511                  */
1512                 end = start + length;
1513                 if (end > vm_map_max(map) || end < start)
1514                         return (1);
1515
1516                 /*
1517                  * Locate the next entry, we can stop if this is the
1518                  * last entry (we know we are in-bounds so that would
1519                  * be a sucess).
1520                  */
1521                 if (entry)
1522                         entry = vm_map_rb_tree_RB_NEXT(entry);
1523                 else
1524                         entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
1525                 if (entry == NULL)
1526                         break;
1527
1528                 /*
1529                  * Determine if the proposed area would overlap the
1530                  * next entry.
1531                  *
1532                  * When matching against a STACK entry, only allow the
1533                  * memory map to intrude on the ungrown portion of the
1534                  * STACK entry when MAP_TRYFIXED is set.
1535                  */
1536                 if (entry->ba.start >= end) {
1537                         if ((entry->eflags & MAP_ENTRY_STACK) == 0)
1538                                 break;
1539                         if (flags & MAP_TRYFIXED)
1540                                 break;
1541                         if (entry->ba.start - entry->aux.avail_ssize >= end)
1542                                 break;
1543                 }
1544                 start = entry->ba.end;
1545         }
1546
1547         /*
1548          * Update the freehint
1549          */
1550         vm_map_freehint_update(map, start, length, align);
1551
1552         /*
1553          * Grow the kernel_map if necessary.  pmap_growkernel() will panic
1554          * if it fails.  The kernel_map is locked and nothing can steal
1555          * our address space if pmap_growkernel() blocks.
1556          *
1557          * NOTE: This may be unconditionally called for kldload areas on
1558          *       x86_64 because these do not bump kernel_vm_end (which would
1559          *       fill 128G worth of page tables!).  Therefore we must not
1560          *       retry.
1561          */
1562         if (map == kernel_map) {
1563                 vm_offset_t kstop;
1564
1565                 kstop = round_page(start + length);
1566                 if (kstop > kernel_vm_end)
1567                         pmap_growkernel(start, kstop);
1568         }
1569         *addr = start;
1570         return (0);
1571 }
1572
1573 /*
1574  * vm_map_find finds an unallocated region in the target address map with
1575  * the given length and allocates it.  The search is defined to be first-fit
1576  * from the specified address; the region found is returned in the same
1577  * parameter.
1578  *
1579  * If object is non-NULL, ref count must be bumped by caller
1580  * prior to making call to account for the new entry.
1581  *
1582  * No requirements.  This function will lock the map temporarily.
1583  */
1584 int
1585 vm_map_find(vm_map_t map, void *map_object, void *map_aux,
1586             vm_ooffset_t offset, vm_offset_t *addr,
1587             vm_size_t length, vm_size_t align, boolean_t fitit,
1588             vm_maptype_t maptype, vm_subsys_t id,
1589             vm_prot_t prot, vm_prot_t max, int cow)
1590 {
1591         vm_offset_t start;
1592         vm_object_t object;
1593         void *aux_info;
1594         int result;
1595         int count;
1596
1597         /*
1598          * Certain UKSMAPs may need aux_info.
1599          *
1600          * (map_object is the callback function, aux_info is the process
1601          *  or thread, if necessary).
1602          */
1603         aux_info = NULL;
1604         if (maptype == VM_MAPTYPE_UKSMAP) {
1605                 KKASSERT(map_aux != NULL && map_object != NULL);
1606
1607                 switch(minor(((struct cdev *)map_aux))) {
1608                 case 5:
1609                         /*
1610                          * /dev/upmap
1611                          */
1612                         aux_info = curproc;
1613                         break;
1614                 case 6:
1615                         /*
1616                          * /dev/kpmap
1617                          */
1618                         break;
1619                 case 7:
1620                         /*
1621                          * /dev/lpmap
1622                          */
1623                         aux_info = curthread->td_lwp;
1624                         break;
1625                 }
1626                 object = NULL;
1627         } else {
1628                 object = map_object;
1629         }
1630
1631         start = *addr;
1632
1633         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
1634         vm_map_lock(map);
1635         if (object)
1636                 vm_object_hold_shared(object);
1637         if (fitit) {
1638                 if (vm_map_findspace(map, start, length, align, 0, addr)) {
1639                         if (object)
1640                                 vm_object_drop(object);
1641                         vm_map_unlock(map);
1642                         vm_map_entry_release(count);
1643                         return (KERN_NO_SPACE);
1644                 }
1645                 start = *addr;
1646         }
1647         result = vm_map_insert(map, &count,
1648                                map_object, map_aux,
1649                                offset, aux_info,
1650                                start, start + length,
1651                                maptype, id, prot, max, cow);
1652         if (object)
1653                 vm_object_drop(object);
1654         vm_map_unlock(map);
1655         vm_map_entry_release(count);
1656
1657         return (result);
1658 }
1659
1660 /*
1661  * Simplify the given map entry by merging with either neighbor.  This
1662  * routine also has the ability to merge with both neighbors.
1663  *
1664  * This routine guarentees that the passed entry remains valid (though
1665  * possibly extended).  When merging, this routine may delete one or
1666  * both neighbors.  No action is taken on entries which have their
1667  * in-transition flag set.
1668  *
1669  * The map must be exclusively locked.
1670  */
1671 void
1672 vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry, int *countp)
1673 {
1674         vm_map_entry_t next, prev;
1675         vm_size_t prevsize, esize;
1676
1677         if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
1678                 ++mycpu->gd_cnt.v_intrans_coll;
1679                 return;
1680         }
1681
1682         if (entry->maptype == VM_MAPTYPE_SUBMAP)
1683                 return;
1684         if (entry->maptype == VM_MAPTYPE_UKSMAP)
1685                 return;
1686
1687         prev = vm_map_rb_tree_RB_PREV(entry);
1688         if (prev) {
1689                 prevsize = prev->ba.end - prev->ba.start;
1690                 if ( (prev->ba.end == entry->ba.start) &&
1691                      (prev->maptype == entry->maptype) &&
1692                      (prev->ba.object == entry->ba.object) &&
1693                      (prev->ba.backing_ba == entry->ba.backing_ba) &&
1694                      (!prev->ba.object ||
1695                         (prev->ba.offset + prevsize == entry->ba.offset)) &&
1696                      (prev->eflags == entry->eflags) &&
1697                      (prev->protection == entry->protection) &&
1698                      (prev->max_protection == entry->max_protection) &&
1699                      (prev->inheritance == entry->inheritance) &&
1700                      (prev->id == entry->id) &&
1701                      (prev->wired_count == entry->wired_count)) {
1702                         /*
1703                          * NOTE: order important.  Unlink before gumming up
1704                          *       the RBTREE w/adjust, adjust before disposal
1705                          *       of prior entry, to avoid pmap snafus.
1706                          */
1707                         vm_map_entry_unlink(map, prev);
1708                         vm_map_backing_adjust_start(entry, prev->ba.start);
1709                         if (entry->ba.object == NULL)
1710                                 entry->ba.offset = 0;
1711                         vm_map_entry_dispose(map, prev, countp);
1712                 }
1713         }
1714
1715         next = vm_map_rb_tree_RB_NEXT(entry);
1716         if (next) {
1717                 esize = entry->ba.end - entry->ba.start;
1718                 if ((entry->ba.end == next->ba.start) &&
1719                     (next->maptype == entry->maptype) &&
1720                     (next->ba.object == entry->ba.object) &&
1721                      (prev->ba.backing_ba == entry->ba.backing_ba) &&
1722                      (!entry->ba.object ||
1723                         (entry->ba.offset + esize == next->ba.offset)) &&
1724                     (next->eflags == entry->eflags) &&
1725                     (next->protection == entry->protection) &&
1726                     (next->max_protection == entry->max_protection) &&
1727                     (next->inheritance == entry->inheritance) &&
1728                     (next->id == entry->id) &&
1729                     (next->wired_count == entry->wired_count)) {
1730                         /*
1731                          * NOTE: order important.  Unlink before gumming up
1732                          *       the RBTREE w/adjust, adjust before disposal
1733                          *       of prior entry, to avoid pmap snafus.
1734                          */
1735                         vm_map_entry_unlink(map, next);
1736                         vm_map_backing_adjust_end(entry, next->ba.end);
1737                         vm_map_entry_dispose(map, next, countp);
1738                 }
1739         }
1740 }
1741
1742 /*
1743  * Asserts that the given entry begins at or after the specified address.
1744  * If necessary, it splits the entry into two.
1745  */
1746 #define vm_map_clip_start(map, entry, startaddr, countp)                \
1747 {                                                                       \
1748         if (startaddr > entry->ba.start)                                \
1749                 _vm_map_clip_start(map, entry, startaddr, countp);      \
1750 }
1751
1752 /*
1753  * This routine is called only when it is known that the entry must be split.
1754  *
1755  * The map must be exclusively locked.
1756  */
1757 static void
1758 _vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start,
1759                    int *countp)
1760 {
1761         vm_map_entry_t new_entry;
1762
1763         /*
1764          * Split off the front portion -- note that we must insert the new
1765          * entry BEFORE this one, so that this entry has the specified
1766          * starting address.
1767          */
1768
1769         vm_map_simplify_entry(map, entry, countp);
1770
1771         /*
1772          * If there is no object backing this entry, we might as well create
1773          * one now.  If we defer it, an object can get created after the map
1774          * is clipped, and individual objects will be created for the split-up
1775          * map.  This is a bit of a hack, but is also about the best place to
1776          * put this improvement.
1777          */
1778         if (entry->ba.object == NULL && !map->system_map &&
1779             VM_MAP_ENTRY_WITHIN_PARTITION(entry)) {
1780                 vm_map_entry_allocate_object(entry);
1781         }
1782
1783         /*
1784          * NOTE: The replicated function will adjust start, end, and offset
1785          *       for the remainder of the backing_ba linkages.  We must fixup
1786          *       the embedded ba.
1787          */
1788         new_entry = vm_map_entry_create(countp);
1789         *new_entry = *entry;
1790         new_entry->ba.end = start;
1791
1792         /*
1793          * Ordering is important, make sure the new entry is replicated
1794          * before we cut the exiting entry.
1795          */
1796         vm_map_backing_replicated(map, new_entry, MAP_BACK_CLIPPED);
1797         vm_map_backing_adjust_start(entry, start);
1798         vm_map_entry_link(map, new_entry);
1799 }
1800
1801 /*
1802  * Asserts that the given entry ends at or before the specified address.
1803  * If necessary, it splits the entry into two.
1804  *
1805  * The map must be exclusively locked.
1806  */
1807 #define vm_map_clip_end(map, entry, endaddr, countp)            \
1808 {                                                               \
1809         if (endaddr < entry->ba.end)                            \
1810                 _vm_map_clip_end(map, entry, endaddr, countp);  \
1811 }
1812
1813 /*
1814  * This routine is called only when it is known that the entry must be split.
1815  *
1816  * The map must be exclusively locked.
1817  */
1818 static void
1819 _vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end,
1820                  int *countp)
1821 {
1822         vm_map_entry_t new_entry;
1823
1824         /*
1825          * If there is no object backing this entry, we might as well create
1826          * one now.  If we defer it, an object can get created after the map
1827          * is clipped, and individual objects will be created for the split-up
1828          * map.  This is a bit of a hack, but is also about the best place to
1829          * put this improvement.
1830          */
1831
1832         if (entry->ba.object == NULL && !map->system_map &&
1833             VM_MAP_ENTRY_WITHIN_PARTITION(entry)) {
1834                 vm_map_entry_allocate_object(entry);
1835         }
1836
1837         /*
1838          * Create a new entry and insert it AFTER the specified entry
1839          *
1840          * NOTE: The replicated function will adjust start, end, and offset
1841          *       for the remainder of the backing_ba linkages.  We must fixup
1842          *       the embedded ba.
1843          */
1844         new_entry = vm_map_entry_create(countp);
1845         *new_entry = *entry;
1846         new_entry->ba.start = end;
1847         new_entry->ba.offset += (new_entry->ba.start - entry->ba.start);
1848
1849         /*
1850          * Ordering is important, make sure the new entry is replicated
1851          * before we cut the exiting entry.
1852          */
1853         vm_map_backing_replicated(map, new_entry, MAP_BACK_CLIPPED);
1854         vm_map_backing_adjust_end(entry, end);
1855         vm_map_entry_link(map, new_entry);
1856 }
1857
1858 /*
1859  * Asserts that the starting and ending region addresses fall within the
1860  * valid range for the map.
1861  */
1862 #define VM_MAP_RANGE_CHECK(map, start, end)     \
1863 {                                               \
1864         if (start < vm_map_min(map))            \
1865                 start = vm_map_min(map);        \
1866         if (end > vm_map_max(map))              \
1867                 end = vm_map_max(map);          \
1868         if (start > end)                        \
1869                 start = end;                    \
1870 }
1871
1872 /*
1873  * Used to block when an in-transition collison occurs.  The map
1874  * is unlocked for the sleep and relocked before the return.
1875  */
1876 void
1877 vm_map_transition_wait(vm_map_t map, int relock)
1878 {
1879         tsleep_interlock(map, 0);
1880         vm_map_unlock(map);
1881         tsleep(map, PINTERLOCKED, "vment", 0);
1882         if (relock)
1883                 vm_map_lock(map);
1884 }
1885
1886 /*
1887  * When we do blocking operations with the map lock held it is
1888  * possible that a clip might have occured on our in-transit entry,
1889  * requiring an adjustment to the entry in our loop.  These macros
1890  * help the pageable and clip_range code deal with the case.  The
1891  * conditional costs virtually nothing if no clipping has occured.
1892  */
1893
1894 #define CLIP_CHECK_BACK(entry, save_start)                      \
1895     do {                                                        \
1896             while (entry->ba.start != save_start) {             \
1897                     entry = vm_map_rb_tree_RB_PREV(entry);      \
1898                     KASSERT(entry, ("bad entry clip"));         \
1899             }                                                   \
1900     } while(0)
1901
1902 #define CLIP_CHECK_FWD(entry, save_end)                         \
1903     do {                                                        \
1904             while (entry->ba.end != save_end) {                 \
1905                     entry = vm_map_rb_tree_RB_NEXT(entry);      \
1906                     KASSERT(entry, ("bad entry clip"));         \
1907             }                                                   \
1908     } while(0)
1909
1910
1911 /*
1912  * Clip the specified range and return the base entry.  The
1913  * range may cover several entries starting at the returned base
1914  * and the first and last entry in the covering sequence will be
1915  * properly clipped to the requested start and end address.
1916  *
1917  * If no holes are allowed you should pass the MAP_CLIP_NO_HOLES
1918  * flag.
1919  *
1920  * The MAP_ENTRY_IN_TRANSITION flag will be set for the entries
1921  * covered by the requested range.
1922  *
1923  * The map must be exclusively locked on entry and will remain locked
1924  * on return. If no range exists or the range contains holes and you
1925  * specified that no holes were allowed, NULL will be returned.  This
1926  * routine may temporarily unlock the map in order avoid a deadlock when
1927  * sleeping.
1928  */
1929 static
1930 vm_map_entry_t
1931 vm_map_clip_range(vm_map_t map, vm_offset_t start, vm_offset_t end,
1932                   int *countp, int flags)
1933 {
1934         vm_map_entry_t start_entry;
1935         vm_map_entry_t entry;
1936         vm_map_entry_t next;
1937
1938         /*
1939          * Locate the entry and effect initial clipping.  The in-transition
1940          * case does not occur very often so do not try to optimize it.
1941          */
1942 again:
1943         if (vm_map_lookup_entry(map, start, &start_entry) == FALSE)
1944                 return (NULL);
1945         entry = start_entry;
1946         if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
1947                 entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
1948                 ++mycpu->gd_cnt.v_intrans_coll;
1949                 ++mycpu->gd_cnt.v_intrans_wait;
1950                 vm_map_transition_wait(map, 1);
1951                 /*
1952                  * entry and/or start_entry may have been clipped while
1953                  * we slept, or may have gone away entirely.  We have
1954                  * to restart from the lookup.
1955                  */
1956                 goto again;
1957         }
1958
1959         /*
1960          * Since we hold an exclusive map lock we do not have to restart
1961          * after clipping, even though clipping may block in zalloc.
1962          */
1963         vm_map_clip_start(map, entry, start, countp);
1964         vm_map_clip_end(map, entry, end, countp);
1965         entry->eflags |= MAP_ENTRY_IN_TRANSITION;
1966
1967         /*
1968          * Scan entries covered by the range.  When working on the next
1969          * entry a restart need only re-loop on the current entry which
1970          * we have already locked, since 'next' may have changed.  Also,
1971          * even though entry is safe, it may have been clipped so we
1972          * have to iterate forwards through the clip after sleeping.
1973          */
1974         for (;;) {
1975                 next = vm_map_rb_tree_RB_NEXT(entry);
1976                 if (next == NULL || next->ba.start >= end)
1977                         break;
1978                 if (flags & MAP_CLIP_NO_HOLES) {
1979                         if (next->ba.start > entry->ba.end) {
1980                                 vm_map_unclip_range(map, start_entry,
1981                                         start, entry->ba.end, countp, flags);
1982                                 return(NULL);
1983                         }
1984                 }
1985
1986                 if (next->eflags & MAP_ENTRY_IN_TRANSITION) {
1987                         vm_offset_t save_end = entry->ba.end;
1988                         next->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
1989                         ++mycpu->gd_cnt.v_intrans_coll;
1990                         ++mycpu->gd_cnt.v_intrans_wait;
1991                         vm_map_transition_wait(map, 1);
1992
1993                         /*
1994                          * clips might have occured while we blocked.
1995                          */
1996                         CLIP_CHECK_FWD(entry, save_end);
1997                         CLIP_CHECK_BACK(start_entry, start);
1998                         continue;
1999                 }
2000
2001                 /*
2002                  * No restart necessary even though clip_end may block, we
2003                  * are holding the map lock.
2004                  */
2005                 vm_map_clip_end(map, next, end, countp);
2006                 next->eflags |= MAP_ENTRY_IN_TRANSITION;
2007                 entry = next;
2008         }
2009         if (flags & MAP_CLIP_NO_HOLES) {
2010                 if (entry->ba.end != end) {
2011                         vm_map_unclip_range(map, start_entry,
2012                                 start, entry->ba.end, countp, flags);
2013                         return(NULL);
2014                 }
2015         }
2016         return(start_entry);
2017 }
2018
2019 /*
2020  * Undo the effect of vm_map_clip_range().  You should pass the same
2021  * flags and the same range that you passed to vm_map_clip_range().
2022  * This code will clear the in-transition flag on the entries and
2023  * wake up anyone waiting.  This code will also simplify the sequence
2024  * and attempt to merge it with entries before and after the sequence.
2025  *
2026  * The map must be locked on entry and will remain locked on return.
2027  *
2028  * Note that you should also pass the start_entry returned by
2029  * vm_map_clip_range().  However, if you block between the two calls
2030  * with the map unlocked please be aware that the start_entry may
2031  * have been clipped and you may need to scan it backwards to find
2032  * the entry corresponding with the original start address.  You are
2033  * responsible for this, vm_map_unclip_range() expects the correct
2034  * start_entry to be passed to it and will KASSERT otherwise.
2035  */
2036 static
2037 void
2038 vm_map_unclip_range(vm_map_t map, vm_map_entry_t start_entry,
2039                     vm_offset_t start, vm_offset_t end,
2040                     int *countp, int flags)
2041 {
2042         vm_map_entry_t entry;
2043
2044         entry = start_entry;
2045
2046         KASSERT(entry->ba.start == start, ("unclip_range: illegal base entry"));
2047         while (entry && entry->ba.start < end) {
2048                 KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION,
2049                         ("in-transition flag not set during unclip on: %p",
2050                         entry));
2051                 KASSERT(entry->ba.end <= end,
2052                         ("unclip_range: tail wasn't clipped"));
2053                 entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
2054                 if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
2055                         entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
2056                         wakeup(map);
2057                 }
2058                 entry = vm_map_rb_tree_RB_NEXT(entry);
2059         }
2060
2061         /*
2062          * Simplification does not block so there is no restart case.
2063          */
2064         entry = start_entry;
2065         while (entry && entry->ba.start < end) {
2066                 vm_map_simplify_entry(map, entry, countp);
2067                 entry = vm_map_rb_tree_RB_NEXT(entry);
2068         }
2069 }
2070
2071 /*
2072  * Mark the given range as handled by a subordinate map.
2073  *
2074  * This range must have been created with vm_map_find(), and no other
2075  * operations may have been performed on this range prior to calling
2076  * vm_map_submap().
2077  *
2078  * Submappings cannot be removed.
2079  *
2080  * No requirements.
2081  */
2082 int
2083 vm_map_submap(vm_map_t map, vm_offset_t start, vm_offset_t end, vm_map_t submap)
2084 {
2085         vm_map_entry_t entry;
2086         int result = KERN_INVALID_ARGUMENT;
2087         int count;
2088
2089         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2090         vm_map_lock(map);
2091
2092         VM_MAP_RANGE_CHECK(map, start, end);
2093
2094         if (vm_map_lookup_entry(map, start, &entry)) {
2095                 vm_map_clip_start(map, entry, start, &count);
2096         } else if (entry) {
2097                 entry = vm_map_rb_tree_RB_NEXT(entry);
2098         } else {
2099                 entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
2100         }
2101
2102         vm_map_clip_end(map, entry, end, &count);
2103
2104         if ((entry->ba.start == start) && (entry->ba.end == end) &&
2105             ((entry->eflags & MAP_ENTRY_COW) == 0) &&
2106             (entry->ba.object == NULL)) {
2107                 entry->ba.sub_map = submap;
2108                 entry->maptype = VM_MAPTYPE_SUBMAP;
2109                 result = KERN_SUCCESS;
2110         }
2111         vm_map_unlock(map);
2112         vm_map_entry_release(count);
2113
2114         return (result);
2115 }
2116
2117 /*
2118  * Sets the protection of the specified address region in the target map.
2119  * If "set_max" is specified, the maximum protection is to be set;
2120  * otherwise, only the current protection is affected.
2121  *
2122  * The protection is not applicable to submaps, but is applicable to normal
2123  * maps and maps governed by virtual page tables.  For example, when operating
2124  * on a virtual page table our protection basically controls how COW occurs
2125  * on the backing object, whereas the virtual page table abstraction itself
2126  * is an abstraction for userland.
2127  *
2128  * No requirements.
2129  */
2130 int
2131 vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
2132                vm_prot_t new_prot, boolean_t set_max)
2133 {
2134         vm_map_entry_t current;
2135         vm_map_entry_t entry;
2136         int count;
2137
2138         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2139         vm_map_lock(map);
2140
2141         VM_MAP_RANGE_CHECK(map, start, end);
2142
2143         if (vm_map_lookup_entry(map, start, &entry)) {
2144                 vm_map_clip_start(map, entry, start, &count);
2145         } else if (entry) {
2146                 entry = vm_map_rb_tree_RB_NEXT(entry);
2147         } else {
2148                 entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
2149         }
2150
2151         /*
2152          * Make a first pass to check for protection violations.
2153          */
2154         current = entry;
2155         while (current && current->ba.start < end) {
2156                 if (current->maptype == VM_MAPTYPE_SUBMAP) {
2157                         vm_map_unlock(map);
2158                         vm_map_entry_release(count);
2159                         return (KERN_INVALID_ARGUMENT);
2160                 }
2161                 if ((new_prot & current->max_protection) != new_prot) {
2162                         vm_map_unlock(map);
2163                         vm_map_entry_release(count);
2164                         return (KERN_PROTECTION_FAILURE);
2165                 }
2166
2167                 /*
2168                  * When making a SHARED+RW file mmap writable, update
2169                  * v_lastwrite_ts.
2170                  */
2171                 if (new_prot & PROT_WRITE &&
2172                     (current->eflags & MAP_ENTRY_NEEDS_COPY) == 0 &&
2173                     current->maptype == VM_MAPTYPE_NORMAL &&
2174                     current->ba.object &&
2175                     current->ba.object->type == OBJT_VNODE) {
2176                         struct vnode *vp;
2177
2178                         vp = current->ba.object->handle;
2179                         if (vp && vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT) == 0) {
2180                                 vfs_timestamp(&vp->v_lastwrite_ts);
2181                                 vsetflags(vp, VLASTWRITETS);
2182                                 vn_unlock(vp);
2183                         }
2184                 }
2185                 current = vm_map_rb_tree_RB_NEXT(current);
2186         }
2187
2188         /*
2189          * Go back and fix up protections. [Note that clipping is not
2190          * necessary the second time.]
2191          */
2192         current = entry;
2193
2194         while (current && current->ba.start < end) {
2195                 vm_prot_t old_prot;
2196
2197                 vm_map_clip_end(map, current, end, &count);
2198
2199                 old_prot = current->protection;
2200                 if (set_max) {
2201                         current->max_protection = new_prot;
2202                         current->protection = new_prot & old_prot;
2203                 } else {
2204                         current->protection = new_prot;
2205                 }
2206
2207                 /*
2208                  * Update physical map if necessary. Worry about copy-on-write
2209                  * here -- CHECK THIS XXX
2210                  */
2211                 if (current->protection != old_prot) {
2212 #define MASK(entry)     (((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \
2213                                                         VM_PROT_ALL)
2214
2215                         pmap_protect(map->pmap, current->ba.start,
2216                             current->ba.end,
2217                             current->protection & MASK(current));
2218 #undef  MASK
2219                 }
2220
2221                 vm_map_simplify_entry(map, current, &count);
2222
2223                 current = vm_map_rb_tree_RB_NEXT(current);
2224         }
2225         vm_map_unlock(map);
2226         vm_map_entry_release(count);
2227         return (KERN_SUCCESS);
2228 }
2229
2230 /*
2231  * This routine traverses a processes map handling the madvise
2232  * system call.  Advisories are classified as either those effecting
2233  * the vm_map_entry structure, or those effecting the underlying
2234  * objects.
2235  *
2236  * The <value> argument is used for extended madvise calls.
2237  *
2238  * No requirements.
2239  */
2240 int
2241 vm_map_madvise(vm_map_t map, vm_offset_t start, vm_offset_t end,
2242                int behav, off_t value)
2243 {
2244         vm_map_entry_t current, entry;
2245         int modify_map = 0;
2246         int error = 0;
2247         int count;
2248
2249         /*
2250          * Some madvise calls directly modify the vm_map_entry, in which case
2251          * we need to use an exclusive lock on the map and we need to perform
2252          * various clipping operations.  Otherwise we only need a read-lock
2253          * on the map.
2254          */
2255         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2256
2257         switch(behav) {
2258         case MADV_NORMAL:
2259         case MADV_SEQUENTIAL:
2260         case MADV_RANDOM:
2261         case MADV_NOSYNC:
2262         case MADV_AUTOSYNC:
2263         case MADV_NOCORE:
2264         case MADV_CORE:
2265         case MADV_SETMAP:
2266                 modify_map = 1;
2267                 vm_map_lock(map);
2268                 break;
2269         case MADV_INVAL:
2270         case MADV_WILLNEED:
2271         case MADV_DONTNEED:
2272         case MADV_FREE:
2273                 vm_map_lock_read(map);
2274                 break;
2275         default:
2276                 vm_map_entry_release(count);
2277                 return (EINVAL);
2278         }
2279
2280         /*
2281          * Locate starting entry and clip if necessary.
2282          */
2283
2284         VM_MAP_RANGE_CHECK(map, start, end);
2285
2286         if (vm_map_lookup_entry(map, start, &entry)) {
2287                 if (modify_map)
2288                         vm_map_clip_start(map, entry, start, &count);
2289         } else if (entry) {
2290                 entry = vm_map_rb_tree_RB_NEXT(entry);
2291         } else {
2292                 entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
2293         }
2294
2295         if (modify_map) {
2296                 /*
2297                  * madvise behaviors that are implemented in the vm_map_entry.
2298                  *
2299                  * We clip the vm_map_entry so that behavioral changes are
2300                  * limited to the specified address range.
2301                  */
2302                 for (current = entry;
2303                      current && current->ba.start < end;
2304                      current = vm_map_rb_tree_RB_NEXT(current)) {
2305                         /*
2306                          * Ignore submaps
2307                          */
2308                         if (current->maptype == VM_MAPTYPE_SUBMAP)
2309                                 continue;
2310
2311                         vm_map_clip_end(map, current, end, &count);
2312
2313                         switch (behav) {
2314                         case MADV_NORMAL:
2315                                 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_NORMAL);
2316                                 break;
2317                         case MADV_SEQUENTIAL:
2318                                 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_SEQUENTIAL);
2319                                 break;
2320                         case MADV_RANDOM:
2321                                 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM);
2322                                 break;
2323                         case MADV_NOSYNC:
2324                                 current->eflags |= MAP_ENTRY_NOSYNC;
2325                                 break;
2326                         case MADV_AUTOSYNC:
2327                                 current->eflags &= ~MAP_ENTRY_NOSYNC;
2328                                 break;
2329                         case MADV_NOCORE:
2330                                 current->eflags |= MAP_ENTRY_NOCOREDUMP;
2331                                 break;
2332                         case MADV_CORE:
2333                                 current->eflags &= ~MAP_ENTRY_NOCOREDUMP;
2334                                 break;
2335                         case MADV_SETMAP:
2336                                 /*
2337                                  * Set the page directory page for a map
2338                                  * governed by a virtual page table.
2339                                  *
2340                                  * Software virtual page table support has
2341                                  * been removed, this MADV is no longer
2342                                  * supported.
2343                                  */
2344                                 error = EINVAL;
2345                                 break;
2346                         case MADV_INVAL:
2347                                 /*
2348                                  * Invalidate the related pmap entries, used
2349                                  * to flush portions of the real kernel's
2350                                  * pmap when the caller has removed or
2351                                  * modified existing mappings in a virtual
2352                                  * page table.
2353                                  *
2354                                  * (exclusive locked map version does not
2355                                  * need the range interlock).
2356                                  */
2357                                 pmap_remove(map->pmap,
2358                                             current->ba.start, current->ba.end);
2359                                 break;
2360                         default:
2361                                 error = EINVAL;
2362                                 break;
2363                         }
2364                         vm_map_simplify_entry(map, current, &count);
2365                 }
2366                 vm_map_unlock(map);
2367         } else {
2368                 vm_pindex_t pindex;
2369                 vm_pindex_t delta;
2370
2371                 /*
2372                  * madvise behaviors that are implemented in the underlying
2373                  * vm_object.
2374                  *
2375                  * Since we don't clip the vm_map_entry, we have to clip
2376                  * the vm_object pindex and count.
2377                  *
2378                  * NOTE!  These functions are only supported on normal maps.
2379                  *
2380                  * NOTE!  These functions only apply to the top-most object.
2381                  *        It is not applicable to backing objects.
2382                  */
2383                 for (current = entry;
2384                      current && current->ba.start < end;
2385                      current = vm_map_rb_tree_RB_NEXT(current)) {
2386                         vm_offset_t useStart;
2387
2388                         if (current->maptype != VM_MAPTYPE_NORMAL)
2389                                 continue;
2390
2391                         pindex = OFF_TO_IDX(current->ba.offset);
2392                         delta = atop(current->ba.end - current->ba.start);
2393                         useStart = current->ba.start;
2394
2395                         if (current->ba.start < start) {
2396                                 pindex += atop(start - current->ba.start);
2397                                 delta -= atop(start - current->ba.start);
2398                                 useStart = start;
2399                         }
2400                         if (current->ba.end > end)
2401                                 delta -= atop(current->ba.end - end);
2402
2403                         if ((vm_spindex_t)delta <= 0)
2404                                 continue;
2405
2406                         if (behav == MADV_INVAL) {
2407                                 /*
2408                                  * Invalidate the related pmap entries, used
2409                                  * to flush portions of the real kernel's
2410                                  * pmap when the caller has removed or
2411                                  * modified existing mappings in a virtual
2412                                  * page table.
2413                                  *
2414                                  * (shared locked map version needs the
2415                                  * interlock, see vm_fault()).
2416                                  */
2417                                 struct vm_map_ilock ilock;
2418
2419                                 KASSERT(useStart >= VM_MIN_USER_ADDRESS &&
2420                                             useStart + ptoa(delta) <=
2421                                             VM_MAX_USER_ADDRESS,
2422                                          ("Bad range %016jx-%016jx (%016jx)",
2423                                          useStart, useStart + ptoa(delta),
2424                                          delta));
2425                                 vm_map_interlock(map, &ilock,
2426                                                  useStart,
2427                                                  useStart + ptoa(delta));
2428                                 pmap_remove(map->pmap,
2429                                             useStart,
2430                                             useStart + ptoa(delta));
2431                                 vm_map_deinterlock(map, &ilock);
2432                         } else {
2433                                 vm_object_madvise(current->ba.object,
2434                                                   pindex, delta, behav);
2435                         }
2436
2437                         /*
2438                          * Try to pre-populate the page table.
2439                          */
2440                         if (behav == MADV_WILLNEED) {
2441                                 pmap_object_init_pt(
2442                                     map->pmap, current,
2443                                     useStart,
2444                                     (delta << PAGE_SHIFT),
2445                                     MAP_PREFAULT_MADVISE
2446                                 );
2447                         }
2448                 }
2449                 vm_map_unlock_read(map);
2450         }
2451         vm_map_entry_release(count);
2452         return(error);
2453 }
2454
2455
2456 /*
2457  * Sets the inheritance of the specified address range in the target map.
2458  * Inheritance affects how the map will be shared with child maps at the
2459  * time of vm_map_fork.
2460  */
2461 int
2462 vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
2463                vm_inherit_t new_inheritance)
2464 {
2465         vm_map_entry_t entry;
2466         vm_map_entry_t temp_entry;
2467         int count;
2468
2469         switch (new_inheritance) {
2470         case VM_INHERIT_NONE:
2471         case VM_INHERIT_COPY:
2472         case VM_INHERIT_SHARE:
2473                 break;
2474         default:
2475                 return (KERN_INVALID_ARGUMENT);
2476         }
2477
2478         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2479         vm_map_lock(map);
2480
2481         VM_MAP_RANGE_CHECK(map, start, end);
2482
2483         if (vm_map_lookup_entry(map, start, &temp_entry)) {
2484                 entry = temp_entry;
2485                 vm_map_clip_start(map, entry, start, &count);
2486         } else if (temp_entry) {
2487                 entry = vm_map_rb_tree_RB_NEXT(temp_entry);
2488         } else {
2489                 entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
2490         }
2491
2492         while (entry && entry->ba.start < end) {
2493                 vm_map_clip_end(map, entry, end, &count);
2494
2495                 entry->inheritance = new_inheritance;
2496
2497                 vm_map_simplify_entry(map, entry, &count);
2498
2499                 entry = vm_map_rb_tree_RB_NEXT(entry);
2500         }
2501         vm_map_unlock(map);
2502         vm_map_entry_release(count);
2503         return (KERN_SUCCESS);
2504 }
2505
2506 /*
2507  * Wiring/Unwiring of memory for user-related operation.
2508  *
2509  * Implement the semantics of mlock
2510  */
2511 int
2512 vm_map_user_wiring(vm_map_t map, vm_offset_t start, vm_offset_t real_end,
2513                    boolean_t new_pageable)
2514 {
2515         vm_map_entry_t entry;
2516         vm_map_entry_t start_entry;
2517         vm_offset_t end;
2518         int rv = KERN_SUCCESS;
2519         int count;
2520
2521         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2522         vm_map_lock(map);
2523         VM_MAP_RANGE_CHECK(map, start, real_end);
2524         end = real_end;
2525
2526         start_entry = vm_map_clip_range(map, start, end, &count,
2527                                         MAP_CLIP_NO_HOLES);
2528         if (start_entry == NULL) {
2529                 vm_map_unlock(map);
2530                 vm_map_entry_release(count);
2531                 return (KERN_INVALID_ADDRESS);
2532         }
2533
2534         if (new_pageable == 0) {
2535                 entry = start_entry;
2536                 while (entry && entry->ba.start < end) {
2537                         vm_offset_t save_start;
2538                         vm_offset_t save_end;
2539
2540                         /*
2541                          * Already user wired or hard wired (trivial cases)
2542                          */
2543                         if (entry->eflags & MAP_ENTRY_USER_WIRED) {
2544                                 entry = vm_map_rb_tree_RB_NEXT(entry);
2545                                 continue;
2546                         }
2547                         if (entry->wired_count != 0) {
2548                                 entry->wired_count++;
2549                                 entry->eflags |= MAP_ENTRY_USER_WIRED;
2550                                 entry = vm_map_rb_tree_RB_NEXT(entry);
2551                                 continue;
2552                         }
2553
2554                         /*
2555                          * A new wiring requires instantiation of appropriate
2556                          * management structures and the faulting in of the
2557                          * page.
2558                          */
2559                         if (entry->maptype == VM_MAPTYPE_NORMAL) {
2560                                 int copyflag = entry->eflags &
2561                                                MAP_ENTRY_NEEDS_COPY;
2562                                 if (copyflag && ((entry->protection &
2563                                                   VM_PROT_WRITE) != 0)) {
2564                                         vm_map_entry_shadow(entry);
2565                                 } else if (entry->ba.object == NULL &&
2566                                            !map->system_map) {
2567                                         vm_map_entry_allocate_object(entry);
2568                                 }
2569                         }
2570                         entry->wired_count++;
2571                         entry->eflags |= MAP_ENTRY_USER_WIRED;
2572
2573                         /*
2574                          * Now fault in the area.  Note that vm_fault_wire()
2575                          * may release the map lock temporarily, it will be
2576                          * relocked on return.  The in-transition
2577                          * flag protects the entries.
2578                          */
2579                         save_start = entry->ba.start;
2580                         save_end = entry->ba.end;
2581                         rv = vm_fault_wire(map, entry, TRUE, 0);
2582                         if (rv) {
2583                                 CLIP_CHECK_BACK(entry, save_start);
2584                                 for (;;) {
2585                                         KASSERT(entry->wired_count == 1, ("bad wired_count on entry"));
2586                                         entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2587                                         entry->wired_count = 0;
2588                                         if (entry->ba.end == save_end)
2589                                                 break;
2590                                         entry = vm_map_rb_tree_RB_NEXT(entry);
2591                                         KASSERT(entry,
2592                                              ("bad entry clip during backout"));
2593                                 }
2594                                 end = save_start;       /* unwire the rest */
2595                                 break;
2596                         }
2597                         /*
2598                          * note that even though the entry might have been
2599                          * clipped, the USER_WIRED flag we set prevents
2600                          * duplication so we do not have to do a
2601                          * clip check.
2602                          */
2603                         entry = vm_map_rb_tree_RB_NEXT(entry);
2604                 }
2605
2606                 /*
2607                  * If we failed fall through to the unwiring section to
2608                  * unwire what we had wired so far.  'end' has already
2609                  * been adjusted.
2610                  */
2611                 if (rv)
2612                         new_pageable = 1;
2613
2614                 /*
2615                  * start_entry might have been clipped if we unlocked the
2616                  * map and blocked.  No matter how clipped it has gotten
2617                  * there should be a fragment that is on our start boundary.
2618                  */
2619                 CLIP_CHECK_BACK(start_entry, start);
2620         }
2621
2622         /*
2623          * Deal with the unwiring case.
2624          */
2625         if (new_pageable) {
2626                 /*
2627                  * This is the unwiring case.  We must first ensure that the
2628                  * range to be unwired is really wired down.  We know there
2629                  * are no holes.
2630                  */
2631                 entry = start_entry;
2632                 while (entry && entry->ba.start < end) {
2633                         if ((entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
2634                                 rv = KERN_INVALID_ARGUMENT;
2635                                 goto done;
2636                         }
2637                         KASSERT(entry->wired_count != 0,
2638                                 ("wired count was 0 with USER_WIRED set! %p",
2639                                  entry));
2640                         entry = vm_map_rb_tree_RB_NEXT(entry);
2641                 }
2642
2643                 /*
2644                  * Now decrement the wiring count for each region. If a region
2645                  * becomes completely unwired, unwire its physical pages and
2646                  * mappings.
2647                  */
2648                 /*
2649                  * The map entries are processed in a loop, checking to
2650                  * make sure the entry is wired and asserting it has a wired
2651                  * count. However, another loop was inserted more-or-less in
2652                  * the middle of the unwiring path. This loop picks up the
2653                  * "entry" loop variable from the first loop without first
2654                  * setting it to start_entry. Naturally, the secound loop
2655                  * is never entered and the pages backing the entries are
2656                  * never unwired. This can lead to a leak of wired pages.
2657                  */
2658                 entry = start_entry;
2659                 while (entry && entry->ba.start < end) {
2660                         KASSERT(entry->eflags & MAP_ENTRY_USER_WIRED,
2661                                 ("expected USER_WIRED on entry %p", entry));
2662                         entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2663                         entry->wired_count--;
2664                         if (entry->wired_count == 0)
2665                                 vm_fault_unwire(map, entry);
2666                         entry = vm_map_rb_tree_RB_NEXT(entry);
2667                 }
2668         }
2669 done:
2670         vm_map_unclip_range(map, start_entry, start, real_end, &count,
2671                             MAP_CLIP_NO_HOLES);
2672         vm_map_unlock(map);
2673         vm_map_entry_release(count);
2674
2675         return (rv);
2676 }
2677
2678 /*
2679  * Wiring/Unwiring of memory for kernel-related operation.
2680  *
2681  * Sets the pageability of the specified address range in the target map.
2682  * Regions specified as not pageable require locked-down physical
2683  * memory and physical page maps.
2684  *
2685  * The map must not be locked, but a reference must remain to the map
2686  * throughout the call.
2687  *
2688  * This function may be called via the zalloc path and must properly
2689  * reserve map entries for kernel_map.
2690  *
2691  * No requirements.
2692  */
2693 int
2694 vm_map_kernel_wiring(vm_map_t map, vm_offset_t start,
2695                      vm_offset_t real_end, int kmflags)
2696 {
2697         vm_map_entry_t entry;
2698         vm_map_entry_t start_entry;
2699         vm_offset_t end;
2700         int rv = KERN_SUCCESS;
2701         int count;
2702
2703         if (kmflags & KM_KRESERVE)
2704                 count = vm_map_entry_kreserve(MAP_RESERVE_COUNT);
2705         else
2706                 count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2707         vm_map_lock(map);
2708         VM_MAP_RANGE_CHECK(map, start, real_end);
2709         end = real_end;
2710
2711         start_entry = vm_map_clip_range(map, start, end, &count,
2712                                         MAP_CLIP_NO_HOLES);
2713         if (start_entry == NULL) {
2714                 vm_map_unlock(map);
2715                 rv = KERN_INVALID_ADDRESS;
2716                 goto failure;
2717         }
2718         if ((kmflags & KM_PAGEABLE) == 0) {
2719                 /*
2720                  * Wiring.
2721                  *
2722                  * 1.  Holding the write lock, we create any shadow or zero-fill
2723                  * objects that need to be created. Then we clip each map
2724                  * entry to the region to be wired and increment its wiring
2725                  * count.  We create objects before clipping the map entries
2726                  * to avoid object proliferation.
2727                  *
2728                  * 2.  We downgrade to a read lock, and call vm_fault_wire to
2729                  * fault in the pages for any newly wired area (wired_count is
2730                  * 1).
2731                  *
2732                  * Downgrading to a read lock for vm_fault_wire avoids a
2733                  * possible deadlock with another process that may have faulted
2734                  * on one of the pages to be wired (it would mark the page busy,
2735                  * blocking us, then in turn block on the map lock that we
2736                  * hold).  Because of problems in the recursive lock package,
2737                  * we cannot upgrade to a write lock in vm_map_lookup.  Thus,
2738                  * any actions that require the write lock must be done
2739                  * beforehand.  Because we keep the read lock on the map, the
2740                  * copy-on-write status of the entries we modify here cannot
2741                  * change.
2742                  */
2743                 entry = start_entry;
2744                 while (entry && entry->ba.start < end) {
2745                         /*
2746                          * Trivial case if the entry is already wired
2747                          */
2748                         if (entry->wired_count) {
2749                                 entry->wired_count++;
2750                                 entry = vm_map_rb_tree_RB_NEXT(entry);
2751                                 continue;
2752                         }
2753
2754                         /*
2755                          * The entry is being newly wired, we have to setup
2756                          * appropriate management structures.  A shadow
2757                          * object is required for a copy-on-write region,
2758                          * or a normal object for a zero-fill region.  We
2759                          * do not have to do this for entries that point to sub
2760                          * maps because we won't hold the lock on the sub map.
2761                          */
2762                         if (entry->maptype == VM_MAPTYPE_NORMAL) {
2763                                 int copyflag = entry->eflags &
2764                                                MAP_ENTRY_NEEDS_COPY;
2765                                 if (copyflag && ((entry->protection &
2766                                                   VM_PROT_WRITE) != 0)) {
2767                                         vm_map_entry_shadow(entry);
2768                                 } else if (entry->ba.object == NULL &&
2769                                            !map->system_map) {
2770                                         vm_map_entry_allocate_object(entry);
2771                                 }
2772                         }
2773                         entry->wired_count++;
2774                         entry = vm_map_rb_tree_RB_NEXT(entry);
2775                 }
2776
2777                 /*
2778                  * Pass 2.
2779                  */
2780
2781                 /*
2782                  * HACK HACK HACK HACK
2783                  *
2784                  * vm_fault_wire() temporarily unlocks the map to avoid
2785                  * deadlocks.  The in-transition flag from vm_map_clip_range
2786                  * call should protect us from changes while the map is
2787                  * unlocked.  T
2788                  *
2789                  * NOTE: Previously this comment stated that clipping might
2790                  *       still occur while the entry is unlocked, but from
2791                  *       what I can tell it actually cannot.
2792                  *
2793                  *       It is unclear whether the CLIP_CHECK_*() calls
2794                  *       are still needed but we keep them in anyway.
2795                  *
2796                  * HACK HACK HACK HACK
2797                  */
2798
2799                 entry = start_entry;
2800                 while (entry && entry->ba.start < end) {
2801                         /*
2802                          * If vm_fault_wire fails for any page we need to undo
2803                          * what has been done.  We decrement the wiring count
2804                          * for those pages which have not yet been wired (now)
2805                          * and unwire those that have (later).
2806                          */
2807                         vm_offset_t save_start = entry->ba.start;
2808                         vm_offset_t save_end = entry->ba.end;
2809
2810                         if (entry->wired_count == 1)
2811                                 rv = vm_fault_wire(map, entry, FALSE, kmflags);
2812                         if (rv) {
2813                                 CLIP_CHECK_BACK(entry, save_start);
2814                                 for (;;) {
2815                                         KASSERT(entry->wired_count == 1,
2816                                           ("wired_count changed unexpectedly"));
2817                                         entry->wired_count = 0;
2818                                         if (entry->ba.end == save_end)
2819                                                 break;
2820                                         entry = vm_map_rb_tree_RB_NEXT(entry);
2821                                         KASSERT(entry,
2822                                           ("bad entry clip during backout"));
2823                                 }
2824                                 end = save_start;
2825                                 break;
2826                         }
2827                         CLIP_CHECK_FWD(entry, save_end);
2828                         entry = vm_map_rb_tree_RB_NEXT(entry);
2829                 }
2830
2831                 /*
2832                  * If a failure occured undo everything by falling through
2833                  * to the unwiring code.  'end' has already been adjusted
2834                  * appropriately.
2835                  */
2836                 if (rv)
2837                         kmflags |= KM_PAGEABLE;
2838
2839                 /*
2840                  * start_entry is still IN_TRANSITION but may have been
2841                  * clipped since vm_fault_wire() unlocks and relocks the
2842                  * map.  No matter how clipped it has gotten there should
2843                  * be a fragment that is on our start boundary.
2844                  */
2845                 CLIP_CHECK_BACK(start_entry, start);
2846         }
2847
2848         if (kmflags & KM_PAGEABLE) {
2849                 /*
2850                  * This is the unwiring case.  We must first ensure that the
2851                  * range to be unwired is really wired down.  We know there
2852                  * are no holes.
2853                  */
2854                 entry = start_entry;
2855                 while (entry && entry->ba.start < end) {
2856                         if (entry->wired_count == 0) {
2857                                 rv = KERN_INVALID_ARGUMENT;
2858                                 goto done;
2859                         }
2860                         entry = vm_map_rb_tree_RB_NEXT(entry);
2861                 }
2862
2863                 /*
2864                  * Now decrement the wiring count for each region. If a region
2865                  * becomes completely unwired, unwire its physical pages and
2866                  * mappings.
2867                  */
2868                 entry = start_entry;
2869                 while (entry && entry->ba.start < end) {
2870                         entry->wired_count--;
2871                         if (entry->wired_count == 0)
2872                                 vm_fault_unwire(map, entry);
2873                         entry = vm_map_rb_tree_RB_NEXT(entry);
2874                 }
2875         }
2876 done:
2877         vm_map_unclip_range(map, start_entry, start, real_end,
2878                             &count, MAP_CLIP_NO_HOLES);
2879         vm_map_unlock(map);
2880 failure:
2881         if (kmflags & KM_KRESERVE)
2882                 vm_map_entry_krelease(count);
2883         else
2884                 vm_map_entry_release(count);
2885         return (rv);
2886 }
2887
2888 /*
2889  * Mark a newly allocated address range as wired but do not fault in
2890  * the pages.  The caller is expected to load the pages into the object.
2891  *
2892  * The map must be locked on entry and will remain locked on return.
2893  * No other requirements.
2894  */
2895 void
2896 vm_map_set_wired_quick(vm_map_t map, vm_offset_t addr, vm_size_t size,
2897                        int *countp)
2898 {
2899         vm_map_entry_t scan;
2900         vm_map_entry_t entry;
2901
2902         entry = vm_map_clip_range(map, addr, addr + size,
2903                                   countp, MAP_CLIP_NO_HOLES);
2904         scan = entry;
2905         while (scan && scan->ba.start < addr + size) {
2906                 KKASSERT(scan->wired_count == 0);
2907                 scan->wired_count = 1;
2908                 scan = vm_map_rb_tree_RB_NEXT(scan);
2909         }
2910         vm_map_unclip_range(map, entry, addr, addr + size,
2911                             countp, MAP_CLIP_NO_HOLES);
2912 }
2913
2914 /*
2915  * Push any dirty cached pages in the address range to their pager.
2916  * If syncio is TRUE, dirty pages are written synchronously.
2917  * If invalidate is TRUE, any cached pages are freed as well.
2918  *
2919  * This routine is called by sys_msync()
2920  *
2921  * Returns an error if any part of the specified range is not mapped.
2922  *
2923  * No requirements.
2924  */
2925 int
2926 vm_map_clean(vm_map_t map, vm_offset_t start, vm_offset_t end,
2927              boolean_t syncio, boolean_t invalidate)
2928 {
2929         vm_map_entry_t current;
2930         vm_map_entry_t next;
2931         vm_map_entry_t entry;
2932         vm_map_backing_t ba;
2933         vm_size_t size;
2934         vm_object_t object;
2935         vm_ooffset_t offset;
2936
2937         vm_map_lock_read(map);
2938         VM_MAP_RANGE_CHECK(map, start, end);
2939         if (!vm_map_lookup_entry(map, start, &entry)) {
2940                 vm_map_unlock_read(map);
2941                 return (KERN_INVALID_ADDRESS);
2942         }
2943         lwkt_gettoken(&map->token);
2944
2945         /*
2946          * Make a first pass to check for holes.
2947          */
2948         current = entry;
2949         while (current && current->ba.start < end) {
2950                 if (current->maptype == VM_MAPTYPE_SUBMAP) {
2951                         lwkt_reltoken(&map->token);
2952                         vm_map_unlock_read(map);
2953                         return (KERN_INVALID_ARGUMENT);
2954                 }
2955                 next = vm_map_rb_tree_RB_NEXT(current);
2956                 if (end > current->ba.end &&
2957                     (next == NULL ||
2958                      current->ba.end != next->ba.start)) {
2959                         lwkt_reltoken(&map->token);
2960                         vm_map_unlock_read(map);
2961                         return (KERN_INVALID_ADDRESS);
2962                 }
2963                 current = next;
2964         }
2965
2966         if (invalidate)
2967                 pmap_remove(vm_map_pmap(map), start, end);
2968
2969         /*
2970          * Make a second pass, cleaning/uncaching pages from the indicated
2971          * objects as we go.
2972          */
2973         current = entry;
2974         while (current && current->ba.start < end) {
2975                 offset = current->ba.offset + (start - current->ba.start);
2976                 size = (end <= current->ba.end ? end : current->ba.end) - start;
2977
2978                 switch(current->maptype) {
2979                 case VM_MAPTYPE_SUBMAP:
2980                 {
2981                         vm_map_t smap;
2982                         vm_map_entry_t tentry;
2983                         vm_size_t tsize;
2984
2985                         smap = current->ba.sub_map;
2986                         vm_map_lock_read(smap);
2987                         vm_map_lookup_entry(smap, offset, &tentry);
2988                         if (tentry == NULL) {
2989                                 tsize = vm_map_max(smap) - offset;
2990                                 ba = NULL;
2991                                 offset = 0 + (offset - vm_map_min(smap));
2992                         } else {
2993                                 tsize = tentry->ba.end - offset;
2994                                 ba = &tentry->ba;
2995                                 offset = tentry->ba.offset +
2996                                          (offset - tentry->ba.start);
2997                         }
2998                         vm_map_unlock_read(smap);
2999                         if (tsize < size)
3000                                 size = tsize;
3001                         break;
3002                 }
3003                 case VM_MAPTYPE_NORMAL:
3004                         ba = &current->ba;
3005                         break;
3006                 default:
3007                         ba = NULL;
3008                         break;
3009                 }
3010                 if (ba) {
3011                         object = ba->object;
3012                         if (object)
3013                                 vm_object_hold(object);
3014                 } else {
3015                         object = NULL;
3016                 }
3017
3018                 /*
3019                  * Note that there is absolutely no sense in writing out
3020                  * anonymous objects, so we track down the vnode object
3021                  * to write out.
3022                  * We invalidate (remove) all pages from the address space
3023                  * anyway, for semantic correctness.
3024                  *
3025                  * note: certain anonymous maps, such as MAP_NOSYNC maps,
3026                  * may start out with a NULL object.
3027                  *
3028                  * XXX do we really want to stop at the first backing store
3029                  * here if there are more? XXX
3030                  */
3031                 if (ba) {
3032                         vm_object_t tobj;
3033
3034                         tobj = object;
3035                         while (ba->backing_ba != NULL) {
3036                                 offset -= ba->offset;
3037                                 ba = ba->backing_ba;
3038                                 offset += ba->offset;
3039                                 tobj = ba->object;
3040                                 if (tobj->size < OFF_TO_IDX(offset + size))
3041                                         size = IDX_TO_OFF(tobj->size) - offset;
3042                                 break; /* XXX this break is not correct */
3043                         }
3044                         if (object != tobj) {
3045                                 if (object)
3046                                         vm_object_drop(object);
3047                                 object = tobj;
3048                                 vm_object_hold(object);
3049                         }
3050                 }
3051
3052                 if (object && (object->type == OBJT_VNODE) &&
3053                     (current->protection & VM_PROT_WRITE) &&
3054                     (object->flags & OBJ_NOMSYNC) == 0) {
3055                         /*
3056                          * Flush pages if writing is allowed, invalidate them
3057                          * if invalidation requested.  Pages undergoing I/O
3058                          * will be ignored by vm_object_page_remove().
3059                          *
3060                          * We cannot lock the vnode and then wait for paging
3061                          * to complete without deadlocking against vm_fault.
3062                          * Instead we simply call vm_object_page_remove() and
3063                          * allow it to block internally on a page-by-page
3064                          * basis when it encounters pages undergoing async
3065                          * I/O.
3066                          */
3067                         int flags;
3068
3069                         /* no chain wait needed for vnode objects */
3070                         vm_object_reference_locked(object);
3071                         vn_lock(object->handle, LK_EXCLUSIVE | LK_RETRY);
3072                         flags = (syncio || invalidate) ? OBJPC_SYNC : 0;
3073                         flags |= invalidate ? OBJPC_INVAL : 0;
3074
3075                         if (current->maptype == VM_MAPTYPE_NORMAL) {
3076                                 vm_object_page_clean(object,
3077                                     OFF_TO_IDX(offset),
3078                                     OFF_TO_IDX(offset + size + PAGE_MASK),
3079                                     flags);
3080                         }
3081                         vn_unlock(((struct vnode *)object->handle));
3082                         vm_object_deallocate_locked(object);
3083                 }
3084                 if (object && invalidate &&
3085                    ((object->type == OBJT_VNODE) ||
3086                     (object->type == OBJT_DEVICE) ||
3087                     (object->type == OBJT_MGTDEVICE))) {
3088                         int clean_only =
3089                                 ((object->type == OBJT_DEVICE) ||
3090                                 (object->type == OBJT_MGTDEVICE)) ? FALSE : TRUE;
3091                         /* no chain wait needed for vnode/device objects */
3092                         vm_object_reference_locked(object);
3093                         if (current->maptype == VM_MAPTYPE_NORMAL) {
3094                                 vm_object_page_remove(object,
3095                                     OFF_TO_IDX(offset),
3096                                     OFF_TO_IDX(offset + size + PAGE_MASK),
3097                                     clean_only);
3098                         }
3099                         vm_object_deallocate_locked(object);
3100                 }
3101                 start += size;
3102                 if (object)
3103                         vm_object_drop(object);
3104                 current = vm_map_rb_tree_RB_NEXT(current);
3105         }
3106
3107         lwkt_reltoken(&map->token);
3108         vm_map_unlock_read(map);
3109
3110         return (KERN_SUCCESS);
3111 }
3112
3113 /*
3114  * Make the region specified by this entry pageable.
3115  *
3116  * The vm_map must be exclusively locked.
3117  */
3118 static void
3119 vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
3120 {
3121         entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3122         entry->wired_count = 0;
3123         vm_fault_unwire(map, entry);
3124 }
3125
3126 /*
3127  * Deallocate the given entry from the target map.
3128  *
3129  * The vm_map must be exclusively locked.
3130  */
3131 static void
3132 vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry, int *countp)
3133 {
3134         vm_map_entry_unlink(map, entry);
3135         map->size -= entry->ba.end - entry->ba.start;
3136         vm_map_entry_dispose(map, entry, countp);
3137 }
3138
3139 /*
3140  * Deallocates the given address range from the target map.
3141  *
3142  * The vm_map must be exclusively locked.
3143  */
3144 int
3145 vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end, int *countp)
3146 {
3147         vm_object_t object;
3148         vm_map_entry_t entry;
3149         vm_map_entry_t first_entry;
3150         vm_offset_t hole_start;
3151
3152         ASSERT_VM_MAP_LOCKED(map);
3153         lwkt_gettoken(&map->token);
3154 again:
3155         /*
3156          * Find the start of the region, and clip it.  Set entry to point
3157          * at the first record containing the requested address or, if no
3158          * such record exists, the next record with a greater address.  The
3159          * loop will run from this point until a record beyond the termination
3160          * address is encountered.
3161          *
3162          * Adjust freehint[] for either the clip case or the extension case.
3163          *
3164          * GGG see other GGG comment.
3165          */
3166         if (vm_map_lookup_entry(map, start, &first_entry)) {
3167                 entry = first_entry;
3168                 vm_map_clip_start(map, entry, start, countp);
3169                 hole_start = start;
3170         } else {
3171                 if (first_entry) {
3172                         entry = vm_map_rb_tree_RB_NEXT(first_entry);
3173                         if (entry == NULL)
3174                                 hole_start = first_entry->ba.start;
3175                         else
3176                                 hole_start = first_entry->ba.end;
3177                 } else {
3178                         entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
3179                         if (entry == NULL)
3180                                 hole_start = vm_map_min(map);
3181                         else
3182                                 hole_start = vm_map_max(map);
3183                 }
3184         }
3185
3186         /*
3187          * Step through all entries in this region
3188          */
3189         while (entry && entry->ba.start < end) {
3190                 vm_map_entry_t next;
3191                 vm_offset_t s, e;
3192                 vm_pindex_t offidxstart, offidxend, count;
3193
3194                 /*
3195                  * If we hit an in-transition entry we have to sleep and
3196                  * retry.  It's easier (and not really slower) to just retry
3197                  * since this case occurs so rarely and the hint is already
3198                  * pointing at the right place.  We have to reset the
3199                  * start offset so as not to accidently delete an entry
3200                  * another process just created in vacated space.
3201                  */
3202                 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
3203                         entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
3204                         start = entry->ba.start;
3205                         ++mycpu->gd_cnt.v_intrans_coll;
3206                         ++mycpu->gd_cnt.v_intrans_wait;
3207                         vm_map_transition_wait(map, 1);
3208                         goto again;
3209                 }
3210                 vm_map_clip_end(map, entry, end, countp);
3211
3212                 s = entry->ba.start;
3213                 e = entry->ba.end;
3214                 next = vm_map_rb_tree_RB_NEXT(entry);
3215
3216                 offidxstart = OFF_TO_IDX(entry->ba.offset);
3217                 count = OFF_TO_IDX(e - s);
3218
3219                 switch(entry->maptype) {
3220                 case VM_MAPTYPE_NORMAL:
3221                 case VM_MAPTYPE_SUBMAP:
3222                         object = entry->ba.object;
3223                         break;
3224                 default:
3225                         object = NULL;
3226                         break;
3227                 }
3228
3229                 /*
3230                  * Unwire before removing addresses from the pmap; otherwise,
3231                  * unwiring will put the entries back in the pmap.
3232                  *
3233                  * Generally speaking, doing a bulk pmap_remove() before
3234                  * removing the pages from the VM object is better at
3235                  * reducing unnecessary IPIs.  The pmap code is now optimized
3236                  * to not blindly iterate the range when pt and pd pages
3237                  * are missing.
3238                  */
3239                 if (entry->wired_count != 0)
3240                         vm_map_entry_unwire(map, entry);
3241
3242                 offidxend = offidxstart + count;
3243
3244                 if (object == kernel_object) {
3245                         pmap_remove(map->pmap, s, e);
3246                         vm_object_hold(object);
3247                         vm_object_page_remove(object, offidxstart,
3248                                               offidxend, FALSE);
3249                         vm_object_drop(object);
3250                 } else if (object && object->type != OBJT_DEFAULT &&
3251                            object->type != OBJT_SWAP) {
3252                         /*
3253                          * vnode object routines cannot be chain-locked,
3254                          * but since we aren't removing pages from the
3255                          * object here we can use a shared hold.
3256                          */
3257                         vm_object_hold_shared(object);
3258                         pmap_remove(map->pmap, s, e);
3259                         vm_object_drop(object);
3260                 } else if (object) {
3261                         vm_object_hold(object);
3262                         pmap_remove(map->pmap, s, e);
3263
3264                         if (object != NULL &&
3265                             object->ref_count != 1 &&
3266                             (object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) ==
3267                              OBJ_ONEMAPPING &&
3268                             (object->type == OBJT_DEFAULT ||
3269                              object->type == OBJT_SWAP)) {
3270                                 /*
3271                                  * When ONEMAPPING is set we can destroy the
3272                                  * pages underlying the entry's range.
3273                                  */
3274                                 vm_object_page_remove(object, offidxstart,
3275                                                       offidxend, FALSE);
3276                                 if (object->type == OBJT_SWAP) {
3277                                         swap_pager_freespace(object,
3278                                                              offidxstart,
3279                                                              count);
3280                                 }
3281                                 if (offidxend >= object->size &&
3282                                     offidxstart < object->size) {
3283                                         object->size = offidxstart;
3284                                 }
3285                         }
3286                         vm_object_drop(object);
3287                 } else if (entry->maptype == VM_MAPTYPE_UKSMAP) {
3288                         pmap_remove(map->pmap, s, e);
3289                 }
3290
3291                 /*
3292                  * Delete the entry (which may delete the object) only after
3293                  * removing all pmap entries pointing to its pages.
3294                  * (Otherwise, its page frames may be reallocated, and any
3295                  * modify bits will be set in the wrong object!)
3296                  */
3297                 vm_map_entry_delete(map, entry, countp);
3298                 entry = next;
3299         }
3300
3301         /*
3302          * We either reached the end and use vm_map_max as the end
3303          * address, or we didn't and we use the next entry as the
3304          * end address.
3305          */
3306         if (entry == NULL) {
3307                 vm_map_freehint_hole(map, hole_start,
3308                                      vm_map_max(map) - hole_start);
3309         } else {
3310                 vm_map_freehint_hole(map, hole_start,
3311                                      entry->ba.start - hole_start);
3312         }
3313
3314         lwkt_reltoken(&map->token);
3315
3316         return (KERN_SUCCESS);
3317 }
3318
3319 /*
3320  * Remove the given address range from the target map.
3321  * This is the exported form of vm_map_delete.
3322  *
3323  * No requirements.
3324  */
3325 int
3326 vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
3327 {
3328         int result;
3329         int count;
3330
3331         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
3332         vm_map_lock(map);
3333         VM_MAP_RANGE_CHECK(map, start, end);
3334         result = vm_map_delete(map, start, end, &count);
3335         vm_map_unlock(map);
3336         vm_map_entry_release(count);
3337
3338         return (result);
3339 }
3340
3341 /*
3342  * Assert that the target map allows the specified privilege on the
3343  * entire address region given.  The entire region must be allocated.
3344  *
3345  * The caller must specify whether the vm_map is already locked or not.
3346  */
3347 boolean_t
3348 vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
3349                         vm_prot_t protection, boolean_t have_lock)
3350 {
3351         vm_map_entry_t entry;
3352         vm_map_entry_t tmp_entry;
3353         boolean_t result;
3354
3355         if (have_lock == FALSE)
3356                 vm_map_lock_read(map);
3357
3358         if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
3359                 if (have_lock == FALSE)
3360                         vm_map_unlock_read(map);
3361                 return (FALSE);
3362         }
3363         entry = tmp_entry;
3364
3365         result = TRUE;
3366         while (start < end) {
3367                 if (entry == NULL) {
3368                         result = FALSE;
3369                         break;
3370                 }
3371
3372                 /*
3373                  * No holes allowed!
3374                  */
3375
3376                 if (start < entry->ba.start) {
3377                         result = FALSE;
3378                         break;
3379                 }
3380                 /*
3381                  * Check protection associated with entry.
3382                  */
3383
3384                 if ((entry->protection & protection) != protection) {
3385                         result = FALSE;
3386                         break;
3387                 }
3388                 /* go to next entry */
3389                 start = entry->ba.end;
3390                 entry = vm_map_rb_tree_RB_NEXT(entry);
3391         }
3392         if (have_lock == FALSE)
3393                 vm_map_unlock_read(map);
3394         return (result);
3395 }
3396
3397 /*
3398  * vm_map_backing structures are not shared across forks and must be
3399  * replicated.
3400  *
3401  * Generally speaking we must reallocate the backing_ba sequence and
3402  * also adjust it for any changes made to the base entry->ba.start and
3403  * entry->ba.end.  The first ba in the chain is of course &entry->ba,
3404  * so we only need to adjust subsequent ba's start, end, and offset.
3405  *
3406  * MAP_BACK_CLIPPED     - Called as part of a clipping replication.
3407  *                        Do not clear OBJ_ONEMAPPING.
3408  *
3409  * MAP_BACK_BASEOBJREFD - Called from vm_map_insert().  The base object
3410  *                        has already been referenced.
3411  */
3412 static
3413 void
3414 vm_map_backing_replicated(vm_map_t map, vm_map_entry_t entry, int flags)
3415 {
3416         vm_map_backing_t ba;
3417         vm_map_backing_t nba;
3418         vm_object_t object;
3419
3420         ba = &entry->ba;
3421         for (;;) {
3422                 ba->pmap = map->pmap;
3423
3424                 if (ba->map_object) {
3425                         switch(entry->maptype) {
3426                         case VM_MAPTYPE_NORMAL:
3427                                 object = ba->object;
3428                                 if (ba != &entry->ba ||
3429                                     (flags & MAP_BACK_BASEOBJREFD) == 0) {
3430                                         vm_object_reference_quick(object);
3431                                 }
3432                                 vm_map_backing_attach(entry, ba);
3433                                 if ((flags & MAP_BACK_CLIPPED) == 0 &&
3434                                     object->ref_count > 1) {
3435                                         vm_object_clear_flag(object,
3436                                                              OBJ_ONEMAPPING);
3437                                 }
3438                                 break;
3439                         case VM_MAPTYPE_UKSMAP:
3440                                 vm_map_backing_attach(entry, ba);
3441                                 break;
3442                         default:
3443                                 break;
3444                         }
3445                 }
3446                 if (ba->backing_ba == NULL)
3447                         break;
3448
3449                 /*
3450                  * NOTE: The aux_info field is retained.
3451                  */
3452                 nba = kmalloc(sizeof(*nba), M_MAP_BACKING, M_INTWAIT);
3453                 *nba = *ba->backing_ba;
3454                 nba->offset += (ba->start - nba->start);  /* += (new - old) */
3455                 nba->start = ba->start;
3456                 nba->end = ba->end;
3457                 ba->backing_ba = nba;
3458                 ba = nba;
3459                 /* pmap is replaced at the top of the loop */
3460         }
3461 }
3462
3463 static
3464 void
3465 vm_map_backing_adjust_start(vm_map_entry_t entry, vm_ooffset_t start)
3466 {
3467         vm_map_backing_t ba;
3468
3469         if (entry->maptype == VM_MAPTYPE_NORMAL) {
3470                 for (ba = &entry->ba; ba; ba = ba->backing_ba) {
3471                         if (ba->object) {
3472                                 lockmgr(&ba->object->backing_lk, LK_EXCLUSIVE);
3473                                 ba->offset += (start - ba->start);
3474                                 ba->start = start;
3475                                 lockmgr(&ba->object->backing_lk, LK_RELEASE);
3476                         } else {
3477                                 ba->offset += (start - ba->start);
3478                                 ba->start = start;
3479                         }
3480                 }
3481         } else {
3482                 /* not an object and can't be shadowed */
3483         }
3484 }
3485
3486 static
3487 void
3488 vm_map_backing_adjust_end(vm_map_entry_t entry, vm_ooffset_t end)
3489 {
3490         vm_map_backing_t ba;
3491
3492         if (entry->maptype == VM_MAPTYPE_NORMAL) {
3493                 for (ba = &entry->ba; ba; ba = ba->backing_ba) {
3494                         if (ba->object) {
3495                                 lockmgr(&ba->object->backing_lk, LK_EXCLUSIVE);
3496                                 ba->end = end;
3497                                 lockmgr(&ba->object->backing_lk, LK_RELEASE);
3498                         } else {
3499                                 ba->end = end;
3500                         }
3501                 }
3502         } /* else not an object and/or can't be shadowed */
3503 }
3504
3505 /*
3506  * Handles the dirty work of making src_entry and dst_entry copy-on-write
3507  * after src_entry has been cloned to dst_entry.  For normal entries only.
3508  *
3509  * The vm_maps must be exclusively locked.
3510  * The vm_map's token must be held.
3511  *
3512  * Because the maps are locked no faults can be in progress during the
3513  * operation.
3514  */
3515 static void
3516 vm_map_copy_entry(vm_map_t src_map, vm_map_t dst_map,
3517                   vm_map_entry_t src_entry, vm_map_entry_t dst_entry)
3518 {
3519         vm_object_t obj;
3520
3521         KKASSERT(dst_entry->maptype == VM_MAPTYPE_NORMAL);
3522
3523         if (src_entry->wired_count) {
3524                 /*
3525                  * Of course, wired down pages can't be set copy-on-write.
3526                  * Cause wired pages to be copied into the new map by
3527                  * simulating faults (the new pages are pageable)
3528                  *
3529                  * Scrap ba.object (its ref-count has not yet been adjusted
3530                  * so we can just NULL out the field).  Remove the backing
3531                  * store.
3532                  *
3533                  * Then call vm_fault_copy_entry() to create a new object
3534                  * in dst_entry and copy the wired pages from src to dst.
3535                  *
3536                  * The fault-copy code doesn't work with virtual page
3537                  * tables.
3538                  *
3539                  * NOTE: obj is not actually an object for all MAPTYPEs,
3540                  *       just test against NULL.
3541                  */
3542                 if (dst_entry->ba.map_object != NULL) {
3543                         vm_map_backing_detach(dst_entry, &dst_entry->ba);
3544                         dst_entry->ba.map_object = NULL;
3545                         vm_map_entry_dispose_ba(dst_entry,
3546                                                 dst_entry->ba.backing_ba);
3547                         dst_entry->ba.backing_ba = NULL;
3548                         dst_entry->ba.backing_count = 0;
3549                 }
3550                 vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry);
3551         } else {
3552                 if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) {
3553                         /*
3554                          * If the source entry is not already marked NEEDS_COPY
3555                          * we need to write-protect the PTEs.
3556                          */
3557                         pmap_protect(src_map->pmap,
3558                                      src_entry->ba.start,
3559                                      src_entry->ba.end,
3560                                      src_entry->protection & ~VM_PROT_WRITE);
3561                 }
3562
3563                 /*
3564                  * dst_entry.ba_object might be stale.  Update it (its
3565                  * ref-count has not yet been updated so just overwrite
3566                  * the field).
3567                  *
3568                  * If there is no object then we are golden.  Also, in
3569                  * this situation if there are no backing_ba linkages then
3570                  * we can set ba.offset to whatever we want.  For now we
3571                  * set the offset for 0 for make debugging object sizes
3572                  * easier.
3573                  */
3574                 obj = src_entry->ba.object;
3575
3576                 if (obj) {
3577                         src_entry->eflags |= (MAP_ENTRY_COW |
3578                                               MAP_ENTRY_NEEDS_COPY);
3579                         dst_entry->eflags |= (MAP_ENTRY_COW |
3580                                               MAP_ENTRY_NEEDS_COPY);
3581                         KKASSERT(dst_entry->ba.offset == src_entry->ba.offset);
3582                 } else {
3583                         dst_entry->ba.offset = 0;
3584                 }
3585
3586                 /*
3587                  * Normal, allow the backing_ba link depth to
3588                  * increase.
3589                  */
3590                 pmap_copy(dst_map->pmap, src_map->pmap,
3591                           dst_entry->ba.start,
3592                           dst_entry->ba.end - dst_entry->ba.start,
3593                           src_entry->ba.start);
3594         }
3595 }
3596
3597 /*
3598  * Create a vmspace for a new process and its related vm_map based on an
3599  * existing vmspace.  The new map inherits information from the old map
3600  * according to inheritance settings.
3601  *
3602  * The source map must not be locked.
3603  * No requirements.
3604  */
3605 static void vmspace_fork_normal_entry(vm_map_t old_map, vm_map_t new_map,
3606                           vm_map_entry_t old_entry, int *countp);
3607 static void vmspace_fork_uksmap_entry(struct proc *p2, struct lwp *lp2,
3608                           vm_map_t old_map, vm_map_t new_map,
3609                           vm_map_entry_t old_entry, int *countp);
3610
3611 struct vmspace *
3612 vmspace_fork(struct vmspace *vm1, struct proc *p2, struct lwp *lp2)
3613 {
3614         struct vmspace *vm2;
3615         vm_map_t old_map = &vm1->vm_map;
3616         vm_map_t new_map;
3617         vm_map_entry_t old_entry;
3618         int count;
3619
3620         lwkt_gettoken(&vm1->vm_map.token);
3621         vm_map_lock(old_map);
3622
3623         vm2 = vmspace_alloc(vm_map_min(old_map), vm_map_max(old_map));
3624         lwkt_gettoken(&vm2->vm_map.token);
3625
3626         /*
3627          * We must bump the timestamp to force any concurrent fault
3628          * to retry.
3629          */
3630         bcopy(&vm1->vm_startcopy, &vm2->vm_startcopy,
3631               (caddr_t)&vm1->vm_endcopy - (caddr_t)&vm1->vm_startcopy);
3632         new_map = &vm2->vm_map; /* XXX */
3633         new_map->timestamp = 1;
3634
3635         vm_map_lock(new_map);
3636
3637         count = old_map->nentries;
3638         count = vm_map_entry_reserve(count + MAP_RESERVE_COUNT);
3639
3640         RB_FOREACH(old_entry, vm_map_rb_tree, &old_map->rb_root) {
3641                 switch(old_entry->maptype) {
3642                 case VM_MAPTYPE_SUBMAP:
3643                         panic("vm_map_fork: encountered a submap");
3644                         break;
3645                 case VM_MAPTYPE_UKSMAP:
3646                         vmspace_fork_uksmap_entry(p2, lp2,
3647                                                   old_map, new_map,
3648                                                   old_entry, &count);
3649                         break;
3650                 case VM_MAPTYPE_NORMAL:
3651                         vmspace_fork_normal_entry(old_map, new_map,
3652                                                   old_entry, &count);
3653                         break;
3654                 default:
3655                         /* nothing to do */
3656                         break;
3657                 }
3658         }
3659
3660         new_map->size = old_map->size;
3661         vm_map_unlock(new_map);
3662         vm_map_unlock(old_map);
3663         vm_map_entry_release(count);
3664
3665         lwkt_reltoken(&vm2->vm_map.token);
3666         lwkt_reltoken(&vm1->vm_map.token);
3667
3668         return (vm2);
3669 }
3670
3671 static
3672 void
3673 vmspace_fork_normal_entry(vm_map_t old_map, vm_map_t new_map,
3674                           vm_map_entry_t old_entry, int *countp)
3675 {
3676         vm_map_entry_t new_entry;
3677         vm_map_backing_t ba;
3678         vm_object_t object;
3679
3680         /*
3681          * If the backing_ba link list gets too long then fault it
3682          * all into the head object and dispose of the list.  We do
3683          * this in old_entry prior to cloning in order to benefit both
3684          * parent and child.
3685          *
3686          * We can test our fronting object's size against its
3687          * resident_page_count for a really cheap (but probably not perfect)
3688          * all-shadowed test, allowing us to disconnect the backing_ba
3689          * link list early.
3690          */
3691         object = old_entry->ba.object;
3692         if (old_entry->ba.backing_ba &&
3693             (old_entry->ba.backing_count >= vm_map_backing_limit ||
3694              (vm_map_backing_shadow_test && object &&
3695               object->size == object->resident_page_count))) {
3696                 /*
3697                  * If there are too many backing_ba linkages we
3698                  * collapse everything into the head
3699                  *
3700                  * This will also remove all the pte's.
3701                  */
3702                 if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY)
3703                         vm_map_entry_shadow(old_entry);
3704                 if (object == NULL)
3705                         vm_map_entry_allocate_object(old_entry);
3706                 if (vm_fault_collapse(old_map, old_entry) == KERN_SUCCESS) {
3707                         ba = old_entry->ba.backing_ba;
3708                         old_entry->ba.backing_ba = NULL;
3709                         old_entry->ba.backing_count = 0;
3710                         vm_map_entry_dispose_ba(old_entry, ba);
3711                 }
3712         }
3713         object = NULL;  /* object variable is now invalid */
3714
3715         /*
3716          * Fork the entry
3717          */
3718         switch (old_entry->inheritance) {
3719         case VM_INHERIT_NONE:
3720                 break;
3721         case VM_INHERIT_SHARE:
3722                 /*
3723                  * Clone the entry as a shared entry.  This will look like
3724                  * shared memory across the old and the new process.  We must
3725                  * ensure that the object is allocated.
3726                  */
3727                 if (old_entry->ba.object == NULL)
3728                         vm_map_entry_allocate_object(old_entry);
3729
3730                 if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
3731                         /*
3732                          * Create the fronting vm_map_backing for
3733                          * an entry which needs a copy, plus an extra
3734                          * ref because we are going to duplicate it
3735                          * in the fork.
3736                          *
3737                          * The call to vm_map_entry_shadow() will also clear
3738                          * OBJ_ONEMAPPING.
3739                          *
3740                          * XXX no more collapse.  Still need extra ref
3741                          * for the fork.
3742                          */
3743                         vm_map_entry_shadow(old_entry);
3744                 } else if (old_entry->ba.object) {
3745                         object = old_entry->ba.object;
3746                 }
3747
3748                 /*
3749                  * Clone the entry.  We've already bumped the ref on
3750                  * the vm_object for our new entry.
3751                  */
3752                 new_entry = vm_map_entry_create(countp);
3753                 *new_entry = *old_entry;
3754
3755                 new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3756                 new_entry->wired_count = 0;
3757
3758                 /*
3759                  * Replicate and index the vm_map_backing.  Don't share
3760                  * the vm_map_backing across vm_map's (only across clips).
3761                  *
3762                  * Insert the entry into the new map -- we know we're
3763                  * inserting at the end of the new map.
3764                  */
3765                 vm_map_backing_replicated(new_map, new_entry, 0);
3766                 vm_map_entry_link(new_map, new_entry);
3767
3768                 /*
3769                  * Update the physical map
3770                  */
3771                 pmap_copy(new_map->pmap, old_map->pmap,
3772                           new_entry->ba.start,
3773                           (old_entry->ba.end - old_entry->ba.start),
3774                           old_entry->ba.start);
3775                 break;
3776         case VM_INHERIT_COPY:
3777                 /*
3778                  * Clone the entry and link the copy into the new map.
3779                  *
3780                  * Note that ref-counting adjustment for old_entry->ba.object
3781                  * (if it isn't a special map that is) is handled by
3782                  * vm_map_copy_entry().
3783                  */
3784                 new_entry = vm_map_entry_create(countp);
3785                 *new_entry = *old_entry;
3786
3787                 new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3788                 new_entry->wired_count = 0;
3789
3790                 vm_map_backing_replicated(new_map, new_entry, 0);
3791                 vm_map_entry_link(new_map, new_entry);
3792
3793                 /*
3794                  * This does the actual dirty work of making both entries
3795                  * copy-on-write, and will also handle the fronting object.
3796                  */
3797                 vm_map_copy_entry(old_map, new_map, old_entry, new_entry);
3798                 break;
3799         }
3800 }
3801
3802 /*
3803  * When forking user-kernel shared maps, the map might change in the
3804  * child so do not try to copy the underlying pmap entries.
3805  */
3806 static
3807 void
3808 vmspace_fork_uksmap_entry(struct proc *p2, struct lwp *lp2,
3809                           vm_map_t old_map, vm_map_t new_map,
3810                           vm_map_entry_t old_entry, int *countp)
3811 {
3812         vm_map_entry_t new_entry;
3813
3814         /*
3815          * Do not fork lpmap entries whos TIDs do not match lp2's tid.
3816          *
3817          * XXX if p2 is NULL and lp2 is non-NULL, we retain the lpmap entry
3818          * (this is for e.g. resident'ing vmspace's) but set the field
3819          * to NULL.  Upon restore it should be restored. XXX NOT IMPL YET
3820          */
3821         if (old_entry->aux.dev) {
3822                 switch(minor(old_entry->aux.dev)) {
3823                 case 5:
3824                         break;
3825                 case 6:
3826                         break;
3827                 case 7:
3828                         if (lp2 == NULL)
3829                                 return;
3830                         if (old_entry->ba.aux_info == NULL)
3831                                 return;
3832                         if (((struct lwp *)old_entry->ba.aux_info)->lwp_tid !=
3833                             lp2->lwp_tid)
3834                                 return;
3835                         break;
3836                 }
3837         }
3838
3839         new_entry = vm_map_entry_create(countp);
3840         *new_entry = *old_entry;
3841
3842         new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3843         new_entry->wired_count = 0;
3844         KKASSERT(new_entry->ba.backing_ba == NULL);
3845
3846         if (new_entry->aux.dev) {
3847                 switch(minor(new_entry->aux.dev)) {
3848                 case 5:
3849                         /*
3850                          * upmap
3851                          */
3852                         new_entry->ba.aux_info = p2;
3853                         break;
3854                 case 6:
3855                         /*
3856                          * kpmap
3857                          */
3858                         new_entry->ba.aux_info = NULL;
3859                         break;
3860                 case 7:
3861                         /*
3862                          * lpmap
3863                          */
3864                         new_entry->ba.aux_info = lp2;
3865                         break;
3866                 }
3867         } else {
3868                 new_entry->ba.aux_info = NULL;
3869         }
3870
3871         vm_map_backing_replicated(new_map, new_entry, 0);
3872
3873         vm_map_entry_link(new_map, new_entry);
3874 }
3875
3876 /*
3877  * Create an auto-grow stack entry
3878  *
3879  * No requirements.
3880  */
3881 int
3882 vm_map_stack (vm_map_t map, vm_offset_t *addrbos, vm_size_t max_ssize,
3883               int flags, vm_prot_t prot, vm_prot_t max, int cow)
3884 {
3885         vm_map_entry_t  prev_entry;
3886         vm_map_entry_t  next;
3887         vm_size_t       init_ssize;
3888         int             rv;
3889         int             count;
3890         vm_offset_t     tmpaddr;
3891
3892         cow |= MAP_IS_STACK;
3893
3894         if (max_ssize < sgrowsiz)
3895                 init_ssize = max_ssize;
3896         else
3897                 init_ssize = sgrowsiz;
3898
3899         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
3900         vm_map_lock(map);
3901
3902         /*
3903          * Find space for the mapping
3904          */
3905         if ((flags & (MAP_FIXED | MAP_TRYFIXED)) == 0) {
3906                 if (vm_map_findspace(map, *addrbos, max_ssize, 1,
3907                                      flags, &tmpaddr)) {
3908                         vm_map_unlock(map);
3909                         vm_map_entry_release(count);
3910                         return (KERN_NO_SPACE);
3911                 }
3912                 *addrbos = tmpaddr;
3913         }
3914
3915         /* If addr is already mapped, no go */
3916         if (vm_map_lookup_entry(map, *addrbos, &prev_entry)) {
3917                 vm_map_unlock(map);
3918                 vm_map_entry_release(count);
3919                 return (KERN_NO_SPACE);
3920         }
3921
3922 #if 0
3923         /* XXX already handled by kern_mmap() */
3924         /* If we would blow our VMEM resource limit, no go */
3925         if (map->size + init_ssize >
3926             curproc->p_rlimit[RLIMIT_VMEM].rlim_cur) {
3927                 vm_map_unlock(map);
3928                 vm_map_entry_release(count);
3929                 return (KERN_NO_SPACE);
3930         }
3931 #endif
3932
3933         /*
3934          * If we can't accomodate max_ssize in the current mapping,
3935          * no go.  However, we need to be aware that subsequent user
3936          * mappings might map into the space we have reserved for
3937          * stack, and currently this space is not protected.
3938          *
3939          * Hopefully we will at least detect this condition
3940          * when we try to grow the stack.
3941          */
3942         if (prev_entry)
3943                 next = vm_map_rb_tree_RB_NEXT(prev_entry);
3944         else
3945                 next = RB_MIN(vm_map_rb_tree, &map->rb_root);
3946
3947         if (next && next->ba.start < *addrbos + max_ssize) {
3948                 vm_map_unlock(map);
3949                 vm_map_entry_release(count);
3950                 return (KERN_NO_SPACE);
3951         }
3952
3953         /*
3954          * We initially map a stack of only init_ssize.  We will
3955          * grow as needed later.  Since this is to be a grow
3956          * down stack, we map at the top of the range.
3957          *
3958          * Note: we would normally expect prot and max to be
3959          * VM_PROT_ALL, and cow to be 0.  Possibly we should
3960          * eliminate these as input parameters, and just
3961          * pass these values here in the insert call.
3962          */
3963         rv = vm_map_insert(map, &count,
3964                            NULL, NULL,
3965                            0, NULL,
3966                            *addrbos + max_ssize - init_ssize,
3967                            *addrbos + max_ssize,
3968                            VM_MAPTYPE_NORMAL,
3969                            VM_SUBSYS_STACK, prot, max, cow);
3970
3971         /* Now set the avail_ssize amount */
3972         if (rv == KERN_SUCCESS) {
3973                 if (prev_entry)
3974                         next = vm_map_rb_tree_RB_NEXT(prev_entry);
3975                 else
3976                         next = RB_MIN(vm_map_rb_tree, &map->rb_root);
3977                 if (prev_entry != NULL) {
3978                         vm_map_clip_end(map,
3979                                         prev_entry,
3980                                         *addrbos + max_ssize - init_ssize,
3981                                         &count);
3982                 }
3983                 if (next->ba.end   != *addrbos + max_ssize ||
3984                     next->ba.start != *addrbos + max_ssize - init_ssize){
3985                         panic ("Bad entry start/end for new stack entry");
3986                 } else {
3987                         next->aux.avail_ssize = max_ssize - init_ssize;
3988                 }
3989         }
3990
3991         vm_map_unlock(map);
3992         vm_map_entry_release(count);
3993         return (rv);
3994 }
3995
3996 /*
3997  * Attempts to grow a vm stack entry.  Returns KERN_SUCCESS if the
3998  * desired address is already mapped, or if we successfully grow
3999  * the stack.  Also returns KERN_SUCCESS if addr is outside the
4000  * stack range (this is strange, but preserves compatibility with
4001  * the grow function in vm_machdep.c).
4002  *
4003  * No requirements.
4004  */
4005 int
4006 vm_map_growstack (vm_map_t map, vm_offset_t addr)
4007 {
4008         vm_map_entry_t prev_entry;
4009         vm_map_entry_t stack_entry;
4010         vm_map_entry_t next;
4011         struct vmspace *vm;
4012         struct lwp *lp;
4013         struct proc *p;
4014         vm_offset_t    end;
4015         int grow_amount;
4016         int rv = KERN_SUCCESS;
4017         int is_procstack;
4018         int use_read_lock = 1;
4019         int count;
4020
4021         /*
4022          * Find the vm
4023          */
4024         lp = curthread->td_lwp;
4025         p = curthread->td_proc;
4026         KKASSERT(lp != NULL);
4027         vm = lp->lwp_vmspace;
4028
4029         /*
4030          * Growstack is only allowed on the current process.  We disallow
4031          * other use cases, e.g. trying to access memory via procfs that
4032          * the stack hasn't grown into.
4033          */
4034         if (map != &vm->vm_map) {
4035                 return KERN_FAILURE;
4036         }
4037
4038         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
4039 Retry:
4040         if (use_read_lock)
4041                 vm_map_lock_read(map);
4042         else
4043                 vm_map_lock(map);
4044
4045         /*
4046          * If addr is already in the entry range, no need to grow.
4047          * prev_entry returns NULL if addr is at the head.
4048          */
4049         if (vm_map_lookup_entry(map, addr, &prev_entry))
4050                 goto done;
4051         if (prev_entry)
4052                 stack_entry = vm_map_rb_tree_RB_NEXT(prev_entry);
4053         else
4054                 stack_entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
4055
4056         if (stack_entry == NULL)
4057                 goto done;
4058         if (prev_entry == NULL)
4059                 end = stack_entry->ba.start - stack_entry->aux.avail_ssize;
4060         else
4061                 end = prev_entry->ba.end;
4062
4063         /*
4064          * This next test mimics the old grow function in vm_machdep.c.
4065          * It really doesn't quite make sense, but we do it anyway
4066          * for compatibility.
4067          *
4068          * If not growable stack, return success.  This signals the
4069          * caller to proceed as he would normally with normal vm.
4070          */
4071         if (stack_entry->aux.avail_ssize < 1 ||
4072             addr >= stack_entry->ba.start ||
4073             addr <  stack_entry->ba.start - stack_entry->aux.avail_ssize) {
4074                 goto done;
4075         }
4076
4077         /* Find the minimum grow amount */
4078         grow_amount = roundup (stack_entry->ba.start - addr, PAGE_SIZE);
4079         if (grow_amount > stack_entry->aux.avail_ssize) {
4080                 rv = KERN_NO_SPACE;
4081                 goto done;
4082         }
4083
4084         /*
4085          * If there is no longer enough space between the entries
4086          * nogo, and adjust the available space.  Note: this
4087          * should only happen if the user has mapped into the
4088          * stack area after the stack was created, and is
4089          * probably an error.
4090          *
4091          * This also effectively destroys any guard page the user
4092          * might have intended by limiting the stack size.
4093          */
4094         if (grow_amount > stack_entry->ba.start - end) {
4095                 if (use_read_lock && vm_map_lock_upgrade(map)) {
4096                         /* lost lock */
4097                         use_read_lock = 0;
4098                         goto Retry;
4099                 }
4100                 use_read_lock = 0;
4101                 stack_entry->aux.avail_ssize = stack_entry->ba.start - end;
4102                 rv = KERN_NO_SPACE;
4103                 goto done;
4104         }
4105
4106         is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr;
4107
4108         /* If this is the main process stack, see if we're over the
4109          * stack limit.
4110          */
4111         if (is_procstack && (vm->vm_ssize + grow_amount >
4112                              p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
4113                 rv = KERN_NO_SPACE;
4114                 goto done;
4115         }
4116
4117         /* Round up the grow amount modulo SGROWSIZ */
4118         grow_amount = roundup (grow_amount, sgrowsiz);
4119         if (grow_amount > stack_entry->aux.avail_ssize) {
4120                 grow_amount = stack_entry->aux.avail_ssize;
4121         }
4122         if (is_procstack && (vm->vm_ssize + grow_amount >
4123                              p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
4124                 grow_amount = p->p_rlimit[RLIMIT_STACK].rlim_cur - vm->vm_ssize;
4125         }
4126
4127         /* If we would blow our VMEM resource limit, no go */
4128         if (map->size + grow_amount > p->p_rlimit[RLIMIT_VMEM].rlim_cur) {
4129                 rv = KERN_NO_SPACE;
4130                 goto done;
4131         }
4132
4133         if (use_read_lock && vm_map_lock_upgrade(map)) {
4134                 /* lost lock */
4135                 use_read_lock = 0;
4136                 goto Retry;
4137         }
4138         use_read_lock = 0;
4139
4140         /* Get the preliminary new entry start value */
4141         addr = stack_entry->ba.start - grow_amount;
4142
4143         /* If this puts us into the previous entry, cut back our growth
4144          * to the available space.  Also, see the note above.
4145          */
4146         if (addr < end) {
4147                 stack_entry->aux.avail_ssize = stack_entry->ba.start - end;
4148                 addr = end;
4149         }
4150
4151         rv = vm_map_insert(map, &count,
4152                            NULL, NULL,
4153                            0, NULL,
4154                            addr, stack_entry->ba.start,
4155                            VM_MAPTYPE_NORMAL,
4156                            VM_SUBSYS_STACK, VM_PROT_ALL, VM_PROT_ALL, 0);
4157
4158         /* Adjust the available stack space by the amount we grew. */
4159         if (rv == KERN_SUCCESS) {
4160                 if (prev_entry) {
4161                         vm_map_clip_end(map, prev_entry, addr, &count);
4162                         next = vm_map_rb_tree_RB_NEXT(prev_entry);
4163                 } else {
4164                         next = RB_MIN(vm_map_rb_tree, &map->rb_root);
4165                 }
4166                 if (next->ba.end != stack_entry->ba.start  ||
4167                     next->ba.start != addr) {
4168                         panic ("Bad stack grow start/end in new stack entry");
4169                 } else {
4170                         next->aux.avail_ssize =
4171                                 stack_entry->aux.avail_ssize -
4172                                 (next->ba.end - next->ba.start);
4173                         if (is_procstack) {
4174                                 vm->vm_ssize += next->ba.end -
4175                                                 next->ba.start;
4176                         }
4177                 }
4178
4179                 if (map->flags & MAP_WIREFUTURE) {
4180                         vm_map_user_wiring(map,
4181                                            next->ba.start,
4182                                            next->ba.end,
4183                                            FALSE);
4184                 }
4185         }
4186
4187 done:
4188         if (use_read_lock)
4189                 vm_map_unlock_read(map);
4190         else
4191                 vm_map_unlock(map);
4192         vm_map_entry_release(count);
4193         return (rv);
4194 }
4195
4196 /*
4197  * Unshare the specified VM space for exec.  If other processes are
4198  * mapped to it, then create a new one.  The new vmspace is null.
4199  *
4200  * No requirements.
4201  */
4202 void
4203 vmspace_exec(struct proc *p, struct vmspace *vmcopy)
4204 {
4205         struct vmspace *oldvmspace = p->p_vmspace;
4206         struct vmspace *newvmspace;
4207         vm_map_t map = &p->p_vmspace->vm_map;
4208
4209         /*
4210          * If we are execing a resident vmspace we fork it, otherwise
4211          * we create a new vmspace.  Note that exitingcnt is not
4212          * copied to the new vmspace.
4213          */
4214         lwkt_gettoken(&oldvmspace->vm_map.token);
4215         if (vmcopy)  {
4216                 newvmspace = vmspace_fork(vmcopy, NULL, NULL);
4217                 lwkt_gettoken(&newvmspace->vm_map.token);
4218         } else {
4219                 newvmspace = vmspace_alloc(vm_map_min(map), vm_map_max(map));
4220                 lwkt_gettoken(&newvmspace->vm_map.token);
4221                 bcopy(&oldvmspace->vm_startcopy, &newvmspace->vm_startcopy,
4222                       (caddr_t)&oldvmspace->vm_endcopy -
4223                        (caddr_t)&oldvmspace->vm_startcopy);
4224         }
4225
4226         /*
4227          * Finish initializing the vmspace before assigning it
4228          * to the process.  The vmspace will become the current vmspace
4229          * if p == curproc.
4230          */
4231         pmap_pinit2(vmspace_pmap(newvmspace));
4232         pmap_replacevm(p, newvmspace, 0);
4233         lwkt_reltoken(&newvmspace->vm_map.token);
4234         lwkt_reltoken(&oldvmspace->vm_map.token);
4235         vmspace_rel(oldvmspace);
4236 }
4237
4238 /*
4239  * Unshare the specified VM space for forcing COW.  This
4240  * is called by rfork, for the (RFMEM|RFPROC) == 0 case.
4241  */
4242 void
4243 vmspace_unshare(struct proc *p)
4244 {
4245         struct vmspace *oldvmspace = p->p_vmspace;
4246         struct vmspace *newvmspace;
4247
4248         lwkt_gettoken(&oldvmspace->vm_map.token);
4249         if (vmspace_getrefs(oldvmspace) == 1) {
4250                 lwkt_reltoken(&oldvmspace->vm_map.token);
4251                 return;
4252         }
4253         newvmspace = vmspace_fork(oldvmspace, NULL, NULL);
4254         lwkt_gettoken(&newvmspace->vm_map.token);
4255         pmap_pinit2(vmspace_pmap(newvmspace));
4256         pmap_replacevm(p, newvmspace, 0);
4257         lwkt_reltoken(&newvmspace->vm_map.token);
4258         lwkt_reltoken(&oldvmspace->vm_map.token);
4259         vmspace_rel(oldvmspace);
4260 }
4261
4262 /*
4263  * vm_map_hint: return the beginning of the best area suitable for
4264  * creating a new mapping with "prot" protection.
4265  *
4266  * No requirements.
4267  */
4268 vm_offset_t
4269 vm_map_hint(struct proc *p, vm_offset_t addr, vm_prot_t prot)
4270 {
4271         struct vmspace *vms = p->p_vmspace;
4272         struct rlimit limit;
4273         rlim_t dsiz;
4274
4275         /*
4276          * Acquire datasize limit for mmap() operation,
4277          * calculate nearest power of 2.
4278          */
4279         if (kern_getrlimit(RLIMIT_DATA, &limit))
4280                 limit.rlim_cur = maxdsiz;
4281         dsiz = limit.rlim_cur;
4282
4283         if (!randomize_mmap || addr != 0) {
4284                 /*
4285                  * Set a reasonable start point for the hint if it was
4286                  * not specified or if it falls within the heap space.
4287                  * Hinted mmap()s do not allocate out of the heap space.
4288                  */
4289                 if (addr == 0 ||
4290                     (addr >= round_page((vm_offset_t)vms->vm_taddr) &&
4291                      addr < round_page((vm_offset_t)vms->vm_daddr + dsiz))) {
4292                         addr = round_page((vm_offset_t)vms->vm_daddr + dsiz);
4293                 }
4294
4295                 return addr;
4296         }
4297
4298         /*
4299          * randomize_mmap && addr == 0.  For now randomize the
4300          * address within a dsiz range beyond the data limit.
4301          */
4302         addr = (vm_offset_t)vms->vm_daddr + dsiz;
4303         if (dsiz)
4304                 addr += (karc4random64() & 0x7FFFFFFFFFFFFFFFLU) % dsiz;
4305         return (round_page(addr));
4306 }
4307
4308 /*
4309  * Finds the VM object, offset, and protection for a given virtual address
4310  * in the specified map, assuming a page fault of the type specified.
4311  *
4312  * Leaves the map in question locked for read; return values are guaranteed
4313  * until a vm_map_lookup_done call is performed.  Note that the map argument
4314  * is in/out; the returned map must be used in the call to vm_map_lookup_done.
4315  *
4316  * A handle (out_entry) is returned for use in vm_map_lookup_done, to make
4317  * that fast.
4318  *
4319  * If a lookup is requested with "write protection" specified, the map may
4320  * be changed to perform virtual copying operations, although the data
4321  * referenced will remain the same.
4322  *
4323  * No requirements.
4324  */
4325 int
4326 vm_map_lookup(vm_map_t *var_map,                /* IN/OUT */
4327               vm_offset_t vaddr,
4328               vm_prot_t fault_typea,
4329               vm_map_entry_t *out_entry,        /* OUT */
4330               struct vm_map_backing **bap,      /* OUT */
4331               vm_pindex_t *pindex,              /* OUT */
4332               vm_pindex_t *pcount,              /* OUT */
4333               vm_prot_t *out_prot,              /* OUT */
4334               int *wflags)                      /* OUT */
4335 {
4336         vm_map_entry_t entry;
4337         vm_map_t map = *var_map;
4338         vm_prot_t prot;
4339         vm_prot_t fault_type = fault_typea;
4340         int use_read_lock = 1;
4341         int rv = KERN_SUCCESS;
4342         int count;
4343         thread_t td = curthread;
4344
4345         /*
4346          * vm_map_entry_reserve() implements an important mitigation
4347          * against mmap() span running the kernel out of vm_map_entry
4348          * structures, but it can also cause an infinite call recursion.
4349          * Use td_nest_count to prevent an infinite recursion (allows
4350          * the vm_map code to dig into the pcpu vm_map_entry reserve).
4351          */
4352         count = 0;
4353         if (td->td_nest_count == 0) {
4354                 ++td->td_nest_count;
4355                 count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
4356                 --td->td_nest_count;
4357         }
4358 RetryLookup:
4359         if (use_read_lock)
4360                 vm_map_lock_read(map);
4361         else
4362                 vm_map_lock(map);
4363
4364         /*
4365          * Always do a full lookup.  The hint doesn't get us much anymore
4366          * now that the map is RB'd.
4367          */
4368         cpu_ccfence();
4369         *out_entry = NULL;
4370         *bap = NULL;
4371
4372         {
4373                 vm_map_entry_t tmp_entry;
4374
4375                 if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) {
4376                         rv = KERN_INVALID_ADDRESS;
4377                         goto done;
4378                 }
4379                 entry = tmp_entry;
4380                 *out_entry = entry;
4381         }
4382
4383         /*
4384          * Handle submaps.
4385          */
4386         if (entry->maptype == VM_MAPTYPE_SUBMAP) {
4387                 vm_map_t old_map = map;
4388
4389                 *var_map = map = entry->ba.sub_map;
4390                 if (use_read_lock)
4391                         vm_map_unlock_read(old_map);
4392                 else
4393                         vm_map_unlock(old_map);
4394                 use_read_lock = 1;
4395                 goto RetryLookup;
4396         }
4397
4398         /*
4399          * Check whether this task is allowed to have this page.
4400          * Note the special case for MAP_ENTRY_COW pages with an override.
4401          * This is to implement a forced COW for debuggers.
4402          */
4403         if (fault_type & VM_PROT_OVERRIDE_WRITE)
4404                 prot = entry->max_protection;
4405         else
4406                 prot = entry->protection;
4407
4408         fault_type &= (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
4409         if ((fault_type & prot) != fault_type) {
4410                 rv = KERN_PROTECTION_FAILURE;
4411                 goto done;
4412         }
4413
4414         if ((entry->eflags & MAP_ENTRY_USER_WIRED) &&
4415             (entry->eflags & MAP_ENTRY_COW) &&
4416             (fault_type & VM_PROT_WRITE) &&
4417             (fault_typea & VM_PROT_OVERRIDE_WRITE) == 0) {
4418                 rv = KERN_PROTECTION_FAILURE;
4419                 goto done;
4420         }
4421
4422         /*
4423          * Flag regular pages that are supposed to be wired.  Remove prior
4424          * semantics that disallowed protection changes for such pages.
4425          *
4426          * The prior semantics are not used by modern systems.  Applications
4427          * do not assume an inability to change protection modes and may
4428          * operate incorrectly if we try to prevent protection changes.
4429          *
4430          * Modern applications are aware that even for locked memory,
4431          * changing protection modes, modifying MAP_PRIVATE mappings,
4432          * or fork() may still cause page faults on the locked memory.
4433          */
4434         *wflags = 0;
4435         if (entry->wired_count) {
4436                 *wflags |= FW_WIRED;
4437 #if 0
4438                 prot = fault_type = entry->protection;
4439 #endif
4440         }
4441
4442         if (curthread->td_lwp && curthread->td_lwp->lwp_vmspace &&
4443             pmap_emulate_ad_bits(&curthread->td_lwp->lwp_vmspace->vm_pmap)) {
4444                 if ((prot & VM_PROT_WRITE) == 0)
4445                         fault_type |= VM_PROT_WRITE;
4446         }
4447
4448         /*
4449          * Only NORMAL maps are object-based.  UKSMAPs are not.
4450          */
4451         if (entry->maptype != VM_MAPTYPE_NORMAL) {
4452                 *bap = NULL;
4453                 goto skip;
4454         }
4455
4456         /*
4457          * If the entry was copy-on-write, we either ...
4458          */
4459         if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
4460                 /*
4461                  * If we want to write the page, we may as well handle that
4462                  * now since we've got the map locked.
4463                  *
4464                  * If we don't need to write the page, we just demote the
4465                  * permissions allowed.
4466                  */
4467                 if (fault_type & VM_PROT_WRITE) {
4468                         /*
4469                          * Not allowed if TDF_NOFAULT is set as the shadowing
4470                          * operation can deadlock against the faulting
4471                          * function due to the copy-on-write.
4472                          */
4473                         if (curthread->td_flags & TDF_NOFAULT) {
4474                                 rv = KERN_FAILURE_NOFAULT;
4475                                 goto done;
4476                         }
4477
4478                         /*
4479                          * Make a new vm_map_backing + object, and place it
4480                          * in the object chain.  Note that no new references
4481                          * have appeared -- one just moved from the map to
4482                          * the new object.
4483                          */
4484                         if (use_read_lock && vm_map_lock_upgrade(map)) {
4485                                 /* lost lock */
4486                                 use_read_lock = 0;
4487                                 goto RetryLookup;
4488                         }
4489                         use_read_lock = 0;
4490                         vm_map_entry_shadow(entry);
4491                         *wflags |= FW_DIDCOW;
4492                 } else {
4493                         /*
4494                          * We're attempting to read a copy-on-write page --
4495                          * don't allow writes.
4496                          */
4497                         prot &= ~VM_PROT_WRITE;
4498                 }
4499         }
4500
4501         /*
4502          * Create an object if necessary.  This code also handles
4503          * partitioning large entries to improve vm_fault performance.
4504          */
4505         if (entry->ba.object == NULL && !map->system_map) {
4506                 if (use_read_lock && vm_map_lock_upgrade(map))  {
4507                         /* lost lock */
4508                         use_read_lock = 0;
4509                         goto RetryLookup;
4510                 }
4511                 use_read_lock = 0;
4512
4513                 /*
4514                  * Partition large entries, giving each its own VM object,
4515                  * to improve concurrent fault performance.  This is only
4516                  * applicable to userspace.
4517                  */
4518                 if (map != kernel_map &&
4519                     entry->maptype == VM_MAPTYPE_NORMAL &&
4520                     ((entry->ba.start ^ entry->ba.end) &
4521                      ~MAP_ENTRY_PARTITION_MASK) &&
4522                     vm_map_partition_enable) {
4523                         if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
4524                                 entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
4525                                 ++mycpu->gd_cnt.v_intrans_coll;
4526                                 ++mycpu->gd_cnt.v_intrans_wait;
4527                                 vm_map_transition_wait(map, 0);
4528                                 goto RetryLookup;
4529                         }
4530                         vm_map_entry_partition(map, entry, vaddr, &count);
4531                 }
4532                 vm_map_entry_allocate_object(entry);
4533         }
4534
4535         /*
4536          * Return the object/offset from this entry.  If the entry was
4537          * copy-on-write or empty, it has been fixed up.
4538          */
4539         *bap = &entry->ba;
4540
4541 skip:
4542         *pindex = OFF_TO_IDX((vaddr - entry->ba.start) + entry->ba.offset);
4543         *pcount = OFF_TO_IDX(entry->ba.end - trunc_page(vaddr));
4544
4545         /*
4546          * Return whether this is the only map sharing this data.  On
4547          * success we return with a read lock held on the map.  On failure
4548          * we return with the map unlocked.
4549          */
4550         *out_prot = prot;
4551 done:
4552         if (rv == KERN_SUCCESS) {
4553                 if (use_read_lock == 0)
4554                         vm_map_lock_downgrade(map);
4555         } else if (use_read_lock) {
4556                 vm_map_unlock_read(map);
4557         } else {
4558                 vm_map_unlock(map);
4559         }
4560         if (count > 0)
4561                 vm_map_entry_release(count);
4562
4563         return (rv);
4564 }
4565
4566 /*
4567  * Releases locks acquired by a vm_map_lookup()
4568  * (according to the handle returned by that lookup).
4569  *
4570  * No other requirements.
4571  */
4572 void
4573 vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry, int count)
4574 {
4575         /*
4576          * Unlock the main-level map
4577          */
4578         vm_map_unlock_read(map);
4579         if (count)
4580                 vm_map_entry_release(count);
4581 }
4582
4583 static void
4584 vm_map_entry_partition(vm_map_t map, vm_map_entry_t entry,
4585                        vm_offset_t vaddr, int *countp)
4586 {
4587         vaddr &= ~MAP_ENTRY_PARTITION_MASK;
4588         vm_map_clip_start(map, entry, vaddr, countp);
4589         vaddr += MAP_ENTRY_PARTITION_SIZE;
4590         vm_map_clip_end(map, entry, vaddr, countp);
4591 }
4592
4593 /*
4594  * Quick hack, needs some help to make it more SMP friendly.
4595  */
4596 void
4597 vm_map_interlock(vm_map_t map, struct vm_map_ilock *ilock,
4598                  vm_offset_t ran_beg, vm_offset_t ran_end)
4599 {
4600         struct vm_map_ilock *scan;
4601
4602         ilock->ran_beg = ran_beg;
4603         ilock->ran_end = ran_end;
4604         ilock->flags = 0;
4605
4606         spin_lock(&map->ilock_spin);
4607 restart:
4608         for (scan = map->ilock_base; scan; scan = scan->next) {
4609                 if (ran_end > scan->ran_beg && ran_beg < scan->ran_end) {
4610                         scan->flags |= ILOCK_WAITING;
4611                         ssleep(scan, &map->ilock_spin, 0, "ilock", 0);
4612                         goto restart;
4613                 }
4614         }
4615         ilock->next = map->ilock_base;
4616         map->ilock_base = ilock;
4617         spin_unlock(&map->ilock_spin);
4618 }
4619
4620 void
4621 vm_map_deinterlock(vm_map_t map, struct  vm_map_ilock *ilock)
4622 {
4623         struct vm_map_ilock *scan;
4624         struct vm_map_ilock **scanp;
4625
4626         spin_lock(&map->ilock_spin);
4627         scanp = &map->ilock_base;
4628         while ((scan = *scanp) != NULL) {
4629                 if (scan == ilock) {
4630                         *scanp = ilock->next;
4631                         spin_unlock(&map->ilock_spin);
4632                         if (ilock->flags & ILOCK_WAITING)
4633                                 wakeup(ilock);
4634                         return;
4635                 }
4636                 scanp = &scan->next;
4637         }
4638         spin_unlock(&map->ilock_spin);
4639         panic("vm_map_deinterlock: missing ilock!");
4640 }
4641
4642 #include "opt_ddb.h"
4643 #ifdef DDB
4644 #include <ddb/ddb.h>
4645
4646 /*
4647  * Debugging only
4648  */
4649 DB_SHOW_COMMAND(map, vm_map_print)
4650 {
4651         static int nlines;
4652         /* XXX convert args. */
4653         vm_map_t map = (vm_map_t)addr;
4654         boolean_t full = have_addr;
4655
4656         vm_map_entry_t entry;
4657
4658         db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
4659             (void *)map,
4660             (void *)map->pmap, map->nentries, map->timestamp);
4661         nlines++;
4662
4663         if (!full && db_indent)
4664                 return;
4665
4666         db_indent += 2;
4667         RB_FOREACH(entry, vm_map_rb_tree, &map->rb_root) {
4668                 db_iprintf("map entry %p: start=%p, end=%p\n",
4669                     (void *)entry,
4670                     (void *)entry->ba.start, (void *)entry->ba.end);
4671                 nlines++;
4672                 {
4673                         static char *inheritance_name[4] =
4674                         {"share", "copy", "none", "donate_copy"};
4675
4676                         db_iprintf(" prot=%x/%x/%s",
4677                             entry->protection,
4678                             entry->max_protection,
4679                             inheritance_name[(int)(unsigned char)
4680                                                 entry->inheritance]);
4681                         if (entry->wired_count != 0)
4682                                 db_printf(", wired");
4683                 }
4684                 switch(entry->maptype) {
4685                 case VM_MAPTYPE_SUBMAP:
4686                         /* XXX no %qd in kernel.  Truncate entry->ba.offset. */
4687                         db_printf(", share=%p, offset=0x%lx\n",
4688                             (void *)entry->ba.sub_map,
4689                             (long)entry->ba.offset);
4690                         nlines++;
4691
4692                         db_indent += 2;
4693                         vm_map_print((db_expr_t)(intptr_t)entry->ba.sub_map,
4694                                      full, 0, NULL);
4695                         db_indent -= 2;
4696                         break;
4697                 case VM_MAPTYPE_NORMAL:
4698                         /* XXX no %qd in kernel.  Truncate entry->ba.offset. */
4699                         db_printf(", object=%p, offset=0x%lx",
4700                             (void *)entry->ba.object,
4701                             (long)entry->ba.offset);
4702                         if (entry->eflags & MAP_ENTRY_COW)
4703                                 db_printf(", copy (%s)",
4704                                     ((entry->eflags & MAP_ENTRY_NEEDS_COPY) ?
4705                                      "needed" : "done"));
4706                         db_printf("\n");
4707                         nlines++;
4708
4709                         if (entry->ba.object) {
4710                                 db_indent += 2;
4711                                 vm_object_print((db_expr_t)(intptr_t)
4712                                                 entry->ba.object,
4713                                                 full, 0, NULL);
4714                                 nlines += 4;
4715                                 db_indent -= 2;
4716                         }
4717                         break;
4718                 case VM_MAPTYPE_UKSMAP:
4719                         db_printf(", uksmap=%p, offset=0x%lx",
4720                             (void *)entry->ba.uksmap,
4721                             (long)entry->ba.offset);
4722                         if (entry->eflags & MAP_ENTRY_COW)
4723                                 db_printf(", copy (%s)",
4724                                     (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
4725                         db_printf("\n");
4726                         nlines++;
4727                         break;
4728                 default:
4729                         break;
4730                 }
4731         }
4732         db_indent -= 2;
4733         if (db_indent == 0)
4734                 nlines = 0;
4735 }
4736
4737 /*
4738  * Debugging only
4739  */
4740 DB_SHOW_COMMAND(procvm, procvm)
4741 {
4742         struct proc *p;
4743
4744         if (have_addr) {
4745                 p = (struct proc *) addr;
4746         } else {
4747                 p = curproc;
4748         }
4749
4750         db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n",
4751             (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map,
4752             (void *)vmspace_pmap(p->p_vmspace));
4753
4754         vm_map_print((db_expr_t)(intptr_t)&p->p_vmspace->vm_map, 1, 0, NULL);
4755 }
4756
4757 #endif /* DDB */